Repository: neondatabase/neon Branch: main Commit: 39e4f234633f Files: 1874 Total size: 20.3 MB Directory structure: gitextract_l6m7x00t/ ├── .cargo/ │ └── config.toml ├── .config/ │ ├── hakari.toml │ └── nextest.toml ├── .dockerignore ├── .git-blame-ignore-revs ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-template.md │ │ ├── config.yml │ │ └── epic-template.md │ ├── actionlint.yml │ ├── actions/ │ │ ├── allure-report-generate/ │ │ │ └── action.yml │ │ ├── allure-report-store/ │ │ │ └── action.yml │ │ ├── download/ │ │ │ └── action.yml │ │ ├── neon-branch-create/ │ │ │ └── action.yml │ │ ├── neon-branch-delete/ │ │ │ └── action.yml │ │ ├── neon-project-create/ │ │ │ └── action.yml │ │ ├── neon-project-delete/ │ │ │ └── action.yml │ │ ├── prepare-for-subzero/ │ │ │ └── action.yml │ │ ├── run-python-test-set/ │ │ │ └── action.yml │ │ ├── save-coverage-data/ │ │ │ └── action.yml │ │ └── upload/ │ │ └── action.yml │ ├── file-filters.yaml │ ├── pull_request_template.md │ ├── scripts/ │ │ ├── generate_image_maps.py │ │ ├── lint-release-pr.sh │ │ ├── previous-releases.jq │ │ └── push_with_image_map.py │ └── workflows/ │ ├── _benchmarking_preparation.yml │ ├── _build-and-test-locally.yml │ ├── _check-codestyle-python.yml │ ├── _check-codestyle-rust.yml │ ├── _meta.yml │ ├── _push-to-container-registry.yml │ ├── actionlint.yml │ ├── approved-for-ci-run.yml │ ├── benchbase_tpcc.yml │ ├── benchmarking.yml │ ├── build-build-tools-image.yml │ ├── build-macos.yml │ ├── build_and_run_selected_test.yml │ ├── build_and_test.yml │ ├── build_and_test_fully.yml │ ├── build_and_test_with_sanitizers.yml │ ├── cargo-deny.yml │ ├── check-permissions.yml │ ├── cleanup-caches-by-a-branch.yml │ ├── cloud-extensions.yml │ ├── cloud-regress.yml │ ├── fast-forward.yml │ ├── force-test-extensions-upgrade.yml │ ├── ingest_benchmark.yml │ ├── label-for-external-users.yml │ ├── large_oltp_benchmark.yml │ ├── large_oltp_growth.yml │ ├── lint-release-pr.yml │ ├── neon_extra_builds.yml │ ├── periodic_pagebench.yml │ ├── pg-clients.yml │ ├── pin-build-tools-image.yml │ ├── pre-merge-checks.yml │ ├── proxy-benchmark.yml │ ├── random-ops-test.yml │ ├── regenerate-pg-setting.yml │ ├── release-compute.yml │ ├── release-notify.yml │ ├── release-proxy.yml │ ├── release-storage.yml │ ├── release.yml │ ├── report-workflow-stats-batch.yml │ └── trigger-e2e-tests.yml ├── .gitignore ├── .gitmodules ├── .neon_clippy_args ├── CODEOWNERS ├── CONTRIBUTING.md ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── build-tools/ │ ├── Dockerfile │ ├── package.json │ └── patches/ │ └── pgcopydbv017.patch ├── clippy.toml ├── compute/ │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── compute-node.Dockerfile │ ├── etc/ │ │ ├── README.md │ │ ├── ld.so.conf.d/ │ │ │ └── 00-neon.conf │ │ ├── neon_collector.jsonnet │ │ ├── neon_collector_autoscaling.jsonnet │ │ ├── pgbouncer.ini │ │ ├── postgres_exporter.yml │ │ ├── sql_exporter/ │ │ │ ├── checkpoints_req.17.sql │ │ │ ├── checkpoints_req.libsonnet │ │ │ ├── checkpoints_req.sql │ │ │ ├── checkpoints_timed.17.sql │ │ │ ├── checkpoints_timed.libsonnet │ │ │ ├── checkpoints_timed.sql │ │ │ ├── compute_backpressure_throttling_seconds_total.libsonnet │ │ │ ├── compute_backpressure_throttling_seconds_total.sql │ │ │ ├── compute_current_lsn.libsonnet │ │ │ ├── compute_current_lsn.sql │ │ │ ├── compute_getpage_max_inflight_stuck_time_ms.libsonnet │ │ │ ├── compute_getpage_stuck_requests_total.libsonnet │ │ │ ├── compute_logical_snapshot_files.libsonnet │ │ │ ├── compute_logical_snapshot_files.sql │ │ │ ├── compute_logical_snapshots_bytes.15.sql │ │ │ ├── compute_logical_snapshots_bytes.libsonnet │ │ │ ├── compute_logical_snapshots_bytes.sql │ │ │ ├── compute_max_connections.libsonnet │ │ │ ├── compute_max_connections.sql │ │ │ ├── compute_pg_oldest_frozen_xid_age.libsonnet │ │ │ ├── compute_pg_oldest_frozen_xid_age.sql │ │ │ ├── compute_pg_oldest_mxid_age.libsonnet │ │ │ ├── compute_pg_oldest_mxid_age.sql │ │ │ ├── compute_receive_lsn.libsonnet │ │ │ ├── compute_receive_lsn.sql │ │ │ ├── compute_subscriptions_count.libsonnet │ │ │ ├── compute_subscriptions_count.sql │ │ │ ├── connection_counts.libsonnet │ │ │ ├── connection_counts.sql │ │ │ ├── db_total_size.libsonnet │ │ │ ├── db_total_size.sql │ │ │ ├── file_cache_read_wait_seconds_bucket.libsonnet │ │ │ ├── file_cache_read_wait_seconds_bucket.sql │ │ │ ├── file_cache_read_wait_seconds_count.libsonnet │ │ │ ├── file_cache_read_wait_seconds_sum.libsonnet │ │ │ ├── file_cache_write_wait_seconds_bucket.libsonnet │ │ │ ├── file_cache_write_wait_seconds_bucket.sql │ │ │ ├── file_cache_write_wait_seconds_count.libsonnet │ │ │ ├── file_cache_write_wait_seconds_sum.libsonnet │ │ │ ├── getpage_prefetch_discards_total.libsonnet │ │ │ ├── getpage_prefetch_misses_total.libsonnet │ │ │ ├── getpage_prefetch_requests_total.libsonnet │ │ │ ├── getpage_prefetches_buffered.libsonnet │ │ │ ├── getpage_sync_requests_total.libsonnet │ │ │ ├── getpage_wait_seconds_bucket.libsonnet │ │ │ ├── getpage_wait_seconds_bucket.sql │ │ │ ├── getpage_wait_seconds_count.libsonnet │ │ │ ├── getpage_wait_seconds_sum.libsonnet │ │ │ ├── lfc_approximate_working_set_size.libsonnet │ │ │ ├── lfc_approximate_working_set_size.sql │ │ │ ├── lfc_approximate_working_set_size_windows.autoscaling.libsonnet │ │ │ ├── lfc_approximate_working_set_size_windows.autoscaling.sql │ │ │ ├── lfc_approximate_working_set_size_windows.libsonnet │ │ │ ├── lfc_approximate_working_set_size_windows.sql │ │ │ ├── lfc_cache_size_limit.libsonnet │ │ │ ├── lfc_cache_size_limit.sql │ │ │ ├── lfc_chunk_size.libsonnet │ │ │ ├── lfc_chunk_size.sql │ │ │ ├── lfc_hits.libsonnet │ │ │ ├── lfc_hits.sql │ │ │ ├── lfc_misses.libsonnet │ │ │ ├── lfc_misses.sql │ │ │ ├── lfc_used.libsonnet │ │ │ ├── lfc_used.sql │ │ │ ├── lfc_used_pages.libsonnet │ │ │ ├── lfc_used_pages.sql │ │ │ ├── lfc_writes.libsonnet │ │ │ ├── lfc_writes.sql │ │ │ ├── logical_slot_restart_lsn.libsonnet │ │ │ ├── logical_slot_restart_lsn.sql │ │ │ ├── max_cluster_size.libsonnet │ │ │ ├── max_cluster_size.sql │ │ │ ├── neon_perf_counters.sql │ │ │ ├── pageserver_disconnects_total.libsonnet │ │ │ ├── pageserver_open_requests.libsonnet │ │ │ ├── pageserver_requests_sent_total.libsonnet │ │ │ ├── pageserver_send_flushes_total.libsonnet │ │ │ ├── pg_stats_userdb.libsonnet │ │ │ ├── pg_stats_userdb.sql │ │ │ ├── replication_delay_bytes.libsonnet │ │ │ ├── replication_delay_bytes.sql │ │ │ ├── replication_delay_seconds.libsonnet │ │ │ ├── replication_delay_seconds.sql │ │ │ ├── retained_wal.libsonnet │ │ │ ├── retained_wal.sql │ │ │ ├── wal_is_lost.libsonnet │ │ │ └── wal_is_lost.sql │ │ └── sql_exporter.jsonnet │ ├── jsonnet/ │ │ └── neon.libsonnet │ ├── manifest.schema.json │ ├── manifest.yaml │ ├── patches/ │ │ ├── anon_v2.patch │ │ ├── cloud_regress_pg16.patch │ │ ├── cloud_regress_pg17.patch │ │ ├── contrib_pg16.patch │ │ ├── contrib_pg17.patch │ │ ├── duckdb_v113.patch │ │ ├── duckdb_v120.patch │ │ ├── onnxruntime.patch │ │ ├── pg_cron.patch │ │ ├── pg_duckdb_v031.patch │ │ ├── pg_graphql.patch │ │ ├── pg_hint_plan_v16.patch │ │ ├── pg_hint_plan_v17.patch │ │ ├── pg_repack.patch │ │ ├── pg_stat_statements_pg14-16.patch │ │ ├── pg_stat_statements_pg17.patch │ │ ├── pgaudit-parallel_workers-v14.patch │ │ ├── pgaudit-parallel_workers-v15.patch │ │ ├── pgaudit-parallel_workers-v16.patch │ │ ├── pgaudit-parallel_workers-v17.patch │ │ ├── pgvector.patch │ │ ├── plv8_v3.1.10.patch │ │ ├── plv8_v3.2.3.patch │ │ ├── postgres_fdw.patch │ │ └── rum.patch │ ├── vm-image-spec-bookworm.yaml │ └── vm-image-spec-bullseye.yaml ├── compute_tools/ │ ├── .dockerignore │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── rustfmt.toml │ ├── src/ │ │ ├── bin/ │ │ │ ├── compute_ctl.rs │ │ │ ├── fast_import/ │ │ │ │ ├── aws_s3_sync.rs │ │ │ │ ├── child_stdio_to_log.rs │ │ │ │ └── s3_uri.rs │ │ │ └── fast_import.rs │ │ ├── catalog.rs │ │ ├── checker.rs │ │ ├── communicator_socket_client.rs │ │ ├── compute.rs │ │ ├── compute_prewarm.rs │ │ ├── compute_promote.rs │ │ ├── config.rs │ │ ├── config_template/ │ │ │ ├── compute_audit_rsyslog_template.conf │ │ │ └── compute_rsyslog_postgres_export_template.conf │ │ ├── configurator.rs │ │ ├── disk_quota.rs │ │ ├── extension_server.rs │ │ ├── hadron_metrics.rs │ │ ├── http/ │ │ │ ├── extract/ │ │ │ │ ├── json.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── path.rs │ │ │ │ ├── query.rs │ │ │ │ └── request_id.rs │ │ │ ├── headers.rs │ │ │ ├── middleware/ │ │ │ │ ├── authorize.rs │ │ │ │ ├── mod.rs │ │ │ │ └── request_id.rs │ │ │ ├── mod.rs │ │ │ ├── openapi_spec.yaml │ │ │ ├── routes/ │ │ │ │ ├── check_writability.rs │ │ │ │ ├── configure.rs │ │ │ │ ├── database_schema.rs │ │ │ │ ├── dbs_and_roles.rs │ │ │ │ ├── extension_server.rs │ │ │ │ ├── extensions.rs │ │ │ │ ├── failpoints.rs │ │ │ │ ├── grants.rs │ │ │ │ ├── hadron_liveness_probe.rs │ │ │ │ ├── insights.rs │ │ │ │ ├── lfc.rs │ │ │ │ ├── metrics.rs │ │ │ │ ├── metrics_json.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── promote.rs │ │ │ │ ├── refresh_configuration.rs │ │ │ │ ├── status.rs │ │ │ │ └── terminate.rs │ │ │ └── server.rs │ │ ├── installed_extensions.rs │ │ ├── lib.rs │ │ ├── local_proxy.rs │ │ ├── logger.rs │ │ ├── lsn_lease.rs │ │ ├── metrics.rs │ │ ├── migration.rs │ │ ├── migrations/ │ │ │ ├── 0001-add_bypass_rls_to_privileged_role.sql │ │ │ ├── 0002-alter_roles.sql │ │ │ ├── 0003-grant_pg_create_subscription_to_privileged_role.sql │ │ │ ├── 0004-grant_pg_monitor_to_privileged_role.sql │ │ │ ├── 0005-grant_all_on_tables_to_privileged_role.sql │ │ │ ├── 0006-grant_all_on_sequences_to_privileged_role.sql │ │ │ ├── 0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql │ │ │ ├── 0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql │ │ │ ├── 0009-revoke_replication_for_previously_allowed_roles.sql │ │ │ ├── 0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql │ │ │ ├── 0011-grant_pg_show_replication_origin_status_to_privileged_role.sql │ │ │ ├── 0012-grant_pg_signal_backend_to_privileged_role.sql │ │ │ └── tests/ │ │ │ ├── 0001-add_bypass_rls_to_privileged_role.sql │ │ │ ├── 0002-alter_roles.sql │ │ │ ├── 0003-grant_pg_create_subscription_to_privileged_role.sql │ │ │ ├── 0004-grant_pg_monitor_to_privileged_role.sql │ │ │ ├── 0005-grant_all_on_tables_to_privileged_role.sql │ │ │ ├── 0006-grant_all_on_sequences_to_privileged_role.sql │ │ │ ├── 0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql │ │ │ ├── 0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql │ │ │ ├── 0009-revoke_replication_for_previously_allowed_roles.sql │ │ │ ├── 0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql │ │ │ ├── 0011-grant_pg_show_replication_origin_status_to_privileged_role.sql │ │ │ └── 0012-grant_pg_signal_backend_to_privileged_role.sql │ │ ├── monitor.rs │ │ ├── params.rs │ │ ├── pg_helpers.rs │ │ ├── pg_isready.rs │ │ ├── pgbouncer.rs │ │ ├── rsyslog.rs │ │ ├── spec.rs │ │ ├── spec_apply.rs │ │ ├── sql/ │ │ │ ├── add_availabilitycheck_tables.sql │ │ │ ├── alter_databricks_reader_roles_timeout.sql │ │ │ ├── create_databricks_misc.sql │ │ │ ├── create_privileged_role.sql │ │ │ ├── default_grants.sql │ │ │ ├── drop_subscriptions.sql │ │ │ ├── finalize_drop_subscriptions.sql │ │ │ ├── pre_drop_role_revoke_privileges.sql │ │ │ ├── set_public_schema_owner.sql │ │ │ └── unset_template_for_drop_dbs.sql │ │ ├── swap.rs │ │ ├── sync_sk.rs │ │ └── tls.rs │ └── tests/ │ ├── README.md │ ├── cluster_spec.json │ ├── config_test.rs │ └── pg_helpers_tests.rs ├── control_plane/ │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── safekeepers.conf │ ├── simple.conf │ ├── src/ │ │ ├── background_process.rs │ │ ├── bin/ │ │ │ └── neon_local.rs │ │ ├── branch_mappings.rs │ │ ├── broker.rs │ │ ├── endpoint.rs │ │ ├── endpoint_storage.rs │ │ ├── lib.rs │ │ ├── local_env.rs │ │ ├── pageserver.rs │ │ ├── postgresql_conf.rs │ │ ├── safekeeper.rs │ │ └── storage_controller.rs │ └── storcon_cli/ │ ├── Cargo.toml │ └── src/ │ └── main.rs ├── deny.toml ├── diesel.toml ├── docker-compose/ │ ├── README.md │ ├── compute_wrapper/ │ │ ├── Dockerfile │ │ ├── private-key.pem │ │ ├── public-key.der │ │ ├── public-key.pem │ │ ├── shell/ │ │ │ └── compute.sh │ │ └── var/ │ │ └── db/ │ │ └── postgres/ │ │ └── configs/ │ │ └── config.json │ ├── docker-compose.yml │ ├── docker_compose_test.sh │ ├── ext-src/ │ │ ├── README.md │ │ ├── alter_db.sh │ │ ├── h3-pg-src/ │ │ │ ├── neon-test.sh │ │ │ └── test-upgrade.sh │ │ ├── hll-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── hypopg-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── ip4r-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── online_advisor-src/ │ │ │ ├── neon-test.sh │ │ │ └── regular-test.sh │ │ ├── pg_cron-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── pg_graphql-src/ │ │ │ ├── neon-test.sh │ │ │ └── regular-test.sh │ │ ├── pg_hint_plan-src/ │ │ │ └── regular-test.sh │ │ ├── pg_ivm-src/ │ │ │ ├── regular-test.sh │ │ │ ├── regular.patch │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── pg_jsonschema-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ ├── jsonschema_edge_cases.out │ │ │ │ └── jsonschema_valid_api.out │ │ │ └── sql/ │ │ │ ├── jsonschema_edge_cases.sql │ │ │ └── jsonschema_valid_api.sql │ │ ├── pg_repack-src/ │ │ │ └── test-upgrade.sh │ │ ├── pg_roaringbitmap-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── pg_semver-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade-v16.patch │ │ │ ├── test-upgrade-v17.patch │ │ │ └── test-upgrade.sh │ │ ├── pg_session_jwt-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ └── basic_functions.out │ │ │ └── sql/ │ │ │ └── basic_functions.sql │ │ ├── pg_tiktoken-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ └── pg_tiktoken.out │ │ │ └── sql/ │ │ │ └── pg_tiktoken.sql │ │ ├── pg_uuidv7-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── pgjwt-src/ │ │ │ ├── neon-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── pgrag-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ ├── api_keys.out │ │ │ │ ├── basic_functions.out │ │ │ │ ├── chunking_functions.out │ │ │ │ ├── document_processing.out │ │ │ │ ├── embedding_api_functions.out │ │ │ │ ├── embedding_functions.out │ │ │ │ ├── text_processing.out │ │ │ │ └── voyageai_functions.out │ │ │ ├── regular-test.sh │ │ │ └── sql/ │ │ │ ├── api_keys.sql │ │ │ ├── basic_functions.sql │ │ │ ├── chunking_functions.sql │ │ │ ├── document_processing.sql │ │ │ ├── embedding_api_functions.sql │ │ │ ├── text_processing.sql │ │ │ └── voyageai_functions.sql │ │ ├── pgtap-src/ │ │ │ ├── regular-test.sh │ │ │ ├── test-upgrade.patch │ │ │ └── test-upgrade.sh │ │ ├── pgvector-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── pgx_ulid-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ ├── 00_ulid_generation.out │ │ │ │ ├── 01_ulid_conversions.out │ │ │ │ ├── 02_ulid_conversions.out │ │ │ │ └── 03_ulid_errors.out │ │ │ └── sql/ │ │ │ ├── 00_ulid_generation.sql │ │ │ ├── 01_ulid_conversions.sql │ │ │ ├── 02_ulid_conversions.sql │ │ │ └── 03_ulid_errors.sql │ │ ├── plv8-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── postgis-src/ │ │ │ ├── README-Neon.md │ │ │ ├── neon-test.sh │ │ │ ├── postgis-common-v16.patch │ │ │ ├── postgis-common-v17.patch │ │ │ ├── postgis-regular-v16.patch │ │ │ ├── postgis-regular-v17.patch │ │ │ ├── raster_outdb_template.sql │ │ │ └── regular-test.sh │ │ ├── postgresql-unit-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── prefix-src/ │ │ │ ├── regular-test.sh │ │ │ └── test-upgrade.sh │ │ ├── rag_bge_small_en_v15-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ ├── basic_functions.out │ │ │ │ ├── basic_functions_enhanced.out │ │ │ │ ├── embedding_functions.out │ │ │ │ └── embedding_functions_enhanced.out │ │ │ └── sql/ │ │ │ ├── basic_functions.sql │ │ │ ├── basic_functions_enhanced.sql │ │ │ ├── embedding_functions.sql │ │ │ └── embedding_functions_enhanced.sql │ │ ├── rag_jina_reranker_v1_tiny_en-src/ │ │ │ ├── Makefile │ │ │ ├── expected/ │ │ │ │ ├── reranking_functions.out │ │ │ │ └── reranking_functions_enhanced.out │ │ │ └── sql/ │ │ │ ├── reranking_functions.sql │ │ │ └── reranking_functions_enhanced.sql │ │ └── rum-src/ │ │ ├── regular-test.sh │ │ ├── test-upgrade.patch │ │ └── test-upgrade.sh │ ├── pageserver_config/ │ │ ├── identity.toml │ │ └── pageserver.toml │ ├── run-tests.sh │ └── test_extensions_upgrade.sh ├── docs/ │ ├── .gitignore │ ├── SUMMARY.md │ ├── authentication.md │ ├── book.toml │ ├── consumption_metrics.md │ ├── core_changes.md │ ├── docker.md │ ├── error-handling.md │ ├── glossary.md │ ├── multitenancy.md │ ├── pageserver-compaction.md │ ├── pageserver-page-service.md │ ├── pageserver-pagecache.md │ ├── pageserver-processing-getpage.md │ ├── pageserver-processing-wal.md │ ├── pageserver-services.md │ ├── pageserver-storage.md │ ├── pageserver-tenant-migration.md │ ├── pageserver-thread-mgmt.md │ ├── pageserver-walredo.md │ ├── pageserver.md │ ├── rfcs/ │ │ ├── 001-cluster-size-limits.md │ │ ├── 002-storage.md │ │ ├── 003-laptop-cli.md │ │ ├── 004-durability.md │ │ ├── 005-zenith_local.md │ │ ├── 006-laptop-cli-v2-CLI.md │ │ ├── 006-laptop-cli-v2-repository-structure.md │ │ ├── 007-serverless-on-laptop.md │ │ ├── 008-push-pull.md │ │ ├── 009-snapshot-first-storage-cli.md │ │ ├── 009-snapshot-first-storage-pitr.md │ │ ├── 009-snapshot-first-storage.md │ │ ├── 010-storage_details.md │ │ ├── 011-retention-policy.md │ │ ├── 012-background-tasks.md │ │ ├── 013-term-history.md │ │ ├── 014-safekeepers-gossip.md │ │ ├── 014-storage-lsm.md │ │ ├── 015-storage-messaging.md │ │ ├── 016-connection-routing.md │ │ ├── 017-console-split.md │ │ ├── 017-timeline-data-management.md │ │ ├── 018-storage-messaging-2.md │ │ ├── 019-tenant-timeline-lifecycles.md │ │ ├── 020-pageserver-s3-coordination.md │ │ ├── 021-metering.md │ │ ├── 022-pageserver-delete-from-s3.md │ │ ├── 023-the-state-of-pageserver-tenant-relocation.md │ │ ├── 024-extension-loading.md │ │ ├── 024-user-mgmt.md │ │ ├── 025-generation-numbers.md │ │ ├── 026-pageserver-s3-mvcc.md │ │ ├── 027-crash-consistent-layer-map-through-index-part.md │ │ ├── 028-pageserver-migration.md │ │ ├── 029-getpage-throttling.md │ │ ├── 029-pageserver-wal-disaster-recovery.md │ │ ├── 030-vectored-timeline-get.md │ │ ├── 031-sharding-static.md │ │ ├── 032-shard-splitting.md │ │ ├── 033-storage-controller-drain-and-fill.md │ │ ├── 034-ancestor-deletion.md │ │ ├── 035-safekeeper-dynamic-membership-change.md │ │ ├── 035-timeline-archive.md │ │ ├── 036-physical-replication.md │ │ ├── 037-storage-controller-restarts.md │ │ ├── 038-aux-file-v2.md │ │ ├── 038-independent-compute-release.md │ │ ├── 040-Endpoint-Persistent-Unlogged-Files-Storage.md │ │ ├── 040-profiling.md │ │ ├── 041-rel-sparse-keyspace.md │ │ ├── 041-sharded-ingest.md │ │ ├── 043-bottom-most-gc-compaction.md │ │ ├── 044-feature-flag.md │ │ ├── 2025-02-14-storage-controller.md │ │ ├── 2025-03-17-compute-prewarm.md │ │ ├── 2025-04-30-direct-io-for-pageserver.md │ │ ├── 2025-04-30-pageserver-concurrent-io-on-read-path.md │ │ ├── 2025-07-07-node-deletion-api-improvement.md │ │ ├── README.md │ │ └── YYYY-MM-DD-copy-me.md │ ├── safekeeper-protocol.md │ ├── separation-compute-storage.md │ ├── settings.md │ ├── sourcetree.md │ ├── storage_broker.md │ ├── storage_controller.md │ ├── synthetic-size.md │ ├── tools.md │ ├── updating-postgres.md │ └── walservice.md ├── endpoint_storage/ │ ├── Cargo.toml │ └── src/ │ ├── app.rs │ ├── claims.rs │ ├── lib.rs │ ├── main.rs │ └── openapi_spec.yml ├── libs/ │ ├── compute_api/ │ │ ├── Cargo.toml │ │ ├── src/ │ │ │ ├── lib.rs │ │ │ ├── privilege.rs │ │ │ ├── requests.rs │ │ │ ├── responses.rs │ │ │ └── spec.rs │ │ └── tests/ │ │ └── cluster_spec.json │ ├── consumption_metrics/ │ │ ├── Cargo.toml │ │ └── src/ │ │ └── lib.rs │ ├── desim/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ ├── src/ │ │ │ ├── chan.rs │ │ │ ├── executor.rs │ │ │ ├── lib.rs │ │ │ ├── network.rs │ │ │ ├── node_os.rs │ │ │ ├── options.rs │ │ │ ├── proto.rs │ │ │ ├── time.rs │ │ │ └── world.rs │ │ └── tests/ │ │ └── reliable_copy_test.rs │ ├── http-utils/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── endpoint.rs │ │ ├── error.rs │ │ ├── failpoints.rs │ │ ├── json.rs │ │ ├── lib.rs │ │ ├── request.rs │ │ ├── server.rs │ │ └── tls_certs.rs │ ├── metrics/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── hll.rs │ │ ├── launch_timestamp.rs │ │ ├── lib.rs │ │ ├── more_process_metrics.rs │ │ └── wrappers.rs │ ├── neon-shmem/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── hash/ │ │ │ ├── core.rs │ │ │ ├── entry.rs │ │ │ └── tests.rs │ │ ├── hash.rs │ │ ├── lib.rs │ │ ├── shmem.rs │ │ └── sync.rs │ ├── pageserver_api/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── config/ │ │ │ └── tests.rs │ │ ├── config.rs │ │ ├── controller_api.rs │ │ ├── key.rs │ │ ├── keyspace.rs │ │ ├── lib.rs │ │ ├── models/ │ │ │ ├── detach_ancestor.rs │ │ │ ├── partitioning.rs │ │ │ └── utilization.rs │ │ ├── models.rs │ │ ├── pagestream_api.rs │ │ ├── reltag.rs │ │ ├── shard.rs │ │ └── upcall_api.rs │ ├── postgres_backend/ │ │ ├── Cargo.toml │ │ ├── src/ │ │ │ └── lib.rs │ │ └── tests/ │ │ ├── cert.pem │ │ ├── key.pem │ │ └── simple_select.rs │ ├── postgres_connection/ │ │ ├── Cargo.toml │ │ └── src/ │ │ └── lib.rs │ ├── postgres_ffi/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ ├── benches/ │ │ │ ├── README.md │ │ │ └── waldecoder.rs │ │ ├── bindgen_deps.h │ │ ├── build.rs │ │ ├── samples/ │ │ │ └── pg_hba.conf │ │ ├── src/ │ │ │ ├── controlfile_utils.rs │ │ │ ├── lib.rs │ │ │ ├── nonrelfile_utils.rs │ │ │ ├── pg_constants.rs │ │ │ ├── pg_constants_v14.rs │ │ │ ├── pg_constants_v15.rs │ │ │ ├── pg_constants_v16.rs │ │ │ ├── pg_constants_v17.rs │ │ │ ├── relfile_utils.rs │ │ │ ├── wal_craft_test_export.rs │ │ │ ├── wal_generator.rs │ │ │ ├── waldecoder_handler.rs │ │ │ ├── walrecord.rs │ │ │ └── xlog_utils.rs │ │ └── wal_craft/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── bin/ │ │ │ └── wal_craft.rs │ │ ├── lib.rs │ │ └── xlog_utils_test.rs │ ├── postgres_ffi_types/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── constants.rs │ │ ├── forknum.rs │ │ └── lib.rs │ ├── postgres_initdb/ │ │ ├── Cargo.toml │ │ └── src/ │ │ └── lib.rs │ ├── postgres_versioninfo/ │ │ ├── Cargo.toml │ │ └── src/ │ │ └── lib.rs │ ├── posthog_client_lite/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── background_loop.rs │ │ └── lib.rs │ ├── pq_proto/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── framed.rs │ │ └── lib.rs │ ├── proxy/ │ │ ├── README.md │ │ ├── json/ │ │ │ ├── Cargo.toml │ │ │ └── src/ │ │ │ ├── lib.rs │ │ │ ├── macros.rs │ │ │ ├── str.rs │ │ │ └── value.rs │ │ ├── postgres-protocol2/ │ │ │ ├── Cargo.toml │ │ │ └── src/ │ │ │ ├── authentication/ │ │ │ │ ├── mod.rs │ │ │ │ └── sasl.rs │ │ │ ├── escape/ │ │ │ │ ├── mod.rs │ │ │ │ └── test.rs │ │ │ ├── lib.rs │ │ │ ├── message/ │ │ │ │ ├── backend.rs │ │ │ │ ├── frontend.rs │ │ │ │ └── mod.rs │ │ │ ├── password/ │ │ │ │ ├── mod.rs │ │ │ │ └── test.rs │ │ │ └── types/ │ │ │ ├── mod.rs │ │ │ └── test.rs │ │ ├── postgres-types2/ │ │ │ ├── Cargo.toml │ │ │ └── src/ │ │ │ ├── lib.rs │ │ │ ├── private.rs │ │ │ └── type_gen.rs │ │ ├── subzero_core/ │ │ │ ├── .gitignore │ │ │ ├── Cargo.toml │ │ │ └── src/ │ │ │ └── lib.rs │ │ └── tokio-postgres2/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── cancel_query.rs │ │ ├── cancel_query_raw.rs │ │ ├── cancel_token.rs │ │ ├── client.rs │ │ ├── codec.rs │ │ ├── config.rs │ │ ├── connect.rs │ │ ├── connect_raw.rs │ │ ├── connect_socket.rs │ │ ├── connect_tls.rs │ │ ├── connection.rs │ │ ├── error/ │ │ │ ├── mod.rs │ │ │ └── sqlstate.rs │ │ ├── generic_client.rs │ │ ├── lib.rs │ │ ├── maybe_tls_stream.rs │ │ ├── prepare.rs │ │ ├── query.rs │ │ ├── row.rs │ │ ├── simple_query.rs │ │ ├── statement.rs │ │ ├── tls.rs │ │ ├── transaction.rs │ │ ├── transaction_builder.rs │ │ └── types.rs │ ├── remote_storage/ │ │ ├── Cargo.toml │ │ ├── src/ │ │ │ ├── azure_blob.rs │ │ │ ├── config.rs │ │ │ ├── error.rs │ │ │ ├── gcs_bucket.rs │ │ │ ├── lib.rs │ │ │ ├── local_fs.rs │ │ │ ├── metrics.rs │ │ │ ├── s3_bucket.rs │ │ │ ├── simulate_failures.rs │ │ │ └── support.rs │ │ └── tests/ │ │ ├── common/ │ │ │ ├── mod.rs │ │ │ └── tests.rs │ │ ├── test_real_azure.rs │ │ ├── test_real_gcs.rs │ │ └── test_real_s3.rs │ ├── safekeeper_api/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── lib.rs │ │ ├── membership.rs │ │ └── models.rs │ ├── tenant_size_model/ │ │ ├── .gitignore │ │ ├── Cargo.toml │ │ ├── Makefile │ │ ├── README.md │ │ ├── src/ │ │ │ ├── calculation.rs │ │ │ ├── lib.rs │ │ │ └── svg.rs │ │ └── tests/ │ │ └── tests.rs │ ├── tracing-utils/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── http.rs │ │ ├── lib.rs │ │ └── perf_span.rs │ ├── utils/ │ │ ├── Cargo.toml │ │ ├── benches/ │ │ │ ├── README.md │ │ │ └── benchmarks.rs │ │ ├── scripts/ │ │ │ ├── restore_from_wal.sh │ │ │ └── restore_from_wal_initdb.sh │ │ ├── src/ │ │ │ ├── auth.rs │ │ │ ├── backoff.rs │ │ │ ├── bin_ser.rs │ │ │ ├── circuit_breaker.rs │ │ │ ├── completion.rs │ │ │ ├── crashsafe.rs │ │ │ ├── elapsed_accum.rs │ │ │ ├── env.rs │ │ │ ├── error.rs │ │ │ ├── failpoint_support.rs │ │ │ ├── fs_ext/ │ │ │ │ └── rename_noreplace.rs │ │ │ ├── fs_ext.rs │ │ │ ├── generation.rs │ │ │ ├── guard_arc_swap.rs │ │ │ ├── hex.rs │ │ │ ├── id.rs │ │ │ ├── ip_address.rs │ │ │ ├── leaky_bucket.rs │ │ │ ├── lib.rs │ │ │ ├── linux_socket_ioctl.rs │ │ │ ├── lock_file.rs │ │ │ ├── logging.rs │ │ │ ├── lsn.rs │ │ │ ├── measured_stream.rs │ │ │ ├── metrics_collector.rs │ │ │ ├── pageserver_feedback.rs │ │ │ ├── pid_file.rs │ │ │ ├── poison.rs │ │ │ ├── postgres_client.rs │ │ │ ├── rate_limit.rs │ │ │ ├── sentry_init.rs │ │ │ ├── seqwait.rs │ │ │ ├── serde_percent.rs │ │ │ ├── serde_regex.rs │ │ │ ├── serde_system_time.rs │ │ │ ├── shard.rs │ │ │ ├── signals.rs │ │ │ ├── simple_rcu.rs │ │ │ ├── span.rs │ │ │ ├── sync/ │ │ │ │ ├── duplex/ │ │ │ │ │ └── mpsc.rs │ │ │ │ ├── duplex.rs │ │ │ │ ├── gate.rs │ │ │ │ ├── heavier_once_cell.rs │ │ │ │ └── spsc_fold.rs │ │ │ ├── sync.rs │ │ │ ├── tcp_listener.rs │ │ │ ├── timeout.rs │ │ │ ├── toml_edit_ext.rs │ │ │ ├── tracing_span_assert.rs │ │ │ ├── try_rcu.rs │ │ │ ├── vec_map.rs │ │ │ ├── yielding_loop.rs │ │ │ └── zstd.rs │ │ └── tests/ │ │ └── bin_ser_test.rs │ ├── vm_monitor/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── src/ │ │ ├── bin/ │ │ │ └── monitor.rs │ │ ├── cgroup.rs │ │ ├── dispatcher.rs │ │ ├── filecache.rs │ │ ├── lib.rs │ │ ├── protocol.rs │ │ └── runner.rs │ ├── wal_decoder/ │ │ ├── Cargo.toml │ │ ├── benches/ │ │ │ ├── README.md │ │ │ └── bench_interpret_wal.rs │ │ ├── build.rs │ │ ├── proto/ │ │ │ └── interpreted_wal.proto │ │ └── src/ │ │ ├── decoder.rs │ │ ├── lib.rs │ │ ├── models/ │ │ │ ├── record.rs │ │ │ └── value.rs │ │ ├── models.rs │ │ ├── serialized_batch.rs │ │ └── wire_format.rs │ └── walproposer/ │ ├── Cargo.toml │ ├── bindgen_deps.h │ ├── build.rs │ └── src/ │ ├── api_bindings.rs │ ├── lib.rs │ └── walproposer.rs ├── pageserver/ │ ├── Cargo.toml │ ├── benches/ │ │ ├── README.md │ │ ├── bench_ingest.rs │ │ ├── bench_layer_map.rs │ │ ├── bench_metrics.rs │ │ ├── bench_walredo.rs │ │ ├── large-layer-map-layernames.txt │ │ ├── odd-brook-layernames.txt │ │ └── upload_queue.rs │ ├── client/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── lib.rs │ │ ├── mgmt_api/ │ │ │ └── util.rs │ │ ├── mgmt_api.rs │ │ └── page_service.rs │ ├── client_grpc/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── client.rs │ │ ├── lib.rs │ │ ├── pool.rs │ │ └── retry.rs │ ├── compaction/ │ │ ├── Cargo.toml │ │ ├── TODO.md │ │ ├── src/ │ │ │ ├── bin/ │ │ │ │ └── compaction-simulator.rs │ │ │ ├── compact_tiered.rs │ │ │ ├── helpers.rs │ │ │ ├── identify_levels.rs │ │ │ ├── interface.rs │ │ │ ├── lib.rs │ │ │ ├── simulator/ │ │ │ │ └── draw.rs │ │ │ └── simulator.rs │ │ └── tests/ │ │ └── tests.rs │ ├── ctl/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── download_remote_object.rs │ │ ├── draw_timeline_dir.rs │ │ ├── index_part.rs │ │ ├── key.rs │ │ ├── layer_map_analyzer.rs │ │ ├── layers.rs │ │ ├── main.rs │ │ └── page_trace.rs │ ├── page_api/ │ │ ├── Cargo.toml │ │ ├── build.rs │ │ ├── proto/ │ │ │ └── page_service.proto │ │ └── src/ │ │ ├── client.rs │ │ ├── lib.rs │ │ ├── model.rs │ │ └── split.rs │ ├── pagebench/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── cmd/ │ │ │ ├── aux_files.rs │ │ │ ├── basebackup.rs │ │ │ ├── getpage_latest_lsn.rs │ │ │ ├── idle_streams.rs │ │ │ ├── ondemand_download_churn.rs │ │ │ └── trigger_initial_size_calculation.rs │ │ ├── main.rs │ │ └── util/ │ │ ├── cli/ │ │ │ └── targets.rs │ │ ├── request_stats.rs │ │ └── tokio_thread_local_stats.rs │ ├── src/ │ │ ├── assert_u64_eq_usize.rs │ │ ├── auth.rs │ │ ├── aux_file.rs │ │ ├── basebackup.rs │ │ ├── basebackup_cache.rs │ │ ├── bin/ │ │ │ ├── pageserver.rs │ │ │ └── test_helper_slow_client_reads.rs │ │ ├── config/ │ │ │ └── ignored_fields.rs │ │ ├── config.rs │ │ ├── consumption_metrics/ │ │ │ ├── disk_cache.rs │ │ │ ├── metrics/ │ │ │ │ └── tests.rs │ │ │ ├── metrics.rs │ │ │ └── upload.rs │ │ ├── consumption_metrics.rs │ │ ├── context.rs │ │ ├── controller_upcall_client.rs │ │ ├── deletion_queue/ │ │ │ ├── deleter.rs │ │ │ ├── list_writer.rs │ │ │ └── validator.rs │ │ ├── deletion_queue.rs │ │ ├── disk_usage_eviction_task.rs │ │ ├── feature_resolver.rs │ │ ├── http/ │ │ │ ├── mod.rs │ │ │ ├── openapi_spec.yml │ │ │ └── routes.rs │ │ ├── import_datadir.rs │ │ ├── l0_flush.rs │ │ ├── lib.rs │ │ ├── metrics.rs │ │ ├── page_cache.rs │ │ ├── page_service.rs │ │ ├── pgdatadir_mapping.rs │ │ ├── span.rs │ │ ├── statvfs.rs │ │ ├── task_mgr.rs │ │ ├── tenant/ │ │ │ ├── blob_io.rs │ │ │ ├── block_io.rs │ │ │ ├── checks.rs │ │ │ ├── config.rs │ │ │ ├── debug.rs │ │ │ ├── disk_btree.rs │ │ │ ├── disk_btree_test_data.rs │ │ │ ├── ephemeral_file.rs │ │ │ ├── gc_block.rs │ │ │ ├── gc_result.rs │ │ │ ├── layer_map/ │ │ │ │ ├── historic_layer_coverage.rs │ │ │ │ └── layer_coverage.rs │ │ │ ├── layer_map.rs │ │ │ ├── metadata.rs │ │ │ ├── mgr.rs │ │ │ ├── remote_timeline_client/ │ │ │ │ ├── download.rs │ │ │ │ ├── index.rs │ │ │ │ ├── manifest.rs │ │ │ │ └── upload.rs │ │ │ ├── remote_timeline_client.rs │ │ │ ├── secondary/ │ │ │ │ ├── downloader.rs │ │ │ │ ├── heatmap.rs │ │ │ │ ├── heatmap_uploader.rs │ │ │ │ └── scheduler.rs │ │ │ ├── secondary.rs │ │ │ ├── size.rs │ │ │ ├── storage_layer/ │ │ │ │ ├── batch_split_writer.rs │ │ │ │ ├── delta_layer.rs │ │ │ │ ├── errors.rs │ │ │ │ ├── filter_iterator.rs │ │ │ │ ├── image_layer.rs │ │ │ │ ├── inmemory_layer/ │ │ │ │ │ └── vectored_dio_read.rs │ │ │ │ ├── inmemory_layer.rs │ │ │ │ ├── layer/ │ │ │ │ │ ├── failpoints.rs │ │ │ │ │ └── tests.rs │ │ │ │ ├── layer.rs │ │ │ │ ├── layer_desc.rs │ │ │ │ ├── layer_name.rs │ │ │ │ └── merge_iterator.rs │ │ │ ├── storage_layer.rs │ │ │ ├── tasks.rs │ │ │ ├── throttle.rs │ │ │ ├── timeline/ │ │ │ │ ├── analysis.rs │ │ │ │ ├── compaction.rs │ │ │ │ ├── delete.rs │ │ │ │ ├── detach_ancestor.rs │ │ │ │ ├── eviction_task.rs │ │ │ │ ├── handle.rs │ │ │ │ ├── heatmap_layers_downloader.rs │ │ │ │ ├── import_pgdata/ │ │ │ │ │ ├── flow.rs │ │ │ │ │ ├── importbucket_client.rs │ │ │ │ │ ├── importbucket_format.rs │ │ │ │ │ └── index_part_format.rs │ │ │ │ ├── import_pgdata.rs │ │ │ │ ├── init.rs │ │ │ │ ├── layer_manager.rs │ │ │ │ ├── logical_size.rs │ │ │ │ ├── offload.rs │ │ │ │ ├── span.rs │ │ │ │ ├── uninit.rs │ │ │ │ ├── walreceiver/ │ │ │ │ │ ├── connection_manager.rs │ │ │ │ │ └── walreceiver_connection.rs │ │ │ │ └── walreceiver.rs │ │ │ ├── timeline.rs │ │ │ ├── upload_queue.rs │ │ │ └── vectored_blob_io.rs │ │ ├── tenant.rs │ │ ├── utilization.rs │ │ ├── virtual_file/ │ │ │ ├── io_engine/ │ │ │ │ └── tokio_epoll_uring_ext.rs │ │ │ ├── io_engine.rs │ │ │ ├── metadata.rs │ │ │ ├── open_options.rs │ │ │ ├── owned_buffers_io/ │ │ │ │ ├── aligned_buffer/ │ │ │ │ │ ├── alignment.rs │ │ │ │ │ ├── buffer.rs │ │ │ │ │ ├── buffer_mut.rs │ │ │ │ │ ├── raw.rs │ │ │ │ │ └── slice.rs │ │ │ │ ├── aligned_buffer.rs │ │ │ │ ├── io_buf_aligned.rs │ │ │ │ ├── io_buf_ext.rs │ │ │ │ ├── slice.rs │ │ │ │ ├── write/ │ │ │ │ │ └── flush.rs │ │ │ │ └── write.rs │ │ │ └── temporary.rs │ │ ├── virtual_file.rs │ │ ├── walingest.rs │ │ ├── walredo/ │ │ │ ├── apply_neon.rs │ │ │ ├── process/ │ │ │ │ ├── no_leak_child.rs │ │ │ │ └── protocol.rs │ │ │ └── process.rs │ │ └── walredo.rs │ └── test_data/ │ ├── indices/ │ │ └── mixed_workload/ │ │ ├── README.md │ │ └── index_part.json │ ├── short_v14_redo.page │ └── sk_wal_segment_from_pgbench/ │ ├── 000000010000000000000001.zst │ └── initdb.tar.zst ├── pgxn/ │ ├── .dir-locals.el │ ├── .editorconfig │ ├── Makefile │ ├── neon/ │ │ ├── Makefile │ │ ├── README.md │ │ ├── bitmap.h │ │ ├── communicator/ │ │ │ ├── .gitignore │ │ │ ├── Cargo.toml │ │ │ ├── README.md │ │ │ ├── build.rs │ │ │ ├── cbindgen.toml │ │ │ └── src/ │ │ │ ├── lib.rs │ │ │ └── worker_process/ │ │ │ ├── callbacks.rs │ │ │ ├── control_socket.rs │ │ │ ├── lfc_metrics.rs │ │ │ ├── logging.rs │ │ │ ├── main_loop.rs │ │ │ ├── mod.rs │ │ │ └── worker_interface.rs │ │ ├── communicator.c │ │ ├── communicator.h │ │ ├── communicator_process.c │ │ ├── communicator_process.h │ │ ├── extension_server.c │ │ ├── extension_server.h │ │ ├── file_cache.c │ │ ├── file_cache.h │ │ ├── hll.c │ │ ├── hll.h │ │ ├── libpagestore.c │ │ ├── libpqwalproposer.h │ │ ├── logical_replication_monitor.c │ │ ├── logical_replication_monitor.h │ │ ├── neon--1.0--1.1.sql │ │ ├── neon--1.0.sql │ │ ├── neon--1.1--1.0.sql │ │ ├── neon--1.1--1.2.sql │ │ ├── neon--1.2--1.1.sql │ │ ├── neon--1.2--1.3.sql │ │ ├── neon--1.3--1.2.sql │ │ ├── neon--1.3--1.4.sql │ │ ├── neon--1.4--1.3.sql │ │ ├── neon--1.4--1.5.sql │ │ ├── neon--1.5--1.4.sql │ │ ├── neon--1.5--1.6.sql │ │ ├── neon--1.6--1.5.sql │ │ ├── neon.c │ │ ├── neon.control │ │ ├── neon.h │ │ ├── neon_ddl_handler.c │ │ ├── neon_ddl_handler.h │ │ ├── neon_lwlsncache.c │ │ ├── neon_lwlsncache.h │ │ ├── neon_perf_counters.c │ │ ├── neon_perf_counters.h │ │ ├── neon_pgversioncompat.c │ │ ├── neon_pgversioncompat.h │ │ ├── neon_utils.c │ │ ├── neon_utils.h │ │ ├── neon_walreader.c │ │ ├── neon_walreader.h │ │ ├── pagestore_client.h │ │ ├── pagestore_smgr.c │ │ ├── relsize_cache.c │ │ ├── unstable_extensions.c │ │ ├── unstable_extensions.h │ │ ├── walproposer.c │ │ ├── walproposer.h │ │ ├── walproposer_compat.c │ │ ├── walproposer_pg.c │ │ ├── walsender_hooks.c │ │ └── walsender_hooks.h │ ├── neon_rmgr/ │ │ ├── Makefile │ │ ├── neon_rmgr.c │ │ ├── neon_rmgr.control │ │ ├── neon_rmgr.h │ │ ├── neon_rmgr_decode.c │ │ └── neon_rmgr_desc.c │ ├── neon_test_utils/ │ │ ├── Makefile │ │ ├── neon_test_utils--1.3.sql │ │ ├── neon_test_utils.control │ │ └── neontest.c │ ├── neon_utils/ │ │ ├── Makefile │ │ ├── neon_utils--1.0.sql │ │ ├── neon_utils.c │ │ └── neon_utils.control │ └── neon_walredo/ │ ├── Makefile │ ├── inmem_smgr.c │ ├── inmem_smgr.h │ ├── neon_seccomp.h │ ├── seccomp.c │ └── walredoproc.c ├── postgres.mk ├── pre-commit.py ├── proxy/ │ ├── Cargo.toml │ ├── README.md │ └── src/ │ ├── auth/ │ │ ├── backend/ │ │ │ ├── classic.rs │ │ │ ├── console_redirect.rs │ │ │ ├── hacks.rs │ │ │ ├── jwt.rs │ │ │ ├── local.rs │ │ │ └── mod.rs │ │ ├── credentials.rs │ │ ├── flow.rs │ │ ├── mod.rs │ │ └── password_hack.rs │ ├── batch.rs │ ├── bin/ │ │ ├── local_proxy.rs │ │ ├── pg_sni_router.rs │ │ └── proxy.rs │ ├── binary/ │ │ ├── local_proxy.rs │ │ ├── mod.rs │ │ ├── pg_sni_router.rs │ │ └── proxy.rs │ ├── cache/ │ │ ├── common.rs │ │ ├── mod.rs │ │ ├── node_info.rs │ │ └── project_info.rs │ ├── cancellation.rs │ ├── compute/ │ │ ├── mod.rs │ │ └── tls.rs │ ├── compute_ctl/ │ │ └── mod.rs │ ├── config.rs │ ├── console_redirect_proxy.rs │ ├── context/ │ │ ├── mod.rs │ │ └── parquet.rs │ ├── control_plane/ │ │ ├── client/ │ │ │ ├── cplane_proxy_v1.rs │ │ │ ├── mock.rs │ │ │ └── mod.rs │ │ ├── errors.rs │ │ ├── messages.rs │ │ ├── mgmt.rs │ │ └── mod.rs │ ├── error.rs │ ├── ext.rs │ ├── http/ │ │ ├── health_server.rs │ │ └── mod.rs │ ├── intern.rs │ ├── jemalloc.rs │ ├── lib.rs │ ├── logging.rs │ ├── metrics.rs │ ├── parse.rs │ ├── pglb/ │ │ ├── copy_bidirectional.rs │ │ ├── handshake.rs │ │ ├── inprocess.rs │ │ ├── mod.rs │ │ └── passthrough.rs │ ├── pqproto.rs │ ├── protocol2.rs │ ├── proxy/ │ │ ├── connect_auth.rs │ │ ├── connect_compute.rs │ │ ├── mod.rs │ │ ├── retry.rs │ │ ├── tests/ │ │ │ ├── mitm.rs │ │ │ └── mod.rs │ │ └── wake_compute.rs │ ├── rate_limiter/ │ │ ├── leaky_bucket.rs │ │ ├── limit_algorithm/ │ │ │ └── aimd.rs │ │ ├── limit_algorithm.rs │ │ ├── limiter.rs │ │ └── mod.rs │ ├── redis/ │ │ ├── connection_with_credentials_provider.rs │ │ ├── elasticache.rs │ │ ├── keys.rs │ │ ├── kv_ops.rs │ │ ├── mod.rs │ │ └── notifications.rs │ ├── sasl/ │ │ ├── channel_binding.rs │ │ ├── messages.rs │ │ ├── mod.rs │ │ └── stream.rs │ ├── scram/ │ │ ├── cache.rs │ │ ├── countmin.rs │ │ ├── exchange.rs │ │ ├── key.rs │ │ ├── messages.rs │ │ ├── mod.rs │ │ ├── pbkdf2.rs │ │ ├── secret.rs │ │ ├── signature.rs │ │ └── threadpool.rs │ ├── serverless/ │ │ ├── backend.rs │ │ ├── cancel_set.rs │ │ ├── conn_pool.rs │ │ ├── conn_pool_lib.rs │ │ ├── error.rs │ │ ├── http_conn_pool.rs │ │ ├── http_util.rs │ │ ├── json.rs │ │ ├── local_conn_pool.rs │ │ ├── mod.rs │ │ ├── rest.rs │ │ ├── sql_over_http.rs │ │ └── websocket.rs │ ├── signals.rs │ ├── stream.rs │ ├── tls/ │ │ ├── client_config.rs │ │ ├── mod.rs │ │ ├── postgres_rustls.rs │ │ └── server_config.rs │ ├── types.rs │ ├── url.rs │ ├── usage_metrics.rs │ ├── util.rs │ └── waiters.rs ├── pyproject.toml ├── pytest.ini ├── run_clippy.sh ├── rust-toolchain.toml ├── safekeeper/ │ ├── Cargo.toml │ ├── benches/ │ │ ├── README.md │ │ └── receive_wal.rs │ ├── client/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── lib.rs │ │ └── mgmt_api.rs │ ├── spec/ │ │ ├── .gitignore │ │ ├── MCProposerAcceptorReconfig.tla │ │ ├── MCProposerAcceptorStatic.tla │ │ ├── ProposerAcceptorReconfig.tla │ │ ├── ProposerAcceptorStatic.tla │ │ ├── modelcheck.sh │ │ ├── models/ │ │ │ ├── MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg │ │ │ ├── MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg │ │ │ ├── MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg │ │ │ ├── MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a3_t2_l2.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a3_t3_l2.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a3_t3_l3.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a3_t4_l4.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a5_t2_l2.cfg │ │ │ ├── MCProposerAcceptorStatic_p2_a5_t3_l3.cfg │ │ │ └── MCProposerAcceptorStatic_p2_a5_t4_l3.cfg │ │ ├── readme.md │ │ ├── remove_interm_progress.awk │ │ ├── remove_interm_progress.sh │ │ └── tlc-results/ │ │ ├── MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log │ │ ├── MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log │ │ ├── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log │ │ ├── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log │ │ ├── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log │ │ ├── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log │ │ ├── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log │ │ └── MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log │ ├── src/ │ │ ├── auth.rs │ │ ├── bin/ │ │ │ └── safekeeper.rs │ │ ├── broker.rs │ │ ├── control_file.rs │ │ ├── control_file_upgrade.rs │ │ ├── copy_timeline.rs │ │ ├── debug_dump.rs │ │ ├── hadron.rs │ │ ├── handler.rs │ │ ├── http/ │ │ │ ├── mod.rs │ │ │ ├── openapi_spec.yaml │ │ │ └── routes.rs │ │ ├── lib.rs │ │ ├── metrics.rs │ │ ├── patch_control_file.rs │ │ ├── pull_timeline.rs │ │ ├── rate_limit.rs │ │ ├── receive_wal.rs │ │ ├── recovery.rs │ │ ├── remove_wal.rs │ │ ├── safekeeper.rs │ │ ├── send_interpreted_wal.rs │ │ ├── send_wal.rs │ │ ├── state.rs │ │ ├── test_utils.rs │ │ ├── timeline.rs │ │ ├── timeline_eviction.rs │ │ ├── timeline_guard.rs │ │ ├── timeline_manager.rs │ │ ├── timelines_global_map.rs │ │ ├── timelines_set.rs │ │ ├── wal_backup.rs │ │ ├── wal_backup_partial.rs │ │ ├── wal_reader_stream.rs │ │ ├── wal_service.rs │ │ └── wal_storage.rs │ └── tests/ │ ├── misc_test.rs │ ├── random_test.rs │ ├── simple_test.rs │ └── walproposer_sim/ │ ├── block_storage.rs │ ├── log.rs │ ├── mod.rs │ ├── safekeeper.rs │ ├── safekeeper_disk.rs │ ├── simulation.rs │ ├── simulation_logs.rs │ ├── walproposer_api.rs │ └── walproposer_disk.rs ├── scripts/ │ ├── benchmark_durations.py │ ├── check_allowed_errors.sh │ ├── comment-test-report.js │ ├── coverage │ ├── download_basebackup.py │ ├── force_layer_download.py │ ├── generate_and_push_perf_report.sh │ ├── ingest_perf_test_result.py │ ├── ingest_regress_test_result-new-format.py │ ├── ninstall.sh │ ├── perf_report_template.html │ ├── proxy_bench_results_ingest.py │ ├── ps_ec2_setup_instance_store │ ├── pysync │ ├── pytest │ ├── reformat │ ├── sk_cleanup_tenants/ │ │ ├── readme.md │ │ ├── remote.yaml │ │ └── script.py │ └── sk_collect_dumps/ │ ├── .gitignore │ ├── ansible.cfg │ ├── pyproject.toml │ ├── readme.md │ ├── remote.yaml │ ├── ssh.cfg │ └── upload.sh ├── storage_broker/ │ ├── Cargo.toml │ ├── benches/ │ │ └── rps.rs │ ├── build.rs │ ├── proto/ │ │ ├── .gitignore │ │ └── broker.proto │ └── src/ │ ├── bin/ │ │ └── storage_broker.rs │ ├── lib.rs │ └── metrics.rs ├── storage_controller/ │ ├── Cargo.toml │ ├── client/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── control_api.rs │ │ └── lib.rs │ ├── migrations/ │ │ ├── .keep │ │ ├── 00000000000000_diesel_initial_setup/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-01-07-211257_create_tenant_shards/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-01-07-212945_create_nodes/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-02-29-094122_generations_null/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-03-18-184429_rename_policy/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-03-27-133204_tenant_policies/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-07-23-191537_create_metadata_health/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-07-26-140924_create_leader/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-08-23-102952_safekeepers/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-08-23-170149_tenant_id_index/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-08-27-184400_pageserver_az/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-08-28-150530_pageserver_az_not_null/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-09-05-104500_tenant_shard_preferred_az/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2024-12-12-212515_safekeepers_scheduling_policy/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-01-09-160454_safekeepers_remove_active/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-01-15-181207_safekeepers_disabled_to_pause/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-02-11-144848_pageserver_use_https/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-02-14-160526_safekeeper_timelines/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-02-28-141741_safekeeper_use_https/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-03-18-103700_timeline_imports/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-06-01-201442_add_lifecycle_to_nodes/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-06-17-082247_pageserver_grpc_addr/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-07-02-170751_safekeeper_default_no_pause/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ ├── 2025-07-08-114340_sk_set_notified_generation/ │ │ │ ├── down.sql │ │ │ └── up.sql │ │ └── 2025-07-17-000001_hadron_safekeepers/ │ │ ├── down.sql │ │ └── up.sql │ └── src/ │ ├── auth.rs │ ├── background_node_operations.rs │ ├── compute_hook.rs │ ├── hadron_utils.rs │ ├── heartbeater.rs │ ├── http.rs │ ├── id_lock_map.rs │ ├── leadership.rs │ ├── lib.rs │ ├── main.rs │ ├── metrics.rs │ ├── node.rs │ ├── operation_utils.rs │ ├── pageserver_client.rs │ ├── peer_client.rs │ ├── persistence/ │ │ └── split_state.rs │ ├── persistence.rs │ ├── reconciler.rs │ ├── safekeeper.rs │ ├── safekeeper_client.rs │ ├── scheduler.rs │ ├── schema.rs │ ├── service/ │ │ ├── chaos_injector.rs │ │ ├── feature_flag.rs │ │ ├── safekeeper_reconciler.rs │ │ ├── safekeeper_service.rs │ │ └── tenant_shard_iterator.rs │ ├── service.rs │ ├── tenant_shard.rs │ └── timeline_import.rs ├── storage_scrubber/ │ ├── Cargo.toml │ ├── README.md │ └── src/ │ ├── checks.rs │ ├── cloud_admin_api.rs │ ├── find_large_objects.rs │ ├── garbage.rs │ ├── lib.rs │ ├── main.rs │ ├── metadata_stream.rs │ ├── pageserver_physical_gc.rs │ ├── scan_pageserver_metadata.rs │ ├── scan_safekeeper_metadata.rs │ └── tenant_snapshot.rs ├── test_runner/ │ ├── README.md │ ├── bin/ │ │ └── neon_local_create_deep_l0_stack.py │ ├── cloud_regress/ │ │ ├── README.md │ │ └── test_cloud_regress.py │ ├── conftest.py │ ├── fixtures/ │ │ ├── __init__.py │ │ ├── auth_tokens.py │ │ ├── benchmark_fixture.py │ │ ├── common_types.py │ │ ├── compare_fixtures.py │ │ ├── compute_migrations.py │ │ ├── compute_reconfigure.py │ │ ├── endpoint/ │ │ │ ├── __init__.py │ │ │ └── http.py │ │ ├── fast_import.py │ │ ├── h2server.py │ │ ├── httpserver.py │ │ ├── log_helper.py │ │ ├── metrics.py │ │ ├── neon_api.py │ │ ├── neon_cli.py │ │ ├── neon_fixtures.py │ │ ├── overlayfs.py │ │ ├── pageserver/ │ │ │ ├── __init__.py │ │ │ ├── allowed_errors.py │ │ │ ├── common_types.py │ │ │ ├── http.py │ │ │ ├── makelayers/ │ │ │ │ ├── __init__.py │ │ │ │ └── l0stack.py │ │ │ ├── many_tenants.py │ │ │ ├── remote_storage.py │ │ │ └── utils.py │ │ ├── parametrize.py │ │ ├── paths.py │ │ ├── pg_config.py │ │ ├── pg_stats.py │ │ ├── pg_version.py │ │ ├── port_distributor.py │ │ ├── remote_storage.py │ │ ├── reruns.py │ │ ├── safekeeper/ │ │ │ ├── __init__.py │ │ │ ├── http.py │ │ │ └── utils.py │ │ ├── safekeeper_utils.py │ │ ├── slow.py │ │ ├── storage_controller_proxy.py │ │ ├── utils.py │ │ └── workload.py │ ├── logical_repl/ │ │ ├── README.md │ │ ├── clickhouse/ │ │ │ └── docker-compose.yml │ │ ├── debezium/ │ │ │ └── docker-compose.yml │ │ ├── test_clickhouse.py │ │ └── test_debezium.py │ ├── performance/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── benchbase_tpc_c_helpers/ │ │ │ ├── generate_diagrams.py │ │ │ ├── generate_workload_size.py │ │ │ └── upload_results_to_perf_test_results.py │ │ ├── large_synthetic_oltp/ │ │ │ ├── IUD_one_transaction.sql │ │ │ ├── grow_action_blocks.sql │ │ │ ├── grow_action_kwargs.sql │ │ │ ├── grow_device_fingerprint_event.sql │ │ │ ├── grow_edges.sql │ │ │ ├── grow_hotel_rate_mapping.sql │ │ │ ├── grow_ocr_pipeline_results_version.sql │ │ │ ├── grow_priceline_raw_response.sql │ │ │ ├── grow_relabled_transactions.sql │ │ │ ├── grow_state_values.sql │ │ │ ├── grow_values.sql │ │ │ ├── grow_vertices.sql │ │ │ ├── insert_webhooks.sql │ │ │ ├── select_any_webhook_with_skew.sql │ │ │ ├── select_prefetch_webhook.sql │ │ │ ├── select_recent_webhook.sql │ │ │ ├── update_accounting_coding_body_tracking_category_selection.sql │ │ │ ├── update_action_blocks.sql │ │ │ ├── update_action_kwargs.sql │ │ │ ├── update_denormalized_approval_workflow.sql │ │ │ ├── update_device_fingerprint_event.sql │ │ │ ├── update_edges.sql │ │ │ ├── update_heron_transaction_enriched_log.sql │ │ │ ├── update_heron_transaction_enrichment_requests.sql │ │ │ ├── update_hotel_rate_mapping.sql │ │ │ ├── update_incoming_webhooks.sql │ │ │ ├── update_manual_transaction.sql │ │ │ ├── update_ml_receipt_matching_log.sql │ │ │ ├── update_ocr_pipeine_results_version.sql │ │ │ ├── update_orc_pipeline_step_results.sql │ │ │ ├── update_orc_pipeline_step_results_version.sql │ │ │ ├── update_priceline_raw_response.sql │ │ │ ├── update_quickbooks_transactions.sql │ │ │ ├── update_raw_finicity_transaction.sql │ │ │ ├── update_relabeled_transactions.sql │ │ │ ├── update_state_values.sql │ │ │ ├── update_stripe_authorization_event_log.sql │ │ │ ├── update_transaction.sql │ │ │ ├── update_values.sql │ │ │ └── update_vertices.sql │ │ ├── many_relations/ │ │ │ └── create_many_relations.sql │ │ ├── out_dir_to_csv.py │ │ ├── pageserver/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── interactive/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_many_small_tenants.py │ │ │ ├── pagebench/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_large_slru_basebackup.py │ │ │ │ ├── test_ondemand_download_churn.py │ │ │ │ └── test_pageserver_max_throughput_getpage_at_latest_lsn.py │ │ │ ├── test_page_service_batching.py │ │ │ └── util.py │ │ ├── pgvector/ │ │ │ ├── HNSW_build.sql │ │ │ ├── IVFFLAT_build.sql │ │ │ ├── README.md │ │ │ ├── halfvec_build.sql │ │ │ ├── loaddata.py │ │ │ ├── pgbench_custom_script_pgvector_halfvec_queries.sql │ │ │ └── pgbench_custom_script_pgvector_hsnw_queries.sql │ │ ├── test_branch_creation.py │ │ ├── test_branching.py │ │ ├── test_bulk_insert.py │ │ ├── test_bulk_tenant_create.py │ │ ├── test_bulk_update.py │ │ ├── test_compaction.py │ │ ├── test_compare_pg_stats.py │ │ ├── test_compute_ctl_api.py │ │ ├── test_compute_startup.py │ │ ├── test_copy.py │ │ ├── test_cumulative_statistics_persistence.py │ │ ├── test_dup_key.py │ │ ├── test_gc_feedback.py │ │ ├── test_gist_build.py │ │ ├── test_hot_page.py │ │ ├── test_hot_table.py │ │ ├── test_ingest_insert_bulk.py │ │ ├── test_ingest_logical_message.py │ │ ├── test_latency.py │ │ ├── test_layer_map.py │ │ ├── test_lfc_prewarm.py │ │ ├── test_logical_replication.py │ │ ├── test_parallel_copy.py │ │ ├── test_parallel_copy_to.py │ │ ├── test_perf_ingest_using_pgcopydb.py │ │ ├── test_perf_many_relations.py │ │ ├── test_perf_olap.py │ │ ├── test_perf_oltp_large_tenant.py │ │ ├── test_perf_pgbench.py │ │ ├── test_perf_pgvector_queries.py │ │ ├── test_physical_replication.py │ │ ├── test_random_writes.py │ │ ├── test_seqscans.py │ │ ├── test_sharded_ingest.py │ │ ├── test_sharding_autosplit.py │ │ ├── test_storage_controller_scale.py │ │ ├── test_wal_backpressure.py │ │ ├── test_write_amplification.py │ │ └── tpc-h/ │ │ ├── create-indexes.sql │ │ ├── create-schema.sql │ │ └── queries/ │ │ ├── 1.sql │ │ ├── 10.sql │ │ ├── 11.sql │ │ ├── 12.sql │ │ ├── 13.sql │ │ ├── 14.sql │ │ ├── 15.sql │ │ ├── 16.sql │ │ ├── 17.sql │ │ ├── 18.sql │ │ ├── 19.sql │ │ ├── 2.sql │ │ ├── 20.sql │ │ ├── 21.sql │ │ ├── 22.sql │ │ ├── 3.sql │ │ ├── 4.sql │ │ ├── 5.sql │ │ ├── 6.sql │ │ ├── 7.sql │ │ ├── 8.sql │ │ └── 9.sql │ ├── pg_clients/ │ │ ├── README.md │ │ ├── csharp/ │ │ │ └── npgsql/ │ │ │ ├── .dockerignore │ │ │ ├── .gitignore │ │ │ ├── Dockerfile │ │ │ ├── Program.cs │ │ │ └── csharp-npgsql.csproj │ │ ├── java/ │ │ │ └── jdbc/ │ │ │ ├── Dockerfile │ │ │ └── Example.java │ │ ├── python/ │ │ │ ├── asyncpg/ │ │ │ │ ├── Dockerfile │ │ │ │ ├── asyncpg_example.py │ │ │ │ └── requirements.txt │ │ │ └── pg8000/ │ │ │ ├── Dockerfile │ │ │ ├── pg8000_example.py │ │ │ └── requirements.txt │ │ ├── rust/ │ │ │ └── tokio-postgres/ │ │ │ ├── .dockerignore │ │ │ ├── .gitignore │ │ │ ├── Cargo.toml │ │ │ ├── Dockerfile │ │ │ └── src/ │ │ │ └── main.rs │ │ ├── swift/ │ │ │ ├── PostgresClientKitExample/ │ │ │ │ ├── .dockerignore │ │ │ │ ├── .gitignore │ │ │ │ ├── Dockerfile │ │ │ │ ├── Package.resolved │ │ │ │ ├── Package.swift │ │ │ │ └── Sources/ │ │ │ │ └── PostgresClientKitExample/ │ │ │ │ └── main.swift │ │ │ └── PostgresNIOExample/ │ │ │ ├── .dockerignore │ │ │ ├── .gitignore │ │ │ ├── Dockerfile │ │ │ ├── Package.resolved │ │ │ ├── Package.swift │ │ │ └── Sources/ │ │ │ └── PostgresNIOExample/ │ │ │ └── main.swift │ │ ├── test_pg_clients.py │ │ └── typescript/ │ │ ├── postgresql-client/ │ │ │ ├── .dockerignore │ │ │ ├── .gitignore │ │ │ ├── Dockerfile │ │ │ ├── index.js │ │ │ └── package.json │ │ └── serverless-driver/ │ │ ├── .dockerignore │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── index.js │ │ └── package.json │ ├── random_ops/ │ │ ├── README.md │ │ └── test_random_ops.py │ ├── regress/ │ │ ├── data/ │ │ │ ├── test_event_trigger_extension/ │ │ │ │ ├── test_event_trigger_extension--1.0.sql │ │ │ │ └── test_event_trigger_extension.control │ │ │ ├── test_remote_extensions/ │ │ │ │ ├── test_extension_sql_only/ │ │ │ │ │ ├── sql/ │ │ │ │ │ │ ├── test_extension_sql_only--1.0--1.1.sql │ │ │ │ │ │ └── test_extension_sql_only--1.0.sql │ │ │ │ │ └── test_extension_sql_only.control │ │ │ │ └── test_extension_with_lib/ │ │ │ │ ├── sql/ │ │ │ │ │ ├── test_extension_with_lib--1.0--1.1.sql │ │ │ │ │ └── test_extension_with_lib--1.0.sql │ │ │ │ ├── src/ │ │ │ │ │ └── test_extension_with_lib.c │ │ │ │ └── test_extension_with_lib.control │ │ │ └── test_signed_char.out │ │ ├── test_ancestor_branch.py │ │ ├── test_attach_tenant_config.py │ │ ├── test_auth.py │ │ ├── test_auth_broker.py │ │ ├── test_backpressure.py │ │ ├── test_bad_connection.py │ │ ├── test_basebackup.py │ │ ├── test_basebackup_error.py │ │ ├── test_branch_and_gc.py │ │ ├── test_branch_behind.py │ │ ├── test_branching.py │ │ ├── test_broken_timeline.py │ │ ├── test_build_info_metric.py │ │ ├── test_change_pageserver.py │ │ ├── test_clog_truncate.py │ │ ├── test_close_fds.py │ │ ├── test_combocid.py │ │ ├── test_communicator_metrics_exporter.py │ │ ├── test_compaction.py │ │ ├── test_compatibility.py │ │ ├── test_compute_catalog.py │ │ ├── test_compute_http.py │ │ ├── test_compute_locales.py │ │ ├── test_compute_metrics.py │ │ ├── test_compute_migrations.py │ │ ├── test_compute_monitor.py │ │ ├── test_compute_reconfigure.py │ │ ├── test_compute_termination.py │ │ ├── test_config.py │ │ ├── test_crafted_wal_end.py │ │ ├── test_createdropdb.py │ │ ├── test_createuser.py │ │ ├── test_ddl_forwarding.py │ │ ├── test_disk_usage_eviction.py │ │ ├── test_download_extensions.py │ │ ├── test_endpoint_crash.py │ │ ├── test_endpoint_storage.py │ │ ├── test_event_trigger_extension.py │ │ ├── test_explain_with_lfc_stats.py │ │ ├── test_extensions.py │ │ ├── test_feature_flag.py │ │ ├── test_fsm_truncate.py │ │ ├── test_fullbackup.py │ │ ├── test_gc_aggressive.py │ │ ├── test_gin_redo.py │ │ ├── test_gist.py │ │ ├── test_hadron_ps_connectivity_metrics.py │ │ ├── test_hcc_handling_ps_data_loss.py │ │ ├── test_hot_standby.py │ │ ├── test_import.py │ │ ├── test_import_pgdata.py │ │ ├── test_ingestion_layer_size.py │ │ ├── test_large_schema.py │ │ ├── test_layer_bloating.py │ │ ├── test_layer_eviction.py │ │ ├── test_layer_writers_fail.py │ │ ├── test_layers_from_future.py │ │ ├── test_lfc_prefetch.py │ │ ├── test_lfc_prewarm.py │ │ ├── test_lfc_resize.py │ │ ├── test_lfc_working_set_approximation.py │ │ ├── test_local_file_cache.py │ │ ├── test_logging.py │ │ ├── test_logical_replication.py │ │ ├── test_lsn_mapping.py │ │ ├── test_multixact.py │ │ ├── test_nbtree_pagesplit_cycleid.py │ │ ├── test_neon_cli.py │ │ ├── test_neon_extension.py │ │ ├── test_neon_local_cli.py │ │ ├── test_neon_superuser.py │ │ ├── test_next_xid.py │ │ ├── test_normal_work.py │ │ ├── test_oid_overflow.py │ │ ├── test_old_request_lsn.py │ │ ├── test_ondemand_download.py │ │ ├── test_ondemand_slru_download.py │ │ ├── test_ondemand_wal_download.py │ │ ├── test_page_service_batching_regressions.py │ │ ├── test_pageserver_api.py │ │ ├── test_pageserver_catchup.py │ │ ├── test_pageserver_config.py │ │ ├── test_pageserver_crash_consistency.py │ │ ├── test_pageserver_generations.py │ │ ├── test_pageserver_getpage_throttle.py │ │ ├── test_pageserver_layer_rolling.py │ │ ├── test_pageserver_metric_collection.py │ │ ├── test_pageserver_reconnect.py │ │ ├── test_pageserver_restart.py │ │ ├── test_pageserver_restarts_under_workload.py │ │ ├── test_pageserver_secondary.py │ │ ├── test_pg_query_cancellation.py │ │ ├── test_pg_regress.py │ │ ├── test_pg_waldump.py │ │ ├── test_pgstat.py │ │ ├── test_physical_and_logical_replicaiton.py │ │ ├── test_physical_replication.py │ │ ├── test_pitr_gc.py │ │ ├── test_postgres_version.py │ │ ├── test_prefetch_buffer_resize.py │ │ ├── test_proxy.py │ │ ├── test_proxy_allowed_ips.py │ │ ├── test_proxy_metric_collection.py │ │ ├── test_proxy_websockets.py │ │ ├── test_read_validation.py │ │ ├── test_readonly_node.py │ │ ├── test_recovery.py │ │ ├── test_relations.py │ │ ├── test_remote_storage.py │ │ ├── test_replica_promotes.py │ │ ├── test_replica_start.py │ │ ├── test_rest_broker.py │ │ ├── test_role_grants.py │ │ ├── test_s3_restore.py │ │ ├── test_safekeeper_deletion.py │ │ ├── test_safekeeper_migration.py │ │ ├── test_setup.py │ │ ├── test_sharding.py │ │ ├── test_signed_char.py │ │ ├── test_sni_router.py │ │ ├── test_ssl.py │ │ ├── test_storage_controller.py │ │ ├── test_storage_scrubber.py │ │ ├── test_subscriber_branching.py │ │ ├── test_subscriber_restart.py │ │ ├── test_subxacts.py │ │ ├── test_tenant_conf.py │ │ ├── test_tenant_delete.py │ │ ├── test_tenant_detach.py │ │ ├── test_tenant_relocation.py │ │ ├── test_tenant_size.py │ │ ├── test_tenant_tasks.py │ │ ├── test_tenants.py │ │ ├── test_tenants_with_remote_storage.py │ │ ├── test_threshold_based_eviction.py │ │ ├── test_timeline_archive.py │ │ ├── test_timeline_delete.py │ │ ├── test_timeline_detach_ancestor.py │ │ ├── test_timeline_gc_blocking.py │ │ ├── test_timeline_size.py │ │ ├── test_truncate.py │ │ ├── test_twophase.py │ │ ├── test_unlogged.py │ │ ├── test_unstable_extensions.py │ │ ├── test_vm_bits.py │ │ ├── test_vm_truncate.py │ │ ├── test_wal_acceptor.py │ │ ├── test_wal_acceptor_async.py │ │ ├── test_wal_receiver.py │ │ ├── test_wal_restore.py │ │ └── test_walredo_not_left_behind_on_detach.py │ ├── sql_regress/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── expected/ │ │ │ ├── .gitignore │ │ │ ├── neon-cid.out │ │ │ ├── neon-clog.out │ │ │ ├── neon-event-triggers.out │ │ │ ├── neon-rel-truncate.out │ │ │ ├── neon-spgist.out │ │ │ ├── neon-subxacts.out │ │ │ ├── neon-test-utils.out │ │ │ └── neon-vacuum-full.out │ │ ├── parallel_schedule │ │ └── sql/ │ │ ├── .gitignore │ │ ├── neon-cid.sql │ │ ├── neon-clog.sql │ │ ├── neon-event-triggers.sql │ │ ├── neon-rel-truncate.sql │ │ ├── neon-spgist.sql │ │ ├── neon-subxacts.sql │ │ ├── neon-test-utils.sql │ │ └── neon-vacuum-full.sql │ ├── stubs/ │ │ └── h2/ │ │ ├── README.md │ │ ├── __init__.pyi │ │ ├── config.pyi │ │ ├── connection.pyi │ │ ├── errors.pyi │ │ ├── events.pyi │ │ ├── exceptions.pyi │ │ ├── frame_buffer.pyi │ │ ├── settings.pyi │ │ ├── stream.pyi │ │ ├── utilities.pyi │ │ └── windows.pyi │ ├── test_broken.py │ └── websocket_tunnel.py ├── vendor/ │ └── revisions.json └── workspace_hack/ ├── .gitattributes ├── Cargo.toml ├── build.rs └── src/ └── lib.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cargo/config.toml ================================================ [build] # This is only present for local builds, as it will be overridden # by the RUSTDOCFLAGS env var in CI. rustdocflags = ["-Arustdoc::private_intra_doc_links"] # Enable frame pointers. This may have a minor performance overhead, but makes it easier and more # efficient to obtain stack traces (and thus CPU/heap profiles). It may also avoid seg faults that # we've seen with libunwind-based profiling. See also: # # * # * # # NB: the RUSTFLAGS envvar will replace this. Make sure to update e.g. Dockerfile as well. rustflags = ["-Cforce-frame-pointers=yes"] [alias] build_testing = ["build", "--features", "testing"] neon = ["run", "--bin", "neon_local"] ================================================ FILE: .config/hakari.toml ================================================ # This file contains settings for `cargo hakari`. # See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options. hakari-package = "workspace_hack" # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. dep-format-version = "4" # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. # Hakari works much better with the new feature resolver. # For more about the new feature resolver, see: # https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver # Have to keep the resolver still here since hakari requires this field, # despite it's now the default for 2021 edition & cargo. resolver = "2" # Add triples corresponding to platforms commonly used by developers here. # https://doc.rust-lang.org/rustc/platform-support.html platforms = [ # "x86_64-unknown-linux-gnu", # "x86_64-apple-darwin", # "x86_64-pc-windows-msvc", ] [final-excludes] workspace-members = [ # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded # from depending on workspace-hack because most of the dependencies are not used. "vm_monitor", # subzero-core is a stub crate that should be excluded from workspace-hack "subzero-core", # All of these exist in libs and are not usually built independently. # Putting workspace hack there adds a bottleneck for cargo builds. "compute_api", "consumption_metrics", "desim", "json", "metrics", "pageserver_api", "postgres_backend", "postgres_connection", "postgres_ffi", "pq_proto", "remote_storage", "safekeeper_api", "tenant_size_model", "tracing-utils", "utils", "wal_craft", "walproposer", "postgres-protocol2", "postgres-types2", "tokio-postgres2", ] # Write out exact versions rather than a semver range. (Defaults to false.) # exact-versions = true ================================================ FILE: .config/nextest.toml ================================================ [profile.default] slow-timeout = { period = "60s", terminate-after = 3 } ================================================ FILE: .dockerignore ================================================ * # Files !Cargo.lock !Cargo.toml !Makefile !postgres.mk !rust-toolchain.toml !scripts/ninstall.sh !docker-compose/run-tests.sh # Directories !.cargo/ !.config/ !compute/ !compute_tools/ !control_plane/ !docker-compose/ext-src !libs/ !pageserver/ !pgxn/ !proxy/ !endpoint_storage/ !storage_scrubber/ !safekeeper/ !storage_broker/ !storage_controller/ !vendor/postgres-*/ !workspace_hack/ !build-tools/patches ================================================ FILE: .git-blame-ignore-revs ================================================ 4c2bb43775947775401cbb9d774823c5723a91f8 ================================================ FILE: .gitattributes ================================================ # allows for nicer hunk headers with git show *.rs diff=rust ================================================ FILE: .github/ISSUE_TEMPLATE/bug-template.md ================================================ --- name: Bug Template about: Used for describing bugs title: '' labels: t/bug type: Bug assignees: '' --- ## Steps to reproduce ## Expected result ## Actual result ## Environment ## Logs, links - ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true contact_links: - name: Feature request url: https://console.neon.tech/app/projects?modal=feedback about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech` ================================================ FILE: .github/ISSUE_TEMPLATE/epic-template.md ================================================ --- name: Epic Template about: A set of related tasks contributing towards specific outcome, comprising of more than 1 week of work. title: 'Epic: ' labels: t/Epic type: Epic assignees: '' --- ## Motivation ## DoD ## Implementation ideas ## Tasks ```[tasklist] - [ ] Example Task ``` ## Other related tasks and Epics - ================================================ FILE: .github/actionlint.yml ================================================ self-hosted-runner: labels: - arm64 - large - large-arm64 - small - small-metal - small-arm64 - unit-perf - unit-perf-aws-arm - us-east-2 config-variables: - AWS_ECR_REGION - AZURE_DEV_CLIENT_ID - AZURE_DEV_REGISTRY_NAME - AZURE_DEV_SUBSCRIPTION_ID - AZURE_PROD_CLIENT_ID - AZURE_PROD_REGISTRY_NAME - AZURE_PROD_SUBSCRIPTION_ID - AZURE_TENANT_ID - BENCHMARK_INGEST_TARGET_PROJECTID - BENCHMARK_LARGE_OLTP_PROJECTID - BENCHMARK_PROJECT_ID_PUB - BENCHMARK_PROJECT_ID_SUB - DEV_AWS_OIDC_ROLE_ARN - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - HETZNER_CACHE_BUCKET - HETZNER_CACHE_ENDPOINT - HETZNER_CACHE_REGION - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - PGREGRESS_PG16_PROJECT_ID - PGREGRESS_PG17_PROJECT_ID - PREWARM_PROJECT_ID - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID - SLACK_COMPUTE_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM - SLACK_ON_CALL_QA_STAGING_STREAM - SLACK_ON_CALL_STORAGE_STAGING_STREAM - SLACK_ONCALL_COMPUTE_GROUP - SLACK_ONCALL_PROXY_GROUP - SLACK_ONCALL_STORAGE_GROUP - SLACK_PROXY_CHANNEL_ID - SLACK_RUST_CHANNEL_ID - SLACK_STORAGE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID ================================================ FILE: .github/actions/allure-report-generate/action.yml ================================================ name: 'Create Allure report' description: 'Generate Allure report from uploaded by actions/allure-report-store tests results' inputs: store-test-results-into-db: description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set' type: boolean required: false default: false aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true outputs: base-url: description: 'Base URL for Allure report' value: ${{ steps.generate-report.outputs.base-url }} base-s3-url: description: 'Base S3 URL for Allure report' value: ${{ steps.generate-report.outputs.base-s3-url }} report-url: description: 'Allure report URL' value: ${{ steps.generate-report.outputs.report-url }} report-json-url: description: 'Allure report JSON URL' value: ${{ steps.generate-report.outputs.report-json-url }} runs: using: "composite" steps: # We're using some of env variables quite offen, so let's set them once. # # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] # # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 # - name: Set variables shell: bash -euxo pipefail {0} env: PR_NUMBER: ${{ github.event.pull_request.number }} BUCKET: neon-github-public-dev run: | if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") fi LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s) mkdir -p ${WORKDIR} echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV echo "BUCKET=${BUCKET}" >> $GITHUB_ENV # TODO: We can replace with a special docker image with Java and Allure pre-installed - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '17' - name: Install Allure shell: bash -euxo pipefail {0} working-directory: /tmp run: | if ! which allure; then ALLURE_ZIP=allure-${ALLURE_VERSION}.zip wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check unzip -q ${ALLURE_ZIP} echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH rm -f ${ALLURE_ZIP} fi env: ALLURE_VERSION: 2.32.2 ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953 - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} with: aws-region: eu-central-1 role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock shell: bash -euxo pipefail {0} run: | LOCK_TIMEOUT=300 # seconds LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt # Do it up to 5 times to avoid race condition for _ in $(seq 1 5); do for i in $(seq 1 ${LOCK_TIMEOUT}); do LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then break fi sleep 1 done aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}" # Double-check that exactly THIS run has acquired the lock aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then break fi done - name: Generate and publish final Allure report id: generate-report shell: bash -euxo pipefail {0} run: | REPORT_PREFIX=reports/${BRANCH_OR_PR} RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID} BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID} REPORT_URL=${BASE_URL}/index.html REPORT_JSON_URL=${BASE_URL}/data/suites.json # Get previously uploaded data for this run ZSTD_NBTHREADS=0 S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key') if [ -z "$S3_FILEPATHS" ]; then # There's no previously uploaded data for this $GITHUB_RUN_ID exit 0 fi time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/" for archive in $(find ${WORKDIR} -name "*.tar.zst"); do mkdir -p ${archive%.tar.zst} time tar -xf ${archive} -C ${archive%.tar.zst} rm -f ${archive} done # Get history trend time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true # Generate report time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/* # Replace a logo link with a redirect to the latest version of the report sed -i 's| ${WORKDIR}/index.html Redirecting to ${REPORT_URL} EOF time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "base-url=${BASE_URL}" >> $GITHUB_OUTPUT echo "base-s3-url=${BASE_S3_URL}" >> $GITHUB_OUTPUT echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - name: Release lock if: always() shell: bash -euxo pipefail {0} run: | aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} shell: bash -euxo pipefail {0} env: COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }} run: | if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then exit 0 fi export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW} ./scripts/pysync poetry run python3 scripts/ingest_regress_test_result-new-format.py \ --reference ${GITHUB_REF} \ --revision ${COMMIT_SHA} \ --run-id ${GITHUB_RUN_ID} \ --run-attempt ${GITHUB_RUN_ATTEMPT} \ --test-cases-dir ${WORKDIR}/report/data/test-cases - name: Cleanup if: always() shell: bash -euxo pipefail {0} run: | if [ -d "${WORKDIR}" ]; then rm -rf ${WORKDIR} fi - uses: actions/github-script@v7 if: always() env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const { REPORT_URL, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, sha: `${COMMIT_SHA}`, state: 'success', target_url: `${REPORT_URL}`, context: 'Allure report', }) ================================================ FILE: .github/actions/allure-report-store/action.yml ================================================ name: 'Store Allure results' description: 'Upload test results to be used by actions/allure-report-generate' inputs: report-dir: description: 'directory with test results generated by tests' required: true unique-key: description: 'string to distinguish different results in the same run' required: true aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true runs: using: "composite" steps: - name: Set variables shell: bash -euxo pipefail {0} env: PR_NUMBER: ${{ github.event.pull_request.number }} REPORT_DIR: ${{ inputs.report-dir }} run: | if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") fi echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} with: aws-region: eu-central-1 role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results shell: bash -euxo pipefail {0} run: | REPORT_PREFIX=reports/${BRANCH_OR_PR} RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} # Add metadata cat < ${REPORT_DIR}/executor.json { "name": "GitHub Actions", "type": "github", "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", "buildOrder": ${GITHUB_RUN_ID}, "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}", "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", "reportName": "Allure Report" } EOF cat < ${REPORT_DIR}/environment.properties COMMIT_SHA=${COMMIT_SHA} EOF ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" ZSTD_NBTHREADS=0 time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd . time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" env: UNIQUE_KEY: ${{ inputs.unique-key }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} BUCKET: neon-github-public-dev - name: Cleanup if: always() shell: bash -euxo pipefail {0} run: | rm -rf ${REPORT_DIR} ================================================ FILE: .github/actions/download/action.yml ================================================ name: "Download an artifact" description: "Custom download action" inputs: name: description: "Artifact name" required: true path: description: "A directory to put artifact into" default: "." required: false skip-if-does-not-exist: description: "Allow to skip if file doesn't exist, fail otherwise" default: false required: false prefix: description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true runs: using: "composite" steps: - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 - name: Download artifact id: download-artifact shell: bash -euxo pipefail {0} env: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo 'SKIPPED=true' >> $GITHUB_OUTPUT exit 0 else echo >&2 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi echo 'SKIPPED=false' >> $GITHUB_OUTPUT mkdir -p $(dirname $ARCHIVE) time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE} - name: Extract artifact if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }} shell: bash -euxo pipefail {0} env: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst run: | mkdir -p ${TARGET} time tar -xf ${ARCHIVE} -C ${TARGET} rm -f ${ARCHIVE} ================================================ FILE: .github/actions/neon-branch-create/action.yml ================================================ name: 'Create Branch' description: 'Create Branch using API' inputs: api_key: description: 'Neon API key' required: true project_id: description: 'ID of the Project to create Branch in' required: true api_host: description: 'Neon API host' default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' value: ${{ steps.change-password.outputs.dsn }} branch_id: description: 'Created Branch ID' value: ${{ steps.create-branch.outputs.branch_id }} runs: using: "composite" steps: - name: Create New Branch id: create-branch shell: bash -euxo pipefail {0} run: | for i in $(seq 1 10); do branch=$(curl \ "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches" \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"branch\": { \"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\" }, \"endpoints\": [ { \"type\": \"read_write\" } ] }") if [ -z "${branch}" ]; then sleep 1 continue fi branch_id=$(echo $branch | jq --raw-output '.branch.id') if [ "${branch_id}" == "null" ]; then sleep 1 continue fi break done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then echo >&2 "Failed to create branch after 10 attempts, the latest response was: ${branch}" exit 1 fi branch_id=$(echo $branch | jq --raw-output '.branch.id') echo "branch_id=${branch_id}" >> $GITHUB_OUTPUT host=$(echo $branch | jq --raw-output '.endpoints[0].host') echo "host=${host}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} - name: Get Role name id: role-name shell: bash -euxo pipefail {0} run: | roles=$(curl \ "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles" \ --fail \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" ) role_name=$(echo "$roles" | jq --raw-output ' (.roles | map(select(.protected == false))) as $roles | if any($roles[]; .name == "neondb_owner") then "neondb_owner" else $roles[0].name end ') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }} - name: Change Password id: change-password # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | for i in $(seq 1 10); do reset_password=$(curl \ "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles/${ROLE_NAME}/reset_password" \ --request POST \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" ) if [ -z "${reset_password}" ]; then sleep $i continue fi password=$(echo $reset_password | jq --raw-output '.role.password') if [ "${password}" == "null" ]; then sleep $i # increasing backoff continue fi echo "::add-mask::${password}" break done if [ -z "${password}" ] || [ "${password}" == "null" ]; then echo >&2 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}" exit 1 fi dsn="postgres://${ROLE_NAME}:${password}@${HOST}/neondb" echo "::add-mask::${dsn}" echo "dsn=${dsn}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }} ROLE_NAME: ${{ steps.role-name.outputs.role_name }} HOST: ${{ steps.create-branch.outputs.host }} ================================================ FILE: .github/actions/neon-branch-delete/action.yml ================================================ name: 'Delete Branch' description: 'Delete Branch using API' inputs: api_key: description: 'Neon API key' required: true project_id: description: 'ID of the Project which should be deleted' required: true branch_id: description: 'ID of the branch to delete' required: true api_host: description: 'Neon API host' default: console-stage.neon.build runs: using: "composite" steps: - name: Delete Branch # Do not try to delete a branch if .github/actions/neon-project-create # or .github/actions/neon-branch-create failed before if: ${{ inputs.project_id != '' && inputs.branch_id != '' }} shell: bash -euxo pipefail {0} run: | for i in $(seq 1 10); do deleted_branch=$(curl \ "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}" \ --request DELETE \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" ) if [ -z "${deleted_branch}" ]; then sleep 1 continue fi branch_id=$(echo $deleted_branch | jq --raw-output '.branch.id') if [ "${branch_id}" == "null" ]; then sleep 1 continue fi break done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then echo >&2 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}" exit 1 fi env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ inputs.branch_id }} ================================================ FILE: .github/actions/neon-project-create/action.yml ================================================ name: 'Create Neon Project' description: 'Create Neon Project using API' inputs: api_key: description: 'Neon API key' required: true region_id: description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: description: 'Postgres version; default is 16' default: '16' api_host: description: 'Neon API host' default: console-stage.neon.build compute_units: description: '[Min, Max] compute units' default: '[1, 1]' # settings below only needed if you want the project to be sharded from the beginning shard_split_project: description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially' required: false default: 'false' disable_sharding: description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding' required: false default: 'false' admin_api_key: description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true' required: false shard_count: description: 'Number of shards to split the project into, only applies if shard_split_project is true' required: false default: '8' stripe_size: description: 'Stripe size, optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true' required: false default: '32768' psql_path: description: 'Path to psql binary - it is caller responsibility to provision the psql binary' required: false default: '/tmp/neon/pg_install/v16/bin/psql' libpq_lib_path: description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library' required: false default: '/tmp/neon/pg_install/v16/lib' project_settings: description: 'A JSON object with project settings' required: false default: '{}' outputs: dsn: description: 'Created Project DSN (for main database)' value: ${{ steps.create-neon-project.outputs.dsn }} project_id: description: 'Created Project ID' value: ${{ steps.create-neon-project.outputs.project_id }} runs: using: "composite" steps: - name: Create Neon Project id: create-neon-project # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | res=$(curl \ "https://${API_HOST}/api/v2/projects" \ -w "%{http_code}" \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": ${PROJECT_SETTINGS} } }") code=${res: -3} if [[ ${code} -ge 400 ]]; then echo Request failed with error code ${code} echo ${res::-3} exit 1 else project=${res::-3} fi # Mask password echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')" dsn=$(echo $project | jq --raw-output '.connection_uris[0].connection_uri') echo "::add-mask::${dsn}" echo "dsn=${dsn}" >> $GITHUB_OUTPUT project_id=$(echo $project | jq --raw-output '.project.id') echo "project_id=${project_id}" >> $GITHUB_OUTPUT echo "Project ${project_id} has been created" if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then # determine tenant ID TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))" echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}" # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) curl -X PUT \ "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \ -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" fi if [ "${DISABLE_SHARDING}" = "true" ]; then # determine tenant ID TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}" echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" echo "with body {\"scheduling\": \"Essential\"}" # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) curl -X PUT \ "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \ -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"scheduling\": \"Essential\"}" fi env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} DISABLE_SHARDING: ${{ inputs.disable_sharding }} ADMIN_API_KEY: ${{ inputs.admin_api_key }} SHARD_COUNT: ${{ inputs.shard_count }} STRIPE_SIZE: ${{ inputs.stripe_size }} PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} PROJECT_SETTINGS: ${{ inputs.project_settings }} ================================================ FILE: .github/actions/neon-project-delete/action.yml ================================================ name: 'Delete Neon Project' description: 'Delete Neon Project using API' inputs: api_key: description: 'Neon API key' required: true project_id: description: 'ID of the Project to delete' required: true api_host: description: 'Neon API host' default: console-stage.neon.build runs: using: "composite" steps: - name: Delete Neon Project # Do not try to delete a project if .github/actions/neon-project-create failed before if: ${{ inputs.project_id != '' }} shell: bash -euxo pipefail {0} run: | curl \ "https://${API_HOST}/api/v2/projects/${PROJECT_ID}" \ --fail \ --request DELETE \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" echo "Project ${PROJECT_ID} has been deleted" env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} ================================================ FILE: .github/actions/prepare-for-subzero/action.yml ================================================ name: 'Prepare current job for subzero' description: > Set git token to access `neondatabase/subzero` from cargo build, and set `CARGO_NET_GIT_FETCH_WITH_CLI=true` env variable to use git CLI inputs: token: description: 'GitHub token with access to neondatabase/subzero' required: true runs: using: "composite" steps: - name: Set git token for neondatabase/subzero uses: pyTooling/Actions/with-post-step@2307b526df64d55e95884e072e49aac2a00a9afa # v5.1.0 env: SUBZERO_ACCESS_TOKEN: ${{ inputs.token }} with: main: | git config --global url."https://x-access-token:${SUBZERO_ACCESS_TOKEN}@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a post: | git config --global --unset url."https://x-access-token:${SUBZERO_ACCESS_TOKEN}@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" - name: Set `CARGO_NET_GIT_FETCH_WITH_CLI=true` env variable shell: bash -euxo pipefail {0} run: echo "CARGO_NET_GIT_FETCH_WITH_CLI=true" >> ${GITHUB_ENV} ================================================ FILE: .github/actions/run-python-test-set/action.yml ================================================ name: 'Run python test' description: 'Runs a Neon python test set, performing all the required preparations before' inputs: build_type: description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true test_selection: description: 'A python test suite to run' required: true extra_params: description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr' required: false default: '' needs_postgres_source: description: 'Set to true if the test suite requires postgres source checked out' required: false default: 'false' run_in_parallel: description: 'Whether to run tests in parallel' required: false default: 'true' save_perf_report: description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set' required: false default: 'false' run_with_real_s3: description: 'Whether to pass real s3 credentials to the test suite' required: false default: 'false' real_s3_bucket: description: 'Bucket name for real s3 tests' required: false default: '' real_s3_region: description: 'Region name for real s3 tests' required: false default: '' rerun_failed: description: 'Whether to rerun failed tests' required: false default: 'false' pg_version: description: 'Postgres version to use for tests' required: false default: 'v16' sanitizers: description: 'enabled or disabled' required: false default: 'disabled' type: string benchmark_durations: description: 'benchmark durations JSON' required: false default: '{}' aws-oidc-role-arn: description: 'OIDC role arn to interract with S3' required: true runs: using: "composite" steps: - name: Get Neon artifact if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Download compatibility snapshot if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest # The lack of compatibility snapshot (for example, for the new Postgres version) # shouldn't fail the whole job. Only relevant test should fail. skip-if-does-not-exist: true aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v4 with: submodules: true - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} run: ./scripts/pysync - name: Run pytest env: NEON_BIN: /tmp/neon/bin COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: ${{ inputs.build_type }} COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} SANITIZERS: ${{ inputs.sanitizers }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export DEFAULT_PG_VERSION=${PG_VERSION#v} export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1 export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1 if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 fi PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV} rm -rf $PERF_REPORT_DIR TEST_SELECTION="test_runner/${{ inputs.test_selection }}" EXTRA_PARAMS="${{ inputs.extra_params }}" if [ -z "$TEST_SELECTION" ]; then echo "test_selection must be set" exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then # -n sets the number of parallel processes that pytest-xdist will run EXTRA_PARAMS="-n12 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS" fi if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then echo "REAL S3 ENABLED" export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }} export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }} fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then mkdir -p "$PERF_REPORT_DIR" EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi if [ "${RERUN_FAILED}" == "true" ]; then EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS" fi # We use pytest-split plugin to run benchmarks in parallel on different CI runners if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then mkdir -p $TEST_OUTPUT echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then # We don't use code coverage for regression tests (the step is disabled), # so there's no need to collect it. # Ref https://github.com/neondatabase/neon/issues/4540 # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) cov_prefix=() # Explicitly set LLVM_PROFILE_FILE to /dev/null to avoid writing *.profraw files export LLVM_PROFILE_FILE=/dev/null else cov_prefix=() fi # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then QUERIES+=("SHOW neon.tenant_id") QUERIES+=("SHOW neon.timeline_id") fi for q in "${QUERIES[@]}"; do ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}" done fi # Run the tests. # # --alluredir saves test results in Allure format (in a specified directory) # --verbose prints name of each test (helpful when there are # multiple tests in one file) # -rA prints summary in the end # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests # mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ -rA $TEST_SELECTION $EXTRA_PARAMS - name: Upload performance report if: ${{ !cancelled() && inputs.save_perf_report == 'true' }} shell: bash -euxo pipefail {0} run: | export REPORT_FROM="${PERF_REPORT_DIR}" scripts/generate_and_push_perf_report.sh - name: Upload compatibility snapshot # Note, that we use `github.base_ref` which is a target branch for a PR if: github.event_name == 'pull_request' && github.base_ref == 'release' uses: ./.github/actions/upload with: name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ # The lack of compatibility snapshot shouldn't fail the job # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} with: aws-region: eu-central-1 role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }} aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} ================================================ FILE: .github/actions/save-coverage-data/action.yml ================================================ name: 'Merge and upload coverage data' description: 'Compresses and uploads the coverage data as an artifact' runs: using: "composite" steps: - name: Merge coverage data shell: bash -euxo pipefail {0} run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - name: Download previous coverage data into the same directory uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage skip-if-does-not-exist: true # skip if there's no previous coverage to download aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} - name: Upload coverage data uses: ./.github/actions/upload with: name: coverage-data-artifact path: /tmp/coverage aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }} ================================================ FILE: .github/actions/upload/action.yml ================================================ name: "Upload an artifact" description: "Custom upload action" inputs: name: description: "Artifact name" required: true path: description: "A directory or file to upload" required: true skip-if-does-not-exist: description: "Allow to skip if path doesn't exist, fail otherwise" default: false required: false prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false aws-oidc-role-arn: description: "the OIDC role arn for aws auth" required: false default: "" runs: using: "composite" steps: - name: Prepare artifact id: prepare-artifact shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} run: | mkdir -p $(dirname $ARCHIVE) if [ -f ${ARCHIVE} ]; then echo >&2 "File ${ARCHIVE} already exist. Something went wrong before" exit 1 fi ZSTD_NBTHREADS=0 if [ -d ${SOURCE} ]; then time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd . elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} elif ! ls ${SOURCE} > /dev/null 2>&1; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo 'SKIPPED=true' >> $GITHUB_OUTPUT exit 0 else echo >&2 "${SOURCE} does not exist" exit 2 fi else echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it" exit 3 fi echo 'SKIPPED=false' >> $GITHUB_OUTPUT - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 role-to-assume: ${{ inputs.aws-oidc-role-arn }} role-duration-seconds: 3600 - name: Upload artifact if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }} shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} ================================================ FILE: .github/file-filters.yaml ================================================ rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock'] rust_dependencies: ['**/Cargo.lock'] v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**'] v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**'] v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**'] v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**'] rebuild_neon_extra: - .github/workflows/neon_extra_builds.yml rebuild_macos: - .github/workflows/build-macos.yml ================================================ FILE: .github/pull_request_template.md ================================================ ## Problem ## Summary of changes ================================================ FILE: .github/scripts/generate_image_maps.py ================================================ import itertools import json import os import sys source_tag = os.getenv("SOURCE_TAG") target_tag = os.getenv("TARGET_TAG") branch = os.getenv("BRANCH") dev_acr = os.getenv("DEV_ACR") prod_acr = os.getenv("PROD_ACR") dev_aws = os.getenv("DEV_AWS") prod_aws = os.getenv("PROD_AWS") aws_region = os.getenv("AWS_REGION") components = { "neon": ["neon"], "compute": [ "compute-node-v14", "compute-node-v15", "compute-node-v16", "compute-node-v17", "vm-compute-node-v14", "vm-compute-node-v15", "vm-compute-node-v16", "vm-compute-node-v17", ], } registries = { "dev": [ "docker.io/neondatabase", "ghcr.io/neondatabase", f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], "prod": [ f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{prod_acr}.azurecr.io/neondatabase", ], } release_branches = ["release", "release-proxy", "release-compute"] outputs: dict[str, dict[str, list[str]]] = {} target_tags = ( [target_tag, "latest"] if branch == "main" else [target_tag, "released"] if branch in release_branches else [target_tag] ) target_stages = ["dev", "prod"] if branch in release_branches else ["dev"] for component_name, component_images in components.items(): for stage in target_stages: outputs[f"{component_name}-{stage}"] = { f"ghcr.io/neondatabase/{component_image}:{source_tag}": [ f"{registry}/{component_image}:{tag}" for registry, tag in itertools.product(registries[stage], target_tags) if not (registry == "ghcr.io/neondatabase" and tag == source_tag) ] for component_image in component_images } with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f: for key, value in outputs.items(): f.write(f"{key}={json.dumps(value)}\n") print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr) ================================================ FILE: .github/scripts/lint-release-pr.sh ================================================ #!/usr/bin/env bash set -euo pipefail DOCS_URL="https://docs.neon.build/overview/repositories/neon.html" message() { if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \ || gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1" fi echo "$1" } report_error() { message "❌ $1 For more details, see the documentation: ${DOCS_URL}" exit 1 } case "$RELEASE_BRANCH" in "release") COMPONENT="Storage" ;; "release-proxy") COMPONENT="Proxy" ;; "release-compute") COMPONENT="Compute" ;; *) report_error "Unknown release branch: ${RELEASE_BRANCH}" ;; esac # Identify main and release branches MAIN_BRANCH="origin/main" REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}" # Find merge base MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}") echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" # Get the HEAD commit (last commit in PR, expected to be the merge commit) LAST_COMMIT=$(git rev-parse HEAD) MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$" if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' Expected component: ${COMPONENT} Found: '${MERGE_COMMIT_MESSAGE}'" fi echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'" LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then report_error "Last commit must be a merge commit with exactly two parents" fi EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}") if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}") else report_error "Last commit must merge the release branch (${RELEASE_BRANCH})" fi echo "✅ Last commit correctly merges the previous commit and the release branch" echo "Top commit of linear history: ${LINEAR_HEAD}" MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}") LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}") if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE}) This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'" fi echo "✅ Merge commit tree matches the linear history head" EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}" # Now traverse down the history, ensuring each commit has exactly one parent CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}" while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then report_error "Commit ${CURRENT_COMMIT} must have exactly one parent" fi NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]') if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then echo "✅ Reached merge base (${MERGE_BASE})" PR_BASE="${MERGE_BASE}" elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})" PR_BASE="${EXPECTED_RELEASE_HEAD}" elif [[ -z "${NEXT_COMMIT}" ]]; then report_error "Unexpected end of commit history before reaching merge base" fi # Move to the next commit in the chain CURRENT_COMMIT="${NEXT_COMMIT}" done echo "✅ All commits are properly ordered and linear" echo "✅ Release PR structure is valid" echo message "Commits that are part of this release: $(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")" ================================================ FILE: .github/scripts/previous-releases.jq ================================================ # Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input, # with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases. # Extract only the `tag_name` field from each release object [ .[].tag_name ] # Transform each tag name into a structured object using regex capture | reduce map( capture("^(?release(-(?proxy|compute))?-(?\\d+))$") | { component: (.component // "storage"), # Default to "storage" if no component is specified version: (.version | tonumber), # Convert the version number to an integer full: .full # Store the full tag name for final output } )[] as $entry # Loop over the transformed list # Accumulate the latest (highest-numbered) version for each component ({}; .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) # Ensure that each component exists, or fail | (["storage", "compute", "proxy"] - (keys)) as $missing | if ($missing | length) > 0 then "Error: Found no release for \($missing | join(", "))!\n" | halt_error(1) else . end # Convert the resulting object into an array of formatted strings | to_entries | map("\(.key)=\(.value.full)") # Output each string separately | .[] ================================================ FILE: .github/scripts/push_with_image_map.py ================================================ import json import os import subprocess RED = "\033[91m" RESET = "\033[0m" image_map = os.getenv("IMAGE_MAP") if not image_map: raise ValueError("IMAGE_MAP environment variable is not set") try: parsed_image_map: dict[str, list[str]] = json.loads(image_map) except json.JSONDecodeError as e: raise ValueError("Failed to parse IMAGE_MAP as JSON") from e failures = [] pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets] while len(pending) > 0: if len(failures) > 10: print("Error: more than 10 failures!") for failure in failures: print(f'"{failure[0]}" failed with the following output:') print(failure[1]) raise RuntimeError("Retry limit reached.") source, target = pending.pop(0) cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] print(f"Running: {' '.join(cmd)}") result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if result.returncode != 0: failures.append((" ".join(cmd), result.stdout, target)) pending.append((source, target)) print( f"{RED}[RETRY]{RESET} Push failed for {target}. Retrying... (failure count: {len(failures)})" ) print(result.stdout) if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")): failed_targets = [target for _, _, target in failures] with open(github_output, "a") as f: f.write(f"push_failures={json.dumps(failed_targets)}\n") ================================================ FILE: .github/workflows/_benchmarking_preparation.yml ================================================ name: Prepare benchmarking databases by restoring dumps on: workflow_call: # no inputs needed defaults: run: shell: bash -euxo pipefail {0} permissions: contents: read jobs: setup-databases: permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ] database: [ clickbench, tpch, userexample ] env: LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib PLATFORM: ${{ matrix.platform }} PG_BINARIES: /tmp/neon/pg_install/v16/bin runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Set up Connection String id: set-up-prep-connstr run: | case "${PLATFORM}" in neon) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neon_pg17) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} ;; aws-rds-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; aws-aurora-serverless-v2-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} # to avoid a race condition of multiple jobs trying to create the table at the same time, # we use an advisory lock run: | ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " SELECT pg_advisory_lock(4711); CREATE TABLE IF NOT EXISTS benchmark_restore_status ( databasename text primary key, restore_done boolean ); SELECT pg_advisory_unlock(4711); " - name: Check if restore is already done id: check-restore-done env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} run: | skip=false if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database." skip=true fi echo "skip=${skip}" | tee -a $GITHUB_OUTPUT - name: Check and create database if it does not exist if: steps.check-restore-done.outputs.skip != 'true' env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} run: | DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'") if [ "$DB_EXISTS" != "1" ]; then echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..." ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";" else echo "Database ${{ env.DATABASE_NAME }} already exists." fi - name: Download dump from S3 to /tmp/dumps if: steps.check-restore-done.outputs.skip != 'true' env: DATABASE_NAME: ${{ matrix.database }} run: | mkdir -p /tmp/dumps aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ - name: Replace database name in connection string if: steps.check-restore-done.outputs.skip != 'true' id: replace-dbname env: DATABASE_NAME: ${{ matrix.database }} BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} run: | # Extract the part before the database name base_connstr="${BENCHMARK_CONNSTR%/*}" # Extract the query parameters (if any) after the database name query_params="${BENCHMARK_CONNSTR#*\?}" # Reconstruct the new connection string if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}" else new_connstr="${base_connstr}/${DATABASE_NAME}" fi echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT - name: Restore dump if: steps.check-restore-done.outputs.skip != 'true' env: DATABASE_NAME: ${{ matrix.database }} DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} # the following works only with larger computes: # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" # we add the || true because: # the dumps were created with Neon and contain neon extensions that are not # available in RDS, so we will always report an error, but we can ignore it run: | ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true - name: Update benchmark_restore_status table if: steps.check-restore-done.outputs.skip != 'true' env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} run: | ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true) ON CONFLICT (databasename) DO UPDATE SET restore_done = true; " ================================================ FILE: .github/workflows/_build-and-test-locally.yml ================================================ name: Build and Test Locally on: workflow_call: inputs: arch: description: 'x64 or arm64' required: true type: string build-tag: description: 'build tag' required: true type: string build-tools-image: description: 'build-tools image' required: true type: string build-type: description: 'debug or release' required: true type: string test-cfg: description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string sanitizers: description: 'enabled or disabled' required: false default: 'disabled' type: string test-selection: description: 'specification of selected test(s) to run' required: false default: '' type: string test-run-count: description: 'number of runs to perform for selected tests' required: false default: 1 type: number rerun-failed: description: 'rerun failed tests to ignore flaky tests' required: false default: true type: boolean defaults: run: shell: bash -euxo pipefail {0} env: RUST_BACKTRACE: 1 COPT: '-Werror' permissions: contents: read jobs: build-neon: runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: id-token: write # aws-actions/configure-aws-credentials contents: read container: image: ${{ inputs.build-tools-image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # Raise locked memory limit for tokio-epoll-uring. # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), # io_uring will account the memory of the CQ and SQ as locked. # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 env: BUILD_TYPE: ${{ inputs.build-type }} GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG: ${{ inputs.build-tag }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - uses: ./.github/actions/prepare-for-subzero with: token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - name: Set pg 15 revision for caching id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - name: Set pg 16 revision for caching id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - name: Set pg 17 revision for caching id: pg_v17_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to all "cargo" subcommands. # # CARGO_PROFILE is passed to "cargo build", "cargo test" etc, but not to # "cargo metadata", because it doesn't accept --release or --debug options. # # We run tests with addtional features, that are turned off by default (e.g. in release builds), see # corresponding Cargo.toml files for their descriptions. - name: Set env variables env: ARCH: ${{ inputs.arch }} SANITIZERS: ${{ inputs.sanitizers }} run: | CARGO_FLAGS="--locked --features testing,rest_broker" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_PROFILE="" elif [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="" CARGO_PROFILE="" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_PROFILE="--release" fi if [[ $SANITIZERS == 'enabled' ]]; then make_vars="WITH_SANITIZERS=yes" else make_vars="" fi { echo "cov_prefix=${cov_prefix}" echo "make_vars=${make_vars}" echo "CARGO_FLAGS=${CARGO_FLAGS}" echo "CARGO_PROFILE=${CARGO_PROFILE}" echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" } >> $GITHUB_ENV - name: Cache postgres v14 build id: cache_pg_14 uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: pg_install/v14 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: pg_install/v15 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }} - name: Cache postgres v16 build id: cache_pg_16 uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }} - name: Cache postgres v17 build id: cache_pg_17 uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }} - name: Build all # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS" - name: Build walproposer-lib run: mold -run make ${make_vars} walproposer-lib -j$(nproc) - name: Build unit tests if: inputs.sanitizers != 'enabled' run: | export ASAN_OPTIONS=detect_leaks=0 ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_PROFILE --tests # Do install *before* running rust tests because they might recompile the # binaries with different features/flags. - name: Install rust binaries env: ARCH: ${{ inputs.arch }} SANITIZERS: ${{ inputs.sanitizers }} run: | # Install target binaries mkdir -p /tmp/neon/bin/ binaries=$( ${cov_prefix} cargo metadata $CARGO_FLAGS --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) for bin in $binaries; do SRC=target/$BUILD_TYPE/$bin DST=/tmp/neon/bin/$bin cp "$SRC" "$DST" done # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then # Keep bloated coverage data files away from the rest of the artifact mkdir -p /tmp/coverage/ mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_PROFILE --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) # We don't need debug symbols for code coverage, so strip them out to make # the artifact smaller. strip "$SRC" -o "$DST" echo "$DST" >> /tmp/coverage/binaries.list done for bin in $binaries; do echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done fi - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Run rust tests if: ${{ inputs.sanitizers != 'enabled' }} env: NEXTEST_RETRIES: 3 run: | LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_PROFILE # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E '!package(pageserver)' # run pageserver tests # (When developing new pageserver features gated by config fields, we commonly make the rust # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. # Then run the nextest invocation below for all relevant combinations. Singling out the # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ ${cov_prefix} \ cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region export ENABLE_REAL_AZURE_REMOTE_STORAGE=y export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install postgres binaries run: | # Use tar to copy files matching the pattern, preserving the paths in the destionation tar c \ pg_install/v* \ build/*/src/test/regress/*.so \ build/*/src/test/regress/pg_regress \ build/*/src/test/isolation/isolationtester \ build/*/src/test/isolation/pg_isolation_regress \ | tar x -C /tmp/neon - name: Upload Neon artifact uses: ./.github/actions/upload with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Check diesel schema if: inputs.build-type == 'release' && inputs.arch == 'x64' env: DATABASE_URL: postgresql://localhost:1235/storage_controller POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install run: | export ASAN_OPTIONS=detect_leaks=0 /tmp/neon/bin/neon_local init /tmp/neon/bin/neon_local storage_controller start diesel print-schema > storage_controller/src/schema.rs if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then echo >&2 "Uncommitted changes in diesel schema" git diff . exit 1 fi /tmp/neon/bin/neon_local storage_controller stop # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: inputs.build-type == 'debug' uses: ./.github/actions/save-coverage-data regress-tests: # Don't run regression tests on debug arm64 builds if: inputs.build-type != 'debug' || inputs.arch != 'arm64' permissions: id-token: write # aws-actions/configure-aws-credentials contents: read statuses: write needs: [ build-neon ] runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }} container: image: ${{ inputs.build-tools-image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress needs_postgres_source: true run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 rerun_failed: ${{ inputs.rerun-failed }} pg_version: ${{ matrix.pg_version }} sanitizers: ${{ inputs.sanitizers }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports # until they are forcibly stopped by the stricter `timeout-minutes` limit. extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }} ${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 - name: Merge and upload coverage data if: | false && inputs.build-type == 'debug' && matrix.pg_version == 'v16' uses: ./.github/actions/save-coverage-data ================================================ FILE: .github/workflows/_check-codestyle-python.yml ================================================ name: Check Codestyle Python on: workflow_call: inputs: build-tools-image: description: 'build-tools image' required: true type: string defaults: run: shell: bash -euxo pipefail {0} permissions: contents: read jobs: check-codestyle-python: runs-on: [ self-hosted, small ] permissions: packages: read container: image: ${{ inputs.build-tools-image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Cache poetry deps uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - run: ./scripts/pysync - run: poetry run ruff check . - run: poetry run ruff format --check . - run: poetry run mypy . ================================================ FILE: .github/workflows/_check-codestyle-rust.yml ================================================ name: Check Codestyle Rust on: workflow_call: inputs: build-tools-image: description: "build-tools image" required: true type: string archs: description: "Json array of architectures to run on" type: string defaults: run: shell: bash -euxo pipefail {0} # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} jobs: check-codestyle-rust: strategy: matrix: arch: ${{ fromJSON(inputs.archs) }} runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} permissions: packages: read container: image: ${{ inputs.build-tools-image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - uses: ./.github/actions/prepare-for-subzero with: token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Cache cargo deps uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers run: make postgres-headers -j$(nproc) # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. # This will catch compiler & clippy warnings in all feature combinations. # TODO: use cargo hack for build and test as well, but, that's quite expensive. # NB: keep clippy args in sync with ./run_clippy.sh # # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second # time just for that, so skip "clippy --release". - run: | CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" if [ "$CLIPPY_COMMON_ARGS" = "" ]; then echo "No clippy args found in .neon_clippy_args" exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - name: Run cargo clippy (debug) run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Check documentation generation run: cargo doc --workspace --no-deps --document-private-items env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting if: ${{ !cancelled() }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies if: ${{ !cancelled() }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack ================================================ FILE: .github/workflows/_meta.yml ================================================ name: Generate run metadata on: workflow_call: inputs: github-event-name: type: string required: true github-event-json: type: string required: true outputs: build-tag: description: "Tag for the current workflow run" value: ${{ jobs.tags.outputs.build-tag }} release-tag: description: "Tag for the release if this is an RC PR run" value: ${{ jobs.tags.outputs.release-tag }} previous-storage-release: description: "Tag of the last storage release" value: ${{ jobs.tags.outputs.storage }} previous-proxy-release: description: "Tag of the last proxy release" value: ${{ jobs.tags.outputs.proxy }} previous-compute-release: description: "Tag of the last compute release" value: ${{ jobs.tags.outputs.compute }} run-kind: description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`" value: ${{ jobs.tags.outputs.run-kind }} release-pr-run-id: description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found." value: ${{ jobs.tags.outputs.release-pr-run-id }} sha: description: "github.event.pull_request.head.sha on release PRs, github.sha otherwise" value: ${{ jobs.tags.outputs.sha }} permissions: {} defaults: run: shell: bash -euo pipefail {0} jobs: tags: runs-on: ubuntu-22.04 outputs: build-tag: ${{ steps.build-tag.outputs.build-tag }} release-tag: ${{ steps.build-tag.outputs.release-tag }} compute: ${{ steps.previous-releases.outputs.compute }} proxy: ${{ steps.previous-releases.outputs.proxy }} storage: ${{ steps.previous-releases.outputs.storage }} run-kind: ${{ steps.run-kind.outputs.run-kind }} release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }} sha: ${{ steps.sha.outputs.sha }} permissions: contents: read steps: # Need `fetch-depth: 0` to count the number of commits in the branch - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Get run kind id: run-kind env: RUN_KIND: >- ${{ false || (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main' || (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release' || (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release' || (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release' || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr' || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr' || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr' || (inputs.github-event-name == 'pull_request') && 'pr' || (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch' || 'unknown' }} run: | echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT - name: Get the right SHA id: sha env: SHA: > ${{ contains(fromJSON('["storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), steps.run-kind.outputs.run-kind) && fromJSON(inputs.github-event-json).pull_request.head.sha || github.sha }} run: | echo "sha=$SHA" | tee -a $GITHUB_OUTPUT - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 ref: ${{ steps.sha.outputs.sha }} - name: Get build tag id: build-tag env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} RUN_KIND: ${{ steps.run-kind.outputs.run-kind }} run: | case $RUN_KIND in push-main) echo "build-tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; storage-release) echo "build-tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; proxy-release) echo "build-tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; compute-release) echo "build-tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) BUILD_AND_TEST_RUN_ID=$(gh api --paginate \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \ | jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))') echo "build-tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT case $RUN_KIND in storage-rc-pr) echo "release-tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; proxy-rc-pr) echo "release-tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; compute-rc-pr) echo "release-tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; esac ;; workflow-dispatch) echo "build-tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT ;; *) echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" exit 1 esac - name: Get the previous release-tags id: previous-releases env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | gh api --paginate \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "/repos/${GITHUB_REPOSITORY}/releases" \ | jq -f .github/scripts/previous-releases.jq -r \ | tee -a "${GITHUB_OUTPUT}" - name: Get the release PR run ID id: release-pr-run-id if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_SHA: ${{ github.sha }} run: | RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))') echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT ================================================ FILE: .github/workflows/_push-to-container-registry.yml ================================================ name: Push images to Container Registry on: workflow_call: inputs: # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} image-map: description: JSON map of images, mapping from a source image to an array of target images that should be pushed. required: true type: string aws-region: description: AWS region to log in to. Required when pushing to ECR. required: false type: string aws-account-id: description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR. required: false type: string aws-role-to-assume: description: AWS role to assume to for pushing to ECR. Required when pushing to ECR. required: false type: string azure-client-id: description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR. required: false type: string azure-subscription-id: description: Azure subscription ID. Required when pushing to ACR. required: false type: string azure-tenant-id: description: Azure tenant ID. Required when pushing to ACR. required: false type: string acr-registry-name: description: ACR registry name. Required when pushing to ACR. required: false type: string permissions: {} defaults: run: shell: bash -euo pipefail {0} jobs: push-to-container-registry: runs-on: ubuntu-22.04 permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false - name: Print image-map run: echo '${{ inputs.image-map }}' | jq - name: Configure AWS credentials if: contains(inputs.image-map, 'amazonaws.com/') uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: "${{ inputs.aws-region }}" role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" role-duration-seconds: 3600 - name: Login to ECR if: contains(inputs.image-map, 'amazonaws.com/') uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 with: registries: "${{ inputs.aws-account-id }}" - name: Configure Azure credentials if: contains(inputs.image-map, 'azurecr.io/') uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 with: client-id: ${{ inputs.azure-client-id }} subscription-id: ${{ inputs.azure-subscription-id }} tenant-id: ${{ inputs.azure-tenant-id }} - name: Login to ACR if: contains(inputs.image-map, 'azurecr.io/') run: | az acr login --name=${{ inputs.acr-registry-name }} - name: Login to GHCR if: contains(inputs.image-map, 'ghcr.io/') uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Log in to Docker Hub uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries id: push run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} - name: Notify Slack if container image pushing fails if: steps.push.outputs.push_failures || failure() uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} text: > *Container image pushing ${{ steps.push.outcome == 'failure' && 'failed completely' || 'succeeded with some retries' }}* in <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> ${{ steps.push.outputs.push_failures && format( '*Failed targets:*\n• {0}', join(fromJson(steps.push.outputs.push_failures), '\n• ') ) || '' }} ================================================ FILE: .github/workflows/actionlint.yml ================================================ name: Lint GitHub Workflows on: push: branches: - main - release paths: - '.github/workflows/*.ya?ml' pull_request: paths: - '.github/workflows/*.ya?ml' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: check-permissions: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: github-event-name: ${{ github.event_name}} actionlint: needs: [ check-permissions ] runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: reviewdog/action-actionlint@a5524e1c19e62881d79c1f1b9b6f09f16356e281 # v1.65.2 env: # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046 # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086 SHELLCHECK_OPTS: --exclude=SC2046,SC2086 with: fail_level: error filter_mode: nofilter level: error - name: Disallow 'ubuntu-latest' runners run: | PAT='^\s*runs-on:.*-latest' if grep -ERq $PAT .github/workflows; then grep -ERl $PAT .github/workflows |\ while read -r f do l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1) echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'" done exit 1 fi ================================================ FILE: .github/workflows/approved-for-ci-run.yml ================================================ name: Handle `approved-for-ci-run` label # This workflow helps to run CI pipeline for PRs made by external contributors (from forks). on: pull_request_target: branches: - main types: # Default types that triggers a workflow ([1]): # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request - opened - synchronize - reopened # Types that we wand to handle in addition to keep labels tidy: - closed # Actual magic happens here: - labeled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}" # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} defaults: run: shell: bash -euo pipefail {0} jobs: remove-label: # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR. # The PR should be reviewed and labelled manually again. permissions: pull-requests: write # For `gh pr edit` if: | contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" create-or-update-pr-for-ci-run: # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it. permissions: pull-requests: write # for `gh pr edit` # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN if: | github.event.action == 'labeled' && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ github.event.pull_request.head.sha }} token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Look for existing PR id: get-pr env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT} - name: Get changed labels id: get-labels if: steps.get-pr.outputs.ALREADY_CREATED != '' env: ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \ <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -) LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \ <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ paste -sd , -) echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - run: git checkout -b "${BRANCH}" - run: git push --force origin "${BRANCH}" if: steps.get-pr.outputs.ALREADY_CREATED == '' - name: Create a Pull Request for CI run (if required) if: steps.get-pr.outputs.ALREADY_CREATED == '' env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER} Please do not alter or merge/close it. Feel free to review/comment/discuss the original PR #${PR_NUMBER}. EOF LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \ grep -E '^run' | paste -sd , -) gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ --body-file "body.md" \ --head "${BRANCH}" \ --base "main" \ --label ${LABELS} \ --draft - name: Modify the existing pull request (if required) if: steps.get-pr.outputs.ALREADY_CREATED != '' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }} LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }} ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} run: | ADD_CMD= REMOVE_CMD= [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}" [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}" if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD} fi - run: git push --force origin "${BRANCH}" if: steps.get-pr.outputs.ALREADY_CREATED != '' cleanup: # Close PRs and delete branchs if the original PR is closed. permissions: contents: write # for `--delete-branch` flag in `gh pr close` pull-requests: write # for `gh pr close` if: | github.event.action == 'closed' && github.event.pull_request.head.repo.full_name != github.repository runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch run: | CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')" if [ "${CLOSED}" == "false" ]; then gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch fi ================================================ FILE: .github/workflows/benchbase_tpcc.yml ================================================ name: TPC-C like benchmark using benchbase on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 6 * * *' # run once a day at 6 AM UTC workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow globally because we do not want to be too noisy in production environment group: benchbase-tpcc-workflow cancel-in-progress: false permissions: contents: read jobs: benchbase-tpcc: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: include: - warehouses: 50 # defines number of warehouses and is used to compute number of terminals max_rate: 800 # measured max TPS at scale factor based on experiments. Adjust if performance is better/worse min_cu: 0.25 # simulate free tier plan (0.25 -2 CU) max_cu: 2 - warehouses: 500 # serverless plan (2-8 CU) max_rate: 2000 min_cu: 2 max_cu: 8 - warehouses: 1000 # business plan (2-16 CU) max_rate: 2900 min_cu: 2 max_cu: 16 max-parallel: 1 # we want to run each workload size sequentially to avoid noisy neighbors permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: PG_CONFIG: /tmp/neon/pg_install/v17/bin/pg_config PSQL: /tmp/neon/pg_install/v17/bin/psql PG_17_LIB_PATH: /tmp/neon/pg_install/v17/lib POSTGRES_VERSION: 17 runs-on: [ self-hosted, us-east-2, x64 ] timeout-minutes: 1440 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project-tpcc uses: ./.github/actions/neon-project-create with: region_id: aws-us-east-2 postgres_version: ${{ env.POSTGRES_VERSION }} compute_units: '[${{ matrix.min_cu }}, ${{ matrix.max_cu }}]' api_key: ${{ secrets.NEON_PRODUCTION_API_KEY_4_BENCHMARKS }} api_host: console.neon.tech # production (!) - name: Initialize Neon project env: BENCHMARK_TPCC_CONNSTR: ${{ steps.create-neon-project-tpcc.outputs.dsn }} PROJECT_ID: ${{ steps.create-neon-project-tpcc.outputs.project_id }} run: | echo "Initializing Neon project with project_id: ${PROJECT_ID}" export LD_LIBRARY_PATH=${PG_17_LIB_PATH} # Retry logic for psql connection with 1 minute sleep between attempts for attempt in {1..3}; do echo "Attempt ${attempt}/3: Creating extensions in Neon project" if ${PSQL} "${BENCHMARK_TPCC_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"; then echo "Successfully created extensions" break else echo "Failed to create extensions on attempt ${attempt}" if [ ${attempt} -lt 3 ]; then echo "Waiting 60 seconds before retry..." sleep 60 else echo "All attempts failed, exiting" exit 1 fi fi done echo "BENCHMARK_TPCC_CONNSTR=${BENCHMARK_TPCC_CONNSTR}" >> $GITHUB_ENV - name: Generate BenchBase workload configuration env: WAREHOUSES: ${{ matrix.warehouses }} MAX_RATE: ${{ matrix.max_rate }} run: | echo "Generating BenchBase configs for warehouses: ${WAREHOUSES}, max_rate: ${MAX_RATE}" # Extract hostname and password from connection string # Format: postgresql://username:password@hostname/database?params (no port for Neon) HOSTNAME=$(echo "${BENCHMARK_TPCC_CONNSTR}" | sed -n 's|.*://[^:]*:[^@]*@\([^/]*\)/.*|\1|p') PASSWORD=$(echo "${BENCHMARK_TPCC_CONNSTR}" | sed -n 's|.*://[^:]*:\([^@]*\)@.*|\1|p') echo "Extracted hostname: ${HOSTNAME}" # Use runner temp (NVMe) as working directory cd "${RUNNER_TEMP}" # Copy the generator script cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/generate_workload_size.py" . # Generate configs and scripts python3 generate_workload_size.py \ --warehouses ${WAREHOUSES} \ --max-rate ${MAX_RATE} \ --hostname ${HOSTNAME} \ --password ${PASSWORD} \ --runner-arch ${{ runner.arch }} # Fix path mismatch: move generated configs and scripts to expected locations mv ../configs ./configs mv ../scripts ./scripts - name: Prepare database (load data) env: WAREHOUSES: ${{ matrix.warehouses }} run: | cd "${RUNNER_TEMP}" echo "Loading ${WAREHOUSES} warehouses into database..." # Run the loader script and capture output to log file while preserving stdout/stderr ./scripts/load_${WAREHOUSES}_warehouses.sh 2>&1 | tee "load_${WAREHOUSES}_warehouses.log" echo "Database loading completed" - name: Run TPC-C benchmark (warmup phase, then benchmark at 70% of configuredmax TPS) env: WAREHOUSES: ${{ matrix.warehouses }} run: | cd "${RUNNER_TEMP}" echo "Running TPC-C benchmark with ${WAREHOUSES} warehouses..." # Run the optimal rate benchmark ./scripts/execute_${WAREHOUSES}_warehouses_opt_rate.sh echo "Benchmark execution completed" - name: Run TPC-C benchmark (warmup phase, then ramp down TPS and up again in 5 minute intervals) env: WAREHOUSES: ${{ matrix.warehouses }} run: | cd "${RUNNER_TEMP}" echo "Running TPC-C ramp-down-up with ${WAREHOUSES} warehouses..." # Run the optimal rate benchmark ./scripts/execute_${WAREHOUSES}_warehouses_ramp_up.sh echo "Benchmark execution completed" - name: Process results (upload to test results database and generate diagrams) env: WAREHOUSES: ${{ matrix.warehouses }} MIN_CU: ${{ matrix.min_cu }} MAX_CU: ${{ matrix.max_cu }} PROJECT_ID: ${{ steps.create-neon-project-tpcc.outputs.project_id }} REVISION: ${{ github.sha }} PERF_DB_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }} run: | cd "${RUNNER_TEMP}" echo "Creating temporary Python environment for results processing..." # Create temporary virtual environment python3 -m venv temp_results_env source temp_results_env/bin/activate # Install required packages in virtual environment pip install matplotlib pandas psycopg2-binary echo "Copying results processing scripts..." # Copy both processing scripts cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/generate_diagrams.py" . cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/upload_results_to_perf_test_results.py" . echo "Processing load phase metrics..." # Find and process load log LOAD_LOG=$(find . -name "load_${WAREHOUSES}_warehouses.log" -type f | head -1) if [ -n "$LOAD_LOG" ]; then echo "Processing load metrics from: $LOAD_LOG" python upload_results_to_perf_test_results.py \ --load-log "$LOAD_LOG" \ --run-type "load" \ --warehouses "${WAREHOUSES}" \ --min-cu "${MIN_CU}" \ --max-cu "${MAX_CU}" \ --project-id "${PROJECT_ID}" \ --revision "${REVISION}" \ --connection-string "${PERF_DB_CONNSTR}" else echo "Warning: Load log file not found: load_${WAREHOUSES}_warehouses.log" fi echo "Processing warmup results for optimal rate..." # Find and process warmup results WARMUP_CSV=$(find results_warmup -name "*.results.csv" -type f | head -1) WARMUP_JSON=$(find results_warmup -name "*.summary.json" -type f | head -1) if [ -n "$WARMUP_CSV" ] && [ -n "$WARMUP_JSON" ]; then echo "Generating warmup diagram from: $WARMUP_CSV" python generate_diagrams.py \ --input-csv "$WARMUP_CSV" \ --output-svg "warmup_${WAREHOUSES}_warehouses_performance.svg" \ --title-suffix "Warmup at max TPS" echo "Uploading warmup metrics from: $WARMUP_JSON" python upload_results_to_perf_test_results.py \ --summary-json "$WARMUP_JSON" \ --results-csv "$WARMUP_CSV" \ --run-type "warmup" \ --min-cu "${MIN_CU}" \ --max-cu "${MAX_CU}" \ --project-id "${PROJECT_ID}" \ --revision "${REVISION}" \ --connection-string "${PERF_DB_CONNSTR}" else echo "Warning: Missing warmup results files (CSV: $WARMUP_CSV, JSON: $WARMUP_JSON)" fi echo "Processing optimal rate results..." # Find and process optimal rate results OPTRATE_CSV=$(find results_opt_rate -name "*.results.csv" -type f | head -1) OPTRATE_JSON=$(find results_opt_rate -name "*.summary.json" -type f | head -1) if [ -n "$OPTRATE_CSV" ] && [ -n "$OPTRATE_JSON" ]; then echo "Generating optimal rate diagram from: $OPTRATE_CSV" python generate_diagrams.py \ --input-csv "$OPTRATE_CSV" \ --output-svg "benchmark_${WAREHOUSES}_warehouses_performance.svg" \ --title-suffix "70% of max TPS" echo "Uploading optimal rate metrics from: $OPTRATE_JSON" python upload_results_to_perf_test_results.py \ --summary-json "$OPTRATE_JSON" \ --results-csv "$OPTRATE_CSV" \ --run-type "opt-rate" \ --min-cu "${MIN_CU}" \ --max-cu "${MAX_CU}" \ --project-id "${PROJECT_ID}" \ --revision "${REVISION}" \ --connection-string "${PERF_DB_CONNSTR}" else echo "Warning: Missing optimal rate results files (CSV: $OPTRATE_CSV, JSON: $OPTRATE_JSON)" fi echo "Processing warmup 2 results for ramp down/up phase..." # Find and process warmup results WARMUP_CSV=$(find results_warmup -name "*.results.csv" -type f | tail -1) WARMUP_JSON=$(find results_warmup -name "*.summary.json" -type f | tail -1) if [ -n "$WARMUP_CSV" ] && [ -n "$WARMUP_JSON" ]; then echo "Generating warmup diagram from: $WARMUP_CSV" python generate_diagrams.py \ --input-csv "$WARMUP_CSV" \ --output-svg "warmup_2_${WAREHOUSES}_warehouses_performance.svg" \ --title-suffix "Warmup at max TPS" echo "Uploading warmup metrics from: $WARMUP_JSON" python upload_results_to_perf_test_results.py \ --summary-json "$WARMUP_JSON" \ --results-csv "$WARMUP_CSV" \ --run-type "warmup" \ --min-cu "${MIN_CU}" \ --max-cu "${MAX_CU}" \ --project-id "${PROJECT_ID}" \ --revision "${REVISION}" \ --connection-string "${PERF_DB_CONNSTR}" else echo "Warning: Missing warmup results files (CSV: $WARMUP_CSV, JSON: $WARMUP_JSON)" fi echo "Processing ramp results..." # Find and process ramp results RAMPUP_CSV=$(find results_ramp_up -name "*.results.csv" -type f | head -1) RAMPUP_JSON=$(find results_ramp_up -name "*.summary.json" -type f | head -1) if [ -n "$RAMPUP_CSV" ] && [ -n "$RAMPUP_JSON" ]; then echo "Generating ramp diagram from: $RAMPUP_CSV" python generate_diagrams.py \ --input-csv "$RAMPUP_CSV" \ --output-svg "ramp_${WAREHOUSES}_warehouses_performance.svg" \ --title-suffix "ramp TPS down and up in 5 minute intervals" echo "Uploading ramp metrics from: $RAMPUP_JSON" python upload_results_to_perf_test_results.py \ --summary-json "$RAMPUP_JSON" \ --results-csv "$RAMPUP_CSV" \ --run-type "ramp-up" \ --min-cu "${MIN_CU}" \ --max-cu "${MAX_CU}" \ --project-id "${PROJECT_ID}" \ --revision "${REVISION}" \ --connection-string "${PERF_DB_CONNSTR}" else echo "Warning: Missing ramp results files (CSV: $RAMPUP_CSV, JSON: $RAMPUP_JSON)" fi # Deactivate and clean up virtual environment deactivate rm -rf temp_results_env rm upload_results_to_perf_test_results.py echo "Results processing completed and environment cleaned up" - name: Set date for upload id: set-date run: echo "date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT - name: Configure AWS credentials # necessary to upload results uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: us-east-2 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 900 # 900 is minimum value - name: Upload benchmark results to S3 env: S3_BUCKET: neon-public-benchmark-results S3_PREFIX: benchbase-tpc-c/${{ steps.set-date.outputs.date }}/${{ github.run_id }}/${{ matrix.warehouses }}-warehouses run: | echo "Redacting passwords from configuration files before upload..." # Mask all passwords in XML config files find "${RUNNER_TEMP}/configs" -name "*.xml" -type f -exec sed -i 's|[^<]*|redacted|g' {} \; echo "Uploading benchmark results to s3://${S3_BUCKET}/${S3_PREFIX}/" # Upload the entire benchmark directory recursively aws s3 cp --only-show-errors --recursive "${RUNNER_TEMP}" s3://${S3_BUCKET}/${S3_PREFIX}/ echo "Upload completed" - name: Delete Neon Project if: ${{ always() }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project-tpcc.outputs.project_id }} api_key: ${{ secrets.NEON_PRODUCTION_API_KEY_4_BENCHMARKS }} api_host: console.neon.tech # production (!) ================================================ FILE: .github/workflows/benchmarking.yml ================================================ name: Benchmarking on: # uncomment to run on push for debugging your PR # push: # branches: [ your branch ] schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: region_id: description: 'Project region id. If not set, the default region will be used' required: false default: 'aws-us-east-2' save_perf_report: type: boolean description: 'Publish perf report. If not set, the report will be published only for the main branch' required: false collect_olap_explain: type: boolean description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected' required: false default: false collect_pg_stat_statements: type: boolean description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected' required: false default: false run_AWS_RDS_AND_AURORA: type: boolean description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' required: false default: false run_only_pgvector_tests: type: boolean description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run' required: false default: false defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true jobs: cleanup: runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init env: ORG_ID: org-solitary-dew-09443886 LIMIT: 100 SEARCH: "GITHUB_RUN_ID=" BASE_URL: https://console-stage.neon.build/api/v2 DRY_RUN: "false" # Set to "true" to just test out the workflow steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Cleanup inactive Neon projects left over from prior runs env: API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} run: | set -euo pipefail NOW=$(date -u +%s) DAYS_AGO=$((NOW - 5 * 86400)) REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID" echo "Requesting project list from:" echo "$REQUEST_URL" response=$(curl -s -X GET "$REQUEST_URL" \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" ) echo "Response:" echo "$response" | jq . projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" ' .projects[] | select(.compute_last_active_at != null) | select((.compute_last_active_at | fromdateiso8601) < $cutoff) | {id, name, compute_last_active_at} ') if [ -z "$projects_to_delete" ]; then echo "No projects eligible for deletion." exit 0 fi echo "Projects that will be deleted:" echo "$projects_to_delete" | jq -r '.id' if [ "$DRY_RUN" = "false" ]; then echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do echo "Deleting project: $project_id" curl -s -X DELETE "$BASE_URL/projects/$project_id" \ --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" done else echo "Dry run enabled — no projects were deleted." fi bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: include: - PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - PG_VERSION: 17 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' RUNNER: [ self-hosted, eastus2, x64 ] env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} runs-on: ${{ matrix.RUNNER }} container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary on Azure runners uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Run benchmark uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests extra_params: -m remote_cluster --sparse-ordering --timeout 14400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py --ignore test_runner/performance/test_logical_replication.py --ignore test_runner/performance/test_physical_replication.py --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py --ignore test_runner/performance/test_cumulative_statistics_persistence.py --ignore test_runner/performance/test_perf_many_relations.py --ignore test_runner/performance/test_perf_oltp_large_tenant.py --ignore test_runner/performance/test_lfc_prewarm.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Delete Neon Project if: ${{ always() }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} cumstats-test: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 17 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: "neon-staging" runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Verify that cumulative statistics are preserved uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_cumulative_statistics_persistence.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 3600 pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: "neon-staging" runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_logical_replication.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }} BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }} - name: Run Physical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_physical_replication.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} # Post both success and failure to the Slack channel - name: Post to a Slack channel if: ${{ github.event.schedule && !cancelled() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream slack-message: | Periodic replication testing: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} prewarm-test: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: PROJECT_ID: ${{ vars.PREWARM_PROJECT_ID }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 17 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: "neon-staging" runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run prewarm benchmark uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_lfc_prewarm.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} generate-matrices: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: # - neonvm-captest-new: Freshly created project (1 CU) # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU) # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region # - neonvm-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' image_default="ghcr.io/neondatabase/build-tools:pinned-bookworm" matrix='{ "pg_version" : [ 16 ], "region_id" : [ "'"$region_id_default"'" ], "platform": [ "neonvm-captest-new", "neonvm-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], "runner": ['"$runner_default"'], "image": [ "'"$image_default"'" ], "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT - name: Generate matrix for OLAP benchmarks id: olap-compare-matrix run: | matrix='{ "platform": [ "neonvm-captest-reuse" ], "pg_version" : [ 16,17 ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT - name: Generate matrix for TPC-H benchmarks id: tpch-compare-matrix run: | matrix='{ "platform": [ "neonvm-captest-reuse" ], "pg_version" : [ 16,17 ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT prepare_AWS_RDS_databases: uses: ./.github/workflows/_benchmarking_preparation.yml secrets: inherit pgbench-compare: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} needs: [ generate-matrices, prepare_AWS_RDS_databases ] permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: ${{fromJSON(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: ${{ matrix.runner }} container: image: ${{ matrix.image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 8h, default timeout is 6h timeout-minutes: 480 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: contains(fromJSON('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; neonvm-captest-new | neonvm-captest-new-many-tables | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} ;; rds-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB # without (neonvm-captest-new) # and with (neonvm-captest-new-many-tables) many relations in the database - name: Create many relations before the run if: contains(fromJSON('["neonvm-captest-new-many-tables"]'), matrix.platform) uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_NUM_RELATIONS: 10000 - name: Benchmark init uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark simple-update uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark select-only uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Delete Neon Project if: ${{ steps.create-neon-project.outputs.project_id && always() }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: include: - PLATFORM: "neonvm-captest-pgvector" RUNNER: [ self-hosted, us-east-2, x64 ] postgres_version: 16 - PLATFORM: "neonvm-captest-pgvector-pg17" RUNNER: [ self-hosted, us-east-2, x64 ] postgres_version: 17 - PLATFORM: "azure-captest-pgvector" RUNNER: [ self-hosted, eastus2, x64 ] postgres_version: 16 env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.postgres_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} runs-on: ${{ matrix.RUNNER }} container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} ;; neonvm-captest-pgvector-pg17) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }} ;; azure-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_perf_olap.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Benchmark pgvector queries uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_perf_pgvector_queries.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. # Run this job only when pgbench-compare is finished to avoid the intersection. # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 12h, default timeout is 6h # we have regression in clickbench causing it to run 2-3x longer timeout-minutes: 720 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-reuse) case "${PG_VERSION}" in 16) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; 17) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }} ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" exit 1 ;; esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }} ;; rds-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_perf_olap.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} TEST_OLAP_SCALE: 10 - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} tpch-compare: # TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare & clickbench-compare. # Run this job only when clickbench-compare is finished to avoid the intersection. # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false matrix: ${{ fromJSON(needs.generate-matrices.outputs.tpch-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get Connstring Secret Name run: | case "${PLATFORM}" in neonvm-captest-reuse) case "${PG_VERSION}" in 16) CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR" ;; 17) CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17" ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" exit 1 ;; esac ;; rds-aurora) CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR" ;; rds-postgres) CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR" ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV - name: Set up Connection String id: set-up-connstr run: | CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }} echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_perf_olap.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} TEST_OLAP_SCALE: 10 - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-reuse) case "${PG_VERSION}" in 16) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; 17) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }} ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" exit 1 ;; esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }} ;; rds-postgres) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: Run user examples uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance/test_perf_olap.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/build-build-tools-image.yml ================================================ name: Build build-tools image on: workflow_call: inputs: archs: description: "Json array of architectures to build" # Default values are set in `check-image` job, `set-variables` step type: string required: false debians: description: "Json array of Debian versions to build" # Default values are set in `check-image` job, `set-variables` step type: string required: false outputs: image-tag: description: "build-tools tag" value: ${{ jobs.check-image.outputs.tag }} image: description: "build-tools image" value: ghcr.io/neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} defaults: run: shell: bash -euo pipefail {0} # The initial idea was to prevent the waste of resources by not re-building the `build-tools` image # for the same tag in parallel workflow runs, and queue them to be skipped once we have # the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected. # GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs. # # Ref https://github.com/orgs/community/discussions/41518 # # concurrency: # group: build-build-tools-image-${{ inputs.image-tag }} # cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} jobs: check-image: runs-on: ubuntu-22.04 outputs: archs: ${{ steps.set-variables.outputs.archs }} debians: ${{ steps.set-variables.outputs.debians }} tag: ${{ steps.set-variables.outputs.image-tag }} everything: ${{ steps.set-more-variables.outputs.everything }} found: ${{ steps.set-more-variables.outputs.found }} permissions: packages: read steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set variables id: set-variables env: ARCHS: ${{ inputs.archs || '["x64","arm64"]' }} DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }} IMAGE_TAG: | ${{ hashFiles('build-tools/Dockerfile', '.github/workflows/build-build-tools-image.yml') }} run: | echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT} echo "debians=${DEBIANS}" | tee -a ${GITHUB_OUTPUT} echo "image-tag=${IMAGE_TAG}" | tee -a ${GITHUB_OUTPUT} - name: Set more variables id: set-more-variables env: IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }} EVERYTHING: | ${{ contains(fromJSON(steps.set-variables.outputs.archs), 'x64') && contains(fromJSON(steps.set-variables.outputs.archs), 'arm64') && contains(fromJSON(steps.set-variables.outputs.debians), 'bullseye') && contains(fromJSON(steps.set-variables.outputs.debians), 'bookworm') }} run: | if docker manifest inspect ghcr.io/neondatabase/build-tools:${IMAGE_TAG}; then found=true else found=false fi echo "everything=${EVERYTHING}" | tee -a ${GITHUB_OUTPUT} echo "found=${found}" | tee -a ${GITHUB_OUTPUT} build-image: needs: [ check-image ] if: needs.check-image.outputs.found == 'false' strategy: matrix: arch: ${{ fromJSON(needs.check-image.outputs.archs) }} debian: ${{ fromJSON(needs.check-image.outputs.debians) }} permissions: packages: write runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: file: build-tools/Dockerfile context: . attests: | type=provenance,mode=max type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true build-args: | DEBIAN_VERSION=${{ matrix.debian }} cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }} tags: | ghcr.io/neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }} merge-images: needs: [ check-image, build-image ] runs-on: ubuntu-22.04 permissions: packages: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch image env: DEFAULT_DEBIAN_VERSION: bookworm ARCHS: ${{ join(fromJSON(needs.check-image.outputs.archs), ' ') }} DEBIANS: ${{ join(fromJSON(needs.check-image.outputs.debians), ' ') }} EVERYTHING: ${{ needs.check-image.outputs.everything }} IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | for debian in ${DEBIANS}; do tags=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}") if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then tags+=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}") fi for arch in ${ARCHS}; do tags+=("ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}") done docker buildx imagetools create "${tags[@]}" done ================================================ FILE: .github/workflows/build-macos.yml ================================================ name: Check neon with MacOS builds on: workflow_call: inputs: pg_versions: description: "Array of the pg versions to build for, for example: ['v14', 'v17']" type: string default: '[]' required: false rebuild_rust_code: description: "Rebuild Rust code" type: boolean default: false required: false rebuild_everything: description: "If true, rebuild for all versions" type: boolean default: false required: false env: RUST_BACKTRACE: 1 COPT: '-Werror' # TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow # We should care about that as Github has limitations: # - You can connect up to four levels of workflows # - You can call a maximum of 20 unique reusable workflows from a single workflow file. # https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations permissions: contents: read jobs: make-all: if: | inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 60 runs-on: macos-15 env: # Use release build only, to have less debug info around # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout main repo uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - uses: ./.github/actions/prepare-for-subzero with: token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Install build dependencies run: | brew install flex bison openssl protobuf icu4c - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - name: Restore "pg_install/" cache id: cache_pg uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: path: pg_install key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-install-v14-${{ hashFiles('Makefile', 'postgres.mk', 'vendor/revisions.json') }} - name: Checkout vendor/postgres submodules if: steps.cache_pg.outputs.cache-hit != 'true' run: | git submodule init git submodule update --depth 1 --recursive - name: Build Postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: | make postgres -j$(sysctl -n hw.ncpu) # This isn't strictly necessary, but it makes the cached and non-cached builds more similar, # When pg_install is restored from cache, there is no 'build/' directory. By removing it # in a non-cached build too, we enforce that the rest of the steps don't depend on it, # so that we notice any build caching bugs earlier. - name: Remove build artifacts if: steps.cache_pg.outputs.cache-hit != 'true' run: | rm -rf build # Explicitly update the rust toolchain before running 'make'. The parallel make build can # invoke 'cargo build' more than once in parallel, for different crates. That's OK, 'cargo' # does its own locking to prevent concurrent builds from stepping on each other's # toes. However, it will first try to update the toolchain, and that step is not locked the # same way. To avoid two toolchain updates running in parallel and stepping on each other's # toes, ensure that the toolchain is up-to-date beforehand. - name: Update rust toolchain run: | rustup --version && rustup update && rustup show - name: Cache cargo deps uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust # Build the neon-specific postgres extensions, and all the Rust bits. # # Pass PG_INSTALL_CACHED=1 because PostgreSQL was already built and cached # separately. - name: Build all run: PG_INSTALL_CACHED=1 BUILD_TYPE=release make -j$(sysctl -n hw.ncpu) all - name: Check that no warnings are produced run: ./run_clippy.sh ================================================ FILE: .github/workflows/build_and_run_selected_test.yml ================================================ name: Build and Run Selected Test on: workflow_dispatch: inputs: test-selection: description: 'Specification of selected test(s), as accepted by pytest -k' required: true type: string run-count: description: 'Number of test runs to perform' required: true type: number archs: description: 'Archs to run tests on, e. g.: ["x64", "arm64"]' default: '["x64"]' required: true type: string build-types: description: 'Build types to run tests on, e. g.: ["debug", "release"]' default: '["release"]' required: true type: string pg-versions: description: 'Postgres versions to use for testing, e.g,: [{"pg_version":"v16"}, {"pg_version":"v17"}])' default: '[{"pg_version":"v17"}]' required: true type: string defaults: run: shell: bash -euxo pipefail {0} env: RUST_BACKTRACE: 1 COPT: '-Werror' jobs: meta: uses: ./.github/workflows/_meta.yml with: github-event-name: ${{ github.event_name }} github-event-json: ${{ toJSON(github.event) }} build-and-test-locally: needs: [ meta ] strategy: fail-fast: false matrix: arch: ${{ fromJson(inputs.archs) }} build-type: ${{ fromJson(inputs.build-types) }} uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} build-tools-image: ghcr.io/neondatabase/build-tools:pinned-bookworm build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} test-cfg: ${{ inputs.pg-versions }} test-selection: ${{ inputs.test-selection }} test-run-count: ${{ fromJson(inputs.run-count) }} rerun-failed: false secrets: inherit create-test-report: needs: [ build-and-test-locally ] if: ${{ !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, small ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_DEV }} - uses: actions/github-script@v7 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const report = { reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } const coverage = {} const script = require("./scripts/comment-test-report.js") await script({ github, context, fetch, report, coverage, }) ================================================ FILE: .github/workflows/build_and_test.yml ================================================ name: Build and Test on: push: branches: - main - release - release-proxy - release-compute pull_request: defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: RUST_BACKTRACE: 1 COPT: '-Werror' # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: github-event-name: ${{ github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] if: github.event_name == 'pull_request' runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Cancel previous e2e-tests runs for this PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | gh workflow --repo neondatabase/cloud \ run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" files-changed: needs: [ check-permissions ] runs-on: [ self-hosted, small ] timeout-minutes: 3 outputs: check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - name: Check for file changes uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} filters: .github/file-filters.yaml meta: needs: [ check-permissions ] uses: ./.github/workflows/_meta.yml with: github-event-name: ${{ github.event_name }} github-event-json: ${{ toJSON(github.event) }} build-build-tools-image: needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit lint-yamls: needs: [ meta, check-permissions, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - run: make -C compute manifest-schema-validation - run: make lint-openapi-spec check-codestyle-python: needs: [ meta, check-permissions, build-build-tools-image ] # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-python.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit check-codestyle-jsonnet: needs: [ meta, check-permissions, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Check Jsonnet code formatting run: | make -C compute jsonnetfmt-test # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: needs: [ check-permissions ] runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: check-if-submodules-changed with: filters: | vendor: - 'vendor/**' - name: Check vendor/postgres-v14 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v14" fetch_depth: "50" sub_fetch_depth: "50" pass_if_unchanged: true - name: Check vendor/postgres-v15 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v15" fetch_depth: "50" sub_fetch_depth: "50" pass_if_unchanged: true - name: Check vendor/postgres-v16 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v16" fetch_depth: "50" sub_fetch_depth: "50" pass_if_unchanged: true - name: Check vendor/postgres-v17 submodule reference if: steps.check-if-submodules-changed.outputs.vendor == 'true' uses: jtmullen/submodule-branch-check-action@ab0d3a69278e3fa0a2d4f3be3199d2514b676e13 # v1.3.0 with: path: "vendor/postgres-v17" fetch_depth: "50" sub_fetch_depth: "50" pass_if_unchanged: true check-codestyle-rust: needs: [ meta, check-permissions, build-build-tools-image ] # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-rust.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm archs: '["x64", "arm64"]' secrets: inherit check-dependencies-rust: needs: [ meta, files-changed, build-build-tools-image ] # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/cargo-deny.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit build-and-test-locally: needs: [ meta, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: arch: [ x64, arm64 ] # Do not build or run tests in debug for release branches build-type: ${{ fromJSON((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} include: - build-type: release arch: arm64 uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. test-cfg: | ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"}, {"pg_version":"v15", "lfc_state": "with-lfc"}, {"pg_version":"v16", "lfc_state": "with-lfc"}, {"pg_version":"v17", "lfc_state": "with-lfc"}, {"pg_version":"v17", "lfc_state": "without-lfc"}]' || '[{"pg_version":"v17", "lfc_state": "without-lfc" }]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking get-benchmarks-durations: if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') outputs: json: ${{ steps.get-benchmark-durations.outputs.json }} needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Cache poetry deps uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 with: endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }} bucket: ${{ vars.HETZNER_CACHE_BUCKET }} accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }} secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }} use-fallback: false path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync - name: get benchmark durations id: get-benchmark-durations env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} run: | poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \ --days 10 \ --output /tmp/benchmark_durations.json echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled()) needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined strategy: fail-fast: false matrix: # the amount of groups (N) should be reflected in `extra_params: --splits N ...` pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones report-benchmarks-results-to-slack: needs: [ benchmarks, create-test-report ] if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result) runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}" text: | Benchmarks on main: *${{ needs.benchmarks.result }}* - <${{ needs.create-test-report.outputs.report-url }}|Allure report> - <${{ github.event.head_commit.url }}|${{ github.sha }}> create-test-report: needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const report = { reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } const coverage = { coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}", summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}", } const script = require("./scripts/comment-test-report.js") await script({ github, context, fetch, report, coverage, }) coverage-report: if: ${{ !startsWith(github.ref_name, 'release') }} needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init strategy: fail-fast: false matrix: build_type: [ debug ] outputs: coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }} coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: # Need `fetch-depth: 0` for differential coverage (to get diff between two commits) - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true fetch-depth: 0 - name: Get Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get coverage artifact uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - name: Build coverage report env: COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }} run: | scripts/coverage --dir=/tmp/coverage \ report \ --input-objects=/tmp/coverage/binaries.list \ --commit-url=${COMMIT_URL} \ --format=github scripts/coverage --dir=/tmp/coverage \ report \ --input-objects=/tmp/coverage/binaries.list \ --format=lcov - name: Build coverage report NEW id: upload-coverage-report-new env: BUCKET: neon-github-public-dev # A differential coverage report is available only for PRs. # (i.e. for pushes into main/release branches we have a regular coverage report) COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} run: | CURRENT="${COMMIT_SHA}" BASELINE="$(git merge-base $BASE_SHA $CURRENT)" cp /tmp/coverage/report/lcov.info ./${CURRENT}.info GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info" # Use differential coverage if the baseline coverage exists. # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit. if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}" fi genhtml ${GENHTML_ARGS} aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const { REPORT_URL_NEW, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, sha: `${COMMIT_SHA}`, state: 'success', target_url: `${REPORT_URL_NEW}`, context: 'Code coverage report NEW', }) trigger-e2e-tests: # !failure() && !cancelled() because it depends on jobs that can get skipped if: >- ${{ ( ( needs.meta.outputs.run-kind == 'pr' && ( !github.event.pull_request.draft || contains(github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') ) ) || contains(fromJSON('["push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) ) && !failure() && !cancelled() }} needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml with: github-event-name: ${{ github.event_name }} github-event-json: ${{ toJSON(github.event) }} secrets: inherit neon-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: packages: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true ref: ${{ needs.meta.outputs.sha }} - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure) # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm secrets: | SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }} attests: | type=provenance,mode=max type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: Dockerfile cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read packages: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch image run: | docker buildx imagetools create -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: needs: [ check-permissions, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read packages: write strategy: fail-fast: false matrix: version: # Much data was already generated on old PG versions with bullseye's # libraries, the locales of which can cause data incompatibilities. # However, new PG versions should be build on newer images, # as that reduces the support burden of old and ancient distros. - pg: v14 debian: bullseye - pg: v15 debian: bullseye - pg: v16 debian: bullseye - pg: v17 debian: bookworm arch: [ x64, arm64 ] runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true ref: ${{ needs.meta.outputs.sha }} - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 with: cache-binary: false # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. buildkitd-config-inline: | [worker.oci] max-parallelism = 1 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: cache.neon.build username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - name: Build compute-node image uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} DEBIAN_VERSION=${{ matrix.version.debian }} attests: | type=provenance,mode=max type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: compute/compute-node.Dockerfile cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0 with: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} DEBIAN_VERSION=${{ matrix.version.debian }} attests: | type=provenance,mode=max type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: compute/compute-node.Dockerfile target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read packages: write runs-on: ubuntu-22.04 strategy: matrix: version: # see the comment for `compute-node-image-arch` job - pg: v14 debian: bullseye - pg: v15 debian: bullseye - pg: v16 debian: bullseye - pg: v17 debian: bookworm steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | docker buildx imagetools create -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | docker buildx imagetools create -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: contents: read packages: write strategy: fail-fast: false matrix: arch: [ amd64, arm64 ] version: - pg: v14 debian: bullseye - pg: v15 debian: bullseye - pg: v16 debian: bullseye - pg: v17 debian: bookworm env: VM_BUILDER_VERSION: v0.46.0 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Downloading vm-builder run: | curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | docker pull ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ -src=ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ -dst=ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | docker push ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} vm-compute-node-image: needs: [ vm-compute-node-image-arch, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: packages: write runs-on: ubuntu-22.04 strategy: matrix: version: # see the comment for `compute-node-image-arch` job - pg: v14 - pg: v15 - pg: v16 - pg: v17 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | docker buildx imagetools create -t ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 test-images: needs: [ check-permissions, meta, neon-image, compute-node-image ] # Depends on jobs that can get skipped if: >- ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: arch: [ x64, arm64 ] pg_version: [v16, v17] permissions: packages: read runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # `ghcr.io/neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] # Bad versions might loop like: # Neon page server git-env:local failpoints: true, features: ["testing"] # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | pageserver_version=$(docker run --rm ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then echo "Pageserver version should not be the default Dockerfile one" exit 1 fi if ! echo "$pageserver_version" | grep -qv '"testing"' ; then echo "Pageserver version should have no testing feature enabled" exit 1 fi - name: Verify docker-compose example and test extensions timeout-minutes: 60 env: PARALLEL_COMPUTES: 3 TAG: >- ${{ needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release || needs.meta.outputs.build-tag }} COMPUTE_TAG: >- ${{ contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_EXTENSIONS_TAG: >- ${{ contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh - name: Print logs and clean up docker-compose test if: always() run: | docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down - name: Test extension upgrade timeout-minutes: 20 if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} env: TAG: >- ${{ false || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release }} TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }} NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up if: always() run: | docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down generate-image-maps: needs: [ meta ] runs-on: ubuntu-22.04 outputs: neon-dev: ${{ steps.generate.outputs.neon-dev }} neon-prod: ${{ steps.generate.outputs.neon-prod }} compute-dev: ${{ steps.generate.outputs.compute-dev }} compute-prod: ${{ steps.generate.outputs.compute-prod }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false - name: Generate Image Maps id: generate run: python3 .github/scripts/generate_image_maps.py env: SOURCE_TAG: >- ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.release-pr-run-id || needs.meta.outputs.build-tag }} TARGET_TAG: ${{ needs.meta.outputs.build-tag }} BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: needs: [ meta, generate-image-maps, neon-image ] if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} secrets: inherit push-compute-image-dev: needs: [ meta, generate-image-maps, vm-compute-node-image ] if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} secrets: inherit push-neon-image-prod: needs: [ meta, generate-image-maps, neon-image, test-images ] # Depends on jobs that can get skipped if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} secrets: inherit push-compute-image-prod: needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ] # Depends on jobs that can get skipped if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} secrets: inherit push-neon-test-extensions-image-dockerhub: if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml permissions: packages: write id-token: write with: image-map: | { "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" ], "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" ] } secrets: inherit add-latest-tag-to-neon-test-extensions-image: if: ${{ needs.meta.outputs.run-kind == 'push-main' }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml permissions: packages: write id-token: write with: image-map: | { "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v16:latest", "ghcr.io/neondatabase/neon-test-extensions-v16:latest" ], "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v17:latest", "ghcr.io/neondatabase/neon-test-extensions-v17:latest" ] } secrets: inherit add-release-tag-to-neon-test-extensions-image: if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} needs: [ meta ] uses: ./.github/workflows/_push-to-container-registry.yml permissions: packages: write id-token: write with: image-map: | { "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}", "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" ], "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}", "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" ] } secrets: inherit trigger-custom-extensions-build-and-wait: needs: [ check-permissions, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Set PR's status to pending and request a remote CI test run: | COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }} REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions" curl -f -X POST \ https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ -H "Accept: application/vnd.github.v3+json" \ --user "${{ secrets.CI_ACCESS_TOKEN }}" \ --data \ "{ \"state\": \"pending\", \"context\": \"build-and-upload-extensions\", \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" }" curl -f -X POST \ https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \ -H "Accept: application/vnd.github.v3+json" \ --user "${{ secrets.CI_ACCESS_TOKEN }}" \ --data \ "{ \"ref\": \"main\", \"inputs\": { \"ci_job_name\": \"build-and-upload-extensions\", \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", \"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\", \"remote_branch_name\": \"${{ github.ref_name }}\" } }" - name: Wait for extension build to finish env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | TIMEOUT=5400 # 90 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer INTERVAL=15 # try each N seconds last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do sleep $INTERVAL # Get statuses for the latest commit in the PR / branch gh api \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json # Get the latest status for the "build-and-upload-extensions" context last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json) if [ "${last_status}" = "pending" ]; then # Extension build is still in progress. continue elif [ "${last_status}" = "success" ]; then # Extension build is successful. exit 0 else # Status is neither "pending" nor "success", exit the loop and fail the job. break fi done # Extension build failed, print `statuses.json` for debugging and fail the job. jq '.' statuses.json echo >&2 "Status of extension build is '${last_status}' != 'success'" exit 1 deploy: needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write runs-on: [ self-hosted, small ] container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create git tag and GitHub release if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: TAG: "${{ needs.meta.outputs.build-tag }}" BRANCH: "${{ github.ref_name }}" PREVIOUS_RELEASE: >- ${{ false || needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release || needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release || needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release || 'unknown' }} with: retries: 5 script: | const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env try { const existingRef = await github.rest.git.getRef({ owner: context.repo.owner, repo: context.repo.repo, ref: `tags/${TAG}`, }); if (existingRef.data.object.sha !== context.sha) { throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); } console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`); } catch (error) { if (error.status !== 404) { throw error; } console.log(`Tag ${TAG} does not exist. Creating it...`); await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, ref: `refs/tags/${TAG}`, sha: context.sha, }); console.log(`Tag ${TAG} created successfully.`); } try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, repo: context.repo.repo, tag: TAG, }); console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`); } catch (error) { if (error.status !== 404) { throw error; } console.log(`Release for tag ${TAG} does not exist. Creating it...`); // Find the PR number using the commit SHA const pullRequests = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'closed', base: BRANCH, }); const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); const prNumber = pr ? pr.number : null; const releaseNotes = [ prNumber ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` : 'Release PR not found.', `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.` ].join('\n\n'); await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, tag_name: TAG, body: releaseNotes, }); console.log(`Release for tag ${TAG} created successfully.`); } - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} RUN_KIND: ${{ needs.meta.outputs.run-kind }} run: | case ${RUN_KIND} in push-main) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false ;; storage-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ -f deployStorage=true \ -f deployStorageBroker=false \ -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ -f deployStorage=true \ -f deployStorageBroker=false \ -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.meta.outputs.build-tag}} ;; proxy-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ -f deployStorage=false \ -f deployStorageBroker=false \ -f deployStorageController=false \ -f branch=main \ -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxyLink=true \ -f deployPrivatelinkProxy=true \ -f deployProxyScram=true \ -f deployProxyAuthBroker=true \ -f branch=main \ -f dockerTag=${{needs.meta.outputs.build-tag}} ;; compute-release) gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}} ;; *) echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'" exit 1 ;; esac notify-release-deploy-failure: needs: [ meta, deploy ] # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Post release-deploy failure to team slack channel uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 env: TEAM_ONCALL: >- ${{ fromJSON(format('{ "storage-release": "", "compute-release": "", "proxy-release": "" }', vars.SLACK_ONCALL_STORAGE_GROUP, vars.SLACK_ONCALL_COMPUTE_GROUP, vars.SLACK_ONCALL_PROXY_GROUP ))[needs.meta.outputs.run-kind] }} CHANNEL: >- ${{ fromJSON(format('{ "storage-release": "{0}", "compute-release": "{1}", "proxy-release": "{2}" }', vars.SLACK_STORAGE_CHANNEL_ID, vars.SLACK_COMPUTE_CHANNEL_ID, vars.SLACK_PROXY_CHANNEL_ID ))[needs.meta.outputs.run-kind] }} with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | channel: ${{ env.CHANNEL }} text: | 🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ meta, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 3600 - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev AWS_REGION: eu-central-1 COMMIT_SHA: ${{ github.sha }} RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }} run: | old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" new_prefix="artifacts/latest" files_to_promote=() files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true) for arch in X64 ARM64; do for build_type in debug release; do neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst" s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true) if [ -z "${s3_key}" ]; then echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist" exit 1 fi files_to_promote+=("s3://${BUCKET}/${s3_key}") for pg_version in v14 v15 v16 v17; do # We run less tests for debug builds, so we don't need to promote them if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then continue fi compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst" s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true) if [ -z "${s3_key}" ]; then echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist" exit 1 fi files_to_promote+=("s3://${BUCKET}/${s3_key}") done done done for f in "${files_to_promote[@]}"; do time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/ done pin-build-tools-image: needs: [ build-build-tools-image, test-images, build-and-test-locally ] # `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped if: github.ref_name == 'main' && !failure() && !cancelled() uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} secrets: inherit # This job simplifies setting branch protection rules (in GitHub UI) # by allowing to set only this job instead of listing many others. # It also makes it easier to rename or parametrise jobs (using matrix) # which requires changes in branch protection rules # # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that. # # https://github.com/neondatabase/neon/settings/branch_protection_rules conclusion: if: always() # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: - meta - build-and-test-locally - check-codestyle-python - check-codestyle-rust - check-dependencies-rust - files-changed - push-compute-image-dev - push-neon-image-dev - test-images - trigger-custom-extensions-build-and-wait runs-on: ubuntu-22.04 steps: # The list of possible results: # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Fail the job if any of the dependencies do not succeed run: exit 1 if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) ================================================ FILE: .github/workflows/build_and_test_fully.yml ================================================ name: Build and Test Fully on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: RUST_BACKTRACE: 1 COPT: '-Werror' jobs: tag: runs-on: [ self-hosted, small ] container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} steps: # Need `fetch-depth: 0` to count the number of commits in the branch - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Get build tag run: | echo run:$GITHUB_RUN_ID echo ref:$GITHUB_REF_NAME echo rev:$(git rev-list --count HEAD) if [[ "$GITHUB_REF_NAME" == "main" ]]; then echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT fi shell: bash id: build-tag build-build-tools-image: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit build-and-test-locally: needs: [ tag, build-build-tools-image ] strategy: fail-fast: false matrix: arch: [ x64, arm64 ] build-type: [ debug, release ] uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} rerun-failed: false test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"}, {"pg_version":"v15", "lfc_state": "with-lfc"}, {"pg_version":"v16", "lfc_state": "with-lfc"}, {"pg_version":"v17", "lfc_state": "with-lfc"}, {"pg_version":"v14", "lfc_state": "without-lfc"}, {"pg_version":"v15", "lfc_state": "without-lfc"}, {"pg_version":"v16", "lfc_state": "without-lfc"}, {"pg_version":"v17", "lfc_state": "withouts-lfc"}]' secrets: inherit create-test-report: needs: [ build-and-test-locally, build-build-tools-image ] if: ${{ !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const report = { reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } const coverage = {} const script = require("./scripts/comment-test-report.js") await script({ github, context, fetch, report, coverage, }) ================================================ FILE: .github/workflows/build_and_test_with_sanitizers.yml ================================================ name: Build and Test with Sanitizers on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 1 * * *' # run once a day, timezone is utc workflow_dispatch: defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: RUST_BACKTRACE: 1 COPT: '-Werror' jobs: tag: runs-on: [ self-hosted, small ] container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} steps: # Need `fetch-depth: 0` to count the number of commits in the branch - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Get build tag run: | echo run:$GITHUB_RUN_ID echo ref:$GITHUB_REF_NAME echo rev:$(git rev-list --count HEAD) if [[ "$GITHUB_REF_NAME" == "main" ]]; then echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT fi shell: bash id: build-tag build-build-tools-image: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit build-and-test-locally: needs: [ tag, build-build-tools-image ] strategy: fail-fast: false matrix: arch: [ x64, arm64 ] build-type: [ release ] uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} rerun-failed: false test-cfg: '[{"pg_version":"v17"}]' sanitizers: enabled secrets: inherit create-test-report: needs: [ build-and-test-locally, build-build-tools-image ] if: ${{ !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const report = { reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } const coverage = {} const script = require("./scripts/comment-test-report.js") await script({ github, context, fetch, report, coverage, }) ================================================ FILE: .github/workflows/cargo-deny.yml ================================================ name: cargo deny checks on: workflow_call: inputs: build-tools-image: required: false type: string schedule: - cron: '0 10 * * *' permissions: contents: read jobs: cargo-deny: strategy: matrix: ref: >- ${{ fromJSON( github.event_name == 'schedule' && '["main","release","release-proxy","release-compute"]' || format('["{0}"]', github.sha) ) }} runs-on: [self-hosted, small] permissions: packages: read container: image: ${{ inputs.build-tools-image || 'ghcr.io/neondatabase/build-tools:pinned' }} credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ matrix.ref }} - name: Check rust licenses/bans/advisories/sources env: CARGO_DENY_TARGET: >- ${{ github.event_name == 'schedule' && 'advisories' || 'all' }} run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET - name: Post to a Slack channel if: ${{ github.event_name == 'schedule' && failure() }} uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} text: | Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help. Pinging . ================================================ FILE: .github/workflows/check-permissions.yml ================================================ name: Check Permissions on: workflow_call: inputs: github-event-name: required: true type: string defaults: run: shell: bash -euo pipefail {0} # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} jobs: check-permissions: runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Disallow CI runs on PRs from forks if: | inputs.github-event-name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository run: | if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" else MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" fi # TODO: use actions/github-script to post this message as a PR comment echo >&2 "We don't run CI for PRs from forks" echo >&2 "${MESSAGE}" exit 1 ================================================ FILE: .github/workflows/cleanup-caches-by-a-branch.yml ================================================ # A workflow from # https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries name: cleanup caches by a branch on: pull_request: types: - closed jobs: cleanup: runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Cleanup run: | gh extension install actions/gh-actions-cache echo "Fetching list of cache key" cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 ) ## Setting this to not fail the workflow while deleting cache keys. set +e echo "Deleting caches..." for cacheKey in $cacheKeysForPR do gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm done echo "Done" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge ================================================ FILE: .github/workflows/cloud-extensions.yml ================================================ name: Cloud Extensions Test on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '45 1 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: region_id: description: 'Project region id. If not set, the default region will be used' required: false default: 'aws-us-east-2' defaults: run: shell: bash -euxo pipefail {0} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write jobs: regress: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote strategy: fail-fast: false matrix: pg-version: [16, 17] runs-on: us-east-2 container: # We use the neon-test-extensions image here as it contains the source code for the extensions. image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Evaluate the settings id: project-settings run: | if [[ $((${{ matrix.pg-version }})) -lt 17 ]]; then ULID=ulid else ULID=pgx_ulid fi LIBS=timescaledb:rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en:$ULID settings=$(jq -c -n --arg libs $LIBS '{preload_libraries:{use_defaults:false,enabled_libraries:($libs| split(":"))}}') echo settings=$settings >> $GITHUB_OUTPUT - name: Create Neon Project id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Run the regression tests run: /run-tests.sh -r /ext-src env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} SKIP: "pg_hint_plan-src,pg_repack-src,pg_cron-src,plpgsql_check-src" - name: Delete Neon Project if: ${{ always() }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | Periodic extensions test on staging: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/cloud-regress.yml ================================================ name: Cloud Regression Test on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '45 1 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow group: ${{ github.workflow }} cancel-in-progress: true permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write jobs: regress: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote strategy: fail-fast: false matrix: pg-version: [16, 17] runs-on: us-east-2 container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - name: Patch the test env: PG_VERSION: ${{matrix.pg-version}} run: | cd "vendor/postgres-v${PG_VERSION}" patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch" - name: Generate a random password id: pwgen run: | set +x DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64) echo "::add-mask::${DBPASS//\//}" echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}" - name: Change tests according to the generated password env: DBPASS: ${{ steps.pwgen.outputs.DBPASS }} PG_VERSION: ${{matrix.pg-version}} run: | cd vendor/postgres-v"${PG_VERSION}"/src/test/regress for fname in sql/*.sql expected/*.out; do sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}" done for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do USER=$(echo "${ph}" | cut -c 22-) MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}') sed -i.bak "s/${ph}/${MD5}/" expected/password.out done - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create a new branch id: create-branch uses: ./.github/actions/neon-branch-create with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }} - name: Run the regression tests uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: cloud_regress pg_version: ${{matrix.pg-version}} extra_params: -m remote_cluster aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} - name: Delete branch if: always() uses: ./.github/actions/neon-branch-delete with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }} branch_id: ${{steps.create-branch.outputs.branch_id}} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | Periodic pg_regress on staging: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/fast-forward.yml ================================================ name: Fast forward merge on: pull_request: types: [labeled] branches: - release - release-proxy - release-compute jobs: fast-forward: if: ${{ github.event.label.name == 'fast-forward' }} runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Remove fast-forward label to PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward" - name: Fast forwarding uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979 # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }} with: merge: true comment: on-error github_token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Comment if mergeable_state is not clean if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }} env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | gh pr comment ${{ github.event.pull_request.number }} \ --repo "${GITHUB_REPOSITORY}" \ --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`." ================================================ FILE: .github/workflows/force-test-extensions-upgrade.yml ================================================ name: Force Test Upgrading of Extension on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '45 2 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow group: ${{ github.workflow }} cancel-in-progress: true permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read jobs: regress: strategy: fail-fast: false matrix: pg-version: [16, 17] runs-on: small steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: false - name: Get the last compute release tag id: get-last-compute-release-tag env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "/repos/${GITHUB_REPOSITORY}/releases") echo tag=${tag} >> ${GITHUB_OUTPUT} - name: Test extension upgrade timeout-minutes: 60 env: NEW_COMPUTE_TAG: latest OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} PG_VERSION: ${{ matrix.pg-version }} FORCE_ALL_UPGRADE_TESTS: true run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up if: always() run: | docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down - name: Post to the Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | Test upgrading of extensions: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/ingest_benchmark.yml ================================================ name: benchmarking ingest on: # uncomment to run on push for debugging your PR # push: # branches: [ your branch ] schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 9 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow globally because we need dedicated resources which only exist once group: ingest-bench-workflow cancel-in-progress: true permissions: contents: read jobs: ingest: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: include: - target_project: new_empty_project_stripe_size_2048 stripe_size: 2048 # 16 MiB postgres_version: 16 disable_sharding: false - target_project: new_empty_project_stripe_size_32768 stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold # while here it is sharded from the beginning with a shard size of 256 MiB disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold disable_sharding: false postgres_version: 17 - target_project: large_existing_project stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project disable_sharding: false postgres_version: 16 - target_project: new_empty_project_unsharded stripe_size: null # run with neon defaults which will shard split only when reaching the threshold disable_sharding: true postgres_version: 16 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config PSQL: /tmp/neon/pg_install/v16/bin/psql PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib PGCOPYDB: /pgcopydb/bin/pgcopydb PGCOPYDB_LIB_PATH: /pgcopydb/lib runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 1440 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} id: create-neon-project-ingest-target uses: ./.github/actions/neon-project-create with: region_id: aws-us-east-2 postgres_version: ${{ matrix.postgres_version }} compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck api_key: ${{ secrets.NEON_STAGING_API_KEY }} shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }} admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} shard_count: 8 stripe_size: ${{ matrix.stripe_size }} disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} run: | echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}" export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - name: Create Neon Branch for large tenant if: ${{ matrix.target_project == 'large_existing_project' }} id: create-neon-branch-ingest-target uses: ./.github/actions/neon-branch-create with: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Initialize Neon project if: ${{ matrix.target_project == 'large_existing_project' }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }} run: | echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}" export LD_LIBRARY_PATH=${PG_16_LIB_PATH} # Extract the part before the database name base_connstr="${BENCHMARK_INGEST_TARGET_CONNSTR%/*}" # Extract the query parameters (if any) after the database name query_params="${BENCHMARK_INGEST_TARGET_CONNSTR#*\?}" # Reconstruct the new connection string if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then new_connstr="${base_connstr}/neondb?${query_params}" else new_connstr="${base_connstr}/neondb" fi ${PSQL} "${new_connstr}" -c "drop database ludicrous;" ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;" if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous?${query_params}" else BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous" fi ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - name: Invoke pgcopydb uses: ./.github/actions/run-python-test-set with: build_type: remote test_selection: performance/test_perf_ingest_using_pgcopydb.py run_in_parallel: false extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb pg_version: v${{ matrix.postgres_version }} save_perf_report: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} TARGET_PROJECT_TYPE: ${{ matrix.target_project }} # we report PLATFORM in zenbenchmark NeonBenchmarker perf database and want to distinguish between new project and large tenant PLATFORM: "${{ matrix.target_project }}-us-east-2-staging" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: show tables sizes after ingest run: | export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - name: Delete Neon Project if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Delete Neon Branch for large tenant if: ${{ always() && matrix.target_project == 'large_existing_project' }} uses: ./.github/actions/neon-branch-delete with: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} ================================================ FILE: .github/workflows/label-for-external-users.yml ================================================ name: Add `external` label to issues and PRs created by external users on: issues: types: - opened pull_request_target: types: - opened workflow_dispatch: inputs: github-actor: description: 'GitHub username. If empty, the username of the current user will be used' required: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} env: LABEL: external jobs: check-user: runs-on: ubuntu-22.04 outputs: is-member: ${{ steps.check-user.outputs.is-member }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}` id: check-user env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} ACTOR: ${{ inputs.github-actor || github.actor }} run: | expected_error="User does not exist or is not a member of the organization" output_file=output.txt for i in $(seq 1 10); do if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then is_member=true break elif grep -q "${expected_error}" ${output_file}; then is_member=false break elif [ $i -eq 10 ]; then title="Failed to get memmbership status for ${ACTOR}" message="The latest GitHub API error message: '$(cat ${output_file})'" echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}" exit 1 fi sleep 1 done echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} add-label: if: needs.check-user.outputs.is-member == 'false' needs: [ check-user ] runs-on: ubuntu-22.04 permissions: pull-requests: write # for `gh pr edit` issues: write # for `gh issue edit` steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Add `${{ env.LABEL }}` label env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }} GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }} run: | gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER} ================================================ FILE: .github/workflows/large_oltp_benchmark.yml ================================================ name: large oltp benchmark on: # uncomment to run on push for debugging your PR #push: # branches: [ bodobolero/synthetic_oltp_workload ] schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow globally because we need dedicated resources which only exist once group: large-oltp-bench-workflow cancel-in-progress: false permissions: contents: read jobs: oltp: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: include: # test only read-only custom scripts in new branch without database maintenance - target: new_branch custom_scripts: select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 test_maintenance: false # test all custom scripts in new branch with database maintenance - target: new_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 test_maintenance: true # test all custom scripts in reuse branch with database maintenance - target: reuse_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 test_maintenance: true max-parallel: 1 # we want to run each benchmark sequentially to not have noisy neighbors on shared storage (PS, SK) permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: 16 # pre-determined by pre-determined project TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote PLATFORM: ${{ matrix.target }} runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time # (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions # in one run vacuum didn't finish within 12 hours timeout-minutes: 2880 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Branch for large tenant if: ${{ matrix.target == 'new_branch' }} id: create-neon-branch-oltp-target uses: ./.github/actions/neon-branch-create with: project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Set up Connection String id: set-up-connstr run: | case "${{ matrix.target }}" in new_branch) CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} ;; reuse_branch) CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} ;; *) echo >&2 "Unknown target=${{ matrix.target }}" exit 1 ;; esac CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT - name: Delete rows from prior runs in reuse branch if: ${{ matrix.target == 'reuse_branch' }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config PSQL: /tmp/neon/pg_install/v16/bin/psql PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib run: | echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs" export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';" echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs" - name: Benchmark pgbench with custom-scripts uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: true extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark database maintenance if: ${{ matrix.test_maintenance }} uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: true extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Delete Neon Branch for large tenant if: ${{ always() && matrix.target == 'new_branch' }} uses: ./.github/actions/neon-branch-delete with: project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Configure AWS credentials # again because prior steps could have exceeded 5 hours uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic large oltp perf testing: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/large_oltp_growth.yml ================================================ name: large oltp growth # workflow to grow the reuse branch of large oltp benchmark continuously (about 16 GB per run) on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 6 * * *' # 06:00 UTC - cron: '0 8 * * *' # 08:00 UTC - cron: '0 10 * * *' # 10:00 UTC - cron: '0 12 * * *' # 12:00 UTC - cron: '0 14 * * *' # 14:00 UTC - cron: '0 16 * * *' # 16:00 UTC workflow_dispatch: # adds ability to run this manually defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow globally because we need dedicated resources which only exist once group: large-oltp-growth cancel-in-progress: true permissions: contents: read jobs: oltp: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: include: # for now only grow the reuse branch, not the other branches. - target: reuse_branch custom_scripts: - grow_action_blocks.sql - grow_action_kwargs.sql - grow_device_fingerprint_event.sql - grow_edges.sql - grow_hotel_rate_mapping.sql - grow_ocr_pipeline_results_version.sql - grow_priceline_raw_response.sql - grow_relabled_transactions.sql - grow_state_values.sql - grow_values.sql - grow_vertices.sql - update_accounting_coding_body_tracking_category_selection.sql - update_action_blocks.sql - update_action_kwargs.sql - update_denormalized_approval_workflow.sql - update_device_fingerprint_event.sql - update_edges.sql - update_heron_transaction_enriched_log.sql - update_heron_transaction_enrichment_requests.sql - update_hotel_rate_mapping.sql - update_incoming_webhooks.sql - update_manual_transaction.sql - update_ml_receipt_matching_log.sql - update_ocr_pipeine_results_version.sql - update_orc_pipeline_step_results.sql - update_orc_pipeline_step_results_version.sql - update_priceline_raw_response.sql - update_quickbooks_transactions.sql - update_raw_finicity_transaction.sql - update_relabeled_transactions.sql - update_state_values.sql - update_stripe_authorization_event_log.sql - update_transaction.sql - update_values.sql - update_vertices.sql max-parallel: 1 # we want to run each growth workload sequentially (for now there is just one) permissions: contents: write statuses: write id-token: write # aws-actions/configure-aws-credentials env: TEST_PG_BENCH_DURATIONS_MATRIX: "1h" TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ join(matrix.custom_scripts, ' ') }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install PG_VERSION: 16 # pre-determined by pre-determined project TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote PLATFORM: ${{ matrix.target }} runs-on: [ self-hosted, us-east-2, x64 ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials # necessary to download artefacts uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${{ matrix.target }}" in reuse_branch) CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} ;; *) echo >&2 "Unknown target=${{ matrix.target }}" exit 1 ;; esac CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT - name: pgbench with custom-scripts uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false save_perf_report: true extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_growth pg_version: ${{ env.PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic large oltp tenant growth increase: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/lint-release-pr.yml ================================================ name: Lint Release PR on: pull_request: branches: - release - release-proxy - release-compute permissions: contents: read jobs: lint-release-pr: runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout PR branch uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 # Fetch full history for git operations ref: ${{ github.event.pull_request.head.ref }} - name: Run lint script env: RELEASE_BRANCH: ${{ github.base_ref }} run: | ./.github/scripts/lint-release-pr.sh ================================================ FILE: .github/workflows/neon_extra_builds.yml ================================================ name: Check neon with extra platform builds on: push: branches: - main pull_request: defaults: run: shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: RUST_BACKTRACE: 1 COPT: '-Werror' jobs: check-permissions: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: github-event-name: ${{ github.event_name}} build-build-tools-image: needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit files-changed: name: Detect what files changed runs-on: ubuntu-22.04 timeout-minutes: 3 outputs: v17: ${{ steps.files_changed.outputs.v17 }} postgres_changes: ${{ steps.postgres_changes.outputs.changes }} rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }} rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true - name: Check for Postgres changes uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 id: files_changed with: token: ${{ github.token }} filters: .github/file-filters.yaml base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }} ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }} - name: Filter out only v-string for build matrix id: postgres_changes env: CHANGES: ${{ steps.files_changed.outputs.changes }} run: | v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" check-macos-build: needs: [ check-permissions, files-changed ] uses: ./.github/workflows/build-macos.yml secrets: inherit with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} rebuild_rust_code: ${{ fromJSON(needs.files-changed.outputs.rebuild_rust_code) }} rebuild_everything: ${{ fromJSON(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image, files-changed ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write if: | (needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && ( contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' ) runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init env: BUILD_TYPE: release # build with incremental compilation produce partial results # so do not attempt to cache this build, also disable the incremental compilation CARGO_INCREMENTAL: 0 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers run: make postgres-headers -j$(nproc) - name: Build walproposer-lib run: make walproposer-lib -j$(nproc) - name: Produce the build stats run: cargo build --all --release --timings -j$(nproc) - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 3600 - name: Upload the build stats id: upload-stats env: BUCKET: neon-github-public-dev SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/" echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Publish build stats report uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const { REPORT_URL, SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, sha: `${SHA}`, state: 'success', target_url: `${REPORT_URL}`, context: `Build stats (release)`, }) ================================================ FILE: .github/workflows/periodic_pagebench.yml ================================================ name: Periodic pagebench performance test on unit-perf-aws-arm runners on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 */4 * * *' # Runs every 4 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: type: string description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' required: false default: '' recreate_snapshots: type: boolean description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.' required: false default: false defaults: run: shell: bash -euo pipefail {0} concurrency: group: ${{ github.workflow }} cancel-in-progress: false permissions: contents: read jobs: run_periodic_pagebench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: RUN_ID: ${{ github.run_id }} DEFAULT_PG_VERSION: 16 BUILD_TYPE: release RUST_BACKTRACE: 1 # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container S3_BUCKET: neon-github-public-dev PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive id: set-env shell: bash -euxo pipefail {0} run: | { echo "NEON_DIR=${RUNNER_TEMP}/neon" echo "NEON_BIN=${RUNNER_TEMP}/neon/bin" echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install" echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib" echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots" echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output" echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local" echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results" echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results" } >> "$GITHUB_ENV" echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT" - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built) - name: Determine commit hash id: commit_hash shell: bash -euxo pipefail {0} env: INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} run: | if [[ -z "${INPUT_COMMIT_HASH}" ]]; then COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha') echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else COMMIT_HASH="${INPUT_COMMIT_HASH}" echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi - name: Checkout the neon repository at given commit hash uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ steps.commit_hash.outputs.commit_hash }} # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash # example artifact # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst - name: Determine artifact S3_KEY for given commit hash and download and extract artifact id: artifact_prefix shell: bash -euxo pipefail {0} env: ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst COMMIT_HASH: ${{ env.COMMIT_HASH }} COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }} run: | attempt=0 max_attempts=24 # 5 minutes * 24 = 2 hours while [[ $attempt -lt $max_attempts ]]; do # the following command will fail until the artifacts are available ... S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \ | jq -r '.Contents[]?.Key' \ | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \ | sort --version-sort \ | tail -1) || true # ... thus ignore errors from the command if [[ -n "${S3_KEY}" ]]; then echo "Artifact found: $S3_KEY" echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV break fi # Increment attempt counter and sleep for 5 minutes attempt=$((attempt + 1)) echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..." sleep 300 # Sleep for 5 minutes done if [[ -z "${S3_KEY}" ]]; then echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours else mkdir -p $(dirname $ARCHIVE) time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE} mkdir -p ${NEON_DIR} time tar -xf ${ARCHIVE} -C ${NEON_DIR} rm -f ${ARCHIVE} fi - name: Download snapshots from S3 if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }} id: download_snapshots shell: bash -euxo pipefail {0} run: | # Download the snapshots from S3 mkdir -p ${TEST_OUTPUT} mkdir -p $BACKUP_DIR cd $BACKUP_DIR mkdir parts cd parts PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \ | jq -r '.Contents[]?.Key' \ | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \ | sort \ | tail -1) echo "Latest PART: $PART" if [[ -z "$PART" ]]; then echo "ERROR: No matching S3 key found" >&2 exit 1 fi S3_KEY=$(dirname $PART) time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ . cd $TEST_OUTPUT time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions rm -rf ${BACKUP_DIR} - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} run: ./scripts/pysync # we need high number of open files for pagebench - name: show ulimits shell: bash -euxo pipefail {0} run: | ulimit -a - name: Run pagebench testcase shell: bash -euxo pipefail {0} env: CI: false # need to override this env variable set by github to enforce using snapshots run: | export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE} # report the commit hash of the neon repository in the revision of the test results export GITHUB_SHA=${COMMIT_HASH} rm -rf ${PERF_REPORT_DIR} rm -rf ${ALLURE_RESULTS_DIR} mkdir -p ${PERF_REPORT_DIR} mkdir -p ${ALLURE_RESULTS_DIR} PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA" EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json" # run only two selected tests # environment set by parent: # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS} ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS} - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results shell: bash -euxo pipefail {0} run: | export REPORT_FROM="$PERF_REPORT_DIR" export GITHUB_SHA=${COMMIT_HASH} time ./scripts/generate_and_push_perf_report.sh - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: ${{ steps.set-env.outputs.allure_results_dir }} unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Upload snapshots if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }} id: upload_snapshots shell: bash -euxo pipefail {0} run: | mkdir -p $BACKUP_DIR cd $TEST_OUTPUT tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst cd $BACKUP_DIR mkdir parts split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part. SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD cd parts time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/ - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - name: Cleanup Test Resources if: always() shell: bash -euxo pipefail {0} env: ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst run: | # Cleanup the test resources if [[ -d "${BACKUP_DIR}" ]]; then rm -rf ${BACKUP_DIR} fi if [[ -d "${TEST_OUTPUT}" ]]; then rm -rf ${TEST_OUTPUT} fi if [[ -d "${NEON_DIR}" ]]; then rm -rf ${NEON_DIR} fi rm -rf $(dirname $ARCHIVE) ================================================ FILE: .github/workflows/pg-clients.yml ================================================ name: Test Postgres client libraries on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '23 02 * * *' # run once a day, timezone is utc pull_request: paths: - '.github/workflows/pg-clients.yml' - 'test_runner/pg_clients/**/*.py' - 'test_runner/logical_repl/**/*.py' - 'poetry.lock' workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref_name }} cancel-in-progress: ${{ github.event_name == 'pull_request' }} defaults: run: shell: bash -euxo pipefail {0} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write # require for posting a status update env: DEFAULT_PG_VERSION: 17 PLATFORM: neon-captest-new AWS_DEFAULT_REGION: eu-central-1 jobs: check-permissions: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: github-event-name: ${{ github.event_name }} build-build-tools-image: permissions: packages: write needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit generate-ch-tmppw: runs-on: ubuntu-22.04 outputs: tmp_val: ${{ steps.pwgen.outputs.tmp_val }} steps: - name: Generate a random password id: pwgen run: | set +x p=$(dd if=/dev/random bs=14 count=1 2>/dev/null | base64) echo tmp_val="${p//\//}" >> "${GITHUB_OUTPUT}" test-logical-replication: needs: [ build-build-tools-image, generate-ch-tmppw ] runs-on: ubuntu-22.04 container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root services: clickhouse: image: clickhouse/clickhouse-server:25.6 env: CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }} PGSSLCERT: /tmp/postgresql.crt ports: - 9000:9000 - 8123:8123 zookeeper: image: quay.io/debezium/zookeeper:3.1.3.Final ports: - 2181:2181 - 2888:2888 - 3888:3888 kafka: image: quay.io/debezium/kafka:3.1.3.Final env: ZOOKEEPER_CONNECT: "zookeeper:2181" KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 KAFKA_BROKER_ID: 1 KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 KAFKA_JMX_PORT: 9991 ports: - 9092:9092 debezium: image: quay.io/debezium/connect:3.1.3.Final env: BOOTSTRAP_SERVERS: kafka:9092 GROUP_ID: 1 CONFIG_STORAGE_TOPIC: debezium-config OFFSET_STORAGE_TOPIC: debezium-offset STATUS_STORAGE_TOPIC: debezium-status DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector ports: - 8083:8083 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project uses: ./.github/actions/neon-project-create with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} project_settings: >- {"enable_logical_replication": true} - name: Run tests uses: ./.github/actions/run-python-test-set with: build_type: remote test_selection: logical_repl run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }} - name: Delete Neon Project if: always() uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - name: Post to a Slack channel if: github.event.schedule && failure() uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} test-postgres-client-libs: needs: [ build-build-tools-image ] runs-on: ubuntu-22.04 container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project uses: ./.github/actions/neon-project-create with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} - name: Run tests uses: ./.github/actions/run-python-test-set with: build_type: remote test_selection: pg_clients run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - name: Delete Neon Project if: always() uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - name: Post to a Slack channel if: github.event.schedule && failure() uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 with: channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} ================================================ FILE: .github/workflows/pin-build-tools-image.yml ================================================ name: 'Pin build-tools image' on: workflow_dispatch: inputs: from-tag: description: 'Source tag' required: true type: string force: description: 'Force the image to be pinned' default: false type: boolean workflow_call: inputs: from-tag: description: 'Source tag' required: true type: string force: description: 'Force the image to be pinned' default: false type: boolean defaults: run: shell: bash -euo pipefail {0} concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} jobs: check-manifests: runs-on: ubuntu-22.04 outputs: skip: ${{ steps.check-manifests.outputs.skip }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Check if we really need to pin the image id: check-manifests env: FROM_TAG: ${{ inputs.from-tag }} TO_TAG: pinned run: | docker manifest inspect "ghcr.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" docker manifest inspect "ghcr.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true else skip=false fi echo "skip=${skip}" | tee -a $GITHUB_OUTPUT tag-image: needs: check-manifests # use format(..) to catch both inputs.force = true AND inputs.force = 'true' if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' permissions: id-token: write # Required for aws/azure login packages: write # required for pushing to GHCR uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", "ghcr.io/neondatabase/build-tools:pinned-bookworm", "ghcr.io/neondatabase/build-tools:pinned", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned" ] } aws-region: ${{ vars.AWS_ECR_REGION }} aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} secrets: inherit ================================================ FILE: .github/workflows/pre-merge-checks.yml ================================================ name: Pre-merge checks on: pull_request: paths: - .github/workflows/_check-codestyle-python.yml - .github/workflows/_check-codestyle-rust.yml - .github/workflows/build-build-tools-image.yml - .github/workflows/pre-merge-checks.yml merge_group: defaults: run: shell: bash -euxo pipefail {0} # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} jobs: meta: runs-on: ubuntu-22.04 permissions: contents: read outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} rust-changed: ${{ steps.rust-src.outputs.any_changed }} branch: ${{ steps.group-metadata.outputs.branch }} pr-number: ${{ steps.group-metadata.outputs.pr-number }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: python-src with: files: | .github/workflows/_check-codestyle-python.yml .github/workflows/build-build-tools-image.yml .github/workflows/pre-merge-checks.yml **/**.py poetry.lock pyproject.toml - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: rust-src with: files: | .github/workflows/_check-codestyle-rust.yml .github/workflows/build-build-tools-image.yml .github/workflows/pre-merge-checks.yml **/**.rs **/Cargo.toml Cargo.toml Cargo.lock - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES env: PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }} RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }} run: | echo "${PYTHON_CHANGED_FILES}" echo "${RUST_CHANGED_FILES}" - name: Merge group metadata if: ${{ github.event_name == 'merge_group' }} id: group-metadata env: MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }} run: | echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?.*)/pr-(?[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}" build-build-tools-image: if: | false || needs.meta.outputs.python-changed == 'true' || needs.meta.outputs.rust-changed == 'true' needs: [ meta ] permissions: contents: read packages: write uses: ./.github/workflows/build-build-tools-image.yml with: # Build only one combination to save time archs: '["x64"]' debians: '["bookworm"]' secrets: inherit check-codestyle-python: if: needs.meta.outputs.python-changed == 'true' needs: [ meta, build-build-tools-image ] permissions: contents: read packages: read uses: ./.github/workflows/_check-codestyle-python.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 secrets: inherit check-codestyle-rust: if: needs.meta.outputs.rust-changed == 'true' needs: [ meta, build-build-tools-image ] permissions: contents: read packages: read uses: ./.github/workflows/_check-codestyle-rust.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 archs: '["x64"]' secrets: inherit # To get items from the merge queue merged into main we need to satisfy "Status checks that are required". # Currently we require 2 jobs (checks with exact name): # - conclusion # - neon-cloud-e2e conclusion: # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow if: always() && github.event_name == 'merge_group' permissions: statuses: write # for `github.repos.createCommitStatus(...)` contents: write needs: - meta - check-codestyle-python - check-codestyle-rust runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Create fake `neon-cloud-e2e` check uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | const { repo, owner } = context.repo; const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`; await github.rest.repos.createCommitStatus({ owner: owner, repo: repo, sha: context.sha, context: `neon-cloud-e2e`, state: `success`, target_url: targetUrl, description: `fake check for merge queue`, }); - name: Fail the job if any of the dependencies do not succeed or skipped run: exit 1 if: | false || (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main') || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true') || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - name: Add fast-forward label to PR to trigger fast-forward merge if: >- ${{ always() && github.event_name == 'merge_group' && contains(fromJSON('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) }} env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: >- gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward" ================================================ FILE: .github/workflows/proxy-benchmark.yml ================================================ name: Periodic proxy performance test on unit-perf-aws-arm runners on: push: # TODO: remove after testing branches: - test-proxy-bench # Runs on pushes to test-proxy-bench branch # schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) # - cron: '0 5 * * *' # Runs at 5 UTC once a day workflow_dispatch: # adds an ability to run this manually defaults: run: shell: bash -euo pipefail {0} concurrency: group: ${{ github.workflow }} cancel-in-progress: false permissions: contents: read jobs: run_periodic_proxybench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write runs-on: [ self-hosted, unit-perf-aws-arm ] timeout-minutes: 60 # 1h timeout container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Checkout proxy-bench Repo uses: actions/checkout@v4 with: repository: neondatabase/proxy-bench path: proxy-bench - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive id: set-env shell: bash -euxo pipefail {0} run: | PROXY_BENCH_PATH=$(realpath ./proxy-bench) { echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH" echo "NEON_DIR=${RUNNER_TEMP}/neon" echo "NEON_PROXY_PATH=${RUNNER_TEMP}/neon/bin/proxy" echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output" echo "" } >> "$GITHUB_ENV" - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} run: ./scripts/pysync - name: show ulimits shell: bash -euxo pipefail {0} run: | ulimit -a - name: Run proxy-bench working-directory: ${{ env.PROXY_BENCH_PATH }} run: ./run.sh --with-grafana --bare-metal - name: Ingest Bench Results if: always() working-directory: ${{ env.NEON_DIR }} run: | mkdir -p $TEST_OUTPUT python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT - name: Push Metrics to Proxy perf database shell: bash -euxo pipefail {0} if: always() env: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" REPORT_FROM: $TEST_OUTPUT working-directory: ${{ env.NEON_DIR }} run: $NEON_DIR/scripts/generate_and_push_perf_report.sh - name: Notify Failure if: failure() run: echo "Proxy bench job failed" && exit 1 - name: Cleanup Test Resources if: always() shell: bash -euxo pipefail {0} run: | # Cleanup the test resources if [[ -d "${TEST_OUTPUT}" ]]; then rm -rf ${TEST_OUTPUT} fi if [[ -d "${PROXY_BENCH_PATH}/test_output" ]]; then rm -rf ${PROXY_BENCH_PATH}/test_output fi ================================================ FILE: .github/workflows/random-ops-test.yml ================================================ name: Random Operations Test on: schedule: # * is a special character in YAML so you have to quote this string # ┌───────────── minute (0 - 59) # │ ┌───────────── hour (0 - 23) # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '23 */2 * * *' # runs every 2 hours workflow_dispatch: inputs: random_seed: type: number description: 'The random seed' required: false default: 0 num_operations: type: number description: "The number of operations to test" default: 250 defaults: run: shell: bash -euxo pipefail {0} permissions: {} env: DEFAULT_PG_VERSION: 16 PLATFORM: neon-captest-new AWS_DEFAULT_REGION: eu-central-1 jobs: run-random-rests: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install runs-on: small permissions: id-token: write statuses: write strategy: fail-fast: false matrix: pg-version: [16, 17] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Neon artifact uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run tests uses: ./.github/actions/run-python-test-set with: build_type: remote test_selection: random_ops run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ matrix.pg-version }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} RANDOM_SEED: ${{ inputs.random_seed }} NUM_OPERATIONS: ${{ inputs.num_operations }} - name: Create Allure report if: ${{ !cancelled() }} id: create-allure-report uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} ================================================ FILE: .github/workflows/regenerate-pg-setting.yml ================================================ name: Regenerate Postgres Settings on: pull_request: types: - opened - synchronize - reopened paths: - pgxn/neon/**.c - vendor/postgres-v* - vendor/revisions.json concurrency: group: ${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true permissions: pull-requests: write jobs: regenerate-pg-settings: runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Add comment uses: thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74 # v3 with: comment-tag: ${{ github.job }} pr-number: ${{ github.event.number }} message: | If this PR added a GUC in the Postgres fork or `neon` extension, please regenerate the Postgres settings in the `cloud` repo: ``` make NEON_WORKDIR=path/to/neon/checkout \ -C goapp/internal/shareddomain/postgres generate ``` If you're an external contributor, a Neon employee will assist in making sure this step is done. ================================================ FILE: .github/workflows/release-compute.yml ================================================ name: Create compute release PR on: schedule: - cron: '0 7 * * FRI' jobs: create-release-pr: uses: ./.github/workflows/release.yml with: component: compute secrets: inherit ================================================ FILE: .github/workflows/release-notify.yml ================================================ name: Notify Slack channel about upcoming release concurrency: group: ${{ github.workflow }}-${{ github.event.number }} cancel-in-progress: true on: pull_request: branches: - release types: # Default types that triggers a workflow: # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request - opened - synchronize - reopened # Additional types that we want to handle: - closed jobs: notify: runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: neondatabase/dev-actions/release-pr-notify@483a843f2a8bcfbdc4c69d27630528a3ddc4e14b # main with: slack-token: ${{ secrets.SLACK_BOT_TOKEN }} slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications` github-token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/release-proxy.yml ================================================ name: Create proxy release PR on: schedule: - cron: '0 6 * * TUE' jobs: create-release-pr: uses: ./.github/workflows/release.yml with: component: proxy secrets: inherit ================================================ FILE: .github/workflows/release-storage.yml ================================================ name: Create storage release PR on: schedule: - cron: '0 6 * * FRI' jobs: create-release-pr: uses: ./.github/workflows/release.yml with: component: storage secrets: inherit ================================================ FILE: .github/workflows/release.yml ================================================ name: Create release PR on: workflow_dispatch: inputs: component: description: "Component to release" required: true type: choice options: - compute - proxy - storage cherry-pick: description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false type: string default: '' workflow_call: inputs: component: description: "Component to release" required: true type: string cherry-pick: description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false type: string default: '' # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} defaults: run: shell: bash -euo pipefail {0} jobs: create-release-pr: runs-on: ubuntu-22.04 permissions: contents: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Configure git run: | git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - name: Create release PR uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677 with: component: ${{ inputs.component }} cherry-pick: ${{ inputs.cherry-pick }} env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} ================================================ FILE: .github/workflows/report-workflow-stats-batch.yml ================================================ name: Report Workflow Stats Batch on: schedule: - cron: '*/15 * * * *' - cron: '25 0 * * *' - cron: '25 1 * * 6' permissions: contents: read jobs: gh-workflow-stats-batch-2h: name: GitHub Workflow Stats Batch 2 hours if: github.event.schedule == '*/15 * * * *' runs-on: ubuntu-22.04 permissions: actions: read steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Export Workflow Run for the past 2 hours uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '2h' gh-workflow-stats-batch-48h: name: GitHub Workflow Stats Batch 48 hours if: github.event.schedule == '25 0 * * *' runs-on: ubuntu-22.04 permissions: actions: read steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Export Workflow Run for the past 48 hours uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '48h' gh-workflow-stats-batch-30d: name: GitHub Workflow Stats Batch 30 days if: github.event.schedule == '25 1 * * 6' runs-on: ubuntu-22.04 permissions: actions: read steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit - name: Export Workflow Run for the past 30 days uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '720h' ================================================ FILE: .github/workflows/trigger-e2e-tests.yml ================================================ name: Trigger E2E Tests on: pull_request: types: - ready_for_review workflow_call: inputs: github-event-name: type: string required: true github-event-json: type: string required: true defaults: run: shell: bash -euxo pipefail {0} env: # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: github-event-name: ${{ inputs.github-event-name || github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] if: github.event_name == 'pull_request' runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Cancel previous e2e-tests runs for this PR env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | gh workflow --repo neondatabase/cloud \ run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" meta: uses: ./.github/workflows/_meta.yml with: github-event-name: ${{ inputs.github-event-name || github.event_name }} github-event-json: ${{ inputs.github-event-json || toJSON(github.event) }} trigger-e2e-tests: needs: [ meta ] runs-on: ubuntu-22.04 env: EVENT_ACTION: ${{ github.event.action }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: >- ${{ contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-storage-release || needs.meta.outputs.build-tag }} COMPUTE_TAG: >- ${{ contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 with: egress-policy: audit - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely timeout-minutes: 60 run: | if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then exit 0 fi # For PRs we use the run id as the tag BUILD_AND_TEST_RUN_ID=${TAG} while true; do gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then break fi jq -c '.[]' jobs.json | while read -r job; do case $(echo $job | jq .conclusion) in failure | cancelled | skipped) echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..." exit 1 ;; esac done echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..." sleep 60 done - name: Set e2e-platforms id: e2e-platforms env: PR_NUMBER: ${{ github.event.pull_request.number }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | # Default set of platforms to run e2e tests on platforms='["docker", "k8s"]' # If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms. # If the workflow run is not a pull request, add k8s-neonvm to the list. if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do case "$f" in # List of directories that contain code which affect compute images. # # This isn't exhaustive, just the paths that are most directly compute-related. # For example, compute_ctl also depends on libs/utils, but we don't trigger # an e2e run on that. vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile) platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') ;; *) # no-op ;; esac done else platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') fi echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT - name: Set PR's status to pending and request a remote CI test env: E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ --method POST \ --raw-field "state=pending" \ --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \ --raw-field "context=neon-cloud-e2e" gh workflow --repo ${REMOTE_REPO} \ run testing.yml \ --ref "main" \ --raw-field "ci_job_name=neon-cloud-e2e" \ --raw-field "commit_hash=$COMMIT_SHA" \ --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ --raw-field "storage_image_tag=${TAG}" \ --raw-field "compute_image_tag=${COMPUTE_TAG}" \ --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ --raw-field "e2e-platforms=${E2E_PLATFORMS}" ================================================ FILE: .gitignore ================================================ /artifact_cache /build /pg_install /target /tmp_check /tmp_check_cli __pycache__/ test_output/ neon_previous/ .vscode .idea *.swp tags neon.iml /.neon /integration_tests/.neon compaction-suite-results.* docker-compose/docker-compose-parallel.yml # Coverage *.profraw *.profdata *.key *.crt *.o *.so *.Po *.pid # pgindent typedef lists *.list # Node **/node_modules/ # various files for local testing /proxy/.subzero local_proxy.json ================================================ FILE: .gitmodules ================================================ [submodule "vendor/postgres-v14"] path = vendor/postgres-v14 url = ../postgres.git branch = REL_14_STABLE_neon [submodule "vendor/postgres-v15"] path = vendor/postgres-v15 url = ../postgres.git branch = REL_15_STABLE_neon [submodule "vendor/postgres-v16"] path = vendor/postgres-v16 url = ../postgres.git branch = REL_16_STABLE_neon [submodule "vendor/postgres-v17"] path = vendor/postgres-v17 url = ../postgres.git branch = REL_17_STABLE_neon ================================================ FILE: .neon_clippy_args ================================================ # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) # * `-D clippy::todo` - don't let `todo!()` slip into `main` export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo" ================================================ FILE: CODEOWNERS ================================================ # Autoscaling /libs/vm_monitor/ @neondatabase/autoscaling # DevProd & PerfCorr /.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness # Compute /pgxn/ @neondatabase/compute /vendor/ @neondatabase/compute /compute/ @neondatabase/compute /compute_tools/ @neondatabase/compute # Proxy /libs/proxy/ @neondatabase/proxy /proxy/ @neondatabase/proxy # Storage /pageserver/ @neondatabase/storage /safekeeper/ @neondatabase/storage /storage_controller @neondatabase/storage /storage_scrubber @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/storage # Shared /pgxn/neon/ @neondatabase/compute @neondatabase/storage /libs/compute_api/ @neondatabase/compute @neondatabase/control-plane /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage ================================================ FILE: CONTRIBUTING.md ================================================ # How to contribute Howdy! Usual good software engineering practices apply. Write tests. Write comments. Follow standard Rust coding practices where possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting. There are soft spots in the code, which could use cleanup, refactoring, additional comments, and so forth. Let's try to raise the bar, and clean things up as we go. Try to leave code in a better shape than it was before. ## Pre-commit hook We have a sample pre-commit hook in `pre-commit.py`. To set it up, run: ```bash ln -s ../../pre-commit.py .git/hooks/pre-commit ``` This will run following checks on staged files before each commit: - `rustfmt` - checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date. If you want to skip the hook, run `git commit` with `--no-verify` option. ## Submitting changes 1. Get at least one +1 on your PR before you push. For simple patches, it will only take a minute for someone to review it. 2. Don't force push small changes after making the PR ready for review. Doing so will force readers to re-read your entire PR, which will delay the review process. 3. Always keep the CI green. Do not push, if the CI failed on your PR. Even if you think it's not your patch's fault. Help to fix the root cause if something else has broken the CI, before pushing. *Happy Hacking!* # How to run a CI pipeline on Pull Requests from external contributors _An instruction for maintainers_ ## TL;DR: - Review the PR - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then: - Press the "Approve and run" button in GitHub UI - Add the `approved-for-ci-run` label to the PR - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour) - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors Repeat all steps after any change to the PR. - When the changes are ready to get merged — merge the original PR (not the internal one) ## Longer version: GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons). So, passing the CI pipeline on Pull Requests from external contributors is impossible. We're using the following approach to make it work: - After the review, assign the `approved-for-ci-run` label to the PR if changes look safe - A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`) - Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI) - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review) For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) ## How do I make build-tools image "pinned" It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow. ```bash gh workflow -R neondatabase/neon run pin-build-tools-image.yml \ -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e ``` ================================================ FILE: Cargo.toml ================================================ [workspace] resolver = "2" members = [ "compute_tools", "control_plane", "control_plane/storcon_cli", "pageserver", "pageserver/compaction", "pageserver/ctl", "pageserver/client", "pageserver/client_grpc", "pageserver/pagebench", "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", "storage_broker", "storage_controller", "storage_controller/client", "storage_scrubber", "workspace_hack", "libs/compute_api", "libs/http-utils", "libs/pageserver_api", "libs/postgres_ffi", "libs/postgres_ffi_types", "libs/postgres_versioninfo", "libs/safekeeper_api", "libs/desim", "libs/neon-shmem", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", "libs/posthog_client_lite", "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", "libs/postgres_connection", "libs/remote_storage", "libs/tracing-utils", "libs/postgres_ffi/wal_craft", "libs/vm_monitor", "libs/walproposer", "libs/wal_decoder", "libs/postgres_initdb", "libs/proxy/json", "libs/proxy/postgres-protocol2", "libs/proxy/postgres-types2", "libs/proxy/subzero_core", "libs/proxy/tokio-postgres2", "endpoint_storage", "pgxn/neon/communicator", ] [workspace.package] edition = "2024" license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.7" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" flate2 = "1.0.26" assert-json-diff = "2" async-stream = "0.3" async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } aws-sdk-s3 = "1.52" aws-sdk-iam = "1.46.0" aws-sdk-kms = "1.47.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.2" aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } axum-extra = { version = "0.10.0", features = ["typed-header", "query"] } base64 = "0.22" bincode = "1.3" bindgen = "0.71" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" cron = "0.15" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } clashmap = { version = "1.0", features = ["raw-api"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" diatomic-waker = { version = "0.2.3" } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" governor = "0.8" hashbrown = "0.14" hashlink = "0.9.1" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } http-body-util = "0.1.2" humantime = "2.2" humantime-serde = "1.1.1" hyper0 = { package = "hyper", version = "0.14" } hyper = "1.4" hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = { version = "2", features = ["serde"] } indoc = "2" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] } jsonwebtoken = "9" lasso = "0.7" libc = "0.2" lock_api = "0.4.13" md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } moka = { version = "0.12", features = ["sync"] } nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.19" once_cell = "1.13" opentelemetry = "0.30" opentelemetry_sdk = "0.30" opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] } opentelemetry-semantic-conventions = "0.30" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pem = "3.0.3" pin-project-lite = "0.2" pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13.5" prost-types = "0.13.5" rand = "0.9" # Remove after p256 is updated to 0.14. rand_core = "=0.6" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_30"] } reqwest-middleware = "0.4" reqwest-retry = "0.7" routerify = "3" rpds = "0.13" rustc-hash = "2.1.1" rustls = { version = "0.23.16", default-features = false } rustls-pemfile = "2" rustls-pki-types = "1.11" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" send-future = "0.1.0" sentry = { version = "0.37", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" serde_with = { version = "3", features = [ "base64" ] } serde_assert = "0.5.0" serde_repr = "0.1.20" sha2 = "0.10.2" signal-hook = "0.3" smallvec = "1.11" smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" spki = "0.7.3" strum = "0.26" strum_macros = "0.26" "subtle" = "2.5.0" svg_fmt = "0.4.3" sync_wrapper = "0.1.2" tar = "0.4" test-context = "0.3" thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } tokio = { version = "1.43.1", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]} tokio-stream = { version = "0.1", features = ["sync"] } tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] } toml = "0.8" toml_edit = "0.22" tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] } tonic-reflection = { version = "0.13.1", features = ["server"] } tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } tower-otel = { version = "0.6", features = ["axum"] } tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" tracing-log = "0.2" tracing-opentelemetry = "0.31" tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } tracing-appender = "0.2.3" try-lock = "0.2.5" test-log = { version = "0.2.17", default-features = false, features = ["log"] } twox-hash = { version = "1.6.3", default-features = false } typed-json = "0.1" url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.8" whoami = "1.5.1" json-structural-diff = { version = "0.2.0" } x509-cert = { version = "0.2.5" } zerocopy = { version = "0.8", features = ["derive", "simd"] } zeroize = "1.8" ## TODO replace this with tracing env_logger = "0.11" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } ## Azure SDK crates azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] } azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] } azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] } azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] } ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } desim = { version = "0.1", path = "./libs/desim" } endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_client_grpc = { path = "./pageserver/client_grpc" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" } postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" } postgres_initdb = { path = "./libs/postgres_initdb" } posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } safekeeper_client = { path = "./safekeeper/client" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. storage_controller_client = { path = "./storage_controller/client" } tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" } wal_decoder = { version = "0.1", path = "./libs/wal_decoder" } walproposer = { version = "0.1", path = "./libs/walproposer/" } ## Common library dependency workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies cbindgen = "0.29.0" criterion = "0.5.1" rcgen = "0.13" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.13.1" [patch.crates-io] # Needed to get `tokio-postgres-rustls` to depend on our fork. tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } ################# Binary contents sections [profile.release] # This is useful for profiling and, to some extent, debug. # Besides, debug info should not affect the performance. # # NB: we also enable frame pointers for improved profiling, see .cargo/config.toml. debug = true # disable debug symbols for all packages except this one to decrease binaries size [profile.release.package."*"] debug = false [profile.release-line-debug] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only [profile.release-line-debug-lto] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only lto = true [profile.release-line-debug-size] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only opt-level = "s" [profile.release-line-debug-zize] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only opt-level = "z" [profile.release-line-debug-size-lto] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only opt-level = "s" lto = true [profile.release-line-debug-zize-lto] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only opt-level = "z" lto = true [profile.release-no-debug] inherits = "release" debug = false # true = 2 = all symbols, 1 = line only [profile.release-no-debug-size] inherits = "release" debug = false # true = 2 = all symbols, 1 = line only opt-level = "s" [profile.release-no-debug-zize] inherits = "release" debug = false # true = 2 = all symbols, 1 = line only opt-level = "z" [profile.release-no-debug-size-lto] inherits = "release" debug = false # true = 2 = all symbols, 1 = line only opt-level = "s" lto = true [profile.release-no-debug-zize-lto] inherits = "release" debug = false # true = 2 = all symbols, 1 = line only opt-level = "z" lto = true ================================================ FILE: Dockerfile ================================================ ### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries. ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Here are the INDEX DIGESTS for the images we use. # You can get them following next steps for now: # 1. Get an authentication token from DockerHub: # TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) # 2. Using that token, query index for the given tag: # curl -s -H "Authorization: Bearer $TOKEN" \ # -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ # "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ # -I | grep -i docker-content-digest # 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks # and updates on regular bases and in automated way. ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 # Here we use ${var/search/replace} syntax, to check # if base image is one of the images, we pin image index for. # If var will match one the known images, we will replace it with the known sha. # If no match, than value will be unaffected, and will process with no-pinned image. ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} # Naive way: # # 1. COPY . . # 1. make neon-pg-ext # 2. cargo build # # But to enable docker to cache intermediate layers, we perform a few preparatory steps: # # - Build all postgres versions, depending on just the contents of vendor/ # - Use cargo chef to build all rust dependencies # 1. Build all postgres versions FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot postgres.mk postgres.mk COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s postgres # 2. Prepare cargo-chef recipe FROM $REPOSITORY/$IMAGE:$TAG AS plan WORKDIR /home/nonroot COPY --chown=nonroot . . RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ set -e \ && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ export CARGO_NET_GIT_FETCH_WITH_CLI=true && \ git config --global url."https://$(cat /run/secrets/SUBZERO_ACCESS_TOKEN)@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" && \ cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a; \ fi \ && cargo chef prepare --recipe-path recipe.json # Main build image FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG ARG ADDITIONAL_RUSTFLAGS="" ARG IO_ALIGNMENT=512 ENV CARGO_FEATURES="default" # 3. Build cargo dependencies. Note that this step doesn't depend on anything else than # `recipe.json`, so the layer can be reused as long as none of the dependencies change. COPY --from=plan /home/nonroot/recipe.json recipe.json RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ set -e \ && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ export CARGO_NET_GIT_FETCH_WITH_CLI=true && \ git config --global url."https://$(cat /run/secrets/SUBZERO_ACCESS_TOKEN)@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero"; \ fi \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json # Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build' # layer, and the cargo dependencies built in the previous step. COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install COPY --chown=nonroot . . COPY --chown=nonroot --from=plan /home/nonroot/proxy/Cargo.toml proxy/Cargo.toml COPY --chown=nonroot --from=plan /home/nonroot/Cargo.lock Cargo.lock RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ set -e \ && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ export CARGO_FEATURES="${CARGO_FEATURES},rest_broker"; \ fi \ && if [ "$IO_ALIGNMENT" = "4k" ]; then \ export CARGO_FEATURES="${CARGO_FEATURES},io-align-4k"; \ elif [ "$IO_ALIGNMENT" = "512" ]; then \ export CARGO_FEATURES="${CARGO_FEATURES},io-align-512"; \ fi \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \ --features $CARGO_FEATURES \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ --bin safekeeper \ --bin storage_broker \ --bin storage_controller \ --bin proxy \ --bin endpoint_storage \ --bin neon_local \ --bin storage_scrubber \ --locked --release \ && mold -run make -j $(nproc) -s neon-pg-ext # Assemble the final image FROM $BASE_IMAGE_SHA WORKDIR /data RUN set -e \ && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ && apt install -y \ libreadline-dev \ libseccomp-dev \ ca-certificates \ openssl \ unzip \ curl \ && ARCH=$(uname -m) \ && if [ "$ARCH" = "x86_64" ]; then \ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \ elif [ "$ARCH" = "aarch64" ]; then \ curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \ else \ echo "Unsupported architecture: $ARCH" && exit 1; \ fi \ && unzip awscliv2.zip \ && ./aws/install \ && rm -rf aws awscliv2.zip \ && rm -f /etc/apt/apt.conf.d/80-retries \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ && chown -R neon:neon /data COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin COPY --from=build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=build /home/nonroot/pg_install/v16 /usr/local/v16/ COPY --from=build /home/nonroot/pg_install/v17 /usr/local/v17/ # Deprecated: Old deployment scripts use this tarball which contains all the Postgres binaries. # That's obsolete, since all the same files are also present under /usr/local/v*. But to keep the # old scripts working for now, create the tarball. RUN tar -C /usr/local -cvzf /data/postgres_install.tar.gz v14 v15 v16 v17 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. RUN mkdir -p /data/.neon/ && \ echo "id=1234" > "/data/.neon/identity.toml" && \ echo "broker_endpoint='http://storage_broker:50051'\n" \ "pg_distrib_dir='/usr/local/'\n" \ "listen_pg_addr='0.0.0.0:6400'\n" \ "listen_http_addr='0.0.0.0:9898'\n" \ "availability_zone='local'\n" \ > /data/.neon/pageserver.toml && \ chown -R neon:neon /data/.neon VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package # managers. POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install # Supported PostgreSQL versions POSTGRES_VERSIONS = v17 v16 v15 v14 # CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked` # and `--features testing` are popular examples. # # CARGO_PROFILE: Set to override the cargo profile to use. By default, # it is derived from BUILD_TYPE. # All intermediate build artifacts are stored here. BUILD_DIR := $(ROOT_PROJECT_DIR)/build ICU_PREFIX_DIR := /usr/local/icu # # We differentiate between release / debug build types using the BUILD_TYPE # environment variable. # BUILD_TYPE ?= debug WITH_SANITIZERS ?= no PG_CFLAGS = -fsigned-char ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) CARGO_PROFILE ?= --profile=release # NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places # the final build artifacts. There is unfortunately no easy way of changing # it to a fully predictable path, nor to extract the path with a simple # command. See https://github.com/rust-lang/cargo/issues/9661 and # https://github.com/rust-lang/cargo/issues/6790. NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) CARGO_PROFILE ?= --profile=dev NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif ifeq ($(WITH_SANITIZERS),yes) PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover COPT += -Wno-error # to avoid failing on warnings induced by sanitizers PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS) export CC := gcc export ASAN_OPTIONS := detect_leaks=0 endif ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) # Exclude static build openssl, icu for local build (MacOS, Linux) # Only keep for build type release and debug PG_CONFIGURE_OPTS += --with-icu PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION' PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm' endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux ifneq ($(WITH_SANITIZERS),yes) PG_CONFIGURE_OPTS += --with-libseccomp endif else ifeq ($(UNAME_S),Darwin) PG_CFLAGS += -DUSE_PREFETCH ifndef DISABLE_HOMEBREW # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable OPENSSL_PREFIX := $(shell brew --prefix openssl@3) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: endif endif # Use -C option so that when PostgreSQL "make install" installs the # headers, the mtime of the headers are not changed when there have # been no changes to the files. Changing the mtime triggers an # unnecessary rebuild of 'postgres_ffi'. PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) # This option has a side effect of passing make jobserver to cargo. # However, we shouldn't do this if `make -n` (--dry-run) has been asked. CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) # Force cargo not to print progress bar CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" # # Top level Makefile to build Neon and PostgreSQL # .PHONY: all all: neon postgres-install neon-pg-ext ### Neon Rust bits # # The 'postgres_ffi' crate depends on the Postgres headers. .PHONY: neon neon: postgres-headers-install walproposer-lib cargo-target-dir +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE) .PHONY: cargo-target-dir cargo-target-dir: # https://github.com/rust-lang/cargo/issues/14281 mkdir -p target test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-install-% cargo-target-dir +@echo "Compiling neon-specific Postgres extensions for $*" mkdir -p $(BUILD_DIR)/pgxn-$* $(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \ NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \ CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \ CARGO_PROFILE="$(CARGO_PROFILE)" \ -C $(BUILD_DIR)/pgxn-$*\ -f $(ROOT_PROJECT_DIR)/pgxn/Makefile install # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. # # We also need to include libpgport.a and libpgcommon.a, because walproposer # uses some functions from those libraries. # # Some object files are removed from libpgport.a and libpgcommon.a because # they depend on openssl and other libraries that are not included in our # Rust build. .PHONY: walproposer-lib walproposer-lib: neon-pg-ext-v17 +@echo "Compiling walproposer-lib" mkdir -p $(BUILD_DIR)/walproposer-lib $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ -C $(BUILD_DIR)/walproposer-lib \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(BUILD_DIR)/walproposer-lib cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(BUILD_DIR)/walproposer-lib $(AR) d $(BUILD_DIR)/walproposer-lib/libpgport.a \ pg_strong_random.o $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ checksum_helper.o \ cryptohash_openssl.o \ hmac_openssl.o \ md5_common.o \ parse_manifest.o \ scram-common.o ifeq ($(UNAME_S),Linux) $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ pg_crc32c.o endif # Shorthand to call neon-pg-ext-% target for all Postgres versions .PHONY: neon-pg-ext neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version)) # This removes everything .PHONY: distclean distclean: $(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt fmt: ./pre-commit.py --fix-inplace postgres-%-pg-bsd-indent: postgres-% +@echo "Compiling pg_bsd_indent" $(MAKE) -C $(BUILD_DIR)/$*/src/tools/pg_bsd_indent/ # Create typedef list for the core. Note that generally it should be combined with # buildfarm one to cover platform specific stuff. # https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code postgres-%-typedefs.list: postgres-% $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@ # Indent postgres. See src/tools/pgindent/README for details. .PHONY: postgres-%-pgindent postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list +@echo merge with buildfarm typedef to cover all platforms +@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \ REL_16_STABLE list misses PGSemaphoreData # wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\ # cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\ cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list +@echo note: you might want to run it on selected files/dirs instead. INDENT=$(BUILD_DIR)/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \ --excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns $(RM) pg*.BAK # Indent pxgn/neon. .PHONY: neon-pgindent neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \ INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \ -C $(BUILD_DIR)/pgxn-v17/neon \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent .PHONY: setup-pre-commit-hook setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit build-tools/node_modules: build-tools/package.json cd build-tools && $(if $(CI),npm ci,npm install) touch build-tools/node_modules .PHONY: lint-openapi-spec lint-openapi-spec: build-tools/node_modules # operation-2xx-response: pageserver timeline delete returns 404 on success find . -iname "openapi_spec.y*ml" -exec\ npx --prefix=build-tools/ redocly\ --skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\ --skip-rule=no-server-example.com --skip-rule=operation-2xx-response\ lint {} \+ # Targets for building PostgreSQL are defined in postgres.mk. # # But if the caller has indicated that PostgreSQL is already # installed, by setting the PG_INSTALL_CACHED variable, skip it. ifdef PG_INSTALL_CACHED postgres-install: skip-install $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install postgres-headers-install: +@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set" skip-install: +@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set" else include postgres.mk endif ================================================ FILE: NOTICE ================================================ Neon Copyright 2022 - 2024 Neon Inc. The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license. See vendor/postgres-vX/COPYRIGHT for details. ================================================ FILE: README.md ================================================ [![Neon](https://github.com/user-attachments/assets/fd91da5f-44a9-41c7-9075-36a5b5608083)](https://neon.com) # Neon Neon is an open-source serverless Postgres database platform. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. ## Quick start Try the [Neon Free Tier](https://neon.com/signup) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.com/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.com/docs/connect/connect-from-any-app/) for connection instructions. Alternatively, compile and run the project [locally](#running-local-installation). ## Architecture overview A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. The Neon storage engine consists of two major components: - Pageserver: Scalable storage backend for the compute nodes. - Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. ## Running a local development environment Neon can be run on a workstation for small experiments and to test code changes, by following these instructions. #### Installing dependencies on Linux 1. Install build dependencies and other applicable packages * On Ubuntu or Debian, this set of packages should be sufficient to build the code: ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \ libffi-devel ``` * On Arch based systems, these packages are needed: ```bash pacman -S base-devel readline zlib libseccomp openssl clang \ postgresql-libs cmake postgresql protobuf curl lsof ``` Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` #### Installing dependencies on macOS (12.3.1) 1. Install XCode and dependencies ``` xcode-select --install brew install protobuf openssl flex bison icu4c pkg-config m4 # add openssl to PATH, required for ed25519 keys generation in neon_local echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc ``` If you get errors about missing `m4` you may have to install it manually: ``` brew install m4 brew link --force m4 ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` 3. Install PostgreSQL Client ``` # from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos brew install libpq brew link --force libpq ``` #### Rustc version The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds. This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file. Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux 1. Build neon and patched postgres ``` # Note: The path to the neon sources can not contain a space. git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, # use "BUILD_TYPE=release make -j`nproc` -s" # Remove -s for the verbose build log make -j`nproc` -s ``` #### Building on OSX 1. Build neon and patched postgres ``` # Note: The path to the neon sources can not contain a space. git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, # use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s" # Remove -s for the verbose build log make -j`sysctl -n hw.logicalcpu` -s ``` #### Dependency installation notes To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database 1. Start pageserver and postgres on top of it (should be called from repo root): ```sh # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > cargo neon init Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon" # start pageserver, safekeeper, and broker for their intercommunication > cargo neon start Starting neon broker at 127.0.0.1:50051. storage_broker started, pid: 2918372 Starting pageserver node 1 at '127.0.0.1:64000' in ".neon". pageserver started, pid: 2918386 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 # create initial tenant and use it as a default for every future neon_local invocation > cargo neon tenant create --set-default tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one # create postgres compute node > cargo neon endpoint create main # start postgres compute node > cargo neon endpoint start main Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' # check list of running postgres instances > cargo neon endpoint list ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` 2. Now, it is possible to connect to postgres and run some queries: ```text > psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); INSERT 0 1 postgres=# select * from t; key | value -----+------- 1 | 1 (1 row) ``` 3. And create branches and run postgres on them: ```sh # create branch named migration_check > cargo neon timeline branch --branch-name migration_check Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree > cargo neon timeline list (L) main [de200bd42b49cc1814412c7e592dd6e9] (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # create postgres on that branch > cargo neon endpoint create migration_check --branch-name migration_check # start postgres on that branch > cargo neon endpoint start migration_check Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' # check the new list of running postgres instances > cargo neon endpoint list ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running migration_check 127.0.0.1:55434 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres > psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- 1 | 1 (1 row) postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres > psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- 1 | 1 (1 row) ``` 4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh > cargo neon stop ``` More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md). #### Handling build failures If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again. ## Running tests ### Rust unit tests We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows. Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead. You can install `cargo-nextest` with `cargo install cargo-nextest`. ### Integration tests Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). ```sh git clone --recursive https://github.com/neondatabase/neon.git CARGO_BUILD_FLAGS="--features=testing" make ./scripts/pytest ``` By default, this runs both debug and release modes, and all supported postgres versions. When testing locally, it is convenient to run just one set of permutations, like this: ```sh DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest ``` ## Flamegraphs You may find yourself in need of flamegraphs for software in this repository. You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice! >[!IMPORTANT] > If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument. > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository. > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764). ## Cleanup For cleaning up the source tree from build artifacts, run `make clean` in the source directory. For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned! ## Documentation [docs](/docs) Contains a top-level overview of all available markdown documentation. - [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open` See also README files in some source directories, and `rustdoc` style documentation comments. Other resources: - [SELECT 'Hello, World'](https://neon.com/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture - [Architecture decisions in Neon](https://neon.com/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas - [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series ### Postgres-specific terms Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used. The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. To get more familiar with this aspect, refer to: - [Neon glossary](/docs/glossary.md) - [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html) - Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres)) ## Join the development - Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices. - To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md). - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html ================================================ FILE: build-tools/Dockerfile ================================================ ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Here are the INDEX DIGESTS for the images we use. # You can get them following next steps for now: # 1. Get an authentication token from DockerHub: # TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) # 2. Using that token, query index for the given tag: # curl -s -H "Authorization: Bearer $TOKEN" \ # -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ # "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ # -I | grep -i docker-content-digest # 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks # and updates on regular bases and in automated way. ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 # Here we use ${var/search/replace} syntax, to check # if base image is one of the images, we pin image index for. # If var will match one the known images, we will replace it with the known sha. # If no match, than value will be unaffected, and will process with no-pinned image. ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} FROM $BASE_IMAGE_SHA AS pgcopydb_builder ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ set -e && \ apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates wget gpg && \ wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \ echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ autotools-dev \ libedit-dev \ libgc-dev \ libpam0g-dev \ libreadline-dev \ libselinux1-dev \ libxslt1-dev \ libssl-dev \ libkrb5-dev \ zlib1g-dev \ liblz4-dev \ libpq5 \ libpq-dev \ libzstd-dev \ postgresql-16 \ postgresql-server-dev-16 \ postgresql-common \ python3-sphinx && \ wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \ mkdir /tmp/pgcopydb && \ tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \ cd /tmp/pgcopydb && \ patch -p1 < /pgcopydbv017.patch && \ make -s clean && \ make -s -j12 install && \ libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \ mkdir -p /pgcopydb/lib && \ cp "$libpq_path" /pgcopydb/lib/; \ else \ # copy command below will fail if we don't have dummy files, so we create them for other debian versions mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \ mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \ fi FROM $BASE_IMAGE_SHA AS build_tools ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] RUN mkdir -p /pgcopydb/{bin,lib} && \ chmod -R 755 /pgcopydb && \ chown -R nonroot:nonroot /pgcopydb COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc # System deps # # 'gdb' is included so that we get backtraces of core dumps produced in # regression tests RUN set -e \ && apt-get update \ && apt-get install -y --no-install-recommends \ autoconf \ automake \ bison \ build-essential \ ca-certificates \ cmake \ curl \ flex \ gdb \ git \ gnupg \ gzip \ jq \ jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ liblzma-dev \ libncurses5-dev \ libncursesw5-dev \ libreadline-dev \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ libxxhash-dev \ lsof \ make \ netcat-openbsd \ net-tools \ openssh-client \ parallel \ pkg-config \ unzip \ wget \ xz-utils \ zlib1g-dev \ zstd \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # sql_exporter # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. ENV SQL_EXPORTER_VERSION=0.17.3 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ && mkdir /tmp/sql_exporter \ && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \ && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \ && rm sql_exporter.tar.gz # protobuf-compiler (protoc) # Keep the version the same as in compute/compute-node.Dockerfile ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc # s5cmd ENV S5CMD_VERSION=2.3.0 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ && chmod +x s5cmd \ && mv s5cmd /usr/local/bin/s5cmd # LLVM ENV LLVM_VERSION=20 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt-get update \ && apt-get install -y --no-install-recommends clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Install node ENV NODE_VERSION=24 RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \ && apt-get install -y --no-install-recommends nodejs \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && apt-get update \ && apt-get install -y --no-install-recommends docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Configure sudo & docker RUN usermod -aG sudo nonroot && \ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ usermod -aG docker nonroot # AWS CLI RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ && unzip -q awscliv2.zip \ && ./aws/install \ && rm awscliv2.zip # Mold: A Modern Linker ENV MOLD_VERSION=v2.37.1 RUN set -e \ && git clone -b "${MOLD_VERSION}" --depth 1 https://github.com/rui314/mold.git \ && mkdir mold/build \ && cd mold/build \ && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \ && cmake --build . -j "$(nproc)" \ && cmake --install . \ && cd .. \ && rm -rf mold # LCOV # Build lcov from a fork: # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) # And patches from us: # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) RUN set +o pipefail && \ for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \ yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\ done && \ set -o pipefail # Split into separate step to debug flaky failures here RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \ && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ && cd lcov \ && make install \ && rm -rf ../lcov.tar.gz # Use the same version of libicu as the compute nodes so that # clusters created using inidb on pageserver can be used by computes. # # TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu # package, which is 67.1. We're duplicating that knowledge here, and also, technically, # Debian has a few patches on top of 67.1 that we're not adding here. ENV ICU_VERSION=67.1 ENV ICU_PREFIX=/usr/local/icu # Download and build static ICU RUN wget -O "/tmp/libicu-${ICU_VERSION}.tgz" https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \ echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \ mkdir /tmp/icu && \ pushd /tmp/icu && \ tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \ pushd icu/source && \ ./configure --prefix=${ICU_PREFIX} --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \ make -j "$(nproc)" && \ make install && \ popd && \ rm -rf icu && \ rm -f /tmp/libicu-${ICU_VERSION}.tgz # Switch to nonroot user USER nonroot:nonroot WORKDIR /home/nonroot RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc # Python ENV PYTHON_VERSION=3.11.12 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ && cd "$HOME" \ && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \ && chmod +x pyenv-installer \ && ./pyenv-installer \ && export PYENV_ROOT=/home/nonroot/.pyenv \ && export PATH="$PYENV_ROOT/bin:$PATH" \ && export PATH="$PYENV_ROOT/shims:$PATH" \ && pyenv install "${PYTHON_VERSION}" \ && pyenv global "${PYTHON_VERSION}" \ && python --version \ && pip install --no-cache-dir --upgrade pip \ && pip --version \ && pip install --no-cache-dir pipenv wheel poetry # Switch to nonroot user (again) USER nonroot:nonroot WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) ENV RUSTC_VERSION=1.88.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG CARGO_AUDITABLE_VERSION=0.7.0 ARG RUSTFILT_VERSION=0.2.1 ARG CARGO_HAKARI_VERSION=0.9.36 ARG CARGO_DENY_VERSION=0.18.2 ARG CARGO_HACK_VERSION=0.6.36 ARG CARGO_NEXTEST_VERSION=0.9.94 ARG CARGO_CHEF_VERSION=0.1.71 ARG CARGO_DIESEL_CLI_VERSION=2.2.9 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ rm rustup-init && \ export PATH="$HOME/.cargo/bin:$PATH" && \ . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools rustfmt clippy && \ cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \ cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \ cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \ cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \ cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \ cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \ cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \ cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \ cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \ --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git # Show versions RUN whoami \ && python --version \ && pip --version \ && cargo --version --verbose \ && rustup --version --verbose \ && rustc --version --verbose \ && clang --version RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \ else \ echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \ fi # Set following flag to check in Makefile if its running in Docker RUN touch /home/nonroot/.docker_build ================================================ FILE: build-tools/package.json ================================================ { "name": "build-tools", "private": true, "devDependencies": { "@redocly/cli": "1.34.5", "@sourcemeta/jsonschema": "10.0.0" } } ================================================ FILE: build-tools/patches/pgcopydbv017.patch ================================================ diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c index d730b03..69a9be9 100644 --- a/src/bin/pgcopydb/copydb.c +++ b/src/bin/pgcopydb/copydb.c @@ -44,6 +44,7 @@ GUC dstSettings[] = { { "synchronous_commit", "'off'" }, { "statement_timeout", "0" }, { "lock_timeout", "0" }, + { "idle_in_transaction_session_timeout", "0" }, { NULL, NULL }, }; diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c index 94f2f46..e051ba8 100644 --- a/src/bin/pgcopydb/pgsql.c +++ b/src/bin/pgcopydb/pgsql.c @@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql, LinesBuffer lbuf = { 0 }; + if (message != NULL){ + // make sure message is writable by splitLines + message = strdup(message); + } + if (!splitLines(&lbuf, message)) { /* errors have already been logged */ @@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql, PQbackendPID(pgsql->connection), lbuf.lines[lineNumber]); } + free(message); // free copy of message we created above if (pgsql->logSQL) { @@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context) /* errors have already been logged */ return; } - if (res != NULL) { char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); + if (sqlstate == NULL) + { + // PQresultErrorField returned NULL! + pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault + } + else + { + strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); + } } char *endpoint = ================================================ FILE: clippy.toml ================================================ disallowed-methods = [ "tokio::task::block_in_place", # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", # tokio-epoll-uring: # - allow-invalid because the method doesn't exist on macOS { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true } ] disallowed-macros = [ # use std::pin::pin "futures::pin_mut", # cannot disallow this, because clippy finds used from tokio macros #"tokio::pin", ] allow-unwrap-in-tests = true ================================================ FILE: compute/.gitignore ================================================ # sql_exporter config files generated from Jsonnet etc/neon_collector.yml etc/neon_collector_autoscaling.yml etc/sql_exporter.yml etc/sql_exporter_autoscaling.yml # Node.js dependencies node_modules/ ================================================ FILE: compute/Makefile ================================================ jsonnet_files = $(wildcard \ etc/*.jsonnet \ etc/sql_exporter/*.libsonnet) .PHONY: all all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml neon_collector.yml: $(jsonnet_files) JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector.jsonnet neon_collector_autoscaling.yml: $(jsonnet_files) JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector_autoscaling.jsonnet sql_exporter.yml: $(jsonnet_files) JSONNET_PATH=etc jsonnet \ --output-file etc/$@ \ --tla-str collector_name=neon_collector \ --tla-str collector_file=neon_collector.yml \ --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter&pgaudit.log=none' \ etc/sql_exporter.jsonnet sql_exporter_autoscaling.yml: $(jsonnet_files) JSONNET_PATH=etc jsonnet \ --output-file etc/$@ \ --tla-str collector_name=neon_collector_autoscaling \ --tla-str collector_file=neon_collector_autoscaling.yml \ --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling&pgaudit.log=none' \ etc/sql_exporter.jsonnet .PHONY: clean clean: $(RM) \ etc/neon_collector.yml \ etc/neon_collector_autoscaling.yml \ etc/sql_exporter.yml \ etc/sql_exporter_autoscaling.yml .PHONY: jsonnetfmt-test jsonnetfmt-test: jsonnetfmt --test $(jsonnet_files) .PHONY: jsonnetfmt-format jsonnetfmt-format: jsonnetfmt --in-place $(jsonnet_files) .PHONY: manifest-schema-validation manifest-schema-validation: ../build-tools/node_modules npx --prefix=../build-tools/ jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml ../build-tools/node_modules: ../build-tools/package.json cd ../build-tools && $(if $(CI),npm ci,npm install) touch ../build-tools/node_modules ================================================ FILE: compute/README.md ================================================ This directory contains files that are needed to build the compute images, or included in the compute images. compute-node.Dockerfile To build the compute image vm-image-spec.yaml Instructions for vm-builder, to turn the compute-node image into corresponding vm-compute-node image. etc/ Configuration files included in /etc in the compute image patches/ Some extensions need to be patched to work with Neon. This directory contains such patches. They are applied to the extension sources in compute-node.Dockerfile In addition to these, postgres itself, the neon postgres extension, and compute_ctl are built and copied into the compute image by compute-node.Dockerfile. ================================================ FILE: compute/compute-node.Dockerfile ================================================ # # This Dockerfile builds the compute image. It is built multiple times to produce # different images for each PostgreSQL major version. # # We use Debian as the base for all the steps. The production images use Debian bookworm # for v17, and Debian bullseye for older PostgreSQL versions. # # ## Intermediary layers # # build-tools: This contains Rust compiler toolchain and other tools needed at compile # time. This is also used for the storage builds. This image is defined in # build-tools/Dockerfile. # # build-deps: Contains C compiler, other build tools, and compile-time dependencies # needed to compile PostgreSQL and most extensions. (Some extensions need # extra tools and libraries that are not included in this image. They are # installed in the extension-specific build stages.) # # pg-build: Result of compiling PostgreSQL. The PostgreSQL binaries are copied from # this to the final image. This is also used as the base for compiling all # the extensions. # # compute-tools: This contains compute_ctl, the launcher program that starts Postgres # in Neon. It also contains a few other tools that are built from the # sources from this repository and used in compute VMs: 'fast_import' and # 'local_proxy' # # ## Extensions # # By convention, the build of each extension consists of two layers: # # {extension}-src: Contains the source tarball, possible neon-specific patches, and # the extracted tarball with the patches applied. All of these are # under the /ext-src/ directory. # # {extension}-build: Contains the installed extension files, under /usr/local/pgsql # (in addition to the PostgreSQL binaries inherited from the pg-build # image). A few extensions need extra libraries or other files # installed elsewhere in the filesystem. They are installed by ONBUILD # directives. # # These are merged together into two layers: # # all-extensions: All the extension -build layers merged together # # extension-tests: All the extension -src layers merged together. This is used by the # extension tests. The tests are executed against the compiled image, # but the tests need test scripts, expected result files etc. from the # original sources, which are not included in the binary image. # # ## Extra components # # These are extra included in the compute image, but are not directly used by PostgreSQL # itself. # # pgbouncer: pgbouncer and its configuration # # sql_exporter: Metrics exporter daemon. # # postgres_exporter: Another metrics exporter daemon, for different sets of metrics. # # The configuration files for the metrics exporters are under etc/ directory. We use # a templating system to handle variations between different PostgreSQL versions, # building slightly different config files for each PostgreSQL version. # # # ## Final image # # The final image puts together the PostgreSQL binaries (pg-build), the compute tools # (compute-tools), all the extensions (all-extensions) and the extra components into # one image. # # VM image: The final image built by this dockerfile isn't actually the final image that # we use in computes VMs. There's an extra step that adds some files and makes other # small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM. # That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the # build_and_test.yml github workflow for how that's done. ARG PG_VERSION ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Here are the INDEX DIGESTS for the images we use. # You can get them following next steps for now: # 1. Get an authentication token from DockerHub: # TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) # 2. Using that token, query index for the given tag: # curl -s -H "Authorization: Bearer $TOKEN" \ # -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ # "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ # -I | grep -i docker-content-digest # 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks # and updates on regular bases and in automated way. ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 # Here we use ${var/search/replace} syntax, to check # if base image is one of the images, we pin image index for. # If var will match one the known images, we will replace it with the known sha. # If no match, than value will be unaffected, and will process with no-pinned image. ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} # By default, build all PostgreSQL extensions. For quick local testing when you don't # care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal ARG EXTENSIONS=all ######################################################################################### # # Layer "build-deps" # ######################################################################################### FROM $BASE_IMAGE_SHA AS build-deps ARG DEBIAN_VERSION # Keep in sync with build-tools/Dockerfile ENV PROTOC_VERSION=25.1 # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. # libstdc++-10-dev is required for plv8 bullseye) \ echo "deb http://archive.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \ ;; \ # Version-specific installs for Bookworm (PG17): bookworm) \ VERSION_INSTALLS="cmake libstdc++-12-dev"; \ ;; \ *) \ echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ ;; \ esac && \ apt update && \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ libclang-dev \ jsonnet \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* \ && useradd -ms /bin/bash nonroot -b /home \ # Install protoc from binary release, since Debian's versions are too old. && curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc ######################################################################################### # # Layer "pg-build" # Build Postgres from the neon postgres repository. # ######################################################################################### FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION:?} postgres COPY compute/patches/postgres_fdw.patch . COPY compute/patches/pg_stat_statements_pg14-16.patch . COPY compute/patches/pg_stat_statements_pg17.patch . RUN cd postgres && \ # Apply patches to some contrib extensions # For example, we need to grant EXECUTE on pg_stat_statements_reset() to {privileged_role_name}. # In vanilla Postgres this function is limited to Postgres role superuser. # In Neon we have {privileged_role_name} role that is not a superuser but replaces superuser in some cases. # We could add the additional grant statements to the Postgres repository but it would be hard to maintain, # whenever we need to pick up a new Postgres version and we want to limit the changes in our Postgres fork, # so we do it here. case "${PG_VERSION}" in \ "v14" | "v15" | "v16") \ patch -p1 < /pg_stat_statements_pg14-16.patch; \ ;; \ "v17") \ patch -p1 < /pg_stat_statements_pg17.patch; \ ;; \ *) \ # To do not forget to migrate patches to the next major version echo "No contrib patches for this PostgreSQL version" && exit 1;; \ esac && \ patch -p1 < /postgres_fdw.patch && \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ fi && \ eval $CONFIGURE_CMD && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control # Set PATH for all the subsequent build steps ENV PATH="/usr/local/pgsql/bin:$PATH" ######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # ######################################################################################### FROM build-deps AS postgis-src ARG DEBIAN_VERSION ARG PG_VERSION # Postgis 3.5.0 requires SFCGAL 1.4+ # # It would be nice to update all versions together, but we must solve the SFCGAL dependency first. # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 # and also we must check backward compatibility with older versions of PostGIS. # # Use new version only for v17 WORKDIR /ext-src RUN case "${DEBIAN_VERSION}" in \ "bookworm") \ export SFCGAL_VERSION=1.4.1 \ export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \ ;; \ "bullseye") \ export SFCGAL_VERSION=1.3.10 \ export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \ echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . # Postgis 3.5.0 supports v17 WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ export POSTGIS_VERSION=3.5.0 \ export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \ ;; \ "v14" | "v15" | "v16") \ export POSTGIS_VERSION=3.3.3 \ export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \ echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . # This is reused for pgrouting FROM pg-build AS postgis-build-deps RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc \ && apt clean && rm -rf /var/lib/apt/lists/* FROM postgis-build-deps AS postgis-build COPY --from=postgis-src /ext-src/ /ext-src/ WORKDIR /ext-src/sfcgal-src RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ ninja clean && cp -R /sfcgal/* / WORKDIR /ext-src/postgis-src RUN ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ make staged-install && \ cd extensions/postgis && \ make clean && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \ mkdir -p /extensions/postgis && \ cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis ######################################################################################### # # Layer "pgrouting-build" # Build pgrouting. Note: This depends on the postgis-build-deps layer built above # ######################################################################################### # Uses versioned libraries, i.e. libpgrouting-3.4 # and may introduce function signature changes between releases # i.e. release 3.5.0 has new signature for pg_dijkstra function # # Use new version only for v17 # last release v3.6.2 - Mar 30, 2024 FROM build-deps AS pgrouting-src ARG DEBIAN_VERSION ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ export PGROUTING_VERSION=3.6.2 \ export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \ ;; \ "v14" | "v15" | "v16") \ export PGROUTING_VERSION=3.4.2 \ export PGROUTING_CHECKSUM=cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \ echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . FROM postgis-build-deps AS pgrouting-build COPY --from=pgrouting-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgrouting-src RUN mkdir build && cd build && \ cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ ninja -j $(getconf _NPROCESSORS_ONLN) && \ ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control ######################################################################################### # # Layer "plv8-build" # Build plv8 # ######################################################################################### FROM build-deps AS plv8-src ARG PG_VERSION WORKDIR /ext-src COPY compute/patches/plv8* . # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 # # clone the repo instead of downloading the release tarball because plv8 has submodule dependencies # and the release tarball doesn't include them # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions RUN case "${PG_VERSION:?}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ ;; \ "v14" | "v15" | "v16") \ export PLV8_TAG=v3.1.10 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. # (The V8 engine takes a very long time to build) FROM build-deps AS plv8-build ARG PG_VERSION WORKDIR /ext-src/plv8-src RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build python3-dev libncurses5 binutils clang \ && apt clean && rm -rf /var/lib/apt/lists/* COPY --from=plv8-src /ext-src/ /ext-src/ RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8 # Step 2: Build the PostgreSQL-dependent parts COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql ENV PATH="/usr/local/pgsql/bin:$PATH" RUN \ # generate and copy upgrade scripts make generate_upgrades && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ case "${PG_VERSION:?}" in \ "v17") \ ln -s plv8-3.2.3.so plv8-3.1.8.so && \ ln -s plv8-3.2.3.so plv8-3.1.5.so && \ ln -s plv8-3.2.3.so plv8-3.1.10.so \ ;; \ "v14" | "v15" | "v16") \ ln -s plv8-3.1.10.so plv8-3.1.5.so && \ ln -s plv8-3.1.10.so plv8-3.1.8.so \ ;; \ esac && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control ######################################################################################### # # Layer "h3-pg-build" # Build h3_pg # ######################################################################################### FROM build-deps AS h3-pg-src ARG PG_VERSION WORKDIR /ext-src # not version-specific # last release v4.1.0 - Jan 18, 2023 RUN mkdir -p /h3/usr/ && \ wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . # not version-specific # last release v4.1.3 - Jul 26, 2023 WORKDIR /ext-src RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . FROM pg-build AS h3-pg-build COPY --from=h3-pg-src /ext-src/ /ext-src/ WORKDIR /ext-src/h3-src RUN mkdir build && cd build && \ cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \ -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \ && ninja -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/h3 ninja install && \ cp -R /h3/usr / && \ rm -rf build WORKDIR /ext-src/h3-pg-src RUN ls -l && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control ######################################################################################### # # Layer "postgresql-unit-build" # compile unit extension # ######################################################################################### FROM build-deps AS postgresql-unit-src ARG PG_VERSION # not version-specific # last release 7.9 - Sep 15, 2024 WORKDIR /ext-src RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \ echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . FROM pg-build AS postgresql-unit-build COPY --from=postgresql-unit-src /ext-src/ /ext-src/ WORKDIR /ext-src/postgresql-unit-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. # This one-liner removes pgsql/ part of the path. # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix. find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control ######################################################################################### # # Layer "pgvector-build" # compile pgvector extension # ######################################################################################### FROM build-deps AS pgvector-src ARG PG_VERSION WORKDIR /ext-src COPY compute/patches/pgvector.patch . # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. # # vector >0.7.4 supports v17 # last release v0.8.0 - Oct 30, 2024 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \ echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \ patch -p1 < /ext-src/pgvector.patch FROM pg-build AS pgvector-build COPY --from=pgvector-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgvector-src RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### # # Layer "pgjwt-build" # compile pgjwt extension # ######################################################################################### FROM build-deps AS pgjwt-src ARG PG_VERSION # not version-specific # doesn't use releases, last commit f3d82fd - Mar 2, 2023 WORKDIR /ext-src RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . FROM pg-build AS pgjwt-build COPY --from=pgjwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgjwt-src RUN make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control ######################################################################################### # # Layer "hypopg-build" # compile hypopg extension # ######################################################################################### FROM build-deps AS hypopg-src ARG PG_VERSION # HypoPG 1.4.1 supports v17 # last release 1.4.1 - Apr 28, 2024 WORKDIR /ext-src RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . FROM pg-build AS hypopg-build COPY --from=hypopg-src /ext-src/ /ext-src/ WORKDIR /ext-src/hypopg-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control ######################################################################################### # # Layer "online_advisor-build" # compile online_advisor extension # ######################################################################################### FROM build-deps AS online_advisor-src ARG PG_VERSION # online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries # last release 1.0 - May 15, 2025 WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ ;; \ *) \ echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \ ;; \ esac && \ wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \ echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \ mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C . FROM pg-build AS online_advisor-build COPY --from=online_advisor-src /ext-src/ /ext-src/ WORKDIR /ext-src/ RUN if [ -d online_advisor-src ]; then \ cd online_advisor-src && \ make -j install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \ fi ######################################################################################### # # Layer "pg_hashids-build" # compile pg_hashids extension # ######################################################################################### FROM build-deps AS pg_hashids-src ARG PG_VERSION # not version-specific # last release v1.2.1 -Jan 12, 2018 WORKDIR /ext-src RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . FROM pg-build AS pg_hashids-build COPY --from=pg_hashids-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_hashids-src RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control ######################################################################################### # # Layer "rum-build" # compile rum extension # ######################################################################################### FROM build-deps AS rum-src ARG PG_VERSION WORKDIR /ext-src COPY compute/patches/rum.patch . # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25 # doesn't use releases since 1.3.13 - Sep 19, 2022 # use latest commit from the master branch RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \ echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /ext-src/rum.patch FROM pg-build AS rum-build COPY --from=rum-src /ext-src/ /ext-src/ WORKDIR /ext-src/rum-src RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control ######################################################################################### # # Layer "pgtap-build" # compile pgTAP extension # ######################################################################################### FROM build-deps AS pgtap-src ARG PG_VERSION # pgtap 1.3.3 supports v17 # last release v1.3.3 - Apr 8, 2024 WORKDIR /ext-src RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . FROM pg-build AS pgtap-build COPY --from=pgtap-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgtap-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control ######################################################################################### # # Layer "ip4r-build" # compile ip4r extension # ######################################################################################### FROM build-deps AS ip4r-src ARG PG_VERSION # not version-specific # last release v2.4.2 - Jul 29, 2023 WORKDIR /ext-src RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . FROM pg-build AS ip4r-build COPY --from=ip4r-src /ext-src/ /ext-src/ WORKDIR /ext-src/ip4r-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control ######################################################################################### # # Layer "prefix-build" # compile Prefix extension # ######################################################################################### FROM build-deps AS prefix-src ARG PG_VERSION # not version-specific # last release v1.2.10 - Jul 5, 2023 WORKDIR /ext-src RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . FROM pg-build AS prefix-build COPY --from=prefix-src /ext-src/ /ext-src/ WORKDIR /ext-src/prefix-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control ######################################################################################### # # Layer "hll-build" # compile hll extension # ######################################################################################### FROM build-deps AS hll-src ARG PG_VERSION # not version-specific # last release v2.18 - Aug 29, 2023 WORKDIR /ext-src RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . FROM pg-build AS hll-build COPY --from=hll-src /ext-src/ /ext-src/ WORKDIR /ext-src/hll-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control ######################################################################################### # # Layer "plpgsql_check-build" # compile plpgsql_check extension # ######################################################################################### FROM build-deps AS plpgsql_check-src ARG PG_VERSION # plpgsql_check v2.7.11 supports v17 # last release v2.7.11 - Sep 16, 2024 WORKDIR /ext-src RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . FROM pg-build AS plpgsql_check-build COPY --from=plpgsql_check-src /ext-src/ /ext-src/ WORKDIR /ext-src/plpgsql_check-src RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control ######################################################################################### # # Layer "timescaledb-build" # compile timescaledb extension # ######################################################################################### FROM build-deps AS timescaledb-src ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ "v16") \ export TIMESCALEDB_VERSION=2.13.0 \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ "v17") \ export TIMESCALEDB_VERSION=2.17.1 \ export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \ ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . FROM pg-build AS timescaledb-build COPY --from=timescaledb-src /ext-src/ /ext-src/ WORKDIR /ext-src/timescaledb-src RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control ######################################################################################### # # Layer "pg_hint_plan-build" # compile pg_hint_plan extension # ######################################################################################### FROM build-deps AS pg_hint_plan-src ARG PG_VERSION # version-specific, has separate releases for each version WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ ;; \ "v15") \ export PG_HINT_PLAN_VERSION=15_1_5_0 \ export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \ ;; \ "v16") \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ "v17") \ export PG_HINT_PLAN_VERSION=17_1_7_0 \ export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . FROM pg-build AS pg_hint_plan-build COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_hint_plan-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control ######################################################################################### # # Layer "pg_cron-build" # compile pg_cron extension # ######################################################################################### FROM build-deps AS pg_cron-src ARG PG_VERSION # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. WORKDIR /ext-src COPY compute/patches/pg_cron.patch . RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ patch < /ext-src/pg_cron.patch FROM pg-build AS pg_cron-build COPY --from=pg_cron-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_cron-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control ######################################################################################### # # Layer "rdkit-build" # compile rdkit extension # ######################################################################################### FROM build-deps AS rdkit-src ARG PG_VERSION # rdkit Release_2024_09_1 supports v17 # last release Release_2024_09_1 - Sep 27, 2024 # # Use new version only for v17 # because Release_2024_09_1 has some backward incompatible changes # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ ;; \ "v14" | "v15" | "v16") \ export RDKIT_VERSION=Release_2023_03_3 \ export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . FROM pg-build AS rdkit-build RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ libeigen3-dev \ libboost-all-dev \ && apt clean && rm -rf /var/lib/apt/lists/* COPY --from=rdkit-src /ext-src/ /ext-src/ WORKDIR /ext-src/rdkit-src # XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find # pg_config. For some reason the rdkit cmake script doesn't work with just that, # however. By also adding /usr/local/pgsql, it works, which is weird because there # are no executables in that directory. ENV PATH="/usr/local/pgsql:$PATH" RUN cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ -D RDK_BUILD_AVALON_SUPPORT=ON \ -D RDK_BUILD_PYTHON_WRAPPERS=OFF \ -D RDK_BUILD_DESCRIPTORS3D=OFF \ -D RDK_BUILD_FREESASA_SUPPORT=OFF \ -D RDK_BUILD_COORDGEN_SUPPORT=ON \ -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \ -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \ -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \ -D RDK_TEST_MULTITHREADED=OFF \ -D RDK_BUILD_CPP_TESTS=OFF \ -D RDK_USE_URF=OFF \ -D RDK_BUILD_PGSQL=ON \ -D RDK_PGSQL_STATIC=ON \ -D PostgreSQL_CONFIG=pg_config \ -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \ -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \ -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ -D RDK_INSTALL_INTREE=OFF \ -D RDK_INSTALL_COMIC_FONTS=OFF \ -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ -GNinja \ . && \ ninja -j $(getconf _NPROCESSORS_ONLN) && \ ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control ######################################################################################### # # Layer "pg_uuidv7-build" # compile pg_uuidv7 extension # ######################################################################################### FROM build-deps AS pg_uuidv7-src ARG PG_VERSION # not version-specific # last release v1.6.0 - Oct 9, 2024 WORKDIR /ext-src RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . FROM pg-build AS pg_uuidv7-build COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_uuidv7-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control ######################################################################################### # # Layer "pg_roaringbitmap-build" # compile pg_roaringbitmap extension # ######################################################################################### FROM build-deps AS pg_roaringbitmap-src ARG PG_VERSION # not version-specific # last release v0.5.4 - Jun 28, 2022 WORKDIR /ext-src RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . FROM pg-build AS pg_roaringbitmap-build COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_roaringbitmap-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control ######################################################################################### # # Layer "pg_semver-build" # compile pg_semver extension # ######################################################################################### FROM build-deps AS pg_semver-src ARG PG_VERSION # Release 0.40.0 breaks backward compatibility with previous versions # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0 # Use new version only for v17 # # last release v0.40.0 - Jul 22, 2024 WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \ ;; \ "v14" | "v15" | "v16") \ export SEMVER_VERSION=0.32.1 \ export SEMVER_CHECKSUM=fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \ echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . FROM pg-build AS pg_semver-build COPY --from=pg_semver-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_semver-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control ######################################################################################### # # Layer "build-deps with Rust toolchain installed" # ######################################################################################### FROM build-deps AS build-deps-with-cargo ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot # See comment on the top of the file regading `echo` and `\n` RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init ######################################################################################### # # Layer "pg-build with Rust toolchain installed" # This layer is base and common for layers with `pgrx` # ######################################################################################### FROM pg-build AS pg-build-with-cargo ARG PG_VERSION ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot COPY --from=build-deps-with-cargo /home/nonroot /home/nonroot ######################################################################################### # # Layer "rust extensions" # This layer is used to build `pgrx` deps # ######################################################################################### FROM pg-build-with-cargo AS rust-extensions-build ARG PG_VERSION RUN case "${PG_VERSION:?}" in \ 'v17') \ echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ esac && \ cargo install --locked --version 0.11.3 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root ######################################################################################### # # Layer "rust extensions pgrx12" # # pgrx started to support Postgres 17 since version 12, # but some older extension aren't compatible with it. # This layer should be used as a base for new pgrx extensions, # and eventually get merged with `rust-extensions-build` # ######################################################################################### FROM pg-build-with-cargo AS rust-extensions-build-pgrx12 ARG PG_VERSION RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root ######################################################################################### # # Layer "rust extensions pgrx14" # # Version 14 is now required by a few # This layer should be used as a base for new pgrx extensions, # and eventually get merged with `rust-extensions-build` # ######################################################################################### FROM pg-build-with-cargo AS rust-extensions-build-pgrx14 ARG PG_VERSION RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" # Compile "pgrag" extensions # ######################################################################################### FROM build-deps AS pgrag-src ARG PG_VERSION WORKDIR /ext-src COPY compute/patches/onnxruntime.patch . RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ patch -p1 < /ext-src/onnxruntime.patch && \ echo "#nothing to test here" > neon-test.sh RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . FROM rust-extensions-build-pgrx14 AS pgrag-build COPY --from=pgrag-src /ext-src/ /ext-src/ # Install build-time dependencies # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise WORKDIR /ext-src/onnxruntime-src RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ python3 python3-pip python3-venv && \ apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ python3 -m pip install cmake==3.30.5 RUN . venv/bin/activate && \ ./build.sh --config Release --parallel --cmake_generator Ninja \ --skip_submodule_sync --skip_tests --allow_running_as_root WORKDIR /ext-src/pgrag-src RUN cd exts/rag && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control RUN cd exts/rag_bge_small_en_v15 && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control RUN cd exts/rag_jina_reranker_v1_tiny_en && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control ######################################################################################### # # Layer "pg_jsonschema-build" # Compile "pg_jsonschema" extension # ######################################################################################### FROM build-deps AS pg_jsonschema-src ARG PG_VERSION # last release v0.3.3 - Oct 16, 2024 WORKDIR /ext-src RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \ echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_jsonschema-src RUN \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 # `unsafe-postgres` feature allows to build pgx extensions # against postgres forks that decided to change their ABI name (like us). # With that we can build extensions without forking them and using stock # pgx. As this feature is new few manual version bumps were required. sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### # # Layer "pg_graphql-build" # Compile "pg_graphql" extension # ######################################################################################### FROM build-deps AS pg_graphql-src ARG PG_VERSION # last release v1.5.9 - Oct 16, 2024 WORKDIR /ext-src COPY compute/patches/pg_graphql.patch . RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \ echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \ patch -p1 < /ext-src/pg_graphql.patch FROM rust-extensions-build-pgrx12 AS pg_graphql-build COPY --from=pg_graphql-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_graphql-src RUN cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control ######################################################################################### # # Layer "pg_tiktoken-build" # Compile "pg_tiktoken" extension # ######################################################################################### FROM build-deps AS pg_tiktoken-src ARG PG_VERSION # doesn't use releases # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024 WORKDIR /ext-src RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \ echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \ sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_tiktoken-src RUN cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control ######################################################################################### # # Layer "pgx_ulid-build" # Compile "pgx_ulid" extension for v16 and below # ######################################################################################### FROM build-deps AS pgx_ulid-src ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v14" | "v15" | "v16") \ ;; \ *) \ echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ ;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml FROM rust-extensions-build AS pgx_ulid-build COPY --from=pgx_ulid-src /ext-src/ /ext-src/ WORKDIR /ext-src/ RUN if [ -d pgx_ulid-src ]; then \ cd pgx_ulid-src && \ cargo pgrx install --release && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \ fi ######################################################################################### # # Layer "pgx_ulid-pgrx12-build" # Compile "pgx_ulid" extension for v17 and up # ######################################################################################### FROM build-deps AS pgx_ulid-pgrx12-src ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION:?}" in \ "v17") \ ;; \ *) \ echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ ;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \ echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build ARG PG_VERSION WORKDIR /ext-src COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ RUN if [ -d pgx_ulid-src ]; then \ cd pgx_ulid-src && \ cargo pgrx install --release && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \ fi ######################################################################################### # # Layer "pg_session_jwt-build" # Compile "pg_session_jwt" extension # ######################################################################################### FROM build-deps AS pg_session_jwt-src ARG PG_VERSION # NOTE: local_proxy depends on the version of pg_session_jwt # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \ echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \ sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_session_jwt-src RUN cargo pgrx install --release ######################################################################################### # # Layer "pg-anon-pg-build" # compile anon extension # ######################################################################################### FROM pg-build AS pg_anon-src ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ WORKDIR /ext-src COPY compute/patches/anon_v2.patch . # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ patch -p1 < /ext-src/anon_v2.patch FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build ARG PG_VERSION COPY --from=pg_anon-src /ext-src/ /ext-src/ WORKDIR /ext-src RUN cd pg_anon-src && \ make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ chmod -R a+r ../pg_anon-src && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; ######################################################################################## ######################################################################################### # # Layer "wal2json-build" # Compile "wal2json" extension # ######################################################################################### FROM build-deps AS wal2json-src ARG PG_VERSION # wal2json wal2json_2_6 supports v17 # last release wal2json_2_6 - Apr 25, 2024 WORKDIR /ext-src RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . FROM pg-build AS wal2json-build COPY --from=wal2json-src /ext-src/ /ext-src/ WORKDIR /ext-src/wal2json-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### # # Layer "pg_ivm" # compile pg_ivm extension # ######################################################################################### FROM build-deps AS pg_ivm-src ARG PG_VERSION # pg_ivm v1.9 supports v17 # last release v1.9 - Jul 31 WORKDIR /ext-src RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . FROM pg-build AS pg_ivm-build COPY --from=pg_ivm-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_ivm-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control ######################################################################################### # # Layer "pg_partman" # compile pg_partman extension # ######################################################################################### FROM build-deps AS pg_partman-src ARG PG_VERSION # should support v17 https://github.com/pgpartman/pg_partman/discussions/693 # last release 5.1.0 Apr 2, 2024 WORKDIR /ext-src RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . FROM pg-build AS pg_partman-build COPY --from=pg_partman-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_partman-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control ######################################################################################### # # Layer "pg_mooncake" # compile pg_mooncake extension # ######################################################################################### FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src COPY compute/patches/duckdb_v113.patch . RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh FROM rust-extensions-build AS pg_mooncake-build COPY --from=pg_mooncake-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_mooncake-src RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control ######################################################################################### # # Layer "pg-duckdb-pg-build" # compile pg_duckdb extension # ######################################################################################### FROM build-deps AS pg_duckdb-src WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules # allow {privileged_role_name} to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() # - access to duckdb.extensions table and its sequence RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ cd pg_duckdb-src && \ git submodule update --init --recursive && \ patch -p1 < /ext-src/pg_duckdb_v031.patch && \ cd third_party/duckdb && \ patch -p1 < /ext-src/duckdb_v120.patch FROM pg-build AS pg_duckdb-build ARG PG_VERSION COPY --from=pg_duckdb-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_duckdb-src RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control ######################################################################################### # # Layer "pg_repack" # compile pg_repack extension # ######################################################################################### FROM build-deps AS pg_repack-src ARG PG_VERSION WORKDIR /ext-src RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . FROM rust-extensions-build AS pg_repack-build COPY --from=pg_repack-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_repack-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### # # Layer "pgaudit" # compile pgaudit extension # ######################################################################################### FROM build-deps AS pgaudit-src ARG PG_VERSION WORKDIR /ext-src COPY "compute/patches/pgaudit-parallel_workers-${PG_VERSION}.patch" . RUN case "${PG_VERSION}" in \ "v14") \ export PGAUDIT_VERSION=1.6.3 \ export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \ ;; \ "v15") \ export PGAUDIT_VERSION=1.7.1 \ export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \ ;; \ "v16") \ export PGAUDIT_VERSION=16.1 \ export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \ ;; \ "v17") \ export PGAUDIT_VERSION=17.1 \ export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \ ;; \ *) \ echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ esac && \ wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . && \ patch -p1 < "/ext-src/pgaudit-parallel_workers-${PG_VERSION}.patch" FROM pg-build AS pgaudit-build COPY --from=pgaudit-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgaudit-src RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) ######################################################################################### # # Layer "pgauditlogtofile" # compile pgauditlogtofile extension # ######################################################################################### FROM build-deps AS pgauditlogtofile-src ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14" | "v15" | "v16" | "v17") \ export PGAUDITLOGTOFILE_VERSION=v1.6.4 \ export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \ ;; \ *) \ echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \ esac && \ wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \ echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \ mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C . FROM pg-build AS pgauditlogtofile-build COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/ WORKDIR /ext-src/pgauditlogtofile-src RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) ######################################################################################### # # Layer "neon-ext-build" # compile neon extensions # ######################################################################################### FROM pg-build-with-cargo AS neon-ext-build ARG PG_VERSION USER root COPY . . RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute \ BUILD_TYPE=release CARGO_BUILD_FLAGS="--locked --release" NEON_CARGO_ARTIFACT_TARGET_DIR="$(pwd)/target/release" ######################################################################################### # # Layer "extensions-none" # ######################################################################################### FROM build-deps AS extensions-none RUN mkdir /usr/local/pgsql ######################################################################################### # # Layer "extensions-minimal" # # This subset of extensions includes the extensions that we have in # shared_preload_libraries by default. # ######################################################################################### FROM build-deps AS extensions-minimal COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ ######################################################################################### # # Layer "extensions-all" # Bundle together all the extensions # ######################################################################################### FROM build-deps AS extensions-all # Public extensions COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /sfcgal/* / COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ ######################################################################################### # # Layer "neon-pg-ext-build" # Includes Postgres and all the extensions chosen by EXTENSIONS arg. # ######################################################################################### FROM extensions-${EXTENSIONS} AS neon-pg-ext-build ######################################################################################### # # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries # ######################################################################################### FROM build-deps-with-cargo AS compute-tools ARG BUILD_TAG ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ --mount=type=cache,uid=1000,target=/home/nonroot/target \ cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ mkdir target-bin && \ cp target/release-line-debug-size-lto/compute_ctl \ target/release-line-debug-size-lto/fast_import \ target/release-line-debug-size-lto/local_proxy \ target-bin ######################################################################################### # # Layer "pgbouncer" # ######################################################################################### FROM $BASE_IMAGE_SHA AS pgbouncer RUN set -e \ && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ && apt install --no-install-suggests --no-install-recommends -y \ build-essential \ git \ ca-certificates \ autoconf \ automake \ libevent-dev \ libtool \ pkg-config \ libcurl4-openssl-dev \ libssl-dev \ && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) ENV PGBOUNCER_TAG=pgbouncer_1_24_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ && ./configure --prefix=/usr/local/pgbouncer \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= ######################################################################################### # # Layer "exporters" # ######################################################################################### FROM build-deps AS exporters ARG TARGETARCH # Keep sql_exporter version same as in build-tools/Dockerfile and # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` RUN if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\ else\ postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\ fi\ && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\ && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - ######################################################################################### # # Clean up postgres folder before inclusion # ######################################################################################### FROM neon-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a ######################################################################################### # # Preprocess the sql_exporter configuration files # ######################################################################################### FROM build-deps AS sql_exporter_preprocessor ARG PG_VERSION USER nonroot WORKDIR /home/nonroot COPY --chown=nonroot compute compute RUN make PG_VERSION="${PG_VERSION:?}" -C compute ######################################################################################### # # Layer extension-tests # ######################################################################################### FROM pg-build AS extension-tests ARG PG_VERSION # This is required for the PostGIS test RUN apt-get update && case $DEBIAN_VERSION in \ bullseye) \ apt-get install -y libproj19 libgdal28 time; \ ;; \ bookworm) \ apt-get install -y libgdal32 libproj25 time; \ ;; \ *) \ echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ ;; \ esac COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src COPY --from=postgis-build /sfcgal/* /usr COPY --from=plv8-src /ext-src/ /ext-src/ COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src COPY --from=postgresql-unit-src /ext-src/ /ext-src/ COPY --from=pgvector-src /ext-src/ /ext-src/ COPY --from=pgjwt-src /ext-src/ /ext-src/ #COPY --from=pgrag-src /ext-src/ /ext-src/ #COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ COPY --from=pg_graphql-src /ext-src/ /ext-src/ #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ COPY --from=hypopg-src /ext-src/ /ext-src/ COPY --from=online_advisor-src /ext-src/ /ext-src/ COPY --from=pg_hashids-src /ext-src/ /ext-src/ COPY --from=rum-src /ext-src/ /ext-src/ COPY --from=pgtap-src /ext-src/ /ext-src/ COPY --from=ip4r-src /ext-src/ /ext-src/ COPY --from=prefix-src /ext-src/ /ext-src/ COPY --from=hll-src /ext-src/ /ext-src/ COPY --from=plpgsql_check-src /ext-src/ /ext-src/ #COPY --from=timescaledb-src /ext-src/ /ext-src/ COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch COPY --from=pg_cron-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ #COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ #COPY --from=rdkit-src /ext-src/ /ext-src/ COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ COPY --from=pg_semver-src /ext-src/ /ext-src/ #COPY --from=wal2json-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ #COPY --from=pg_mooncake-src /ext-src/ /ext-src/ COPY --from=pg_repack-src /ext-src/ /ext-src/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/pg_repack.patch /ext-src RUN cd /ext-src/pg_repack-src && patch -p1 /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \ && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH ENV PGHOST=compute1 ENV PGPORT=55433 ENV PGUSER=cloud_admin ENV PGDATABASE=postgres ENV PG_VERSION=${PG_VERSION:?} ######################################################################################### # # Final layer # Put it all together into the final image # ######################################################################################### FROM $BASE_IMAGE_SHA ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # Install: # libreadline8 for psql # liblz4-1 for lz4 # libossp-uuid16 for extension ossp-uuid # libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd # libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl # libevent for pgbouncer RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc RUN apt update && \ case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # libicu67, locales for collations (including ICU and plpgsql_check) # libgdal28, libproj19 for PostGIS bullseye) \ VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ ;; \ # Version-specific installs for Bookworm (PG17): # libicu72, locales for collations (including ICU and plpgsql_check) # libgdal32, libproj25 for PostGIS bookworm) \ VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ ;; \ *) \ echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ ;; \ esac && \ apt install --no-install-recommends -y \ ca-certificates \ gdb \ iproute2 \ libboost-iostreams1.74.0 \ libboost-regex1.74.0 \ libboost-serialization1.74.0 \ libboost-system1.74.0 \ libcurl4 \ libevent-2.1-7 \ libgeos-c1v5 \ liblz4-1 \ libossp-uuid16 \ libprotobuf-c1 \ libreadline8 \ libsfcgal1 \ libxml2 \ libxslt1.1 \ libzstd1 \ locales \ lsof \ procps \ rsyslog-gnutls \ screen \ tcpdump \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ mkdir /var/db/postgres/pgbouncer && \ chown -R postgres:postgres /var/db/postgres && \ chmod 0750 /var/db/postgres/compute && \ chmod 0750 /var/db/postgres/pgbouncer && \ # create folder for file cache mkdir -p -m 777 /neon/cache && \ # Create remote extension download directory mkdir /usr/local/download_extensions && \ chown -R postgres:postgres /usr/local/download_extensions # pgbouncer and its config COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import # local_proxy and its config COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy # Metrics exporter binaries and configuration files COPY --from=exporters ./postgres_exporter /bin/postgres_exporter COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter COPY --from=exporters ./sql_exporter /bin/sql_exporter COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Make the libraries we built available COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf RUN /sbin/ldconfig # rsyslog config permissions # directory for rsyslogd pid file RUN mkdir /var/run/rsyslogd && \ chown -R postgres:postgres /var/run/rsyslogd && \ chown -R postgres:postgres /etc/rsyslog.d/ ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] ================================================ FILE: compute/etc/README.md ================================================ # Compute Configuration These files are the configuration files for various other pieces of software that will be running in the compute alongside Postgres. ## `sql_exporter` ### Adding a `sql_exporter` Metric We use `sql_exporter` to export various metrics from Postgres. In order to add a metric, you will need to create two files: a `libsonnet` and a `sql` file. You will then import the `libsonnet` file in one of the collector files, and the `sql` file will be imported in the `libsonnet` file. In the event your statistic is an LSN, you may want to cast it to a `float8` because Prometheus only supports floats. It's probably fine because `float8` can store integers from `-2^53` to `+2^53` exactly. ================================================ FILE: compute/etc/ld.so.conf.d/00-neon.conf ================================================ /usr/local/lib ================================================ FILE: compute/etc/neon_collector.jsonnet ================================================ { collector_name: 'neon_collector', metrics: [ import 'sql_exporter/checkpoints_req.libsonnet', import 'sql_exporter/checkpoints_timed.libsonnet', import 'sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet', import 'sql_exporter/compute_current_lsn.libsonnet', import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', import 'sql_exporter/compute_max_connections.libsonnet', import 'sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet', import 'sql_exporter/compute_pg_oldest_mxid_age.libsonnet', import 'sql_exporter/compute_receive_lsn.libsonnet', import 'sql_exporter/compute_subscriptions_count.libsonnet', import 'sql_exporter/connection_counts.libsonnet', import 'sql_exporter/db_total_size.libsonnet', import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet', import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet', import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet', import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet', import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet', import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet', import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet', import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', import 'sql_exporter/lfc_cache_size_limit.libsonnet', import 'sql_exporter/lfc_chunk_size.libsonnet', import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', import 'sql_exporter/lfc_used_pages.libsonnet', import 'sql_exporter/lfc_writes.libsonnet', import 'sql_exporter/logical_slot_restart_lsn.libsonnet', import 'sql_exporter/max_cluster_size.libsonnet', import 'sql_exporter/pageserver_disconnects_total.libsonnet', import 'sql_exporter/pageserver_requests_sent_total.libsonnet', import 'sql_exporter/pageserver_send_flushes_total.libsonnet', import 'sql_exporter/pageserver_open_requests.libsonnet', import 'sql_exporter/pg_stats_userdb.libsonnet', import 'sql_exporter/replication_delay_bytes.libsonnet', import 'sql_exporter/replication_delay_seconds.libsonnet', import 'sql_exporter/retained_wal.libsonnet', import 'sql_exporter/wal_is_lost.libsonnet', ], queries: [ { query_name: 'neon_perf_counters', query: importstr 'sql_exporter/neon_perf_counters.sql', }, ], } ================================================ FILE: compute/etc/neon_collector_autoscaling.jsonnet ================================================ { collector_name: 'neon_collector_autoscaling', metrics: [ import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', import 'sql_exporter/lfc_cache_size_limit.libsonnet', import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', import 'sql_exporter/lfc_writes.libsonnet', ], } ================================================ FILE: compute/etc/pgbouncer.ini ================================================ [databases] ;; pgbouncer propagates application_name (if it's specified) to the server, but some ;; clients don't set it. We set default application_name=pgbouncer to make it ;; easier to identify pgbouncer connections in Postgres. If client sets ;; application_name, it will be used instead. *=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer [pgbouncer] listen_port=6432 listen_addr=0.0.0.0 auth_type=scram-sha-256 auth_user=cloud_admin auth_dbname=postgres client_tls_sslmode=disable server_tls_sslmode=disable pool_mode=transaction max_client_conn=10000 default_pool_size=64 max_prepared_statements=0 admin_users=postgres unix_socket_dir=/tmp/ unix_socket_mode=0777 ; required for pgbouncer_exporter ignore_startup_parameters=extra_float_digits ; pidfile for graceful termination pidfile=/tmp/pgbouncer.pid ;; Disable connection logging. It produces a lot of logs that no one looks at, ;; and we can get similar log entries from the proxy too. We had incidents in ;; the past where the logging significantly stressed the log device or pgbouncer ;; itself. log_connections=0 log_disconnections=0 ================================================ FILE: compute/etc/postgres_exporter.yml ================================================ ================================================ FILE: compute/etc/sql_exporter/checkpoints_req.17.sql ================================================ SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer; ================================================ FILE: compute/etc/sql_exporter/checkpoints_req.libsonnet ================================================ local neon = import 'neon.libsonnet'; local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; { metric_name: 'checkpoints_req', type: 'gauge', help: 'Number of requested checkpoints', key_labels: null, values: [ 'checkpoints_req', ], query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } ================================================ FILE: compute/etc/sql_exporter/checkpoints_req.sql ================================================ SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter; ================================================ FILE: compute/etc/sql_exporter/checkpoints_timed.17.sql ================================================ SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer; ================================================ FILE: compute/etc/sql_exporter/checkpoints_timed.libsonnet ================================================ local neon = import 'neon.libsonnet'; local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql'; local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql'; { metric_name: 'checkpoints_timed', type: 'gauge', help: 'Number of scheduled checkpoints', key_labels: null, values: [ 'checkpoints_timed', ], query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } ================================================ FILE: compute/etc/sql_exporter/checkpoints_timed.sql ================================================ SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter; ================================================ FILE: compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet ================================================ { metric_name: 'compute_backpressure_throttling_seconds_total', type: 'counter', help: 'Time compute has spent throttled', key_labels: null, values: [ 'throttled', ], query: importstr 'sql_exporter/compute_backpressure_throttling_seconds_total.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql ================================================ SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled; ================================================ FILE: compute/etc/sql_exporter/compute_current_lsn.libsonnet ================================================ { metric_name: 'compute_current_lsn', type: 'gauge', help: 'Current LSN of the database', key_labels: null, values: [ 'lsn', ], query: importstr 'sql_exporter/compute_current_lsn.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_current_lsn.sql ================================================ SELECT CASE WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8 END AS lsn; ================================================ FILE: compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet ================================================ { metric_name: 'compute_getpage_max_inflight_stuck_time_ms', type: 'gauge', help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for', values: [ 'compute_getpage_max_inflight_stuck_time_ms', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet ================================================ { metric_name: 'compute_getpage_stuck_requests_total', type: 'counter', help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout', values: [ 'compute_getpage_stuck_requests_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet ================================================ { metric_name: 'compute_logical_snapshot_files', type: 'gauge', help: 'Number of snapshot files in pg_logical/snapshot', key_labels: [ 'timeline_id', ], values: [ 'num_logical_snapshot_files', ], query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_logical_snapshot_files.sql ================================================ SELECT (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; ================================================ FILE: compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql ================================================ SELECT (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; ================================================ FILE: compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet ================================================ local neon = import 'neon.libsonnet'; local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql'; local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql'; { metric_name: 'compute_logical_snapshots_bytes', type: 'gauge', help: 'Size of the pg_logical/snapshots directory, not including temporary files', key_labels: [ 'timeline_id', ], values: [ 'logical_snapshots_bytes', ], query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir, } ================================================ FILE: compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql ================================================ SELECT (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name ) AS logical_snapshots_bytes; ================================================ FILE: compute/etc/sql_exporter/compute_max_connections.libsonnet ================================================ { metric_name: 'compute_max_connections', type: 'gauge', help: 'Max connections allowed for Postgres', key_labels: null, values: [ 'max_connections', ], query: importstr 'sql_exporter/compute_max_connections.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_max_connections.sql ================================================ SELECT pg_catalog.current_setting('max_connections') AS max_connections; ================================================ FILE: compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet ================================================ { metric_name: 'compute_pg_oldest_frozen_xid_age', type: 'gauge', help: 'Age of oldest XIDs that have not been frozen by VACUUM. An indicator of how long it has been since VACUUM last ran.', key_labels: [ 'database_name', ], value_label: 'metric', values: [ 'frozen_xid_age', ], query: importstr 'sql_exporter/compute_pg_oldest_frozen_xid_age.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql ================================================ SELECT datname database_name, pg_catalog.age(datfrozenxid) frozen_xid_age FROM pg_catalog.pg_database ORDER BY frozen_xid_age DESC LIMIT 10; ================================================ FILE: compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet ================================================ { metric_name: 'compute_pg_oldest_mxid_age', type: 'gauge', help: 'Age of oldest MXIDs that have not been replaced by VACUUM. An indicator of how long it has been since VACUUM last ran.', key_labels: [ 'database_name', ], value_label: 'metric', values: [ 'min_mxid_age', ], query: importstr 'sql_exporter/compute_pg_oldest_mxid_age.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql ================================================ SELECT datname database_name, pg_catalog.mxid_age(datminmxid) min_mxid_age FROM pg_catalog.pg_database ORDER BY min_mxid_age DESC LIMIT 10; ================================================ FILE: compute/etc/sql_exporter/compute_receive_lsn.libsonnet ================================================ { metric_name: 'compute_receive_lsn', type: 'gauge', help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', key_labels: null, values: [ 'lsn', ], query: importstr 'sql_exporter/compute_receive_lsn.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_receive_lsn.sql ================================================ SELECT CASE WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE 0 END AS lsn; ================================================ FILE: compute/etc/sql_exporter/compute_subscriptions_count.libsonnet ================================================ { metric_name: 'compute_subscriptions_count', type: 'gauge', help: 'Number of logical replication subscriptions grouped by enabled/disabled', key_labels: [ 'enabled', ], values: [ 'subscriptions_count', ], query: importstr 'sql_exporter/compute_subscriptions_count.sql', } ================================================ FILE: compute/etc/sql_exporter/compute_subscriptions_count.sql ================================================ SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled; ================================================ FILE: compute/etc/sql_exporter/connection_counts.libsonnet ================================================ { metric_name: 'connection_counts', type: 'gauge', help: 'Connection counts', key_labels: [ 'datname', 'state', ], values: [ 'count', ], query: importstr 'sql_exporter/connection_counts.sql', } ================================================ FILE: compute/etc/sql_exporter/connection_counts.sql ================================================ SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state; ================================================ FILE: compute/etc/sql_exporter/db_total_size.libsonnet ================================================ { metric_name: 'db_total_size', type: 'gauge', help: 'Size of all databases', key_labels: null, values: [ 'total', ], query: importstr 'sql_exporter/db_total_size.sql', } ================================================ FILE: compute/etc/sql_exporter/db_total_size.sql ================================================ SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total FROM pg_catalog.pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2; ================================================ FILE: compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet ================================================ { metric_name: 'file_cache_read_wait_seconds_bucket', type: 'counter', help: 'Histogram buckets of LFC read operation latencies', key_labels: [ 'bucket_le', ], values: [ 'value', ], query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql', } ================================================ FILE: compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql ================================================ SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket'; ================================================ FILE: compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet ================================================ { metric_name: 'file_cache_read_wait_seconds_count', type: 'counter', help: 'Number of read operations in LFC', values: [ 'file_cache_read_wait_seconds_count', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet ================================================ { metric_name: 'file_cache_read_wait_seconds_sum', type: 'counter', help: 'Time spent in LFC read operations', values: [ 'file_cache_read_wait_seconds_sum', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet ================================================ { metric_name: 'file_cache_write_wait_seconds_bucket', type: 'counter', help: 'Histogram buckets of LFC write operation latencies', key_labels: [ 'bucket_le', ], values: [ 'value', ], query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql', } ================================================ FILE: compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql ================================================ SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket'; ================================================ FILE: compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet ================================================ { metric_name: 'file_cache_write_wait_seconds_count', type: 'counter', help: 'Number of write operations in LFC', values: [ 'file_cache_write_wait_seconds_count', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet ================================================ { metric_name: 'file_cache_write_wait_seconds_sum', type: 'counter', help: 'Time spent in LFC write operations', values: [ 'file_cache_write_wait_seconds_sum', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet ================================================ { metric_name: 'getpage_prefetch_discards_total', type: 'counter', help: 'Number of prefetch responses issued but not used', values: [ 'getpage_prefetch_discards_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet ================================================ { metric_name: 'getpage_prefetch_misses_total', type: 'counter', help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", values: [ 'getpage_prefetch_misses_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet ================================================ { metric_name: 'getpage_prefetch_requests_total', type: 'counter', help: 'Number of getpage issued for prefetching', values: [ 'getpage_prefetch_requests_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet ================================================ { metric_name: 'getpage_prefetches_buffered', type: 'gauge', help: 'Number of prefetched pages buffered in neon', values: [ 'getpage_prefetches_buffered', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet ================================================ { metric_name: 'getpage_sync_requests_total', type: 'counter', help: 'Number of synchronous getpage issued', values: [ 'getpage_sync_requests_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet ================================================ { metric_name: 'getpage_wait_seconds_bucket', type: 'counter', help: 'Histogram buckets of getpage request latency', key_labels: [ 'bucket_le', ], values: [ 'value', ], query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', } ================================================ FILE: compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql ================================================ SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; ================================================ FILE: compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet ================================================ { metric_name: 'getpage_wait_seconds_count', type: 'counter', help: 'Number of getpage requests', values: [ 'getpage_wait_seconds_count', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet ================================================ { metric_name: 'getpage_wait_seconds_sum', type: 'counter', help: 'Time spent in getpage requests', values: [ 'getpage_wait_seconds_sum', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet ================================================ // DEPRECATED { metric_name: 'lfc_approximate_working_set_size', type: 'gauge', help: 'Approximate working set size in pages of 8192 bytes', key_labels: null, values: [ 'approximate_working_set_size', ], query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size.sql ================================================ SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet ================================================ { metric_name: 'lfc_approximate_working_set_size_windows', type: 'gauge', help: 'Approximate working set size in pages of 8192 bytes', key_labels: [ 'duration_seconds', ], values: [ 'size', ], query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql ================================================ -- NOTE: This is the "internal" / "machine-readable" version. This outputs the -- working set size looking back 1..60 minutes, labeled with the number of -- minutes. SELECT x::pg_catalog.text AS duration_seconds, neon.approximate_working_set_size_seconds(x) AS size FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet ================================================ { metric_name: 'lfc_approximate_working_set_size_windows', type: 'gauge', help: 'Approximate working set size in pages of 8192 bytes', key_labels: [ 'duration', ], values: [ 'size', ], query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql ================================================ -- NOTE: This is the "public" / "human-readable" version. Here, we supply a -- small selection of durations in a pretty-printed form. SELECT x AS duration, neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM ( VALUES ('5m'), ('15m'), ('1h') ) AS t (x); ================================================ FILE: compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet ================================================ { metric_name: 'lfc_cache_size_limit', type: 'gauge', help: 'LFC cache size limit in bytes', key_labels: null, values: [ 'lfc_cache_size_limit', ], query: importstr 'sql_exporter/lfc_cache_size_limit.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_cache_size_limit.sql ================================================ SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; ================================================ FILE: compute/etc/sql_exporter/lfc_chunk_size.libsonnet ================================================ { metric_name: 'lfc_chunk_size', type: 'gauge', help: 'LFC chunk size, measured in 8KiB pages', key_labels: null, values: [ 'lfc_chunk_size_pages', ], query: importstr 'sql_exporter/lfc_chunk_size.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_chunk_size.sql ================================================ SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages'; ================================================ FILE: compute/etc/sql_exporter/lfc_hits.libsonnet ================================================ { metric_name: 'lfc_hits', type: 'gauge', help: 'lfc_hits', key_labels: null, values: [ 'lfc_hits', ], query: importstr 'sql_exporter/lfc_hits.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_hits.sql ================================================ SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; ================================================ FILE: compute/etc/sql_exporter/lfc_misses.libsonnet ================================================ { metric_name: 'lfc_misses', type: 'gauge', help: 'lfc_misses', key_labels: null, values: [ 'lfc_misses', ], query: importstr 'sql_exporter/lfc_misses.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_misses.sql ================================================ SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; ================================================ FILE: compute/etc/sql_exporter/lfc_used.libsonnet ================================================ { metric_name: 'lfc_used', type: 'gauge', help: 'LFC chunks used (chunk = 1MB)', key_labels: null, values: [ 'lfc_used', ], query: importstr 'sql_exporter/lfc_used.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_used.sql ================================================ SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; ================================================ FILE: compute/etc/sql_exporter/lfc_used_pages.libsonnet ================================================ { metric_name: 'lfc_used_pages', type: 'gauge', help: 'LFC pages used', key_labels: null, values: [ 'lfc_used_pages', ], query: importstr 'sql_exporter/lfc_used_pages.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_used_pages.sql ================================================ SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages'; ================================================ FILE: compute/etc/sql_exporter/lfc_writes.libsonnet ================================================ { metric_name: 'lfc_writes', type: 'gauge', help: 'lfc_writes', key_labels: null, values: [ 'lfc_writes', ], query: importstr 'sql_exporter/lfc_writes.sql', } ================================================ FILE: compute/etc/sql_exporter/lfc_writes.sql ================================================ SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; ================================================ FILE: compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet ================================================ // Number of slots is limited by max_replication_slots, so collecting position // for all of them shouldn't be bad. { metric_name: 'logical_slot_restart_lsn', type: 'gauge', help: 'restart_lsn of logical slots', key_labels: [ 'slot_name', ], values: [ 'restart_lsn', ], query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', } ================================================ FILE: compute/etc/sql_exporter/logical_slot_restart_lsn.sql ================================================ SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn FROM pg_catalog.pg_replication_slots WHERE slot_type = 'logical'; ================================================ FILE: compute/etc/sql_exporter/max_cluster_size.libsonnet ================================================ { metric_name: 'max_cluster_size', type: 'gauge', help: 'neon.max_cluster_size setting', key_labels: null, values: [ 'max_cluster_size', ], query: importstr 'sql_exporter/max_cluster_size.sql', } ================================================ FILE: compute/etc/sql_exporter/max_cluster_size.sql ================================================ SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size'; ================================================ FILE: compute/etc/sql_exporter/neon_perf_counters.sql ================================================ WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( file_cache_read_wait_seconds_count numeric, file_cache_read_wait_seconds_sum numeric, file_cache_write_wait_seconds_count numeric, file_cache_write_wait_seconds_sum numeric, getpage_wait_seconds_count numeric, getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, compute_getpage_stuck_requests_total numeric, compute_getpage_max_inflight_stuck_time_ms numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, getpage_prefetches_buffered numeric, pageserver_requests_sent_total numeric, pageserver_disconnects_total numeric, pageserver_send_flushes_total numeric, pageserver_open_requests numeric ); ================================================ FILE: compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet ================================================ { metric_name: 'pageserver_disconnects_total', type: 'counter', help: 'Number of times that the connection to the pageserver was lost', values: [ 'pageserver_disconnects_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/pageserver_open_requests.libsonnet ================================================ { metric_name: 'pageserver_open_requests', type: 'gauge', help: 'Number of open requests to PageServer', values: [ 'pageserver_open_requests', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet ================================================ { metric_name: 'pageserver_requests_sent_total', type: 'counter', help: 'Number of all requests sent to the pageserver (not just GetPage requests)', values: [ 'pageserver_requests_sent_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet ================================================ { metric_name: 'pageserver_send_flushes_total', type: 'counter', help: 'Number of flushes to the pageserver connection', values: [ 'pageserver_send_flushes_total', ], query_ref: 'neon_perf_counters', } ================================================ FILE: compute/etc/sql_exporter/pg_stats_userdb.libsonnet ================================================ { metric_name: 'pg_stats_userdb', type: 'gauge', help: 'Stats for several oldest non-system dbs', key_labels: [ 'datname', ], value_label: 'kind', values: [ 'db_size', 'deadlocks', // Rows 'inserted', 'updated', 'deleted', ], query: importstr 'sql_exporter/pg_stats_userdb.sql', } ================================================ FILE: compute/etc/sql_exporter/pg_stats_userdb.sql ================================================ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. SELECT pg_catalog.pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, tup_updated AS updated, tup_deleted AS deleted, datname FROM pg_catalog.pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2 AND datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 ); ================================================ FILE: compute/etc/sql_exporter/replication_delay_bytes.libsonnet ================================================ { metric_name: 'replication_delay_bytes', type: 'gauge', help: 'Bytes between received and replayed LSN', key_labels: null, values: [ 'replication_delay_bytes', ], query: importstr 'sql_exporter/replication_delay_bytes.sql', } ================================================ FILE: compute/etc/sql_exporter/replication_delay_bytes.sql ================================================ -- We use a GREATEST call here because this calculation can be negative. The -- calculation is not atomic, meaning after we've gotten the receive LSN, the -- replay LSN may have advanced past the receive LSN we are using for the -- calculation. SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes; ================================================ FILE: compute/etc/sql_exporter/replication_delay_seconds.libsonnet ================================================ { metric_name: 'replication_delay_seconds', type: 'gauge', help: 'Time since last LSN was replayed', key_labels: null, values: [ 'replication_delay_seconds', ], query: importstr 'sql_exporter/replication_delay_seconds.sql', } ================================================ FILE: compute/etc/sql_exporter/replication_delay_seconds.sql ================================================ SELECT CASE WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0 ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; ================================================ FILE: compute/etc/sql_exporter/retained_wal.libsonnet ================================================ { metric_name: 'retained_wal', type: 'gauge', help: 'Retained WAL in inactive replication slots', key_labels: [ 'slot_name', ], values: [ 'retained_wal', ], query: importstr 'sql_exporter/retained_wal.sql', } ================================================ FILE: compute/etc/sql_exporter/retained_wal.sql ================================================ SELECT slot_name, pg_catalog.pg_wal_lsn_diff( CASE WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn() ELSE pg_catalog.pg_current_wal_lsn() END, restart_lsn)::pg_catalog.FLOAT8 AS retained_wal FROM pg_catalog.pg_replication_slots WHERE active = false; ================================================ FILE: compute/etc/sql_exporter/wal_is_lost.libsonnet ================================================ { metric_name: 'wal_is_lost', type: 'gauge', help: 'Whether or not the replication slot wal_status is lost', key_labels: [ 'slot_name', ], values: [ 'wal_is_lost', ], query: importstr 'sql_exporter/wal_is_lost.sql', } ================================================ FILE: compute/etc/sql_exporter/wal_is_lost.sql ================================================ SELECT slot_name, CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost FROM pg_catalog.pg_replication_slots; ================================================ FILE: compute/etc/sql_exporter.jsonnet ================================================ function(collector_name, collector_file, connection_string) { // Configuration for sql_exporter for autoscaling-agent // Global defaults. global: { // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. scrape_timeout: '10s', // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. scrape_timeout_offset: '500ms', // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. min_interval: '0s', // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, // as will concurrent scrapes. max_connections: 1, // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should // always be the same as max_connections. max_idle_connections: 1, // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. // If 0, connections are not closed due to a connection's age. max_connection_lifetime: '5m', }, // The target to monitor and the collectors to execute on it. target: { // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) // the schema gets dropped or replaced to match the driver expected DSN format. data_source_name: connection_string, // Collectors (referenced by name) to execute on the target. // Glob patterns are supported (see for syntax). collectors: [ collector_name, ], }, // Collector files specifies a list of globs. One collector definition is read from each matching file. // Glob patterns are supported (see for syntax). collector_files: [ collector_file, ], } ================================================ FILE: compute/jsonnet/neon.libsonnet ================================================ local MIN_SUPPORTED_VERSION = 14; local MAX_SUPPORTED_VERSION = 17; local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION); # If we receive the pg_version with a leading "v", ditch it. local pg_version = std.strReplace(std.extVar('pg_version'), 'v', ''); local pg_version_num = std.parseInt(pg_version); assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) : std.format('%s is an unsupported Postgres version: %s', [pg_version, std.toString(SUPPORTED_VERSIONS)]); { PG_MAJORVERSION: pg_version, PG_MAJORVERSION_NUM: pg_version_num, } ================================================ FILE: compute/manifest.schema.json ================================================ { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "Neon Compute Manifest Schema", "description": "Schema for Neon compute node configuration manifest", "type": "object", "properties": { "pg_settings": { "type": "object", "properties": { "common": { "type": "object", "properties": { "client_connection_check_interval": { "type": "string", "description": "Check for client disconnection interval in milliseconds" }, "effective_io_concurrency": { "type": "string", "description": "Effective IO concurrency setting" }, "fsync": { "type": "string", "enum": ["on", "off"], "description": "Whether to force fsync to disk" }, "hot_standby": { "type": "string", "enum": ["on", "off"], "description": "Whether hot standby is enabled" }, "idle_in_transaction_session_timeout": { "type": "string", "description": "Timeout for idle transactions in milliseconds" }, "listen_addresses": { "type": "string", "description": "Addresses to listen on" }, "log_connections": { "type": "string", "enum": ["on", "off"], "description": "Whether to log connections" }, "log_disconnections": { "type": "string", "enum": ["on", "off"], "description": "Whether to log disconnections" }, "log_temp_files": { "type": "string", "description": "Size threshold for logging temporary files in KB" }, "log_error_verbosity": { "type": "string", "enum": ["terse", "verbose", "default"], "description": "Error logging verbosity level" }, "log_min_error_statement": { "type": "string", "description": "Minimum error level for statement logging" }, "maintenance_io_concurrency": { "type": "string", "description": "Maintenance IO concurrency setting" }, "max_connections": { "type": "string", "description": "Maximum number of connections" }, "max_replication_flush_lag": { "type": "string", "description": "Maximum replication flush lag" }, "max_replication_slots": { "type": "string", "description": "Maximum number of replication slots" }, "max_replication_write_lag": { "type": "string", "description": "Maximum replication write lag" }, "max_wal_senders": { "type": "string", "description": "Maximum number of WAL senders" }, "max_wal_size": { "type": "string", "description": "Maximum WAL size" }, "neon.unstable_extensions": { "type": "string", "description": "List of unstable extensions" }, "neon.protocol_version": { "type": "string", "description": "Neon protocol version" }, "password_encryption": { "type": "string", "description": "Password encryption method" }, "restart_after_crash": { "type": "string", "enum": ["on", "off"], "description": "Whether to restart after crash" }, "superuser_reserved_connections": { "type": "string", "description": "Number of reserved connections for superuser" }, "synchronous_standby_names": { "type": "string", "description": "Names of synchronous standby servers" }, "wal_keep_size": { "type": "string", "description": "WAL keep size" }, "wal_level": { "type": "string", "description": "WAL level" }, "wal_log_hints": { "type": "string", "enum": ["on", "off"], "description": "Whether to log hints in WAL" }, "wal_sender_timeout": { "type": "string", "description": "WAL sender timeout in milliseconds" } }, "required": [ "client_connection_check_interval", "effective_io_concurrency", "fsync", "hot_standby", "idle_in_transaction_session_timeout", "listen_addresses", "log_connections", "log_disconnections", "log_temp_files", "log_error_verbosity", "log_min_error_statement", "maintenance_io_concurrency", "max_connections", "max_replication_flush_lag", "max_replication_slots", "max_replication_write_lag", "max_wal_senders", "max_wal_size", "neon.unstable_extensions", "neon.protocol_version", "password_encryption", "restart_after_crash", "superuser_reserved_connections", "synchronous_standby_names", "wal_keep_size", "wal_level", "wal_log_hints", "wal_sender_timeout" ] }, "replica": { "type": "object", "properties": { "hot_standby": { "type": "string", "enum": ["on", "off"], "description": "Whether hot standby is enabled for replicas" } }, "required": ["hot_standby"] }, "per_version": { "type": "object", "patternProperties": { "^1[4-7]$": { "type": "object", "properties": { "common": { "type": "object", "properties": { "io_combine_limit": { "type": "string", "description": "IO combine limit" } } }, "replica": { "type": "object", "properties": { "recovery_prefetch": { "type": "string", "enum": ["on", "off"], "description": "Whether to enable recovery prefetch for PostgreSQL replicas" } } } } } } } }, "required": ["common", "replica", "per_version"] } }, "required": ["pg_settings"] } ================================================ FILE: compute/manifest.yaml ================================================ pg_settings: # Common settings for primaries and replicas of all versions. common: # Check for client disconnection every 1 minute. By default, Postgres will detect the # loss of the connection only at the next interaction with the socket, when it waits # for, receives or sends data, so it will likely waste resources till the end of the # query execution. There should be no drawbacks in setting this for everyone, so enable # it by default. If anyone will complain, we can allow editing it. # https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL client_connection_check_interval: "60000" # 1 minute # ---- IO ---- effective_io_concurrency: "20" maintenance_io_concurrency: "100" fsync: "off" hot_standby: "off" # We allow users to change this if needed, but by default we # just don't want to see long-lasting idle transactions, as they # prevent activity monitor from suspending projects. idle_in_transaction_session_timeout: "300000" # 5 minutes listen_addresses: "*" # --- LOGGING ---- helps investigations log_connections: "on" log_disconnections: "on" # 1GB, unit is KB log_temp_files: "1048576" # Disable dumping customer data to logs, both to increase data privacy # and to reduce the amount the logs. log_error_verbosity: "terse" log_min_error_statement: "panic" max_connections: "100" # --- WAL --- # - flush lag is the max amount of WAL that has been generated but not yet stored # to disk in the page server. A smaller value means less delay after a pageserver # restart, but if you set it too small you might again need to slow down writes if the # pageserver cannot flush incoming WAL to disk fast enough. This must be larger # than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a # a deadlock where the compute node refuses to generate more WAL before the # old WAL has been uploaded to S3, but the pageserver is waiting for more WAL # to be generated before it is uploaded to S3. max_replication_flush_lag: "10GB" max_replication_slots: "10" # Backpressure configuration: # - write lag is the max amount of WAL that has been generated by Postgres but not yet # processed by the page server. Making this smaller reduces the worst case latency # of a GetPage request, if you request a page that was recently modified. On the other # hand, if this is too small, the compute node might need to wait on a write if there is a # hiccup in the network or page server so that the page server has temporarily fallen # behind. # # Previously it was set to 500 MB, but it caused compute being unresponsive under load # https://github.com/neondatabase/neon/issues/2028 max_replication_write_lag: "500MB" max_wal_senders: "10" # A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount # of real I/O. Only the SLRU buffers and some other small files are flushed to disk. # However, as long as we have full_page_writes=on, page updates after a checkpoint # include full-page images which bloats the WAL. So may want to bump max_wal_size to # reduce the WAL bloating, but at the same it will increase pg_wal directory size on # compute and can lead to out of disk error on k8s nodes. max_wal_size: "1024" wal_keep_size: "0" wal_level: "replica" # Reduce amount of WAL generated by default. wal_log_hints: "off" # - without wal_sender_timeout set we don't get feedback messages, # required for backpressure. wal_sender_timeout: "10000" # We have some experimental extensions, which we don't want users to install unconsciously. # To install them, users would need to set the `neon.allow_unstable_extensions` setting. # There are two of them currently: # - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`, # and two dependencies: # - `rag_bge_small_en_v15` # - `rag_jina_reranker_v1_tiny_en` # - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/ neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon" neon.protocol_version: "3" password_encryption: "scram-sha-256" # This is important to prevent Postgres from trying to perform # a local WAL redo after backend crash. It should exit and let # the systemd or k8s to do a fresh startup with compute_ctl. restart_after_crash: "off" # By default 3. We have the following persistent connections in the VM: # * compute_activity_monitor (from compute_ctl) # * postgres-exporter (metrics collector; it has 2 connections) # * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling]) # * vm-monitor (to query & change file cache size) # i.e. total of 6. Let's reserve 7, so there's still at least one left over. superuser_reserved_connections: "7" synchronous_standby_names: "walproposer" replica: hot_standby: "on" per_version: 17: common: # PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some # size. It still has some issues with readahead, though, so we default to disabled/ # "no combining of IOs" to make sure we get the maximum prefetch depth. # See also: https://github.com/neondatabase/neon/pull/9860 io_combine_limit: "1" replica: # prefetching of blocks referenced in WAL doesn't make sense for us # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 16: common: {} replica: # prefetching of blocks referenced in WAL doesn't make sense for us # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 15: common: {} replica: # prefetching of blocks referenced in WAL doesn't make sense for us # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 14: common: {} replica: {} ================================================ FILE: compute/patches/anon_v2.patch ================================================ diff --git a/sql/anon.sql b/sql/anon.sql index 0cdc769..5eab1d6 100644 --- a/sql/anon.sql +++ b/sql/anon.sql @@ -1141,3 +1141,19 @@ $$ -- TODO : https://en.wikipedia.org/wiki/L-diversity -- TODO : https://en.wikipedia.org/wiki/T-closeness + +-- NEON Patches + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT ALL ON SCHEMA anon to %I', privileged_role_name); + EXECUTE format('GRANT ALL ON ALL TABLES IN SCHEMA anon TO %I', privileged_role_name); + + IF current_setting('server_version_num')::int >= 150000 THEN + EXECUTE format('GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO %I', privileged_role_name); + END IF; +END $$; diff --git a/sql/init.sql b/sql/init.sql index 7da6553..9b6164b 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -74,50 +74,49 @@ $$ SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED'; --- load fake data from a given path -CREATE OR REPLACE FUNCTION anon.init( - datapath TEXT -) +CREATE OR REPLACE FUNCTION anon.load_fake_data() RETURNS BOOLEAN AS $$ DECLARE - datapath_check TEXT; success BOOLEAN; + sharedir TEXT; + datapath TEXT; BEGIN - IF anon.is_initialized() THEN - RAISE NOTICE 'The anon extension is already initialized.'; - RETURN TRUE; - END IF; + datapath := '/extension/anon/'; + -- find the local extension directory + SELECT setting INTO sharedir + FROM pg_catalog.pg_config + WHERE name = 'SHAREDIR'; SELECT bool_or(results) INTO success FROM unnest(array[ - anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'), - anon.load_csv('anon.identifier',datapath ||'/identifier.csv'), - anon.load_csv('anon.address',datapath ||'/address.csv'), - anon.load_csv('anon.city',datapath ||'/city.csv'), - anon.load_csv('anon.company',datapath ||'/company.csv'), - anon.load_csv('anon.country',datapath ||'/country.csv'), - anon.load_csv('anon.email', datapath ||'/email.csv'), - anon.load_csv('anon.first_name',datapath ||'/first_name.csv'), - anon.load_csv('anon.iban',datapath ||'/iban.csv'), - anon.load_csv('anon.last_name',datapath ||'/last_name.csv'), - anon.load_csv('anon.postcode',datapath ||'/postcode.csv'), - anon.load_csv('anon.siret',datapath ||'/siret.csv'), - anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv') + anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'), + anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'), + anon.load_csv('anon.address',sharedir || datapath || '/address.csv'), + anon.load_csv('anon.city',sharedir || datapath || '/city.csv'), + anon.load_csv('anon.company',sharedir || datapath || '/company.csv'), + anon.load_csv('anon.country',sharedir || datapath || '/country.csv'), + anon.load_csv('anon.email', sharedir || datapath || '/email.csv'), + anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'), + anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'), + anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'), + anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'), + anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'), + anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv') ]) results; RETURN success; - END; $$ - LANGUAGE PLPGSQL + LANGUAGE plpgsql VOLATILE RETURNS NULL ON NULL INPUT - PARALLEL UNSAFE -- because load_csv is unsafe - SECURITY INVOKER + PARALLEL UNSAFE -- because of the EXCEPTION + SECURITY DEFINER SET search_path='' ; -SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED'; + +SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED'; -- People tend to forget the anon.init() step -- This is a friendly notice for them @@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED'; CREATE OR REPLACE FUNCTION anon.load(TEXT) RETURNS BOOLEAN AS $$ - SELECT anon.init($1); + SELECT anon.init(); $$ LANGUAGE SQL VOLATILE @@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED'; CREATE OR REPLACE FUNCTION anon.init() RETURNS BOOLEAN AS $$ - WITH conf AS ( - -- find the local extension directory - SELECT setting AS sharedir - FROM pg_catalog.pg_config - WHERE name = 'SHAREDIR' - ) - SELECT anon.init(conf.sharedir || '/extension/anon/') - FROM conf; +BEGIN + IF anon.is_initialized() THEN + RAISE NOTICE 'The anon extension is already initialized.'; + RETURN TRUE; + END IF; + + RETURN anon.load_fake_data(); +END; $$ - LANGUAGE SQL + LANGUAGE plpgsql VOLATILE PARALLEL UNSAFE -- because init is unsafe SECURITY INVOKER ================================================ FILE: compute/patches/cloud_regress_pg16.patch ================================================ diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 0c24f6afe4..dd808ac2b4 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -11,7 +11,8 @@ CREATE TABLE aggtest ( b float4 ); \set filename :abs_srcdir '/data/agg.data' -COPY aggtest FROM :'filename'; +\set command '\\copy aggtest FROM ' :'filename'; +:command ANALYZE aggtest; SELECT avg(four) AS avg_1 FROM onek; avg_1 diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index ae54cb254f..888e2ee8bc 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -15,9 +15,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user1; DROP ROLE IF EXISTS regress_alter_generic_user2; DROP ROLE IF EXISTS regress_alter_generic_user3; RESET client_min_messages; -CREATE USER regress_alter_generic_user3; -CREATE USER regress_alter_generic_user2; -CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3; +CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3; CREATE SCHEMA alt_nsp1; CREATE SCHEMA alt_nsp2; GRANT ALL ON SCHEMA alt_nsp1, alt_nsp2 TO public; @@ -370,7 +370,7 @@ ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user5 NOSUPERUSER; +CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER; CREATE OPERATOR FAMILY alt_opf5 USING btree; SET ROLE regress_alter_generic_user5; ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2); @@ -382,7 +382,7 @@ ERROR: current transaction is aborted, commands ignored until end of transactio ROLLBACK; -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user6; +CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA alt_nsp6; REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6; CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree; diff --git a/src/test/regress/expected/alter_operator.out b/src/test/regress/expected/alter_operator.out index 71bd484282..066ea4ec0d 100644 --- a/src/test/regress/expected/alter_operator.out +++ b/src/test/regress/expected/alter_operator.out @@ -127,7 +127,7 @@ ERROR: operator attribute "Restrict" not recognized -- -- Test permission check. Must be owner to ALTER OPERATOR. -- -CREATE USER regress_alter_op_user; +CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_alter_op_user; ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE); ERROR: must be owner of operator === diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 0e439a6488..393f316c3e 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -5,7 +5,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_alter_table_user1; RESET client_min_messages; -CREATE USER regress_alter_table_user1; +CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- -- add attribute -- @@ -3896,8 +3896,8 @@ DROP TABLE fail_part; ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); ERROR: relation "nonexistent" does not exist -- check ownership of the source table -CREATE ROLE regress_test_me; -CREATE ROLE regress_test_not_me; +CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE not_owned_by_me (LIKE list_parted); ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; SET SESSION AUTHORIZATION regress_test_me; diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index 57a283dc59..9672d526b4 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -18,7 +18,8 @@ CREATE TABLE array_op_test ( t text[] ); \set filename :abs_srcdir '/data/array.data' -COPY array_op_test FROM :'filename'; +\set command '\\copy array_op_test FROM ' :'filename'; +:command ANALYZE array_op_test; -- -- only the 'e' array is 0-based, the others are 1-based. diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 93ed5e8cc0..54bd7d535c 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -20,13 +20,17 @@ CREATE TABLE bt_f8_heap ( random int4 ); \set filename :abs_srcdir '/data/desc.data' -COPY bt_i4_heap FROM :'filename'; +\set command '\\copy bt_i4_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_name_heap FROM :'filename'; +\set command '\\copy bt_name_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/desc.data' -COPY bt_txt_heap FROM :'filename'; +\set command '\\copy bt_txt_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_f8_heap FROM :'filename'; +\set command '\\copy bt_f8_heap FROM ' :'filename'; +:command ANALYZE bt_i4_heap; ANALYZE bt_name_heap; ANALYZE bt_txt_heap; diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out index 542c2e098c..0062d3024f 100644 --- a/src/test/regress/expected/cluster.out +++ b/src/test/regress/expected/cluster.out @@ -308,7 +308,7 @@ WHERE pg_class.oid=indexrelid -- Verify that toast tables are clusterable CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index; -- Verify that clustering all tables does in fact cluster the right ones -CREATE USER regress_clstr_user; +CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE clstr_1 (a INT PRIMARY KEY); CREATE TABLE clstr_2 (a INT PRIMARY KEY); CREATE TABLE clstr_3 (a INT PRIMARY KEY); @@ -497,7 +497,7 @@ DROP TABLE clstrpart; CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); CREATE INDEX ptnowner_i_idx ON ptnowner(i); CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1); -CREATE ROLE regress_ptnowner; +CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2); ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; ALTER TABLE ptnowner OWNER TO regress_ptnowner; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 3f9a8f539c..0a51b52940 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC'; reset enable_seqscan; -- schema manipulation commands -CREATE ROLE regress_test_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: SET client_min_messages TO WARNING; diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index cf0b80d616..e8e2a14a4a 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -349,7 +349,8 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT, CONSTRAINT COPY_CON CHECK (x > 3 AND y <> 'check failed' AND x < 7 )); \set filename :abs_srcdir '/data/constro.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; x | y | z ---+---------------+--- @@ -358,7 +359,8 @@ SELECT * FROM COPY_TBL; (2 rows) \set filename :abs_srcdir '/data/constrf.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command ERROR: new row for relation "copy_tbl" violates check constraint "copy_con" DETAIL: Failing row contains (7, check failed, 6). CONTEXT: COPY copy_tbl, line 2: "7 check failed 6" @@ -799,7 +801,7 @@ DETAIL: Key (f1)=(3) conflicts with key (f1)=(3). DROP TABLE deferred_excl; -- Comments -- Setup a low-level role to enforce non-superuser checks. -CREATE ROLE regress_constraint_comments; +CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments; CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0)); CREATE DOMAIN constraint_comments_dom AS int CONSTRAINT the_constraint CHECK (value > 0); @@ -819,7 +821,7 @@ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS NULL; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL; -- unauthorized user RESET SESSION AUTHORIZATION; -CREATE ROLE regress_constraint_comments_noaccess; +CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments_noaccess; COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; -CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE; +CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_conversion_user; CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8; -- diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index b48365ec98..a6ef910055 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -15,9 +15,11 @@ insert into copytest values('Unix',E'abc\ndef',2); insert into copytest values('Mac',E'abc\rdef',3); insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4); \set filename :abs_builddir '/results/copytest.csv' -copy copytest to :'filename' csv; +\set command '\\copy copytest to ' :'filename' csv; +:command create temp table copytest2 (like copytest); -copy copytest2 from :'filename' csv; +\set command '\\copy copytest2 from ' :'filename' csv; +:command select * from copytest except select * from copytest2; style | test | filler -------+------+-------- @@ -25,8 +27,10 @@ select * from copytest except select * from copytest2; truncate copytest2; --- same test but with an escape char different from quote char -copy copytest to :'filename' csv quote '''' escape E'\\'; -copy copytest2 from :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command +\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command select * from copytest except select * from copytest2; style | test | filler -------+------+-------- @@ -66,13 +70,16 @@ insert into parted_copytest select x,1,'One' from generate_series(1,1000) x; insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x; insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x; \set filename :abs_builddir '/results/parted_copytest.csv' -copy (select * from parted_copytest order by a) to :'filename'; +\set command '\\copy (select * from parted_copytest order by a) to ' :'filename'; +:command truncate parted_copytest; -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command -- Ensure COPY FREEZE errors for partitioned tables. begin; truncate parted_copytest; -copy parted_copytest from :'filename' (freeze); +\set command '\\copy parted_copytest from ' :'filename' (freeze); +:command ERROR: cannot perform COPY FREEZE on a partitioned table rollback; select tableoid::regclass,count(*),sum(a) from parted_copytest @@ -94,7 +101,8 @@ create trigger part_ins_trig before insert on parted_copytest_a2 for each row execute procedure part_ins_func(); -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command select tableoid::regclass,count(*),sum(a) from parted_copytest group by tableoid order by tableoid::regclass::name; tableoid | count | sum @@ -106,7 +114,8 @@ group by tableoid order by tableoid::regclass::name; truncate table parted_copytest; create index on parted_copytest (b); drop trigger part_ins_trig on parted_copytest_a2; -copy parted_copytest from stdin; +\set command '\\copy parted_copytest from ' stdin; +:command -- Ensure index entries were properly added during the copy. select * from parted_copytest where b = 1; a | b | c @@ -170,9 +179,9 @@ INFO: progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progre -- Generate COPY FROM report with FILE, with some excluded tuples. truncate tab_progress_reporting; \set filename :abs_srcdir '/data/emp.data' -copy tab_progress_reporting from :'filename' - where (salary < 2000); -INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true} +\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)'; +:command +INFO: progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true} drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); drop table tab_progress_reporting; @@ -281,7 +290,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1); -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us \set filename :abs_srcdir '/data/desc.data' -COPY parted_si(id, data) FROM :'filename'; +\set command '\\COPY parted_si(id, data) FROM ' :'filename'; +:command -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from -- the wrong partition. This test is *not* guaranteed to trigger that bug, but -- does so when shared_buffers is small enough. To test if we encountered the diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 9a74820ee8..22400a5551 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -553,8 +553,8 @@ select * from check_con_tbl; (2 rows) -- test with RLS enabled. -CREATE ROLE regress_rls_copy_user; -CREATE ROLE regress_rls_copy_user_colperms; +CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE rls_t1 (a int, b int, c int); COPY rls_t1 (a, b, c) from stdin; CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0); diff --git a/src/test/regress/expected/create_function_sql.out b/src/test/regress/expected/create_function_sql.out index 50aca5940f..42527142f6 100644 --- a/src/test/regress/expected/create_function_sql.out +++ b/src/test/regress/expected/create_function_sql.out @@ -4,7 +4,7 @@ -- Assorted tests using SQL-language functions -- -- All objects made in this test are in temp_func_test schema -CREATE USER regress_unpriv_user; +CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA temp_func_test; GRANT ALL ON SCHEMA temp_func_test TO public; SET search_path TO temp_func_test, public; diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index acfd9d1f4f..0eeb64e47a 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -51,7 +51,8 @@ CREATE TABLE fast_emp4000 ( home_base box ); \set filename :abs_srcdir '/data/rect.data' -COPY slow_emp4000 FROM :'filename'; +\set command '\\copy slow_emp4000 FROM ' :'filename'; +:command INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000; ANALYZE slow_emp4000; ANALYZE fast_emp4000; @@ -655,7 +656,8 @@ CREATE TABLE array_index_op_test ( t text[] ); \set filename :abs_srcdir '/data/array.data' -COPY array_index_op_test FROM :'filename'; +\set command '\\copy array_index_op_test FROM ' :'filename'; +:command ANALYZE array_index_op_test; SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno; seqno | i | t @@ -2822,7 +2824,7 @@ END; -- concurrently REINDEX SCHEMA CONCURRENTLY schema_to_reindex; -- Failure for unauthorized user -CREATE ROLE regress_reindexuser NOLOGIN; +CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_reindexuser; REINDEX SCHEMA schema_to_reindex; ERROR: must be owner of schema schema_to_reindex diff --git a/src/test/regress/expected/create_procedure.out b/src/test/regress/expected/create_procedure.out index 2177ba3509..ae3ca94d00 100644 --- a/src/test/regress/expected/create_procedure.out +++ b/src/test/regress/expected/create_procedure.out @@ -421,7 +421,7 @@ ERROR: cp_testfunc1(integer) is not a procedure DROP PROCEDURE nonexistent(); ERROR: procedure nonexistent() does not exist -- privileges -CREATE USER regress_cp_user1; +CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT INSERT ON cp_test TO regress_cp_user1; REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC; SET ROLE regress_cp_user1; diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out index 46d4f9efe9..fc2a28a2f6 100644 --- a/src/test/regress/expected/create_role.out +++ b/src/test/regress/expected/create_role.out @@ -1,28 +1,28 @@ -- ok, superuser can create users with any set of privileges -CREATE ROLE regress_role_super SUPERUSER; -CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS; +CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION; -CREATE ROLE regress_role_limited_admin CREATEROLE; -CREATE ROLE regress_role_normal; +CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, CREATEROLE user can't give away role attributes without having them SET SESSION AUTHORIZATION regress_role_limited_admin; -CREATE ROLE regress_nosuch_superuser SUPERUSER; +CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the SUPERUSER attribute may create roles with the SUPERUSER attribute. -CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS; +CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute. -CREATE ROLE regress_nosuch_replication REPLICATION; +CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute. -CREATE ROLE regress_nosuch_bypassrls BYPASSRLS; +CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the BYPASSRLS attribute may create roles with the BYPASSRLS attribute. -CREATE ROLE regress_nosuch_createdb CREATEDB; +CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the CREATEDB attribute may create roles with the CREATEDB attribute. -- ok, can create a role without any special attributes -CREATE ROLE regress_role_limited; +CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, can't give it in any of the restricted attributes ALTER ROLE regress_role_limited SUPERUSER; ERROR: permission denied to alter role @@ -39,10 +39,10 @@ DETAIL: Only roles with the BYPASSRLS attribute may change the BYPASSRLS attrib DROP ROLE regress_role_limited; -- ok, can give away these role attributes if you have them SET SESSION AUTHORIZATION regress_role_admin; -CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_replication REPLICATION; -CREATE ROLE regress_bypassrls BYPASSRLS; -CREATE ROLE regress_createdb CREATEDB; +CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can toggle these role attributes off and on if you have them ALTER ROLE regress_replication NOREPLICATION; ALTER ROLE regress_replication REPLICATION; @@ -58,48 +58,48 @@ ALTER ROLE regress_createdb NOSUPERUSER; ERROR: permission denied to alter role DETAIL: Only roles with the SUPERUSER attribute may change the SUPERUSER attribute. -- ok, having CREATEROLE is enough to create users with these privileges -CREATE ROLE regress_createrole CREATEROLE NOINHERIT; +CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION; -CREATE ROLE regress_login LOGIN; -CREATE ROLE regress_inherit INHERIT; -CREATE ROLE regress_connection_limit CONNECTION LIMIT 5; -CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo'; -CREATE ROLE regress_password_null PASSWORD NULL; +CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, backwards compatible noise words should be ignored -CREATE ROLE regress_noiseword SYSID 12345; +CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER; NOTICE: SYSID can no longer be specified -- fail, cannot grant membership in superuser role -CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; +CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to grant role "regress_role_super" DETAIL: Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute. -- fail, database owner cannot have members -CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; +CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "pg_database_owner" cannot have explicit members -- ok, can grant other users into a role CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself -CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; +CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive" -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself with admin option -CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; +CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive" -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; ERROR: permission denied to create database -- ok, regress_createrole can create new roles -CREATE ROLE regress_plainrole; +CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with it -CREATE ROLE regress_rolecreator CREATEROLE; +CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with different role -- attributes, including CREATEROLE -CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5; +CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, we should be able to modify a role we created COMMENT ON ROLE regress_hasprivs IS 'some comment'; ALTER ROLE regress_hasprivs RENAME TO regress_tenant; @@ -141,7 +141,7 @@ ERROR: permission denied to reassign objects DETAIL: Only roles with privileges of role "regress_tenant" may reassign objects owned by it. -- ok, create a role with a value for createrole_self_grant SET createrole_self_grant = 'set, inherit'; -CREATE ROLE regress_tenant2; +CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_tenant2; -- ok, regress_tenant2 can create objects within the database SET SESSION AUTHORIZATION regress_tenant2; @@ -165,34 +165,34 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2; ERROR: must be able to SET ROLE "regress_tenant2" DROP TABLE tenant2_table; -- fail, CREATEROLE is not enough to create roles in privileged roles -CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; +CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data; ERROR: permission denied to grant role "pg_read_all_data" DETAIL: Only roles with the ADMIN option on role "pg_read_all_data" may grant this role. -CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; +CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data; ERROR: permission denied to grant role "pg_write_all_data" DETAIL: Only roles with the ADMIN option on role "pg_write_all_data" may grant this role. -CREATE ROLE regress_monitor IN ROLE pg_monitor; +CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor; ERROR: permission denied to grant role "pg_monitor" DETAIL: Only roles with the ADMIN option on role "pg_monitor" may grant this role. -CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; +CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings; ERROR: permission denied to grant role "pg_read_all_settings" DETAIL: Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role. -CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; +CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats; ERROR: permission denied to grant role "pg_read_all_stats" DETAIL: Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role. -CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; +CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables; ERROR: permission denied to grant role "pg_stat_scan_tables" DETAIL: Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role. -CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; +CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files; ERROR: permission denied to grant role "pg_read_server_files" DETAIL: Only roles with the ADMIN option on role "pg_read_server_files" may grant this role. -CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; +CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files; ERROR: permission denied to grant role "pg_write_server_files" DETAIL: Only roles with the ADMIN option on role "pg_write_server_files" may grant this role. -CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; +CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program; ERROR: permission denied to grant role "pg_execute_server_program" DETAIL: Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role. -CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend; ERROR: permission denied to grant role "pg_signal_backend" DETAIL: Only roles with the ADMIN option on role "pg_signal_backend" may grant this role. -- fail, role still owns database objects diff --git a/src/test/regress/expected/create_schema.out b/src/test/regress/expected/create_schema.out index 93302a07ef..1a73f083ac 100644 --- a/src/test/regress/expected/create_schema.out +++ b/src/test/regress/expected/create_schema.out @@ -2,7 +2,7 @@ -- CREATE_SCHEMA -- -- Schema creation with elements. -CREATE ROLE regress_create_schema_role SUPERUSER; +CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Cases where schema creation fails as objects are qualified with a schema -- that does not match with what's expected. -- This checks all the object types that include schema qualifications. diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index f551624afb..57f1e432d4 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -18,7 +18,8 @@ CREATE TABLE real_city ( outline path ); \set filename :abs_srcdir '/data/real_city.data' -COPY real_city FROM :'filename'; +\set command '\\copy real_city FROM ' :'filename'; +:command ANALYZE real_city; SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out @@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out index 6d9498cdd1..692cf979d0 100644 --- a/src/test/regress/expected/dependency.out +++ b/src/test/regress/expected/dependency.out @@ -1,10 +1,10 @@ -- -- DEPENDENCIES -- -CREATE USER regress_dep_user; -CREATE USER regress_dep_user2; -CREATE USER regress_dep_user3; -CREATE GROUP regress_dep_group; +CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE deptest (f1 serial primary key, f2 text); GRANT SELECT ON TABLE deptest TO GROUP regress_dep_group; GRANT ALL ON TABLE deptest TO regress_dep_user, regress_dep_user2; @@ -41,9 +41,9 @@ ERROR: role "regress_dep_user3" cannot be dropped because some objects depend o DROP TABLE deptest; DROP USER regress_dep_user3; -- Test DROP OWNED -CREATE USER regress_dep_user0; -CREATE USER regress_dep_user1; -CREATE USER regress_dep_user2; +CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_dep_user0; -- permission denied DROP OWNED BY regress_dep_user1; diff --git a/src/test/regress/expected/drop_if_exists.out b/src/test/regress/expected/drop_if_exists.out index 5e44c2c3ce..eb3bb329fb 100644 --- a/src/test/regress/expected/drop_if_exists.out +++ b/src/test/regress/expected/drop_if_exists.out @@ -64,9 +64,9 @@ ERROR: type "test_domain_exists" does not exist --- --- role/user/group --- -CREATE USER regress_test_u1; -CREATE ROLE regress_test_r1; -CREATE GROUP regress_test_g1; +CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP USER regress_test_u2; ERROR: role "regress_test_u2" does not exist DROP USER IF EXISTS regress_test_u1, regress_test_u2; diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out index 126f7047fe..0e2cc73426 100644 --- a/src/test/regress/expected/equivclass.out +++ b/src/test/regress/expected/equivclass.out @@ -384,7 +384,7 @@ set enable_nestloop = on; set enable_mergejoin = off; alter table ec1 enable row level security; create policy p1 on ec1 using (f1 < '5'::int8alias1); -create user regress_user_ectest; +create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select on ec0 to regress_user_ectest; grant select on ec1 to regress_user_ectest; -- without any RLS, we'll treat {a.ff, b.ff, 43} as an EquivalenceClass diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 5a10958df5..a578c06ebd 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -85,7 +85,7 @@ create event trigger regress_event_trigger2 on ddl_command_start -- OK comment on event trigger regress_event_trigger is 'test comment'; -- drop as non-superuser should fail -create role regress_evt_user; +create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_evt_user; create event trigger regress_event_trigger_noperms on ddl_command_start execute procedure test_event_trigger(); diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 6ed50fdcfa..caa00a345d 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -14,13 +14,13 @@ CREATE FUNCTION test_fdw_handler() SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_role2, regress_test_role_super, regress_test_indirect, regress_unprivileged_role; RESET client_min_messages; -CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER; +CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_foreign_data_user'; -CREATE ROLE regress_test_role; -CREATE ROLE regress_test_role2; -CREATE ROLE regress_test_role_super SUPERUSER; -CREATE ROLE regress_test_indirect; -CREATE ROLE regress_unprivileged_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index 84745b9f60..4883c12351 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES ERROR: cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner; set role regress_other_partitioned_fk_owner; create table other_partitioned_fk(a int, b int) partition by list (a); diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out index 5881420388..4ae21aa43c 100644 --- a/src/test/regress/expected/generated.out +++ b/src/test/regress/expected/generated.out @@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR ALTER TABLE gtest10a DROP COLUMN b; INSERT INTO gtest10a (a) VALUES (1); -- privileges -CREATE USER regress_user11; +CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED); INSERT INTO gtest11s VALUES (1, 10), (2, 20); GRANT SELECT (a, c) ON gtest11s TO regress_user11; diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out index 127c953297..e6f8272f99 100644 --- a/src/test/regress/expected/guc.out +++ b/src/test/regress/expected/guc.out @@ -584,7 +584,7 @@ PREPARE foo AS SELECT 1; LISTEN foo_event; SET vacuum_cost_delay = 13; CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS; -CREATE ROLE regress_guc_user; +CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_guc_user; -- look changes SELECT pg_listening_channels(); diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out index a2036a1597..805d73b9d2 100644 --- a/src/test/regress/expected/hash_index.out +++ b/src/test/regress/expected/hash_index.out @@ -20,10 +20,14 @@ CREATE TABLE hash_f8_heap ( random float8 ); \set filename :abs_srcdir '/data/hash.data' -COPY hash_i4_heap FROM :'filename'; -COPY hash_name_heap FROM :'filename'; -COPY hash_txt_heap FROM :'filename'; -COPY hash_f8_heap FROM :'filename'; +\set command '\\copy hash_i4_heap FROM ' :'filename'; +:command +\set command '\\copy hash_name_heap FROM ' :'filename'; +:command +\set command '\\copy hash_txt_heap FROM ' :'filename'; +:command +\set command '\\copy hash_f8_heap FROM ' :'filename'; +:command -- the data in this file has a lot of duplicates in the index key -- fields, leading to long bucket chains and lots of table expansion. -- this is therefore a stress test of the bucket overflow code (unlike diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out index 1b74958de9..078187b542 100644 --- a/src/test/regress/expected/identity.out +++ b/src/test/regress/expected/identity.out @@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT; ALTER TABLE itest7 ALTER COLUMN a RESTART; ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY; -- privileges -CREATE USER regress_identity_user1; +CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text); GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 8f831c95c3..ec681b52af 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2636,7 +2636,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; -create role regress_no_child_access; +create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER; revoke all on permtest_grandchild from regress_no_child_access; grant select on permtest_parent to regress_no_child_access; set session authorization regress_no_child_access; diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index cf4b5221a8..fa6ccb639c 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -802,7 +802,7 @@ drop table mlparted5; -- appropriate key description (or none) in various situations create table key_desc (a int, b int) partition by list ((a+0)); create table key_desc_1 partition of key_desc for values in (1) partition by range (b); -create user regress_insert_other_user; +create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select (a) on key_desc_1 to regress_insert_other_user; grant insert on key_desc to regress_insert_other_user; set role regress_insert_other_user; @@ -914,7 +914,7 @@ DETAIL: Failing row contains (2, hi there). -- check that the message shows the appropriate column description in a -- situation where the partitioned table is not the primary ModifyTable node create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int); -create role regress_coldesc_role; +create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER; grant insert on inserttest3 to regress_coldesc_role; grant insert on brtrigpartcon to regress_coldesc_role; revoke select on brtrigpartcon from regress_coldesc_role; diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index f8a7dac960..64dcaf171c 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -4,7 +4,8 @@ CREATE TABLE testjsonb ( j jsonb ); \set filename :abs_srcdir '/data/jsonb.data' -COPY testjsonb FROM :'filename'; +\set command '\\copy testjsonb FROM ' :'filename'; +:command -- Strings. SELECT '""'::jsonb; -- OK. jsonb diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out index 4921dd79ae..d18a3cdd66 100644 --- a/src/test/regress/expected/largeobject.out +++ b/src/test/regress/expected/largeobject.out @@ -7,7 +7,7 @@ -- ensure consistent test output regardless of the default bytea format SET bytea_output TO escape; -- Test ALTER LARGE OBJECT OWNER -CREATE ROLE regress_lo_user; +CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT lo_create(42); lo_create ----------- @@ -346,7 +346,8 @@ SELECT lo_unlink(loid) from lotest_stash_values; TRUNCATE lotest_stash_values; \set filename :abs_srcdir '/data/tenk.data' -INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename'); +\lo_import :filename +INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID); BEGIN; UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer)); -- verify length of large object @@ -410,12 +411,8 @@ SELECT lo_close(fd) FROM lotest_stash_values; END; \set filename :abs_builddir '/results/lotest.txt' -SELECT lo_export(loid, :'filename') FROM lotest_stash_values; - lo_export ------------ - 1 -(1 row) - +SELECT loid FROM lotest_stash_values \gset +\lo_export :loid, :filename \lo_import :filename \set newloid :LASTOID -- just make sure \lo_export does not barf diff --git a/src/test/regress/expected/lock.out b/src/test/regress/expected/lock.out index ad137d3645..8dac447436 100644 --- a/src/test/regress/expected/lock.out +++ b/src/test/regress/expected/lock.out @@ -16,7 +16,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2; CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1; CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a); CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub; -CREATE ROLE regress_rol_lock1; +CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1; GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1; -- Try all valid lock options; also try omitting the optional TABLE keyword. diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 67a50bde3d..7eeafd2603 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -549,7 +549,7 @@ SELECT * FROM mvtest_mv_v; DROP TABLE mvtest_v CASCADE; NOTICE: drop cascades to materialized view mvtest_mv_v -- make sure running as superuser works when MV owned by another role (bug #11208) -CREATE ROLE regress_user_mvtest; +CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_user_mvtest; -- this test case also checks for ambiguity in the queries issued by -- refresh_by_match_merge(), by choosing column names that intentionally @@ -615,7 +615,7 @@ HINT: Use the REFRESH MATERIALIZED VIEW command. ROLLBACK; -- INSERT privileges if relation owner is not allowed to insert. CREATE SCHEMA matview_schema; -CREATE USER regress_matview_user; +CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user REVOKE INSERT ON TABLES FROM regress_matview_user; GRANT ALL ON SCHEMA matview_schema TO public; diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out index bc9a59803f..5b9ddf0626 100644 --- a/src/test/regress/expected/merge.out +++ b/src/test/regress/expected/merge.out @@ -1,9 +1,9 @@ -- -- MERGE -- -CREATE USER regress_merge_privs; -CREATE USER regress_merge_no_privs; -CREATE USER regress_merge_none; +CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP TABLE IF EXISTS target; NOTICE: table "target" does not exist, skipping DROP TABLE IF EXISTS source; diff --git a/src/test/regress/expected/misc.out b/src/test/regress/expected/misc.out index 6e816c57f1..6ef45b468e 100644 --- a/src/test/regress/expected/misc.out +++ b/src/test/regress/expected/misc.out @@ -59,9 +59,11 @@ DROP TABLE tmp; -- copy -- \set filename :abs_builddir '/results/onek.data' -COPY onek TO :'filename'; +\set command '\\copy onek TO ' :'filename'; +:command CREATE TEMP TABLE onek_copy (LIKE onek); -COPY onek_copy FROM :'filename'; +\set command '\\copy onek_copy FROM ' :'filename'; +:command SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy; unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- @@ -73,9 +75,11 @@ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek; (0 rows) \set filename :abs_builddir '/results/stud_emp.data' -COPY BINARY stud_emp TO :'filename'; +\set command '\\COPY BINARY stud_emp TO ' :'filename'; +:command CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp); -COPY BINARY stud_emp_copy FROM :'filename'; +\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename'; +:command SELECT * FROM stud_emp_copy; name | age | location | salary | manager | gpa | percent -------+-----+------------+--------+---------+-----+--------- diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index c669948370..47111b1d24 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -297,7 +297,7 @@ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity t (1 row) -CREATE ROLE regress_log_memory; +CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_function_privilege('regress_log_memory', 'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no has_function_privilege @@ -483,7 +483,7 @@ select count(*) > 0 from -- -- Test replication slot directory functions -- -CREATE ROLE regress_slot_dir_funcs; +CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Not available by default. SELECT has_function_privilege('regress_slot_dir_funcs', 'pg_ls_logicalsnapdir()', 'EXECUTE'); diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index fc42d418bf..e38f517574 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -5,7 +5,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_addr_user; RESET client_min_messages; -CREATE USER regress_addr_user; +CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out index 8475231735..0653946337 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out @@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; -CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1'; -CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; +CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- check list of created entries -- -- The scram secret will look something like: @@ -30,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 + regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 + regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role @@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; --- already encrypted, use as they are -ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; --- already encrypted with MD5, use as it is -CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; --- This looks like a valid SCRAM-SHA-256 secret, but it is not --- so it should be hashed with SCRAM-SHA-256. -CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; --- These may look like valid MD5 secrets, but they are not, so they --- should be hashed with SCRAM-SHA-256. --- trailing garbage at the end -CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; --- invalid length -CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; @@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb + regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 + regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 + regress_passwd5 | SCRAM-SHA-256$4096:$: regress_passwd6 | SCRAM-SHA-256$4096:$: regress_passwd7 | SCRAM-SHA-256$4096:$: regress_passwd8 | SCRAM-SHA-256$4096:$: @@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password +ERROR: Failed to get encrypted password: User "regress_passwd_empty" has no password assigned. ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; -NOTICE: empty string is not a valid password, clearing password +ERROR: role "regress_passwd_empty" does not exist ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; -NOTICE: empty string is not a valid password, clearing password +ERROR: role "regress_passwd_empty" does not exist SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; rolpassword ------------- - -(1 row) +(0 rows) --- Test with invalid stored and server keys. --- --- The first is valid, to act as a control. The others have too long --- stored/server keys. They will be re-hashed. -CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed @@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; rolname | is_rolpassword_rehashed -------------------------+------------------------- - regress_passwd_sha_len0 | f + regress_passwd_sha_len0 | t regress_passwd_sha_len1 | t regress_passwd_sha_len2 | t (3 rows) @@ -135,6 +124,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 620fbe8c52..0570102357 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 RESET client_min_messages; -- test proper begins here -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; -CREATE USER regress_priv_user5; -- duplicate +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- duplicate ERROR: role "regress_priv_user5" already exists -CREATE USER regress_priv_user6; -CREATE USER regress_priv_user7; -CREATE USER regress_priv_user8; -CREATE USER regress_priv_user9; -CREATE USER regress_priv_user10; -CREATE ROLE regress_priv_role; +CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER; -- circular ADMIN OPTION grants should be disallowed GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION; GRANT regress_priv_user1 TO regress_priv_user3 WITH ADMIN OPTION GRANTED BY regress_priv_user2; @@ -108,11 +108,11 @@ ERROR: role "regress_priv_user5" cannot be dropped because some objects depend DETAIL: privileges for membership of role regress_priv_user6 in role regress_priv_user1 DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order -- recreate the roles we just dropped -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION; @@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8; DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; -CREATE GROUP regress_priv_group1; -CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2; +CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; SET SESSION AUTHORIZATION regress_priv_user3; @@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; +ERROR: permission denied to grant privileges as role "neondb_owner" +DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY foo; -- error ERROR: role "foo" does not exist REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY regress_priv_user2; -- warning, noop WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "regress_priv_user2" REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_USER; +WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner" REVOKE regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_ROLE; +WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner" DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; @@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - -CREATE ROLE regress_sro_user; +CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ @@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - -CREATE ROLE regress_schemauser1 superuser login; -CREATE ROLE regress_schemauser2 superuser login; +CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; @@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE -CREATE USER regress_locktable_user; +CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; @@ -2881,7 +2885,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - -CREATE ROLE regress_readallstats; +CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- @@ -2925,10 +2929,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery -CREATE ROLE regress_group; -CREATE ROLE regress_group_direct_manager; -CREATE ROLE regress_group_indirect_manager; -CREATE ROLE regress_group_member; +CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; @@ -2957,9 +2961,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes -CREATE ROLE regress_roleoption_protagonist; -CREATE ROLE regress_roleoption_donor; -CREATE ROLE regress_roleoption_recipient; +CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 7cd0c27cca..d7a124ed68 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -2857,7 +2857,7 @@ Type | func -- check conditional am display \pset expanded off CREATE SCHEMA tableam_display; -CREATE ROLE regress_display_role; +CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER SCHEMA tableam_display OWNER TO regress_display_role; SET search_path TO tableam_display; CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler; @@ -4808,7 +4808,7 @@ last error message: division by zero last error code: 22012 \unset FETCH_COUNT create schema testpart; -create role regress_partitioning_role; +create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER; alter schema testpart owner to regress_partitioning_role; set role to regress_partitioning_role; -- run test inside own schema and hide other partitions @@ -5260,7 +5260,7 @@ reset work_mem; -- check \df+ -- we have to use functions with a predictable owner name, so make a role -create role regress_psql_user superuser; +create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; begin; set session authorization regress_psql_user; create function psql_df_internal (float8) @@ -5544,11 +5544,14 @@ CREATE TEMPORARY TABLE reload_output( line text ); SELECT 1 AS a \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line --------- @@ -5587,13 +5590,15 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c; -- COPY TO file -- The data goes to :g_out_file and the status to :o_out_file \set QUIET false -COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file'; +\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file'; +:command -- DML command status UPDATE onek SET unique1 = unique1 WHERE false; \set QUIET true \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -5610,7 +5615,8 @@ SELECT line FROM reload_output ORDER BY lineno; (10 rows) TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ---------- @@ -5647,7 +5653,8 @@ COPY (SELECT 'foo1') TO STDOUT \; COPY (SELECT 'bar1') TO STDOUT; COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -5656,7 +5663,8 @@ SELECT line FROM reload_output ORDER BY lineno; (2 rows) TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -6619,10 +6627,10 @@ cross-database references are not implemented: "no.such.database"."no.such.schem \dX "no.such.database"."no.such.schema"."no.such.extended.statistics" cross-database references are not implemented: "no.such.database"."no.such.schema"."no.such.extended.statistics" -- check \drg and \du -CREATE ROLE regress_du_role0; -CREATE ROLE regress_du_role1; -CREATE ROLE regress_du_role2; -CREATE ROLE regress_du_admin; +CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role2 TO regress_du_admin WITH ADMIN TRUE; diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 69dc6cfd85..68390cc18a 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -1,9 +1,9 @@ -- -- PUBLICATION -- -CREATE ROLE regress_publication_user LOGIN SUPERUSER; -CREATE ROLE regress_publication_user2; -CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_publication_user'; -- suppress warning that depends on wal_level SET client_min_messages = 'ERROR'; @@ -1211,7 +1211,7 @@ ALTER PUBLICATION testpub2 ADD TABLE testpub_tbl1; -- ok DROP PUBLICATION testpub2; DROP PUBLICATION testpub3; SET ROLE regress_publication_user; -CREATE ROLE regress_publication_user3; +CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_publication_user2 TO regress_publication_user3; SET client_min_messages = 'ERROR'; CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test; diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out index a9420850b8..bd3b5f312d 100644 --- a/src/test/regress/expected/regproc.out +++ b/src/test/regress/expected/regproc.out @@ -2,7 +2,7 @@ -- regproc -- /* If objects exist, return oids */ -CREATE ROLE regress_regrole_test; +CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER; -- without schemaname SELECT regoper('||/'); regoper diff --git a/src/test/regress/expected/roleattributes.out b/src/test/regress/expected/roleattributes.out index 5e6969b173..2c4d52237f 100644 --- a/src/test/regress/expected/roleattributes.out +++ b/src/test/regress/expected/roleattributes.out @@ -1,233 +1,233 @@ -- default for superuser is false -CREATE ROLE regress_test_def_superuser; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_superuser | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_superuser | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | t | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | t | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_superuser WITH NOSUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | t | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | t | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for inherit is true -CREATE ROLE regress_test_def_inherit; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_inherit | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_inherit | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | f | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | f | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_inherit WITH INHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | f | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | f | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for create role is false -CREATE ROLE regress_test_def_createrole; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_createrole | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_createrole | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | t | f | f | f | f | -1 | | +CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | t | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createrole WITH NOCREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | t | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | t | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for create database is false -CREATE ROLE regress_test_def_createdb; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_createdb | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_createdb | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | t | f | f | f | -1 | | +CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | t | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createdb WITH NOCREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | t | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | t | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for can login is false for role -CREATE ROLE regress_test_def_role_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_role_canlogin | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_role_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | | +CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_role_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for can login is true for user -CREATE USER regress_test_def_user_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_user_canlogin | f | t | f | f | t | f | f | -1 | | +CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_user_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | | +CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER USER regress_test_user_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | t | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for replication is false -CREATE ROLE regress_test_def_replication; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_replication | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_replication | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | t | f | -1 | | +CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | t | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_replication WITH NOREPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | t | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | t | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for bypassrls is false -CREATE ROLE regress_test_def_bypassrls; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_bypassrls | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_bypassrls | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | t | -1 | | +CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | t | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | t | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | t | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- clean up roles diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 218c0c2863..f7af0cfb12 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2; DROP SCHEMA IF EXISTS regress_rls_schema CASCADE; RESET client_min_messages; -- initial setup -CREATE USER regress_rls_alice NOLOGIN; -CREATE USER regress_rls_bob NOLOGIN; -CREATE USER regress_rls_carol NOLOGIN; -CREATE USER regress_rls_dave NOLOGIN; -CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN; -CREATE ROLE regress_rls_group1 NOLOGIN; -CREATE ROLE regress_rls_group2 NOLOGIN; +CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_rls_group1 TO regress_rls_bob; GRANT regress_rls_group2 TO regress_rls_carol; CREATE SCHEMA regress_rls_schema; @@ -4352,8 +4352,8 @@ SELECT count(*) = 0 FROM pg_depend -- DROP OWNED BY testing RESET SESSION AUTHORIZATION; -CREATE ROLE regress_rls_dob_role1; -CREATE ROLE regress_rls_dob_role2; +CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE dob_t1 (c1 int); CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1); CREATE POLICY p1 ON dob_t1 TO regress_rls_dob_role1 USING (true); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 09a255649b..15895f0c53 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3708,7 +3708,7 @@ DROP TABLE ruletest2; -- Test non-SELECT rule on security invoker view. -- Should use view owner's permissions. -- -CREATE USER regress_rule_user1; +CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ruletest_t1 (x int); CREATE TABLE ruletest_t2 (x int); CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out index a8e01a6220..83543b250a 100644 --- a/src/test/regress/expected/security_label.out +++ b/src/test/regress/expected/security_label.out @@ -6,8 +6,8 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_seclabel_user1; DROP ROLE IF EXISTS regress_seclabel_user2; RESET client_min_messages; -CREATE USER regress_seclabel_user1 WITH CREATEROLE; -CREATE USER regress_seclabel_user2; +CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2; diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out index b79fe9a1c0..e29fab88ab 100644 --- a/src/test/regress/expected/select_into.out +++ b/src/test/regress/expected/select_into.out @@ -15,7 +15,7 @@ DROP TABLE sitmp1; -- SELECT INTO and INSERT permission, if owner is not allowed to insert. -- CREATE SCHEMA selinto_schema; -CREATE USER regress_selinto_user; +CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index afc6ab08c2..dfcd891af3 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -1220,7 +1220,7 @@ SELECT 1 FROM tenk1_vw_sec rollback; -- test that function option SET ROLE works in parallel workers. -create role regress_parallel_worker; +create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; create function set_and_report_role() returns text as $$ select current_setting('role') $$ language sql parallel safe set role = regress_parallel_worker; diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index 1aeed8452b..7d9427d070 100644 --- a/src/test/regress/expected/select_views.out +++ b/src/test/regress/expected/select_views.out @@ -1250,7 +1250,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; -- -- Test for Leaky view scenario -- -CREATE ROLE regress_alice; +CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FUNCTION f_leak (text) RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index f02f020542..c9e0fda350 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -22,7 +22,7 @@ CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid; -- not a table ERROR: sequence cannot be owned by relation "pg_class_oid_index" DETAIL: This operation is not supported for indexes. CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname; -- not same schema -ERROR: sequence must be in same schema as table it is linked to +ERROR: sequence must have same owner as table it is linked to CREATE TABLE sequence_test_table (a int); CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b; -- wrong column ERROR: column "b" of relation "sequence_test_table" does not exist @@ -639,7 +639,7 @@ SELECT setval('sequence_test2', 1); -- error ERROR: cannot execute setval() in a read-only transaction ROLLBACK; -- privileges tests -CREATE USER regress_seq_user; +CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- nextval BEGIN; SET LOCAL SESSION AUTHORIZATION regress_seq_user; diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 94187e59cf..72346e2c71 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -1283,37 +1283,6 @@ SELECT current_setting('fsync') = 'off' t (1 row) --- Change the tablespace so that the table is rewritten directly, then SELECT --- from it to cause it to be read back into shared buffers. -SELECT sum(reads) AS io_sum_shared_before_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset --- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly --- rewritten table, e.g. by autovacuum. -BEGIN; -ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace; --- SELECT from the table so that the data is read into shared buffers and --- context 'normal', object 'relation' reads are counted. -SELECT COUNT(*) FROM test_io_shared; - count -------- - 100 -(1 row) - -COMMIT; -SELECT pg_stat_force_next_flush(); - pg_stat_force_next_flush --------------------------- - -(1 row) - -SELECT sum(reads) AS io_sum_shared_after_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads; - ?column? ----------- - t -(1 row) - SELECT sum(hits) AS io_sum_shared_before_hits FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -- Select from the table again to count hits. @@ -1415,6 +1384,7 @@ SELECT :io_sum_local_after_evictions > :io_sum_local_before_evictions, -- local buffers, exercising a different codepath than standard local buffer -- writes. ALTER TABLE test_io_local SET TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist SELECT pg_stat_force_next_flush(); pg_stat_force_next_flush -------------------------- @@ -1426,7 +1396,7 @@ SELECT sum(writes) AS io_sum_local_new_tblspc_writes SELECT :io_sum_local_new_tblspc_writes > :io_sum_local_after_writes; ?column? ---------- - t + f (1 row) RESET temp_buffers; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index b4c85613de..d32a9a69ad 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -70,7 +70,7 @@ DROP TABLE ext_stats_test; CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1; COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment'; -CREATE ROLE regress_stats_ext; +CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_stats_ext; COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment'; ERROR: must be owner of statistics object ab1_a_b_stats @@ -3214,7 +3214,7 @@ set search_path to public, stts_s1; stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined (10 rows) -create role regress_stats_ext nosuperuser; +create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_stats_ext; \dX List of extended statistics @@ -3237,7 +3237,7 @@ drop schema stts_s1, stts_s2 cascade; drop user regress_stats_ext; reset search_path; -- User with no access -CREATE USER regress_stats_user1; +CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT USAGE ON SCHEMA tststats TO regress_stats_user1; SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl; -- Permission denied diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index b15eddbff3..e9ba4568eb 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -1,10 +1,10 @@ -- -- SUBSCRIPTION -- -CREATE ROLE regress_subscription_user LOGIN SUPERUSER; -CREATE ROLE regress_subscription_user2; -CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription; -CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription; +CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_subscription_user'; -- fail - no publications CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo'; diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out index 5d9e6bf12b..c5fddfdca6 100644 --- a/src/test/regress/expected/test_setup.out +++ b/src/test/regress/expected/test_setup.out @@ -21,6 +21,7 @@ GRANT ALL ON SCHEMA public TO public; -- Create a tablespace we can use in tests. SET allow_in_place_tablespaces = true; CREATE TABLESPACE regress_tblspace LOCATION ''; +ERROR: CREATE TABLESPACE is not supported on Neon -- -- These tables have traditionally been referenced by many tests, -- so create and populate them. Insert only non-error values here. @@ -111,7 +112,8 @@ CREATE TABLE onek ( string4 name ); \set filename :abs_srcdir '/data/onek.data' -COPY onek FROM :'filename'; +\set command '\\copy onek FROM ' :'filename'; +:command VACUUM ANALYZE onek; CREATE TABLE onek2 AS SELECT * FROM onek; VACUUM ANALYZE onek2; @@ -134,7 +136,8 @@ CREATE TABLE tenk1 ( string4 name ); \set filename :abs_srcdir '/data/tenk.data' -COPY tenk1 FROM :'filename'; +\set command '\\copy tenk1 FROM ' :'filename'; +:command VACUUM ANALYZE tenk1; CREATE TABLE tenk2 AS SELECT * FROM tenk1; VACUUM ANALYZE tenk2; @@ -144,20 +147,23 @@ CREATE TABLE person ( location point ); \set filename :abs_srcdir '/data/person.data' -COPY person FROM :'filename'; +\set command '\\copy person FROM ' :'filename'; +:command VACUUM ANALYZE person; CREATE TABLE emp ( salary int4, manager name ) INHERITS (person); \set filename :abs_srcdir '/data/emp.data' -COPY emp FROM :'filename'; +\set command '\\copy emp FROM ' :'filename'; +:command VACUUM ANALYZE emp; CREATE TABLE student ( gpa float8 ) INHERITS (person); \set filename :abs_srcdir '/data/student.data' -COPY student FROM :'filename'; +\set command '\\copy student FROM ' :'filename'; +:command VACUUM ANALYZE student; CREATE TABLE stud_emp ( percent int4 @@ -166,14 +172,16 @@ NOTICE: merging multiple inherited definitions of column "name" NOTICE: merging multiple inherited definitions of column "age" NOTICE: merging multiple inherited definitions of column "location" \set filename :abs_srcdir '/data/stud_emp.data' -COPY stud_emp FROM :'filename'; +\set command '\\copy stud_emp FROM ' :'filename'; +:command VACUUM ANALYZE stud_emp; CREATE TABLE road ( name text, thepath path ); \set filename :abs_srcdir '/data/streets.data' -COPY road FROM :'filename'; +\set command '\\copy road FROM ' :'filename'; +:command VACUUM ANALYZE road; CREATE TABLE ihighway () INHERITS (road); INSERT INTO ihighway diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 9fad6c8b04..a1b8e82389 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -63,7 +63,8 @@ CREATE TABLE test_tsvector( a tsvector ); \set filename :abs_srcdir '/data/tsearch.data' -COPY test_tsvector FROM :'filename'; +\set command '\\copy test_tsvector FROM ' :'filename'; +:command ANALYZE test_tsvector; -- test basic text search behavior without indexes, then with SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index ba46c32029..eac3017bac 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -999,9 +999,9 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to view rw_view1 drop cascades to function rw_view1_aa(rw_view1) -- permissions checks -CREATE USER regress_view_user1; -CREATE USER regress_view_user2; -CREATE USER regress_view_user3; +CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_view_user1; CREATE TABLE base_tbl(a int, b text, c float); INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0); @@ -3094,8 +3094,8 @@ DETAIL: View columns that are not columns of their base relation are not updata drop view uv_iocu_view; drop table uv_iocu_tab; -- ON CONFLICT DO UPDATE permissions checks -create user regress_view_user1; -create user regress_view_user2; +create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; set session authorization regress_view_user1; create table base_tbl(a int unique, b text, c float); insert into base_tbl values (1,'xxx',1.0); diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index c809f88f54..d1d57852d4 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -602,7 +602,7 @@ DROP FUNCTION func_parted_mod_b(); -- RLS policies with update-row-movement ----------------------------------------- ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; -CREATE USER regress_range_parted_user; +CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT ALL ON range_parted, mintab TO regress_range_parted_user; CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index 4aaf4f025d..40a339758a 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -433,7 +433,7 @@ CREATE TABLE vacowned (a int); CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a); CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1); CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2); -CREATE ROLE regress_vacuum; +CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_vacuum; -- Simple table VACUUM vacowned; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 3d14bf4e4f..87f351b1d1 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -130,4 +130,4 @@ test: fast_default # run tablespace test at the end because it drops the tablespace created during # setup that other tests may use. -test: tablespace +#test: tablespace diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index f51726e8ed..8854104eff 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -15,7 +15,8 @@ CREATE TABLE aggtest ( ); \set filename :abs_srcdir '/data/agg.data' -COPY aggtest FROM :'filename'; +\set command '\\copy aggtest FROM ' :'filename'; +:command ANALYZE aggtest; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d3..9d38df7f42 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -22,9 +22,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user3; RESET client_min_messages; -CREATE USER regress_alter_generic_user3; -CREATE USER regress_alter_generic_user2; -CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3; +CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3; CREATE SCHEMA alt_nsp1; CREATE SCHEMA alt_nsp2; @@ -316,7 +316,7 @@ DROP OPERATOR FAMILY alt_opf4 USING btree; -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user5 NOSUPERUSER; +CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER; CREATE OPERATOR FAMILY alt_opf5 USING btree; SET ROLE regress_alter_generic_user5; ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2); @@ -326,7 +326,7 @@ ROLLBACK; -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user6; +CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA alt_nsp6; REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6; CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree; diff --git a/src/test/regress/sql/alter_operator.sql b/src/test/regress/sql/alter_operator.sql index fd40370165..ca8055e06d 100644 --- a/src/test/regress/sql/alter_operator.sql +++ b/src/test/regress/sql/alter_operator.sql @@ -87,7 +87,7 @@ ALTER OPERATOR & (bit, bit) SET ("Restrict" = _int_contsel, "Join" = _int_contjo -- -- Test permission check. Must be owner to ALTER OPERATOR. -- -CREATE USER regress_alter_op_user; +CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_alter_op_user; ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE); diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index d2845abc97..a0719b8d0e 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -7,7 +7,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_alter_table_user1; RESET client_min_messages; -CREATE USER regress_alter_table_user1; +CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- -- add attribute @@ -2397,8 +2397,8 @@ DROP TABLE fail_part; ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); -- check ownership of the source table -CREATE ROLE regress_test_me; -CREATE ROLE regress_test_not_me; +CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE not_owned_by_me (LIKE list_parted); ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; SET SESSION AUTHORIZATION regress_test_me; diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index e414fa560d..79a75a0e57 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -22,7 +22,8 @@ CREATE TABLE array_op_test ( ); \set filename :abs_srcdir '/data/array.data' -COPY array_op_test FROM :'filename'; +\set command '\\copy array_op_test FROM ' :'filename'; +:command ANALYZE array_op_test; -- diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 239f4a4755..f29d87bdff 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -26,16 +26,20 @@ CREATE TABLE bt_f8_heap ( ); \set filename :abs_srcdir '/data/desc.data' -COPY bt_i4_heap FROM :'filename'; +\set command '\\copy bt_i4_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_name_heap FROM :'filename'; +\set command '\\copy bt_name_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/desc.data' -COPY bt_txt_heap FROM :'filename'; +\set command '\\copy bt_txt_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_f8_heap FROM :'filename'; +\set command '\\copy bt_f8_heap FROM ' :'filename'; +:command ANALYZE bt_i4_heap; ANALYZE bt_name_heap; diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql index 6cb9c926c0..5e689e4062 100644 --- a/src/test/regress/sql/cluster.sql +++ b/src/test/regress/sql/cluster.sql @@ -108,7 +108,7 @@ WHERE pg_class.oid=indexrelid CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index; -- Verify that clustering all tables does in fact cluster the right ones -CREATE USER regress_clstr_user; +CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE clstr_1 (a INT PRIMARY KEY); CREATE TABLE clstr_2 (a INT PRIMARY KEY); CREATE TABLE clstr_3 (a INT PRIMARY KEY); @@ -233,7 +233,7 @@ DROP TABLE clstrpart; CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); CREATE INDEX ptnowner_i_idx ON ptnowner(i); CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1); -CREATE ROLE regress_ptnowner; +CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2); ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; ALTER TABLE ptnowner OWNER TO regress_ptnowner; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 8aa902d5ab..24bb823b86 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -353,7 +353,7 @@ reset enable_seqscan; -- schema manipulation commands -CREATE ROLE regress_test_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index e3e3bea709..fa86ddc326 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -243,12 +243,14 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT, CHECK (x > 3 AND y <> 'check failed' AND x < 7 )); \set filename :abs_srcdir '/data/constro.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; \set filename :abs_srcdir '/data/constrf.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; @@ -599,7 +601,7 @@ DROP TABLE deferred_excl; -- Comments -- Setup a low-level role to enforce non-superuser checks. -CREATE ROLE regress_constraint_comments; +CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments; CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0)); @@ -621,7 +623,7 @@ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL; -- unauthorized user RESET SESSION AUTHORIZATION; -CREATE ROLE regress_constraint_comments_noaccess; +CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments_noaccess; COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; -CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE; +CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_conversion_user; CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8; -- diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index 43d2e906dd..6c993d70f0 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -20,11 +20,13 @@ insert into copytest values('Mac',E'abc\rdef',3); insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4); \set filename :abs_builddir '/results/copytest.csv' -copy copytest to :'filename' csv; +\set command '\\copy copytest to ' :'filename' csv; +:command create temp table copytest2 (like copytest); -copy copytest2 from :'filename' csv; +\set command '\\copy copytest2 from ' :'filename' csv; +:command select * from copytest except select * from copytest2; @@ -32,9 +34,11 @@ truncate copytest2; --- same test but with an escape char different from quote char -copy copytest to :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command -copy copytest2 from :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command select * from copytest except select * from copytest2; @@ -86,16 +90,19 @@ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x; insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x; \set filename :abs_builddir '/results/parted_copytest.csv' -copy (select * from parted_copytest order by a) to :'filename'; +\set command '\\copy (select * from parted_copytest order by a) to ' :'filename'; +:command truncate parted_copytest; -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command -- Ensure COPY FREEZE errors for partitioned tables. begin; truncate parted_copytest; -copy parted_copytest from :'filename' (freeze); +\set command '\\copy parted_copytest from ' :'filename' (freeze); +:command rollback; select tableoid::regclass,count(*),sum(a) from parted_copytest @@ -115,7 +122,8 @@ create trigger part_ins_trig for each row execute procedure part_ins_func(); -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command select tableoid::regclass,count(*),sum(a) from parted_copytest group by tableoid order by tableoid::regclass::name; @@ -124,7 +132,8 @@ truncate table parted_copytest; create index on parted_copytest (b); drop trigger part_ins_trig on parted_copytest_a2; -copy parted_copytest from stdin; +\set command '\\copy parted_copytest from ' stdin; +:command 1 1 str1 2 2 str2 \. @@ -191,8 +200,8 @@ bill 20 (11,10) 1000 sharon -- Generate COPY FROM report with FILE, with some excluded tuples. truncate tab_progress_reporting; \set filename :abs_srcdir '/data/emp.data' -copy tab_progress_reporting from :'filename' - where (salary < 2000); +\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)'; +:command drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); @@ -311,7 +320,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1); -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us \set filename :abs_srcdir '/data/desc.data' -COPY parted_si(id, data) FROM :'filename'; +\set command '\\COPY parted_si(id, data) FROM ' :'filename'; +:command -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from -- the wrong partition. This test is *not* guaranteed to trigger that bug, but diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index cf3828c16e..cf3ca38175 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -365,8 +365,8 @@ copy check_con_tbl from stdin; select * from check_con_tbl; -- test with RLS enabled. -CREATE ROLE regress_rls_copy_user; -CREATE ROLE regress_rls_copy_user_colperms; +CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE rls_t1 (a int, b int, c int); COPY rls_t1 (a, b, c) from stdin; diff --git a/src/test/regress/sql/create_function_sql.sql b/src/test/regress/sql/create_function_sql.sql index 89e9af3a49..2b86fe2285 100644 --- a/src/test/regress/sql/create_function_sql.sql +++ b/src/test/regress/sql/create_function_sql.sql @@ -6,7 +6,7 @@ -- All objects made in this test are in temp_func_test schema -CREATE USER regress_unpriv_user; +CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA temp_func_test; GRANT ALL ON SCHEMA temp_func_test TO public; diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d49ce9f300..47fa813bc8 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -71,7 +71,8 @@ CREATE TABLE fast_emp4000 ( ); \set filename :abs_srcdir '/data/rect.data' -COPY slow_emp4000 FROM :'filename'; +\set command '\\copy slow_emp4000 FROM ' :'filename'; +:command INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000; @@ -269,7 +270,8 @@ CREATE TABLE array_index_op_test ( ); \set filename :abs_srcdir '/data/array.data' -COPY array_index_op_test FROM :'filename'; +\set command '\\copy array_index_op_test FROM ' :'filename'; +:command ANALYZE array_index_op_test; SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno; @@ -1246,7 +1248,7 @@ END; REINDEX SCHEMA CONCURRENTLY schema_to_reindex; -- Failure for unauthorized user -CREATE ROLE regress_reindexuser NOLOGIN; +CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_reindexuser; REINDEX SCHEMA schema_to_reindex; -- Permission failures with toast tables and indexes (pg_authid here) diff --git a/src/test/regress/sql/create_procedure.sql b/src/test/regress/sql/create_procedure.sql index 069a3727ce..faeeb3f744 100644 --- a/src/test/regress/sql/create_procedure.sql +++ b/src/test/regress/sql/create_procedure.sql @@ -255,7 +255,7 @@ DROP PROCEDURE nonexistent(); -- privileges -CREATE USER regress_cp_user1; +CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT INSERT ON cp_test TO regress_cp_user1; REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC; SET ROLE regress_cp_user1; diff --git a/src/test/regress/sql/create_role.sql b/src/test/regress/sql/create_role.sql index 4491a28a8a..3045434865 100644 --- a/src/test/regress/sql/create_role.sql +++ b/src/test/regress/sql/create_role.sql @@ -1,20 +1,20 @@ -- ok, superuser can create users with any set of privileges -CREATE ROLE regress_role_super SUPERUSER; -CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS; +CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION; -CREATE ROLE regress_role_limited_admin CREATEROLE; -CREATE ROLE regress_role_normal; +CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, CREATEROLE user can't give away role attributes without having them SET SESSION AUTHORIZATION regress_role_limited_admin; -CREATE ROLE regress_nosuch_superuser SUPERUSER; -CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_nosuch_replication REPLICATION; -CREATE ROLE regress_nosuch_bypassrls BYPASSRLS; -CREATE ROLE regress_nosuch_createdb CREATEDB; +CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can create a role without any special attributes -CREATE ROLE regress_role_limited; +CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, can't give it in any of the restricted attributes ALTER ROLE regress_role_limited SUPERUSER; @@ -25,10 +25,10 @@ DROP ROLE regress_role_limited; -- ok, can give away these role attributes if you have them SET SESSION AUTHORIZATION regress_role_admin; -CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_replication REPLICATION; -CREATE ROLE regress_bypassrls BYPASSRLS; -CREATE ROLE regress_createdb CREATEDB; +CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can toggle these role attributes off and on if you have them ALTER ROLE regress_replication NOREPLICATION; @@ -43,52 +43,52 @@ ALTER ROLE regress_createdb SUPERUSER; ALTER ROLE regress_createdb NOSUPERUSER; -- ok, having CREATEROLE is enough to create users with these privileges -CREATE ROLE regress_createrole CREATEROLE NOINHERIT; +CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION; -CREATE ROLE regress_login LOGIN; -CREATE ROLE regress_inherit INHERIT; -CREATE ROLE regress_connection_limit CONNECTION LIMIT 5; -CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo'; -CREATE ROLE regress_password_null PASSWORD NULL; +CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, backwards compatible noise words should be ignored -CREATE ROLE regress_noiseword SYSID 12345; +CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant membership in superuser role -CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; +CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, database owner cannot have members -CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; +CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can grant other users into a role CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself -CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; +CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself with admin option -CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; +CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; -- ok, regress_createrole can create new roles -CREATE ROLE regress_plainrole; +CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with it -CREATE ROLE regress_rolecreator CREATEROLE; +CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with different role -- attributes, including CREATEROLE -CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5; +CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, we should be able to modify a role we created COMMENT ON ROLE regress_hasprivs IS 'some comment'; @@ -123,7 +123,7 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole; -- ok, create a role with a value for createrole_self_grant SET createrole_self_grant = 'set, inherit'; -CREATE ROLE regress_tenant2; +CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_tenant2; -- ok, regress_tenant2 can create objects within the database @@ -150,16 +150,16 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2; DROP TABLE tenant2_table; -- fail, CREATEROLE is not enough to create roles in privileged roles -CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; -CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; -CREATE ROLE regress_monitor IN ROLE pg_monitor; -CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; -CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; -CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; -CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; -CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; -CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; -CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data; +CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data; +CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor; +CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings; +CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats; +CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables; +CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files; +CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files; +CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program; +CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend; -- fail, role still owns database objects DROP ROLE regress_tenant; diff --git a/src/test/regress/sql/create_schema.sql b/src/test/regress/sql/create_schema.sql index 1b7064247a..be5b662ce1 100644 --- a/src/test/regress/sql/create_schema.sql +++ b/src/test/regress/sql/create_schema.sql @@ -4,7 +4,7 @@ -- Schema creation with elements. -CREATE ROLE regress_create_schema_role SUPERUSER; +CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Cases where schema creation fails as objects are qualified with a schema -- that does not match with what's expected. diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql index ae6841308b..47bc792e30 100644 --- a/src/test/regress/sql/create_view.sql +++ b/src/test/regress/sql/create_view.sql @@ -23,7 +23,8 @@ CREATE TABLE real_city ( ); \set filename :abs_srcdir '/data/real_city.data' -COPY real_city FROM :'filename'; +\set command '\\copy real_city FROM ' :'filename'; +:command ANALYZE real_city; SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. diff --git a/src/test/regress/sql/dependency.sql b/src/test/regress/sql/dependency.sql index 2559c62d0b..06c3aa1a36 100644 --- a/src/test/regress/sql/dependency.sql +++ b/src/test/regress/sql/dependency.sql @@ -2,10 +2,10 @@ -- DEPENDENCIES -- -CREATE USER regress_dep_user; -CREATE USER regress_dep_user2; -CREATE USER regress_dep_user3; -CREATE GROUP regress_dep_group; +CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE deptest (f1 serial primary key, f2 text); @@ -45,9 +45,9 @@ DROP TABLE deptest; DROP USER regress_dep_user3; -- Test DROP OWNED -CREATE USER regress_dep_user0; -CREATE USER regress_dep_user1; -CREATE USER regress_dep_user2; +CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_dep_user0; -- permission denied DROP OWNED BY regress_dep_user1; diff --git a/src/test/regress/sql/drop_if_exists.sql b/src/test/regress/sql/drop_if_exists.sql index ac6168b91f..4270062ec7 100644 --- a/src/test/regress/sql/drop_if_exists.sql +++ b/src/test/regress/sql/drop_if_exists.sql @@ -86,9 +86,9 @@ DROP DOMAIN test_domain_exists; --- role/user/group --- -CREATE USER regress_test_u1; -CREATE ROLE regress_test_r1; -CREATE GROUP regress_test_g1; +CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP USER regress_test_u2; diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql index 247b0a3105..bf018fd3a1 100644 --- a/src/test/regress/sql/equivclass.sql +++ b/src/test/regress/sql/equivclass.sql @@ -230,7 +230,7 @@ set enable_mergejoin = off; alter table ec1 enable row level security; create policy p1 on ec1 using (f1 < '5'::int8alias1); -create user regress_user_ectest; +create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select on ec0 to regress_user_ectest; grant select on ec1 to regress_user_ectest; diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index 1aeaddbe71..89a410ec4a 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -86,7 +86,7 @@ create event trigger regress_event_trigger2 on ddl_command_start comment on event trigger regress_event_trigger is 'test comment'; -- drop as non-superuser should fail -create role regress_evt_user; +create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_evt_user; create event trigger regress_event_trigger_noperms on ddl_command_start execute procedure test_event_trigger(); diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql index aa147b14a9..370e0dd570 100644 --- a/src/test/regress/sql/foreign_data.sql +++ b/src/test/regress/sql/foreign_data.sql @@ -22,14 +22,14 @@ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_r RESET client_min_messages; -CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER; +CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_foreign_data_user'; -CREATE ROLE regress_test_role; -CREATE ROLE regress_test_role2; -CREATE ROLE regress_test_role_super SUPERUSER; -CREATE ROLE regress_test_indirect; -CREATE ROLE regress_unprivileged_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 9f4210b26e..620d3fc87e 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner; set role regress_other_partitioned_fk_owner; create table other_partitioned_fk(a int, b int) partition by list (a); diff --git a/src/test/regress/sql/generated.sql b/src/test/regress/sql/generated.sql index 298f6b3aa8..f058913ae0 100644 --- a/src/test/regress/sql/generated.sql +++ b/src/test/regress/sql/generated.sql @@ -263,7 +263,7 @@ ALTER TABLE gtest10a DROP COLUMN b; INSERT INTO gtest10a (a) VALUES (1); -- privileges -CREATE USER regress_user11; +CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED); INSERT INTO gtest11s VALUES (1, 10), (2, 20); diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql index dc79761955..a9ead75349 100644 --- a/src/test/regress/sql/guc.sql +++ b/src/test/regress/sql/guc.sql @@ -188,7 +188,7 @@ PREPARE foo AS SELECT 1; LISTEN foo_event; SET vacuum_cost_delay = 13; CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS; -CREATE ROLE regress_guc_user; +CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_guc_user; -- look changes SELECT pg_listening_channels(); diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql index 527024f710..de49c0b85f 100644 --- a/src/test/regress/sql/hash_index.sql +++ b/src/test/regress/sql/hash_index.sql @@ -26,10 +26,14 @@ CREATE TABLE hash_f8_heap ( ); \set filename :abs_srcdir '/data/hash.data' -COPY hash_i4_heap FROM :'filename'; -COPY hash_name_heap FROM :'filename'; -COPY hash_txt_heap FROM :'filename'; -COPY hash_f8_heap FROM :'filename'; +\set command '\\copy hash_i4_heap FROM ' :'filename'; +:command +\set command '\\copy hash_name_heap FROM ' :'filename'; +:command +\set command '\\copy hash_txt_heap FROM ' :'filename'; +:command +\set command '\\copy hash_f8_heap FROM ' :'filename'; +:command -- the data in this file has a lot of duplicates in the index key -- fields, leading to long bucket chains and lots of table expansion. diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql index 7537258a75..9041e35e34 100644 --- a/src/test/regress/sql/identity.sql +++ b/src/test/regress/sql/identity.sql @@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART; ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY; -- privileges -CREATE USER regress_identity_user1; +CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text); GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index b5b554a125..109889ad24 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -958,7 +958,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; -create role regress_no_child_access; +create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER; revoke all on permtest_grandchild from regress_no_child_access; grant select on permtest_parent to regress_no_child_access; set session authorization regress_no_child_access; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 2b086eeb6d..913d8a0aed 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -513,7 +513,7 @@ drop table mlparted5; create table key_desc (a int, b int) partition by list ((a+0)); create table key_desc_1 partition of key_desc for values in (1) partition by range (b); -create user regress_insert_other_user; +create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select (a) on key_desc_1 to regress_insert_other_user; grant insert on key_desc to regress_insert_other_user; @@ -597,7 +597,7 @@ insert into brtrigpartcon1 values (1, 'hi there'); -- check that the message shows the appropriate column description in a -- situation where the partitioned table is not the primary ModifyTable node create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int); -create role regress_coldesc_role; +create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER; grant insert on inserttest3 to regress_coldesc_role; grant insert on brtrigpartcon to regress_coldesc_role; revoke select on brtrigpartcon from regress_coldesc_role; diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index 6dae715afd..aa320ba7be 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -6,7 +6,8 @@ CREATE TABLE testjsonb ( ); \set filename :abs_srcdir '/data/jsonb.data' -COPY testjsonb FROM :'filename'; +\set command '\\copy testjsonb FROM ' :'filename'; +:command -- Strings. SELECT '""'::jsonb; -- OK. diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql index a4aee02e3a..8839c9496a 100644 --- a/src/test/regress/sql/largeobject.sql +++ b/src/test/regress/sql/largeobject.sql @@ -10,7 +10,7 @@ SET bytea_output TO escape; -- Test ALTER LARGE OBJECT OWNER -CREATE ROLE regress_lo_user; +CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT lo_create(42); ALTER LARGE OBJECT 42 OWNER TO regress_lo_user; @@ -189,7 +189,8 @@ SELECT lo_unlink(loid) from lotest_stash_values; TRUNCATE lotest_stash_values; \set filename :abs_srcdir '/data/tenk.data' -INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename'); +\lo_import :filename +INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID); BEGIN; UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer)); @@ -219,8 +220,8 @@ SELECT lo_close(fd) FROM lotest_stash_values; END; \set filename :abs_builddir '/results/lotest.txt' -SELECT lo_export(loid, :'filename') FROM lotest_stash_values; - +SELECT loid FROM lotest_stash_values \gset +\lo_export :loid, :filename \lo_import :filename \set newloid :LASTOID diff --git a/src/test/regress/sql/lock.sql b/src/test/regress/sql/lock.sql index b88488c6d0..78b31e6dd3 100644 --- a/src/test/regress/sql/lock.sql +++ b/src/test/regress/sql/lock.sql @@ -19,7 +19,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2; CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1; CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a); CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub; -CREATE ROLE regress_rol_lock1; +CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1; GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1; diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql index 235123de1e..58e73cec5d 100644 --- a/src/test/regress/sql/matview.sql +++ b/src/test/regress/sql/matview.sql @@ -209,7 +209,7 @@ SELECT * FROM mvtest_mv_v; DROP TABLE mvtest_v CASCADE; -- make sure running as superuser works when MV owned by another role (bug #11208) -CREATE ROLE regress_user_mvtest; +CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_user_mvtest; -- this test case also checks for ambiguity in the queries issued by -- refresh_by_match_merge(), by choosing column names that intentionally @@ -264,7 +264,7 @@ ROLLBACK; -- INSERT privileges if relation owner is not allowed to insert. CREATE SCHEMA matview_schema; -CREATE USER regress_matview_user; +CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user REVOKE INSERT ON TABLES FROM regress_matview_user; GRANT ALL ON SCHEMA matview_schema TO public; diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql index 2a220a248f..91a404d51e 100644 --- a/src/test/regress/sql/merge.sql +++ b/src/test/regress/sql/merge.sql @@ -2,9 +2,9 @@ -- MERGE -- -CREATE USER regress_merge_privs; -CREATE USER regress_merge_no_privs; -CREATE USER regress_merge_none; +CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP TABLE IF EXISTS target; DROP TABLE IF EXISTS source; diff --git a/src/test/regress/sql/misc.sql b/src/test/regress/sql/misc.sql index 165a2e175f..08d7096e2c 100644 --- a/src/test/regress/sql/misc.sql +++ b/src/test/regress/sql/misc.sql @@ -74,22 +74,26 @@ DROP TABLE tmp; -- copy -- \set filename :abs_builddir '/results/onek.data' -COPY onek TO :'filename'; +\set command '\\copy onek TO ' :'filename'; +:command CREATE TEMP TABLE onek_copy (LIKE onek); -COPY onek_copy FROM :'filename'; +\set command '\\copy onek_copy FROM ' :'filename'; +:command SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy; SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek; \set filename :abs_builddir '/results/stud_emp.data' -COPY BINARY stud_emp TO :'filename'; +\set command '\\COPY BINARY stud_emp TO ' :'filename'; +:command CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp); -COPY BINARY stud_emp_copy FROM :'filename'; +\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename'; +:command SELECT * FROM stud_emp_copy; diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index b57f01f3e9..3e05aa6400 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -82,7 +82,7 @@ SELECT pg_log_backend_memory_contexts(pg_backend_pid()); SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity WHERE backend_type = 'checkpointer'; -CREATE ROLE regress_log_memory; +CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_function_privilege('regress_log_memory', 'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no @@ -169,7 +169,7 @@ select count(*) > 0 from -- -- Test replication slot directory functions -- -CREATE ROLE regress_slot_dir_funcs; +CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Not available by default. SELECT has_function_privilege('regress_slot_dir_funcs', 'pg_ls_logicalsnapdir()', 'EXECUTE'); diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 1a6c61f49d..1c31ac6a53 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -7,7 +7,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_addr_user; RESET client_min_messages; -CREATE USER regress_addr_user; +CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql index 53e86b0b6c..0303fdfe96 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql @@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; -CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1'; -CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; +CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- check list of created entries -- @@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; --- already encrypted, use as they are -ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; --- already encrypted with MD5, use as it is -CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; --- This looks like a valid SCRAM-SHA-256 secret, but it is not --- so it should be hashed with SCRAM-SHA-256. -CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; --- These may look like valid MD5 secrets, but they are not, so they --- should be hashed with SCRAM-SHA-256. --- trailing garbage at the end -CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; --- invalid length -CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; @@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; --- Test with invalid stored and server keys. --- --- The first is valid, to act as a control. The others have too long --- stored/server keys. They will be re-hashed. -CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 259f1aedd1..6e1a3d17b7 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; -- test proper begins here -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; -CREATE USER regress_priv_user5; -- duplicate -CREATE USER regress_priv_user6; -CREATE USER regress_priv_user7; -CREATE USER regress_priv_user8; -CREATE USER regress_priv_user9; -CREATE USER regress_priv_user10; -CREATE ROLE regress_priv_role; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- duplicate +CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER; -- circular ADMIN OPTION grants should be disallowed GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION; @@ -84,11 +84,11 @@ DROP ROLE regress_priv_user5; -- should fail, dependency DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order -- recreate the roles we just dropped -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; @@ -163,8 +163,8 @@ DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; -CREATE GROUP regress_priv_group1; -CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2; +CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; @@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - -CREATE ROLE regress_sro_user; +CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that index expressions and predicates are run as the table's owner @@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - -CREATE ROLE regress_schemauser1 superuser login; -CREATE ROLE regress_schemauser2 superuser login; +CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; @@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE -CREATE USER regress_locktable_user; +CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission @@ -1839,7 +1839,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - -CREATE ROLE regress_readallstats; +CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no @@ -1859,10 +1859,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery -CREATE ROLE regress_group; -CREATE ROLE regress_group_direct_manager; -CREATE ROLE regress_group_indirect_manager; -CREATE ROLE regress_group_member; +CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; @@ -1884,9 +1884,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes -CREATE ROLE regress_roleoption_protagonist; -CREATE ROLE regress_roleoption_donor; -CREATE ROLE regress_roleoption_recipient; +CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql index f3bc6cd07e..f1a2f58069 100644 --- a/src/test/regress/sql/psql.sql +++ b/src/test/regress/sql/psql.sql @@ -496,7 +496,7 @@ select 1 where false; \pset expanded off CREATE SCHEMA tableam_display; -CREATE ROLE regress_display_role; +CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER SCHEMA tableam_display OWNER TO regress_display_role; SET search_path TO tableam_display; CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler; @@ -1174,7 +1174,7 @@ select 1/(15-unique2) from tenk1 order by unique2 limit 19; \unset FETCH_COUNT create schema testpart; -create role regress_partitioning_role; +create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER; alter schema testpart owner to regress_partitioning_role; @@ -1285,7 +1285,7 @@ reset work_mem; -- check \df+ -- we have to use functions with a predictable owner name, so make a role -create role regress_psql_user superuser; +create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; begin; set session authorization regress_psql_user; @@ -1431,11 +1431,14 @@ CREATE TEMPORARY TABLE reload_output( ); SELECT 1 AS a \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; @@ -1452,17 +1455,20 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c; -- COPY TO file -- The data goes to :g_out_file and the status to :o_out_file \set QUIET false -COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file'; +\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file'; +:command -- DML command status UPDATE onek SET unique1 = unique1 WHERE false; \set QUIET true \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; @@ -1475,10 +1481,12 @@ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; DROP TABLE reload_output; @@ -1825,10 +1833,10 @@ DROP FUNCTION psql_error; \dX "no.such.database"."no.such.schema"."no.such.extended.statistics" -- check \drg and \du -CREATE ROLE regress_du_role0; -CREATE ROLE regress_du_role1; -CREATE ROLE regress_du_role2; -CREATE ROLE regress_du_admin; +CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE; diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index d5051a5e74..b32d729271 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -1,9 +1,9 @@ -- -- PUBLICATION -- -CREATE ROLE regress_publication_user LOGIN SUPERUSER; -CREATE ROLE regress_publication_user2; -CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_publication_user'; -- suppress warning that depends on wal_level @@ -801,7 +801,7 @@ DROP PUBLICATION testpub2; DROP PUBLICATION testpub3; SET ROLE regress_publication_user; -CREATE ROLE regress_publication_user3; +CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_publication_user2 TO regress_publication_user3; SET client_min_messages = 'ERROR'; CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test; diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql index de2aa881a8..41a675fd35 100644 --- a/src/test/regress/sql/regproc.sql +++ b/src/test/regress/sql/regproc.sql @@ -4,7 +4,7 @@ /* If objects exist, return oids */ -CREATE ROLE regress_regrole_test; +CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER; -- without schemaname diff --git a/src/test/regress/sql/roleattributes.sql b/src/test/regress/sql/roleattributes.sql index c961b2d730..0859b89c4f 100644 --- a/src/test/regress/sql/roleattributes.sql +++ b/src/test/regress/sql/roleattributes.sql @@ -1,83 +1,83 @@ -- default for superuser is false -CREATE ROLE regress_test_def_superuser; +CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; -CREATE ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; +CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; ALTER ROLE regress_test_superuser WITH NOSUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; ALTER ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; -- default for inherit is true -CREATE ROLE regress_test_def_inherit; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; -CREATE ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; +CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; ALTER ROLE regress_test_inherit WITH INHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; ALTER ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; -- default for create role is false -CREATE ROLE regress_test_def_createrole; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; -CREATE ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; +CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; ALTER ROLE regress_test_createrole WITH NOCREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; ALTER ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; -- default for create database is false -CREATE ROLE regress_test_def_createdb; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; -CREATE ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; +CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; ALTER ROLE regress_test_createdb WITH NOCREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; ALTER ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; -- default for can login is false for role -CREATE ROLE regress_test_def_role_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; -CREATE ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; +CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; ALTER ROLE regress_test_role_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; ALTER ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; -- default for can login is true for user -CREATE USER regress_test_def_user_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; -CREATE USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; +CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; ALTER USER regress_test_user_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; ALTER USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; -- default for replication is false -CREATE ROLE regress_test_def_replication; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; -CREATE ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; +CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; ALTER ROLE regress_test_replication WITH NOREPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; ALTER ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; -- default for bypassrls is false -CREATE ROLE regress_test_def_bypassrls; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; -CREATE ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; +CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; ALTER ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; -- clean up roles DROP ROLE regress_test_def_superuser; diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index d3bfd53e23..919ce1d0c6 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE; RESET client_min_messages; -- initial setup -CREATE USER regress_rls_alice NOLOGIN; -CREATE USER regress_rls_bob NOLOGIN; -CREATE USER regress_rls_carol NOLOGIN; -CREATE USER regress_rls_dave NOLOGIN; -CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN; -CREATE ROLE regress_rls_group1 NOLOGIN; -CREATE ROLE regress_rls_group2 NOLOGIN; +CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_rls_group1 TO regress_rls_bob; GRANT regress_rls_group2 TO regress_rls_carol; @@ -2065,8 +2065,8 @@ SELECT count(*) = 0 FROM pg_depend -- DROP OWNED BY testing RESET SESSION AUTHORIZATION; -CREATE ROLE regress_rls_dob_role1; -CREATE ROLE regress_rls_dob_role2; +CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE dob_t1 (c1 int); CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1); diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 8b7e255dcd..c58d095c05 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -1356,7 +1356,7 @@ DROP TABLE ruletest2; -- Test non-SELECT rule on security invoker view. -- Should use view owner's permissions. -- -CREATE USER regress_rule_user1; +CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ruletest_t1 (x int); CREATE TABLE ruletest_t2 (x int); diff --git a/src/test/regress/sql/security_label.sql b/src/test/regress/sql/security_label.sql index 98e6a5f211..68c868fef2 100644 --- a/src/test/regress/sql/security_label.sql +++ b/src/test/regress/sql/security_label.sql @@ -10,8 +10,8 @@ DROP ROLE IF EXISTS regress_seclabel_user2; RESET client_min_messages; -CREATE USER regress_seclabel_user1 WITH CREATEROLE; -CREATE USER regress_seclabel_user2; +CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); diff --git a/src/test/regress/sql/select_into.sql b/src/test/regress/sql/select_into.sql index 689c448cc2..223ceb1d75 100644 --- a/src/test/regress/sql/select_into.sql +++ b/src/test/regress/sql/select_into.sql @@ -20,7 +20,7 @@ DROP TABLE sitmp1; -- SELECT INTO and INSERT permission, if owner is not allowed to insert. -- CREATE SCHEMA selinto_schema; -CREATE USER regress_selinto_user; +CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 33d78e16dc..cb193c9b27 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -464,7 +464,7 @@ SELECT 1 FROM tenk1_vw_sec rollback; -- test that function option SET ROLE works in parallel workers. -create role regress_parallel_worker; +create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; create function set_and_report_role() returns text as $$ select current_setting('role') $$ language sql parallel safe diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql index e742f13699..7bd0255df8 100644 --- a/src/test/regress/sql/select_views.sql +++ b/src/test/regress/sql/select_views.sql @@ -12,7 +12,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; -- -- Test for Leaky view scenario -- -CREATE ROLE regress_alice; +CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FUNCTION f_leak (text) RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index 793f1415f6..ec07c1f193 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -293,7 +293,7 @@ ROLLBACK; -- privileges tests -CREATE USER regress_seq_user; +CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- nextval BEGIN; diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 1e21e55c6d..2251f50c5e 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -622,23 +622,6 @@ SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes; SELECT current_setting('fsync') = 'off' OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs; --- Change the tablespace so that the table is rewritten directly, then SELECT --- from it to cause it to be read back into shared buffers. -SELECT sum(reads) AS io_sum_shared_before_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset --- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly --- rewritten table, e.g. by autovacuum. -BEGIN; -ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace; --- SELECT from the table so that the data is read into shared buffers and --- context 'normal', object 'relation' reads are counted. -SELECT COUNT(*) FROM test_io_shared; -COMMIT; -SELECT pg_stat_force_next_flush(); -SELECT sum(reads) AS io_sum_shared_after_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads; - SELECT sum(hits) AS io_sum_shared_before_hits FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -- Select from the table again to count hits. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 1b80d3687b..4d8798b0b1 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -50,7 +50,7 @@ DROP TABLE ext_stats_test; CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1; COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment'; -CREATE ROLE regress_stats_ext; +CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_stats_ext; COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment'; DROP STATISTICS ab1_a_b_stats; @@ -1607,7 +1607,7 @@ drop statistics stts_t1_expr_expr_stat; set search_path to public, stts_s1; \dX -create role regress_stats_ext nosuperuser; +create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_stats_ext; \dX reset role; @@ -1618,7 +1618,7 @@ drop user regress_stats_ext; reset search_path; -- User with no access -CREATE USER regress_stats_user1; +CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT USAGE ON SCHEMA tststats TO regress_stats_user1; SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl; -- Permission denied diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 444e563ff3..1a538a98a0 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -2,10 +2,10 @@ -- SUBSCRIPTION -- -CREATE ROLE regress_subscription_user LOGIN SUPERUSER; -CREATE ROLE regress_subscription_user2; -CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription; -CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription; +CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_subscription_user'; -- fail - no publications diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql index 1b2d434683..b765c748b8 100644 --- a/src/test/regress/sql/test_setup.sql +++ b/src/test/regress/sql/test_setup.sql @@ -135,7 +135,8 @@ CREATE TABLE onek ( ); \set filename :abs_srcdir '/data/onek.data' -COPY onek FROM :'filename'; +\set command '\\copy onek FROM ' :'filename'; +:command VACUUM ANALYZE onek; CREATE TABLE onek2 AS SELECT * FROM onek; @@ -161,7 +162,8 @@ CREATE TABLE tenk1 ( ); \set filename :abs_srcdir '/data/tenk.data' -COPY tenk1 FROM :'filename'; +\set command '\\copy tenk1 FROM ' :'filename'; +:command VACUUM ANALYZE tenk1; CREATE TABLE tenk2 AS SELECT * FROM tenk1; @@ -174,7 +176,8 @@ CREATE TABLE person ( ); \set filename :abs_srcdir '/data/person.data' -COPY person FROM :'filename'; +\set command '\\copy person FROM ' :'filename'; +:command VACUUM ANALYZE person; CREATE TABLE emp ( @@ -183,7 +186,8 @@ CREATE TABLE emp ( ) INHERITS (person); \set filename :abs_srcdir '/data/emp.data' -COPY emp FROM :'filename'; +\set command '\\copy emp FROM ' :'filename'; +:command VACUUM ANALYZE emp; CREATE TABLE student ( @@ -191,7 +195,8 @@ CREATE TABLE student ( ) INHERITS (person); \set filename :abs_srcdir '/data/student.data' -COPY student FROM :'filename'; +\set command '\\copy student FROM ' :'filename'; +:command VACUUM ANALYZE student; CREATE TABLE stud_emp ( @@ -199,7 +204,8 @@ CREATE TABLE stud_emp ( ) INHERITS (emp, student); \set filename :abs_srcdir '/data/stud_emp.data' -COPY stud_emp FROM :'filename'; +\set command '\\copy stud_emp FROM ' :'filename'; +:command VACUUM ANALYZE stud_emp; CREATE TABLE road ( @@ -208,7 +214,8 @@ CREATE TABLE road ( ); \set filename :abs_srcdir '/data/streets.data' -COPY road FROM :'filename'; +\set command '\\copy road FROM ' :'filename'; +:command VACUUM ANALYZE road; CREATE TABLE ihighway () INHERITS (road); diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index fbd26cdba4..7ec2d78eee 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -49,7 +49,8 @@ CREATE TABLE test_tsvector( ); \set filename :abs_srcdir '/data/tsearch.data' -COPY test_tsvector FROM :'filename'; +\set command '\\copy test_tsvector FROM ' :'filename'; +:command ANALYZE test_tsvector; diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index 0a3176e25d..7744ef68f5 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -425,9 +425,9 @@ DROP TABLE base_tbl CASCADE; -- permissions checks -CREATE USER regress_view_user1; -CREATE USER regress_view_user2; -CREATE USER regress_view_user3; +CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_view_user1; CREATE TABLE base_tbl(a int, b text, c float); @@ -1586,8 +1586,8 @@ drop view uv_iocu_view; drop table uv_iocu_tab; -- ON CONFLICT DO UPDATE permissions checks -create user regress_view_user1; -create user regress_view_user2; +create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; set session authorization regress_view_user1; create table base_tbl(a int unique, b text, c float); diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 7a7bee77b9..07b480cd59 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -339,7 +339,7 @@ DROP FUNCTION func_parted_mod_b(); ----------------------------------------- ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; -CREATE USER regress_range_parted_user; +CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT ALL ON range_parted, mintab TO regress_range_parted_user; CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql index ae36b54641..5612b8e162 100644 --- a/src/test/regress/sql/vacuum.sql +++ b/src/test/regress/sql/vacuum.sql @@ -335,7 +335,7 @@ CREATE TABLE vacowned (a int); CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a); CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1); CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2); -CREATE ROLE regress_vacuum; +CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_vacuum; -- Simple table VACUUM vacowned; ================================================ FILE: compute/patches/cloud_regress_pg17.patch ================================================ diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1c1ca7573a..6dfe537647 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -11,7 +11,8 @@ CREATE TABLE aggtest ( b float4 ); \set filename :abs_srcdir '/data/agg.data' -COPY aggtest FROM :'filename'; +\set command '\\copy aggtest FROM ' :'filename'; +:command ANALYZE aggtest; SELECT avg(four) AS avg_1 FROM onek; avg_1 diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index ae54cb254f..888e2ee8bc 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -15,9 +15,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user1; DROP ROLE IF EXISTS regress_alter_generic_user2; DROP ROLE IF EXISTS regress_alter_generic_user3; RESET client_min_messages; -CREATE USER regress_alter_generic_user3; -CREATE USER regress_alter_generic_user2; -CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3; +CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3; CREATE SCHEMA alt_nsp1; CREATE SCHEMA alt_nsp2; GRANT ALL ON SCHEMA alt_nsp1, alt_nsp2 TO public; @@ -370,7 +370,7 @@ ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user5 NOSUPERUSER; +CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER; CREATE OPERATOR FAMILY alt_opf5 USING btree; SET ROLE regress_alter_generic_user5; ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2); @@ -382,7 +382,7 @@ ERROR: current transaction is aborted, commands ignored until end of transactio ROLLBACK; -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user6; +CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA alt_nsp6; REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6; CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree; diff --git a/src/test/regress/expected/alter_operator.out b/src/test/regress/expected/alter_operator.out index 4217ba15de..d28e3ff86e 100644 --- a/src/test/regress/expected/alter_operator.out +++ b/src/test/regress/expected/alter_operator.out @@ -119,7 +119,7 @@ ERROR: operator attribute "Restrict" not recognized -- -- Test permission check. Must be owner to ALTER OPERATOR. -- -CREATE USER regress_alter_op_user; +CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_alter_op_user; ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE); ERROR: must be owner of operator === diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 6de74a26a9..cd59809194 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -5,7 +5,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_alter_table_user1; RESET client_min_messages; -CREATE USER regress_alter_table_user1; +CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- -- add attribute -- @@ -3928,8 +3928,8 @@ DROP TABLE fail_part; ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); ERROR: relation "nonexistent" does not exist -- check ownership of the source table -CREATE ROLE regress_test_me; -CREATE ROLE regress_test_not_me; +CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE not_owned_by_me (LIKE list_parted); ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; SET SESSION AUTHORIZATION regress_test_me; diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index a6d81fd5f9..afefd761cb 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -18,7 +18,8 @@ CREATE TABLE array_op_test ( t text[] ); \set filename :abs_srcdir '/data/array.data' -COPY array_op_test FROM :'filename'; +\set command '\\copy array_op_test FROM ' :'filename'; +:command ANALYZE array_op_test; -- -- only the 'e' array is 0-based, the others are 1-based. diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 510646cbce..0b3ca1f720 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -20,13 +20,17 @@ CREATE TABLE bt_f8_heap ( random int4 ); \set filename :abs_srcdir '/data/desc.data' -COPY bt_i4_heap FROM :'filename'; +\set command '\\copy bt_i4_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_name_heap FROM :'filename'; +\set command '\\copy bt_name_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/desc.data' -COPY bt_txt_heap FROM :'filename'; +\set command '\\copy bt_txt_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_f8_heap FROM :'filename'; +\set command '\\copy bt_f8_heap FROM ' :'filename'; +:command ANALYZE bt_i4_heap; ANALYZE bt_name_heap; ANALYZE bt_txt_heap; diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out index a13aafff0b..f0289b5c06 100644 --- a/src/test/regress/expected/cluster.out +++ b/src/test/regress/expected/cluster.out @@ -308,7 +308,7 @@ WHERE pg_class.oid=indexrelid -- Verify that toast tables are clusterable CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index; -- Verify that clustering all tables does in fact cluster the right ones -CREATE USER regress_clstr_user; +CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE clstr_1 (a INT PRIMARY KEY); CREATE TABLE clstr_2 (a INT PRIMARY KEY); CREATE TABLE clstr_3 (a INT PRIMARY KEY); @@ -499,7 +499,7 @@ DROP TABLE clstrpart; CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); CREATE INDEX ptnowner_i_idx ON ptnowner(i); CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1); -CREATE ROLE regress_ptnowner; +CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2); ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; SET SESSION AUTHORIZATION regress_ptnowner; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 7a425afe1f..2756fb2d55 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC'; reset enable_seqscan; -- schema manipulation commands -CREATE ROLE regress_test_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: SET client_min_messages TO WARNING; diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index cf0b80d616..e8e2a14a4a 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -349,7 +349,8 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT, CONSTRAINT COPY_CON CHECK (x > 3 AND y <> 'check failed' AND x < 7 )); \set filename :abs_srcdir '/data/constro.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; x | y | z ---+---------------+--- @@ -358,7 +359,8 @@ SELECT * FROM COPY_TBL; (2 rows) \set filename :abs_srcdir '/data/constrf.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command ERROR: new row for relation "copy_tbl" violates check constraint "copy_con" DETAIL: Failing row contains (7, check failed, 6). CONTEXT: COPY copy_tbl, line 2: "7 check failed 6" @@ -799,7 +801,7 @@ DETAIL: Key (f1)=(3) conflicts with key (f1)=(3). DROP TABLE deferred_excl; -- Comments -- Setup a low-level role to enforce non-superuser checks. -CREATE ROLE regress_constraint_comments; +CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments; CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0)); CREATE DOMAIN constraint_comments_dom AS int CONSTRAINT the_constraint CHECK (value > 0); @@ -819,7 +821,7 @@ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS NULL; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL; -- unauthorized user RESET SESSION AUTHORIZATION; -CREATE ROLE regress_constraint_comments_noaccess; +CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments_noaccess; COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; -CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE; +CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_conversion_user; CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8; -- diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index 44114089a6..fc1894a0f2 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -15,9 +15,11 @@ insert into copytest values('Unix',E'abc\ndef',2); insert into copytest values('Mac',E'abc\rdef',3); insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4); \set filename :abs_builddir '/results/copytest.csv' -copy copytest to :'filename' csv; +\set command '\\copy copytest to ' :'filename' csv; +:command create temp table copytest2 (like copytest); -copy copytest2 from :'filename' csv; +\set command '\\copy copytest2 from ' :'filename' csv; +:command select * from copytest except select * from copytest2; style | test | filler -------+------+-------- @@ -25,8 +27,10 @@ select * from copytest except select * from copytest2; truncate copytest2; --- same test but with an escape char different from quote char -copy copytest to :'filename' csv quote '''' escape E'\\'; -copy copytest2 from :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command +\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command select * from copytest except select * from copytest2; style | test | filler -------+------+-------- @@ -66,13 +70,16 @@ insert into parted_copytest select x,1,'One' from generate_series(1,1000) x; insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x; insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x; \set filename :abs_builddir '/results/parted_copytest.csv' -copy (select * from parted_copytest order by a) to :'filename'; +\set command '\\copy (select * from parted_copytest order by a) to ' :'filename'; +:command truncate parted_copytest; -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command -- Ensure COPY FREEZE errors for partitioned tables. begin; truncate parted_copytest; -copy parted_copytest from :'filename' (freeze); +\set command '\\copy parted_copytest from ' :'filename' (freeze); +:command ERROR: cannot perform COPY FREEZE on a partitioned table rollback; select tableoid::regclass,count(*),sum(a) from parted_copytest @@ -94,7 +101,8 @@ create trigger part_ins_trig before insert on parted_copytest_a2 for each row execute procedure part_ins_func(); -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command select tableoid::regclass,count(*),sum(a) from parted_copytest group by tableoid order by tableoid::regclass::name; tableoid | count | sum @@ -106,7 +114,8 @@ group by tableoid order by tableoid::regclass::name; truncate table parted_copytest; create index on parted_copytest (b); drop trigger part_ins_trig on parted_copytest_a2; -copy parted_copytest from stdin; +\set command '\\copy parted_copytest from ' stdin; +:command -- Ensure index entries were properly added during the copy. select * from parted_copytest where b = 1; a | b | c @@ -170,9 +179,9 @@ INFO: progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progre -- Generate COPY FROM report with FILE, with some excluded tuples. truncate tab_progress_reporting; \set filename :abs_srcdir '/data/emp.data' -copy tab_progress_reporting from :'filename' - where (salary < 2000); -INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true} +\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)'; +:command +INFO: progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true} drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); drop table tab_progress_reporting; @@ -281,7 +290,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1); -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us \set filename :abs_srcdir '/data/desc.data' -COPY parted_si(id, data) FROM :'filename'; +\set command '\\COPY parted_si(id, data) FROM ' :'filename'; +:command -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from -- the wrong partition. This test is *not* guaranteed to trigger that bug, but -- does so when shared_buffers is small enough. To test if we encountered the diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 695b1b2d63..9c9addead6 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -631,8 +631,8 @@ select * from check_con_tbl; (2 rows) -- test with RLS enabled. -CREATE ROLE regress_rls_copy_user; -CREATE ROLE regress_rls_copy_user_colperms; +CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE rls_t1 (a int, b int, c int); COPY rls_t1 (a, b, c) from stdin; CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0); diff --git a/src/test/regress/expected/create_function_sql.out b/src/test/regress/expected/create_function_sql.out index 50aca5940f..42527142f6 100644 --- a/src/test/regress/expected/create_function_sql.out +++ b/src/test/regress/expected/create_function_sql.out @@ -4,7 +4,7 @@ -- Assorted tests using SQL-language functions -- -- All objects made in this test are in temp_func_test schema -CREATE USER regress_unpriv_user; +CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA temp_func_test; GRANT ALL ON SCHEMA temp_func_test TO public; SET search_path TO temp_func_test, public; diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index cf6eac5734..3e56ea09d7 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -51,7 +51,8 @@ CREATE TABLE fast_emp4000 ( home_base box ); \set filename :abs_srcdir '/data/rect.data' -COPY slow_emp4000 FROM :'filename'; +\set command '\\copy slow_emp4000 FROM ' :'filename'; +:command INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000; ANALYZE slow_emp4000; ANALYZE fast_emp4000; @@ -655,7 +656,8 @@ CREATE TABLE array_index_op_test ( t text[] ); \set filename :abs_srcdir '/data/array.data' -COPY array_index_op_test FROM :'filename'; +\set command '\\copy array_index_op_test FROM ' :'filename'; +:command ANALYZE array_index_op_test; SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno; seqno | i | t @@ -2966,7 +2968,7 @@ END; -- concurrently REINDEX SCHEMA CONCURRENTLY schema_to_reindex; -- Failure for unauthorized user -CREATE ROLE regress_reindexuser NOLOGIN; +CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_reindexuser; REINDEX SCHEMA schema_to_reindex; ERROR: must be owner of schema schema_to_reindex diff --git a/src/test/regress/expected/create_procedure.out b/src/test/regress/expected/create_procedure.out index 2177ba3509..ae3ca94d00 100644 --- a/src/test/regress/expected/create_procedure.out +++ b/src/test/regress/expected/create_procedure.out @@ -421,7 +421,7 @@ ERROR: cp_testfunc1(integer) is not a procedure DROP PROCEDURE nonexistent(); ERROR: procedure nonexistent() does not exist -- privileges -CREATE USER regress_cp_user1; +CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT INSERT ON cp_test TO regress_cp_user1; REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC; SET ROLE regress_cp_user1; diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out index 46d4f9efe9..fc2a28a2f6 100644 --- a/src/test/regress/expected/create_role.out +++ b/src/test/regress/expected/create_role.out @@ -1,28 +1,28 @@ -- ok, superuser can create users with any set of privileges -CREATE ROLE regress_role_super SUPERUSER; -CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS; +CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION; -CREATE ROLE regress_role_limited_admin CREATEROLE; -CREATE ROLE regress_role_normal; +CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, CREATEROLE user can't give away role attributes without having them SET SESSION AUTHORIZATION regress_role_limited_admin; -CREATE ROLE regress_nosuch_superuser SUPERUSER; +CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the SUPERUSER attribute may create roles with the SUPERUSER attribute. -CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS; +CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute. -CREATE ROLE regress_nosuch_replication REPLICATION; +CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute. -CREATE ROLE regress_nosuch_bypassrls BYPASSRLS; +CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the BYPASSRLS attribute may create roles with the BYPASSRLS attribute. -CREATE ROLE regress_nosuch_createdb CREATEDB; +CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to create role DETAIL: Only roles with the CREATEDB attribute may create roles with the CREATEDB attribute. -- ok, can create a role without any special attributes -CREATE ROLE regress_role_limited; +CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, can't give it in any of the restricted attributes ALTER ROLE regress_role_limited SUPERUSER; ERROR: permission denied to alter role @@ -39,10 +39,10 @@ DETAIL: Only roles with the BYPASSRLS attribute may change the BYPASSRLS attrib DROP ROLE regress_role_limited; -- ok, can give away these role attributes if you have them SET SESSION AUTHORIZATION regress_role_admin; -CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_replication REPLICATION; -CREATE ROLE regress_bypassrls BYPASSRLS; -CREATE ROLE regress_createdb CREATEDB; +CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can toggle these role attributes off and on if you have them ALTER ROLE regress_replication NOREPLICATION; ALTER ROLE regress_replication REPLICATION; @@ -58,48 +58,48 @@ ALTER ROLE regress_createdb NOSUPERUSER; ERROR: permission denied to alter role DETAIL: Only roles with the SUPERUSER attribute may change the SUPERUSER attribute. -- ok, having CREATEROLE is enough to create users with these privileges -CREATE ROLE regress_createrole CREATEROLE NOINHERIT; +CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION; -CREATE ROLE regress_login LOGIN; -CREATE ROLE regress_inherit INHERIT; -CREATE ROLE regress_connection_limit CONNECTION LIMIT 5; -CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo'; -CREATE ROLE regress_password_null PASSWORD NULL; +CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, backwards compatible noise words should be ignored -CREATE ROLE regress_noiseword SYSID 12345; +CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER; NOTICE: SYSID can no longer be specified -- fail, cannot grant membership in superuser role -CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; +CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: permission denied to grant role "regress_role_super" DETAIL: Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute. -- fail, database owner cannot have members -CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; +CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "pg_database_owner" cannot have explicit members -- ok, can grant other users into a role CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself -CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; +CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive" -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself with admin option -CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; +CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; ERROR: role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive" -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; ERROR: permission denied to create database -- ok, regress_createrole can create new roles -CREATE ROLE regress_plainrole; +CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with it -CREATE ROLE regress_rolecreator CREATEROLE; +CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with different role -- attributes, including CREATEROLE -CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5; +CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, we should be able to modify a role we created COMMENT ON ROLE regress_hasprivs IS 'some comment'; ALTER ROLE regress_hasprivs RENAME TO regress_tenant; @@ -141,7 +141,7 @@ ERROR: permission denied to reassign objects DETAIL: Only roles with privileges of role "regress_tenant" may reassign objects owned by it. -- ok, create a role with a value for createrole_self_grant SET createrole_self_grant = 'set, inherit'; -CREATE ROLE regress_tenant2; +CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_tenant2; -- ok, regress_tenant2 can create objects within the database SET SESSION AUTHORIZATION regress_tenant2; @@ -165,34 +165,34 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2; ERROR: must be able to SET ROLE "regress_tenant2" DROP TABLE tenant2_table; -- fail, CREATEROLE is not enough to create roles in privileged roles -CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; +CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data; ERROR: permission denied to grant role "pg_read_all_data" DETAIL: Only roles with the ADMIN option on role "pg_read_all_data" may grant this role. -CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; +CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data; ERROR: permission denied to grant role "pg_write_all_data" DETAIL: Only roles with the ADMIN option on role "pg_write_all_data" may grant this role. -CREATE ROLE regress_monitor IN ROLE pg_monitor; +CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor; ERROR: permission denied to grant role "pg_monitor" DETAIL: Only roles with the ADMIN option on role "pg_monitor" may grant this role. -CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; +CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings; ERROR: permission denied to grant role "pg_read_all_settings" DETAIL: Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role. -CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; +CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats; ERROR: permission denied to grant role "pg_read_all_stats" DETAIL: Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role. -CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; +CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables; ERROR: permission denied to grant role "pg_stat_scan_tables" DETAIL: Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role. -CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; +CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files; ERROR: permission denied to grant role "pg_read_server_files" DETAIL: Only roles with the ADMIN option on role "pg_read_server_files" may grant this role. -CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; +CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files; ERROR: permission denied to grant role "pg_write_server_files" DETAIL: Only roles with the ADMIN option on role "pg_write_server_files" may grant this role. -CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; +CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program; ERROR: permission denied to grant role "pg_execute_server_program" DETAIL: Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role. -CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend; ERROR: permission denied to grant role "pg_signal_backend" DETAIL: Only roles with the ADMIN option on role "pg_signal_backend" may grant this role. -- fail, role still owns database objects diff --git a/src/test/regress/expected/create_schema.out b/src/test/regress/expected/create_schema.out index 93302a07ef..1a73f083ac 100644 --- a/src/test/regress/expected/create_schema.out +++ b/src/test/regress/expected/create_schema.out @@ -2,7 +2,7 @@ -- CREATE_SCHEMA -- -- Schema creation with elements. -CREATE ROLE regress_create_schema_role SUPERUSER; +CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Cases where schema creation fails as objects are qualified with a schema -- that does not match with what's expected. -- This checks all the object types that include schema qualifications. diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index f551624afb..57f1e432d4 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -18,7 +18,8 @@ CREATE TABLE real_city ( outline path ); \set filename :abs_srcdir '/data/real_city.data' -COPY real_city FROM :'filename'; +\set command '\\copy real_city FROM ' :'filename'; +:command ANALYZE real_city; SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out @@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out index 74d9ff2998..fad0151614 100644 --- a/src/test/regress/expected/dependency.out +++ b/src/test/regress/expected/dependency.out @@ -1,10 +1,10 @@ -- -- DEPENDENCIES -- -CREATE USER regress_dep_user; -CREATE USER regress_dep_user2; -CREATE USER regress_dep_user3; -CREATE GROUP regress_dep_group; +CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE deptest (f1 serial primary key, f2 text); GRANT SELECT ON TABLE deptest TO GROUP regress_dep_group; GRANT ALL ON TABLE deptest TO regress_dep_user, regress_dep_user2; @@ -41,9 +41,9 @@ ERROR: role "regress_dep_user3" cannot be dropped because some objects depend o DROP TABLE deptest; DROP USER regress_dep_user3; -- Test DROP OWNED -CREATE USER regress_dep_user0; -CREATE USER regress_dep_user1; -CREATE USER regress_dep_user2; +CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_dep_user0; -- permission denied DROP OWNED BY regress_dep_user1; diff --git a/src/test/regress/expected/drop_if_exists.out b/src/test/regress/expected/drop_if_exists.out index 5e44c2c3ce..eb3bb329fb 100644 --- a/src/test/regress/expected/drop_if_exists.out +++ b/src/test/regress/expected/drop_if_exists.out @@ -64,9 +64,9 @@ ERROR: type "test_domain_exists" does not exist --- --- role/user/group --- -CREATE USER regress_test_u1; -CREATE ROLE regress_test_r1; -CREATE GROUP regress_test_g1; +CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP USER regress_test_u2; ERROR: role "regress_test_u2" does not exist DROP USER IF EXISTS regress_test_u1, regress_test_u2; diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out index 126f7047fe..0e2cc73426 100644 --- a/src/test/regress/expected/equivclass.out +++ b/src/test/regress/expected/equivclass.out @@ -384,7 +384,7 @@ set enable_nestloop = on; set enable_mergejoin = off; alter table ec1 enable row level security; create policy p1 on ec1 using (f1 < '5'::int8alias1); -create user regress_user_ectest; +create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select on ec0 to regress_user_ectest; grant select on ec1 to regress_user_ectest; -- without any RLS, we'll treat {a.ff, b.ff, 43} as an EquivalenceClass diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 7b2198eac6..39919697ad 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -85,7 +85,7 @@ create event trigger regress_event_trigger2 on ddl_command_start -- OK comment on event trigger regress_event_trigger is 'test comment'; -- drop as non-superuser should fail -create role regress_evt_user; +create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_evt_user; create event trigger regress_event_trigger_noperms on ddl_command_start execute procedure test_event_trigger(); diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 6ed50fdcfa..caa00a345d 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -14,13 +14,13 @@ CREATE FUNCTION test_fdw_handler() SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_role2, regress_test_role_super, regress_test_indirect, regress_unprivileged_role; RESET client_min_messages; -CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER; +CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_foreign_data_user'; -CREATE ROLE regress_test_role; -CREATE ROLE regress_test_role2; -CREATE ROLE regress_test_role_super SUPERUSER; -CREATE ROLE regress_test_indirect; -CREATE ROLE regress_unprivileged_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index fe6a1015f2..614b387b7d 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES ERROR: cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner; set role regress_other_partitioned_fk_owner; create table other_partitioned_fk(a int, b int) partition by list (a); diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out index 499072e14c..bd7a8b3f18 100644 --- a/src/test/regress/expected/generated.out +++ b/src/test/regress/expected/generated.out @@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR ALTER TABLE gtest10a DROP COLUMN b; INSERT INTO gtest10a (a) VALUES (1); -- privileges -CREATE USER regress_user11; +CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED); INSERT INTO gtest11s VALUES (1, 10), (2, 20); GRANT SELECT (a, c) ON gtest11s TO regress_user11; diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out index 455b6d6c0c..12fa350c6d 100644 --- a/src/test/regress/expected/guc.out +++ b/src/test/regress/expected/guc.out @@ -584,7 +584,7 @@ PREPARE foo AS SELECT 1; LISTEN foo_event; SET vacuum_cost_delay = 13; CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS; -CREATE ROLE regress_guc_user; +CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_guc_user; -- look changes SELECT pg_listening_channels(); diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out index 0d4bdb2ade..9a5a9b5407 100644 --- a/src/test/regress/expected/hash_index.out +++ b/src/test/regress/expected/hash_index.out @@ -20,10 +20,14 @@ CREATE TABLE hash_f8_heap ( random float8 ); \set filename :abs_srcdir '/data/hash.data' -COPY hash_i4_heap FROM :'filename'; -COPY hash_name_heap FROM :'filename'; -COPY hash_txt_heap FROM :'filename'; -COPY hash_f8_heap FROM :'filename'; +\set command '\\copy hash_i4_heap FROM ' :'filename'; +:command +\set command '\\copy hash_name_heap FROM ' :'filename'; +:command +\set command '\\copy hash_txt_heap FROM ' :'filename'; +:command +\set command '\\copy hash_f8_heap FROM ' :'filename'; +:command -- the data in this file has a lot of duplicates in the index key -- fields, leading to long bucket chains and lots of table expansion. -- this is therefore a stress test of the bucket overflow code (unlike diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out index f14bfccfb1..bbb2092df9 100644 --- a/src/test/regress/expected/identity.out +++ b/src/test/regress/expected/identity.out @@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT; ALTER TABLE itest7 ALTER COLUMN a RESTART; ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY; -- privileges -CREATE USER regress_identity_user1; +CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text); GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 85240a9b0b..5294f7557d 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2055,8 +2055,8 @@ NOTICE: drop cascades to table cnullchild -- -- Mixed ownership inheritance tree -- -create role regress_alice; -create role regress_bob; +create role regress_alice password NEON_PASSWORD_PLACEHOLDER; +create role regress_bob password NEON_PASSWORD_PLACEHOLDER; grant all on schema public to regress_alice, regress_bob; grant regress_alice to regress_bob; set session authorization regress_alice; @@ -2789,7 +2789,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; -create role regress_no_child_access; +create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER; revoke all on permtest_grandchild from regress_no_child_access; grant select on permtest_parent to regress_no_child_access; set session authorization regress_no_child_access; diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index cf4b5221a8..fa6ccb639c 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -802,7 +802,7 @@ drop table mlparted5; -- appropriate key description (or none) in various situations create table key_desc (a int, b int) partition by list ((a+0)); create table key_desc_1 partition of key_desc for values in (1) partition by range (b); -create user regress_insert_other_user; +create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select (a) on key_desc_1 to regress_insert_other_user; grant insert on key_desc to regress_insert_other_user; set role regress_insert_other_user; @@ -914,7 +914,7 @@ DETAIL: Failing row contains (2, hi there). -- check that the message shows the appropriate column description in a -- situation where the partitioned table is not the primary ModifyTable node create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int); -create role regress_coldesc_role; +create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER; grant insert on inserttest3 to regress_coldesc_role; grant insert on brtrigpartcon to regress_coldesc_role; revoke select on brtrigpartcon from regress_coldesc_role; diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index e66d760189..86348fd416 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -4,7 +4,8 @@ CREATE TABLE testjsonb ( j jsonb ); \set filename :abs_srcdir '/data/jsonb.data' -COPY testjsonb FROM :'filename'; +\set command '\\copy testjsonb FROM ' :'filename'; +:command -- Strings. SELECT '""'::jsonb; -- OK. jsonb diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out index 4921dd79ae..d18a3cdd66 100644 --- a/src/test/regress/expected/largeobject.out +++ b/src/test/regress/expected/largeobject.out @@ -7,7 +7,7 @@ -- ensure consistent test output regardless of the default bytea format SET bytea_output TO escape; -- Test ALTER LARGE OBJECT OWNER -CREATE ROLE regress_lo_user; +CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT lo_create(42); lo_create ----------- @@ -346,7 +346,8 @@ SELECT lo_unlink(loid) from lotest_stash_values; TRUNCATE lotest_stash_values; \set filename :abs_srcdir '/data/tenk.data' -INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename'); +\lo_import :filename +INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID); BEGIN; UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer)); -- verify length of large object @@ -410,12 +411,8 @@ SELECT lo_close(fd) FROM lotest_stash_values; END; \set filename :abs_builddir '/results/lotest.txt' -SELECT lo_export(loid, :'filename') FROM lotest_stash_values; - lo_export ------------ - 1 -(1 row) - +SELECT loid FROM lotest_stash_values \gset +\lo_export :loid, :filename \lo_import :filename \set newloid :LASTOID -- just make sure \lo_export does not barf diff --git a/src/test/regress/expected/lock.out b/src/test/regress/expected/lock.out index ad137d3645..8dac447436 100644 --- a/src/test/regress/expected/lock.out +++ b/src/test/regress/expected/lock.out @@ -16,7 +16,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2; CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1; CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a); CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub; -CREATE ROLE regress_rol_lock1; +CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1; GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1; -- Try all valid lock options; also try omitting the optional TABLE keyword. diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 038ab73517..bd471f9fac 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -549,7 +549,7 @@ SELECT * FROM mvtest_mv_v; DROP TABLE mvtest_v CASCADE; NOTICE: drop cascades to materialized view mvtest_mv_v -- make sure running as superuser works when MV owned by another role (bug #11208) -CREATE ROLE regress_user_mvtest; +CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_user_mvtest; -- this test case also checks for ambiguity in the queries issued by -- refresh_by_match_merge(), by choosing column names that intentionally @@ -617,7 +617,7 @@ HINT: Use the REFRESH MATERIALIZED VIEW command. ROLLBACK; -- INSERT privileges if relation owner is not allowed to insert. CREATE SCHEMA matview_schema; -CREATE USER regress_matview_user; +CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user REVOKE INSERT ON TABLES FROM regress_matview_user; GRANT ALL ON SCHEMA matview_schema TO public; diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out index 521d70a891..7fd218f3d8 100644 --- a/src/test/regress/expected/merge.out +++ b/src/test/regress/expected/merge.out @@ -1,9 +1,9 @@ -- -- MERGE -- -CREATE USER regress_merge_privs; -CREATE USER regress_merge_no_privs; -CREATE USER regress_merge_none; +CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP TABLE IF EXISTS target; NOTICE: table "target" does not exist, skipping DROP TABLE IF EXISTS source; diff --git a/src/test/regress/expected/misc.out b/src/test/regress/expected/misc.out index 6e816c57f1..6ef45b468e 100644 --- a/src/test/regress/expected/misc.out +++ b/src/test/regress/expected/misc.out @@ -59,9 +59,11 @@ DROP TABLE tmp; -- copy -- \set filename :abs_builddir '/results/onek.data' -COPY onek TO :'filename'; +\set command '\\copy onek TO ' :'filename'; +:command CREATE TEMP TABLE onek_copy (LIKE onek); -COPY onek_copy FROM :'filename'; +\set command '\\copy onek_copy FROM ' :'filename'; +:command SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy; unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- @@ -73,9 +75,11 @@ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek; (0 rows) \set filename :abs_builddir '/results/stud_emp.data' -COPY BINARY stud_emp TO :'filename'; +\set command '\\COPY BINARY stud_emp TO ' :'filename'; +:command CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp); -COPY BINARY stud_emp_copy FROM :'filename'; +\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename'; +:command SELECT * FROM stud_emp_copy; name | age | location | salary | manager | gpa | percent -------+-----+------------+--------+---------+-----+--------- diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index d94056862a..f8270d8343 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -297,7 +297,7 @@ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity t (1 row) -CREATE ROLE regress_log_memory; +CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_function_privilege('regress_log_memory', 'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no has_function_privilege @@ -483,7 +483,7 @@ select count(*) > 0 from -- -- Test replication slot directory functions -- -CREATE ROLE regress_slot_dir_funcs; +CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Not available by default. SELECT has_function_privilege('regress_slot_dir_funcs', 'pg_ls_logicalsnapdir()', 'EXECUTE'); @@ -671,7 +671,7 @@ FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1), (1 row) -- pg_current_logfile -CREATE ROLE regress_current_logfile; +CREATE ROLE regress_current_logfile PASSWORD NEON_PASSWORD_PLACEHOLDER; -- not available by default SELECT has_function_privilege('regress_current_logfile', 'pg_current_logfile()', 'EXECUTE'); diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out index c6363ebeb2..8f43732404 100644 --- a/src/test/regress/expected/multirangetypes.out +++ b/src/test/regress/expected/multirangetypes.out @@ -3118,7 +3118,7 @@ drop type textrange2; -- Multiranges don't have their own ownership or permissions. -- create type textrange1 as range(subtype=text, multirange_type_name=multitextrange1, collation="C"); -create role regress_multirange_owner; +create role regress_multirange_owner password NEON_PASSWORD_PLACEHOLDER; alter type multitextrange1 owner to regress_multirange_owner; -- fail ERROR: cannot alter multirange type multitextrange1 HINT: You can alter type textrange1, which will alter the multirange type as well. diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index fc42d418bf..e38f517574 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -5,7 +5,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_addr_user; RESET client_min_messages; -CREATE USER regress_addr_user; +CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out index 924d6e001d..7fdda73439 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out @@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; -CREATE ROLE regress_passwd1; -ALTER ROLE regress_passwd1 PASSWORD 'role_pwd1'; -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; +CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- check list of created entries -- -- The scram secret will look something like: @@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 + regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 + regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role @@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; --- already encrypted, use as they are -ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is -CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; --- This looks like a valid SCRAM-SHA-256 secret, but it is not --- so it should be hashed with SCRAM-SHA-256. -CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; --- These may look like valid MD5 secrets, but they are not, so they --- should be hashed with SCRAM-SHA-256. --- trailing garbage at the end -CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; --- invalid length -CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; @@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb + regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 + regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 + regress_passwd5 | SCRAM-SHA-256$4096:$: regress_passwd6 | SCRAM-SHA-256$4096:$: regress_passwd7 | SCRAM-SHA-256$4096:$: regress_passwd8 | SCRAM-SHA-256$4096:$: @@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password +ERROR: Failed to get encrypted password: User "regress_passwd_empty" has no password assigned. ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; -NOTICE: empty string is not a valid password, clearing password +ERROR: role "regress_passwd_empty" does not exist ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; -NOTICE: empty string is not a valid password, clearing password +ERROR: role "regress_passwd_empty" does not exist SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; rolpassword ------------- - -(1 row) +(0 rows) --- Test with invalid stored and server keys. --- --- The first is valid, to act as a control. The others have too long --- stored/server keys. They will be re-hashed. -CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed @@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; rolname | is_rolpassword_rehashed -------------------------+------------------------- - regress_passwd_sha_len0 | f + regress_passwd_sha_len0 | t regress_passwd_sha_len1 | t regress_passwd_sha_len2 | t (3 rows) @@ -137,6 +125,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index e8c668e0a1..03be5c2120 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 RESET client_min_messages; -- test proper begins here -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; -CREATE USER regress_priv_user5; -- duplicate +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- duplicate ERROR: role "regress_priv_user5" already exists -CREATE USER regress_priv_user6; -CREATE USER regress_priv_user7; -CREATE USER regress_priv_user8; -CREATE USER regress_priv_user9; -CREATE USER regress_priv_user10; -CREATE ROLE regress_priv_role; +CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER; -- circular ADMIN OPTION grants should be disallowed GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION; GRANT regress_priv_user1 TO regress_priv_user3 WITH ADMIN OPTION GRANTED BY regress_priv_user2; @@ -108,11 +108,11 @@ ERROR: role "regress_priv_user5" cannot be dropped because some objects depend DETAIL: privileges for membership of role regress_priv_user6 in role regress_priv_user1 DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order -- recreate the roles we just dropped -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION; @@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8; DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; -CREATE GROUP regress_priv_group1; -CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2; +CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; SET SESSION AUTHORIZATION regress_priv_user3; @@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; +ERROR: permission denied to grant privileges as role "neondb_owner" +DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY foo; -- error ERROR: role "foo" does not exist REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY regress_priv_user2; -- warning, noop WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "regress_priv_user2" REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_USER; +WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner" REVOKE regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_ROLE; +WARNING: role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner" DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; @@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - -CREATE ROLE regress_sro_user; +CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ @@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - -CREATE ROLE regress_schemauser1 superuser login; -CREATE ROLE regress_schemauser2 superuser login; +CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; @@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE -CREATE USER regress_locktable_user; +CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; @@ -2895,7 +2899,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - -CREATE ROLE regress_readallstats; +CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- @@ -2939,10 +2943,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery -CREATE ROLE regress_group; -CREATE ROLE regress_group_direct_manager; -CREATE ROLE regress_group_indirect_manager; -CREATE ROLE regress_group_member; +CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; @@ -2971,9 +2975,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes -CREATE ROLE regress_roleoption_protagonist; -CREATE ROLE regress_roleoption_donor; -CREATE ROLE regress_roleoption_recipient; +CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; @@ -3002,9 +3006,9 @@ DROP ROLE regress_roleoption_protagonist; DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN -CREATE ROLE regress_no_maintain; -CREATE ROLE regress_maintain; -CREATE ROLE regress_maintain_all IN ROLE pg_maintain; +CREATE ROLE regress_no_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_maintain_all IN ROLE pg_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE maintain_test (a INT); CREATE INDEX ON maintain_test (a); GRANT MAINTAIN ON maintain_test TO regress_maintain; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 3bbe4c5f97..e742a46a63 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -2862,7 +2862,7 @@ Type | func -- check conditional am display \pset expanded off CREATE SCHEMA tableam_display; -CREATE ROLE regress_display_role; +CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER SCHEMA tableam_display OWNER TO regress_display_role; SET search_path TO tableam_display; CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler; @@ -4817,7 +4817,7 @@ last error code: 22012 reset debug_parallel_query; \unset FETCH_COUNT create schema testpart; -create role regress_partitioning_role; +create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER; alter schema testpart owner to regress_partitioning_role; set role to regress_partitioning_role; -- run test inside own schema and hide other partitions @@ -5269,7 +5269,7 @@ reset work_mem; -- check \df+ -- we have to use functions with a predictable owner name, so make a role -create role regress_psql_user superuser; +create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; begin; set session authorization regress_psql_user; create function psql_df_internal (float8) @@ -5557,11 +5557,14 @@ CREATE TEMPORARY TABLE reload_output( line text ); SELECT 1 AS a \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line --------- @@ -5600,13 +5603,15 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c; -- COPY TO file -- The data goes to :g_out_file and the status to :o_out_file \set QUIET false -COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file'; +\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file'; +:command -- DML command status UPDATE onek SET unique1 = unique1 WHERE false; \set QUIET true \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -5623,7 +5628,8 @@ SELECT line FROM reload_output ORDER BY lineno; (10 rows) TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ---------- @@ -5660,7 +5666,8 @@ COPY (SELECT 'foo1') TO STDOUT \; COPY (SELECT 'bar1') TO STDOUT; COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -5669,7 +5676,8 @@ SELECT line FROM reload_output ORDER BY lineno; (2 rows) TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; line ------ @@ -6633,10 +6641,10 @@ cross-database references are not implemented: "no.such.database"."no.such.schem \dX "no.such.database"."no.such.schema"."no.such.extended.statistics" cross-database references are not implemented: "no.such.database"."no.such.schema"."no.such.extended.statistics" -- check \drg and \du -CREATE ROLE regress_du_role0; -CREATE ROLE regress_du_role1; -CREATE ROLE regress_du_role2; -CREATE ROLE regress_du_admin; +CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role2 TO regress_du_admin WITH ADMIN TRUE; diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 30b6371134..cc01076c22 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -1,9 +1,9 @@ -- -- PUBLICATION -- -CREATE ROLE regress_publication_user LOGIN SUPERUSER; -CREATE ROLE regress_publication_user2; -CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_publication_user'; -- suppress warning that depends on wal_level SET client_min_messages = 'ERROR'; @@ -1221,7 +1221,7 @@ ALTER PUBLICATION testpub2 ADD TABLE testpub_tbl1; -- ok DROP PUBLICATION testpub2; DROP PUBLICATION testpub3; SET ROLE regress_publication_user; -CREATE ROLE regress_publication_user3; +CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_publication_user2 TO regress_publication_user3; SET client_min_messages = 'ERROR'; CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test; diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out index 97b917502c..e9428535cb 100644 --- a/src/test/regress/expected/regproc.out +++ b/src/test/regress/expected/regproc.out @@ -2,7 +2,7 @@ -- regproc -- /* If objects exist, return oids */ -CREATE ROLE regress_regrole_test; +CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER; -- without schemaname SELECT regoper('||/'); regoper diff --git a/src/test/regress/expected/roleattributes.out b/src/test/regress/expected/roleattributes.out index 5e6969b173..2c4d52237f 100644 --- a/src/test/regress/expected/roleattributes.out +++ b/src/test/regress/expected/roleattributes.out @@ -1,233 +1,233 @@ -- default for superuser is false -CREATE ROLE regress_test_def_superuser; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_superuser | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_superuser | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | t | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | t | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_superuser WITH NOSUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_superuser | t | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_superuser | t | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for inherit is true -CREATE ROLE regress_test_def_inherit; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_inherit | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_inherit | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | f | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | f | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_inherit WITH INHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_inherit | f | f | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_inherit | f | f | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for create role is false -CREATE ROLE regress_test_def_createrole; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_createrole | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_createrole | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | t | f | f | f | f | -1 | | +CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | t | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createrole WITH NOCREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil --------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createrole | f | t | t | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createrole | f | t | t | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for create database is false -CREATE ROLE regress_test_def_createdb; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_createdb | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_createdb | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | t | f | f | f | -1 | | +CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | t | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createdb WITH NOCREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_createdb | f | t | f | t | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_createdb | f | t | f | t | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for can login is false for role -CREATE ROLE regress_test_def_role_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_role_canlogin | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_role_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | | +CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_role_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_role_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for can login is true for user -CREATE USER regress_test_def_user_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_user_canlogin | f | t | f | f | t | f | f | -1 | | +CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_user_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | | +CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER USER regress_test_user_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | t | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | t | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_user_canlogin | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for replication is false -CREATE ROLE regress_test_def_replication; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_replication | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_replication | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | t | f | -1 | | +CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | t | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_replication WITH NOREPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil ---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_replication | f | t | f | f | f | t | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_replication | f | t | f | f | f | t | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- default for bypassrls is false -CREATE ROLE regress_test_def_bypassrls; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_def_bypassrls | f | t | f | f | f | f | f | -1 | | +CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_def_bypassrls | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) -CREATE ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | t | -1 | | +CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | t | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | f | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | f | -1 | SCRAM-SHA-256$4096:$: | (1 row) ALTER ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; - rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil -------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+--------------- - regress_test_bypassrls | f | t | f | f | f | f | t | -1 | | +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; + rolname | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | regexp_replace | rolvaliduntil +------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+--------------- + regress_test_bypassrls | f | t | f | f | f | f | t | -1 | SCRAM-SHA-256$4096:$: | (1 row) -- clean up roles diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 51bba175ec..45355a9c66 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2; DROP SCHEMA IF EXISTS regress_rls_schema CASCADE; RESET client_min_messages; -- initial setup -CREATE USER regress_rls_alice NOLOGIN; -CREATE USER regress_rls_bob NOLOGIN; -CREATE USER regress_rls_carol NOLOGIN; -CREATE USER regress_rls_dave NOLOGIN; -CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN; -CREATE ROLE regress_rls_group1 NOLOGIN; -CREATE ROLE regress_rls_group2 NOLOGIN; +CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_rls_group1 TO regress_rls_bob; GRANT regress_rls_group2 TO regress_rls_carol; CREATE SCHEMA regress_rls_schema; @@ -4423,8 +4423,8 @@ SELECT count(*) = 0 FROM pg_depend -- DROP OWNED BY testing RESET SESSION AUTHORIZATION; -CREATE ROLE regress_rls_dob_role1; -CREATE ROLE regress_rls_dob_role2; +CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE dob_t1 (c1 int); CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1); CREATE POLICY p1 ON dob_t1 TO regress_rls_dob_role1 USING (true); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 13178e2b3d..9a3ebfea3c 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3799,7 +3799,7 @@ DROP TABLE ruletest2; -- Test non-SELECT rule on security invoker view. -- Should use view owner's permissions. -- -CREATE USER regress_rule_user1; +CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ruletest_t1 (x int); CREATE TABLE ruletest_t2 (x int); CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out index a8e01a6220..83543b250a 100644 --- a/src/test/regress/expected/security_label.out +++ b/src/test/regress/expected/security_label.out @@ -6,8 +6,8 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_seclabel_user1; DROP ROLE IF EXISTS regress_seclabel_user2; RESET client_min_messages; -CREATE USER regress_seclabel_user1 WITH CREATEROLE; -CREATE USER regress_seclabel_user2; +CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2; diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out index b79fe9a1c0..e29fab88ab 100644 --- a/src/test/regress/expected/select_into.out +++ b/src/test/regress/expected/select_into.out @@ -15,7 +15,7 @@ DROP TABLE sitmp1; -- SELECT INTO and INSERT permission, if owner is not allowed to insert. -- CREATE SCHEMA selinto_schema; -CREATE USER regress_selinto_user; +CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index 496ddb1289..a4fea8e367 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -1295,7 +1295,7 @@ SELECT 1 FROM tenk1_vw_sec rollback; -- test that function option SET ROLE works in parallel workers. -create role regress_parallel_worker; +create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; create function set_and_report_role() returns text as $$ select current_setting('role') $$ language sql parallel safe set role = regress_parallel_worker; diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index 1aeed8452b..7d9427d070 100644 --- a/src/test/regress/expected/select_views.out +++ b/src/test/regress/expected/select_views.out @@ -1250,7 +1250,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; -- -- Test for Leaky view scenario -- -CREATE ROLE regress_alice; +CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FUNCTION f_leak (text) RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index fa8059dbcd..190d41afc7 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -22,7 +22,7 @@ CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid; -- not a table ERROR: sequence cannot be owned by relation "pg_class_oid_index" DETAIL: This operation is not supported for indexes. CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname; -- not same schema -ERROR: sequence must be in same schema as table it is linked to +ERROR: sequence must have same owner as table it is linked to CREATE TABLE sequence_test_table (a int); CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b; -- wrong column ERROR: column "b" of relation "sequence_test_table" does not exist @@ -640,7 +640,7 @@ SELECT setval('sequence_test2', 1); -- error ERROR: cannot execute setval() in a read-only transaction ROLLBACK; -- privileges tests -CREATE USER regress_seq_user; +CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- nextval BEGIN; SET LOCAL SESSION AUTHORIZATION regress_seq_user; diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 6e08898b18..7eb5385b7a 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -1301,37 +1301,6 @@ SELECT current_setting('fsync') = 'off' t (1 row) --- Change the tablespace so that the table is rewritten directly, then SELECT --- from it to cause it to be read back into shared buffers. -SELECT sum(reads) AS io_sum_shared_before_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset --- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly --- rewritten table, e.g. by autovacuum. -BEGIN; -ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace; --- SELECT from the table so that the data is read into shared buffers and --- context 'normal', object 'relation' reads are counted. -SELECT COUNT(*) FROM test_io_shared; - count -------- - 100 -(1 row) - -COMMIT; -SELECT pg_stat_force_next_flush(); - pg_stat_force_next_flush --------------------------- - -(1 row) - -SELECT sum(reads) AS io_sum_shared_after_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads; - ?column? ----------- - t -(1 row) - SELECT sum(hits) AS io_sum_shared_before_hits FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -- Select from the table again to count hits. @@ -1433,6 +1402,7 @@ SELECT :io_sum_local_after_evictions > :io_sum_local_before_evictions, -- local buffers, exercising a different codepath than standard local buffer -- writes. ALTER TABLE test_io_local SET TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist SELECT pg_stat_force_next_flush(); pg_stat_force_next_flush -------------------------- @@ -1444,7 +1414,7 @@ SELECT sum(writes) AS io_sum_local_new_tblspc_writes SELECT :io_sum_local_new_tblspc_writes > :io_sum_local_after_writes; ?column? ---------- - t + f (1 row) RESET temp_buffers; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 8c4da95508..346961f92a 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -70,7 +70,7 @@ DROP TABLE ext_stats_test; CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1; COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment'; -CREATE ROLE regress_stats_ext; +CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_stats_ext; COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment'; ERROR: must be owner of statistics object ab1_a_b_stats @@ -3214,7 +3214,7 @@ set search_path to public, stts_s1; stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined (10 rows) -create role regress_stats_ext nosuperuser; +create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_stats_ext; \dX List of extended statistics @@ -3237,7 +3237,7 @@ drop schema stts_s1, stts_s2 cascade; drop user regress_stats_ext; reset search_path; -- User with no access -CREATE USER regress_stats_user1; +CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT USAGE ON SCHEMA tststats TO regress_stats_user1; SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl; -- Permission denied diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index 0f2a25cdc1..de168e39d9 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -1,10 +1,10 @@ -- -- SUBSCRIPTION -- -CREATE ROLE regress_subscription_user LOGIN SUPERUSER; -CREATE ROLE regress_subscription_user2; -CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription; -CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription; +CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_subscription_user'; -- fail - no publications CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo'; diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out index 3d0eeec996..2c3932139d 100644 --- a/src/test/regress/expected/test_setup.out +++ b/src/test/regress/expected/test_setup.out @@ -21,6 +21,7 @@ GRANT ALL ON SCHEMA public TO public; -- Create a tablespace we can use in tests. SET allow_in_place_tablespaces = true; CREATE TABLESPACE regress_tblspace LOCATION ''; +ERROR: CREATE TABLESPACE is not supported on Neon -- -- These tables have traditionally been referenced by many tests, -- so create and populate them. Insert only non-error values here. @@ -111,7 +112,8 @@ CREATE TABLE onek ( string4 name ); \set filename :abs_srcdir '/data/onek.data' -COPY onek FROM :'filename'; +\set command '\\copy onek FROM ' :'filename'; +:command VACUUM ANALYZE onek; CREATE TABLE onek2 AS SELECT * FROM onek; VACUUM ANALYZE onek2; @@ -134,7 +136,8 @@ CREATE TABLE tenk1 ( string4 name ); \set filename :abs_srcdir '/data/tenk.data' -COPY tenk1 FROM :'filename'; +\set command '\\copy tenk1 FROM ' :'filename'; +:command VACUUM ANALYZE tenk1; CREATE TABLE tenk2 AS SELECT * FROM tenk1; VACUUM ANALYZE tenk2; @@ -144,20 +147,23 @@ CREATE TABLE person ( location point ); \set filename :abs_srcdir '/data/person.data' -COPY person FROM :'filename'; +\set command '\\copy person FROM ' :'filename'; +:command VACUUM ANALYZE person; CREATE TABLE emp ( salary int4, manager name ) INHERITS (person); \set filename :abs_srcdir '/data/emp.data' -COPY emp FROM :'filename'; +\set command '\\copy emp FROM ' :'filename'; +:command VACUUM ANALYZE emp; CREATE TABLE student ( gpa float8 ) INHERITS (person); \set filename :abs_srcdir '/data/student.data' -COPY student FROM :'filename'; +\set command '\\copy student FROM ' :'filename'; +:command VACUUM ANALYZE student; CREATE TABLE stud_emp ( percent int4 @@ -166,14 +172,16 @@ NOTICE: merging multiple inherited definitions of column "name" NOTICE: merging multiple inherited definitions of column "age" NOTICE: merging multiple inherited definitions of column "location" \set filename :abs_srcdir '/data/stud_emp.data' -COPY stud_emp FROM :'filename'; +\set command '\\copy stud_emp FROM ' :'filename'; +:command VACUUM ANALYZE stud_emp; CREATE TABLE road ( name text, thepath path ); \set filename :abs_srcdir '/data/streets.data' -COPY road FROM :'filename'; +\set command '\\copy road FROM ' :'filename'; +:command VACUUM ANALYZE road; CREATE TABLE ihighway () INHERITS (road); INSERT INTO ihighway diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 9fad6c8b04..a1b8e82389 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -63,7 +63,8 @@ CREATE TABLE test_tsvector( a tsvector ); \set filename :abs_srcdir '/data/tsearch.data' -COPY test_tsvector FROM :'filename'; +\set command '\\copy test_tsvector FROM ' :'filename'; +:command ANALYZE test_tsvector; -- test basic text search behavior without indexes, then with SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index 442b55120c..7224709d6f 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -1338,9 +1338,9 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to view rw_view1 drop cascades to function rw_view1_aa(rw_view1) -- permissions checks -CREATE USER regress_view_user1; -CREATE USER regress_view_user2; -CREATE USER regress_view_user3; +CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_view_user1; CREATE TABLE base_tbl(a int, b text, c float); INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0); @@ -3734,8 +3734,8 @@ DETAIL: View columns that are not columns of their base relation are not updata drop view uv_iocu_view; drop table uv_iocu_tab; -- ON CONFLICT DO UPDATE permissions checks -create user regress_view_user1; -create user regress_view_user2; +create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; set session authorization regress_view_user1; create table base_tbl(a int unique, b text, c float); insert into base_tbl values (1,'xxx',1.0); diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index 1b27d132d7..25b109d609 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -608,7 +608,7 @@ DROP FUNCTION func_parted_mod_b(); -- RLS policies with update-row-movement ----------------------------------------- ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; -CREATE USER regress_range_parted_user; +CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT ALL ON range_parted, mintab TO regress_range_parted_user; CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index 2eba712887..d46877aca9 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -433,7 +433,7 @@ CREATE TABLE vacowned (a int); CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a); CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1); CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2); -CREATE ROLE regress_vacuum; +CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_vacuum; -- Simple table VACUUM vacowned; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f53a526f7c..c07b093476 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -135,4 +135,4 @@ test: fast_default # run tablespace test at the end because it drops the tablespace created during # setup that other tests may use. -test: tablespace +#test: tablespace diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 1a18ca3d8f..b2009628d0 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -15,7 +15,8 @@ CREATE TABLE aggtest ( ); \set filename :abs_srcdir '/data/agg.data' -COPY aggtest FROM :'filename'; +\set command '\\copy aggtest FROM ' :'filename'; +:command ANALYZE aggtest; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d3..9d38df7f42 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -22,9 +22,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user3; RESET client_min_messages; -CREATE USER regress_alter_generic_user3; -CREATE USER regress_alter_generic_user2; -CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3; +CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3; CREATE SCHEMA alt_nsp1; CREATE SCHEMA alt_nsp2; @@ -316,7 +316,7 @@ DROP OPERATOR FAMILY alt_opf4 USING btree; -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user5 NOSUPERUSER; +CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER; CREATE OPERATOR FAMILY alt_opf5 USING btree; SET ROLE regress_alter_generic_user5; ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2); @@ -326,7 +326,7 @@ ROLLBACK; -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP BEGIN TRANSACTION; -CREATE ROLE regress_alter_generic_user6; +CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA alt_nsp6; REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6; CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree; diff --git a/src/test/regress/sql/alter_operator.sql b/src/test/regress/sql/alter_operator.sql index 8faecf7830..bb8b8e14ea 100644 --- a/src/test/regress/sql/alter_operator.sql +++ b/src/test/regress/sql/alter_operator.sql @@ -83,7 +83,7 @@ ALTER OPERATOR & (bit, bit) SET ("Restrict" = _int_contsel, "Join" = _int_contjo -- -- Test permission check. Must be owner to ALTER OPERATOR. -- -CREATE USER regress_alter_op_user; +CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_alter_op_user; ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE); diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index da12724473..86f5ae5444 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -7,7 +7,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_alter_table_user1; RESET client_min_messages; -CREATE USER regress_alter_table_user1; +CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- -- add attribute @@ -2404,8 +2404,8 @@ DROP TABLE fail_part; ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); -- check ownership of the source table -CREATE ROLE regress_test_me; -CREATE ROLE regress_test_not_me; +CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE not_owned_by_me (LIKE list_parted); ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; SET SESSION AUTHORIZATION regress_test_me; diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 47058dfde5..f8962592e4 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -22,7 +22,8 @@ CREATE TABLE array_op_test ( ); \set filename :abs_srcdir '/data/array.data' -COPY array_op_test FROM :'filename'; +\set command '\\copy array_op_test FROM ' :'filename'; +:command ANALYZE array_op_test; -- diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 0d2a33f370..df86e6b050 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -26,16 +26,20 @@ CREATE TABLE bt_f8_heap ( ); \set filename :abs_srcdir '/data/desc.data' -COPY bt_i4_heap FROM :'filename'; +\set command '\\copy bt_i4_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_name_heap FROM :'filename'; +\set command '\\copy bt_name_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/desc.data' -COPY bt_txt_heap FROM :'filename'; +\set command '\\copy bt_txt_heap FROM ' :'filename'; +:command \set filename :abs_srcdir '/data/hash.data' -COPY bt_f8_heap FROM :'filename'; +\set command '\\copy bt_f8_heap FROM ' :'filename'; +:command ANALYZE bt_i4_heap; ANALYZE bt_name_heap; diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql index b7115f8610..a753f2c794 100644 --- a/src/test/regress/sql/cluster.sql +++ b/src/test/regress/sql/cluster.sql @@ -108,7 +108,7 @@ WHERE pg_class.oid=indexrelid CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index; -- Verify that clustering all tables does in fact cluster the right ones -CREATE USER regress_clstr_user; +CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE clstr_1 (a INT PRIMARY KEY); CREATE TABLE clstr_2 (a INT PRIMARY KEY); CREATE TABLE clstr_3 (a INT PRIMARY KEY); @@ -235,7 +235,7 @@ DROP TABLE clstrpart; CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); CREATE INDEX ptnowner_i_idx ON ptnowner(i); CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1); -CREATE ROLE regress_ptnowner; +CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2); ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; SET SESSION AUTHORIZATION regress_ptnowner; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 4eb1adf028..28636ec711 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -353,7 +353,7 @@ reset enable_seqscan; -- schema manipulation commands -CREATE ROLE regress_test_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index e3e3bea709..fa86ddc326 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -243,12 +243,14 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT, CHECK (x > 3 AND y <> 'check failed' AND x < 7 )); \set filename :abs_srcdir '/data/constro.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; \set filename :abs_srcdir '/data/constrf.data' -COPY COPY_TBL FROM :'filename'; +\set command '\\copy COPY_TBL FROM ' :'filename'; +:command SELECT * FROM COPY_TBL; @@ -599,7 +601,7 @@ DROP TABLE deferred_excl; -- Comments -- Setup a low-level role to enforce non-superuser checks. -CREATE ROLE regress_constraint_comments; +CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments; CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0)); @@ -621,7 +623,7 @@ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL; -- unauthorized user RESET SESSION AUTHORIZATION; -CREATE ROLE regress_constraint_comments_noaccess; +CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_constraint_comments_noaccess; COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; -CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE; +CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_conversion_user; CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8; -- diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index e2dd24cb35..4a186750f8 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -20,11 +20,13 @@ insert into copytest values('Mac',E'abc\rdef',3); insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4); \set filename :abs_builddir '/results/copytest.csv' -copy copytest to :'filename' csv; +\set command '\\copy copytest to ' :'filename' csv; +:command create temp table copytest2 (like copytest); -copy copytest2 from :'filename' csv; +\set command '\\copy copytest2 from ' :'filename' csv; +:command select * from copytest except select * from copytest2; @@ -32,9 +34,11 @@ truncate copytest2; --- same test but with an escape char different from quote char -copy copytest to :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command -copy copytest2 from :'filename' csv quote '''' escape E'\\'; +\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\''; +:command select * from copytest except select * from copytest2; @@ -86,16 +90,19 @@ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x; insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x; \set filename :abs_builddir '/results/parted_copytest.csv' -copy (select * from parted_copytest order by a) to :'filename'; +\set command '\\copy (select * from parted_copytest order by a) to ' :'filename'; +:command truncate parted_copytest; -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command -- Ensure COPY FREEZE errors for partitioned tables. begin; truncate parted_copytest; -copy parted_copytest from :'filename' (freeze); +\set command '\\copy parted_copytest from ' :'filename' (freeze); +:command rollback; select tableoid::regclass,count(*),sum(a) from parted_copytest @@ -115,7 +122,8 @@ create trigger part_ins_trig for each row execute procedure part_ins_func(); -copy parted_copytest from :'filename'; +\set command '\\copy parted_copytest from ' :'filename'; +:command select tableoid::regclass,count(*),sum(a) from parted_copytest group by tableoid order by tableoid::regclass::name; @@ -124,7 +132,8 @@ truncate table parted_copytest; create index on parted_copytest (b); drop trigger part_ins_trig on parted_copytest_a2; -copy parted_copytest from stdin; +\set command '\\copy parted_copytest from ' stdin; +:command 1 1 str1 2 2 str2 \. @@ -191,8 +200,8 @@ bill 20 (11,10) 1000 sharon -- Generate COPY FROM report with FILE, with some excluded tuples. truncate tab_progress_reporting; \set filename :abs_srcdir '/data/emp.data' -copy tab_progress_reporting from :'filename' - where (salary < 2000); +\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)'; +:command drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); @@ -311,7 +320,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1); -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us \set filename :abs_srcdir '/data/desc.data' -COPY parted_si(id, data) FROM :'filename'; +\set command '\\COPY parted_si(id, data) FROM ' :'filename'; +:command -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from -- the wrong partition. This test is *not* guaranteed to trigger that bug, but diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index 6b75b6c7ea..f3655b413c 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -407,8 +407,8 @@ copy check_con_tbl from stdin; select * from check_con_tbl; -- test with RLS enabled. -CREATE ROLE regress_rls_copy_user; -CREATE ROLE regress_rls_copy_user_colperms; +CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE rls_t1 (a int, b int, c int); COPY rls_t1 (a, b, c) from stdin; diff --git a/src/test/regress/sql/create_function_sql.sql b/src/test/regress/sql/create_function_sql.sql index 89e9af3a49..2b86fe2285 100644 --- a/src/test/regress/sql/create_function_sql.sql +++ b/src/test/regress/sql/create_function_sql.sql @@ -6,7 +6,7 @@ -- All objects made in this test are in temp_func_test schema -CREATE USER regress_unpriv_user; +CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA temp_func_test; GRANT ALL ON SCHEMA temp_func_test TO public; diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index e296891cab..70cea565e4 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -71,7 +71,8 @@ CREATE TABLE fast_emp4000 ( ); \set filename :abs_srcdir '/data/rect.data' -COPY slow_emp4000 FROM :'filename'; +\set command '\\copy slow_emp4000 FROM ' :'filename'; +:command INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000; @@ -269,7 +270,8 @@ CREATE TABLE array_index_op_test ( ); \set filename :abs_srcdir '/data/array.data' -COPY array_index_op_test FROM :'filename'; +\set command '\\copy array_index_op_test FROM ' :'filename'; +:command ANALYZE array_index_op_test; SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno; @@ -1298,7 +1300,7 @@ END; REINDEX SCHEMA CONCURRENTLY schema_to_reindex; -- Failure for unauthorized user -CREATE ROLE regress_reindexuser NOLOGIN; +CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_reindexuser; REINDEX SCHEMA schema_to_reindex; -- Permission failures with toast tables and indexes (pg_authid here) diff --git a/src/test/regress/sql/create_procedure.sql b/src/test/regress/sql/create_procedure.sql index 069a3727ce..faeeb3f744 100644 --- a/src/test/regress/sql/create_procedure.sql +++ b/src/test/regress/sql/create_procedure.sql @@ -255,7 +255,7 @@ DROP PROCEDURE nonexistent(); -- privileges -CREATE USER regress_cp_user1; +CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT INSERT ON cp_test TO regress_cp_user1; REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC; SET ROLE regress_cp_user1; diff --git a/src/test/regress/sql/create_role.sql b/src/test/regress/sql/create_role.sql index 4491a28a8a..3045434865 100644 --- a/src/test/regress/sql/create_role.sql +++ b/src/test/regress/sql/create_role.sql @@ -1,20 +1,20 @@ -- ok, superuser can create users with any set of privileges -CREATE ROLE regress_role_super SUPERUSER; -CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS; +CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION; -CREATE ROLE regress_role_limited_admin CREATEROLE; -CREATE ROLE regress_role_normal; +CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, CREATEROLE user can't give away role attributes without having them SET SESSION AUTHORIZATION regress_role_limited_admin; -CREATE ROLE regress_nosuch_superuser SUPERUSER; -CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_nosuch_replication REPLICATION; -CREATE ROLE regress_nosuch_bypassrls BYPASSRLS; -CREATE ROLE regress_nosuch_createdb CREATEDB; +CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can create a role without any special attributes -CREATE ROLE regress_role_limited; +CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, can't give it in any of the restricted attributes ALTER ROLE regress_role_limited SUPERUSER; @@ -25,10 +25,10 @@ DROP ROLE regress_role_limited; -- ok, can give away these role attributes if you have them SET SESSION AUTHORIZATION regress_role_admin; -CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS; -CREATE ROLE regress_replication REPLICATION; -CREATE ROLE regress_bypassrls BYPASSRLS; -CREATE ROLE regress_createdb CREATEDB; +CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can toggle these role attributes off and on if you have them ALTER ROLE regress_replication NOREPLICATION; @@ -43,52 +43,52 @@ ALTER ROLE regress_createdb SUPERUSER; ALTER ROLE regress_createdb NOSUPERUSER; -- ok, having CREATEROLE is enough to create users with these privileges -CREATE ROLE regress_createrole CREATEROLE NOINHERIT; +CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION; -CREATE ROLE regress_login LOGIN; -CREATE ROLE regress_inherit INHERIT; -CREATE ROLE regress_connection_limit CONNECTION LIMIT 5; -CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo'; -CREATE ROLE regress_password_null PASSWORD NULL; +CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, backwards compatible noise words should be ignored -CREATE ROLE regress_noiseword SYSID 12345; +CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant membership in superuser role -CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; +CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, database owner cannot have members -CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; +CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can grant other users into a role CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself -CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; +CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, - regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; + regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, cannot grant a role into itself with admin option -CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; +CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER; -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; -- ok, regress_createrole can create new roles -CREATE ROLE regress_plainrole; +CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with it -CREATE ROLE regress_rolecreator CREATEROLE; +CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, roles with CREATEROLE can create new roles with different role -- attributes, including CREATEROLE -CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5; +CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- ok, we should be able to modify a role we created COMMENT ON ROLE regress_hasprivs IS 'some comment'; @@ -123,7 +123,7 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole; -- ok, create a role with a value for createrole_self_grant SET createrole_self_grant = 'set, inherit'; -CREATE ROLE regress_tenant2; +CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT CREATE ON DATABASE regression TO regress_tenant2; -- ok, regress_tenant2 can create objects within the database @@ -150,16 +150,16 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2; DROP TABLE tenant2_table; -- fail, CREATEROLE is not enough to create roles in privileged roles -CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; -CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; -CREATE ROLE regress_monitor IN ROLE pg_monitor; -CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; -CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; -CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; -CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; -CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; -CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; -CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data; +CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data; +CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor; +CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings; +CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats; +CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables; +CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files; +CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files; +CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program; +CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend; -- fail, role still owns database objects DROP ROLE regress_tenant; diff --git a/src/test/regress/sql/create_schema.sql b/src/test/regress/sql/create_schema.sql index 1b7064247a..be5b662ce1 100644 --- a/src/test/regress/sql/create_schema.sql +++ b/src/test/regress/sql/create_schema.sql @@ -4,7 +4,7 @@ -- Schema creation with elements. -CREATE ROLE regress_create_schema_role SUPERUSER; +CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Cases where schema creation fails as objects are qualified with a schema -- that does not match with what's expected. diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql index ae6841308b..47bc792e30 100644 --- a/src/test/regress/sql/create_view.sql +++ b/src/test/regress/sql/create_view.sql @@ -23,7 +23,8 @@ CREATE TABLE real_city ( ); \set filename :abs_srcdir '/data/real_city.data' -COPY real_city FROM :'filename'; +\set command '\\copy real_city FROM ' :'filename'; +:command ANALYZE real_city; SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. diff --git a/src/test/regress/sql/dependency.sql b/src/test/regress/sql/dependency.sql index 8d74ed7122..293194615e 100644 --- a/src/test/regress/sql/dependency.sql +++ b/src/test/regress/sql/dependency.sql @@ -2,10 +2,10 @@ -- DEPENDENCIES -- -CREATE USER regress_dep_user; -CREATE USER regress_dep_user2; -CREATE USER regress_dep_user3; -CREATE GROUP regress_dep_group; +CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE deptest (f1 serial primary key, f2 text); @@ -45,9 +45,9 @@ DROP TABLE deptest; DROP USER regress_dep_user3; -- Test DROP OWNED -CREATE USER regress_dep_user0; -CREATE USER regress_dep_user1; -CREATE USER regress_dep_user2; +CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_dep_user0; -- permission denied DROP OWNED BY regress_dep_user1; diff --git a/src/test/regress/sql/drop_if_exists.sql b/src/test/regress/sql/drop_if_exists.sql index ac6168b91f..4270062ec7 100644 --- a/src/test/regress/sql/drop_if_exists.sql +++ b/src/test/regress/sql/drop_if_exists.sql @@ -86,9 +86,9 @@ DROP DOMAIN test_domain_exists; --- role/user/group --- -CREATE USER regress_test_u1; -CREATE ROLE regress_test_r1; -CREATE GROUP regress_test_g1; +CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP USER regress_test_u2; diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql index 247b0a3105..bf018fd3a1 100644 --- a/src/test/regress/sql/equivclass.sql +++ b/src/test/regress/sql/equivclass.sql @@ -230,7 +230,7 @@ set enable_mergejoin = off; alter table ec1 enable row level security; create policy p1 on ec1 using (f1 < '5'::int8alias1); -create user regress_user_ectest; +create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select on ec0 to regress_user_ectest; grant select on ec1 to regress_user_ectest; diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index 013546b830..616a46da1d 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -86,7 +86,7 @@ create event trigger regress_event_trigger2 on ddl_command_start comment on event trigger regress_event_trigger is 'test comment'; -- drop as non-superuser should fail -create role regress_evt_user; +create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_evt_user; create event trigger regress_event_trigger_noperms on ddl_command_start execute procedure test_event_trigger(); diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql index aa147b14a9..370e0dd570 100644 --- a/src/test/regress/sql/foreign_data.sql +++ b/src/test/regress/sql/foreign_data.sql @@ -22,14 +22,14 @@ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_r RESET client_min_messages; -CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER; +CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_foreign_data_user'; -CREATE ROLE regress_test_role; -CREATE ROLE regress_test_role2; -CREATE ROLE regress_test_role_super SUPERUSER; -CREATE ROLE regress_test_indirect; -CREATE ROLE regress_unprivileged_role; +CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 8c4e4c7c83..e946cd2119 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner; set role regress_other_partitioned_fk_owner; create table other_partitioned_fk(a int, b int) partition by list (a); diff --git a/src/test/regress/sql/generated.sql b/src/test/regress/sql/generated.sql index cb55d77821..9c15ae954c 100644 --- a/src/test/regress/sql/generated.sql +++ b/src/test/regress/sql/generated.sql @@ -263,7 +263,7 @@ ALTER TABLE gtest10a DROP COLUMN b; INSERT INTO gtest10a (a) VALUES (1); -- privileges -CREATE USER regress_user11; +CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED); INSERT INTO gtest11s VALUES (1, 10), (2, 20); diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql index dc79761955..a9ead75349 100644 --- a/src/test/regress/sql/guc.sql +++ b/src/test/regress/sql/guc.sql @@ -188,7 +188,7 @@ PREPARE foo AS SELECT 1; LISTEN foo_event; SET vacuum_cost_delay = 13; CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS; -CREATE ROLE regress_guc_user; +CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_guc_user; -- look changes SELECT pg_listening_channels(); diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql index 219da82981..bf99d2ec4c 100644 --- a/src/test/regress/sql/hash_index.sql +++ b/src/test/regress/sql/hash_index.sql @@ -26,10 +26,14 @@ CREATE TABLE hash_f8_heap ( ); \set filename :abs_srcdir '/data/hash.data' -COPY hash_i4_heap FROM :'filename'; -COPY hash_name_heap FROM :'filename'; -COPY hash_txt_heap FROM :'filename'; -COPY hash_f8_heap FROM :'filename'; +\set command '\\copy hash_i4_heap FROM ' :'filename'; +:command +\set command '\\copy hash_name_heap FROM ' :'filename'; +:command +\set command '\\copy hash_txt_heap FROM ' :'filename'; +:command +\set command '\\copy hash_f8_heap FROM ' :'filename'; +:command -- the data in this file has a lot of duplicates in the index key -- fields, leading to long bucket chains and lots of table expansion. diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql index cb0e05a2f1..b11492bd31 100644 --- a/src/test/regress/sql/identity.sql +++ b/src/test/regress/sql/identity.sql @@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART; ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY; -- privileges -CREATE USER regress_identity_user1; +CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text); GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 51251b0e51..3492f1cfef 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -770,8 +770,8 @@ drop table cnullparent cascade; -- -- Mixed ownership inheritance tree -- -create role regress_alice; -create role regress_bob; +create role regress_alice password NEON_PASSWORD_PLACEHOLDER; +create role regress_bob password NEON_PASSWORD_PLACEHOLDER; grant all on schema public to regress_alice, regress_bob; grant regress_alice to regress_bob; set session authorization regress_alice; @@ -1031,7 +1031,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; -create role regress_no_child_access; +create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER; revoke all on permtest_grandchild from regress_no_child_access; grant select on permtest_parent to regress_no_child_access; set session authorization regress_no_child_access; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 2b086eeb6d..913d8a0aed 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -513,7 +513,7 @@ drop table mlparted5; create table key_desc (a int, b int) partition by list ((a+0)); create table key_desc_1 partition of key_desc for values in (1) partition by range (b); -create user regress_insert_other_user; +create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER; grant select (a) on key_desc_1 to regress_insert_other_user; grant insert on key_desc to regress_insert_other_user; @@ -597,7 +597,7 @@ insert into brtrigpartcon1 values (1, 'hi there'); -- check that the message shows the appropriate column description in a -- situation where the partitioned table is not the primary ModifyTable node create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int); -create role regress_coldesc_role; +create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER; grant insert on inserttest3 to regress_coldesc_role; grant insert on brtrigpartcon to regress_coldesc_role; revoke select on brtrigpartcon from regress_coldesc_role; diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index 97bc2242a1..88c8b1dcdb 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -6,7 +6,8 @@ CREATE TABLE testjsonb ( ); \set filename :abs_srcdir '/data/jsonb.data' -COPY testjsonb FROM :'filename'; +\set command '\\copy testjsonb FROM ' :'filename'; +:command -- Strings. SELECT '""'::jsonb; -- OK. diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql index a4aee02e3a..8839c9496a 100644 --- a/src/test/regress/sql/largeobject.sql +++ b/src/test/regress/sql/largeobject.sql @@ -10,7 +10,7 @@ SET bytea_output TO escape; -- Test ALTER LARGE OBJECT OWNER -CREATE ROLE regress_lo_user; +CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT lo_create(42); ALTER LARGE OBJECT 42 OWNER TO regress_lo_user; @@ -189,7 +189,8 @@ SELECT lo_unlink(loid) from lotest_stash_values; TRUNCATE lotest_stash_values; \set filename :abs_srcdir '/data/tenk.data' -INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename'); +\lo_import :filename +INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID); BEGIN; UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer)); @@ -219,8 +220,8 @@ SELECT lo_close(fd) FROM lotest_stash_values; END; \set filename :abs_builddir '/results/lotest.txt' -SELECT lo_export(loid, :'filename') FROM lotest_stash_values; - +SELECT loid FROM lotest_stash_values \gset +\lo_export :loid, :filename \lo_import :filename \set newloid :LASTOID diff --git a/src/test/regress/sql/lock.sql b/src/test/regress/sql/lock.sql index b88488c6d0..78b31e6dd3 100644 --- a/src/test/regress/sql/lock.sql +++ b/src/test/regress/sql/lock.sql @@ -19,7 +19,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2; CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1; CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a); CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub; -CREATE ROLE regress_rol_lock1; +CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1; GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1; diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql index b74ee305e0..33b8b690fc 100644 --- a/src/test/regress/sql/matview.sql +++ b/src/test/regress/sql/matview.sql @@ -209,7 +209,7 @@ SELECT * FROM mvtest_mv_v; DROP TABLE mvtest_v CASCADE; -- make sure running as superuser works when MV owned by another role (bug #11208) -CREATE ROLE regress_user_mvtest; +CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_user_mvtest; -- this test case also checks for ambiguity in the queries issued by -- refresh_by_match_merge(), by choosing column names that intentionally @@ -266,7 +266,7 @@ ROLLBACK; -- INSERT privileges if relation owner is not allowed to insert. CREATE SCHEMA matview_schema; -CREATE USER regress_matview_user; +CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user REVOKE INSERT ON TABLES FROM regress_matview_user; GRANT ALL ON SCHEMA matview_schema TO public; diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql index 5ddcca84f8..99f4cef9ef 100644 --- a/src/test/regress/sql/merge.sql +++ b/src/test/regress/sql/merge.sql @@ -2,9 +2,9 @@ -- MERGE -- -CREATE USER regress_merge_privs; -CREATE USER regress_merge_no_privs; -CREATE USER regress_merge_none; +CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER; DROP TABLE IF EXISTS target; DROP TABLE IF EXISTS source; diff --git a/src/test/regress/sql/misc.sql b/src/test/regress/sql/misc.sql index 165a2e175f..08d7096e2c 100644 --- a/src/test/regress/sql/misc.sql +++ b/src/test/regress/sql/misc.sql @@ -74,22 +74,26 @@ DROP TABLE tmp; -- copy -- \set filename :abs_builddir '/results/onek.data' -COPY onek TO :'filename'; +\set command '\\copy onek TO ' :'filename'; +:command CREATE TEMP TABLE onek_copy (LIKE onek); -COPY onek_copy FROM :'filename'; +\set command '\\copy onek_copy FROM ' :'filename'; +:command SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy; SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek; \set filename :abs_builddir '/results/stud_emp.data' -COPY BINARY stud_emp TO :'filename'; +\set command '\\COPY BINARY stud_emp TO ' :'filename'; +:command CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp); -COPY BINARY stud_emp_copy FROM :'filename'; +\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename'; +:command SELECT * FROM stud_emp_copy; diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index 76470fcb3f..09746de223 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -82,7 +82,7 @@ SELECT pg_log_backend_memory_contexts(pg_backend_pid()); SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity WHERE backend_type = 'checkpointer'; -CREATE ROLE regress_log_memory; +CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_function_privilege('regress_log_memory', 'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no @@ -169,7 +169,7 @@ select count(*) > 0 from -- -- Test replication slot directory functions -- -CREATE ROLE regress_slot_dir_funcs; +CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Not available by default. SELECT has_function_privilege('regress_slot_dir_funcs', 'pg_ls_logicalsnapdir()', 'EXECUTE'); @@ -252,7 +252,7 @@ FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1), pg_split_walfile_name(file_name); -- pg_current_logfile -CREATE ROLE regress_current_logfile; +CREATE ROLE regress_current_logfile PASSWORD NEON_PASSWORD_PLACEHOLDER; -- not available by default SELECT has_function_privilege('regress_current_logfile', 'pg_current_logfile()', 'EXECUTE'); diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql index 41d5524285..373be031a2 100644 --- a/src/test/regress/sql/multirangetypes.sql +++ b/src/test/regress/sql/multirangetypes.sql @@ -704,7 +704,7 @@ drop type textrange2; -- Multiranges don't have their own ownership or permissions. -- create type textrange1 as range(subtype=text, multirange_type_name=multitextrange1, collation="C"); -create role regress_multirange_owner; +create role regress_multirange_owner password NEON_PASSWORD_PLACEHOLDER; alter type multitextrange1 owner to regress_multirange_owner; -- fail alter type textrange1 owner to regress_multirange_owner; diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 1a6c61f49d..1c31ac6a53 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -7,7 +7,7 @@ SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_addr_user; RESET client_min_messages; -CREATE USER regress_addr_user; +CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql index bb82aa4aa2..dd8a05e24d 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql @@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; -CREATE ROLE regress_passwd1; -ALTER ROLE regress_passwd1 PASSWORD 'role_pwd1'; -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; +CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- check list of created entries -- @@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; --- already encrypted, use as they are -ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is -CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; --- This looks like a valid SCRAM-SHA-256 secret, but it is not --- so it should be hashed with SCRAM-SHA-256. -CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; --- These may look like valid MD5 secrets, but they are not, so they --- should be hashed with SCRAM-SHA-256. --- trailing garbage at the end -CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; --- invalid length -CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; @@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; --- Test with invalid stored and server keys. --- --- The first is valid, to act as a control. The others have too long --- stored/server keys. They will be re-hashed. -CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; +-- Neon does not support encrypted passwords, use unencrypted instead +CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index b7e1cb6cdd..6e5a2217f1 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; -- test proper begins here -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; -CREATE USER regress_priv_user5; -- duplicate -CREATE USER regress_priv_user6; -CREATE USER regress_priv_user7; -CREATE USER regress_priv_user8; -CREATE USER regress_priv_user9; -CREATE USER regress_priv_user10; -CREATE ROLE regress_priv_role; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- duplicate +CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER; -- circular ADMIN OPTION grants should be disallowed GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION; @@ -84,11 +84,11 @@ DROP ROLE regress_priv_user5; -- should fail, dependency DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order -- recreate the roles we just dropped -CREATE USER regress_priv_user1; -CREATE USER regress_priv_user2; -CREATE USER regress_priv_user3; -CREATE USER regress_priv_user4; -CREATE USER regress_priv_user5; +CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; @@ -163,8 +163,8 @@ DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; -CREATE GROUP regress_priv_group1; -CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2; +CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; @@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - -CREATE ROLE regress_sro_user; +CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that index expressions and predicates are run as the table's owner @@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - -CREATE ROLE regress_schemauser1 superuser login; -CREATE ROLE regress_schemauser2 superuser login; +CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; @@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE -CREATE USER regress_locktable_user; +CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission @@ -1854,7 +1854,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - -CREATE ROLE regress_readallstats; +CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no @@ -1874,10 +1874,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery -CREATE ROLE regress_group; -CREATE ROLE regress_group_direct_manager; -CREATE ROLE regress_group_indirect_manager; -CREATE ROLE regress_group_member; +CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; @@ -1899,9 +1899,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes -CREATE ROLE regress_roleoption_protagonist; -CREATE ROLE regress_roleoption_donor; -CREATE ROLE regress_roleoption_recipient; +CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; @@ -1929,9 +1929,9 @@ DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN -CREATE ROLE regress_no_maintain; -CREATE ROLE regress_maintain; -CREATE ROLE regress_maintain_all IN ROLE pg_maintain; +CREATE ROLE regress_no_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_maintain_all IN ROLE pg_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE maintain_test (a INT); CREATE INDEX ON maintain_test (a); GRANT MAINTAIN ON maintain_test TO regress_maintain; diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql index 3b3c6f6e29..b09d6231f8 100644 --- a/src/test/regress/sql/psql.sql +++ b/src/test/regress/sql/psql.sql @@ -500,7 +500,7 @@ select 1 where false; \pset expanded off CREATE SCHEMA tableam_display; -CREATE ROLE regress_display_role; +CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER SCHEMA tableam_display OWNER TO regress_display_role; SET search_path TO tableam_display; CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler; @@ -1182,7 +1182,7 @@ reset debug_parallel_query; \unset FETCH_COUNT create schema testpart; -create role regress_partitioning_role; +create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER; alter schema testpart owner to regress_partitioning_role; @@ -1293,7 +1293,7 @@ reset work_mem; -- check \df+ -- we have to use functions with a predictable owner name, so make a role -create role regress_psql_user superuser; +create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; begin; set session authorization regress_psql_user; @@ -1439,11 +1439,14 @@ CREATE TEMPORARY TABLE reload_output( ); SELECT 1 AS a \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; @@ -1460,17 +1463,20 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c; -- COPY TO file -- The data goes to :g_out_file and the status to :o_out_file \set QUIET false -COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file'; +\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file'; +:command -- DML command status UPDATE onek SET unique1 = unique1 WHERE false; \set QUIET true \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; @@ -1483,10 +1489,12 @@ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file \o -- Check the contents of the files generated. -COPY reload_output(line) FROM :'g_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'g_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; TRUNCATE TABLE reload_output; -COPY reload_output(line) FROM :'o_out_file'; +\set command '\\COPY reload_output(line) FROM ' :'o_out_file'; +:command SELECT line FROM reload_output ORDER BY lineno; DROP TABLE reload_output; @@ -1834,10 +1842,10 @@ DROP FUNCTION psql_error; \dX "no.such.database"."no.such.schema"."no.such.extended.statistics" -- check \drg and \du -CREATE ROLE regress_du_role0; -CREATE ROLE regress_du_role1; -CREATE ROLE regress_du_role2; -CREATE ROLE regress_du_admin; +CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE; GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE; diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index 479d4f3264..6d348a93e7 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -1,9 +1,9 @@ -- -- PUBLICATION -- -CREATE ROLE regress_publication_user LOGIN SUPERUSER; -CREATE ROLE regress_publication_user2; -CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_publication_user'; -- suppress warning that depends on wal_level @@ -810,7 +810,7 @@ DROP PUBLICATION testpub2; DROP PUBLICATION testpub3; SET ROLE regress_publication_user; -CREATE ROLE regress_publication_user3; +CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_publication_user2 TO regress_publication_user3; SET client_min_messages = 'ERROR'; CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test; diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql index 232289ac39..d967ef0cd3 100644 --- a/src/test/regress/sql/regproc.sql +++ b/src/test/regress/sql/regproc.sql @@ -4,7 +4,7 @@ /* If objects exist, return oids */ -CREATE ROLE regress_regrole_test; +CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER; -- without schemaname diff --git a/src/test/regress/sql/roleattributes.sql b/src/test/regress/sql/roleattributes.sql index c961b2d730..0859b89c4f 100644 --- a/src/test/regress/sql/roleattributes.sql +++ b/src/test/regress/sql/roleattributes.sql @@ -1,83 +1,83 @@ -- default for superuser is false -CREATE ROLE regress_test_def_superuser; +CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; -CREATE ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser'; +CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; ALTER ROLE regress_test_superuser WITH NOSUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; ALTER ROLE regress_test_superuser WITH SUPERUSER; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser'; -- default for inherit is true -CREATE ROLE regress_test_def_inherit; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; -CREATE ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit'; +CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; ALTER ROLE regress_test_inherit WITH INHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; ALTER ROLE regress_test_inherit WITH NOINHERIT; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit'; -- default for create role is false -CREATE ROLE regress_test_def_createrole; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; -CREATE ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole'; +CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; ALTER ROLE regress_test_createrole WITH NOCREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; ALTER ROLE regress_test_createrole WITH CREATEROLE; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole'; -- default for create database is false -CREATE ROLE regress_test_def_createdb; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; -CREATE ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb'; +CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; ALTER ROLE regress_test_createdb WITH NOCREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; ALTER ROLE regress_test_createdb WITH CREATEDB; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb'; -- default for can login is false for role -CREATE ROLE regress_test_def_role_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; -CREATE ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin'; +CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; ALTER ROLE regress_test_role_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; ALTER ROLE regress_test_role_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin'; -- default for can login is true for user -CREATE USER regress_test_def_user_canlogin; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; -CREATE USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin'; +CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; ALTER USER regress_test_user_canlogin WITH LOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; ALTER USER regress_test_user_canlogin WITH NOLOGIN; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin'; -- default for replication is false -CREATE ROLE regress_test_def_replication; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; -CREATE ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication'; +CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; ALTER ROLE regress_test_replication WITH NOREPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; ALTER ROLE regress_test_replication WITH REPLICATION; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication'; -- default for bypassrls is false -CREATE ROLE regress_test_def_bypassrls; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; -CREATE ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls'; +CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; ALTER ROLE regress_test_bypassrls WITH BYPASSRLS; -SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; +SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:$:'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls'; -- clean up roles DROP ROLE regress_test_def_superuser; diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index eab7d99003..0cf1139e01 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE; RESET client_min_messages; -- initial setup -CREATE USER regress_rls_alice NOLOGIN; -CREATE USER regress_rls_bob NOLOGIN; -CREATE USER regress_rls_carol NOLOGIN; -CREATE USER regress_rls_dave NOLOGIN; -CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN; -CREATE ROLE regress_rls_group1 NOLOGIN; -CREATE ROLE regress_rls_group2 NOLOGIN; +CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT regress_rls_group1 TO regress_rls_bob; GRANT regress_rls_group2 TO regress_rls_carol; @@ -2105,8 +2105,8 @@ SELECT count(*) = 0 FROM pg_depend -- DROP OWNED BY testing RESET SESSION AUTHORIZATION; -CREATE ROLE regress_rls_dob_role1; -CREATE ROLE regress_rls_dob_role2; +CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE dob_t1 (c1 int); CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1); diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 4a5fa50585..a9e9eab77d 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -1390,7 +1390,7 @@ DROP TABLE ruletest2; -- Test non-SELECT rule on security invoker view. -- Should use view owner's permissions. -- -CREATE USER regress_rule_user1; +CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE ruletest_t1 (x int); CREATE TABLE ruletest_t2 (x int); diff --git a/src/test/regress/sql/security_label.sql b/src/test/regress/sql/security_label.sql index 98e6a5f211..68c868fef2 100644 --- a/src/test/regress/sql/security_label.sql +++ b/src/test/regress/sql/security_label.sql @@ -10,8 +10,8 @@ DROP ROLE IF EXISTS regress_seclabel_user2; RESET client_min_messages; -CREATE USER regress_seclabel_user1 WITH CREATEROLE; -CREATE USER regress_seclabel_user2; +CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); diff --git a/src/test/regress/sql/select_into.sql b/src/test/regress/sql/select_into.sql index 689c448cc2..223ceb1d75 100644 --- a/src/test/regress/sql/select_into.sql +++ b/src/test/regress/sql/select_into.sql @@ -20,7 +20,7 @@ DROP TABLE sitmp1; -- SELECT INTO and INSERT permission, if owner is not allowed to insert. -- CREATE SCHEMA selinto_schema; -CREATE USER regress_selinto_user; +CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER; ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 3e4bfcb71f..99757eff3c 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -498,7 +498,7 @@ SELECT 1 FROM tenk1_vw_sec rollback; -- test that function option SET ROLE works in parallel workers. -create role regress_parallel_worker; +create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; create function set_and_report_role() returns text as $$ select current_setting('role') $$ language sql parallel safe diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql index e742f13699..7bd0255df8 100644 --- a/src/test/regress/sql/select_views.sql +++ b/src/test/regress/sql/select_views.sql @@ -12,7 +12,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; -- -- Test for Leaky view scenario -- -CREATE ROLE regress_alice; +CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER; CREATE FUNCTION f_leak (text) RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index 793f1415f6..ec07c1f193 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -293,7 +293,7 @@ ROLLBACK; -- privileges tests -CREATE USER regress_seq_user; +CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER; -- nextval BEGIN; diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index d8ac0d06f4..c9cfcea208 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -631,23 +631,6 @@ SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes; SELECT current_setting('fsync') = 'off' OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs; --- Change the tablespace so that the table is rewritten directly, then SELECT --- from it to cause it to be read back into shared buffers. -SELECT sum(reads) AS io_sum_shared_before_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset --- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly --- rewritten table, e.g. by autovacuum. -BEGIN; -ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace; --- SELECT from the table so that the data is read into shared buffers and --- context 'normal', object 'relation' reads are counted. -SELECT COUNT(*) FROM test_io_shared; -COMMIT; -SELECT pg_stat_force_next_flush(); -SELECT sum(reads) AS io_sum_shared_after_reads - FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads; - SELECT sum(hits) AS io_sum_shared_before_hits FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset -- Select from the table again to count hits. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 0c08a6cc42..7a5b1036d8 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -50,7 +50,7 @@ DROP TABLE ext_stats_test; CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1; COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment'; -CREATE ROLE regress_stats_ext; +CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_stats_ext; COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment'; DROP STATISTICS ab1_a_b_stats; @@ -1607,7 +1607,7 @@ drop statistics stts_t1_expr_expr_stat; set search_path to public, stts_s1; \dX -create role regress_stats_ext nosuperuser; +create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER; set role regress_stats_ext; \dX reset role; @@ -1618,7 +1618,7 @@ drop user regress_stats_ext; reset search_path; -- User with no access -CREATE USER regress_stats_user1; +CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT USAGE ON SCHEMA tststats TO regress_stats_user1; SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl; -- Permission denied diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 3e5ba4cb8c..a35f030908 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -2,10 +2,10 @@ -- SUBSCRIPTION -- -CREATE ROLE regress_subscription_user LOGIN SUPERUSER; -CREATE ROLE regress_subscription_user2; -CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription; -CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER; +CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription; +CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION 'regress_subscription_user'; -- fail - no publications diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql index 06b0e2121f..01444f9426 100644 --- a/src/test/regress/sql/test_setup.sql +++ b/src/test/regress/sql/test_setup.sql @@ -135,7 +135,8 @@ CREATE TABLE onek ( ); \set filename :abs_srcdir '/data/onek.data' -COPY onek FROM :'filename'; +\set command '\\copy onek FROM ' :'filename'; +:command VACUUM ANALYZE onek; CREATE TABLE onek2 AS SELECT * FROM onek; @@ -161,7 +162,8 @@ CREATE TABLE tenk1 ( ); \set filename :abs_srcdir '/data/tenk.data' -COPY tenk1 FROM :'filename'; +\set command '\\copy tenk1 FROM ' :'filename'; +:command VACUUM ANALYZE tenk1; CREATE TABLE tenk2 AS SELECT * FROM tenk1; @@ -174,7 +176,8 @@ CREATE TABLE person ( ); \set filename :abs_srcdir '/data/person.data' -COPY person FROM :'filename'; +\set command '\\copy person FROM ' :'filename'; +:command VACUUM ANALYZE person; CREATE TABLE emp ( @@ -183,7 +186,8 @@ CREATE TABLE emp ( ) INHERITS (person); \set filename :abs_srcdir '/data/emp.data' -COPY emp FROM :'filename'; +\set command '\\copy emp FROM ' :'filename'; +:command VACUUM ANALYZE emp; CREATE TABLE student ( @@ -191,7 +195,8 @@ CREATE TABLE student ( ) INHERITS (person); \set filename :abs_srcdir '/data/student.data' -COPY student FROM :'filename'; +\set command '\\copy student FROM ' :'filename'; +:command VACUUM ANALYZE student; CREATE TABLE stud_emp ( @@ -199,7 +204,8 @@ CREATE TABLE stud_emp ( ) INHERITS (emp, student); \set filename :abs_srcdir '/data/stud_emp.data' -COPY stud_emp FROM :'filename'; +\set command '\\copy stud_emp FROM ' :'filename'; +:command VACUUM ANALYZE stud_emp; CREATE TABLE road ( @@ -208,7 +214,8 @@ CREATE TABLE road ( ); \set filename :abs_srcdir '/data/streets.data' -COPY road FROM :'filename'; +\set command '\\copy road FROM ' :'filename'; +:command VACUUM ANALYZE road; CREATE TABLE ihighway () INHERITS (road); diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index fbd26cdba4..7ec2d78eee 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -49,7 +49,8 @@ CREATE TABLE test_tsvector( ); \set filename :abs_srcdir '/data/tsearch.data' -COPY test_tsvector FROM :'filename'; +\set command '\\copy test_tsvector FROM ' :'filename'; +:command ANALYZE test_tsvector; diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index 93b693ae83..2983475265 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -569,9 +569,9 @@ DROP TABLE base_tbl CASCADE; -- permissions checks -CREATE USER regress_view_user1; -CREATE USER regress_view_user2; -CREATE USER regress_view_user3; +CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET SESSION AUTHORIZATION regress_view_user1; CREATE TABLE base_tbl(a int, b text, c float); @@ -1909,8 +1909,8 @@ drop view uv_iocu_view; drop table uv_iocu_tab; -- ON CONFLICT DO UPDATE permissions checks -create user regress_view_user1; -create user regress_view_user2; +create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER; set session authorization regress_view_user1; create table base_tbl(a int unique, b text, c float); diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 8b4707eb9c..b9041f8134 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -342,7 +342,7 @@ DROP FUNCTION func_parted_mod_b(); ----------------------------------------- ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; -CREATE USER regress_range_parted_user; +CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER; GRANT ALL ON range_parted, mintab TO regress_range_parted_user; CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql index 548cd7acca..5b15d4dab0 100644 --- a/src/test/regress/sql/vacuum.sql +++ b/src/test/regress/sql/vacuum.sql @@ -335,7 +335,7 @@ CREATE TABLE vacowned (a int); CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a); CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1); CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2); -CREATE ROLE regress_vacuum; +CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER; SET ROLE regress_vacuum; -- Simple table VACUUM vacowned; ================================================ FILE: compute/patches/contrib_pg16.patch ================================================ diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out index 979e5e8..2375b45 100644 --- a/contrib/amcheck/expected/check_heap.out +++ b/contrib/amcheck/expected/check_heap.out @@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b) -- same transaction. The heaptest table is smaller than the default -- wal_skip_threshold, so a wal_level=minimal commit reads the table into -- shared_buffers. A transaction delays that and excludes any autovacuum. -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; SELECT sum(reads) AS stats_bulkreads_before FROM pg_stat_io WHERE context = 'bulkread' \gset BEGIN; -ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; -- Check that valid options are not rejected nor corruption reported -- for a non-empty table SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); @@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT sum(reads) AS stats_bulkreads_after - FROM pg_stat_io WHERE context = 'bulkread' \gset -SELECT :stats_bulkreads_after > :stats_bulkreads_before; - ?column? ----------- - t -(1 row) - CREATE ROLE regress_heaptest_role; -- verify permissions are checked (error due to function not callable) SET ROLE regress_heaptest_role; @@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" DETAIL: This operation is not supported for foreign tables. -- cleanup DROP TABLE heaptest; -DROP TABLESPACE regress_test_stats_tblspc; DROP TABLE test_partition; DROP TABLE test_partitioned; DROP OWNED BY regress_heaptest_role; -- permissions diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql index 1745bae..3b429c3 100644 --- a/contrib/amcheck/sql/check_heap.sql +++ b/contrib/amcheck/sql/check_heap.sql @@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) -- same transaction. The heaptest table is smaller than the default -- wal_skip_threshold, so a wal_level=minimal commit reads the table into -- shared_buffers. A transaction delays that and excludes any autovacuum. -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; SELECT sum(reads) AS stats_bulkreads_before FROM pg_stat_io WHERE context = 'bulkread' \gset BEGIN; -ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; -- Check that valid options are not rejected nor corruption reported -- for a non-empty table SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); @@ -58,9 +55,6 @@ COMMIT; -- ALTER TABLE ... SET TABLESPACE ... -- causing an additional bulkread, which should be reflected in pg_stat_io. SELECT pg_stat_force_next_flush(); -SELECT sum(reads) AS stats_bulkreads_after - FROM pg_stat_io WHERE context = 'bulkread' \gset -SELECT :stats_bulkreads_after > :stats_bulkreads_before; CREATE ROLE regress_heaptest_role; @@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', -- cleanup DROP TABLE heaptest; -DROP TABLESPACE regress_test_stats_tblspc; DROP TABLE test_partition; DROP TABLE test_partitioned; DROP OWNED BY regress_heaptest_role; -- permissions diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out index 33be13a..70a406c 100644 --- a/contrib/citext/expected/create_index_acl.out +++ b/contrib/citext/expected/create_index_acl.out @@ -5,9 +5,6 @@ -- owner having as few applicable privileges as possible. (The privileges.sql -- regress_sro_user tests look for the opposite defect; they confirm that -- DefineIndex() uses the table owner userid where necessary.) -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; -RESET allow_in_place_tablespaces; BEGIN; CREATE ROLE regress_minimal; CREATE SCHEMA s; @@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; -- Empty-table DefineIndex() CREATE UNIQUE INDEX u0rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Make the table nonempty. INSERT INTO s.x VALUES ('foo'), ('bar'); @@ -66,11 +61,9 @@ RESET search_path; GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; CREATE UNIQUE INDEX u2rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Shall not find s.coll via search_path, despite the s.const->public.setter -- call having set search_path=s during expression planning. Suppress the @@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree \set VERBOSITY sqlstate ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); ERROR: 42704 \set VERBOSITY default ROLLBACK; -DROP TABLESPACE regress_create_idx_tblspace; diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql index 10b5225..ae442e1 100644 --- a/contrib/citext/sql/create_index_acl.sql +++ b/contrib/citext/sql/create_index_acl.sql @@ -6,10 +6,6 @@ -- regress_sro_user tests look for the opposite defect; they confirm that -- DefineIndex() uses the table owner userid where necessary.) -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; -RESET allow_in_place_tablespaces; - BEGIN; CREATE ROLE regress_minimal; CREATE SCHEMA s; @@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; -- Empty-table DefineIndex() CREATE UNIQUE INDEX u0rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Make the table nonempty. INSERT INTO s.x VALUES ('foo'), ('bar'); @@ -68,11 +62,9 @@ RESET search_path; GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; CREATE UNIQUE INDEX u2rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Shall not find s.coll via search_path, despite the s.const->public.setter -- call having set search_path=s during expression planning. Suppress the @@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree \set VERBOSITY sqlstate ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); \set VERBOSITY default ROLLBACK; -DROP TABLESPACE regress_create_idx_tblspace; diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out index 72304e0..ebe131b 100644 --- a/contrib/file_fdw/expected/file_fdw.out +++ b/contrib/file_fdw/expected/file_fdw.out @@ -4,6 +4,7 @@ -- directory paths are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR -- Clean up in case a prior regression run failed +SET compute_query_id TO 'off'; SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; RESET client_min_messages; diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql index f0548e1..848a08c 100644 --- a/contrib/file_fdw/sql/file_fdw.sql +++ b/contrib/file_fdw/sql/file_fdw.sql @@ -6,6 +6,7 @@ \getenv abs_srcdir PG_ABS_SRCDIR -- Clean up in case a prior regression run failed +SET compute_query_id TO 'off'; SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; RESET client_min_messages; diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out index d1adbab..38b52ac 100644 --- a/contrib/pageinspect/expected/gist.out +++ b/contrib/pageinspect/expected/gist.out @@ -10,25 +10,6 @@ BEGIN; CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM generate_series(1,1000) i; CREATE INDEX test_gist_idx ON test_gist USING gist (p); --- Page 0 is the root, the rest are leaf pages -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); - lsn | nsn | rightlink | flags ------+-----+------------+------- - 0/1 | 0/0 | 4294967295 | {} -(1 row) - -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); - lsn | nsn | rightlink | flags ------+-----+------------+-------- - 0/1 | 0/0 | 4294967295 | {leaf} -(1 row) - -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); - lsn | nsn | rightlink | flags ------+-----+-----------+-------- - 0/1 | 0/0 | 1 | {leaf} -(1 row) - COMMIT; SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); itemoffset | ctid | itemlen | dead | keys diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql index d263542..607992f 100644 --- a/contrib/pageinspect/sql/gist.sql +++ b/contrib/pageinspect/sql/gist.sql @@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM generate_series(1,1000) i; CREATE INDEX test_gist_idx ON test_gist USING gist (p); --- Page 0 is the root, the rest are leaf pages -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); -SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); - COMMIT; SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); ================================================ FILE: compute/patches/contrib_pg17.patch ================================================ diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out index 979e5e8..2375b45 100644 --- a/contrib/amcheck/expected/check_heap.out +++ b/contrib/amcheck/expected/check_heap.out @@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b) -- same transaction. The heaptest table is smaller than the default -- wal_skip_threshold, so a wal_level=minimal commit reads the table into -- shared_buffers. A transaction delays that and excludes any autovacuum. -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; SELECT sum(reads) AS stats_bulkreads_before FROM pg_stat_io WHERE context = 'bulkread' \gset BEGIN; -ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; -- Check that valid options are not rejected nor corruption reported -- for a non-empty table SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); @@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT sum(reads) AS stats_bulkreads_after - FROM pg_stat_io WHERE context = 'bulkread' \gset -SELECT :stats_bulkreads_after > :stats_bulkreads_before; - ?column? ----------- - t -(1 row) - CREATE ROLE regress_heaptest_role; -- verify permissions are checked (error due to function not callable) SET ROLE regress_heaptest_role; @@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" DETAIL: This operation is not supported for foreign tables. -- cleanup DROP TABLE heaptest; -DROP TABLESPACE regress_test_stats_tblspc; DROP TABLE test_partition; DROP TABLE test_partitioned; DROP OWNED BY regress_heaptest_role; -- permissions diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql index 1745bae..3b429c3 100644 --- a/contrib/amcheck/sql/check_heap.sql +++ b/contrib/amcheck/sql/check_heap.sql @@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) -- same transaction. The heaptest table is smaller than the default -- wal_skip_threshold, so a wal_level=minimal commit reads the table into -- shared_buffers. A transaction delays that and excludes any autovacuum. -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; SELECT sum(reads) AS stats_bulkreads_before FROM pg_stat_io WHERE context = 'bulkread' \gset BEGIN; -ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; -- Check that valid options are not rejected nor corruption reported -- for a non-empty table SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); @@ -58,9 +55,6 @@ COMMIT; -- ALTER TABLE ... SET TABLESPACE ... -- causing an additional bulkread, which should be reflected in pg_stat_io. SELECT pg_stat_force_next_flush(); -SELECT sum(reads) AS stats_bulkreads_after - FROM pg_stat_io WHERE context = 'bulkread' \gset -SELECT :stats_bulkreads_after > :stats_bulkreads_before; CREATE ROLE regress_heaptest_role; @@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', -- cleanup DROP TABLE heaptest; -DROP TABLESPACE regress_test_stats_tblspc; DROP TABLE test_partition; DROP TABLE test_partitioned; DROP OWNED BY regress_heaptest_role; -- permissions diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out index 33be13a..70a406c 100644 --- a/contrib/citext/expected/create_index_acl.out +++ b/contrib/citext/expected/create_index_acl.out @@ -5,9 +5,6 @@ -- owner having as few applicable privileges as possible. (The privileges.sql -- regress_sro_user tests look for the opposite defect; they confirm that -- DefineIndex() uses the table owner userid where necessary.) -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; -RESET allow_in_place_tablespaces; BEGIN; CREATE ROLE regress_minimal; CREATE SCHEMA s; @@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; -- Empty-table DefineIndex() CREATE UNIQUE INDEX u0rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Make the table nonempty. INSERT INTO s.x VALUES ('foo'), ('bar'); @@ -66,11 +61,9 @@ RESET search_path; GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; CREATE UNIQUE INDEX u2rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Shall not find s.coll via search_path, despite the s.const->public.setter -- call having set search_path=s during expression planning. Suppress the @@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree \set VERBOSITY sqlstate ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); ERROR: 42704 \set VERBOSITY default ROLLBACK; -DROP TABLESPACE regress_create_idx_tblspace; diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql index 10b5225..ae442e1 100644 --- a/contrib/citext/sql/create_index_acl.sql +++ b/contrib/citext/sql/create_index_acl.sql @@ -6,10 +6,6 @@ -- regress_sro_user tests look for the opposite defect; they confirm that -- DefineIndex() uses the table owner userid where necessary.) -SET allow_in_place_tablespaces = true; -CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; -RESET allow_in_place_tablespaces; - BEGIN; CREATE ROLE regress_minimal; CREATE SCHEMA s; @@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; -- Empty-table DefineIndex() CREATE UNIQUE INDEX u0rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Make the table nonempty. INSERT INTO s.x VALUES ('foo'), ('bar'); @@ -68,11 +62,9 @@ RESET search_path; GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; CREATE UNIQUE INDEX u2rows ON s.x USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) - TABLESPACE regress_create_idx_tblspace WHERE s.index_row_if(y); ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); -- Shall not find s.coll via search_path, despite the s.const->public.setter -- call having set search_path=s during expression planning. Suppress the @@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree \set VERBOSITY sqlstate ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) - USING INDEX TABLESPACE regress_create_idx_tblspace WHERE (s.index_row_if(y)); \set VERBOSITY default ROLLBACK; -DROP TABLESPACE regress_create_idx_tblspace; diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out index 86c148a..81bdb2c 100644 --- a/contrib/file_fdw/expected/file_fdw.out +++ b/contrib/file_fdw/expected/file_fdw.out @@ -4,6 +4,7 @@ -- directory paths are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR -- Clean up in case a prior regression run failed +SET compute_query_id TO 'off'; SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; RESET client_min_messages; diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql index f0548e1..848a08c 100644 --- a/contrib/file_fdw/sql/file_fdw.sql +++ b/contrib/file_fdw/sql/file_fdw.sql @@ -6,6 +6,7 @@ \getenv abs_srcdir PG_ABS_SRCDIR -- Clean up in case a prior regression run failed +SET compute_query_id TO 'off'; SET client_min_messages TO 'warning'; DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; RESET client_min_messages; ================================================ FILE: compute/patches/duckdb_v113.patch ================================================ diff --git a/libduckdb.map b/libduckdb.map new file mode 100644 index 0000000000..3b56f00cd7 --- /dev/null +++ b/libduckdb.map @@ -0,0 +1,6 @@ +DUCKDB_1.1.3 { + global: + *duckdb*; + local: + *; +}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3e757a4bcc..88ab4005b9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -135,6 +135,8 @@ else() target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) link_threads(duckdb) link_extension_libraries(duckdb) + target_link_options(duckdb PRIVATE + -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map) add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) ================================================ FILE: compute/patches/duckdb_v120.patch ================================================ diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map new file mode 100644 index 0000000000..0872978b48 --- /dev/null +++ b/libduckdb_pg_duckdb.map @@ -0,0 +1,6 @@ +DUCKDB_1.2.0 { + global: + *duckdb*; + local: + *; +}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 58adef3fc0..2c522f91be 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -59,7 +59,7 @@ endfunction() if(AMALGAMATION_BUILD) - add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") + add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS}) link_threads(duckdb) link_extension_libraries(duckdb) @@ -109,7 +109,7 @@ else() duckdb_yyjson duckdb_zstd) - add_library(duckdb SHARED ${ALL_OBJECT_FILES}) + add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES}) if(WIN32 AND NOT MINGW) ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION) @@ -131,9 +131,11 @@ else() target_sources(duckdb PRIVATE version.rc) endif() - target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) - link_threads(duckdb) - link_extension_libraries(duckdb) + target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS}) + link_threads(duckdb_pg_duckdb) + link_extension_libraries(duckdb_pg_duckdb) + target_link_options(duckdb_pg_duckdb PRIVATE + -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map) add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) @@ -141,7 +143,7 @@ else() link_extension_libraries(duckdb_static) target_include_directories( - duckdb PUBLIC $ + duckdb_pg_duckdb PUBLIC $ $) target_include_directories( @@ -161,7 +163,7 @@ else() endif() install( - TARGETS duckdb duckdb_static + TARGETS duckdb_pg_duckdb duckdb_static EXPORT "${DUCKDB_EXPORT_SET}" LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" ================================================ FILE: compute/patches/onnxruntime.patch ================================================ diff --git a/cmake/deps.txt b/cmake/deps.txt index d213b09034..229de2ebf0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -22,7 +22,9 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132 # it contains changes on top of 3.4.0 which are required to fix build issues. # Until the 3.4.1 release this is the best option we have. # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744 -eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a +# Moved to github mirror to avoid gitlab issues.Add commentMore actions +# Issue link: https://github.com/bazelbuild/bazel-central-registry/issues/4355 +eigen;https://github.com/eigen-mirror/eigen/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;61418a349000ba7744a3ad03cf5071f22ebf860a flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 ================================================ FILE: compute/patches/pg_cron.patch ================================================ commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master) Author: Alexey Masterov Date: Fri Jun 7 19:23:42 2024 +0000 Disable REGRESS_OPTIONS causing initdb diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile index 053314c..fbd5fb5 100644 --- a/ext-src/pg_cron-src/Makefile +++ b/ext-src/pg_cron-src/Makefile @@ -5,7 +5,7 @@ EXTENSION = pg_cron DATA_built = $(EXTENSION)--1.0.sql DATA = $(wildcard $(EXTENSION)--*--*.sql) -REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check +#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check REGRESS = pg_cron-test # compilation configuration ================================================ FILE: compute/patches/pg_duckdb_v031.patch ================================================ diff --git a/Makefile b/Makefile index 3235cc8..6b892bc 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ else DUCKDB_BUILD_TYPE = release endif -DUCKDB_LIB = libduckdb$(DLSUFFIX) +DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX) FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) ERROR_ON_WARNING ?= @@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} - # changes to the vendored code in one place. override PG_CFLAGS += -Wno-declaration-after-statement -SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 +SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4 include Makefile.global diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index d777d76..3b54396 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -1056,3 +1056,14 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO %I', privileged_role_name); + EXECUTE format('GRANT ALL ON TABLE duckdb.extensions TO %I', privileged_role_name); + EXECUTE format('GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO %I', privileged_role_name); +END $$; ================================================ FILE: compute/patches/pg_graphql.patch ================================================ commit ec6a491d126882966a696f9ad5d3698935361d55 Author: Alexey Masterov Date: Tue Dec 17 10:25:00 2024 +0100 Changes required to run tests on Neon diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out index 1e9fbc2..94cbe25 100644 --- a/test/expected/permissions_functions.out +++ b/test/expected/permissions_functions.out @@ -64,7 +64,7 @@ begin; select current_user; current_user -------------- - postgres + cloud_admin (1 row) -- revoke default access from the public role for new functions ================================================ FILE: compute/patches/pg_hint_plan_v16.patch ================================================ diff --git a/expected/ut-A.out b/expected/ut-A.out index da723b8..5328114 100644 --- a/expected/ut-A.out +++ b/expected/ut-A.out @@ -3175,6 +3178,7 @@ SELECT s.query, s.calls FROM public.pg_stat_statements s JOIN pg_catalog.pg_database d ON (s.dbid = d.oid) + WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' ORDER BY 1; query | calls --------------------------------------+------- diff --git a/sql/ut-A.sql b/sql/ut-A.sql index 7c7d58a..4fd1a07 100644 --- a/sql/ut-A.sql +++ b/sql/ut-A.sql @@ -963,6 +963,7 @@ SELECT s.query, s.calls FROM public.pg_stat_statements s JOIN pg_catalog.pg_database d ON (s.dbid = d.oid) + WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' ORDER BY 1; ---- ================================================ FILE: compute/patches/pg_hint_plan_v17.patch ================================================ diff --git a/expected/ut-J.out b/expected/ut-J.out index 2fa3c70..314e929 100644 --- a/expected/ut-J.out +++ b/expected/ut-J.out @@ -789,38 +789,6 @@ NestLoop(st1 st2) MergeJoin(t1 t2) not used hint: duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -NestLoop(st1 st2) -MergeJoin(t1 t2) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -NestLoop(st1 st2) -MergeJoin(t1 t2) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -NestLoop(st1 st2) -MergeJoin(t1 t2) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -NestLoop(st1 st2) -MergeJoin(t1 t2) -duplication hint: error hint: explain_filter diff --git a/expected/ut-S.out b/expected/ut-S.out index 0bfcfb8..e75f581 100644 --- a/expected/ut-S.out +++ b/expected/ut-S.out @@ -4415,34 +4415,6 @@ used hint: IndexScan(ti1 ti1_pred) not used hint: duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(ti1 ti1_pred) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(ti1 ti1_pred) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(ti1 ti1_pred) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(ti1 ti1_pred) -duplication hint: error hint: explain_filter diff --git a/expected/ut-W.out b/expected/ut-W.out index a09bd34..0ad227c 100644 --- a/expected/ut-W.out +++ b/expected/ut-W.out @@ -1341,54 +1341,6 @@ IndexScan(ft1) IndexScan(t) Parallel(s1 3 hard) duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(*VALUES*) -SeqScan(cte1) -IndexScan(ft1) -IndexScan(t) -Parallel(p1 5 hard) -Parallel(s1 3 hard) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(*VALUES*) -SeqScan(cte1) -IndexScan(ft1) -IndexScan(t) -Parallel(p1 5 hard) -Parallel(s1 3 hard) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(*VALUES*) -SeqScan(cte1) -IndexScan(ft1) -IndexScan(t) -Parallel(p1 5 hard) -Parallel(s1 3 hard) -duplication hint: -error hint: - -LOG: pg_hint_plan: -used hint: -not used hint: -IndexScan(*VALUES*) -SeqScan(cte1) -IndexScan(ft1) -IndexScan(t) -Parallel(p1 5 hard) -Parallel(s1 3 hard) -duplication hint: error hint: explain_filter ================================================ FILE: compute/patches/pg_repack.patch ================================================ commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb Author: Alexey Masterov Date: Mon Jul 28 18:11:02 2025 +0200 Patch for pg_repack diff --git a/regress/Makefile b/regress/Makefile index bf6edcb..110e734 100644 --- a/regress/Makefile +++ b/regress/Makefile @@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\} # Test suite # -REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger +REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum USE_PGXS = 1 # use pgxs if not in contrib directory PGXS := $(shell $(PG_CONFIG) --pgxs) diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out new file mode 100644 index 0000000..e7f2363 --- /dev/null +++ b/regress/expected/autovacuum.out @@ -0,0 +1,7 @@ +ALTER SYSTEM SET autovacuum='on'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out new file mode 100644 index 0000000..fc7978e --- /dev/null +++ b/regress/expected/noautovacuum.out @@ -0,0 +1,7 @@ +ALTER SYSTEM SET autovacuum='off'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out index 8d0a94e..63b68bf 100644 --- a/regress/expected/nosuper.out +++ b/regress/expected/nosuper.out @@ -4,22 +4,22 @@ SET client_min_messages = error; DROP ROLE IF EXISTS nosuper; SET client_min_messages = warning; -CREATE ROLE nosuper WITH LOGIN; +CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; -- => OK \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check INFO: repacking table "public.tbl_cluster" -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ERROR: pg_repack failed with error: You must be a superuser to use pg_repack -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ERROR: pg_repack failed with error: ERROR: permission denied for schema repack LINE 1: select repack.version(), repack.version_sql() ^ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; GRANT USAGE ON SCHEMA repack TO nosuper; -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check INFO: repacking table "public.tbl_cluster" ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block DETAIL: query was: RESET lock_timeout diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql new file mode 100644 index 0000000..a8eda63 --- /dev/null +++ b/regress/sql/autovacuum.sql @@ -0,0 +1,2 @@ +ALTER SYSTEM SET autovacuum='on'; +SELECT pg_reload_conf(); diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql new file mode 100644 index 0000000..13d4836 --- /dev/null +++ b/regress/sql/noautovacuum.sql @@ -0,0 +1,2 @@ +ALTER SYSTEM SET autovacuum='off'; +SELECT pg_reload_conf(); diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql index 072f0fa..dbe60f8 100644 --- a/regress/sql/nosuper.sql +++ b/regress/sql/nosuper.sql @@ -4,19 +4,19 @@ SET client_min_messages = error; DROP ROLE IF EXISTS nosuper; SET client_min_messages = warning; -CREATE ROLE nosuper WITH LOGIN; +CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; -- => OK \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; GRANT USAGE ON SCHEMA repack TO nosuper; -- => ERROR -\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check +\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper; REVOKE USAGE ON SCHEMA repack FROM nosuper; ================================================ FILE: compute/patches/pg_stat_statements_pg14-16.patch ================================================ diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql index 58cdf600fce..8be57a996f6 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql @@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC; -- Don't want this to be available to non-superusers. REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name); +END $$; diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql index 6fc3fed4c93..256345a8f79 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql @@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE; -- Don't want this to be available to non-superusers. REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name); +END $$; ================================================ FILE: compute/patches/pg_stat_statements_pg17.patch ================================================ diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql index 0bb2c397711..32764db1d8b 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql @@ -80,3 +80,12 @@ LANGUAGE C STRICT PARALLEL SAFE; -- Don't want this to be available to non-superusers. REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) FROM PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO %I', privileged_role_name); +END $$; \ No newline at end of file diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql index 58cdf600fce..8be57a996f6 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql @@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC; -- Don't want this to be available to non-superusers. REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name); +END $$; diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql index 6fc3fed4c93..256345a8f79 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql @@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE; -- Don't want this to be available to non-superusers. REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name); +END $$; ================================================ FILE: compute/patches/pgaudit-parallel_workers-v14.patch ================================================ commit 7220bb3a3f23fa27207d77562dcc286f9a123313 Author: Tristan Partin Date: 2025-06-23 02:09:31 +0000 Disable logging in parallel workers When a query uses parallel workers, pgaudit will log the same query for every parallel worker. This is undesireable since it can result in log amplification for queries that use parallel workers. Signed-off-by: Tristan Partin diff --git a/expected/pgaudit.out b/expected/pgaudit.out index baa8011..a601375 100644 --- a/expected/pgaudit.out +++ b/expected/pgaudit.out @@ -2563,6 +2563,37 @@ COMMIT; NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, DROP TABLE part_test; NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; +SELECT count(*) FROM parallel_test; +NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, + count +------- + 1000 +(1 row) + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; diff --git a/pgaudit.c b/pgaudit.c index 5e6fd38..ac9ded2 100644 --- a/pgaudit.c +++ b/pgaudit.c @@ -11,6 +11,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/parallel.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/relation.h" @@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) { AuditEventStackItem *stackItem = NULL; - if (!internalStatement) + if (!internalStatement && !IsParallelWorker()) { /* Push the audit even onto the stack */ stackItem = stack_push(); @@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) /* Log DML if the audit role is valid or session logging is enabled */ if ((auditOid != InvalidOid || auditLogBitmap != 0) && - !IsAbortedTransactionBlockState()) + !IsAbortedTransactionBlockState() && !IsParallelWorker()) { /* If auditLogRows is on, wait for rows processed to be set */ if (auditLogRows && auditEventStack != NULL) @@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c else standard_ExecutorRun(queryDesc, direction, count, execute_once); - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); @@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) AuditEventStackItem *stackItem = NULL; AuditEventStackItem *auditEventStackFull = NULL; - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql index cc1374a..1870a60 100644 --- a/sql/pgaudit.sql +++ b/sql/pgaudit.sql @@ -1612,6 +1612,36 @@ COMMIT; DROP TABLE part_test; +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; + +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; + +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; + +SELECT count(*) FROM parallel_test; + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; + -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; ================================================ FILE: compute/patches/pgaudit-parallel_workers-v15.patch ================================================ commit 29dc2847f6255541992f18faf8a815dfab79631a Author: Tristan Partin Date: 2025-06-23 02:09:31 +0000 Disable logging in parallel workers When a query uses parallel workers, pgaudit will log the same query for every parallel worker. This is undesireable since it can result in log amplification for queries that use parallel workers. Signed-off-by: Tristan Partin diff --git a/expected/pgaudit.out b/expected/pgaudit.out index b22560b..73f0327 100644 --- a/expected/pgaudit.out +++ b/expected/pgaudit.out @@ -2563,6 +2563,37 @@ COMMIT; NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, DROP TABLE part_test; NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; +SELECT count(*) FROM parallel_test; +NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, + count +------- + 1000 +(1 row) + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; diff --git a/pgaudit.c b/pgaudit.c index 5e6fd38..ac9ded2 100644 --- a/pgaudit.c +++ b/pgaudit.c @@ -11,6 +11,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/parallel.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/relation.h" @@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) { AuditEventStackItem *stackItem = NULL; - if (!internalStatement) + if (!internalStatement && !IsParallelWorker()) { /* Push the audit even onto the stack */ stackItem = stack_push(); @@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) /* Log DML if the audit role is valid or session logging is enabled */ if ((auditOid != InvalidOid || auditLogBitmap != 0) && - !IsAbortedTransactionBlockState()) + !IsAbortedTransactionBlockState() && !IsParallelWorker()) { /* If auditLogRows is on, wait for rows processed to be set */ if (auditLogRows && auditEventStack != NULL) @@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c else standard_ExecutorRun(queryDesc, direction, count, execute_once); - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); @@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) AuditEventStackItem *stackItem = NULL; AuditEventStackItem *auditEventStackFull = NULL; - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql index 8052426..7f0667b 100644 --- a/sql/pgaudit.sql +++ b/sql/pgaudit.sql @@ -1612,6 +1612,36 @@ COMMIT; DROP TABLE part_test; +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; + +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; + +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; + +SELECT count(*) FROM parallel_test; + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; + -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; ================================================ FILE: compute/patches/pgaudit-parallel_workers-v16.patch ================================================ commit cc708dde7ef2af2a8120d757102d2e34c0463a0f Author: Tristan Partin Date: 2025-06-23 02:09:31 +0000 Disable logging in parallel workers When a query uses parallel workers, pgaudit will log the same query for every parallel worker. This is undesireable since it can result in log amplification for queries that use parallel workers. Signed-off-by: Tristan Partin diff --git a/expected/pgaudit.out b/expected/pgaudit.out index 8772054..9b66ac6 100644 --- a/expected/pgaudit.out +++ b/expected/pgaudit.out @@ -2556,6 +2556,37 @@ DROP SERVER fdw_server; NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server;, DROP EXTENSION postgres_fdw; NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw;, +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; +SELECT count(*) FROM parallel_test; +NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, + count +------- + 1000 +(1 row) + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; diff --git a/pgaudit.c b/pgaudit.c index 004d1f9..f061164 100644 --- a/pgaudit.c +++ b/pgaudit.c @@ -11,6 +11,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/parallel.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/relation.h" @@ -1339,7 +1340,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) { AuditEventStackItem *stackItem = NULL; - if (!internalStatement) + if (!internalStatement && !IsParallelWorker()) { /* Push the audit even onto the stack */ stackItem = stack_push(); @@ -1420,7 +1421,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) /* Log DML if the audit role is valid or session logging is enabled */ if ((auditOid != InvalidOid || auditLogBitmap != 0) && - !IsAbortedTransactionBlockState()) + !IsAbortedTransactionBlockState() && !IsParallelWorker()) { /* If auditLogRows is on, wait for rows processed to be set */ if (auditLogRows && auditEventStack != NULL) @@ -1475,7 +1476,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c else standard_ExecutorRun(queryDesc, direction, count, execute_once); - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); @@ -1495,7 +1496,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) AuditEventStackItem *stackItem = NULL; AuditEventStackItem *auditEventStackFull = NULL; - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql index 6aae88b..de6d7fd 100644 --- a/sql/pgaudit.sql +++ b/sql/pgaudit.sql @@ -1631,6 +1631,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; DROP SERVER fdw_server; DROP EXTENSION postgres_fdw; +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; + +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; + +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; + +SELECT count(*) FROM parallel_test; + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; + -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; ================================================ FILE: compute/patches/pgaudit-parallel_workers-v17.patch ================================================ commit 8d02e4c6c5e1e8676251b0717a46054267091cb4 Author: Tristan Partin Date: 2025-06-23 02:09:31 +0000 Disable logging in parallel workers When a query uses parallel workers, pgaudit will log the same query for every parallel worker. This is undesireable since it can result in log amplification for queries that use parallel workers. Signed-off-by: Tristan Partin diff --git a/expected/pgaudit.out b/expected/pgaudit.out index d696287..4b1059a 100644 --- a/expected/pgaudit.out +++ b/expected/pgaudit.out @@ -2568,6 +2568,37 @@ DROP SERVER fdw_server; NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server, DROP EXTENSION postgres_fdw; NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw, +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; +SELECT count(*) FROM parallel_test; +NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test, + count +------- + 1000 +(1 row) + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; diff --git a/pgaudit.c b/pgaudit.c index 1764af1..0e48875 100644 --- a/pgaudit.c +++ b/pgaudit.c @@ -11,6 +11,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/parallel.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/relation.h" @@ -1406,7 +1407,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) { AuditEventStackItem *stackItem = NULL; - if (!internalStatement) + if (!internalStatement && !IsParallelWorker()) { /* Push the audit event onto the stack */ stackItem = stack_push(); @@ -1489,7 +1490,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) /* Log DML if the audit role is valid or session logging is enabled */ if ((auditOid != InvalidOid || auditLogBitmap != 0) && - !IsAbortedTransactionBlockState()) + !IsAbortedTransactionBlockState() && !IsParallelWorker()) { /* If auditLogRows is on, wait for rows processed to be set */ if (auditLogRows && auditEventStack != NULL) @@ -1544,7 +1545,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c else standard_ExecutorRun(queryDesc, direction, count, execute_once); - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); @@ -1564,7 +1565,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) AuditEventStackItem *stackItem = NULL; AuditEventStackItem *auditEventStackFull = NULL; - if (auditLogRows && !internalStatement) + if (auditLogRows && !internalStatement && !IsParallelWorker()) { /* Find an item from the stack by the query memory context */ stackItem = stack_find_context(queryDesc->estate->es_query_cxt); diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql index e161f01..c873098 100644 --- a/sql/pgaudit.sql +++ b/sql/pgaudit.sql @@ -1637,6 +1637,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; DROP SERVER fdw_server; DROP EXTENSION postgres_fdw; +-- +-- Test logging in parallel workers +SET pgaudit.log = 'read'; +SET pgaudit.log_client = on; +SET pgaudit.log_level = 'notice'; + +-- Force parallel execution for testing +SET max_parallel_workers_per_gather = 2; +SET parallel_tuple_cost = 0; +SET parallel_setup_cost = 0; +SET min_parallel_table_scan_size = 0; +SET min_parallel_index_scan_size = 0; + +-- Create table with enough data to trigger parallel execution +CREATE TABLE parallel_test (id int, data text); +INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; + +SELECT count(*) FROM parallel_test; + +-- Cleanup parallel test +DROP TABLE parallel_test; +RESET max_parallel_workers_per_gather; +RESET parallel_tuple_cost; +RESET parallel_setup_cost; +RESET min_parallel_table_scan_size; +RESET min_parallel_index_scan_size; +RESET pgaudit.log; +RESET pgaudit.log_client; +RESET pgaudit.log_level; + -- Cleanup -- Set client_min_messages up to warning to avoid noise SET client_min_messages = 'warning'; ================================================ FILE: compute/patches/pgvector.patch ================================================ diff --git a/Makefile b/Makefile index 7a4b88c..56678af 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,10 @@ EXTVERSION = 0.8.0 MODULE_big = vector DATA = $(wildcard sql/*--*--*.sql) -DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql +# This change is needed to install different per-version SQL files +# like pgvector--0.8.0.sql and pgvector--0.7.4.sql +# The corresponding file is downloaded during the Docker image build process +DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o HEADERS = src/halfvec.h src/sparsevec.h src/vector.h diff --git a/src/hnswbuild.c b/src/hnswbuild.c index b667478..1298aa1 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c @@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); +#ifdef NEON_SMGR + smgr_start_unlogged_build(RelationGetSmgr(indexRel)); +#endif + /* Perform inserts */ HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); +#ifdef NEON_SMGR + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); +#endif + /* Close relations within worker */ index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); @@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, SeedRandom(42); #endif +#ifdef NEON_SMGR + smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + InitBuildState(buildstate, heap, index, indexInfo, forkNum); BuildGraph(buildstate, forkNum); +#ifdef NEON_SMGR + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true); +#ifdef NEON_SMGR + smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + FreeBuildState(buildstate); } ================================================ FILE: compute/patches/plv8_v3.1.10.patch ================================================ diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch new file mode 100644 index 0000000..fae1cb3 --- /dev/null +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch @@ -0,0 +1,30 @@ +From 84cf3230a9680aac3b73c410c2b758760b6d3066 Mon Sep 17 00:00:00 2001 +From: Michael Lippautz +Date: Thu, 27 Jan 2022 14:14:11 +0100 +Subject: [PATCH] cppgc: Fix include + +Add to cover for std::exchange. + +Bug: v8:12585 +Change-Id: Ida65144e93e466be8914527d0e646f348c136bcb +Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3420309 +Auto-Submit: Michael Lippautz +Reviewed-by: Omer Katz +Commit-Queue: Michael Lippautz +Cr-Commit-Position: refs/heads/main@{#78820} +--- + src/heap/cppgc/prefinalizer-handler.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/heap/cppgc/prefinalizer-handler.h b/src/heap/cppgc/prefinalizer-handler.h +index bc17c99b1838..c82c91ff5a45 100644 +--- a/src/heap/cppgc/prefinalizer-handler.h ++++ b/src/heap/cppgc/prefinalizer-handler.h +@@ -5,6 +5,7 @@ + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ + ++#include + #include + + #include "include/cppgc/prefinalizer.h" diff --git a/plv8.cc b/plv8.cc index c1ce883..6e47e94 100644 --- a/plv8.cc +++ b/plv8.cc @@ -379,7 +379,7 @@ _PG_init(void) NULL, &plv8_v8_flags, NULL, - PGC_USERSET, 0, + PGC_SUSET, 0, #if PG_VERSION_NUM >= 90100 NULL, #endif ================================================ FILE: compute/patches/plv8_v3.2.3.patch ================================================ diff --git a/plv8.cc b/plv8.cc index edfa2aa..623e7f2 100644 --- a/plv8.cc +++ b/plv8.cc @@ -385,7 +385,7 @@ _PG_init(void) NULL, &plv8_v8_flags, NULL, - PGC_USERSET, 0, + PGC_SUSET, 0, #if PG_VERSION_NUM >= 90100 NULL, #endif ================================================ FILE: compute/patches/postgres_fdw.patch ================================================ diff --git a/contrib/postgres_fdw/postgres_fdw--1.0.sql b/contrib/postgres_fdw/postgres_fdw--1.0.sql index a0f0fc1bf45..ee077f2eea6 100644 --- a/contrib/postgres_fdw/postgres_fdw--1.0.sql +++ b/contrib/postgres_fdw/postgres_fdw--1.0.sql @@ -16,3 +16,12 @@ LANGUAGE C STRICT; CREATE FOREIGN DATA WRAPPER postgres_fdw HANDLER postgres_fdw_handler VALIDATOR postgres_fdw_validator; + +DO $$ +DECLARE + privileged_role_name text; +BEGIN + privileged_role_name := current_setting('neon.privileged_role_name'); + + EXECUTE format('GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO %I', privileged_role_name); +END $$; ================================================ FILE: compute/patches/rum.patch ================================================ diff --git a/src/ruminsert.c b/src/ruminsert.c index 255e616..1c6edb7 100644 --- a/src/ruminsert.c +++ b/src/ruminsert.c @@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); +#ifdef NEON_SMGR + smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + initRumState(&buildstate.rumstate, index); buildstate.rumstate.isBuild = true; buildstate.indtuples = 0; @@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); +#ifdef NEON_SMGR + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + /* * Write index to xlog */ @@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) UnlockReleaseBuffer(buffer); } +#ifdef NEON_SMGR + smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + /* * Return statistics */ ================================================ FILE: compute/vm-image-spec-bookworm.yaml ================================================ # Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. --- commands: - name: cgconfigparser user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for # running it as root. - name: chmod-resize-swap user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/resize-swap' - name: chmod-set-disk-quota user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/set-disk-quota' - name: pgbouncer user: postgres sysvInitAction: respawn shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' - name: local_proxy user: postgres sysvInitAction: respawn shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn # Turn off database collector (`--no-collector.database`), we don't use `pg_database_size_bytes` metric anyway, see # https://github.com/neondatabase/flux-fleet/blob/5e19b3fd897667b70d9a7ad4aa06df0ca22b49ff/apps/base/compute-metrics/scrape-compute-pg-exporter-neon.yaml#L29 # but it's enabled by default and it doesn't filter out invalid databases, see # https://github.com/prometheus-community/postgres_exporter/blob/06a553c8166512c9d9c5ccf257b0f9bba8751dbc/collector/pg_database.go#L67 # so if it hits one, it starts spamming logs # ERROR: [NEON_SMGR] [reqid d9700000018] could not read db size of db 705302 from page server at lsn 5/A2457EB0 shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --no-collector.database --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' - name: sql-exporter-autoscaling user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to # use a different path for the socket. The symlink actually points to our custom path. - name: rsyslogd-socket-symlink user: root sysvInitAction: sysinit shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes group neon-postgres { perm { admin { uid = postgres; } task { gid = users; } } memory {} } # Create dummy rsyslog config, because it refuses to start without at least one action configured. # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. # # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset # for debian version migration. ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ && apt install --no-install-recommends -y \ git \ ca-certificates \ automake \ cmake \ make \ gcc \ byacc \ flex \ libtool \ libpam0g-dev \ && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ && INSTALL_DIR="/libcgroup-install" \ && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ && cd libcgroup \ # extracted from bootstrap.sh, with modified flags: && (test -d m4 || mkdir m4) \ && autoreconf -fi \ && rm -rf autom4te.cache \ && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ # actually build the thing... && make install merge: | # tweak nofile limits RUN set -e \ && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ && test ! -e /etc/security || ( \ echo '* - nofile 1048576' >>/etc/security/limits.conf \ && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) # Allow postgres user (compute_ctl) to run swap resizer. # Need to install sudo in order to allow this. # # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. RUN set -e \ && apt update \ && apt install --no-install-recommends -y \ sudo \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers COPY cgconfig.conf /etc/cgconfig.conf RUN set -e \ && chmod 0644 /etc/cgconfig.conf COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ ================================================ FILE: compute/vm-image-spec-bullseye.yaml ================================================ # Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. --- commands: - name: cgconfigparser user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for # running it as root. - name: chmod-resize-swap user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/resize-swap' - name: chmod-set-disk-quota user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/set-disk-quota' - name: pgbouncer user: postgres sysvInitAction: respawn shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' - name: local_proxy user: postgres sysvInitAction: respawn shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn # Turn off database collector (`--no-collector.database`), we don't use `pg_database_size_bytes` metric anyway, see # https://github.com/neondatabase/flux-fleet/blob/5e19b3fd897667b70d9a7ad4aa06df0ca22b49ff/apps/base/compute-metrics/scrape-compute-pg-exporter-neon.yaml#L29 # but it's enabled by default and it doesn't filter out invalid databases, see # https://github.com/prometheus-community/postgres_exporter/blob/06a553c8166512c9d9c5ccf257b0f9bba8751dbc/collector/pg_database.go#L67 # so if it hits one, it starts spamming logs # ERROR: [NEON_SMGR] [reqid d9700000018] could not read db size of db 705302 from page server at lsn 5/A2457EB0 shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --no-collector.database --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' - name: sql-exporter-autoscaling user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to # use a different path for the socket. The symlink actually points to our custom path. - name: rsyslogd-socket-symlink user: root sysvInitAction: sysinit shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes group neon-postgres { perm { admin { uid = postgres; } task { gid = users; } } memory {} } # Create dummy rsyslog config, because it refuses to start without at least one action configured. # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ && apt install --no-install-recommends -y \ git \ ca-certificates \ automake \ cmake \ make \ gcc \ byacc \ flex \ libtool \ libpam0g-dev \ && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ && INSTALL_DIR="/libcgroup-install" \ && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ && cd libcgroup \ # extracted from bootstrap.sh, with modified flags: && (test -d m4 || mkdir m4) \ && autoreconf -fi \ && rm -rf autom4te.cache \ && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ # actually build the thing... && make install merge: | # tweak nofile limits RUN set -e \ && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ && test ! -e /etc/security || ( \ echo '* - nofile 1048576' >>/etc/security/limits.conf \ && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) # Allow postgres user (compute_ctl) to run swap resizer. # Need to install sudo in order to allow this. # # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. RUN set -e \ && apt update \ && apt install --no-install-recommends -y \ sudo \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers COPY cgconfig.conf /etc/cgconfig.conf RUN set -e \ && chmod 0644 /etc/cgconfig.conf COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ ================================================ FILE: compute_tools/.dockerignore ================================================ target ================================================ FILE: compute_tools/.gitignore ================================================ target ================================================ FILE: compute_tools/Cargo.toml ================================================ [package] name = "compute_tools" version = "0.1.0" edition = "2024" license.workspace = true [features] default = [] # Enables test specific features. testing = ["fail/failpoints"] [dependencies] async-compression.workspace = true base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } axum-extra.workspace = true camino.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true http-body-util.workspace = true hostname-validator = "1.1" hyper.workspace = true hyper-util.workspace = true indexmap.workspace = true itertools.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true num_cpus.workspace = true once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true p256 = { version = "0.13", features = ["pem"] } pageserver_page_api.workspace = true postgres.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } ring = "0.17" scopeguard.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true tower.workspace = true tower-http.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true tonic.workspace = true tower-otel.workspace = true tracing.workspace = true tracing-appender.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true uuid.workspace = true walkdir.workspace = true x509-cert.workspace = true postgres-types.workspace = true postgres_versioninfo.workspace = true postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" bytes = "1.0" rust-ini = "0.20.0" rlimit = "0.10.1" ================================================ FILE: compute_tools/README.md ================================================ # Compute node tools Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` `ExecStart` option. It will handle all the `Neon` specifics during compute node initialization: - `compute_ctl` accepts cluster (compute node) specification as a JSON file. - Every start is a fresh start, so the data directory is removed and initialized again on each run. - Next it will put configuration files into the `PGDATA` directory. - Sync safekeepers and get commit LSN. - Get `basebackup` from pageserver using the returned on the previous step LSN. - Try to start `postgres` and wait until it is ready to accept connections. - Check and alter/drop/create roles and databases. - Hang waiting on the `postmaster` process to exit. Also `compute_ctl` spawns two separate service threads: - `compute-monitor` checks the last Postgres activity timestamp and saves it into the shared `ComputeNode`; - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the last activity requests. If `AUTOSCALING` environment variable is set, `compute_ctl` will start the `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes, `vm-monitor` communicates with the VM autoscaling system. It coordinates downscaling and requests immediate upscaling under resource pressure. Usage example: ```sh compute_ctl -D /var/db/postgres/compute \ -C 'postgresql://cloud_admin@localhost/postgres' \ -S /var/db/postgres/specs/current.json \ -b /usr/local/bin/postgres ``` ## State Diagram Computes can be in various states. Below is a diagram that details how a compute moves between states. ```mermaid %% https://mermaid.js.org/syntax/stateDiagram.html stateDiagram-v2 [*] --> Empty : Compute spawned Empty --> ConfigurationPending : Waiting for compute spec ConfigurationPending --> Configuration : Received compute spec Configuration --> Failed : Failed to configure the compute Configuration --> Running : Compute has been configured Empty --> Init : Compute spec is immediately available Empty --> TerminationPendingFast : Requested termination Empty --> TerminationPendingImmediate : Requested termination Init --> Failed : Failed to start Postgres Init --> Running : Started Postgres Running --> TerminationPendingFast : Requested termination Running --> TerminationPendingImmediate : Requested termination Running --> ConfigurationPending : Received a /configure request with spec Running --> RefreshConfigurationPending : Received a /refresh_configuration request, compute node will pull a new spec and reconfigure RefreshConfigurationPending --> RefreshConfiguration: Received compute spec and started configuration RefreshConfiguration --> Running : Compute has been re-configured RefreshConfiguration --> RefreshConfigurationPending : Configuration failed and to be retried TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status TerminationPendingImmediate --> Terminated : Terminated compute immediately Failed --> RefreshConfigurationPending : Received a /refresh_configuration request Failed --> [*] : Compute exited Terminated --> [*] : Compute exited ``` ## Tests Cargo formatter: ```sh cargo fmt ``` Run tests: ```sh cargo test ``` Clippy linter: ```sh cargo clippy --all --all-targets -- -Dwarnings -Drust-2018-idioms ``` ## Cross-platform compilation Imaging that you are on macOS (x86) and you want a Linux GNU (`x86_64-unknown-linux-gnu` platform in `rust` terminology) executable. ### Using docker You can use a throw-away Docker container ([rustlang/rust](https://hub.docker.com/r/rustlang/rust/) image) for doing that: ```sh docker run --rm \ -v $(pwd):/compute_tools \ -w /compute_tools \ -t rustlang/rust:nightly cargo build --release --target=x86_64-unknown-linux-gnu ``` or one-line: ```sh docker run --rm -v $(pwd):/compute_tools -w /compute_tools -t rust:latest cargo build --release --target=x86_64-unknown-linux-gnu ``` ### Using rust native cross-compilation Another way is to add `x86_64-unknown-linux-gnu` target on your host system: ```sh rustup target add x86_64-unknown-linux-gnu ``` Install macOS cross-compiler toolchain: ```sh brew tap SergioBenitez/osxct brew install x86_64-unknown-linux-gnu ``` And finally run `cargo build`: ```sh CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=x86_64-unknown-linux-gnu-gcc cargo build --target=x86_64-unknown-linux-gnu --release ``` ================================================ FILE: compute_tools/rustfmt.toml ================================================ max_width = 100 ================================================ FILE: compute_tools/src/bin/compute_ctl.rs ================================================ //! //! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` //! `ExecStart` option. It will handle all the `Neon` specifics during compute node //! initialization: //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file. //! - Every start is a fresh start, so the data directory is removed and //! initialized again on each run. //! - If remote_extension_config is provided, it will be used to fetch extensions list //! and download `shared_preload_libraries` from the remote storage. //! - Next it will put configuration files into the `PGDATA` directory. //! - Sync safekeepers and get commit LSN. //! - Get `basebackup` from pageserver using the returned on the previous step LSN. //! - Try to start `postgres` and wait until it is ready to accept connections. //! - Check and alter/drop/create roles and databases. //! - Hang waiting on the `postmaster` process to exit. //! //! Also `compute_ctl` spawns two separate service threads: //! - `compute-monitor` checks the last Postgres activity timestamp and saves it //! into the shared `ComputeNode`; //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the //! last activity requests. //! //! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the //! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes, //! `vm-monitor` communicates with the VM autoscaling system. It coordinates //! downscaling and requests immediate upscaling under resource pressure. //! //! Usage example: //! ```sh //! compute_ctl -D /var/db/postgres/compute \ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -c /var/db/postgres/configs/config.json \ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` use std::ffi::OsString; use std::fs::File; use std::process::exit; use std::sync::Arc; use std::sync::atomic::AtomicU64; use std::sync::mpsc; use std::thread; use std::time::Duration; use anyhow::{Context, Result, bail}; use clap::Parser; use compute_api::responses::ComputeConfig; use compute_tools::compute::{ BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal, }; use compute_tools::extension_server::get_pg_version_string; use compute_tools::params::*; use compute_tools::pg_isready::get_pg_isready_bin; use compute_tools::spec::*; use compute_tools::{hadron_metrics, installed_extensions, logger::*}; use rlimit::{Resource, setrlimit}; use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; use signal_hook::iterator::Signals; use tracing::{error, info}; use url::Url; use utils::failpoint_support; #[derive(Debug, Parser)] #[command(rename_all = "kebab-case")] struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, /// The base URL for the remote extension storage proxy gateway. #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)] pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep /// the previous name for this argument around for a smoother release /// with the control plane. #[arg(long, default_value_t = 3080)] pub external_http_port: u16, /// The port to bind the internal listening HTTP server to. Clients include /// the neon extension (for installing remote extensions) and local_proxy. #[arg(long, default_value_t = 3081)] pub internal_http_port: u16, /// Backwards-compatible --http-port for Hadron deployments. Functionally the /// same as --external-http-port. #[arg( long, conflicts_with = "external_http_port", conflicts_with = "internal_http_port" )] pub http_port: Option, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, #[arg(short = 'C', long, value_name = "DATABASE_URL")] pub connstr: String, #[arg( long, default_value = "neon_superuser", value_name = "PRIVILEGED_ROLE_NAME", value_parser = Self::parse_privileged_role_name )] pub privileged_role_name: String, #[cfg(target_os = "linux")] #[arg(long, default_value = "neon-postgres")] pub cgroup: String, #[cfg(target_os = "linux")] #[arg( long, default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor" )] pub filecache_connstr: String, #[cfg(target_os = "linux")] #[arg(long, default_value = "0.0.0.0:10301")] pub vm_monitor_addr: String, #[arg(long, action = clap::ArgAction::SetTrue)] pub resize_swap_on_bind: bool, #[arg(long)] pub set_disk_quota_for_fs: Option, #[arg(short = 'c', long)] pub config: Option, #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, #[arg( short = 'p', long, conflicts_with = "config", value_name = "CONTROL_PLANE_API_BASE_URL", requires = "compute-id" )] pub control_plane_uri: Option, /// Interval in seconds for collecting installed extensions statistics #[arg(long, default_value = "3600")] pub installed_extensions_collection_interval: u64, /// Run in development mode, skipping VM-specific operations like process termination #[arg(long, action = clap::ArgAction::SetTrue)] pub dev: bool, #[arg(long)] pub pg_init_timeout: Option, #[arg(long, default_value_t = false, action = clap::ArgAction::Set)] pub lakebase_mode: bool, } impl Cli { /// Parse a URL from an argument. By default, this isn't necessary, but we /// want to do some sanity checking. fn parse_remote_ext_base_url(value: &str) -> Result { // Remove extra trailing slashes, and add one. We use Url::join() later // when downloading remote extensions. If the base URL is something like // http://example.com/pg-ext-s3-gateway, and join() is called with // something like "xyz", the resulting URL is http://example.com/xyz. let value = value.trim_end_matches('/').to_owned() + "/"; let url = Url::parse(&value)?; if url.query_pairs().count() != 0 { bail!("parameters detected in remote extensions base URL") } Ok(url) } /// For simplicity, we do not escape `privileged_role_name` anywhere in the code. /// Since it's a system role, which we fully control, that's fine. Still, let's /// validate it to avoid any surprises. fn parse_privileged_role_name(value: &str) -> Result { use regex::Regex; let pattern = Regex::new(r"^[a-z_]+$").unwrap(); if !pattern.is_match(value) { bail!("--privileged-role-name can only contain lowercase letters and underscores") } Ok(value.to_string()) } } // Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port` // arg is used and acts the same as `--external-http-port`. The internal http port is defined // to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so // we need to be careful with the ports to choose. fn get_external_http_port(cli: &Cli) -> u16 { if cli.lakebase_mode { return cli.http_port.unwrap_or(cli.external_http_port); } cli.external_http_port } fn get_internal_http_port(cli: &Cli) -> u16 { if cli.lakebase_mode { return cli .http_port .map(|p| p + 1) .unwrap_or(cli.internal_http_port); } cli.internal_http_port } fn main() -> Result<()> { let cli = Cli::parse(); let scenario = failpoint_support::init(); // For historical reasons, the main thread that processes the config and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) // from all parts of compute_ctl. let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; let _rt_guard = runtime.enter(); let mut log_dir = None; if cli.lakebase_mode { log_dir = std::env::var("COMPUTE_CTL_LOG_DIRECTORY").ok(); } let (tracing_provider, _file_logs_guard) = init(cli.dev, log_dir)?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; if cli.lakebase_mode { installed_extensions::initialize_metrics(); hadron_metrics::initialize_metrics(); } let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; let config = get_config(&cli)?; let external_http_port = get_external_http_port(&cli); let internal_http_port = get_internal_http_port(&cli); let compute_node = ComputeNode::new( ComputeNodeParams { compute_id: cli.compute_id, connstr, privileged_role_name: cli.privileged_role_name.clone(), pgdata: cli.pgdata.clone(), pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), external_http_port, internal_http_port, remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] filecache_connstr: cli.filecache_connstr, #[cfg(target_os = "linux")] cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, installed_extensions_collection_interval: Arc::new(AtomicU64::new( cli.installed_extensions_collection_interval, )), pg_init_timeout: cli.pg_init_timeout.map(Duration::from_secs), pg_isready_bin: get_pg_isready_bin(&cli.pgbin), instance_id: std::env::var("INSTANCE_ID").ok(), lakebase_mode: cli.lakebase_mode, build_tag: BUILD_TAG.to_string(), control_plane_uri: cli.control_plane_uri, config_path_test_only: cli.config, }, config, )?; let exit_code = compute_node.run().context("running compute node")?; scenario.teardown(); deinit_and_exit(tracing_provider, exit_code); } fn init( dev_mode: bool, log_dir: Option, ) -> Result<( Option, Option, )> { let (provider, file_logs_guard) = init_tracing_and_logging(DEFAULT_LOG_LEVEL, &log_dir)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { handle_exit_signal(sig, dev_mode); } }); info!("compute build_tag: {}", &BUILD_TAG.to_string()); Ok((provider, file_logs_guard)) } fn get_config(cli: &Cli) -> Result { // First, read the config from the path if provided if let Some(ref config) = cli.config { let file = File::open(config)?; return Ok(serde_json::from_reader(&file)?); } // If the config wasn't provided in the CLI arguments, then retrieve it from // the control plane match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { Ok(config) => Ok(config), Err(e) => { error!( "cannot get response from control plane: {}\n\ neither spec nor confirmation that compute is in the Empty state was received", e ); Err(e) } } } fn deinit_and_exit(tracing_provider: Option, exit_code: Option) -> ! { if let Some(p) = tracing_provider { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 // // Yet, we want computes to shut down fast enough, as we may need a new one // for the same timeline ASAP. So wait no longer than 2s for the shutdown to // complete, then just error out and exit the main thread. info!("shutting down tracing"); let (sender, receiver) = mpsc::channel(); let _ = thread::spawn(move || { _ = p.shutdown(); sender.send(()).ok() }); let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); if shutdown_res.is_err() { error!("timed out while shutting down tracing, exiting anyway"); } } info!("shutting down"); exit(exit_code.unwrap_or(1)) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers /// to prevent leakage. TODO: it is better to convert compute_ctl to async and /// wait for termination which would be easy then. fn handle_exit_signal(sig: i32, dev_mode: bool) { info!("received {sig} termination signal"); forward_termination_signal(dev_mode); exit(1); } #[cfg(test)] mod test { use clap::{CommandFactory, Parser}; use url::Url; use super::Cli; #[test] fn verify_cli() { Cli::command().debug_assert() } #[test] fn verify_remote_ext_base_url() { let cli = Cli::parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--remote-ext-base-url", "https://example.com/subpath", ]); assert_eq!( cli.remote_ext_base_url.unwrap(), Url::parse("https://example.com/subpath/").unwrap() ); let cli = Cli::parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--remote-ext-base-url", "https://example.com//", ]); assert_eq!( cli.remote_ext_base_url.unwrap(), Url::parse("https://example.com").unwrap() ); Cli::try_parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--remote-ext-base-url", "https://example.com?hello=world", ]) .expect_err("URL parameters are not allowed"); } #[test] fn verify_privileged_role_name() { // Valid name let cli = Cli::parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--privileged-role-name", "my_superuser", ]); assert_eq!(cli.privileged_role_name, "my_superuser"); // Invalid names Cli::try_parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--privileged-role-name", "NeonSuperuser", ]) .expect_err("uppercase letters are not allowed"); Cli::try_parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--privileged-role-name", "$'neon_superuser", ]) .expect_err("special characters are not allowed"); Cli::try_parse_from([ "compute_ctl", "--pgdata=test", "--connstr=test", "--compute-id=test", "--privileged-role-name", "", ]) .expect_err("empty name is not allowed"); } } ================================================ FILE: compute_tools/src/bin/fast_import/aws_s3_sync.rs ================================================ use camino::{Utf8Path, Utf8PathBuf}; use tokio::task::JoinSet; use tracing::{info, warn}; use walkdir::WalkDir; use super::s3_uri::S3Uri; const MAX_PARALLEL_UPLOADS: usize = 10; /// Upload all files from 'local' to 'remote' pub(crate) async fn upload_dir_recursive( s3_client: &aws_sdk_s3::Client, local: &Utf8Path, remote: &S3Uri, ) -> anyhow::Result<()> { // Recursively scan directory let mut dirwalker = WalkDir::new(local) .into_iter() .map(|entry| { let entry = entry?; let file_type = entry.file_type(); let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf(); Ok((file_type, path)) }) .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| { match e { Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)), Ok((file_type, _path)) if file_type.is_dir() => { // The WalkDir iterator will recurse into directories, but we don't want // to do anything with directories as such. There's no concept of uploading // an empty directory to S3. None } Ok((file_type, path)) if file_type.is_symlink() => { // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip. warn!("cannot upload symlink ({})", path); None } Ok((_file_type, path)) => { // should not happen warn!("directory entry has unexpected type ({})", path); None } Err(e) => Some(Err(e)), } }); // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in // parallel. let mut joinset = JoinSet::new(); loop { // Could we upload more? while joinset.len() < MAX_PARALLEL_UPLOADS { if let Some(full_local_path) = dirwalker.next() { let full_local_path = full_local_path?; let relative_local_path = full_local_path .strip_prefix(local) .expect("all paths start from the walkdir root"); let remote_path = remote.append(relative_local_path.as_str()); info!( "starting upload of {} to {}", &full_local_path, &remote_path ); let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path); joinset.spawn(upload_task); } else { info!("draining upload tasks"); break; } } // Wait for an upload to complete if let Some(res) = joinset.join_next().await { let _ = res?; } else { // all done! break; } } Ok(()) } pub(crate) async fn upload_file( s3_client: aws_sdk_s3::Client, local_path: Utf8PathBuf, remote: S3Uri, ) -> anyhow::Result<()> { use aws_smithy_types::byte_stream::ByteStream; let stream = ByteStream::from_path(&local_path).await?; let _result = s3_client .put_object() .bucket(remote.bucket) .key(&remote.key) .body(stream) .send() .await?; info!("upload of {} to {} finished", &local_path, &remote.key); Ok(()) } ================================================ FILE: compute_tools/src/bin/fast_import/child_stdio_to_log.rs ================================================ use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::{ChildStderr, ChildStdout}; use tracing::info; /// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log. /// Each line is read and logged individually, with lossy UTF-8 conversion. /// /// # Arguments /// /// * `stdout`: An `Option` from the child process. /// * `stderr`: An `Option` from the child process. /// pub(crate) async fn relay_process_output(stdout: Option, stderr: Option) { let stdout_fut = async { if let Some(stdout) = stdout { let reader = BufReader::new(stdout); let mut lines = reader.lines(); while let Ok(Some(line)) = lines.next_line().await { info!(fd = "stdout", "{}", line); } } }; let stderr_fut = async { if let Some(stderr) = stderr { let reader = BufReader::new(stderr); let mut lines = reader.lines(); while let Ok(Some(line)) = lines.next_line().await { info!(fd = "stderr", "{}", line); } } }; tokio::join!(stdout_fut, stderr_fut); } ================================================ FILE: compute_tools/src/bin/fast_import/s3_uri.rs ================================================ use std::str::FromStr; use anyhow::Result; /// Struct to hold parsed S3 components #[derive(Debug, Clone, PartialEq, Eq)] pub struct S3Uri { pub bucket: String, pub key: String, } impl FromStr for S3Uri { type Err = anyhow::Error; /// Parse an S3 URI into a bucket and key fn from_str(uri: &str) -> Result { // Ensure the URI starts with "s3://" if !uri.starts_with("s3://") { return Err(anyhow::anyhow!("Invalid S3 URI scheme")); } // Remove the "s3://" prefix let stripped_uri = &uri[5..]; // Split the remaining string into bucket and key parts if let Some((bucket, key)) = stripped_uri.split_once('/') { Ok(S3Uri { bucket: bucket.to_string(), key: key.to_string(), }) } else { Err(anyhow::anyhow!( "Invalid S3 URI format, missing bucket or key" )) } } } impl S3Uri { pub fn append(&self, suffix: &str) -> Self { Self { bucket: self.bucket.clone(), key: format!("{}{}", self.key, suffix), } } } impl std::fmt::Display for S3Uri { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "s3://{}/{}", self.bucket, self.key) } } impl clap::builder::TypedValueParser for S3Uri { type Value = Self; fn parse_ref( &self, _cmd: &clap::Command, _arg: Option<&clap::Arg>, value: &std::ffi::OsStr, ) -> Result { let value_str = value.to_str().ok_or_else(|| { clap::Error::raw( clap::error::ErrorKind::InvalidUtf8, "Invalid UTF-8 sequence", ) })?; S3Uri::from_str(value_str).map_err(|e| { clap::Error::raw( clap::error::ErrorKind::InvalidValue, format!("Failed to parse S3 URI: {e}"), ) }) } } ================================================ FILE: compute_tools/src/bin/fast_import.rs ================================================ //! This program dumps a remote Postgres database into a local Postgres database //! and uploads the resulting PGDATA into object storage for import into a Timeline. //! //! # Context, Architecture, Design //! //! See cloud.git Fast Imports RFC () //! for the full picture. //! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline //! is publicly accessible at . //! //! # This is a Prototype! //! //! This program is part of a prototype feature and not yet used in production. //! //! The cloud.git RFC contains lots of suggestions for improving e2e throughput //! of this step of the timeline import process. //! //! # Local Testing //! //! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build. //! - Build the image with the following command: //! //! ```bash //! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile . //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use compute_tools::extension_server::get_pg_version; use nix::unistd::Pid; use std::ops::Not; use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] mod aws_s3_sync; #[path = "fast_import/child_stdio_to_log.rs"] mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); #[derive(Subcommand, Debug, Clone, serde::Serialize)] enum Command { /// Runs local postgres (neon binary), restores into it, /// uploads pgdata to s3 to be consumed by pageservers Pgdata { /// Raw connection string to the source database. Used only in tests, /// real scenario uses encrypted connection string in spec.json from s3. #[clap(long)] source_connection_string: Option, /// If specified, will not shut down the local postgres after the import. Used in local testing #[clap(short, long)] interactive: bool, /// Port to run postgres on. Default is 5432. #[clap(long, default_value_t = 5432)] pg_port: u16, // port to run postgres on, 5432 is default /// Number of CPUs in the system. This is used to configure # of /// parallel worker processes, for index creation. #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] num_cpus: Option, /// Amount of RAM in the system. This is used to configure shared_buffers /// and maintenance_work_mem. #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] memory_mb: Option, }, /// Runs pg_dump-pg_restore from source to destination without running local postgres. DumpRestore { /// Raw connection string to the source database. Used only in tests, /// real scenario uses encrypted connection string in spec.json from s3. #[clap(long)] source_connection_string: Option, /// Raw connection string to the destination database. Used only in tests, /// real scenario uses encrypted connection string in spec.json from s3. #[clap(long)] destination_connection_string: Option, }, } impl Command { fn as_str(&self) -> &'static str { match self { Command::Pgdata { .. } => "pgdata", Command::DumpRestore { .. } => "dump-restore", } } } #[derive(clap::Parser)] struct Args { #[clap(long, env = "NEON_IMPORTER_WORKDIR")] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] s3_prefix: Option, #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")] pg_bin_dir: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")] pg_lib_dir: Utf8PathBuf, #[clap(subcommand)] command: Command, } #[serde_with::serde_as] #[derive(serde::Deserialize)] struct Spec { encryption_secret: EncryptionSecret, #[serde_as(as = "serde_with::base64::Base64")] source_connstring_ciphertext_base64: Vec, #[serde_as(as = "Option")] destination_connstring_ciphertext_base64: Option>, } #[derive(serde::Deserialize)] enum EncryptionSecret { #[allow(clippy::upper_case_acronyms)] KMS { key_id: String }, } // copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { "C" } else { "C.UTF-8" }; async fn decode_connstring( kms_client: &aws_sdk_kms::Client, key_id: &String, connstring_ciphertext_base64: Vec, ) -> Result { let mut output = kms_client .decrypt() .key_id(key_id) .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( connstring_ciphertext_base64, )) .send() .await .context("decrypt connection string")?; let plaintext = output .plaintext .take() .context("get plaintext connection string")?; String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8") } struct PostgresProcess { pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pgbin: Utf8PathBuf, pg_lib_dir: Utf8PathBuf, postgres_proc: Option, } impl PostgresProcess { fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self { Self { pgdata_dir, pgbin: pg_bin_dir.join("postgres"), pg_bin_dir, pg_lib_dir, postgres_proc: None, } } async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> { tokio::fs::create_dir(&self.pgdata_dir) .await .context("create pgdata directory")?; let pg_version = get_pg_version(self.pgbin.as_ref()); postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: initdb_user, locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, pg_version, initdb_bin: self.pg_bin_dir.join("initdb").as_ref(), library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. pgdata: &self.pgdata_dir, }) .await .context("initdb") } async fn start( &mut self, initdb_user: &str, port: u16, nproc: usize, memory_mb: usize, ) -> Result<&tokio::process::Child, anyhow::Error> { self.prepare(initdb_user).await?; // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest // available for misc other stuff that PostgreSQL uses memory for. let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; // // Launch postgres process // let mut proc = tokio::process::Command::new(&self.pgbin) .arg("-D") .arg(&self.pgdata_dir) .args(["-p", &format!("{port}")]) .args(["-c", "wal_level=minimal"]) .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) .args(["-c", "max_wal_senders=0"]) .args(["-c", "fsync=off"]) .args(["-c", "full_page_writes=off"]) .args(["-c", "synchronous_commit=off"]) .args([ "-c", &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), ]) .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) .args(["-c", &format!("max_parallel_workers={nproc}")]) .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) .args(["-c", &format!("max_worker_processes={nproc}")]) .args(["-c", "effective_io_concurrency=100"]) .env_clear() .env("LD_LIBRARY_PATH", &self.pg_lib_dir) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() .context("spawn postgres")?; info!("spawned postgres, waiting for it to become ready"); tokio::spawn( child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take()) .instrument(info_span!("postgres")), ); self.postgres_proc = Some(proc); Ok(self.postgres_proc.as_ref().unwrap()) } async fn shutdown(&mut self) -> Result<(), anyhow::Error> { let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap(); info!("shutdown postgres"); nix::sys::signal::kill( Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")), nix::sys::signal::SIGTERM, ) .context("signal postgres to shut down")?; proc.wait() .await .context("wait for postgres to shut down") .map(|_| ()) } } async fn wait_until_ready(connstring: String, create_dbname: String) { // Create neondb database in the running postgres let start_time = std::time::Instant::now(); loop { if start_time.elapsed() > PG_WAIT_TIMEOUT { error!( "timeout exceeded: failed to poll postgres and create database within 10 minutes" ); std::process::exit(1); } match tokio_postgres::connect( &connstring.replace("dbname=neondb", "dbname=postgres"), tokio_postgres::NoTls, ) .await { Ok((client, connection)) => { // Spawn the connection handling task to maintain the connection tokio::spawn(async move { if let Err(e) = connection.await { warn!("connection error: {}", e); } }); match client .simple_query(format!("CREATE DATABASE {create_dbname};").as_str()) .await { Ok(_) => { info!("created {} database", create_dbname); break; } Err(e) => { warn!( "failed to create database: {}, retying in {}s", e, PG_WAIT_RETRY_INTERVAL.as_secs_f32() ); tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; continue; } } } Err(_) => { info!( "postgres not ready yet, retrying in {}s", PG_WAIT_RETRY_INTERVAL.as_secs_f32() ); tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; continue; } } } } async fn run_dump_restore( workdir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf, source_connstring: String, destination_connstring: String, ) -> Result<(), anyhow::Error> { let dumpdir = workdir.join("dumpdir"); let num_jobs = num_cpus::get().to_string(); info!("using {num_jobs} jobs for dump/restore"); let common_args = [ // schema mapping (prob suffices to specify them on one side) "--no-owner".to_string(), "--no-privileges".to_string(), "--no-publications".to_string(), "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), // concurrency "--jobs".to_string(), num_jobs, // progress updates "--verbose".to_string(), ]; info!("dump into the working directory"); { let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump")) .args(&common_args) .arg("-f") .arg(&dumpdir) .arg("--no-sync") // POSITIONAL args // source db (db name included in connection string) .arg(&source_connstring) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() .context("spawn pg_dump")?; info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump"); tokio::spawn( child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take()) .instrument(info_span!("pg_dump")), ); let st = pg_dump.wait().await.context("wait for pg_dump")?; info!(status=?st, "pg_dump exited"); if !st.success() { error!(status=%st, "pg_dump failed, restore will likely fail as well"); bail!("pg_dump failed"); } } // TODO: maybe do it in a streaming way, plenty of internal research done on this already // TODO: do the unlogged table trick { let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) .args(&common_args) .arg("-d") .arg(&destination_connstring) // POSITIONAL args .arg(&dumpdir) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() .context("spawn pg_restore")?; info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore"); tokio::spawn( child_stdio_to_log::relay_process_output( pg_restore.stdout.take(), pg_restore.stderr.take(), ) .instrument(info_span!("pg_restore")), ); let st = pg_restore.wait().await.context("wait for pg_restore")?; info!(status=?st, "pg_restore exited"); if !st.success() { error!(status=%st, "pg_restore failed, restore will likely fail as well"); bail!("pg_restore failed"); } } Ok(()) } #[allow(clippy::too_many_arguments)] async fn cmd_pgdata( s3_client: Option<&aws_sdk_s3::Client>, kms_client: Option, maybe_s3_prefix: Option, maybe_spec: Option, source_connection_string: Option, interactive: bool, pg_port: u16, workdir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf, num_cpus: Option, memory_mb: Option, ) -> Result<(), anyhow::Error> { if maybe_spec.is_none() && source_connection_string.is_none() { bail!("spec must be provided for pgdata command"); } if maybe_spec.is_some() && source_connection_string.is_some() { bail!("only one of spec or source_connection_string can be provided"); } let source_connection_string = if let Some(spec) = maybe_spec { match spec.encryption_secret { EncryptionSecret::KMS { key_id } => { decode_connstring( kms_client.as_ref().unwrap(), &key_id, spec.source_connstring_ciphertext_base64, ) .await? } } } else { source_connection_string.unwrap() }; let superuser = "cloud_admin"; let destination_connstring = format!("host=localhost port={pg_port} user={superuser} dbname=neondb"); let pgdata_dir = workdir.join("pgdata"); let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); let nproc = num_cpus.unwrap_or_else(num_cpus::get); let memory_mb = memory_mb.unwrap_or(256); proc.start(superuser, pg_port, nproc, memory_mb).await?; wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await; run_dump_restore( workdir.clone(), pg_bin_dir, pg_lib_dir, source_connection_string, destination_connstring, ) .await?; // If interactive mode, wait for Ctrl+C if interactive { info!("Running in interactive mode. Press Ctrl+C to shut down."); tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; } proc.shutdown().await?; // Only sync if s3_prefix was specified if let Some(s3_prefix) = maybe_s3_prefix { info!("upload pgdata"); aws_s3_sync::upload_dir_recursive( s3_client.unwrap(), Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"), ) .await .context("sync dump directory to destination")?; info!("write pgdata status to s3"); { let status_dir = workdir.join("status"); std::fs::create_dir(&status_dir).context("create status directory")?; let status_file = status_dir.join("pgdata"); std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) .context("write status file")?; aws_s3_sync::upload_dir_recursive( s3_client.as_ref().unwrap(), &status_dir, &s3_prefix.append("/status/"), ) .await .context("sync status directory to destination")?; } } Ok(()) } async fn cmd_dumprestore( kms_client: Option, maybe_spec: Option, source_connection_string: Option, destination_connection_string: Option, workdir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf, ) -> Result<(), anyhow::Error> { let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec { match spec.encryption_secret { EncryptionSecret::KMS { key_id } => { let source = decode_connstring( kms_client.as_ref().unwrap(), &key_id, spec.source_connstring_ciphertext_base64, ) .await .context("decrypt source connection string")?; let dest = if let Some(dest_ciphertext) = spec.destination_connstring_ciphertext_base64 { decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) .await .context("decrypt destination connection string")? } else { bail!( "destination connection string must be provided in spec for dump_restore command" ); }; (source, dest) } } } else { ( source_connection_string.unwrap(), if let Some(val) = destination_connection_string { val } else { bail!("destination connection string must be provided for dump_restore command"); }, ) }; run_dump_restore( workdir, pg_bin_dir, pg_lib_dir, source_connstring, destination_connstring, ) .await } #[tokio::main] pub(crate) async fn main() -> anyhow::Result<()> { utils::logging::init( utils::logging::LogFormat::Json, utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, utils::logging::Output::Stdout, )?; info!("starting"); let args = Args::parse(); // Initialize AWS clients only if s3_prefix is specified let (s3_client, kms_client) = if args.s3_prefix.is_some() { // Create AWS config with enhanced retry settings let config = aws_config::defaults(BehaviorVersion::v2024_03_28()) .retry_config( aws_config::retry::RetryConfig::standard() .with_max_attempts(5) // Retry up to 5 times .with_initial_backoff(std::time::Duration::from_millis(200)) // Start with 200ms delay .with_max_backoff(std::time::Duration::from_secs(5)), // Cap at 5 seconds ) .load() .await; // Create clients from the config with enhanced retry settings let s3_client = aws_sdk_s3::Client::new(&config); let kms = aws_sdk_kms::Client::new(&config); (Some(s3_client), Some(kms)) } else { (None, None) }; // Capture everything from spec assignment onwards to handle errors let res = async { let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { let spec_key = s3_prefix.append("/spec.json"); let object = s3_client .as_ref() .unwrap() .get_object() .bucket(&spec_key.bucket) .key(spec_key.key) .send() .await .context("get spec from s3")? .body .collect() .await .context("download spec body")?; serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? } else { None }; match tokio::fs::create_dir(&args.working_directory).await { Ok(()) => {} Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { if !is_directory_empty(&args.working_directory) .await .context("check if working directory is empty")? { bail!("working directory is not empty"); } else { // ok } } Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), } match args.command.clone() { Command::Pgdata { source_connection_string, interactive, pg_port, num_cpus, memory_mb, } => { cmd_pgdata( s3_client.as_ref(), kms_client, args.s3_prefix.clone(), spec, source_connection_string, interactive, pg_port, args.working_directory.clone(), args.pg_bin_dir, args.pg_lib_dir, num_cpus, memory_mb, ) .await } Command::DumpRestore { source_connection_string, destination_connection_string, } => { cmd_dumprestore( kms_client, spec, source_connection_string, destination_connection_string, args.working_directory.clone(), args.pg_bin_dir, args.pg_lib_dir, ) .await } } } .await; if let Some(s3_prefix) = args.s3_prefix { info!("write job status to s3"); { let status_dir = args.working_directory.join("status"); if std::fs::exists(&status_dir)?.not() { std::fs::create_dir(&status_dir).context("create status directory")?; } let status_file = status_dir.join("fast_import"); let res_obj = match res { Ok(_) => serde_json::json!({"command": args.command.as_str(), "done": true}), Err(err) => { serde_json::json!({"command": args.command.as_str(), "done": false, "error": err.to_string()}) } }; std::fs::write(&status_file, res_obj.to_string()).context("write status file")?; aws_s3_sync::upload_dir_recursive( s3_client.as_ref().unwrap(), &status_dir, &s3_prefix.append("/status/"), ) .await .context("sync status directory to destination")?; } } Ok(()) } ================================================ FILE: compute_tools/src/catalog.rs ================================================ use std::path::Path; use std::process::Stdio; use std::result::Result; use std::sync::Arc; use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command; use tokio::spawn; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?; spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); let roles = get_existing_roles_async(&client).await?; let databases = get_existing_dbs_async(&client) .await? .into_values() .collect(); Ok(CatalogObjects { roles, databases }) } #[derive(Debug, thiserror::Error)] pub enum SchemaDumpError { #[error("database does not exist")] DatabaseDoesNotExist, #[error("failed to execute pg_dump")] IO(#[from] std::io::Error), #[error("unexpected I/O error")] Unexpected, } // It uses the pg_dump utility to dump the schema of the specified database. // The output is streamed back to the caller and supposed to be streamed via HTTP. // // Before return the result with the output, it checks that pg_dump produced any output. // If not, it tries to parse the stderr output to determine if the database does not exist // and special error is returned. // // To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature. pub async fn get_database_schema( compute: &Arc, dbname: &str, ) -> Result> + use<>, SchemaDumpError> { let pgbin = &compute.params.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); // Replace the DB in the connection string and disable it to parts. // This is the only option to handle DBs with special characters. let conf = postgres_conf_for_db(&compute.params.connstr, dbname) .map_err(|_| SchemaDumpError::Unexpected)?; let host = conf .get_hosts() .first() .ok_or(SchemaDumpError::Unexpected)?; let host = match host { tokio_postgres::config::Host::Tcp(ip) => ip.to_string(), #[cfg(unix)] tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(), }; let port = conf .get_ports() .first() .ok_or(SchemaDumpError::Unexpected)?; let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?; let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?; let mut cmd = Command::new(pgdump) // XXX: this seems to be the only option to deal with DBs with `=` in the name // See .env("PGDATABASE", dbname) .arg("--host") .arg(host) .arg("--port") .arg(port.to_string()) .arg("--username") .arg(user) .arg("--schema-only") .stdout(Stdio::piped()) .stderr(Stdio::piped()) .kill_on_drop(true) .spawn()?; let stdout = cmd .stdout .take() .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?; let stderr = cmd .stderr .take() .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?; let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); let stderr_reader = BufReader::new(stderr); let first_chunk = match stdout_reader.next().await { Some(Ok(bytes)) if !bytes.is_empty() => bytes, Some(Err(e)) => { return Err(SchemaDumpError::IO(e)); } _ => { let mut lines = stderr_reader.lines(); if let Some(line) = lines.next_line().await? { if line.contains(&format!("FATAL: database \"{dbname}\" does not exist")) { return Err(SchemaDumpError::DatabaseDoesNotExist); } warn!("pg_dump stderr: {}", line) } tokio::spawn(async move { while let Ok(Some(line)) = lines.next_line().await { warn!("pg_dump stderr: {}", line) } }); return Err(SchemaDumpError::IO(std::io::Error::other( "failed to start pg_dump", ))); } }; let initial_stream = stream::once(Ok(first_chunk.freeze())); // Consume stderr and log warnings tokio::spawn(async move { let mut lines = stderr_reader.lines(); while let Ok(Some(line)) = lines.next_line().await { warn!("pg_dump stderr: {}", line) } }); #[allow(dead_code)] struct SchemaStream { // We keep a reference to the child process to ensure it stays alive // while the stream is being consumed. When SchemaStream is dropped, // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump cmd: tokio::process::Child, stream: S, } impl Stream for SchemaStream where S: Stream> + Unpin, { type Item = Result; fn poll_next( mut self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> std::task::Poll> { Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx) } } let schema_stream = SchemaStream { cmd, stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))), }; Ok(schema_stream) } ================================================ FILE: compute_tools/src/checker.rs ================================================ use anyhow::{Ok, Result, anyhow}; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; use crate::compute::ComputeNode; /// Update timestamp in a row in a special service table to check /// that we can actually write some data in this particular timeline. #[instrument(skip_all)] pub async fn check_writability(compute: &ComputeNode) -> Result<()> { // Connect to the database. let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker")); let (client, connection) = conf.connect(NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } // The connection object performs the actual communication with the database, // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { error!("connection error: {}", e); } }); let query = " INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE SET updated_at = pg_catalog.now();"; match client.simple_query(query).await { Result::Ok(result) => { if result.len() != 1 { return Err(anyhow::anyhow!( "expected 1 query results, but got {}", result.len() )); } } Err(err) => { if let Some(state) = err.code() { if state == &tokio_postgres::error::SqlState::DISK_FULL { warn!("Tenant disk is full"); return Ok(()); } } return Err(err.into()); } } Ok(()) } ================================================ FILE: compute_tools/src/communicator_socket_client.rs ================================================ //! Client for making request to a running Postgres server's communicator control socket. //! //! The storage communicator process that runs inside Postgres exposes an HTTP endpoint in //! a Unix Domain Socket in the Postgres data directory. This provides access to it. use std::path::Path; use anyhow::Context; use hyper::client::conn::http1::SendRequest; use hyper_util::rt::TokioIo; /// Name of the socket within the Postgres data directory. This better match that in /// `pgxn/neon/communicator/src/lib.rs`. const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket"; /// Open a connection to the communicator's control socket, prepare to send requests to it /// with hyper. pub async fn connect_communicator_socket(pgdata: &Path) -> anyhow::Result> where B: hyper::body::Body + 'static + Send, B::Data: Send, B::Error: Into>, { let socket_path = pgdata.join(NEON_COMMUNICATOR_SOCKET_NAME); let socket_path_len = socket_path.display().to_string().len(); // There is a limit of around 100 bytes (108 on Linux?) on the length of the path to a // Unix Domain socket. The limit is on the connect(2) function used to open the // socket, not on the absolute path itself. Postgres changes the current directory to // the data directory and uses a relative path to bind to the socket, and the relative // path "./neon-communicator.socket" is always short, but when compute_ctl needs to // open the socket, we need to use a full path, which can be arbitrarily long. // // There are a few ways we could work around this: // // 1. Change the current directory to the Postgres data directory and use a relative // path in the connect(2) call. That's problematic because the current directory // applies to the whole process. We could change the current directory early in // compute_ctl startup, and that might be a good idea anyway for other reasons too: // it would be more robust if the data directory is moved around or unlinked for // some reason, and you would be less likely to accidentally litter other parts of // the filesystem with e.g. temporary files. However, that's a pretty invasive // change. // // 2. On Linux, you could open() the data directory, and refer to the the socket // inside it as "/proc/self/fd//neon-communicator.socket". But that's // Linux-only. // // 3. Create a symbolic link to the socket with a shorter path, and use that. // // We use the symbolic link approach here. Hopefully the paths we use in production // are shorter, so that we can open the socket directly, so that this hack is needed // only in development. let connect_result = if socket_path_len < 100 { // We can open the path directly with no hacks. tokio::net::UnixStream::connect(socket_path).await } else { // The path to the socket is too long. Create a symlink to it with a shorter path. let short_path = std::env::temp_dir().join(format!( "compute_ctl.short-socket.{}.{}", std::process::id(), tokio::task::id() )); std::os::unix::fs::symlink(&socket_path, &short_path)?; // Delete the symlink as soon as we have connected to it. There's a small chance // of leaking if the process dies before we remove it, so try to keep that window // as small as possible. scopeguard::defer! { if let Err(err) = std::fs::remove_file(&short_path) { tracing::warn!("could not remove symlink \"{}\" created for socket: {}", short_path.display(), err); } } tracing::info!( "created symlink \"{}\" for socket \"{}\", opening it now", short_path.display(), socket_path.display() ); tokio::net::UnixStream::connect(&short_path).await }; let stream = connect_result.context("connecting to communicator control socket")?; let io = TokioIo::new(stream); let (request_sender, connection) = hyper::client::conn::http1::handshake(io).await?; // spawn a task to poll the connection and drive the HTTP state tokio::spawn(async move { if let Err(err) = connection.await { eprintln!("Error in connection: {err}"); } }); Ok(request_sender) } ================================================ FILE: compute_tools/src/compute.rs ================================================ use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, LfcPrewarmState, PromoteState, TlsConfig, }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption, PageserverConnectionInfo, PageserverProtocol, PgIdent, Role, }; use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; use itertools::Itertools; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use once_cell::sync::Lazy; use pageserver_page_api::{self as page_api, BaseBackupCompression}; use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; use std::collections::{HashMap, HashSet}; use std::ffi::OsString; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; use tokio::{spawn, sync::watch, task::JoinHandle, time}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::backoff::{ DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration, }; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; use utils::pid_file; use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize}; use crate::configurator::launch_configurator; use crate::disk_quota::set_disk_quota; use crate::hadron_metrics::COMPUTE_ATTACHED; use crate::installed_extensions::get_installed_extensions; use crate::logger::{self, startup_context_from_env}; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::metrics::COMPUTE_CTL_UP; use crate::monitor::launch_monitor; use crate::pg_helpers::*; use crate::pgbouncer::*; use crate::rsyslog::{ PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, launch_pgaudit_gc, }; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::tls::watch_cert_for_changes; use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); // This is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; /// Build tag/version of the compute node binaries/image. It's tricky and ugly /// to pass it everywhere as a part of `ComputeNodeParams`, so we use a /// global static variable. pub static BUILD_TAG: Lazy = Lazy::new(|| { option_env!("BUILD_TAG") .unwrap_or(BUILD_TAG_DEFAULT) .to_string() }); const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600; /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. #[derive(Clone, Debug)] pub struct ComputeNodeParams { /// The ID of the compute pub compute_id: String, /// Url type maintains proper escaping pub connstr: url::Url, /// The name of the 'weak' superuser role, which we give to the users. /// It follows the allow list approach, i.e., we take a standard role /// and grant it extra permissions with explicit GRANTs here and there, /// and core patches. pub privileged_role_name: String, pub resize_swap_on_bind: bool, pub set_disk_quota_for_fs: Option, // VM monitor parameters #[cfg(target_os = "linux")] pub filecache_connstr: String, #[cfg(target_os = "linux")] pub cgroup: String, #[cfg(target_os = "linux")] pub vm_monitor_addr: String, pub pgdata: String, pub pgbin: String, pub pgversion: String, /// The port that the compute's external HTTP server listens on pub external_http_port: u16, /// The port that the compute's internal HTTP server listens on pub internal_http_port: u16, /// the address of extension storage proxy gateway pub remote_ext_base_url: Option, /// Interval for installed extensions collection pub installed_extensions_collection_interval: Arc, /// Hadron instance ID of the compute node. pub instance_id: Option, /// Timeout of PG compute startup in the Init state. pub pg_init_timeout: Option, // Path to the `pg_isready` binary. pub pg_isready_bin: String, pub lakebase_mode: bool, pub build_tag: String, pub control_plane_uri: Option, pub config_path_test_only: Option, } type TaskHandle = Mutex>>; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { pub params: ComputeNodeParams, // We connect to Postgres from many different places, so build configs once // and reuse them where needed. These are derived from 'params.connstr' pub conn_conf: postgres::config::Config, pub tokio_conn_conf: tokio_postgres::config::Config, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do /// read/write, not the whole configuration process. pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub compute_ctl_config: ComputeCtlConfig, /// Handle to the extension stats collection task extension_stats_task: TaskHandle, lfc_offload_task: TaskHandle, } // store some metrics about download size that might impact startup time #[derive(Clone, Debug)] pub struct RemoteExtensionMetrics { num_ext_downloaded: u64, largest_ext_size: u64, total_ext_download_size: u64, } #[derive(Clone, Debug)] pub struct ComputeState { pub start_time: DateTime, pub pg_start_time: Option>, pub status: ComputeStatus, /// Timestamp of the last Postgres activity. It could be `None` if /// compute wasn't used since start. pub last_active: Option>, pub error: Option, /// Compute spec. This can be received from the CLI or - more likely - /// passed by the control plane with a /configure HTTP request. pub pspec: Option, /// If the spec is passed by a /configure request, 'startup_span' is the /// /configure request's tracing span. The main thread enters it when it /// processes the compute startup, so that the compute startup is considered /// to be part of the /configure request for tracing purposes. /// /// If the request handling thread/task called startup_compute() directly, /// it would automatically be a child of the request handling span, and we /// wouldn't need this. But because we use the main thread to perform the /// startup, and the /configure task just waits for it to finish, we need to /// set up the span relationship ourselves. pub startup_span: Option, pub lfc_prewarm_state: LfcPrewarmState, pub lfc_prewarm_token: CancellationToken, pub lfc_offload_state: LfcOffloadState, /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if /// mode == ComputeMode::Primary. None otherwise pub terminate_flush_lsn: Option, pub promote_state: Option>, pub metrics: ComputeMetrics, } impl ComputeState { pub fn new() -> Self { Self { start_time: Utc::now(), pg_start_time: None, status: ComputeStatus::Empty, last_active: None, error: None, pspec: None, startup_span: None, metrics: ComputeMetrics::default(), lfc_prewarm_state: LfcPrewarmState::default(), lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, promote_state: None, lfc_prewarm_token: CancellationToken::new(), } } pub fn set_status(&mut self, status: ComputeStatus, state_changed: &Condvar) { let prev = self.status; info!("Changing compute status from {} to {}", prev, status); self.status = status; state_changed.notify_all(); COMPUTE_CTL_UP.reset(); COMPUTE_CTL_UP .with_label_values(&[&BUILD_TAG, status.to_string().as_str()]) .set(1); } pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) { self.error = Some(format!("{err:?}")); self.set_status(ComputeStatus::Failed, state_changed); } } impl Default for ComputeState { fn default() -> Self { Self::new() } } #[derive(Clone, Debug)] pub struct ParsedSpec { pub spec: ComputeSpec, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub pageserver_conninfo: PageserverConnectionInfo, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, /// k8s dns name and port pub endpoint_storage_addr: Option, pub endpoint_storage_token: Option, } impl ParsedSpec { pub fn validate(&self) -> Result<(), String> { // Only Primary nodes are using safekeeper_connstrings, and at the moment // this method only validates that part of the specs. if self.spec.mode != ComputeMode::Primary { return Ok(()); } // While it seems like a good idea to check for an odd number of entries in // the safekeepers connection string, changes to the list of safekeepers might // incur appending a new server to a list of 3, in which case a list of 4 // entries is okay in production. // // Still we want unique entries, and at least one entry in the vector if self.safekeeper_connstrings.is_empty() { return Err(String::from("safekeeper_connstrings is empty")); } // check for uniqueness of the connection strings in the set let mut connstrings = self.safekeeper_connstrings.clone(); connstrings.sort(); let mut previous = &connstrings[0]; for current in connstrings.iter().skip(1) { // duplicate entry? if current == previous { return Err(format!( "duplicate entry in safekeeper_connstrings: {current}!", )); } previous = current; } Ok(()) } } impl TryFrom for ParsedSpec { type Error = anyhow::Error; fn try_from(spec: ComputeSpec) -> Result { // Extract the options from the spec file that are needed to connect to // the storage system. // // In compute specs generated by old control plane versions, the spec file might // be missing the `pageserver_connection_info` field. In that case, we need to dig // the pageserver connection info from the `pageserver_connstr` field instead, or // if that's missing too, from the GUC in the cluster.settings field. let mut pageserver_conninfo = spec.pageserver_connection_info.clone(); if pageserver_conninfo.is_none() { if let Some(pageserver_connstr_field) = &spec.pageserver_connstring { pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr( pageserver_connstr_field, spec.shard_stripe_size, )?); } } if pageserver_conninfo.is_none() { if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") { let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size") { Some(ShardStripeSize(u32::from_str(&guc)?)) } else { None }; pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?); } } let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!( "pageserver connection information should be provided" ))?; // Similarly for safekeeper connection strings let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() { if matches!(spec.mode, ComputeMode::Primary) { spec.cluster .settings .find("neon.safekeepers") .ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))? .split(',') .map(|str| str.to_string()) .collect() } else { vec![] } } else { spec.safekeeper_connstrings.clone() }; let storage_auth_token = spec.storage_auth_token.clone(); let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { tenant_id } else { let guc = spec .cluster .settings .find("neon.tenant_id") .ok_or(anyhow::anyhow!("tenant id should be provided"))?; TenantId::from_str(&guc).context("invalid tenant id")? }; let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { timeline_id } else { let guc = spec .cluster .settings .find("neon.timeline_id") .ok_or(anyhow::anyhow!("timeline id should be provided"))?; TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))? }; let endpoint_storage_addr: Option = spec .endpoint_storage_addr .clone() .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")); let endpoint_storage_token = spec .endpoint_storage_token .clone() .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); let res = ParsedSpec { spec, pageserver_conninfo, safekeeper_connstrings, storage_auth_token, tenant_id, timeline_id, endpoint_storage_addr, endpoint_storage_token, }; // Now check validity of the parsed specification res.validate().map_err(anyhow::Error::msg)?; Ok(res) } } /// If we are a VM, returns a [`Command`] that will run in the `neon-postgres` /// cgroup. Otherwise returns the default `Command::new(cmd)` /// /// This function should be used to start postgres, as it will start it in the /// neon-postgres cgroup if we are a VM. This allows autoscaling to control /// postgres' resource usage. The cgroup will exist in VMs because vm-builder /// creates it during the sysinit phase of its inittab. fn maybe_cgexec(cmd: &str) -> Command { // The cplane sets this env var for autoscaling computes. // use `var_os` so we don't have to worry about the variable being valid // unicode. Should never be an concern . . . but just in case if env::var_os("AUTOSCALING").is_some() { let mut command = Command::new("cgexec"); command.args(["-g", "memory:neon-postgres"]); command.arg(cmd); command } else { Command::new(cmd) } } struct PostgresHandle { postgres: std::process::Child, log_collector: JoinHandle>, } impl PostgresHandle { /// Return PID of the postgres (postmaster) process fn pid(&self) -> Pid { Pid::from_raw(self.postgres.id() as i32) } } struct StartVmMonitorResult { #[cfg(target_os = "linux")] token: tokio_util::sync::CancellationToken, #[cfg(target_os = "linux")] vm_monitor: Option>>, } // BEGIN_HADRON /// This function creates roles that are used by Databricks. /// These roles are not needs to be botostrapped at PG Compute provisioning time. /// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository. pub(crate) fn create_databricks_roles() -> Vec { let roles = vec![ // Role for prometheus_stats_exporter Role { name: "databricks_monitor".to_string(), // This uses "local" connection and auth method for that is "trust", so no password is needed. encrypted_password: None, options: Some(vec![GenericOption { name: "IN ROLE pg_monitor".to_string(), value: None, vartype: "string".to_string(), }]), }, // Role for brickstore control plane Role { name: "databricks_control_plane".to_string(), // Certificate user does not need password. encrypted_password: None, options: Some(vec![GenericOption { name: "SUPERUSER".to_string(), value: None, vartype: "string".to_string(), }]), }, // Role for brickstore httpgateway. Role { name: "databricks_gateway".to_string(), // Certificate user does not need password. encrypted_password: None, options: None, }, ]; roles .into_iter() .map(|role| { let query = format!( r#" DO $$ BEGIN IF NOT EXISTS ( SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}') THEN CREATE ROLE {} {}; END IF; END $$;"#, role.name, role.name.pg_quote(), role.to_pg_options(), ); query }) .collect() } /// Databricks-specific environment variables to be passed to the `postgres` sub-process. pub struct DatabricksEnvVars { /// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check /// the token scopes of internal auth tokens. pub endpoint_id: String, /// Hostname of the Databricks workspace URL this compute instance belongs to. /// Used by postgres to verify Databricks PAT tokens. pub workspace_host: String, pub lakebase_mode: bool, } impl DatabricksEnvVars { pub fn new( compute_spec: &ComputeSpec, compute_id: Option<&String>, instance_id: Option, lakebase_mode: bool, ) -> Self { let endpoint_id = if let Some(instance_id) = instance_id { // Use instance_id as endpoint_id if it is set. This code path is for PuPr model. instance_id } else { // Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model. // compute_id is a string format of "{endpoint_id}/{compute_idx}" // endpoint_id is a uuid. We only need to pass down endpoint_id to postgres. // Panics if compute_id is not set or not in the expected format. compute_id.unwrap().split('/').next().unwrap().to_string() }; let workspace_host = compute_spec .databricks_settings .as_ref() .map(|s| s.databricks_workspace_host.clone()) .unwrap_or("".to_string()); Self { endpoint_id, workspace_host, lakebase_mode, } } /// Constants for the names of Databricks-specific postgres environment variables. const DATABRICKS_ENDPOINT_ID_ENVVAR: &'static str = "DATABRICKS_ENDPOINT_ID"; const DATABRICKS_WORKSPACE_HOST_ENVVAR: &'static str = "DATABRICKS_WORKSPACE_HOST"; /// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`. pub fn to_env_var_list(self) -> Vec<(String, String)> { if !self.lakebase_mode { // In neon env, we don't need to pass down the env vars to postgres. return vec![]; } vec![ ( Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(), self.endpoint_id.clone(), ), ( Self::DATABRICKS_WORKSPACE_HOST_ENVVAR.to_string(), self.workspace_host.clone(), ), ] } } impl ComputeNode { pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); let mut conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) .context("cannot build tokio postgres config from connstr")?; // Users can set some configuration parameters per database with // ALTER DATABASE ... SET ... // // There are at least these parameters: // // - role=some_other_role // - default_transaction_read_only=on // - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail // - search_path=non_public_schema, this should be actually safe because // we don't call any functions in user databases, but better to always reset // it to public. // // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute Some(options) => format!("{EXTRA_OPTIONS} {options}"), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options); tokio_conn_conf.options(&options); let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; if params.lakebase_mode { ComputeNode::set_spec(¶ms, &mut new_state, pspec); } else { new_state.pspec = Some(pspec); } } Ok(ComputeNode { params, conn_conf, tokio_conn_conf, state: Mutex::new(new_state), state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), compute_ctl_config: config.compute_ctl_config, extension_stats_task: Mutex::new(None), lfc_offload_task: Mutex::new(None), }) } /// Top-level control flow of compute_ctl. Returns a process exit code we should /// exit with. pub fn run(self) -> Result> { let this = Arc::new(self); let cli_spec = this.state.lock().unwrap().pspec.clone(); // If this is a pooled VM, prewarm before starting HTTP server and becoming // available for binding. Prewarming helps Postgres start quicker later, // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if cli_spec.is_none() { this.prewarm_postgres_vm_memory()?; } // Set the up metric with Empty status before starting the HTTP server. // That way on the first metric scrape, an external observer will see us // as 'up' and 'empty' (unless the compute was started with a spec or // already configured by control plane). COMPUTE_CTL_UP .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()]) .set(1); // Launch the external HTTP server first, so that we can serve control plane // requests while configuration is still in progress. crate::http::server::Server::External { port: this.params.external_http_port, config: this.compute_ctl_config.clone(), compute_id: this.params.compute_id.clone(), instance_id: this.params.instance_id.clone(), } .launch(&this); // The internal HTTP server could be launched later, but there isn't much // sense in waiting. crate::http::server::Server::Internal { port: this.params.internal_http_port, } .launch(&this); // If we got a spec from the CLI already, use that. Otherwise wait for the // control plane to pass it to us with a /configure HTTP request let pspec = if let Some(cli_spec) = cli_spec { cli_spec } else { this.wait_spec()? }; launch_lsn_lease_bg_task_for_static(&this); // We have a spec, start the compute let mut delay_exit = false; let mut vm_monitor = None; let mut pg_process: Option = None; match this.start_compute(&mut pg_process) { Ok(()) => { // Success! Launch remaining services (just vm-monitor currently) vm_monitor = Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false))); } Err(err) => { // Something went wrong with the startup. Log it and expose the error to // HTTP status requests. error!("could not start the compute node: {:#}", err); this.set_failed_status(err); delay_exit = true; // If the error happened after starting PostgreSQL, kill it if let Some(ref pg_process) = pg_process { kill(pg_process.pid(), Signal::SIGQUIT).ok(); } } } // If startup was successful, or it failed in the late stages, // PostgreSQL is now running. Wait until it exits. let exit_code = if let Some(pg_handle) = pg_process { let exit_status = this.wait_postgres(pg_handle); info!("Postgres exited with code {}, shutting down", exit_status); exit_status.code() } else { None }; this.terminate_extension_stats_task(); this.terminate_lfc_offload_task(); // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. if let Some(vm_monitor) = vm_monitor { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { // Kills all threads spawned by the monitor vm_monitor.token.cancel(); if let Some(handle) = vm_monitor.vm_monitor { // Kills the actual task running the monitor handle.abort(); } } else { _ = vm_monitor; // appease unused lint on macOS } } } // Reap the postgres process delay_exit |= this.cleanup_after_postgres_exit()?; // /terminate returns LSN. If we don't sleep at all, connection will break and we // won't get result. If we sleep too much, tests will take significantly longer // and Github Action run will error out let sleep_duration = if delay_exit { Duration::from_secs(30) } else { Duration::from_millis(300) }; // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); } std::thread::sleep(sleep_duration); Ok(exit_code) } pub fn wait_spec(&self) -> Result { info!("no compute spec provided, waiting"); let mut state = self.state.lock().unwrap(); while state.status != ComputeStatus::ConfigurationPending { state = self.state_changed.wait(state).unwrap(); } info!("got spec, continue configuration"); let spec = state.pspec.as_ref().unwrap().clone(); // Record for how long we slept waiting for the spec. let now = Utc::now(); state.metrics.wait_for_spec_ms = now .signed_duration_since(state.start_time) .to_std() .unwrap() .as_millis() as u64; // Reset start time, so that the total startup time that is calculated later will // not include the time that we waited for the spec. state.start_time = now; Ok(spec) } /// Start compute. /// /// Prerequisites: /// - the compute spec has been placed in self.state.pspec /// /// On success: /// - status is set to ComputeStatus::Running /// - self.running_postgres is set /// /// On error: /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed /// - if Postgres was started before the fatal error happened, self.running_postgres is /// set. The caller is responsible for killing it. /// /// Note that this is in the critical path of a compute cold start. Keep this fast. /// Try to do things concurrently, to hide the latencies. fn start_compute(self: &Arc, pg_handle: &mut Option) -> Result<()> { let compute_state: ComputeState; let start_compute_span; let _this_entered; { let mut state_guard = self.state.lock().unwrap(); // Create a tracing span for the startup operation. // // We could otherwise just annotate the function with #[instrument], but if // we're being configured from a /configure HTTP request, we want the // startup to be considered part of the /configure request. // // Similarly, if a trace ID was passed in env variables, attach it to the span. start_compute_span = { // Temporarily enter the parent span, so that the new span becomes its child. if let Some(p) = state_guard.startup_span.take() { let _parent_entered = p.entered(); tracing::info_span!("start_compute") } else if let Some(otel_context) = startup_context_from_env() { use tracing_opentelemetry::OpenTelemetrySpanExt; let span = tracing::info_span!("start_compute"); span.set_parent(otel_context); span } else { tracing::info_span!("start_compute") } }; _this_entered = start_compute_span.enter(); // Hadron: Record postgres start time (used to enforce pg_init_timeout). state_guard.pg_start_time.replace(Utc::now()); state_guard.set_status(ComputeStatus::Init, &self.state_changed); compute_state = state_guard.clone() } let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}", pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), pspec.spec.operation_uuid.as_deref().unwrap_or("None"), pspec.tenant_id, pspec.timeline_id, pspec.spec.project_id.as_deref().unwrap_or("None"), pspec.spec.branch_id.as_deref().unwrap_or("None"), pspec.spec.endpoint_id.as_deref().unwrap_or("None"), pspec.spec.features, pspec.spec.remote_extensions, ); ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process // Collect all the tasks that must finish here let mut pre_tasks = tokio::task::JoinSet::new(); // Make sure TLS certificates are properly loaded and in the right place. if self.compute_ctl_config.tls.is_some() { let this = self.clone(); pre_tasks.spawn(async move { this.watch_cert_for_changes().await; Ok::<(), anyhow::Error>(()) }); } let tls_config = self.tls_config(&pspec.spec); // If there are any remote extensions in shared_preload_libraries, start downloading them if pspec.spec.remote_extensions.is_some() { let (this, spec) = (self.clone(), pspec.spec.clone()); pre_tasks.spawn(async move { this.download_preload_extensions(&spec) .in_current_span() .await }); } // Prepare pgdata directory. This downloads the basebackup, among other things. { let (this, cs) = (self.clone(), compute_state.clone()); pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs)); } // Resize swap to the desired size if the compute spec says so if let (Some(size_bytes), true) = (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind) { pre_tasks.spawn_blocking_child(move || { // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion // *before* starting postgres. // // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets // OOM-killed during startup because swap wasn't available yet. resize_swap(size_bytes).context("failed to resize swap")?; let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. info!(%size_bytes, %size_mib, "resized swap"); Ok::<(), anyhow::Error>(()) }); } // Set disk quota if the compute spec says so if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = ( pspec.spec.disk_quota_bytes, self.params.set_disk_quota_for_fs.as_ref(), ) { let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone(); pre_tasks.spawn_blocking_child(move || { set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) .context("failed to set disk quota")?; let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. info!(%disk_quota_bytes, %size_mib, "set disk quota"); Ok::<(), anyhow::Error>(()) }); } // tune pgbouncer if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let _handle = tokio::spawn(async move { let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); // Continue with the startup anyway } }); } // configure local_proxy if let Some(local_proxy) = &pspec.spec.local_proxy_config { info!("configuring local_proxy"); // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let mut local_proxy = local_proxy.clone(); local_proxy.tls = tls_config.clone(); let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); // Continue with the startup anyway } }); } // Configure and start rsyslog for compliance audit logging match pspec.spec.audit_log_level { ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { let remote_tls_endpoint = std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string()); let remote_plain_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() { anyhow::bail!( "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty" ); } let log_directory_path = Path::new(&self.params.pgdata).join("log"); let log_directory_path = log_directory_path.to_string_lossy().to_string(); // Add project_id,endpoint_id to identify the logs. // // These ids are passed from cplane, let endpoint_id = pspec.spec.endpoint_id.as_deref().unwrap_or(""); let project_id = pspec.spec.project_id.as_deref().unwrap_or(""); configure_audit_rsyslog( log_directory_path.clone(), endpoint_id, project_id, &remote_plain_endpoint, &remote_tls_endpoint, )?; // Launch a background task to clean up the audit logs launch_pgaudit_gc(log_directory_path); } _ => {} } // Configure and start rsyslog for Postgres logs export let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref()); configure_postgres_logs_export(conf)?; // Launch remaining service threads let _monitor_handle = launch_monitor(self); let _configurator_handle = launch_configurator(self); // Wait for all the pre-tasks to finish before starting postgres let rt = tokio::runtime::Handle::current(); while let Some(res) = rt.block_on(pre_tasks.join_next()) { res??; } ////// START POSTGRES let start_time = Utc::now(); let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let postmaster_pid = pg_process.pid(); *pg_handle = Some(pg_process); // If this is a primary endpoint, perform some post-startup configuration before // opening it up for the world. let config_time = Utc::now(); if pspec.spec.mode == ComputeMode::Primary { self.configure_as_primary(&compute_state)?; let conf = self.get_tokio_conn_conf(None); tokio::task::spawn(async { let _ = installed_extensions(conf).await; }); } // All done! let startup_end_time = Utc::now(); let metrics = { let mut state = self.state.lock().unwrap(); state.metrics.start_postgres_ms = config_time .signed_duration_since(start_time) .to_std() .unwrap() .as_millis() as u64; state.metrics.config_ms = startup_end_time .signed_duration_since(config_time) .to_std() .unwrap() .as_millis() as u64; state.metrics.total_startup_ms = startup_end_time .signed_duration_since(compute_state.start_time) .to_std() .unwrap() .as_millis() as u64; state.metrics.clone() }; self.set_status(ComputeStatus::Running); // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); self.spawn_extension_stats_task(); if pspec.spec.autoprewarm { info!("autoprewarming on startup as requested"); self.prewarm_lfc(None); } if let Some(seconds) = pspec.spec.offload_lfc_interval_seconds { self.spawn_lfc_offload_task(Duration::from_secs(seconds.into())); }; Ok(()) } #[instrument(skip_all)] async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> { let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions { remote_extensions } else { return Ok(()); }; // First, create control files for all available extensions extension_server::create_control_files(remote_extensions, &self.params.pgbin); let library_load_start_time = Utc::now(); let remote_ext_metrics = self.prepare_preload_libraries(spec).await?; let library_load_time = Utc::now() .signed_duration_since(library_load_start_time) .to_std() .unwrap() .as_millis() as u64; let mut state = self.state.lock().unwrap(); state.metrics.load_ext_ms = library_load_time; state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; info!( "Loading shared_preload_libraries took {:?}ms", library_load_time ); info!("{:?}", remote_ext_metrics); Ok(()) } /// Start the vm-monitor if directed to. The vm-monitor only runs on linux /// because it requires cgroups. fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { use std::env; use tokio_util::sync::CancellationToken; // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC let pgconnstr = if disable_lfc_resizing { None } else { Some(self.params.filecache_connstr.clone()) }; let vm_monitor = if env::var_os("AUTOSCALING").is_some() { let vm_monitor = tokio::spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: Some(self.params.cgroup.clone()), pgconnstr, addr: self.params.vm_monitor_addr.clone(), })), token.clone(), )); Some(vm_monitor) } else { None }; StartVmMonitorResult { token, vm_monitor } } else { _ = disable_lfc_resizing; // appease unused lint on macOS StartVmMonitorResult { } } } } fn cleanup_after_postgres_exit(&self) -> Result { // Maybe sync safekeepers again, to speed up next startup let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let lsn = if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { info!("syncing safekeepers on shutdown"); let storage_auth_token = pspec.storage_auth_token.clone(); let lsn = self.sync_safekeepers(storage_auth_token)?; info!(%lsn, "synced safekeepers"); Some(lsn) } else { info!("not primary, not syncing safekeepers"); None }; let mut state = self.state.lock().unwrap(); state.terminate_flush_lsn = lsn; let delay_exit = state.status == ComputeStatus::TerminationPendingFast; if state.status == ComputeStatus::TerminationPendingFast || state.status == ComputeStatus::TerminationPendingImmediate { info!( "Changing compute status from {} to {}", state.status, ComputeStatus::Terminated ); state.status = ComputeStatus::Terminated; self.state_changed.notify_all(); } drop(state); if let Err(err) = self.check_for_core_dumps() { error!("error while checking for core dumps: {err:?}"); } Ok(delay_exit) } /// Check that compute node has corresponding feature enabled. pub fn has_feature(&self, feature: ComputeFeature) -> bool { let state = self.state.lock().unwrap(); if let Some(s) = state.pspec.as_ref() { s.spec.features.contains(&feature) } else { false } } pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); state.set_status(status, &self.state_changed); } pub fn set_failed_status(&self, err: anyhow::Error) { let mut state = self.state.lock().unwrap(); state.set_failed_status(err, &self.state_changed); } pub fn get_status(&self) -> ComputeStatus { self.state.lock().unwrap().status } pub fn get_timeline_id(&self) -> Option { self.state .lock() .unwrap() .pspec .as_ref() .map(|s| s.timeline_id) } // Remove `pgdata` directory and create it again with right permissions. fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. // If it is something different then create_dir() will error out anyway. let pgdata = &self.params.pgdata; let _ok = fs::remove_dir_all(pgdata); if self.params.lakebase_mode { // Ignore creation errors if the directory already exists (e.g. mounting it ahead of time). // If it is something different then PG startup will error out anyway. let _ok = fs::create_dir(pgdata); } else { fs::create_dir(pgdata)?; } fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) } /// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and /// unarchives it to `pgdata` directory, replacing any existing contents. #[instrument(skip_all, fields(%lsn))] fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let started = Instant::now(); let (connected, size) = match spec.pageserver_conninfo.prefer_protocol { PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, }; self.fix_zenith_signal_neon_signal()?; let mut state = self.state.lock().unwrap(); state.metrics.pageserver_connect_micros = connected.duration_since(started).as_micros() as u64; state.metrics.basebackup_bytes = size as u64; state.metrics.basebackup_ms = started.elapsed().as_millis() as u64; Ok(()) } /// Move the Zenith signal file to Neon signal file location. /// This makes Compute compatible with older PageServers that don't yet /// know about the Zenith->Neon rename. fn fix_zenith_signal_neon_signal(&self) -> Result<()> { let datadir = Path::new(&self.params.pgdata); let neonsig = datadir.join("neon.signal"); if neonsig.is_file() { return Ok(()); } let zenithsig = datadir.join("zenith.signal"); if zenithsig.is_file() { fs::copy(zenithsig, neonsig)?; } Ok(()) } /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when /// the connection was established, and the (compressed) size of the basebackup. fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { let shard0_index = ShardIndex { shard_number: ShardNumber(0), shard_count: spec.pageserver_conninfo.shard_count, }; let shard0_url = spec .pageserver_conninfo .shard_url(ShardNumber(0), PageserverProtocol::Grpc)? .to_owned(); let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { let mut client = page_api::Client::connect( shard0_url, spec.tenant_id, spec.timeline_id, shard0_index, spec.storage_auth_token.clone(), None, // NB: base backups use payload compression ) .await?; let connected = Instant::now(); let reader = client .get_base_backup(page_api::GetBaseBackupRequest { lsn: (lsn != Lsn(0)).then_some(lsn), compression: BaseBackupCompression::Gzip, replica: spec.spec.mode != ComputeMode::Primary, full: false, }) .await?; anyhow::Ok((reader, connected)) })?; let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader)); // Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the // end-of-archive marker. If the server errors, the tar::Builder drop handler will write an // end-of-archive marker before the error is emitted, and we would not see the error. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader)); ar.set_ignore_zeros(true); ar.unpack(&self.params.pgdata)?; Ok((connected, reader.get_byte_count())) } /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp /// when the connection was established, and the (compressed) size of the basebackup. fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { let shard0_connstr = spec .pageserver_conninfo .shard_url(ShardNumber(0), PageserverProtocol::Libpq)?; let mut config = postgres::Config::from_str(shard0_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. if let Some(storage_auth_token) = &spec.storage_auth_token { info!("Got storage auth token from spec file"); config.password(storage_auth_token); } else { info!("Storage auth token not set"); } config.application_name("compute_ctl"); config.options(&format!( "-c neon.compute_mode={}", spec.spec.mode.to_type_str() )); // Connect to pageserver let mut client = config.connect(NoTls)?; let connected = Instant::now(); let basebackup_cmd = match lsn { Lsn(0) => { if spec.spec.mode != ComputeMode::Primary { format!( "basebackup {} {} --gzip --replica", spec.tenant_id, spec.timeline_id ) } else { format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id) } } _ => { if spec.spec.mode != ComputeMode::Primary { format!( "basebackup {} {} {} --gzip --replica", spec.tenant_id, spec.timeline_id, lsn ) } else { format!( "basebackup {} {} {} --gzip", spec.tenant_id, spec.timeline_id, lsn ) } } }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; let mut measured_reader = MeasuredReader::new(copyreader); let mut bufreader = std::io::BufReader::new(&mut measured_reader); // Read the archive directly from the `CopyOutReader` // // Set `ignore_zeros` so that unpack() reads all the Copy data and // doesn't stop at the end-of-archive marker. Otherwise, if the server // sends an Error after finishing the tarball, we will not notice it. // The tar::Builder drop handler will write an end-of-archive marker // before emitting the error, and we would not see it otherwise. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true); ar.unpack(&self.params.pgdata)?; Ok((connected, measured_reader.get_byte_count())) } // Gets the basebackup in a retry loop #[instrument(skip_all, fields(%lsn))] pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let mut retry_period_ms = 500.0; let mut attempts = 0; const DEFAULT_ATTEMPTS: u16 = 10; #[cfg(feature = "testing")] let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") { u16::from_str(&v).unwrap() } else { DEFAULT_ATTEMPTS }; #[cfg(not(feature = "testing"))] let max_attempts = DEFAULT_ATTEMPTS; loop { let result = self.try_get_basebackup(compute_state, lsn); match result { Ok(_) => { return result; } Err(ref e) if attempts < max_attempts => { warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})"); std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); retry_period_ms *= 1.5; } Err(_) => { return result; } } attempts += 1; } } pub async fn check_safekeepers_synced_async( &self, compute_state: &ComputeState, ) -> Result> { // Construct a connection config for each safekeeper let pspec: ParsedSpec = compute_state .pspec .as_ref() .expect("spec must be set") .clone(); let sk_connstrs: Vec = pspec.safekeeper_connstrings.clone(); let sk_configs = sk_connstrs.into_iter().map(|connstr| { // Format connstr let id = connstr.clone(); let connstr = format!("postgresql://no_user@{connstr}"); let options = format!( "-c timeline_id={} tenant_id={}", pspec.timeline_id, pspec.tenant_id ); // Construct client let mut config = tokio_postgres::Config::from_str(&connstr).unwrap(); config.options(&options); if let Some(storage_auth_token) = pspec.storage_auth_token.clone() { config.password(storage_auth_token); } (id, config) }); // Create task set to query all safekeepers let mut tasks = FuturesUnordered::new(); let quorum = sk_configs.len() / 2 + 1; for (id, config) in sk_configs { let timeout = tokio::time::Duration::from_millis(100); let task = tokio::time::timeout(timeout, ping_safekeeper(id, config)); tasks.push(tokio::spawn(task)); } // Get a quorum of responses or errors let mut responses = Vec::new(); let mut join_errors = Vec::new(); let mut task_errors = Vec::new(); let mut timeout_errors = Vec::new(); while let Some(response) = tasks.next().await { match response { Ok(Ok(Ok(r))) => responses.push(r), Ok(Ok(Err(e))) => task_errors.push(e), Ok(Err(e)) => timeout_errors.push(e), Err(e) => join_errors.push(e), }; if responses.len() >= quorum { break; } if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum { break; } } // In case of error, log and fail the check, but don't crash. // We're playing it safe because these errors could be transient // and we don't yet retry. if responses.len() < quorum { error!( "failed sync safekeepers check {:?} {:?} {:?}", join_errors, task_errors, timeout_errors ); return Ok(None); } Ok(check_if_synced(responses)) } // Fast path for sync_safekeepers. If they're already synced we get the lsn // in one roundtrip. If not, we should do a full sync_safekeepers. #[instrument(skip_all)] pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); let rt = tokio::runtime::Handle::current(); let result = rt.block_on(self.check_safekeepers_synced_async(compute_state)); // Record runtime self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now() .signed_duration_since(start_time) .to_std() .unwrap() .as_millis() as u64; result } // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. #[instrument(skip_all)] pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let mut sync_handle = maybe_cgexec(&self.params.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { vec![] }) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst); // `postgres --sync-safekeepers` will print all log output to stderr and // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs // will be collected in a child thread. let stderr = sync_handle .stderr .take() .expect("stderr should be captured"); let logs_handle = handle_postgres_logs(stderr); let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); // Process has exited, so we can join the logs thread. let _ = tokio::runtime::Handle::current() .block_on(logs_handle) .map_err(|e| tracing::error!("log task panicked: {:?}", e)); if !sync_output.status.success() { anyhow::bail!( "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}", sync_output.status, String::from_utf8(sync_output.stdout) .expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), ); } self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now() .signed_duration_since(start_time) .to_std() .unwrap() .as_millis() as u64; let lsn = Lsn::from_str(String::from_utf8(sync_output.stdout)?.trim())?; Ok(lsn) } fn sync_safekeepers_with_retries(&self, storage_auth_token: Option) -> Result { let max_retries = 5; let mut attempts = 0; loop { let result = self.sync_safekeepers(storage_auth_token.clone()); match &result { Ok(_) => { if attempts > 0 { tracing::info!("sync_safekeepers succeeded after {attempts} retries"); } return result; } Err(e) if attempts < max_retries => { tracing::info!( "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}" ); } Err(err) => { tracing::warn!( "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}" ); return result; } } // sleep and retry let backoff = exponential_backoff_duration( attempts, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, ); std::thread::sleep(backoff); attempts += 1; } } /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.params.pgdata); let tls_config = self.tls_config(&pspec.spec); let databricks_settings = spec.databricks_settings.as_ref(); let postgres_port = self.params.connstr.port(); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( pgdata_path, &self.params, &pspec.spec, postgres_port, self.params.internal_http_port, tls_config, databricks_settings, self.params.lakebase_mode, )?; // Syncing safekeepers is only safe with primary nodes: if a primary // is already connected it will be kicked out, so a secondary (standby) // cannot sync safekeepers. let lsn = match spec.mode { ComputeMode::Primary => { info!("checking if safekeepers are synced"); let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) { lsn } else { info!("starting safekeepers syncing"); self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")? }; info!("safekeepers synced at LSN {}", lsn); lsn } ComputeMode::Static(lsn) => { info!("Starting read-only node at static LSN {}", lsn); lsn } ComputeMode::Replica => { info!("Initializing standby from latest Pageserver LSN"); Lsn(0) } }; self.get_basebackup(compute_state, lsn) .with_context(|| format!("failed to get basebackup@{lsn}"))?; if let Some(settings) = databricks_settings { copy_tls_certificates( &settings.pg_compute_tls_settings.key_file, &settings.pg_compute_tls_settings.cert_file, pgdata_path, )?; // Update pg_hba.conf received with basebackup including additional databricks settings. update_pg_hba(pgdata_path, Some(&settings.databricks_pg_hba))?; update_pg_ident(pgdata_path, Some(&settings.databricks_pg_ident))?; } else { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path, None)?; } if let Some(databricks_settings) = spec.databricks_settings.as_ref() { copy_tls_certificates( &databricks_settings.pg_compute_tls_settings.key_file, &databricks_settings.pg_compute_tls_settings.cert_file, pgdata_path, )?; } // Place pg_dynshmem under /dev/shm. This allows us to use // 'dynamic_shared_memory_type = mmap' so that the files are placed in // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works. // // Why on earth don't we just stick to the 'posix' default, you might // ask. It turns out that making large allocations with 'posix' doesn't // work very well with autoscaling. The behavior we want is that: // // 1. You can make large DSM allocations, larger than the current RAM // size of the VM, without errors // // 2. If the allocated memory is really used, the VM is scaled up // automatically to accommodate that // // We try to make that possible by having swap in the VM. But with the // default 'posix' DSM implementation, we fail step 1, even when there's // plenty of swap available. PostgreSQL uses posix_fallocate() to create // the shmem segment, which is really just a file in /dev/shm in Linux, // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger // than available RAM. // // Using 'dynamic_shared_memory_type = mmap' works around that, because // the Postgres 'mmap' DSM implementation doesn't use // posix_fallocate(). Instead, it uses repeated calls to write(2) to // fill the file with zeros. It's weird that that differs between // 'posix' and 'mmap', but we take advantage of it. When the file is // filled slowly with write(2), the kernel allows it to grow larger, as // long as there's swap available. // // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM // segment to be larger than currently available RAM. But because we // don't want to store it on a real file, which the kernel would try to // flush to disk, so symlink pg_dynshm to /dev/shm. // // We don't set 'dynamic_shared_memory_type = mmap' here, we let the // control plane control that option. If 'mmap' is not used, this // symlink doesn't affect anything. // // See https://github.com/neondatabase/autoscaling/issues/800 std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?; symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?; match spec.mode { ComputeMode::Primary => {} ComputeMode::Replica | ComputeMode::Static(..) => { add_standby_signal(pgdata_path)?; } } Ok(()) } /// Start and stop a postgres process to warm up the VM for startup. pub fn prewarm_postgres_vm_memory(&self) -> Result<()> { if self.params.lakebase_mode { // We are running in Hadron mode. Disabling this prewarming step for now as it could run // into dblet port conflicts and also doesn't add much value with our current infra. info!("Skipping postgres prewarming in Hadron mode"); return Ok(()); } info!("prewarming VM memory"); // Create pgdata let pgdata = &format!("{}.warmup", self.params.pgdata); create_pgdata(pgdata)?; // Run initdb to completion info!("running initdb"); let initdb_bin = Path::new(&self.params.pgbin) .parent() .unwrap() .join("initdb"); Command::new(initdb_bin) .args(["--pgdata", pgdata]) .output() .expect("cannot start initdb process"); // Write conf use std::io::Write; let conf_path = Path::new(pgdata).join("postgresql.conf"); let mut file = std::fs::File::create(conf_path)?; writeln!(file, "shared_buffers=65536")?; writeln!(file, "port=51055")?; // Nobody should be connecting writeln!(file, "shared_preload_libraries = 'neon'")?; // Start postgres info!("starting postgres"); let mut pg = maybe_cgexec(&self.params.pgbin) .args(["-D", pgdata]) .spawn() .expect("cannot start postgres process"); // Stop it when it's ready info!("waiting for postgres"); wait_for_postgres(&mut pg, Path::new(pgdata))?; // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL // it to avoid orphaned processes prowling around while datadir is // wiped. let pm_pid = Pid::from_raw(pg.id() as i32); kill(pm_pid, Signal::SIGQUIT)?; info!("sent SIGQUIT signal"); pg.wait()?; info!("done prewarming vm memory"); // clean up let _ok = fs::remove_dir_all(pgdata); Ok(()) } /// Start Postgres as a child process and wait for it to start accepting /// connections. /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] pub fn start_postgres(&self, storage_auth_token: Option) -> Result { let pgdata_path = Path::new(&self.params.pgdata); let env_vars: Vec<(String, String)> = if self.params.lakebase_mode { let databricks_env_vars = { let state = self.state.lock().unwrap(); let spec = &state.pspec.as_ref().unwrap().spec; DatabricksEnvVars::new( spec, Some(&self.params.compute_id), self.params.instance_id.clone(), self.params.lakebase_mode, ) }; info!( "Starting Postgres for databricks endpoint id: {}", &databricks_env_vars.endpoint_id ); let mut env_vars = databricks_env_vars.to_env_var_list(); env_vars.extend(storage_auth_token.map(|t| ("NEON_AUTH_TOKEN".to_string(), t))); env_vars } else if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN".to_owned(), storage_auth_token.to_owned())] } else { vec![] }; // Run postgres as a child process. let mut pg = maybe_cgexec(&self.params.pgbin) .args(["-D", &self.params.pgdata]) .envs(env_vars) .stderr(Stdio::piped()) .spawn() .expect("cannot start postgres process"); PG_PID.store(pg.id(), Ordering::SeqCst); // Start a task to collect logs from stderr. let stderr = pg.stderr.take().expect("stderr should be captured"); let logs_handle = handle_postgres_logs(stderr); wait_for_postgres(&mut pg, pgdata_path)?; Ok(PostgresHandle { postgres: pg, log_collector: logs_handle, }) } /// Wait for the child Postgres process forever. In this state Ctrl+C will /// propagate to Postgres and it will be shut down as well. fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus { info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit"); let ecode = pg_handle .postgres .wait() .expect("failed to start waiting on Postgres process"); PG_PID.store(0, Ordering::SeqCst); // Process has exited. Wait for the log collecting task to finish. let _ = tokio::runtime::Handle::current() .block_on(pg_handle.log_collector) .map_err(|e| tracing::error!("log task panicked: {:?}", e)); ecode } /// Do post configuration of the already started Postgres. This function spawns a background task to /// configure the database after applying the compute spec. Currently, it upgrades the neon extension /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config")); tokio::spawn(async move { let res = async { let (mut client, connection) = conf.connect(NoTls).await?; tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); handle_neon_extension_upgrade(&mut client) .await .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) } .await; if let Err(err) = res { error!("error while post_apply_config: {err:#}"); } }); Ok(()) } pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config { let mut conf = self.conn_conf.clone(); if let Some(application_name) = application_name { conf.application_name(application_name); } conf } pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config { let mut conf = self.tokio_conn_conf.clone(); if let Some(application_name) = application_name { conf.application_name(application_name); } conf } pub async fn get_maintenance_client( conf: &tokio_postgres::Config, ) -> Result { let mut conf = conf.clone(); conf.application_name("compute_ctl:apply_config"); let (client, conn) = match conf.connect(NoTls).await { // If connection fails, it may be the old node with `zenith_admin` superuser. // // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { // Connect with `zenith_admin` if `cloud_admin` could not authenticate info!( "cannot connect to Postgres: {}, retrying with 'zenith_admin' username", e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); // It doesn't matter what were the options before, here we just want // to connect and create a new superuser role. const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0"; zenith_admin_conf.options(ZENITH_OPTIONS); let mut client = zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role let mut func = || { client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?; Ok::<_, anyhow::Error>(()) }; func().context("apply_config setup cloud_admin")?; drop(client); // Reconnect with connstring with expected name conf.connect(NoTls).await? } _ => return Err(e.into()), }, Ok((client, conn)) => (client, conn), }; spawn(async move { if let Err(e) = conn.await { error!("maintenance client connection error: {}", e); } }); // Disable DDL forwarding because control plane already knows about the roles/databases // we're about to modify. client .simple_query("SET neon.forward_ddl = false") .await .context("apply_config SET neon.forward_ddl = false")?; Ok(client) } /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config")); if self.params.lakebase_mode { // Set a 2-minute statement_timeout for the session applying config. The individual SQL statements // used in apply_spec_sql() should not take long (they are just creating users and installing // extensions). If any of them are stuck for an extended period of time it usually indicates a // pageserver connectivity problem and we should bail out. conf.options("-c statement_timeout=2min"); } let conf = Arc::new(conf); let spec = Arc::new( compute_state .pspec .as_ref() .expect("spec must be set") .spec .clone(), ); let mut tls_config = None::; if spec.features.contains(&ComputeFeature::TlsExperimental) { tls_config = self.compute_ctl_config.tls.clone(); } self.update_installed_extensions_collection_interval(&spec); let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; if let Some(local_proxy) = &spec.clone().local_proxy_config { let mut local_proxy = local_proxy.clone(); local_proxy.tls = tls_config.clone(); info!("configuring local_proxy"); local_proxy::configure(&local_proxy).context("apply_config local_proxy")?; } // Run migrations separately to not hold up cold starts let lakebase_mode = self.params.lakebase_mode; let params = self.params.clone(); tokio::spawn(async move { let mut conf = conf.as_ref().clone(); conf.application_name("compute_ctl:migrations"); match conf.connect(NoTls).await { Ok((mut client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); if let Err(e) = handle_migrations(params, &mut client, lakebase_mode).await { error!("Failed to run migrations: {}", e); } } Err(e) => { error!( "Failed to connect to the compute for running migrations: {}", e ); } }; }); Ok::<(), anyhow::Error>(()) } // Signal to the configurator to refresh the configuration by pulling a new spec from the HCC. // Note that this merely triggers a notification on a condition variable the configurator thread // waits on. The configurator thread (in configurator.rs) pulls the new spec from the HCC and // applies it. pub async fn signal_refresh_configuration(&self) -> Result<()> { let states_allowing_configuration_refresh = [ ComputeStatus::Running, ComputeStatus::Failed, ComputeStatus::RefreshConfigurationPending, ]; let mut state = self.state.lock().expect("state lock poisoned"); if states_allowing_configuration_refresh.contains(&state.status) { state.status = ComputeStatus::RefreshConfigurationPending; self.state_changed.notify_all(); Ok(()) } else if state.status == ComputeStatus::Init { // If the compute is in Init state, we can't refresh the configuration immediately, // but we should be able to do that soon. Ok(()) } else { Err(anyhow::anyhow!( "Cannot refresh compute configuration in state {:?}", state.status )) } } // Wrapped this around `pg_ctl reload`, but right now we don't use // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { let pgctl_bin = Path::new(&self.params.pgbin) .parent() .unwrap() .join("pg_ctl"); Command::new(pgctl_bin) .args(["reload", "-D", &self.params.pgdata]) .output() .expect("cannot run pg_ctl process"); Ok(()) } /// Similar to `apply_config()`, but does a bit different sequence of operations, /// as it's used to reconfigure a previously started and configured Postgres node. #[instrument(skip_all)] pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; let tls_config = self.tls_config(&spec); self.update_installed_extensions_collection_interval(&spec); if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. tokio::spawn(async move { let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } }); } if let Some(ref local_proxy) = spec.local_proxy_config { info!("configuring local_proxy"); // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let mut local_proxy = local_proxy.clone(); local_proxy.tls = tls_config.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } }); } // Reconfigure rsyslog for Postgres logs export let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref()); configure_postgres_logs_export(conf)?; // Write new config let pgdata_path = Path::new(&self.params.pgdata); let postgres_port = self.params.connstr.port(); config::write_postgres_conf( pgdata_path, &self.params, &spec, postgres_port, self.params.internal_http_port, tls_config, spec.databricks_settings.as_ref(), self.params.lakebase_mode, )?; self.pg_reload_conf()?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; // Temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: // creating new extensions, roles, etc. config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure")); let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); self.apply_spec_sql(spec, conf, max_concurrent_connections)?; } Ok(()) })?; self.pg_reload_conf()?; } let unknown_op = "unknown".to_string(); let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); info!( "finished reconfiguration of compute node for operation {}", op_id ); Ok(()) } #[instrument(skip_all)] pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); assert!(pspec.spec.mode == ComputeMode::Primary); if !pspec.spec.skip_pg_catalog_updates { let pgdata_path = Path::new(&self.params.pgdata); // temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are applying config: // creating new extensions, roles, etc... config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; self.apply_config(compute_state)?; Ok(()) })?; let postgresql_conf_path = pgdata_path.join("postgresql.conf"); if config::line_in_file( &postgresql_conf_path, "neon.disable_logical_replication_subscribers=false", )? { info!( "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" ); } self.pg_reload_conf()?; } self.post_apply_config()?; Ok(()) } pub async fn watch_cert_for_changes(self: Arc) { // update status on cert renewal if let Some(tls_config) = &self.compute_ctl_config.tls { let tls_config = tls_config.clone(); // wait until the cert exists. let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await; tokio::task::spawn_blocking(move || { let handle = tokio::runtime::Handle::current(); 'cert_update: loop { // let postgres/pgbouncer/local_proxy know the new cert/key exists. // we need to wait until it's configurable first. let mut state = self.state.lock().unwrap(); 'status_update: loop { match state.status { // let's update the state to config pending ComputeStatus::ConfigurationPending | ComputeStatus::Running => { state.set_status( ComputeStatus::ConfigurationPending, &self.state_changed, ); break 'status_update; } // exit loop ComputeStatus::Failed | ComputeStatus::TerminationPendingFast | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Terminated => break 'cert_update, // wait ComputeStatus::Init | ComputeStatus::Configuration | ComputeStatus::RefreshConfiguration | ComputeStatus::RefreshConfigurationPending | ComputeStatus::Empty => { state = self.state_changed.wait(state).unwrap(); } } } drop(state); // wait for a new certificate update if handle.block_on(cert_watch.changed()).is_err() { break; } } }); } } pub fn tls_config(&self, spec: &ComputeSpec) -> &Option { if spec.features.contains(&ComputeFeature::TlsExperimental) { &self.compute_ctl_config.tls } else { &None:: } } /// Update the `last_active` in the shared state, but ensure that it's a more recent one. pub fn update_last_active(&self, last_active: Option>) { let mut state = self.state.lock().unwrap(); // NB: `Some()` is always greater than `None`. if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {:?}", last_active); } } // Look for core dumps and collect backtraces. // // EKS worker nodes have following core dump settings: // /proc/sys/kernel/core_pattern -> core // /proc/sys/kernel/core_uses_pid -> 1 // ulimit -c -> unlimited // which results in core dumps being written to postgres data directory as core.. // // Use that as a default location and pattern, except macos where core dumps are written // to /cores/ directory by default. // // With default Linux settings, the core dump file is called just "core", so check for // that too. pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), // BEGIN HADRON // NB: Read core dump files from a fixed location outside of // the data directory since `compute_ctl` wipes the data directory // across container restarts. _ => { if self.params.lakebase_mode { Path::new("/databricks/logs/brickstore") } else { Path::new(&self.params.pgdata) } } // END HADRON }; // Collect core dump paths if any info!("checking for core dumps in {}", core_dump_dir.display()); let files = fs::read_dir(core_dump_dir)?; let cores = files.filter_map(|entry| { let entry = entry.ok()?; let is_core_dump = match entry.file_name().to_str()? { n if n.starts_with("core.") => true, "core" => true, _ => false, }; if is_core_dump { Some(entry.path()) } else { None } }); // Print backtrace for each core dump for core_path in cores { warn!( "core dump found: {}, collecting backtrace", core_path.display() ); // Try first with gdb let backtrace = Command::new("gdb") .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin]) .arg(&core_path) .output(); // Try lldb if no gdb is found -- that is handy for local testing on macOS let backtrace = match backtrace { Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => { warn!("cannot find gdb, trying lldb"); Command::new("lldb") .arg("-c") .arg(&core_path) .args(["--batch", "-o", "bt all", "-o", "quit"]) .output() } _ => backtrace, }?; warn!( "core dump backtrace: {}", String::from_utf8_lossy(&backtrace.stdout) ); warn!( "debugger stderr: {}", String::from_utf8_lossy(&backtrace.stderr) ); } Ok(()) } /// Select `pg_stat_statements` data and return it as a stringified JSON pub async fn collect_insights(&self) -> String { let mut result_rows: Vec = Vec::new(); let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights")); let connect_result = conf.connect(NoTls).await; let (client, connection) = connect_result.unwrap(); tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); let result = client .simple_query( "SELECT pg_catalog.row_to_json(pss) FROM public.pg_stat_statements pss WHERE pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid ORDER BY (pss.mean_exec_time + pss.mean_plan_time) DESC LIMIT 100", ) .await; if let Ok(raw_rows) = result { for message in raw_rows.iter() { if let postgres::SimpleQueryMessage::Row(row) = message { if let Some(json) = row.get(0) { result_rows.push(json.to_string()); } } } format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(",")) } else { "{{\"pg_stat_statements\": []}}".to_string() } } // download an archive, unzip and place files in correct locations pub async fn download_extension( &self, real_ext_name: String, ext_path: RemotePath, ) -> Result { let remote_ext_base_url = self.params .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", )))?; let ext_archive_name = ext_path.object_name().expect("bad path"); let mut first_try = false; if !self .ext_download_progress .read() .expect("lock err") .contains_key(ext_archive_name) { self.ext_download_progress .write() .expect("lock err") .insert(ext_archive_name.to_string(), (Utc::now(), false)); first_try = true; } let (download_start, download_completed) = self.ext_download_progress.read().expect("lock err")[ext_archive_name]; let start_time_delta = Utc::now() .signed_duration_since(download_start) .to_std() .unwrap() .as_millis() as u64; // how long to wait for extension download if it was started by another process const HANG_TIMEOUT: u64 = 3000; // milliseconds if download_completed { info!("extension already downloaded, skipping re-download"); return Ok(0); } else if start_time_delta < HANG_TIMEOUT && !first_try { info!( "download {ext_archive_name} already started by another process, hanging untill completion or timeout" ); let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500)); loop { info!("waiting for download"); interval.tick().await; let (_, download_completed_now) = self.ext_download_progress.read().expect("lock")[ext_archive_name]; if download_completed_now { info!("download finished by whoever else downloaded it"); return Ok(0); } } // NOTE: the above loop will get terminated // based on the timeout of the download function } // if extension hasn't been downloaded before or the previous // attempt to download was at least HANG_TIMEOUT ms ago // then we try to download it here info!("downloading new extension {ext_archive_name}"); let download_size = extension_server::download_extension( &real_ext_name, &ext_path, remote_ext_base_url, &self.params.pgbin, ) .await .map_err(DownloadError::Other); if download_size.is_ok() { self.ext_download_progress .write() .expect("bad lock") .insert(ext_archive_name.to_string(), (download_start, true)); } download_size } pub async fn set_role_grants( &self, db_name: &PgIdent, schema_name: &PgIdent, privileges: &[Privilege], role_name: &PgIdent, ) -> Result<()> { use tokio_postgres::NoTls; let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants")); conf.dbname(db_name); let (db_client, conn) = conf .connect(NoTls) .await .context("Failed to connect to the database")?; tokio::spawn(conn); // TODO: support other types of grants apart from schemas? // check the role grants first - to gracefully handle read-replicas. let select = "SELECT privilege_type FROM pg_catalog.pg_namespace JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name"; let rows = db_client .query(select, &[role_name, schema_name]) .await .with_context(|| format!("Failed to execute query: {select}"))?; let already_granted: HashSet = rows.into_iter().map(|row| row.get(0)).collect(); let grants = privileges .iter() .filter(|p| !already_granted.contains(p.as_str())) // should not be quoted as it's part of the command. // is already sanitized so it's ok .map(|p| p.as_str()) .join(", "); if !grants.is_empty() { // quote the schema and role name as identifiers to sanitize them. let schema_name = schema_name.pg_quote(); let role_name = role_name.pg_quote(); let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",); db_client .simple_query(&query) .await .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(()) } pub async fn install_extension( &self, ext_name: &PgIdent, db_name: &PgIdent, ext_version: ExtVersion, ) -> Result { use tokio_postgres::NoTls; let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension")); conf.dbname(db_name); let (db_client, conn) = conf .connect(NoTls) .await .context("Failed to connect to the database")?; tokio::spawn(conn); let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1"; let version: Option = db_client .query_opt(version_query, &[&ext_name]) .await .with_context(|| format!("Failed to execute query: {version_query}"))? .map(|row| row.get(0)); // sanitize the inputs as postgres idents. let ext_name: String = ext_name.pg_quote(); let quoted_version: String = ext_version.pg_quote(); if let Some(installed_version) = version { if installed_version == ext_version { return Ok(installed_version); } let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}"); db_client .simple_query(&query) .await .with_context(|| format!("Failed to execute query: {query}"))?; } else { let query = format!( "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}" ); db_client .simple_query(&query) .await .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(ext_version) } pub async fn prepare_preload_libraries( &self, spec: &ComputeSpec, ) -> Result { if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, total_ext_download_size: 0, }); } let remote_extensions = spec .remote_extensions .as_ref() .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?; info!("parse shared_preload_libraries from spec.cluster.settings"); let mut libs_vec = Vec::new(); if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { libs_vec = libs .split(&[',', '\'', ' ']) .filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty()) .map(str::to_string) .collect(); } info!("parse shared_preload_libraries from provided postgresql.conf"); // that is used in neon_local and python tests if let Some(conf) = &spec.cluster.postgresql_conf { let conf_lines = conf.split('\n').collect::>(); let mut shared_preload_libraries_line = ""; for line in conf_lines { if line.starts_with("shared_preload_libraries") { shared_preload_libraries_line = line; } } let mut preload_libs_vec = Vec::new(); if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) { preload_libs_vec = libs .split(&[',', '\'', ' ']) .filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty()) .map(str::to_string) .collect(); } libs_vec.extend(preload_libs_vec); } // Don't try to download libraries that are not in the index. // Assume that they are already present locally. libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib)); info!("Downloading to shared preload libraries: {:?}", &libs_vec); let mut download_tasks = Vec::new(); for library in &libs_vec { let (ext_name, ext_path) = remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; let mut remote_ext_metrics = RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, total_ext_download_size: 0, }; for result in results { let download_size = match result { Ok(res) => { remote_ext_metrics.num_ext_downloaded += 1; res } Err(err) => { // if we failed to download an extension, we don't want to fail the whole // process, but we do want to log the error error!("Failed to download extension: {}", err); 0 } }; remote_ext_metrics.largest_ext_size = std::cmp::max(remote_ext_metrics.largest_ext_size, download_size); remote_ext_metrics.total_ext_download_size += download_size; } Ok(remote_ext_metrics) } /// Waits until current thread receives a state changed notification and /// the pageserver connection strings has changed. /// /// The operation will time out after a specified duration. pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) { let state = self.state.lock().unwrap(); let old_pageserver_conninfo = state .pspec .as_ref() .expect("spec must be set") .pageserver_conninfo .clone(); let mut unchanged = true; let _ = self .state_changed .wait_timeout_while(state, duration, |s| { let pageserver_conninfo = &s .pspec .as_ref() .expect("spec must be set") .pageserver_conninfo; unchanged = pageserver_conninfo == &old_pageserver_conninfo; unchanged }) .unwrap(); if !unchanged { info!("Pageserver config changed"); } } pub fn spawn_extension_stats_task(&self) { self.terminate_extension_stats_task(); let conf = self.tokio_conn_conf.clone(); let atomic_interval = self.params.installed_extensions_collection_interval.clone(); let mut installed_extensions_collection_interval = 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst); info!( "[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}", installed_extensions_collection_interval ); let handle = tokio::spawn(async move { loop { info!( "[NEON_EXT_INT_SLEEP]: Interval: {}", installed_extensions_collection_interval ); // Sleep at the start of the loop to ensure that two collections don't happen at the same time. // The first collection happens during compute startup. tokio::time::sleep(tokio::time::Duration::from_secs( installed_extensions_collection_interval, )) .await; let _ = installed_extensions(conf.clone()).await; // Acquire a read lock on the compute spec and then update the interval if necessary installed_extensions_collection_interval = std::cmp::max( installed_extensions_collection_interval, 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst), ); } }); // Store the new task handle *self.extension_stats_task.lock().unwrap() = Some(handle); } fn terminate_extension_stats_task(&self) { if let Some(h) = self.extension_stats_task.lock().unwrap().take() { h.abort() } } pub fn spawn_lfc_offload_task(self: &Arc, interval: Duration) { self.terminate_lfc_offload_task(); let secs = interval.as_secs(); let this = self.clone(); info!("spawning LFC offload worker with {secs}s interval"); let handle = spawn(async move { let mut interval = time::interval(interval); interval.tick().await; // returns immediately loop { interval.tick().await; let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone(); // Do not offload LFC state if we are currently prewarming or any issue occurred. // If we'd do that, we might override the LFC state in endpoint storage with some // incomplete state. Imagine a situation: // 1. Endpoint started with `autoprewarm: true` // 2. While prewarming is not completed, we upload the new incomplete state // 3. Compute gets interrupted and restarts // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state if matches!( prewarm_state, LfcPrewarmState::Completed { .. } | LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Skipped ) { this.offload_lfc_async().await; } } }); *self.lfc_offload_task.lock().unwrap() = Some(handle); } fn terminate_lfc_offload_task(&self) { if let Some(h) = self.lfc_offload_task.lock().unwrap().take() { h.abort() } } fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) { // Update the interval for collecting installed extensions statistics // If the value is -1, we never suspend so set the value to default collection. // If the value is 0, it means default, we will just continue to use the default. if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { self.params.installed_extensions_collection_interval.store( DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, std::sync::atomic::Ordering::SeqCst, ); } else { self.params.installed_extensions_collection_interval.store( spec.suspend_timeout_seconds as u64, std::sync::atomic::Ordering::SeqCst, ); } } /// Set the compute spec and update related metrics. /// This is the central place where pspec is updated. pub fn set_spec(params: &ComputeNodeParams, state: &mut ComputeState, pspec: ParsedSpec) { state.pspec = Some(pspec); ComputeNode::update_attached_metric(params, state); let _ = logger::update_ids(¶ms.instance_id, &Some(params.compute_id.clone())); } pub fn update_attached_metric(params: &ComputeNodeParams, state: &mut ComputeState) { // Update the pg_cctl_attached gauge when all identifiers are available. if let Some(instance_id) = ¶ms.instance_id { if let Some(pspec) = &state.pspec { // Clear all values in the metric COMPUTE_ATTACHED.reset(); // Set new metric value COMPUTE_ATTACHED .with_label_values(&[ ¶ms.compute_id, instance_id, &pspec.tenant_id.to_string(), &pspec.timeline_id.to_string(), ]) .set(1); } } } } pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> { let res = get_installed_extensions(conf).await; match res { Ok(extensions) => { info!( "[NEON_EXT_STAT] {}", serde_json::to_string(&extensions).expect("failed to serialize extensions list") ); } Err(err) => error!("could not get installed extensions: {err}"), } Ok(()) } pub fn forward_termination_signal(dev_mode: bool) { let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); if ss_pid != 0 { let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); kill(ss_pid, Signal::SIGTERM).ok(); } if !dev_mode { // Terminate pgbouncer with SIGKILL match pid_file::read(PGBOUNCER_PIDFILE.into()) { Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { info!("sending SIGKILL to pgbouncer process pid: {}", pid); if let Err(e) = kill(pid, Signal::SIGKILL) { error!("failed to terminate pgbouncer: {}", e); } } // pgbouncer does not lock the pid file, so we read and kill the process directly Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) { if let Ok(pid) = pid_str.trim().parse::() { info!( "sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)", pid ); if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) { error!("failed to terminate pgbouncer: {}", e); } } } else { info!("pgbouncer pid file exists but process not running"); } } Ok(pid_file::PidFileRead::NotExist) => { info!("pgbouncer pid file not found, process may not be running"); } Err(e) => { error!("error reading pgbouncer pid file: {}", e); } } // Terminate local_proxy match pid_file::read("/etc/local_proxy/pid".into()) { Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { info!("sending SIGTERM to local_proxy process pid: {}", pid); if let Err(e) = kill(pid, Signal::SIGTERM) { error!("failed to terminate local_proxy: {}", e); } } Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { info!("local_proxy PID file exists but process not running"); } Ok(pid_file::PidFileRead::NotExist) => { info!("local_proxy PID file not found, process may not be running"); } Err(e) => { error!("error reading local_proxy PID file: {}", e); } } } else { info!("Skipping pgbouncer and local_proxy termination because in dev mode"); } let pg_pid = PG_PID.load(Ordering::SeqCst); if pg_pid != 0 { let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for // ROs to get a list of running xacts faster instead of going through the CLOG. // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals. kill(pg_pid, Signal::SIGINT).ok(); } } // helper trait to call JoinSet::spawn_blocking(f), but propagates the current // tracing span to the thread. trait JoinSetExt { fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle where F: FnOnce() -> T + Send + 'static, T: Send; } impl JoinSetExt for tokio::task::JoinSet { fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle where F: FnOnce() -> T + Send + 'static, T: Send, { let sp = tracing::Span::current(); self.spawn_blocking(move || { let _e = sp.enter(); f() }) } } #[cfg(test)] mod tests { use std::fs::File; use super::*; #[test] fn duplicate_safekeeper_connstring() { let file = File::open("tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); match ParsedSpec::try_from(spec.clone()) { Ok(_p) => panic!("Failed to detect duplicate entry"), Err(e) => assert!( e.to_string() .starts_with("duplicate entry in safekeeper_connstrings:") ), }; } } ================================================ FILE: compute_tools/src/compute_prewarm.rs ================================================ use crate::compute::ComputeNode; use anyhow::{Context, Result, bail}; use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; use compute_api::responses::LfcOffloadState; use compute_api::responses::LfcPrewarmState; use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; use std::time::Instant; use tokio::{io::AsyncReadExt, select, spawn}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; /// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks struct EndpointStoragePair { url: String, token: String, } const KEY: &str = "lfc_state"; impl EndpointStoragePair { /// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs /// If not None, takes precedence over pspec.spec.endpoint_id fn from_spec_and_endpoint( pspec: &crate::compute::ParsedSpec, endpoint_id: Option, ) -> Result { let endpoint_id = endpoint_id.as_ref().or(pspec.spec.endpoint_id.as_ref()); let Some(ref endpoint_id) = endpoint_id else { bail!("pspec.endpoint_id missing, other endpoint_id not provided") }; let Some(ref base_uri) = pspec.endpoint_storage_addr else { bail!("pspec.endpoint_storage_addr missing") }; let tenant_id = pspec.tenant_id; let timeline_id = pspec.timeline_id; let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}"); let Some(ref token) = pspec.endpoint_storage_token else { bail!("pspec.endpoint_storage_token missing") }; let token = token.clone(); Ok(EndpointStoragePair { url, token }) } } impl ComputeNode { pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState { self.state.lock().unwrap().lfc_prewarm_state.clone() } pub fn lfc_offload_state(&self) -> LfcOffloadState { self.state.lock().unwrap().lfc_offload_state.clone() } /// If there is a prewarm request ongoing, return `false`, `true` otherwise. /// Has a failpoint "compute-prewarm" pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { let token: CancellationToken; { let state = &mut self.state.lock().unwrap(); token = state.lfc_prewarm_token.clone(); if let LfcPrewarmState::Prewarming = replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming) { return false; } } crate::metrics::LFC_PREWARMS.inc(); let this = self.clone(); spawn(async move { let prewarm_state = match this.prewarm_impl(from_endpoint, token).await { Ok(state) => state, Err(err) => { crate::metrics::LFC_PREWARM_ERRORS.inc(); error!(%err, "could not prewarm LFC"); let error = format!("{err:#}"); LfcPrewarmState::Failed { error } } }; let state = &mut this.state.lock().unwrap(); if let LfcPrewarmState::Cancelled = prewarm_state { state.lfc_prewarm_token = CancellationToken::new(); } state.lfc_prewarm_state = prewarm_state; }); true } /// from_endpoint: None for endpoint managed by this compute_ctl fn endpoint_storage_pair(&self, from_endpoint: Option) -> Result { let state = self.state.lock().unwrap(); EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint) } /// Request LFC state from endpoint storage and load corresponding pages into Postgres. async fn prewarm_impl( &self, from_endpoint: Option, token: CancellationToken, ) -> Result { let EndpointStoragePair { url, token: storage_token, } = self.endpoint_storage_pair(from_endpoint)?; #[cfg(feature = "testing")] fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); let mut now = Instant::now(); let request = Client::new().get(&url).bearer_auth(storage_token); let response = select! { _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), response = request.send() => response } .context("querying endpoint storage")?; match response.status() { StatusCode::OK => (), StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } let state_download_time_ms = now.elapsed().as_millis() as u32; now = Instant::now(); let mut uncompressed = Vec::new(); let lfc_state = select! { _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), lfc_state = response.bytes() => lfc_state } .context("getting request body from endpoint storage")?; let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice()); select! { _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), read = decoder.read_to_end(&mut uncompressed) => read } .context("decoding LFC state")?; let uncompress_time_ms = now.elapsed().as_millis() as u32; now = Instant::now(); let uncompressed_len = uncompressed.len(); info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); // Client connection and prewarm info querying are fast and therefore don't need // cancellation let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")?; let pg_token = client.cancel_token(); let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed]; select! { res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res, _ = token.cancelled() => { pg_token.cancel_query(postgres::NoTls).await .context("cancelling neon.prewarm_local_cache()")?; return Ok(LfcPrewarmState::Cancelled) } } .context("loading LFC state into postgres") .map(|_| ())?; let prewarm_time_ms = now.elapsed().as_millis() as u32; let row = client .query_one("select * from neon.get_prewarm_info()", &[]) .await .context("querying prewarm info")?; let total = row.try_get(0).unwrap_or_default(); let prewarmed = row.try_get(1).unwrap_or_default(); let skipped = row.try_get(2).unwrap_or_default(); Ok(LfcPrewarmState::Completed { total, prewarmed, skipped, state_download_time_ms, uncompress_time_ms, prewarm_time_ms, }) } /// If offload request is ongoing, return false, true otherwise pub fn offload_lfc(self: &Arc) -> bool { { let state = &mut self.state.lock().unwrap().lfc_offload_state; if matches!( replace(state, LfcOffloadState::Offloading), LfcOffloadState::Offloading ) { return false; } } let cloned = self.clone(); spawn(async move { cloned.offload_lfc_with_state_update().await }); true } pub async fn offload_lfc_async(self: &Arc) { { let state = &mut self.state.lock().unwrap().lfc_offload_state; if matches!( replace(state, LfcOffloadState::Offloading), LfcOffloadState::Offloading ) { return; } } self.offload_lfc_with_state_update().await } async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); let state = match self.offload_lfc_impl().await { Ok(state) => state, Err(err) => { crate::metrics::LFC_OFFLOAD_ERRORS.inc(); error!(%err, "could not offload LFC"); let error = format!("{err:#}"); LfcOffloadState::Failed { error } } }; self.state.lock().unwrap().lfc_offload_state = state; } async fn offload_lfc_impl(&self) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); let mut now = Instant::now(); let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")? .query_one("select neon.get_local_cache_state()", &[]) .await .context("querying LFC state")?; let state = row .try_get::>(0) .context("deserializing LFC state")?; let Some(state) = state else { info!(%url, "empty LFC state, not exporting"); return Ok(LfcOffloadState::Skipped); }; let state_query_time_ms = now.elapsed().as_millis() as u32; now = Instant::now(); let mut compressed = Vec::new(); ZstdEncoder::new(state) .read_to_end(&mut compressed) .await .context("compressing LFC state")?; let compress_time_ms = now.elapsed().as_millis() as u32; now = Instant::now(); let compressed_len = compressed.len(); info!(%url, "downloaded LFC state, compressed size {compressed_len}"); let request = Client::new().put(url).bearer_auth(token).body(compressed); let response = request .send() .await .context("writing to endpoint storage")?; let state_upload_time_ms = now.elapsed().as_millis() as u32; let status = response.status(); if status != StatusCode::OK { bail!("request to endpoint storage failed: {status}"); } Ok(LfcOffloadState::Completed { compress_time_ms, state_query_time_ms, state_upload_time_ms, }) } pub fn cancel_prewarm(self: &Arc) { self.state.lock().unwrap().lfc_prewarm_token.cancel(); } } ================================================ FILE: compute_tools/src/compute_promote.rs ================================================ use crate::compute::ComputeNode; use anyhow::{Context, bail}; use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState}; use std::time::Instant; use tracing::info; impl ComputeNode { /// Returns only when promote fails or succeeds. If http client calling this function /// disconnects, this does not stop promotion, and subsequent calls block until promote finishes. /// Called by control plane on secondary after primary endpoint is terminated /// Has a failpoint "compute-promotion" pub async fn promote(self: &std::sync::Arc, cfg: PromoteConfig) -> PromoteState { let this = self.clone(); let promote_fn = async move || match this.promote_impl(cfg).await { Ok(state) => state, Err(err) => { tracing::error!(%err, "promoting replica"); let error = format!("{err:#}"); PromoteState::Failed { error } } }; let start_promotion = || { let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted); tokio::spawn(async move { tx.send(promote_fn().await) }); rx }; let mut task; // promote_impl locks self.state so we need to unlock it before calling task.changed() { let promote_state = &mut self.state.lock().unwrap().promote_state; task = promote_state.get_or_insert_with(start_promotion).clone() } if task.changed().await.is_err() { let error = "promote sender dropped".to_string(); return PromoteState::Failed { error }; } task.borrow().clone() } async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result { { let state = self.state.lock().unwrap(); let mode = &state.pspec.as_ref().unwrap().spec.mode; if *mode != compute_api::spec::ComputeMode::Replica { bail!("compute mode \"{}\" is not replica", mode.to_type_str()); } match &state.lfc_prewarm_state { status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => { bail!("compute {status}") } LfcPrewarmState::Failed { error } => { tracing::warn!(%error, "compute prewarm failed") } _ => {} } } let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")?; let mut now = Instant::now(); let primary_lsn = cfg.wal_flush_lsn; let mut standby_lsn = utils::lsn::Lsn::INVALID; const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[]) .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); standby_lsn = lsn.into(); if standby_lsn >= primary_lsn { break; } info!(%standby_lsn, %primary_lsn, "catching up, try {i}"); tokio::time::sleep(std::time::Duration::from_secs(1)).await; } if standby_lsn < primary_lsn { bail!("didn't catch up with primary in {RETRIES} retries"); } let lsn_wait_time_ms = now.elapsed().as_millis() as u32; now = Instant::now(); // using $1 doesn't work with ALTER SYSTEM SET let safekeepers_sql = format!( "ALTER SYSTEM SET neon.safekeepers='{}'", cfg.spec.safekeeper_connstrings.join(",") ); client .query(&safekeepers_sql, &[]) .await .context("setting safekeepers")?; client .query( "ALTER SYSTEM SET synchronous_standby_names=walproposer", &[], ) .await .context("setting synchronous_standby_names")?; client .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; #[cfg(feature = "testing")] fail::fail_point!("compute-promotion", |_| bail!( "compute-promotion failpoint" )); let row = client .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { bail!("pg_promote() failed"); } let pg_promote_time_ms = now.elapsed().as_millis() as u32; let now = Instant::now(); let row = client .query_one("SHOW transaction_read_only", &[]) .await .context("getting transaction_read_only")?; if row.get::(0) == "on" { bail!("replica in read only mode after promotion"); } // Already checked validity in http handler #[allow(unused_mut)] let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec"); { let mut state = self.state.lock().unwrap(); // Local setup has different ports for pg process (port=) for primary and secondary. // Primary is stopped so we need secondary's "port" value #[cfg(feature = "testing")] { let old_spec = &state.pspec.as_ref().unwrap().spec; let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else { bail!("pspec.spec.cluster.postgresql_conf missing for endpoint"); }; let set: std::collections::HashMap<&str, &str> = old_conf .split_terminator('\n') .map(|e| e.split_once("=").expect("invalid item")) .collect(); let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else { bail!("pspec.spec.cluster.postgresql_conf missing for supplied config"); }; new_conf.push_str(&format!("port={}\n", set["port"])); } tracing::debug!("applied spec: {:#?}", new_pspec.spec); if self.params.lakebase_mode { ComputeNode::set_spec(&self.params, &mut state, new_pspec); } else { state.pspec = Some(new_pspec); } } info!("applied new spec, reconfiguring as primary"); self.reconfigure()?; let reconfigure_time_ms = now.elapsed().as_millis() as u32; Ok(PromoteState::Completed { lsn_wait_time_ms, pg_promote_time_ms, reconfigure_time_ms, }) } } ================================================ FILE: compute_tools/src/config.rs ================================================ use anyhow::Result; use std::fmt::Write as FmtWrite; use std::fs::{File, OpenOptions}; use std::io; use std::io::Write; use std::io::prelude::*; use std::path::Path; use compute_api::responses::TlsConfig; use compute_api::spec::{ ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption, }; use crate::compute::ComputeNodeParams; use crate::pg_helpers::{ DatabricksSettingsExt as _, GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, }; use crate::tls::{self, SERVER_CRT, SERVER_KEY}; use utils::shard::{ShardIndex, ShardNumber}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. pub fn line_in_file(path: &Path, line: &str) -> Result { let mut file = OpenOptions::new() .read(true) .write(true) .create(true) .append(false) .truncate(false) .open(path)?; let buf = io::BufReader::new(&file); let mut count: usize = 0; for l in buf.lines() { if l? == line { return Ok(false); } count = 1; } write!(file, "{}{}", "\n".repeat(count), line)?; Ok(true) } /// Create or completely rewrite configuration file specified by `path` #[allow(clippy::too_many_arguments)] pub fn write_postgres_conf( pgdata_path: &Path, params: &ComputeNodeParams, spec: &ComputeSpec, postgres_port: Option, extension_server_port: u16, tls_config: &Option, databricks_settings: Option<&DatabricksSettings>, lakebase_mode: bool, ) -> Result<()> { let path = pgdata_path.join("postgresql.conf"); // File::create() destroys the file content if it exists. let mut file = File::create(path)?; // Write the postgresql.conf content from the spec file as is. if let Some(conf) = &spec.cluster.postgresql_conf { writeln!(file, "{conf}")?; } // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; writeln!(file)?; if let Some(conninfo) = &spec.pageserver_connection_info { // Stripe size GUC should be defined prior to connection string if let Some(stripe_size) = conninfo.stripe_size { writeln!( file, "# from compute spec's pageserver_connection_info.stripe_size field" )?; writeln!(file, "neon.stripe_size={stripe_size}")?; } let mut libpq_urls: Option> = Some(Vec::new()); let num_shards = if conninfo.shard_count.0 == 0 { 1 // unsharded, treat it as a single shard } else { conninfo.shard_count.0 }; for shard_number in 0..num_shards { let shard_index = ShardIndex { shard_number: ShardNumber(shard_number), shard_count: conninfo.shard_count, }; let info = conninfo.shards.get(&shard_index).ok_or_else(|| { anyhow::anyhow!( "shard {shard_index} missing from pageserver_connection_info shard map" ) })?; let first_pageserver = info .pageservers .first() .expect("must have at least one pageserver"); // Add the libpq URL to the array, or if the URL is missing, reset the array // forgetting any previous entries. All servers must have a libpq URL, or none // at all. if let Some(url) = &first_pageserver.libpq_url { if let Some(ref mut urls) = libpq_urls { urls.push(url.clone()); } } else { libpq_urls = None } } if let Some(libpq_urls) = libpq_urls { writeln!( file, "# derived from compute spec's pageserver_connection_info field" )?; writeln!( file, "neon.pageserver_connstring={}", escape_conf_value(&libpq_urls.join(",")) )?; } else { writeln!(file, "# no neon.pageserver_connstring")?; } } else { // Stripe size GUC should be defined prior to connection string if let Some(stripe_size) = spec.shard_stripe_size { writeln!(file, "# from compute spec's shard_stripe_size field")?; writeln!(file, "neon.stripe_size={stripe_size}")?; } if let Some(s) = &spec.pageserver_connstring { writeln!(file, "# from compute spec's pageserver_connstring field")?; writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } } if !spec.safekeeper_connstrings.is_empty() { let mut neon_safekeepers_value = String::new(); tracing::info!( "safekeepers_connstrings is not zero, gen: {:?}", spec.safekeepers_generation ); // If generation is given, prepend sk list with g#number: if let Some(generation) = spec.safekeepers_generation { write!(neon_safekeepers_value, "g#{generation}:")?; } neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( file, "neon.safekeepers={}", escape_conf_value(&neon_safekeepers_value) )?; } if let Some(s) = &spec.tenant_id { writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?; } if let Some(s) = &spec.timeline_id { writeln!( file, "neon.timeline_id={}", escape_conf_value(&s.to_string()) )?; } if let Some(s) = &spec.project_id { writeln!(file, "neon.project_id={}", escape_conf_value(s))?; } if let Some(s) = &spec.branch_id { writeln!(file, "neon.branch_id={}", escape_conf_value(s))?; } if let Some(s) = &spec.endpoint_id { writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?; } // tls if let Some(tls_config) = tls_config { writeln!(file, "ssl = on")?; // postgres requires the keyfile to be in a secure file, // currently too complicated to ensure that at the VM level, // so we just copy them to another file instead. :shrug: tls::update_key_path_blocking(pgdata_path, tls_config); // these are the default, but good to be explicit. writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?; writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?; } // Locales if cfg!(target_os = "macos") { writeln!(file, "lc_messages='C'")?; writeln!(file, "lc_monetary='C'")?; writeln!(file, "lc_time='C'")?; writeln!(file, "lc_numeric='C'")?; } else { writeln!(file, "lc_messages='C.UTF-8'")?; writeln!(file, "lc_monetary='C.UTF-8'")?; writeln!(file, "lc_time='C.UTF-8'")?; writeln!(file, "lc_numeric='C.UTF-8'")?; } writeln!(file, "neon.compute_mode={}", spec.mode.to_type_str())?; match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { // hot_standby is 'on' by default, but let's be explicit writeln!(file, "hot_standby=on")?; writeln!(file, "recovery_target_lsn='{lsn}'")?; } ComputeMode::Replica => { // hot_standby is 'on' by default, but let's be explicit writeln!(file, "hot_standby=on")?; } } if cfg!(target_os = "linux") { // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is // disabled), then the control plane has enabled swap and we should set // dynamic_shared_memory_type = 'mmap'. // // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") // ignore any errors - they may be expected to occur under certain situations (e.g. when // not running in Linux). .unwrap_or_else(|_| String::new()); if overcommit_memory_contents.trim() == "2" { let opt = GenericOption { name: "dynamic_shared_memory_type".to_owned(), value: Some("mmap".to_owned()), vartype: "enum".to_owned(), }; writeln!(file, "{}", opt.to_pg_setting())?; } } writeln!( file, "neon.privileged_role_name={}", escape_conf_value(params.privileged_role_name.as_str()) )?; // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; write!(file, "{}", spec.cluster.settings.as_pg_settings())?; writeln!(file, "# Managed by compute_ctl: end")?; } // If base audit logging is enabled, configure it. // In this setup, the audit log will be written to the standard postgresql log. // // If compliance audit logging is enabled, configure pgaudit. // // Note, that this is called after the settings from spec are written. // This way we always override the settings from the spec // and don't allow the user or the control plane admin to change them. match spec.audit_log_level { ComputeAudit::Disabled => {} ComputeAudit::Log | ComputeAudit::Base => { writeln!(file, "# Managed by compute_ctl base audit settings: start")?; writeln!(file, "pgaudit.log='ddl,role'")?; // Disable logging of catalog queries to reduce the noise writeln!(file, "pgaudit.log_catalog=off")?; if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { let mut extra_shared_preload_libraries = String::new(); if !libs.contains("pgaudit") { extra_shared_preload_libraries.push_str(",pgaudit"); } writeln!( file, "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, // because we always set at least some shared_preload_libraries in the spec // but let's handle it explicitly anyway. writeln!(file, "shared_preload_libraries='neon,pgaudit'")?; } writeln!(file, "# Managed by compute_ctl base audit settings: end")?; } ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { writeln!( file, "# Managed by compute_ctl compliance audit settings: begin" )?; // Enable logging of parameters. // This is very verbose and may contain sensitive data. if spec.audit_log_level == ComputeAudit::Full { writeln!(file, "pgaudit.log_parameter=on")?; writeln!(file, "pgaudit.log='all'")?; } else { writeln!(file, "pgaudit.log_parameter=off")?; writeln!(file, "pgaudit.log='all, -misc'")?; } // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. writeln!(file, "pgaudit.log_catalog=off")?; // Set log rotation to 5 minutes // TODO: tune this after performance testing writeln!(file, "pgaudit.log_rotation_age=5")?; // Enable audit logs for pg_session_jwt extension // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863 // // writeln!(file, "pg_session_jwt.audit_log=on")?; // Add audit shared_preload_libraries, if they are not present. // // The caller who sets the flag is responsible for ensuring that the necessary // shared_preload_libraries are present in the compute image, // otherwise the compute start will fail. if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { let mut extra_shared_preload_libraries = String::new(); if !libs.contains("pgaudit") { extra_shared_preload_libraries.push_str(",pgaudit"); } if !libs.contains("pgauditlogtofile") { extra_shared_preload_libraries.push_str(",pgauditlogtofile"); } writeln!( file, "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, // because we always set at least some shared_preload_libraries in the spec // but let's handle it explicitly anyway. writeln!( file, "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" )?; } writeln!( file, "# Managed by compute_ctl compliance audit settings: end" )?; } } writeln!(file, "neon.extension_server_port={extension_server_port}")?; if spec.drop_subscriptions_before_start { writeln!(file, "neon.disable_logical_replication_subscribers=true")?; } else { // be explicit about the default value writeln!(file, "neon.disable_logical_replication_subscribers=false")?; } // We need Postgres to send logs to rsyslog so that we can forward them // further to customers' log aggregation systems. if spec.logs_export_host.is_some() { writeln!(file, "log_destination='stderr,syslog'")?; } if lakebase_mode { // Explicitly set the port based on the connstr, overriding any previous port setting. // Note: It is important that we don't specify a different port again after this. let port = postgres_port.expect("port must be present in connstr"); writeln!(file, "port = {port}")?; // This is databricks specific settings. // This should be at the end of the file but before `compute_ctl_temp_override.conf` below // so that it can override any settings above. // `compute_ctl_temp_override.conf` is intended to override any settings above during specific operations. // To prevent potential breakage in the future, we keep it above `compute_ctl_temp_override.conf`. writeln!(file, "# Databricks settings start")?; if let Some(settings) = databricks_settings { writeln!(file, "{}", settings.as_pg_settings())?; } writeln!(file, "# Databricks settings end")?; } // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; Ok(()) } pub fn with_compute_ctl_tmp_override(pgdata_path: &Path, options: &str, exec: F) -> Result<()> where F: FnOnce() -> Result<()>, { let path = pgdata_path.join("compute_ctl_temp_override.conf"); let mut file = File::create(path)?; write!(file, "{options}")?; let res = exec(); file.set_len(0)?; res } ================================================ FILE: compute_tools/src/config_template/compute_audit_rsyslog_template.conf ================================================ # Load imfile module to read log files module(load="imfile") # Input configuration for log files in the specified directory # The messages can be multiline. The start of the message is a timestamp # in "%Y-%m-%d %H:%M:%S.%3N GMT" (so timezone hardcoded). # Replace log_directory with the directory containing the log files input(type="imfile" File="{log_directory}/*.log" Tag="pgaudit_log" Severity="info" Facility="local5" startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,") # the directory to store rsyslog state files global( workDirectory="/var/log/rsyslog" DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt" ) # Whether the remote syslog receiver uses tls set $.remote_syslog_tls = "{remote_syslog_tls}"; # Construct json, endpoint_id and project_id as additional metadata set $.json_log!endpoint_id = "{endpoint_id}"; set $.json_log!project_id = "{project_id}"; set $.json_log!msg = $msg; # Template suitable for rfc5424 syslog format template(name="PgAuditLog" type="string" string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%") # Forward to remote syslog receiver (over TLS) if ( $syslogtag == 'pgaudit_log' ) then {{ if ( $.remote_syslog_tls == 'true' ) then {{ action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" template="PgAuditLog" queue.type="linkedList" queue.size="1000" action.ResumeRetryCount="10" StreamDriver="gtls" StreamDriverMode="1" StreamDriverAuthMode="x509/name" StreamDriverPermittedPeers="{remote_syslog_host}" StreamDriver.CheckExtendedKeyPurpose="on" StreamDriver.PermitExpiredCerts="off" ) stop }} else {{ action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" template="PgAuditLog" queue.type="linkedList" queue.size="1000" action.ResumeRetryCount="10" ) stop }} }} ================================================ FILE: compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf ================================================ # Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT # Default value is 'postgres'. if $programname == 'postgres' then {{ # Forward Postgres logs to telemetry otel collector action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp" template="RSYSLOG_SyslogProtocol23Format" action.resumeRetryCount="3" queue.type="linkedList" queue.size="1000") stop }} ================================================ FILE: compute_tools/src/configurator.rs ================================================ use std::fs::File; use std::thread; use std::{path::Path, sync::Arc}; use anyhow::Result; use compute_api::responses::{ComputeConfig, ComputeStatus}; use tracing::{error, info, instrument}; use crate::compute::{ComputeNode, ParsedSpec}; use crate::spec::get_config_from_control_plane; #[instrument(skip_all)] fn configurator_main_loop(compute: &Arc) { info!("waiting for reconfiguration requests"); loop { let mut state = compute.state.lock().unwrap(); /* BEGIN_HADRON */ // RefreshConfiguration should only be used inside the loop assert_ne!(state.status, ComputeStatus::RefreshConfiguration); /* END_HADRON */ if compute.params.lakebase_mode { while state.status != ComputeStatus::ConfigurationPending && state.status != ComputeStatus::RefreshConfigurationPending && state.status != ComputeStatus::Failed { info!("configurator: compute status: {:?}, sleeping", state.status); state = compute.state_changed.wait(state).unwrap(); } } else { // We have to re-check the status after re-acquiring the lock because it could be that // the status has changed while we were waiting for the lock, and we might not need to // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e. // we are waiting for a condition variable that will never be signaled. if state.status != ComputeStatus::ConfigurationPending { state = compute.state_changed.wait(state).unwrap(); } } // Re-check the status after waking up if state.status == ComputeStatus::ConfigurationPending { info!("got configuration request"); state.set_status(ComputeStatus::Configuration, &compute.state_changed); drop(state); let mut new_status = ComputeStatus::Failed; if let Err(e) = compute.reconfigure() { error!("could not configure compute node: {}", e); } else { new_status = ComputeStatus::Running; info!("compute node configured"); } // XXX: used to test that API is blocking // std::thread::sleep(std::time::Duration::from_millis(10000)); compute.set_status(new_status); } else if state.status == ComputeStatus::RefreshConfigurationPending { info!( "compute node suspects its configuration is out of date, now refreshing configuration" ); state.set_status(ComputeStatus::RefreshConfiguration, &compute.state_changed); // Drop the lock guard here to avoid holding the lock while downloading config from the control plane / HCC. // This is the only thread that can move compute_ctl out of the `RefreshConfiguration` state, so it // is safe to drop the lock like this. drop(state); let get_config_result: anyhow::Result = if let Some(config_path) = &compute.params.config_path_test_only { // This path is only to make testing easier. In production we always get the config from the HCC. info!( "reloading config.json from path: {}", config_path.to_string_lossy() ); let path = Path::new(config_path); if let Ok(file) = File::open(path) { match serde_json::from_reader::(file) { Ok(config) => Ok(config), Err(e) => { error!("could not parse config file: {}", e); Err(anyhow::anyhow!("could not parse config file: {}", e)) } } } else { error!( "could not open config file at path: {:?}", config_path.to_string_lossy() ); Err(anyhow::anyhow!( "could not open config file at path: {}", config_path.to_string_lossy() )) } } else if let Some(control_plane_uri) = &compute.params.control_plane_uri { get_config_from_control_plane(control_plane_uri, &compute.params.compute_id) } else { Err(anyhow::anyhow!("config_path_test_only is not set")) }; // Parse any received ComputeSpec and transpose the result into a Result>. let parsed_spec_result: Result> = get_config_result.and_then(|config| { if let Some(spec) = config.spec { if let Ok(pspec) = ParsedSpec::try_from(spec) { Ok(Some(pspec)) } else { Err(anyhow::anyhow!("could not parse spec")) } } else { Ok(None) } }); let new_status: ComputeStatus; match parsed_spec_result { // Control plane (HCM) returned a spec and we were able to parse it. Ok(Some(pspec)) => { { let mut state = compute.state.lock().unwrap(); // Defensive programming to make sure this thread is indeed the only one that can move the compute // node out of the `RefreshConfiguration` state. Would be nice if we can encode this invariant // into the type system. assert_eq!(state.status, ComputeStatus::RefreshConfiguration); if state .pspec .as_ref() .map(|ps| ps.pageserver_conninfo.clone()) == Some(pspec.pageserver_conninfo.clone()) { info!( "Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration." ); state.status = ComputeStatus::Running; compute.state_changed.notify_all(); drop(state); std::thread::sleep(std::time::Duration::from_secs(5)); continue; } // state.pspec is consumed by compute.reconfigure() below. Note that compute.reconfigure() will acquire // the compute.state lock again so we need to have the lock guard go out of scope here. We could add a // "locked" variant of compute.reconfigure() that takes the lock guard as an argument to make this cleaner, // but it's not worth forking the codebase too much for this minor point alone right now. state.pspec = Some(pspec); } match compute.reconfigure() { Ok(_) => { info!("Refresh configuration: compute node configured"); new_status = ComputeStatus::Running; } Err(e) => { error!( "Refresh configuration: could not configure compute node: {}", e ); // Set the compute node back to the `RefreshConfigurationPending` state if the configuration // was not successful. It should be okay to treat this situation the same as if the loop // hasn't executed yet as long as the detection side keeps notifying. new_status = ComputeStatus::RefreshConfigurationPending; } } } // Control plane (HCM)'s response does not contain a spec. This is the "Empty" attachment case. Ok(None) => { info!( "Compute Manager signaled that this compute is no longer attached to any storage. Exiting." ); // We just immediately terminate the whole compute_ctl in this case. It's not necessary to attempt a // clean shutdown as Postgres is probably not responding anyway (which is why we are in this refresh // configuration state). std::process::exit(1); } // Various error cases: // - The request to the control plane (HCM) either failed or returned a malformed spec. // - compute_ctl itself is configured incorrectly (e.g., compute_id is not set). Err(e) => { error!( "Refresh configuration: error getting a parsed spec: {:?}", e ); new_status = ComputeStatus::RefreshConfigurationPending; // We may be dealing with an overloaded HCM if we end up in this path. Backoff 5 seconds before // retrying to avoid hammering the HCM. std::thread::sleep(std::time::Duration::from_secs(5)); } } compute.set_status(new_status); } else if state.status == ComputeStatus::Failed { info!("compute node is now in Failed state, exiting"); break; } else { info!("woken up for compute status: {:?}, sleeping", state.status); } } } pub fn launch_configurator(compute: &Arc) -> thread::JoinHandle<()> { let compute = Arc::clone(compute); let runtime = tokio::runtime::Handle::current(); thread::Builder::new() .name("compute-configurator".into()) .spawn(move || { let _rt_guard = runtime.enter(); configurator_main_loop(&compute); info!("configurator thread is exited"); }) .expect("cannot launch configurator thread") } ================================================ FILE: compute_tools/src/disk_quota.rs ================================================ use anyhow::Context; use tracing::instrument; pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; /// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. /// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. #[instrument] pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { let size_kb = size_bytes / 1024; // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` let child_result = std::process::Command::new("/usr/bin/sudo") .arg(DISK_QUOTA_BIN) .arg(size_kb.to_string()) .arg(fs_mountpoint) .spawn(); child_result .context("spawn() failed") .and_then(|mut child| child.wait().context("wait() failed")) .and_then(|status| match status.success() { true => Ok(()), false => Err(anyhow::anyhow!("process exited with {status}")), }) // wrap any prior error with the overall context that we couldn't run the command .with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`")) } ================================================ FILE: compute_tools/src/extension_server.rs ================================================ // Download extension files from the extension store // and put them in the right place in the postgres directory (share / lib) /* The layout of the S3 bucket is as follows: 5615610098 // this is an extension build number ├── v14 │   ├── extensions │   │   ├── anon.tar.zst │   │   └── embedding.tar.zst │   └── ext_index.json └── v15 ├── extensions │   ├── anon.tar.zst │   └── embedding.tar.zst └── ext_index.json 5615261079 ├── v14 │   ├── extensions │   │   └── anon.tar.zst │   └── ext_index.json └── v15 ├── extensions │   └── anon.tar.zst └── ext_index.json 5623261088 ├── v14 │   ├── extensions │   │   └── embedding.tar.zst │   └── ext_index.json └── v15 ├── extensions │   └── embedding.tar.zst └── ext_index.json Note that build number cannot be part of prefix because we might need extensions from other build numbers. ext_index.json stores the control files and location of extension archives It also stores a list of public extensions and a library_index We don't need to duplicate extension.tar.zst files. We only need to upload a new one if it is updated. (Although currently we just upload every time anyways, hopefully will change this sometime) *access* is controlled by spec More specifically, here is an example ext_index.json { "public_extensions": [ "anon", "pg_buffercache" ], "library_index": { "anon": "anon", "pg_buffercache": "pg_buffercache" }, "extension_data": { "pg_buffercache": { "control_data": { "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" }, "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" }, "anon": { "control_data": { "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" }, "archive_path": "5670669815/v14/extensions/anon.tar.zst" } } } */ use std::path::Path; use std::str; use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use postgres_versioninfo::PgMajorVersion; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; use tar::Archive; use tracing::info; use tracing::log::warn; use url::Url; use zstd::stream::read::Decoder; fn get_pg_config(argument: &str, pgbin: &str) -> String { // gives the result of `pg_config [argument]` // where argument is a flag like `--version` or `--sharedir` let pgconfig = pgbin .strip_suffix("postgres") .expect("bad pgbin") .to_owned() + "/pg_config"; let config_output = std::process::Command::new(pgconfig) .arg(argument) .output() .expect("pg_config error"); std::str::from_utf8(&config_output.stdout) .expect("pg_config error") .trim() .to_string() } pub fn get_pg_version(pgbin: &str) -> PgMajorVersion { // pg_config --version returns a (platform specific) human readable string // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); parse_pg_version(&human_version) } pub fn get_pg_version_string(pgbin: &str) -> String { get_pg_version(pgbin).v_str() } fn parse_pg_version(human_version: &str) -> PgMajorVersion { use PgMajorVersion::*; // Normal releases have version strings like "PostgreSQL 15.4". But there // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version // configure option, you can tack any string to the version number, // e.g. "PostgreSQL 15.4foobar". match Regex::new(r"^PostgreSQL (?\d+).+") .unwrap() .captures(human_version) { Some(captures) if captures.len() == 2 => match &captures["major"] { "14" => return PG14, "15" => return PG15, "16" => return PG16, "17" => return PG17, _ => {} }, _ => {} } panic!("Unsuported postgres version {human_version}"); } // download the archive for a given extension, // unzip it, and place files in the appropriate locations (share/lib) pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, remote_ext_base_url: &Url, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( "error downloading extension {:?}: {:?}", ext_name, error_message )); } }; let download_size = download_buffer.len() as u64; info!("Download size {:?}", download_size); // it's unclear whether it is more performant to decompress into memory or not // TODO: decompressing into memory can be avoided let decoder = Decoder::new(download_buffer.as_ref())?; let mut archive = Archive::new(decoder); let unzip_dest = pgbin .strip_suffix("/bin/postgres") .expect("bad pgbin") .to_string() + "/download_extensions"; archive.unpack(&unzip_dest)?; info!("Download + unzip {:?} completed successfully", &ext_path); let sharedir_paths = ( unzip_dest.to_string() + "/share/extension", Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"), ); let libdir_paths = ( unzip_dest.to_string() + "/lib", Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(), ); // move contents of the libdir / sharedir in unzipped archive to the correct local paths for paths in [sharedir_paths, libdir_paths] { let (zip_dir, real_dir) = paths; let dir = match std::fs::read_dir(&zip_dir) { Ok(dir) => dir, Err(e) => match e.kind() { // In the event of a SQL-only extension, there would be nothing // to move from the lib/ directory, so note that in the log and // move on. std::io::ErrorKind::NotFound => { info!("nothing to move from {}", zip_dir); continue; } _ => return Err(anyhow::anyhow!(e)), }, }; info!("mv {zip_dir:?}/* {real_dir:?}"); for file in dir { let old_file = file?.path(); let new_file = Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?); info!("moving {old_file:?} to {new_file:?}"); // extension download failed: Directory not empty (os error 39) match std::fs::rename(old_file, new_file) { Ok(()) => info!("move succeeded"), Err(e) => { warn!("move failed, probably because the extension already exists: {e}") } } } } info!("done moving extension {ext_name}"); Ok(download_size) } // Create extension control files from spec pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"); for (ext_name, ext_data) in remote_extensions.extension_data.iter() { // Check if extension is present in public or custom. // If not, then it is not allowed to be used by this compute. if let Some(public_extensions) = &remote_extensions.public_extensions { if !public_extensions.contains(ext_name) { if let Some(custom_extensions) = &remote_extensions.custom_extensions { if !custom_extensions.contains(ext_name) { continue; // skip this extension, it is not allowed } } } } for (control_name, control_content) in &ext_data.control_data { let control_path = local_sharedir.join(control_name); if !control_path.exists() { info!("writing file {:?}{:?}", control_path, control_content); std::fs::write(control_path, control_content).unwrap(); } else { warn!( "control file {:?} exists both locally and remotely. ignoring the remote version.", control_path ); } } } } // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result { let uri = remote_ext_base_url.join(ext_path).with_context(|| { format!( "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}" ) })?; let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) .to_str() .unwrap_or("unknown") .to_string(); info!("Downloading extension file '{}' from uri {}", filename, uri); match do_extension_server_request(uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL .with_label_values(&[&StatusCode::OK.to_string(), &filename]) .inc(); Ok(resp) } Err((msg, status)) => { REMOTE_EXT_REQUESTS_TOTAL .with_label_values(&[&status, &filename]) .inc(); bail!(msg); } } } // Do a single remote extensions server request. // Return result or (error message + stringified status code) in case of any failures. async fn do_extension_server_request(uri: Url) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( format!("could not perform remote extensions server request: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; let status = resp.status(); match status { StatusCode::OK => match resp.bytes().await { Ok(resp) => Ok(resp), Err(e) => Err(( format!("could not read remote extensions server response: {e:?}"), // It's fine to return and report error with status as 200 OK, // because we still failed to read the response. status.to_string(), )), }, StatusCode::SERVICE_UNAVAILABLE => Err(( "remote extensions server is temporarily unavailable".to_string(), status.to_string(), )), _ => Err(( format!("unexpected remote extensions server response status code: {status}"), status.to_string(), )), } } #[cfg(test)] mod tests { use super::parse_pg_version; #[test] fn test_parse_pg_version() { use postgres_versioninfo::PgMajorVersion::*; assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15); assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15); assert_eq!( parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"), PG15 ); assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14); assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14); assert_eq!( parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"), PG14 ); assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16); assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16); assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16); assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16); } #[test] #[should_panic] fn test_parse_pg_unsupported_version() { parse_pg_version("PostgreSQL 13.14"); } #[test] #[should_panic] fn test_parse_pg_incorrect_version_format() { parse_pg_version("PostgreSQL 14"); } } ================================================ FILE: compute_tools/src/hadron_metrics.rs ================================================ use metrics::{ IntCounter, IntGaugeVec, core::Collector, proto::MetricFamily, register_int_counter, register_int_gauge_vec, }; use once_cell::sync::Lazy; // Counter keeping track of the number of PageStream request errors reported by Postgres. // An error is registered every time Postgres calls compute_ctl's /refresh_configuration API. // Postgres will invoke this API if it detected trouble with PageStream requests (get_page@lsn, // get_base_backup, etc.) it sends to any pageserver. An increase in this counter value typically // indicates Postgres downtime, as PageStream requests are critical for Postgres to function. pub static POSTGRES_PAGESTREAM_REQUEST_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "pg_cctl_pagestream_request_errors_total", "Number of PageStream request errors reported by the postgres process" ) .expect("failed to define a metric") }); // Counter keeping track of the number of compute configuration errors due to Postgres statement // timeouts. An error is registered every time `ComputeNode::reconfigure()` fails due to Postgres // error code 57014 (query cancelled). This statement timeout typically occurs when postgres is // stuck in a problematic retry loop when the PS is reject its connection requests (usually due // to PG pointing at the wrong PS). We should investigate the root cause when this counter value // increases by checking PG and PS logs. pub static COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "pg_cctl_configure_statement_timeout_errors_total", "Number of compute configuration errors due to Postgres statement timeouts." ) .expect("failed to define a metric") }); pub static COMPUTE_ATTACHED: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pg_cctl_attached", "Compute node attached status (1 if attached)", &[ "pg_compute_id", "pg_instance_id", "tenant_id", "timeline_id" ] ) .expect("failed to define a metric") }); pub fn collect() -> Vec { let mut metrics = Vec::new(); metrics.extend(POSTGRES_PAGESTREAM_REQUEST_ERRORS.collect()); metrics.extend(COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.collect()); metrics.extend(COMPUTE_ATTACHED.collect()); metrics } pub fn initialize_metrics() { Lazy::force(&POSTGRES_PAGESTREAM_REQUEST_ERRORS); Lazy::force(&COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS); Lazy::force(&COMPUTE_ATTACHED); } ================================================ FILE: compute_tools/src/http/extract/json.rs ================================================ use std::ops::{Deref, DerefMut}; use axum::extract::rejection::JsonRejection; use axum::extract::{FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; /// Custom `Json` extractor, so that we can format errors into /// `JsonResponse`. #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Json(pub T); impl FromRequest for Json where axum::Json: FromRequest, S: Send + Sync, { type Rejection = (StatusCode, axum::Json); async fn from_request(req: Request, state: &S) -> Result { match axum::Json::::from_request(req, state).await { Ok(value) => Ok(Self(value.0)), Err(rejection) => Err(( rejection.status(), axum::Json(GenericAPIError { error: rejection.body_text().to_lowercase(), }), )), } } } impl Deref for Json { type Target = T; fn deref(&self) -> &Self::Target { &self.0 } } impl DerefMut for Json { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } ================================================ FILE: compute_tools/src/http/extract/mod.rs ================================================ pub(crate) mod json; pub(crate) mod path; pub(crate) mod query; pub(crate) mod request_id; pub(crate) use json::Json; pub(crate) use path::Path; pub(crate) use query::Query; #[allow(unused)] pub(crate) use request_id::RequestId; ================================================ FILE: compute_tools/src/http/extract/path.rs ================================================ use std::ops::{Deref, DerefMut}; use axum::extract::FromRequestParts; use axum::extract::rejection::PathRejection; use compute_api::responses::GenericAPIError; use http::StatusCode; use http::request::Parts; /// Custom `Path` extractor, so that we can format errors into /// `JsonResponse`. #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Path(pub T); impl FromRequestParts for Path where axum::extract::Path: FromRequestParts, S: Send + Sync, { type Rejection = (StatusCode, axum::Json); async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { match axum::extract::Path::::from_request_parts(parts, state).await { Ok(value) => Ok(Self(value.0)), Err(rejection) => Err(( rejection.status(), axum::Json(GenericAPIError { error: rejection.body_text().to_ascii_lowercase(), }), )), } } } impl Deref for Path { type Target = T; fn deref(&self) -> &Self::Target { &self.0 } } impl DerefMut for Path { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } ================================================ FILE: compute_tools/src/http/extract/query.rs ================================================ use std::ops::{Deref, DerefMut}; use axum::extract::FromRequestParts; use axum::extract::rejection::QueryRejection; use compute_api::responses::GenericAPIError; use http::StatusCode; use http::request::Parts; /// Custom `Query` extractor, so that we can format errors into /// `JsonResponse`. #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Query(pub T); impl FromRequestParts for Query where axum::extract::Query: FromRequestParts, S: Send + Sync, { type Rejection = (StatusCode, axum::Json); async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { match axum::extract::Query::::from_request_parts(parts, state).await { Ok(value) => Ok(Self(value.0)), Err(rejection) => Err(( rejection.status(), axum::Json(GenericAPIError { error: rejection.body_text().to_ascii_lowercase(), }), )), } } } impl Deref for Query { type Target = T; fn deref(&self) -> &Self::Target { &self.0 } } impl DerefMut for Query { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } ================================================ FILE: compute_tools/src/http/extract/request_id.rs ================================================ use std::{ fmt::Display, ops::{Deref, DerefMut}, }; use axum::{extract::FromRequestParts, response::IntoResponse}; use http::{StatusCode, request::Parts}; use crate::http::{JsonResponse, headers::X_REQUEST_ID}; /// Extract the request ID from the `X-Request-Id` header. #[derive(Debug, Clone, Default)] pub(crate) struct RequestId(pub String); #[derive(Debug)] /// Rejection used for [`RequestId`]. /// /// Contains one variant for each way the [`RequestId`] extractor can /// fail. pub(crate) enum RequestIdRejection { /// The request is missing the header. MissingRequestId, /// The value of the header is invalid UTF-8. InvalidUtf8, } impl RequestIdRejection { pub fn status(&self) -> StatusCode { match self { RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR, RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST, } } pub fn message(&self) -> String { match self { RequestIdRejection::MissingRequestId => "request ID is missing", RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8", } .to_string() } } impl IntoResponse for RequestIdRejection { fn into_response(self) -> axum::response::Response { JsonResponse::error(self.status(), self.message()) } } impl FromRequestParts for RequestId where S: Send + Sync, { type Rejection = RequestIdRejection; async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result { match parts.headers.get(X_REQUEST_ID) { Some(value) => match value.to_str() { Ok(request_id) => Ok(Self(request_id.to_string())), Err(_) => Err(RequestIdRejection::InvalidUtf8), }, None => Err(RequestIdRejection::MissingRequestId), } } } impl Deref for RequestId { type Target = String; fn deref(&self) -> &Self::Target { &self.0 } } impl DerefMut for RequestId { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } impl Display for RequestId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(&self.0) } } ================================================ FILE: compute_tools/src/http/headers.rs ================================================ /// Constant for `X-Request-Id` header. pub const X_REQUEST_ID: &str = "x-request-id"; ================================================ FILE: compute_tools/src/http/middleware/authorize.rs ================================================ use anyhow::{Result, anyhow}; use axum::{RequestExt, body::Body}; use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope}; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; use tower_http::auth::AsyncAuthorizeRequest; use tracing::{debug, warn}; use crate::http::JsonResponse; #[derive(Clone, Debug)] pub(in crate::http) struct Authorize { compute_id: String, // BEGIN HADRON // Hadron instance ID. Only set if it's a Lakebase V1 a.k.a. Hadron instance. instance_id: Option, // END HADRON jwks: JwkSet, validation: Validation, } impl Authorize { pub fn new(compute_id: String, instance_id: Option, jwks: JwkSet) -> Self { let mut validation = Validation::new(Algorithm::EdDSA); // BEGIN HADRON let use_rsa = jwks.keys.iter().any(|jwk| { jwk.common .key_algorithm .is_some_and(|alg| alg == jsonwebtoken::jwk::KeyAlgorithm::RS256) }); if use_rsa { validation = Validation::new(Algorithm::RS256); } // END HADRON validation.validate_exp = true; // Unused by the control plane validation.validate_nbf = false; // Unused by the control plane validation.validate_aud = false; validation.set_audience(&[COMPUTE_AUDIENCE]); // Nothing is currently required validation.set_required_spec_claims(&[] as &[&str; 0]); Self { compute_id, instance_id, jwks, validation, } } } impl AsyncAuthorizeRequest for Authorize { type RequestBody = Body; type ResponseBody = Body; type Future = BoxFuture<'static, Result, Response>>; fn authorize(&mut self, mut request: Request) -> Self::Future { let compute_id = self.compute_id.clone(); let is_hadron_instance = self.instance_id.is_some(); let jwks = self.jwks.clone(); let validation = self.validation.clone(); Box::pin(async move { // BEGIN HADRON // In Hadron deployments the "external" HTTP endpoint on compute_ctl can only be // accessed by trusted components (enforced by dblet network policy), so we can bypass // all auth here. if is_hadron_instance { return Ok(request); } // END HADRON let TypedHeader(Authorization(bearer)) = request .extract_parts::>>() .await .map_err(|_| { JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token") })?; let data = match Self::verify(&jwks, bearer.token(), &validation) { Ok(claims) => claims, Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), }; match data.claims.scope { // TODO: We should validate audience for every token, but // instead of this ad-hoc validation, we should turn // [`Validation::validate_aud`] on. This is merely a stopgap // while we roll out `aud` deployment. We return a 401 // Unauthorized because when we eventually do use // [`Validation`], we will hit the above `Err` match arm which // returns 401 Unauthorized. Some(ComputeClaimsScope::Admin) => { let Some(ref audience) = data.claims.audience else { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, "missing audience in authorization token claims", )); }; if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, "invalid audience in authorization token claims", )); } } // If the scope is not [`ComputeClaimsScope::Admin`], then we // must validate the compute_id _ => { let Some(ref claimed_compute_id) = data.claims.compute_id else { return Err(JsonResponse::error( StatusCode::FORBIDDEN, "missing compute_id in authorization token claims", )); }; if *claimed_compute_id != compute_id { return Err(JsonResponse::error( StatusCode::FORBIDDEN, "invalid compute ID in authorization token claims", )); } } } // Make claims available to any subsequent middleware or request // handlers request.extensions_mut().insert(data.claims); Ok(request) }) } } impl Authorize { /// Verify the token using the JSON Web Key set and return the token data. fn verify( jwks: &JwkSet, token: &str, validation: &Validation, ) -> Result> { debug_assert!(!jwks.keys.is_empty()); debug!("verifying token {}", token); for jwk in jwks.keys.iter() { let decoding_key = match DecodingKey::from_jwk(jwk) { Ok(key) => key, Err(e) => { warn!( "failed to construct decoding key from {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); continue; } }; match jsonwebtoken::decode::(token, &decoding_key, validation) { Ok(data) => return Ok(data), Err(e) => { warn!( "failed to decode authorization token using {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); continue; } } } Err(anyhow!("failed to verify authorization token")) } } ================================================ FILE: compute_tools/src/http/middleware/mod.rs ================================================ pub(in crate::http) mod authorize; pub(in crate::http) mod request_id; ================================================ FILE: compute_tools/src/http/middleware/request_id.rs ================================================ use axum::{extract::Request, middleware::Next, response::Response}; use uuid::Uuid; use crate::http::headers::X_REQUEST_ID; /// This middleware function allows compute_ctl to generate its own request ID /// if one isn't supplied. The control plane will always send one as a UUID. The /// neon Postgres extension on the other hand does not send one. pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { let headers = request.headers_mut(); if !headers.contains_key(X_REQUEST_ID) { headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); } next.run(request).await } ================================================ FILE: compute_tools/src/http/mod.rs ================================================ use axum::body::Body; use axum::response::Response; use compute_api::responses::{ComputeStatus, GenericAPIError}; use http::StatusCode; use http::header::CONTENT_TYPE; use serde::Serialize; use tracing::error; mod extract; mod headers; mod middleware; mod routes; pub mod server; /// Convenience response builder for JSON responses struct JsonResponse; impl JsonResponse { /// Helper for actually creating a response fn create_response(code: StatusCode, body: impl Serialize) -> Response { Response::builder() .status(code) .header(CONTENT_TYPE.as_str(), "application/json") .body(Body::from(serde_json::to_string(&body).unwrap())) .unwrap() } /// Create a successful error response pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response { assert!({ let code = code.as_u16(); (200..300).contains(&code) }); Self::create_response(code, body) } /// Create an error response pub(self) fn error(code: StatusCode, error: impl ToString) -> Response { assert!(code.as_u16() >= 400); let message = error.to_string(); error!(message); Self::create_response(code, &GenericAPIError { error: message }) } /// Create an error response related to the compute being in an invalid state pub(self) fn invalid_status(status: ComputeStatus) -> Response { Self::error( StatusCode::PRECONDITION_FAILED, format!("invalid compute status: {status}"), ) } } ================================================ FILE: compute_tools/src/http/openapi_spec.yaml ================================================ openapi: "3.0.2" info: title: Compute node control API version: "1.0" servers: - url: "http://localhost:3080" paths: /status: get: tags: - Info summary: Get compute node internal status. description: "" operationId: getComputeStatus responses: 200: description: ComputeState content: application/json: schema: $ref: "#/components/schemas/ComputeState" /metrics.json: get: tags: - Info summary: Get compute node startup metrics in JSON format. description: "" operationId: getComputeMetricsJSON responses: 200: description: ComputeMetrics content: application/json: schema: $ref: "#/components/schemas/ComputeMetrics" /metrics: get: tags: - Info summary: Get compute node metrics in text format. description: "" operationId: getComputeMetrics responses: 200: description: ComputeMetrics content: text/plain: schema: type: string description: Metrics in text format. /insights: get: tags: - Info summary: Get current compute insights in JSON format. description: | Note, that this doesn't include any historical data. operationId: getComputeInsights responses: 200: description: Compute insights content: application/json: schema: $ref: "#/components/schemas/ComputeInsights" /dbs_and_roles: get: tags: - Info summary: Get databases and roles in the catalog. description: "" operationId: getDbsAndRoles responses: 200: description: Compute schema objects content: application/json: schema: $ref: "#/components/schemas/DbsAndRoles" /promote: post: tags: - Promotion summary: Promote secondary replica to primary description: "" operationId: promoteReplica requestBody: description: Promote requests data required: true content: application/json: schema: $ref: "#/components/schemas/ComputeSchemaWithLsn" responses: 200: description: Promote succeeded or wasn't started content: application/json: schema: $ref: "#/components/schemas/PromoteState" 500: description: Promote failed content: application/json: schema: $ref: "#/components/schemas/PromoteState" /lfc/prewarm: post: summary: Request LFC Prewarm parameters: - name: from_endpoint in: query schema: type: string description: "" operationId: lfcPrewarm responses: 202: description: LFC prewarm started 429: description: LFC prewarm ongoing get: tags: - Prewarm summary: Get LFC prewarm state description: "" operationId: getLfcPrewarmState responses: 200: description: Prewarm state content: application/json: schema: $ref: "#/components/schemas/LfcPrewarmState" delete: tags: - Prewarm summary: Cancel ongoing LFC prewarm description: "" operationId: cancelLfcPrewarm responses: 202: description: Prewarm cancelled /lfc/offload: post: summary: Request LFC offload description: "" operationId: lfcOffload responses: 202: description: LFC offload started 429: description: LFC offload ongoing get: tags: - Prewarm summary: Get LFC offloading state description: "" operationId: getLfcOffloadState responses: 200: description: Offload state content: application/json: schema: $ref: "#/components/schemas/LfcOffloadState" /database_schema: get: tags: - Info summary: Get schema dump parameters: - name: database in: query description: Database name to dump. required: true schema: type: string example: "postgres" description: Get schema dump in SQL format. operationId: getDatabaseSchema responses: 200: description: Schema dump content: text/plain: schema: type: string description: Schema dump in SQL format. 404: description: Non existing database. content: application/json: schema: $ref: "#/components/schemas/GenericError" /grants: post: tags: - Grants summary: Apply grants to the database. description: "" operationId: setRoleGrants requestBody: description: Grants request. required: true content: application/json: schema: $ref: "#/components/schemas/SetRoleGrantsRequest" responses: 200: description: Grants applied. content: application/json: schema: $ref: "#/components/schemas/SetRoleGrantsResponse" 412: description: | Compute is not in the right state for processing the request. content: application/json: schema: $ref: "#/components/schemas/GenericError" 500: description: Error occurred during grants application. content: application/json: schema: $ref: "#/components/schemas/GenericError" /check_writability: post: tags: - Check summary: Check that we can write new data on this compute. description: "" operationId: checkComputeWritability responses: 200: description: Check result content: text/plain: schema: type: string description: Error text or 'true' if check passed. example: "true" /extensions: post: tags: - Extensions summary: Install extension if possible. description: "" operationId: installExtension requestBody: description: Extension name and database to install it to. required: true content: application/json: schema: $ref: "#/components/schemas/ExtensionInstallRequest" responses: 200: description: Result from extension installation content: application/json: schema: $ref: "#/components/schemas/ExtensionInstallResult" 412: description: | Compute is in the wrong state for processing the request. content: application/json: schema: $ref: "#/components/schemas/GenericError" 500: description: Error during extension installation. content: application/json: schema: $ref: "#/components/schemas/GenericError" /configure: post: tags: - Configure summary: Perform compute node configuration. description: | This is a blocking API endpoint, i.e. it blocks waiting until compute is finished configuration and is in `Running` state. Optional non-blocking mode could be added later. operationId: configureCompute requestBody: description: Configuration request. required: true content: application/json: schema: $ref: "#/components/schemas/ComputeSchema" responses: 200: description: Compute configuration finished. content: application/json: schema: $ref: "#/components/schemas/ComputeState" 400: description: Provided spec is invalid. content: application/json: schema: $ref: "#/components/schemas/GenericError" 412: description: | It's not possible to do live-configuration of the compute. It's either in the wrong state, or compute doesn't use pull mode of configuration. content: application/json: schema: $ref: "#/components/schemas/GenericError" 500: description: | Compute configuration request was processed, but error occurred. Compute will likely shutdown soon. content: application/json: schema: $ref: "#/components/schemas/GenericError" /extension_server: post: tags: - Extension summary: Download extension from S3 to local folder. description: "" operationId: downloadExtension responses: 200: description: Extension downloaded content: text/plain: schema: type: string description: Error text or 'OK' if download succeeded. example: "OK" 400: description: Request is invalid. content: application/json: schema: $ref: "#/components/schemas/GenericError" 500: description: Extension download request failed. content: application/json: schema: $ref: "#/components/schemas/GenericError" /terminate: post: tags: - Terminate summary: Terminate Postgres and wait for it to exit description: "" operationId: terminate parameters: - name: mode in: query description: "Terminate mode: fast (wait 30s before returning) and immediate" required: false schema: type: string enum: ["fast", "immediate"] default: fast responses: 200: description: Result content: application/json: schema: $ref: "#/components/schemas/TerminateResponse" 201: description: Result if compute is already terminated content: application/json: schema: $ref: "#/components/schemas/TerminateResponse" 412: description: "wrong state" content: application/json: schema: $ref: "#/components/schemas/GenericError" 500: description: "Unexpected error" content: application/json: schema: $ref: "#/components/schemas/GenericError" components: securitySchemes: JWT: type: http scheme: bearer bearerFormat: JWT schemas: ComputeMetrics: type: object description: Compute startup metrics. required: - wait_for_spec_ms - sync_safekeepers_ms - basebackup_ms - config_ms - total_startup_ms properties: wait_for_spec_ms: type: integer sync_safekeepers_ms: type: integer basebackup_ms: type: integer config_ms: type: integer total_startup_ms: type: integer DbsAndRoles: type: object description: Databases and Roles required: - roles - databases properties: roles: type: array items: $ref: "#/components/schemas/Role" databases: type: array items: $ref: "#/components/schemas/Database" Database: type: object description: Database required: - name - owner - restrict_conn - invalid properties: name: type: string owner: type: string options: type: array items: $ref: "#/components/schemas/GenericOption" restrict_conn: type: boolean invalid: type: boolean Role: type: object description: Role required: - name properties: name: type: string encrypted_password: type: string options: type: array items: $ref: "#/components/schemas/GenericOption" GenericOption: type: object description: Schema Generic option required: - name - vartype properties: name: type: string value: type: string vartype: type: string ComputeState: type: object required: - start_time - status properties: start_time: type: string description: | Time when compute was started. If initially compute was started in the `empty` state and then provided with valid spec, `start_time` will be reset to the moment, when spec was received. example: "2022-10-12T07:20:50.52Z" status: $ref: "#/components/schemas/ComputeStatus" last_active: type: string description: | The last detected compute activity timestamp in UTC and RFC3339 format. It could be empty if compute was never used by user since start. example: "2022-10-12T07:20:50.52Z" error: type: string description: Text of the error during compute startup or reconfiguration, if any. example: "" tenant: type: string description: Identifier of the current tenant served by compute node, if any. example: c9269c359e9a199fad1ea0981246a78f timeline: type: string description: Identifier of the current timeline served by compute node, if any. example: ece7de74d4b8cbe5433a68ce4d1b97b4 ComputeInsights: type: object properties: pg_stat_statements: description: Contains raw output from pg_stat_statements in JSON format. type: array items: type: object ComputeStatus: type: string enum: - empty - configuration_pending - init - running - configuration - failed - termination_pending_fast - termination_pending_immediate - terminated example: running ExtensionInstallRequest: type: object required: - extension - database - version properties: extension: type: string description: Extension name. example: "pg_session_jwt" version: type: string description: Version of the extension. example: "1.0.0" database: type: string description: Database name. example: "neondb" ExtensionInstallResult: type: object properties: extension: description: Name of the extension. type: string example: "pg_session_jwt" version: description: Version of the extension. type: string example: "1.0.0" ComputeSchema: type: object required: - spec properties: spec: type: object ComputeSchemaWithLsn: type: object required: - spec - wal_flush_lsn properties: spec: $ref: "#/components/schemas/ComputeState" wal_flush_lsn: type: string description: "last WAL flush LSN" example: "0/028F10D8" LfcPrewarmState: type: object required: - status properties: status: description: LFC prewarm status enum: [not_prewarmed, prewarming, completed, failed, skipped] type: string error: description: LFC prewarm error, if any type: string total: description: Total pages processed type: integer prewarmed: description: Total pages prewarmed type: integer skipped: description: Pages processed but not prewarmed type: integer state_download_time_ms: description: Time it takes to download LFC state to compute type: integer uncompress_time_ms: description: Time it takes to uncompress LFC state type: integer prewarm_time_ms: description: Time it takes to prewarm LFC state in Postgres type: integer LfcOffloadState: type: object required: - status properties: status: description: LFC offload status enum: [not_offloaded, offloading, completed, skipped, failed] type: string error: description: LFC offload error, if any type: string state_query_time_ms: description: Time it takes to get LFC state from Postgres type: integer compress_time_ms: description: Time it takes to compress LFC state type: integer state_upload_time_ms: description: Time it takes to upload LFC state to endpoint storage type: integer PromoteState: type: object required: - status properties: status: description: Promote result enum: [not_promoted, completed, failed] type: string error: description: Promote error, if any type: string lsn_wait_time_ms: description: Time it takes for secondary to catch up with primary WAL flush LSN type: integer pg_promote_time_ms: description: Time it takes to call pg_promote on secondary type: integer reconfigure_time_ms: description: Time it takes to reconfigure promoted secondary type: integer SetRoleGrantsRequest: type: object required: - database - schema - privileges - role properties: database: type: string description: Database name. example: "neondb" schema: type: string description: Schema name. example: "public" privileges: type: array items: type: string description: List of privileges to set. example: ["SELECT", "INSERT"] role: type: string description: Role name. example: "neon" TerminateResponse: type: object required: - lsn properties: lsn: type: string nullable: true description: "last WAL flush LSN" example: "0/028F10D8" SetRoleGrantsResponse: type: object required: - database - schema - privileges - role properties: database: type: string description: Database name. example: "neondb" schema: type: string description: Schema name. example: "public" privileges: type: array items: type: string description: List of privileges set. example: ["SELECT", "INSERT"] role: type: string description: Role name. example: "neon" # # Errors # GenericError: type: object required: - error properties: error: type: string security: - JWT: [] ================================================ FILE: compute_tools/src/http/routes/check_writability.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; use crate::checker::check_writability; use crate::compute::ComputeNode; use crate::http::JsonResponse; /// Check that the compute is currently running. pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { let status = compute.get_status(); if status != ComputeStatus::Running { return JsonResponse::invalid_status(status); } match check_writability(&compute).await { Ok(_) => JsonResponse::success(StatusCode::OK, true), Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), } } ================================================ FILE: compute_tools/src/http/routes/configure.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use http::StatusCode; use tokio::task; use tracing::info; use crate::compute::{ComputeNode, ParsedSpec}; use crate::http::JsonResponse; use crate::http::extract::Json; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and // update compute state with new spec, we basically leave compute in the // potentially wrong state. That said, it's control-plane's responsibility to // watch compute state after reconfiguration request and to clean restart in // case of errors. pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { let pspec = match ParsedSpec::try_from(request.0.spec) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), }; // XXX: wrap state update under lock in a code block. Otherwise, we will try // to `Send` `mut state` into the spawned thread bellow, which will cause // the following rustc error: // // error: future cannot be sent between threads safely { let mut state = compute.state.lock().unwrap(); if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { return JsonResponse::invalid_status(state.status); } // Pass the tracing span to the main thread that performs the startup, // so that the start_compute operation is considered a child of this // configure request for tracing purposes. state.startup_span = Some(tracing::Span::current()); if compute.params.lakebase_mode { ComputeNode::set_spec(&compute.params, &mut state, pspec); } else { state.pspec = Some(pspec); } state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); let completed = task::spawn_blocking(move || { let mut state = c.state.lock().unwrap(); while state.status != ComputeStatus::Running { state = c.state_changed.wait(state).unwrap(); info!( "waiting for compute to become {}, current status: {}", ComputeStatus::Running, state.status ); if state.status == ComputeStatus::Failed { let err = state.error.as_ref().map_or("unknown error", |x| x); let msg = format!("compute configuration failed: {err:?}"); return Err(msg); } } Ok(()) }) .await .unwrap(); if let Err(e) = completed { return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); } // Return current compute state if everything went well. let state = compute.state.lock().unwrap().clone(); let body = ComputeStatusResponse::from(&state); JsonResponse::success(StatusCode::OK, body) } ================================================ FILE: compute_tools/src/http/routes/database_schema.rs ================================================ use std::sync::Arc; use axum::body::Body; use axum::extract::State; use axum::response::Response; use http::StatusCode; use http::header::CONTENT_TYPE; use serde::Deserialize; use crate::catalog::{SchemaDumpError, get_database_schema}; use crate::compute::ComputeNode; use crate::http::JsonResponse; use crate::http::extract::Query; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct DatabaseSchemaParams { database: String, } /// Get a schema dump of the requested database. pub(in crate::http) async fn get_schema_dump( params: Query, State(compute): State>, ) -> Response { match get_database_schema(&compute, ¶ms.database).await { Ok(schema) => Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE.as_str(), "application/json") .body(Body::from_stream(schema)) .unwrap(), Err(SchemaDumpError::DatabaseDoesNotExist) => { JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist) } Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), } } ================================================ FILE: compute_tools/src/http/routes/dbs_and_roles.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use http::StatusCode; use crate::catalog::get_dbs_and_roles; use crate::compute::ComputeNode; use crate::http::JsonResponse; /// Get the databases and roles from the compute. pub(in crate::http) async fn get_catalog_objects( State(compute): State>, ) -> Response { match get_dbs_and_roles(&compute).await { Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects), Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), } } ================================================ FILE: compute_tools/src/http/routes/extension_server.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; use crate::compute::{BUILD_TAG, ComputeNode}; use crate::http::JsonResponse; use crate::http::extract::{Path, Query}; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { #[serde(default)] is_library: bool, } /// Download a remote extension. pub(in crate::http) async fn download_extension( Path(filename): Path, ext_server_params: Query, State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", ); } let ext = { let state = compute.state.lock().unwrap(); let pspec = state.pspec.as_ref().unwrap(); let spec = &pspec.spec; let remote_extensions = match spec.remote_extensions.as_ref() { Some(r) => r, None => { return JsonResponse::error( StatusCode::CONFLICT, "information about remote extensions is unavailable", ); } }; remote_extensions.get_ext( &filename, ext_server_params.is_library, &BUILD_TAG, &compute.params.pgversion, ) }; match ext { Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await { Ok(_) => StatusCode::OK.into_response(), Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), }, Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e), } } ================================================ FILE: compute_tools/src/http/routes/extensions.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use compute_api::requests::ExtensionInstallRequest; use compute_api::responses::{ComputeStatus, ExtensionInstallResponse}; use http::StatusCode; use crate::compute::ComputeNode; use crate::http::JsonResponse; use crate::http::extract::Json; /// Install a extension. pub(in crate::http) async fn install_extension( State(compute): State>, request: Json, ) -> Response { let status = compute.get_status(); if status != ComputeStatus::Running { return JsonResponse::invalid_status(status); } match compute .install_extension( &request.extension, &request.database, request.version.to_string(), ) .await { Ok(version) => JsonResponse::success( StatusCode::CREATED, Some(ExtensionInstallResponse { extension: request.extension.clone(), version, }), ), Err(e) => JsonResponse::error( StatusCode::INTERNAL_SERVER_ERROR, format!("failed to install extension: {e}"), ), } } ================================================ FILE: compute_tools/src/http/routes/failpoints.rs ================================================ use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::{Deserialize, Serialize}; use tracing::info; use utils::failpoint_support::apply_failpoint; pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point #[derive(Debug, Serialize, Deserialize)] pub struct FailpointConfig { /// Name of the fail point pub name: String, /// List of actions to take, using the format described in `fail::cfg` /// /// We also support `actions = "exit"` to cause the fail point to immediately exit. pub actions: String, } use crate::http::JsonResponse; use crate::http::extract::Json; /// Configure failpoints for testing purposes. pub(in crate::http) async fn configure_failpoints( failpoints: Json, ) -> Response { if !fail::has_failpoints() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "Cannot manage failpoints because neon was compiled without failpoints support", ); } for fp in &*failpoints { info!("cfg failpoint: {} {}", fp.name, fp.actions); // We recognize one extra "action" that's not natively recognized // by the failpoints crate: exit, to immediately kill the process let cfg_result = apply_failpoint(&fp.name, &fp.actions); if let Err(e) = cfg_result { return JsonResponse::error( StatusCode::BAD_REQUEST, format!("failed to configure failpoints: {e}"), ); } } StatusCode::OK.into_response() } ================================================ FILE: compute_tools/src/http/routes/grants.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use compute_api::requests::SetRoleGrantsRequest; use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse}; use http::StatusCode; use crate::compute::ComputeNode; use crate::http::JsonResponse; use crate::http::extract::Json; /// Add grants for a role. pub(in crate::http) async fn add_grant( State(compute): State>, request: Json, ) -> Response { let status = compute.get_status(); if status != ComputeStatus::Running { return JsonResponse::invalid_status(status); } match compute .set_role_grants( &request.database, &request.schema, &request.privileges, &request.role, ) .await { Ok(()) => JsonResponse::success( StatusCode::CREATED, Some(SetRoleGrantsResponse { database: request.database.clone(), schema: request.schema.clone(), role: request.role.clone(), privileges: request.privileges.clone(), }), ), Err(e) => JsonResponse::error( StatusCode::INTERNAL_SERVER_ERROR, format!("failed to grant role privileges to the schema: {e}"), ), } } ================================================ FILE: compute_tools/src/http/routes/hadron_liveness_probe.rs ================================================ use crate::pg_isready::pg_isready; use crate::{compute::ComputeNode, http::JsonResponse}; use axum::{extract::State, http::StatusCode, response::Response}; use std::sync::Arc; /// NOTE: NOT ENABLED YET /// Detect if the compute is alive. /// Called by the liveness probe of the compute container. pub(in crate::http) async fn hadron_liveness_probe( State(compute): State>, ) -> Response { let port = match compute.params.connstr.port() { Some(port) => port, None => { return JsonResponse::error( StatusCode::INTERNAL_SERVER_ERROR, "Failed to get the port from the connection string", ); } }; match pg_isready(&compute.params.pg_isready_bin, port) { Ok(_) => { // The connection is successful, so the compute is alive. // Return a 200 OK response. JsonResponse::success(StatusCode::OK, "ok") } Err(e) => { tracing::error!("Hadron liveness probe failed: {}", e); // The connection failed, so the compute is not alive. // Return a 500 Internal Server Error response. JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e) } } } ================================================ FILE: compute_tools/src/http/routes/insights.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; use crate::compute::ComputeNode; use crate::http::JsonResponse; /// Collect current Postgres usage insights. pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { let status = compute.get_status(); if status != ComputeStatus::Running { return JsonResponse::invalid_status(status); } let insights = compute.collect_insights().await; JsonResponse::success(StatusCode::OK, insights) } ================================================ FILE: compute_tools/src/http/routes/lfc.rs ================================================ use crate::http::JsonResponse; use axum::response::{IntoResponse, Response}; use axum::{Json, http::StatusCode}; use axum_extra::extract::OptionalQuery; use compute_api::responses::{LfcOffloadState, LfcPrewarmState}; type Compute = axum::extract::State>; pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { Json(compute.lfc_prewarm_state().await) } // Following functions are marked async for axum, as it's more convenient than wrapping these // in async lambdas at call site pub(in crate::http) async fn offload_state(compute: Compute) -> Json { Json(compute.lfc_offload_state()) } #[derive(serde::Deserialize)] pub struct PrewarmQuery { pub from_endpoint: String, } pub(in crate::http) async fn prewarm( compute: Compute, OptionalQuery(query): OptionalQuery, ) -> Response { if compute.prewarm_lfc(query.map(|q| q.from_endpoint)) { StatusCode::ACCEPTED.into_response() } else { JsonResponse::error( StatusCode::TOO_MANY_REQUESTS, "Multiple requests for prewarm are not allowed", ) } } pub(in crate::http) async fn offload(compute: Compute) -> Response { if compute.offload_lfc() { StatusCode::ACCEPTED.into_response() } else { JsonResponse::error( StatusCode::TOO_MANY_REQUESTS, "Multiple requests for prewarm offload are not allowed", ) } } pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode { compute.cancel_prewarm(); StatusCode::ACCEPTED } ================================================ FILE: compute_tools/src/http/routes/metrics.rs ================================================ use std::path::Path; use std::sync::Arc; use anyhow::Context; use axum::body::Body; use axum::extract::State; use axum::response::Response; use http::header::CONTENT_TYPE; use http_body_util::BodyExt; use hyper::{Request, StatusCode}; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; use crate::communicator_socket_client::connect_communicator_socket; use crate::compute::ComputeNode; use crate::hadron_metrics; use crate::http::JsonResponse; use crate::metrics::collect; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { // When we call TextEncoder::encode() below, it will immediately return an // error if a metric family has no metrics, so we need to preemptively // filter out metric families with no metrics. let mut metrics = collect() .into_iter() .filter(|m| !m.get_metric().is_empty()) .collect::>(); // Add Hadron metrics. let hadron_metrics: Vec = hadron_metrics::collect() .into_iter() .filter(|m| !m.get_metric().is_empty()) .collect(); metrics.extend(hadron_metrics); let encoder = TextEncoder::new(); let mut buffer = vec![]; if let Err(e) = encoder.encode(&metrics, &mut buffer) { return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); } Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, encoder.format_type()) .body(Body::from(buffer)) .unwrap() } /// Fetch and forward metrics from the Postgres neon extension's metrics /// exporter that are used by autoscaling-agent. /// /// The neon extension exposes these metrics over a Unix domain socket /// in the data directory. That's not accessible directly from the outside /// world, so we have this endpoint in compute_ctl to expose it pub(in crate::http) async fn get_autoscaling_metrics( State(compute): State>, ) -> Result { let pgdata = Path::new(&compute.params.pgdata); // Connect to the communicator process's metrics socket let mut metrics_client = connect_communicator_socket(pgdata) .await .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?; // Make a request for /autoscaling_metrics let request = Request::builder() .method("GET") .uri("/autoscaling_metrics") .header("Host", "localhost") // hyper requires Host, even though the server won't care .body(Body::from("")) .unwrap(); let resp = metrics_client .send_request(request) .await .context("fetching metrics from Postgres metrics service") .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?; // Build a response that just forwards the response we got. let mut response = Response::builder(); response = response.status(resp.status()); if let Some(content_type) = resp.headers().get(CONTENT_TYPE) { response = response.header(CONTENT_TYPE, content_type); } let body = tonic::service::AxumBody::from_stream(resp.into_body().into_data_stream()); Ok(response.body(body).unwrap()) } ================================================ FILE: compute_tools/src/http/routes/metrics_json.rs ================================================ use std::sync::Arc; use axum::extract::State; use axum::response::Response; use http::StatusCode; use crate::compute::ComputeNode; use crate::http::JsonResponse; /// Get startup metrics. pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { let metrics = compute.state.lock().unwrap().metrics.clone(); JsonResponse::success(StatusCode::OK, metrics) } ================================================ FILE: compute_tools/src/http/routes/mod.rs ================================================ use compute_api::responses::ComputeStatusResponse; use crate::compute::ComputeState; pub(in crate::http) mod check_writability; pub(in crate::http) mod configure; pub(in crate::http) mod database_schema; pub(in crate::http) mod dbs_and_roles; pub(in crate::http) mod extension_server; pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; pub(in crate::http) mod hadron_liveness_probe; pub(in crate::http) mod insights; pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod promote; pub(in crate::http) mod refresh_configuration; pub(in crate::http) mod status; pub(in crate::http) mod terminate; impl From<&ComputeState> for ComputeStatusResponse { fn from(state: &ComputeState) -> Self { ComputeStatusResponse { start_time: state.start_time, tenant: state .pspec .as_ref() .map(|pspec| pspec.tenant_id.to_string()), timeline: state .pspec .as_ref() .map(|pspec| pspec.timeline_id.to_string()), status: state.status, last_active: state.last_active, error: state.error.clone(), } } } ================================================ FILE: compute_tools/src/http/routes/promote.rs ================================================ use crate::http::JsonResponse; use axum::extract::Json; use compute_api::responses::PromoteConfig; use http::StatusCode; pub(in crate::http) async fn promote( compute: axum::extract::State>, Json(cfg): Json, ) -> axum::response::Response { // Return early at the cost of extra parsing spec let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), }; let cfg = PromoteConfig { spec: pspec.spec, wal_flush_lsn: cfg.wal_flush_lsn, }; let state = compute.promote(cfg).await; if let compute_api::responses::PromoteState::Failed { error: _ } = state { return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state); } JsonResponse::success(StatusCode::OK, state) } ================================================ FILE: compute_tools/src/http/routes/refresh_configuration.rs ================================================ // This file is added by Hadron use std::sync::Arc; use axum::{ extract::State, response::{IntoResponse, Response}, }; use http::StatusCode; use crate::compute::ComputeNode; use crate::hadron_metrics::POSTGRES_PAGESTREAM_REQUEST_ERRORS; use crate::http::JsonResponse; /// The /refresh_configuration POST method is used to nudge compute_ctl to pull a new spec /// from the HCC and attempt to reconfigure Postgres with the new spec. The method does not wait /// for the reconfiguration to complete. Rather, it simply delivers a signal that will cause /// configuration to be reloaded in a best effort manner. Invocation of this method does not /// guarantee that a reconfiguration will occur. The caller should consider keep sending this /// request while it believes that the compute configuration is out of date. pub(in crate::http) async fn refresh_configuration( State(compute): State>, ) -> Response { POSTGRES_PAGESTREAM_REQUEST_ERRORS.inc(); match compute.signal_refresh_configuration().await { Ok(_) => StatusCode::OK.into_response(), Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), } } ================================================ FILE: compute_tools/src/http/routes/status.rs ================================================ use std::ops::Deref; use std::sync::Arc; use axum::extract::State; use axum::http::StatusCode; use axum::response::Response; use compute_api::responses::ComputeStatusResponse; use crate::compute::ComputeNode; use crate::http::JsonResponse; /// Retrieve the state of the comute. pub(in crate::http) async fn get_status(State(compute): State>) -> Response { let state = compute.state.lock().unwrap(); let body = ComputeStatusResponse::from(state.deref()); JsonResponse::success(StatusCode::OK, body) } ================================================ FILE: compute_tools/src/http/routes/terminate.rs ================================================ use crate::compute::{ComputeNode, forward_termination_signal}; use crate::http::JsonResponse; use axum::extract::State; use axum::response::{IntoResponse, Response}; use axum_extra::extract::OptionalQuery; use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse}; use http::StatusCode; use serde::Deserialize; use std::sync::Arc; use tokio::task; use tracing::info; #[derive(Deserialize, Default)] pub struct TerminateQuery { mode: TerminateMode, } /// Terminate the compute. pub(in crate::http) async fn terminate( State(compute): State>, OptionalQuery(terminate): OptionalQuery, ) -> Response { let mode = terminate.unwrap_or_default().mode; { let mut state = compute.state.lock().unwrap(); if state.status == ComputeStatus::Terminated { let response = TerminateResponse { lsn: state.terminate_flush_lsn, }; return JsonResponse::success(StatusCode::CREATED, response); } if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { return JsonResponse::invalid_status(state.status); } // If compute is Empty, there's no Postgres to terminate. The regular compute_ctl termination path // assumes Postgres to be configured and running, so we just special-handle this case by exiting // the process directly. if compute.params.lakebase_mode && state.status == ComputeStatus::Empty { drop(state); info!("terminating empty compute - will exit process"); // Queue a task to exit the process after 5 seconds. The 5-second delay aims to // give enough time for the HTTP response to be sent so that HCM doesn't get an abrupt // connection termination. tokio::spawn(async { tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; info!("exiting process after terminating empty compute"); std::process::exit(0); }); return StatusCode::OK.into_response(); } // For Running status, proceed with normal termination state.set_status(mode.into(), &compute.state_changed); drop(state); } forward_termination_signal(false); info!("sent signal and notified waiters"); // Spawn a blocking thread to wait for compute to become Terminated. // This is needed to do not block the main pool of workers and // be able to serve other requests while some particular request // is waiting for compute to finish configuration. let c = compute.clone(); let lsn = task::spawn_blocking(move || { let mut state = c.state.lock().unwrap(); while state.status != ComputeStatus::Terminated { state = c.state_changed.wait(state).unwrap(); info!( "waiting for compute to become {}, current status: {:?}", ComputeStatus::Terminated, state.status ); } state.terminate_flush_lsn }) .await .unwrap(); info!("terminated Postgres"); JsonResponse::success(StatusCode::OK, TerminateResponse { lsn }) } ================================================ FILE: compute_tools/src/http/server.rs ================================================ use std::fmt::Display; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::sync::Arc; use std::time::Duration; use anyhow::Result; use axum::Router; use axum::middleware::{self}; use axum::response::IntoResponse; use axum::routing::{get, post}; use compute_api::responses::ComputeCtlConfig; use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; use tower_http::{ auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, }; use tracing::{Span, error, info}; use super::middleware::request_id::maybe_add_request_id_header; use super::{ headers::X_REQUEST_ID, middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, grants, hadron_liveness_probe, insights, lfc, metrics, metrics_json, promote, refresh_configuration, status, terminate, }, }; use crate::compute::ComputeNode; /// `compute_ctl` has two servers: internal and external. The internal server /// binds to the loopback interface and handles communication from clients on /// the compute. The external server is what receives communication from the /// control plane, the metrics scraper, etc. We make the distinction because /// certain routes in `compute_ctl` only need to be exposed to local processes /// like Postgres via the neon extension and local_proxy. #[derive(Clone, Debug)] pub enum Server { Internal { port: u16, }, External { port: u16, config: ComputeCtlConfig, compute_id: String, instance_id: Option, }, } impl Display for Server { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Server::Internal { .. } => f.write_str("internal"), Server::External { .. } => f.write_str("external"), } } } impl From<&Server> for Router> { fn from(server: &Server) -> Self { let mut router = Router::>::new(); router = match server { Server::Internal { .. } => { router = router .route( "/extension_server/{*filename}", post(extension_server::download_extension), ) .route("/extensions", post(extensions::install_extension)) .route("/grants", post(grants::add_grant)) // Hadron: Compute-initiated configuration refresh .route( "/refresh_configuration", post(refresh_configuration::refresh_configuration), ); // Add in any testing support if cfg!(feature = "testing") { use super::routes::failpoints; router = router.route("/failpoints", post(failpoints::configure_failpoints)); } router } Server::External { config, compute_id, instance_id, .. } => { let unauthenticated_router = Router::>::new() .route("/metrics", get(metrics::get_metrics)) .route( "/autoscaling_metrics", get(metrics::get_autoscaling_metrics), ); let authenticated_router = Router::>::new() .route( "/lfc/prewarm", get(lfc::prewarm_state) .post(lfc::prewarm) .delete(lfc::cancel_prewarm), ) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route("/insights", get(insights::get_insights)) .route("/metrics.json", get(metrics_json::get_metrics)) .route("/status", get(status::get_status)) .route("/terminate", post(terminate::terminate)) .route( "/hadron_liveness_probe", get(hadron_liveness_probe::hadron_liveness_probe), ) .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( compute_id.clone(), instance_id.clone(), config.jwks.clone(), ))); router .merge(unauthenticated_router) .merge(authenticated_router) } }; router .fallback(Server::handle_404) .method_not_allowed_fallback(Server::handle_405) .layer( ServiceBuilder::new() .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) // Add this middleware since we assume the request ID exists .layer(middleware::from_fn(maybe_add_request_id_header)) .layer( TraceLayer::new_for_http() .on_request(|request: &http::Request<_>, _span: &Span| { let request_id = request .headers() .get(X_REQUEST_ID) .unwrap() .to_str() .unwrap(); info!(%request_id, "{} {}", request.method(), request.uri()); }) .on_response( |response: &http::Response<_>, latency: Duration, _span: &Span| { let request_id = response .headers() .get(X_REQUEST_ID) .unwrap() .to_str() .unwrap(); info!( %request_id, code = response.status().as_u16(), latency = latency.as_millis() ); }, ), ) .layer(PropagateRequestIdLayer::x_request_id()), ) } } impl Server { async fn handle_404() -> impl IntoResponse { StatusCode::NOT_FOUND } async fn handle_405() -> impl IntoResponse { StatusCode::METHOD_NOT_ALLOWED } async fn listener(&self) -> Result { let addr = SocketAddr::new(self.ip(), self.port()); let listener = TcpListener::bind(&addr).await?; Ok(listener) } fn ip(&self) -> IpAddr { match self { // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners // allow binding to localhost Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), } } fn port(&self) -> u16 { match self { Server::Internal { port, .. } => *port, Server::External { port, .. } => *port, } } async fn serve(self, compute: Arc) { let listener = self.listener().await.unwrap_or_else(|e| { // If we can't bind, the compute cannot operate correctly panic!( "failed to bind the compute_ctl {} HTTP server to {}: {}", self, SocketAddr::new(self.ip(), self.port()), e ); }); if tracing::enabled!(tracing::Level::INFO) { let local_addr = match listener.local_addr() { Ok(local_addr) => local_addr, Err(_) => SocketAddr::new(self.ip(), self.port()), }; info!( "compute_ctl {} HTTP server listening at {}", self, local_addr ); } let router = Router::from(&self) .with_state(compute) .into_make_service_with_connect_info::(); if let Err(e) = axum::serve(listener, router).await { error!("compute_ctl {} HTTP server error: {}", self, e); } } pub fn launch(self, compute: &Arc) { let state = Arc::clone(compute); info!("Launching the {} server", self); tokio::spawn(self.serve(state)); } } ================================================ FILE: compute_tools/src/installed_extensions.rs ================================================ use std::collections::HashMap; use anyhow::Result; use compute_api::responses::{InstalledExtension, InstalledExtensions}; use once_cell::sync::Lazy; use tokio_postgres::error::Error as PostgresError; use tokio_postgres::{Client, Config, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. /// /// Limit the number of databases to 500 to avoid excessive load. async fn list_dbs(client: &mut Client) -> Result, PostgresError> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state let databases = client .query( "SELECT datname FROM pg_catalog.pg_database WHERE datallowconn AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4) LIMIT 500", &[], ) .await? .iter() .map(|row| { let db: String = row.get("datname"); db }) .collect(); Ok(databases) } /// Connect to every database (see list_dbs above) and get the list of installed extensions. /// /// Same extension can be installed in multiple databases with different versions, /// so we report a separate metric (number of databases where it is installed) /// for each extension version. pub async fn get_installed_extensions( mut conf: Config, ) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); let databases: Vec = { let (mut client, connection) = conf.connect(NoTls).await?; tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); list_dbs(&mut client).await? }; let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new(); for db in databases.iter() { conf.dbname(db); let (client, connection) = conf.connect(NoTls).await?; tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); let extensions: Vec<(String, String, i32)> = client .query( "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension", &[], ) .await? .iter() .map(|row| { ( row.get("extname"), row.get("extversion"), row.get("extowner"), ) }) .collect(); for (extname, v, extowner) in extensions.iter() { let version = v.to_string(); // check if the extension is owned by superuser // 10 is the oid of superuser let owned_by_superuser = if *extowner == 10 { "1" } else { "0" }; extensions_map .entry(( extname.to_string(), version.clone(), owned_by_superuser.to_string(), )) .and_modify(|e| { // count the number of databases where the extension is installed e.n_databases += 1; }) .or_insert(InstalledExtension { extname: extname.to_string(), version: version.clone(), n_databases: 1, owned_by_superuser: owned_by_superuser.to_string(), }); } } for (key, ext) in extensions_map.iter() { let (extname, version, owned_by_superuser) = key; let n_databases = ext.n_databases as u64; INSTALLED_EXTENSIONS .with_label_values(&[extname, version, owned_by_superuser]) .set(n_databases); } Ok(InstalledExtensions { extensions: extensions_map.into_values().collect(), }) } pub fn initialize_metrics() { Lazy::force(&INSTALLED_EXTENSIONS); } ================================================ FILE: compute_tools/src/lib.rs ================================================ //! Various tools and helpers to handle cluster / compute node (Postgres) //! configuration. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] pub mod checker; pub mod communicator_socket_client; pub mod config; pub mod configurator; pub mod http; #[macro_use] pub mod logger; pub mod catalog; pub mod compute; pub mod compute_prewarm; pub mod compute_promote; pub mod disk_quota; pub mod extension_server; pub mod hadron_metrics; pub mod installed_extensions; pub mod local_proxy; pub mod lsn_lease; pub mod metrics; mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod pg_isready; pub mod pgbouncer; pub mod rsyslog; pub mod spec; mod spec_apply; pub mod swap; pub mod sync_sk; pub mod tls; ================================================ FILE: compute_tools/src/local_proxy.rs ================================================ //! Local Proxy is a feature of our BaaS Neon Authorize project. //! //! Local Proxy validates JWTs and manages the pg_session_jwt extension. //! It also maintains a connection pool to postgres. use anyhow::{Context, Result}; use camino::Utf8Path; use compute_api::spec::LocalProxySpec; use nix::sys::signal::Signal; use utils::pid_file::{self, PidFileRead}; pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> { write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?; notify_local_proxy("/etc/local_proxy/pid".as_ref())?; Ok(()) } /// Create or completely rewrite configuration file specified by `path` fn write_local_proxy_conf(path: &Utf8Path, local_proxy: &LocalProxySpec) -> Result<()> { let config = serde_json::to_string_pretty(local_proxy).context("serializing LocalProxySpec to json")?; std::fs::write(path, config).with_context(|| format!("writing {path}"))?; Ok(()) } /// Notify local proxy about a new config file. fn notify_local_proxy(path: &Utf8Path) -> Result<()> { match pid_file::read(path)? { // if the file doesn't exist, or isn't locked, local_proxy isn't running // and will naturally pick up our config later PidFileRead::NotExist | PidFileRead::NotHeldByAnyProcess(_) => {} PidFileRead::LockedByOtherProcess(pid) => { // From the pid_file docs: // // > 1. The other process might exit at any time, turning the given PID stale. // > 2. There is a small window in which `claim_for_current_process` has already // > locked the file but not yet updates its contents. [`read`] will return // > this variant here, but with the old file contents, i.e., a stale PID. // > // > The kernel is free to recycle PID once it has been `wait(2)`ed upon by // > its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill` // > system call on it, bears the risk of killing an unrelated process. // > This is an inherent limitation of using pidfiles. // > The only race-free solution is to have a supervisor-process with a lifetime // > that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`). // // This is an ok risk as we only send a SIGHUP which likely won't actually // kill the process, only reload config. nix::sys::signal::kill(pid, Signal::SIGHUP).context("sending signal to local_proxy")?; } } Ok(()) } ================================================ FILE: compute_tools/src/logger.rs ================================================ use std::collections::HashMap; use std::sync::{LazyLock, RwLock}; use tracing::Subscriber; use tracing::info; use tracing_appender; use tracing_subscriber::prelude::*; use tracing_subscriber::{fmt, layer::SubscriberExt, registry::LookupSpan}; /// Initialize logging to stderr, and OpenTelemetry tracing and exporter. /// /// Logging is configured using either `default_log_level` or /// `RUST_LOG` environment variable as default log level. /// /// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up /// configuration from environment variables. For example, to change the destination, /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See /// `tracing-utils` package description. /// pub fn init_tracing_and_logging( default_log_level: &str, log_dir_opt: &Option, ) -> anyhow::Result<( Option, Option, )> { // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); // Standard output streams let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) .with_target(false) .with_writer(std::io::stderr); // Logs with file rotation. Files in `$log_dir/pgcctl.yyyy-MM-dd` let (json_to_file_layer, _file_logs_guard) = if let Some(log_dir) = log_dir_opt { std::fs::create_dir_all(log_dir)?; let file_logs_appender = tracing_appender::rolling::RollingFileAppender::builder() .rotation(tracing_appender::rolling::Rotation::DAILY) .filename_prefix("pgcctl") // Lib appends to existing files, so we will keep files for up to 2 days even on restart loops. // At minimum, log-daemon will have 1 day to detect and upload a file (if created right before midnight). .max_log_files(2) .build(log_dir) .expect("Initializing rolling file appender should succeed"); let (file_logs_writer, _file_logs_guard) = tracing_appender::non_blocking(file_logs_appender); let json_to_file_layer = tracing_subscriber::fmt::layer() .with_ansi(false) .with_target(false) .event_format(PgJsonLogShapeFormatter) .with_writer(file_logs_writer); (Some(json_to_file_layer), Some(_file_logs_guard)) } else { (None, None) }; // Initialize OpenTelemetry let provider = tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()); let otlp_layer = provider.as_ref().map(tracing_utils::layer); // Put it all together tracing_subscriber::registry() .with(env_filter) .with(otlp_layer) .with(fmt_layer) .with(json_to_file_layer) .init(); tracing::info!("logging and tracing started"); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); Ok((provider, _file_logs_guard)) } /// Replace all newline characters with a special character to make it /// easier to grep for log messages. pub fn inlinify(s: &str) -> String { s.replace('\n', "\u{200B}") } pub fn startup_context_from_env() -> Option { // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. // // This is used to propagate the context for the 'start_compute' operation // from the neon control plane. This allows linking together the wider // 'start_compute' operation that creates the compute container, with the // startup actions here within the container. // // There is no standard for passing context in env variables, but a lot of // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See // https://github.com/open-telemetry/opentelemetry-specification/issues/740 // // Switch to the startup context here, and exit it once the startup has // completed and Postgres is up and running. // // If this pod is pre-created without binding it to any particular endpoint // yet, this isn't the right place to enter the startup context. In that // case, the control plane should pass the tracing context as part of the // /configure API call. // // NOTE: This is supposed to only cover the *startup* actions. Once // postgres is configured and up-and-running, we exit this span. Any other // actions that are performed on incoming HTTP requests, for example, are // performed in separate spans. // // XXX: If the pod is restarted, we perform the startup actions in the same // context as the original startup actions, which probably doesn't make // sense. let mut startup_tracing_carrier: HashMap = HashMap::new(); if let Ok(val) = std::env::var("TRACEPARENT") { startup_tracing_carrier.insert("traceparent".to_string(), val); } if let Ok(val) = std::env::var("TRACESTATE") { startup_tracing_carrier.insert("tracestate".to_string(), val); } if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry_sdk::propagation::TraceContextPropagator; info!("got startup tracing context from env variables"); Some(TraceContextPropagator::new().extract(&startup_tracing_carrier)) } else { None } } /// Track relevant id's const UNKNOWN_IDS: &str = r#""pg_instance_id": "", "pg_compute_id": """#; static IDS: LazyLock> = LazyLock::new(|| RwLock::new(UNKNOWN_IDS.to_string())); pub fn update_ids(instance_id: &Option, compute_id: &Option) -> anyhow::Result<()> { let ids = format!( r#""pg_instance_id": "{}", "pg_compute_id": "{}""#, instance_id.as_ref().map(|s| s.as_str()).unwrap_or_default(), compute_id.as_ref().map(|s| s.as_str()).unwrap_or_default() ); let mut guard = IDS .write() .map_err(|e| anyhow::anyhow!("Log set id's rwlock poisoned: {}", e))?; *guard = ids; Ok(()) } /// Massage compute_ctl logs into PG json log shape so we can use the same Lumberjack setup. struct PgJsonLogShapeFormatter; impl fmt::format::FormatEvent for PgJsonLogShapeFormatter where S: Subscriber + for<'a> LookupSpan<'a>, N: for<'a> fmt::format::FormatFields<'a> + 'static, { fn format_event( &self, ctx: &fmt::FmtContext<'_, S, N>, mut writer: fmt::format::Writer<'_>, event: &tracing::Event<'_>, ) -> std::fmt::Result { // Format values from the event's metadata, and open message string let metadata = event.metadata(); { let ids_guard = IDS.read(); let ids = ids_guard .as_ref() .map(|guard| guard.as_str()) // Surpress so that we don't lose all uploaded/ file logs if something goes super wrong. We would notice the missing id's. .unwrap_or(UNKNOWN_IDS); write!( &mut writer, r#"{{"timestamp": "{}", "error_severity": "{}", "file_name": "{}", "backend_type": "compute_ctl_self", {}, "message": "#, chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f GMT"), metadata.level(), metadata.target(), ids )?; } let mut message = String::new(); let message_writer = fmt::format::Writer::new(&mut message); // Gather the message ctx.field_format().format_fields(message_writer, event)?; // TODO: any better options than to copy-paste this OSS span formatter? // impl FormatEvent for Format // https://docs.rs/tracing-subscriber/latest/tracing_subscriber/fmt/trait.FormatEvent.html#impl-FormatEvent%3CS,+N%3E-for-Format%3CFull,+T%3E // write message, close bracket, and new line writeln!(writer, "{}}}", serde_json::to_string(&message).unwrap()) } } #[cfg(feature = "testing")] #[cfg(test)] mod test { use super::*; use std::{cell::RefCell, io}; // Use thread_local! instead of Mutex for test isolation thread_local! { static WRITER_OUTPUT: RefCell = const { RefCell::new(String::new()) }; } #[derive(Clone, Default)] struct StaticStringWriter; impl io::Write for StaticStringWriter { fn write(&mut self, buf: &[u8]) -> io::Result { let output = String::from_utf8(buf.to_vec()).expect("Invalid UTF-8 in test output"); WRITER_OUTPUT.with(|s| s.borrow_mut().push_str(&output)); Ok(buf.len()) } fn flush(&mut self) -> io::Result<()> { Ok(()) } } impl fmt::MakeWriter<'_> for StaticStringWriter { type Writer = Self; fn make_writer(&self) -> Self::Writer { Self } } #[test] fn test_log_pg_json_shape_formatter() { // Use a scoped subscriber to prevent global state pollution let subscriber = tracing_subscriber::registry().with( tracing_subscriber::fmt::layer() .with_ansi(false) .with_target(false) .event_format(PgJsonLogShapeFormatter) .with_writer(StaticStringWriter), ); let _ = update_ids(&Some("000".to_string()), &Some("111".to_string())); // Clear any previous test state WRITER_OUTPUT.with(|s| s.borrow_mut().clear()); let messages = [ "test message", r#"json escape check: name="BatchSpanProcessor.Flush.ExportError" reason="Other(reqwest::Error { kind: Request, url: \"http://localhost:4318/v1/traces\", source: hyper_ util::client::legacy::Error(Connect, ConnectError(\"tcp connect error\", Os { code: 111, kind: ConnectionRefused, message: \"Connection refused\" })) })" Failed during the export process"#, ]; tracing::subscriber::with_default(subscriber, || { for message in messages { tracing::info!(message); } }); tracing::info!("not test message"); // Get captured output let output = WRITER_OUTPUT.with(|s| s.borrow().clone()); let json_strings: Vec<&str> = output.lines().collect(); assert_eq!( json_strings.len(), messages.len(), "Log didn't have the expected number of json strings." ); let json_string_shape_regex = regex::Regex::new( r#"\{"timestamp": "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3} GMT", "error_severity": "INFO", "file_name": ".+", "backend_type": "compute_ctl_self", "pg_instance_id": "000", "pg_compute_id": "111", "message": ".+"\}"# ).unwrap(); for (i, expected_message) in messages.iter().enumerate() { let json_string = json_strings[i]; assert!( json_string_shape_regex.is_match(json_string), "Json log didn't match expected pattern:\n{json_string}", ); let parsed_json: serde_json::Value = serde_json::from_str(json_string).unwrap(); let actual_message = parsed_json["message"].as_str().unwrap(); assert_eq!(*expected_message, actual_message); } } } ================================================ FILE: compute_tools/src/lsn_lease.rs ================================================ use std::str::FromStr; use std::sync::Arc; use std::thread; use std::time::{Duration, SystemTime}; use anyhow::{Result, bail}; use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol}; use pageserver_page_api as page_api; use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::shard::TenantShardId; use crate::compute::ComputeNode; /// Spawns a background thread to periodically renew LSN leases for static compute. /// Do nothing if the compute is not in static mode. pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc) { let (tenant_id, timeline_id, lsn) = { let state = compute.state.lock().unwrap(); let spec = state.pspec.as_ref().expect("Spec must be set"); match spec.spec.mode { ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn), _ => return, } }; let compute = compute.clone(); let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn); thread::spawn(move || { let _entered = span.entered(); if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) { // TODO: might need stronger error feedback than logging an warning. warn!("Exited with error: {e}"); } }); } /// Renews lsn lease periodically so static compute are not affected by GC. fn lsn_lease_bg_task( compute: Arc, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Result<()> { loop { let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?; let valid_duration = valid_until .duration_since(SystemTime::now()) .unwrap_or(Duration::ZERO); // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration. let sleep_duration = valid_duration .saturating_sub(Duration::from_secs(60)) .max(valid_duration / 2); info!( "Request succeeded, sleeping for {} seconds", sleep_duration.as_secs() ); compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration); } } /// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted. /// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests. fn acquire_lsn_lease_with_retry( compute: &Arc, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Result { let mut attempts = 0usize; let mut retry_period_ms: f64 = 500.0; const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0; loop { // Note: List of pageservers is dynamic, need to re-read configs before each attempt. let (conninfo, auth) = { let state = compute.state.lock().unwrap(); let spec = state.pspec.as_ref().expect("spec must be set"); ( spec.pageserver_conninfo.clone(), spec.storage_auth_token.clone(), ) }; let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn); match result { Ok(Some(res)) => { return Ok(res); } Ok(None) => { bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); } Err(e) => { warn!("Failed to acquire lsn lease: {e} (attempt {attempts})"); compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis( retry_period_ms as u64, )); retry_period_ms *= 1.5; retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); } } attempts += 1; } } /// Tries to acquire LSN leases on all Pageserver shards. fn try_acquire_lsn_lease( conninfo: PageserverConnectionInfo, auth: Option<&str>, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Result> { let mut leases = Vec::new(); for (shard_index, shard) in conninfo.shards.into_iter() { let tenant_shard_id = TenantShardId { tenant_id, shard_number: shard_index.shard_number, shard_count: shard_index.shard_count, }; // XXX: If there are more than pageserver for the one shard, do we need to get a // leas on all of them? Currently, that's what we assume, but this is hypothetical // as of this writing, as we never pass the info for more than one pageserver per // shard. for pageserver in shard.pageservers { let lease = match conninfo.prefer_protocol { PageserverProtocol::Grpc => acquire_lsn_lease_grpc( &pageserver.grpc_url.unwrap(), auth, tenant_shard_id, timeline_id, lsn, )?, PageserverProtocol::Libpq => acquire_lsn_lease_libpq( &pageserver.libpq_url.unwrap(), auth, tenant_shard_id, timeline_id, lsn, )?, }; leases.push(lease); } } Ok(leases.into_iter().min().flatten()) } /// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a /// postgresql:// scheme. fn acquire_lsn_lease_libpq( connstring: &str, auth: Option<&str>, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, ) -> Result> { let mut config = postgres::Config::from_str(connstring)?; if let Some(auth) = auth { config.password(auth); } let mut client = config.connect(NoTls)?; let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); let res = client.simple_query(&cmd)?; let msg = match res.first() { Some(msg) => msg, None => bail!("empty response"), }; let row = match msg { SimpleQueryMessage::Row(row) => row, _ => bail!("error parsing lsn lease response"), }; // Note: this will be None if a lease is explicitly not granted. let valid_until_str = row.get("valid_until"); let valid_until = valid_until_str.map(|s| { SystemTime::UNIX_EPOCH .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) .expect("Time larger than max SystemTime could handle") }); Ok(valid_until) } /// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a /// grpc:// scheme. fn acquire_lsn_lease_grpc( connstring: &str, auth: Option<&str>, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, ) -> Result> { tokio::runtime::Handle::current().block_on(async move { let mut client = page_api::Client::connect( connstring.to_string(), tenant_shard_id.tenant_id, timeline_id, tenant_shard_id.to_index(), auth.map(String::from), None, ) .await?; let req = page_api::LeaseLsnRequest { lsn }; match client.lease_lsn(req).await { Ok(expires) => Ok(Some(expires)), // Lease couldn't be acquired because the LSN has been garbage collected. Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None), Err(err) => Err(err.into()), } }) } ================================================ FILE: compute_tools/src/metrics.rs ================================================ use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "compute_installed_extensions", "Number of databases where the version of extension is installed", &["extension_name", "version", "owned_by_superuser"] ) .expect("failed to define a metric") }); // Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH, // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. // And it's fair to call it a 'RPC' (Remote Procedure Call). pub enum CPlaneRequestRPC { GetConfig, } impl CPlaneRequestRPC { pub fn as_str(&self) -> &str { match self { CPlaneRequestRPC::GetConfig => "GetConfig", } } } pub const UNKNOWN_HTTP_STATUS: &str = "unknown"; pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy = Lazy::new(|| { register_int_counter_vec!( "compute_ctl_cplane_requests_total", "Total number of control plane requests made by compute_ctl by status", &["rpc", "http_status"] ) .expect("failed to define a metric") }); /// Total number of failed database migrations. Per-compute, this is actually a boolean metric, /// either empty or with a single value (1, migration_id) because we stop at the first failure. /// Yet, the sum over the fleet will provide the total number of failures. pub(crate) static DB_MIGRATION_FAILED: Lazy = Lazy::new(|| { register_int_counter_vec!( "compute_ctl_db_migration_failed_total", "Total number of failed database migrations", &["migration_id"] ) .expect("failed to define a metric") }); pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| { register_int_counter_vec!( "compute_ctl_remote_ext_requests_total", "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", &["http_status", "filename"] ) .expect("failed to define a metric") }); // Size of audit log directory in bytes pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new(|| { register_gauge!( "compute_audit_log_dir_size", "Size of audit log directory in bytes", ) .expect("failed to define a metric") }); // Report that `compute_ctl` is up and what's the current compute status. pub(crate) static COMPUTE_CTL_UP: Lazy = Lazy::new(|| { register_int_gauge_vec!( "compute_ctl_up", "Whether compute_ctl is running", &["build_tag", "status"] ) .expect("failed to define a metric") }); pub(crate) static PG_CURR_DOWNTIME_MS: Lazy> = Lazy::new(|| { register_gauge!( "compute_pg_current_downtime_ms", "Non-cumulative duration of Postgres downtime in ms; resets after successful check", ) .expect("failed to define a metric") }); pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy::new(|| { register_int_counter!( "compute_pg_downtime_ms_total", "Cumulative duration of Postgres downtime in ms", ) .expect("failed to define a metric") }); pub(crate) static LFC_PREWARMS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_prewarms_total", "Total number of LFC prewarms requested by compute_ctl or autoprewarm option", ) .expect("failed to define a metric") }); pub(crate) static LFC_PREWARM_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_prewarm_errors_total", "Total number of LFC prewarm errors", ) .expect("failed to define a metric") }); pub(crate) static LFC_OFFLOADS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_offloads_total", "Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option", ) .expect("failed to define a metric") }); pub(crate) static LFC_OFFLOAD_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_offload_errors_total", "Total number of LFC offload errors", ) .expect("failed to define a metric") }); pub fn collect() -> Vec { let mut metrics = COMPUTE_CTL_UP.collect(); metrics.extend(INSTALLED_EXTENSIONS.collect()); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); metrics.extend(LFC_PREWARMS.collect()); metrics.extend(LFC_PREWARM_ERRORS.collect()); metrics.extend(LFC_OFFLOADS.collect()); metrics.extend(LFC_OFFLOAD_ERRORS.collect()); metrics } ================================================ FILE: compute_tools/src/migration.rs ================================================ use anyhow::{Context, Result}; use fail::fail_point; use tokio_postgres::{Client, Transaction}; use tracing::{error, info}; use crate::metrics::DB_MIGRATION_FAILED; /// Runs a series of migrations on a target database pub(crate) struct MigrationRunner<'m> { client: &'m mut Client, migrations: &'m [&'m str], lakebase_mode: bool, } impl<'m> MigrationRunner<'m> { /// Create a new migration runner pub fn new(client: &'m mut Client, migrations: &'m [&'m str], lakebase_mode: bool) -> Self { // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 assert!(migrations.len() + 1 < i64::MAX as usize); Self { client, migrations, lakebase_mode, } } /// Get the current value neon_migration.migration_id async fn get_migration_id(&mut self) -> Result { let row = self .client .query_one("SELECT id FROM neon_migration.migration_id", &[]) .await?; Ok(row.get::<&str, i64>("id")) } /// Update the neon_migration.migration_id value /// /// This function has a fail point called compute-migration, which can be /// used if you would like to fail the application of a series of migrations /// at some point. async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> { // We use this fail point in order to check that failing in the // middle of applying a series of migrations fails in an expected // manner if cfg!(feature = "testing") { let fail = (|| { fail_point!("compute-migration", |fail_migration_id| { migration_id == fail_migration_id.unwrap().parse::().unwrap() }); false })(); if fail { return Err(anyhow::anyhow!(format!( "migration {} was configured to fail because of a failpoint", migration_id ))); } } txn.query( "UPDATE neon_migration.migration_id SET id = $1", &[&migration_id], ) .await .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?; Ok(()) } /// Prepare the migrations the target database for handling migrations async fn prepare_database(&mut self) -> Result<()> { self.client .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") .await?; self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?; self.client .simple_query( "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", ) .await?; self.client .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin") .await?; self.client .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC") .await?; Ok(()) } /// Run an individual migration in a separate transaction block. async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { let mut txn = client .transaction() .await .with_context(|| format!("begin transaction for migration {migration_id}"))?; if migration.starts_with("-- SKIP") { info!("Skipping migration id={}", migration_id); // Even though we are skipping the migration, updating the // migration ID should help keep logic easy to understand when // trying to understand the state of a cluster. Self::update_migration_id(&mut txn, migration_id).await?; } else { info!("Running migration id={}:\n{}\n", migration_id, migration); txn.simple_query(migration) .await .with_context(|| format!("apply migration {migration_id}"))?; Self::update_migration_id(&mut txn, migration_id).await?; } txn.commit() .await .with_context(|| format!("commit transaction for migration {migration_id}"))?; Ok(()) } /// Run the configured set of migrations pub async fn run_migrations(mut self) -> Result<()> { self.prepare_database() .await .context("prepare database to handle migrations")?; let mut current_migration = self.get_migration_id().await? as usize; while current_migration < self.migrations.len() { // The index lags the migration ID by 1, so the current migration // ID is also the next index let migration_id = (current_migration + 1) as i64; let migration = self.migrations[current_migration]; let migration = if self.lakebase_mode { migration.replace("neon_superuser", "databricks_superuser") } else { migration.to_string() }; match Self::run_migration(self.client, migration_id, &migration).await { Ok(_) => { info!("Finished migration id={}", migration_id); } Err(e) => { error!("Failed to run migration id={}: {:?}", migration_id, e); DB_MIGRATION_FAILED .with_label_values(&[migration_id.to_string().as_str()]) .inc(); return Err(e); } } current_migration += 1; } Ok(()) } } ================================================ FILE: compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql ================================================ ALTER ROLE {privileged_role_name} BYPASSRLS; ================================================ FILE: compute_tools/src/migrations/0002-alter_roles.sql ================================================ -- On December 8th, 2023, an engineering escalation (INC-110) was opened after -- it was found that BYPASSRLS was being applied to all roles. -- -- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657 -- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072 -- -- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it -- isn't easy to know if a Postgres cluster is affected by the issue, we need to -- keep the migration around for a long time, if not indefinitely, so any -- cluster can be fixed. -- -- Branching is the gift that keeps on giving... DO $$ DECLARE role_name text; BEGIN FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name); EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name); END LOOP; FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name); EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name); END LOOP; END $$; ================================================ FILE: compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql ================================================ DO $$ BEGIN IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql ================================================ GRANT pg_monitor TO {privileged_role_name} WITH ADMIN OPTION; ================================================ FILE: compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql ================================================ -- SKIP: Deemed insufficient for allowing relations created by extensions to be -- interacted with by {privileged_role_name} without permission issues. ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name}; ================================================ FILE: compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql ================================================ -- SKIP: Deemed insufficient for allowing relations created by extensions to be -- interacted with by {privileged_role_name} without permission issues. ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name}; ================================================ FILE: compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql ================================================ -- SKIP: Moved inline to the handle_grants() functions. ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name} WITH GRANT OPTION; ================================================ FILE: compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql ================================================ -- SKIP: Moved inline to the handle_grants() functions. ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name} WITH GRANT OPTION; ================================================ FILE: compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql ================================================ -- SKIP: The original goal of this migration was to prevent creating -- subscriptions, but this migration was insufficient. DO $$ DECLARE role_name TEXT; BEGIN FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE LOOP RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name); EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name); END LOOP; END $$; ================================================ FILE: compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql ================================================ DO $$ BEGIN IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql ================================================ GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO {privileged_role_name}; ================================================ FILE: compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql ================================================ GRANT pg_signal_backend TO {privileged_role_name} WITH ADMIN OPTION; ================================================ FILE: compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql ================================================ DO $$ DECLARE bypassrls boolean; BEGIN SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser'; IF NOT bypassrls THEN RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0002-alter_roles.sql ================================================ DO $$ DECLARE role record; BEGIN FOR role IN SELECT rolname AS name, rolinherit AS inherit FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') LOOP IF NOT role.inherit THEN RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); END IF; END LOOP; FOR role IN SELECT rolname AS name, rolbypassrls AS bypassrls FROM pg_catalog.pg_roles WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP IF role.bypassrls THEN RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name); END IF; END LOOP; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql ================================================ DO $$ BEGIN IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN RETURN; END IF; IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql ================================================ DO $$ DECLARE monitor record; BEGIN SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, admin_option AS admin INTO monitor FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_monitor'::pg_catalog.regrole AND member = 'neon_superuser'::pg_catalog.regrole; IF monitor IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; END IF; IF monitor.admin IS NULL OR NOT monitor.member THEN RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor'; END IF; IF monitor.admin IS NULL OR NOT monitor.admin THEN RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_privileged_role.sql ================================================ -- This test was never written becuase at the time migration tests were added -- the accompanying migration was already skipped. ================================================ FILE: compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_privileged_role.sql ================================================ -- This test was never written becuase at the time migration tests were added -- the accompanying migration was already skipped. ================================================ FILE: compute_tools/src/migrations/tests/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql ================================================ -- This test was never written becuase at the time migration tests were added -- the accompanying migration was already skipped. ================================================ FILE: compute_tools/src/migrations/tests/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql ================================================ -- This test was never written becuase at the time migration tests were added -- the accompanying migration was already skipped. ================================================ FILE: compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql ================================================ -- This test was never written becuase at the time migration tests were added -- the accompanying migration was already skipped. ================================================ FILE: compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql ================================================ DO $$ DECLARE can_execute boolean; BEGIN SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')) INTO can_execute FROM pg_catalog.pg_proc WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql ================================================ DO $$ DECLARE can_execute boolean; BEGIN SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute') INTO can_execute FROM pg_catalog.pg_proc WHERE proname = 'pg_show_replication_origin_status' AND pronamespace = 'pg_catalog'::regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_show_replication_origin_status'; END IF; END $$; ================================================ FILE: compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql ================================================ DO $$ DECLARE signal_backend record; BEGIN SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, admin_option AS admin INTO signal_backend FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_signal_backend'::regrole AND member = 'neon_superuser'::regrole; IF signal_backend IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend'; END IF; IF signal_backend.member IS NULL OR NOT signal_backend.member THEN RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend'; END IF; IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend'; END IF; END $$; ================================================ FILE: compute_tools/src/monitor.rs ================================================ use std::sync::Arc; use std::thread; use std::time::Duration; use chrono::{DateTime, Utc}; use compute_api::responses::ComputeStatus; use compute_api::spec::ComputeFeature; use postgres::{Client, NoTls}; use tracing::{Level, error, info, instrument, span}; use crate::compute::ComputeNode; use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS}; const PG_DEFAULT_INIT_TIMEOUIT: Duration = Duration::from_secs(60); const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); /// Struct to store runtime state of the compute monitor thread. /// In theory, this could be a part of `Compute`, but i) /// this state is expected to be accessed only by single thread, /// so we don't need to care about locking; ii) `Compute` is /// already quite big. Thus, it seems to be a good idea to keep /// all the activity/health monitoring parts here. struct ComputeMonitor { compute: Arc, /// The moment when Postgres had some activity, /// that should prevent compute from being suspended. last_active: Option>, /// The moment when we last tried to check Postgres. last_checked: DateTime, /// The last moment we did a successful Postgres check. last_up: DateTime, /// Only used for internal statistics change tracking /// between monitor runs and can be outdated. active_time: Option, /// Only used for internal statistics change tracking /// between monitor runs and can be outdated. sessions: Option, /// Use experimental statistics-based activity monitor. It's no longer /// 'experimental' per se, as it's enabled for everyone, but we still /// keep the flag as an option to turn it off in some cases if it will /// misbehave. experimental: bool, } impl ComputeMonitor { fn report_down(&self) { let now = Utc::now(); // Calculate and report current downtime // (since the last time Postgres was up) let downtime = now.signed_duration_since(self.last_up); PG_CURR_DOWNTIME_MS.set(downtime.num_milliseconds() as f64); // Calculate and update total downtime // (cumulative duration of Postgres downtime in ms) let inc = now .signed_duration_since(self.last_checked) .num_milliseconds(); PG_TOTAL_DOWNTIME_MS.inc_by(inc as u64); } fn report_up(&mut self) { self.last_up = Utc::now(); PG_CURR_DOWNTIME_MS.set(0.0); } fn downtime_info(&self) -> String { format!( "total_ms: {}, current_ms: {}, last_up: {}", PG_TOTAL_DOWNTIME_MS.get(), PG_CURR_DOWNTIME_MS.get(), self.last_up ) } /// Check if compute is in some terminal or soon-to-be-terminal /// state, then return `true`, signalling the caller that it /// should exit gracefully. Otherwise, return `false`. fn check_interrupts(&mut self) -> bool { let compute_status = self.compute.get_status(); if matches!( compute_status, ComputeStatus::Terminated | ComputeStatus::TerminationPendingFast | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Failed ) { info!( "compute is in {} status, stopping compute monitor", compute_status ); return true; } false } /// Spin in a loop and figure out the last activity time in the Postgres. /// Then update it in the shared state. This function currently never /// errors out explicitly, but there is a graceful termination path. /// Every time we receive an error trying to check Postgres, we use /// [`ComputeMonitor::check_interrupts()`] because it could be that /// compute is being terminated already, then we can exit gracefully /// to not produce errors' noise in the log. /// NB: the only expected panic is at `Mutex` unwrap(), all other errors /// should be handled gracefully. #[instrument(skip_all)] pub fn run(&mut self) -> anyhow::Result<()> { // Suppose that `connstr` doesn't change let connstr = self.compute.params.connstr.clone(); let conf = self .compute .get_conn_conf(Some("compute_ctl:compute_monitor")); // During startup and configuration we connect to every Postgres database, // but we don't want to count this as some user activity. So wait until // the compute fully started before monitoring activity. wait_for_postgres_start(&self.compute); // Define `client` outside of the loop to reuse existing connection if it's active. let mut client = conf.connect(NoTls); info!("starting compute monitor for {}", connstr); loop { if self.check_interrupts() { break; } match &mut client { Ok(cli) => { if cli.is_closed() { info!( downtime_info = self.downtime_info(), "connection to Postgres is closed, trying to reconnect" ); if self.check_interrupts() { break; } self.report_down(); // Connection is closed, reconnect and try again. client = conf.connect(NoTls); } else { match self.check(cli) { Ok(_) => { self.report_up(); self.compute.update_last_active(self.last_active); } Err(e) => { error!( downtime_info = self.downtime_info(), "could not check Postgres: {}", e ); if self.check_interrupts() { break; } // Although we have many places where we can return errors in `check()`, // normally it shouldn't happen. I.e., we will likely return error if // connection got broken, query timed out, Postgres returned invalid data, etc. // In all such cases it's suspicious, so let's report this as downtime. self.report_down(); // Reconnect to Postgres just in case. During tests, I noticed // that queries in `check()` can fail with `connection closed`, // but `cli.is_closed()` above doesn't detect it. Even if old // connection is still alive, it will be dropped when we reassign // `client` to a new connection. client = conf.connect(NoTls); } } } } Err(e) => { info!( downtime_info = self.downtime_info(), "could not connect to Postgres: {}, retrying", e ); if self.check_interrupts() { break; } self.report_down(); // Establish a new connection and try again. client = conf.connect(NoTls); } } // Reset the `last_checked` timestamp and sleep before the next iteration. self.last_checked = Utc::now(); thread::sleep(MONITOR_CHECK_INTERVAL); } // Graceful termination path Ok(()) } #[instrument(skip_all)] fn check(&mut self, cli: &mut Client) -> anyhow::Result<()> { // This is new logic, only enable if the feature flag is set. // TODO: remove this once we are sure that it works OR drop it altogether. if self.experimental { // Check if the total active time or sessions across all databases has changed. // If it did, it means that user executed some queries. In theory, it can even go down if // some databases were dropped, but it's still user activity. match get_database_stats(cli) { Ok((active_time, sessions)) => { let mut detected_activity = false; if let Some(prev_active_time) = self.active_time { if active_time != prev_active_time { detected_activity = true; } } self.active_time = Some(active_time); if let Some(prev_sessions) = self.sessions { if sessions != prev_sessions { detected_activity = true; } } self.sessions = Some(sessions); if detected_activity { // Update the last active time and continue, we don't need to // check backends state change. self.last_active = Some(Utc::now()); return Ok(()); } } Err(e) => { return Err(anyhow::anyhow!("could not get database statistics: {}", e)); } } } // If database statistics are the same, check all backends for state changes. // Maybe there are some with more recent activity. `get_backends_state_change()` // can return None or stale timestamp, so it's `compute.update_last_active()` // responsibility to check if the new timestamp is more recent than the current one. // This helps us to discover new sessions that have not done anything yet. match get_backends_state_change(cli) { Ok(last_active) => match (last_active, self.last_active) { (Some(last_active), Some(prev_last_active)) => { if last_active > prev_last_active { self.last_active = Some(last_active); return Ok(()); } } (Some(last_active), None) => { self.last_active = Some(last_active); return Ok(()); } _ => {} }, Err(e) => { return Err(anyhow::anyhow!( "could not get backends state change: {}", e )); } } // If there are existing (logical) walsenders, do not suspend. // // N.B. walproposer doesn't currently show up in pg_stat_replication, // but protect if it will. const WS_COUNT_QUERY: &str = "select count(*) from pg_stat_replication where application_name != 'walproposer';"; match cli.query_one(WS_COUNT_QUERY, &[]) { Ok(r) => match r.try_get::<&str, i64>("count") { Ok(num_ws) => { if num_ws > 0 { self.last_active = Some(Utc::now()); return Ok(()); } } Err(e) => { let err: anyhow::Error = e.into(); return Err(err.context("failed to parse walsenders count")); } }, Err(e) => { return Err(anyhow::anyhow!("failed to get list of walsenders: {}", e)); } } // Don't suspend compute if there is an active logical replication subscription // // `where pid is not null` – to filter out read only computes and subscription on branches const LOGICAL_SUBSCRIPTIONS_QUERY: &str = "select count(*) from pg_stat_subscription where pid is not null;"; match cli.query_one(LOGICAL_SUBSCRIPTIONS_QUERY, &[]) { Ok(row) => match row.try_get::<&str, i64>("count") { Ok(num_subscribers) => { if num_subscribers > 0 { self.last_active = Some(Utc::now()); return Ok(()); } } Err(e) => { return Err(anyhow::anyhow!( "failed to parse 'pg_stat_subscription' count: {}", e )); } }, Err(e) => { return Err(anyhow::anyhow!( "failed to get list of active logical replication subscriptions: {}", e )); } } // Do not suspend compute if autovacuum is running const AUTOVACUUM_COUNT_QUERY: &str = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'"; match cli.query_one(AUTOVACUUM_COUNT_QUERY, &[]) { Ok(r) => match r.try_get::<&str, i64>("count") { Ok(num_workers) => { if num_workers > 0 { self.last_active = Some(Utc::now()); return Ok(()); }; } Err(e) => { return Err(anyhow::anyhow!( "failed to parse autovacuum workers count: {}", e )); } }, Err(e) => { return Err(anyhow::anyhow!( "failed to get list of autovacuum workers: {}", e )); } } Ok(()) } } // Hang on condition variable waiting until the compute status is `Running`. fn wait_for_postgres_start(compute: &ComputeNode) { let mut state = compute.state.lock().unwrap(); let pg_init_timeout = compute .params .pg_init_timeout .unwrap_or(PG_DEFAULT_INIT_TIMEOUIT); while state.status != ComputeStatus::Running { info!("compute is not running, waiting before monitoring activity"); if !compute.params.lakebase_mode { state = compute.state_changed.wait(state).unwrap(); if state.status == ComputeStatus::Running { break; } continue; } if state.pg_start_time.is_some() && Utc::now() .signed_duration_since(state.pg_start_time.unwrap()) .to_std() .unwrap_or_default() > pg_init_timeout { // If Postgres isn't up and running with working PS/SK connections within POSTGRES_STARTUP_TIMEOUT, it is // possible that we started Postgres with a wrong spec (so it is talking to the wrong PS/SK nodes). To prevent // deadends we simply exit (panic) the compute node so it can restart with the latest spec. // // NB: We skip this check if we have not attempted to start PG yet (indicated by state.pg_start_up == None). // This is to make sure the more appropriate errors are surfaced if we encounter issues before we even attempt // to start PG (e.g., if we can't pull the spec, can't sync safekeepers, or can't get the basebackup). error!( "compute did not enter Running state in {} seconds, exiting", pg_init_timeout.as_secs() ); std::process::exit(1); } state = compute .state_changed .wait_timeout(state, Duration::from_secs(5)) .unwrap() .0; } } // Figure out the total active time and sessions across all non-system databases. // Returned tuple is `(active_time, sessions)`. // It can return `0.0` active time or `0` sessions, which means no user databases exist OR // it was a start with skipped `pg_catalog` updates and user didn't do any queries // (or open any sessions) yet. fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { // Filter out `postgres` database as `compute_ctl` and other monitoring tools // like `postgres_exporter` use it to query Postgres statistics. // Use explicit 8 bytes type casts to match Rust types. let stats = cli.query_one( "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time, pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions FROM pg_catalog.pg_stat_database WHERE datname NOT IN ( 'postgres', 'template0', 'template1' );", &[], ); let stats = match stats { Ok(stats) => stats, Err(e) => { return Err(anyhow::anyhow!("could not query active_time: {}", e)); } }; let active_time: f64 = match stats.try_get("total_active_time") { Ok(active_time) => active_time, Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)), }; let sessions: i64 = match stats.try_get("total_sessions") { Ok(sessions) => sessions, Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)), }; Ok((active_time, sessions)) } // Figure out the most recent state change time across all client backends. // If there is currently active backend, timestamp will be `Utc::now()`. // It can return `None`, which means no client backends exist or we were // unable to parse the timestamp. fn get_backends_state_change(cli: &mut Client) -> anyhow::Result>> { let mut last_active: Option> = None; // Get all running client backends except ourself, use RFC3339 DateTime format. let backends = cli.query( "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change FROM pg_stat_activity WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid() AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors? &[], ); match backends { Ok(backs) => { let mut idle_backs: Vec> = vec![]; for b in backs.into_iter() { let state: String = match b.try_get("state") { Ok(state) => state, Err(_) => continue, }; if state == "idle" { let change: String = match b.try_get("state_change") { Ok(state_change) => state_change, Err(_) => continue, }; let change = DateTime::parse_from_rfc3339(&change); match change { Ok(t) => idle_backs.push(t.with_timezone(&Utc)), Err(e) => { info!("cannot parse backend state_change DateTime: {}", e); continue; } } } else { // Found non-idle backend, so the last activity is NOW. // Return immediately, no need to check other backends. return Ok(Some(Utc::now())); } } // Get idle backend `state_change` with the max timestamp. if let Some(last) = idle_backs.iter().max() { last_active = Some(*last); } } Err(e) => { return Err(anyhow::anyhow!("could not query backends: {}", e)); } } Ok(last_active) } /// Launch a separate compute monitor thread and return its `JoinHandle`. pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { let compute = Arc::clone(compute); let experimental = compute.has_feature(ComputeFeature::ActivityMonitorExperimental); let now = Utc::now(); let mut monitor = ComputeMonitor { compute, last_active: None, last_checked: now, last_up: now, active_time: None, sessions: None, experimental, }; thread::Builder::new() .name("compute-monitor".into()) .spawn(move || { let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); match monitor.run() { Ok(_) => info!("compute monitor thread terminated gracefully"), Err(err) => error!("compute monitor thread terminated abnormally {:?}", err), } }) .expect("cannot launch compute monitor thread") } ================================================ FILE: compute_tools/src/params.rs ================================================ pub const DEFAULT_LOG_LEVEL: &str = "info"; // From Postgres docs: // To ease transition from the md5 method to the newer SCRAM method, if md5 is specified // as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM // (see below), then SCRAM-based authentication will automatically be chosen instead. // https://www.postgresql.org/docs/15/auth-password.html // // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles. pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5"; ================================================ FILE: compute_tools/src/pg_helpers.rs ================================================ use std::collections::HashMap; use std::fmt::Write; use std::fs; use std::fs::File; use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; use std::str::FromStr; use std::time::{Duration, Instant}; use anyhow::{Result, bail}; use compute_api::responses::TlsConfig; use compute_api::spec::{ Database, DatabricksSettings, GenericOption, GenericOptions, PgIdent, Role, }; use futures::StreamExt; use indexmap::IndexMap; use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; use tokio::io::AsyncBufReadExt; use tokio::task::JoinHandle; use tokio::time::timeout; use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal. /// /// Wrapping the result with `E'{}'` or `'{}'` is not required, /// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`. /// See /// for the original implementation. pub fn escape_literal(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); if res.contains('\\') { format!("E'{res}'") } else { format!("'{res}'") } } /// Escape a string so that it can be used in postgresql.conf. Wrapping the result /// with `'{}'` is not required, as it returns a ready-to-use config string. pub fn escape_conf_value(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); format!("'{res}'") } pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } impl GenericOptionExt for GenericOption { /// Represent `GenericOption` as SQL statement parameter. fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} {}", self.name, escape_literal(val)), _ => format!("{} {}", self.name, val), } } else { self.name.to_owned() } } /// Represent `GenericOption` as configuration option. fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} = {}", self.name, escape_conf_value(val)), _ => format!("{} = {}", self.name, val), } } else { self.name.to_owned() } } } pub trait PgOptionsSerialize { fn as_pg_options(&self) -> String; fn as_pg_settings(&self) -> String; } impl PgOptionsSerialize for GenericOptions { /// Serialize an optional collection of `GenericOption`'s to /// Postgres SQL statement arguments. fn as_pg_options(&self) -> String { if let Some(ops) = &self { ops.iter() .map(|op| op.to_pg_option()) .collect::>() .join(" ") } else { "".to_string() } } /// Serialize an optional collection of `GenericOption`'s to /// `postgresql.conf` compatible format. fn as_pg_settings(&self) -> String { if let Some(ops) = &self { ops.iter() .map(|op| op.to_pg_setting()) .collect::>() .join("\n") + "\n" // newline after last setting } else { "".to_string() } } } pub trait GenericOptionsSearch { fn find(&self, name: &str) -> Option; fn find_ref(&self, name: &str) -> Option<&GenericOption>; } impl GenericOptionsSearch for GenericOptions { /// Lookup option by name fn find(&self, name: &str) -> Option { let ops = self.as_ref()?; let op = ops.iter().find(|s| s.name == name)?; op.value.clone() } /// Lookup option by name, returning ref fn find_ref(&self, name: &str) -> Option<&GenericOption> { let ops = self.as_ref()?; ops.iter().find(|s| s.name == name) } } pub trait RoleExt { fn to_pg_options(&self) -> String; } impl RoleExt for Role { /// Serialize a list of role parameters into a Postgres-acceptable /// string of arguments. fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. let mut params: String = self.options.as_pg_options(); params.push_str(" LOGIN"); if let Some(pass) = &self.encrypted_password { // Some time ago we supported only md5 and treated all encrypted_password as md5. // Now we also support SCRAM-SHA-256 and to preserve compatibility // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256. if pass.starts_with("SCRAM-SHA-256") { write!(params, " PASSWORD '{pass}'") .expect("String is documented to not to error during write operations"); } else { write!(params, " PASSWORD 'md5{pass}'") .expect("String is documented to not to error during write operations"); } } else { params.push_str(" PASSWORD NULL"); } params } } pub trait DatabaseExt { fn to_pg_options(&self) -> String; } impl DatabaseExt for Database { /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need /// to use `template0` and `template1`, so it is not a problem. Yet in the future /// it may require a proper quoting too. fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); write!(params, " OWNER {}", &self.owner.pg_quote()) .expect("String is documented to not to error during write operations"); params } } pub trait DatabricksSettingsExt { fn as_pg_settings(&self) -> String; } impl DatabricksSettingsExt for DatabricksSettings { fn as_pg_settings(&self) -> String { // Postgres GUCs rendered from DatabricksSettings vec![ // ssl_ca_file Some(format!( "ssl_ca_file = '{}'", self.pg_compute_tls_settings.ca_file )), // [Optional] databricks.workspace_url Some(format!( "databricks.workspace_url = '{}'", &self.databricks_workspace_host )), // todo(vikas.jain): these are not required anymore as they are moved to static // conf but keeping these to avoid image mismatch between hcc and pg. // Once hcc and pg are in sync, we can remove these. // // databricks.enable_databricks_identity_login Some("databricks.enable_databricks_identity_login = true".to_string()), // databricks.enable_sql_restrictions Some("databricks.enable_sql_restrictions = true".to_string()), ] .into_iter() // Removes `None`s .flatten() .collect::>() .join("\n") + "\n" } } /// Generic trait used to provide quoting / encoding for strings used in the /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { fn pg_quote(&self) -> String; fn pg_quote_dollar(&self) -> (String, String); } impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it /// always quotes provided string with `""` and escapes every `"`. /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. /// N.B. it's not useful for escaping identifiers that are used inside WHERE /// clause, use `escape_literal()` instead. fn pg_quote(&self) -> String { format!("\"{}\"", self.replace('"', "\"\"")) } /// This helper is intended to be used for dollar-escaping strings for usage /// inside PL/pgSQL procedures. In addition to dollar-escaping the string, /// it also returns a tag that is intended to be used inside the outer /// PL/pgSQL procedure. If you do not need an outer tag, just discard it. /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, /// fn pg_quote_dollar(&self) -> (String, String) { let mut tag: String = "x".to_string(); let mut outer_tag = "xx".to_string(); // Find the first suitable tag that is not present in the string. // Postgres' max role/DB name length is 63 bytes, so even in the // worst case it won't take long. Outer tag is always `tag + "x"`, // so if `tag` is not present in the string, `outer_tag` is not // present in the string either. while self.contains(&tag.to_string()) { tag += "x"; outer_tag = tag.clone() + "x"; } let escaped = format!("${tag}${self}${tag}$"); (escaped, outer_tag) } } /// Build a list of existing Postgres roles pub async fn get_existing_roles_async(client: &tokio_postgres::Client) -> Result> { let postgres_roles = client .query_raw::( "SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[], ) .await? .filter_map(|row| async { row.ok() }) .map(|row| Role { name: row.get("rolname"), encrypted_password: row.get("rolpassword"), options: None, }) .collect() .await; Ok(postgres_roles) } /// Build a list of existing Postgres databases pub async fn get_existing_dbs_async( client: &tokio_postgres::Client, ) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 let rowstream = client // We use a subquery instead of a fancy `datdba::regrole::text AS owner`, // because the latter automatically wraps the result in double quotes, // if the role name contains special characters. .query_raw::( "SELECT datname AS name, (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner, NOT datallowconn AS restrict_conn, datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid FROM pg_catalog.pg_database;", &[], ) .await?; let dbs_map = rowstream .filter_map(|r| async { r.ok() }) .map(|row| Database { name: row.get("name"), owner: row.get("owner"), restrict_conn: row.get("restrict_conn"), invalid: row.get("invalid"), options: None, }) .map(|db| (db.name.clone(), db.clone())) .collect::>() .await; Ok(dbs_map) } /// Wait for Postgres to become ready to accept connections. It's ready to /// accept connections when the state-field in `pgdata/postmaster.pid` says /// 'ready'. #[instrument(skip_all, fields(pgdata = %pgdata.display()))] pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); // PostgreSQL writes line "ready" to the postmaster.pid file, when it has // completed initialization and is ready to accept connections. We want to // react quickly and perform the rest of our initialization as soon as // PostgreSQL starts accepting connections. Use 'notify' to be notified // whenever the PID file is changed, and whenever it changes, read it to // check if it's now "ready". // // You cannot actually watch a file before it exists, so we first watch the // data directory, and once the postmaster.pid file appears, we switch to // watch the file instead. We also wake up every 100 ms to poll, just in // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); let watcher_res = notify::recommended_watcher(move |res| { let _ = tx.send(res); }); let (mut watcher, rx): (Box, _) = match watcher_res { Ok(watcher) => (Box::new(watcher), rx), Err(e) => { match e.kind { notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => { // docker on m1 macs does not support recommended_watcher // but return "Function not implemented (os error 38)" // see https://github.com/notify-rs/notify/issues/423 let (tx, rx) = std::sync::mpsc::channel(); // let's poll it faster than what we check the results for (100ms) let config = notify::Config::default().with_poll_interval(Duration::from_millis(50)); let watcher = notify::PollWatcher::new( move |res| { let _ = tx.send(res); }, config, )?; (Box::new(watcher), rx) } _ => return Err(e.into()), } } }; watcher.watch(pgdata, RecursiveMode::NonRecursive)?; let started_at = Instant::now(); let mut postmaster_pid_seen = false; loop { if let Ok(Some(status)) = pg.try_wait() { // Postgres exited, that is not what we expected, bail out earlier. let code = status.code().unwrap_or(-1); bail!("Postgres exited unexpectedly with code {}", code); } let res = rx.recv_timeout(Duration::from_millis(100)); debug!("woken up by notify: {res:?}"); // If there are multiple events in the channel already, we only need to be // check once. Swallow the extra events before we go ahead to check the // pid file. while let Ok(res) = rx.try_recv() { debug!("swallowing extra event: {res:?}"); } // Check that we can open pid file first. if let Ok(file) = File::open(&pid_path) { if !postmaster_pid_seen { debug!("postmaster.pid appeared"); watcher .unwatch(pgdata) .expect("Failed to remove pgdata dir watch"); watcher .watch(&pid_path, RecursiveMode::NonRecursive) .expect("Failed to add postmaster.pid file watch"); postmaster_pid_seen = true; } let file = BufReader::new(file); let last_line = file.lines().last(); // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); debug!("last line of postmaster.pid: {status:?}"); // Now Postgres is ready to accept connections if status == "ready" { break; } } } // Give up after POSTGRES_WAIT_TIMEOUT. let duration = started_at.elapsed(); if duration >= POSTGRES_WAIT_TIMEOUT { bail!("timed out while waiting for Postgres to start"); } } tracing::info!("PostgreSQL is now running, continuing to configure it"); Ok(()) } /// Remove `pgdata` directory and create it again with right permissions. pub fn create_pgdata(pgdata: &str) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. // If it is something different then create_dir() will error out anyway. let _ok = fs::remove_dir_all(pgdata); fs::create_dir(pgdata)?; fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) } /// Update pgbouncer.ini with provided options fn update_pgbouncer_ini( pgbouncer_config: IndexMap, pgbouncer_ini_path: &str, ) -> Result<()> { let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; let section = conf.section_mut(Some("pgbouncer")).unwrap(); for (option_name, value) in pgbouncer_config.iter() { section.insert(option_name, value); debug!( "Updating pgbouncer.ini with new values {}={}", option_name, value ); } conf.write_to_file(pgbouncer_ini_path)?; Ok(()) } /// Tune pgbouncer. /// 1. Apply new config using pgbouncer admin console /// 2. Add new values to pgbouncer.ini to preserve them after restart pub async fn tune_pgbouncer( mut pgbouncer_config: IndexMap, tls_config: Option, ) -> Result<()> { let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() { // for VMs use pgbouncer specific way to connect to // pgbouncer admin console without password // when pgbouncer is running under the same user. "host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string() } else { // for k8s use normal connection string with password // to connect to pgbouncer admin console let mut pgbouncer_connstr = "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string(); if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") { pgbouncer_connstr.push_str(format!(" password={pass}").as_str()); } pgbouncer_connstr }; info!( "Connecting to pgbouncer with connection string: {}", pgbouncer_connstr ); // connect to pgbouncer, retrying several times // because pgbouncer may not be ready yet let mut retries = 3; let client = loop { match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await { Ok((client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); break client; } Err(e) => { if retries == 0 { return Err(e.into()); } error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e); retries -= 1; tokio::time::sleep(Duration::from_secs(1)).await; } } }; if let Some(tls_config) = tls_config { // pgbouncer starts in a half-ok state if it cannot find these files. // It will default to client_tls_sslmode=deny, which causes proxy to error. // There is a small window at startup where these files don't yet exist in the VM. // Best to wait until it exists. loop { if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await { break; } tokio::time::sleep(Duration::from_millis(500)).await } pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path); pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path); pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string()); } // save values to pgbouncer.ini // so that they are preserved after pgbouncer restart let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() { // in VMs we use /etc/pgbouncer.ini "/etc/pgbouncer.ini".to_string() } else { // in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini // this is a shared volume between pgbouncer and postgres containers // FIXME: fix permissions for this file "/var/db/postgres/pgbouncer/pgbouncer.ini".to_string() }; update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; info!("Applying pgbouncer setting change"); if let Err(err) = client.simple_query("RELOAD").await { // Don't fail on error, just print it into log error!("Failed to apply pgbouncer setting change, {err}",); }; Ok(()) } /// Spawn a task that will read Postgres logs from `stderr`, join multiline logs /// and send them to the logger. In the future we may also want to add context to /// these logs. pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle> { tokio::spawn(async move { let stderr = tokio::process::ChildStderr::from_std(stderr)?; handle_postgres_logs_async(stderr).await }) } /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: /// - next line starts with timestamp /// - EOF /// - no new lines were written for the last 100 milliseconds async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { let mut lines = tokio::io::BufReader::new(stderr).lines(); let timeout_duration = Duration::from_millis(100); let ts_regex = regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid"); let mut buf = vec![]; loop { let next_line = timeout(timeout_duration, lines.next_line()).await; // we should flush lines from the buffer if we cannot continue reading multiline message let should_flush_buf = match next_line { // Flushing if new line starts with timestamp Ok(Ok(Some(ref line))) => ts_regex.is_match(line), // Flushing on EOF, timeout or error _ => true, }; if !buf.is_empty() && should_flush_buf { // join multiline message into a single line, separated by unicode Zero Width Space. // "PG:" suffix is used to distinguish postgres logs from other logs. let combined = format!("PG:{}\n", buf.join("\u{200B}")); buf.clear(); // sync write to stderr to avoid interleaving with other logs use std::io::Write; let res = std::io::stderr().lock().write_all(combined.as_bytes()); if let Err(e) = res { tracing::error!("error while writing to stderr: {}", e); } } // if not timeout, append line to the buffer if next_line.is_ok() { match next_line?? { Some(line) => buf.push(line), // EOF None => break, }; } } Ok(()) } /// `Postgres::config::Config` handles database names with whitespaces /// and special characters properly. pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result { let mut conf = Config::from_str(connstr.as_str())?; conf.dbname(dbname); Ok(conf) } ================================================ FILE: compute_tools/src/pg_isready.rs ================================================ use anyhow::{Context, anyhow}; // Run `/usr/local/bin/pg_isready -p {port}` // Check the connectivity of PG // Success means PG is listening on the port and accepting connections // Note that PG does not need to authenticate the connection, nor reserve a connection quota for it. // See https://www.postgresql.org/docs/current/app-pg-isready.html pub fn pg_isready(bin: &str, port: u16) -> anyhow::Result<()> { let child_result = std::process::Command::new(bin) .arg("-p") .arg(port.to_string()) .spawn(); child_result .context("spawn() failed") .and_then(|mut child| child.wait().context("wait() failed")) .and_then(|status| match status.success() { true => Ok(()), false => Err(anyhow!("process exited with {status}")), }) // wrap any prior error with the overall context that we couldn't run the command .with_context(|| format!("could not run `{bin} --port {port}`")) } // It's safe to assume pg_isready is under the same directory with postgres, // because it is a PG util bin installed along with postgres pub fn get_pg_isready_bin(pgbin: &str) -> String { let split = pgbin.split("/").collect::>(); split[0..split.len() - 1].join("/") + "/pg_isready" } ================================================ FILE: compute_tools/src/pgbouncer.rs ================================================ pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid"; ================================================ FILE: compute_tools/src/rsyslog.rs ================================================ use std::fs; use std::io::ErrorKind; use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs::OpenOptions, io::Write}; use url::{Host, Url}; use anyhow::{Context, Result, anyhow}; use hostname_validator; use tracing::{error, info, instrument, warn}; const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; fn get_rsyslog_pid() -> Option { let output = Command::new("pgrep") .arg("rsyslogd") .output() .expect("Failed to execute pgrep"); if !output.stdout.is_empty() { let pid = std::str::from_utf8(&output.stdout) .expect("Invalid UTF-8 in process output") .trim() .to_string(); Some(pid) } else { None } } fn wait_for_rsyslog_pid() -> Result { const MAX_WAIT: Duration = Duration::from_secs(5); const INITIAL_SLEEP: Duration = Duration::from_millis(2); let mut sleep_duration = INITIAL_SLEEP; let start = std::time::Instant::now(); let mut attempts = 1; for attempt in 1.. { attempts = attempt; match get_rsyslog_pid() { Some(pid) => return Ok(pid), None => { if start.elapsed() >= MAX_WAIT { break; } info!( "rsyslogd is not running, attempt {}. Sleeping for {} ms", attempt, sleep_duration.as_millis() ); std::thread::sleep(sleep_duration); sleep_duration *= 2; } } } Err(anyhow::anyhow!( "rsyslogd is not running after waiting for {} seconds and {} attempts", attempts, start.elapsed().as_secs() )) } // Restart rsyslogd to apply the new configuration. // This is necessary, because there is no other way to reload the rsyslog configuration. // // Rsyslogd shouldn't lose any messages, because of the restart, // because it tracks the last read position in the log files // and will continue reading from that position. // TODO: test it properly // fn restart_rsyslog() -> Result<()> { // kill it to restart let _ = Command::new("pkill") .arg("rsyslogd") .output() .context("Failed to restart rsyslogd")?; // ensure rsyslogd is running wait_for_rsyslog_pid()?; Ok(()) } fn parse_audit_syslog_address( remote_plain_endpoint: &str, remote_tls_endpoint: &str, ) -> Result<(String, u16, String)> { let tls; let remote_endpoint = if !remote_tls_endpoint.is_empty() { tls = "true".to_string(); remote_tls_endpoint } else { tls = "false".to_string(); remote_plain_endpoint }; // Urlify the remote_endpoint, so parsing can be done with url::Url. let url_str = format!("http://{remote_endpoint}"); let url = Url::parse(&url_str).map_err(|err| { anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}") })?; let is_valid = url.scheme() == "http" && url.path() == "/" && url.query().is_none() && url.fragment().is_none() && url.username() == "" && url.password().is_none(); if !is_valid { return Err(anyhow!( "Invalid address format {remote_endpoint}, expected host:port" )); } let host = match url.host() { Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(), Some(Host::Ipv4(ip4)) => ip4.to_string(), Some(Host::Ipv6(ip6)) => ip6.to_string(), _ => return Err(anyhow!("Invalid host")), }; let port = url .port() .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?; Ok((host, port, tls)) } fn generate_audit_rsyslog_config( log_directory: String, endpoint_id: &str, project_id: &str, remote_syslog_host: &str, remote_syslog_port: u16, remote_syslog_tls: &str, ) -> String { format!( include_str!("config_template/compute_audit_rsyslog_template.conf"), log_directory = log_directory, endpoint_id = endpoint_id, project_id = project_id, remote_syslog_host = remote_syslog_host, remote_syslog_port = remote_syslog_port, remote_syslog_tls = remote_syslog_tls ) } pub fn configure_audit_rsyslog( log_directory: String, endpoint_id: &str, project_id: &str, remote_endpoint: &str, remote_tls_endpoint: &str, ) -> Result<()> { let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) = parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap(); let config_content = generate_audit_rsyslog_config( log_directory, endpoint_id, project_id, &remote_syslog_host, remote_syslog_port, &remote_syslog_tls, ); info!("rsyslog config_content: {}", config_content); let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf"; let mut file = OpenOptions::new() .create(true) .write(true) .truncate(true) .open(rsyslog_conf_path)?; file.write_all(config_content.as_bytes())?; info!( "rsyslog configuration file {} added successfully. Starting rsyslogd", rsyslog_conf_path ); // start the service, using the configuration restart_rsyslog()?; Ok(()) } /// Configuration for enabling Postgres logs forwarding from rsyslogd pub struct PostgresLogsRsyslogConfig<'a> { pub host: Option<&'a str>, } impl<'a> PostgresLogsRsyslogConfig<'a> { pub fn new(host: Option<&'a str>) -> Self { Self { host } } pub fn build(&self) -> Result { match self.host { Some(host) => { if let Some((target, port)) = host.split_once(":") { Ok(format!( include_str!( "config_template/compute_rsyslog_postgres_export_template.conf" ), logs_export_target = target, logs_export_port = port, )) } else { Err(anyhow!("Invalid host format for Postgres logs export")) } } None => Ok("".to_string()), } } fn current_config() -> Result { let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) { Ok(c) => c, Err(err) if err.kind() == ErrorKind::NotFound => String::new(), Err(err) => return Err(err.into()), }; Ok(config_content) } } /// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog. pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { let new_config = conf.build()?; let current_config = PostgresLogsRsyslogConfig::current_config()?; if new_config == current_config { info!("postgres logs rsyslog configuration is up-to-date"); return Ok(()); } // Nothing to configure if new_config.is_empty() { // When the configuration is removed, PostgreSQL will stop sending data // to the files watched by rsyslog, so restarting rsyslog is more effort // than just ignoring this change. return Ok(()); } info!( "configuring rsyslog for postgres logs export to: {:?}", conf.host ); let mut file = OpenOptions::new() .create(true) .write(true) .truncate(true) .open(POSTGRES_LOGS_CONF_PATH)?; file.write_all(new_config.as_bytes())?; info!( "rsyslog configuration file {} added successfully. Starting rsyslogd", POSTGRES_LOGS_CONF_PATH ); restart_rsyslog()?; Ok(()) } #[instrument(skip_all)] async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { info!("running pgaudit GC main loop"); loop { // Check log_directory for old pgaudit logs and delete them. // New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age // Find files that were not modified in the last 15 minutes and delete them. // This should be enough time for rsyslog to process the logs and for us to catch the alerts. // // In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age. // // TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog // imfile-state files, but for now just do a simple GC to avoid filling up the disk. let _ = Command::new("find") .arg(&log_directory) .arg("-name") .arg("audit*.log") .arg("-mmin") .arg("+15") .arg("-delete") .output()?; // also collect the metric for the size of the log directory async fn get_log_files_size(path: &Path) -> Result { let mut total_size = 0; for entry in fs::read_dir(path)? { let entry = entry?; let entry_path = entry.path(); if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") { total_size += entry.metadata()?.len(); } } Ok(total_size) } let log_directory_size = get_log_files_size(Path::new(&log_directory)) .await .unwrap_or_else(|e| { warn!("Failed to get log directory size: {}", e); 0 }); crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64); tokio::time::sleep(Duration::from_secs(60)).await; } } // launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory pub fn launch_pgaudit_gc(log_directory: String) { tokio::spawn(async move { if let Err(e) = pgaudit_gc_main_loop(log_directory).await { error!("pgaudit GC main loop failed: {}", e); } }); } #[cfg(test)] mod tests { use crate::rsyslog::PostgresLogsRsyslogConfig; use super::{generate_audit_rsyslog_config, parse_audit_syslog_address}; #[test] fn test_postgres_logs_config() { { // Verify empty config let conf = PostgresLogsRsyslogConfig::new(None); let res = conf.build(); assert!(res.is_ok()); let conf_str = res.unwrap(); assert_eq!(&conf_str, ""); } { // Verify config let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514")); let res = conf.build(); assert!(res.is_ok()); let conf_str = res.unwrap(); assert!(conf_str.contains("omfwd")); assert!(conf_str.contains(r#"target="collector.cvc.local""#)); assert!(conf_str.contains(r#"port="514""#)); } { // Verify invalid config let conf = PostgresLogsRsyslogConfig::new(Some("invalid")); let res = conf.build(); assert!(res.is_err()); } } #[test] fn test_parse_audit_syslog_address() { { // host:port format (plaintext) let parsed = parse_audit_syslog_address("collector.host.tld:5555", ""); assert!(parsed.is_ok()); assert_eq!( parsed.unwrap(), ( String::from("collector.host.tld"), 5555, String::from("false") ) ); } { // host:port format with ipv4 ip address (plaintext) let parsed = parse_audit_syslog_address("10.0.0.1:5555", ""); assert!(parsed.is_ok()); assert_eq!( parsed.unwrap(), (String::from("10.0.0.1"), 5555, String::from("false")) ); } { // host:port format with ipv6 ip address (plaintext) let parsed = parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", ""); assert_eq!( parsed.unwrap(), ( String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"), 5555, String::from("false") ) ); } { // Only TLS host:port defined let parsed = parse_audit_syslog_address("", "tls.host.tld:5556"); assert_eq!( parsed.unwrap(), (String::from("tls.host.tld"), 5556, String::from("true")) ); } { // tls host should take precedence, when both defined let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556"); assert_eq!( parsed.unwrap(), (String::from("tls.host.tld"), 5556, String::from("true")) ); } { // host without port (plaintext) let parsed = parse_audit_syslog_address("collector.host.tld", ""); assert!(parsed.is_err()); } { // port without host let parsed = parse_audit_syslog_address(":5555", ""); assert!(parsed.is_err()); } { // valid host with invalid port let parsed = parse_audit_syslog_address("collector.host.tld:90001", ""); assert!(parsed.is_err()); } { // invalid hostname with valid port let parsed = parse_audit_syslog_address("-collector.host.tld:5555", ""); assert!(parsed.is_err()); } { // parse error let parsed = parse_audit_syslog_address("collector.host.tld:::5555", ""); assert!(parsed.is_err()); } } #[test] fn test_generate_audit_rsyslog_config() { { // plaintext version let log_directory = "/tmp/log".to_string(); let endpoint_id = "ep-test-endpoint-id"; let project_id = "test-project-id"; let remote_syslog_host = "collector.host.tld"; let remote_syslog_port = 5555; let remote_syslog_tls = "false"; let conf_str = generate_audit_rsyslog_config( log_directory, endpoint_id, project_id, remote_syslog_host, remote_syslog_port, remote_syslog_tls, ); assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#)); assert!(conf_str.contains(r#"type="omfwd""#)); assert!(conf_str.contains(r#"target="collector.host.tld""#)); assert!(conf_str.contains(r#"port="5555""#)); assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); } { // TLS version let log_directory = "/tmp/log".to_string(); let endpoint_id = "ep-test-endpoint-id"; let project_id = "test-project-id"; let remote_syslog_host = "collector.host.tld"; let remote_syslog_port = 5556; let remote_syslog_tls = "true"; let conf_str = generate_audit_rsyslog_config( log_directory, endpoint_id, project_id, remote_syslog_host, remote_syslog_port, remote_syslog_tls, ); assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#)); assert!(conf_str.contains(r#"type="omfwd""#)); assert!(conf_str.contains(r#"target="collector.host.tld""#)); assert!(conf_str.contains(r#"port="5556""#)); assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); } } } ================================================ FILE: compute_tools/src/spec.rs ================================================ use std::fs::File; use std::fs::{self, Permissions}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse, }; use reqwest::StatusCode; use tokio_postgres::Client; use tracing::{error, info, instrument}; use crate::compute::ComputeNodeParams; use crate::config; use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request // and a string with error message. fn do_control_plane_request( uri: &str, jwt: &str, ) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) .header("Authorization", format!("Bearer {jwt}")) .send() .map_err(|e| { ( true, format!("could not perform request to control plane: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; let status = resp.status(); match status { StatusCode::OK => match resp.json::() { Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, format!("could not deserialize control plane response: {e:?}"), status.to_string(), )), }, StatusCode::SERVICE_UNAVAILABLE => Err(( true, "control plane is temporarily unavailable".to_string(), status.to_string(), )), StatusCode::BAD_GATEWAY => { // We have a problem with intermittent 502 errors now // https://github.com/neondatabase/cloud/issues/2353 // It's fine to retry GET request in this case. Err(( true, "control plane request failed with 502".to_string(), status.to_string(), )) } // Another code, likely 500 or 404, means that compute is unknown to the control plane // or some internal failure happened. Doesn't make much sense to retry in this case. _ => Err(( false, format!("unexpected control plane response status code: {status}"), status.to_string(), )), } } /// Request config from the control-plane by compute_id. If /// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for /// authorization. pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default(); let mut attempt = 1; info!("getting config from control plane: {}", cp_uri); // Do 3 attempts to get spec from the control plane using the following logic: // - network error -> then retry // - compute id is unknown or any other error -> bail out // - no spec for compute yet (Empty state) -> return Ok(None) // - got config -> return Ok(Some(config)) while attempt < 4 { let result = match do_control_plane_request(&cp_uri, &jwt) { Ok(config_resp) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[ CPlaneRequestRPC::GetConfig.as_str(), &StatusCode::OK.to_string(), ]) .inc(); match config_resp.status { ControlPlaneComputeStatus::Empty => Ok(config_resp.into()), ControlPlaneComputeStatus::Attached => { if config_resp.spec.is_some() { Ok(config_resp.into()) } else { bail!("compute is attached, but spec is empty") } } } } Err((retry, msg, status)) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status]) .inc(); if retry { Err(anyhow!(msg)) } else { bail!(msg); } } }; if let Err(e) = &result { error!("attempt {} to get config failed with: {}", attempt, e); } else { return result; } attempt += 1; std::thread::sleep(std::time::Duration::from_millis(100)); } // All attempts failed, return error. Err(anyhow::anyhow!( "Exhausted all attempts to retrieve the config from the control plane" )) } /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path, databricks_pg_hba: Option<&String>) -> Result<()> { // XXX: consider making it a part of config.json let pghba_path = pgdata_path.join("pg_hba.conf"); // Update pg_hba to contains databricks specfic settings before adding neon settings // PG uses the first record that matches to perform authentication, so we need to have // our rules before the default ones from neon. // See https://www.postgresql.org/docs/current/auth-pg-hba-conf.html if let Some(databricks_pg_hba) = databricks_pg_hba { if config::line_in_file( &pghba_path, &format!("include_if_exists {}\n", *databricks_pg_hba), )? { info!("updated pg_hba.conf to include databricks_pg_hba.conf"); } else { info!("pg_hba.conf already included databricks_pg_hba.conf"); } } if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { info!("updated pg_hba.conf to allow external connections"); } else { info!("pg_hba.conf is up-to-date"); } Ok(()) } /// Check `pg_ident.conf` and update if needed to allow databricks config. pub fn update_pg_ident(pgdata_path: &Path, databricks_pg_ident: Option<&String>) -> Result<()> { info!("checking pg_ident.conf"); let pghba_path = pgdata_path.join("pg_ident.conf"); // Update pg_ident to contains databricks specfic settings if let Some(databricks_pg_ident) = databricks_pg_ident { if config::line_in_file( &pghba_path, &format!("include_if_exists {}\n", *databricks_pg_ident), )? { info!("updated pg_ident.conf to include databricks_pg_ident.conf"); } else { info!("pg_ident.conf already included databricks_pg_ident.conf"); } } Ok(()) } /// Copy tls key_file and cert_file from k8s secret mount directory /// to pgdata and set private key file permissions as expected by Postgres. /// See this doc for expected permission /// K8s secrets mount on dblet does not honor permission and ownership /// specified in the Volume or VolumeMount. So we need to explicitly copy the file and set the permissions. pub fn copy_tls_certificates( key_file: &String, cert_file: &String, pgdata_path: &Path, ) -> Result<()> { let files = [cert_file, key_file]; for file in files.iter() { let source = Path::new(file); let dest = pgdata_path.join(source.file_name().unwrap()); if !dest.exists() { std::fs::copy(source, &dest)?; info!( "Copying tls file: {} to {}", &source.display(), &dest.display() ); } if *file == key_file { // Postgres requires private key to be readable only by the owner by having // chmod 600 permissions. let permissions = Permissions::from_mode(0o600); fs::set_permissions(&dest, permissions)?; info!("Setting permission on {}.", &dest.display()); } } Ok(()) } /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of config.json let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { File::create(signalfile)?; info!("created standby.signal"); } else { info!("reused pre-existing standby.signal"); } Ok(()) } #[instrument(skip_all)] pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); client.simple_query(query).await?; Ok(()) } #[instrument(skip_all)] pub async fn handle_migrations( params: ComputeNodeParams, client: &mut Client, lakebase_mode: bool, ) -> Result<()> { info!("handle migrations"); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Add new migrations in numerical order. let migrations = [ &format!( include_str!("./migrations/0001-add_bypass_rls_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0002-alter_roles.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0003-grant_pg_create_subscription_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0004-grant_pg_monitor_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0005-grant_all_on_tables_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0006-grant_all_on_sequences_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), &format!( include_str!( "./migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql" ), privileged_role_name = params.privileged_role_name ), &format!( include_str!( "./migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql" ), privileged_role_name = params.privileged_role_name ), include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), &format!( include_str!( "./migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql" ), privileged_role_name = params.privileged_role_name ), &format!( include_str!( "./migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql" ), privileged_role_name = params.privileged_role_name ), &format!( include_str!("./migrations/0012-grant_pg_signal_backend_to_privileged_role.sql"), privileged_role_name = params.privileged_role_name ), ]; MigrationRunner::new(client, &migrations, lakebase_mode) .run_migrations() .await?; Ok(()) } ================================================ FILE: compute_tools/src/spec_apply.rs ================================================ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::future::Future; use std::iter::{empty, once}; use std::sync::Arc; use anyhow::{Context, Result}; use compute_api::responses::ComputeStatus; use compute_api::spec::{ComputeAudit, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; use tokio_postgres::error::SqlState; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles}; use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS; use crate::pg_helpers::{ DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, get_existing_roles_async, }; use crate::spec_apply::ApplySpecPhase::{ AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension, CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon, DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, }; use crate::spec_apply::PerDatabasePhase::{ ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, }; impl ComputeNode { /// Apply the spec to the running PostgreSQL instance. /// The caller can decide to run with multiple clients in parallel, or /// single mode. Either way, the commands executed will be the same, and /// only commands run in different databases are parallelized. #[instrument(skip_all)] pub fn apply_spec_sql( &self, spec: Arc, conf: Arc, concurrency: usize, ) -> Result<()> { info!("Applying config with max {} concurrency", concurrency); debug!("Config: {:?}", spec); let rt = tokio::runtime::Handle::current(); rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. let client = Self::get_maintenance_client(&conf).await?; let spec = spec.clone(); let params = Arc::new(self.params.clone()); let databases = get_existing_dbs_async(&client).await?; let roles = get_existing_roles_async(&client) .await? .into_iter() .map(|role| (role.name.clone(), role)) .collect::>(); // Check if we need to drop subscriptions before starting the endpoint. // // It is important to do this operation exactly once when endpoint starts on a new branch. // Otherwise, we may drop not inherited, but newly created subscriptions. // // We cannot rely only on spec.drop_subscriptions_before_start flag, // because if for some reason compute restarts inside VM, // it will start again with the same spec and flag value. // // To handle this, we save the fact of the operation in the database // in the neon.drop_subscriptions_done table. // If the table does not exist, we assume that the operation was never performed, so we must do it. // If table exists, we check if the operation was performed on the current timelilne. // let mut drop_subscriptions_done = false; if spec.drop_subscriptions_before_start { let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); drop_subscriptions_done = match client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await { Ok(result) => !result.is_empty(), Err(e) => { match e.code() { Some(&SqlState::UNDEFINED_TABLE) => false, _ => { // We don't expect any other error here, except for the schema/table not existing error!("Error checking if drop subscription operation was already performed: {}", e); return Err(e.into()); } } } } }; let jwks_roles = Arc::new( spec.as_ref() .local_proxy_config .iter() .flat_map(|it| &it.jwks) .flatten() .flat_map(|setting| &setting.role_names) .cloned() .collect::>(), ); let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { roles, dbs: databases, })); // Apply special pre drop database phase. // NOTE: we use the code of RunInEachDatabase phase for parallelism // and connection management, but we don't really run it in *each* database, // only in databases, we're about to drop. info!("Applying PerDatabase (pre-dropdb) phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); // Run the phase for each database that we're about to drop. let db_processes = spec .delta_operations .iter() .flatten() .filter_map(move |op| { if op.action.as_str() == "delete_db" { Some(op.name.clone()) } else { None } }) .map(|dbname| { let spec = spec.clone(); let ctx = ctx.clone(); let jwks_roles = jwks_roles.clone(); let mut conf = conf.as_ref().clone(); let concurrency_token = concurrency_token.clone(); // We only need dbname field for this phase, so set other fields to dummy values let db = DB::UserDB(Database { name: dbname.clone(), owner: "cloud_admin".to_string(), options: None, restrict_conn: false, invalid: false, }); debug!("Applying per-database phases for Database {:?}", &db); match &db { DB::SystemDB => {} DB::UserDB(db) => { conf.dbname(db.name.as_str()); } } let conf = Arc::new(conf); let fut = Self::apply_spec_sql_db( params.clone(), spec.clone(), conf, ctx.clone(), jwks_roles.clone(), concurrency_token.clone(), db, [DropLogicalSubscriptions].to_vec(), self.params.lakebase_mode, ); Ok(tokio::spawn(fut)) }) .collect::>>(); for process in db_processes.into_iter() { let handle = process?; if let Err(e) = handle.await? { // Handle the error case where the database does not exist // We do not check whether the DB exists or not in the deletion phase, // so we shouldn't be strict about it in pre-deletion cleanup as well. if e.to_string().contains("does not exist") { warn!("Error dropping subscription: {}", e); } else { return Err(e); } }; } let phases = if self.params.lakebase_mode { vec![ CreatePrivilegedRole, // BEGIN_HADRON CreateDatabricksRoles, AlterDatabricksRoles, // END_HADRON DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, CreateSchemaNeon, ] } else { vec![ CreatePrivilegedRole, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, CreateSchemaNeon, ] }; for phase in phases { info!("Applying phase {:?}", &phase); apply_operations( params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), phase, || async { Ok(&client) }, self.params.lakebase_mode, ) .await?; } info!("Applying RunInEachDatabase2 phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); let db_processes = spec .cluster .databases .iter() .map(|db| DB::new(db.clone())) // include .chain(once(DB::SystemDB)) .map(|db| { let spec = spec.clone(); let ctx = ctx.clone(); let jwks_roles = jwks_roles.clone(); let mut conf = conf.as_ref().clone(); let concurrency_token = concurrency_token.clone(); let db = db.clone(); debug!("Applying per-database phases for Database {:?}", &db); match &db { DB::SystemDB => {} DB::UserDB(db) => { conf.dbname(db.name.as_str()); } } let conf = Arc::new(conf); let mut phases = vec![ DeleteDBRoleReferences, ChangeSchemaPerms, ]; if spec.drop_subscriptions_before_start && !drop_subscriptions_done { info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); phases.push(DropLogicalSubscriptions); } let fut = Self::apply_spec_sql_db( params.clone(), spec.clone(), conf, ctx.clone(), jwks_roles.clone(), concurrency_token.clone(), db, phases, self.params.lakebase_mode, ); Ok(tokio::spawn(fut)) }) .collect::>>(); for process in db_processes.into_iter() { let handle = process?; handle.await??; } let mut phases = if self.params.lakebase_mode { vec![ HandleOtherExtensions, HandleNeonExtension, // This step depends on CreateSchemaNeon // BEGIN_HADRON HandleDatabricksAuthExtension, // END_HADRON CreateAvailabilityCheck, DropRoles, // BEGIN_HADRON AddDatabricksGrants, CreateDatabricksMisc, // END_HADRON ] } else { vec![ HandleOtherExtensions, HandleNeonExtension, // This step depends on CreateSchemaNeon CreateAvailabilityCheck, DropRoles, ] }; // This step depends on CreateSchemaNeon if spec.drop_subscriptions_before_start && !drop_subscriptions_done { info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); phases.push(FinalizeDropLogicalSubscriptions); } // Keep DisablePostgresDBPgAudit phase at the end, // so that all config operations are audit logged. match spec.audit_log_level { ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { phases.push(CreatePgauditExtension); phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } ComputeAudit::Log | ComputeAudit::Base => { phases.push(CreatePgauditExtension); phases.push(DisablePostgresDBPgAudit); } ComputeAudit::Disabled => {} } for phase in phases { debug!("Applying phase {:?}", &phase); apply_operations( params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), phase, || async { Ok(&client) }, self.params.lakebase_mode, ) .await?; } Ok::<(), anyhow::Error>(()) })?; Ok(()) } /// Apply SQL migrations of the RunInEachDatabase phase. /// /// May opt to not connect to databases that don't have any scheduled /// operations. The function is concurrency-controlled with the provided /// semaphore. The caller has to make sure the semaphore isn't exhausted. #[allow(clippy::too_many_arguments)] // TODO: needs bigger refactoring async fn apply_spec_sql_db( params: Arc, spec: Arc, conf: Arc, ctx: Arc>, jwks_roles: Arc>, concurrency_token: Arc, db: DB, subphases: Vec, lakebase_mode: bool, ) -> Result<()> { let _permit = concurrency_token.acquire().await?; let mut client_conn = None; for subphase in subphases { apply_operations( params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), RunInEachDatabase { db: db.clone(), subphase, }, // Only connect if apply_operation actually wants a connection. // It's quite possible this database doesn't need any queries, // so by not connecting we save time and effort connecting to // that database. || async { if client_conn.is_none() { let db_client = Self::get_maintenance_client(&conf).await?; client_conn.replace(db_client); } let client = client_conn.as_ref().unwrap(); Ok(client) }, lakebase_mode, ) .await?; } drop(client_conn); Ok::<(), anyhow::Error>(()) } /// Choose how many concurrent connections to use for applying the spec changes. pub fn max_service_connections( &self, compute_state: &ComputeState, spec: &ComputeSpec, ) -> usize { // If the cluster is in Init state we don't have to deal with user connections, // and can thus use all `max_connections` connection slots. However, that's generally not // very efficient, so we generally still limit it to a smaller number. if compute_state.status == ComputeStatus::Init { // If the settings contain 'max_connections', use that as template if let Some(config) = spec.cluster.settings.find("max_connections") { config.parse::().ok() } else { // Otherwise, try to find the setting in the postgresql_conf string spec.cluster .postgresql_conf .iter() .flat_map(|conf| conf.split("\n")) .filter_map(|line| { if !line.contains("max_connections") { return None; } let (key, value) = line.split_once("=")?; let key = key .trim_start_matches(char::is_whitespace) .trim_end_matches(char::is_whitespace); let value = value .trim_start_matches(char::is_whitespace) .trim_end_matches(char::is_whitespace); if key != "max_connections" { return None; } value.parse::().ok() }) .next() } // If max_connections is present, use at most 1/3rd of that. // When max_connections is lower than 30, try to use at least 10 connections, but // never more than max_connections. .map(|limit| match limit { 0..10 => limit, 10..30 => 10, 30..300 => limit / 3, 300.. => 100, }) // If we didn't find max_connections, default to 10 concurrent connections. .unwrap_or(10) } else { // state == Running // Because the cluster is already in the Running state, we should assume users are // already connected to the cluster, and high concurrency could negatively // impact user connectivity. Therefore, we can limit concurrency to the number of // reserved superuser connections, which users wouldn't be able to use anyway. spec.cluster .settings .find("superuser_reserved_connections") .iter() .filter_map(|val| val.parse::().ok()) .map(|val| if val > 1 { val - 1 } else { 1 }) .next_back() .unwrap_or(3) } } } #[derive(Clone)] pub enum DB { SystemDB, UserDB(Database), } impl DB { pub fn new(db: Database) -> DB { Self::UserDB(db) } pub fn is_owned_by(&self, role: &PgIdent) -> bool { match self { DB::SystemDB => false, DB::UserDB(db) => &db.owner == role, } } } impl Debug for DB { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { DB::SystemDB => f.debug_tuple("SystemDB").finish(), DB::UserDB(db) => f.debug_tuple("UserDB").field(&db.name).finish(), } } } #[derive(Copy, Clone, Debug)] pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, /// This is a shared phase, used for both i) dropping dangling LR subscriptions /// before dropping the DB, and ii) dropping all subscriptions after creating /// a fresh branch. /// N.B. we will skip all DBs that are not present in Postgres, invalid, or /// have `datallowconn = false` (`restrict_conn`). DropLogicalSubscriptions, } #[derive(Clone, Debug)] pub enum ApplySpecPhase { CreatePrivilegedRole, // BEGIN_HADRON CreateDatabricksRoles, AlterDatabricksRoles, // END_HADRON DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, CreatePgauditExtension, CreatePgauditlogtofileExtension, DisablePostgresDBPgAudit, HandleOtherExtensions, HandleNeonExtension, // BEGIN_HADRON HandleDatabricksAuthExtension, // END_HADRON CreateAvailabilityCheck, // BEGIN_HADRON AddDatabricksGrants, CreateDatabricksMisc, // END_HADRON DropRoles, FinalizeDropLogicalSubscriptions, } pub struct Operation { pub query: String, pub comment: Option, } pub struct MutableApplyContext { pub roles: HashMap, pub dbs: HashMap, } /// Apply the operations that belong to the given spec apply phase. /// /// Commands within a single phase are executed in order of Iterator yield. /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database /// indicated by its `db` field, and can share a single client for all changes /// to that database. /// /// Notes: /// - Commands are pipelined, and thus may cause incomplete apply if one /// command of many fails. /// - Failing commands will fail the phase's apply step once the return value /// is processed. /// - No timeouts have (yet) been implemented. /// - The caller is responsible for limiting and/or applying concurrency. pub async fn apply_operations<'a, Fut, F>( params: Arc, spec: Arc, ctx: Arc>, jwks_roles: Arc>, apply_spec_phase: ApplySpecPhase, client: F, lakebase_mode: bool, ) -> Result<()> where F: FnOnce() -> Fut, Fut: Future>, { debug!("Starting phase {:?}", &apply_spec_phase); let span = info_span!("db_apply_changes", phase=?apply_spec_phase); let span2 = span.clone(); async move { debug!("Processing phase {:?}", &apply_spec_phase); let ctx = ctx; let mut ops = get_operations(¶ms, &spec, &ctx, &jwks_roles, &apply_spec_phase) .await? .peekable(); // Return (and by doing so, skip requesting the PostgreSQL client) if // we don't have any operations scheduled. if ops.peek().is_none() { return Ok(()); } let client = client().await?; debug!("Applying phase {:?}", &apply_spec_phase); let active_queries = ops .map(|op| { let Operation { comment, query } = op; let inspan = match comment { None => span.clone(), Some(comment) => info_span!("phase {}: {}", comment), }; async { let query = query; let res = client.simple_query(&query).await; debug!( "{} {}", if res.is_ok() { "successfully executed" } else { "failed to execute" }, query ); if !lakebase_mode { return res; } // BEGIN HADRON if let Err(e) = res.as_ref() { if let Some(sql_state) = e.code() { if sql_state.code() == "57014" { // SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts. // Increment the counter whenever a statement timeout occurs. Timeouts on // this configuration path can only occur due to PS connectivity problems that // Postgres failed to recover from. COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc(); } } } // END HADRON res } .instrument(inspan) }) .collect::>(); drop(ctx); for it in join_all(active_queries).await { drop(it?); } debug!("Completed phase {:?}", &apply_spec_phase); Ok(()) } .instrument(span2) .await } /// Create a stream of operations to be executed for that phase of applying /// changes. /// /// In the future we may generate a single stream of changes and then /// sort/merge/batch execution, but for now this is a nice way to improve /// batching behavior of the commands. async fn get_operations<'a>( params: &'a ComputeNodeParams, spec: &'a ComputeSpec, ctx: &'a RwLock, jwks_roles: &'a HashSet, apply_spec_phase: &'a ApplySpecPhase, ) -> Result + 'a + Send>> { match apply_spec_phase { ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation { query: format!( include_str!("sql/create_privileged_role.sql"), privileged_role_name = params.privileged_role_name, privileges = if params.lakebase_mode { "CREATEDB CREATEROLE NOLOGIN BYPASSRLS" } else { "CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS" } ), comment: None, }))), // BEGIN_HADRON // New Hadron phase ApplySpecPhase::CreateDatabricksRoles => { let queries = create_databricks_roles(); let operations = queries.into_iter().map(|query| Operation { query, comment: None, }); Ok(Box::new(operations)) } // Backfill existing databricks_reader_* roles with statement timeout from GUC ApplySpecPhase::AlterDatabricksRoles => { let query = String::from(include_str!( "sql/alter_databricks_reader_roles_timeout.sql" )); let operations = once(Operation { query, comment: Some( "Backfill existing databricks_reader_* roles with statement timeout" .to_string(), ), }); Ok(Box::new(operations)) } // End of new Hadron Phase // END_HADRON ApplySpecPhase::DropInvalidDatabases => { let mut ctx = ctx.write().await; let databases = &mut ctx.dbs; let keys: Vec<_> = databases .iter() .filter(|(_, db)| db.invalid) .map(|(dbname, _)| dbname.clone()) .collect(); // After recent commit in Postgres, interrupted DROP DATABASE // leaves the database in the invalid state. According to the // commit message, the only option for user is to drop it again. // See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 // // Postgres Neon extension is done the way, that db is de-registered // in the control plane metadata only after it is dropped. So there is // a chance that it still thinks that the db should exist. This means // that it will be re-created by the `CreateDatabases` phase. This // is fine, as user can just drop the table again (in vanilla // Postgres they would need to do the same). let operations = keys .into_iter() .filter_map(move |dbname| ctx.dbs.remove(&dbname)) .map(|db| Operation { query: format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()), comment: Some(format!("Dropping invalid database {}", db.name)), }); Ok(Box::new(operations)) } ApplySpecPhase::RenameRoles => { let mut ctx = ctx.write().await; let operations = spec .delta_operations .iter() .flatten() .filter(|op| op.action == "rename_role") .filter_map(move |op| { let roles = &mut ctx.roles; if roles.contains_key(op.name.as_str()) { None } else { let new_name = op.new_name.as_ref().unwrap(); let mut role = roles.remove(op.name.as_str()).unwrap(); role.name = new_name.clone(); role.encrypted_password = None; roles.insert(role.name.clone(), role); Some(Operation { query: format!( "ALTER ROLE {} RENAME TO {}", op.name.pg_quote(), new_name.pg_quote() ), comment: Some(format!("renaming role '{}' to '{}'", op.name, new_name)), }) } }); Ok(Box::new(operations)) } ApplySpecPhase::CreateAndAlterRoles => { let mut ctx = ctx.write().await; let operations = spec.cluster.roles .iter() .filter_map(move |role| { let roles = &mut ctx.roles; let db_role = roles.get(&role.name); match db_role { Some(db_role) => { if db_role.encrypted_password != role.encrypted_password { // This can be run on /every/ role! Not just ones created through the console. // This means that if you add some funny ALTER here that adds a permission, // this will get run even on user-created roles! This will result in different // behavior before and after a spec gets reapplied. The below ALTER as it stands // now only grants LOGIN and changes the password. Please do not allow this branch // to do anything silly. Some(Operation { query: format!( "ALTER ROLE {} {}", role.name.pg_quote(), role.to_pg_options(), ), comment: None, }) } else { None } } None => { let query = if !jwks_roles.contains(role.name.as_str()) { format!( "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE {} {}", role.name.pg_quote(), params.privileged_role_name, role.to_pg_options(), ) } else { format!( "CREATE ROLE {} {}", role.name.pg_quote(), role.to_pg_options(), ) }; Some(Operation { query, comment: Some(format!("creating role {}", role.name)), }) } } }); Ok(Box::new(operations)) } ApplySpecPhase::RenameAndDeleteDatabases => { let mut ctx = ctx.write().await; let operations = spec .delta_operations .iter() .flatten() .filter_map(move |op| { let databases = &mut ctx.dbs; match op.action.as_str() { // We do not check whether the DB exists or not, // Postgres will take care of it for us "delete_db" => { let (db_name, outer_tag) = op.name.pg_quote_dollar(); // In Postgres we can't drop a database if it is a template. // So we need to unset the template flag first, but it could // be a retry, so we could've already dropped the database. // Check that database exists first to make it idempotent. let unset_template_query: String = format!( include_str!("sql/unset_template_for_drop_dbs.sql"), datname = db_name, outer_tag = outer_tag, ); // Use FORCE to drop database even if there are active connections. // We run this from `cloud_admin`, so it should have enough privileges. // // NB: there could be other db states, which prevent us from dropping // the database. For example, if db is used by any active subscription // or replication slot. // Such cases are handled in the DropLogicalSubscriptions // phase. We do all the cleanup before actually dropping the database. let drop_db_query: String = format!( "DROP DATABASE IF EXISTS {} WITH (FORCE)", &op.name.pg_quote() ); databases.remove(&op.name); Some(vec![ Operation { query: unset_template_query, comment: Some(format!( "optionally clearing template flags for DB {}", op.name, )), }, Operation { query: drop_db_query, comment: Some(format!("deleting database {}", op.name,)), }, ]) } "rename_db" => { if let Some(mut db) = databases.remove(&op.name) { // update state of known databases let new_name = op.new_name.as_ref().unwrap(); db.name = new_name.clone(); databases.insert(db.name.clone(), db); Some(vec![Operation { query: format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), new_name.pg_quote(), ), comment: Some(format!( "renaming database '{}' to '{}'", op.name, new_name )), }]) } else { None } } _ => None, } }) .flatten(); Ok(Box::new(operations)) } ApplySpecPhase::CreateAndAlterDatabases => { let mut ctx = ctx.write().await; let operations = spec .cluster .databases .iter() .filter_map(move |db| { let databases = &mut ctx.dbs; if let Some(edb) = databases.get_mut(&db.name) { let change_owner = if edb.owner.starts_with('"') { db.owner.pg_quote() != edb.owner } else { db.owner != edb.owner }; edb.owner = db.owner.clone(); if change_owner { Some(vec![Operation { query: format!( "ALTER DATABASE {} OWNER TO {}", db.name.pg_quote(), db.owner.pg_quote() ), comment: Some(format!( "changing database owner of database {} to {}", db.name, db.owner )), }]) } else { None } } else { databases.insert(db.name.clone(), db.clone()); Some(vec![ Operation { query: format!( "CREATE DATABASE {} {}", db.name.pg_quote(), db.to_pg_options(), ), comment: None, }, Operation { // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database // (see https://www.postgresql.org/docs/current/ddl-priv.html) query: format!( "GRANT ALL PRIVILEGES ON DATABASE {} TO {}", db.name.pg_quote(), params.privileged_role_name ), comment: None, }, ]) } }) .flatten(); Ok(Box::new(operations)) } ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation { query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), comment: Some(String::from( "create schema for neon extension and utils tables", )), }))), ApplySpecPhase::RunInEachDatabase { db, subphase } => { // Do some checks that user DB exists and we can access it. // // During the phases like DropLogicalSubscriptions, DeleteDBRoleReferences, // which happen before dropping the DB, the current run could be a retry, // so it's a valid case when DB is absent already. The case of // `pg_database.datallowconn = false`/`restrict_conn` is a bit tricky, as // in theory user can have some dangling objects there, so we will fail at // the actual drop later. Yet, to fix that in the current code we would need // to ALTER DATABASE, and then check back, but that even more invasive, so // that's not what we really want to do here. // // For ChangeSchemaPerms, skipping DBs we cannot access is totally fine. if let DB::UserDB(db) = db { let databases = &ctx.read().await.dbs; let edb = match databases.get(&db.name) { Some(edb) => edb, None => { warn!( "skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name ); return Ok(Box::new(empty())); } }; if edb.restrict_conn || edb.invalid { warn!( "skipping RunInEachDatabase phase {:?}, database {} is (restrict_conn={}, invalid={})", subphase, db.name, edb.restrict_conn, edb.invalid ); return Ok(Box::new(empty())); } } match subphase { PerDatabasePhase::DropLogicalSubscriptions => { match &db { DB::UserDB(db) => { let (db_name, outer_tag) = db.name.pg_quote_dollar(); let drop_subscription_query: String = format!( include_str!("sql/drop_subscriptions.sql"), datname_str = db_name, outer_tag = outer_tag, ); let operations = vec![Operation { query: drop_subscription_query, comment: Some(format!( "optionally dropping subscriptions for DB {}", db.name, )), }] .into_iter(); Ok(Box::new(operations)) } // skip this cleanup for the system databases // because users can't drop them DB::SystemDB => Ok(Box::new(empty())), } } PerDatabasePhase::DeleteDBRoleReferences => { let ctx = ctx.read().await; let operations = spec .delta_operations .iter() .flatten() .filter(|op| op.action == "delete_role") .filter_map(move |op| { if db.is_owned_by(&op.name) { return None; } if !ctx.roles.contains_key(&op.name) { return None; } let quoted = op.name.pg_quote(); let new_owner = match &db { DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), DB::UserDB(db) => db.owner.pg_quote(), }; let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); Some(vec![ // This will reassign all dependent objects to the db owner Operation { query: format!("REASSIGN OWNED BY {quoted} TO {new_owner}",), comment: None, }, // Revoke some potentially blocking privileges (Neon-specific currently) Operation { query: format!( include_str!("sql/pre_drop_role_revoke_privileges.sql"), // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` role_name = escaped_role, outer_tag = outer_tag, ) // HADRON change: .replace("neon_superuser", ¶ms.privileged_role_name), // HADRON change end , comment: None, }, // This now will only drop privileges of the role // TODO: this is obviously not 100% true because of the above case, // there could be still some privileges that are not revoked. Maybe this // only drops privileges that were granted *by this* role, not *to this* role, // but this has to be checked. Operation { query: format!("DROP OWNED BY {quoted}"), comment: None, }, ]) }) .flatten(); Ok(Box::new(operations)) } PerDatabasePhase::ChangeSchemaPerms => { let db = match &db { // ignore schema permissions on the system database DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; let (db_owner, outer_tag) = db.owner.pg_quote_dollar(); let operations = vec![ Operation { query: format!( include_str!("sql/set_public_schema_owner.sql"), db_owner = db_owner, outer_tag = outer_tag, ), comment: None, }, Operation { query: String::from(include_str!("sql/default_grants.sql")) .replace("neon_superuser", ¶ms.privileged_role_name), comment: None, }, ] .into_iter(); Ok(Box::new(operations)) } } } // Interestingly, we only install p_s_s in the main database, even when // it's preloaded. ApplySpecPhase::HandleOtherExtensions => { if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { if libs.contains("pg_stat_statements") { return Ok(Box::new(once(Operation { query: String::from( "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public", ), comment: Some(String::from("create system extensions")), }))); } } Ok(Box::new(empty())) } ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"), comment: Some(String::from("create pgaudit extensions")), }))), ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { query: String::from( "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public", ), comment: Some(String::from("create pgauditlogtofile extensions")), }))), // Disable pgaudit logging for postgres database. // Postgres is neon system database used by monitors // and compute_ctl tuning functions and thus generates a lot of noise. // We do not consider data stored in this database as sensitive. ApplySpecPhase::DisablePostgresDBPgAudit => { let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'"; Ok(Box::new(once(Operation { query: query.to_string(), comment: Some(query.to_string()), }))) } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ Operation { query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"), comment: Some(String::from( "init: install the extension if not already installed", )), }, Operation { query: String::from( "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false", ), comment: Some(String::from("compat/fix: make neon relocatable")), }, Operation { query: String::from("ALTER EXTENSION neon SET SCHEMA neon"), comment: Some(String::from("compat/fix: alter neon extension schema")), }, Operation { query: String::from("ALTER EXTENSION neon UPDATE"), comment: Some(String::from("compat/update: update neon extension version")), }, ] .into_iter(); Ok(Box::new(operations)) } // BEGIN_HADRON // Note: we may want to version the extension someday, but for now we just drop it and recreate it. ApplySpecPhase::HandleDatabricksAuthExtension => { let operations = vec![ Operation { query: String::from("DROP EXTENSION IF EXISTS databricks_auth"), comment: Some(String::from("dropping existing databricks_auth extension")), }, Operation { query: String::from("CREATE EXTENSION databricks_auth"), comment: Some(String::from("creating databricks_auth extension")), }, Operation { query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"), comment: Some(String::from("grant select on databricks auth counters")), }, ] .into_iter(); Ok(Box::new(operations)) } // END_HADRON ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation { query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")), comment: None, }))), ApplySpecPhase::DropRoles => { let operations = spec .delta_operations .iter() .flatten() .filter(|op| op.action == "delete_role") .map(|op| Operation { query: format!("DROP ROLE IF EXISTS {}", op.name.pg_quote()), comment: None, }); Ok(Box::new(operations)) } // BEGIN_HADRON // New Hadron phases // // Grants permissions to roles that are used by Databricks. ApplySpecPhase::AddDatabricksGrants => { let operations = vec![ Operation { query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"), comment: Some(String::from( "Permissions needed to execute neon.* functions (in the postgres database)", )), }, Operation { query: String::from( "GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor", ), comment: Some(String::from("Permissions needed for read and write probes")), }, Operation { query: String::from( "GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor", ), comment: Some(String::from( "Permissions needed to monitor .snap file counts", )), }, Operation { query: String::from( "GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor", ), comment: Some(String::from( "Permissions needed to access neon performance counters view", )), }, Operation { query: String::from( "GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor", ), comment: Some(String::from( "Permissions needed to execute the underlying performance counters function", )), }, ] .into_iter(); Ok(Box::new(operations)) } // Creates minor objects that are used by Databricks. ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation { query: String::from(include_str!("sql/create_databricks_misc.sql")), comment: Some(String::from( "The function databricks_monitor uses to convert exception to 0 or 1", )), }))), // End of new Hadron phases // END_HADRON ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation { query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")), comment: None, }))), } } ================================================ FILE: compute_tools/src/sql/add_availabilitycheck_tables.sql ================================================ DO $$ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN CREATE TABLE public.health_check ( id pg_catalog.int4 primary key generated by default as identity, updated_at pg_catalog.timestamptz default pg_catalog.now() ); INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE SET updated_at = pg_catalog.now(); END IF; END $$ ================================================ FILE: compute_tools/src/sql/alter_databricks_reader_roles_timeout.sql ================================================ DO $$ DECLARE reader_role RECORD; timeout_value TEXT; BEGIN -- Get the current GUC setting for reader statement timeout SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value; -- Only proceed if timeout_value is not null/empty and not '0' (disabled) IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN -- Find all databricks_reader_* roles and update their statement_timeout FOR reader_role IN SELECT r.rolname FROM pg_roles r WHERE r.rolname ~ '^databricks_reader_\d+$' LOOP -- Apply the timeout setting to the role (will overwrite existing setting) EXECUTE format('ALTER ROLE %I SET statement_timeout = %L', reader_role.rolname, timeout_value); RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname; END LOOP; END IF; END $$; ================================================ FILE: compute_tools/src/sql/create_databricks_misc.sql ================================================ ALTER ROLE databricks_monitor SET statement_timeout = '60s'; CREATE OR REPLACE FUNCTION health_check_write_succeeds() RETURNS INTEGER AS $$ BEGIN INSERT INTO health_check VALUES (1, now()) ON CONFLICT (id) DO UPDATE SET updated_at = now(); RETURN 1; EXCEPTION WHEN OTHERS THEN RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM; RETURN 0; END; $$ LANGUAGE plpgsql; ================================================ FILE: compute_tools/src/sql/create_privileged_role.sql ================================================ DO $$ BEGIN IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name) THEN CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; END $$; ================================================ FILE: compute_tools/src/sql/default_grants.sql ================================================ DO $$ BEGIN IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace WHERE nspname OPERATOR(pg_catalog.=) 'public' ) AND pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15 THEN IF EXISTS( SELECT rolname FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) 'web_access' ) THEN GRANT CREATE ON SCHEMA public TO web_access; END IF; END IF; IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace WHERE nspname OPERATOR(pg_catalog.=) 'public' ) THEN ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; END IF; END $$; ================================================ FILE: compute_tools/src/sql/drop_subscriptions.sql ================================================ DO ${outer_tag}$ DECLARE subname TEXT; BEGIN LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE; FOR subname IN SELECT pg_subscription.subname FROM pg_catalog.pg_subscription WHERE subdbid OPERATOR(pg_catalog.=) ( SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name ) LOOP EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname); EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; ${outer_tag}$; ================================================ FILE: compute_tools/src/sql/finalize_drop_subscriptions.sql ================================================ DO $$ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name ) THEN CREATE TABLE neon.drop_subscriptions_done (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text); END IF; -- preserve the timeline_id of the last drop_subscriptions run -- to ensure that the cleanup of a timeline is executed only once. -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id')) ON CONFLICT (id) DO UPDATE SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text; END $$ ================================================ FILE: compute_tools/src/sql/pre_drop_role_revoke_privileges.sql ================================================ DO ${outer_tag}$ DECLARE schema TEXT; grantor TEXT; revoke_query TEXT; BEGIN FOR schema IN SELECT schema_name FROM information_schema.schemata -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants, -- e.g., make DB owner the owner of 'public' schema automatically (when created via API). -- See https://github.com/neondatabase/cloud/issues/13582 for the context. -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema, -- ii) it's easy to add more schemas to the list if needed. WHERE schema_name IN ('public') LOOP FOR grantor IN EXECUTE pg_catalog.format( 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s', -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` quote_literal({role_name}) ) LOOP EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor); revoke_query := pg_catalog.format( 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', schema, -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` {role_name}, grantor ); EXECUTE revoke_query; END LOOP; END LOOP; END; ${outer_tag}$; ================================================ FILE: compute_tools/src/sql/set_public_schema_owner.sql ================================================ DO ${outer_tag}$ DECLARE schema_owner TEXT; BEGIN IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN SELECT nspowner::regrole::text FROM pg_catalog.pg_namespace WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text INTO schema_owner; IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text THEN EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END ${outer_tag}$; ================================================ FILE: compute_tools/src/sql/unset_template_for_drop_dbs.sql ================================================ DO ${outer_tag}$ BEGIN IF EXISTS( SELECT 1 FROM pg_catalog.pg_database WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name ) THEN EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname}); END IF; END ${outer_tag}$; ================================================ FILE: compute_tools/src/swap.rs ================================================ use std::path::Path; use anyhow::{Context, anyhow}; use tracing::{instrument, warn}; pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; #[instrument] pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { // run `/neonvm/bin/resize-swap --once {size_bytes}` // // Passing '--once' causes resize-swap to delete itself after successful completion, which // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while // postgres is running. // // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg. let child_result = std::process::Command::new("/usr/bin/sudo") .arg(RESIZE_SWAP_BIN) .arg("--once") .arg(size_bytes.to_string()) .spawn(); child_result .context("spawn() failed") .and_then(|mut child| child.wait().context("wait() failed")) .and_then(|status| match status.success() { true => Ok(()), false => { // The command failed. Maybe it was because the resize-swap file doesn't exist? // The --once flag causes it to delete itself on success so we don't disable swap // while postgres is running; maybe this is fine. match Path::new(RESIZE_SWAP_BIN).try_exists() { Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")), // The path doesn't exist; we're actually ok Ok(false) => { warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); Ok(()) }, } } }) // wrap any prior error with the overall context that we couldn't run the command .with_context(|| { format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`") }) } ================================================ FILE: compute_tools/src/sync_sk.rs ================================================ // Utils for running sync_safekeepers use anyhow::Result; use tracing::info; use utils::lsn::Lsn; #[derive(Copy, Clone, Debug)] pub enum TimelineStatusResponse { NotFound, Ok(TimelineStatusOkResponse), } #[derive(Copy, Clone, Debug)] pub struct TimelineStatusOkResponse { flush_lsn: Lsn, commit_lsn: Lsn, } /// Get a safekeeper's metadata for our timeline. The id is only used for logging pub async fn ping_safekeeper( id: String, config: tokio_postgres::Config, ) -> Result { // TODO add retries // Connect info!("connecting to {}", id); let (client, conn) = config.connect(tokio_postgres::NoTls).await?; tokio::spawn(async move { if let Err(e) = conn.await { eprintln!("connection error: {e}"); } }); // Query info!("querying {}", id); let result = client.simple_query("TIMELINE_STATUS").await?; // Parse result info!("done with {}", id); if let postgres::SimpleQueryMessage::Row(row) = &result[0] { use std::str::FromStr; let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse { flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?, commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?, }); Ok(response) } else { // Timeline doesn't exist Ok(TimelineStatusResponse::NotFound) } } /// Given a quorum of responses, check if safekeepers are synced at some Lsn pub fn check_if_synced(responses: Vec) -> Option { // Check if all responses are ok let ok_responses: Vec = responses .iter() .filter_map(|r| match r { TimelineStatusResponse::Ok(ok_response) => Some(ok_response), _ => None, }) .cloned() .collect(); if ok_responses.len() < responses.len() { info!( "not synced. Only {} out of {} know about this timeline", ok_responses.len(), responses.len() ); return None; } // Get the min and the max of everything let commit: Vec = ok_responses.iter().map(|r| r.commit_lsn).collect(); let flush: Vec = ok_responses.iter().map(|r| r.flush_lsn).collect(); let commit_max = commit.iter().max().unwrap(); let commit_min = commit.iter().min().unwrap(); let flush_max = flush.iter().max().unwrap(); let flush_min = flush.iter().min().unwrap(); // Check that all values are equal if commit_min != commit_max { info!("not synced. {:?} {:?}", commit_min, commit_max); return None; } if flush_min != flush_max { info!("not synced. {:?} {:?}", flush_min, flush_max); return None; } // Check that commit == flush if commit_max != flush_max { info!("not synced. {:?} {:?}", commit_max, flush_max); return None; } Some(*commit_max) } ================================================ FILE: compute_tools/src/tls.rs ================================================ use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration}; use anyhow::{Context, Result, bail}; use compute_api::responses::TlsConfig; use ring::digest; use x509_cert::Certificate; #[derive(Clone, Copy)] pub struct CertDigest(digest::Digest); pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver { let mut digest = compute_digest(&cert_path).await; let (tx, rx) = tokio::sync::watch::channel(digest); tokio::spawn(async move { while !tx.is_closed() { let new_digest = compute_digest(&cert_path).await; if digest.0.as_ref() != new_digest.0.as_ref() { digest = new_digest; _ = tx.send(digest); } tokio::time::sleep(Duration::from_secs(60)).await } }); rx } async fn compute_digest(cert_path: &str) -> CertDigest { loop { match try_compute_digest(cert_path).await { Ok(d) => break d, Err(e) => { tracing::error!("could not read cert file {e:?}"); tokio::time::sleep(Duration::from_secs(1)).await } } } } async fn try_compute_digest(cert_path: &str) -> Result { let data = tokio::fs::read(cert_path).await?; // sha256 is extremely collision resistent. can safely assume the digest to be unique Ok(CertDigest(digest::digest(&digest::SHA256, &data))) } pub const SERVER_CRT: &str = "server.crt"; pub const SERVER_KEY: &str = "server.key"; pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) { loop { match try_update_key_path_blocking(pg_data, tls_config) { Ok(()) => break, Err(e) => { tracing::error!(error = ?e, "could not create key file"); std::thread::sleep(Duration::from_secs(1)) } } } } // Postgres requires the keypath be "secure". This means // 1. Owned by the postgres user. // 2. Have permission 600. fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> { let key = std::fs::read_to_string(&tls_config.key_path)?; let crt = std::fs::read_to_string(&tls_config.cert_path)?; // to mitigate a race condition during renewal. verify_key_cert(&key, &crt)?; let mut key_file = std::fs::OpenOptions::new() .write(true) .create(true) .truncate(true) .mode(0o600) .open(pg_data.join(SERVER_KEY))?; let mut crt_file = std::fs::OpenOptions::new() .write(true) .create(true) .truncate(true) .mode(0o600) .open(pg_data.join(SERVER_CRT))?; key_file.write_all(key.as_bytes())?; crt_file.write_all(crt.as_bytes())?; Ok(()) } fn verify_key_cert(key: &str, cert: &str) -> Result<()> { use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256; let certs = Certificate::load_pem_chain(cert.as_bytes()) .context("decoding PEM encoded certificates")?; // First certificate is our server-cert, // all the rest of the certs are the CA cert chain. let Some(cert) = certs.first() else { bail!("no certificates found"); }; match cert.signature_algorithm.oid { ECDSA_WITH_SHA_256 => { let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?; let a = key.public_key().to_sec1_bytes(); let b = cert .tbs_certificate .subject_public_key_info .subject_public_key .raw_bytes(); if *a != *b { bail!("private key file does not match certificate") } } _ => bail!("unknown TLS key type"), } Ok(()) } #[cfg(test)] mod tests { use super::verify_key_cert; /// Real certificate chain file, generated by cert-manager in dev. /// The server auth certificate has expired since 2025-04-24T15:41:35Z. const CERT: &str = " -----BEGIN CERTIFICATE----- MIICCDCCAa+gAwIBAgIQKhLomFcNULbZA/bPdGzaSzAKBggqhkjOPQQDAjBEMQsw CQYDVQQGEwJVUzESMBAGA1UEChMJTmVvbiBJbmMuMSEwHwYDVQQDExhOZW9uIEs4 cyBJbnRlcm1lZGlhdGUgQ0EwHhcNMjUwNDIzMTU0MTM1WhcNMjUwNDI0MTU0MTM1 WjBBMT8wPQYDVQQDEzZjb21wdXRlLXdpc3B5LWdyYXNzLXcwY21laWp3LmRlZmF1 bHQuc3ZjLmNsdXN0ZXIubG9jYWwwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAATF QCcG2m/EVHAiZtSsYgVnHgoTjUL/Jtwfdrpvz2t0bVRZmBmSKhlo53uPV9Y5eKFG AmR54p9/gT2eO3xU7vAgo4GFMIGCMA4GA1UdDwEB/wQEAwIFoDAMBgNVHRMBAf8E AjAAMB8GA1UdIwQYMBaAFFR2JAhXkeiNQNEixTvAYIwxUu3QMEEGA1UdEQQ6MDiC NmNvbXB1dGUtd2lzcHktZ3Jhc3MtdzBjbWVpancuZGVmYXVsdC5zdmMuY2x1c3Rl ci5sb2NhbDAKBggqhkjOPQQDAgNHADBEAiBLG22wKG8XS9e9RxBT+kmUx/kIThcP DIpp7jx0PrFcdQIgEMTdnXpx5Cv/Z0NIEDxtMHUD7G0vuRPfztki36JuakM= -----END CERTIFICATE----- -----BEGIN CERTIFICATE----- MIICFzCCAb6gAwIBAgIUbbX98N2Ip6lWAONRk8dU9hSz+YIwCgYIKoZIzj0EAwIw RDELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UEAxMYTmVv biBBV1MgSW50ZXJtZWRpYXRlIENBMB4XDTI1MDQyMjE1MTAxMFoXDTI1MDcyMTE1 MTAxMFowRDELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UE AxMYTmVvbiBLOHMgSW50ZXJtZWRpYXRlIENBMFkwEwYHKoZIzj0CAQYIKoZIzj0D AQcDQgAE5++m5owqNI4BPMTVNIUQH0qvU7pYhdpHGVGhdj/Lgars6ROvE6uSNQV4 SAmJN5HBzj5/6kLQaTPWpXW7EHXjK6OBjTCBijAOBgNVHQ8BAf8EBAMCAQYwEgYD VR0TAQH/BAgwBgEB/wIBADAdBgNVHQ4EFgQUVHYkCFeR6I1A0SLFO8BgjDFS7dAw HwYDVR0jBBgwFoAUgHfNXfyKtHO0V9qoLOWCjkNiaI8wJAYDVR0eAQH/BBowGKAW MBSCEi5zdmMuY2x1c3Rlci5sb2NhbDAKBggqhkjOPQQDAgNHADBEAiBObVFFdXaL QpOXmN60dYUNnQRwjKreFduEkQgOdOlssgIgVAdJJQFgvlrvEOBhY8j5WyeKRwUN k/ALs6KpgaFBCGY= -----END CERTIFICATE----- -----BEGIN CERTIFICATE----- MIIB4jCCAYegAwIBAgIUFlxWFn/11yoGdmD+6gf+yQMToS0wCgYIKoZIzj0EAwIw ODELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEVMBMGA1UEAxMMTmVv biBSb290IENBMB4XDTI1MDQwMzA3MTUyMloXDTI2MDQwMzA3MTUyMlowRDELMAkG A1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UEAxMYTmVvbiBBV1Mg SW50ZXJtZWRpYXRlIENBMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEqonG/IQ6 ZxtEtOUTkkoNopPieXDO5CBKUkNFTGeJEB7OxRlSpYJgsBpaYIaD6Vc4sVk3thIF p+pLw52idQOIN6NjMGEwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8w HQYDVR0OBBYEFIB3zV38irRztFfaqCzlgo5DYmiPMB8GA1UdIwQYMBaAFKh7M4/G FHvr/ORDQZt4bMLlJvHCMAoGCCqGSM49BAMCA0kAMEYCIQCbS4x7QPslONzBYbjC UQaQ0QLDW4CJHvQ4u4gbWFG87wIhAJMsHQHjP9qTT27Q65zQCR7O8QeLAfha1jrH Ag/LsxSr -----END CERTIFICATE----- "; /// The key corresponding to [`CERT`] const KEY: &str = " -----BEGIN EC PRIVATE KEY----- MHcCAQEEIDnAnrqmIJjndCLWP1iIO5X3X63Aia48TGpGuMXwvm6IoAoGCCqGSM49 AwEHoUQDQgAExUAnBtpvxFRwImbUrGIFZx4KE41C/ybcH3a6b89rdG1UWZgZkioZ aOd7j1fWOXihRgJkeeKff4E9njt8VO7wIA== -----END EC PRIVATE KEY----- "; /// An incorrect key. const INCORRECT_KEY: &str = " -----BEGIN EC PRIVATE KEY----- MHcCAQEEIL6WqqBDyvM0HWz7Ir5M5+jhFWB7IzOClGn26OPrzHCXoAoGCCqGSM49 AwEHoUQDQgAE7XVvdOy5lfwtNKb+gJEUtnG+DrnnXLY5LsHDeGQKV9PTRcEMeCrG YZzHyML4P6Sr4yi2ts+4B9i47uvAG8+XwQ== -----END EC PRIVATE KEY----- "; #[test] fn certificate_verification() { verify_key_cert(KEY, CERT).unwrap(); } #[test] #[should_panic(expected = "private key file does not match certificate")] fn certificate_verification_fail() { verify_key_cert(INCORRECT_KEY, CERT).unwrap(); } } ================================================ FILE: compute_tools/tests/README.md ================================================ ### Test files The file `cluster_spec.json` has been copied over from libs/compute_api tests, with some edits: - the neon.safekeepers setting contains a duplicate value ================================================ FILE: compute_tools/tests/cluster_spec.json ================================================ { "format_version": 1.0, "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", "suspend_timeout_seconds": 3600, "cluster": { "cluster_id": "test-cluster-42", "name": "Zenith Test", "state": "restarted", "roles": [ { "name": "postgres", "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972", "options": null }, { "name": "alexk", "encrypted_password": null, "options": null }, { "name": "zenith \"new\"", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972", "options": null }, { "name": "zen", "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972" }, { "name": "\"name\";\\n select 1;", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" }, { "name": "MyRole", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" } ], "databases": [ { "name": "DB2", "owner": "alexk", "options": [ { "name": "LC_COLLATE", "value": "C", "vartype": "string" }, { "name": "LC_CTYPE", "value": "C", "vartype": "string" }, { "name": "TEMPLATE", "value": "template0", "vartype": "enum" } ] }, { "name": "zenith", "owner": "MyRole" }, { "name": "zen", "owner": "zen" } ], "settings": [ { "name": "fsync", "value": "off", "vartype": "bool" }, { "name": "wal_level", "value": "logical", "vartype": "enum" }, { "name": "hot_standby", "value": "on", "vartype": "bool" }, { "name": "prewarm_lfc_on_startup", "value": "off", "vartype": "bool" }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501,127.0.0.1:6502", "vartype": "string" }, { "name": "wal_log_hints", "value": "on", "vartype": "bool" }, { "name": "log_connections", "value": "on", "vartype": "bool" }, { "name": "shared_buffers", "value": "32768", "vartype": "integer" }, { "name": "port", "value": "55432", "vartype": "integer" }, { "name": "max_connections", "value": "100", "vartype": "integer" }, { "name": "max_wal_senders", "value": "10", "vartype": "integer" }, { "name": "listen_addresses", "value": "0.0.0.0", "vartype": "string" }, { "name": "wal_sender_timeout", "value": "0", "vartype": "integer" }, { "name": "password_encryption", "value": "md5", "vartype": "enum" }, { "name": "maintenance_work_mem", "value": "65536", "vartype": "integer" }, { "name": "max_parallel_workers", "value": "8", "vartype": "integer" }, { "name": "max_worker_processes", "value": "8", "vartype": "integer" }, { "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, { "name": "max_replication_slots", "value": "10", "vartype": "integer" }, { "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", "value": "neon", "vartype": "string" }, { "name": "synchronous_standby_names", "value": "walproposer", "vartype": "string" }, { "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" }, { "name": "test.escaping", "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray", "vartype": "string" } ] }, "delta_operations": [ { "action": "delete_db", "name": "zenith_test" }, { "action": "rename_db", "name": "DB", "new_name": "DB2" }, { "action": "delete_role", "name": "zenith2" }, { "action": "rename_role", "name": "zenith new", "new_name": "zenith \"new\"" } ], "remote_extensions": { "library_index": { "postgis-3": "postgis", "libpgrouting-3.4": "postgis", "postgis_raster-3": "postgis", "postgis_sfcgal-3": "postgis", "postgis_topology-3": "postgis", "address_standardizer-3": "postgis" }, "extension_data": { "postgis": { "archive_path": "5834329303/v15/extensions/postgis.tar.zst", "control_data": { "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n", "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n", "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n", "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n", "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n", "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n", "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n", "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n" } } }, "custom_extensions": [], "public_extensions": ["postgis"] }, "pgbouncer_settings": { "default_pool_size": "42", "pool_mode": "session" } } ================================================ FILE: compute_tools/tests/config_test.rs ================================================ #[cfg(test)] mod config_tests { use std::fs::{File, remove_file}; use std::io::{Read, Write}; use std::path::Path; use compute_tools::config::*; fn write_test_file(path: &Path, content: &str) { let mut file = File::create(path).unwrap(); file.write_all(content.as_bytes()).unwrap(); } fn check_file_content(path: &Path, expected_content: &str) { let mut file = File::open(path).unwrap(); let mut content = String::new(); file.read_to_string(&mut content).unwrap(); assert_eq!(content, expected_content); } #[test] fn test_line_in_file() { let path = Path::new("./tests/tmp/config_test.txt"); write_test_file(path, "line1\nline2.1\t line2.2\nline3"); let line = "line2.1\t line2.2"; let result = line_in_file(path, line).unwrap(); assert!(!result); check_file_content(path, "line1\nline2.1\t line2.2\nline3"); let line = "line4"; let result = line_in_file(path, line).unwrap(); assert!(result); check_file_content(path, "line1\nline2.1\t line2.2\nline3\nline4"); remove_file(path).unwrap(); let path = Path::new("./tests/tmp/new_config_test.txt"); let line = "line4"; let result = line_in_file(path, line).unwrap(); assert!(result); check_file_content(path, "line4"); remove_file(path).unwrap(); } } ================================================ FILE: compute_tools/tests/pg_helpers_tests.rs ================================================ #[cfg(test)] mod pg_helpers_tests { use std::fs::File; use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent}; use compute_tools::pg_helpers::*; #[test] fn params_serialize() { let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.databases.first().unwrap().to_pg_options(), "LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0 OWNER \"alexk\"" ); assert_eq!( spec.cluster.roles.first().unwrap().to_pg_options(), " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" ); } #[test] fn settings_serialize() { let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.settings.as_pg_settings(), r#"fsync = off wal_level = logical hot_standby = on autoprewarm = off offload_lfc_interval_seconds = 20 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on shared_buffers = 32768 port = 55432 max_connections = 100 max_wal_senders = 10 listen_addresses = '0.0.0.0' wal_sender_timeout = 0 password_encryption = md5 maintenance_work_mem = 65536 max_parallel_workers = 8 max_worker_processes = 8 neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8' max_replication_slots = 10 neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13' shared_preload_libraries = 'neon' synchronous_standby_names = 'walproposer' neon.pageserver_connstring = 'host=127.0.0.1 port=6400' test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray' "# ); } #[test] fn ident_pg_quote() { let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;"); assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } #[test] fn ident_pg_quote_dollar() { let test_cases = vec![ ("name", ("$x$name$x$", "xx")), ("name$", ("$x$name$$x$", "xx")), ("name$$", ("$x$name$$$x$", "xx")), ("name$$$", ("$x$name$$$$x$", "xx")), ("name$$$$", ("$x$name$$$$$x$", "xx")), ("name$x$", ("$xx$name$x$$xx$", "xxx")), ("x", ("$xx$x$xx$", "xxx")), ("xx", ("$xxx$xx$xxx$", "xxxx")), ("$x", ("$xx$$x$xx$", "xxx")), ("x$", ("$xx$x$$xx$", "xxx")), ("$x$", ("$xx$$x$$xx$", "xxx")), ("xx$", ("$xxx$xx$$xxx$", "xxxx")), ("$xx", ("$xxx$$xx$xxx$", "xxxx")), ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")), ]; for (input, expected) in test_cases { let (escaped, tag) = PgIdent::from(input).pg_quote_dollar(); assert_eq!(escaped, expected.0); assert_eq!(tag, expected.1); } } #[test] fn generic_options_search() { let generic_options: GenericOptions = Some(vec![ GenericOption { name: "present_value".into(), value: Some("value".into()), vartype: "string".into(), }, GenericOption { name: "missed_value".into(), value: None, vartype: "int".into(), }, ]); assert_eq!(generic_options.find("present_value"), Some("value".into())); assert_eq!(generic_options.find("missed_value"), None); assert_eq!(generic_options.find("invalid_value"), None); let empty_generic_options: GenericOptions = Some(vec![]); assert_eq!(empty_generic_options.find("present_value"), None); assert_eq!(empty_generic_options.find("missed_value"), None); assert_eq!(empty_generic_options.find("invalid_value"), None); let none_generic_options: GenericOptions = None; assert_eq!(none_generic_options.find("present_value"), None); assert_eq!(none_generic_options.find("missed_value"), None); assert_eq!(none_generic_options.find("invalid_value"), None); } #[test] fn test_escape_literal() { assert_eq!(escape_literal("test"), "'test'"); assert_eq!(escape_literal("test'"), "'test'''"); assert_eq!(escape_literal("test\\'"), "E'test\\\\'''"); assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''"); } } ================================================ FILE: control_plane/.gitignore ================================================ tmp_check/ ================================================ FILE: control_plane/Cargo.toml ================================================ [package] name = "control_plane" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true base64.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true futures.workspace = true humantime.workspace = true jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pem.workspace = true humantime-serde.workspace = true hyper0.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } scopeguard.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true spki.workspace = true thiserror.workspace = true toml.workspace = true toml_edit.workspace = true tokio.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true url.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_backend.workspace = true safekeeper_api.workspace = true safekeeper_client.workspace = true postgres_connection.workspace = true storage_broker.workspace = true http-utils.workspace = true utils.workspace = true whoami.workspace = true endpoint_storage.workspace = true compute_api.workspace = true workspace_hack.workspace = true tracing.workspace = true ================================================ FILE: control_plane/README.md ================================================ # Local Development Control Plane (`neon_local`) This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. This is a convenience to invoke the `neon_local` binary. **Note**: this is a dev/test tool -- a minimal control plane suitable for testing code changes locally, but not suitable for running production systems. ## Example: Start with Postgres 16 To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands. ```shell cargo neon init cargo neon start cargo neon tenant create --set-default --pg-version 16 cargo neon endpoint create main --pg-version 16 cargo neon endpoint start main ``` ## Example: Create Test User and Database By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint. ```shell cargo neon endpoint create main --pg-version 16 --update-catalog true cargo neon endpoint start main --create-test-user true ``` The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command. ================================================ FILE: control_plane/safekeepers.conf ================================================ # Page server and three safekeepers. [pageserver] listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' listen_grpc_addr = '127.0.0.1:51051' pg_auth_type = 'Trust' http_auth_type = 'Trust' grpc_auth_type = 'Trust' [[safekeepers]] id = 1 pg_port = 5454 http_port = 7676 [[safekeepers]] id = 2 pg_port = 5455 http_port = 7677 [[safekeepers]] id = 3 pg_port = 5456 http_port = 7678 ================================================ FILE: control_plane/simple.conf ================================================ # Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config [[pageservers]] id=1 listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' listen_grpc_addr = '127.0.0.1:51051' pg_auth_type = 'Trust' http_auth_type = 'Trust' grpc_auth_type = 'Trust' [[safekeepers]] id = 1 pg_port = 5454 http_port = 7676 [broker] listen_addr = '127.0.0.1:50051' ================================================ FILE: control_plane/src/background_process.rs ================================================ //! Spawns and kills background processes that are needed by Neon CLI. //! Applies common set-up such as log and pid files (if needed) to every process. //! //! Neon CLI does not run in background, so it needs to store the information about //! spawned processes, which it does in this module. //! We do that by storing the pid of the process in the "${process_name}.pid" file. //! The pid file can be created by the process itself //! (Neon storage binaries do that and also ensure that a lock is taken onto that file) //! or we create such file after starting the process //! (non-Neon binaries don't necessarily follow our pidfile conventions). //! The pid stored in the file is later used to stop the service. //! //! See the [`lock_file`](utils::lock_file) module for more info. use std::ffi::OsStr; use std::io::Write; use std::os::fd::AsFd; use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs, io, thread}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno; use nix::fcntl::{FcntlArg, FdFlag}; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use utils::pid_file::{self, PidFileRead}; // These constants control the loop used to poll for process start / stop. // // The loop waits for at most 10 seconds, polling every 100 ms. // Once a second, it prints a dot ("."), to give the user an indication that // it's waiting. If the process hasn't started/stopped after 5 seconds, // it prints a notice that it's taking long, but keeps waiting. // const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10); const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis(); const RETRY_INTERVAL: Duration = Duration::from_millis(100); const DOT_EVERY_RETRIES: u128 = 10; const NOTICE_AFTER_RETRIES: u128 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. pub enum InitialPidFile { /// Create a pidfile, to allow future CLI invocations to manipulate the process. Create(Utf8PathBuf), /// The process will create the pidfile itself, need to wait for that event. Expect(Utf8PathBuf), } /// Start a background child process using the parameters given. #[allow(clippy::too_many_arguments)] pub async fn start_process( process_name: &str, datadir: &Path, command: &Path, args: AI, envs: EI, initial_pid_file: InitialPidFile, retry_timeout: &Duration, process_status_check: F, ) -> anyhow::Result<()> where F: Fn() -> Fut, Fut: std::future::Future>, AI: IntoIterator, A: AsRef, // Not generic AsRef, otherwise empty `envs` prevents type inference EI: IntoIterator, { let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); if !datadir.metadata().context("stat datadir")?.is_dir() { anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}"); } let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() .create(true) .append(true) .open(&log_path) .with_context(|| { format!("Could not open {process_name} log file {log_path:?} for writing") })?; let same_file_for_stderr = process_log_file.try_clone().with_context(|| { format!("Could not reuse {process_name} log file {log_path:?} for writing stderr") })?; let mut command = Command::new(command); let background_command = command .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args) // spawn all child processes in their datadir, useful for all kinds of things, // not least cleaning up child processes e.g. after an unclean exit from the test suite: // ``` // lsof -d cwd -a +D Users/cs/src/neon/test_output // ``` .current_dir(datadir); let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( fill_rust_env_vars(background_command), )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { InitialPidFile::Create(path) => { pre_exec_create_pidfile(filled_cmd, path); path } InitialPidFile::Expect(path) => path, }; let spawned_process = filled_cmd.spawn().with_context(|| { format!("Could not spawn {process_name}, see console output and log files for details.") })?; let pid = spawned_process.id(); let pid = Pid::from_raw( i32::try_from(pid) .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, ); // set up a scopeguard to kill & wait for the child in case we panic or bail below let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| { println!("SIGKILL & wait the started process"); (|| { // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo). spawned_process.kill().context("SIGKILL child")?; spawned_process.wait().context("wait() for child process")?; anyhow::Ok(()) })() .with_context(|| format!("scopeguard kill&wait child {process_name:?}")) .unwrap(); }); for retries in 0..retries { match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { println!("\n{process_name} started and passed status check, pid: {pid}"); // leak the child process, it'll outlive this neon_local invocation drop(scopeguard::ScopeGuard::into_inner(spawned_process)); return Ok(()); } Ok(false) => { if retries == NOTICE_AFTER_RETRIES { // The process is taking a long time to start up. Keep waiting, but // print a message print!("\n{process_name} has not started yet, continuing to wait"); } if retries % DOT_EVERY_RETRIES == 0 { print!("."); io::stdout().flush().unwrap(); } tokio::time::sleep(RETRY_INTERVAL).await; } Err(e) => { println!("error starting process {process_name:?}: {e:#}"); return Err(e); } } } println!(); anyhow::bail!(format!( "{} did not start+pass status checks within {:?} seconds", process_name, retry_timeout )); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. pub fn stop_process( immediate: bool, process_name: &str, pid_file: &Utf8Path, ) -> anyhow::Result<()> { let pid = match pid_file::read(pid_file) .with_context(|| format!("read pid_file {pid_file:?}"))? { PidFileRead::NotExist => { println!("{process_name} is already stopped: no pid file present at {pid_file:?}"); return Ok(()); } PidFileRead::NotHeldByAnyProcess(_) => { // Don't try to kill according to file contents beacuse the pid might have been re-used by another process. // Don't delete the file either, it can race with new pid file creation. // Read `pid_file` module comment for details. println!( "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}" ); return Ok(()); } PidFileRead::LockedByOtherProcess(pid) => pid, }; // XXX the pid could become invalid (and recycled) at any time before the kill() below. // send signal let sig = if immediate { print!("Stopping {process_name} with pid {pid} immediately.."); Signal::SIGQUIT } else { print!("Stopping {process_name} with pid {pid} gracefully.."); Signal::SIGTERM }; io::stdout().flush().unwrap(); match kill(pid, sig) { Ok(()) => (), Err(Errno::ESRCH) => { // Again, don't delete the pid file. The unlink can race with a new pid file being created. println!( "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone." ); return Ok(()); } Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"), } // Wait until process is gone wait_until_stopped(process_name, pid)?; Ok(()) } pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { for retries in 0..STOP_RETRIES { match process_has_stopped(pid) { Ok(true) => { println!("\n{process_name} stopped"); return Ok(()); } Ok(false) => { if retries == NOTICE_AFTER_RETRIES { // The process is taking a long time to start up. Keep waiting, but // print a message print!("\n{process_name} has not stopped yet, continuing to wait"); } if retries % DOT_EVERY_RETRIES == 0 { print!("."); io::stdout().flush().unwrap(); } thread::sleep(RETRY_INTERVAL); } Err(e) => { println!("{process_name} with pid {pid} failed to stop: {e:#}"); return Err(e); } } } println!(); anyhow::bail!(format!( "{} with pid {} did not stop in {:?} seconds", process_name, pid, STOP_RETRY_TIMEOUT )); } fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { // If RUST_BACKTRACE is set, pass it through. But if it's not set, default // to RUST_BACKTRACE=1. let backtrace_setting = std::env::var_os("RUST_BACKTRACE"); let backtrace_setting = backtrace_setting .as_deref() .unwrap_or_else(|| OsStr::new("1")); let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting); // Pass through these environment variables to the command for var in [ "LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG", "ASAN_OPTIONS", "UBSAN_OPTIONS", ] { if let Some(val) = std::env::var_os(var) { filled_cmd = filled_cmd.env(var, val); } } filled_cmd } fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", "AWS_PROFILE", // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions. "HOME", "AZURE_STORAGE_ACCOUNT", "AZURE_STORAGE_ACCESS_KEY", ] { if let Ok(value) = std::env::var(env_key) { cmd = cmd.env(env_key, value); } } cmd } fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { for (var, val) in std::env::vars() { if var.starts_with("NEON_") { cmd = cmd.env(var, val); } } cmd } /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) /// will remain held until the cmd exits. fn pre_exec_create_pidfile

(cmd: &mut Command, path: P) -> &mut Command where P: Into, { let path: Utf8PathBuf = path.into(); // SAFETY: // pre_exec is marked unsafe because it runs between fork and exec. // Why is that dangerous in various ways? // Long answer: https://github.com/rust-lang/rust/issues/39575 // Short answer: in a multi-threaded program, other threads may have // been inside of critical sections at the time of fork. In the // original process, that was allright, assuming they protected // the critical sections appropriately, e.g., through locks. // Fork adds another process to the mix that // 1. Has a single thread T // 2. In an exact copy of the address space at the time of fork. // A variety of problems scan occur now: // 1. T tries to grab a lock that was locked at the time of fork. // It will wait forever since in its address space, the lock // is in state 'taken' but the thread that would unlock it is // not there. // 2. A rust object that represented some external resource in the // parent now got implicitly copied by the fork, even though // the object's type is not `Copy`. The parent program may use // non-copyability as way to enforce unique ownership of an // external resource in the typesystem. The fork breaks that // assumption, as now both parent and child process have an // owned instance of the object that represents the same // underlying resource. // While these seem like niche problems, (1) in particular is // highly relevant. For example, `malloc()` may grab a mutex internally, // and so, if we forked while another thread was mallocing' and our // pre_exec closure allocates as well, it will block on the malloc // mutex forever // // The proper solution is to only use C library functions that are marked // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html // // With this specific pre_exec() closure, the non-error path doesn't allocate. // The error path uses `anyhow`, and hence does allocate. // We take our chances there, hoping that any potential disaster is constrained // to the child process (e.g., malloc has no state ourside of the child process). // Last, `expect` prints to stderr, and stdio is not async-signal-safe. // Again, we take our chances, making the same assumptions as for malloc. unsafe { cmd.pre_exec(move || { let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile // remains locked after exec. nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty())) .expect("remove FD_CLOEXEC"); // Don't run drop(file), it would close the file before we actually exec. std::mem::forget(file); Ok(()) }); } cmd } async fn process_started( pid: Pid, pid_file_to_check: &Utf8Path, status_check: &F, ) -> anyhow::Result where F: Fn() -> Fut, Fut: std::future::Future>, { match status_check().await { Ok(true) => match pid_file::read(pid_file_to_check)? { PidFileRead::NotExist => Ok(false), PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), PidFileRead::NotHeldByAnyProcess(_) => Ok(false), }, Ok(false) => Ok(false), Err(e) => anyhow::bail!("process failed to start: {e}"), } } pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result { match kill(pid, None) { // Process exists, keep waiting Ok(_) => Ok(false), // Process not found, we're done Err(Errno::ESRCH) => Ok(true), Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"), } } ================================================ FILE: control_plane/src/bin/neon_local.rs ================================================ //! //! `neon_local` is an executable that can be used to create a local //! Neon environment, for testing purposes. The local environment is //! quite different from the cloud environment with Kubernetes, but it //! easier to work with locally. The python tests in `test_runner` //! rely on `neon_local` to set up the environment for each test. //! use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::fs::File; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::requests::ComputeClaimsScope; use compute_api::spec::{ComputeMode, PageserverProtocol}; use control_plane::broker::StorageBroker; use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; use control_plane::endpoint::{ local_pageserver_conf_to_conn_info, tenant_locate_response_to_conn_info, }; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, SafekeeperConf, }; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use nix::fcntl::{Flock, FlockArg}; use pageserver_api::config::{ DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT, DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; use pageserver_api::controller_api::{ NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; use pageserver_api::models::{ ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, }; use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId, }; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; use utils::auth::{Claims, Scope}; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use utils::project_git_version; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); #[allow(dead_code)] const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const DEFAULT_PG_VERSION_NUM: &str = "17"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; /// Neon CLI. #[derive(clap::Parser)] #[command(version = GIT_VERSION, name = "Neon CLI")] struct Cli { #[command(subcommand)] command: NeonLocalCmd, } #[derive(clap::Subcommand)] enum NeonLocalCmd { Init(InitCmdArgs), #[command(subcommand)] Tenant(TenantCmd), #[command(subcommand)] Timeline(TimelineCmd), #[command(subcommand)] Pageserver(PageserverCmd), #[command(subcommand)] #[clap(alias = "storage_controller")] StorageController(StorageControllerCmd), #[command(subcommand)] #[clap(alias = "storage_broker")] StorageBroker(StorageBrokerCmd), #[command(subcommand)] Safekeeper(SafekeeperCmd), #[command(subcommand)] EndpointStorage(EndpointStorageCmd), #[command(subcommand)] Endpoint(EndpointCmd), #[command(subcommand)] Mappings(MappingsCmd), Start(StartCmdArgs), Stop(StopCmdArgs), } /// Initialize a new Neon repository, preparing configs for services to start with. #[derive(clap::Args)] struct InitCmdArgs { /// How many pageservers to create (default 1). #[clap(long)] num_pageservers: Option, #[clap(long)] config: Option, /// Force initialization even if the repository is not empty. #[clap(long, default_value = "must-not-exist")] #[arg(value_parser)] force: InitForceMode, } /// Start pageserver and safekeepers. #[derive(clap::Args)] struct StartCmdArgs { #[clap(long = "start-timeout", default_value = "10s")] timeout: humantime::Duration, } /// Stop pageserver and safekeepers. #[derive(clap::Args)] struct StopCmdArgs { #[arg(value_enum)] #[clap(long, default_value_t = StopMode::Fast)] mode: StopMode, } #[derive(Clone, Copy, clap::ValueEnum)] enum StopMode { Fast, Immediate, } /// Manage tenants. #[derive(clap::Subcommand)] enum TenantCmd { List, Create(TenantCreateCmdArgs), SetDefault(TenantSetDefaultCmdArgs), Config(TenantConfigCmdArgs), Import(TenantImportCmdArgs), } #[derive(clap::Args)] struct TenantCreateCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, /// Use a specific timeline id when creating a tenant and its initial timeline. #[clap(long)] timeline_id: Option, #[clap(short = 'c')] config: Vec, /// Postgres version to use for the initial timeline. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long)] pg_version: PgMajorVersion, /// Use this tenant in future CLI commands where tenant_id is needed, but not specified. #[clap(long)] set_default: bool, /// Number of shards in the new tenant. #[clap(long)] #[arg(default_value_t = 0)] shard_count: u8, /// Sharding stripe size in pages. #[clap(long)] shard_stripe_size: Option, /// Placement policy shards in this tenant. #[clap(long)] #[arg(value_parser = parse_placement_policy)] placement_policy: Option, } fn parse_placement_policy(s: &str) -> anyhow::Result { Ok(serde_json::from_str::(s)?) } /// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not /// specified. #[derive(clap::Args)] struct TenantSetDefaultCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: TenantId, } #[derive(clap::Args)] struct TenantConfigCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, #[clap(short = 'c')] config: Vec, } /// Import a tenant that is present in remote storage, and create branches for its timelines. #[derive(clap::Args)] struct TenantImportCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: TenantId, } /// Manage timelines. #[derive(clap::Subcommand)] enum TimelineCmd { List(TimelineListCmdArgs), Branch(TimelineBranchCmdArgs), Create(TimelineCreateCmdArgs), Import(TimelineImportCmdArgs), } /// List all timelines available to this pageserver. #[derive(clap::Args)] struct TimelineListCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_shard_id: Option, } /// Create a new timeline, branching off from another timeline. #[derive(clap::Args)] struct TimelineBranchCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, /// New timeline's ID, as a 32-byte hexadecimal string. #[clap(long)] timeline_id: Option, /// Human-readable alias for the new timeline. #[clap(long)] branch_name: String, /// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The /// timeline gets resolved by its branch name. #[clap(long)] ancestor_branch_name: Option, /// When using another timeline as base, use a specific Lsn in it instead of the latest one. #[clap(long)] ancestor_start_lsn: Option, } /// Create a new blank timeline. #[derive(clap::Args)] struct TimelineCreateCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, /// New timeline's ID, as a 32-byte hexadecimal string. #[clap(long)] timeline_id: Option, /// Human-readable alias for the new timeline. #[clap(long)] branch_name: String, /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long)] pg_version: PgMajorVersion, } /// Import a timeline from a basebackup directory. #[derive(clap::Args)] struct TimelineImportCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, /// New timeline's ID, as a 32-byte hexadecimal string. #[clap(long)] timeline_id: TimelineId, /// Human-readable alias for the new timeline. #[clap(long)] branch_name: String, /// Basebackup tarfile to import. #[clap(long)] base_tarfile: PathBuf, /// LSN the basebackup starts at. #[clap(long)] base_lsn: Lsn, /// WAL to add after base. #[clap(long)] wal_tarfile: Option, /// LSN the basebackup ends at. #[clap(long)] end_lsn: Option, /// Postgres version of the basebackup being imported. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long)] pg_version: PgMajorVersion, } /// Manage pageservers. #[derive(clap::Subcommand)] enum PageserverCmd { Status(PageserverStatusCmdArgs), Start(PageserverStartCmdArgs), Stop(PageserverStopCmdArgs), Restart(PageserverRestartCmdArgs), } /// Show status of a local pageserver. #[derive(clap::Args)] struct PageserverStatusCmdArgs { /// Pageserver ID. #[clap(long = "id")] pageserver_id: Option, } /// Start local pageserver. #[derive(clap::Args)] struct PageserverStartCmdArgs { /// Pageserver ID. #[clap(long = "id")] pageserver_id: Option, /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } /// Stop local pageserver. #[derive(clap::Args)] struct PageserverStopCmdArgs { /// Pageserver ID. #[clap(long = "id")] pageserver_id: Option, /// If 'immediate', don't flush repository data at shutdown #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } /// Restart local pageserver. #[derive(clap::Args)] struct PageserverRestartCmdArgs { /// Pageserver ID. #[clap(long = "id")] pageserver_id: Option, /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } /// Manage storage controller. #[derive(clap::Subcommand)] enum StorageControllerCmd { Start(StorageControllerStartCmdArgs), Stop(StorageControllerStopCmdArgs), } /// Start storage controller. #[derive(clap::Args)] struct StorageControllerStartCmdArgs { /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, /// Identifier used to distinguish storage controller instances. #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, /// Base port for the storage controller instance identified by instance-id (defaults to /// pageserver cplane api). #[clap(long)] base_port: Option, /// Whether the storage controller should handle pageserver-reported local disk loss events. #[clap(long)] handle_ps_local_disk_loss: Option, } /// Stop storage controller. #[derive(clap::Args)] struct StorageControllerStopCmdArgs { /// If 'immediate', don't flush repository data at shutdown #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, /// Identifier used to distinguish storage controller instances. #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, } /// Manage storage broker. #[derive(clap::Subcommand)] enum StorageBrokerCmd { Start(StorageBrokerStartCmdArgs), Stop(StorageBrokerStopCmdArgs), } /// Start broker. #[derive(clap::Args)] struct StorageBrokerStartCmdArgs { /// Timeout until we fail the command. #[clap(short = 't', long, default_value = "10s")] start_timeout: humantime::Duration, } /// Stop broker. #[derive(clap::Args)] struct StorageBrokerStopCmdArgs { /// If 'immediate', don't flush repository data on shutdown. #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } /// Manage safekeepers. #[derive(clap::Subcommand)] enum SafekeeperCmd { Start(SafekeeperStartCmdArgs), Stop(SafekeeperStopCmdArgs), Restart(SafekeeperRestartCmdArgs), } /// Manage object storage. #[derive(clap::Subcommand)] enum EndpointStorageCmd { Start(EndpointStorageStartCmd), Stop(EndpointStorageStopCmd), } /// Start object storage. #[derive(clap::Args)] struct EndpointStorageStartCmd { /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } /// Stop object storage. #[derive(clap::Args)] struct EndpointStorageStopCmd { /// If 'immediate', don't flush repository data on shutdown. #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } /// Start local safekeeper. #[derive(clap::Args)] struct SafekeeperStartCmdArgs { /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } /// Stop local safekeeper. #[derive(clap::Args)] struct SafekeeperStopCmdArgs { /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] #[clap(short = 'm')] stop_mode: StopMode, } /// Restart local safekeeper. #[derive(clap::Args)] struct SafekeeperRestartCmdArgs { /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] #[clap(short = 'm')] stop_mode: StopMode, /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, /// Timeout until we fail the command. #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } /// Manage Postgres instances. #[derive(clap::Subcommand)] enum EndpointCmd { List(EndpointListCmdArgs), Create(EndpointCreateCmdArgs), Start(EndpointStartCmdArgs), Reconfigure(EndpointReconfigureCmdArgs), RefreshConfiguration(EndpointRefreshConfigurationArgs), Stop(EndpointStopCmdArgs), UpdatePageservers(EndpointUpdatePageserversCmdArgs), GenerateJwt(EndpointGenerateJwtCmdArgs), } /// List endpoints. #[derive(clap::Args)] struct EndpointListCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_shard_id: Option, } /// Create a compute endpoint. #[derive(clap::Args)] struct EndpointCreateCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long = "tenant-id")] tenant_id: Option, /// Postgres endpoint ID. endpoint_id: Option, /// Name of the branch the endpoint will run on. #[clap(long)] branch_name: Option, /// Specify LSN on the timeline to start from. By default, end of the timeline would be used. #[clap(long)] lsn: Option, #[clap(long)] pg_port: Option, #[clap(long, alias = "http-port")] external_http_port: Option, #[clap(long)] internal_http_port: Option, #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, /// Don't do basebackup, create endpoint directory with only config files. #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] config_only: bool, /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long)] pg_version: PgMajorVersion, /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. /// /// Specified on creation such that it's retained across reconfiguration and restarts. /// /// NB: not yet supported by computes. #[clap(long)] grpc: bool, /// If set, the node will be a hot replica on the specified timeline. #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] hot_standby: bool, /// If set, will set up the catalog for neon_superuser. #[clap(long)] update_catalog: bool, /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but /// useful for tests. #[clap(long)] allow_multiple: bool, /// Name of the privileged role for the endpoint. // Only allow changing it on creation. #[clap(long)] privileged_role_name: Option, } /// Start Postgres. If the endpoint doesn't exist yet, it is created. #[derive(clap::Args)] struct EndpointStartCmdArgs { /// Postgres endpoint ID. endpoint_id: String, /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, /// Safekeepers membership generation to prefix neon.safekeepers with. #[clap(long)] safekeepers_generation: Option, /// List of safekeepers endpoint will talk to. #[clap(long)] safekeepers: Option, /// Configure the remote extensions storage proxy gateway URL to request for extensions. #[clap(long, alias = "remote-ext-config")] remote_ext_base_url: Option, /// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true` #[clap(long)] create_test_user: bool, /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but /// useful for tests. #[clap(long)] allow_multiple: bool, /// Timeout until we fail the command. #[clap(short = 't', long, value_parser= humantime::parse_duration)] #[arg(default_value = "90s")] start_timeout: Duration, /// Download LFC cache from endpoint storage on endpoint startup #[clap(long, default_value = "false")] autoprewarm: bool, /// Upload LFC cache to endpoint storage periodically #[clap(long)] offload_lfc_interval_seconds: Option, /// Run in development mode, skipping VM-specific operations like process termination #[clap(long, action = clap::ArgAction::SetTrue)] dev: bool, } /// Reconfigure an endpoint. #[derive(clap::Args)] struct EndpointReconfigureCmdArgs { /// Tenant id. Represented as a hexadecimal string 32 symbols length #[clap(long = "tenant-id")] tenant_id: Option, /// Postgres endpoint ID. endpoint_id: String, /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, #[clap(long)] safekeepers: Option, } /// Refresh the endpoint's configuration by forcing it reload it's spec #[derive(clap::Args)] struct EndpointRefreshConfigurationArgs { /// Postgres endpoint id endpoint_id: String, } /// Stop an endpoint. #[derive(clap::Args)] struct EndpointStopCmdArgs { /// Postgres endpoint ID. endpoint_id: String, /// Also delete data directory (now optional, should be default in future). #[clap(long)] destroy: bool, /// Postgres shutdown mode, passed to `pg_ctl -m `. #[clap(long)] #[clap(default_value = "fast")] mode: EndpointTerminateMode, } /// Update the pageservers in the spec file of the compute endpoint #[derive(clap::Args)] struct EndpointUpdatePageserversCmdArgs { /// Postgres endpoint id endpoint_id: String, /// Specified pageserver id #[clap(short = 'p', long)] pageserver_id: Option, } /// Generate a JWT for an endpoint. #[derive(clap::Args)] struct EndpointGenerateJwtCmdArgs { /// Postgres endpoint ID. endpoint_id: String, /// Scope to generate the JWT with. #[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)] scope: Option, } /// Manage neon_local branch name mappings. #[derive(clap::Subcommand)] enum MappingsCmd { Map(MappingsMapCmdArgs), } /// Create new mapping which cannot exist already. #[derive(clap::Args)] struct MappingsMapCmdArgs { /// Tenant ID, as a 32-byte hexadecimal string. #[clap(long)] tenant_id: TenantId, /// Timeline ID, as a 32-byte hexadecimal string. #[clap(long)] timeline_id: TimelineId, /// Branch name to give to the timeline. #[clap(long)] branch_name: String, } /// /// Timelines tree element used as a value in the HashMap. /// struct TimelineTreeEl { /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. pub info: TimelineInfo, /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: BTreeSet, } /// A flock-based guard over the neon_local repository directory struct RepoLock { _file: Flock, } impl RepoLock { fn new() -> Result { let repo_dir = File::open(local_env::base_path())?; match Flock::lock(repo_dir, FlockArg::LockExclusive) { Ok(f) => Ok(Self { _file: f }), Err((_, e)) => Err(e).context("flock error"), } } } // Main entry point for the 'neon_local' CLI utility // // This utility helps to manage neon installation. That includes following: // * Management of local postgres installations running on top of the // pageserver. // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { let cli = Cli::parse(); // Check for 'neon init' command first. let (subcommand_result, _lock) = if let NeonLocalCmd::Init(args) = cli.command { (handle_init(&args).map(|env| Some(Cow::Owned(env))), None) } else { // This tool uses a collection of simple files to store its state, and consequently // it is not generally safe to run multiple commands concurrently. Rather than expect // all callers to know this, use a lock file to protect against concurrent execution. let _repo_lock = RepoLock::new().unwrap(); // all other commands need an existing config let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); let env = Box::leak(Box::new(env)); let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); let subcommand_result = match cli.command { NeonLocalCmd::Init(_) => unreachable!("init was handled earlier already"), NeonLocalCmd::Start(args) => rt.block_on(handle_start_all(&args, env)), NeonLocalCmd::Stop(args) => rt.block_on(handle_stop_all(&args, env)), NeonLocalCmd::Tenant(subcmd) => rt.block_on(handle_tenant(&subcmd, env)), NeonLocalCmd::Timeline(subcmd) => rt.block_on(handle_timeline(&subcmd, env)), NeonLocalCmd::Pageserver(subcmd) => rt.block_on(handle_pageserver(&subcmd, env)), NeonLocalCmd::StorageController(subcmd) => { rt.block_on(handle_storage_controller(&subcmd, env)) } NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)), NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)), NeonLocalCmd::EndpointStorage(subcmd) => { rt.block_on(handle_endpoint_storage(&subcmd, env)) } NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)), NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; let subcommand_result = if &original_env != env { subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) }; (subcommand_result, Some(_repo_lock)) }; match subcommand_result { Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); exit(1); } } Ok(()) } /// /// Prints timelines list as a tree-like structure. /// fn print_timelines_tree( timelines: Vec, mut timeline_name_mappings: HashMap, ) -> Result<()> { let mut timelines_hash = timelines .iter() .map(|t| { ( t.timeline_id, TimelineTreeEl { info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)), }, ) }) .collect::>(); // Memorize all direct children of each timeline. for timeline in timelines.iter() { if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id { timelines_hash .get_mut(&ancestor_timeline_id) .context("missing timeline info in the HashMap")? .children .insert(timeline.timeline_id); } } for timeline in timelines_hash.values() { // Start with root local timelines (no ancestors) first. if timeline.info.ancestor_timeline_id.is_none() { print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; } } Ok(()) } /// /// Recursively prints timeline info with all its children. /// fn print_timeline( nesting_level: usize, is_last: &[bool], timeline: &TimelineTreeEl, timelines: &HashMap, ) -> Result<()> { if nesting_level > 0 { let ancestor_lsn = match timeline.info.ancestor_lsn { Some(lsn) => lsn.to_string(), None => "Unknown Lsn".to_string(), }; let mut br_sym = "┣━"; // Draw each nesting padding with proper style // depending on whether its timeline ended or not. if nesting_level > 1 { for l in &is_last[1..is_last.len() - 1] { if *l { print!(" "); } else { print!("┃ "); } } } // We are the last in this sub-timeline if *is_last.last().unwrap() { br_sym = "┗━"; } print!("{br_sym} @{ancestor_lsn}: "); } // Finally print a timeline id and name with new line println!( "{} [{}]", timeline.name.as_deref().unwrap_or("_no_name_"), timeline.info.timeline_id ); let len = timeline.children.len(); let mut i: usize = 0; let mut is_last_new = Vec::from(is_last); is_last_new.push(false); for child in &timeline.children { i += 1; // Mark that the last padding is the end of the timeline if i == len { if let Some(last) = is_last_new.last_mut() { *last = true; } } print_timeline( nesting_level + 1, &is_last_new, timelines .get(child) .context("missing timeline info in the HashMap")?, timelines, )?; } Ok(()) } /// Helper function to get tenant id from an optional --tenant_id option or from the config file fn get_tenant_id( tenant_id_arg: Option, env: &local_env::LocalEnv, ) -> anyhow::Result { if let Some(tenant_id_from_arguments) = tenant_id_arg { Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(default_id) } else { anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant"); } } /// Helper function to get tenant-shard ID from an optional --tenant_id option or from the config file, /// for commands that accept a shard suffix fn get_tenant_shard_id( tenant_shard_id_arg: Option, env: &local_env::LocalEnv, ) -> anyhow::Result { if let Some(tenant_id_from_arguments) = tenant_shard_id_arg { Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(TenantShardId::unsharded(default_id)) } else { anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant"); } } fn handle_init(args: &InitCmdArgs) -> anyhow::Result { // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. if args.num_pageservers.is_some() { bail!( "Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead" ); } // load and parse the file let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) })?; toml_edit::de::from_str(&contents)? } else { // User (likely interactive) did not provide a description of the environment, give them the default NeonLocalInitConf { control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()), broker: NeonBroker { listen_addr: Some(DEFAULT_BROKER_ADDR.parse().unwrap()), listen_https_addr: None, }, safekeepers: vec![SafekeeperConf { id: DEFAULT_SAFEKEEPER_ID, pg_port: DEFAULT_SAFEKEEPER_PG_PORT, http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, ..Default::default() }], pageservers: (0..args.num_pageservers.unwrap_or(1)) .map(|i| { let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; let grpc_port = DEFAULT_PAGESERVER_GRPC_PORT + i; NeonLocalInitPageserverConf { id: pageserver_id, listen_pg_addr: format!("127.0.0.1:{pg_port}"), listen_http_addr: format!("127.0.0.1:{http_port}"), listen_https_addr: None, listen_grpc_addr: Some(format!("127.0.0.1:{grpc_port}")), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, grpc_auth_type: AuthType::Trust, other: Default::default(), // Typical developer machines use disks with slow fsync, and we don't care // about data integrity: disable disk syncs. no_sync: true, } }) .collect(), endpoint_storage: EndpointStorageConf { listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, }, pg_distrib_dir: None, neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, control_plane_hooks_api: None, generate_local_ssl_certs: false, } }; LocalEnv::init(init_conf, &args.force) .context("materialize initial neon_local environment on disk")?; Ok(LocalEnv::load_config(&local_env::base_path()) .expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. /// For typical interactive use, one would just run with a single pageserver. Scenarios with /// tenant/timeline placement across multiple pageservers are managed by python test code rather /// than this CLI. fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { let ps_conf = env .pageservers .first() .expect("Config is validated to contain at least one pageserver"); PageServerNode::from_env(env, ps_conf) } async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { let pageserver = get_default_pageserver(env); match subcmd { TenantCmd::List => { for t in pageserver.tenant_list().await? { println!("{} {:?}", t.id, t.state); } } TenantCmd::Import(args) => { let tenant_id = args.tenant_id; let storage_controller = StorageController::from_env(env); let create_response = storage_controller.tenant_import(tenant_id).await?; let shard_zero = create_response .shards .first() .expect("Import response omitted shards"); let attached_pageserver_id = shard_zero.node_id; let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?); println!( "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}" ); let timelines = pageserver .http_client .list_timelines(shard_zero.shard_id) .await?; // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names let main_timeline = timelines .iter() .find(|t| t.ancestor_timeline_id.is_none()) .expect("No timelines found") .timeline_id; let mut branch_i = 0; for timeline in timelines.iter() { let branch_name = if timeline.timeline_id == main_timeline { "main".to_string() } else { branch_i += 1; format!("branch_{branch_i}") }; println!( "Importing timeline {tenant_id}/{} as branch {branch_name}", timeline.timeline_id ); env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; } } TenantCmd::Create(args) => { let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one let tenant_id = args.tenant_id.unwrap_or_else(TenantId::generate); // We must register the tenant with the storage controller, so // that when the pageserver restarts, it will be re-attached. let storage_controller = StorageController::from_env(env); storage_controller .tenant_create(TenantCreateRequest { // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the // storage controller expects a shard-naive tenant_id in this attribute, and the TenantCreateRequest // type is used both in the storage controller (for creating tenants) and in the pageserver (for // creating shards) new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters { count: ShardCount::new(args.shard_count), stripe_size: args .shard_stripe_size .map(ShardStripeSize) .unwrap_or(DEFAULT_STRIPE_SIZE), }, placement_policy: args.placement_policy.clone(), config: tenant_conf, }) .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have // different shards picking different start lsns. Maybe we have to teach storage controller // to let shard 0 branch first and then propagate the chosen LSN to other shards. storage_controller .tenant_timeline_create( tenant_id, TimelineCreateRequest { new_timeline_id, mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id: None, pg_version: Some(args.pg_version), }, }, ) .await?; env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), tenant_id, new_timeline_id, )?; println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",); if args.set_default { println!("Setting tenant {tenant_id} as a default one"); env.default_tenant_id = Some(tenant_id); } } TenantCmd::SetDefault(args) => { println!("Setting tenant {} as a default one", args.tenant_id); env.default_tenant_id = Some(args.tenant_id); } TenantCmd::Config(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); let config = PageServerNode::parse_config(tenant_conf)?; let req = TenantConfigRequest { tenant_id, config }; let storage_controller = StorageController::from_env(env); storage_controller .set_tenant_config(&req) .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured via storcon"); } } Ok(()) } async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = get_default_pageserver(env); match cmd { TimelineCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timelines = pageserver.timeline_list(&tenant_shard_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } TimelineCmd::Create(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let new_branch_name = &args.branch_name; let new_timeline_id_opt = args.timeline_id; let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id: None, pg_version: Some(args.pg_version), }, }; let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", timeline_info.timeline_id ); } // TODO: rename to import-basebackup-plus-wal TimelineCmd::Import(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let timeline_id = args.timeline_id; let branch_name = &args.branch_name; // Parse base inputs let base = (args.base_lsn, args.base_tarfile.clone()); // Parse pg_wal inputs let wal_tarfile = args.wal_tarfile.clone(); let end_lsn = args.end_lsn; // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); println!("Importing timeline into pageserver ..."); pageserver .timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version) .await?; if env.storage_controller.timelines_onto_safekeepers { println!("Creating timeline on safekeeper ..."); let timeline_info = pageserver .timeline_info( TenantShardId::unsharded(tenant_id), timeline_id, pageserver_client::mgmt_api::ForceAwaitLogicalSize::No, ) .await?; let default_sk = SafekeeperNode::from_env(env, env.safekeepers.first().unwrap()); let default_host = default_sk .conf .listen_addr .clone() .unwrap_or_else(|| "localhost".to_string()); let mconf = safekeeper_api::membership::Configuration { generation: SafekeeperGeneration::new(1), members: safekeeper_api::membership::MemberSet { m: vec![SafekeeperId { host: default_host, id: default_sk.conf.id, pg_port: default_sk.conf.pg_port, }], }, new_members: None, }; let pg_version = PgVersionId::from(args.pg_version); let req = safekeeper_api::models::TimelineCreateRequest { tenant_id, timeline_id, mconf, pg_version, system_id: None, wal_seg_size: None, start_lsn: timeline_info.last_record_lsn, commit_lsn: None, }; default_sk.create_timeline(&req).await?; } env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } TimelineCmd::Branch(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); let new_branch_name = &args.branch_name; let ancestor_branch_name = args .ancestor_branch_name .clone() .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); let ancestor_timeline_id = env .get_branch_timeline_id(&ancestor_branch_name, tenant_id) .ok_or_else(|| { anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; let start_lsn = args.ancestor_start_lsn; let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, mode: pageserver_api::models::TimelineCreateRequestMode::Branch { ancestor_timeline_id, ancestor_start_lsn: start_lsn, read_only: false, pg_version: None, }, }; let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", timeline_info.timeline_id ); } } Ok(()) } async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; match subcmd { EndpointCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timeline_name_mappings = env.timeline_name_mappings(); let mut table = comfy_table::Table::new(); table.load_preset(comfy_table::presets::NOTHING); table.set_header([ "ENDPOINT", "ADDRESS", "TIMELINE", "BRANCH NAME", "LSN", "STATUS", ]); for (endpoint_id, endpoint) in cplane .endpoints .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id) { let lsn_str = match endpoint.mode { ComputeMode::Static(lsn) => { // -> read-only endpoint // Use the node's LSN. lsn.to_string() } _ => { // As the LSN here refers to the one that the compute is started with, // we display nothing as it is a primary/hot standby compute. "---".to_string() } }; let branch_name = timeline_name_mappings .get(&TenantTimelineId::new( tenant_shard_id.tenant_id, endpoint.timeline_id, )) .map(|name| name.as_str()) .unwrap_or("?"); table.add_row([ endpoint_id.as_str(), &endpoint.pg_address.to_string(), &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), &format!("{}", endpoint.status()), ]); } println!("{table}"); } EndpointCmd::Create(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let branch_name = args .branch_name .clone() .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); let endpoint_id = args .endpoint_id .clone() .unwrap_or_else(|| format!("ep-{branch_name}")); let timeline_id = env .get_branch_timeline_id(&branch_name, tenant_id) .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; let mode = match (args.lsn, args.hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { bail!( "Cannot start a node in hot standby mode when it is already configured as a static replica" ) } (ComputeMode::Primary, true) => { bail!( "Cannot start a node as a hot standby replica, it is already configured as primary node" ) } _ => {} } if !args.allow_multiple { cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; } cplane.new_endpoint( &endpoint_id, tenant_id, timeline_id, args.pg_port, args.external_http_port, args.internal_http_port, args.pg_version, mode, args.grpc, !args.update_catalog, false, args.privileged_role_name.clone(), )?; } EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; let remote_ext_base_url = &args.remote_ext_base_url; let default_generation = env .storage_controller .timelines_onto_safekeepers .then_some(1); let safekeepers_generation = args .safekeepers_generation .or(default_generation) .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { safekeepers } else { env.safekeepers.iter().map(|sk| sk.id).collect() }; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .ok_or_else(|| anyhow!("endpoint {endpoint_id} not found"))?; if !args.allow_multiple { cplane.check_conflicting_endpoints( endpoint.mode, endpoint.tenant_id, endpoint.timeline_id, )?; } let prefer_protocol = if endpoint.grpc { PageserverProtocol::Grpc } else { PageserverProtocol::Libpq }; let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id { let conf = env.get_pageserver_conf(ps_id).unwrap(); local_pageserver_conf_to_conn_info(conf)? } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; assert!(!locate_result.shards.is_empty()); // Initialize LSN leases for static computes. if let ComputeMode::Static(lsn) = endpoint.mode { futures::future::try_join_all(locate_result.shards.iter().map( |shard| async move { let conf = env.get_pageserver_conf(shard.node_id).unwrap(); let pageserver = PageServerNode::from_env(env, conf); pageserver .http_client .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn) .await }, )) .await?; } tenant_locate_response_to_conn_info(&locate_result)? }; pageserver_conninfo.prefer_protocol = prefer_protocol; let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?; let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) } else { None }; let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)? + Duration::from_secs(86400)) .as_secs(); let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: endpoint.tenant_id, timeline_id: endpoint.timeline_id, endpoint_id: endpoint_id.to_string(), exp, }; let endpoint_storage_token = env.generate_auth_token(&claims)?; let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); let args = control_plane::endpoint::EndpointStartArgs { auth_token, endpoint_storage_token, endpoint_storage_addr, safekeepers_generation, safekeepers, pageserver_conninfo, remote_ext_base_url: remote_ext_base_url.clone(), create_test_user: args.create_test_user, start_timeout: args.start_timeout, autoprewarm: args.autoprewarm, offload_lfc_interval_seconds: args.offload_lfc_interval_seconds, dev: args.dev, }; println!("Starting existing endpoint {endpoint_id}..."); endpoint.start(args).await?; } EndpointCmd::UpdatePageservers(args) => { let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; let prefer_protocol = if endpoint.grpc { PageserverProtocol::Grpc } else { PageserverProtocol::Libpq }; let mut pageserver_conninfo = match args.pageserver_id { Some(pageserver_id) => { let conf = env.get_pageserver_conf(pageserver_id)?; local_pageserver_conf_to_conn_info(conf)? } None => { let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; tenant_locate_response_to_conn_info(&locate_result)? } }; pageserver_conninfo.prefer_protocol = prefer_protocol; endpoint .update_pageservers_in_config(&pageserver_conninfo) .await?; } EndpointCmd::Reconfigure(args) => { let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; let prefer_protocol = if endpoint.grpc { PageserverProtocol::Grpc } else { PageserverProtocol::Libpq }; let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id { let conf = env.get_pageserver_conf(ps_id)?; local_pageserver_conf_to_conn_info(conf)? } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; tenant_locate_response_to_conn_info(&locate_result)? }; pageserver_conninfo.prefer_protocol = prefer_protocol; // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = parse_safekeepers(&args.safekeepers)?; endpoint .reconfigure(Some(&pageserver_conninfo), safekeepers, None) .await?; } EndpointCmd::RefreshConfiguration(args) => { let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; endpoint.refresh_configuration().await?; } EndpointCmd::Stop(args) => { let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; match endpoint.stop(args.mode, args.destroy).await?.lsn { Some(lsn) => println!("{lsn}"), None => println!("null"), } } EndpointCmd::GenerateJwt(args) => { let endpoint = { let endpoint_id = &args.endpoint_id; cplane .endpoints .get(endpoint_id) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))? }; let jwt = endpoint.generate_jwt(args.scope)?; print!("{jwt}"); } } Ok(()) } /// Parse --safekeepers as list of safekeeper ids. fn parse_safekeepers(safekeepers_str: &Option) -> Result>> { if let Some(safekeepers_str) = safekeepers_str { let mut safekeepers: Vec = Vec::new(); for sk_id in safekeepers_str.split(',').map(str::trim) { let sk_id = NodeId( u64::from_str(sk_id) .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?, ); safekeepers.push(sk_id); } Ok(Some(safekeepers)) } else { Ok(None) } } fn handle_mappings(subcmd: &MappingsCmd, env: &mut local_env::LocalEnv) -> Result<()> { match subcmd { MappingsCmd::Map(args) => { env.register_branch_mapping( args.branch_name.to_owned(), args.tenant_id, args.timeline_id, )?; Ok(()) } } } fn get_pageserver( env: &local_env::LocalEnv, pageserver_id_arg: Option, ) -> Result { let node_id = pageserver_id_arg.unwrap_or(DEFAULT_PAGESERVER_ID); Ok(PageServerNode::from_env( env, env.get_pageserver_conf(node_id)?, )) } async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> Result<()> { match subcmd { PageserverCmd::Start(args) => { if let Err(e) = get_pageserver(env, args.pageserver_id)? .start(&args.start_timeout) .await { eprintln!("pageserver start failed: {e}"); exit(1); } } PageserverCmd::Stop(args) => { let immediate = match args.stop_mode { StopMode::Fast => false, StopMode::Immediate => true, }; if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) { eprintln!("pageserver stop failed: {e}"); exit(1); } } PageserverCmd::Restart(args) => { let pageserver = get_pageserver(env, args.pageserver_id)?; //TODO what shutdown strategy should we use here? if let Err(e) = pageserver.stop(false) { eprintln!("pageserver stop failed: {e}"); exit(1); } if let Err(e) = pageserver.start(&args.start_timeout).await { eprintln!("pageserver start failed: {e}"); exit(1); } } PageserverCmd::Status(args) => { match get_pageserver(env, args.pageserver_id)? .check_status() .await { Ok(_) => println!("Page server is up and running"), Err(err) => { eprintln!("Page server is not available: {err}"); exit(1); } } } } Ok(()) } async fn handle_storage_controller( subcmd: &StorageControllerCmd, env: &local_env::LocalEnv, ) -> Result<()> { let svc = StorageController::from_env(env); match subcmd { StorageControllerCmd::Start(args) => { let start_args = NeonStorageControllerStartArgs { instance_id: args.instance_id, base_port: args.base_port, start_timeout: args.start_timeout, handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, }; if let Err(e) = svc.start(start_args).await { eprintln!("start failed: {e}"); exit(1); } } StorageControllerCmd::Stop(args) => { let stop_args = NeonStorageControllerStopArgs { instance_id: args.instance_id, immediate: match args.stop_mode { StopMode::Fast => false, StopMode::Immediate => true, }, }; if let Err(e) = svc.stop(stop_args).await { eprintln!("stop failed: {e}"); exit(1); } } } Ok(()) } fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { bail!("could not find safekeeper {id}") } } async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Result<()> { match subcmd { SafekeeperCmd::Start(args) => { let safekeeper = get_safekeeper(env, args.id)?; if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {e}"); exit(1); } } SafekeeperCmd::Stop(args) => { let safekeeper = get_safekeeper(env, args.id)?; let immediate = match args.stop_mode { StopMode::Fast => false, StopMode::Immediate => true, }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {e}"); exit(1); } } SafekeeperCmd::Restart(args) => { let safekeeper = get_safekeeper(env, args.id)?; let immediate = match args.stop_mode { StopMode::Fast => false, StopMode::Immediate => true, }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {e}"); exit(1); } if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {e}"); exit(1); } } } Ok(()) } async fn handle_endpoint_storage( subcmd: &EndpointStorageCmd, env: &local_env::LocalEnv, ) -> Result<()> { use EndpointStorageCmd::*; let storage = EndpointStorage::from_env(env); // In tests like test_forward_compatibility or test_graceful_cluster_restart // old neon binaries (without endpoint_storage) are present if !storage.bin.exists() { eprintln!( "{} binary not found. Ignore if this is a compatibility test", storage.bin ); return Ok(()); } match subcmd { Start(EndpointStorageStartCmd { start_timeout }) => { if let Err(e) = storage.start(start_timeout).await { eprintln!("endpoint_storage start failed: {e}"); exit(1); } } Stop(EndpointStorageStopCmd { stop_mode }) => { let immediate = match stop_mode { StopMode::Fast => false, StopMode::Immediate => true, }; if let Err(e) = storage.stop(immediate) { eprintln!("proxy stop failed: {e}"); exit(1); } } }; Ok(()) } async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> { match subcmd { StorageBrokerCmd::Start(args) => { let storage_broker = StorageBroker::from_env(env); if let Err(e) = storage_broker.start(&args.start_timeout).await { eprintln!("broker start failed: {e}"); exit(1); } } StorageBrokerCmd::Stop(_args) => { // FIXME: stop_mode unused let storage_broker = StorageBroker::from_env(env); if let Err(e) = storage_broker.stop() { eprintln!("broker stop failed: {e}"); exit(1); } } } Ok(()) } async fn handle_start_all( args: &StartCmdArgs, env: &'static local_env::LocalEnv, ) -> anyhow::Result<()> { // FIXME: this was called "retry_timeout", is it right? let Err(errors) = handle_start_all_impl(env, args.timeout).await else { neon_start_status_check(env, args.timeout.as_ref()) .await .context("status check after successful startup of all services")?; return Ok(()); }; eprintln!("startup failed because one or more services could not be started"); for e in errors { eprintln!("{e}"); let debug_repr = format!("{e:?}"); for line in debug_repr.lines() { eprintln!(" {line}"); } } try_stop_all(env, true).await; exit(2); } /// Returns Ok() if and only if all services could be started successfully. /// Otherwise, returns the list of errors that occurred during startup. async fn handle_start_all_impl( env: &'static local_env::LocalEnv, retry_timeout: humantime::Duration, ) -> Result<(), Vec> { // Endpoints are not started automatically let mut js = JoinSet::new(); // force infalliblity through closure #[allow(clippy::redundant_closure_call)] (|| { js.spawn(async move { let storage_broker = StorageBroker::from_env(env); storage_broker .start(&retry_timeout) .await .map_err(|e| e.context("start storage_broker")) }); js.spawn(async move { let storage_controller = StorageController::from_env(env); storage_controller .start(NeonStorageControllerStartArgs::with_default_instance_id( retry_timeout, )) .await .map_err(|e| e.context("start storage_controller")) }); for ps_conf in &env.pageservers { js.spawn(async move { let pageserver = PageServerNode::from_env(env, ps_conf); pageserver .start(&retry_timeout) .await .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id))) }); } for node in env.safekeepers.iter() { js.spawn(async move { let safekeeper = SafekeeperNode::from_env(env, node); safekeeper .start(&[], &retry_timeout) .await .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) }); } js.spawn(async move { EndpointStorage::from_env(env) .start(&retry_timeout) .await .map_err(|e| e.context("start endpoint_storage")) }); })(); let mut errors = Vec::new(); while let Some(result) = js.join_next().await { let result = result.expect("we don't panic or cancel the tasks"); if let Err(e) = result { errors.push(e); } } if !errors.is_empty() { return Err(errors); } Ok(()) } async fn neon_start_status_check( env: &local_env::LocalEnv, retry_timeout: &Duration, ) -> anyhow::Result<()> { const RETRY_INTERVAL: Duration = Duration::from_millis(100); const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); let storcon = StorageController::from_env(env); let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis(); println!("\nRunning neon status check"); for retry in 0..retries { if retry == notice_after_retries { println!("\nNeon status check has not passed yet, continuing to wait") } let mut passed = true; let mut nodes = storcon.node_list().await?; let mut pageservers = env.pageservers.clone(); if nodes.len() != pageservers.len() { continue; } nodes.sort_by_key(|ps| ps.id); pageservers.sort_by_key(|ps| ps.id); for (idx, pageserver) in pageservers.iter().enumerate() { let node = &nodes[idx]; if node.id != pageserver.id { passed = false; break; } if !matches!(node.availability, NodeAvailabilityWrapper::Active) { passed = false; break; } } if passed { println!("\nNeon started and passed status check"); return Ok(()); } tokio::time::sleep(RETRY_INTERVAL).await; } anyhow::bail!("\nNeon passed status check") } async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Result<()> { let immediate = match args.mode { StopMode::Fast => false, StopMode::Immediate => true, }; try_stop_all(env, immediate).await; Ok(()) } async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let mode = if immediate { EndpointTerminateMode::Immediate } else { EndpointTerminateMode::Fast }; // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { if let Err(e) = node.stop(mode, false).await { eprintln!("postgres stop failed: {e:#}"); } } } Err(e) => { eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}") } } let storage = EndpointStorage::from_env(env); if let Err(e) = storage.stop(immediate) { eprintln!("endpoint_storage stop failed: {e:#}"); } for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); if let Err(e) = pageserver.stop(immediate) { eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e); } } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e); } } let storage_broker = StorageBroker::from_env(env); if let Err(e) = storage_broker.stop() { eprintln!("neon broker stop failed: {e:#}"); } // Stop all storage controller instances. In the most common case there's only one, // but iterate though the base data directory in order to discover the instances. let storcon_instances = env .storage_controller_instances() .await .expect("Must inspect data dir"); for (instance_id, _instance_dir_path) in storcon_instances { let storage_controller = StorageController::from_env(env); let stop_args = NeonStorageControllerStopArgs { instance_id, immediate, }; if let Err(e) = storage_controller.stop(stop_args).await { eprintln!("Storage controller instance {instance_id} stop failed: {e:#}"); } } } ================================================ FILE: control_plane/src/branch_mappings.rs ================================================ //! Branch mappings for convenience use std::collections::HashMap; use std::fs; use std::path::Path; use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; /// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user. #[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] #[serde(default, deny_unknown_fields)] pub struct BranchMappings { /// Default tenant ID to use with the 'neon_local' command line utility, when /// --tenant_id is not explicitly specified. This comes from the branches. pub default_tenant_id: Option, // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". pub mappings: HashMap>, } impl BranchMappings { pub fn register_branch_mapping( &mut self, branch_name: String, tenant_id: TenantId, timeline_id: TimelineId, ) -> anyhow::Result<()> { let existing_values = self.mappings.entry(branch_name.clone()).or_default(); let existing_ids = existing_values .iter() .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); if let Some((_, old_timeline_id)) = existing_ids { if old_timeline_id == &timeline_id { Ok(()) } else { bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); } } else { existing_values.push((tenant_id, timeline_id)); Ok(()) } } pub fn get_branch_timeline_id( &self, branch_name: &str, tenant_id: TenantId, ) -> Option { // If it looks like a timeline ID, return it as it is if let Ok(timeline_id) = branch_name.parse::() { return Some(timeline_id); } self.mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) .map(TimelineId::from) } pub fn timeline_name_mappings(&self) -> HashMap { self.mappings .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() } pub fn persist(&self, path: &Path) -> anyhow::Result<()> { let content = &toml::to_string_pretty(self)?; fs::write(path, content).with_context(|| { format!( "Failed to write branch information into path '{}'", path.display() ) }) } pub fn load(path: &Path) -> anyhow::Result { let branches_file_contents = fs::read_to_string(path)?; Ok(toml::from_str(branches_file_contents.as_str())?) } } ================================================ FILE: control_plane/src/broker.rs ================================================ //! Code to manage the storage broker //! //! In the local test environment, the storage broker stores its data directly in //! //! ```text //! .neon/storage_broker //! ``` use std::time::Duration; use anyhow::Context; use camino::Utf8PathBuf; use crate::{background_process, local_env::LocalEnv}; pub struct StorageBroker { env: LocalEnv, } impl StorageBroker { /// Create a new `StorageBroker` instance from the environment. pub fn from_env(env: &LocalEnv) -> Self { Self { env: env.clone() } } pub fn initialize(&self) -> anyhow::Result<()> { if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( &self.env.storage_broker_data_dir().join("server.crt"), &self.env.storage_broker_data_dir().join("server.key"), )?; } Ok(()) } /// Start the storage broker process. pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { let broker = &self.env.broker; println!("Starting neon broker at {}", broker.client_url()); let mut args = Vec::new(); if let Some(addr) = &broker.listen_addr { args.push(format!("--listen-addr={addr}")); } if let Some(addr) = &broker.listen_https_addr { args.push(format!("--listen-https-addr={addr}")); } let client = self.env.create_http_client(); background_process::start_process( "storage_broker", &self.env.storage_broker_data_dir(), &self.env.storage_broker_bin(), args, [], background_process::InitialPidFile::Create(self.pid_file_path()), retry_timeout, || async { let url = broker.client_url(); let status_url = url.join("status").with_context(|| { format!("Failed to append /status path to broker endpoint {url}") })?; let request = client.get(status_url).build().with_context(|| { format!("Failed to construct request to broker endpoint {url}") })?; match client.execute(request).await { Ok(resp) => Ok(resp.status().is_success()), Err(_) => Ok(false), } }, ) .await .context("Failed to spawn storage_broker subprocess")?; Ok(()) } /// Stop the storage broker process. pub fn stop(&self) -> anyhow::Result<()> { background_process::stop_process(true, "storage_broker", &self.pid_file_path()) } /// Get the path to the PID file for the storage broker. fn pid_file_path(&self) -> Utf8PathBuf { Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_broker.pid")) .expect("non-Unicode path") } } ================================================ FILE: control_plane/src/endpoint.rs ================================================ //! Code to manage compute endpoints //! //! In the local test environment, the data for each endpoint is stored in //! //! ```text //! .neon/endpoints/ //! ``` //! //! Some basic information about the endpoint, like the tenant and timeline IDs, //! are stored in the `endpoint.json` file. The `endpoint.json` file is created //! when the endpoint is created, and doesn't change afterwards. //! //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads //! the basebackup from the pageserver to initialize the data directory, and //! finally launches the PostgreSQL process. It watches the PostgreSQL process //! until it exits. //! //! When an endpoint is created, a `postgresql.conf` file is also created in //! the endpoint's directory. The file can be modified before starting PostgreSQL. //! However, the `postgresql.conf` file in the endpoint directory is not used directly //! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another //! copy of it in the data directory. //! //! Directory contents: //! //! ```text //! .neon/endpoints/main/ //! compute.log - log output of `compute_ctl` and `postgres` //! endpoint.json - serialized `EndpointConf` struct //! postgresql.conf - postgresql settings //! config.json - passed to `compute_ctl` //! pgdata/ //! postgresql.conf - copy of postgresql.conf created by `compute_ctl` //! neon.signal //! zenith.signal - copy of neon.signal, for backward compatibility //! //! ``` //! use std::collections::{BTreeMap, HashMap}; use std::fmt::Display; use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; use std::process::Command; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; use base64::Engine; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use compute_api::requests::{ COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, }; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse, TlsConfig, }; use compute_api::spec::{ Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol, PageserverShardInfo, PgIdent, RemoteExtSpec, Role, }; // re-export these, because they're used in the reconfigure() function pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo}; use jsonwebtoken::jwk::{ AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations, OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse, }; use nix::sys::signal::{Signal, kill}; use pem::Pem; use reqwest::header::CONTENT_TYPE; use safekeeper_api::PgMajorVersion; use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use spki::der::Decode; use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef}; use tracing::debug; use utils::id::{NodeId, TenantId, TimelineId}; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT; use postgres_connection::parse_host_port; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { endpoint_id: String, tenant_id: TenantId, timeline_id: TimelineId, mode: ComputeMode, pg_port: u16, external_http_port: u16, internal_http_port: u16, pg_version: PgMajorVersion, grpc: bool, skip_pg_catalog_updates: bool, reconfigure_concurrency: usize, drop_subscriptions_before_start: bool, features: Vec, cluster: Option, compute_ctl_config: ComputeCtlConfig, privileged_role_name: Option, } // // ComputeControlPlane // pub struct ComputeControlPlane { base_port: u16, // endpoint ID is the key pub endpoints: BTreeMap>, env: LocalEnv, } impl ComputeControlPlane { // Load current endpoints from the endpoints/ subdirectories pub fn load(env: LocalEnv) -> Result { let mut endpoints = BTreeMap::default(); for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env); let ep = match ep_res { Ok(ep) => ep, Err(e) => match e.downcast::() { Ok(e) => { // A parallel task could delete an endpoint while we have just scanned the directory if e.kind() == std::io::ErrorKind::NotFound { continue; } else { Err(e)? } } Err(e) => Err(e)?, }, }; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } Ok(ComputeControlPlane { base_port: 55431, endpoints, env, }) } fn get_port(&mut self) -> u16 { 1 + self .endpoints .values() .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port())) .max() .unwrap_or(self.base_port) } /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS /// from the production control plane. fn create_jwks_from_pem(pem: &Pem) -> Result { let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?; let public_key = spki.subject_public_key.raw_bytes(); let mut hasher = Sha256::new(); hasher.update(public_key); let key_hash = hasher.finalize(); Ok(JwkSet { keys: vec![Jwk { common: CommonParameters { public_key_use: Some(PublicKeyUse::Signature), key_operations: Some(vec![KeyOperations::Verify]), key_algorithm: Some(KeyAlgorithm::EdDSA), key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)), x509_url: None::, x509_chain: None::>, x509_sha1_fingerprint: None::, x509_sha256_fingerprint: None::, }, algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters { key_type: OctetKeyPairType::OctetKeyPair, curve: EllipticCurve::Ed25519, x: BASE64_URL_SAFE_NO_PAD.encode(public_key), }), }], }) } #[allow(clippy::too_many_arguments)] pub fn new_endpoint( &mut self, endpoint_id: &str, tenant_id: TenantId, timeline_id: TimelineId, pg_port: Option, external_http_port: Option, internal_http_port: Option, pg_version: PgMajorVersion, mode: ComputeMode, grpc: bool, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, privileged_role_name: Option, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1); let compute_ctl_config = ComputeCtlConfig { jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?, tls: None::, }; let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port), external_http_address: SocketAddr::new( IpAddr::from(Ipv4Addr::UNSPECIFIED), external_http_port, ), internal_http_address: SocketAddr::new( IpAddr::from(Ipv4Addr::LOCALHOST), internal_http_port, ), env: self.env.clone(), timeline_id, mode, tenant_id, pg_version, // We don't setup roles and databases in the spec locally, so we don't need to // do catalog updates. Catalog updates also include check availability // data creation. Yet, we have tests that check that size and db dump // before and after start are the same. So, skip catalog updates, // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. skip_pg_catalog_updates, drop_subscriptions_before_start, grpc, reconfigure_concurrency: 1, features: vec![], cluster: None, compute_ctl_config: compute_ctl_config.clone(), privileged_role_name: privileged_role_name.clone(), }); ep.create_endpoint_dir()?; std::fs::write( ep.endpoint_path().join("endpoint.json"), serde_json::to_string_pretty(&EndpointConf { endpoint_id: endpoint_id.to_string(), tenant_id, timeline_id, mode, external_http_port, internal_http_port, pg_port, pg_version, grpc, skip_pg_catalog_updates, drop_subscriptions_before_start, reconfigure_concurrency: 1, features: vec![], cluster: None, compute_ctl_config, privileged_role_name, })?, )?; std::fs::write( ep.endpoint_path().join("postgresql.conf"), ep.setup_pg_conf()?.to_string(), )?; self.endpoints .insert(ep.endpoint_id.clone(), Arc::clone(&ep)); Ok(ep) } pub fn check_conflicting_endpoints( &self, mode: ComputeMode, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result<()> { if matches!(mode, ComputeMode::Primary) { // this check is not complete, as you could have a concurrent attempt at // creating another primary, both reading the state before checking it here, // but it's better than nothing. let mut duplicates = self.endpoints.iter().filter(|(_k, v)| { v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode && v.status() != EndpointStatus::Stopped }); if let Some((key, _)) = duplicates.next() { bail!( "attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported." ); } } Ok(()) } } /////////////////////////////////////////////////////////////////////////////// pub struct Endpoint { /// used as the directory name endpoint_id: String, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mode: ComputeMode, /// If true, the endpoint should use gRPC to communicate with Pageservers. pub grpc: bool, // port and address of the Postgres server and `compute_ctl`'s HTTP APIs pub pg_address: SocketAddr, pub external_http_address: SocketAddr, pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. pg_version: PgMajorVersion, // These are not part of the endpoint as such, but the environment // the endpoint runs in. pub env: LocalEnv, // Optimizations skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, reconfigure_concurrency: usize, // Feature flags features: Vec, // Cluster settings cluster: Option, /// The compute_ctl config for the endpoint's compute. compute_ctl_config: ComputeCtlConfig, /// The name of the privileged role for the endpoint. privileged_role_name: Option, } #[derive(PartialEq, Eq)] pub enum EndpointStatus { Running, Stopped, Crashed, RunningNoPidfile, } impl Display for EndpointStatus { fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { writer.write_str(match self { Self::Running => "running", Self::Stopped => "stopped", Self::Crashed => "crashed", Self::RunningNoPidfile => "running, no pidfile", }) } } #[derive(Default, Clone, Copy, clap::ValueEnum)] pub enum EndpointTerminateMode { #[default] /// Use pg_ctl stop -m fast Fast, /// Use pg_ctl stop -m immediate Immediate, /// Use /terminate?mode=immediate ImmediateTerminate, } impl std::fmt::Display for EndpointTerminateMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match &self { EndpointTerminateMode::Fast => "fast", EndpointTerminateMode::Immediate => "immediate", EndpointTerminateMode::ImmediateTerminate => "immediate-terminate", }) } } pub struct EndpointStartArgs { pub auth_token: Option, pub endpoint_storage_token: String, pub endpoint_storage_addr: String, pub safekeepers_generation: Option, pub safekeepers: Vec, pub pageserver_conninfo: PageserverConnectionInfo, pub remote_ext_base_url: Option, pub create_test_user: bool, pub start_timeout: Duration, pub autoprewarm: bool, pub offload_lfc_interval_seconds: Option, pub dev: bool, } impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { anyhow::bail!( "Endpoint::from_dir_entry failed: '{}' is not a directory", entry.path().display() ); } // parse data directory name let fname = entry.file_name(); let endpoint_id = fname.to_str().unwrap().to_string(); // Read the endpoint.json file let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; debug!("serialized endpoint conf: {:?}", conf); Ok(Endpoint { pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), external_http_address: SocketAddr::new( IpAddr::from(Ipv4Addr::UNSPECIFIED), conf.external_http_port, ), internal_http_address: SocketAddr::new( IpAddr::from(Ipv4Addr::LOCALHOST), conf.internal_http_port, ), endpoint_id, env: env.clone(), timeline_id: conf.timeline_id, mode: conf.mode, tenant_id: conf.tenant_id, pg_version: conf.pg_version, grpc: conf.grpc, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, reconfigure_concurrency: conf.reconfigure_concurrency, drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, cluster: conf.cluster, compute_ctl_config: conf.compute_ctl_config, privileged_role_name: conf.privileged_role_name, }) } fn create_endpoint_dir(&self) -> Result<()> { std::fs::create_dir_all(self.endpoint_path()).with_context(|| { format!( "could not create endpoint directory {}", self.endpoint_path().display() ) }) } // Generate postgresql.conf with default configuration fn setup_pg_conf(&self) -> Result { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); conf.append("wal_log_hints", "off"); conf.append("max_replication_slots", "10"); conf.append("hot_standby", "on"); // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for // Postgres to operate. Everything smaller might be not enough for Postgres under load, // and can cause errors like 'no unpinned buffers available', see // conf.append("shared_buffers", "1MB"); // Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's // batching logic. Set this to 2 so that we exercise the code a bit without letting // individual tests do a lot of concurrent work on underpowered test machines conf.append("effective_io_concurrency", "2"); conf.append("fsync", "off"); conf.append("max_connections", "100"); conf.append("wal_level", "logical"); // wal_sender_timeout is the maximum time to wait for WAL replication. // It also defines how often the walreceiver will send a feedback message to the wal sender. conf.append("wal_sender_timeout", "5s"); conf.append("listen_addresses", &self.pg_address.ip().to_string()); conf.append("port", &self.pg_address.port().to_string()); conf.append("wal_keep_size", "0"); // walproposer panics when basebackup is invalid, it is pointless to restart in this case. conf.append("restart_after_crash", "off"); // Load the 'neon' extension conf.append("shared_preload_libraries", "neon"); conf.append_line(""); // Replication-related configurations, such as WAL sending match &self.mode { ComputeMode::Primary => { // Configure backpressure // - Replication write lag depends on how fast the walreceiver can process incoming WAL. // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. // Actually latency should be much smaller (better if < 1sec). But we assume that recently // updates pages are not requested from pageserver. // - Replication flush lag depends on speed of persisting data by checkpointer (creation of // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. // To be able to restore database in case of pageserver node crash, safekeeper should not // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers // (if they are not able to upload WAL to S3). conf.append("max_replication_write_lag", "15MB"); conf.append("max_replication_flush_lag", "10GB"); if !self.env.safekeepers.is_empty() { // Configure Postgres to connect to the safekeepers conf.append("synchronous_standby_names", "walproposer"); let safekeepers = self .env .safekeepers .iter() .map(|sk| format!("localhost:{}", sk.get_compute_port())) .collect::>() .join(","); conf.append("neon.safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, // so set more relaxed synchronous_commit. conf.append("synchronous_commit", "remote_write"); // Configure the node to stream WAL directly to the pageserver // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); } } ComputeMode::Static(lsn) => { conf.append("recovery_target_lsn", &lsn.to_string()); } ComputeMode::Replica => { assert!(!self.env.safekeepers.is_empty()); // TODO: use future host field from safekeeper spec // Pass the list of safekeepers to the replica so that it can connect to any of them, // whichever is available. let sk_ports = self .env .safekeepers .iter() .map(|x| x.get_compute_port().to_string()) .collect::>() .join(","); let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(","); let connstr = format!( "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true", sk_hosts, sk_ports, &self.timeline_id.to_string(), &self.tenant_id.to_string(), ); let slot_name = format!("repl_{}_", self.timeline_id); conf.append("primary_conninfo", connstr.as_str()); conf.append("primary_slot_name", slot_name.as_str()); conf.append("hot_standby", "on"); // prefetching of blocks referenced in WAL doesn't make sense for us // Neon hot standby ignores pages that are not in the shared_buffers if self.pg_version >= PgMajorVersion::PG15 { conf.append("recovery_prefetch", "off"); } } } Ok(conf) } pub fn endpoint_path(&self) -> PathBuf { self.env.endpoints_path().join(&self.endpoint_id) } pub fn pgdata(&self) -> PathBuf { self.endpoint_path().join("pgdata") } pub fn status(&self) -> EndpointStatus { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { (true, true) => EndpointStatus::Running, (false, false) => EndpointStatus::Stopped, (true, false) => EndpointStatus::Crashed, (false, true) => EndpointStatus::RunningNoPidfile, } } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl"); let mut cmd = Command::new(&pg_ctl_path); cmd.args( [ &[ "-D", self.pgdata().to_str().unwrap(), "-w", //wait till pg_ctl actually does what was asked ], args, ] .concat(), ) .env_clear() .env( "LD_LIBRARY_PATH", self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ) .env( "DYLD_LIBRARY_PATH", self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); // Pass authentication token used for the connections to pageserver and safekeepers if let Some(token) = auth_token { cmd.env("NEON_AUTH_TOKEN", token); } let pg_ctl = cmd .output() .context(format!("{} failed", pg_ctl_path.display()))?; if !pg_ctl.status.success() { anyhow::bail!( "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}", pg_ctl.status, String::from_utf8_lossy(&pg_ctl.stdout), String::from_utf8_lossy(&pg_ctl.stderr), ); } Ok(()) } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); if send_sigterm { kill(pid, Signal::SIGTERM).ok(); } crate::background_process::wait_until_stopped("compute_ctl", pid)?; Ok(()) } fn read_postgresql_conf(&self) -> Result { // Slurp the endpoints//postgresql.conf file into // memory. We will include it in the spec file that we pass to // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf // in the data directory. let postgresql_conf_path = self.endpoint_path().join("postgresql.conf"); match std::fs::read(&postgresql_conf_path) { Ok(content) => Ok(String::from_utf8(content)?), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()), Err(e) => Err(anyhow::Error::new(e).context(format!( "failed to read config file in {}", postgresql_conf_path.to_str().unwrap() ))), } } /// Map safekeepers ids to the actual connection strings. fn build_safekeepers_connstrs(&self, sk_ids: Vec) -> Result> { let mut safekeeper_connstrings = Vec::new(); if self.mode == ComputeMode::Primary { for sk_id in sk_ids { let sk = self .env .safekeepers .iter() .find(|node| node.id == sk_id) .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); } } Ok(safekeeper_connstrings) } /// Generate a JWT with the correct claims. pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { audience: match scope { Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]), _ => None, }, compute_id: match scope { Some(ComputeClaimsScope::Admin) => None, _ => Some(self.endpoint_id.clone()), }, scope, }) } pub async fn start(&self, args: EndpointStartArgs) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); } let postgresql_conf = self.read_postgresql_conf()?; // We always start the compute node from scratch, so if the Postgres // data dir exists from a previous launch, remove it first. if self.pgdata().exists() { std::fs::remove_dir_all(self.pgdata())?; } let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?; // check for file remote_extensions_spec.json // if it is present, read it and pass to compute_ctl let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json"); let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path); let remote_extensions: Option; if let Ok(spec_file) = remote_extensions_spec { remote_extensions = serde_json::from_reader(spec_file).ok(); } else { remote_extensions = None; }; // For the sake of backwards-compatibility, also fill in 'pageserver_connstring' // // XXX: I believe this is not really needed, except to make // test_forward_compatibility happy. // // Use a closure so that we can conviniently return None in the middle of the // loop. let pageserver_connstring: Option = (|| { let num_shards = args.pageserver_conninfo.shard_count.count(); let mut connstrings = Vec::new(); for shard_no in 0..num_shards { let shard_index = ShardIndex { shard_count: args.pageserver_conninfo.shard_count, shard_number: ShardNumber(shard_no), }; let shard = args .pageserver_conninfo .shards .get(&shard_index) .ok_or_else(|| { anyhow!( "shard {} not found in pageserver_connection_info", shard_index ) })?; let pageserver = shard .pageservers .first() .ok_or(anyhow!("must have at least one pageserver"))?; if let Some(libpq_url) = &pageserver.libpq_url { connstrings.push(libpq_url.clone()); } else { return Ok::<_, anyhow::Error>(None); } } Ok(Some(connstrings.join(","))) })()?; // Create config file let config = { let mut spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, features: self.features.clone(), swap_size_bytes: None, disk_quota_bytes: None, disable_lfc_resizing: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used state: None, roles: if args.create_test_user { vec![Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, options: None, }] } else { Vec::new() }, databases: if args.create_test_user { vec![Database { name: PgIdent::from_str("neondb").unwrap(), owner: PgIdent::from_str("test").unwrap(), options: None, restrict_conn: false, invalid: false, }] } else { Vec::new() }, settings: None, postgresql_conf: Some(postgresql_conf.clone()), }, delta_operations: None, tenant_id: Some(self.tenant_id), timeline_id: Some(self.timeline_id), project_id: None, branch_id: None, endpoint_id: Some(self.endpoint_id.clone()), mode: self.mode, pageserver_connection_info: Some(args.pageserver_conninfo.clone()), pageserver_connstring, safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, storage_auth_token: args.auth_token.clone(), remote_extensions, pgbouncer_settings: None, shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, endpoint_storage_addr: Some(args.endpoint_storage_addr), endpoint_storage_token: Some(args.endpoint_storage_token), autoprewarm: args.autoprewarm, offload_lfc_interval_seconds: args.offload_lfc_interval_seconds, suspend_timeout_seconds: -1, // Only used in neon_local. databricks_settings: None, }; // this strange code is needed to support respec() in tests if self.cluster.is_some() { debug!("Cluster is already set in the endpoint spec, using it"); spec.cluster = self.cluster.clone().unwrap(); debug!("spec.cluster {:?}", spec.cluster); // fill missing fields again if args.create_test_user { spec.cluster.roles.push(Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, options: None, }); spec.cluster.databases.push(Database { name: PgIdent::from_str("neondb").unwrap(), owner: PgIdent::from_str("test").unwrap(), options: None, restrict_conn: false, invalid: false, }); } spec.cluster.postgresql_conf = Some(postgresql_conf); } ComputeConfig { spec: Some(spec), compute_ctl_config: self.compute_ctl_config.clone(), } }; let config_path = self.endpoint_path().join("config.json"); std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?; // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it. let logfile = std::fs::OpenOptions::new() .create(true) .append(true) .open(self.endpoint_path().join("compute.log"))?; // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); println!("Starting postgres node at '{conn_str}'"); if args.create_test_user { let conn_str = self.connstr("test", "neondb"); println!("Also at '{conn_str}'"); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args([ "--external-http-port", &self.external_http_address.port().to_string(), ]) .args([ "--internal-http-port", &self.internal_http_address.port().to_string(), ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) .arg("--config") .arg(self.endpoint_path().join("config.json").as_os_str()) .args([ "--pgbin", self.env .pg_bin_dir(self.pg_version)? .join("postgres") .to_str() .unwrap(), ]) // TODO: It would be nice if we generated compute IDs with the same // algorithm as the real control plane. .args(["--compute-id", &self.endpoint_id]) .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); if let Some(remote_ext_base_url) = args.remote_ext_base_url { cmd.args(["--remote-ext-base-url", &remote_ext_base_url]); } if args.dev { cmd.arg("--dev"); } if let Some(privileged_role_name) = self.privileged_role_name.clone() { cmd.args(["--privileged-role-name", &privileged_role_name]); } let child = cmd.spawn()?; // set up a scopeguard to kill & wait for the child in case we panic or bail below let child = scopeguard::guard(child, |mut child| { println!("SIGKILL & wait the started process"); (|| { // TODO: use another signal that can be caught by the child so it can clean up any children it spawned child.kill().context("SIGKILL child")?; child.wait().context("wait() for child process")?; anyhow::Ok(()) })() .with_context(|| format!("scopeguard kill&wait child {child:?}")) .unwrap(); }); // Write down the pid so we can wait for it when we want to stop // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; // Wait for it to start const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); let start_at = Instant::now(); loop { match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { let timeout = args.start_timeout; if Instant::now().duration_since(start_at) > timeout { bail!( "compute startup timed out {:?}; still in Init state", timeout ); } // keep retrying } ComputeStatus::Running => { // All good! break; } ComputeStatus::Failed => { bail!( "compute startup failed: {}", state .error .as_deref() .unwrap_or("") ); } ComputeStatus::Empty | ComputeStatus::ConfigurationPending | ComputeStatus::Configuration | ComputeStatus::TerminationPendingFast | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Terminated | ComputeStatus::RefreshConfigurationPending | ComputeStatus::RefreshConfiguration => { bail!("unexpected compute status: {:?}", state.status) } } } Err(e) => { if Instant::now().duration_since(start_at) > args.start_timeout { return Err(e).context(format!( "timed out {:?} waiting to connect to compute_ctl HTTP", args.start_timeout )); } } } tokio::time::sleep(ATTEMPT_INTERVAL).await; } // disarm the scopeguard, let the child outlive this function (and neon_local invoction) drop(scopeguard::ScopeGuard::into_inner(child)); Ok(()) } // Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario. pub async fn update_pageservers_in_config( &self, pageserver_conninfo: &PageserverConnectionInfo, ) -> Result<()> { let config_path = self.endpoint_path().join("config.json"); let mut config: ComputeConfig = { let file = std::fs::File::open(&config_path)?; serde_json::from_reader(file)? }; let mut spec = config.spec.unwrap(); spec.pageserver_connection_info = Some(pageserver_conninfo.clone()); config.spec = Some(spec); let file = std::fs::File::create(&config_path)?; serde_json::to_writer_pretty(file, &config)?; Ok(()) } // Call the /status HTTP API pub async fn get_status(&self) -> Result { let client = reqwest::Client::new(); let response = client .request( reqwest::Method::GET, format!( "http://{}:{}/status", self.external_http_address.ip(), self.external_http_address.port() ), ) .bearer_auth(self.generate_jwt(None::)?) .send() .await?; // Interpret the response let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { Ok(response.json().await?) } else { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = response.url().to_owned(); let msg = match response.text().await { Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) } } pub async fn reconfigure( &self, pageserver_conninfo: Option<&PageserverConnectionInfo>, safekeepers: Option>, safekeeper_generation: Option, ) -> Result<()> { let (mut spec, compute_ctl_config) = { let config_path = self.endpoint_path().join("config.json"); let file = std::fs::File::open(config_path)?; let config: ComputeConfig = serde_json::from_reader(file)?; (config.spec.unwrap(), config.compute_ctl_config) }; let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); if let Some(pageserver_conninfo) = pageserver_conninfo { // If pageservers are provided, we need to ensure that they are not empty. // This is a requirement for the compute_ctl configuration. anyhow::ensure!( !pageserver_conninfo.shards.is_empty(), "no pageservers provided" ); spec.pageserver_connection_info = Some(pageserver_conninfo.clone()); spec.shard_stripe_size = pageserver_conninfo.stripe_size; } // If safekeepers are not specified, don't change them. if let Some(safekeepers) = safekeepers { let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; spec.safekeeper_connstrings = safekeeper_connstrings; if let Some(g) = safekeeper_generation { spec.safekeepers_generation = Some(g.into_inner()); } } let client = reqwest::Client::builder() .timeout(Duration::from_secs(120)) .build() .unwrap(); let response = client .post(format!( "http://{}:{}/configure", self.external_http_address.ip(), self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") .bearer_auth(self.generate_jwt(None::)?) .body( serde_json::to_string(&ConfigurationRequest { spec, compute_ctl_config, }) .unwrap(), ) .send() .await?; let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { Ok(()) } else { let url = response.url().to_owned(); let msg = match response.text().await { Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) } } pub async fn reconfigure_pageservers( &self, pageservers: &PageserverConnectionInfo, ) -> Result<()> { self.reconfigure(Some(pageservers), None, None).await } pub async fn reconfigure_safekeepers( &self, safekeepers: Vec, generation: SafekeeperGeneration, ) -> Result<()> { self.reconfigure(None, Some(safekeepers), Some(generation)) .await } pub async fn stop( &self, mode: EndpointTerminateMode, destroy: bool, ) -> Result { // pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is // slow, and test runs time out. Solution: special mode "immediate-terminate" // which uses /terminate let response = if let EndpointTerminateMode::ImmediateTerminate = mode { let ip = self.external_http_address.ip(); let port = self.external_http_address.port(); let url = format!("http://{ip}:{port}/terminate?mode=immediate"); let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?; let request = reqwest::Client::new().post(url).bearer_auth(token); let response = request.send().await.context("/terminate")?; let text = response.text().await.context("/terminate result")?; serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))? } else { self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?; TerminateResponse { lsn: None } }; // Also wait for the compute_ctl process to die. It might have some // cleanup work to do after postgres stops, like syncing safekeepers, // etc. // // If destroying or stop mode is immediate, send it SIGTERM before // waiting. Sometimes we do *not* want this cleanup: tests intentionally // do stop when majority of safekeepers is down, so sync-safekeepers // would hang otherwise. This could be a separate flag though. let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast); self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); std::fs::remove_dir_all(self.endpoint_path())?; } Ok(response) } pub async fn refresh_configuration(&self) -> Result<()> { let client = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .build() .unwrap(); let response = client .post(format!( "http://{}:{}/refresh_configuration", self.internal_http_address.ip(), self.internal_http_address.port() )) .send() .await?; let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { Ok(()) } else { let url = response.url().to_owned(); let msg = match response.text().await { Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) } } pub fn connstr(&self, user: &str, db_name: &str) -> String { format!( "postgresql://{}@{}:{}/{}", user, self.pg_address.ip(), self.pg_address.port(), db_name ) } } /// If caller is telling us what pageserver to use, this is not a tenant which is /// fully managed by storage controller, therefore not sharded. pub fn local_pageserver_conf_to_conn_info( conf: &crate::local_env::PageServerConf, ) -> Result { let libpq_url = { let (host, port) = parse_host_port(&conf.listen_pg_addr)?; let port = port.unwrap_or(5432); Some(format!("postgres://no_user@{host}:{port}")) }; let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr { let (host, port) = parse_host_port(grpc_addr)?; let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); Some(format!("grpc://no_user@{host}:{port}")) } else { None }; let ps_conninfo = PageserverShardConnectionInfo { id: Some(conf.id), libpq_url, grpc_url, }; let shard_info = PageserverShardInfo { pageservers: vec![ps_conninfo], }; let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)] .into_iter() .collect(); Ok(PageserverConnectionInfo { shard_count: ShardCount::unsharded(), stripe_size: None, shards, prefer_protocol: PageserverProtocol::default(), }) } pub fn tenant_locate_response_to_conn_info( response: &pageserver_api::controller_api::TenantLocateResponse, ) -> Result { let mut shards = HashMap::new(); for shard in response.shards.iter() { tracing::info!("parsing {}", shard.listen_pg_addr); let libpq_url = { let host = &shard.listen_pg_addr; let port = shard.listen_pg_port; Some(format!("postgres://no_user@{host}:{port}")) }; let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr { let host = grpc_addr; let port = shard.listen_grpc_port.expect("no gRPC port"); Some(format!("grpc://no_user@{host}:{port}")) } else { None }; let shard_info = PageserverShardInfo { pageservers: vec![PageserverShardConnectionInfo { id: Some(shard.node_id), libpq_url, grpc_url, }], }; shards.insert(shard.shard_id.to_index(), shard_info); } let stripe_size = if response.shard_params.count.is_unsharded() { None } else { Some(response.shard_params.stripe_size) }; Ok(PageserverConnectionInfo { shard_count: response.shard_params.count, stripe_size, shards, prefer_protocol: PageserverProtocol::default(), }) } ================================================ FILE: control_plane/src/endpoint_storage.rs ================================================ use crate::background_process::{self, start_process, stop_process}; use crate::local_env::LocalEnv; use anyhow::{Context, Result}; use camino::Utf8PathBuf; use std::io::Write; use std::net::SocketAddr; use std::time::Duration; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage"; pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr = SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993); pub struct EndpointStorage { pub bin: Utf8PathBuf, pub data_dir: Utf8PathBuf, pub pemfile: Utf8PathBuf, pub addr: SocketAddr, } impl EndpointStorage { pub fn from_env(env: &LocalEnv) -> EndpointStorage { EndpointStorage { bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(), data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(), pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), addr: env.endpoint_storage.listen_addr, } } fn config_path(&self) -> Utf8PathBuf { self.data_dir.join("endpoint_storage.json") } fn listen_addr(&self) -> Utf8PathBuf { format!("{}:{}", self.addr.ip(), self.addr.port()).into() } pub fn init(&self) -> Result<()> { println!("Initializing object storage in {:?}", self.data_dir); let parent = self.data_dir.parent().unwrap(); #[derive(serde::Serialize)] struct Cfg { listen: Utf8PathBuf, pemfile: Utf8PathBuf, local_path: Utf8PathBuf, r#type: String, } let cfg = Cfg { listen: self.listen_addr(), pemfile: parent.join(self.pemfile.clone()), local_path: parent.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR), r#type: "LocalFs".to_string(), }; std::fs::create_dir_all(self.config_path().parent().unwrap())?; std::fs::write(self.config_path(), serde_json::to_string(&cfg)?) .context("write object storage config")?; Ok(()) } pub async fn start(&self, retry_timeout: &Duration) -> Result<()> { println!("Starting endpoint_storage at {}", self.listen_addr()); std::io::stdout().flush().context("flush stdout")?; let process_status_check = || async { let res = reqwest::Client::new().get(format!("http://{}/metrics", self.listen_addr())); match res.send().await { Ok(res) => Ok(res.status().is_success()), Err(_) => Ok(false), } }; let res = start_process( "endpoint_storage", &self.data_dir.clone().into_std_path_buf(), &self.bin.clone().into_std_path_buf(), vec![self.config_path().to_string()], vec![("RUST_LOG".into(), "debug".into())], background_process::InitialPidFile::Create(self.pid_file()), retry_timeout, process_status_check, ) .await; if res.is_err() { eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?); } res } pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { stop_process(immediate, "endpoint_storage", &self.pid_file()) } fn log_file(&self) -> Utf8PathBuf { self.data_dir.join("endpoint_storage.log") } fn pid_file(&self) -> Utf8PathBuf { self.data_dir.join("endpoint_storage.pid") } } ================================================ FILE: control_plane/src/lib.rs ================================================ //! Local control plane. //! //! Can start, configure and stop postgres instances running as a local processes. //! //! Intended to be used in integration tests and in CLI tools for //! local installations. #![deny(clippy::undocumented_unsafe_blocks)] mod background_process; pub mod broker; pub mod endpoint; pub mod endpoint_storage; pub mod local_env; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; pub mod storage_controller; ================================================ FILE: control_plane/src/local_env.rs ================================================ //! This module is responsible for locating and loading paths in a local setup. //! //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. use std::collections::HashMap; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::time::Duration; use std::{env, fs}; use anyhow::{Context, bail}; use clap::ValueEnum; use pageserver_api::config::PostHogConfig; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Certificate, Url}; use safekeeper_api::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::broker::StorageBroker; use crate::endpoint_storage::{ ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage, }; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 17; // // This data structures represents neon_local CLI config // // It is deserialized from the .neon/config file, or the config file passed // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // #[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). // // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable which // must be an absolute path. If the env var is not set, $PWD/.neon is used. pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. pub pg_distrib_dir: PathBuf, // Path to pageserver binary. pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. pub default_tenant_id: Option, // used to issue tokens during e.g pg start pub private_key_path: PathBuf, /// Path to environment's public key pub public_key_path: PathBuf, pub broker: NeonBroker, // Configuration for the storage controller (1 per neon_local environment) pub storage_controller: NeonStorageControllerConf, /// This Vec must always contain at least one pageserver /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, pub safekeepers: Vec, pub endpoint_storage: EndpointStorageConf, // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. pub control_plane_api: Url, // Control plane upcall APIs for storage controller. If set, this will be propagated into the // storage controller's configuration. pub control_plane_hooks_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". pub branch_name_mappings: HashMap>, /// Flag to generate SSL certificates for components that need it. /// Also generates root CA certificate that is used to sign all other certificates. pub generate_local_ssl_certs: bool, } /// On-disk state stored in `.neon/config`. #[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] #[serde(default, deny_unknown_fields)] pub struct OnDiskConfig { pub pg_distrib_dir: PathBuf, pub neon_distrib_dir: PathBuf, pub default_tenant_id: Option, pub private_key_path: PathBuf, pub public_key_path: PathBuf, pub broker: NeonBroker, pub storage_controller: NeonStorageControllerConf, #[serde( skip_serializing, deserialize_with = "fail_if_pageservers_field_specified" )] pub pageservers: Vec, pub safekeepers: Vec, pub endpoint_storage: EndpointStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, // Note: skip serializing because in compat tests old storage controller fails // to load new config file. May be removed after this field is in release branch. #[serde(skip_serializing_if = "std::ops::Not::not")] pub generate_local_ssl_certs: bool, } fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> where D: serde::Deserializer<'de>, { Err(serde::de::Error::custom( "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ Please remove the `pageservers` from your .neon/config.", )) } /// The description of the neon_local env to be initialized by `neon_local init --config`. #[derive(Clone, Debug, Deserialize)] #[serde(deny_unknown_fields)] pub struct NeonLocalInitConf { // TODO: do we need this? Seems unused pub pg_distrib_dir: Option, // TODO: do we need this? Seems unused pub neon_distrib_dir: Option, pub default_tenant_id: TenantId, pub broker: NeonBroker, pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, pub endpoint_storage: EndpointStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct EndpointStorageConf { pub listen_addr: SocketAddr, } /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Default)] #[serde(default)] pub struct NeonBroker { /// Broker listen HTTP address for storage nodes coordination, e.g. '127.0.0.1:50051'. /// At least one of listen_addr or listen_https_addr must be set. pub listen_addr: Option, /// Broker listen HTTPS address for storage nodes coordination, e.g. '127.0.0.1:50051'. /// At least one of listen_addr or listen_https_addr must be set. /// listen_https_addr is preferred over listen_addr in neon_local. pub listen_https_addr: Option, } /// A part of storage controller's config the neon_local knows about. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] pub max_offline: Duration, #[serde(with = "humantime_serde")] pub max_warming_up: Duration, pub start_as_candidate: bool, /// Database url used when running multiple storage controller instances pub database_url: Option, /// Thresholds for auto-splitting a tenant into shards. pub split_threshold: Option, pub max_split_shards: Option, pub initial_split_threshold: Option, pub initial_split_shards: Option, pub max_secondary_lag_bytes: Option, #[serde(with = "humantime_serde")] pub heartbeat_interval: Duration, #[serde(with = "humantime_serde")] pub long_reconcile_threshold: Option, pub use_https_pageserver_api: bool, pub timelines_onto_safekeepers: bool, pub use_https_safekeeper_api: bool, pub use_local_compute_notifications: bool, pub timeline_safekeeper_count: Option, pub posthog_config: Option, pub kick_secondary_downloads: Option, #[serde(with = "humantime_serde")] pub shard_split_request_timeout: Option, } impl NeonStorageControllerConf { // Use a shorter pageserver unavailability interval than the default to speed up tests. const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); // Very tight heartbeat interval to speed up tests const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); } impl Default for NeonStorageControllerConf { fn default() -> Self { Self { max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, start_as_candidate: false, database_url: None, split_threshold: None, max_split_shards: None, initial_split_threshold: None, initial_split_shards: None, max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, use_https_pageserver_api: false, timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, timeline_safekeeper_count: None, posthog_config: None, kick_secondary_downloads: None, shard_split_request_timeout: None, } } } impl Default for EndpointStorageConf { fn default() -> Self { Self { listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, } } } impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { format!("https://{addr}") } else { format!( "http://{}", self.listen_addr .expect("at least one address should be set") ) }; Url::parse(&url).expect("failed to construct url") } } // neon_local needs to know this subset of pageserver configuration. // For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. // It can get stale if `pageserver.toml` is changed. // TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default, deny_unknown_fields)] pub struct PageServerConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, pub listen_https_addr: Option, pub listen_grpc_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, pub grpc_auth_type: AuthType, pub no_sync: bool, } impl Default for PageServerConf { fn default() -> Self { Self { id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), listen_https_addr: None, listen_grpc_addr: None, pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, grpc_auth_type: AuthType::Trust, no_sync: false, } } } /// The toml that can be passed to `neon_local init --config`. /// This is a subset of the `pageserver.toml` configuration. // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] pub struct NeonLocalInitPageserverConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, pub listen_https_addr: Option, pub listen_grpc_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, pub grpc_auth_type: AuthType, #[serde(default, skip_serializing_if = "std::ops::Not::not")] pub no_sync: bool, #[serde(flatten)] pub other: HashMap, } impl From<&NeonLocalInitPageserverConf> for PageServerConf { fn from(conf: &NeonLocalInitPageserverConf) -> Self { let NeonLocalInitPageserverConf { id, listen_pg_addr, listen_http_addr, listen_https_addr, listen_grpc_addr, pg_auth_type, http_auth_type, grpc_auth_type, no_sync, other: _, } = conf; Self { id: *id, listen_pg_addr: listen_pg_addr.clone(), listen_http_addr: listen_http_addr.clone(), listen_https_addr: listen_https_addr.clone(), listen_grpc_addr: listen_grpc_addr.clone(), pg_auth_type: *pg_auth_type, grpc_auth_type: *grpc_auth_type, http_auth_type: *http_auth_type, no_sync: *no_sync, } } } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { pub id: NodeId, pub pg_port: u16, pub pg_tenant_only_port: Option, pub http_port: u16, pub https_port: Option, pub sync: bool, pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, pub listen_addr: Option, } impl Default for SafekeeperConf { fn default() -> Self { Self { id: NodeId(0), pg_port: 0, pg_tenant_only_port: None, http_port: 0, https_port: None, sync: true, remote_storage: None, backup_threads: None, auth_enabled: false, listen_addr: None, } } } #[derive(Clone, Copy)] pub enum InitForceMode { MustNotExist, EmptyDirOk, RemoveAllContents, } impl ValueEnum for InitForceMode { fn value_variants<'a>() -> &'a [Self] { &[ Self::MustNotExist, Self::EmptyDirOk, Self::RemoveAllContents, ] } fn to_possible_value(&self) -> Option { Some(clap::builder::PossibleValue::new(match self { InitForceMode::MustNotExist => "must-not-exist", InitForceMode::EmptyDirOk => "empty-dir-ok", InitForceMode::RemoveAllContents => "remove-all-contents", })) } } impl SafekeeperConf { /// Compute is served by port on which only tenant scoped tokens allowed, if /// it is configured. pub fn get_compute_port(&self) -> u16 { self.pg_tenant_only_port.unwrap_or(self.pg_port) } } impl LocalEnv { pub fn pg_distrib_dir_raw(&self) -> PathBuf { self.pg_distrib_dir.clone() } pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); Ok(path.join(pg_version.v_str())) } pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) } pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "bin") } pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "lib") } pub fn endpoint_storage_bin(&self) -> PathBuf { self.neon_distrib_dir.join("endpoint_storage") } pub fn pageserver_bin(&self) -> PathBuf { self.neon_distrib_dir.join("pageserver") } pub fn storage_controller_bin(&self) -> PathBuf { // Irrespective of configuration, storage controller binary is always // run from the same location as neon_local. This means that for compatibility // tests that run old pageserver/safekeeper, they still run latest storage controller. let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); neon_local_bin_dir.join("storage_controller") } pub fn safekeeper_bin(&self) -> PathBuf { self.neon_distrib_dir.join("safekeeper") } pub fn storage_broker_bin(&self) -> PathBuf { self.neon_distrib_dir.join("storage_broker") } pub fn endpoints_path(&self) -> PathBuf { self.base_data_dir.join("endpoints") } pub fn storage_broker_data_dir(&self) -> PathBuf { self.base_data_dir.join("storage_broker") } pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf { self.base_data_dir .join(format!("pageserver_{pageserver_id}")) } pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { self.base_data_dir.join("safekeepers").join(data_dir_name) } pub fn endpoint_storage_data_dir(&self) -> PathBuf { self.base_data_dir.join("endpoint_storage") } pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> { if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) { Ok(conf) } else { let have_ids = self .pageservers .iter() .map(|node| format!("{}:{}", node.id, node.listen_http_addr)) .collect::>(); let joined = have_ids.join(","); bail!("could not find pageserver {id}, have ids {joined}") } } pub fn ssl_ca_cert_path(&self) -> Option { if self.generate_local_ssl_certs { Some(self.base_data_dir.join("rootCA.crt")) } else { None } } pub fn ssl_ca_key_path(&self) -> Option { if self.generate_local_ssl_certs { Some(self.base_data_dir.join("rootCA.key")) } else { None } } pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> { let cert_path = self.ssl_ca_cert_path().unwrap(); let key_path = self.ssl_ca_key_path().unwrap(); if !fs::exists(cert_path.as_path())? { generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?; } Ok(()) } pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { self.generate_ssl_ca_cert()?; generate_ssl_cert( cert_path, key_path, self.ssl_ca_cert_path().unwrap().as_path(), self.ssl_ca_key_path().unwrap().as_path(), ) } /// Creates HTTP client with local SSL CA certificates. pub fn create_http_client(&self) -> reqwest::Client { let ssl_ca_certs = self.ssl_ca_cert_path().map(|ssl_ca_file| { let buf = std::fs::read(ssl_ca_file).expect("SSL CA file should exist"); Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid") }); let mut http_client = reqwest::Client::builder(); for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() { http_client = http_client.add_root_certificate(ssl_ca_cert); } http_client .build() .expect("HTTP client should construct with no error") } /// Inspect the base data directory and extract the instance id and instance directory path /// for all storage controller instances pub async fn storage_controller_instances(&self) -> std::io::Result> { let mut instances = Vec::default(); let dir = std::fs::read_dir(self.base_data_dir.clone())?; for dentry in dir { let dentry = dentry?; let is_dir = dentry.metadata()?.is_dir(); let filename = dentry.file_name().into_string().unwrap(); let parsed_instance_id = match filename.strip_prefix("storage_controller_") { Some(suffix) => suffix.parse::().ok(), None => None, }; let is_instance_dir = is_dir && parsed_instance_id.is_some(); if !is_instance_dir { continue; } instances.push(( parsed_instance_id.expect("Checked previously"), dentry.path(), )); } Ok(instances) } pub fn register_branch_mapping( &mut self, branch_name: String, tenant_id: TenantId, timeline_id: TimelineId, ) -> anyhow::Result<()> { let existing_values = self .branch_name_mappings .entry(branch_name.clone()) .or_default(); let existing_ids = existing_values .iter() .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); if let Some((_, old_timeline_id)) = existing_ids { if old_timeline_id == &timeline_id { Ok(()) } else { bail!( "branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}" ); } } else { existing_values.push((tenant_id, timeline_id)); Ok(()) } } pub fn get_branch_timeline_id( &self, branch_name: &str, tenant_id: TenantId, ) -> Option { self.branch_name_mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) } pub fn timeline_name_mappings(&self) -> HashMap { self.branch_name_mappings .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() } /// Construct `Self` from on-disk state. pub fn load_config(repopath: &Path) -> anyhow::Result { if !repopath.exists() { bail!( "Neon config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } // TODO: check that it looks like a neon repository // load and parse file let config_file_contents = fs::read_to_string(repopath.join("config"))?; let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; let mut env = { let OnDiskConfig { pg_distrib_dir, neon_distrib_dir, default_tenant_id, private_key_path, public_key_path, broker, storage_controller, pageservers, safekeepers, control_plane_api, control_plane_hooks_api, control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, endpoint_storage, } = on_disk_config; LocalEnv { base_data_dir: repopath.to_owned(), pg_distrib_dir, neon_distrib_dir, default_tenant_id, private_key_path, public_key_path, broker, storage_controller, pageservers, safekeepers, control_plane_api: control_plane_api.unwrap(), control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, endpoint_storage, } }; // The source of truth for pageserver configuration is the pageserver.toml. assert!( env.pageservers.is_empty(), "we ensure this during deserialization" ); env.pageservers = { let iter = std::fs::read_dir(repopath).context("open dir")?; let mut pageservers = Vec::new(); for res in iter { let dentry = res?; const PREFIX: &str = "pageserver_"; let dentry_name = dentry .file_name() .into_string() .ok() .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) .unwrap(); if !dentry_name.starts_with(PREFIX) { continue; } if !dentry.file_type().context("determine file type")?.is_dir() { anyhow::bail!("expected a directory, got {:?}", dentry.path()); } let id = dentry_name[PREFIX.len()..] .parse::() .with_context(|| format!("parse id from {:?}", dentry.path()))?; // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) #[derive(serde::Serialize, serde::Deserialize)] // (allow unknown fields, unlike PageServerConf) struct PageserverConfigTomlSubset { listen_pg_addr: String, listen_http_addr: String, listen_https_addr: Option, listen_grpc_addr: Option, pg_auth_type: AuthType, http_auth_type: AuthType, grpc_auth_type: AuthType, #[serde(default)] no_sync: bool, } let config_toml_path = dentry.path().join("pageserver.toml"); let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&config_toml_path) .with_context(|| format!("read {config_toml_path:?}"))?, ) .context("parse pageserver.toml")?; let identity_toml_path = dentry.path().join("identity.toml"); #[derive(serde::Serialize, serde::Deserialize)] struct IdentityTomlSubset { id: NodeId, } let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&identity_toml_path) .with_context(|| format!("read {identity_toml_path:?}"))?, ) .context("parse identity.toml")?; let PageserverConfigTomlSubset { listen_pg_addr, listen_http_addr, listen_https_addr, listen_grpc_addr, pg_auth_type, http_auth_type, grpc_auth_type, no_sync, } = config_toml; let IdentityTomlSubset { id: identity_toml_id, } = identity_toml; let conf = PageServerConf { id: { anyhow::ensure!( identity_toml_id == id, "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}", ); id }, listen_pg_addr, listen_http_addr, listen_https_addr, listen_grpc_addr, pg_auth_type, http_auth_type, grpc_auth_type, no_sync, }; pageservers.push(conf); } pageservers }; Ok(env) } pub fn persist_config(&self) -> anyhow::Result<()> { Self::persist_config_impl( &self.base_data_dir, &OnDiskConfig { pg_distrib_dir: self.pg_distrib_dir.clone(), neon_distrib_dir: self.neon_distrib_dir.clone(), default_tenant_id: self.default_tenant_id, private_key_path: self.private_key_path.clone(), public_key_path: self.public_key_path.clone(), broker: self.broker.clone(), storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), control_plane_api: Some(self.control_plane_api.clone()), control_plane_hooks_api: self.control_plane_hooks_api.clone(), control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, endpoint_storage: self.endpoint_storage.clone(), }, ) } pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( "Failed to write config file into path '{}'", target_config_path.display() ) }) } // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &S) -> anyhow::Result { let key = self.read_private_key()?; encode_from_key_file(claims, &key) } /// Get the path to the private key. pub fn get_private_key_path(&self) -> PathBuf { if self.private_key_path.is_absolute() { self.private_key_path.to_path_buf() } else { self.base_data_dir.join(&self.private_key_path) } } /// Get the path to the public key. pub fn get_public_key_path(&self) -> PathBuf { if self.public_key_path.is_absolute() { self.public_key_path.to_path_buf() } else { self.base_data_dir.join(&self.public_key_path) } } /// Read the contents of the private key file. pub fn read_private_key(&self) -> anyhow::Result { let private_key_path = self.get_private_key_path(); let pem = pem::parse(fs::read(private_key_path)?)?; Ok(pem) } /// Read the contents of the public key file. pub fn read_public_key(&self) -> anyhow::Result { let public_key_path = self.get_public_key_path(); let pem = pem::parse(fs::read(public_key_path)?)?; Ok(pem) } /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { let base_path = base_path(); assert_ne!(base_path, Path::new("")); let base_path = &base_path; // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { bail!( "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); } InitForceMode::EmptyDirOk => { if let Some(res) = std::fs::read_dir(base_path)?.next() { res.context("check if directory is empty")?; anyhow::bail!("directory not empty: {base_path:?}"); } } InitForceMode::RemoveAllContents => { println!("removing all contents of '{}'", base_path.display()); // instead of directly calling `remove_dir_all`, we keep the original dir but removing // all contents inside. This helps if the developer symbol links another directory (i.e., // S3 local SSD) to the `.neon` base directory. for entry in std::fs::read_dir(base_path)? { let entry = entry?; let path = entry.path(); if path.is_dir() { fs::remove_dir_all(&path)?; } else { fs::remove_file(&path)?; } } } } } if !base_path.exists() { fs::create_dir(base_path)?; } let NeonLocalInitConf { pg_distrib_dir, neon_distrib_dir, default_tenant_id, broker, storage_controller, pageservers, safekeepers, control_plane_api, generate_local_ssl_certs, control_plane_hooks_api, endpoint_storage, } = conf; // Find postgres binaries. // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". // Note that later in the code we assume, that distrib dirs follow the same pattern // for all postgres versions. let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { postgres_bin.into() } else { let cwd = env::current_dir().unwrap(); cwd.join("pg_install") } }); // Find neon binaries. let neon_distrib_dir = neon_distrib_dir .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization // step. generate_auth_keys( base_path.join("auth_private_key.pem").as_path(), base_path.join("auth_public_key.pem").as_path(), ) .context("generate auth keys")?; let private_key_path = PathBuf::from("auth_private_key.pem"); let public_key_path = PathBuf::from("auth_public_key.pem"); // create the runtime type because the remaining initialization code below needs // a LocalEnv instance op operation // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state let env = LocalEnv { base_data_dir: base_path.clone(), pg_distrib_dir, neon_distrib_dir, default_tenant_id: Some(default_tenant_id), private_key_path, public_key_path, broker, storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, control_plane_api: control_plane_api.unwrap(), control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, endpoint_storage, }; if generate_local_ssl_certs { env.generate_ssl_ca_cert()?; } // create endpoints dir fs::create_dir_all(env.endpoints_path())?; // create storage broker dir fs::create_dir_all(env.storage_broker_data_dir())?; StorageBroker::from_env(&env) .initialize() .context("storage broker init failed")?; // create safekeeper dirs for safekeeper in &env.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; SafekeeperNode::from_env(&env, safekeeper) .initialize() .context("safekeeper init failed")?; } // initialize pageserver state for (i, ps) in pageservers.into_iter().enumerate() { let runtime_ps = &env.pageservers[i]; assert_eq!(&PageServerConf::from(&ps), runtime_ps); fs::create_dir(env.pageserver_data_dir(ps.id))?; PageServerNode::from_env(&env, runtime_ps) .initialize(ps) .context("pageserver init failed")?; } EndpointStorage::from_env(&env) .init() .context("object storage init failed")?; // setup remote remote location for default LocalFs remote storage std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; std::fs::create_dir_all(env.base_data_dir.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR))?; env.persist_config() } } pub fn base_path() -> PathBuf { let path = match std::env::var_os("NEON_REPO_DIR") { Some(val) => { let path = PathBuf::from(val); if !path.is_absolute() { // repeat the env var in the error because our default is always absolute panic!("NEON_REPO_DIR must be an absolute path, got {path:?}"); } path } None => { let pwd = std::env::current_dir() // technically this can fail but it's quite unlikeley .expect("determine current directory"); let pwd_abs = pwd.canonicalize().expect("canonicalize current directory"); pwd_abs.join(".neon") } }; assert!(path.is_absolute()); path } /// Generate a public/private key pair for JWT authentication fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> { // Generate the key pair // // openssl genpkey -algorithm ed25519 -out auth_private_key.pem let keygen_output = Command::new("openssl") .arg("genpkey") .args(["-algorithm", "ed25519"]) .args(["-out", private_key_path.to_str().unwrap()]) .stdout(Stdio::null()) .output() .context("failed to generate auth private key")?; if !keygen_output.status.success() { bail!( "openssl failed: '{}'", String::from_utf8_lossy(&keygen_output.stderr) ); } // Extract the public key from the private key file // // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem let keygen_output = Command::new("openssl") .arg("pkey") .args(["-in", private_key_path.to_str().unwrap()]) .arg("-pubout") .args(["-out", public_key_path.to_str().unwrap()]) .output() .context("failed to extract public key from private key")?; if !keygen_output.status.success() { bail!( "openssl failed: '{}'", String::from_utf8_lossy(&keygen_output.stderr) ); } Ok(()) } fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { // openssl req -x509 -newkey ed25519 -nodes -subj "/CN=Neon Local CA" -days 36500 \ // -out rootCA.crt -keyout rootCA.key let keygen_output = Command::new("openssl") .args([ "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500", ]) .args(["-subj", "/CN=Neon Local CA"]) .args(["-out", cert_path.to_str().unwrap()]) .args(["-keyout", key_path.to_str().unwrap()]) .output() .context("failed to generate CA certificate")?; if !keygen_output.status.success() { bail!( "openssl failed: '{}'", String::from_utf8_lossy(&keygen_output.stderr) ); } Ok(()) } fn generate_ssl_cert( cert_path: &Path, key_path: &Path, ca_cert_path: &Path, ca_key_path: &Path, ) -> anyhow::Result<()> { // Generate Certificate Signing Request (CSR). let mut csr_path = cert_path.to_path_buf(); csr_path.set_extension(".csr"); // openssl req -new -nodes -newkey ed25519 -keyout server.key -out server.csr \ // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" let keygen_output = Command::new("openssl") .args(["req", "-new", "-nodes"]) .args(["-newkey", "ed25519"]) .args(["-subj", "/CN=localhost"]) .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"]) .args(["-keyout", key_path.to_str().unwrap()]) .args(["-out", csr_path.to_str().unwrap()]) .output() .context("failed to generate CSR")?; if !keygen_output.status.success() { bail!( "openssl failed: '{}'", String::from_utf8_lossy(&keygen_output.stderr) ); } // Sign CSR with CA key. // // openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \ // -out server.crt -days 36500 -copy_extensions copyall let keygen_output = Command::new("openssl") .args(["x509", "-req"]) .args(["-in", csr_path.to_str().unwrap()]) .args(["-CA", ca_cert_path.to_str().unwrap()]) .args(["-CAkey", ca_key_path.to_str().unwrap()]) .arg("-CAcreateserial") .args(["-out", cert_path.to_str().unwrap()]) .args(["-days", "36500"]) .args(["-copy_extensions", "copyall"]) .output() .context("failed to sign CSR")?; if !keygen_output.status.success() { bail!( "openssl failed: '{}'", String::from_utf8_lossy(&keygen_output.stderr) ); } // Remove CSR file as it's not needed anymore. fs::remove_file(csr_path)?; Ok(()) } ================================================ FILE: control_plane/src/pageserver.rs ================================================ //! Code to manage pageservers //! //! In the local test environment, the data for each pageserver is stored in //! //! ```text //! .neon/pageserver_ //! ``` //! use std::collections::HashMap; use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; use std::str::FromStr; use std::time::Duration; use anyhow::{Context, bail}; use camino::Utf8PathBuf; use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT}; use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{PgConnectionConfig, parse_host_port}; use safekeeper_api::PgMajorVersion; use utils::auth::{Claims, Scope}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; use crate::background_process; use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; // // Control routines for pageserver. // // Used in CLI and tests. // #[derive(Debug)] pub struct PageServerNode { pub pg_connection_config: PgConnectionConfig, pub conf: PageServerConf, pub env: LocalEnv, pub http_client: mgmt_api::Client, } impl PageServerNode { pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode { let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); let endpoint = if env.storage_controller.use_https_pageserver_api { format!( "https://{}", conf.listen_https_addr.as_ref().expect( "listen https address should be specified if use_https_pageserver_api is on" ) ) } else { format!("http://{}", conf.listen_http_addr) }; Self { pg_connection_config: PgConnectionConfig::new_host_port(host, port), conf: conf.clone(), env: env.clone(), http_client: mgmt_api::Client::new( env.create_http_client(), endpoint, { match conf.http_auth_type { AuthType::Trust => None, AuthType::NeonJWT => Some( env.generate_auth_token(&Claims::new(None, Scope::PageServerApi)) .unwrap(), ), } } .as_deref(), ), } } fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut { toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap() } fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { assert_eq!( &PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully" ); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; overrides.push(format!( "control_plane_api='{}'", self.env.control_plane_api.as_str() )); // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .unwrap(); overrides.push(format!("control_plane_api_token='{jwt_token}'")); } if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type] .contains(&AuthType::NeonJWT) { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { overrides.push(format!("ssl_ca_file='{}'", ssl_ca_file.to_str().unwrap())); } // Apply the user-provided overrides overrides.push({ let mut doc = toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier"); // `id` is written out to `identity.toml` instead of `pageserver.toml` doc.remove("id").expect("it's part of the struct"); doc.to_string() }); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. let mut config_toml = toml_edit::DocumentMut::new(); for fragment_str in overrides { let fragment = toml_edit::DocumentMut::from_str(&fragment_str) .expect("all fragments in `overrides` are valid toml documents, this function controls that"); for (key, item) in fragment.iter() { config_toml.insert(key, item.clone()); } } Ok(config_toml) } /// Initializes a pageserver node by creating its config with the overrides provided. pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } pub fn repo_path(&self) -> PathBuf { self.env.pageserver_data_dir(self.conf.id) } /// The pid file is created by the pageserver process, with its pid stored inside. /// Other pageservers cannot lock the same file and overwrite it for as long as the current /// pageserver runs. (Unless someone removes the file manually; never do that!) fn pid_file(&self) -> Utf8PathBuf { Utf8PathBuf::from_path_buf(self.repo_path().join("pageserver.pid")) .expect("non-Unicode path") } pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { self.start_node(retry_timeout).await } fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( "Initializing pageserver node {} at '{}' in {:?}", node_id, self.pg_connection_config.raw_address(), datadir ); io::stdout().flush()?; // If the config file we got as a CLI argument includes the `availability_zone` // config, then use that to populate the `metadata.json` file for the pageserver. // In production the deployment orchestrator does this for us. let az_id = conf .other .get("availability_zone") .map(|toml| { let az_str = toml.to_string(); // Trim the (") chars from the toml representation if az_str.starts_with('"') && az_str.ends_with('"') { az_str[1..az_str.len() - 1].to_string() } else { az_str } }) .unwrap_or("local".to_string()); let config = self .pageserver_init_make_toml(conf) .context("make pageserver toml")?; let config_file_path = datadir.join("pageserver.toml"); let mut config_file = std::fs::OpenOptions::new() .create_new(true) .write(true) .open(&config_file_path) .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; config_file .write_all(config.to_string().as_bytes()) .context("write pageserver toml")?; drop(config_file); let identity_file_path = datadir.join("identity.toml"); let mut identity_file = std::fs::OpenOptions::new() .create_new(true) .write(true) .open(&identity_file_path) .with_context(|| format!("open identity toml for write: {identity_file_path:?}"))?; let identity_toml = self.pageserver_make_identity_toml(node_id); identity_file .write_all(identity_toml.to_string().as_bytes()) .context("write identity toml")?; drop(identity_toml); if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( datadir.join("server.crt").as_path(), datadir.join("server.key").as_path(), )?; } // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with // the storage controller let metadata_path = datadir.join("metadata.json"); let http_host = "localhost".to_string(); let (_, http_port) = parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT); let https_port = match self.conf.listen_https_addr.as_ref() { Some(https_addr) => { let (_https_host, https_port) = parse_host_port(https_addr).expect("Unable to parse listen_https_addr"); Some(https_port.unwrap_or(9899)) } None => None, }; let (mut grpc_host, mut grpc_port) = (None, None); if let Some(grpc_addr) = &self.conf.listen_grpc_addr { let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr"); grpc_host = Some("localhost".to_string()); grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT)); } // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. std::fs::write( metadata_path, serde_json::to_vec(&pageserver_api::config::NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: self.pg_connection_config.port(), grpc_host, grpc_port, http_host, http_port, https_port, other: HashMap::from([( "availability_zone_id".to_string(), serde_json::json!(az_id), )]), }) .unwrap(), ) .expect("Failed to write metadata file"); Ok(()) } async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); println!( "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}", self.conf.id, self.pg_connection_config.raw_address(), datadir, retry_timeout ); io::stdout().flush().context("flush stdout")?; let datadir_path_str = datadir.to_str().with_context(|| { format!( "Cannot start pageserver node {} in path that has no string representation: {:?}", self.conf.id, datadir, ) })?; let args = vec!["-D", datadir_path_str]; background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), retry_timeout, || async { let st = self.check_status().await; match st { Ok(()) => Ok(true), Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false), Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), } }, ) .await?; Ok(()) } fn pageserver_env_variables(&self) -> anyhow::Result> { // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper // needs a token, and how to generate that token, seems independent to whether // the pageserver requires a token in incoming requests. Ok(if self.conf.http_auth_type != AuthType::Trust { // Generate a token to connect from the pageserver to a safekeeper let token = self .env .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; vec![("NEON_AUTH_TOKEN".to_owned(), token)] } else { Vec::new() }) } /// /// Stop the server. /// /// If 'immediate' is true, we use SIGQUIT, killing the process immediately. /// Otherwise we use SIGTERM, triggering a clean shutdown /// /// If the server is not running, returns success /// pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } pub async fn check_status(&self) -> mgmt_api::Result<()> { self.http_client.status().await } pub async fn tenant_list(&self) -> mgmt_api::Result> { self.http_client.list_tenants().await } pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result { let result = models::TenantConfig { checkpoint_distance: settings .remove("checkpoint_distance") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, checkpoint_timeout: settings .remove("checkpoint_timeout") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, compaction_period: settings .remove("compaction_period") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, compaction_upper_limit: settings .remove("compaction_upper_limit") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_upper_limit' as an integer")?, compaction_algorithm: settings .remove("compaction_algorithm") .map(serde_json::from_str) .transpose() .context("Failed to parse 'compaction_algorithm' json")?, compaction_shard_ancestor: settings .remove("compaction_shard_ancestor") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_shard_ancestor' as a bool")?, compaction_l0_first: settings .remove("compaction_l0_first") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_l0_first' as a bool")?, compaction_l0_semaphore: settings .remove("compaction_l0_semaphore") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_l0_semaphore' as a bool")?, l0_flush_delay_threshold: settings .remove("l0_flush_delay_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?, l0_flush_stall_threshold: settings .remove("l0_flush_stall_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, gc_period: settings.remove("gc_period") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, // HADRON image_layer_force_creation_period: settings .remove("image_layer_force_creation_period") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'image_layer_force_creation_period' as duration")?, image_layer_creation_check_threshold: settings .remove("image_layer_creation_check_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_check_threshold' as integer")?, image_creation_preempt_threshold: settings .remove("image_creation_preempt_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, pitr_interval: settings.remove("pitr_interval") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) .transpose() .context("Failed to parse 'eviction_policy' json")?, min_resident_size_override: settings .remove("min_resident_size_override") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, heatmap_period: settings .remove("heatmap_period") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'lazy_slru_download' as bool")?, timeline_get_throttle: settings .remove("timeline_get_throttle") .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, lsn_lease_length: settings.remove("lsn_lease_length") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") .map(humantime::parse_duration) .transpose() .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'timeline_offloading' as bool")?, rel_size_v2_enabled: settings .remove("rel_size_v2_enabled") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'rel_size_v2_enabled' as bool")?, gc_compaction_enabled: settings .remove("gc_compaction_enabled") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_enabled' as bool")?, gc_compaction_verification: settings .remove("gc_compaction_verification") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_verification' as bool")?, gc_compaction_initial_threshold_kb: settings .remove("gc_compaction_initial_threshold_kb") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?, gc_compaction_ratio_percent: settings .remove("gc_compaction_ratio_percent") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, sampling_ratio: settings .remove("sampling_ratio") .map(serde_json::from_str) .transpose() .context("Failed to parse 'sampling_ratio'")?, relsize_snapshot_cache_capacity: settings .remove("relsize snapshot cache capacity") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'relsize_snapshot_cache_capacity' as integer")?, basebackup_cache_enabled: settings .remove("basebackup_cache_enabled") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'basebackup_cache_enabled' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } else { Ok(result) } } pub async fn tenant_config( &self, tenant_id: TenantId, settings: HashMap<&str, &str>, ) -> anyhow::Result<()> { let config = Self::parse_config(settings)?; self.http_client .set_tenant_config(&models::TenantConfigRequest { tenant_id, config }) .await?; Ok(()) } pub async fn timeline_list( &self, tenant_shard_id: &TenantShardId, ) -> anyhow::Result> { Ok(self.http_client.list_timelines(*tenant_shard_id).await?) } /// Import a basebackup prepared using either: /// a) `pg_basebackup -F tar`, or /// b) The `fullbackup` pageserver endpoint /// /// # Arguments /// * `tenant_id` - tenant to import into. Created if not exists /// * `timeline_id` - id to assign to imported timeline /// * `base` - (start lsn of basebackup, path to `base.tar` file) /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) pub async fn timeline_import( &self, tenant_id: TenantId, timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Init base reader let (start_lsn, base_tarfile_path) = base; let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; let base_tarfile = mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile)); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; let wal_reader = mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile)); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; // Import base self.http_client .import_basebackup( tenant_id, timeline_id, start_lsn, end_lsn, pg_version, base_tarfile, ) .await?; // Import wal if necessary if let Some(wal_reader) = wal_reader { self.http_client .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader) .await?; } Ok(()) } pub async fn timeline_info( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: mgmt_api::ForceAwaitLogicalSize, ) -> anyhow::Result { let timeline_info = self .http_client .timeline_info(tenant_shard_id, timeline_id, force_await_logical_size) .await?; Ok(timeline_info) } } ================================================ FILE: control_plane/src/postgresql_conf.rs ================================================ use std::collections::HashMap; use std::fmt; /// /// Module for parsing postgresql.conf file. /// /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just /// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. use once_cell::sync::Lazy; use regex::Regex; /// In-memory representation of a postgresql.conf file #[derive(Default, Debug)] pub struct PostgresConf { lines: Vec, hash: HashMap, } impl PostgresConf { pub fn new() -> PostgresConf { PostgresConf::default() } /// Return the current value of 'option' pub fn get(&self, option: &str) -> Option<&str> { self.hash.get(option).map(|x| x.as_ref()) } /// /// Note: if you call this multiple times for the same option, the config /// file will a line for each call. It would be nice to have a function /// to change an existing line, but that's a TODO. /// pub fn append(&mut self, option: &str, value: &str) { self.lines .push(format!("{}={}\n", option, escape_str(value))); self.hash.insert(option.to_string(), value.to_string()); } /// Append an arbitrary non-setting line to the config file pub fn append_line(&mut self, line: &str) { self.lines.push(line.to_string()); } } impl fmt::Display for PostgresConf { /// Return the whole configuration file as a string fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { for line in self.lines.iter() { f.write_str(line)?; } Ok(()) } } /// Escape a value for putting in postgresql.conf. fn escape_str(s: &str) -> String { // If the string doesn't contain anything that needs quoting or escaping, return it // as it is. // // The first part of the regex, before the '|', matches the INTEGER rule in the // PostgreSQL flex grammar (guc-file.l). It matches plain integers like "123" and // "-123", and also accepts units like "10MB". The second part of the regex matches // the UNQUOTED_STRING rule, and accepts strings that contain a single word, beginning // with a letter. That covers words like "off" or "posix". Everything else is quoted. // // This regex is a bit more conservative than the rules in guc-file.l, so we quote some // strings that PostgreSQL would accept without quoting, but that's OK. static UNQUOTED_RE: Lazy = Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap()); if UNQUOTED_RE.is_match(s) { s.to_string() } else { // Otherwise escape and quote it let s = s .replace('\\', "\\\\") .replace('\n', "\\n") .replace('\'', "''"); "\'".to_owned() + &s + "\'" } } #[test] fn test_postgresql_conf_escapes() -> anyhow::Result<()> { assert_eq!(escape_str("foo bar"), "'foo bar'"); // these don't need to be quoted assert_eq!(escape_str("foo"), "foo"); assert_eq!(escape_str("123"), "123"); assert_eq!(escape_str("+123"), "+123"); assert_eq!(escape_str("-10"), "-10"); assert_eq!(escape_str("1foo"), "1foo"); assert_eq!(escape_str("foo1"), "foo1"); assert_eq!(escape_str("10MB"), "10MB"); assert_eq!(escape_str("-10kB"), "-10kB"); // these need quoting and/or escaping assert_eq!(escape_str("foo bar"), "'foo bar'"); assert_eq!(escape_str("fo'o"), "'fo''o'"); assert_eq!(escape_str("fo\no"), "'fo\\no'"); assert_eq!(escape_str("fo\\o"), "'fo\\\\o'"); assert_eq!(escape_str("10 cats"), "'10 cats'"); Ok(()) } ================================================ FILE: control_plane/src/safekeeper.rs ================================================ //! Code to manage safekeepers //! //! In the local test environment, the data for each safekeeper is stored in //! //! ```text //! .neon/safekeepers/ //! ``` use std::error::Error as _; use std::io::Write; use std::path::PathBuf; use std::time::Duration; use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use safekeeper_api::models::TimelineCreateRequest; use safekeeper_client::mgmt_api; use thiserror::Error; use utils::auth::{Claims, Scope}; use utils::id::NodeId; use crate::background_process; use crate::local_env::{LocalEnv, SafekeeperConf}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] Transport(#[from] reqwest::Error), #[error("Error: {0}")] Response(String), } type Result = result::Result; fn err_from_client_err(err: mgmt_api::Error) -> SafekeeperHttpError { use mgmt_api::Error::*; match err { ApiError(_, str) => SafekeeperHttpError::Response(str), Cancelled => SafekeeperHttpError::Response("Cancelled".to_owned()), ReceiveBody(err) => SafekeeperHttpError::Transport(err), ReceiveErrorBody(err) => SafekeeperHttpError::Response(err), Timeout(str) => SafekeeperHttpError::Response(format!("timeout: {str}")), } } // // Control routines for safekeeper. // // Used in CLI and tests. // #[derive(Debug)] pub struct SafekeeperNode { pub id: NodeId, pub conf: SafekeeperConf, pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: mgmt_api::Client, pub listen_addr: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { listen_addr.clone() } else { "127.0.0.1".to_string() }; let jwt = None; let http_base_url = format!("http://{}:{}", listen_addr, conf.http_port); SafekeeperNode { id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: mgmt_api::Client::new(env.create_http_client(), http_base_url, jwt), listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { env.safekeeper_data_dir(&format!("sk{sk_id}")) } pub fn datadir_path(&self) -> PathBuf { SafekeeperNode::datadir_path_by_id(&self.env, self.id) } pub fn pid_file(&self) -> Utf8PathBuf { Utf8PathBuf::from_path_buf(self.datadir_path().join("safekeeper.pid")) .expect("non-Unicode path") } /// Initializes a safekeeper node by creating all necessary files, /// e.g. SSL certificates and JWT token file. pub fn initialize(&self) -> anyhow::Result<()> { if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( &self.datadir_path().join("server.crt"), &self.datadir_path().join("server.key"), )?; } // Generate a token file for authentication with other safekeepers if self.conf.auth_enabled { let token = self .env .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; let token_path = self.datadir_path().join("peer_jwt_token"); std::fs::write(token_path, token)?; } Ok(()) } pub async fn start( &self, extra_opts: &[String], retry_timeout: &Duration, ) -> anyhow::Result<()> { println!( "Starting safekeeper at '{}' in '{}', retrying for {:?}", self.pg_connection_config.raw_address(), self.datadir_path().display(), retry_timeout, ); io::stdout().flush().unwrap(); let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); let id_string = id.to_string(); // TODO: add availability_zone to the config. // Right now we just specify any value here and use it to check metrics in tests. let availability_zone = format!("sk-{id_string}"); let mut args = vec![ "-D".to_owned(), datadir .to_str() .with_context(|| { format!("Datadir path {datadir:?} cannot be represented as a unicode string") })? .to_owned(), "--id".to_owned(), id_string, "--listen-pg".to_owned(), listen_pg, "--listen-http".to_owned(), listen_http, "--availability-zone".to_owned(), availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { args.push("--no-sync".to_owned()); } let broker_endpoint = format!("{}", self.env.broker.client_url()); args.extend(["--broker-endpoint".to_owned(), broker_endpoint]); let mut backup_threads = String::new(); if let Some(threads) = self.conf.backup_threads { backup_threads = threads.to_string(); args.extend(["--backup-threads".to_owned(), backup_threads]); } else { drop(backup_threads); } if let Some(ref remote_storage) = self.conf.remote_storage { args.extend(["--remote-storage".to_owned(), remote_storage.clone()]); } let key_path = self.env.base_data_dir.join("auth_public_key.pem"); if self.conf.auth_enabled { let key_path_string = key_path .to_str() .with_context(|| { format!("Key path {key_path:?} cannot be represented as a unicode string") })? .to_owned(); args.extend([ "--pg-auth-public-key-path".to_owned(), key_path_string.clone(), ]); args.extend([ "--pg-tenant-only-auth-public-key-path".to_owned(), key_path_string.clone(), ]); args.extend([ "--http-auth-public-key-path".to_owned(), key_path_string.clone(), ]); } if let Some(https_port) = self.conf.https_port { args.extend([ "--listen-https".to_owned(), format!("{}:{}", self.listen_addr, https_port), ]); } if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } if self.conf.auth_enabled { let token_path = self.datadir_path().join("peer_jwt_token"); let token_path_str = token_path .to_str() .with_context(|| { format!("Token path {token_path:?} cannot be represented as a unicode string") })? .to_owned(); args.extend(["--auth-token-path".to_owned(), token_path_str]); } args.extend_from_slice(extra_opts); let env_variables = Vec::new(); background_process::start_process( &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, env_variables, background_process::InitialPidFile::Expect(self.pid_file()), retry_timeout, || async { match self.check_status().await { Ok(()) => Ok(true), Err(SafekeeperHttpError::Transport(_)) => Ok(false), Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), } }, ) .await } /// /// Stop the server. /// /// If 'immediate' is true, we use SIGQUIT, killing the process immediately. /// Otherwise we use SIGTERM, triggering a clean shutdown /// /// If the server is not running, returns success /// pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { background_process::stop_process( immediate, &format!("safekeeper {}", self.id), &self.pid_file(), ) } pub async fn check_status(&self) -> Result<()> { self.http_client .status() .await .map_err(err_from_client_err)?; Ok(()) } pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<()> { self.http_client .create_timeline(req) .await .map_err(err_from_client_err)?; Ok(()) } } ================================================ FILE: control_plane/src/storage_controller.rs ================================================ use std::ffi::OsStr; use std::fs; use std::path::PathBuf; use std::process::ExitStatus; use std::str::FromStr; use std::sync::OnceLock; use std::time::{Duration, Instant}; use crate::background_process; use crate::local_env::{LocalEnv, NeonStorageControllerConf}; use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Method, Response}; use safekeeper_api::PgMajorVersion; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::instrument; use url::Url; use utils::auth::{Claims, Scope, encode_from_key_file}; use utils::id::{NodeId, TenantId}; use whoami::username; pub struct StorageController { env: LocalEnv, private_key: Option, public_key: Option, client: reqwest::Client, config: NeonStorageControllerConf, // The listen port is learned when starting the storage controller, // hence the use of OnceLock to init it at the right time. listen_port: OnceLock, } const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16; const DB_NAME: &str = "storage_controller"; pub struct NeonStorageControllerStartArgs { pub instance_id: u8, pub base_port: Option, pub start_timeout: humantime::Duration, pub handle_ps_local_disk_loss: Option, } impl NeonStorageControllerStartArgs { pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self { Self { instance_id: 1, base_port: None, start_timeout, handle_ps_local_disk_loss: None, } } } pub struct NeonStorageControllerStopArgs { pub instance_id: u8, pub immediate: bool, } impl NeonStorageControllerStopArgs { pub fn with_default_instance_id(immediate: bool) -> Self { Self { instance_id: 1, immediate, } } } #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, pub generation_override: Option, // only new tenants pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] pub struct AttachHookResponse { #[serde(rename = "gen")] pub generation: Option, } #[derive(Serialize, Deserialize)] pub struct InspectRequest { pub tenant_shard_id: TenantShardId, } #[derive(Serialize, Deserialize)] pub struct InspectResponse { pub attachment: Option<(u32, NodeId)>, } impl StorageController { pub fn from_env(env: &LocalEnv) -> Self { // Assume all pageservers have symmetric auth configuration: this service // expects to use one JWT token to talk to all of them. let ps_conf = env .pageservers .first() .expect("Config is validated to contain at least one pageserver"); let (private_key, public_key) = match ps_conf.http_auth_type { AuthType::Trust => (None, None), AuthType::NeonJWT => { let private_key_path = env.get_private_key_path(); let private_key = pem::parse(fs::read(private_key_path).expect("failed to read private key")) .expect("failed to parse PEM file"); // If pageserver auth is enabled, this implicitly enables auth for this service, // using the same credentials. let public_key_path = camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) .unwrap(); // This service takes keys as a string rather than as a path to a file/dir: read the key into memory. let public_key = if std::fs::metadata(&public_key_path) .expect("Can't stat public key") .is_dir() { // Our config may specify a directory: this is for the pageserver's ability to handle multiple // keys. We only use one key at a time, so, arbitrarily load the first one in the directory. let mut dir = std::fs::read_dir(&public_key_path).expect("Can't readdir public key path"); let dent = dir .next() .expect("Empty key dir") .expect("Error reading key dir"); pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key")) .expect("Failed to parse PEM file") } else { pem::parse( std::fs::read_to_string(&public_key_path).expect("Can't read public key"), ) .expect("Failed to parse PEM file") }; (Some(private_key), Some(public_key)) } }; Self { env: env.clone(), private_key, public_key, client: env.create_http_client(), config: env.storage_controller.clone(), listen_port: OnceLock::default(), } } fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { self.env .base_data_dir .join(format!("storage_controller_{instance_id}")) } fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { Utf8PathBuf::from_path_buf( self.storage_controller_instance_dir(instance_id) .join("storage_controller.pid"), ) .expect("non-Unicode path") } /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { const PREFER_VERSIONS: [PgMajorVersion; 5] = [ STORAGE_CONTROLLER_POSTGRES_VERSION, PgMajorVersion::PG16, PgMajorVersion::PG15, PgMajorVersion::PG14, PgMajorVersion::PG17, ]; for v in PREFER_VERSIONS { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); if tokio::fs::try_exists(&path).await? { return Ok(path); } } // Fall through anyhow::bail!( "Postgres directory '{}' not found in {}", dir_name, self.env.pg_distrib_dir.display(), ); } pub async fn get_pg_bin_dir(&self) -> anyhow::Result { self.get_pg_dir("bin").await } pub async fn get_pg_lib_dir(&self) -> anyhow::Result { self.get_pg_dir("lib").await } /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); let args = [ "-h", "localhost", "-U", &username(), "-d", DB_NAME, "-p", &format!("{postgres_port}"), ]; let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); let envs = [ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]; let exitcode = Command::new(bin_path) .args(args) .envs(envs) .spawn()? .wait() .await?; Ok(exitcode.success()) } /// Create our database if it doesn't exist /// /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers /// who just want to run `cargo neon_local` without knowing about diesel. /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { let database_url = format!( "postgresql://{}@localhost:{}/{DB_NAME}", &username(), postgres_port ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); let envs = [ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]; let output = Command::new(&createdb_path) .args([ "-h", "localhost", "-p", &format!("{postgres_port}"), "-U", &username(), "-O", &username(), DB_NAME, ]) .envs(envs) .output() .await .expect("Failed to spawn createdb"); if !output.status.success() { let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb"); if stderr.contains("already exists") { tracing::info!("Database {DB_NAME} already exists"); } else { anyhow::bail!("createdb failed with status {}: {stderr}", output.status); } } Ok(database_url) } pub async fn connect_to_database( &self, postgres_port: u16, ) -> anyhow::Result<( tokio_postgres::Client, tokio_postgres::Connection, )> { tokio_postgres::Config::new() .host("localhost") .port(postgres_port) // The user is the ambient operating system user name. // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 // // Until we get there, use the ambient operating system user name. // Recent tokio-postgres versions default to this if the user isn't specified. // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await .map_err(anyhow::Error::new) } /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres async fn pg_ctl(&self, args: I) -> ExitStatus where I: IntoIterator, S: AsRef, { let pg_bin_dir = self.get_pg_bin_dir().await.unwrap(); let bin_path = pg_bin_dir.join("pg_ctl"); let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); let envs = [ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]; Command::new(bin_path) .args(args) .envs(envs) .spawn() .expect("Failed to spawn pg_ctl, binary_missing?") .wait() .await .expect("Failed to wait for pg_ctl termination") } pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); if let Err(err) = tokio::fs::create_dir(&instance_dir).await { if err.kind() != std::io::ErrorKind::AlreadyExists { panic!("Failed to create instance dir {instance_dir:?}"); } } if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( &instance_dir.join("server.crt"), &instance_dir.join("server.key"), )?; } let listen_url = &self.env.control_plane_api; let scheme = listen_url.scheme(); let host = listen_url.host_str().unwrap(); let (listen_port, postgres_port) = if let Some(base_port) = start_args.base_port { ( base_port, self.config .database_url .expect("--base-port requires NeonStorageControllerConf::database_url") .port(), ) } else { let port = listen_url.port().unwrap(); (port, port + 1) }; self.listen_port .set(listen_port) .expect("StorageController::listen_port is only set here"); // Do we remove the pid file on stop? let pg_started = self.is_postgres_running().await?; let pg_lib_dir = self.get_pg_lib_dir().await?; if !pg_started { // Start a vanilla Postgres process used by the storage controller for persistence. let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) .unwrap() .join("storage_controller_db"); let pg_bin_dir = self.get_pg_bin_dir().await?; let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { let initdb_args = [ "--pgdata", pg_data_path.as_ref(), "--username", &username(), "--no-sync", "--no-instructions", ]; tracing::info!( "Initializing storage controller database with args: {:?}", initdb_args ); // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) .envs(vec![ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; if !status.success() { anyhow::bail!("initdb failed with status {status}"); } }; // Write a minimal config file: // - Specify the port, since this is chosen dynamically // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing // the storage controller we don't want a slow local disk to interfere with that. // // NB: it's important that we rewrite this file on each start command so we propagate changes // from `LocalEnv`'s config file (`.neon/config`). tokio::fs::write( &pg_data_path.join("postgresql.conf"), format!("port = {postgres_port}\nfsync=off\n"), ) .await?; println!("Starting storage controller database..."); let db_start_args = [ "-w", "-D", pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), "-U", &username(), "start", ]; tracing::info!( "Starting storage controller database with args: {:?}", db_start_args ); let db_start_status = self.pg_ctl(db_start_args).await; let start_timeout: Duration = start_args.start_timeout.into(); let db_start_deadline = Instant::now() + start_timeout; if !db_start_status.success() { return Err(anyhow::anyhow!( "Failed to start postgres {}", db_start_status.code().unwrap() )); } loop { if Instant::now() > db_start_deadline { return Err(anyhow::anyhow!("Timed out waiting for postgres to start")); } match self.pg_isready(&pg_bin_dir, postgres_port).await { Ok(true) => { tracing::info!("storage controller postgres is now ready"); break; } Ok(false) => { tokio::time::sleep(Duration::from_millis(100)).await; } Err(e) => { tracing::warn!("Failed to check postgres status: {e}") } } } self.setup_database(postgres_port).await?; } let database_url = format!("postgresql://localhost:{postgres_port}/{DB_NAME}"); // We support running a startup SQL script to fiddle with the database before we launch storcon. // This is used by the test suite. let startup_script_path = self .env .base_data_dir .join("storage_controller_db.startup.sql"); let startup_script = match tokio::fs::read_to_string(&startup_script_path).await { Ok(script) => { tokio::fs::remove_file(startup_script_path).await?; script } Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { // always run some startup script so that this code path doesn't bit rot "BEGIN; COMMIT;".to_string() } else { anyhow::bail!("Failed to read startup script: {e}") } } }; let (mut client, conn) = self.connect_to_database(postgres_port).await?; let conn = tokio::spawn(conn); let tx = client.build_transaction(); let tx = tx.start().await?; tx.batch_execute(&startup_script).await?; tx.commit().await?; drop(client); conn.await??; let addr = format!("{host}:{listen_port}"); let address_for_peers = Uri::builder() .scheme(scheme) .authority(addr.clone()) .path_and_query("") .build() .unwrap(); let mut args = vec![ "--dev", "--database-url", &database_url, "--max-offline-interval", &humantime::Duration::from(self.config.max_offline).to_string(), "--max-warming-up-interval", &humantime::Duration::from(self.config.max_warming_up).to_string(), "--heartbeat-interval", &humantime::Duration::from(self.config.heartbeat_interval).to_string(), "--address-for-peers", &address_for_peers.to_string(), ] .into_iter() .map(|s| s.to_string()) .collect::>(); match scheme { "http" => args.extend(["--listen".to_string(), addr]), "https" => args.extend(["--listen-https".to_string(), addr]), _ => { panic!("Unexpected url scheme in control_plane_api: {scheme}"); } } if self.config.start_as_candidate { args.push("--start-as-candidate".to_string()); } if self.config.use_https_pageserver_api { args.push("--use-https-pageserver-api".to_string()); } if self.config.use_https_safekeeper_api { args.push("--use-https-safekeeper-api".to_string()); } if self.config.use_local_compute_notifications { args.push("--use-local-compute-notifications".to_string()); } if let Some(value) = self.config.kick_secondary_downloads { args.push(format!("--kick-secondary-downloads={value}")); } if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } if let Some(private_key) = &self.private_key { let claims = Claims::new(None, Scope::PageServerApi); let jwt_token = encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); args.push(format!("--jwt-token={jwt_token}")); let peer_claims = Claims::new(None, Scope::Admin); let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); let claims = Claims::new(None, Scope::SafekeeperData); let jwt_token = encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { args.push(format!("--public-key=\"{public_key}\"")); } if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api { args.push(format!("--control-plane-url={control_plane_hooks_api}")); } if let Some(split_threshold) = self.config.split_threshold.as_ref() { args.push(format!("--split-threshold={split_threshold}")) } if let Some(max_split_shards) = self.config.max_split_shards.as_ref() { args.push(format!("--max-split-shards={max_split_shards}")) } if let Some(initial_split_threshold) = self.config.initial_split_threshold.as_ref() { args.push(format!( "--initial-split-threshold={initial_split_threshold}" )) } if let Some(initial_split_shards) = self.config.initial_split_shards.as_ref() { args.push(format!("--initial-split-shards={initial_split_shards}")) } if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { args.push(format!("--max-secondary-lag-bytes={lag}")) } if let Some(threshold) = self.config.long_reconcile_threshold { args.push(format!( "--long-reconcile-threshold={}", humantime::Duration::from(threshold) )) } args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() )); if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { anyhow::bail!("Safekeeper set up for auth but no private key specified"); } if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } // neon_local is used in test environments where we often have less than 3 safekeepers. if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 { let sk_cnt = self .config .timeline_safekeeper_count .unwrap_or(self.env.safekeepers.len()); args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } if let Some(duration) = self.config.shard_split_request_timeout { args.push(format!( "--shard-split-request-timeout={}", humantime::Duration::from(duration) )); } let mut envs = vec![ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]; if let Some(posthog_config) = &self.config.posthog_config { envs.push(( "POSTHOG_CONFIG".to_string(), serde_json::to_string(posthog_config)?, )); } println!("Starting storage controller at {scheme}://{host}:{listen_port}"); if start_args.handle_ps_local_disk_loss.unwrap_or_default() { args.push("--handle-ps-local-disk-loss".to_string()); } background_process::start_process( COMMAND, &instance_dir, &self.env.storage_controller_bin(), args, envs, background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), &start_args.start_timeout, || async { match self.ready().await { Ok(_) => Ok(true), Err(_) => Ok(false), } }, ) .await?; if self.config.timelines_onto_safekeepers { self.register_safekeepers().await?; } Ok(()) } pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> { background_process::stop_process( stop_args.immediate, COMMAND, &self.pid_file(stop_args.instance_id), )?; let storcon_instances = self.env.storage_controller_instances().await?; for (instance_id, instanced_dir_path) in storcon_instances { if instance_id == stop_args.instance_id { continue; } let pid_file = instanced_dir_path.join("storage_controller.pid"); let pid = tokio::fs::read_to_string(&pid_file) .await .map_err(|err| { anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}") })? .parse::() .expect("pid is valid i32"); let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?; if other_proc_alive { // There is another storage controller instance running, so we return // and leave the database running. return Ok(()); } } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); println!("Stopping storage controller database..."); let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; let stop_status = self.pg_ctl(pg_stop_args).await; if !stop_status.success() { match self.is_postgres_running().await { Ok(false) => { println!("Storage controller database is already stopped"); return Ok(()); } Ok(true) => { anyhow::bail!("Failed to stop storage controller database"); } Err(err) => { anyhow::bail!("Failed to stop storage controller database: {err}"); } } } Ok(()) } async fn is_postgres_running(&self) -> anyhow::Result { let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; let status_exitcode = self.pg_ctl(pg_status_args).await; // pg_ctl status returns this exit code if postgres is not running: in this case it is // fine that stop failed. Otherwise it is an error that stop failed. const PG_STATUS_NOT_RUNNING: i32 = 3; const PG_NO_DATA_DIR: i32 = 4; const PG_STATUS_RUNNING: i32 = 0; match status_exitcode.code() { Some(PG_STATUS_NOT_RUNNING) => Ok(false), Some(PG_NO_DATA_DIR) => Ok(false), Some(PG_STATUS_RUNNING) => Ok(true), Some(code) => Err(anyhow::anyhow!( "pg_ctl status returned unexpected status code: {:?}", code )), None => Err(anyhow::anyhow!("pg_ctl status returned no status code")), } } fn get_claims_for_path(path: &str) -> anyhow::Result> { let category = match path.find('/') { Some(idx) => &path[..idx], None => path, }; match category { "status" | "ready" => Ok(None), "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))), "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))), _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)), } } /// Simple HTTP request wrapper for calling into storage controller async fn dispatch( &self, method: reqwest::Method, path: String, body: Option, ) -> anyhow::Result where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { let response = self.dispatch_inner(method, path, body).await?; Ok(response .json() .await .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) } /// Simple HTTP request wrapper for calling into storage controller async fn dispatch_inner( &self, method: reqwest::Method, path: String, body: Option, ) -> anyhow::Result where RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order // to pass the readiness check. In this scenario [`Self::listen_port`] will // be set (see [`Self::start`]). // // Otherwise, we infer the storage controller api endpoint from the configured // control plane API. let port = if let Some(port) = self.listen_port.get() { *port } else { self.env.control_plane_api.port().unwrap() }; // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out // for general purpose API access. let url = Url::from_str(&format!( "{}://{}:{port}/{path}", self.env.control_plane_api.scheme(), self.env.control_plane_api.host_str().unwrap(), )) .unwrap(); let mut builder = self.client.request(method, url); if let Some(body) = body { builder = builder.json(&body) } if let Some(private_key) = &self.private_key { println!("Getting claims for path {path}"); if let Some(required_claims) = Self::get_claims_for_path(&path)? { println!("Got claims {required_claims:?} for path {path}"); let jwt_token = encode_from_key_file(&required_claims, private_key)?; builder = builder.header( reqwest::header::AUTHORIZATION, format!("Bearer {jwt_token}"), ); } } let response = builder.send().await?; let response = response.error_from_body().await?; Ok(response) } /// Register the safekeepers in the storage controller #[instrument(skip(self))] async fn register_safekeepers(&self) -> anyhow::Result<()> { for sk in self.env.safekeepers.iter() { let sk_id = sk.id; let body = serde_json::json!({ "id": sk_id, "created_at": "2023-10-25T09:11:25Z", "updated_at": "2024-08-28T11:32:43Z", "region_id": "aws-us-east-2", "host": "127.0.0.1", "port": sk.pg_port, "http_port": sk.http_port, "https_port": sk.https_port, "version": 5957, "availability_zone_id": format!("us-east-2b-{sk_id}"), }); self.upsert_safekeeper(sk_id, body).await?; self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) .await?; } Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers #[instrument(skip(self))] pub async fn attach_hook( &self, tenant_shard_id: TenantShardId, pageserver_id: NodeId, ) -> anyhow::Result> { let request = AttachHookRequest { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, config: None, }; let response = self .dispatch::<_, AttachHookResponse>( Method::POST, "debug/v1/attach-hook".to_string(), Some(request), ) .await?; Ok(response.generation) } #[instrument(skip(self))] pub async fn upsert_safekeeper( &self, node_id: NodeId, request: serde_json::Value, ) -> anyhow::Result<()> { let resp = self .dispatch_inner::( Method::POST, format!("control/v1/safekeeper/{node_id}"), Some(request), ) .await?; if !resp.status().is_success() { anyhow::bail!( "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", resp.status() ); } Ok(()) } #[instrument(skip(self))] pub async fn safekeeper_scheduling_policy( &self, node_id: NodeId, scheduling_policy: SkSchedulingPolicy, ) -> anyhow::Result<()> { self.dispatch::( Method::POST, format!("control/v1/safekeeper/{node_id}/scheduling_policy"), Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), ) .await } #[instrument(skip(self))] pub async fn inspect( &self, tenant_shard_id: TenantShardId, ) -> anyhow::Result> { let request = InspectRequest { tenant_shard_id }; let response = self .dispatch::<_, InspectResponse>( Method::POST, "debug/v1/inspect".to_string(), Some(request), ) .await?; Ok(response.attachment) } #[instrument(skip(self))] pub async fn tenant_create( &self, req: TenantCreateRequest, ) -> anyhow::Result { self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req)) .await } #[instrument(skip(self))] pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), TenantCreateResponse>( Method::POST, format!("debug/v1/tenant/{tenant_id}/import"), None, ) .await } #[instrument(skip(self))] pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), _>( Method::GET, format!("debug/v1/tenant/{tenant_id}/locate"), None, ) .await } #[instrument(skip_all, fields(node_id=%req.node_id))] pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) .await } #[instrument(skip_all, fields(node_id=%req.node_id))] pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { self.dispatch::<_, ()>( Method::PUT, format!("control/v1/node/{}/config", req.node_id), Some(req), ) .await } pub async fn node_list(&self) -> anyhow::Result> { self.dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await } #[instrument(skip(self))] pub async fn ready(&self) -> anyhow::Result<()> { self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) .await } #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] pub async fn tenant_timeline_create( &self, tenant_id: TenantId, req: TimelineCreateRequest, ) -> anyhow::Result { self.dispatch( Method::POST, format!("v1/tenant/{tenant_id}/timeline"), Some(req), ) .await } pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> { self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req)) .await } } ================================================ FILE: control_plane/storcon_cli/Cargo.toml ================================================ [package] name = "storcon_cli" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true clap.workspace = true comfy-table.workspace = true futures.workspace = true humantime.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true reqwest.workspace = true safekeeper_api.workspace=true serde_json = { workspace = true, features = ["raw_value"] } storage_controller_client.workspace = true tokio.workspace = true tracing.workspace = true utils.workspace = true workspace_hack.workspace = true ================================================ FILE: control_plane/storcon_cli/src/main.rs ================================================ use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::str::FromStr; use std::time::Duration; use clap::{Parser, Subcommand}; use futures::StreamExt; use pageserver_api::controller_api::{ AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, TenantShardSplitResponse, }; use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_client::mgmt_api::{self}; use reqwest::{Certificate, Method, StatusCode, Url}; use safekeeper_api::models::TimelineLocateResponse; use storage_controller_client::control_api::Client; use utils::id::{NodeId, TenantId, TimelineId}; #[derive(Subcommand, Debug)] enum Command { /// Register a pageserver with the storage controller. This shouldn't usually be necessary, /// since pageservers auto-register when they start up NodeRegister { #[arg(long)] node_id: NodeId, #[arg(long)] listen_pg_addr: String, #[arg(long)] listen_pg_port: u16, #[arg(long)] listen_grpc_addr: Option, #[arg(long)] listen_grpc_port: Option, #[arg(long)] listen_http_addr: String, #[arg(long)] listen_http_port: u16, #[arg(long)] listen_https_port: Option, #[arg(long)] availability_zone_id: String, }, /// Modify a node's configuration in the storage controller NodeConfigure { #[arg(long)] node_id: NodeId, /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to /// manually mark a node offline #[arg(long)] availability: Option, /// Scheduling policy controls whether tenant shards may be scheduled onto this node. #[arg(long)] scheduling: Option, }, /// Exists for backup usage and will be removed in future. /// Use [`Command::NodeStartDelete`] instead, if possible. NodeDelete { #[arg(long)] node_id: NodeId, }, /// Start deletion of the specified pageserver. NodeStartDelete { #[arg(long)] node_id: NodeId, /// When `force` is true, skip waiting for shards to prewarm during migration. /// This can significantly speed up node deletion since prewarming all shards /// can take considerable time, but may result in slower initial access to /// migrated shards until they warm up naturally. #[arg(long)] force: bool, }, /// Cancel deletion of the specified pageserver and wait for `timeout` /// for the operation to be canceled. May be retried. NodeCancelDelete { #[arg(long)] node_id: NodeId, #[arg(long)] timeout: humantime::Duration, }, /// Delete a tombstone of node from the storage controller. /// This is used when we want to allow the node to be re-registered. NodeDeleteTombstone { #[arg(long)] node_id: NodeId, }, /// Modify a tenant's policies in the storage controller TenantPolicy { #[arg(long)] tenant_id: TenantId, /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), /// or is in the normal attached state with N secondary locations (`attached:N`) #[arg(long)] placement: Option, /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant /// unavailable, and are only for use in emergencies. #[arg(long)] scheduling: Option, }, /// List nodes known to the storage controller Nodes {}, /// List soft deleted nodes known to the storage controller NodeTombstones {}, /// List tenants known to the storage controller Tenants { /// If this field is set, it will list the tenants on a specific node node_id: Option, }, /// Create a new tenant in the storage controller, and by extension on pageservers. TenantCreate { #[arg(long)] tenant_id: TenantId, }, /// Delete a tenant in the storage controller, and by extension on pageservers. TenantDelete { #[arg(long)] tenant_id: TenantId, }, /// Split an existing tenant into a higher number of shards than its current shard count. TenantShardSplit { #[arg(long)] tenant_id: TenantId, #[arg(long)] shard_count: u8, /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. #[arg(long)] stripe_size: Option, }, /// Migrate the attached location for a tenant shard to a specific pageserver. TenantShardMigrate { #[arg(long)] tenant_shard_id: TenantShardId, #[arg(long)] node: NodeId, #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] prewarm: bool, #[arg(long, default_value_t = false, action = clap::ArgAction::Set)] override_scheduler: bool, }, /// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate TenantShardWatch { #[arg(long)] tenant_shard_id: TenantShardId, }, /// Migrate the secondary location for a tenant shard to a specific pageserver. TenantShardMigrateSecondary { #[arg(long)] tenant_shard_id: TenantShardId, #[arg(long)] node: NodeId, }, /// Cancel any ongoing reconciliation for this shard TenantShardCancelReconcile { #[arg(long)] tenant_shard_id: TenantShardId, }, /// Set the pageserver tenant configuration of a tenant: this is the configuration structure /// that is passed through to pageservers, and does not affect storage controller behavior. /// Any previous tenant configs are overwritten. SetTenantConfig { #[arg(long)] tenant_id: TenantId, #[arg(long)] config: String, }, /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the /// provided JSON are unset from the tenant config and all fields with non-null values are set. /// Unspecified fields are not changed. PatchTenantConfig { #[arg(long)] tenant_id: TenantId, #[arg(long)] config: String, }, /// Print details about a particular tenant, including all its shards' states. TenantDescribe { #[arg(long)] tenant_id: TenantId, }, TenantSetPreferredAz { #[arg(long)] tenant_id: TenantId, #[arg(long)] preferred_az: Option, }, /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. TenantDrop { #[arg(long)] tenant_id: TenantId, #[arg(long)] unclean: bool, }, NodeDrop { #[arg(long)] node_id: NodeId, #[arg(long)] unclean: bool, }, TenantSetTimeBasedEviction { #[arg(long)] tenant_id: TenantId, #[arg(long)] period: humantime::Duration, #[arg(long)] threshold: humantime::Duration, }, // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers // outside of the specified set. BulkMigrate { // Set of pageserver node ids to drain. #[arg(long)] nodes: Vec, // Optional: migration concurrency (default is 8) #[arg(long)] concurrency: Option, // Optional: maximum number of shards to migrate #[arg(long)] max_shards: Option, // Optional: when set to true, nothing is migrated, but the plan is printed to stdout #[arg(long)] dry_run: Option, }, /// Start draining the specified pageserver. /// The drain is complete when the schedulling policy returns to active. StartDrain { #[arg(long)] node_id: NodeId, }, /// Cancel draining the specified pageserver and wait for `timeout` /// for the operation to be canceled. May be retried. CancelDrain { #[arg(long)] node_id: NodeId, #[arg(long)] timeout: humantime::Duration, }, /// Start filling the specified pageserver. /// The drain is complete when the schedulling policy returns to active. StartFill { #[arg(long)] node_id: NodeId, }, /// Cancel filling the specified pageserver and wait for `timeout` /// for the operation to be canceled. May be retried. CancelFill { #[arg(long)] node_id: NodeId, #[arg(long)] timeout: humantime::Duration, }, /// List safekeepers known to the storage controller Safekeepers {}, /// Set the scheduling policy of the specified safekeeper SafekeeperScheduling { #[arg(long)] node_id: NodeId, #[arg(long)] scheduling_policy: SkSchedulingPolicyArg, }, /// Downloads any missing heatmap layers for all shard for a given timeline DownloadHeatmapLayers { /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified, /// the operation is performed on all shards. When a sharded tenant ID is /// specified, the operation is only performed on the specified shard. #[arg(long)] tenant_shard_id: TenantShardId, #[arg(long)] timeline_id: TimelineId, /// Optional: Maximum download concurrency (default is 16) #[arg(long)] concurrency: Option, }, /// Locate safekeepers for a timeline from the storcon DB. TimelineLocate { #[arg(long)] tenant_id: TenantId, #[arg(long)] timeline_id: TimelineId, }, /// Migrate a timeline to a new set of safekeepers TimelineSafekeeperMigrate { #[arg(long)] tenant_id: TenantId, #[arg(long)] timeline_id: TimelineId, /// Example: --new-sk-set 1,2,3 #[arg(long, required = true, value_delimiter = ',')] new_sk_set: Vec, }, /// Abort ongoing safekeeper migration. TimelineSafekeeperMigrateAbort { #[arg(long)] tenant_id: TenantId, #[arg(long)] timeline_id: TimelineId, }, } #[derive(Parser)] #[command( author, version, about, long_about = "CLI for Storage Controller Support/Debug" )] #[command(arg_required_else_help(true))] struct Cli { #[arg(long)] /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` api: Url, #[arg(long)] /// JWT token for authenticating with storage controller. Depending on the API used, this /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint /// a token with both scopes to use with this tool. jwt: Option, #[arg(long)] /// Trusted root CA certificates to use in https APIs. ssl_ca_file: Option, #[command(subcommand)] command: Command, } #[derive(Debug, Clone)] struct PlacementPolicyArg(PlacementPolicy); impl FromStr for PlacementPolicyArg { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "detached" => Ok(Self(PlacementPolicy::Detached)), "secondary" => Ok(Self(PlacementPolicy::Secondary)), _ if s.starts_with("attached:") => { let mut splitter = s.split(':'); let _prefix = splitter.next().unwrap(); match splitter.next().and_then(|s| s.parse::().ok()) { Some(n) => Ok(Self(PlacementPolicy::Attached(n))), None => Err(anyhow::anyhow!( "Invalid format '{s}', a valid example is 'attached:1'" )), } } _ => Err(anyhow::anyhow!( "Unknown placement policy '{s}', try detached,secondary,attached:" )), } } } #[derive(Debug, Clone)] struct SkSchedulingPolicyArg(SkSchedulingPolicy); impl FromStr for SkSchedulingPolicyArg { type Err = anyhow::Error; fn from_str(s: &str) -> Result { SkSchedulingPolicy::from_str(s).map(Self) } } #[derive(Debug, Clone)] struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); impl FromStr for ShardSchedulingPolicyArg { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "active" => Ok(Self(ShardSchedulingPolicy::Active)), "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), _ => Err(anyhow::anyhow!( "Unknown scheduling policy '{s}', try active,essential,pause,stop" )), } } } #[derive(Debug, Clone)] struct NodeAvailabilityArg(NodeAvailabilityWrapper); impl FromStr for NodeAvailabilityArg { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "active" => Ok(Self(NodeAvailabilityWrapper::Active)), "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), } } } async fn wait_for_scheduling_policy( client: Client, node_id: NodeId, timeout: Duration, f: F, ) -> anyhow::Result where F: Fn(NodeSchedulingPolicy) -> bool, { let waiter = tokio::time::timeout(timeout, async move { loop { let node = client .dispatch::<(), NodeDescribeResponse>( Method::GET, format!("control/v1/node/{node_id}"), None, ) .await?; if f(node.scheduling) { return Ok::(node.scheduling); } } }); Ok(waiter.await??) } #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); let ssl_ca_certs = match &cli.ssl_ca_file { Some(ssl_ca_file) => { let buf = tokio::fs::read(ssl_ca_file).await?; Certificate::from_pem_bundle(&buf)? } None => Vec::new(), }; let mut http_client = reqwest::Client::builder(); for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } let http_client = http_client.build()?; let storcon_client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); let mut trimmed = cli.api.to_string(); trimmed.pop(); let vps_client = mgmt_api::Client::new(http_client.clone(), trimmed, cli.jwt.as_deref()); match cli.command { Command::NodeRegister { node_id, listen_pg_addr, listen_pg_port, listen_grpc_addr, listen_grpc_port, listen_http_addr, listen_http_port, listen_https_port, availability_zone_id, } => { storcon_client .dispatch::<_, ()>( Method::POST, "control/v1/node".to_string(), Some(NodeRegisterRequest { node_id, listen_pg_addr, listen_pg_port, listen_grpc_addr, listen_grpc_port, listen_http_addr, listen_http_port, listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), node_ip_addr: None, }), ) .await?; } Command::TenantCreate { tenant_id } => { storcon_client .dispatch::<_, ()>( Method::POST, "v1/tenant".to_string(), Some(TenantCreateRequest { new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters::default(), placement_policy: Some(PlacementPolicy::Attached(1)), config: TenantConfig::default(), }), ) .await?; } Command::TenantDelete { tenant_id } => { let status = vps_client .tenant_delete(TenantShardId::unsharded(tenant_id)) .await?; tracing::info!("Delete status: {}", status); } Command::Nodes {} => { let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); } println!("{table}"); } Command::NodeConfigure { node_id, availability, scheduling, } => { let req = NodeConfigureRequest { node_id, availability: availability.map(|a| a.0), scheduling, }; storcon_client .dispatch::<_, ()>( Method::PUT, format!("control/v1/node/{node_id}/config"), Some(req), ) .await?; } Command::Tenants { node_id: Some(node_id), } => { let describe_response = storcon_client .dispatch::<(), NodeShardResponse>( Method::GET, format!("control/v1/node/{node_id}/shards"), None, ) .await?; let shards = describe_response.shards; let mut table = comfy_table::Table::new(); table.set_header([ "Shard", "Intended Primary/Secondary", "Observed Primary/Secondary", ]); for shard in shards { table.add_row([ format!("{}", shard.tenant_shard_id), match shard.is_intended_secondary { None => "".to_string(), Some(true) => "Secondary".to_string(), Some(false) => "Primary".to_string(), }, match shard.is_observed_secondary { None => "".to_string(), Some(true) => "Secondary".to_string(), Some(false) => "Primary".to_string(), }, ]); } println!("{table}"); } Command::Tenants { node_id: None } => { // Set up output formatting let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", "Preferred AZ", "ShardCount", "StripeSize", "Placement", "Scheduling", ]); // Pagination loop over listing API let mut start_after = None; const LIMIT: usize = 1000; loop { let path = match start_after { None => format!("control/v1/tenant?limit={LIMIT}"), Some(start_after) => { format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}") } }; let resp = storcon_client .dispatch::<(), Vec>(Method::GET, path, None) .await?; if resp.is_empty() { // End of data reached break; } // Give some visual feedback while we're building up the table (comfy_table doesn't have // streaming output) if resp.len() >= LIMIT { eprint!("."); } start_after = Some(resp.last().unwrap().tenant_id); for tenant in resp { let shard_zero = tenant.shards.into_iter().next().unwrap(); table.add_row([ format!("{}", tenant.tenant_id), shard_zero .preferred_az_id .as_ref() .cloned() .unwrap_or("".to_string()), format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), format!("{:?}", tenant.stripe_size), format!("{:?}", tenant.policy), format!("{:?}", shard_zero.scheduling_policy), ]); } } // Terminate progress dots if table.row_count() > LIMIT { eprint!(""); } println!("{table}"); } Command::TenantPolicy { tenant_id, placement, scheduling, } => { let req = TenantPolicyRequest { scheduling: scheduling.map(|s| s.0), placement: placement.map(|p| p.0), }; storcon_client .dispatch::<_, ()>( Method::PUT, format!("control/v1/tenant/{tenant_id}/policy"), Some(req), ) .await?; } Command::TenantShardSplit { tenant_id, shard_count, stripe_size, } => { let req = TenantShardSplitRequest { new_shard_count: shard_count, new_stripe_size: stripe_size.map(ShardStripeSize), }; let response = storcon_client .dispatch::( Method::PUT, format!("control/v1/tenant/{tenant_id}/shard_split"), Some(req), ) .await?; println!( "Split tenant {} into {} shards: {}", tenant_id, shard_count, response .new_shards .iter() .map(|s| format!("{s:?}")) .collect::>() .join(",") ); } Command::TenantShardMigrate { tenant_shard_id, node, prewarm, override_scheduler, } => { let migration_config = MigrationConfig { prewarm, override_scheduler, ..Default::default() }; let req = TenantShardMigrateRequest { node_id: node, origin_node_id: None, migration_config, }; match storcon_client .dispatch::( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), Some(req), ) .await { Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => { anyhow::bail!( "Migration to {node} rejected, may require `--force` ({}) ", msg ); } Err(e) => return Err(e.into()), Ok(_) => {} } watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?; } Command::TenantShardWatch { tenant_shard_id } => { watch_tenant_shard(storcon_client, tenant_shard_id, None).await?; } Command::TenantShardMigrateSecondary { tenant_shard_id, node, } => { let req = TenantShardMigrateRequest { node_id: node, origin_node_id: None, migration_config: MigrationConfig::default(), }; storcon_client .dispatch::( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"), Some(req), ) .await?; } Command::TenantShardCancelReconcile { tenant_shard_id } => { storcon_client .dispatch::<(), ()>( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"), None, ) .await?; } Command::SetTenantConfig { tenant_id, config } => { let tenant_conf = serde_json::from_str(&config)?; vps_client .set_tenant_config(&TenantConfigRequest { tenant_id, config: tenant_conf, }) .await?; } Command::PatchTenantConfig { tenant_id, config } => { let tenant_conf = serde_json::from_str(&config)?; vps_client .patch_tenant_config(&TenantConfigPatchRequest { tenant_id, config: tenant_conf, }) .await?; } Command::TenantDescribe { tenant_id } => { let TenantDescribeResponse { tenant_id, shards, stripe_size, policy, config, } = storcon_client .dispatch::<(), TenantDescribeResponse>( Method::GET, format!("control/v1/tenant/{tenant_id}"), None, ) .await?; let nodes = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; let nodes = nodes .into_iter() .map(|n| (n.id, n)) .collect::>(); println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); table.add_row(["Policy", &format!("{policy:?}")]); table.add_row(["Stripe size", &format!("{stripe_size:?}")]); table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]); println!("{table}"); println!("Shards:"); let mut table = comfy_table::Table::new(); table.set_header([ "Shard", "Attached", "Attached AZ", "Secondary", "Last error", "status", ]); for shard in shards { let secondary = shard .node_secondary .iter() .map(|n| format!("{n}")) .collect::>() .join(","); let mut status_parts = Vec::new(); if shard.is_reconciling { status_parts.push("reconciling"); } if shard.is_pending_compute_notification { status_parts.push("pending_compute"); } if shard.is_splitting { status_parts.push("splitting"); } let status = status_parts.join(","); let attached_node = shard .node_attached .as_ref() .map(|id| nodes.get(id).expect("Shard references nonexistent node")); table.add_row([ format!("{}", shard.tenant_shard_id), attached_node .map(|n| format!("{} ({})", n.listen_http_addr, n.id)) .unwrap_or(String::new()), attached_node .map(|n| n.availability_zone_id.clone()) .unwrap_or(String::new()), secondary, shard.last_error, status, ]); } println!("{table}"); } Command::TenantSetPreferredAz { tenant_id, preferred_az, } => { // First learn about the tenant's shards let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( Method::GET, format!("control/v1/tenant/{tenant_id}"), None, ) .await?; // Learn about nodes to validate the AZ ID let nodes = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; if let Some(preferred_az) = &preferred_az { let azs = nodes .into_iter() .map(|n| (n.availability_zone_id)) .collect::>(); if !azs.contains(preferred_az) { anyhow::bail!( "AZ {} not found on any node: known AZs are: {:?}", preferred_az, azs ); } } else { // Make it obvious to the user that since they've omitted an AZ, we're clearing it eprintln!("Clearing preferred AZ for tenant {tenant_id}"); } // Construct a request that modifies all the tenant's shards let req = ShardsPreferredAzsRequest { preferred_az_ids: describe_response .shards .into_iter() .map(|s| { ( s.tenant_shard_id, preferred_az.clone().map(AvailabilityZone), ) }) .collect(), }; storcon_client .dispatch::( Method::PUT, "control/v1/preferred_azs".to_string(), Some(req), ) .await?; } Command::TenantDrop { tenant_id, unclean } => { if !unclean { anyhow::bail!( "This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed." ) } storcon_client .dispatch::<(), ()>( Method::POST, format!("debug/v1/tenant/{tenant_id}/drop"), None, ) .await?; } Command::NodeDrop { node_id, unclean } => { if !unclean { anyhow::bail!( "This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed." ) } storcon_client .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) .await?; } Command::NodeDelete { node_id } => { eprintln!("Warning: This command is obsolete and will be removed in a future version"); eprintln!("Use `NodeStartDelete` instead, if possible"); storcon_client .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) .await?; } Command::NodeStartDelete { node_id, force } => { let query = if force { format!("control/v1/node/{node_id}/delete?force=true") } else { format!("control/v1/node/{node_id}/delete") }; storcon_client .dispatch::<(), ()>(Method::PUT, query, None) .await?; println!("Delete started for {node_id}"); } Command::NodeCancelDelete { node_id, timeout } => { storcon_client .dispatch::<(), ()>( Method::DELETE, format!("control/v1/node/{node_id}/delete"), None, ) .await?; println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); let final_policy = wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { !matches!(sched, NodeSchedulingPolicy::Deleting) }) .await?; println!( "Delete was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } Command::NodeDeleteTombstone { node_id } => { storcon_client .dispatch::<(), ()>( Method::DELETE, format!("debug/v1/tombstone/{node_id}"), None, ) .await?; } Command::NodeTombstones {} => { let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "debug/v1/tombstone".to_string(), None, ) .await?; resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); } println!("{table}"); } Command::TenantSetTimeBasedEviction { tenant_id, period, threshold, } => { vps_client .set_tenant_config(&TenantConfigRequest { tenant_id, config: TenantConfig { eviction_policy: Some(EvictionPolicy::LayerAccessThreshold( EvictionPolicyLayerAccessThreshold { period: period.into(), threshold: threshold.into(), }, )), heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) .await?; } Command::BulkMigrate { nodes, concurrency, max_shards, dry_run, } => { // Load the list of nodes, split them up into the drained and filled sets, // and validate that draining is possible. let node_descs = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; let mut node_to_drain_descs = Vec::new(); let mut node_to_fill_descs = Vec::new(); for desc in node_descs { let to_drain = nodes.contains(&desc.id); if to_drain { node_to_drain_descs.push(desc); } else { node_to_fill_descs.push(desc); } } if nodes.len() != node_to_drain_descs.len() { anyhow::bail!("Bulk migration requested away from node which doesn't exist.") } node_to_fill_descs.retain(|desc| { matches!(desc.availability, NodeAvailabilityWrapper::Active) && matches!( desc.scheduling, NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling ) }); if node_to_fill_descs.is_empty() { anyhow::bail!("There are no nodes to migrate to") } // Set the node scheduling policy to draining for the nodes which // we plan to drain. for node_desc in node_to_drain_descs.iter() { let req = NodeConfigureRequest { node_id: node_desc.id, availability: None, scheduling: Some(NodeSchedulingPolicy::Draining), }; storcon_client .dispatch::<_, ()>( Method::PUT, format!("control/v1/node/{}/config", node_desc.id), Some(req), ) .await?; } // Perform the migration: move each tenant shard scheduled on a node to // be drained to a node which is being filled. A simple round robin // strategy is used to pick the new node. let tenants = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/tenant".to_string(), None, ) .await?; let mut selected_node_idx = 0; struct MigrationMove { tenant_shard_id: TenantShardId, from: NodeId, to: NodeId, } let mut moves: Vec = Vec::new(); let shards = tenants .into_iter() .flat_map(|tenant| tenant.shards.into_iter()); for shard in shards { if let Some(max_shards) = max_shards { if moves.len() >= max_shards { println!( "Stop planning shard moves since the requested maximum was reached" ); break; } } let should_migrate = { if let Some(attached_to) = shard.node_attached { node_to_drain_descs .iter() .map(|desc| desc.id) .any(|id| id == attached_to) } else { false } }; if !should_migrate { continue; } moves.push(MigrationMove { tenant_shard_id: shard.tenant_shard_id, from: shard .node_attached .expect("We only migrate attached tenant shards"), to: node_to_fill_descs[selected_node_idx].id, }); selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len(); } let total_moves = moves.len(); if dry_run == Some(true) { println!("Dryrun requested. Planned {total_moves} moves:"); for mv in &moves { println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to) } return Ok(()); } const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; let mut stream = futures::stream::iter(moves) .map(|mv| { let client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); async move { client .dispatch::( Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), Some(TenantShardMigrateRequest { node_id: mv.to, origin_node_id: Some(mv.from), migration_config: MigrationConfig::default(), }), ) .await .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) } }) .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY)); let mut success = 0; let mut failure = 0; while let Some(res) = stream.next().await { match res { Ok(_) => { success += 1; } Err((tenant_shard_id, from, to, error)) => { failure += 1; println!( "Failed to migrate {tenant_shard_id} from node {from} to node {to}: {error}" ); } } if (success + failure) % 20 == 0 { println!( "Processed {}/{} shards: {} succeeded, {} failed", success + failure, total_moves, success, failure ); } } println!( "Processed {}/{} shards: {} succeeded, {} failed", success + failure, total_moves, success, failure ); } Command::StartDrain { node_id } => { storcon_client .dispatch::<(), ()>( Method::PUT, format!("control/v1/node/{node_id}/drain"), None, ) .await?; println!("Drain started for {node_id}"); } Command::CancelDrain { node_id, timeout } => { storcon_client .dispatch::<(), ()>( Method::DELETE, format!("control/v1/node/{node_id}/drain"), None, ) .await?; println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); let final_policy = wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { use NodeSchedulingPolicy::*; matches!(sched, Active | PauseForRestart) }) .await?; println!( "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } Command::StartFill { node_id } => { storcon_client .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None) .await?; println!("Fill started for {node_id}"); } Command::CancelFill { node_id, timeout } => { storcon_client .dispatch::<(), ()>( Method::DELETE, format!("control/v1/node/{node_id}/fill"), None, ) .await?; println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); let final_policy = wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { use NodeSchedulingPolicy::*; matches!(sched, Active) }) .await?; println!( "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } Command::Safekeepers {} => { let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/safekeeper".to_string(), None, ) .await?; resp.sort_by(|a, b| a.id.cmp(&b.id)); let mut table = comfy_table::Table::new(); table.set_header([ "Id", "Version", "Host", "Port", "Http Port", "AZ Id", "Scheduling", ]); for sk in resp { table.add_row([ format!("{}", sk.id), format!("{}", sk.version), sk.host, format!("{}", sk.port), format!("{}", sk.http_port), sk.availability_zone_id.clone(), String::from(sk.scheduling_policy), ]); } println!("{table}"); } Command::SafekeeperScheduling { node_id, scheduling_policy, } => { let scheduling_policy = scheduling_policy.0; storcon_client .dispatch::( Method::POST, format!("control/v1/safekeeper/{node_id}/scheduling_policy"), Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), ) .await?; println!( "Scheduling policy of {node_id} set to {}", String::from(scheduling_policy) ); } Command::DownloadHeatmapLayers { tenant_shard_id, timeline_id, concurrency, } => { let mut path = format!( "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", ); if let Some(c) = concurrency { path = format!("{path}?concurrency={c}"); } storcon_client .dispatch::<(), ()>(Method::POST, path, None) .await?; } Command::TimelineLocate { tenant_id, timeline_id, } => { let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate"); let resp = storcon_client .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None) .await?; let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::>(); let new_sk_set = resp .new_sk_set .as_ref() .map(|ids| ids.iter().map(|id| id.0 as i64).collect::>()); println!("generation = {}", resp.generation); println!("sk_set = {sk_set:?}"); println!("new_sk_set = {new_sk_set:?}"); } Command::TimelineSafekeeperMigrate { tenant_id, timeline_id, new_sk_set, } => { let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate"); storcon_client .dispatch::<_, ()>( Method::POST, path, Some(TimelineSafekeeperMigrateRequest { new_sk_set }), ) .await?; } Command::TimelineSafekeeperMigrateAbort { tenant_id, timeline_id, } => { let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort"); storcon_client .dispatch::<(), ()>(Method::POST, path, None) .await?; } } Ok(()) } static WATCH_INTERVAL: Duration = Duration::from_secs(5); async fn watch_tenant_shard( storcon_client: Client, tenant_shard_id: TenantShardId, until_migrated_to: Option, ) -> anyhow::Result<()> { if let Some(until_migrated_to) = until_migrated_to { println!( "Waiting for tenant shard {tenant_shard_id} to be migrated to node {until_migrated_to}" ); } loop { let desc = storcon_client .dispatch::<(), TenantDescribeResponse>( Method::GET, format!("control/v1/tenant/{}", tenant_shard_id.tenant_id), None, ) .await?; // Output the current state of the tenant shard let shard = desc .shards .iter() .find(|s| s.tenant_shard_id == tenant_shard_id) .ok_or(anyhow::anyhow!("Tenant shard not found"))?; let summary = format!( "attached: {} secondary: {} {}", shard .node_attached .map(|n| format!("{n}")) .unwrap_or("none".to_string()), shard .node_secondary .iter() .map(|n| n.to_string()) .collect::>() .join(","), if shard.is_reconciling { "(reconciler active)" } else { "(reconciler idle)" } ); println!("{summary}"); // Maybe drop out if we finished migration if let Some(until_migrated_to) = until_migrated_to { if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling { println!("Tenant shard {tenant_shard_id} is now on node {until_migrated_to}"); break; } } tokio::time::sleep(WATCH_INTERVAL).await; } Ok(()) } ================================================ FILE: deny.toml ================================================ # This file was auto-generated using `cargo deny init`. # cargo-deny is a cargo plugin that lets you lint your project's # dependency graph to ensure all your dependencies conform # to your expectations and requirements. # Root options [graph] targets = [ { triple = "x86_64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" }, { triple = "aarch64-apple-darwin" }, { triple = "x86_64-apple-darwin" }, ] all-features = false no-default-features = false [output] feature-depth = 1 # This section is considered when running `cargo deny check advisories` # More documentation for the advisories section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] yanked = "warn" [[advisories.ignore]] id = "RUSTSEC-2023-0071" reason = "the marvin attack only affects private key decryption, not public key signature verification" [[advisories.ignore]] id = "RUSTSEC-2024-0436" reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact." # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] version = 2 allow = [ "0BSD", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", "CC0-1.0", "CDDL-1.0", "ISC", "MIT", "MPL-2.0", "Unicode-3.0", ] confidence-threshold = 0.8 exceptions = [ # Zlib license has some restrictions if we decide to change sth { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" }, { allow = ["Zlib"], name = "const_format", version = "*" }, ] [licenses.private] ignore = true registries = [] # This section is considered when running `cargo deny check bans`. # More documentation about the 'bans' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html [bans] multiple-versions = "allow" wildcards = "allow" highlight = "all" workspace-default-features = "allow" external-default-features = "allow" allow = [] skip = [] skip-tree = [] [[bans.deny]] # we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol # if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust name = "async-std" [[bans.deny]] name = "async-io" [[bans.deny]] name = "async-waker" [[bans.deny]] name = "async-global-executor" [[bans.deny]] name = "async-executor" [[bans.deny]] name = "smol" [[bans.deny]] # We want to use rustls instead of the platform's native tls implementation. name = "native-tls" [[bans.deny]] name = "openssl" # This section is considered when running `cargo deny check sources`. # More documentation about the 'sources' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html [sources] unknown-registry = "warn" unknown-git = "warn" allow-registry = ["https://github.com/rust-lang/crates.io-index"] allow-git = [ # Crate pinned to commit in origin repo due to opentelemetry version. # TODO: Remove this once crate is fetched from crates.io again. "https://github.com/mattiapenati/tower-otel", ] [sources.allow-org] github = [ "neondatabase", ] gitlab = [] bitbucket = [] ================================================ FILE: diesel.toml ================================================ # For documentation on how to configure this file, # see https://diesel.rs/guides/configuring-diesel-cli [print_schema] file = "storage_controller/src/schema.rs" custom_type_derives = ["diesel::query_builder::QueryId"] [migrations_directory] dir = "storage_controller/migrations" ================================================ FILE: docker-compose/README.md ================================================ # Example docker compose configuration The configuration in this directory is used for testing Neon docker images: it is not intended for deploying a usable system. To run a development environment where you can experiment with a miniature Neon system, use `cargo neon` rather than container images. This configuration does not start the storage controller, because the controller needs a way to reconfigure running computes, and no such thing exists in this setup. ## Generating the JWKS for a compute ```shell openssl genpkey -algorithm Ed25519 -out private-key.pem openssl pkey -in private-key.pem -pubout -out public-key.pem openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der key="$(xxd -plain -cols 32 -s -32 public-key.der)" key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)" x="$(printf '%s' "$key" | basenc --base64url --wrap=0)" ``` ================================================ FILE: docker-compose/compute_wrapper/Dockerfile ================================================ ARG REPOSITORY=ghcr.io/neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG ARG COMPUTE_IMAGE USER root RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y curl \ jq \ netcat-openbsd #This is required for the pg_hintplan test RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src USER postgres ================================================ FILE: docker-compose/compute_wrapper/private-key.pem ================================================ -----BEGIN PRIVATE KEY----- MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma -----END PRIVATE KEY----- ================================================ FILE: docker-compose/compute_wrapper/public-key.pem ================================================ -----BEGIN PUBLIC KEY----- MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0= -----END PUBLIC KEY----- ================================================ FILE: docker-compose/compute_wrapper/shell/compute.sh ================================================ #!/usr/bin/env bash set -eux # Generate a random tenant or timeline ID # # Takes a variable name as argument. The result is stored in that variable. generate_id() { local -n resvar=${1} printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM} } PG_VERSION=${PG_VERSION:-14} readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json readonly CONFIG_FILE=/tmp/config.json # Test that the first library path that the dynamic loader looks in is the path # that we use for custom compiled software first_path="$(ldconfig --verbose 2>/dev/null \ | grep --invert-match ^$'\t' \ | cut --delimiter=: --fields=1 \ | head --lines=1)" test "${first_path}" = '/usr/local/lib' echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do sleep 1 done echo "Page server is ready." cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}" if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then tenant_id=${TENANT_ID} timeline_id=${TIMELINE_ID} else echo "Check if a tenant present" PARAMS=( -X GET -H "Content-Type: application/json" "http://pageserver:9898/v1/tenant" ) tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id) if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then echo "Create a tenant" generate_id tenant_id PARAMS=( -X PUT -H "Content-Type: application/json" -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" ) result=$(curl "${PARAMS[@]}") printf '%s\n' "${result}" | jq . fi if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then echo "Check if a timeline present" PARAMS=( -X GET -H "Content-Type: application/json" "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" ) timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) fi if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then generate_id timeline_id PARAMS=( -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") printf '%s\n' "${result}" | jq . fi fi if [[ ${PG_VERSION} -ge 17 ]]; then ulid_extension=pgx_ulid else ulid_extension=ulid fi echo "Adding pgx_ulid" shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE}) sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE} echo "Overwrite tenant id and timeline id in spec file" sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE} sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE} cat ${CONFIG_FILE} echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ --compute-id "compute-${RANDOM}" \ --config "${CONFIG_FILE}" --dev ================================================ FILE: docker-compose/compute_wrapper/var/db/postgres/configs/config.json ================================================ { "spec": { "format_version": 1.0, "timestamp": "2022-10-12T18:00:00.000Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", "suspend_timeout_seconds": -1, "cluster": { "cluster_id": "docker_compose", "name": "docker_compose_test", "state": "restarted", "roles": [ { "name": "cloud_admin", "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", "options": null } ], "databases": [ ], "settings": [ { "name": "fsync", "value": "off", "vartype": "bool" }, { "name": "wal_level", "value": "logical", "vartype": "enum" }, { "name": "wal_log_hints", "value": "on", "vartype": "bool" }, { "name": "log_connections", "value": "on", "vartype": "bool" }, { "name": "port", "value": "55433", "vartype": "integer" }, { "name": "shared_buffers", "value": "1MB", "vartype": "string" }, { "name": "max_connections", "value": "100", "vartype": "integer" }, { "name": "listen_addresses", "value": "0.0.0.0", "vartype": "string" }, { "name": "max_wal_senders", "value": "10", "vartype": "integer" }, { "name": "max_replication_slots", "value": "10", "vartype": "integer" }, { "name": "wal_sender_timeout", "value": "5s", "vartype": "string" }, { "name": "wal_keep_size", "value": "0", "vartype": "integer" }, { "name": "password_encryption", "value": "md5", "vartype": "enum" }, { "name": "restart_after_crash", "value": "off", "vartype": "bool" }, { "name": "synchronous_standby_names", "value": "walproposer", "vartype": "string" }, { "name": "shared_preload_libraries", "value": "neon,pg_cron,timescaledb,pg_stat_statements", "vartype": "string" }, { "name": "neon.safekeepers", "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", "vartype": "string" }, { "name": "neon.timeline_id", "value": "TIMELINE_ID", "vartype": "string" }, { "name": "neon.tenant_id", "value": "TENANT_ID", "vartype": "string" }, { "name": "neon.pageserver_connstring", "value": "host=pageserver port=6400", "vartype": "string" }, { "name": "max_replication_write_lag", "value": "500MB", "vartype": "string" }, { "name": "max_replication_flush_lag", "value": "10GB", "vartype": "string" }, { "name": "cron.database", "value": "postgres", "vartype": "string" } ] }, "delta_operations": [ ] }, "compute_ctl_config": { "jwks": { "keys": [ { "use": "sig", "key_ops": [ "verify" ], "alg": "EdDSA", "kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=", "kty": "OKP", "crv": "Ed25519", "x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo=" } ] } } } ================================================ FILE: docker-compose/docker-compose.yml ================================================ services: minio: restart: always image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z ports: - 9000:9000 - 9001:9001 environment: - MINIO_ROOT_USER=minio - MINIO_ROOT_PASSWORD=password command: server /data --address :9000 --console-address ":9001" minio_create_buckets: image: minio/mc environment: - MINIO_ROOT_USER=minio - MINIO_ROOT_PASSWORD=password entrypoint: - "/bin/sh" - "-c" command: - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do echo 'Waiting to start minio...' && sleep 1; done; /usr/bin/mc mb minio/neon --region=eu-north-1; exit 0;" depends_on: - minio pageserver: restart: always image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 6400:6400 # pg protocol handler - 9898:9898 # http endpoints volumes: - ./pageserver_config:/data/.neon/ depends_on: - storage_broker - minio_create_buckets safekeeper1: restart: always image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ID=1 - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 5454:5454 # pg protocol handler - 7676:7676 # http endpoints entrypoint: - "/bin/sh" - "-c" command: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - storage_broker - minio_create_buckets safekeeper2: restart: always image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ID=2 - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 5454:5454 # pg protocol handler - 7677:7676 # http endpoints entrypoint: - "/bin/sh" - "-c" command: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - storage_broker - minio_create_buckets safekeeper3: restart: always image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ID=3 - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 5454:5454 # pg protocol handler - 7678:7676 # http endpoints entrypoint: - "/bin/sh" - "-c" command: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - storage_broker - minio_create_buckets storage_broker: restart: always image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} ports: - 50051:50051 command: - "storage_broker" - "--listen-addr=0.0.0.0:50051" compute1: restart: always build: context: ./compute_wrapper/ args: - REPOSITORY=${REPOSITORY:-ghcr.io/neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${COMPUTE_TAG:-${TAG:-latest}} - http_proxy=${http_proxy:-} - https_proxy=${https_proxy:-} image: built-compute environment: - PG_VERSION=${PG_VERSION:-16} - TENANT_ID=${TENANT_ID:-} - TIMELINE_ID=${TIMELINE_ID:-} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/ - ./compute_wrapper/shell/:/shell/ ports: - 55433:55433 # pg protocol handler - 3080:3080 # http endpoints entrypoint: - "/shell/compute.sh" # Ad an alias for compute1 for compatibility networks: default: aliases: - compute depends_on: - safekeeper1 - safekeeper2 - safekeeper3 - pageserver compute_is_ready: image: postgres:latest environment: - PARALLEL_COMPUTES=1 entrypoint: - "/bin/sh" - "-c" command: - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do sleep 1; done; done; echo All computes are started" depends_on: - compute1 neon-test-extensions: profiles: ["test-extensions"] image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGUSER=${PGUSER:-cloud_admin} - PGPASSWORD=${PGPASSWORD:-cloud_admin} entrypoint: - "/bin/bash" - "-c" command: - sleep 3600 depends_on: - compute1 ================================================ FILE: docker-compose/docker_compose_test.sh ================================================ #!/usr/bin/env bash # A basic test to ensure Docker images are built correctly. # Build a wrapper around the compute, start all services and runs a simple SQL query. # Repeats the process for all currenly supported Postgres versions. # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). # # A test script for postgres extensions # Currently supports only v16+ # set -eux -o pipefail cd "$(dirname "${0}")" export COMPOSE_FILE='docker-compose.yml' export COMPOSE_PROFILES=test-extensions export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1} READY_MESSAGE="All computes are started" COMPUTES=() for i in $(seq 1 "${PARALLEL_COMPUTES}"); do COMPUTES+=("compute${i}") done CURRENT_TMPDIR=$(mktemp -d) trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then export COMPOSE_FILE=docker-compose-parallel.yml cp docker-compose.yml docker-compose-parallel.yml # Replace the environment variable PARALLEL_COMPUTES with the actual value yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE} for i in $(seq 2 "${PARALLEL_COMPUTES}"); do # Duplicate compute1 as compute${i} for parallel execution yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE} # We don't need these sections, so delete them yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE} # Let the compute 1 be the only dependence yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE} # Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE} # Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes # They will create new TENANT_ID and TIMELINE_ID anyway. yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE} done fi PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" function cleanup() { echo "show container information" docker ps echo "stop containers..." docker compose down } for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do pg_version=${pg_version/v/} echo "clean up containers if exist" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1 PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 while sleep 3; do # check timeout (( cnt += 3 )) if [[ ${cnt} -gt 60 ]]; then echo "timeout before the compute is ready." exit 1 fi if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then echo "OK. The compute is ready to connect." echo "execute simple queries." for compute in "${COMPUTES[@]}"; do docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" done break fi done if [[ ${pg_version} -ge 16 ]]; then mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src} docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test" docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install" docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data" docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data" for compute in "${COMPUTES[@]}"; do # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config on "${compute}" docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf # Prepare for the PostGIS test docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/ # The following block does the same for the contrib/file_fdw test docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data done # Apply patches docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" # We are running tests now rm -f testout.txt testout_contrib.txt # We want to run the longest tests first to better utilize parallelization and reduce overall test time. # Tests listed in the RUN_FIRST variable will be run before others. # If parallelization is not used, this environment variable will be ignored. docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ -e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then CONTRIB_FAILED= FAILED= [[ ${EXT_SUCCESS} -eq 0 ]] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') [[ ${CONTRIB_SUCCESS} -eq 0 ]] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') for d in ${FAILED} ${CONTRIB_FAILED}; do docker compose exec neon-test-extensions bash -c 'for file in $(find '"${d}"' -name regression.diffs -o -name regression.out); do cat ${file}; done' || [[ ${?} -eq 1 ]] done exit 1 fi fi done ================================================ FILE: docker-compose/ext-src/README.md ================================================ # PostgreSQL Extensions for Testing This directory contains PostgreSQL extensions used primarily for: 1. Testing extension upgrades between different Compute versions 2. Running regression tests with regular users (mostly for cloud instances) ## Directory Structure Each extension directory follows a standard structure: - `extension-name-src/` - Directory containing test files for the extension - `test-upgrade.sh` - Script for testing upgrade scenarios - `regular-test.sh` - Script for testing with regular users - Additional test files depending on the extension ## Available Extensions This directory includes the following extensions: - `hll-src` - HyperLogLog, a fixed-size data structure for approximating cardinality - `hypopg-src` - Extension to create hypothetical indexes - `ip4r-src` - IPv4/v6 and subnet data types - `pg_cron-src` - Run periodic jobs in PostgreSQL - `pg_graphql-src` - GraphQL support for PostgreSQL - `pg_hint_plan-src` - Execution plan hints - `pg_ivm-src` - Incremental view maintenance - `pg_jsonschema-src` - JSON Schema validation - `pg_repack-src` - Reorganize tables with minimal locks - `pg_roaringbitmap-src` - Roaring bitmap implementation - `pg_semver-src` - Semantic version data type - `pg_session_jwt-src` - JWT authentication for PostgreSQL - `pg_tiktoken-src` - OpenAI Tiktoken tokenizer - `pg_uuidv7-src` - UUIDv7 implementation for PostgreSQL - `pgjwt-src` - JWT tokens for PostgreSQL - `pgrag-src` - Retrieval Augmented Generation for PostgreSQL - `pgtap-src` - Unit testing framework for PostgreSQL - `pgvector-src` - Vector similarity search - `pgx_ulid-src` - ULID data type - `plv8-src` - JavaScript language for PostgreSQL stored procedures - `postgresql-unit-src` - SI units for PostgreSQL - `prefix-src` - Prefix matching for strings - `rag_bge_small_en_v15-src` - BGE embedding model for RAG - `rag_jina_reranker_v1_tiny_en-src` - Jina reranker model for RAG - `rum-src` - RUM access method for text search ## Usage ### Extension Upgrade Testing The extensions in this directory are used by the `test-upgrade.sh` script to test upgrading extensions between different versions of Neon Compute nodes. The script: 1. Creates a database with extensions installed on an old Compute version 2. Creates timelines for each extension 3. Switches to a new Compute version and tests the upgrade process 4. Verifies extension functionality after upgrade ### Regular User Testing For testing with regular users (particularly for cloud instances), each extension directory typically contains a `regular-test.sh` script that: 1. Drops the database if it exists 2. Creates a fresh test database 3. Installs the extension 4. Runs regression tests A note about pg_regress: Since pg_regress attempts to set `lc_messages` for the database by default, which is forbidden for regular users, we create databases manually and use the `--use-existing` option to bypass this limitation. ### CI Workflows Two main workflows use these extensions: 1. **Cloud Extensions Test** - Tests extensions on Neon cloud projects 2. **Force Test Upgrading of Extension** - Tests upgrading extensions between different Compute versions These workflows are integrated into the build-and-test pipeline through shell scripts: - `docker_compose_test.sh` - Tests extensions in a Docker Compose environment - `test_extensions_upgrade.sh` - Tests extension upgrades between different Compute versions ## Adding New Extensions To add a new extension for testing: 1. Create a directory named `extension-name-src` in this directory 2. Add at minimum: - `regular-test.sh` for testing with regular users - If `regular-test.sh` doesn't exist, the system will look for `neon-test.sh` - If neither exists, it will try to run `make installcheck` - `test-upgrade.sh` is only needed if you want to test upgrade scenarios 3. Update the list of extensions in the `test_extensions_upgrade.sh` script if needed for upgrade testing ### Patching Extension Sources If you need to patch the extension sources: 1. Place the patch file in the extension's directory 2. Apply the patch in the appropriate script (`test-upgrade.sh`, `neon-test.sh`, `regular-test.sh`, or `Makefile`) 3. The patch will be applied during the testing process ================================================ FILE: docker-compose/ext-src/alter_db.sh ================================================ #!/bin/bash # We need these settings to get the expected output results. # We cannot use the environment variables e.g. PGTZ due to # https://github.com/neondatabase/neon/issues/1287 export DATABASE=${1:-contrib_regression} psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \ -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \ -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \ ================================================ FILE: docker-compose/ext-src/h3-pg-src/neon-test.sh ================================================ #!/usr/bin/env bash set -ex cd "$(dirname "${0}")" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress dropdb --if-exists contrib_regression createdb contrib_regression cd h3_postgis/test psql -d contrib_regression -c "CREATE EXTENSION postgis" -c "CREATE EXTENSION postgis_raster" -c "CREATE EXTENSION h3" -c "CREATE EXTENSION h3_postgis" TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g') ${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS} cd ../../h3/test TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g') dropdb --if-exists contrib_regression createdb contrib_regression psql -d contrib_regression -c "CREATE EXTENSION h3" ${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS} ================================================ FILE: docker-compose/ext-src/h3-pg-src/test-upgrade.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress cd h3/test TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g') ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression ${TESTS} ================================================ FILE: docker-compose/ext-src/hll-src/regular-test.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress dropdb --if-exists contrib_regression createdb contrib_regression ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression setup add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op ================================================ FILE: docker-compose/ext-src/hll-src/test-upgrade.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op ================================================ FILE: docker-compose/ext-src/hypopg-src/regular-test.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" dropdb --if-exists contrib_regression createdb contrib_regression PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --inputdir=test --dbname=contrib_regression hypopg hypo_brin hypo_index_part hypo_include hypo_hash hypo_hide_index ================================================ FILE: docker-compose/ext-src/hypopg-src/test-upgrade.patch ================================================ diff --git a/expected/hypopg.out b/expected/hypopg.out index 90121d0..859260b 100644 --- a/expected/hypopg.out +++ b/expected/hypopg.out @@ -11,7 +11,8 @@ BEGIN END; $_$ LANGUAGE plpgsql; -CREATE EXTENSION hypopg; +CREATE EXTENSION IF NOT EXISTS hypopg; +NOTICE: extension "hypopg" already exists, skipping CREATE TABLE hypo (id integer, val text, "Id2" bigint); INSERT INTO hypo SELECT i, 'line ' || i FROM generate_series(1,100000) f(i); diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql index 99722b0..8d6bacb 100644 --- a/test/sql/hypopg.sql +++ b/test/sql/hypopg.sql @@ -12,7 +12,7 @@ END; $_$ LANGUAGE plpgsql; -CREATE EXTENSION hypopg; +CREATE EXTENSION IF NOT EXISTS hypopg; CREATE TABLE hypo (id integer, val text, "Id2" bigint); ================================================ FILE: docker-compose/ext-src/hypopg-src/test-upgrade.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" patch -p1 Hello

'); markdown_from_html -------------------- Hello (1 row) SELECT array_length(rag.chunks_by_character_count('the cat sat on the mat', 10, 5), 1); array_length -------------- 3 (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/chunking_functions.out ================================================ -- Chunking function tests SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); chunks_by_character_count --------------------------------------- {"the cat","cat sat on","on the mat"} (1 row) SELECT rag.chunks_by_character_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 20, 10); chunks_by_character_count ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- {"Lorem ipsum dolor","dolor sit amet,","amet, consectetur","adipiscing elit.","Sed do eiusmod","do eiusmod tempor","tempor incididunt ut","ut labore et dolore","et dolore magna","magna aliqua."} (1 row) SELECT (rag.chunks_by_character_count('the cat', 10, 0))[1]; chunks_by_character_count --------------------------- the cat (1 row) SELECT rag.chunks_by_character_count('', 10, 5); chunks_by_character_count --------------------------- {} (1 row) SELECT rag.chunks_by_character_count('a b c d e f g h i j k l m n o p', 5, 2); chunks_by_character_count ----------------------------------------------------------------- {"a b c","c d e","e f g","g h i","i j k","k l m","m n o","o p"} (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/document_processing.out ================================================ -- HTML to Markdown conversion tests SELECT rag.markdown_from_html('

Hello

'); markdown_from_html -------------------- Hello (1 row) SELECT rag.markdown_from_html('

Hello world

'); markdown_from_html -------------------- Hello _world_ (1 row) SELECT rag.markdown_from_html('

Title

Paragraph

'); markdown_from_html -------------------- # Title + + Paragraph (1 row) SELECT rag.markdown_from_html('
  • Item 1
  • Item 2
'); markdown_from_html -------------------- * Item 1 + * Item 2 (1 row) SELECT rag.markdown_from_html('
Link'); markdown_from_html ----------------------------- [Link](https://example.com) (1 row) -- Note: text_from_pdf and text_from_docx require binary input which is harder to test in regression tests -- We'll test that the functions exist and have the right signature SELECT 'text_from_pdf_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'text_from_pdf' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ----------------------+-------- text_from_pdf_exists | t (1 row) SELECT 'text_from_docx_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'text_from_docx' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result -----------------------+-------- text_from_docx_exists | t (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/embedding_api_functions.out ================================================ -- Test embedding functions exist with correct signatures -- OpenAI embedding functions SELECT 'openai_text_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ------------------------------+-------- openai_text_embedding_exists | t (1 row) SELECT 'openai_text_embedding_3_small_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_3_small' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result --------------------------------------+-------- openai_text_embedding_3_small_exists | t (1 row) SELECT 'openai_text_embedding_3_large_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_3_large' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result --------------------------------------+-------- openai_text_embedding_3_large_exists | t (1 row) SELECT 'openai_text_embedding_ada_002_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_ada_002' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result --------------------------------------+-------- openai_text_embedding_ada_002_exists | t (1 row) -- Fireworks embedding functions SELECT 'fireworks_nomic_embed_text_v1_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_nomic_embed_text_v1' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result --------------------------------------+-------- fireworks_nomic_embed_text_v1_exists | t (1 row) SELECT 'fireworks_nomic_embed_text_v15_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_nomic_embed_text_v15' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------------------+-------- fireworks_nomic_embed_text_v15_exists | t (1 row) SELECT 'fireworks_text_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------------+-------- fireworks_text_embedding_exists | t (1 row) SELECT 'fireworks_text_embedding_thenlper_gte_base_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_thenlper_gte_base' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------------------------------+-------- fireworks_text_embedding_thenlper_gte_base_exists | t (1 row) SELECT 'fireworks_text_embedding_thenlper_gte_large_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_thenlper_gte_large' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ----------------------------------------------------+-------- fireworks_text_embedding_thenlper_gte_large_exists | t (1 row) SELECT 'fireworks_text_embedding_whereisai_uae_large_v1_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_whereisai_uae_large_v1' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result --------------------------------------------------------+-------- fireworks_text_embedding_whereisai_uae_large_v1_exists | t (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/embedding_functions.out ================================================ BEGIN CREATE EXTENSION IF NOT EXISTS vector; DROP EXTENSION IF EXISTS rag CASCADE; CREATE EXTENSION rag CASCADE; test_name|result openai_embedding_dimensions_test|t test_name|result fireworks_embedding_dimensions_test|t COMMIT ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/text_processing.out ================================================ -- Text processing function tests SELECT rag.markdown_from_html('

Hello world

'); markdown_from_html -------------------- Hello _world_ (1 row) SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); chunks_by_character_count --------------------------------------- {"the cat","cat sat on","on the mat"} (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/expected/voyageai_functions.out ================================================ -- Test VoyageAI API key functions SELECT 'voyageai_api_key_test' AS test_name, (SELECT rag.voyageai_set_api_key('test_key') IS NULL) AS result; test_name | result -----------------------+-------- voyageai_api_key_test | t (1 row) SELECT 'voyageai_get_api_key_test' AS test_name, (SELECT rag.voyageai_get_api_key() = 'test_key') AS result; test_name | result ---------------------------+-------- voyageai_get_api_key_test | t (1 row) -- Test VoyageAI embedding functions exist SELECT 'voyageai_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------+-------- voyageai_embedding_exists | t (1 row) SELECT 'voyageai_embedding_3_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_3' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result -----------------------------+-------- voyageai_embedding_3_exists | t (1 row) SELECT 'voyageai_embedding_3_lite_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_3_lite' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ----------------------------------+-------- voyageai_embedding_3_lite_exists | t (1 row) SELECT 'voyageai_embedding_code_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_code_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ----------------------------------+-------- voyageai_embedding_code_2_exists | t (1 row) SELECT 'voyageai_embedding_finance_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_finance_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result -------------------------------------+-------- voyageai_embedding_finance_2_exists | t (1 row) SELECT 'voyageai_embedding_law_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_law_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------------+-------- voyageai_embedding_law_2_exists | t (1 row) SELECT 'voyageai_embedding_multilingual_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_multilingual_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ------------------------------------------+-------- voyageai_embedding_multilingual_2_exists | t (1 row) -- Test VoyageAI reranking functions exist SELECT 'voyageai_rerank_distance_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_distance' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ---------------------------------+-------- voyageai_rerank_distance_exists | t (1 row) SELECT 'voyageai_rerank_score_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_score' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); test_name | result ------------------------------+-------- voyageai_rerank_score_exists | t (1 row) -- Test VoyageAI function signatures SELECT 'voyageai_embedding_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs = 3; test_name | result ------------------------------+-------- voyageai_embedding_signature | t (1 row) SELECT 'voyageai_rerank_distance_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_distance' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs IN (3, 4); test_name | result ------------------------------------+-------- voyageai_rerank_distance_signature | t (1 row) SELECT 'voyageai_rerank_score_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_score' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs IN (3, 4); test_name | result ---------------------------------+-------- voyageai_rerank_score_signature | t (1 row) ================================================ FILE: docker-compose/ext-src/pgrag-src/regular-test.sh ================================================ #!/bin/sh set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression . ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/api_keys.sql ================================================ -- API key function tests SELECT rag.anthropic_set_api_key('test_key'); SELECT rag.anthropic_get_api_key(); SELECT rag.openai_set_api_key('test_key'); SELECT rag.openai_get_api_key(); SELECT rag.fireworks_set_api_key('test_key'); SELECT rag.fireworks_get_api_key(); SELECT rag.voyageai_set_api_key('test_key'); SELECT rag.voyageai_get_api_key(); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/basic_functions.sql ================================================ -- Basic function tests SELECT rag.markdown_from_html('

Hello

'); SELECT array_length(rag.chunks_by_character_count('the cat sat on the mat', 10, 5), 1); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/chunking_functions.sql ================================================ -- Chunking function tests SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); SELECT rag.chunks_by_character_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 20, 10); SELECT (rag.chunks_by_character_count('the cat', 10, 0))[1]; SELECT rag.chunks_by_character_count('', 10, 5); SELECT rag.chunks_by_character_count('a b c d e f g h i j k l m n o p', 5, 2); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/document_processing.sql ================================================ -- HTML to Markdown conversion tests SELECT rag.markdown_from_html('

Hello

'); SELECT rag.markdown_from_html('

Hello world

'); SELECT rag.markdown_from_html('

Title

Paragraph

'); SELECT rag.markdown_from_html('
  • Item 1
  • Item 2
'); SELECT rag.markdown_from_html('Link'); -- Note: text_from_pdf and text_from_docx require binary input which is harder to test in regression tests -- We'll test that the functions exist and have the right signature SELECT 'text_from_pdf_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'text_from_pdf' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'text_from_docx_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'text_from_docx' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/embedding_api_functions.sql ================================================ -- Test embedding functions exist with correct signatures -- OpenAI embedding functions SELECT 'openai_text_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'openai_text_embedding_3_small_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_3_small' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'openai_text_embedding_3_large_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_3_large' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'openai_text_embedding_ada_002_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'openai_text_embedding_ada_002' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); -- Fireworks embedding functions SELECT 'fireworks_nomic_embed_text_v1_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_nomic_embed_text_v1' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'fireworks_nomic_embed_text_v15_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_nomic_embed_text_v15' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'fireworks_text_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'fireworks_text_embedding_thenlper_gte_base_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_thenlper_gte_base' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'fireworks_text_embedding_thenlper_gte_large_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_thenlper_gte_large' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'fireworks_text_embedding_whereisai_uae_large_v1_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'fireworks_text_embedding_whereisai_uae_large_v1' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/text_processing.sql ================================================ -- Text processing function tests SELECT rag.markdown_from_html('

Hello world

'); SELECT rag.chunks_by_character_count('the cat sat on the mat', 10, 5); ================================================ FILE: docker-compose/ext-src/pgrag-src/sql/voyageai_functions.sql ================================================ -- Test VoyageAI API key functions SELECT 'voyageai_api_key_test' AS test_name, (SELECT rag.voyageai_set_api_key('test_key') IS NULL) AS result; SELECT 'voyageai_get_api_key_test' AS test_name, (SELECT rag.voyageai_get_api_key() = 'test_key') AS result; -- Test VoyageAI embedding functions exist SELECT 'voyageai_embedding_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_3_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_3' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_3_lite_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_3_lite' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_code_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_code_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_finance_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_finance_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_law_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_law_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_embedding_multilingual_2_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding_multilingual_2' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); -- Test VoyageAI reranking functions exist SELECT 'voyageai_rerank_distance_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_distance' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); SELECT 'voyageai_rerank_score_exists' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_score' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag'); -- Test VoyageAI function signatures SELECT 'voyageai_embedding_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_embedding' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs = 3; SELECT 'voyageai_rerank_distance_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_distance' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs IN (3, 4); SELECT 'voyageai_rerank_score_signature' AS test_name, count(*) > 0 AS result FROM pg_proc WHERE proname = 'voyageai_rerank_score' AND pronamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'rag') AND pronargs IN (3, 4); ================================================ FILE: docker-compose/ext-src/pgtap-src/regular-test.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" make installcheck || true dropdb --if-exist contrib_regression createdb contrib_regression PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress sed -i '/hastap/d' test/build/run.sch sed -Ei 's/\b(aretap|enumtap|ownership|privs|usergroup)\b//g' test/build/run.sch ${PG_REGRESS} --use-existing --dbname=contrib_regression --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --max-connections=879 --schedule test/schedule/main.sch --schedule test/build/run.sch ================================================ FILE: docker-compose/ext-src/pgtap-src/test-upgrade.patch ================================================ diff --git a/Makefile b/Makefile index f255fe6..0a0fa65 100644 --- a/Makefile +++ b/Makefile @@ -346,7 +346,7 @@ test: test-serial test-parallel TB_DIR = test/build GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe -REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) +REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets! IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=)) PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS))) diff --git a/test/schedule/create.sql b/test/schedule/create.sql index ba355ed..7e250f5 100644 --- a/test/schedule/create.sql +++ b/test/schedule/create.sql @@ -1,3 +1,2 @@ \unset ECHO \i test/psql.sql -CREATE EXTENSION pgtap; diff --git a/test/schedule/main.sch b/test/schedule/main.sch index a8a5fbc..0463fc4 100644 --- a/test/schedule/main.sch +++ b/test/schedule/main.sch @@ -1,2 +1 @@ -test: build test: create ================================================ FILE: docker-compose/ext-src/pgtap-src/test-upgrade.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" patch -p1 =" 120),1) - TESTS += \ - $(top_srcdir)/regress/core/computed_columns -endif - ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1) # GEOS-3.7 adds: # ST_FrechetDistance diff --git a/regress/runtest.mk b/regress/runtest.mk index c051f03..010e493 100644 --- a/regress/runtest.mk +++ b/regress/runtest.mk @@ -24,16 +24,6 @@ check-regress: POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS) - @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \ - echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \ - POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \ - --upgrade \ - $(RUNTESTFLAGS) \ - $(RUNTESTFLAGS_INTERNAL) \ - $(TESTS); \ - else \ - echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \ - fi check-long: $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW) ================================================ FILE: docker-compose/ext-src/postgis-src/postgis-common-v17.patch ================================================ diff --git a/regress/core/tests.mk b/regress/core/tests.mk index 9e05244..90987df 100644 --- a/regress/core/tests.mk +++ b/regress/core/tests.mk @@ -143,8 +143,7 @@ TESTS += \ $(top_srcdir)/regress/core/oriented_envelope \ $(top_srcdir)/regress/core/point_coordinates \ $(top_srcdir)/regress/core/out_geojson \ - $(top_srcdir)/regress/core/wrapx \ - $(top_srcdir)/regress/core/computed_columns + $(top_srcdir)/regress/core/wrapx # Slow slow tests TESTS_SLOW = \ diff --git a/regress/runtest.mk b/regress/runtest.mk index 4b95b7e..449d5a2 100644 --- a/regress/runtest.mk +++ b/regress/runtest.mk @@ -24,16 +24,6 @@ check-regress: @POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS) - @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \ - echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \ - POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \ - --upgrade \ - $(RUNTESTFLAGS) \ - $(RUNTESTFLAGS_INTERNAL) \ - $(TESTS); \ - else \ - echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \ - fi check-long: $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW) ================================================ FILE: docker-compose/ext-src/postgis-src/postgis-regular-v16.patch ================================================ diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk index 00918e1..7e2b6cd 100644 --- a/raster/test/regress/tests.mk +++ b/raster/test/regress/tests.mk @@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \ $(RUNTESTFLAGS_INTERNAL) \ --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql -RASTER_TEST_FIRST = \ - $(top_srcdir)/raster/test/regress/check_gdal \ - $(top_srcdir)/raster/test/regress/loader/load_outdb +RASTER_TEST_FIRST = RASTER_TEST_LAST = \ $(top_srcdir)/raster/test/regress/clean @@ -33,9 +31,7 @@ RASTER_TEST_IO = \ RASTER_TEST_BASIC_FUNC = \ $(top_srcdir)/raster/test/regress/rt_bytea \ - $(top_srcdir)/raster/test/regress/rt_wkb \ $(top_srcdir)/raster/test/regress/box3d \ - $(top_srcdir)/raster/test/regress/rt_addband \ $(top_srcdir)/raster/test/regress/rt_band \ $(top_srcdir)/raster/test/regress/rt_tile @@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \ $(top_srcdir)/raster/test/regress/rt_neighborhood \ $(top_srcdir)/raster/test/regress/rt_nearestvalue \ $(top_srcdir)/raster/test/regress/rt_pixelofvalue \ - $(top_srcdir)/raster/test/regress/rt_polygon \ - $(top_srcdir)/raster/test/regress/rt_setbandpath + $(top_srcdir)/raster/test/regress/rt_polygon RASTER_TEST_UTILITY = \ $(top_srcdir)/raster/test/regress/rt_utility \ - $(top_srcdir)/raster/test/regress/rt_fromgdalraster \ - $(top_srcdir)/raster/test/regress/rt_asgdalraster \ - $(top_srcdir)/raster/test/regress/rt_astiff \ - $(top_srcdir)/raster/test/regress/rt_asjpeg \ - $(top_srcdir)/raster/test/regress/rt_aspng \ $(top_srcdir)/raster/test/regress/rt_reclass \ $(top_srcdir)/raster/test/regress/rt_gdalwarp \ $(top_srcdir)/raster/test/regress/rt_gdalcontour \ @@ -120,21 +110,13 @@ RASTER_TEST_SREL = \ RASTER_TEST_BUGS = \ $(top_srcdir)/raster/test/regress/bug_test_car5 \ - $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \ $(top_srcdir)/raster/test/regress/tickets RASTER_TEST_LOADER = \ $(top_srcdir)/raster/test/regress/loader/Basic \ $(top_srcdir)/raster/test/regress/loader/Projected \ $(top_srcdir)/raster/test/regress/loader/BasicCopy \ - $(top_srcdir)/raster/test/regress/loader/BasicFilename \ - $(top_srcdir)/raster/test/regress/loader/BasicOutDB \ - $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \ - $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \ - $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \ - $(top_srcdir)/raster/test/regress/loader/TiledAuto \ - $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \ - $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn + $(top_srcdir)/raster/test/regress/loader/BasicFilename RASTER_TESTS := $(RASTER_TEST_FIRST) \ $(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \ diff --git a/regress/core/binary.sql b/regress/core/binary.sql index 7a36b65..ad78fc7 100644 --- a/regress/core/binary.sql +++ b/regress/core/binary.sql @@ -1,4 +1,5 @@ SET client_min_messages TO warning; + CREATE SCHEMA tm; CREATE TABLE tm.geoms (id serial, g geometry); @@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id; INSERT INTO tm.geoms(g) SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id; -COPY tm.geoms TO :tmpfile WITH BINARY; +-- define temp file path +\set tmpfile '/tmp/postgis_binary_test.dat' + +-- export +\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)' +:command + +-- import CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0; -COPY tm.geoms_in FROM :tmpfile WITH BINARY; -SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id - AND ST_OrderingEquals(i.g, o.g); +\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)' +:command + +SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o +WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g); CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms WHERE geometrytype(g) NOT LIKE '%CURVE%' AND geometrytype(g) NOT LIKE '%CIRCULAR%' AND geometrytype(g) NOT LIKE '%SURFACE%' AND geometrytype(g) NOT LIKE 'TRIANGLE%' - AND geometrytype(g) NOT LIKE 'TIN%' -; + AND geometrytype(g) NOT LIKE 'TIN%'; -COPY tm.geogs TO :tmpfile WITH BINARY; +-- export +\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)' +:command + +-- import CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0; -COPY tm.geogs_in FROM :tmpfile WITH BINARY; -SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id - AND ST_OrderingEquals(i.g::geometry, o.g::geometry); +\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)' +:command + +SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o +WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry); DROP SCHEMA tm CASCADE; + diff --git a/regress/core/tests.mk b/regress/core/tests.mk index 64a9254..94903c3 100644 --- a/regress/core/tests.mk +++ b/regress/core/tests.mk @@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) RUNTESTFLAGS_INTERNAL += \ --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \ --after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \ - --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \ --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql TESTS += \ @@ -40,7 +39,6 @@ TESTS += \ $(top_srcdir)/regress/core/dumppoints \ $(top_srcdir)/regress/core/dumpsegments \ $(top_srcdir)/regress/core/empty \ - $(top_srcdir)/regress/core/estimatedextent \ $(top_srcdir)/regress/core/forcecurve \ $(top_srcdir)/regress/core/flatgeobuf \ $(top_srcdir)/regress/core/geography \ @@ -55,7 +53,6 @@ TESTS += \ $(top_srcdir)/regress/core/out_marc21 \ $(top_srcdir)/regress/core/in_encodedpolyline \ $(top_srcdir)/regress/core/iscollection \ - $(top_srcdir)/regress/core/legacy \ $(top_srcdir)/regress/core/letters \ $(top_srcdir)/regress/core/long_xact \ $(top_srcdir)/regress/core/lwgeom_regress \ @@ -112,7 +109,6 @@ TESTS += \ $(top_srcdir)/regress/core/temporal_knn \ $(top_srcdir)/regress/core/tickets \ $(top_srcdir)/regress/core/twkb \ - $(top_srcdir)/regress/core/typmod \ $(top_srcdir)/regress/core/wkb \ $(top_srcdir)/regress/core/wkt \ $(top_srcdir)/regress/core/wmsservers \ diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk index 1fc77ac..c3cb9de 100644 --- a/regress/loader/tests.mk +++ b/regress/loader/tests.mk @@ -38,7 +38,5 @@ TESTS += \ $(top_srcdir)/regress/loader/Latin1 \ $(top_srcdir)/regress/loader/Latin1-implicit \ $(top_srcdir)/regress/loader/mfile \ - $(top_srcdir)/regress/loader/TestSkipANALYZE \ - $(top_srcdir)/regress/loader/TestANALYZE \ $(top_srcdir)/regress/loader/CharNoWidth diff --git a/regress/run_test.pl b/regress/run_test.pl index 0ec5b2d..1c331f4 100755 --- a/regress/run_test.pl +++ b/regress/run_test.pl @@ -147,7 +147,6 @@ $ENV{"LANG"} = "C"; # Add locale info to the psql options # Add pg12 precision suppression my $PGOPTIONS = $ENV{"PGOPTIONS"}; -$PGOPTIONS .= " -c lc_messages=C"; $PGOPTIONS .= " -c client_min_messages=NOTICE"; $PGOPTIONS .= " -c extra_float_digits=0"; $ENV{"PGOPTIONS"} = $PGOPTIONS; ================================================ FILE: docker-compose/ext-src/postgis-src/postgis-regular-v17.patch ================================================ diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk index 00918e1..7e2b6cd 100644 --- a/raster/test/regress/tests.mk +++ b/raster/test/regress/tests.mk @@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \ $(RUNTESTFLAGS_INTERNAL) \ --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql -RASTER_TEST_FIRST = \ - $(top_srcdir)/raster/test/regress/check_gdal \ - $(top_srcdir)/raster/test/regress/loader/load_outdb +RASTER_TEST_FIRST = RASTER_TEST_LAST = \ $(top_srcdir)/raster/test/regress/clean @@ -33,9 +31,7 @@ RASTER_TEST_IO = \ RASTER_TEST_BASIC_FUNC = \ $(top_srcdir)/raster/test/regress/rt_bytea \ - $(top_srcdir)/raster/test/regress/rt_wkb \ $(top_srcdir)/raster/test/regress/box3d \ - $(top_srcdir)/raster/test/regress/rt_addband \ $(top_srcdir)/raster/test/regress/rt_band \ $(top_srcdir)/raster/test/regress/rt_tile @@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \ $(top_srcdir)/raster/test/regress/rt_neighborhood \ $(top_srcdir)/raster/test/regress/rt_nearestvalue \ $(top_srcdir)/raster/test/regress/rt_pixelofvalue \ - $(top_srcdir)/raster/test/regress/rt_polygon \ - $(top_srcdir)/raster/test/regress/rt_setbandpath + $(top_srcdir)/raster/test/regress/rt_polygon RASTER_TEST_UTILITY = \ $(top_srcdir)/raster/test/regress/rt_utility \ - $(top_srcdir)/raster/test/regress/rt_fromgdalraster \ - $(top_srcdir)/raster/test/regress/rt_asgdalraster \ - $(top_srcdir)/raster/test/regress/rt_astiff \ - $(top_srcdir)/raster/test/regress/rt_asjpeg \ - $(top_srcdir)/raster/test/regress/rt_aspng \ $(top_srcdir)/raster/test/regress/rt_reclass \ $(top_srcdir)/raster/test/regress/rt_gdalwarp \ $(top_srcdir)/raster/test/regress/rt_gdalcontour \ @@ -120,21 +110,13 @@ RASTER_TEST_SREL = \ RASTER_TEST_BUGS = \ $(top_srcdir)/raster/test/regress/bug_test_car5 \ - $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \ $(top_srcdir)/raster/test/regress/tickets RASTER_TEST_LOADER = \ $(top_srcdir)/raster/test/regress/loader/Basic \ $(top_srcdir)/raster/test/regress/loader/Projected \ $(top_srcdir)/raster/test/regress/loader/BasicCopy \ - $(top_srcdir)/raster/test/regress/loader/BasicFilename \ - $(top_srcdir)/raster/test/regress/loader/BasicOutDB \ - $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \ - $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \ - $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \ - $(top_srcdir)/raster/test/regress/loader/TiledAuto \ - $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \ - $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn + $(top_srcdir)/raster/test/regress/loader/BasicFilename RASTER_TESTS := $(RASTER_TEST_FIRST) \ $(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \ diff --git a/regress/core/binary.sql b/regress/core/binary.sql index 7a36b65..ad78fc7 100644 --- a/regress/core/binary.sql +++ b/regress/core/binary.sql @@ -1,4 +1,5 @@ SET client_min_messages TO warning; + CREATE SCHEMA tm; CREATE TABLE tm.geoms (id serial, g geometry); @@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id; INSERT INTO tm.geoms(g) SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id; -COPY tm.geoms TO :tmpfile WITH BINARY; +-- define temp file path +\set tmpfile '/tmp/postgis_binary_test.dat' + +-- export +\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)' +:command + +-- import CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0; -COPY tm.geoms_in FROM :tmpfile WITH BINARY; -SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id - AND ST_OrderingEquals(i.g, o.g); +\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)' +:command + +SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o +WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g); CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms WHERE geometrytype(g) NOT LIKE '%CURVE%' AND geometrytype(g) NOT LIKE '%CIRCULAR%' AND geometrytype(g) NOT LIKE '%SURFACE%' AND geometrytype(g) NOT LIKE 'TRIANGLE%' - AND geometrytype(g) NOT LIKE 'TIN%' -; + AND geometrytype(g) NOT LIKE 'TIN%'; -COPY tm.geogs TO :tmpfile WITH BINARY; +-- export +\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)' +:command + +-- import CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0; -COPY tm.geogs_in FROM :tmpfile WITH BINARY; -SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id - AND ST_OrderingEquals(i.g::geometry, o.g::geometry); +\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)' +:command + +SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o +WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry); DROP SCHEMA tm CASCADE; + diff --git a/regress/core/tests.mk b/regress/core/tests.mk index 90987df..74fe3f1 100644 --- a/regress/core/tests.mk +++ b/regress/core/tests.mk @@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170 POSTGIS_GEOS_VERSION=31101 HAVE_JSON=yes HAVE_SPGIST=yes -INTERRUPTTESTS=yes +INTERRUPTTESTS=no current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) RUNTESTFLAGS_INTERNAL += \ --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \ --after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \ - --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \ --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql TESTS += \ @@ -40,7 +39,6 @@ TESTS += \ $(top_srcdir)/regress/core/dumppoints \ $(top_srcdir)/regress/core/dumpsegments \ $(top_srcdir)/regress/core/empty \ - $(top_srcdir)/regress/core/estimatedextent \ $(top_srcdir)/regress/core/forcecurve \ $(top_srcdir)/regress/core/flatgeobuf \ $(top_srcdir)/regress/core/frechet \ @@ -60,7 +58,6 @@ TESTS += \ $(top_srcdir)/regress/core/out_marc21 \ $(top_srcdir)/regress/core/in_encodedpolyline \ $(top_srcdir)/regress/core/iscollection \ - $(top_srcdir)/regress/core/legacy \ $(top_srcdir)/regress/core/letters \ $(top_srcdir)/regress/core/lwgeom_regress \ $(top_srcdir)/regress/core/measures \ @@ -119,7 +116,6 @@ TESTS += \ $(top_srcdir)/regress/core/temporal_knn \ $(top_srcdir)/regress/core/tickets \ $(top_srcdir)/regress/core/twkb \ - $(top_srcdir)/regress/core/typmod \ $(top_srcdir)/regress/core/wkb \ $(top_srcdir)/regress/core/wkt \ $(top_srcdir)/regress/core/wmsservers \ diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk index ac4f8ad..4bad4fc 100644 --- a/regress/loader/tests.mk +++ b/regress/loader/tests.mk @@ -38,7 +38,5 @@ TESTS += \ $(top_srcdir)/regress/loader/Latin1 \ $(top_srcdir)/regress/loader/Latin1-implicit \ $(top_srcdir)/regress/loader/mfile \ - $(top_srcdir)/regress/loader/TestSkipANALYZE \ - $(top_srcdir)/regress/loader/TestANALYZE \ $(top_srcdir)/regress/loader/CharNoWidth \ diff --git a/regress/run_test.pl b/regress/run_test.pl index cac4b2e..4c7c82b 100755 --- a/regress/run_test.pl +++ b/regress/run_test.pl @@ -238,7 +238,6 @@ $ENV{"LANG"} = "C"; # Add locale info to the psql options # Add pg12 precision suppression my $PGOPTIONS = $ENV{"PGOPTIONS"}; -$PGOPTIONS .= " -c lc_messages=C"; $PGOPTIONS .= " -c client_min_messages=NOTICE"; $PGOPTIONS .= " -c extra_float_digits=0"; $ENV{"PGOPTIONS"} = $PGOPTIONS; diff --git a/topology/test/tests.mk b/topology/test/tests.mk index cbe2633..2c7c18f 100644 --- a/topology/test/tests.mk +++ b/topology/test/tests.mk @@ -46,9 +46,7 @@ TESTS += \ $(top_srcdir)/topology/test/regress/legacy_query.sql \ $(top_srcdir)/topology/test/regress/legacy_validate.sql \ $(top_srcdir)/topology/test/regress/polygonize.sql \ - $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \ $(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \ - $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \ $(top_srcdir)/topology/test/regress/renametopology.sql \ $(top_srcdir)/topology/test/regress/share_sequences.sql \ $(top_srcdir)/topology/test/regress/sqlmm.sql \ ================================================ FILE: docker-compose/ext-src/postgis-src/raster_outdb_template.sql ================================================ -- -- PostgreSQL database dump -- -- Dumped from database version 17.4 -- Dumped by pg_dump version 17.4 SET statement_timeout = 0; SET lock_timeout = 0; SET idle_in_transaction_session_timeout = 0; SET transaction_timeout = 0; SET client_encoding = 'UTF8'; SET standard_conforming_strings = on; SELECT pg_catalog.set_config('search_path', '', false); SET check_function_bodies = false; SET xmloption = content; SET client_min_messages = warning; -- -- Name: raster_outdb_template; Type: TABLE; Schema: public; Owner: cloud_admin -- CREATE TABLE public.raster_outdb_template ( rid integer, rast public.raster ); ALTER TABLE public.raster_outdb_template OWNER TO cloud_admin; -- -- Data for Name: raster_outdb_template; Type: TABLE DATA; Schema: public; Owner: cloud_admin -- COPY public.raster_outdb_template (rid, rast) FROM stdin; 1 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 2 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 3 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A00320044000101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101018400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 4 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A003200C4FF012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966004400010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101 \. -- -- PostgreSQL database dump complete -- ================================================ FILE: docker-compose/ext-src/postgis-src/regular-test.sh ================================================ #!/bin/bash set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \ -c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \ -c "CREATE EXTENSION postgis SCHEMA public" \ -c "CREATE EXTENSION postgis_topology" \ -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \ -c "CREATE EXTENSION postgis_raster SCHEMA public" \ -c "CREATE EXTENSION postgis_sfcgal SCHEMA public" patch -p1 <"postgis-common-${PG_VERSION}.patch" patch -p1 <"postgis-regular-${PG_VERSION}.patch" psql -d contrib_regression -f raster_outdb_template.sql trap 'patch -R -p1 0 AS result; test_name | result ----------------------------+-------- embedding_for_passage_test | t (1 row) SELECT 'embedding_for_query_test' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; test_name | result --------------------------+-------- embedding_for_query_test | t (1 row) ================================================ FILE: docker-compose/ext-src/rag_bge_small_en_v15-src/expected/embedding_functions_enhanced.out ================================================ -- Embedding function tests SELECT 'embedding_for_passage_test_1' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; test_name | result ------------------------------+-------- embedding_for_passage_test_1 | t (1 row) SELECT 'embedding_for_passage_test_2' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('Lorem ipsum dolor sit amet')) > 0 AS result; test_name | result ------------------------------+-------- embedding_for_passage_test_2 | t (1 row) SELECT 'embedding_for_passage_test_3' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('')) > 0 AS result; test_name | result ------------------------------+-------- embedding_for_passage_test_3 | t (1 row) SELECT 'embedding_for_query_test_1' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; test_name | result ----------------------------+-------- embedding_for_query_test_1 | t (1 row) SELECT 'embedding_for_query_test_2' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('Lorem ipsum dolor sit amet')) > 0 AS result; test_name | result ----------------------------+-------- embedding_for_query_test_2 | t (1 row) SELECT 'embedding_for_query_test_3' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('')) > 0 AS result; test_name | result ----------------------------+-------- embedding_for_query_test_3 | t (1 row) -- Test that passage and query embeddings have the same dimensions SELECT 'embedding_dimensions_match' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('test')) = vector_dims(rag_bge_small_en_v15.embedding_for_query('test')) AS result; test_name | result ----------------------------+-------- embedding_dimensions_match | t (1 row) ================================================ FILE: docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions.sql ================================================ -- Basic function tests SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); ================================================ FILE: docker-compose/ext-src/rag_bge_small_en_v15-src/sql/basic_functions_enhanced.sql ================================================ -- Basic function tests for chunks_by_token_count SELECT rag_bge_small_en_v15.chunks_by_token_count('the cat sat on the mat', 3, 2); SELECT rag_bge_small_en_v15.chunks_by_token_count('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', 5, 2); SELECT (rag_bge_small_en_v15.chunks_by_token_count('the cat', 5, 0))[1]; SELECT rag_bge_small_en_v15.chunks_by_token_count('', 5, 2); SELECT rag_bge_small_en_v15.chunks_by_token_count('a b c d e f g h i j k l m n o p', 3, 1); ================================================ FILE: docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions.sql ================================================ -- Embedding function tests SELECT 'embedding_for_passage_test' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; SELECT 'embedding_for_query_test' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; ================================================ FILE: docker-compose/ext-src/rag_bge_small_en_v15-src/sql/embedding_functions_enhanced.sql ================================================ -- Embedding function tests SELECT 'embedding_for_passage_test_1' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('the cat sat on the mat')) > 0 AS result; SELECT 'embedding_for_passage_test_2' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('Lorem ipsum dolor sit amet')) > 0 AS result; SELECT 'embedding_for_passage_test_3' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('')) > 0 AS result; SELECT 'embedding_for_query_test_1' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('the cat sat on the mat')) > 0 AS result; SELECT 'embedding_for_query_test_2' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('Lorem ipsum dolor sit amet')) > 0 AS result; SELECT 'embedding_for_query_test_3' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_query('')) > 0 AS result; -- Test that passage and query embeddings have the same dimensions SELECT 'embedding_dimensions_match' AS test_name, vector_dims(rag_bge_small_en_v15.embedding_for_passage('test')) = vector_dims(rag_bge_small_en_v15.embedding_for_query('test')) AS result; ================================================ FILE: docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile ================================================ EXTENSION = rag_jina_reranker_v1_tiny_en MODULE_big = rag_jina_reranker_v1_tiny_en OBJS = $(patsubst %.rs,%.o,$(wildcard src/*.rs)) REGRESS = reranking_functions reranking_functions_enhanced PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress .PHONY installcheck: installcheck: dropdb --if-exists contrib_regression createdb contrib_regression ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) ================================================ FILE: docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions.out ================================================ -- Reranking function tests SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); round -------- 0.8989 (1 row) SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); array ----------------- {0.8989,1.3018} (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); round --------- -0.8989 (1 row) SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) as x); array ------------------- {-0.8989,-1.3018} (1 row) ================================================ FILE: docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/expected/reranking_functions_enhanced.out ================================================ -- Reranking function tests - single passage SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); round -------- 0.8989 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the tanks fired at the buildings')::NUMERIC,4); round -------- 1.3018 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('query about cats', 'information about felines')::NUMERIC,4); round -------- 1.3133 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('', 'empty query test')::NUMERIC,4); round -------- 0.7076 (1 row) -- Reranking function tests - array of passages SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); array ----------------- {0.8989,1.3018} (1 row) SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('query about programming', ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases'])) AS x); array ------------------------ {0.1659,0.3348,0.1013} (1 row) SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('empty array test', ARRAY[]::text[]); rerank_distance ----------------- {} (1 row) -- Reranking score function tests - single passage SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); round --------- -0.8989 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the tanks fired at the buildings')::NUMERIC,4); round --------- -1.3018 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('query about cats', 'information about felines')::NUMERIC,4); round --------- -1.3133 (1 row) SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('', 'empty query test')::NUMERIC,4); round --------- -0.7076 (1 row) -- Reranking score function tests - array of passages SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); array ------------------- {-0.8989,-1.3018} (1 row) SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('query about programming', ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases'])) AS x); array --------------------------- {-0.1659,-0.3348,-0.1013} (1 row) SELECT rag_jina_reranker_v1_tiny_en.rerank_score('empty array test', ARRAY[]::text[]); rerank_score -------------- {} (1 row) ================================================ FILE: docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions.sql ================================================ -- Reranking function tests SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) as x); ================================================ FILE: docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/sql/reranking_functions_enhanced.sql ================================================ -- Reranking function tests - single passage SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', 'the tanks fired at the buildings')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('query about cats', 'information about felines')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_distance('', 'empty query test')::NUMERIC,4); -- Reranking function tests - array of passages SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_distance('query about programming', ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases'])) AS x); SELECT rag_jina_reranker_v1_tiny_en.rerank_distance('empty array test', ARRAY[]::text[]); -- Reranking score function tests - single passage SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the baboon played with the balloon')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', 'the tanks fired at the buildings')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('query about cats', 'information about felines')::NUMERIC,4); SELECT ROUND(rag_jina_reranker_v1_tiny_en.rerank_score('', 'empty query test')::NUMERIC,4); -- Reranking score function tests - array of passages SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('the cat sat on the mat', ARRAY['the baboon played with the balloon', 'the tanks fired at the buildings'])) AS x); SELECT ARRAY(SELECT ROUND(x::NUMERIC,4) FROM unnest(rag_jina_reranker_v1_tiny_en.rerank_score('query about programming', ARRAY['Python is a programming language', 'Java is also a programming language', 'SQL is used for databases'])) AS x); SELECT rag_jina_reranker_v1_tiny_en.rerank_score('empty array test', ARRAY[]::text[]); ================================================ FILE: docker-compose/ext-src/rum-src/regular-test.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression . ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array ================================================ FILE: docker-compose/ext-src/rum-src/test-upgrade.patch ================================================ diff --git a/expected/rum.out b/expected/rum.out index 5966d19..8860b79 100644 --- a/expected/rum.out +++ b/expected/rum.out @@ -1,4 +1,3 @@ -CREATE EXTENSION rum; CREATE TABLE test_rum( t text, a tsvector ); CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT ON test_rum diff --git a/sql/rum.sql b/sql/rum.sql index 8414bb9..898e6ab 100644 --- a/sql/rum.sql +++ b/sql/rum.sql @@ -1,5 +1,3 @@ -CREATE EXTENSION rum; - CREATE TABLE test_rum( t text, a tsvector ); CREATE TRIGGER tsvectorupdate ================================================ FILE: docker-compose/ext-src/rum-src/test-upgrade.sh ================================================ #!/bin/sh set -ex cd "$(dirname ${0})" patch -p1 /dev/null; then exit 1 fi echo Running on \${PGHOST} if [[ -f ${extdir}/{}/neon-test.sh ]]; then echo Running from script ${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE}; else echo Running using make; USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE}; fi" ::: ${ORDERED_LIST} [[ ! -f ${FAILED_FILE} ]] && exit 0 else for d in "${LIST[@]}"; do [ -d "${d}" ] || continue if ! psql -w -c "select 1" >/dev/null; then FAILED="${d} ${FAILED}" break fi if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then "${d}/regular-test.sh" || FAILED="${d} ${FAILED}" continue fi if [ -f "${d}/neon-test.sh" ]; then "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" else USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" fi done [[ -z ${FAILED} ]] && exit 0 fi for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do cat "$(find $d -name regression.diffs)" done for postgis_diff in /tmp/pgis_reg/*_diff; do echo "${postgis_diff}:" cat "${postgis_diff}" done echo "${FAILED}" cat ${FAILED_FILE} exit 1 ================================================ FILE: docker-compose/test_extensions_upgrade.sh ================================================ #!/usr/bin/env bash set -eux -o pipefail cd "$(dirname "${0}")" # Takes a variable name as argument. The result is stored in that variable. generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } echo "${OLD_COMPUTE_TAG}" echo "${NEW_COMPUTE_TAG}" echo "${TEST_EXTENSIONS_TAG}" if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set exit 1 fi export PG_VERSION=${PG_VERSION:-16} export PG_TEST_VERSION=${PG_VERSION} # Waits for compute node is ready function wait_for_ready { TIME=0 while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do ((TIME += 1 )) sleep 1 done if [ ${TIME} -gt 300 ]; then echo Time is out. exit 2 fi } # Creates extensions. Gets a string with space-separated extensions as a parameter function create_extensions() { for ext in ${1}; do docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" done } # Creates a new timeline. Gets the parent ID and an extension name as parameters. # Saves the timeline ID in the variable EXT_TIMELINE function create_timeline() { generate_id new_timeline_id PARAMS=( -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}" "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") echo $result | jq . EXT_TIMELINE[${2}]=${new_timeline_id} } # Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter function check_timeline() { TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") if [ "${TID}" != "${1}" ]; then echo Timeline mismatch exit 1 fi } # Restarts the compute node with the required compute tag and timeline. # Accepts the tag for the compute node and the timeline as parameters. function restart_compute() { docker compose down compute1 compute_is_ready COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready wait_for_ready check_timeline ${2} } declare -A EXT_TIMELINE EXTENSIONS='[ {"extname": "plv8", "extdir": "plv8-src"}, {"extname": "vector", "extdir": "pgvector-src"}, {"extname": "unit", "extdir": "postgresql-unit-src"}, {"extname": "hypopg", "extdir": "hypopg-src"}, {"extname": "rum", "extdir": "rum-src"}, {"extname": "ip4r", "extdir": "ip4r-src"}, {"extname": "prefix", "extdir": "prefix-src"}, {"extname": "hll", "extdir": "hll-src"}, {"extname": "pg_cron", "extdir": "pg_cron-src"}, {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"}, {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"}, {"extname": "semver", "extdir": "pg_semver-src"}, {"extname": "pg_ivm", "extdir": "pg_ivm-src"}, {"extname": "pgjwt", "extdir": "pgjwt-src"}, {"extname": "pgtap", "extdir": "pgtap-src"}, {"extname": "pg_repack", "extdir": "pg_repack-src"}, {"extname": "h3", "extdir": "h3-pg-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") create_timeline "${EXT_TIMELINE["main"]}" init restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}" create_extensions "${EXTNAMES}" if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then exts="${EXTNAMES}" else query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion" exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") fi if [ -z "${exts}" ]; then echo "No extensions were upgraded" else for ext in ${exts}; do echo Testing ${ext}... create_timeline "${EXT_TIMELINE["main"]}" ${ext} EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE" restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs exit 1 fi docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update" docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" done fi ================================================ FILE: docs/.gitignore ================================================ book ================================================ FILE: docs/SUMMARY.md ================================================ # Summary # Looking for `neon.tech` docs? This page linkes to a selection of technical content about the open source code in this repository. Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code in this repository. # Architecture [Introduction]() - [Separation of Compute and Storage](./separation-compute-storage.md) - [Compute]() - [Postgres changes](./core_changes.md) - [Pageserver](./pageserver.md) - [Services](./pageserver-services.md) - [Thread management](./pageserver-thread-mgmt.md) - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) - [Compaction](./pageserver-compaction.md) - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) - [WAL Service](walservice.md) - [Consensus protocol](safekeeper-protocol.md) - [Source view](./sourcetree.md) - [docker.md](./docker.md) — Docker images and building pipeline. - [Error handling and logging](./error-handling.md) - [Glossary](./glossary.md) # Uncategorized - [authentication.md](./authentication.md) - [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. - [settings.md](./settings.md) #FIXME: move these under sourcetree.md #- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) #- [test_runner/README.md](/test_runner/README.md) # RFCs Major changes are documented in RFCS: - See [RFCs](./rfcs/README.md) for more information - view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs ================================================ FILE: docs/authentication.md ================================================ ## Authentication ### Overview We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL). storage_broker currently has no authentication. Authentication is optional and is disabled by default for easier debugging. It is used in some tests, though. Note that we do not cover authentication with `pg.neon.tech` here. For HTTP connections we use the Bearer authentication scheme. For PostgreSQL connections we expect the token to be passed as a password. There is a caveat for `psql`: it silently truncates passwords to 100 symbols, so to correctly pass JWT via `psql` you have to either use `PGPASSWORD` environment variable, or store password in `psql`'s config file. Current token scopes are described in `utils::auth::Scope`. There are no expiration or rotation schemes. _TODO_: some scopes allow both access to server management API and to the data. These probably should be split into multiple scopes. Tokens should not occur in logs. They may sometimes occur in configuration files, although this is discouraged because configs may be parsed and dumped into logs. #### Tokens generation and validation JWT tokens are signed using a private key. Compute/pageserver/safekeeper use the private key's public counterpart to validate JWT tokens. These components should not have access to the private key and may only get tokens from their configuration or external clients. The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`. There is currently no way to rotate the key without bringing down all components. ### Best practices See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725) ### Token format The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)). Example: Header: ``` { "alg": "EdDSA", "typ": "JWT" } ``` Payload: ``` { "scope": "tenant", # "tenant", "pageserverapi", or "safekeeperdata" "tenant_id": "5204921ff44f09de8094a1390a6a50f6", } ``` Meanings of scope: "tenant": Provides access to all data for a specific tenant "pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. Should only be used e.g. for status check/tenant creation/list. "safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. Should only be used e.g. for status check. Currently also used for connection from any pageserver to any safekeeper. "generations_api": Provides access to the upcall APIs served by the storage controller or the control plane. "admin": Provides access to the control plane and admin APIs of the storage controller. ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: ```bash openssl genpkey -algorithm ed25519 -out auth_private_key.pem openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem ``` Configuration files for all components point to `public_key.pem` for JWT validation. However, authentication is disabled by default. There is no way to automatically enable it everywhere, you have to configure each component individually. CLI also generates signed token (full access to Pageserver) and saves it in the CLI's `config` file under `pageserver.auth_token`. Note that pageserver's config does not have any similar parameter. CLI is the only component which accesses that token. Technically it could generate it from the private key on each run, but it does not do that for some reason (_TODO_). ### Compute #### Overview Compute is a per-timeline PostgreSQL instance, so it should not have any access to data of other tenants. All tokens used by a compute are restricted to a specific tenant. There is no auth isolation from other timelines of the same tenant, but a non-rogue client never accesses another timeline even by an accident: timeline IDs are random and hard to guess. #### Incoming connections All incoming connections are from PostgreSQL clients. Their authentication is just plain PostgreSQL authentication and out of scope for this document. There is no administrative API except those provided by PostgreSQL. #### Outgoing connections Compute connects to Pageserver for getting pages. The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN` environment variable is set, it is used as the password for the connection. (The pageserver uses JWT tokens for authentication, so the password is really a token.) Compute connects to Safekeepers to write and commit data. The list of safekeeper addresses is given in the `neon.safekeepers` GUC. The connections to the safekeepers take the password from the `$NEON_AUTH_TOKEN` environment variable, if set. The `compute_ctl` binary that runs before the PostgreSQL server, and launches PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the initial "base backup" dump, to initialize the PostgreSQL data directory. It also uses `$NEON_AUTH_TOKEN` as the password for the connection. ### Pageserver #### Overview Pageserver keeps track of multiple tenants, each having multiple timelines. For each timeline, it connects to the corresponding Safekeeper. Information about "corresponding Safekeeper" is published by Safekeepers in the storage_broker, but they do not publish access tokens, otherwise what is the point of authentication. Pageserver keeps a connection to some set of Safekeepers, which may or may not correspond to active Computes. Hence, we cannot obtain a per-timeline access token from a Compute. E.g. if the timeline's Compute terminates before all WAL is consumed by the Pageserver, the Pageserver continues consuming WAL. Pageserver replicas' authentication is the same as the main's. #### Incoming connections Pageserver listens for connections from computes. Each compute should present a token valid for the timeline's tenant. Pageserver also has HTTP API: some parts are per-tenant, some parts are server-wide, these are different scopes. Authentication can be enabled separately for the HTTP mgmt API, and for the libpq connections from compute. The `http_auth_type` and `pg_auth_type` configuration variables in Pageserver's config may have one of these values: * `Trust` removes all authentication. * `NeonJWT` enables JWT validation. Tokens are validated using the public key which lies in a PEM file specified in the `auth_validation_public_key_path` config. #### Outgoing connections Pageserver makes a connection to a Safekeeper for each active timeline. As Pageserver may want to access any timeline it has on the disk, it is given a blanket JWT token to access any data on any Safekeeper. This token is passed through an environment variable called `NEON_AUTH_TOKEN` (non-configurable as of writing this text). A better way _may be_ to store JWT token for each timeline next to it, but may be not. ### Safekeeper #### Overview Safekeeper keeps track of multiple tenants, each having multiple timelines. #### Incoming connections Safekeeper accepts connections from Compute/Pageserver, each connection corresponds to a specific timeline and requires a corresponding JWT token. Safekeeper also has HTTP API: some parts are per-tenant, some parts are server-wide, these are different scopes. The `auth-validation-public-key-path` command line options controls the authentication mode: * If the option is missing, there is no authentication or JWT token validation. * If the option is present, it should be a path to the public key PEM file used for JWT token validation. #### Outgoing connections No connections are initiated by a Safekeeper. ### In the source code Tests do not use authentication by default. If you need it, you can enable it by configuring the test's environment: ```python neon_env_builder.auth_enabled = True ``` You will have to generate tokens if you want to access components inside the test directly, use `AuthKeys.generate_*_token` methods for that. If you create a new scope, please create a new method to prevent mistypes in scope's name. ================================================ FILE: docs/book.toml ================================================ [book] language = "en" multilingual = false src = "." title = "Neon architecture" ================================================ FILE: docs/consumption_metrics.md ================================================ ### Overview Pageserver and proxy periodically collect consumption metrics and push them to a HTTP endpoint. This doc describes current implementation details. For design details see [the RFC](./rfcs/021-metering.md) and [the discussion on Github](https://github.com/neondatabase/neon/pull/2884). - The metrics are collected in a separate thread, and the collection interval and endpoint are configurable. - Metrics are cached, so that we don't send unchanged metrics on every iteration. - Metrics are sent in batches of 1000 (see CHUNK_SIZE const) metrics max with no particular grouping guarantees. batch format is ```json { "events" : [metric1, metric2, ...] } ``` See metric format examples below. - All metrics values are in bytes, unless otherwise specified. - Currently no retries are implemented. ### Pageserver metrics #### Configuration The endpoint and the collection interval are specified in the pageserver config file (or can be passed as command line arguments): `metric_collection_endpoint` defaults to None, which means that metric collection is disabled by default. `metric_collection_interval` defaults to 10min #### Metrics Currently, the following metrics are collected: - `written_size` Amount of WAL produced , by a timeline, i.e. last_record_lsn This is an absolute, per-timeline metric. - `remote_storage_size` Size of the remote storage (S3) directory. This is an absolute, per-tenant metric. - `timeline_logical_size` Logical size of the data in the timeline. This is an absolute, per-timeline metric. - `synthetic_storage_size` Size of all tenant's branches including WAL. This is the same metric that `tenant/{tenant_id}/size` endpoint returns. This is an absolute, per-tenant metric. Synthetic storage size is calculated in a separate thread, so it might be slightly outdated. #### Format example ```json { "metric": "remote_storage_size", "type": "absolute", "time": "2022-12-28T11:07:19.317310284Z", "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", "value": 12345454, "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d", "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143", } ``` `idempotency_key` is a unique key for each metric, so that we can deduplicate metrics. It is a combination of the time, node_id and a random number. ### Proxy consumption metrics #### Configuration The endpoint and the collection interval can be passed as command line arguments for proxy: `metric_collection_endpoint` no default, which means that metric collection is disabled by default. `metric_collection_interval` no default #### Metrics Currently, only one proxy metric is collected: - `proxy_io_bytes_per_client` Outbound traffic per client. This is an incremental, per-endpoint metric. #### Format example ```json { "metric": "proxy_io_bytes_per_client", "type": "incremental", "start_time": "2022-12-28T11:07:19.317310284Z", "stop_time": "2022-12-28T11:07:19.317310284Z", "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", "value": 12345454, "endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d", } ``` The metric is incremental, so the value is the difference between the current and the previous value. If there is no previous value, the value is the current value and the `start_time` equals `stop_time`. ### TODO - [ ] Handle errors better: currently if one tenant fails to gather metrics, the whole iteration fails and metrics are not sent for any tenant. - [ ] Add retries - [ ] Tune the interval ================================================ FILE: docs/core_changes.md ================================================ # Postgres core changes This lists all the changes that have been made to the PostgreSQL source tree, as a somewhat logical set of patches. The long-term goal is to eliminate all these changes, by submitting patches to upstream and refactoring code into extensions, so that you can run unmodified PostgreSQL against Neon storage. In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for the WAL redo process. In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the extensions, and for any new features, that is preferred over modifying core PostgreSQL code. Below is a list of all the PostgreSQL source code changes, categorized into changes needed for compute, and changes needed for the WAL redo process: # Changes for Compute node ## Prefetching There are changes in many places to perform prefetching, for example for sequential scans. Neon doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local disk, so prefetching is critical for performance, also for sequential scans. ### How to get rid of the patch Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully do more. ## Add t_cid to heap WAL records ``` src/backend/access/heap/heapam.c | 26 +- src/include/access/heapam_xlog.h | 6 +- ``` We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL! ### Problem we're trying to solve The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running. ### How to get rid of the patch Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information. Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded. ### Alternatives Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated. ## Mark index builds that use buffer manager without logging explicitly ``` src/backend/access/gin/gininsert.c | 7 + src/backend/access/gist/gistbuild.c | 15 +- src/backend/access/spgist/spginsert.c | 8 +- also some changes in src/backend/storage/smgr/smgr.c ``` pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too. When a GIN index is built, for example, it is built by inserting the entries into the index more or less normally, but without WAL-logging anything. After the index has been built, we iterate through all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon extension. To fix that, we've added a few functions to track explicitly when we're performing such an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and `smgr_end_unlogged_build`. ### How to get rid of the patch I think it would make sense to be more explicit about that in PostgreSQL too. So extract these changes to a patch and post to pgsql-hackers. Perhaps we could deduce that an unlogged index build has started when we see a page being evicted with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that. ## Track last-written page LSN ``` src/backend/commands/dbcommands.c | 17 +- Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too ``` Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same LSN in the GetPage@LSN request when reading the page back from the page server. The value is conservative: it would be correct to always use the last-inserted LSN, but it would be slow because then the page server would need to wait for the recent WAL to be streamed and processed, before responding to any GetPage@LSN request. The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes, but there are a few exceptions where we've had to add explicit calls to the Neon-specific SetLastWrittenPageLSN() function. There's an open PR to track the LSN in a more-fine grained fashion: https://github.com/neondatabase/postgres/pull/177 PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of relying copying files and checkpoint. With that method, we probably won't need any special handling. The old method is still available, though. ### How to get rid of the patch Wait until v15? ## Allow startup without reading checkpoint record In Neon, the compute node is stateless. So when we are launching compute node, we need to provide some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres still need some non-relational data: control and configuration files, SLRUs,... It is currently implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It includes in this tarball config/control files, SLRUs and required directories. As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record. There are some changes in xlog.c, to allow starting the compute node without reading the last checkpoint record from WAL. This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo. ### How to get rid of the patch ??? ### Alternatives Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL. ## Disable sequence caching ``` diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 0415df9ccb..9f9db3c8bc 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -53,7 +53,9 @@ * so we pre-log a few fetches in advance. In the event of * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 +/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 ``` Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache, we can get a gap in sequence values even without crash. ### How to get rid of the patch Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It would be weird if the sequence moved backwards though, think of PITR. Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon. ## Make smgr interface available to extensions ``` src/backend/storage/smgr/smgr.c | 203 +++--- src/include/storage/smgr.h | 72 +- ``` ### How to get rid of the patch Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression. We have submitted this to upstream, but it's moving at glacial a speed. https://commitfest.postgresql.org/47/4428/ ## Added relpersistence argument to smgropen() ``` src/backend/access/heap/heapam_handler.c | 2 +- src/backend/catalog/storage.c | 10 +- src/backend/commands/tablecmds.c | 2 +- src/backend/storage/smgr/md.c | 4 +- src/include/utils/rel.h | 3 +- ``` Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc. implementations need to know the 'relpersistence' of the relation. To get that information where it's needed, we added the 'relpersistence' field to smgropen(). ### How to get rid of the patch Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to extensions. ## Alternatives Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the relpersistence argument to handle index builds? See item on "Mark index builds that use buffer manager without logging explicitly". ## Use smgr and dbsize_hook for size calculations ``` src/backend/utils/adt/dbsize.c | 61 +- ``` In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon. ### How to get rid of the patch Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as part of the general smgr API patch. # WAL redo process changes Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall victim to carefully designed malicious WAL records and start doing harmful things to the system. To prevent this, the redo functions are executed in a separate process that is sandboxed with Linux Secure Computing mode (see seccomp(2) man page). As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to leverage PostgreSQL code. Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be safe to call them directly from Rust code, without needing the security sandbox. That's not feasible for similar reasons as rewriting them in Rust. ## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying ``` src/backend/access/gin/ginxlog.c | 19 +- Also some changes in xlog.c and xlogutils.c Example: @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of left page"); ``` ### Problem we're trying to solve In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is unexpected by code like the above. ### How to get rid of the patch Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from these changes, although it doesn't have any benefit either. To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream. ### Alternatives Maybe we could revert this optimization, and restore pages other than the target page too. ## Add predefined_sysidentifier flag to initdb ``` src/backend/bootstrap/bootstrap.c | 13 +- src/bin/initdb/initdb.c | 4 + And some changes in xlog.c ``` This is used to help with restoring a database when you have all the WAL, all the way back to initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same sysidentifier. ### How to get rid of the patch Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres patches, we can just keep it around as a patch or as separate branch in a repo. ## pg_waldump flags to ignore errors After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors. ### How to get rid of the patch Like previous one, ignore it. ## Backpressure if pageserver doesn't ingest WAL fast enough ``` @@ -3200,6 +3202,7 @@ ProcessInterrupts(void) return; InterruptPending = false; +retry: if (ProcDiePending) { ProcDiePending = false; @@ -3447,6 +3450,13 @@ ProcessInterrupts(void) if (ParallelApplyMessagePending) HandleParallelApplyMessages(); + + /* Call registered callback if any */ + if (ProcessInterruptsCallback) + { + if (ProcessInterruptsCallback()) + goto retry; + } } ``` ### How to get rid of the patch Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions too. ## SLRU on-demand download ``` src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 13 deletions(-) ``` ### Problem we're trying to solve Previously, SLRU files were included in the basebackup, but the total size of them can be large, several GB, and downloading them all made the startup time too long. ### Alternatives FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files ## WAL-log an all-zeros page as one large hole - In XLogRecordAssemble() ### Problem we're trying to solve This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL. ### How to get rid of the patch Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps. ## Shut down walproposer after checkpointer ``` + /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */ + if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) && ``` This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers. ### How to get rid of the patch Do a bigger refactoring of the postmaster state machine, such that a background worker can specify the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and would benefit from a refactoring for the sake of readability anyway. ## EXPLAIN changes for prefetch and LFC ### How to get rid of the patch Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state. ## On-demand download of extensions ### How to get rid of the patch FUSE or LD_PRELOAD trickery to intercept reads? ## Publication superuser checks We have hacked CreatePublication so that also neon_superuser can create them. ### How to get rid of the patch Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users. ## WAL log replication slots ### How to get rid of the patch Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else. ## WAL-log replication snapshots ### How to get rid of the patch WAL-log them periodically, from a backgound worker. ## WAL-log relmapper files Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged ### How to get rid of the patch WAL-log them periodically, from a backgound worker. ## XLogWaitForReplayOf() ?? # Not currently committed but proposed ## Disable ring buffer buffer manager strategies ### Why? Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...). Even if there are free space in buffer cache, pages may be evicted. Negative effect of it can be somehow compensated by file system cache, but in Neon, cost of requesting page from page server is much higher. ### Alternatives? Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy, for example copy evicted page from ring buffer to some other buffer if there is free space in buffer cache. ## Disable marking page as dirty when hint bits are set. ### Why? Postgres has to modify page twice: first time when some tuple is updated and second time when hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL. ### Alternatives? Add special WAL record for setting page hints. ## Prewarming ### Why? Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon. But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. We can capture state of compute node buffer cache and send bulk request for this pages at startup. ================================================ FILE: docs/docker.md ================================================ # Docker images of Neon ## Images Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile). ## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs 1. `neondatabase/compute-node-v17` (and -16, -v15, -v14) 2. `neondatabase/neon` ## Docker Compose example You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers. - pageserver x 1 - safekeeper x 3 - storage_broker x 1 - compute x 1 - MinIO x 1 # This is Amazon S3 compatible object storage ### How to use 1. create containers You can specify version of neon cluster using following environment values. - PG_VERSION: postgres version for compute (default is 16 as of this writing) - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest' ``` $ cd docker-compose/ $ docker-compose down # remove the containers if exists $ PG_VERSION=16 TAG=latest docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) ``` 2. connect compute node ``` $ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres psql (16.3) Type "help" for help. postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1, 1); INSERT 0 1 postgres=# select * from t; key | value -----+------- 1 | 1 (1 row) ``` 3. If you want to see the log, you can use `docker-compose logs` command. ``` # check the container name you want to see $ docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 3582f6d76227 docker-compose_compute "/shell/compute.sh" 2 minutes ago Up 2 minutes 0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp docker-compose_compute_1 (...omit...) $ docker logs -f docker-compose_compute_1 2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql 2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400' (...omit...) ``` 4. If you want to see durable data in MinIO which is s3 compatible storage Access http://localhost:9001 and sign in. - Username: `minio` - Password: `password` You can see durable pages and WAL data in `neon` bucket. ================================================ FILE: docs/error-handling.md ================================================ # Error handling and logging ## Logging errors The principle is that errors are logged when they are handled. If you just propagate an error to the caller in a function, you don't need to log it; the caller will. But if you consume an error in a function, you *must* log it (if it needs to be logged at all). For example: ```rust fn read_motd_file() -> std::io::Result { let mut f = File::open("/etc/motd")?; let mut result = String::new(); f.read_to_string(&mut result)?; result } ``` Opening or reading the file could fail, but there is no need to log the error here. The function merely propagates the error to the caller, and it is up to the caller to log the error or propagate it further, if the failure is not expected. But if, for example, it is normal that the "/etc/motd" file doesn't exist, the caller can choose to silently ignore the error, or log it as an INFO or DEBUG level message: ```rust fn get_message_of_the_day() -> String { // Get the motd from /etc/motd, or return the default proverb match read_motd_file() { Ok(motd) => motd, Err(err) => { // It's normal that /etc/motd doesn't exist, but if we fail to // read it for some other reason, that's unexpected. The message // of the day isn't very important though, so we just WARN and // continue with the default in any case. if err.kind() != std::io::ErrorKind::NotFound { tracing::warn!("could not read \"/etc/motd\": {err:?}"); } "An old error is always more popular than a new truth. - German proverb" } } } ``` ## Error types We use the `anyhow` crate widely. It contains many convenient macros like `bail!` and `ensure!` to construct and return errors, and to propagate many kinds of low-level errors, wrapped in `anyhow::Error`. A downside of `anyhow::Error` is that the caller cannot distinguish between different error cases. Most errors are propagated all the way to the mgmt API handler function, or the main loop that handles a connection with the compute node, and they are all handled the same way: the error is logged and returned to the client as an HTTP or libpq error. But in some cases, we need to distinguish between errors and handle them differently. For example, attaching a tenant to the pageserver could fail either because the tenant has already been attached, or because we could not load its metadata from cloud storage. The first case is more or less expected. The console sends the Attach request to the pageserver, and the pageserver completes the operation, but the network connection might be lost before the console receives the response. The console will retry the operation in that case, but the tenant has already been attached. It is important that the pagserver responds with the HTTP 403 Already Exists error in that case, rather than a generic HTTP 500 Internal Server Error. If you need to distinguish between different kinds of errors, create a new `Error` type. The `thiserror` crate is useful for that. But in most cases `anyhow::Error` is good enough. ## Panics Depending on where a panic happens, it can cause the whole pageserver or safekeeper to restart, or just a single tenant. In either case, that is pretty bad and causes an outage. Avoid panics. Never use `unwrap()` or other calls that might panic, to verify inputs from the network or from disk. It is acceptable to use functions that might panic, like `unwrap()`, if it is obvious that it cannot panic. For example, if you have just checked that a variable is not None, it is OK to call `unwrap()` on it, but it is still preferable to use `expect("reason")` instead to explain why the function cannot fail. `assert!` and `panic!` are reserved for checking clear invariants and very obvious "can't happen" cases. When in doubt, use anyhow `ensure!` or `bail!` instead. ## Error levels `tracing::Level` doesn't provide very clear guidelines on what the different levels mean, or when to use which level. Here is how we use them: ### Error Examples: - could not open file "foobar" - invalid tenant id Errors are not expected to happen during normal operation. Incorrect inputs from client can cause ERRORs. For example, if a client tries to call a mgmt API that doesn't exist, or if a compute node sends passes an LSN that has already been garbage collected away. These should *not* happen during normal operations. "Normal operations" is not a very precise concept. But for example, disk errors are not expected to happen when the system is working, so those count as Errors. However, if a TCP connection to a compute node is lost, that is not considered an Error, because it doesn't affect the pageserver's or safekeeper's operation in any way, and happens fairly frequently when compute nodes are shut down, or are killed abruptly because of errors in the compute. **Errors are monitored, and always need human investigation to determine the cause.** Whether something should be logged at ERROR, WARNING or INFO level can depend on the callers and clients. For example, it might be unexpected and a sign of a serious issue if the console calls the "timeline_detail" mgmt API for a timeline that doesn't exist. ERROR would be appropriate in that case. But if the console routinely calls the API after deleting a timeline, to check if the deletion has completed, then it would be totally normal and an INFO or DEBUG level message would be more appropriate. If a message is logged as an ERROR, but it in fact happens frequently in production and never requires any action, it should probably be demoted to an INFO level message. ### Warn Examples: - could not remove temporary file "foobar.temp" - unrecognized file "foobar" in timeline directory Warnings are similar to Errors, in that they should not happen when the system is operating normally. The difference between Error and Warning is that an Error means that the operation failed, whereas Warning means that something unexpected happened, but the operation continued anyway. For example, if deleting a file fails because the file already didn't exist, it should be logged as Warning. > **Note:** The python regression tests, under `test_regress`, check the > pageserver log after each test for any ERROR and WARN lines. If there are > any ERRORs or WARNs that have not been explicitly listed in the test as > allowed, the test is marked a failed. This is to catch unexpected errors > e.g. in background operations, that don't cause immediate misbehaviour in > the tested functionality. ### Info Info level is used to log useful information when the system is operating normally. Info level is appropriate e.g. for logging state changes, background operations, and network connections. Examples: - "system is shutting down" - "tenant was created" - "retrying S3 upload" ### Debug & Trace Debug and Trace level messages are not printed to the log in our normal production configuration, but could be enabled for a specific server or tenant, to aid debugging. (Although we don't actually have that capability as of this writing). ## Context We use logging "spans" to hold context information about the current operation. Almost every operation happens on a particular tenant and timeline, so we enter a span with the "tenant_id" and "timeline_id" very early when processing an incoming API request, for example. All background operations should also run in a span containing at least those two fields, and any other parameters or information that might be useful when debugging an error that might happen when performing the operation. TODO: Spans are not captured in the Error when it is created, but when the error is logged. It would be more useful to capture them at Error creation. We should consider using `tracing_error::SpanTrace` to do that. ## Error message style ### PostgreSQL extensions PostgreSQL has a style guide for writing error messages: https://www.postgresql.org/docs/current/error-style-guide.html Follow that guide when writing error messages in the PostgreSQL extensions. ### Neon Rust code #### Anyhow Context When adding anyhow `context()`, use form `present-tense-verb+action`. Example: - Bad: `file.metadata().context("could not get file metadata")?;` - Good: `file.metadata().context("get file metadata")?;` #### Logging Errors When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`. If `e` is an `anyhow` error and you want to log the backtrace that it contains, use `{e:?}` instead of `{e:#}`. #### Rationale The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `. For example, the following Rust code will result in output ``` ERROR failed to list users: load users from server: parse response: invalid json ``` This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.: ``` ERROR could not list users: could not load users from server: could not parse response: invalid json ``` ```rust fn main() { match list_users().context("list users") else { Ok(_) => ..., Err(e) => tracing::error!("failed to {e:#}"), } } fn list_users() { http_get_users().context("load users from server")?; } fn http_get_users() { let response = client....?; response.parse().context("parse response")?; // fails with serde error "invalid json" } ``` ================================================ FILE: docs/glossary.md ================================================ # Glossary ### Authentication ### Backpressure Backpressure is used to limit the lag between pageserver and compute node or WAL service. If compute node or WAL service run far ahead of Page Server, the time of serving page requests increases. This may lead to timeout errors. To tune backpressure limits use `max_replication_write_lag`, `max_replication_flush_lag` and `max_replication_apply_lag` settings. When lag between current LSN (pg_current_wal_flush_lsn() at compute node) and minimal write/flush/apply position of replica exceeds the limit backends performing writes are blocked until the replica is caught up. ### Base image (page image) ### Basebackup A tarball with files needed to bootstrap a compute node[] and a corresponding command to create it. NOTE:It has nothing to do with PostgreSQL pg_basebackup. ### Branch We can create branch at certain LSN using `neon_local timeline branch` command. Each Branch lives in a corresponding timeline[] and has an ancestor[]. ### Checkpoint (PostgreSQL) NOTE: This is an overloaded term. A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; ### Checkpoint (Layered repository) NOTE: This is an overloaded term. Whenever enough WAL has been accumulated in memory, the page server [] writes out the changes from the in-memory layer into a new delta layer file. This process is called "checkpointing". Configuration parameter `checkpoint_distance` defines the distance from current LSN to perform checkpoint of in-memory layers. Default is `DEFAULT_CHECKPOINT_DISTANCE`. ### Compaction A background operation on layer files. Compaction takes a number of L0 layer files, each of which covers the whole key space and a range of LSN, and reshuffles the data in them into L1 files so that each file covers the whole LSN range, but only part of the key space. Compaction should also opportunistically leave obsolete page versions from the L1 files, and materialize other page versions for faster access. That hasn't been implemented as of this writing, though. ### Compute node Stateless Postgres node that stores data in pageserver. ### Garbage collection The process of removing old on-disk layers that are not needed by any timeline anymore. ### Fork Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map. ### Layer A layer contains data needed to reconstruct any page versions within the layer's Segment and range of LSNs. There are two kinds of layers, in-memory and on-disk layers. In-memory layers are used to ingest incoming WAL, and provide fast access to the recent page versions. On-disk layers are stored as files on disk, and are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more. ### Layer file (on-disk layer) Layered repository on-disk format is based on immutable files. The files are called "layer files". There are two kinds of layer files: image files and delta files. An image file contains a "snapshot" of a range of keys at a particular LSN, and a delta file contains WAL records applicable to a range of keys, in a range of LSNs. ### Layer map The layer map tracks what layers exist in a timeline. ### Layered repository Neon repository implementation that keeps data in layers. ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. The insert position is a byte offset into the logs, increasing monotonically with each new record. Internally, an LSN is a 64-bit integer, representing a byte position in the write-ahead log stream. It is printed as two hexadecimal numbers of up to 8 digits each, separated by a slash. Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html) Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery. In Postgres and Neon LSNs are used to describe certain points in WAL handling. PostgreSQL LSNs and functions to monitor them: * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location. * `pg_current_wal_lsn()` - Returns the current write-ahead log write location. * `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location. * `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically. * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information. * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `VCL`: the largest LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN. * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash. TODO: use this name consistently in remote storage code. Now `disk_consistent_lsn` is used and meaning depends on the context. * `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created) TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs. ### Logical size The pageserver tracks the "logical size" of a timeline. It is the total size of all relations in all Postgres databases on the timeline. It includes all user and system tables, including their FSM and VM forks. But it does not include SLRUs, twophase files or any other such data or metadata that lives outside relations. The logical size is calculated by the pageserver, and is sent to PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses the logical size to enforce the size limit in the free tier. The logical size is also shown to users in the web console. The logical size is not affected by branches or the physical layout of layer files in the pageserver. If you have a database with 1 GB logical size and you create a branch of it, both branches will have 1 GB logical size, even though the branch is copy-on-write and won't consume any extra physical disk space until you make changes to it. ### Page (block) The basic structure used to store relation data. All pages are of the same size. This is the unit of data exchange between compute node and pageserver. ### Pageserver Neon storage engine: repositories + wal receiver + page service + wal redo. ### Page service The Page Service listens for GetPage@LSN requests from the Compute Nodes, and responds with pages from the repository. ### PITR (Point-in-time-recovery) PostgreSQL's ability to restore up to a specified LSN. ### Primary node ### Proxy Postgres protocol proxy/router. This service listens psql port, can check auth via external service and create new databases and accounts (control plane API in our case). ### Relation The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order. ### Replication slot ### Replica node ### Repository Repository stores multiple timelines, forked off from the same initial call to 'initdb' and has associated WAL redo service. One repository corresponds to one Tenant. ### Retention policy How much history do we need to keep around for PITR and read-only nodes? ### Segment A physical file that stores data for a given relation. File segments are limited in size by a compile-time setting (1 gigabyte by default), so if a relation exceeds that size, it is split into multiple segments. ### SLRU SLRUs include pg_clog, pg_multixact/members, and pg_multixact/offsets. There are other SLRUs in PostgreSQL, but they don't need to be stored permanently (e.g. pg_subtrans), or we do not support them in neon yet (pg_commit_ts). ### Tenant (Multitenancy) Tenant represents a single customer, interacting with Neon. Wal redo[] activity, timelines[], layers[] are managed for each tenant independently. One pageserver[] can serve multiple tenants at once. One safekeeper See `docs/multitenancy.md` for more. ### Timeline Timeline accepts page changes and serves get_page_at_lsn() and get_rel_size() requests. The term "timeline" is used internally in the system, but to users they are exposed as "branches", with human-friendly names. NOTE: this has nothing to do with PostgreSQL WAL timelines. ### XLOG PostgreSQL alias for WAL[]. ### WAL (Write-ahead log) The journal that keeps track of the changes in the database cluster as user- and system-invoked operations take place. It comprises many individual WAL records[] written sequentially to WAL files[]. ### WAL acceptor, WAL proposer In the context of the consensus algorithm, the Postgres compute node is also known as the WAL proposer, and the safekeeper is also known as the acceptor. Those are the standard terms in the Paxos algorithm. ### WAL receiver (WAL decoder) The WAL receiver connects to the external WAL safekeeping service (or directly to the primary) using PostgreSQL physical streaming replication, and continuously receives WAL. It decodes the WAL records, and stores them to the repository. We keep one WAL receiver active per timeline. ### WAL record A low-level description of an individual data change. ### WAL redo A service that runs PostgreSQL in a special wal_redo mode to apply given WAL records over an old page image and return new page image. ### WAL safekeeper One node that participates in the quorum. All the safekeepers together form the WAL service. ### WAL segment (WAL file) Also known as WAL segment or WAL segment file. Each of the sequentially-numbered files that provide storage space for WAL. The files are all of the same predefined size and are written in sequential order, interspersing changes as they occur in multiple simultaneous sessions. ### WAL service The service as whole that ensures that WAL is stored durably. ### Web console ================================================ FILE: docs/multitenancy.md ================================================ ## Multitenancy ### Overview Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. ### Tenants in other commands By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: ```sh neon_local tenant list neon_local tenant create // generates new id neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f neon_local pg create main // default tenant from neon init neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f ``` ### Data layout On the page server tenants introduce one level of indirection, so data directory structured the following way: ``` ├── pageserver.log ├── pageserver.pid ├── pageserver.toml └── tenants ├── 537cffa58a4fa557e49e19951b5a9d6b ├── de182bc61fb11a5a6b390a8aed3a804a └── ee6016ec31116c1b7c33dfdfca38891f ``` Wal redo activity and timelines are managed for each tenant independently. For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way: ``` pgdatadirs └── tenants ├── de182bc61fb11a5a6b390a8aed3a804a │ └── main └── ee6016ec31116c1b7c33dfdfca38892f └── main ``` ### Changes to postgres Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id is added to commands issued to pageserver, namely: pagestream, callmemaybe. Tenant id is also exists in ServerInfo structure, this is needed to pass the value to wal receiver to be able to forward it to the pageserver. ### Safety For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id). ================================================ FILE: docs/pageserver-compaction.md ================================================ # Pageserver Compaction Lifted from . Updated 2025-03-26. ## Pages and WAL Postgres stores data in 8 KB pages, identified by a page number. The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs. Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN. Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN. ## Compaction: Why? Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree. When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups). As WAL writes continue, more layer files accumulate. Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification. Compaction’s job is to: - Reduce read amplification by reorganizing and combining layer files. - Remove old garbage from layer files. As part of this, it may combine several page deltas into a single page image where possible. ## Compaction: How? Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1. Compaction runs in two phases: L0→L1 compaction, and L1 image compaction. L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example: ``` | Page 0-99 @ LSN 0400-04ff | | Page 0-99 @ LSN 0300-03ff | | Page 0-99 @ LSN 0200-02ff | | Page 0-99 @ LSN 0100-01ff | | Page 0-99 @ LSN 0000-00ff | ``` L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB). L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example: ``` Delta layers: | 30-84@0310-04ff | Delta layers: | 10-42@0200-02ff | | 65-92@0174-02aa | Image layers: | 0-39@0100 | 40-79@0100 | 80-99@0100 | ``` L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN. Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR. Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image. ## Compaction: When? Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10). L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10). L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers. At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait. Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down: - L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`). - L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`). - If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs). ## Backpressure With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop. To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload: - At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long. - At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough. This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at: - `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags - `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard. Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure. ## Circuit Breaker Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc. If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore. To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not). Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly. ================================================ FILE: docs/pageserver-page-service.md ================================================ # Page Service The Page Service listens for GetPage@LSN requests from the Compute Nodes, and responds with pages from the repository. On each GetPage@LSN request, it calls into the Repository function A separate thread is spawned for each incoming connection to the page service. The page service uses the libpq protocol to communicate with the client. The client is a Compute Postgres instance. ================================================ FILE: docs/pageserver-pagecache.md ================================================ # Page cache TODO: - shared across tenants - store pages from layer files - store pages from "in-memory layer" ================================================ FILE: docs/pageserver-processing-getpage.md ================================================ # Processing a GetPage request TODO: - sequence diagram that shows how a GetPage@LSN request is processed ================================================ FILE: docs/pageserver-processing-wal.md ================================================ # Processing WAL TODO: - diagram that shows how incoming WAL is processed - explain durability, what is fsync'd when, disk_consistent_lsn ================================================ FILE: docs/pageserver-services.md ================================================ # Services The Page Server consists of multiple threads that operate on a shared repository of page versions: ``` | WAL V +--------------+ | | | WAL receiver | | | +--------------+ ...... +---------+ +--------+ . . | | | | . . GetPage@LSN | | | backup | -------> . S3 . -------------> | Page | repository | | . . | Service | +--------+ . . page | | ...... <------------- | | +---------+ +-----------+ +--------------------+ | WAL redo | | Checkpointing, | +----------+ | processes | | Garbage collection | | | +-----------+ +--------------------+ | HTTP | | mgmt API | | | +----------+ Legend: +--+ | | A thread or multi-threaded service +--+ ---> Data flow <--- ``` ## Page Service The Page Service listens for GetPage@LSN requests from the Compute Nodes, and responds with pages from the repository. On each GetPage@LSN request, it calls into the Repository function A separate thread is spawned for each incoming connection to the page service. The page service uses the libpq protocol to communicate with the client. The client is a Compute Postgres instance. ## WAL Receiver The WAL receiver connects to the external WAL safekeeping service using PostgreSQL physical streaming replication, and continuously receives WAL. It decodes the WAL records, and stores them to the repository. ## Backup service The backup service, responsible for storing pageserver recovery data externally. Currently, pageserver stores its files in a filesystem directory it's pointed to. That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached". Therefore, the server interacts with external, more reliable storage to back up and restore its state. The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait. There are the following implementations present: * local filesystem — to use in tests mainly * AWS S3 - to use in production The backup service is disabled by default and can be enabled to interact with a single remote storage. CLI examples: * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` * AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. For local S3 installations, refer to their documentation for name format and credentials. Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. Required sections are: ```toml [remote_storage] local_path = '/Users/someonetoignore/Downloads/tmp_dir/' ``` or ```toml [remote_storage] bucket_name = 'some-sample-bucket' bucket_region = 'eu-north-1' prefix_in_bucket = '/test_prefix/' ``` `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. or ```toml [remote_storage] container_name = 'some-container-name' storage_account = 'somestorageaccnt' container_region = 'us-east' prefix_in_container = '/test-prefix/' ``` The `AZURE_STORAGE_ACCESS_KEY` env variable can be used to specify the azure credentials if needed. ## Repository background tasks The Repository also has a few different background threads and tokio tasks that perform background duties like dumping accumulated WAL data from memory to disk, reorganizing files for performance (compaction), and garbage collecting old files. Repository ---------- The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in `layered_repository.rs`. There is only that one implementation of the Repository trait, but it's still a useful abstraction that keeps the interface for the low-level storage functionality clean. The layered storage format is described in [pageserver-storage.md](./pageserver-storage.md). Each repository consists of multiple Timelines. Timeline is a workhorse that accepts page changes from the WAL, and serves get_page_at_lsn() and get_rel_size() requests. Note: this has nothing to do with PostgreSQL WAL timeline. The term "timeline" is mostly interchangeable with "branch", there is a one-to-one mapping from branch to timeline. A timeline has a unique ID within the tenant, represented as 16-byte hex string that never changes, whereas a branch is a user-given name for a timeline. Each repository also has a WAL redo manager associated with it, see `walredo.rs`. The WAL redo manager is used to replay PostgreSQL WAL records, whenever we need to reconstruct a page version from WAL to satisfy a GetPage@LSN request, or to avoid accumulating too much WAL for a page. The WAL redo manager uses a Postgres process running in special Neon wal-redo mode to do the actual WAL redo, and communicates with the process using a pipe. Checkpointing / Garbage Collection ---------------------------------- Periodically, the checkpointer thread wakes up and performs housekeeping duties on the repository. It has two duties: ### Checkpointing Flush WAL that has accumulated in memory to disk, so that the old WAL can be truncated away in the WAL safekeepers. Also, to free up memory for receiving new WAL. This process is called "checkpointing". It's similar to checkpointing in PostgreSQL or other DBMSs, but in the page server, checkpointing happens on a per-segment basis. ### Garbage collection Remove old on-disk layer files that are no longer needed according to the PITR retention policy TODO: Sharding -------------------- We should be able to run multiple Page Servers that handle sharded data. ================================================ FILE: docs/pageserver-storage.md ================================================ # Pageserver storage The main responsibility of the Page Server is to process the incoming WAL, and reprocess it into a format that allows reasonably quick access to any page version. The page server slices the incoming WAL per relation and page, and packages the sliced WAL into suitably-sized "layer files". The layer files contain all the history of the database, back to some reasonable retention period. This system replaces the base backups and the WAL archive used in a traditional PostgreSQL installation. The layer files are immutable, they are not modified in-place after creation. New layer files are created for new incoming WAL, and old layer files are removed when they are no longer needed. The on-disk format is based on immutable files. The page server receives a stream of incoming WAL, parses the WAL records to determine which pages they apply to, and accumulates the incoming changes in memory. Whenever enough WAL has been accumulated in memory, it is written out to a new immutable file. That process accumulates "L0 delta files" on disk. When enough L0 files have been accumulated, they are merged and re-partitioned into L1 files, and old files that are no longer needed are removed by Garbage Collection (GC). The incoming WAL contains updates to arbitrary pages in the system. The distribution depends on the workload: the updates could be totally random, or there could be a long stream of updates to a single relation when data is bulk loaded, for example, or something in between. ``` Cloud Storage Page Server Safekeeper L1 L0 Memory WAL +----+ +----+----+ |AAAA| |AAAA|AAAA| +---+-----+ | +----+ +----+----+ | | | |AA |BBBB| |BBBB|BBBB| |BB | AA | |BB +----+----+ +----+----+ |C | BB | |CC |CCCC|CCCC| <---- |CCCC|CCCC| <--- |D | CC | <--- |DDD <---- ADEBAABED +----+----+ +----+----+ | | DDD | |E |DDDD|DDDD| |DDDD|DDDD| |E | | | +----+----+ +----+----+ | | | |EEEE| |EEEE|EEEE| +---+-----+ +----+ +----+----+ ``` In this illustration, WAL is received as a stream from the Safekeeper, from the right. It is immediately captured by the page server and stored quickly in memory. The page server memory can be thought of as a quick "reorder buffer", used to hold the incoming WAL and reorder it so that we keep the WAL records for the same page and relation close to each other. From the page server memory, whenever enough WAL has been accumulated, it is flushed to disk into a new L0 layer file, and the memory is released. When enough L0 files have been accumulated, they are merged together and sliced per key-space, producing a new set of files where each file contains a more narrow key range, but larger LSN range. From the local disk, the layers are further copied to Cloud Storage, for long-term archival. After a layer has been copied to Cloud Storage, it can be removed from local disk, although we currently keep everything locally for fast access. If a layer is needed that isn't found locally, it is fetched from Cloud Storage and stored in local disk. L0 and L1 files are both uploaded to Cloud Storage. # Layer map The LayerMap tracks what layers exist in a timeline. Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or other read request, the layer map scans through the array to find the right layer that contains the data for the requested page. The read-code in LayeredTimeline is aware of the ancestor, and returns data from the ancestor timeline if it's not found on the current timeline. # Different kinds of layers A layer can be in different states: - Open - a layer where new WAL records can be appended to. - Closed - a layer that is read-only, no new WAL records can be appended to it - Historic: synonym for closed - InMemory: A layer that needs to be rebuilt from WAL on pageserver start. To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file. - OnDisk: A layer that is stored on disk. If its end-LSN is older than disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk. - Frozen layer: an in-memory layer that is Closed. TODO: Clarify the difference between Closed, Historic and Frozen. There are two kinds of OnDisk layers: - ImageLayer represents a snapshot of all the keys in a particular range, at one particular LSN. Any keys that are not present in the ImageLayer are known not to exist at that LSN. - DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for a range of keys. # Layer life cycle LSN range defined by start_lsn and end_lsn: - start_lsn is inclusive. - end_lsn is exclusive. For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory layer or a delta layer, it is a valid end bound. An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 Every layer starts its life as an Open In-Memory layer. When the page server receives the first WAL record for a timeline, it creates a new In-Memory layer for it, and puts it to the layer map. Later, when the layer becomes full, its contents are written to disk, as an on-disk layers. Flushing a layer is a two-step process: First, the layer is marked as closed, so that it no longer accepts new WAL records, and a new in-memory layer is created to hold any WAL after that point. After this first step, the layer is a Closed InMemory state. This first step is called "freezing" the layer. In the second step, a new Delta layers is created, containing all the data from the Frozen InMemory layer. When it has been created and flushed to disk, the original frozen layer is replaced with the new layers in the layer map, and the original frozen layer is dropped, releasing the memory. # Layer files (On-disk layers) The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under `.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file contains modifications to a segment - mostly in the form of WAL records - in a range of LSN. image file: ``` 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 start key end key LSN ``` The first parts define the key range that the layer covers. See pgdatadir_mapping.rs for how the key space is used. The last part is the LSN. delta file: Delta files are named similarly, but they cover a range of LSNs: ``` 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 start key end key start LSN end LSN ``` A delta file contains all the key-values in the key-range that were updated in the LSN range. If a key has not been modified, there is no trace of it in the delta layer. A delta layer file can cover a part of the overall key space, as in the previous example, or the whole key range like this: ``` 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051 ``` A file that covers the whole key range is called a L0 file (Level 0), while a file that covers only part of the key range is called a L1 file. The "level" of a file is not explicitly stored anywhere, you can only distinguish them by looking at the key range that a file covers. The read-path doesn't need to treat L0 and L1 files any differently. ## Notation used in this document FIXME: This is somewhat obsolete, the layer files cover a key-range rather than a particular relation nowadays. However, the description on how you find a page version, and how branching and GC works is still valid. The full path of a delta file looks like this: ``` .neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 ``` For simplicity, the examples below use a simplified notation for the paths. The tenant ID is left out, the timeline ID is replaced with the human-readable branch name, and spcnode+dbnode+relnode+forkum+segno with a human-readable table name. The LSNs are also shorter. For example, a base image file at LSN 100 and a delta file between 100-200 for 'orders' table on 'main' branch is represented like this: ``` main/orders_100 main/orders_100_200 ``` # Creating layer files Let's start with a simple example with a system that contains one branch called 'main' and two tables, 'orders' and 'customers'. The end of WAL is currently at LSN 250. In this starting situation, you would have these files on disk: ``` main/orders_100 main/orders_100_200 main/orders_200 main/customers_100 main/customers_100_200 main/customers_200 ``` In addition to those files, the recent changes between LSN 200 and the end of WAL at 250 are kept in memory. If the page server crashes, the latest records between 200-250 need to be re-read from the WAL. Whenever enough WAL has been accumulated in memory, the page server writes out the changes in memory into new layer files. This process is called "checkpointing" (not to be confused with the PostgreSQL checkpoints, that's a different thing). The page server only creates layer files for relations that have been modified since the last checkpoint. For example, if the current end of WAL is at LSN 450, and the last checkpoint happened at LSN 400 but there hasn't been any recent changes to 'customers' table, you would have these files on disk: main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 main/customers_100 main/customers_100_200 main/customers_200 If the customers table is modified later, a new file is created for it at the next checkpoint. The new file will cover the "gap" from the last layer file, so the LSN ranges are always contiguous: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 main/customers_100 main/customers_100_200 main/customers_200 main/customers_200_500 main/customers_500 ``` ## Reading page versions Whenever a GetPage@LSN request comes in from the compute node, the page server needs to reconstruct the requested page, as it was at the requested LSN. To do that, the page server first checks the recent in-memory layer; if the requested page version is found there, it can be returned immediately without looking at the files on disk. Otherwise the page server needs to locate the layer file that contains the requested page version. For example, if a request comes in for table 'orders' at LSN 250, the page server would load the 'main/orders_200_300' file into memory, and reconstruct and return the requested page from it, as it was at LSN 250. Because the layer file consists of a full image of the relation at the start LSN and the WAL, reconstructing the page involves replaying any WAL records applicable to the page between LSNs 200-250, starting from the base image at LSN 200. # Multiple branches Imagine that a child branch is created at LSN 250: ``` @250 ----main--+--------------------------> \ +---child--------------> ``` Then, the 'orders' table is updated differently on the 'main' and 'child' branches. You now have this situation on disk: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 main/customers_100 main/customers_100_200 main/customers_200 child/orders_250_300 child/orders_300 child/orders_300_400 child/orders_400 ``` Because the 'customers' table hasn't been modified on the child branch, there is no file for it there. If you request a page for it on the 'child' branch, the page server will not find any layer file for it in the 'child' directory, so it will recurse to look into the parent 'main' branch instead. From the 'child' branch's point of view, the history for each relation is linear, and the request's LSN identifies unambiguously which file you need to look at. For example, the history for the 'orders' table on the 'main' branch consists of these files: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 ``` And from the 'child' branch's point of view, it consists of these files: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 child/orders_250_300 child/orders_300 child/orders_300_400 child/orders_400 ``` The branch metadata includes the point where the child branch was created, LSN 250. If a page request comes with LSN 275, we read the page version from the 'child/orders_250_300' file. We might also need to reconstruct the page version as it was at LSN 250, in order to replay the WAL up to LSN 275, using 'main/orders_200_300' and 'main/orders_200'. The page versions between 250-300 in the 'main/orders_200_300' file are ignored when operating on the child branch. Note: It doesn't make any difference if the child branch is created when the end of the main branch was at LSN 250, or later when the tip of the main branch had already moved on. The latter case, creating a branch at a historic LSN, is how we support PITR in Neon. # Garbage collection In this scheme, we keep creating new layer files over time. We also need a mechanism to remove old files that are no longer needed, because disk space isn't infinite. What files are still needed? Currently, the page server supports PITR and branching from any branch at any LSN that is "recent enough" from the tip of the branch. "Recent enough" is defined as an LSN horizon, which by default is 64 MB. (See DEFAULT_GC_HORIZON). For this example, let's assume that the LSN horizon is 150 units. Let's look at the single branch scenario again. Imagine that the end of the branch is LSN 525, so that the GC horizon is currently at 525-150 = 375 ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 main/orders_400_500 main/orders_500 main/customers_100 main/customers_100_200 main/customers_200 ``` We can remove the following files because the end LSNs of those files are older than GC horizon 375, and there are more recent layer files for the table: ``` main/orders_100 DELETE main/orders_100_200 DELETE main/orders_200 DELETE main/orders_200_300 DELETE main/orders_300 STILL NEEDED BY orders_300_400 main/orders_300_400 KEEP, NEWER THAN GC HORIZON main/orders_400 .. main/orders_400_500 .. main/orders_500 .. main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION ``` 'main/customers_200' is old enough, but it cannot be removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of the above still holds, but in addition to recent files we must also retain older snapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: ``` main/orders_100 KEEP, NEEDED BY child BRANCH main/orders_100_200 KEEP, NEEDED BY child BRANCH main/orders_200 DELETE main/orders_200_300 DELETE main/orders_300 KEEP, NEWER THAN GC HORIZON main/orders_300_400 KEEP, NEWER THAN GC HORIZON main/orders_400 KEEP, NEWER THAN GC HORIZON main/orders_400_500 KEEP, NEWER THAN GC HORIZON main/orders_500 KEEP, NEWER THAN GC HORIZON main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION child/customers_150_300 DELETE child/customers_300 KEEP, NO NEWER VERSION ``` In this situation, 'main/orders_100' and 'main/orders_100_200' cannot be removed, even though they are older than the GC horizon, because they are still needed by the child branch. 'main/orders_200' and 'main/orders_200_300' can still be removed. If 'orders' is modified later on the 'child' branch, we will create a new base image and delta file for it on the child: ``` main/orders_100 main/orders_100_200 main/orders_300 main/orders_300_400 main/orders_400 main/orders_400_500 main/orders_500 main/customers_200 child/customers_300 child/orders_150_400 child/orders_400 ``` After this, the 'main/orders_100' and 'main/orders_100_200' file could be removed. It is no longer needed by the child branch, because there is a newer layer file there. TODO: This optimization hasn't been implemented! The GC algorithm will currently keep the file on the 'main' branch anyway, for as long as the child branch exists. TODO: Describe GC and checkpoint interval settings. # TODO: On LSN ranges In principle, each relation can be checkpointed separately, i.e. the LSN ranges of the files don't need to line up. So this would be legal: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_300_400 main/orders_400 main/customers_150 main/customers_150_250 main/customers_250 main/customers_250_500 main/customers_500 ``` However, the code currently always checkpoints all relations together. So that situation doesn't arise in practice. It would also be OK to have overlapping LSN ranges for the same relation: main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 main/orders_250_350 main/orders_350 main/orders_300_400 main/orders_400 The code that reads the layer files should cope with this, but this situation doesn't arise either, because the checkpointing code never does that. It could be useful, however, as a transient state when garbage collecting around branch points, or explicit recovery points. For example, if we start with this: ``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 ``` And there is a branch or explicit recovery point at LSN 150, we could replace 'main/orders_100_200' with 'main/orders_150' to keep a layer only at that exact point that's still needed, removing the other page versions around it. But such compaction has not been implemented yet. ================================================ FILE: docs/pageserver-tenant-migration.md ================================================ ## Pageserver tenant migration ### Overview This feature allows to migrate a timeline from one pageserver to another by utilizing remote storage capability. ### Migration process Pageserver implements two new http handlers: timeline attach and timeline detach. Timeline migration is performed in a following way: 1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3. 2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049)) 3. Replication state can be tracked via timeline detail pageserver call. 4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console). 5. Timeline is detached from old pageserver. On disk data is removed. ### Implementation details Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code: * We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). * We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail) ================================================ FILE: docs/pageserver-thread-mgmt.md ================================================ ## Thread management The pageserver uses Tokio for handling concurrency. Everything runs in Tokio tasks, although some parts are written in blocking style and use spawn_blocking(). We currently use std blocking functions for disk I/O, however. The current model is that we consider disk I/Os to be short enough that we perform them while running in a Tokio task. Changing all the disk I/O calls to async is a TODO. Each Tokio task is tracked by the `task_mgr` module. It maintains a registry of tasks, and which tenant or timeline they are operating on. ### Handling shutdown When a tenant or timeline is deleted, we need to shut down all tasks operating on it, before deleting the data on disk. There's a function, `shutdown_tasks`, to request all tasks of a particular tenant or timeline to shutdown. It will also wait for them to finish. A task registered in the task registry can check if it has been requested to shut down, by calling `is_shutdown_requested()`. There's also a `shutdown_watcher()` Future that can be used with `tokio::select!` or similar, to wake up on shutdown. ### Async cancellation safety In async Rust, futures can be "cancelled" at any await point, by dropping the Future. For example, `tokio::select!` returns as soon as one of the Futures returns, and drops the others. `tokio::time::timeout` is another example. In the Rust ecosystem, some functions are cancellation-safe, meaning they can be safely dropped without side-effects, while others are not. See documentation of `tokio::select!` for examples. In the pageserver and safekeeper, async code is *not* cancellation-safe by default. Unless otherwise marked, any async function that you call cannot be assumed to be async cancellation-safe, and must be polled to completion. The downside of non-cancellation safe code is that you have to be very careful when using `tokio::select!`, `tokio::time::timeout`, and other such functions that can cause a Future to be dropped. They can only be used with functions that are explicitly documented to be cancellation-safe, or you need to spawn a separate task to shield from the cancellation. At the entry points to the code, we also take care to poll futures to completion, or shield the rest of the code from surprise cancellations by spawning a separate task. The code that handles incoming HTTP requests, for example, spawns a separate task for each request, because Hyper will drop the request-handling Future if the HTTP connection is lost. #### How to cancel, then? If our code is not cancellation-safe, how do you cancel long-running tasks? Use CancellationTokens. TODO: More details on that. And we have an ongoing discussion on what to do if cancellations might come from multiple sources. #### Exceptions Some library functions are cancellation-safe, and are explicitly marked as such. For example, `utils::seqwait`. #### Rationale The alternative would be to make all async code cancellation-safe, unless otherwise marked. That way, you could use `tokio::select!` more liberally. The reasons we didn't choose that are explained in this section. Writing code in a cancellation-safe manner is tedious, as you need to scrutinize every `.await` and ensure that if the `.await` call never returns, the system is in a safe, consistent state. In some ways, you need to do that with `?` and early `returns`, too, but `.await`s are easier to miss. It is also easier to perform cleanup tasks when a function returns an `Err` than when an `.await` simply never returns. You can use `scopeguard` and Drop guards to perform cleanup tasks, but it is more tedious. An `.await` that never returns is more similar to a panic. Note that even if you only use building blocks that themselves are cancellation-safe, it doesn't mean that the code as whole is cancellation-safe. For example, consider the following code: ``` while let Some(i) = work_inbox.recv().await { if let Err(_) = results_outbox.send(i).await { println!("receiver dropped"); return; } } } ``` It reads messages from one channel, sends them to another channel. If this code is cancelled at the `results_outbox.send(i).await`, the message read from the receiver is lost. That may or may not be OK, depending on the context. Another reason to not require cancellation-safety is historical: we already had a lot of async code that was not scrutinized for cancellation-safety when this issue was raised. Scrutinizing all existing code is no fun. ================================================ FILE: docs/pageserver-walredo.md ================================================ # WAL Redo To reconstruct a particular page version from an image of the page and some WAL records, the pageserver needs to replay the WAL records. This happens on-demand, when a GetPage@LSN request comes in, or as part of background jobs that reorganize data for faster access. It's important that data cannot leak from one tenant to another, and that a corrupt WAL record on one timeline doesn't affect other tenants or timelines. ## Multi-tenant security If you have direct access to the WAL directory, or if you have superuser access to a running PostgreSQL server, it's easy to construct a malicious or corrupt WAL record that causes the WAL redo functions to crash, or to execute arbitrary code. That is not a security problem for PostgreSQL; if you have superuser access, you have full access to the system anyway. The Neon pageserver, however, is multi-tenant. It needs to execute WAL belonging to different tenants in the same system, and malicious WAL in one tenant must not affect other tenants. A separate WAL redo process is launched for each tenant, and the process uses the seccomp(2) system call to restrict its access to the bare minimum needed to replay WAL records. The process does not have access to the filesystem or network. It can only communicate with the parent pageserver process through a pipe. If an attacker creates a malicious WAL record and injects it into the WAL stream of a timeline, he can take control of the WAL redo process in the pageserver. However, the WAL redo process cannot access the rest of the system. And because there is a separate WAL redo process for each tenant, the hijacked WAL redo process can only see WAL and data belonging to the same tenant, which the attacker would have access to anyway. ## WAL-redo process communication The WAL redo process runs the 'postgres' executable, launched with a Neon-specific command-line option to put it into WAL-redo process mode. The pageserver controls the lifetime of the WAL redo processes, launching them as needed. If a tenant is detached from the pageserver, any WAL redo processes for that tenant are killed. The pageserver communicates with each WAL redo process over its stdin/stdout/stderr. It works in request-response model with a simple custom protocol, described in walredo.rs. To replay a set of WAL records for a page, the pageserver sends the "before" image of the page and the WAL records over 'stdin', followed by a command to perform the replay. The WAL redo process responds with an "after" image of the page. ## Special handling of some records Some WAL record types are handled directly in the pageserver, by bespoken Rust code, and are not sent over to the WAL redo process. This includes SLRU-related WAL records, like commit records. SLRUs don't use the standard Postgres buffer manager, so dealing with them in the Neon WAL redo mode would require quite a few changes to Postgres code and special handling in the protocol anyway. Some record types that include a full-page-image (e.g. XLOG_FPI) are also handled specially when incoming WAL is processed already, and are stored as page images rather than WAL records. ## Records that modify multiple pages Some Postgres WAL records modify multiple pages. Such WAL records are duplicated, so that a copy is stored for each affected page. This is somewhat wasteful, but because most WAL records only affect one page, the overhead is acceptable. The WAL redo always happens for one particular page. If the WAL record contains changes to other pages, they are ignored. ================================================ FILE: docs/pageserver.md ================================================ # Page server architecture The Page Server has a few different duties: - Respond to GetPage@LSN requests from the Compute Nodes - Receive WAL from WAL safekeeper, and store it - Upload data to S3 to make it durable, download files from S3 as needed S3 is the main fault-tolerant storage of all data, as there are no Page Server replicas. We use a separate fault-tolerant WAL service to reduce latency. It keeps track of WAL records which are not synced to S3 yet. ================================================ FILE: docs/rfcs/001-cluster-size-limits.md ================================================ Cluster size limits ================== ## Summary One of the resource consumption limits for free-tier users is a cluster size limit. To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations. If the limit is reached, the query must fail with some meaningful error/warning. We may want to exempt some operations from the quota to allow users free space to fit back into the limit. The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components. ## Motivation Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). First of all, this is needed to control our free tier production costs. Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters, so we don't want to give users access to the functionality that we don't think is ready. ## Components * pageserver - calculate the size consumed by a timeline and add it to the feedback message. * safekeeper - pass feedback message from pageserver to compute. * compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`. * console - set and update `neon.max_cluster_size` setting ## Proposed implementation First of all, it's necessary to define timeline size. The current approach is to count all data, including SLRUs. (not including WAL) Here we think of it as a physical disk underneath the Postgres cluster. This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver. Alternatively, we could count only relation data. As in pg_database_size(). This approach is somewhat more user-friendly because it is the data that is really affected by the user. On the other hand, it puts us in a weaker position than other services, i.e., RDS. We will need to refactor the timeline_size counter or add another counter to implement it. Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment. Then this size should be reported to compute node. `current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` (PR about protocol changes https://github.com/neondatabase/neon/pull/1037). This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. See issues https://github.com/neondatabase/neon/issues/1245 https://github.com/neondatabase/neon/issues/1445 TODO: We should warn users if the limit is soon to be reached. ### **Reliability, failure modes and corner cases** 1. `current_timeline_size` is valid at the last received and digested by pageserver lsn. If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time. So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this? Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue. ### **Security implications** We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check. To cover this case, we also monitor the compute node size in the console. ================================================ FILE: docs/rfcs/002-storage.md ================================================ # Neon storage node — alternative ## **Design considerations** Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. Proposed architecture addresses: - High availability -- tolerates n/2 - 1 failures - Multi-tenancy -- one storage for all databases - Elasticity -- increase storage size on the go by adding nodes - Snapshots / backups / PITR with S3 offload - Compression Minuses are: - Quite a lot of work - Single page access may touch few disk pages - Some bloat in data — may slowdown sequential scans ## **Summary** Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: ``` |-chunk_42/ |-store/ -- contains lsm with pages/pagediffs ranging from | page_key_lo to page_key_hi |-wal/ | |- db_1234/ db-specific wal files with pages from page_key_lo | to page_key_hi | |-chunk.meta -- small file with snapshot references (page_key_prefix+lsn+name) and PITR regions (page_key_start, page_key_end) ``` ## **Chunk** Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: - `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) - `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance - `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later - `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. - `(forkno, segno, pageno)` -- page coordinates in postgres data files - `lsn_timeline` -- postgres feature, increments when PITR was done. - `lsn` -- lsn of current page version. Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. Content of SSTable can be following: ```jsx (pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) (pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) (pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) (pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) (pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) (pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) (pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) (pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) ``` So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. ### **Page deletion** To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. ### **Recovery** Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. ### **Checkpointing** No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer. ### **Full page writes (torn page protection)** Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. ### **Snapshot** That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. **Starting db from snapshot** When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). **Snapshot export/import** Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. ### **PITR area** In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. ### **Compression** Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. ### **Chunk metadata** Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers. ### **Chunk splitting** *(NB: following paragraph is about how to avoid page splitting)* When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: 1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. 2. Prohibit WAL deletion and old SSTables deletion on original chunk. 3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. 4. Start WAL replay on new chunks. 5. Update global metadata about new chunk boundaries. 6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. 7. New chunk may start serving read queries when following conditions are met: a) it receives at least on WAL record from processing node b) it replayed all WAL up to the new received one c) checked by downlinks that there were no WAL gaps. Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. ### Fixed chunks Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. ### **Chunk lsm internals** So how to implement chunk's lsm? - Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. - Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. # Storage fleet # **Storage fleet** - When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). Screenshot_2021-02-22_at_16 49 17 Few databases are stored in one chunk, replicated three times - When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster. Screenshot_2021-02-22_at_16 49 10 Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. ## **Chunk placement strategies** There are few scenarios where we may want to move chunks around the cluster: - disk usage on some node is big - some disk experienced a failure - some node experienced a failure or need maintenance ## **Chunk replication** Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. ================================================ FILE: docs/rfcs/003-laptop-cli.md ================================================ # Command line interface (end-user) Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start. This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. # Possible usage scenarios ## Install neon, run a postgres ``` > brew install pg-neon > neon pg create # creates pgdata with default pattern pgdata$i > neon pg list ID PGDATA USED STORAGE ENDPOINT primary1 pgdata1 0G neon-local localhost:5432 ``` ## Import standalone postgres to neon ``` > neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg [====================------------] 60% | 20MB/s > neon snapshot list ID SIZE PARENT oldpg 5G - > neon pg create --snapshot oldpg Started postgres on localhost:5432 > neon pg list ID PGDATA USED STORAGE ENDPOINT primary1 pgdata1 5G neon-local localhost:5432 > neon snapshot destroy oldpg Ok ``` Also, we may start snapshot import implicitly by looking at snapshot schema ``` > neon pg create --snapshot basebackup://replication@localhost:5432/ Downloading snapshot... Done. Started postgres on localhost:5432 Destroying snapshot... Done. ``` ## Pull snapshot with some publicly shared database Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). ``` > neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies ``` ## Create snapshot and push it to the cloud ``` > neon snapshot create pgdata1@snap1 > neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1 ``` ## Rollback database to the snapshot One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`. ``` > neon pg list ID PGDATA USED STORAGE ENDPOINT primary1 pgdata1 5G neon-local localhost:5432 > neon snapshot create pgdata1@snap1 > neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@CURRENT 6G - > neon pg checkout pgdata1@snap1 Stopping postgres on pgdata1. Rolling back pgdata1@CURRENT to pgdata1@snap1. Starting postgres on pgdata1. > neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@HEAD{0} 6G - pgdata1@CURRENT 6G - ``` Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). ## Configure PITR area (Point In Time Recovery). PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). ``` > neon pitr create --storage s3tank --ttl 30d --name pitr_last_month ``` Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. # Manual ## storage Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. **neon storage attach** -t [native|s3] -c key=value -n name Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'. **neon storage list** Show currently attached storages. For example: ``` > neon storage list NAME USED TYPE OPTIONS PATH local 5.1G neon-local /opt/neon/store/local local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr zcloud 60G neon-remote neon.tech/stas/mystore s3tank 80G S3 ``` **neon storage detach** **neon storage show** ## pg Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves. Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. **neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. --no-start: just init datadir without creating --snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1) --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) **neon pg destroy** **neon pg start** [--replica] pgdata Start postgres with proper extensions preloaded/installed. **neon pg checkout** Rollback data directory to some previous snapshot. **neon pg stop** pg_id **neon pg list** ``` ROLE PGDATA USED STORAGE ENDPOINT primary my_pg 5.1G local localhost:5432 replica-1 localhost:5433 replica-2 localhost:5434 primary my_pg2 3.2G local.compr localhost:5435 - my_pg3 9.2G local.compr - ``` **neon pg show** ``` my_pg: storage: local space used on local: 5.1G space used on all storages: 15.1G snapshots: on local: snap1: 1G snap2: 1G on zcloud: snap2: 1G on s3tank: snap5: 2G pitr: on s3tank: pitr_one_month: 45G ``` **neon pg start-rest/graphql** pgdata Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. ## snapshot Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. **neon snapshot create** pgdata_name@snap_name Creates a new snapshot in the same storage where pgdata_name exists. **neon snapshot push** --to url pgdata_name@snap_name Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go. **neon snapshot recv** Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. **neon snapshot pull** --from url or path Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format. **neon snapshot import** --from basebackup://<...> or path Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. **neon snapshot export** Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay). **neon snapshot diff** snap1 snap2 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. **neon snapshot destroy** ## pitr Pitr represents wal stream and ttl policy for that stream XXX: any suggestions on a better name? **neon pitr create** name --ttl = inf | period --size-limit = inf | limit --storage = storage_name **neon pitr extract-snapshot** pitr_name --lsn xxx Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) **neon pitr gc** pitr_name Force garbage collection on some PITR area. **neon pitr list** **neon pitr destroy** ## console **neon console** Opens browser targeted at web console with the more or less same functionality as described here. ================================================ FILE: docs/rfcs/004-durability.md ================================================ Durability & Consensus ====================== When a transaction commits, a commit record is generated in the WAL. When do we consider the WAL record as durable, so that we can acknowledge the commit to the client and be reasonably certain that we will not lose the transaction? Neon uses a group of WAL safekeeper nodes to hold the generated WAL. A WAL record is considered durable, when it has been written to a majority of WAL safekeeper nodes. In this document, I use 5 safekeepers, because I have five fingers. A WAL record is durable, when at least 3 safekeepers have written it to disk. First, assume that only one primary node can be running at a time. This can be achieved by Kubernetes or etcd or some cloud-provider specific facility, or we can implement it ourselves. These options are discussed in later chapters. For now, assume that there is a Magic STONITH Fairy that ensures that. In addition to the WAL safekeeper nodes, the WAL is archived in S3. WAL that has been archived to S3 can be removed from the safekeepers, so the safekeepers don't need a lot of disk space. ``` +----------------+ +-----> | WAL safekeeper | | +----------------+ | +----------------+ +-----> | WAL safekeeper | +------------+ | +----------------+ | Primary | | +----------------+ | Processing | ---------+-----> | WAL safekeeper | | Node | | +----------------+ +------------+ | +----------------+ \ +-----> | WAL safekeeper | \ | +----------------+ \ | +----------------+ \ +-----> | WAL safekeeper | \ +----------------+ \ \ \ \ \ +--------+ \ | | +------> | S3 | | | +--------+ ``` Every WAL safekeeper holds a section of WAL, and a VCL value. The WAL can be divided into three portions: ``` VCL LSN | | V V .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX Archived WAL Completed WAL In-flight WAL ``` Note that all this WAL kept in a safekeeper is a contiguous section. This is different from Aurora: In Aurora, there can be holes in the WAL, and there is a Gossip protocol to fill the holes. That could be implemented in the future, but let's keep it simple for now. WAL needs to be written to a safekeeper in order. However, during crash recovery, In-flight WAL that has already been stored in a safekeeper can be truncated or overwritten. The Archived WAL has already been stored in S3, and can be removed from the safekeeper. The Completed WAL has been written to at least three safekeepers. The algorithm ensures that it is not lost, when at most two nodes fail at the same time. The In-flight WAL has been persisted in the safekeeper, but if a crash happens, it may still be overwritten or truncated. The VCL point is determined in the Primary. It is not strictly necessary to store it in the safekeepers, but it allows some optimizations and sanity checks and is probably generally useful for the system as whole. The VCL values stored in the safekeepers can lag behind the VCL computed by the primary. Primary node Normal operation ----------------------------- 1. Generate some WAL. 2. Send the WAL to all the safekeepers that you can reach. 3. As soon as a quorum of safekeepers have acknowledged that they have received and durably stored the WAL up to that LSN, update local VCL value in memory, and acknowledge commits to the clients. 4. Send the new VCL to all the safekeepers that were part of the quorum. (Optional) Primary Crash recovery ---------------------- When a new Primary node starts up, before it can generate any new WAL it needs to contact a majority of the WAL safekeepers to compute the VCL. Remember that there is a Magic STONITH fairy that ensures that only node process can be doing this at a time. 1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. 2. Update the other safekeepers you can reach, by copying all the WAL from the Winner, starting from each safekeeper's old VCL point. Any old In-Flight WAL from previous Epoch is truncated away. 3. Increment Epoch, and send the new Epoch to the quorum of safekeepers. (This ensures that if any of the safekeepers that we could not reach later come back online, they will be considered as older than this in any future recovery) You can now start generating new WAL, starting from the newly-computed VCL. Optimizations ------------- As described, the Primary node sends all the WAL to all the WAL safekeepers. That can be a lot of network traffic. Instead of sending the WAL directly from Primary, some safekeepers can be daisy-chained off other safekeepers, or there can be a broadcast mechanism among them. There should still be a direct connection from the each safekeeper to the Primary for the acknowledgments though. Similarly, the responsibility for archiving WAL to S3 can be delegated to one of the safekeepers, to reduce the load on the primary. Magic STONITH fairy ------------------- Now that we have a system that works as long as only one primary node is running at a time, how do we ensure that? 1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary when it's holding a valid lease. If the primary node dies, the lease expires after a timeout period, and a new node is allowed to become the primary. 2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you cannot do this safely. In practice, it would probably be OK if you make the lease times and timeouts long enough. This has the advantage that we don't need to introduce a new component to the architecture. 3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The next chapter describes this option. Built-in Paxos -------------- The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes as both Proposers and Learners. Each WAL safekeeper holds an Epoch value in addition to the VCL and the WAL. Each request by the primary to safekeep WAL is accompanied by an Epoch value. If a safekeeper receives a request with Epoch that doesn't match its current Accepted Epoch, it must ignore (NACK) it. (In different Paxos papers, Epochs are called "terms" or "round numbers") When a node wants to become the primary, it generates a new Epoch value that is higher than any previously observed Epoch value, and globally unique. Accepted Epoch: 555 VCL LSN | | V V .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX Archived WAL Completed WAL In-flight WAL Primary node startup: 1. Contact all WAL safekeepers that you can reach (if you cannot connect to a quorum of them, you can give up immediately). Find the latest Epoch among them. 2. Generate a new globally unique Epoch, greater than the latest Epoch found in previous step. 2. Send the new Epoch in a Prepare message to a quorum of safekeepers. (PAXOS Prepare message) 3. Each safekeeper responds with a Promise. If a safekeeper has already made a promise with a higher Epoch, it doesn't respond (or responds with a NACK). After making a promise, the safekeeper stops responding to any write requests with earlier Epoch. 4. Once you have received a majority of promises, you know that the VCL cannot advance on the old Epoch anymore. This effectively kills any old primary server. 5. Find the highest written LSN among the quorum of safekeepers (these can be included in the Promise messages already). This is the new VCL. If a new node starts the election process after this point, it will compute the same or higher VCL. 6. Copy the WAL from the safekeeper with the highest LSN to the other safekeepers in the quorum, using the new Epoch. (PAXOS Accept phase) 7. You can now start generating new WAL starting from the VCL. If another process starts the election process after this point and gains control of a majority of the safekeepers, we will no longer be able to advance the VCL. ================================================ FILE: docs/rfcs/005-zenith_local.md ================================================ # Neon local Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome. #### Why do we need it? - For distribution - this easy to use binary will help us to build adoption among developers. - For internal use - to test all components together. In my understanding, we consider it to be just a mock-up version of neon-cloud. > Question: How much should we care about durability and security issues for a local setup? #### Why is it better than a simple local postgres? - Easy one-line setup. As simple as `cargo install neon && neon start` - Quick and cheap creation of compute nodes over the same storage. > Question: How can we describe a use-case for this feature? - Neon-local can work with S3 directly. - Push and pull images (snapshots) to remote S3 to exchange data with other users. - Quick and cheap snapshot checkouts to switch back and forth in the database history. > Question: Do we want it in the very first release? This feature seems quite complicated. #### Distribution: Ideally, just one binary that incorporates all elements we need. > Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. #### Components: - **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli - **neon-console** - WEB UI with same functionality as CLI. >Note: not for the first release. - **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local. - **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src - **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon. > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? > Question: Do we use it together with local page store or they are interchangeable? WIP code is ??? - **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper - **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node #### REST API: Service endpoint: `http://localhost:3000` Resources: - /storages - Where data lives: neon-pageserver or neon-s3 - /pgs - Postgres - neon-computenode - /snapshots - snapshots **TODO** >Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? Methods and their mapping to CLI: - /storages - neon-pageserver or neon-s3 CLI | REST API ------------- | ------------- storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages storage detach -n name | DELETE /storages/:storage_name storage list | GET /storages storage show -n name | GET /storages/:storage_name - /pgs - neon-computenode CLI | REST API ------------- | ------------- pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs pg destroy -n name | DELETE /pgs/:pg_name pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions pg list | GET /pgs pg show -n name | GET /pgs/:pg_name - /snapshots **TODO** CLI | REST API ------------- | ------------- ================================================ FILE: docs/rfcs/006-laptop-cli-v2-CLI.md ================================================ Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". # CLI v2 (after chatting with Carl) Neon introduces the notion of a repository. ```bash neon init neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory ``` Once you have a cluster catalog you can explore it ```bash neon log -- returns a list of commits neon status -- returns if there are changes in the catalog that can be committed neon commit -- commits the changes and generates a new commit hash neon branch experimental -- creates a branch called testdb based on a given commit hash ``` To make changes in the catalog you need to run compute nodes ```bash -- here is how you a compute node neon start /home/pipedpiper/northwind:main -- starts a compute instance neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch neon start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run -- neon status and see how there are two WAL streams one on top of -- the main branch neon status -- and another on top of the experimental branch neon status -b experimental -- you can commit each branch separately neon commit main -- or neon commit -c /home/pipedpiper/northwind:experimental ``` Starting compute instances against cloud environments ```bash -- you can start a compute instance against the cloud environment -- in this case all of the changes will be streamed into the cloud neon start https://neon:tecj/pipedpiper/northwind:main neon start https://neon:tecj/pipedpiper/northwind:main neon status -c https://neon:tecj/pipedpiper/northwind:main neon commit -c https://neon:tecj/pipedpiper/northwind:main neon branch -c https://neon:tecj/pipedpiper/northwind: experimental ``` Pushing data into the cloud ```bash -- pull all the commits from the cloud neon pull -- push all the commits to the cloud neon push ``` ================================================ FILE: docs/rfcs/006-laptop-cli-v2-repository-structure.md ================================================ # Repository format A Neon repository is similar to a traditional PostgreSQL backup archive, like a WAL-G bucket or pgbarman backup catalogue. It holds multiple versions of a PostgreSQL database cluster. The distinguishing feature is that you can launch a Neon Postgres server directly against a branch in the repository, without having to "restore" it first. Also, Neon manages the storage automatically, there is no separation between full and incremental backups nor WAL archive. Neon relies heavily on the WAL, and uses concepts similar to incremental backups and WAL archiving internally, but it is hidden from the user. ## Directory structure, version 1 This first version is pretty straightforward but not very efficient. Just something to get us started. The repository directory looks like this: .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history .neon/refs/branches/mybranch .neon/refs/tags/foo .neon/refs/tags/bar .neon/datadirs/ ### Timelines A timeline is similar to PostgeSQL's timeline, but is identified by a UUID instead of a 32-bit timeline Id. For user convenience, it can be given a name that refers to the UUID (called a branch). All WAL is generated on a timeline. You can launch a read-only node against a tag or arbitrary LSN on a timeline, but in order to write, you need to create a timeline. Each timeline is stored in a directory under .neon/timelines. It consists of a WAL archive, containing all the WAL in the standard PostgreSQL format, under the wal/ subdirectory. The 'snapshots/' subdirectory, contains "base backups" of the data directory at a different LSNs. Each snapshot is simply a copy of the Postgres data directory. When a new timeline is forked from a previous timeline, the ancestor timeline's UUID is stored in the 'history' file. ### Refs There are two kinds of named objects in the repository: branches and tags. A branch is a human-friendly name for a timeline UUID, and a tag is a human-friendly name for a specific LSN on a timeline (timeline UUID + LSN). Like in git, these are just for user convenience; you can also use timeline UUIDs and LSNs directly. Refs do have one additional purpose though: naming a timeline or LSN prevents it from being automatically garbage collected. The refs directory contains a small text file for each tag/branch. It contains the UUID of the timeline (and LSN, for tags). ### Datadirs .neon/datadirs contains PostgreSQL data directories. You can launch a Postgres instance on one of them with: ``` postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c ``` All the actual data is kept in the timeline directories, under .neon/timelines. The data directories are only needed for active PostgreQSL instances. After an instance is stopped, the data directory can be safely removed. "neon start" will recreate it quickly from the data in .neon/timelines, if it's missing. ## Version 2 The format described above isn't very different from a traditional daily base backup + WAL archive configuration. The main difference is the nicer naming of branches and tags. That's not very efficient. For performance, we need something like incremental backups that don't require making a full copy of all data. So only store modified files or pages. And instead of having to replay all WAL from the last snapshot, "slice" the WAL into per-relation WAL files and only recover what's needed when a table is accessed. In version 2, the file format in the "snapshots" subdirectory gets more advanced. The exact format is TODO. But it should support: - storing WAL records of individual relations/pages - storing a delta from an older snapshot - compression ## Operations ### Garbage collection When you run "neon gc", old timelines that are no longer needed are removed. That involves collecting the list of "unreachable" objects, starting from the named branches and tags. Also, if enough WAL has been generated on a timeline since last snapshot, a new snapshot or delta is created. ### neon push/pull Compare the tags and branches on both servers, and copy missing ones. For each branch, compare the timeline it points to in both servers. If one is behind the other, copy the missing parts. FIXME: how do you prevent confusion if you have to clones of the same repository, launch an instance on the same branch in both clones, and later try to push/pull between them? Perhaps create a new timeline every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept that we have in the WAL safekeeper ### neon checkout/commit In this format, there is no concept of a "working tree", and hence no concept of checking out or committing. All modifications are done on a branch or a timeline. As soon as you launch a server, the changes are appended to the timeline. You can easily fork off a temporary timeline to emulate a "working tree". You can later remove it and have it garbage collected, or to "commit", re-point the branch to the new timeline. If we want to have a worktree and "neon checkout/commit" concept, we can emulate that with a temporary timeline. Create the temporary timeline at "neon checkout", and have "neon commit" modify the branch to point to the new timeline. ================================================ FILE: docs/rfcs/007-serverless-on-laptop.md ================================================ How it works now ---------------- 1. Create repository, start page server on it ``` $ neon init ... created main branch new neon repository was created in .neon $ neon pageserver start Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create a branch, and start a Postgres instance on it ``` $ neon branch heikki main branching at end of WAL: 0/15ECF68 $ neon pg create heikki Initializing Postgres on timeline 76cf9279915be7797095241638e64644... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432 $ neon pg start pg1 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' waiting for server to start.... done server started ``` 3. Connect to it and run queries ``` $ psql "dbname=postgres port=55432" psql (14devel) Type "help" for help. postgres=# ``` Proposal: Serverless on your Laptop ----------------------------------- We've been talking about doing the "pg create" step automatically at "pg start", to eliminate that step. What if we go further, go serverless on your laptop, so that the workflow becomes just: 1. Create repository, start page server on it (same as before) ``` $ neon init ... created main branch new neon repository was created in .neon $ neon pageserver start Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create branch ``` $ neon branch heikki main branching at end of WAL: 0/15ECF68 ``` 3. Connect to it: ``` $ psql "dbname=postgres port=5432 branch=heikki" psql (14devel) Type "help" for help. postgres=# ``` The trick behind the scenes is that when you launch the page server, it starts to listen on port 5432. When you connect to it with psql, it looks at the 'branch' parameter that you passed in the connection string. It automatically performs the "pg create" and "pg start" steps for that branch, and then forwards the connection to the Postgres instance that it launched. After you disconnect, if there are no more active connections to the server running on the branch, it can automatically shut it down again. This is how serverless would work in the cloud. We can do it on your laptop, too. ================================================ FILE: docs/rfcs/008-push-pull.md ================================================ # Push and pull between pageservers Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. ## Origin management The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). ``` neon origin add neon origin list neon origin remove ``` Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. Behind the scenes, this commands may update toml file inside .neon directory. ## Push ### Pushing branch ``` neon push mybranch cloudserver # push to eponymous branch in cloudserver neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver ``` Exact mechanics would be slightly different in the following situations: 1) Destination branch does not exist. That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. The exact mechanics may be the following: * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. * local pageserver connects to the remote pageserver and runs `branch_push ` Handler for branch_create would create destination timeline and switch connection to copyboth mode. * Sending pageserver may start iterator on that timeline and send all the records as copy messages. 2) Destination branch exists and latest_valid_lsn is less than ours. In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. 3) Destination branch exists and latest_valid_lsn is bigger than ours. In this case, we can't push to that branch. We can only pull. ### Pulling branch Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: * CLI calls `perform_pull ` on local pageserver. * local pageserver calls `branch_pull ` on remote pageserver. * remote pageserver sends records in our direction But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. [*] It looks to me that there are two different possible approaches to handling unrelated timelines: 1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. 2) Transparently create and manage several repositories in one pageserver. But that is the topic for a separate RFC/discussion. ================================================ FILE: docs/rfcs/009-snapshot-first-storage-cli.md ================================================ While working on export/import commands, I understood that they fit really well into "snapshot-first design". We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon. So here is an attempt to design consistent CLI for different usage scenarios: #### 1. Start empty pageserver. That is what we have now. Init empty pageserver using `initdb` in temporary directory. `--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` neon init --storage_dest=S3_PREFIX neon start ``` #### 2. Restart pageserver (manually or crash-recovery). Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. Push snapshots to `storage_dest` in background. ``` neon start ``` #### 3. Import. Start pageserver from existing snapshot. Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` //I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage. neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX neon start ``` How to pass credentials needed for `snapshot_path`? #### 4. Export. Manually push snapshot to `snapshot_path` which differs from `storage_dest` Optionally set `snapshot_format`, which can be plain pgdata format or neon format. ``` neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? - Why do we need `neon init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. I can recall at least one such difference - PD_WAL_LOGGED flag in pages. ================================================ FILE: docs/rfcs/009-snapshot-first-storage-pitr.md ================================================ # Preface GetPage@LSN can be called with older LSNs, and the page server needs to be able to reconstruct older page versions. That's needed for having read-only replicas that lag behind the primary, or that are "anchored" at an older LSN, and internally in the page server when you branch at an older point in time. How do you do that? For now, I'm not considering incremental snapshots at all. I don't think that changes things. So whenever you create a snapshot or a snapshot file, it contains an image of all the pages, there is no need to look at an older snapshot file. Also, I'm imagining that this works on a per-relation basis, so that each snapshot file contains data for one relation. A "relation" is a fuzzy concept - it could actually be one 1 GB relation segment. Or it could include all the different "forks" of a relation, or you could treat each fork as a separate relation for storage purpose. And once we have the "non-relational" work is finished, a "relation" could actually mean some other versioned object kept in the PostgreSQL data directory. Let's ignore that for now. # Eric's RFC: Every now and then, you create a "snapshot". It means that you create a new snapshot file for each relation that was modified after the last snapshot, and write out the contents the relation as it is/was at the snapshot LSN. Write-ahead log is stored separately in S3 by the WAL safekeeping service, in the original PostgreSQL WAL file format. SNAPSHOT @100 WAL . | . | . | . | SNAPSHOT @200 | . | . | . | . | SNAPSHOT @300 | . | . V IN-MEMORY @400 If a GetPage@LSN request comes from the primary, you return the latest page from the in-memory layer. If there is no trace of the page in memory, it means that it hasn't been modified since the last snapshot, so you return the page from the latest snapshot, at LSN 300 in the above example. PITR is implemented using the original WAL files: If a GetPage@LSN request comes from a read replica with LSN 250, you read the image of the page from the snapshot at LSN 200, and you also scan the WAL between 200 and 250, and apply all WAL records for the requested page, to reconstruct it at LSN 250. Scanning the WAL naively for every GetPage@LSN request would be expensive, so in practice you'd construct an in-memory data structure of all the WAL between 200 and 250 once that allows quickly looking up records for a given page. ## Problems/questions I think you'll need to store the list of snapshot LSNs on each timeline somewhere. If the latest snapshot of a relation is at LSN 100, and you request a page at LSN 1000000, how do you know if there are some modifications to it between 100 and 1000000 that you need to replay? You can scan all the WAL between 100 and 1000000, but that would be expensive. You can skip that, if you know that a snapshot was taken e.g. at LSN 999900. Then you know that the fact that there is no snapshot file at 999900 means that the relation hasn't been modified between 100-999900. Then you only need to scan the WAL between 999900 and 1000000. However, there is no trace of a snapshot happening at LSN 999900 in the snapshot file for this relation, so you need to get that information from somewhere else. Where do you get that information from? Perhaps you can scan all the other relations, and if you see a snapshot file for *any* relation at LSN 999900, you know that if there were modifications to this relation, there would be a newer snapshot file for it, too. In other words, the list of snapshots that have been taken can be constructed by scanning all relations and computing the union of all snapshot LSNs that you see for any relation. But that's expensive so at least you should keep that in memory, after computing it once. Also, if you rely on that, it's not possible to have snapshots at different intervals for different files. That seems limiting. Another option is to explicitly store a list of snapshot LSNs in a separate metadata file. # Current implementation in the 'layered_repo' branch: We store snapshot files like in the RFC, but each snapshot file also contains all the WAL in the range of LSNs, so that you don't need to fetch the WAL separately from S3. So you have "layers" like this: SNAPSHOT+WAL 100-200 | | | | SNAPSHOT+WAL 200-300 | | | | IN-MEMORY 300- Each "snapshot+WAL" is a file that contains a snapshot - i.e. full copy of each page in the relation, at the *start* LSN. In addition to that, it contains all the WAL applicable to the relation from the start LSN to the end LSN. With that, you can reconstruct any page version in the range that the file covers. ## Problems/questions I can see one potential performance issue here, compared to the RFC. Let's focus on a single relation for now. Imagine that you start from an empty relation, and you receive WAL from 100 to 200, containing a bunch of inserts and updates to the relation. You now have all that WAL in memory: memory: WAL from 100-200 We decide that it's time to materialize that to a snapshot file on disk. We materialize full image of the relation as it was at LSN 100 to the snapshot file, and include all of the WAL. Since the relation was initially empty, the "image" at the beginning of th range is empty too. So now you have one file on on disk: SNAPSHOT+WAL 100-200 It contains a full image of the relation at LSN 100 and all WAL between 100-200. (It's actually stored as a serialized BTreeMap of page versions, with the page images and WAL records all stored together in the same BtreeMap. But for this story, that's not important.) We now receive more WAL updating the relation, up to LSN 300. We decide it's time to materialize a new snapshot file, and we now have two files: SNAPSHOT+WAL 100-200 SNAPSHOT+WAL 200-300 Note that the latest "full snapshot" that we store on disk always lags behind by one snapshot cycle. The first file contains a full image of the relation at LSN 100, the second at LSN 200. When we have received WAL up to LSN 300, we write a materialized image at LSN 200. That seems a bit silly. In the design per your RFC, you would write a snapshots at LSNs 200 and 300, instead. That seems better. # Third option (not implemented yet) Store snapshot files like in the RFC, but also store per-relation WAL files that contain WAL in a range of LSNs for that relation. SNAPSHOT @100 WAL 100-200 . | . | . | . | SNAPSHOT @200 WAL 200-300 . | . | . | . | SNAPSHOT @300 . . IN-MEMORY 300- This could be the best of both worlds. The snapshot files would be independent of the PostgreSQL WAL format. When it's time to write snapshot file @300, you write a full image of the relation at LSN 300, and you write the WAL that you had accumulated between 200 and 300 to a separate file. That way, you don't "lag behind" for one snapshot cycle like in the current implementation. But you still have the WAL for a particular relation readily available alongside the snapshot files, and you don't need to track what snapshot LSNs exist separately. (If we wanted to minimize the number of files, you could include the snapshot @300 and the WAL between 200 and 300 in the same file, but I feel it's probably better to keep them separate) # Further thoughts There's no fundamental reason why the LSNs of the snapshot files and the ranges of the WAL files would need to line up. So this would be possible too: SNAPSHOT @100 WAL 100-150 . | . | . WAL 150-250 . | SNAPSHOT @200 | . | . WAL 250-400 . | . | SNAPSHOT @300 | . | . | IN-MEMORY 300- I'm not sure what the benefit of this would be. You could materialize additional snapshot files in the middle of a range covered by a WAL file, maybe? Might be useful to speed up access when you create a new branch in the middle of an LSN range or if there's some other reason to believe that a particular LSN is "interesting" and there will be a lot of requests using it. ================================================ FILE: docs/rfcs/009-snapshot-first-storage.md ================================================ # Snapshot-first storage architecture Goals: - Long-term storage of database pages. - Easy snapshots; simple snapshot and branch management. - Allow cloud-based snapshot/branch management. - Allow cloud-centric branching; decouple branch state from running pageserver. - Allow customer ownership of data via s3 permissions. - Provide same or better performance for typical workloads, vs plain postgres. Non-goals: - Service database reads from s3 (reads should be serviced from the pageserver cache). - Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). ## Principle of operation The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. Objects in s3 are immutable snapshots, never to be modified once written (only deleted). Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) ## Pageserver operation To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. ## Cloud snapshot manager operation Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): Create/delete/clone/rename a database Create a new branch (possibly from a historical snapshot) Start/stop the pageserver/safekeeper on a branch List databases/branches/snapshots that are visible to this user account Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. ## Snapshot names, deletion and concurrency There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. ## Branching A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: - If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. - Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. - If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. ## Long-term file format Snapshot files (and any other object stored in s3) must be readable by future software versions. It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. Files should contain the following metadata, in addition to the set of pages: - The version of the file format. - A unique identifier for this branch (should be worldwide-unique and unchanging). - Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). - For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). - The location of the predecessor branch snapshot, if different from this branch’s location. - The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. - The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). - A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. ## S3 semantics, and other kinds of storage For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. Alternate implementations of s3 should be supported, including Google Cloud Storage. Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. The properties of s3 that we depend on are: list objects streaming read of entire object read byte range from object streaming write new object (may use multipart upload for better reliability) delete object (that should not disrupt an already-started read). Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. ## Notes Possible simplifications, for a first draft implementation: - Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. - Don’t worry about the details of the squashing process yet. - Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. - Don’t implement rename, delete at first. - Don’t implement public/private, just use s3 permissions. - Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. - Don’t worry about history that spans multiple buckets. - Don’t worry about s3 regions. - Don’t support user-writeable s3 buckets; users get only read-only access at most. Open questions: - How important is point-in-time recovery? When should we add this? How should it work? - Should snapshot files use compression? - Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. - Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? - When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? - How can pageserver software upgrade be done with minimal downtime? ================================================ FILE: docs/rfcs/010-storage_details.md ================================================ # Storage details Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. ## Overview ![storage](images/storage.jpeg) ### MemStore MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. ### PageIndex PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): * PageStoreRef -- page offset in the PageStore * LocalStoreRef -- snapshot_id and page offset inside of that snapshot * WalStoreRef -- offset (and size optionally) of WalRecord in WalStore PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. ### WalStore WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). ### PageStore PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. There are few possible options for PageStore: a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. With option b) we can also treat PageStor as an uncompleted incremental snapshot. ### LocalStore LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. ## Granularity By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: * can we shard big databases between page servers? * how much time will we spend applying WAL to access certain pages with older LSN's? * how many files do we create for a single database? I can think of the following options here: 1. whole database goes to one full snapshot. * +: we never create a lot of files for one database * +: the approach is quite straightforward, moving data around is simple * -: can not be sharded * -: long recovery -- we always need to recover the whole database 2. table segment is the unit of snapshotting * +: straightforward for sharding * +: individual segment can be quickly recovered with sliced WAL * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. 3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. * +: addresses all mentioned issues * -: harder to implement I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. Both PageStore and WalStore should be "sharded" by this granularity level. ## Security We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. ## Dynamics ### WAL stream handling When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. ### getPage queries Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. ### Starting page server without local data * build set of latest full snapshots and incremental snapshots on top of them * load all their metadata into PageIndex * Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot * for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). ### Starting page server with local data (aka restart or reboot) * check that local snapshot files are consistent with S3 ### Snapshot creation Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. ### S3 pushdown When we have several full snapshots GC can push the old one with its increments to S3. ### Branch creation Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. ## File formats As far as I understand Bookfile/Aversion addresses versioning and serialization parts. As for exact data that should go to snapshots I think it is the following for each snapshot: * format version number * set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number. * array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile * array of [(BuffTag, LSN), corresponding offset in file] for the WAL records * pages, one by one * WAL records, one by one It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). 1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small). 2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: 1. snapshot lsn=200, includes WAL in range 200-300 2. snapshot lsn=200, includes WAL in range 100-200 3. data snapshots are separated from WAL snapshots Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). ================================================ FILE: docs/rfcs/011-retention-policy.md ================================================ # User-visible timeline history The user can specify a retention policy. The retention policy is presented to the user as a PITR period and snapshots. The PITR period is the amount of recent history that needs to be retained, as minutes, hours, or days. Within that period, you can create a branch or snapshot at any point in time, open a compute node, and start running queries. Internally, a PITR period is represented as a range of LSNs The user can also create snapshots. A snapshot is a point in time, internally represented by an LSN. The user gives the snapshot a name. The user can also specify an interval, at which the system creates snapshots automatically. For example, create a snapshot every night at 2 AM. After some user-specified time, old automatically created snapshots are removed. Snapshot Snapshot PITR "Monday" "Tuesday" PITR ----######----------+-------------+-------------######> If there are multiple branches, you can specify different policies or different branches. The PITR period and user-visible snapshots together define the retention policy. NOTE: As presented here, this is probably overly flexible. In reality, we want to keep the user interface simple. Only allow a PITR period at the tip of a branch, for example. But that doesn't make much difference to the internals. # Retention policy behind the scenes The retention policy consists of points (for snapshots) and ranges (for PITR periods). The system must be able to reconstruct any page within the retention policy. Other page versions can be garbage collected away. We have a lot of flexibility on when to perform the garbage collection and how aggressive it is. # Base images and WAL slices The page versions are stored in two kinds of files: base images and WAL slices. A base image contains a dump of all the pages of one relation at a specific LSN. A WAL slice contains all the WAL in an LSN range. | | | | --Base img @100 + | | | | WAL slice | | 100-200 | | | --Base img @200 + | | | | WAL slice | | 200-300 | | | + | V To recover a page e.g. at LSN 150, you need the base image at LSN 100, and the WAL slice 100-200. All of this works at a per-relation or per-relation-segment basis. If a relation is updated very frequently, we create base images and WAL slices for it more quickly. For a relation that's updated infrequently, we hold the recent WAL for that relation longer, and only write it out when we need to release the disk space occupied by the original WAL. (We need a backstop like that, because until all the WAL/base images have been been durably copied to S3, we must keep the original WAL for that period somewhere, in the WAL service or in S3.) # Branching Internally, branch points are also "retention points", in addition to the user-visible snapshots. If a branch has been forked off at LSN 100, we need to be able to reconstruct any page on the parent branch at that LSN, because it is needed by the child branch. If a page is modified in the child, we don't need to keep that in the parent anymore, though. ================================================ FILE: docs/rfcs/012-background-tasks.md ================================================ # Eviction Write out in-memory layer to disk, into a delta layer. - To release memory - To make it possible to advance disk_consistent_lsn and allow the WAL service to release some WAL. - Triggered if we are short on memory - Or if the oldest in-memory layer is so old that it's holding back the WAL service from removing old WAL # Materialization Create a new image layer of a segment, by performing WAL redo - To reduce the amount of WAL that needs to be replayed on a GetPage request. - To allow garbage collection of old layers - Triggered by distance to last full image of a page # Coalescing Replace N consecutive layers of a segment with one larger layer. - To reduce the number of small files that needs to be uploaded to S3 # Bundling Zip together multiple small files belonging to different segments. - To reduce the number of small files that needs to be uploaded to S3 # Garbage collection Remove a layer that's older than the GC horizon, and isn't needed anymore. ================================================ FILE: docs/rfcs/013-term-history.md ================================================ # What Currently, apart from WAL safekeeper persistently stores only two logical clock counter (aka term) values, sourced from the same sequence. The first is bumped whenever safekeeper gives vote to proposer (or acknowledges already elected one) and e.g. prevents electing two proposers with the same term -- it is actually called `term` in the code. The second, called `epoch`, reflects progress of log receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly corresponds to proposed in https://github.com/neondatabase/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is stamped with term in which it was generated; while we essentially store in `epoch` only the term of the highest record on this safekeeper -- when we know it -- because during recovery generally we don't, and `epoch` is bumped directly to the term of the proposer who performs the recovery when it is finished. It is not immediately obvious that this simplification is safe. I thought and I still think it is; model checking confirmed that. However, some details now make me believe it is better to keep full term switching history (which is equivalent to knowing term of each record). # Why Without knowing full history (list of pairs) of terms it is hard to determine the exact divergence point, and if we don't perform truncation at that point safety becomes questionable. Consider the following history, with safekeepers A, B, C, D, E. n_m means record created by proposer in term n with LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. 1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only on A.
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=1, e=1) 1.1
D(t=1, e=1) 1.1
E(t=1, e=1) 1.1
2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD:
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=2, e=2) 1.1 2.2 2.3
D(t=2, e=2) 1.1 2.2 2.3
E(t=2, e=1) 1.1
3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D:
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=3, e=2) 1.1 2.2 2.3
D(t=3, e=3) 1.1 2.2 2.3 3.4
E(t=3, e=1) 1.1
Now, A gets back and P3 starts recovering it. How it should proceed? There are two options. ## Don't try to find divergence point at all ...start sending WAL conservatively since the horizon (1.1), and truncate obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes. Then the following is possible: 4) P3 moves one record 2.2 to A.
A(t=1, e=1) 1.1 2.2 1.3 1.4
B(t=1, e=1) 1.1 1.2
C(t=3, e=2) 1.1 2.2 2.3
D(t=3, e=3) 1.1 2.2 2.3 3.4
E(t=3, e=1) 1.1
Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and A's log is the longest one, they can elect P4 who will commit such log. Note that this particular history couldn't happen if we forbid to *create* new records in term n until majority of safekeepers switch to it. It would force CDE to switch to 2 before 2.2 is created, and A could never become donor while his log is corrupted. Generally with this additional barrier I believe the algorithm becomes safe, but - I don't like this kind of artificial barrier; - I also feel somewhat discomfortable about even temporary having intentionally corrupted WAL; - I'd still model check the idea. ## Find divergence point and truncate at it Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we do that? Without term switching history we have to resort to sending again since the horizon and memcmp'ing records, which is inefficient and ugly. Or we can maintain full history and determine truncation point by comparing 'wrong' and 'right' histories -- much like pg_rewind does -- and perform truncation + start streaming right there. # Proposal - Add term history as array of pairs to safekeeper controlfile. - Return it to proposer with VoteResponse so 1) proposer can tell it to other nodes and 2) determine personal streaming starting point. However, since we don't append WAL and update controlfile atomically, let's first always update controlfile but send only the history of what we really have (up to highest term in history where begin_lsn >= end of wal; this highest term replaces current `epoch`). We also send end of wal as we do now to determine the donor. - Create ProposerAnnouncement message which proposer sends before starting streaming. It announces proposer as elected and 1) Truncates wrong part of WAL on safekeeper (divergence point is already calculated at proposer, but can be cross-verified here). 2) Communicates the 'right' history of its term (taken from donor). Seems better to immediately put the history in the controlfile, though safekeeper might not have full WAL for previous terms in it -- this way is simpler, and we can't update WAL and controlfile atomically anyway. This also constitutes analogue of current epoch bump for those safekeepers which don't need recovery, which is important for sync-safekeepers (bump epoch without waiting records from new term). - After ProposerAnnouncement proposer streams WAL since calculated starting point -- only what is missing. pros/cons: + (more) clear safety of WAL truncation -- we get very close to Raft + no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters only for 5+ nodes) + adds some observability at safekeepers - complexity, but not that much # Misc - During model checking I did truncation on first locally non existent or different record -- analogue of 'memcmp' variant described above. ================================================ FILE: docs/rfcs/014-safekeepers-gossip.md ================================================ # Safekeeper gossip Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13) ## Motivation In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant: 1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely. 2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver. 3. To enable SK to SK direct recovery without involving the compute ## Summary Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload. ## Components safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses) ## Proposed implementation Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)` Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants. Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it. ### Corner cases - Current safekeeper may be alive but may not have connectivity to the pageserver To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver. - Current safekeeper may be alive but may not have connectivity with the compute node. We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that. - It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts: - that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL. - we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery If those conditions are not met, we will have some gossip activity (but that may be okay). ## Pros/cons Pros: - distributed, does not introduce new services (like etcd), does not add console as a storage dependency - lays the foundation for gossip-based recovery Cons: - Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly. - If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing. ## Alternative implementation We can have a selected node (e.g., console) with everybody reporting to it. ## Security implications We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users. ## Scalability implications The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant. ================================================ FILE: docs/rfcs/014-storage-lsm.md ================================================ # Why LSM trees? In general, an LSM tree has the nice property that random updates are fast, but the disk writes are sequential. When a new file is created, it is immutable. New files are created and old ones are deleted, but existing files are never modified. That fits well with storing the files on S3. Currently, we create a lot of small files. That is mostly a problem with S3, because each GET/PUT operation is expensive, and LIST operation only returns 1000 objects at a time, and isn't free either. Currently, the files are "archived" together into larger checkpoint files before they're uploaded to S3 to alleviate that problem, but garbage collecting data from the archive files would be difficult and we have not implemented it. This proposal addresses that problem. # Overview ``` ^ LSN | | Memtable: +-----------------------------+ | | | | +-----------------------------+ | | | L0: +-----------------------------+ | | | | +-----------------------------+ | | +-----------------------------+ | | | | +-----------------------------+ | | +-----------------------------+ | | | | +-----------------------------+ | | +-----------------------------+ | | | | +-----------------------------+ | | | L1: +-------+ +-----+ +--+ +-+ | | | | | | | | | | | | | | | | | | | +-------+ +-----+ +--+ +-+ | | +----+ +-----+ +--+ +----+ | | | | | | | | | | | | | | | | | | | +----+ +-----+ +--+ +----+ | +--------------------------------------------------------------> Page ID +---+ | | Layer file +---+ ``` # Memtable When new WAL arrives, it is first put into the Memtable. Despite the name, the Memtable is not a purely in-memory data structure. It can spill to a temporary file on disk if the system is low on memory, and is accessed through a buffer cache. If the page server crashes, the Memtable is lost. It is rebuilt by processing again the WAL that's newer than the latest layer in L0. The size of the Memtable is configured by the "checkpoint distance" setting. Because anything that hasn't been flushed to disk and uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint distance" also determines the amount of WAL that needs to kept in the safekeeper. # L0 When the Memtable fills up, it is written out to a new file in L0. The files are immutable; when a file is created, it is never modified. Each file in L0 is roughly 1 GB in size (*). Like the Memtable, each file in L0 covers the whole key range. When enough files have been accumulated in L0, compaction starts. Compaction processes all the files in L0 and reshuffles the data to create a new set of files in L1. (*) except in corner cases like if we want to shut down the page server and want to flush out the memtable to disk even though it's not full yet. # L1 L1 consists of ~ 1 GB files like L0. But each file covers only part of the overall key space, and a larger range of LSNs. This speeds up searches. When you're looking for a given page, you need to check all the files in L0, to see if they contain a page version for the requested page. But in L1, you only need to check the files whose key range covers the requested page. This is particularly important at cold start, when checking a file means downloading it from S3. Partitioning by key range also helps with garbage collection. If only a part of the database is updated, we will accumulate more files for the hot part in L1, and old files can be removed without affecting the cold part. # Image layers So far, we've only talked about delta layers. In addition to the delta layers, we create image layers, when "enough" WAL has been accumulated for some part of the database. Each image layer covers a 1 GB range of key space. It contains images of the pages at a single LSN, a snapshot if you will. The exact heuristic for what "enough" means is not clear yet. Maybe create a new image layer when 10 GB of WAL has been accumulated for a 1 GB segment. The image layers limit the number of layers that a search needs to check. That put a cap on read latency, and it also allows garbage collecting layers that are older than the GC horizon. # Partitioning scheme When compaction happens and creates a new set of files in L1, how do we partition the data into the files? - Goal is that each file is ~ 1 GB in size - Try to match partition boundaries at relation boundaries. (See [1] for how PebblesDB does this, and for why that's important) - Greedy algorithm # Additional Reading [1] Paper on PebblesDB and how it does partitioning. https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf ================================================ FILE: docs/rfcs/015-storage-messaging.md ================================================ # Storage messaging Created on 19.01.22 Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich. That it is an alternative to (014-safekeeper-gossip)[] ## Motivation As in 014-safekeeper-gossip we need to solve the following problems: * Trim WAL on safekeepers * Decide on which SK should push WAL to the S3 * Decide on which SK should forward WAL to the pageserver * Decide on when to shut down SK<->pageserver connection This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip. Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper. ## Summary Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param. Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency. ## Non-goals That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC. ## Impacted components pageserver, safekeeper adds either etcd or console as a storage dependency ## Possible implementation: custom message broker in the console We've decided to go with an etcd approach instead of the message broker.
Original suggestion
We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline. Message format could be `{sender, destination, payload}`. The destination is either: 1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or 2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline Sender is either: 1. `sk_#{sk_id}`, or 2. `pserver_#{pserver_id}` I can think of the following behavior to address our original problems: * WAL trimming Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers * Decide on which SK should push WAL to the S3 Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3. * Decide on which SK should forward WAL to the pageserver Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). * Decide on when to shutdown sk<->pageserver connection Again, pageserver would have all the info to understand when to shut down the safekeeper connection. ### Scalability One node is enough (c) No, seriously, it is enough. ### High Availability Broker lives in the console, so we can rely on k8s maintaining the console app alive. If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically. ### Interactions ``` .________________. sk_1 <-> | | <-> pserver_1 ... | Console broker | ... sk_n <-> |________________| <-> pserver_m ```
## Implementation: etcd state store Alternatively, we can set up `etcd` and maintain the following data structure in it: ```ruby "compute_#{tenant}_#{timeline}" => { safekeepers => { "sk_#{sk_id}" => { write_lsn: "0/AEDF130", commit_lsn: "0/AEDF100", compute_connected: true, last_updated: 1642621138, }, } } ``` As etcd doesn't support field updates in the nested objects that translates to the following set of keys: ```ruby "compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn", "compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn", ... ``` Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console. ### Safekeeper address discovery During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful. ### Safekeeper behavior For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers. That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive. ### Pageserver behavior Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds. This will help to faster detect issues with safekeeper (and switch to another) in the following cases: when compute failed but TCP connection stays alive until timeout (usually about a minute) when safekeeper failed and didn't set compute_connected to false Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC. Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). ### Interactions ``` .________________. sk_1 <-> | | <-> pserver_1 ... | etcd | ... sk_n <-> |________________| <-> pserver_m ``` ### Sequence diagrams for different workflows #### Cluster startup ```mermaid sequenceDiagram autonumber participant C as Compute participant SK1 participant SK2 participant SK3 participant PS1 participant PS2 participant O as Orchestrator participant M as Metadata Service PS1->>M: subscribe to updates to state of timeline N C->>+SK1: WAL push loop constantly update current lsns SK1->>-M: I'm at lsn A end C->>+SK2: WAL push loop constantly update current lsns SK2->>-M: I'm at lsn B end C->>+SK3: WAL push loop constantly update current lsns SK3->>-M: I'm at lsn C end loop request pages C->>+PS1: get_page@lsn PS1->>-C: page image end M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100
so connect to SK1 because it is the most up to date one PS1->>SK1: start replication ``` #### Behaviour of services during typical operations ```mermaid sequenceDiagram autonumber participant C as Compute participant SK1 participant SK2 participant SK3 participant PS1 participant PS2 participant O as Orchestrator participant M as Metadata Service note over C,M: Scenario 1: Pageserver checkpoint note over PS1: Upload data to S3 PS1->>M: Update remote consistent lsn M->>SK1: propagate remote consistent lsn update note over SK1: truncate WAL up to remote consistent lsn M->>SK2: propagate remote consistent lsn update note over SK2: truncate WAL up to remote consistent lsn M->>SK3: propagate remote consistent lsn update note over SK3: truncate WAL up to remote consistent lsn note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2) note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds. note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source PS1->>SK2: start replication ``` #### Behaviour during timeline relocation ```mermaid sequenceDiagram autonumber participant C as Compute participant SK1 participant SK2 participant SK3 participant PS1 participant PS2 participant O as Orchestrator participant M as Metadata Service note over C,M: Timeline is being relocated from PS1 to PS2 O->>+PS2: Attach timeline PS2->>-O: 202 Accepted if timeline exists in S3 note over PS2: Download timeline from S3 note over O: Poll for timeline download (or subscribe to metadata service) loop wait for attach to complete O->>PS2: timeline detail should answer that timeline is ready end PS2->>M: Register downloaded timeline PS2->>M: Get safekeepers for timeline, subscribe to changes PS2->>SK1: Start replication to catch up note over O: PS2 caught up, time to switch compute O->>C: Restart compute with new pageserver url in config note over C: Wal push is restarted loop request pages C->>+PS2: get_page@lsn PS2->>-C: page image end O->>PS1: detach timeline note over C,M: Scenario 1: Attach call failed O--xPS2: Attach timeline note over O: The operation can be safely retried,
if we hit some threshold we can try another pageserver note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication loop wait for attach to complete O--xPS2: timeline detail should answer that timeline is ready end note over O: Can wait for a timeout, and then try another pageserver
there should be a limit on number of different pageservers to try note over C,M: Scenario 3: Detach fails O--xPS1: Detach timeline note over O: can be retried, if continues to fail might lead to data duplication in s3 ``` # Pros/cons ## Console broker/etcd vs gossip: Gossip pros: * gossip allows running storage without the console or etcd Console broker/etcd pros: * simpler * solves "call me maybe" as well * avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples ## Console broker vs. etcd: Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/. But with an etcd we are in a bit different situation: 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd 2. etcd uses Grpc as a protocol, and messages are pretty simple So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). ================================================ FILE: docs/rfcs/016-connection-routing.md ================================================ # Dispatching a connection For each client connection, Neon service needs to authenticate the connection, and route it to the right PostgreSQL instance. ## Authentication There are three different ways to authenticate: - anonymous; no authentication needed - PostgreSQL authentication - github single sign-on using browser In anonymous access, the user doesn't need to perform any authentication at all. This can be used e.g. in interactive PostgreSQL documentation, allowing you to run the examples very quickly. Similar to sqlfiddle.com. PostgreSQL authentication works the same as always. All the different PostgreSQL authentication options like SCRAM, kerberos, etc. are available. [1] The third option is to authenticate with github single sign-on. When you open the connection in psql, you get a link that you open with your browser. Opening the link redirects you to github authentication, and lets the connection to proceed. This is also known as "Link auth" [2]. ## Routing the connection When a client starts a connection, it needs to be routed to the correct PostgreSQL instance. Routing can be done by the proxy, acting as a man-in-the-middle, or the connection can be routed at the network level based on the hostname or IP address. Either way, Neon needs to identify which PostgreSQL instance the connection should be routed to. If the instance is not already running, it needs to be started. Some connections always require a new PostgreSQL instance to be created, e.g. if you want to run a one-off query against a particular point-in-time. The PostgreSQL instance is identified by: - Neon account (possibly anonymous) - cluster (known as tenant in the storage?) - branch or snapshot name - timestamp (PITR) - primary or read-replica - one-off read replica - one-off writeable branch When you are using regular PostgreSQL authentication or anonymous access, the connection URL needs to contain all the information needed for the routing. With github single sign-on, the browser is involved and some details - the Neon account in particular - can be deduced from the authentication exchange. There are three methods for identifying the PostgreSQL instance: - Browser interaction (link auth) - Options in the connection URL and the domain name - A pre-defined endpoint, identified by domain name or IP address ### Link Auth postgres://@start.neon.tech/ This gives you a link that you open in browser. Clicking the link performs github authentication, and the Neon account name is provided to the proxy behind the scenes. The proxy routes the connection to the primary PostgreSQL instance in cluster called "main", branch "main". Further ideas: - You could pre-define a different target for link auth connections in the UI. - You could have a drop-down in the browser, allowing you to connect to any cluster you want. Link Auth can be like Teleport. ### Connection URL The connection URL looks like this: postgres://@.db.neon.tech/ By default, this connects you to the primary PostgreSQL instance running on the "main" branch in the named cluster [3]. However, you can change that by specifying options in the connection URL. The following options are supported: | option name | Description | Examples | | --- | --- | --- | | cluster | Cluster name | cluster:myproject | | branch | Branch name | branch:main | | timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z | | lsn | Connect to an instance at given LSN | lsn:0/12FF0420 | | read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new | For example, to read branch 'testing' as it was on Mar 31, 2022, you could specify a timestamp in the connection URL [4]: postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31 Connecting with cluster name and options can be disabled in the UI. If disabled, you can only connect using a pre-defined endpoint. ### Pre-defined Endpoint Instead of providing the cluster name, branch, and all those options in the connection URL, you can define a named endpoint with the same options. In the UI, click "create endpoint". Fill in the details: - Cluster name - Branch - timestamp or LSN - is this for the primary or for a read replica - etc. When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect: postgres://@.endpoint.neon.tech/ An endpoint can be assigned a static or dynamic IP address, so that you can connect to it with clients that don't support TLS SNI. Maybe bypass the proxy altogether, but that ought to be invisible to the user. You can limit the range of source IP addresses that are allowed to connect to an endpoint. An endpoint can also be exposed in an Amazon VPC, allowing direct connections from applications. # Footnotes [1] I'm not sure how feasible it is to set up configure like Kerberos or LDAP in a cloud environment. But in principle I think we should allow customers to have the full power of PostgreSQL, including all authentication options. However, it's up to the customer to configure it correctly. [2] Link is a way to both authenticate and to route the connection [3] This assumes that cluster-ids are globally unique, across all Neon accounts. [4] The syntax accepted in the connection URL is limited by libpq. The only way to pass arbitrary options to the server (or our proxy) is with the "options" keyword, and the options must be percent-encoded. I think the above would work but i haven't tested it ================================================ FILE: docs/rfcs/017-console-split.md ================================================ # Splitting cloud console Created on 17.06.2022 ## Summary Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain. This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this: ```markup . x external area x internal area (our clients) x (our services) x x ┌───────────────────────┐ x ┌───────────────┐ > ┌─────────────────────┐ │ Storage (EC2) │ x │ console db │ > │ control-plane db │ │ │ x └───────────────┘ > └─────────────────────┘ │ - safekeepers │ x ▲ > ▲ │ - pageservers │ x │ > │ │ │ ┌──────────────────┐ x ┌───────┴───────┐ > │ │ Dependencies │ │ browser UI ├──►│ │ > ┌──────────┴──────────┐ │ │ └──────────────────┘ x │ │ > │ │ │ - etcd │ x │ console ├───────►│ control-plane ├────►│ - S3 │ ┌──────────────────┐ x │ │ > │ (deployed in k8s) │ │ - more? │ │public API clients├──►│ │ > │ │ │ │ └──────────────────┘ x └───────┬───────┘ > └──────────┬──────────┘ └───────────────────────┘ x │ > ▲ │ ▲ x │ > │ │ │ x ┌───────┴───────┐ > │ │ ┌───────────┴───────────┐ x │ dependencies │ > │ │ │ │ x │- analytics │ > │ └───────────────►│ computes │ x │- auth │ > │ │ (deployed in k8s) │ x │- billing │ > │ │ │ x └───────────────┘ > │ └───────────────────────┘ x > │ ▲ x > ┌─────┴───────────────┐ │ ┌──────────────────┐ x > │ │ │ │ │ x > │ proxy ├─────────────────┘ │ postgres ├───────────────────────────►│ (deployed in k8s) │ │ users │ x > │ │ │ │ x > └─────────────────────┘ └──────────────────┘ x > > > closed-source > open-source > > ``` Notes: - diagram is simplified in the less-important places - directed arrows are strict and mean that connections in the reverse direction are forbidden This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: 1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other. 2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database. 3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API. 4. Move control-plane source code to the neon repo; start control-plane as a separate service. ## Motivation These are the two most important problems we want to solve: - Publish open-source implementation of all our cloud/storage features - Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment. After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand. Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps). Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics. For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements. ## Proposed implementation ### Current state of things Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code: - open-source `postgres` — our fork of postgres - open-source `neon` — our main repository for storage source code - closed-source `cloud` — mostly console backend and UI frontend This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code. Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have: - command-line tools, such as cloudbench, neonadmin - markdown documentation - cloud operations scripts (helm, terraform, ansible) - configs and other things - e2e python tests - incidents playbooks - UI frontend - Make build scripts, code generation scripts - database migrations - swagger definitions And also let’s take a look at what we have in the console source code, which is the service we’d like to split: - API Servers - Public API v2 - Management API v2 - Public API v1 - Admin API v1 (same port as Public API v1) - Management API v1 - Workers - Monitor Compute Activity - Watch Failed Operations - Availability Checker - Business Metrics Collector - Internal Services - Auth Middleware, UserIsAdmin, Cookies - Cable Websocket Server - Admin Services - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users - Authenticate Proxy - API Keys - App Controller, serving UI HTML - Auth Controller - Branches - Projects - Psql Connect + Passwordless login - Users - Cloud Metrics - User Metrics - Invites - Pageserver/Safekeeper management - Operations, k8s/docker/common logic - Platforms, Regions - Project State - Projects Roles, SCRAM - Global Settings - Other things - segment analytics integration - sentry integration - other common utilities packages ### Drawing the splitting line The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits. We propose to define that line as follows: - everything user-related stays in the console service - everything storage-related should be in the control-plane service - something that falls in between should be decided where to go, but most likely should stay in the console service - some similar parts should be in both services, such as admin/management/db_migrations We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver). Storage-related things can be defined as doing any of the following: - using k8s API - doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..) - tracking current status of tenants/timelines, managing lifetime of computes Based on that idea, we can say that new control-plane service should have the following components: - single HTTP API for everything - Create and manage tenants and timelines - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers) - Admin API for storage health inspection and debugging - Workers - Monitor Compute Activity - Watch Failed Operations - Availability Checker - Internal Services - Admin Services - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers - Authenticate Proxy - Branches - Psql Connect - Cloud Metrics - Pageserver/Safekeeper management - Operations, k8s/docker/common logic - Platforms, Regions - Tenant State - Compute Roles, SCRAM - Global Settings --- And other components should probably stay in the console service: - API Servers (no changes here) - Public API v2 - Management API v2 - Public API v1 - Admin API v1 (same port as Public API v1) - Management API v1 - Workers - Business Metrics Collector - Internal Services - Auth Middleware, UserIsAdmin, Cookies - Cable Websocket Server - Admin Services - Users admin stays the same - Other admin services can redirect requests to the control-plane - API Keys - App Controller, serving UI HTML - Auth Controller - Projects - User Metrics - Invites - Users - Passwordless login - Other things - segment analytics integration - sentry integration - other common utilities packages There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services: - markdown documentation - e2e python tests - make build scripts, code generation scripts - database migrations - swagger definitions The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service. After the code is moved to the new service, we can fill the created void by making API calls to the new service: - authorization of the client - mapping user_id + project_id to the tenant_id - calling the control-plane API ### control-plane API Currently we have the following projects API in the console: ``` GET /projects/{project_id} PATCH /projects/{project_id} POST /projects/{project_id}/branches GET /projects/{project_id}/databases POST /projects/{project_id}/databases GET /projects/{project_id}/databases/{database_id} PUT /projects/{project_id}/databases/{database_id} DELETE /projects/{project_id}/databases/{database_id} POST /projects/{project_id}/delete GET /projects/{project_id}/issue_token GET /projects/{project_id}/operations GET /projects/{project_id}/operations/{operation_id} POST /projects/{project_id}/query GET /projects/{project_id}/roles POST /projects/{project_id}/roles GET /projects/{project_id}/roles/{role_name} DELETE /projects/{project_id}/roles/{role_name} POST /projects/{project_id}/roles/{role_name}/reset_password POST /projects/{project_id}/start POST /projects/{project_id}/stop POST /psql_session/{psql_session_id} ``` It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like: ``` GET /tenants/{tenant_id} PATCH /tenants/{tenant_id} POST /tenants/{tenant_id}/branches GET /tenants/{tenant_id}/databases POST /tenants/{tenant_id}/databases GET /tenants/{tenant_id}/databases/{database_id} PUT /tenants/{tenant_id}/databases/{database_id} DELETE /tenants/{tenant_id}/databases/{database_id} POST /tenants/{tenant_id}/delete GET /tenants/{tenant_id}/issue_token GET /tenants/{tenant_id}/operations GET /tenants/{tenant_id}/operations/{operation_id} POST /tenants/{tenant_id}/query GET /tenants/{tenant_id}/roles POST /tenants/{tenant_id}/roles GET /tenants/{tenant_id}/roles/{role_name} DELETE /tenants/{tenant_id}/roles/{role_name} POST /tenants/{tenant_id}/roles/{role_name}/reset_password POST /tenants/{tenant_id}/start POST /tenants/{tenant_id}/stop POST /psql_session/{psql_session_id} ``` One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP: - HTTP API is easier to use for the clients - we already have HTTP API in pageserver/safekeeper/console - we probably want control-plane API to be similar to the console API, available in the cloud ### Getting updates from the storage There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month. All of the above cases can happen without using the console, just by accessing compute through the proxy. To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities: - We finished processing some HTTP API query, such as resetting the password - We changed some state, such as started or stopped a compute - Operation is created - Operation is started for the first time - Operation is failed for the first time - Operation is finished Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this: ``` GET /events/ { "events": [...], "next_cursor": 123 } ``` It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events. ### Next steps After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the [Next steps](#next-steps-1). ## Non Goals RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on. ## Impacted components Mostly console, but can also affect some storage service. ## Scalability We should support starting several instances of the new control-plane service at the same time. At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests. ## Security implications New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these: - Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key. - Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens. ## Alternative implementation There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it. Regarding less alternative ideas, there are another options for the name of the new control-plane service: - storage-ctl - cloud - cloud-ctl ## Pros/cons of proposed approaches (TODO) Pros: - All storage features are completely open-source - Better tests coverage, less difference between cloud and local setups - Easier to develop storage and cloud features, because there is no need to setup console for that - Easier to deploy storage-only services to the any cloud Cons: - All storage features are completely open-source - Distributed services mean more code to connect different services and potential network issues - Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch - More code to JOIN data from different services (console and control-plane) ## Definition of Done We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo. ## Next steps After we’ve reached DoD, we can make further improvements. First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this: ``` ┌─────────────────────┐ ┌───────────────────────┐ │ │ │ Storage (local) │ │ control-plane db │ │ │ │ (local process) │ │ - safekeepers │ │ │ │ - pageservers │ └──────────▲──────────┘ │ │ │ │ Dependencies │ ┌──────────┴──────────┐ │ │ │ │ │ - etcd │ │ control-plane ├────►│ - S3 │ │ (local process) │ │ - more? │ │ │ │ │ └──────────┬──────────┘ └───────────────────────┘ ▲ │ ▲ │ │ │ │ │ ┌───────────┴───────────┐ │ │ │ │ │ └───────────────►│ computes │ │ │ (local processes) │ │ │ │ ┌──────┴──────────────┐ └───────────────────────┘ │ │ ▲ │ proxy │ │ │ (local process) ├─────────────────┘ │ │ └─────────────────────┘ ``` The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups. For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane. The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s. Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors. ================================================ FILE: docs/rfcs/017-timeline-data-management.md ================================================ # Name Tenant and timeline data management in pageserver ## Summary This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them. The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either. Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state. RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing. ## Motivation In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access. The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent. As the number of modules and isolated data grows per timeline, more questions and corner cases arise: - https://github.com/neondatabase/neon/issues/1559 right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently. - https://github.com/neondatabase/neon/issues/1751 GC and compaction file activities are not well known outside their tasks code, causing race bugs - https://github.com/neondatabase/neon/issues/2003 Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs - more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues. For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic. In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data. On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files. The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state. Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is. We have to develop a vision on a number of topics before progressing safely: - timeline and tenant data structure and how should we access it - sync and async worlds and in what way that should evolve - unit tests for the complex logic This RFC aims to provide a general overview of the existing situation and propose ways to improve it. The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later. ## What is a timeline and its data First, we need to define what data we want to manage per timeline. Currently, the data every timeline operates is: - a set of layer files, on the FS Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion. - a set of layer files, on the remote storage Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread - a `metadata` file, on the FS Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart. Also contains data about the ancestor, if the timeline was branched off another timeline. - an `index_part.json` file, on the remote storage Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline. Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files. - LayerMap and PageCache, in memory Dynamic, used to store and retrieve the page data to users. - timeline info, in memory LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes. - metrics data, in memory Data to push or provide to Prometheus, Opentelemetry, etc. Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers. Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents. Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant. `tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers. Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts. It's not yet anyhow synchronized with the remote storage, so only exists on the local FS. ### How the data is stored We have multiple places where timeline data is stored: - `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock>` with the `Tenant` having the `local_timelines: HashMap>` inside - same `Tenant` above has actually two references to timelines: another via its `repo: Arc` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex>` - `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries - `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any) - `PageCache` contains timeline-related data, and is created globally for the whole pageserver - implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time. On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first. Based on this, a high-level pageserver's module diagram with data and entities could be: ![timeline tenant state diagram](./images/017-timeline-data-management/timeline_tenant_state.svg) A few comments on the diagram: - the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below) It aims to show main data and means of synchronizing it. - modules tend to isolate their data inside and provide access to it via API Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening. - part of the modules is asynchronous, while the other is not, that complicates the data access Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking. Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread. Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking. Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done. - locks are used in two different ways: - `RwLock>` ones to hold the shared data and ensure its atomic updates - `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as `latest_gc_cutoff_lsn: RwLock` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams. - some synchronizations are not yet implemented E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage. That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues. - `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc. ### How is this data accessed? There are multiple ways the data is accessed, from different sources: 1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs) High-level CRUD API for managing tenants, timelines and getting data about them. Current API list (modified for readability): ```rust .get("/v1/status", status_handler) // pageserver status .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones .get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata .put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files .get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler) .get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata ``` Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata. `GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon. It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock. Non-`GET` operations mostly follow the same rule, with two differences: - `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks - `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too. This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory. "Timeline data synchronization" section below describes both complex cases in more details. 2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs) Is the main interface of pageserver, intended to handle libpq (and similar) requests. Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors. - `pagestream` Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size` Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout. - `basebackup` and `fullbackup` Options to generate postgres-compatible backup archives. - `import basebackup` - `import wal` Import the `pg_wal` section of the basebackup archive. - `get_last_record_rlsn`, `get_lsn_by_timestamp` "Metadata" retrieval methods, that still requires internal knowledge about layers. - `set`, `fallpoints`, `show` Utility methods to support various edge cases or help with debugging/testing. - `do_gc`, `compact`, `checkpoint` Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up. Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one. See "Timeline data synchronization" section for the united synchronization diagram on the topic. 3. internal access Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations. Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization. If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved. Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on. - periodic GC and compaction tasks Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk. Same as its libpq counterparts, needs full synchronization with the low level layer management code. - storage sync task Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository). Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks. - walreceiver and walingest task Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node. Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task. During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API. Layer updates require low-level set of sync primitives used to preserve the data consistency. - checkpoint (layer freeze) task Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time. ### Timeline data synchronization Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above. For brevity, diagrams do not show `RwLock>` data accesses, considering them almost instant to happen. `RwLock` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly. Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code. Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones. To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts. `write_lock` synchronization diagram: ![timeline data access synchronization(1)](./images/017-timeline-data-management/timeline_data_access_sync_1.svg) Comments: - `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time - `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations. Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks - `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time ![timeline data access synchronization(2)](./images/017-timeline-data-management/timeline_data_access_sync_2.svg) Comments: - `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for. - there are multiple locks that do similar task management operations: - `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock` ensures that branching and gc are not run concurrently - `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently - `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon. Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks. ## Proposed implementation ### How to structure timeline data access better - adjust tenant state handling Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup. We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added. The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that. Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes. Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive. - remove the "unloaded" status for the timeline Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493) ```rust #[derive(Clone)] enum LayeredTimelineEntry { Loaded(Arc), Unloaded { id: ZTimelineId, metadata: TimelineMetadata, }, } ``` supposes that timelines have to be in `Unloaded` state. The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded). The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later. Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working. Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation. Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory. There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first). With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state. To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer. Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future. This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover. - scale down the remote storage sync to per layer file, not per timeline as now Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed. Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage. With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested. Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities. Yet the only place using remote storage should be the layer map. - encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently [`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents. `tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs. To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place. One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place. - move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.) Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory. `LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`. In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS. Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable. One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463) The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed. Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests. - change the approach to locking synchronization A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes. It should be done in a task manager or other place, external to the timeline. Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock` are expected to stay for that purpose, but general amount of locks should be reduced. ### Putting it all together If the proposal bullets applied to the diagrams above, the state could be represented as: ![timeline timeline tenant state](./images/017-timeline-data-management/proposed_timeline_tenant_state.svg) The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible. This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way, more convenient for the data sync system inside. So far, it seems that a few maps with `Arc>` with actual data operations added inside each `SeparateData` struct, if needed. `page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking. `task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks. All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization. Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar. `task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...) Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently. `LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities. Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not. Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines. Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization. ![proposed timeline data access synchronization(1)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg) There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources. Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed. ![proposed timeline data access synchronization(2)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg) Lock diagrams legend: ![lock diagrams legend](./images/017-timeline-data-management/lock_legend.svg) After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way. This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held. Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock. It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling. The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request. There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently. ================================================ FILE: docs/rfcs/018-storage-messaging-2.md ================================================ # Storage messaging Safekeepers need to communicate to each other to * Trim WAL on safekeepers; * Decide on which SK should push WAL to the S3; * Decide on when to shut down SK<->pageserver connection; * Understand state of each other to perform peer recovery; Pageservers need to communicate to safekeepers to decide which SK should provide WAL to the pageserver. This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation, potential performance issue and ways to address it. ## Background What we have currently is very close to etcd variant described in 015-storage-messaging. Basically, we have single `SkTimelineInfo` message periodically sent by all safekeepers to etcd for each timeline. * Safekeepers subscribe to it to learn status of peers (currently they subscribe to 'everything', but they can and should fetch data only for timelines they hold). * Pageserver subscribes to it (separate watch per timeline) to learn safekeepers positions; based on that, it decides from which safekeepers to pull WAL. Also, safekeepers use etcd elections API to make sure only single safekeeper offloads WAL. It works, and callmemaybe is gone. However, this has a performance hazard. Currently deployed etcd can do about 6k puts per second (using its own `benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to 35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs), I get ~10k received messages per second with various number of publisher-subscribers (laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we get about 800 active timelines, if message is sent each second. Not extremely low, but quite reachable. A lot of idle watches seem to be ok though -- which is good, as pageserver subscribes to all its timelines regardless of their activity. Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on each restart or there is a risk of corruption errors. The reason is etcd making much more than what we need; it is a fault tolerant store with strong consistency, but I claim all we need here is just simplest pub sub with best effort delivery, because * We already have centralized source of truth for long running data, like which tlis are on which nodes -- the console. * Momentary data (safekeeper/pageserver progress) doesn't make sense to persist. Instead of putting each change to broker, expecting it to reliably deliver it is better to just have constant flow of data for active timelines: 1) they serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from it 2) it is simpler -- no need to track delivery to/from the broker. Moreover, latency here is important: the faster we obtain fresh data, the faster we can switch to proper safekeeper after failure. * As for WAL offloading leader election, it is trivial to achieve through these heartbeats -- just take suitable node through deterministic rule (min node id). Once network is stable, this is a converging process (well, except complicated failure topology, but even then making it converge is not hard). Such elections bear some risk of several offloaders running concurrently for a short period of time, but that's harmless. Generally, if one needs strong consistency, electing leader per se is not enough; it must be accompanied with number (logical clock ts), checked at every action to track causality. s3 doesn't provide CAS, so it can't differentiate old/new leader, this must be solved differently. We could use etcd CAS (its most powerful/useful primitive actually) to issue these leader numbers (and e.g. prefix files in s3), but currently I don't see need for that. Obviously best effort pub sub is much more simpler and performant; the one proposed is ## gRPC broker I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use with grpc streams and tokio mpsc channels. The implementation description is at the file header. It is just 500 lines of code and core functionality is complete. 1-1 pub sub gives about 120k received messages per second; having multiple subscribers in different connections quickly scales to 1 million received messages per second. I had concerns about many concurrent streams in singe connection, but 2^20 subscribers still work (though eat memory, with 10 publishers 20GB are consumed; in this implementation each publisher holds full copy of all subscribers). There is `bench.rs` nearby which I used for testing. `SkTimelineInfo` is wired here, but another message can be added (e.g. if pageservers want to communicate with each other) with templating. ### Fault tolerance Since such broker is stateless, we can run it under k8s. Or add proxying to other members, with best-effort this is simple. ### Security implications Communication happens in a private network that is not exposed to users; additionally we can add auth to the broker. ## Alternative: get existing pub-sub We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is much more complicated and needs VM; Redis Rust client maintenance is not ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC as well. ## Alternative: direct communication Apart from being transport, broker solves one more task: discovery, i.e. letting safekeepers and pageservers find each other. We can let safekeepers know, for each timeline, both other safekeepers for this timeline and pageservers serving it. In this case direct communication is possible: - each safekeeper pushes to each other safekeeper status of timelines residing on both of them, letting remove WAL, decide who offloads, decide on peer recovery; - each safekeeper pushes to each pageserver status of timelines residing on both of them, letting pageserver choose from which sk to pull WAL; It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that. The main pro is less one dependency: less moving parts, easier to run Neon locally/manually, less places to monitor. Fault tolerance for broker disappears, no kuber or something. To me this is a big thing. Also (though not a big thing) idle watches for inactive timelines disappear: naturally safekeepers learn about compute connection first and start pushing status to pageserver(s), notifying it should pull. Importantly, I think that eventually knowing and persisting peers and pageservers on safekeepers is inevitable: - Knowing peer safekeepers for the timeline is required for correct automatic membership change -- new member set must be hardened on old majority before proceeding. It is required to get rid of sync-safekeepers as well (peer recovery up to flush_lsn). - Knowing pageservers where the timeline is attached is needed to 1. Understand when to shut down activity on the timeline, i.e. push data to the broker. We can have a lot of timelines sleeping quietly which shouldn't occupy resources. 2. Preserve WAL for these (currently we offload to s3 and take it from there, but serving locally is better, and we get one less condition on which WAL can be removed from s3). I suppose this membership data should be passed to safekeepers directly from the console because 1. Console is the original source of this data, conceptually this is the simplest way (rather than passing it through compute or something). 2. We already have similar code for deleting timeline on safekeepers (and attaching/detaching timeline on pageserver), this is a typical action -- queue operation against storage node and execute it until it completes (or timeline is dropped). Cons of direct communication are - It is more complicated: each safekeeper should maintain set of peers it talks to, and set of timelines for each such peer -- they ought to be multiplexed into single connection. - Totally, we have O(n^2) connections instead of O(n) with broker schema (still O(n) on each node). However, these are relatively stable, async and thus not very expensive, I don't think this is a big problem. Up to 10k storage nodes I doubt connection overhead would be noticeable. I'd use gRPC for direct communication, and in this sense gRPC based broker is a step towards it. ================================================ FILE: docs/rfcs/019-tenant-timeline-lifecycles.md ================================================ # Managing Tenant and Timeline lifecycles ## Summary The pageserver has a Tenant object in memory for each tenant it manages, and a Timeline for each timeline. There are a lot of tasks that operate on the tenants and timelines with references to those objects. We have some mechanisms to track which tasks are operating on each Tenant and Timeline, and to request them to shutdown when a tenant or timeline is deleted, but it does not cover all uses, and as a result we have many race conditions around tenant/timeline shutdown. ## Motivation We have a bunch of race conditions that can produce weird errors and can be hard to track down. ## Non Goals This RFC only covers the problem of ensuring that a task/thread isn't operating on a Tenant or Timeline. It does not cover what states, aside from Active and non-Active, each Tenant and Timeline should have, or when exactly the transitions should happen. ## Impacted components (e.g. pageserver, safekeeper, console, etc) Pageserver. Although I wonder if the safekeeper should have a similar mechanism. ## Current situation Most pageserver tasks of are managed by task_mgr.rs: - LibpqEndpointListener - HttpEndPointListener - WalReceiverManager and -Connection - GarbageCollector and Compaction - InitialLogicalSizeCalculation In addition to those tasks, the walreceiver performs some direct tokio::spawn calls to spawn tasks that are not registered with 'task_mgr'. And all of these tasks can spawn extra operations with tokio spawn_blocking. Whenever a tenant or timeline is removed from the system, by pageserver shutdown, delete_timeline or tenant-detach operation, we rely on the task registry in 'task_mgr.rs' to wait until there are no tasks operating on the tenant or timeline, before its Tenant/Timeline object is removed. That relies on each task to register itself with the tenant/timeline ID in 'task_mgr.rs'. However, there are many gaps in that. For example, GarbageCollection and Compaction tasks are registered with the tenant, but when they proceed to operate on a particular timeline of the tenant, they don't register with timeline ID. Because of that, the timeline can be deleted while GC or compaction is running on it, causing failures in the GC or compaction (see https://github.com/neondatabase/neon/issues/2442). Another problem is that the task registry only works for tokio Tasks. There is no way to register a piece of code that runs inside spawn_blocking(), for example. ## Proposed implementation This "voluntary" registration of tasks is fragile. Let's use Rust language features to enforce that a tenant/timeline cannot be removed from the system when there is still some code operating on it. Let's introduce new Guard objects for Tenant and Timeline, and do all actions through the Guard object. Something like: TenantActiveGuard: Guard object over Arc. When you acquire the guard, the code checks that the tenant is in Active state. If it's not, you get an error. You can change the state of the tenant to Stopping while there are ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from being acquired, but the Tenant cannot be removed until all the guards are gone. TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the tenant is not in Active state. Used for operations like attach/detach. Perhaps allow only one such guard on a Tenant at a time. Similarly for Timelines. We don't currently have a "state" on Timeline, but I think we need at least two states: Active and Stopping. The Stopping state is used at deletion, to prevent new TimelineActiveGuards from appearing, while you wait for existing TimelineActiveGuards to die out. The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(), probably also needs changes to deal with the new Guards. The rule is that if you have a TenantActiveGuard, and the tenant's state changes from Active to Stopping, the is_shutdown_requested() function should return true, and shutdown_watcher() future should return. This signaling doesn't necessarily need to cover all cases. For example, if you have a block of code in spawn_blocking(), it might be acceptable if is_shutdown_requested() doesn't return true even though the tenant is in Stopping state, as long as the code finishes reasonably fast. ================================================ FILE: docs/rfcs/020-pageserver-s3-coordination.md ================================================ # Coordinating access of multiple pageservers to the same s3 data ## Motivation There are some blind spots around coordinating access of multiple pageservers to the same s3 data. Currently this is applicable only to tenant relocation case, but in the future we'll need to solve similar problems for replica/standby pageservers. ## Impacted components (e.g. pageserver, safekeeper, console, etc) Pageserver ## The problem ### Relocation During relocation both pageservers can write to s3. This should be ok for all data except the `index_part.json`. For index part it causes problems during compaction/gc because they remove files from index/s3. Imagine this case: ```mermaid sequenceDiagram autonumber participant PS1 participant S3 participant PS2 PS1->>S3: Uploads L1, L2
Index contains L1 L2 PS2->>S3: Attach called, sees L1, L2 PS1->>S3: Compaction comes
Removes L1, adds L3 note over S3: Index now L2, L3 PS2->>S3: Uploads new layer L4
(added to previous view of the index) note over S3: Index now L1, L2, L4 ``` At this point it is not possible to restore from index, it contains L2 which is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) ### Standby pageserver Another related case is standby pageserver. In this case second pageserver can be used as a replica to scale reads and serve as a failover target in case first one fails. In this mode second pageserver needs to have the same picture of s3 files to be able to load layers on-demand. To accomplish that second pageserver cannot run gc/compaction jobs. Instead it needs to receive updates for index contents. (There is no need to run walreceiver on the second pageserver then). ## Observations - If both pageservers ingest wal then their layer set diverges, because layer file generation is not deterministic - If one of the pageservers does not ingest wal (and just picks up layer updates) then it lags behind and cannot really answer queries in the same pace as the primary one - Can compaction help make layers deterministic? E g we do not upload level zero layers and construction of higher levels should be deterministic. This way we can guarantee that layer creation by timeout wont mess things up. This way one pageserver uploads data and second one can just ingest it. But we still need some form of election ## Solutions ### Manual orchestration One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can avoid overwriting index part if it is no longer the leading one Note that flag that disables background jobs needs to be persistent, because otherwise pageserver restart will clean it ### Avoid index_part.json Index part consists of two parts, list of layers and metadata. List of layers can be easily obtained by `ListObjects` S3 API method. But what to do with metadata? Create metadata instance for each checkpoint and add some counter to the file name? Back to potentially long s3 ls. ### Coordination based approach Do it like safekeepers chose leader for WAL upload. Ping each other and decide based on some heuristics e g smallest node id. During relocation PS1 sends "resign" ping message so others can start election without waiting for a timeout. This still leaves metadata question open and non deterministic layers are a problem as well ### Avoid metadata file One way to eliminate metadata file is to store it in layer files under some special key. This may resonate with intention to keep all relation sizes in some special segment to avoid initial download during size calculation. Maybe with that we can even store pre calculated value. As a downside each checkpoint gets 512 bytes larger. If we entirely avoid metadata file this opens up many approaches * * * During discussion it seems that we converged on the approach consisting of: - index files stored per pageserver in the same timeline directory. With that index file name starts to look like: `_index_part.json`. In such set up there are no concurrent overwrites of index file by different pageservers. - For replica pageservers the solution would be for primary to broadcast index changes to any followers with an ability to check index files in s3 and restore the full state. To properly merge changes with index files we can use a counter that is persisted in an index file, is incremented on every change to it and passed along with broadcasted change. This way we can determine whether we need to apply change to the index state or not. - Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this pageserver is considered as primary one or not. TODO what happens if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on start. Requirement for deterministic layer generation was considered overly strict because of two reasons: - It can limit possible optimizations e g when pageserver wants to reshuffle some data locally and doesn't want to coordinate this - The deterministic algorithm itself can change so during deployments for some time there will be two different version running at the same time which can cause non determinism ### External elections The above case with lost state in this schema with externally managed leadership is represented like this: Note that here we keep objects list in the index file. ```mermaid sequenceDiagram autonumber participant PS1 participant CP as Control Plane participant S3 participant PS2 note over PS1,PS2: PS1 starts up and still a leader PS1->>CP: Am I still the leader for Tenant X? activate CP CP->>PS1: Yes deactivate CP PS1->>S3: Fetch PS1 index. note over PS1: Continue operations, start background jobs note over PS1,PS2: PS1 starts up and still and is not a leader anymore PS1->>CP: Am I still the leader for Tenant X? CP->>PS1: No PS1->>PS2: Subscribe to index changes PS1->>S3: Fetch PS1 and PS2 indexes note over PS1: Combine index file to include layers
from both indexes to be able
to see newer files from leader (PS2) note over PS1: Continue operations, do not start background jobs ``` ### Internal elections To manage leadership internally we can use broker to exchange pings so nodes can decide on the leader roles. In case multiple pageservers are active leader is the one with lowest node id. Operations with internally managed elections: ```mermaid sequenceDiagram autonumber participant PS1 participant S3 note over PS1: Starts up note over PS1: Subscribes to changes, waits for two ping
timeouts to see if there is a leader PS1->>S3: Fetch indexes from s3 alt there is a leader note over PS1: do not start background jobs,
continue applying index updates else there is no leader note over PS1: start background jobs,
broadcast index changes end note over PS1,S3: Then the picture is similar to external elections
the difference is that follower can become a leader
if there are no pings after some timeout new leader gets elected ``` ### Eviction When two pageservers operate on a tenant for extended period of time follower doesn't perform write operations in s3. When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. Note that it wont match evicted layer exactly, so layers will overlap and lookup code needs to correctly handle that. ### Relocation flow Actions become: - Attach tenant to new pageserver - New pageserver becomes follower since previous one is still leading - New pageserver starts replicating from safekeepers but does not upload layers - Detach is called on the old one - New pageserver becomes leader after it realizes that old one disappeared ### Index File Using `s3 ls` on startup simplifies things, but we still need metadata, so we need to fetch index files anyway. If they contain list of files we can combine them and avoid costly `s3 ls` ### Remaining issues - More than one remote consistent lsn for safekeepers to know Anything else? ### Proposed solution To recap. On meeting we converged on approach with external elections but I think it will be overall harder to manage and will introduce a dependency on control plane for pageserver. Using separate index files for each pageserver consisting of log of operations and a metadata snapshot should be enough. ### What we need to get there? - Change index file structure to contain log of changes instead of just the file list - Implement pinging/elections for pageservers ================================================ FILE: docs/rfcs/021-metering.md ================================================ # Consumption tracking # Goals This proposal is made with two mostly but not entirely overlapping goals: * Collect info that is needed for consumption-based billing * Cross-check AWS bills # Metrics There are six metrics to collect: * CPU time. Wall clock seconds * the current number of cores. We have a fixed ratio of memory to cores, so the current memory size is the function of the number of cores. Measured per each `endpoint`. * Traffic. In/out traffic on the proxy. Measured per each `endpoint`. * Written size. Amount of data we write. That is different from both traffic and storage size, as only during the writing we a) occupy some disk bandwidth on safekeepers b) necessarily cross AZ boundaries delivering WAL to all safekeepers Each timeline/branch has at most one writer, so the data is collected per branch. * Synthetic storage size. That is what is exposed now with pageserver's `/v1/tenant/{}/size`. Looks like now it is per-tenant. (Side note: can we make it per branch to show as branch physical size in UI?) * Real storage size. That is the size of the tenant directory on the pageservers disk. Per-tenant. * S3 storage size. That is the size of the tenant data on S3. Per-tenant. That info should be enough to build an internal model that predicts AWS price (hence tracking `written data` and `real storage size`). As for the billing model we probably can get away with mentioning only `CPU time`, `synthetic storage size`, and `traffic` consumption. # Services participating in metrics collection ## Proxy For actual implementation details check `/docs/consumption_metrics.md` Proxy is the only place that knows about traffic flow, so it tracks it and reports it with quite a small interval, let's say 1 minute. A small interval is needed here since the proxy is stateless, and any restart will reset accumulated consumption. Also proxy should report deltas since the last report, not an absolute value of the counter. Such kind of events is easier to integrate over a period of time to get the amount of traffic during some time interval. Example event: ```json { "metric": "proxy_io_bytes_per_client", "type": "incremental", "start_time": "2022-12-28T11:07:19.317310284Z", "stop_time": "2022-12-28T11:07:19.317310284Z", "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", "value": 12345454, "endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d", } ``` Since we report deltas over some period of time, it makes sense to include `event_start_time`/`event_stop_time` where `event_start_time` is the time of the previous report. That will allow us to identify metering gaps better (e.g., failed send/delivery). When there is no active connection proxy can avoid reporting anything. Also, deltas are additive, so several console instances serving the same user and endpoint can report traffic without coordination. ## Console The console knows about start/stop events, so it knows the amount of CPU time allocated to each endpoint. It also knows about operation successes and failures and can avoid billing clients after unsuccessful 'suspend' events. The console doesn't know the current compute size within the allowed limits on the endpoint. So with CPU time, we do the following: * While we don't yet have the autoscaling console can report `cpu time` as the number of seconds since the last `start_compute` event. * When we have autoscaling, `autoscaler-agent` can report `cpu time`*`compute_units_count` in the same increments as the proxy reports traffic. Example event: ```json { "metric": "effective_compute_seconds", "type": "increment", "endpoint_id": "blazing-warrior-34", "event_start_time": ..., "event_stop_time": ..., "value": 12345454, } ``` I'd also suggest reporting one value, `cpu time`*`compute_units_count`, instead of two separate fields as it makes event schema simpler (it is possible to treat it the same way as traffic) and preserves additivity. ## Pageserver For actual implementation details check `/docs/consumption_metrics.md` Pageserver knows / has access to / can calculate the rest of the metrics: * Written size -- that is basically `last_received_lsn`, * Synthetic storage size -- there is a way to calculate it, albeit a costly one, * Real storage size -- there is a way to calculate it using a layer map or filesystem, * S3 storage size -- can calculate it by S3 API calls Some of those metrics are expensive to calculate, so the reporting period here is driven mainly by implementation details. We can set it to, for example, once per hour. Not a big deal since the pageserver is stateful, and all metrics can be reported as an absolute value, not increments. At the same time, a smaller reporting period improves UX, so it would be good to have something more real-time. `written size` is primarily a safekeeper-related metric, but since it is available on both pageserver and safekeeper, we can avoid reporting anything from the safekeeper. Example event: ```json { "metric": "remote_storage_size", "type": "absolute", "time": "2022-12-28T11:07:19.317310284Z", "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", "value": 12345454, "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d", "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143", } ``` # Data collection ## Push vs. pull We already have pull-based Prometheus metrics, so it is tempting to use them here too. However, in our setup, it is hard to tell when some metric changes. For example, garbage collection will constantly free some disk space over a week, even if the project is down for that week. We could also iterate through all existing tenants/branches/endpoints, but that means some amount of code to do that properly and most likely we will end up with some per-metric hacks in the collector to cut out some of the tenants that are surely not changing that metric. With the push model, it is easier to publish data only about actively changing metrics -- pageserver knows when it performs s3 offloads, garbage collection and starts/stops consuming data from the safekeeper; proxy knows about connected clients; console / autoscaler-agent knows about active cpu time. Hence, let's go with a push-based model. ## Common bus vs. proxying through the console We can implement such push systems in a few ways: a. Each component pushes its metrics to the "common bus", namely segment, Kafka, or something similar. That approach scales well, but it would be harder to test it locally, will introduce new dependencies, we will have to distribute secrets for that connection to all of the components, etc. We would also have to loop back some of the events and their aggregates to the console, as we want to show some that metrics to the user in real-time. b. Each component can call HTTP `POST` with its events to the console, and the console can forward it to the segment for later integration with metronome / orb / onebill / etc. With that approach, only the console has to speak with segment. Also since that data passes through the console, the console can save the latest metrics values, so there is no need for constant feedback of that events back from the segment. # Implementation Each (proxy|pageserver|autoscaler-agent) sends consumption events to the single endpoint in the console: ```json POST /usage_events HTTP/1.1 Content-Type: application/json [ { "metric": "remote_storage_size", "type": "absolute", "time": "2022-12-28T11:07:19.317310284Z", "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", "value": 12345454, "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d", "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143", }, ... ] ``` ![data flow](./images/metering.jpg) Events could be either: * `incremental` -- change in consumption since the previous event or service restart. That is `effective_cpu_seconds`, `traffic_in_bytes`, and `traffic_out_bytes`. * `absolute` -- that is the current value of a metric. All of the size-related metrics are absolute. Each service can post events at its own pace and bundle together data from different tenants/endpoints. The console algorithm upon receive of events could be the following: 1. Create and send a segment event with the same content (possibly enriching it with tenant/timeline data for endpoint-based events). 2. Update the latest state of per-tenant and per-endpoint metrics in the database. 3. Check whether any of that metrics is above the allowed threshold and stop the project if necessary. Since all the data comes in batches, we can do the batch update to reduce the number of queries in the database. Proxy traffic is probably the most frequent metric, so with batching, we will have extra `number_of_proxies` requests to the database each minute. This is most likely fine for now but will generate many dead tuples in the console database. If that is the case, we can change step 2 to the following: 2.1. Check if there $tenant_$metric / $endpoint_$metric key in Redis 2.2. If no stored value is found and the metric is incremental, then fetch the current value from DWH (which keeps aggregated value for all the events) and publish it. 2.3. Publish a new value (absolute metric) or add an increment to the stored value (incremental metric) ## Consumption watchdog Since all the data goes through the console, we don't have to run any background thread/coroutines to check whether consumption is within the allowed limits. We only change consumption with `POST /usage_events`, so limit checks could be applied in the same handler. ## Extensibility If we need to add a new metric (e.g. s3 traffic or something else), the console code should, by default, process it and publish segment event, even if the metric name is unknown to the console. ## Naming & schema Each metric name should end up with units -- now `_seconds` and `_bytes`, and segment event should always have `tenant_id` and `timeline_id`/`endpoint_id` where applicable. ================================================ FILE: docs/rfcs/022-pageserver-delete-from-s3.md ================================================ # Deleting pageserver part of tenants data from s3 Created on 08.03.23 ## Motivation Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) ## Summary TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons. The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision). ## Components pageserver, control-plane ## Requirements Deletion should successfully finish (eventually) without leaving dangling files in presense of: - component restarts - component outage - pageserver loss ## Proposed implementation Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files. Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation. The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called. For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines) ### 1. Pageserver owns deletion machinery #### The sequence TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence. Happy path: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS as Pageserver participant S3 CP->>PS: Delete tenant PS->>S3: Create deleted mark file at
/tenant/meta/deleted PS->>PS: Create deleted mark file locally PS->>CP: Accepted PS->>PS: delete local files other than deleted mark loop Delete layers for each timeline PS->>S3: delete(..) CP->>PS: Finished? PS->>CP: False end PS->>S3: Delete mark file PS->>PS: Delete local mark file loop Poll for status CP->>PS: Finished? PS->>CP: True or False end ``` Why two mark files? Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach. Why local mark file is needed? If we don't have one, we have two choices, delete local data before deleting the remote part or do that after. If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants). If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. Thus we need local record of tenant being deleted as well. ##### Handle pageserver crashes Lets explore sequences with various crash points. Pageserver crashes before `deleted` mark file is persisted in s3: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS as Pageserver participant S3 CP->>PS: Delete tenant note over PS: Crash point 1. CP->>PS: Retry delete request PS->>S3: Create deleted mark file at
/tenant/meta/deleted PS->>PS: Create deleted mark file locally PS->>CP: Accepted PS->>PS: delete local files other than deleted mark loop Delete layers for each timeline PS->>S3: delete(..) CP->>PS: Finished? PS->>CP: False end PS->>S3: Delete mark file PS->>PS: Delete local mark file CP->>PS: Finished? PS->>CP: True ``` Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS as Pageserver participant S3 CP->>PS: Delete tenant PS->>S3: Create deleted mark file at
/tenant/meta/deleted note over PS: Crash point 2. note over PS: During startup we reconcile
with remote and see
whether the remote mark exists alt Remote mark exists PS->>PS: create local mark if its missing PS->>PS: delete local files other than deleted mark loop Delete layers for each timeline PS->>S3: delete(..) end note over CP: Eventually console should
retry delete request CP->>PS: Retry delete tenant PS->>CP: Not modified else Mark is missing note over PS: Continue to operate the tenant as if deletion didn't happen note over CP: Eventually console should
retry delete request CP->>PS: Retry delete tenant PS->>S3: Create deleted mark file at
/tenant/meta/deleted PS->>CP: Delete tenant end PS->>PS: Continue with layer file deletions loop Delete layers for each timeline PS->>S3: delete(..) CP->>PS: Finished? PS->>CP: False end PS->>S3: Delete mark file PS->>PS: Delete local mark file CP->>PS: Finished? PS->>CP: True ``` Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response. If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404. ##### Differences between tenants and timelines For timeline the sequence is the same with the following differences: - remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json - local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible ##### Handle pageserver loss If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created). ##### Restrictions for tenant that is in progress of being deleted I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status. #### Summary Pros: - Storage is not dependent on control plane. Storage can be restarted even if control plane is not working. - Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck. - No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract. - No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See Cons: - Logic is a tricky, needs good testing - Anything else? ### 2. Control plane owns deletion machinery In this case the only action performed on pageserver is removal of local files. Everything else is done by control plane. The steps are as follows: 1. Control plane marks tenant as "delete pending" in its database 2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind 3. When no files are left marks deletion as completed In case of restart it selects all tenants marked as "delete pending" and continues the deletion. For tenants it is simple. For timelines there are caveats. Assume that the same workflow is used for timelines. If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state. Available options: - require list of alive timelines to be passed to attach call - use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks. With first option the following problem becomes apparent: Who is the source of truth regarding timeline liveness? Imagine: PS1 fails. PS2 gets assigned the tenant. New branch gets created PS1 starts up (is it possible or we just recycle it?) PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane. ### Summary Cons: - Potential thundering herd-like problem during storage restart (requests to control plane) - Potential increase in storage startup time (additional request to control plane) - Storage startup starts to depend on console - Erroneous attach call can attach tenant in half deleted state Pros: - Easier to reason about if you don't have to account for pageserver restarts ### Extra notes There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at. Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary. ## Decision After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes. With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. So the decision is to proceed with pageserver centric approach. ================================================ FILE: docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md ================================================ # The state of pageserver tenant relocation Created on 17.03.23 ## Motivation There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc. Related (in chronological order): - Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886) - [015. Storage Messaging](015-storage-messaging.md) - [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) ## Summary The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches. 1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime. 2. More complicated approach that avoids even short downtime. 3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers). The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3. ## Components pageserver, control-plane, safekeepers (a bit) ## Requirements Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine. - component restarts - component outage - pageserver loss ## The original proposed implementation The starting point is this sequence: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS1 as Pageserver 1 participant PS2 as Pageserver 2 participant S3 CP->>PS2: Attach tenant X PS2->>S3: Fetch timelines, indexes for them PS2->>CP: Accepted CP->>CP: Change pageserver id in project CP->>PS1: Detach ``` Which problems do we have with naive approach? ### Concurrent GC and Compaction The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md) ```mermaid sequenceDiagram autonumber participant PS1 participant S3 participant PS2 PS1->>S3: Uploads L1, L2
Index contains L1 L2 PS2->>S3: Attach called, sees L1, L2 PS1->>S3: Compaction comes
Removes L1, adds L3 note over S3: Index now L2, L3 PS2->>S3: Uploads new layer L4
(added to previous view of the index) note over S3: Index now L1, L2, L4 ``` At this point it is not possible to restore the state from index, it contains L2 which is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart, initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests. #### Options There are several options on how to restrict concurrent access to index file. First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads. So the sequence becomes: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS1 as Pageserver 1 participant PS2 as Pageserver 2 participant S3 CP->>PS1: Pause background jobs, pause uploading new layers. CP->>PS2: Attach tenant X. PS2->>S3: Fetch timelines, index, start background operations PS2->>CP: Accepted CP->>CP: Monitor PS2 last record lsn, ensure OK lag CP->>CP: Change pageserver id in project CP->>PS1: Detach ``` The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver? There are two questions: #### How can we detect that something went wrong? We can run usual availability check (consists of compute startup and an update of one row). Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one. #### What can go wrong? And how we can safely roll-back? In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state. Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state. So the sequence for happy path becomes this one: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS1 as Pageserver 1 participant PS2 as Pageserver 2 participant S3 CP->>PS1: Pause background jobs, pause uploading new layers. CP->>PS2: Attach tenant X in remote readonly mode. PS2->>S3: Fetch timelines, index PS2->>CP: Accepted CP->>CP: Monitor PS2 last record lsn, ensure OK lag CP->>CP: Change pageserver id in project CP->>CP: Run successful availability check CP->>PS2: Start uploads, background tasks CP->>PS1: Detach ``` With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver. The sequence with roll back process: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS1 as Pageserver 1 participant PS2 as Pageserver 2 participant S3 CP->>PS1: Pause background jobs, pause uploading new layers. CP->>PS2: Attach tenant X in remote readonly mode. PS2->>S3: Fetch timelines, index PS2->>CP: Accepted CP->>CP: Monitor PS2 last record lsn, ensure OK lag CP->>CP: Change pageserver id in project CP->>CP: Availability check Failed CP->>CP: Change pageserver id back CP->>PS1: Resume remote operations CP->>PS2: Ignore (instead of detach for investigation purposes) ``` ## Concurrent branch creation Another problem is a possibility of concurrent branch creation calls. I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. ## Simplistic approach The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid. The approach largely follows this guide: The happy path sequence: ```mermaid sequenceDiagram autonumber participant CP as Control Plane participant PS1 as Pageserver 1 participant PS2 as Pageserver 2 participant SK as Safekeeper participant S3 CP->>CP: Enable maintenance mode CP->>PS1: Ignore CP->>PS2: Attach PS2->>CP: Accepted loop Delete layers for each timeline CP->>PS2: Get last record lsn CP->>SK: Get commit lsn CP->>CP: OK? Timed out? end CP->>CP: Change pageserver id in project CP->>CP: Run successful availability check CP->>CP: Disable maintenance mode CP->>PS1: Detach ignored ``` The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way. Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one. ## Lease based approach In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration. NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role. There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required. The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers. There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one. ## Summary Proposed implementation strategy: Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation. And only then go to lease based approach to solve HA/Pageserver replica use cases. ================================================ FILE: docs/rfcs/024-extension-loading.md ================================================ # Supporting custom user Extensions (Dynamic Extension Loading) Created 2023-05-03 ## Motivation There are many extensions in the PostgreSQL ecosystem, and not all extensions are of a quality that we can confidently support them. Additionally, our current extension inclusion mechanism has several problems because we build all extensions into the primary Compute image: We build the extensions every time we build the compute image regardless of whether we actually need to rebuild the image, and the inclusion of these extensions in the image adds a hard dependency on all supported extensions - thus increasing the image size, and with it the time it takes to download that image - increasing first start latency. This RFC proposes a dynamic loading mechanism that solves most of these problems. ## Summary `compute_ctl` is made responsible for loading extensions on-demand into the container's file system for dynamically loaded extensions, and will also make sure that the extensions in `shared_preload_libraries` are downloaded before the compute node starts. ## Components compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store ## Requirements Compute nodes with no extra extensions should not be negatively impacted by the existence of support for many extensions. Installing an extension into PostgreSQL should be easy. Non-preloaded extensions shouldn't impact startup latency. Uninstalled extensions shouldn't impact query latency. A small latency penalty for dynamically loaded extensions is acceptable in the first seconds of compute startup, but not in steady-state operations. ## Proposed implementation ### On-demand, JIT-loading of extensions Before postgres starts we download - control files for all extensions available to that compute node; - all `shared_preload_libraries`; After postgres is running, `compute_ctl` listens for requests to load files. When PostgreSQL requests a file, `compute_ctl` downloads it. PostgreSQL requests files in the following cases: - When loading a preload library set in `local_preload_libraries` - When explicitly loading a library with `LOAD` - When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) #### Summary Pros: - Startup is only as slow as it takes to load all (shared_)preload_libraries - Supports BYO Extension Cons: - O(sizeof(extensions)) IO requirement for loading all extensions. ### Alternative solutions 1. Allow users to add their extensions to the base image Pros: - Easy to deploy Cons: - Doesn't scale - first start size is dependent on image size; - All extensions are shared across all users: It doesn't allow users to bring their own restrictive-licensed extensions 2. Bring Your Own compute image Pros: - Still easy to deploy - User can bring own patched version of PostgreSQL Cons: - First start latency is O(sizeof(extensions image)) - Warm instance pool for skipping pod schedule latency is not feasible with O(n) custom images - Support channels are difficult to manage 3. Download all user extensions in bulk on compute start Pros: - Easy to deploy - No startup latency issues for "clean" users. - Warm instance pool for skipping pod schedule latency is possible Cons: - Downloading all extensions in advance takes a lot of time, thus startup latency issues 4. Store user's extensions in persistent storage Pros: - Easy to deploy - No startup latency issues - Warm instance pool for skipping pod schedule latency is possible Cons: - EC2 instances have only limited number of attachments shared between EBS volumes, direct-attached NVMe drives, and ENIs. - Compute instance migration isn't trivially solved for EBS mounts (e.g. the device is unavailable whilst moving the mount between instances). - EBS can only mount on one instance at a time (except the expensive IO2 device type). 5. Store user's extensions in network drive Pros: - Easy to deploy - Few startup latency issues - Warm instance pool for skipping pod schedule latency is possible Cons: - We'd need networked drives, and a lot of them, which would store many duplicate extensions. - **UNCHECKED:** Compute instance migration may not work nicely with networked IOs ### Idea extensions The extension store does not have to be S3 directly, but could be a Node-local caching service on top of S3. This would reduce the load on the network for popular extensions. ## Extension Storage implementation The layout of the S3 bucket is as follows: ``` 5615610098 // this is an extension build number ├── v14 │   ├── extensions │   │   ├── anon.tar.zst │   │   └── embedding.tar.zst │   └── ext_index.json └── v15 ├── extensions │   ├── anon.tar.zst │   └── embedding.tar.zst └── ext_index.json 5615261079 ├── v14 │   ├── extensions │   │   └── anon.tar.zst │   └── ext_index.json └── v15 ├── extensions │   └── anon.tar.zst └── ext_index.json 5623261088 ├── v14 │   ├── extensions │   │   └── embedding.tar.zst │   └── ext_index.json └── v15 ├── extensions │   └── embedding.tar.zst └── ext_index.json ``` Note that build number cannot be part of prefix because we might need extensions from other build numbers. `ext_index.json` stores the control files and location of extension archives. It also stores a list of public extensions and a library_index We don't need to duplicate `extension.tar.zst`` files. We only need to upload a new one if it is updated. (Although currently we just upload every time anyways, hopefully will change this sometime) *access* is controlled by spec More specifically, here is an example ext_index.json ``` { "public_extensions": [ "anon", "pg_buffercache" ], "library_index": { "anon": "anon", "pg_buffercache": "pg_buffercache" // for more complex extensions like postgis // we might have something like: // address_standardizer: postgis // postgis_tiger: postgis }, "extension_data": { "pg_buffercache": { "control_data": { "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" }, "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" }, "anon": { "control_data": { "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" }, "archive_path": "5670669815/v14/extensions/anon.tar.zst" } } } ``` ### How to add new extension to the Extension Storage? Simply upload build artifacts to the S3 bucket. Implement a CI step for that. Splitting it from compute-node-image build. ### How do we deal with extension versions and updates? Currently, we rebuild extensions on every compute-node-image build and store them in the prefix. This is needed to ensure that `/share` and `/lib` files are in sync. For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL. ### Alternatives For extensions written on trusted languages we can also adopt `dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase. This will increase the amount supported extensions and decrease the amount of work required to support them. ================================================ FILE: docs/rfcs/024-user-mgmt.md ================================================ # Postgres user and database management (This supersedes the previous proposal that looked too complicated and desynchronization-prone) We've accumulated a bunch of problems with our approach to role and database management, namely: 1. we don't allow role and database creation from Postgres, and users are complaining about that 2. fine-grained role management is not possible both from Postgres and console Right now, we do store users and databases both in console and Postgres, and there are two main reasons for that: * we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise, malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service). * it is handy when we can render console UI without waking up compute (e.g., show database list) This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup. ## Overview * Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit. * Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API. ## Postgres behavior The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist. Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here: * roles created via SQL will *not* have raw passwords in the console * roles created via SQL will have raw passwords in the console, except ones that were created using hashes I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them. To send data about roles and databases from Postgres to the console we can create the following Postgres extension: * Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use. * When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that. * Extension should be mindful of the fact that it is possible to create and delete roles within the transaction. * We also need to track who is database owner, some coding around may be needed to get the current user when the database is created. ## Console user management API The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work: ``` curl -X PATCH /api/v1/roles_and_databases -d ' [ {"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"}, {"op":"drop", "type":"role", "name": "trout"}, {"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"}, {"op":"create", "type":"database", "name": "db2", "owner": "eliot"}, ] ' ``` Makes sense not to error out on duplicated create/delete operations (see failure modes) ## Managing users from the console Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook). Generally, we have several options when we are creating users via console: 1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension 2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension 3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console 4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be: 1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl 2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much ## Failure modes * during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc): in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction. # Scalability On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k). ================================================ FILE: docs/rfcs/025-generation-numbers.md ================================================ # Pageserver: split-brain safety for remote storage through generation numbers ## Summary A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with changes to the remote storage format to include these generation numbers in S3 keys. Using the control plane as the issuer of these generation numbers enables strong anti-split-brain properties in the pageserver cluster without implementing a consensus mechanism directly in the pageservers. ## Motivation Currently, the pageserver's remote storage format does not provide a mechanism for addressing split brain conditions that may happen when replacing a node or when migrating a tenant from one pageserver to another. From a remote storage perspective, a split brain condition occurs whenever two nodes both think they have the same tenant attached, and both can write to S3. This can happen in the case of a network partition, pathologically long delays (e.g. suspended VM), or software bugs. In the current deployment model, control plane guarantees that a tenant is attached to one pageserver at a time, thereby ruling out split-brain conditions resulting from dual attachment (however, there is always the risk of a control plane bug). This control plane guarantee prevents robust response to failures, as if a pageserver is unresponsive we may not detach from it. The mechanism in this RFC fixes this, by making it safe to attach to a new, different pageserver even if an unresponsive pageserver may be running. Further lack of safety during split-brain conditions blocks two important features where occasional split-brain conditions are part of the design assumptions: - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029)) - automatic pageserver instance failure handling (aka "failover") (RFC TBD) ### Prior art - 020-pageserver-s3-coordination.md - 023-the-state-of-pageserver-tenant-relocation.md - 026-pageserver-s3-mvcc.md This RFC has broad similarities to the proposal to implement a MVCC scheme in S3 object names, but this RFC avoids a general purpose transaction scheme in favour of more specialized "generations" that work like a transaction ID that always has the same lifetime as a pageserver process or tenant attachment, whichever is shorter. ## Requirements - Accommodate storage backends with no atomic or fencing capability (i.e. work within S3's limitation that there are no atomics and clients can't be fenced) - Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not assume that we can reliably kill and EC2 instance and have it die) - Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need per-tenant granularity, and for _failover_, we likely want to spread the workload of the failed pageserver instance to a number of peers, rather than monolithically moving the entire workload to another machine. We do not rule out the latter case, but should not constrain ourselves to it. ## Design Tenets These are not requirements, but are ideas that guide the following design: - Avoid implementing another consensus system: we already have a strongly consistent database in the control plane that can do atomic operations where needed, and we also have a Paxos implementation in the safekeeper. - Avoiding locking in to specific models of how failover will work (e.g. do not assume that all the tenants on a pageserver will fail over as a unit). - Be strictly correct when it comes to data integrity. Occasional failures of availability are tolerable, occasional data loss is not. ## Non Goals The changes in this RFC intentionally isolate the design decision of how to define logical generations numbers and object storage format in a way that is somewhat flexible with respect to how actual orchestration of failover works. This RFC intentionally does not cover: - Failure detection - Orchestration of failover - Standby modes to keep data ready for fast migration - Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations). - Sharding. The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features) ## Impacted Components pageserver, control plane, safekeeper (optional) ## Implementation Part 1: Correctness ### Summary - A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes. - This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts. - the control plane is the authority for generation numbers: only it may increment a generation number. - **Object keys are suffixed** with the generation number - **Safety for multiply-attached tenants** is provided by the generation number in the object key: the competing pageservers will not try to write to the same keys. - **Safety in split brain for multiple nodes running with the same node ID** is provided by the pageserver calling out to the control plane on startup, to re-attach and thereby increment the generations of any attached tenants - **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key. - **The control plane is used to issue generation numbers** to avoid the need for a built-in consensus system in the pageserver, although this could in principle be changed without changing the storage format. ### Generation numbers A generation number is associated with each tenant in the control plane, and each time the attachment status of the tenant changes, this is incremented. Changes in attachment status include: - Attaching the tenant to a different pageserver - A pageserver restarting, and "re-attaching" its tenants on startup These increments of attachment generation provide invariants we need to avoid split-brain issues in storage: - If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment while attaching the second one. - If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment when the second node started and re-attached its tenants. As long as the infrastructure does not transparently replace an underlying physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details. ### Object Key Changes #### Generation suffix All object keys (layer objects and index objects) will contain the attachment generation as a [suffix](#why-a-generation-suffix-rather-than-prefix). This suffix is the primary mechanism for protecting against split-brain situations, and enabling safe multi-attachment of tenants: - Two pageservers running with the same node ID (e.g. after a failure, where there is some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented generation numbers. - Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation. The generation is appended in hex format (8 byte string representing u32), to all our existing key names. A u32's range limit would permit 27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than is realistic. The exact meaning of the generation suffix can evolve over time if necessary, for example if we chose to implement a failover mechanism internally to the pageservers rather than going via the control plane. The storage format just sees it as a number, with the only semantic property being that the highest numbered index is the latest. #### Index changes Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well. This will increase the size of the file, but only modestly: layers are already encoded as their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when the index storage format is migrated to a binary format from JSON. #### Visibility _This section doesn't describe code changes, but extends on the consequences of the object key changes given above_ ##### Visibility of objects to pageservers Pageservers can of course list objects in S3 at any time, but in practice their visible set is based on the contents of their LayerMap, which is initialized from the `index_part.json.???` that they load. Starting with the `index_part` from the most recent previous generation (see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver initially has visibility of all the objects that were referenced in the loaded index. These objects are guaranteed to remain visible until the current generation is superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)). The "most recent previous generation" is _not_ necessarily the most recent in terms of walltime, it is the one that is readable at the time a new generation starts. Consider the following sequence of a tenant being re-attached to different pageserver nodes: - Create + attach on PS1 in generation 1 - PS1 Do some work, write out index_part.json-0001 - Attach to PS2 in generation 2 - Read index_part.json-0001 - PS2 starts doing some work... - Attach to PS3 in generation 3 - Read index_part.json-0001 - **...PS2 finishes its work: now it writes index_part.json-0002** - PS3 writes out index_part.json-0003 In the above sequence, the ancestry of indices is: ``` 0001 -> 0002 | -> 0003 ``` This is not an issue for safety: if the 0002 references some object that is not in 0001, then 0003 simply does not see it, and will re-do whatever work was required (e.g. ingesting WAL or doing compaction). Objects referenced by only the 0002 index will never be read by future attachment generations, and will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)). ##### Visibility of LSNs to clients Because index_part.json is now written with a generation suffix, which data is visible depends on which generation the reader is operating in: - If one was passively reading from S3 from outside of a pageserver, the visibility of data would depend on which index_part.json- file one had chosen to read from. - If two pageservers have the same tenant attached, they may have different data visible as they're independently replaying the WAL, and maintaining independent LayerMaps that are written to independent index_part.json files. Data does not have to be remotely committed to be visible. - For a pageserver writing with a stale generation, historic LSNs remain readable until another pageserver (with a higher generation suffix) decides to execute GC deletions. At this point, we may think of the stale attachment's generation as having logically ended: during its existence the generation had a consistent view of the world. - For a newly attached pageserver, its highest visible LSN may appears to go backwards with respect to an earlier attachment, if that earlier attachment had not uploaded all data to S3 before the new attachment. ### Deletion #### Generation number validation While writes are de-conflicted by writers always using their own generation number in the key, deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote itself, because pageserver's B metadata might reference those objects. We solve this by inserting a "generation validation" step between the write of a remote index that un-links a particular object from the index, and the actual deletion of the object, such that deletions strictly obey the following ordering: 1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will not try and read the object we unlinked. 2. Call out to control plane to validate that the generation which we use for our attachment is still the latest. 3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane together with our visibility rules guarantees that any later generation will use either the exact `index_part.json` that we uploaded in step 1, or a successor of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the key we are deleting anymore, so, the key is invisible to any later attachment generation. Hence it's safe to delete it. Note that at step 2 we are only confirming that deletions of objects _no longer referenced by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently, these would need their own generation validation step. If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node failures, if we do proper flushing of deletions on clean shutdown and clean migration. To avoid doing a huge number of control plane requests to perform generation validation, validation of many tenants will be done in a single request, and deletions will be queued up prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more. #### `remote_consistent_lsn` updates Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that they may drop data below this LSN. For the same reasons that deletion of objects must be guarded by an attachment generation number validation step, updates to `remote_consistent_lsn` are subject to the same rules, using an ordering as follows: 1. upload the index_part that covers data up to LSN `L0` to S3 2. Call out to control plane to validate that the generation which we use for our attachment is still the latest. 3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0` If step 2 fails, then the `remote_consistent_lsn` advertised to safekeepers will not advance again until a pageserver with the latest generation is ready to do so. **Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are advertising the value in the index_part that we uploaded in step 1. This provides a strong ordering guarantee. Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that reflects its latest write to remote storage, and the one that reflects the most recent validation of generation number. It is only the latter value that may be advertised to the outside world (i.e. to the safekeeper). The control plane remains unaware of `remote_consistent_lsn`: it only has to validate the freshness of generation numbers, thereby granting the pageserver permission to share the information with the safekeeper. For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to the same generation validation requirement. ### Pageserver attach/startup changes #### Attachment Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional `generation` field in the body. The pageserver does not persist this: a generation is only good for the lifetime of a process. #### Finding the remote indices for timelines Because index files are now suffixed with generation numbers, the pageserver cannot always GET the remote index in one request, because it can't always know a-priori what the latest remote index is. Typically, the most recent generation to write an index would be our own generation minus 1. However, this might not be the case: the previous node might have started and acquired a generation number, and then crashed before writing out a remote index. In the general case and as a fallback, the pageserver may list all the `index_part.json` files for a timeline, sort them by generation, and pick the highest that is `<=` its current generation for this attachment. The tenant should never load an index with an attachment generation _newer_ than its own. These two rules combined ensure that objects written by later generations are never visible to earlier generations. Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1. It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2. So, above rules guarantee no determinism in selecting the index part. are allowed to be attached with stale attachment generations during a multiply-attached phase in a migration, and in this instance if the old location's pageserver restarts, it should not try and load the newer generation's index. To summarize, on starting a timeline, the pageserver will: 1. Issue a GET for index_part.json- 2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and pick the newest. One could optimize this further by using the control plane to record specifically which generation most recently wrote an index_part.json, if necessary, to increase the probability of finding the index_part.json in one GET. One could also improve the chances by having pageservers proactively write out index_part.json after they get a new generation ID. #### Re-attachment on startup On startup, the pageserver will call out to an new control plane `/re-attach` API (see [Generation API](#generation-api)). This returns a list of tenants that should be attached to the pageserver, and their generation numbers, which the control plane will increment before returning. The pageserver should still scan its local disk on startup, but should _delete_ any local content for tenants not indicated in the `/re-attach` response: their absence is an implicit detach operation. **Note** if a tenant is omitted from the re-attach response, its local disk content will be deleted. This will change in subsequent work, when the control plane gains the concept of a secondary/standby location: a node with local content may revert to this status and retain some local content. #### Cleaning up previous generations' remote indices Deletion of old indices is not necessary for correctness, although it is necessary to avoid the ListObjects fallback in the previous section becoming ever more expensive. Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json objects that were found. We may choose to implement this deletion either as an explicit step after we write out index_part for the first time in a pageserver's lifetime, or for simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)); ### Control Plane Changes #### Store generations for attaching tenants - The `Project` table must store the generation number for use when attaching the tenant to a new pageserver. - The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number, which the control plane can supply by simply incrementing the `Project`'s generation number each time the tenant is attached to a different server: the same database transaction that changes the assigned pageserver should also change the generation number. #### Generation API This section describes an API that could be provided directly by the control plane, or built as a separate microservice. In earlier parts of the RFC, when we discuss the control plane providing generation numbers, we are referring to this API. The API endpoints used by the pageserver to acquire and validate generation numbers are quite simple, and only require access to some persistent and linerizable storage (such as a database). Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver). However, this is not mandatory: this "Generation Number Issuer" could be built as a microservice. In practice, we will write such a miniature service anyway, to enable E2E pageserver/compute testing without control plane. The endpoints required by pageservers are: ##### `/re-attach` - Request: `{node_id: }` - Response: - 200 `{tenants: [{id: , gen: }]}` - 404: unknown node_id - (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID, or perhaps this node was in a retry loop) - (On unknown tenants, omit tenant from `tenants` array) - Server behavior: query database for which tenants should be attached to this pageserver. - for each tenant that should be attached, increment the attachment generation and include the new generation in the response - Client behavior: - for all tenants in the response, activate with the new generation number - for any local disk content _not_ referenced in the response, act as if we had been asked to detach it (i.e. delete local files) **Note** the `node_id` in this request will change in future if we move to ephemeral node IDs, to be replaced with some correlation ID that helps the control plane realize if a process is running with the same storage as a previous pageserver process (e.g. we might use EC instance ID, or we might just write some UUID to the disk the first time we use it) ##### `/validate` - Request: `{'tenants': [{tenant: , attach_gen: }, ...]}'` - Response: - 200 `{'tenants': [{tenant: , status: }...]}` - (On unknown tenants, omit tenant from `tenants` array) - Purpose: enable the pageserver to discover for the given attachments whether they are still the latest. - Server behavior: this is a read-only operation: simply compare the generations in the request with the generations known to the server, and set status to `true` if they match. - Client behavior: clients must not do deletions within a tenant's remote data until they have received a response indicating the generation they hold for the attachment is current. #### Use of `/load` and `/ignore` APIs Because the pageserver will be changed to only attach tenants on startup based on the control plane's response to a `/re-attach` request, the load/ignore APIs no longer make sense in their current form. The `/load` API becomes functionally equivalent to attach, and will be removed: any location that used `/load` before should just attach instead. The `/ignore` API is equivalent to detaching, but without deleting local files. ### Timeline/Branch creation & deletion All of the previous arguments for safety have described operations within a timeline, where we may describe a sequence that includes updates to index_part.json, and where reads and writes are coming from a postgres endpoint (writes via the safekeeper). Creating or destroying timeline is a bit different, because writes are coming from the control plane. We must be safe against scenarios such as: - A tenant is attached to pageserver B while pageserver A is in the middle of servicing an RPC from the control plane to create or delete a tenant. - A pageserver A has been sent a timeline creation request but becomes unresponsive. The tenant is attached to a different pageserver B, and the timeline creation request is sent there too. #### Timeline Creation If some very slow node tries to do a timeline creation _after_ a more recent generation node has already created the timeline and written some data into it, that must not cause harm. This is provided in timeline creations by the way all the objects within the timeline's remote path include a generation suffix: a slow node in an old generation that attempts to "create" a timeline that already exists will just emit an index_part.json with an old generation suffix. Timeline IDs are never reused, so we don't have to worry about the case of create/delete/create cycles. If they were re-used during a disaster recovery "un-delete" of a timeline, that special case can be handled by calling out to all available pageservers to check that they return 404 for the timeline, and to flush their deletion queues in case they had any deletions pending from the timeline. The above makes it safe for control plane to change the assignment of tenant to pageserver in control plane while a timeline creation is ongoing. The reason is that the creation request against the new assigned pageserver uses a new generation number. However, care must be taken by control plane to ensure that a "timeline creation successful" response from some pageserver is checked for the pageserver's generation for that timeline's tenant still being the latest. If it is not the latest, the response does not constitute a successful timeline creation. It is acceptable to discard such responses, the scrubber will clean up the S3 state. It is better to issue a timeline deletion request to the stale attachment. #### Timeline Deletion Tenant/timeline deletion operations are exempt from generation validation on deletes, and therefore don't have to go through the same deletion queue as GC/compaction layer deletions. This is because once a delete is issued by the control plane, it is a promise that the control plane will keep trying until the deletion is done, so even stale pageservers are permitted to go ahead and delete the objects. The implications of this for control plane are: - During timeline/tenant deletion, the control plane must wait for the deletion to be truly complete (status 404) and also handle the case where the pageserver becomes unavailable, either by waiting for a replacement with the same node_id, or by *re-attaching the tenant elsewhere. - The control plane must persist its intent to delete a timeline/tenant before issuing any RPCs, and then once it starts, it must keep retrying until the tenant/timeline is gone. This is already handled by using a persistent `Operation` record that is retried indefinitely. Timeline deletion may result in a special kind of object leak, where the latest generation attachment completes a deletion (including erasing all objects in the timeline path), but some slow/partitioned node is writing into the timeline path with a stale generation number. This would not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the attached pageserver, and once the timeline is deleted it isn't attached anywhere. This scenario should be pretty rare, and the control plane can make it even rarer by ensuring that if a tenant is in a multi-attached state (e.g. during migration), we wait for that to complete before processing the deletion. Beyond that, we may implement some other top-level scrub of timelines in an external tool, to identify any tenant/timeline paths that are not found in the control plane database. #### Examples - Deletion, node restarts partway through: - By the time we returned 202, we have written a remote delete marker - Any subsequent incarnation of the same node_id will see the remote delete marker and continue to process the deletion - If the original pageserver is lost permanently and no replacement with the same node_id is available, then the control plane must recover by re-attaching the tenant to a different node. - Creation, node becomes unresponsive partway through. - Control plane will see HTTP request timeout, keep re-issuing request to whoever is the latest attachment point for the tenant until it succeeds. - Stale nodes may be trying to execute timeline creation: they will write out index_part.json files with stale attachment generation: these will be eventually cleaned up by the same mechanism as other old indices. ### Unsafe case on badly behaved infrastructure This section is only relevant if running on a different environment than EC2 machines with ephemeral disks. If we ever run pageservers on infrastructure that might transparently restart a pageserver while leaving an old process running (e.g. a VM gets rescheduled without the old one being fenced), then there is a risk of corruption, when the control plane attaches the tenant, as follows: - If the control plane sends an `/attach` request to node A, then node A dies and is replaced, and the control plane's retries the request without incrementing that attachment ID, then it could end up with two physical nodes both using the same generation number. - This is not an issue when using EC2 instances with ephemeral storage, as long as the control plane never re-uses a node ID, but it would need re-examining if running on different infrastructure. - To robustly protect against this class of issue, we would either: - add a "node generation" to distinguish between different processes holding the same node_id. - or, dispense with static node_id entirely and issue an ephemeral ID to each pageserver process when it starts. ## Implementation Part 2: Optimizations ### Persistent deletion queue Between writing our a new index_part.json that doesn't reference an object, and executing the deletion, an object passes through a window where it is only referenced in memory, and could be leaked if the pageserver is stopped uncleanly. That introduces conflicting incentives: on the one hand, we would like to delay and batch deletions to 1. minimize the cost of the mandatory validations calls to control plane, and 2. minimize cost for DeleteObjects requests. On the other hand we would also like to minimize leakage by executing deletions promptly. To resolve this, we may make the deletion queue persistent and then executing these in the background at a later time. _Note: The deletion queue's reason for existence is optimization rather than correctness, so there is a lot of flexibility in exactly how the it should work, as long as it obeys the rule to validate generations before executing deletions, so the following details are not essential to the overall RFC._ #### Scope The deletion queue will be global per pageserver, not per-tenant. There are several reasons for this choice: - Use the queue as a central point to coalesce validation requests to the control plane: this avoids individual `Timeline` objects ever touching the control plane API, and avoids them having to know the rules about validating deletions. This separation of concerns will avoid burdening the already many-LoC `Timeline` type with even more responsibility. - Decouple the deletion queue from Tenant attachment lifetime: we may "hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline` objects in the pageserver, without having to wait for deletions to be done. - Amortize the cost of I/O for the persistent queue, instead of having many tiny queues. - Coalesce deletions into a smaller number of larger DeleteObjects calls Because of the cost of doing I/O for persistence, and the desire to coalesce generation validation requests across tenants, and coalesce deletions into larger DeleteObjects requests, there will be one deletion queue per pageserver rather than one per tenant. This has the added benefit that when deactivating a tenant, we do not have to drain their deletion queue: deletions can proceed for a tenant whose main `Tenant` object has been torn down. #### Flow of deletion The flow of a deletion is becomes: 1. Need for deletion of an object (=> layer file) is identified. 2. Unlink the object from all the places that reference it (=> `index_part.json`). 3. Enqueue the deletion to a persistent queue. Each entry is `tenant_id, attachment_generation, S3 key`. 4. Validate & execute in batches: 4.1 For a batch of entries, call into control plane. 4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys. As outlined in the Part 1 on correctness, it is critical that deletions are only executed once the key is not referenced anywhere in S3. This property is obviously upheld by the scheme above. #### We Accept Object Leakage In Acceptable Circumstances If we crash in the flow above between (2) and (3), we lose track of unreferenced object. Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk. This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing). There are various measures we can take to improve this in the future. 1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk) 2. Proactively flush: - On graceful shutdown, as we anticipate that some or all of our attachments may be re-assigned while we are offline. - On tenant detach. 3. For each entry, keep track of whether it has passed (2). Only admit entries to (4) one they have passed (2). This requires re-writing / two queue entries (intent, commit) per deletion. The important take-away with any of the above is that it's not disastrous to leak objects in exceptional circumstances. #### Operations that may skip the queue Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the control plane sends the deletion request, there is no requirement to retain the readability of any data within the timeline, and all objects within the timeline path may be deleted at any time from the control plane's deletion request onwards. Since deletions of smaller timelines won't have enough objects to compose a full sized DeleteObjects request, it is still useful to send these through the last part of the deletion pipeline to coalesce with other executing deletions: to enable this, the deletion queue should expose two input channels: one for deletions that must be processed in a generation-aware way, and a fast path for timeline deletions, where that fast path may skip validation and the persistent queue. ### Cleaning up orphan objects (scrubbing) An orphan object is any object which is no longer referenced by a running node or by metadata. Examples of how orphan objects arise: - A node PUTs a layer object, then crashes before it writes the index_part.json that references that layer. - A stale node carries on running for some time, and writes out an unbounded number of objects while it believes itself to be the rightful writer for a tenant. - A pageserver crashes between un-linking an object from the index, and persisting the object to its deletion queue. Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross referencing with the latest metadata to identify objects which are not referenced. Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same validation as all other deletions: the attachment generation must be fresh. This avoids the possibility of a stale pageserver incorrectly thinking than an object written by a newer generation is stale, and deleting it. It is not strictly necessary that scrubbing be done by an attached pageserver: it could also be done externally. However, an external scrubber would still require the same validation procedure that a pageserver's deletion queue performs, before actually erasing objects. ## Operational impact ### Availability Coordination of generation numbers via the control plane introduce a dependency for certain operations: 1. Starting new pageservers (or activating pageservers after a restart) 2. Executing enqueued deletions 3. Advertising updated `remote_consistent_lsn` to enable WAL trimming Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were unavailable, will now not resume service to users until the control plane is available. We could avoid this by having a timeout on communication with the control plane, and after some timeout, resume service with the previous generation numbers (assuming this was persisted to disk). However, this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate reasoning about the system, as it would break the invariant that a generation number uniquely identifies a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment to the pageserver _machine_ or its _on-disk-state_. Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is the S3 capacity cost. Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue, we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for remote_consistent_lsn to advance. For a managed service, the general approach should be to make sure we are monitoring & respond fast enough that control plane outages are bounded in time. There is also the fact that control plane runs in a single region. The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path. However, we lose region isolation for the operations listed above. The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes. With that in mind, we accept the trade-offs outlined in this paragraph. We will also implement an "escape hatch" config generation numbers, where in a major disaster outage, we may manually run pageservers with a hand-selected generation number, so that we can bring them online independently of a control plane. ### Rollout Although there is coupling between components, we may deploy most of the new data plane components independently of the control plane: initially they can just use a static generation number. #### Phase 1 The pageserver is deployed with some special config to: - Always act like everything is generation 1 and do not wait for a control plane issued generation on attach - Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane #### Phase 2 The control plane changes are deployed: control plane will now track and increment generation numbers. #### Phase 3 The pageserver is deployed with its control-plane-dependent changes enabled: it will now require the control plane to service re-attach requests on startup, and handle generation validation requests. ### On-disk backward compatibility Backward compatibility with existing data is straightforward: - When reading the index, we may assume that any layer whose metadata doesn't include generations will have a path without generation suffix. - When locating the index file on attachment, we may use the "fallback" listing path and if there is only an index without generation suffix, that is the one we load. It is not necessary to re-write existing layers: even new index files will be able to represent generation-less layers. ### On-disk forward compatibility We will do a two phase rollout, probably over multiple releases because we will naturally have some of the read-side code ready before the overall functionality is ready: 1. Deploy pageservers which understand the new index format and generation suffixes in keys, but do not write objects with generation numbers in the keys. 2. Deploy pageservers that write objects with generation numbers in the keys. Old pageservers will be oblivious to generation numbers. That means that they can't read objects with generation numbers in the name. This is why we must first step must deploy the ability to read, before the second step starts writing them. # Frequently Asked Questions ## Why a generation _suffix_ rather than _prefix_? The choice is motivated by object listing, since one can list by prefix but not suffix. In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely on being able to do a prefix listing for `//index_part.json*`. That relies on the prefix listing. The converse case of using a generation prefix and listing by generation is not needed: one could imagine listing by generation while scrubbing (so that a particular generation's layers could be scrubbed), but this is not part of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway. ## Wouldn't it be simpler to have a separate deletion queue per timeline? Functionally speaking, we could. That's how RemoteTimelineClient currently works, but this approach does not map well to a long-lived persistent queue with generation validation. Anything we do per-timeline generates tiny random I/O, on a pageserver with tens of thousands of timelines operating: to be ready for high scale, we should: - A) Amortize costs where we can (e.g. a shared deletion queue) - B) Expect to put tenants into a quiescent state while they're not busy: i.e. we shouldn't keep a tenant alive to service its deletion queue. This was discussed in the [scope](#scope) part of the deletion queue section. # Appendix A: Examples of use in high availability/failover The generation numbers proposed in this RFC are adaptable to a variety of different failover scenarios and models. The sections below sketch how they would work in practice. ### In-place restart of a pageserver "In-place" here means that the restart is done before any other element in the system has taken action in response to the node being down. - After restart, the node issues a re-attach request to the control plane, and receives new generation numbers for all its attached tenants. - Tenants may be activated with the generation number in the re-attach response. - If any of its attachments were in fact stale (i.e. had be reassigned to another node while this node was offline), then - the re-attach response will inform the tenant about this by not including the tenant of this by _not_ incrementing the generation for that attachment. - This will implicitly block deletions in the tenant, but as an optimization the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state. - The control plane is expected to eventually detach this tenant from the pageserver. If the control plane does not include a tenant in the re-attach response, but there is still local state for the tenant in the filesystem, the pageserver deletes the local state in response and does not load/active the tenant. See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details. Control plane can use this mechanism to clean up a pageserver that has been down for so long that all its tenants were migrated away before it came back up again and asked for re-attach. ### Failure of a pageserver In this context, read "failure" as the most ambiguous possible case, where a pageserver is unavailable to clients and control plane, but may still be executing and talking to S3. #### Case A: re-attachment to other nodes 1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2. 2. Some external mechanism notices that the node is unavailable and initiates movement of all tenants attached to that node to a different node according to some distribution rule. In this example, it would mean incrementing the generation of all tenants that were attached to node 0, as each tenant's assigned pageserver changes. 3. A tenant which is now attached to node 1 will _also_ still be attached to node 0, from the perspective of node 0. Node 0 will still be using its old generation, node 1 will be using a newer generation. 4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001 \_and\* an index_part.json-00000002. Objects written under the old suffix after the new attachment was created do not matter from the rest of the system's perspective: the endpoints are reading from the new attachment location. Objects written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will not do any deletions because it can't synchronize with control plane, or if it could, its deletion queue processing would get errors for the validation requests. #### Case B: direct node replacement with same node_id and drive This is the scenario we would experience if running pageservers in some dynamic VM/container environment that would auto-replace a given node_id when it became unresponsive, with the node's storage supplied by some network block device that is attached to the replacement VM/container. 1. Let's say node 0 fails, and there may be some other peers but they aren't relevant. 2. Some external mechanism notices that the node is unavailable, and creates a "new node 0" (Node 0b) which is a physically separate server. The original node 0 (Node 0a) may still be running, because we do not assume the environment fences nodes. 3. On startup, node 0b re-attaches and gets higher generation numbers for all tenants. 4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different generation in the suffix, and the writes from node 0a are not visible to the rest of the system because endpoints are reading only from node 0b. # Appendix B: interoperability with other features ## Sharded Keyspace The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space for a tenant are assigned to different pageservers: - the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant - TenantShards get generation numbers just as Tenants do. - Write workload (ingest, compaction) for a tenant is spread out across pageservers via TenantShards, but each TenantShard still has exactly one valid writer at a time. ## Read replicas _This section is about a passive reader of S3 pageserver state, not a postgres read replica_ For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given page moves around as compaction happens). A read replica needs to be aware of generations in remote data in order to read the latest metadata (find the index_part.json with the latest suffix). It may either query this from the control plane, or find it with ListObjectsv2 request ## Seamless migration To make tenant migration totally seamless, we will probably want to intentionally double-attach a tenant briefly, serving reads from the old node while waiting for the new node to be ready. This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination having a higher generation number. The old node will be able to ingest and serve reads, but not do any deletes. The new node's attachment must also avoid deleting layers that the old node may still use. A new piece of state will be needed for this in the control plane's definition of an attachment. ## Warm secondary locations To enable faster tenant movement after a pageserver is lost, we will probably want to spend some disk capacity on keeping standby locations populated with local disk data. There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis would be a separate change to the control plane to store standby location(s) for a tenant. Because the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is re-attached to a standby location, that would increment the tenant attachment generation and this would work the same as any other attachment change, but with a warm cache. ## Ephemeral node IDs This RFC intentionally avoids changing anything fundamental about how pageservers are identified and registered with the control plane, to avoid coupling the implementation of pageserver split brain protection with more fundamental changes in the management of the pageservers. Moving to ephemeral node IDs would provide an extra layer of resilience in the system, as it would prevent the control plane accidentally attaching to two physical nodes with the same generation, if somehow there were two physical nodes with the same node IDs (currently we rely on EC2 guarantees to eliminate this scenario). With ephemeral node IDs, there would be no possibility of that happening, no matter the behavior of underlying infrastructure. Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the `node_id` anywhere. The `/re-attach` API would be extended to enable the pageserver to obtain its ephemeral ID, and provide some correlation identifier (e.g. EC instance ID), to help the control plane re-attach tenants to the same physical server that previously had them attached. ================================================ FILE: docs/rfcs/026-pageserver-s3-mvcc.md ================================================ This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16. This is for archival mostly. The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919. --- # Proposal: Pageserver MVCC S3 Storage tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop). # Problem Statement The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each other’s changes in S3 for that tenant, resulting in data loss for that tenant. The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture. Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, we’ll have data loss. Also, by relying on humans, we’re [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost. Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability. # High-Level Idea The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID, so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one that’s requesting commit. Garbage collection is done through deadlists, and it’s simplified tremendously by above commit grant/reject policy. Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation. **Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:** - **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers, and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3. - **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes. # Design The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme. To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3. Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transaction’s deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transaction’s changes are now visible to subsequent transactions. Commit reject means that the transaction’s changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant. ## ****************************************************Commit grant/reject policy**************************************************** For the purposes of Pageserver, we want **linearizability** of a tenant’s S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started. For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants; What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesn’t have for A’s transaction to become RejectAcknowledge before attaching the tenant to B. ```mermaid sequenceDiagram participant CP participant A participant S3 participant B CP -->> A: attach tenant activate A A -->> CP: start txn CP -->> A: txn=23, last_committed_txn=22 Note over CP,A: network partition CP --x A: heartbeat CP --x A: heartbeat Note over CP: relocate tenant to avoid downtime CP -->> B: attach tenant activate B B -->> CP: start txn Note over CP: mark A's txn 23 as RejectPending CP -->> B: txn=24, last-committed txn is 22 B -->> S3: PUT X.layer.24
PUT index_part.json.24 referencing X.layer.24 B -->> CP: request commit CP -->> B: granted B -->> CP: start txn CP -->> B: txn=25, last_committed_txn=22 A -->> S3: PUT Y.layer.23
PUT index_part.json.23 referencing Y.layer.23 A --x CP: request commit A --x CP: request commit Note over CP,A: partition is over A -->> CP: request commit Note over CP: most recently started txn is 25, not 23, reject CP -->> A: reject A -->> CP: acknowledge reject Note over CP: mark A's txn 23 as RejectAcknowledged deactivate A B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************
... deactivate B ``` If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime). In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` . ## ********************Visibility******************** We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; they’re all multi-versioned, suffixed by the txn number. The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transaction’s last_committed_txn for purposes of garbage collection (see later paragraph). Equipped with last_committed_txn, Pageserver then discovers - the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architecture’s index_part.json, i.e. full list of layers. - the list of existent timelines as part of the `attach` RPC from CP; There is no other S3 state per tenant, so, that’s all the visibility required. An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired. So, the only MVCC’ed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageserver’s garbage collection code needs to know the full set of timelines of a tenant. Otherwise it’ll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCC’ed. ## Longevity Of Transactions & Availability Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started. Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node. Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane. ## **Operations** ********PUTs:******** - **layer files** - current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic. - this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit. - **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP. **************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because it’s not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCC’ing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction. ### Rationale For Deadlist.json Given that this proposal only MVCC’s layers and indexparts, one may ask why the deadlist isn’t part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which we’ll come to in the next section. ## Garbage Collection / Vacuuming After a transaction has reached reject-acknowledged state, Control Plane initiates a garbage collection procedure for the aborted transaction. Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of. ``` struct Tenant { ... txns: HashMap, // the most recently started txn's id; only most recently started can win next_winner_txn: Option, } struct Transaction { id: TxnId, // immutable last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed` // when self was started pageserver_id: PageserverId, state: enum { Open, Committed, RejectPending, RejectAcknowledged, // invariant: we know all S3 activity has ceded GarbageCollected, } } ``` Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transaction’s deadlists. A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on T’s deadlist, though, as per the Deadlist Invariant (see previous section). So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged: - Committed: delete objects on the deadlist. - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap. - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below. - RejectAcknowledged: delete all objects created in that txn, and discard deadlists. - 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above) - How to get this list of objects created in a txn? Open but solvable design question; Ideas: - **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them. - **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that it’s going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT. - ******************************LIST with reorg’ed S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction. ## Branches This proposal only MVCC’s layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that it’s fine to ignore this for now, because 1. Control Plane can act as source-of-truth for the set of timelines, and 2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCC’ed. Now that we’ve introduced garbage collection, let’s elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T. How does the MVCC’ing of layer files protect us here? If A decides to delete that layer, it’s just on A’s transaction’s deadlist, but still present in S3 and usable by B. If A commits first, B won’t be able to commit and the layers in timeline T will be vacuumed. If B commits first, A’s deadlist is discarded and the layer continues to exist. ## Safekeeper Changes We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation. In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageserver’s `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol. So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connection’s `remote_consistent_lsn` to use as a basis for WAL pruning. What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction. The straight-forward way to get it is to re-design WAL pruning as follows: 1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane. 2. Control Plane makes sure transaction state update is persisted. 3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens. The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways. # How This Proposal Unlocks Future Features Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because they’re unavoidable in distributed systems. We did that. Now let’s see how we can solve actual product problems: ## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure** The “Problem Statement” section outlined the current architecture’s problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesn’t exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users. With this proposal, automatic failover for pageservers is trivial: If a pageserver is unresponsive from Control Plane’s / Compute’s perspective, Control Plane does the following: - attach all tenants of the unresponsive pageserver to new pageservers - switch over these tenants’ computes immediately; At this point, availability is restored and user pain relieved. What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above. 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT): 1. Inspect the instance, investigate logs, understand root cause. 2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC. 3. Use below procedure to decommission pageserver. ### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive) The solution, enabled by this proposal: 1. Ensure that pageserver’s S3 credentials are revoked so that it cannot make new uploads, which wouldn’t be tracked anywhere. 2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesn’t give a guarantee here. As stated earlier, we can easily afford to wait here. 3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge. Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again. ## Workflow: Zero-Downtime Relocation With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver. With this proposal, it works like so: 1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails. 2. Grant target pageserver its first open transaction. 3. Have target pageserver catch up on WAL, streaming from last-committed-txn’s remote_consistent_lsn onwards. 4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit. 5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth). Note that as soon as we complete step (4), the source pageserver’s transaction is doomed to be rejected later. Conversely, if the target can’t catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g., - delaying source commits if we see target making progress - slowing down source consumption (need some signalling mechanism for it) - slowing down compute wal generation - … It doesn’t really matter, what’s important is that two pageservers can overlap. # Additional Trade-Offs / Remarks Brought Up During Peer Review This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!). Issues that we discussed: 1. **Frequency of transactions:** If even idle tenants commit every 10min or so, that’s quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach? 1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work. 2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data. 1. It’s ok to spend resources on active users, they pay us money to do it! 2. The amount of work per transaction is minimal. 1. In current Control Plane, it’s a small database transaction that is super unlikely to conflict with other transactions. 2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant. 3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario. 4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation. 2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************ 1. No it doesn’t. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver. 2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region. 3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasn’t been addressed Ithere. 4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages. 1. Is that a show-stopper? I think not. 2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers. 1. Personally I think that’s an OK trade-off to make. 2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica. 3. ********************************************Does this proposal allow multiple writers for a tenant?******************************************** 1. In abstract terms, this proposal provides a linearized history for a given S3 prefix. 2. In concrete terms, this proposal provides a linearized history per tenant. 3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history. 4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************ 1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT 2. @Dmitry Rodionov : 3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment. ================================================ FILE: docs/rfcs/027-crash-consistent-layer-map-through-index-part.md ================================================ # Crash-Consistent Layer Map Updates By Leveraging `index_part.json` * Created on: Aug 23, 2023 * Author: Christian Schwarz ## Summary This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage. Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold. ## Motivation ### Background We can currently easily make complex, atomic updates to the layer map by means of an RwLock. If we crash or restart pageserver, we reconstruct the layer map from: 1. local timeline directory contents 2. remote `index_part.json` contents. The function that is responsible for this is called `Timeline::load_layer_map()`. The reconciliation process's behavior is the following: * local-only files will become part of the layer map as local-only layers and rescheduled for upload * For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map. ### The Problem There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**. The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers. As stated above, making the update to the layer map in atomic way is trivial. But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion. Currently, we issue the system calls one by one and hope we don't crash. What happens if we crash and restart in the middle of that system call sequence? We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in. We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make. ### Problem's Implications For Compaction The implications of the above are primarily problematic for compaction. Specifically, the part of it that compacts L0 layers into L1 layers. Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files. Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map. It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part. If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s. This means the compaction after restart will **overwrite** the previously written L1s. Currently we also schedule an S3 upload of the overwritten L1. If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed. *However*: 1. the file size of the overwritten L1s may not be identical, and 2. the bit pattern of the overwritten L1s may not be identical, and, 3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s. But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite. That earlier `index_part.json`` contained the file size of the pre-overwrite L1. If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. Effectively, the data in the L1 has become inaccessible to node B. If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem. If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum. But if (3) ever happens, the logical content may be different, and, we could have truly lost data. Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents. **It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).** ## Design Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load. Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise. During **timeline load**, the only thing that matters is the remote index part content. Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines. The local timeline dir's `metadata` file does not matter. The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part. Any layer files in the local timeline dir that aren't in the remote index part are removed during startup. The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part. Instead, it treats the remote index part as the authoritative layer map. If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part. If it doesn't match, we remove the file from the local timeline dir. After load, **at runtime**, nothing changes compared to what we did before this RFC. The procedure for single- and multi-object changes is reproduced here for reference: * For any new layers that the change adds: * Write them to a temporary location. * While holding layer map lock: * Move them to the final location. * Insert into layer map. * Make the S3 changes. We won't reproduce the remote timeline client method calls here because these are subject to change. Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change: * PUT layer files inserted by the change. * PUT an index part that has insertions and deletions of the change. * DELETE the layer files that are deleted by the change. Note that it is safe for the DELETE to be deferred arbitrarily. * If it never happens, we leak the object, but, that's not a correctness concern. * As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`. * With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details). ## How This Solves The Problem If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part. The S3 change sequence above is obviously crash-consistent. If we crash before the index part PUT, then we leak the inserted layer files to S3. If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3. Leaking is fine, it's a pre-existing condition and not addressed in this RFC. Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent: * atomic layer map update at runtime, currently by using an RwLock in write mode * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic * local timeline dir state: * irrelevant for layer map content => irrelevant for atomic updates / crash consistency * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them * if we crash before index part PUT, local layer files will be deleted ## Trade-Offs ### Fundamental If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`: * wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver * compaction: we lose the entire compaction iteration work; need to re-do it again * gc: no change to what we have today If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work. The amount of work to be re-do is capped to the lag of S3 changes to the local changes. Assuming upload queue allows for unlimited queue depth (that's what it does today), this means: * on-demand downloads that were needed to do the work: are likely still present, not lost * wal ingest: currently unbounded * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` * I have no intuition how expensive / long-running it is in reality. * gc: `update_gc_info`` work (not substantial, AFAIK) To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section). However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable. We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)). However, this RFC is not constraining the design space either. ### Practical #### Pageserver Restarts Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case. However, regular pageserver restart happen frequently, e.g., during weekly deploys. In general, pageserver restart faces the problem of tenants that "take too long" to shut down. They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down. We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). A longer budget would expose tenants that are done early to a longer downtime. A short budget would risk throwing away more work that'd have to be re-done after restart. In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3. We can mitigate this problem as follows: 0. initially, by accepting that we need to do the work again 1. short-term, introducing measures to cap the amount of in-flight work: - cap upload queue length, use backpressure to slow down compaction - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver` - introducing a read-only shutdown state for tenants that are fast to shut down; that state would be equivalent to the state of a tenant in hot standby / readonly mode. 2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it. #### `disk_consistent_lsn` can go backwards `disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT. Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`. Compute certainly doesn't care about `disk_consistent_lsn`. ## Side-Effects Of This Design * local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load. ## Limitations Multi-object changes that span multiple timelines aren't covered by this RFC. That's fine because we currently don't need them, as evidenced by the absence of a Pageserver operation that holds multiple timelines' layer map lock at a time. ## Impacted components Primarily pageservers. Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work. No changes to safekeepers are needed. ## Alternatives considered ### Alternative 1: WAL We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 . The WAL would be used to 1. make multi-object changes atomic 2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay. The WAL is appealing in a local-first world, but, it's much more complex than the design described above: * New on-disk state to get right. * Forward- and backward-compatibility development costs in the future. ### Alternative 2: Flow Everything Through `index_part.json` We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`. I.e., layer map would always be the last-persisted S3 state. That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)). And it might make hot standbys / read-only pageservers less of a special case in the future. But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers. And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload. Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable. ### Alternative 3: Sequence Numbers For Layers Instead of what's proposed in this RFC, we could use unique numbers to identify layer files: ``` # before tenants/$tenant/timelines/$timeline/$key_and_lsn_range # after tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range ``` To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`. This alternative does not solve atomic layer map updates. In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files. We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents. However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute. The proposed design in this RFC addresses both. So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. That way, we avoid a phase where the crash-during-compaction problem is acute. ## Related issues - https://github.com/neondatabase/neon/issues/4749 - https://github.com/neondatabase/neon/issues/4418 - https://github.com/neondatabase/neon/pull/4422 - https://github.com/neondatabase/neon/issues/5077 - https://github.com/neondatabase/neon/issues/4088 - (re)resolutions: - https://github.com/neondatabase/neon/pull/4696 - https://github.com/neondatabase/neon/pull/4094 - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719 Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem. ## Implementation Plan 1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part. - The nasty part here is to fix all the tests that fiddle with the local timeline directory. Possibly they are just irrelevant with this change, but, each case will require inspection. 2. Implement the design above. - Initially, ship without the mitigations for restart and accept we will do some work twice. - Measure the impact and implement one of the mitigations. ================================================ FILE: docs/rfcs/028-pageserver-migration.md ================================================ # Seamless tenant migration - Author: john@neon.tech - Created on 2023-08-11 - Implemented on .. ## Summary The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant migration safe". Following that, this RFC is about how those migrations are to be done: 1. Seamlessly (without interruption to client availability) 2. Quickly (enabling faster operations) 3. Efficiently (minimizing I/O and $ cost) These points are in priority order: if we have to sacrifice efficiency to make a migration seamless for clients, we will do so, etc. This is accomplished by introducing two high level changes: - A dual-attached state for tenants, used in a control-plane-orchestrated migration procedure that preserves availability during a migration. - Warm secondary locations for tenants, where on-disk content is primed for a fast migration of the tenant from its current attachment to this secondary location. ## Motivation Migrating tenants between pageservers is essential to operating a service at scale, in several contexts: 1. Responding to a pageserver node failure by migrating tenants to other pageservers 2. Balancing load and capacity across pageservers, for example when a user expands their database and they need to migrate to a pageserver with more capacity. 3. Restarting pageservers for upgrades and maintenance The current situation steps for migration are: - detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)). - attach to new node - re-configure endpoints to use the new node Once [generation numbers](025-generation-numbers.md) are implemented, the detach step is no longer critical for correctness. So, we can - attach to a new node, - re-configure endpoints to use the new node, and then - detach from the old node. However, this still does not meet our seamless/fast/efficient goals: - Not fast: The new node will have to download potentially large amounts of data from S3, which may take many minutes. - Not seamless: If we attach to a new pageserver before detaching an old one, the new one might delete some objects that interrupt availability of reads on the old one. - Not efficient: the old pageserver will continue uploading S3 content during the migration that will never be read. The user expectations for availability are: - For planned maintenance, there should be zero availability gap. This expectation is fulfilled by this RFC. - For unplanned changes (e.g. node failures), there should be minimal availability gap. This RFC provides the _mechanism_ to fail over quickly, but does not provide the failure _detection_ nor failover _policy_. ## Non Goals - Defining service tiers with different storage strategies: the same level of HA & overhead will apply to all tenants. This doesn't rule out adding such tiers in future. - Enabling pageserver failover in the absence of a control plane: the control plane will remain the source of truth for what should be attached where. - Totally avoiding availability gaps on unplanned migrations during a failure (we expect a small, bounded window of read unavailability of very recent LSNs) - Workload balancing: this RFC defines the mechanism for moving tenants around, not the higher level logic for deciding who goes where. - Defining all possible configuration flows for tenants: the migration process defined in this RFC demonstrates the sufficiency of the pageserver API, but is not the only kind of configuration change the control plane will ever do. The APIs defined here should let the control plane move tenants around in whatever way is needed while preserving data safety and read availability. ## Impacted components Pageserver, control plane ## Terminology - **Attachment**: a tenant is _attached_ to a pageserver if it has been issued a generation number, and is running an instance of the `Tenant` type, ingesting the WAL, and available to serve page reads. - **Location**: locations are a superset of attachments. A location is a combination of a tenant and a pageserver. We may _attach_ at a _location_. - **Secondary location**: a location which is not currently attached. - **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features. ## Implementation (high level) ### Warm secondary locations To enable faster migrations, we will identify at least one _secondary location_ for each tenant. This secondary location will keep a warm cache of layers for the tenant, so that if it is later attached, it can catch up with the latest LSN quickly: rather than downloading everything, it only has to replay the recent part of the WAL to advance from the remote_consistent_offset to the most recent LSN in the WAL. The control plane is responsible for selecting secondary locations, and calling into pageservers to configure tenants into a secondary mode at this new location, as well as attaching the tenant in its existing primary location. The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap) to advise secondaries of which layers should be downloaded. ### Location modes Currently, we consider a tenant to be in one of two states on a pageserver: - Attached: active `Tenant` object, and layers on local disk - Detached: no layers on local disk, no runtime state. We will extend this with finer-grained modes, whose purpose will become clear in later sections: - **AttachedSingle**: equivalent the existing attached state. - **AttachedMulti**: like AttachedSingle, holds an up to date generation, but does not do deletions. - **AttachedStale**: like AttachedSingle, holds a stale generation, do not do any remote storage operations. - **Secondary**: keep local state on disk, periodically update from S3. - **Detached**: equivalent to existing detached state. To control these finer grained states, a new pageserver API endpoint will be added. ### Cutover procedure Define old location and new location as "Node A" and "Node B". Consider the case where both nodes are available, and Node B was previously configured as a secondary location for the tenant we are migrating. The cutover procedure is orchestrated by the control plane, calling into the pageservers' APIs: 1. Call to Node A requesting it to flush to S3 and enter AttachedStale state 2. Increment generation, and call to Node B requesting it to enter AttachedMulti state with the new generation. 3. Call to Node B, requesting it to download the latest hot layers from remote storage, according to the latest heatmap flushed by Node A. 4. Wait for Node B's WAL ingestion to catch up with node A's 5. Update endpoints to use node B instead of node A 6. Call to node B requesting it to enter state AttachedSingle. 7. Call to node A requesting it to enter state Secondary The following table summarizes how the state of the system advances: | Step | Node A | Node B | Node used by endpoints | | :-----------: | :------------: | :------------: | :--------------------: | | 1 (_initial_) | AttachedSingle | Secondary | A | | 2 | AttachedStale | AttachedMulti | A | | 3 | AttachedStale | AttachedMulti | A | | 4 | AttachedStale | AttachedMulti | A | | 5 (_cutover_) | AttachedStale | AttachedMulti | B | | 6 | AttachedStale | AttachedSingle | B | | 7 (_final_) | Secondary | AttachedSingle | B | The procedure described for a clean handover from a live node to a secondary is also used for failure cases and for migrations to a location that is not configured as a secondary, by simply skipping irrelevant steps, as described in the following sections. #### Migration from an unresponsive node If node A is unavailable, then all calls into node A are skipped and we don't wait for B to catch up before switching updating the endpoints to use B. #### Migration to a location that is not a secondary If node B is initially in Detached state, the procedure is identical. Since Node B is coming from a Detached state rather than Secondary, the download of layers and catch up with WAL will take much longer. We might do this if: - Attached and secondary locations are both critically low on disk, and we need to migrate to a third node with more resources available. - We are migrating a tenant which does not use secondary locations to save on cost. #### Permanent migration away from a node In the final step of the migration, we generally request the original node to enter a Secondary state. This is typical if we are doing a planned migration during maintenance, or to balance CPU/network load away from a node. One might also want to permanently migrate away: this can be done by simply removing the secondary location after the migration is complete, or as an optimization by substituting the Detached state for the Secondary state in the final step. #### Cutover diagram ```mermaid sequenceDiagram participant CP as Control plane participant A as Node A participant B as Node B participant E as Endpoint CP->>A: PUT Flush & go to AttachedStale note right of A: A continues to ingest WAL CP->>B: PUT AttachedMulti CP->>B: PUT Download layers from latest heatmap note right of B: B downloads from S3 loop Poll until download complete CP->>B: GET download status end activate B note right of B: B ingests WAL loop Poll until catch up CP->>B: GET visible WAL CP->>A: GET visible WAL end deactivate B CP->>E: Configure to use Node B E->>B: Connect for reads CP->>B: PUT AttachedSingle CP->>A: PUT Secondary ``` #### Cutover from an unavailable pageserver This case is far simpler: we may skip straight to our intended end state. ```mermaid sequenceDiagram participant A as Node A participant CP as Control plane participant B as Node B participant E as Endpoint note right of A: Node A offline activate A CP->>B: PUT AttachedSingle CP->>E: Configure to use Node B E->>B: Connect for reads deactivate A ``` ## Implementation (detail) ### Purpose of AttachedMulti, AttachedStale #### AttachedMulti Ordinarily, an attached pageserver whose generation is the latest may delete layers at will (e.g. during compaction). If a previous generation pageserver is also still attached, and in use by endpoints, then this layer deletion could lead to a loss of availability for the endpoint when reading from the previous generation pageserver. The _AttachedMulti_ state simply disables deletions. These will be enqueued in `RemoteTimelineClient` until the control plane transitions the node into AttachedSingle, which unblocks deletions. Other remote storage operations such as uploads are not blocked. AttachedMulti is not required for data safety, only to preserve availability on pageservers running with stale generations. A node enters AttachedMulti only when explicitly asked to by the control plane. It should only remain in this state for the duration of a migration. If a control plane bug leaves the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued deletions. This may be accomplished simply, by dropping enqueued deletions when some modest threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions, it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or by timeline deletion. During AttachedMulti, the Tenant is free to drop layers from local disk in response to disk pressure: only the deletion of remote layers is blocked. #### AttachedStale Currently, a pageserver with a stale generation number will continue to upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation will not be read back by future generations of pageservers. The _AttachedStale_ state disables S3 uploads. The stale pageserver will continue to ingest the WAL and write layers to local disk, but not to do any uploads to S3. A node may enter AttachedStale in two ways: - Explicitly, when control plane calls into the node at the start of a migration. - Implicitly, when the node tries to validate some deletions and discovers that its generation is stale. The AttachedStale state also disables sending consumption metrics from that location: it is interpreted as an indication that some other pageserver is already attached or is about to be attached, and that new pageserver will be responsible for sending consumption metrics. #### Disk Pressure & AttachedStale Over long periods of time, a tenant location in AttachedStale will accumulate data on local disk, as it cannot evict any layers written since it entered the AttachStale state. We rely on the control plane to revert the location to Secondary or Detached at the end of a migration. This scenario is particularly noteworthy when evacuating all tenants on a pageserver: since _all_ the attached tenants will go into AttachedStale, we will be doing no uploads at all, therefore ingested data will cause disk usage to increase continuously. Under nominal conditions, the available disk space on pageservers should be sufficient to complete the evacuation before this becomes a problem, but we must also handle the case where we hit a low disk situation while in this state. The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task` touches each Tenant when it determines that a low-disk condition requires some layer eviction. Having selected layers for eviction, the eviction task calls `Timeline::evict_layers`. **Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted layers are not yet uploaded to S3, then the block on uploads will be lifted. This will result in leaking some objects once a migration is complete, but will enable the node to manage its disk space properly: if a node is left with some tenants in AttachedStale indefinitely due to a network partition or control plane bug, these tenants will not cause a full disk condition. ### Warm secondary updates #### Layer heatmap The secondary location's job is to serve reads **with the same quality of service as the original location was serving them around the time of a migration**. This does not mean the secondary location needs the whole set of layers: inactive layers that might soon be evicted on the attached pageserver need not be downloaded by the secondary. A totally idle tenant only needs to maintain enough on-disk state to enable a fast cold start (i.e. the most recent image layers are typically sufficient). To enable this, we introduce the concept of a _layer heatmap_, which acts as an advisory input to secondary locations to decide which layers to download from S3. #### Attached pageserver The attached pageserver, if in state AttachedSingle, periodically uploads a serialized heat map to S3. It may skip this if there is no change since the last time it uploaded (e.g. if the tenant is totally idle). Additionally, when the tenant is flushed to remote storage prior to a migration (the first step in [cutover procedure](#cutover-procedure)), the heatmap is written out. This enables a future attached pageserver to get an up to date view when deciding which layers to download. #### Secondary location behavior Secondary warm locations run a simple loop, implemented separately from the main `Tenant` type, which represents attached tenants: - Download the layer heatmap - Select any "hot enough" layers to download, if there is sufficient free disk space. - Download layers, if they were not previously evicted (see below) - Download the latest index_part.json - Check if any layers currently on disk are no longer referenced by IndexPart & delete them Note that the heatmap is only advisory: if a secondary location has plenty of disk space, it may choose to retain layers that aren't referenced by the heatmap, as long as they are still referenced by the IndexPart. Conversely, if a node is very low on disk space, it might opt to raise the heat threshold required to both downloading a layer, until more disk space is available. #### Secondary locations & disk pressure Secondary locations are subject to eviction on disk pressure, just as attached locations are. For eviction purposes, the access time of a layer in a secondary location will be the access time given in the heatmap, rather than the literal time at which the local layer file was accessed. The heatmap will indicate which layers are in local storage on the attached location. The secondary will always attempt to get back to having that set of layers on disk, but to avoid flapping, it will remember the access time of the layer it was most recently asked to evict, and layers whose access time is below that will not be re-downloaded. The resulting behavior is that after a layer is evicted from a secondary location, it is only re-downloaded once the attached pageserver accesses the layer and uploads a heatmap reflecting that access time. On a pageserver restart, the secondary location will attempt to download all layers in the heatmap again, if they are not on local disk. This behavior will be slightly different when secondary locations are used for "low energy tenants", but that is beyond the scope of this RFC. ### Location configuration API Currently, the `/tenant//config` API defines various tunables like compaction settings, which apply to the tenant irrespective of which pageserver it is running on. A new "location config" structure will be introduced, which defines configuration which is per-tenant, but local to a particular pageserver, such as the attachment mode and whether it is a secondary. The pageserver will expose a new per-tenant API for setting the state: `/tenant//location/config`. Body content: ``` { state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}', generation: Option, configuration: `Option` flush: bool } ``` Existing `/attach` and `/detach` endpoint will have the same behavior as calling `/location/config` with `AttachedSingle` and `Detached` states respectively. These endpoints will be deprecated and later removed. The generation attribute is mandatory for entering `AttachedSingle` or `AttachedMulti`. The configuration attribute is mandatory when entering any state other than `Detached`. This configuration is the same as the body for the existing `/tenant//config` endpoint. The `flush` argument indicates whether the pageservers should flush to S3 before proceeding: this only has any effect if the node is currently in AttachedSingle or AttachedMulti. This is used during the first phase of migration, when transitioning the old pageserver to AttachedSingle. The `/re-attach` API response will be extended to include a `state` as well as a `generation`, enabling the pageserver to enter the correct state for each tenant on startup. ### Database schema for locations A new table `ProjectLocation`: - pageserver_id: int - tenant_id: TenantId - generation: Option - state: `enum(Secondary, AttachedSingle, AttachedMulti)` Notes: - It is legacy for a Project to have zero `ProjectLocation`s - The `pageserver` column in `Project` now means "to which pageserver should endpoints connect", rather than simply which pageserver is attached. - The `generation` column in `Project` remains, and is incremented and used to set the generation of `ProjectLocation` rows when they are set into an attached state. - The `Detached` state is implicitly represented as the absence of a `ProjectLocation`. ### Executing migrations Migrations will be implemented as Go functions, within the existing `Operation` framework in the control plane. These operations are persistent, such that they will always keep trying until completion: this property is important to avoid leaving garbage behind on pageservers, such as AttachedStale locations. ### Recovery from failures during migration During migration, the control plane may encounter failures of either the original or new pageserver, or both: - If the original fails, skip past waiting for the new pageserver to catch up, and put it into AttachedSingle immediately. - If the new node fails, put the old pageserver into Secondary and then back into AttachedSingle (this has the effect of retaining on-disk state and granting it a fresh generation number). - If both nodes fail, keep trying until one of them is available again. ### Control plane -> Pageserver reconciliation A migration may be done while the old node is unavailable, in which case the old node may still be running in an AttachedStale state. In this case, it is undesirable to have the migration `Operation` stay alive until the old node eventually comes back online and can be cleaned up. To handle this, the control plane should run a background reconciliation process to compare a pageserver's attachments with the database, and clean up any that shouldn't be there any more. Note that there will be no work to do if the old node was really offline, as during startup it will call into `/re-attach` and be updated that way. The reconciliation will only be needed if the node was unavailable but still running. ## Alternatives considered ### Only enabling secondary locations for tenants on a higher service tier This will make sense in future, especially for tiny databases that may be downloaded from S3 in milliseconds when needed. However, it is not wise to do it immediately, because pageservers contain a mixture of higher and lower tier workloads. If we had 1 tenant with a secondary location and 9 without, then those other 9 tenants will do a lot of I/O as they try to recover from S3, which may degrade the service of the tenant which had a secondary location. Until we segregate tenant on different service tiers on different pageserver nodes, or implement & test QoS to ensure that tenants with secondaries are not harmed by tenants without, we should use the same failover approach for all the tenants. ### Hot secondary locations (continuous WAL replay) Instead of secondary locations populating their caches from S3, we could have them consume the WAL from safekeepers. The downsides of this would be: - Double load on safekeepers, which are a less scalable service than S3 - Secondary locations' on-disk state would end up subtly different to the remote state, which would make synchronizing with S3 more complex/expensive when going into attached state. The downside of only updating secondary locations from S3 is that we will have a delay during migration from replaying the LSN range between what's in S3 and what's in the pageserver. This range will be very small on planned migrations, as we have the old pageserver flush to S3 immediately before attaching the new pageserver. On unplanned migrations (old pageserver is unavailable), the range of LSNs to replay is bounded by the flush frequency on the old pageserver. However, the migration doesn't have to wait for the replay: it's just that not-yet-replayed LSNs will be unavailable for read until the new pageserver catches up. We expect that pageserver reads of the most recent LSNs will be relatively rare, as for an active endpoint those pages will usually still be in the postgres page cache: this leads us to prefer synchronizing from S3 on secondary locations, rather than consuming the WAL from safekeepers. ### Cold secondary locations It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes). Warm caches on secondary locations are necessary to meet our availability goals. ### Pageserver-granularity failover Instead of migrating tenants individually, we could have entire spare nodes, and on a node death, move all its work to one of these spares. This approach is avoided for several reasons: - we would still need fine-grained tenant migration for other purposes such as balancing load - by sharing the spare capacity over many peers rather than one spare node, these peers may use the capacity for other purposes, until it is needed to handle migrated tenants. e.g. for keeping a deeper cache of their attached tenants. ### Readonly during migration We could simplify migrations by making both previous and new nodes go into a readonly state, then flush remote content from the previous node, then activate attachment on the secondary node. The downside to this approach is a potentially large gap in readability of recent LSNs while loading data onto the new node. To avoid this, it is worthwhile to incur the extra cost of double-replaying the WAL onto old and new nodes' local storage during a migration. ### Peer-to-peer pageserver communication Rather than uploading the heatmap to S3, attached pageservers could make it available to peers. Currently, pageservers have no peer to peer communication, so adding this for heatmaps would incur significant overhead in deployment and configuration of the service, and ensuring that when a new pageserver is deployed, other pageservers are updated to be aware of it. As well as simplifying implementation, putting heatmaps in S3 will be useful for future analytics purposes -- gathering aggregated statistics on activity patterns across many tenants may be done directly from data in S3. ================================================ FILE: docs/rfcs/029-getpage-throttling.md ================================================ # Per-Tenant GetPage@LSN Throttling Author: Christian Schwarz Date: Oct 24, 2023 ## Summary This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver and the interactions with its client, i.e., the neon_smgr component in Compute. The result of implementing & executing this RFC will be a fleet-wide upper limit for **"the highest GetPage/second that Pageserver can support for a single tenant/shard"**. ## Background ### GetPage@LSN Request Flow Pageserver exposes its `page_service.rs` as a libpq listener. The Computes' `neon_smgr` module connects to that libpq listener. Once a connection is established, the protocol allows Compute to request page images at a given LSN. We call these requests GetPage@LSN requests, or GetPage requests for short. Other request types can be sent, but these are low traffic compared to GetPage requests and are not the concern of this RFC. Pageserver associates one libpq connection with one tokio task. Per connection/task, the pq protocol is handled by the common `postgres_backend` crate. Its `run_message_loop` function invokes the `page_service` specific `impl postgres_backend::Handler for PageServerHandler`. Requests are processed in the order in which they arrive via the TCP-based pq protocol. So, there is no concurrent request processing within one connection/task. There is a degree of natural pipelining: Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream. And Pageserver can fill the pipe with responses in the other direction. Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc. ### GetPage@LSN Access Pattern The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC). Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches. If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*. ## Motivation In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h, then dropping to ca 18k GetPage/second for a period of 9h. We noticed this because of an internal GetPage latency SLO burn rate alert, i.e., the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO. Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants. However, here are some illustrative data points for the 155k period: The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance. We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`) The CPU utilization of the instance was 75% user+system. Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%. The hit rate for materialized pages was ca. 40%. Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100. The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**. The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM. The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**. My claim is that it was **unhealthy to serve this workload at the pace we did**: * it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this) * more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons: * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops. At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit. The result is an **uneven** performance profile from the Compute perspective. * **economics**: Neon currently does not charge for IOPS, only capacity. **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.** For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume. Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume. We charge 0$. It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free. Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits: vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver. So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity. ## Solution: Throttling GetPage Requests **The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**. That is, unless we want to start charging for provisioned GetPage@LSN/second. Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size. Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913). ## The Design Space What that remains is the question about *policy* and *mechanism*: **Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant. Candidates are: * hard limit, same limit value per connection|timeline|tenant * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance. This is a major operational pain point / risk right now. * hard limit, configurable per connection|timeline|tenant * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers. * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant. * fair share among active connections|timelines|tenants per instance * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance * ... Regarding **mechanism**, it's clear that **backpressure** is the way to go. However, we must choose between * **implicit** backpressure through pq/TCP and * **explicit** rejection of requests + retries with exponential backoff Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**: where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling? And when we eventually move the measurement point into the Computes (to avoid coordinated omission), how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO? ## Scope Of This RFC **This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**. This proposal is easy to implement and significantly de-risks operating large Pageservers, based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants. For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate. Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate. If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation. The mechanism for backpressure will be TCP-based implicit backpressure. The compute team isn't concerned about prefetch queue depth. Pageserver will implement it by delaying the reading of requests from the libpq connection(s). The rate limit will be implemented using a per-tenant token bucket. The bucket will be be shared among all connections to the tenant. The bucket implementation supports starvation-preventing `await`ing. The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/). The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771 can be used to evaluate the overhead of sharing the bucket among connections of a tenant. A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler. Regarding metrics / the internal GetPage latency SLO: we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by: - histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver) - histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver) Further observability measures: - an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute. The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats. Rollout will happen as follows: - deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf - experimentation in staging and later production to study impact & interaction with auto-scaling - determination of a sensible global default value - the value will be chosen as high as possible ... - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance. - deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default - reset of the experimental per-tenant overrides - gain experience & lower the limit over time - we stop lowering the limit as soon as this RFC's goal is achieved, i.e., once we decide that in practice the chosen value sufficiently de-risks operating large pageservers The per-tenant override will remain for emergencies and testing. But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant. Toward the upper layers of the Neon stack, the resulting limit will be **"the highest GetPage/second that Pageserver can support for a single tenant"**. ### Rationale We decided against error + retry because of worries about starvation. ## Future Work Enable per-tenant emergency override of the limit via Console. Should be part of a more general framework to specify tenant config overrides. **NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users, or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that concerns itself with GetPage/second capacity planning. Compute-side metrics for GetPage latency. Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled. Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss. Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant. Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance. With per-tenant rate limiting, we will not meet that expectation. However, we can currently only scale per tenant. Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis. But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit. To solve this properly, I think we'll need replicas for tenants / shard. To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas. ================================================ FILE: docs/rfcs/029-pageserver-wal-disaster-recovery.md ================================================ # Name Created on: 2023-09-08 Author: Arpad Müller ## Summary Enable the pageserver to recover from data corruption events by implementing a feature to re-apply historic WAL records in parallel to the already occurring WAL replay. The feature is outside of the user-visible backup and history story, and only serves as a second-level backup for the case that there is a bug in the pageservers that corrupted the served pages. The RFC proposes the addition of two new features: * recover a broken branch from WAL (downtime is allowed) * a test recovery system to recover random branches to make sure recovery works ## Motivation The historic WAL is currently stored in S3 even after it has been replayed by the pageserver and thus been integrated into the pageserver's storage system. This is done to defend from data corruption failures inside the pageservers. However, application of this WAL in the disaster recovery setting is currently very manual and we want to automate this to make it easier. ### Use cases There are various use cases for this feature, like: * The main motivation is replaying in the instance of pageservers corrupting data. * We might want to, beyond the user-visible history features, through our support channels and upon customer request, in select instances, recover historic versions beyond the range of history that we officially support. * Running the recovery process in the background for random tenant timelines to figure out if there was a corruption of data (we would compare with what the pageserver stores for the "official" timeline). * Using the WAL to arrive at historic pages we can then back up to S3 so that WAL itself can be discarded, or at least not used for future replays. Again, this sounds a lot like what the pageserver is already doing, but the point is to provide a fallback to the service provided by the pageserver. ## Design ### Design constraints The main design constraint is that the feature needs to be *simple* enough that the number of bugs are as low, and reliability as high as possible: the main goal of this endeavour is to achieve higher correctness than the pageserver. For the background process, we cannot afford a downtime of the timeline that is being cloned, as we don't want to restrict ourselves to offline tenants only. In the scenario where we want to recover from disasters or roll back to a historic lsn through support staff, downtimes are more affordable, and inevitable if the original had been subject to the corruption. Ideally, the two code paths would share code, so the solution would be designed for not requiring downtimes. ### API endpoint changes This RFC proposes two API endpoint changes in the safekeeper and the pageserver. Remember, the pageserver timeline API creation endpoint is to this URL: ``` /v1/tenant/{tenant_id}/timeline/ ``` Where `{tenant_id}` is the ID of the tenant the timeline is created for, and specified as part of the URL. The timeline ID is passed via the POST request body as the only required parameter `new_timeline_id`. This proposal adds one optional parameter called `existing_initdb_timeline_id` to the request's json body. If the parameter is not specified, behaviour should be as existing, so the pageserver runs initdb. If the parameter is specified, it is expected to point to a timeline ID. In fact that ID might match `new_timeline_id`, what's important is that S3 storage contains a matching initdb under the URL matching the given tenant and timeline. Having both `ancestor_timeline_id` and `existing_initdb_timeline_id` specified is illegal and will yield in an HTTP error. This feature is only meant for the "main" branch that doesn't have any ancestors of its own, as only here initdb is relevant. For the safekeeper, we propose the addition of the following copy endpoint: ``` /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy ``` it is meant for POST requests with json, and the two URL parameters `tenant_id` and `source_timeline_id`. The json request body contains the two required parameters `target_timeline_id` and `until_lsn`. After invoking, the copy endpoint starts a copy process of the WAL from the source ID to the target ID. The lsn is updated according to the progress of the API call. ### Higher level features We want the API changes to support the following higher level features: * recovery-after-corruption DR of the main timeline of a tenant. This feature allows for downtime. * test DR of the main timeline into a special copy timeline. this feature is meant to run against selected production tenants in the background, without the user noticing, so it does not allow for downtime. The recovery-after-corruption DR only needs the pageserver changes. It works as follows: * delete the timeline from the pageservers via timeline deletion API * re-create it via timeline creation API (same ID as before) and set `existing_initdb_timeline_id` to the same timeline ID The test DR requires also the copy primitive and works as follows: * copy the WAL of the timeline to a new place * create a new timeline for the tenant ## Non Goals At the danger of being repetitive, the main goal of this feature is to be a backup method, so reliability is very important. This implies that other aspects like performance or space reduction are less important. ### Corrupt WAL The process suggested by this RFC assumes that the WAL is free of corruption. In some instances, corruption can make it into WAL, like for example when higher level components like postgres or the application first read corrupt data, and then execute a write with data derived from that earlier read. That written data might then contain the corruption. Common use cases can hit this quite easily. For example, an application reads some counter, increments it, and then writes the new counter value to the database. On a lower level, the compute might put FPIs (Full Page Images) into the WAL, which have corrupt data for rows unrelated to the write operation at hand. Separating corrupt writes from non-corrupt ones is a hard problem in general, and if the application was involved in making the corrupt write, a recovery would also involve the application. Therefore, corruption that has made it into the WAL is outside of the scope of this feature. However, the WAL replay can be issued to right before the point in time where the corruption occurred. Then the data loss is isolated to post-corruption writes only. ## Impacted components (e.g. pageserver, safekeeper, console, etc) Most changes would happen to the pageservers. For the higher level features, maybe other components like the console would be involved. We need to make sure that the shadow timelines are not subject to the usual limits and billing we apply to existing timelines. ## Proposed implementation The first problem to keep in mind is the reproducibility of `initdb`. So an initial step would be to upload `initdb` snapshots to S3. After that, we'd have the endpoint spawn a background process which performs the replay of the WAL to that new timeline. This process should follow the existing workflows as closely as possible, just using the WAL records of a different timeline. The timeline created will be in a special state that solely looks for WAL entries of the timeline it is trying to copy. Once the target LSN is reached, it turns into a normal timeline that also accepts writes to its own timeline ID. ### Scalability For now we want to run this entire process on a single node, and as it is by nature linear, it's hard to parallelize. However, for the verification workloads, we can easily start the WAL replay in parallel for different points in time. This is valuable especially for tenants with large WAL records. Compare this with the tricks to make addition circuits execute with lower latency by making them perform the addition for both possible values of the carry bit, and then, in a second step, taking the result for the carry bit that was actually obtained. The other scalability dimension to consider is the WAL length, which is a growing question as tenants accumulate changes. There are possible approaches to this, including creating snapshots of the page files and uploading them to S3, but if we do this for every single branch, we lose the cheap branching property. ### Implementation by component The proposed changes for the various components of the neon architecture are written up in this notion page: https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2 ### Unresolved questions none known (outside of the mentioned ones). ================================================ FILE: docs/rfcs/030-vectored-timeline-get.md ================================================ # Vectored Timeline Get Created on: 2024-01-02 Author: Christian Schwarz # Summary A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. **EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link). # Motivation During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. For an example, see https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302. Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example). For each layer visited by layer map traversal, we do a `DiskBtree` point lookup. If it's negative (no entry), we resume layer map traversal. If it's positive, we collect the result in our reconstruct data bag. If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo. Otherwise, we resume layer map traversal. Doing this many `Timeline::get` calls is quite inefficient because: 1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack. 2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys. This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but may not contain the data we're looking for. 3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1]. So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do. The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk. [^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9 # Solution We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks: * more efficient basebackup * batched IO during compaction (useful for strides of unchanged pages) * page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch) * if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API # DoD There is a new variant of `Timeline::get`, called `Timeline::get_vectored`. It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`. It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images. It is sufficient to simply return a `Vec`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`. Functionally, the behavior of `Timeline::get_vectored` is equivalent to ```rust let mut keys_iter: impl Iterator = src.map(|KeyVec{ base, count }| (base..base+count)).flatten(); let mut out = Vec::new(); for key in keys_iter { let data = Timeline::get(key, lsn)?; out.push(data); } return out; ``` However, unlike above, an ideal solution will * Visit each `struct Layer` at most once. * For each visited layer, call `Layer::get_value_reconstruct_data` at most once. * This means, read each `DiskBtree` page at most once. * Facilitate merging of the reads we issue to the OS and eventually NVMe. Each of these items above represents a significant amount of work. ## Performance Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`. A reasonable constant overhead over current `Timeline::get` is acceptable. The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments. # Implementation High-level set of tasks / changes to be made: - **Get clarity on API**: - Define naive `Timeline::get_vectored` implementation & adopt it across pageserver. - The tricky thing here will be the return type (e.g. `Vec` vs `impl Stream`). - Start with something simple to explore the different usages of the API. Then iterate with peers until we have something that is good enough. - **Vectored Layer Map traversal** - Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`) - Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1 - The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385) but need more. Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data. - **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`** - Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384). - Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load. - Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call. - What needs to happen to `DiskBtree::visit()`? * Minimally * take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit. * Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range * This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous. * Ideally: * Take a `&[KeyVec]`, sort it; * during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out. * NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`". - **Facilitate merging of the reads we issue to the OS and eventually NVMe.** - The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804) - [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804) - We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure) - [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435) - What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**. - That is tricky because - the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824) - there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`. - The guiding principle here should be to avoid coupling this work to the `PageCache`. - I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management. Let's see how we can improve by doing the first three items in above list first, then revisit. ## Rollout / Feature Flags No feature flags are required for this epic. At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change. It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks. That will help isolate performance regressions across weekly releases. # Interaction With Sharding [Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`. Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard. Given that this is already the case, there shouldn't be significant interaction/interference with sharding. However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction. For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint. ================================================ FILE: docs/rfcs/031-sharding-static.md ================================================ # Sharding Phase 1: Static Key-space Sharding ## Summary To enable databases with sizes approaching the capacity of a pageserver's disk, it is necessary to break up the storage for the database, or _shard_ it. Sharding in general is a complex area. This RFC aims to define an initial capability that will permit creating large-capacity databases using a static configuration defined at time of Tenant creation. ## Motivation Currently, all data for a Tenant, including all its timelines, is stored on a single pageserver. The local storage required may be several times larger than the actual database size, due to LSM write inflation. If a database is larger than what one pageserver can hold, then it becomes impossible for the pageserver to hold it in local storage, as it must do to provide service to clients. ### Prior art In Neon: - Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4 - Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843 - Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677 Prior art in other distributed systems is too broad to capture here: pretty much any scale out storage system does something like this. ## Requirements - Enable creating a large (for example, 16TiB) database without requiring dedicated pageserver nodes. - Share read/write bandwidth costs for large databases across pageservers, as well as storage capacity, in order to avoid large capacity databases acting as I/O hotspots that disrupt service to other tenants. - Our data distribution scheme should handle sparse/nonuniform keys well, since postgres does not write out a single contiguous ranges of page numbers. _Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database that a user might create on a current-gen enterprise SSD should also work well on Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the pageserver backend is not the limiting factor in the database size_. ## Non Goals - Independently distributing timelines within the same tenant. If a tenant has many timelines, then sharding may be a less efficient mechanism for distributing load than sharing out timelines between pageservers. - Distributing work in the LSN dimension: this RFC focuses on the Key dimension only, based on the idea that separate mechanisms will make sense for each dimension. ## Impacted Components pageserver, control plane, postgres/smgr ## Terminology **Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store, the page number is the key in that store. `Key` is a literal data type in existing code. **LSN dimension**: this just means the range of LSNs (history), when talking about the range of keys and LSNs as a two dimensional space. ## Implementation ### Key sharding vs. LSN sharding When we think of sharding across the two dimensional key/lsn space, this is an opportunity to think about how the two dimensions differ: - Sharding the key space distributes the _write_ workload of ingesting data and compacting. This work must be carefully managed so that exactly one node owns a given key. - Sharding the LSN space distributes the _historical read_ workload. This work can be done by anyone without any special coordination, as long as they can see the remote index and layers. The key sharding is the harder part, and also the more urgent one, to support larger capacity databases. Because distributing historical LSN read work is a relatively simpler problem that most users don't have, we defer it to future work. It is anticipated that some quite simple P2P offload model will enable distributing work for historical reads: a node which is low on space can call out to peer to ask it to download and serve reads from a historical layer. ### Key mapping scheme Having decided to focus on key sharding, we must next decide how we will map keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise between data locality and avoiding entire large relations mapping to the same shard. We will define two spaces: - Key space: unsigned integer - Shard space: integer from 0 to N-1, where we have N shards. ### Key -> Shard mapping Keys are currently defined in the pageserver's getpage@lsn interface as follows: ``` pub struct Key { pub field1: u8, pub field2: u32, pub field3: u32, pub field4: u32, pub field5: u8, pub field6: u32, } fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { field1: 0x00, field2: rel.spcnode, field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, field6: blknum, } } ``` _Note: keys for relation metadata are ignored here, as this data will be mirrored to all shards. For distribution purposes, we only care about user data keys_ The properties we want from our Key->Shard mapping are: - Locality in `blknum`, such that adjacent `blknum` will usually map to the same stripe and consequently land on the same shard, even though the overall collection of blocks in a relation will be spread over many stripes and therefore many shards. - Avoid the same blknum on different relations landing on the same stripe, so that with many small relations we do not end up aliasing data to the same stripe/shard. - Avoid vulnerability to aliasing in the values of relation identity fields, such that if there are patterns in the value of `relnode`, these do not manifest as patterns in data placement. To accomplish this, the blknum is used to select a stripe, and stripes are assigned to shards in a pseudorandom order via a hash. The motivation for pseudo-random distribution (rather than sequential mapping of stripe to shard) is to avoid I/O hotspots when sequentially reading multiple relations: we don't want all relations' stripes to touch pageservers in the same order. To map a `Key` to a shard: - Hash the `Key` field 4 (relNode). - Divide field 6 (`blknum`) field by the stripe size in pages, and combine the hash of this with the hash from the previous step. - The total hash modulo the shard count gives the shard holding this key. Why don't we use the other fields in the Key? - We ignore `forknum` for key mapping, because it distinguishes different classes of data in the same relation, and we would like to keep the data in a relation together. - We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created database's blocks differ only by spcNode and dbNode from the original. To enable running this type of creation without cross-pageserver communication, we must ensure that these blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash. ### Data placement examples For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards and a stripe size of 32k pages: - A single large relation: `blknum` division will break the data up into 4096 stripes, which will be scattered across the shards. - 4096 relations of of 32k pages each: each relation will map to exactly one stripe, and that stripe will be placed according to the hash of the key fields 4. The data placement will be statistically uniform across shards. Data placement will be more uneven on smaller databases: - A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance that both relations land on the same shard and no data lands on the other shard. - A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double the data of the other four shards. These uneven cases for small amounts of data do not matter, as long as the stripe size is an order of magnitude smaller than the amount of data we are comfortable holding in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if a tenant has some shards with 256MB size and some shards with 512MB size, even though the standard deviation of shard size within the tenant is very high. Our key mapping scheme provides a statistical guarantee that as the tenant's overall data size increases, uniformity of placement will improve. ### Important Types #### `ShardIdentity` Provides the information needed to know whether a particular key belongs to a particular shard: - Layout version - Stripe size - Shard count - Shard index This structure's size is constant. Note that if we had used a differnet key mapping scheme such as consistent hashing with explicit hash ranges assigned to each shard, then the ShardIdentity's size would grow with the shard count: the simpler key mapping scheme used here enables a small fixed size ShardIdentity. ### Pageserver changes #### Structural Everywhere the Pageserver currently deals with Tenants, it will move to dealing with `TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity` covers the whole keyspace. When the pageserver writes layers and index_part.json to remote storage, it must include the shard index & count in the name, to avoid collisions (the count is necessary for future-proofing: the count will vary in time). These keys will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work exactly the same for TenantShards as it does for Tenants today: each shard will have its own generation number. #### Storage Format: Keys For tenants with >1 shard, layer files implicitly become sparse: within the key range described in the layer name, the layer file for a shard will only hold the content relevant to stripes assigned to the shard. For this reason, the LayerFileName within a tenant is no longer unique: different shards may use the same LayerFileName to refer to different data. We may solve this simply by including the shard number in the keys used for layers. The shard number will be included as a prefix (as part of tenant ID), like this: `pageserver/v1/tenants/-/timelines//-` `pageserver/v1/tenants/-/timelines//index_part.json-` Reasons for this particular format: - Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere we construct a layer file name), and enables efficient listing of index_parts within a particular shard-timeline prefix. - Including the shard _count_ as well as shard number means that in future when we implement shard splitting, it will be possible for a parent shard and one of its children to write the same layer file without a name collision. For example, a parent shard 0_1 might split into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part that is distinct from what shard 0_1 would have written at the same place. In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient, and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`, for example a single-shard tenant's prefix will be `0001`. For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0, and use this as a cue to construct paths with no prefix at all. #### Storage Format: Indices In the phase 1 described in this RFC, shards only reference layers they write themselves. However, when we implement shard splitting in future, it will be useful to enable shards to reference layers written by other shards (specifically the parent shard during a split), so that shards don't have to exhaustively copy all data into their own shard-prefixed keys. To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count) tuple on each layer, such that it can construct paths for layers written by other shards. This naturally raises the question of who "owns" such layers written by ancestral shards: this problem will be addressed in phase 2. For backward compatibility, any index entry without shard information will be assumed to be in the legacy shardidentity. #### WAL Ingest In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter it down to the pages relevant to their shard: - For ordinary user data writes, only retain a write if it matches the ShardIdentity - For metadata describing relations etc, all shards retain these writes. The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn: one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards, and have only the 0th shard populate remote_consistent_lsn. However, this is relatively expensive: if the safekeeper can be made shard-aware then it could be taught to use the max() of all shards' remote_consistent_lsns to decide when to trim the WAL. #### Compaction/GC No changes needed. The pageserver doesn't have to do anything special during compaction or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity. This will result in sparse layer files, containing keys only in the stripes that this shard owns. Where optimizations currently exist in compaction for spotting "gaps" in the key range, these should be updated to ignore gaps that are due to sharding, to avoid spuriously splitting up layers ito stripe-sized pieces. ### Compute Endpoints Compute endpoints will need to: - Accept a vector of connection strings as part of their configuration from the control plane - Route pageserver requests according to mapping the hash of key to the correct entry in the vector of connection strings. Doing this in compute rather than routing requests via a single pageserver is necessary to enable sharding tenants without adding latency from extra hops. ### Control Plane Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing tenants. Tenant lifecycle operations like deletion will require fanning-out to all the shards in the tenant. The same goes for timeline creation and deletion: a timeline should not be considered created until it has been created in all shards. #### Selectively enabling sharding for large tenants Initially, we will explicitly enable sharding for large tenants only. In future, this hint mechanism will become optional when we implement automatic re-sharding of tenants. ## Future Phases This section exists to indicate what will likely come next after this phase. Phases 2a and 2b are amenable to execution in parallel. ### Phase 2a: WAL fan-out **Problem**: when all shards consume the whole WAL, the network bandwidth used for transmitting the WAL from safekeeper to pageservers is multiplied by a factor of the shard count. Network bandwidth is not our most pressing bottleneck, but it is likely to become a problem if we set a modest shard count (~8) on a significant number of tenants, especially as those larger tenants which we shard are also likely to have higher write bandwidth than average. ### Phase 2b: Shard Splitting **Problem**: the number of shards in a tenant is defined at creation time and cannot be changed. This causes excessive sharding for most small tenants, and an upper bound on scale for very large tenants. To address this, a _splitting_ feature will later be added. One shard can split its data into a number of children by doing a special compaction operation to generate image layers broken up child-shard-wise, and then writing out an `index_part.json` for each child. This will then require external coordination (by the control plane) to safely attach these new child shards and then move them around to distribute work. The opposite _merging_ operation can also be imagined, but is unlikely to be implemented: once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify the risk/complexity of implementing such a rarely-encountered scenario. ### Phase N (future): distributed historical reads **Problem**: while sharding based on key is good for handling changes in overall database size, it is less suitable for spiky/unpredictable changes in the read workload to historical layers. Sudden increases in historical reads could result in sudden increases in local disk capacity required for a TenantShard. Example: the extreme case of this would be to run a tenant for a year, then create branches with ancestors at monthly intervals. This could lead to a sudden 12x inflation in the on-disk capacity footprint of a TenantShard, since it would be serving reads from all those disparate historical layers. If we can respond fast enough, then key-sharding a tenant more finely can help with this, but splitting may be a relatively expensive operation and the increased historical read load may be transient. A separate mechanism for handling heavy historical reads could be something like a gossip mechanism for pageservers to communicate about their workload, and then a getpageatlsn offload mechanism where one pageserver can ask another to go read the necessary layers from remote storage to serve the read. This requires relativly little coordination because it is read-only: any node can service any read. All reads to a particular shard would still flow through one node, but the disk capactity & I/O impact of servicing the read would be distributed. ## FAQ/Alternatives ### Why stripe the data, rather than using contiguous ranges of keyspace for each shard? When a database is growing under a write workload, writes may predominantly hit the end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user is intensively re-writing a particular relation, if that relation lived in a particular shard then it would not achieve our goal of distributing the write work across shards. ### Why not proxy read requests through one pageserver, so that endpoints don't have to change? 1. This would not achieve scale-out of network bandwidth: a busy tenant with a large database would still cause a load hotspot on the pageserver routing its read requests. 2. The additional hop through the "proxy" pageserver would add latency and overall resource cost (CPU, network bandwidth) ### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers In this model, there would be no explicit sharding of work, but the pageserver to which a tenant is attached would not hold all layers on its disk: instead, it would call out to peers to have them store some layers, and call out to those peers to request reads in those layers. This mechanism will work well for distributing work in the LSN dimension, but in the key space dimension it has the major limitation of requiring one node to handle all incoming writes, and compactions. Even if the write workload for a large database fits in one pageserver, it will still be a hotspot and such tenants may still de-facto require their own pageserver. ================================================ FILE: docs/rfcs/032-shard-splitting.md ================================================ # Shard splitting ## Summary This RFC describes a new pageserver API for splitting an existing tenant shard into multiple shards, and describes how to use this API to safely increase the total shard count of a tenant. ## Motivation In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale tenants beyond the capacity of a single pageserver by breaking up the key space into stripes, and distributing these stripes across many pageservers. However, the shard count was defined once at tenant creation time and not varied thereafter. In practice, the expected size of a database is rarely known at creation time, and it is inefficient to enable sharding for very small tenants: we need to be able to create a tenant with a small number of shards (such as 1), and later expand when it becomes clear that the tenant has grown in size to a point where sharding is beneficial. ### Prior art Many distributed systems have the problem of choosing how many shards to create for tenants that do not specify an expected size up-front. There are a couple of general approaches: - Write to a key space in order, and start a new shard when the highest key advances past some point. This doesn't work well for Neon, because we write to our key space in many different contiguous ranges (per relation), rather than in one contiguous range. To adapt to this kind of model, we would need a sharding scheme where each relation had its own range of shards, which would be inefficient for the common case of databases with many small relations. - Monitor the system, and automatically re-shard at some size threshold. For example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py) component monitors the size of each RADOS Pool, and adjusts the number of Placement Groups (Ceph's shard equivalent). ## Requirements - A configurable capacity limit per-shard is enforced. - Changes in shard count do not interrupt service beyond requiring postgres to reconnect (i.e. milliseconds). - Human being does not have to choose shard count ## Non Goals - Shard splitting is always a tenant-global operation: we will not enable splitting one shard while leaving others intact. - The inverse operation (shard merging) is not described in this RFC. This is a lower priority than splitting, because databases grow more often than they shrink, and a database with many shards will still work properly if the stored data shrinks, just with slightly more overhead (e.g. redundant WAL replication) - Shard splitting is only initiated based on capacity bounds, not load. Splitting a tenant based on load will make sense for some medium-capacity, high-load workloads, but is more complex to reason about and likely is not desirable until we have shard merging to reduce the shard count again if the database becomes less busy. ## Impacted Components pageserver, storage controller (the _storage controller_ is the evolution of what was called `attachment_service` in our test environment) ## Terminology **Parent** shards are the shards that exist before a split. **Child** shards are the new shards created during a split. **Shard** is synonymous with _tenant shard_. **Shard Index** is the 2-tuple of shard number and shard count, written in paths as {:02x}{:02x}, e.g. `0001`. ## Background In the implementation section, a couple of existing aspects of sharding are important to remember: - Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local storage paths, and remote index metadata. - Remote layer file paths contain the shard index of the shard that created them, and remote indices contain the same index to enable building the layer file path. A shard's index may reference layers that were created by another shard. - Local tenant shard directories include the shard index. All layers downloaded by a tenant shard are stored in this shard-prefixed path, even if those layers were initially created by another shard: tenant shards do not read and write one anothers' paths. - The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant. This is for historical reasons and will be cleaned up in future, but the existing name is used here to help comprehension when reading code. ## Implementation Note: this section focuses on the correctness of the core split process. This will be fairly inefficient in a naive implementation, and several important optimizations are described in a later section. There are broadly two parts to the implementation: 1. The pageserver split API, which splits one shard on one pageserver 2. The overall tenant split proccess which is coordinated by the storage controller, and calls into the pageserver split API as needed. ### Pageserver Split API The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split` that takes the new total shard count in the body. The pageserver split API operates on one tenant shard, on one pageserver. External coordination is required to use it safely, this is described in the later 'Split procedure' section. #### Preparation First identify the shard indices for the new child shards. These are deterministic, calculated from the parent shard's index, and the number of children being created (this is an input to the API, and validated to be a power of two). In a trivial example, splitting 0001 in two always results in 0002 and 0102. Child shard indices are chosen such that the childrens' parts of the keyspace will be subsets of the parent's parts of the keyspace. #### Step 1: write new remote indices In remote storage, splitting is very simple: we may just write new index_part.json objects for each child shard, containing exactly the same layers as the parent shard. The children will have more data than they need, but this avoids any exhausive re-writing or copying of layer files. The index key path includes a generation number: the parent shard's current attached generation number will also be used for the child shards' indices. This makes the operation safely retryable: if everything crashes and restarts, we may call the split API again on the parent shard, and the result will be some new remote indices for the child shards, under a higher generation number. #### Step 2: start new `Tenant` objects A new `Tenant` object may be instantiated for each child shard, while the parent shard still exists. When calling the tenant_spawn function for this object, the remote index from step 1 will be read, and the child shard will start to ingest WAL to catch up from whatever was in the remote storage at step 1. We now wait for child shards' WAL ingestion to catch up with the parent shard, so that we can safely tear down the parent shard without risking an availability gap to clients reading recent LSNs. #### Step 3: tear down parent `Tenant` object Once child shards are running and have caught up with WAL ingest, we no longer need the parent shard. Note that clients may still be using it -- when we shut it down, any page_service handlers will also shut down, causing clients to disconnect. When the client reconnects, it will re-lookup the tenant, and hit the child shard instead of the parent (shard lookup from page_service should bias toward higher ShardCount shards). Note that at this stage the page service client has not yet been notified of any split. In the trivial single split example: - Shard 0001 is gone: Tenant object torn down - Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live. - Clients will continue to connect to that server thinking that shard 0001 is there, and all requests will work, because any key that was in shard 0001 is definitely available in either shard 0002 or shard 0102. - Eventually, the storage controller (not the pageserver) will decide to migrate some child shards away: at that point it will do a live migration, ensuring that the client has an updated configuration before it detaches anything from the original server. #### Complete When we send a 200 response to the split request, we are promising the caller: - That the child shards are persistent in remote storage - That the parent shard has been shut down This enables the caller to proceed with the overall shard split operation, which may involve other shards on other pageservers. ### Storage Controller Split procedure Splitting a tenant requires calling the pageserver split API, and tracking enough state to ensure recovery + completion in the event of any component (pageserver or storage controller) crashing (or request timing out) during the split. 1. call the split API on all existing shards. Ensure that the resulting child shards are pinned to their pageservers until _all_ the split calls are done. This pinning may be implemented as a "split bit" on the tenant shards, that blocks any migrations, and also acts as a sign that if we restart, we must go through some recovery steps to resume the split. 2. Once all the split calls are done, we may unpin the child shards (clear the split bit). The split is now complete: subsequent steps are just migrations, not strictly part of the split. 3. Try to schedule new pageserver locations for the child shards, using a soft anti-affinity constraint to place shards from the same tenant onto different pageservers. Updating computes about the new shard count is not necessary until we migrate any of the child shards away from the parent's location. ### Recovering from failures #### Rolling back an incomplete split An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers, and detaching child shards. This will lose any WAL ingested into the children after the parents were detached earlier, but the parents will catch up. No special pageserver API is needed for this. From the storage controllers point of view, the procedure is: 1. For all parent shards in the tenant, ensure they are attached 2. For all child shards, ensure they are not attached 3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards. Any remote storage content for child shards is left behind. This is similar to other cases where we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an index that references it). Future online scrub/cleanup functionality can remove these objects, or they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix, which would include any child shards that were rolled back. If any timelines had been created on child shards, they will be lost when rolling back. To mitigate this, we will **block timeline creation during splitting**, so that we can safely roll back until the split is complete, without risking losing timelines. Rolling back an incomplete split will happen automatically if a split fails due to some fatal reason, and will not be accessible via an API: - A pageserver fails to complete its split API request after too many retries - A pageserver returns a fatal unexpected error such as 400 or 500 - The storage controller database returns a non-retryable error - Some internal invariant is violated in the storage controller split code #### Rolling back a complete split A complete shard split may be rolled back similarly to an incomplete split, with the following modifications: - The parent shards will no longer exist in the storage controller database, so these must be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This may be accomplished either by probing in S3, or by retaining some tombstone state for deleted shards in the storage controller database. - Any timelines that were created after the split complete will disappear when rolling back to the tenant shards. For this reason, rolling back after a complete split should only be done due to serious issues where loss of recently created timelines is acceptable, or in cases where we have confirmed that no timelines were created in the intervening period. - Parent shards' layers must not have been deleted: this property will come "for free" when we first roll out sharding, by simply not implementing deletion of parent layers after a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the Optimizations section), it should apply a TTL to layers such that we have a defined walltime window in which rollback will be possible. The storage controller will expose an API for rolling back a complete split, for use in the field if we encounter some critical bug with a post-split tenant. #### Retrying API calls during Pageserver Restart When a pageserver restarts during a split API call, it may witness on-disk content for both parent and child shards from an ongoing split. This does not intrinsically break anything, and the pageserver may include all these shards in its `/re-attach` request to the storage controller. In order to support such restarts, it is important that the storage controller stores persistent records of each child shard before it calls into a pageserver, as these child shards may require generation increments via a `/re-attach` request. The pageserver restart will also result in a failed API call from the storage controller's point of view. Recall that if _any_ pageserver fails to split, the overall split operation may not complete, and all shards must remain pinned to their current pageserver locations until the split is done. The pageserver API calls during splitting will retry on transient errors, so that short availability gaps do not result in a failure of the overall operation. The split in progress will be automatically rolled back if the threshold for API retries is reached (e.g. if a pageserver stays offline for longer than a typical restart). #### Rollback on Storage Controller Restart On startup, the storage controller will inspect the split bit for tenant shards that it loads from the database. If any splits are in progress: - Database content will be reverted to the parent shards - Child shards will be dropped from memory - The parent and child shards will be included in the general startup reconciliation that the storage controller does: any child shards will be detached from pageservers because they don't exist in the storage controller's expected set of shards, and parent shards will be attached if they aren't already. #### Storage controller API request failures/retries The split request handler will implement idempotency: if the [`Tenant`] requested to split doesn't exist, we will check for the would-be child shards, and if they already exist, we consider the request complete. If a request is retried while the original request is still underway, then the split request handler will notice an InProgress marker in TenantManager, and return 503 to encourage the client to backoff/retry. This is the same as the general pageserver API handling for calls that try to act on an InProgress shard. #### Compute start/restart during a split If a compute starts up during split, it will be configured with the old sharding configuration. This will work for reads irrespective of the progress of the split as long as no child hards have been migrated away from their original location, and this is guaranteed in the split procedure (see earlier section). #### Pageserver fails permanently during a split If a pageserver permanently fails (i.e. the storage controller availability state for it goes to Offline) while a split is in progress, the splitting operation will roll back, and during the roll back it will skip any API calls to the offline pageserver. If the offline pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API). ### Handling secondary locations For correctness, it is not necessary to split secondary locations. We can simply detach the secondary locations for parent shards, and then attach new secondary locations for child shards. Clearly this is not optimal, as it will result in re-downloads of layer files that were already present on disk. See "Splitting secondary locations" ### Conditions to trigger a split The pageserver will expose a new API for reporting on shards that are candidates for split: this will return a top-N report of the largest tenant shards by physical size (remote size). This should exclude any tenants that are already at the maximum configured shard count. The API would look something like: `/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size` The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds). A split operation will be started when the tenant exceeds some threshold. This threshold should be _less than_ how large we actually want shards to be, perhaps much less. That's to minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing tenant size distribution may be useful here: if we can make a statement like "usually, if a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might make our policy to split a tenant at 20GiB. The finest split we can do is by factors of two, but we can do higher-cardinality splits too, and this will help to reduce the overhead of repeatedly re-splitting a tenant as it grows. An example of a very simple heuristic for early deployment of the splitting feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had split a tenant, it will not need re-splitting soon after. ## Optimizations ### Flush parent shard to remote storage during split Any data that is in WAL but not remote storage at time of split will need to be replayed by child shards when they start for the first time. To minimize this work, we may flush the parent shard to remote storage before writing the remote indices for child shards. It is important that this flush is subject to some time bounds: we may be splitting in response to a surge of write ingest, so it may be time-critical to split. A few seconds to flush latest data should be sufficient to optimize common cases without running the risk of holding up a split for a harmful length of time when a parent shard is being written heavily. If the flush doesn't complete in time, we may proceed to shut down the parent shard and carry on with the split. ### Hard linking parent layers into child shard directories Before we start the Tenant objects for child shards, we may pre-populate their local storage directories with hard links to the layer files already present in the parent shard's local directory. When the child shard starts and downloads its remote index, it will find all those layer files already present on local disk. This avoids wasting download capacity and makes splitting faster, but more importantly it avoids taking up a factor of N more disk space when splitting 1 shard into N. This mechanism will work well in typical flows where shards are migrated away promptly after a split, but for the general case including what happens when layers are evicted and re-downloaded after a split, see the 'Proactive compaction' section below. ### Filtering during compaction Compaction, especially image layer generation, should skip any keys that are present in a shard's layer files, but do not match the shard's ShardIdentity's is_key_local() check. This avoids carrying around data for longer than necessary in post-split compactions. This was already implemented in https://github.com/neondatabase/neon/pull/6246 ### Proactive compaction In remote storage, there is little reason to rewrite any data on a shard split: all the children can reference parent layers via the very cheap write of the child index_part.json. In local storage, things are more nuanced. During the initial split there is no capacity cost to duplicating parent layers, if we implement the hard linking optimization described above. However, as soon as any layers are evicted from local disk and re-downloaded, the downloaded layers will not be hard-links any more: they'll have real capacity footprint. That isn't a problem if we migrate child shards away from the parent node swiftly, but it risks a significant over-use of local disk space if we do not. For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of the shards elsewhere, then churned all the layers in all the shards via eviction, then we would blow up the storage capacity used on the node by 8x. If we're splitting a 100GB shard, that could take the pageserver to the point of exhausting disk space. To avoid this scenario, we could implement a special compaction mode where we just read historic layers, drop unwanted keys, and write back the layer file. This is pretty expensive, but useful if we have split a large shard and are not going to migrate the child shards away. The heuristic conditions for triggering such a compaction are: - A) eviction plus time: if a child shard has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load. - B) resident size plus time: we may inspect the resident layers and calculate how many of them include the overhead of storing pre-split keys. After some time threshold (different to the one in case A) we still have such layers occupying local disk space, then we should proactively compact them. ### Cleaning up parent-shard layers It is functionally harmless to leave parent shard layers in remote storage indefinitely. They would be cleaned up in the event of the tenant's deletion. As an optimization to avoid leaking remote storage capacity (which costs money), we may lazily clean up parent shard layers once no child shards reference them. This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is: - list all the key prefixes beginning with the tenant ID, and select those shard prefixes which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_) - If there are no _ancestral shard_ prefixes found, we have nothing to clean up and may drop out now. - find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices. - for all ancestral shards, list objects in the prefix and delete any layer which was not referenced by a current shard. If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable. The cleanup may be done by the scrubber (external process), or we may choose to have the zeroth shard in the latest generation do the work -- there is no obstacle to one shard reading the other shard's indices at runtime, and we do not require visibility of the latest index writes. Cleanup should be artificially delayed by some period (for example 24 hours) to ensure that we retain the option to roll back a split in case of bugs. ### Splitting secondary locations We may implement a pageserver API similar to the main splitting API, which does a simpler operation for secondary locations: it would not write anything to S3, instead it would simply create the child shard directory on local disk, hard link in directories from the parent, and set up the in memory (TenantSlot) state for the children. Similar to attached locations, a subset of secondary locations will probably need re-locating after the split is complete, to avoid leaving multiple child shards on the same pageservers, where they may use excessive space for the tenant. ## FAQ/Alternatives ### What should the thresholds be set to? Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit. Max shard count: - The safekeeper overhead to sharding is currently O(N) network bandwidth because the un-filtered WAL is sent to all shards. To avoid this growing out of control, a limit of 8 shards should be temporarily imposed until WAL filtering is implemented on the safekeeper. - there is also little benefit to increasing the shard count beyond the number of pageservers in a region. ### Is it worth just rewriting all the data during a split to simplify reasoning about space? ================================================ FILE: docs/rfcs/033-storage-controller-drain-and-fill.md ================================================ # Graceful Restarts of Storage Controller Managed Clusters ## Summary This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes. It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement graceful cluster restarts. ## Motivation Pageserver restarts cause read availablity downtime for tenants. For example pageserver-3 @ us-east-1 was unavailable for a randomly picked tenant (which requested on-demand activation) for around 30 seconds during the restart at 2024-04-03 16:37 UTC. Note that lots of shutdowns on loaded pageservers do not finish within the [10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse. This problem is not yet very acutely felt in storage controller managed pageservers since tenant density is much lower there. However, we are planning on eventually migrating all pageservers to storage controller management, so it makes sense to solve the issue proactively. ## Requirements - Pageserver re-deployments cause minimal downtime for tenants - The storage controller exposes HTTP API hooks for draining and filling tenant shards from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator. - The storage controller exposes some HTTP API to cancel draining and filling background operations. - Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed as usual (with downtime). - Progress of draining/filling is visible through metrics ## Non Goals - Integration with the control plane - Graceful restarts for large non-HA tenants. ## Impacted Components - storage controller - deployment orchestrator (i.e. Ansible) - pageserver (indirectly) ## Terminology ** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver are distributed across the rest of the cluster. ** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers. ** Node scheduling policies ** act as constraints to the scheduler. For instance, when a node is set in the `Paused` policy, no further shards will be scheduled on it. ** Node ** is a pageserver. Term is used interchangeably in this RFC. ** Deployment orchestrator ** is a generic term for whatever drives our deployments. Currently, it's an Ansible playbook. ## Background ### Storage Controller Basics (skip if already familiar) Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers. An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook. ### Background Optimizations The storage controller performs scheduling optimizations in the background. It will migrate attachments to warm secondaries and replace secondaries in order to balance the cluster out. ### Reconciliations Concurrency Limiting There's a hard limit on the number of reconciles that the storage controller can have in flight at any given time. To get an idea of scales, the limit is 128 at the time of writing. ## Implementation Note: this section focuses on the core functionality of the graceful restart process. It doesn't neccesarily describe the most efficient approach. Optimizations are described separately in a later section. ### Overall Flow This section describes how to implement graceful restarts from the perspective of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially. The orchestrator shall implement the following epilogue and prologue steps for each pageserver restart: #### Prologue The orchestrator shall first fetch the pageserver node id from the control plane or the pageserver it aims to restart directly. Next, it issues an HTTP request to the storage controller in order to start the drain of said pageserver node. All error responses are retried with a short back-off. When a 202 (Accepted) HTTP code is returned, the drain has started. Now the orchestrator polls the node status endpoint exposed by the storage controller in order to await the end of the drain process. When the `policy` field of the node status response becomes `PauseForRestart`, the drain has completed and the orchestrator can proceed with restarting the pageserver. The prologue is subject to an overall timeout. It will have a value in the ballpark of minutes. As storage controller managed pageservers become more loaded this timeout will likely have to increase. #### Epilogue After restarting the pageserver, the orchestrator issues an HTTP request to the storage controller to kick off the filling process. This API call may be retried for all error codes with a short backoff. This also serves as a synchronization primitive as the fill will be refused if the pageserver has not yet re-attached to the storage controller. When a 202(Accepted) HTTP code is returned, the fill has started. Now the orchestrator polls the node status endpoint exposed by the storage controller in order to await the end of the filling process. When the `policy` field of the node status response becomes `Active`, the fill has completed and the orchestrator may proceed to the next pageserver. Again, the epilogue is subject to an overall timeout. We can start off with using the same timeout as for the prologue, but can also consider relying on the storage controller's background optimizations with a shorter timeout. In the case that the deployment orchestrator times out, it attempts to cancel the fill. This operation shall be retried with a short back-off. If it ultimately fails it will require manual intervention to set the nodes scheduling policy to `NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic, but it constrains the scheduler as mentioned previously. ### Node Scheduling Policy State Machine The state machine below encodes the behaviours discussed above and the various failover situations described in a later section. Assuming no failures and/or timeouts the flow should be: `Active -> Draining -> PauseForRestart -> Active -> Filling -> Active` ``` Operator requested drain +-----------------------------------------+ | | +-------+-------+ +-------v-------+ | | | | | Pause | +-----------> Draining +----------+ | | | | | | +---------------+ | +-------+-------+ | | | | | | | Drain requested| | | | |Drain complete | Drain failed | | | Cancelled/PS reattach/Storcon restart | | | +-------+-------+ | | | | | | +-------------+ Active <-----------+------------------+ | | | | Fill requested | +---^---^-------+ | | | | | | | | | | | | | | Fill completed| | | | | |PS reattach | | | |after restart | +-------v-------+ | | +-------v-------+ | | | | | | | Filling +---------+ +-----------+PauseForRestart| | | | | +---------------+ +---------------+ ``` ### Draining/Filling APIs The storage controller API to trigger the draining of a given node is: `PUT /v1/control/node/:node_id/{drain,fill}`. The following HTTP non-success return codes are used. All of them are safely retriable from the perspective of the storage controller. - 404: Requested node was not found - 503: Requested node is known to the storage controller, but unavailable - 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining - 409: A {drain, fill} is already in progress. Only one such background operation is allowed per node. When the drain is accepted and commenced a 202 HTTP code is returned. Drains and fills shall be cancellable by the deployment orchestrator or a human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200 response is returned when the cancelation is successful. Errors are retriable. ### Drain Process Before accpeting a drain request the following validations is applied: * Ensure that the node is known the storage controller * Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause` * Ensure that another drain or fill is not already running on the node * Ensure that a drain is possible (i.e. check that there is at least one schedulable node to drain to) After accepting the drain, the scheduling policy of the node is set to `NodeSchedulingPolicy::Draining` and persisted in both memory and the database. This disallows the optimizer from adding or removing shards from the node which is desirable to avoid them racing. Next, a separate Tokio task is spawned to manage the draining. For each tenant shard attached to the node being drained, demote the node to a secondary and attempt to schedule the node away. Scheduling might fail due to unsatisfiable constraints, but that is fine. Draining is a best effort process since it might not always be possible to cut over all shards. Importantly, this task manages the concurrency of issued reconciles in order to avoid drowning out the target pageservers and to allow other important reconciles to proceed. Once the triggered reconciles have finished or timed out, set the node's scheduling policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain. A note on non HA tenants: These tenants do not have secondaries, so by the description above, they would not be migrated. It makes sense to skip them (especially the large ones) since, depending on tenant size, this might be more disruptive than the restart since the pageserver we've moved to do will need to on-demand download the entire working set for the tenant. We can consider expanding to small non-HA tenants in the future. ### Fill Process Before accpeting a fill request the following validations is applied: * Ensure that the node is known the storage controller * Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`. This is the only acceptable policy for the fill starting state. When a node re-attaches, it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to `NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain). * Ensure that another drain or fill is not already running on the node After accepting the drain, the scheduling policy of the node is set to `NodeSchedulingPolicy::Filling` and persisted in both memory and the database. This disallows the optimizer from adding or removing shards from the node which is desirable to avoid them racing. Next, a separate Tokio task is spawned to manage the draining. For each tenant shard where the filled node is a secondary, promote the secondary. This is done until we run out of shards or the counts of attached shards become balanced across the cluster. Like for draining, the concurrency of spawned reconciles is limited. ### Failure Modes & Handling Failures are generally handled by transition back into the `Active` (neutral) state. This simplifies the implementation greatly at the cost of adding transitions to the state machine. For example, we could detect the `Draining` state upon restart and proceed with a drain, but how should the storage controller know that's what the orchestrator needs still? #### Storage Controller Crash When the storage controller starts up reset the node scheduling policy of all nodes in states `Draining`, `Filling` or `PauseForRestart` to `Active`. The rationale is that when the storage controller restarts, we have lost context of what the deployment orchestrator wants. It also has the benefit of making things easier to reason about. #### Pageserver Crash During Drain The pageserver will attempt to re-attach during restart at which point the node scheduling policy will be set back to `Active`, thus reenabling the scheduler to use the node. #### Non-drained Pageserver Crash During Drain What should happen when a pageserver we are draining to crashes during the process. Two reasonable options are: cancel the drain and focus on the failover *or* do both, but prioritise failover. Since the number of concurrent reconciles produced by drains/fills are limited, we get the later behaviour for free. My suggestion is we take this approach, but the cancellation option is trivial to implement as well. #### Pageserver Crash During Fill The pageserver will attempt to re-attach during restart at which point the node scheduling policy will be set back to `Active`, thus reenabling the scheduler to use the node. #### Pageserver Goes unavailable During Drain/Fill The drain and fill jobs handle this by stopping early. When the pageserver is detected as online by storage controller heartbeats, reset its scheduling policy to `Active`. If a restart happens instead, see the pageserver crash failure mode. #### Orchestrator Drain Times Out Orchestrator will still proceed with the restart. When the pageserver re-attaches, the scheduling policy is set back to `Active`. #### Orchestrator Fill Times Out Orchestrator will attempt to cancel the fill operation. If that fails, the fill will continue until it quiesces and the node will be left in the `Filling` scheduling policy. This hinders the scheduler, but is otherwise harmless. A human operator can handle this by setting the scheduling policy to `Active`, or we can bake in a fill timeout into the storage controller. ## Optimizations ### Location Warmth When cutting over to a secondary, the storage controller will wait for it to become "warm" (i.e. download enough of the tenants data). This means that some reconciliations can take significantly longer than others and hold up precious reconciliations units. As an optimization, the drain stage can only cut over tenants that are already "warm". Similarly, the fill stage can prioritise the "warmest" tenants in the fill. Given that the number of tenants by the storage controller will be fairly low for the foreseable future, the first implementation could simply query the tenants for secondary status. This doesn't scale well with increasing tenant counts, so eventually we will need new pageserver API endpoints to report the sets of "warm" and "cold" nodes. ## Alternatives Considered ### Draining and Filling Purely as Scheduling Constraints At its core, the storage controller is a big background loop that detects changes in the environment and reacts on them. One could express draining and filling of nodes purely in terms of constraining the scheduler (as opposed to having such background tasks). While theoretically nice, I think that's harder to implement and more importantly operate and reason about. Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion. It would also mean that reconciliations themselves have side effects that persist in the database (persist something to the databse when the drain is done), which I'm not conceptually fond of. ## Proof of Concept This RFC is accompanied by a POC which implements nearly everything mentioned here apart from the optimizations and some of the failure handling: https://github.com/neondatabase/neon/pull/7682 ================================================ FILE: docs/rfcs/034-ancestor-deletion.md ================================================ # Ancestor Timeline Deletion Created on: 2024-02-23 Author: John Spray # Summary When a tenant creates a new timeline that they will treat as their 'main' history, it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently this is necessary because it is forbidden to delete a timeline which has descendents. A new pageserver API is proposed to 'adopt' data from a parent timeline into one of its children, such that the link between ancestor and child can be severed, leaving the parent in a state where it may then be deleted. # Motivation Retaining parent timelines currently has two costs: - Cognitive load on users, who have to remember which is the "real" main timeline. - Storage capacity cost, as the parent timeline will retain layers up to the child's timeline point, even if the child fully covers its keyspace with image layers and will never actually read from the parent. # Solution A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor` will be added. The `timeline_id` in this URL is that of the _child_ timeline that we wish to detach from its parent. On success, this API will leave the following state: - The detached child timeline will no longer have an ancestor, and will contain all the data needed to service reads without recursing into an ancestor. - Any other children of the parent whose timeline points were at a lower LSN than the detached child timeline will be modified to have the child timeline as their new parent. - The parent timeline will still exist, but the child will no longer have it as an ancestor. If this was the last timeline that depended on the parent, then the parent will become deletable. This API's implementation will consist of a series of retryable steps, such that on failures/timeout it can safely be called again to reach the target state. ## Example ### Before The user has "rolled back" their project to LSN X, resulting in a "new main" timeline. The parent "old main" timeline still exists, and they would like to clean it up. They have two other timelines A and B. A is from before the rollback point, and B is from after the rollback point. ``` ----"old main" timeline-------X--------------------------------------------> | | | |-> child A | | |-> "new main" timeline | -> child B ``` ### After calling detach ancestor API The "new main" timeline is no longer dependent on old main, and neither is child A, because it had a branch point before X. The user may now choose to delete child B and "old main" to get to a pristine state. Child B is likely to be unwanted since the user chose to roll back to X, and it branches from after X. However, we don't assume this in the API; it is up to the user to delete it. ``` |----"old main" timeline----------------------------------------------------> | | | -> child B |----"new main" timeline---------> | |-> child A ``` ### After removing timelines We end up with a totally clean state that leaves no trace that a rollback ever happened: there is only one root timeline. ``` | ----"new main" timeline-----------> | |-> child A ``` ## Caveats Important things for API users to bear in mind: - this API does not delete the parent timeline: you must still do that explicitly. - if there are other child timelines ahead of the branch point of the detached child, the parent won't be deletable: you must either delete or detach those children. - do _not_ simply loop over all children and detach them all: this can have an extremely high storage cost. The detach ancestor API is intended for use on a single timeline to make it the new "main". - The detach ancestor API should also not be exposed directly to the user as button/API, because they might decide to click it for all the children and thereby generate many copies of the parent's data -- the detach ancestor API should be used as part of a high level "clean up after rollback" feature. ## `detach_ancestor` API implementation Terms used in the following sections: - "the child": the timeline whose ID is specified in the detach ancestor API URL, also called "new main" in the example. - "the parent": the parent of "the child". Also called "old main" in the example. - "the branch point" the ancestor_lsn of "the child" ### Phase 1: write out adopted layers to S3 The child will "adopt" layers from the parent, such that its end state contains all the parent's history as well as its own. For all layers in the parent's layer map whose high LSN is below the branch point, issue S3 CopyObject requests to duplicate them into the child timeline's prefix. Do not add them to the child's layer map yet. For delta layers in the parent's layer map which straddle the branch point, read them and write out only content up to the branch point into new layer objects. This is a long running operation if the parent has many layers: it should be implemented in a way that resumes rather than restarting from scratch, if the API times out and is called again. As an optimization, if there are no other timelines that will be adopted into the child, _and_ the child's image layers already full cover the branch LSN, then we may skip adopting layers. ### Phase 2: update the child's index Having written out all needed layers in phase 1, atomically link them all into the child's IndexPart and upload to S3. This may be done while the child Timeline is still running. ### Phase 3: modify timelines ancestry Modify the child's ancestor to None, and upload its IndexPart to persist the change. For all timelines which have the same parent as the child, and have a branch point lower than our branch point, switch their ancestor_timeline to the child, and upload their IndexPart to persist the change. ## Alternatives considered ### Generate full image layer on child, rather than adopting parent deltas This would work for the case of a single child, but would prevent re-targeting other timelines that depended on the parent. If we detached many children this way, the storage cost would become prohibitive (consider a 1TB database with 100 child timelines: it would cost 100TiB if they all generated their own image layers). ### Don't rewrite anything: just fake it in the API We could add a layer of indirection that let a child "pretend" that it had no ancestor, when in reality it still had the parent. The pageserver API could accept deletion of ancestor timelines, and just update child metadata to make them look like they have no ancestor. This would not achieve the desired reduction in storage cost, and may well be more complex to maintain than simply implementing the API described in this RFC. ### Avoid copying objects: enable child index to use parent layers directly We could teach IndexPart to store a TimelineId for each layer, such that a child timeline could reference a parent's layers directly, rather than copying them into the child's prefix. This would impose a cost for the normal case of indices that only target the timeline's own layers, add complexity, and break the useful simplifying invariant that timelines "own" their own path. If child timelines were referencing layers from the parent, we would have to ensure that the parent never runs GC/compaction again, which would make the API less flexible (the proposal in this RFC enables deletion of the parent but doesn't require it.) ## Performance ### Adopting layers - CopyObject is a relatively cheap operation, but we may need to issue tens of thousands of such requests: this can take up to tens of seconds and will compete for RemoteStorage semaphore units with other activity on the pageserver. - If we are running on storage backend that doesn't implement CopyObject, then this part will be much more expensive as we would stream all layer content through the pageserver. This is no different to issuing a lot of reads to a timeline that does not have a warm local cache: it will move a lot of gigabytes, but that shouldn't break anything. - Generating truncated layers for delta that straddle the branch point will require streaming read/write of all the layers in question. ### Updating timeline ancestry The simplest way to update timeline ancestry will probably be to stop and start all the Timeline objects: this is preferable to the complexity of making their ancestry mutable at runtime. There will be a corresponding "stutter" in the availability of the timelines, of the order 10-100ms, which is the time taken to upload their IndexPart, and restart the Timeline. # Interaction with other features ## Concurrent timeline creation If new historic timelines are created using the parent as an ancestor while the detach ancestor API is running, they will not be re-parented to the child. This doesn't break anything, but it leaves the parent in a state where it might not be possible to delete it. Since timeline creations are an explicit user action, this is not something we need to worry about as the storage layer: a user who wants to delete their parent timeline will not create new children, and if they do, they can choose to delete those children to enable deleting the parent. For the least surprise to the user, before starting the detach ancestor branch operation, the control plane should wait until all branches are created and not allow any branches to be created before the branch point on the ancestor branch while the operation is ongoing. ## WAL based disaster recovery WAL based disaster recovery currently supports only restoring of the main branch. Enabling WAL based disaster recovery in the future requires that we keep a record which timeline generated the WAL and at which LSN was a parent detached. Keep a list of timeline ids and the LSN in which they were detached in the `index_part.json`. Limit the size of the list to 100 first entries, after which the WAL disaster recovery will not be possible. ## Sharded tenants For sharded tenants, calls to the detach ancestor API will pass through the storage controller, which will handle them the same as timeline creations: invoke first on shard zero, and then on all the other shards. ================================================ FILE: docs/rfcs/035-safekeeper-dynamic-membership-change.md ================================================ # Safekeeper dynamic membership change To quickly recover from safekeeper node failures and do rebalancing we need to be able to change set of safekeepers the timeline resides on. The procedure must be safe (not lose committed log) regardless of safekeepers and compute state. It should be able to progress if any majority of old safekeeper set, any majority of new safekeeper set and compute are up and connected. This is known as a consensus membership change. It always involves two phases: 1) switch old majority to old + new configuration, preventing commits without acknowledge from the new set 2) bootstrap the new set by ensuring majority of the new set has all data which ever could have been committed before the first phase completed; after that switch is safe to finish. Without two phases switch to the new set which quorum might not intersect with quorum of the old set (and typical case of ABC -> ABD switch is an example of that, because quorums AC and BD don't intersect). Furthermore, procedure is typically carried out by the consensus leader, and so enumeration of configurations which establishes order between them is done through consensus log. In our case consensus leader is compute (walproposer), and we don't want to wake up all computes for the change. Neither we want to fully reimplement the leader logic second time outside compute. Because of that the proposed algorithm relies for issuing configurations on the external fault tolerant (distributed) strongly consistent storage with simple API: CAS (compare-and-swap) on the single key. Properly configured postgres suits this. In the system consensus is implemented at the timeline level, so algorithm below applies to the single timeline. ## Algorithm ### Definitions A configuration is ``` struct Configuration { generation: SafekeeperGeneration, // a number uniquely identifying configuration sk_set: Vec, // current safekeeper set new_sk_set: Optional>, } ``` Configuration with `new_set` present is used for the intermediate step during the change and called joint configuration. Generations establish order of generations: we say `c1` is higher than `c2` if `c1.generation` > `c2.generation`. ### Persistently stored data changes Safekeeper starts storing its current configuration in the control file. Update of is atomic, so in-memory value always matches the persistent one. External CAS providing storage (let's call it configuration storage here) also stores configuration for each timeline. It is initialized with generation 1 and initial set of safekeepers during timeline creation. Executed CAS on it must never be lost. ### Compute <-> safekeeper protocol changes `ProposerGreeting` message carries walproposer's configuration if it is already established (see below), else null. `AcceptorGreeting` message carries safekeeper's current `Configuration`. All further messages (`VoteRequest`, `VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry generation number, of walproposer in case of wp->sk message or of safekeeper in case of sk->wp message. ### Safekeeper changes Basic rule: once safekeeper observes configuration higher than his own it immediately switches to it. It must refuse all messages with lower generation that his. It also refuses messages if it is not member of the current generation (that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to process them (walproposer should ignore result anyway). If there is non null configuration in `ProposerGreeting` and it is higher than current safekeeper one, safekeeper switches to it. Safekeeper sends its current configuration in its first message to walproposer `AcceptorGreeting`. It refuses all other walproposer messages if the configuration generation in them is less than its current one. Namely, it refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In response it sends its current configuration generation to let walproposer know. Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership` accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its current one and ignores it otherwise. In any case it replies with ``` struct TimelineMembershipSwitchResponse { conf: Configuration, term: Term, last_log_term: Term, flush_lsn: Lsn, } ``` ### Compute (walproposer) changes Basic rule is that joint configuration requires votes from majorities in the both `set` and `new_sk_set`. Compute receives list of safekeepers to connect to from the control plane as currently and tries to communicate with all of them. However, the list does not define consensus members. Instead, on start walproposer tracks highest configuration it receives from `AcceptorGreeting`s. Once it assembles greetings from majority of `sk_set` and majority of `new_sk_set` (if it is present), it establishes this configuration as its own and moves to voting. It should stop talking to safekeepers not listed in the configuration at this point, though it is not unsafe to continue doing so. To be elected it must receive votes from both majorities if `new_sk_set` is present. Similarly, to commit WAL it must receive flush acknowledge from both majorities. If walproposer hears from safekeeper configuration higher than his own (i.e. refusal to accept due to configuration change) it simply restarts. ### Change algorithm The following algorithm can be executed anywhere having access to configuration storage and safekeepers. It is safe to interrupt / restart it and run multiple instances of it concurrently, though likely one of them won't make progress then. It accepts `desired_set: Vec` as input. Algorithm will refuse to make the change if it encounters previous interrupted change attempt, but in this case it will try to finish it. It will eventually converge if old majority, new majority and configuration storage are reachable. 1) Fetch current timeline configuration from the configuration storage. 2) If it is already joint one and `new_set` is different from `desired_set` refuse to change. However, assign join conf to (in memory) var `joint_conf` and proceed to step 4 to finish the ongoing change. 3) Else, create joint `joint_conf: Configuration`: increment current conf number `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration storage by doing CAS on the current generation: change happens only if current configuration number is still `n`. Apart from guaranteeing uniqueness of configurations, CAS linearizes them, ensuring that new configuration is created only following the previous one when we know that the transition is safe. Failed CAS aborts the procedure. 4) Call `PUT` `configuration` on safekeepers from the current set, delivering them `joint_conf`. Collecting responses from majority is required to proceed. If any response returned generation higher than `joint_conf.generation`, abort (another switch raced us). Otherwise, choose max `` among responses and establish it as (in memory) `sync_position`. Also choose max `term` and establish it as (in memory) `sync_term`. We can't finish the switch until majority of the new set catches up to this `sync_position` because data before it could be committed without ack from the new set. Similarly, we'll bump term on new majority to `sync_term` so that two computes with the same term are never elected. 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it doesn't exist yet by doing `pull_timeline` from the majority of the current set. Doing that on majority of `new_sk_set` is enough to proceed, but it is reasonable to ensure that all `new_sk_set` members are initialized -- if some of them are down why are we migrating there? 5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. Success on majority is enough. 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, delivering them `joint_conf` and collecting their positions. This will switch them to the `joint_conf` which generally won't be needed because `pull_timeline` already includes it and plus additionally would be broadcast by compute. More importantly, we may proceed to the next step only when `` on the majority of the new set reached `sync_position`. Similarly, on the happy path no waiting is needed because `pull_timeline` already includes it. However, we should double check to be safe. For example, timeline could have been created earlier e.g. manually or after try-to-migrate, abort, try-to-migrate-again sequence. 7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration storage under one more CAS. 8) Call `PUT` `configuration` on safekeepers from the new set, delivering them `new_conf`. It is enough to deliver it to the majority of the new set; the rest can be updated by compute. I haven't put huge effort to make the description above very precise, because it is natural language prone to interpretations anyway. Instead I'd like to make TLA+ spec of it. Description above focuses on safety. To make the flow practical and live, here a few more considerations. 1) It makes sense to ping new set to ensure we are migrating to live node(s) before step 3. 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed it is safe to rollback to the old conf with one more CAS. 3) On step 4 timeline might be already created on members of the new set for various reasons; the simplest is the procedure restart. There are more complicated scenarios like mentioned in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving generations, so seems simpler to treat existing timeline as success. However, this also has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). I don't think we'll observe this in practice, but can add waking up compute if needed. 4) In the end timeline should be locally deleted on the safekeeper(s) which are in the old set but not in the new one, unless they are unreachable. To be safe this also should be done under generation number (deletion proceeds only if current configuration is <= than one in request and safekeeper is not member of it). 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, jump to step 7, using it as `new_conf`. ## Implementation The procedure ought to be driven from somewhere. Obvious candidates are control plane and storage_controller; and as each of them already has db we don't want yet another storage. I propose to manage safekeepers in storage_controller because 1) since it is in rust it simplifies simulation testing (more on this below) 2) it already manages pageservers. This assumes that migration will be fully usable only after we migrate all tenants/timelines to storage_controller. It is discussible whether we want also to manage pageserver attachments for all of these, but likely we do. This requires us to define storcon <-> cplane interface and changes. ### storage_controller <-> control plane interface and changes First of all, control plane should [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) storing safekeepers per timeline instead of per tenant because we can't migrate tenants atomically. The important question is how updated configuration is delivered from storage_controller to control plane to provide it to computes. As always, there are two options, pull and push. Let's do it the same push as with pageserver `/notify-attach` because 1) it keeps storage_controller out of critical compute start path 2) uniformity. It makes storage_controller responsible for retrying notifying control plane until it succeeds. It is not needed for the control plane to fully know the `Configuration`. It is enough for it to only to be aware of the list of safekeepers in the latest configuration to supply it to compute, plus associated generation number to protect from stale update requests and to also pass it to compute. So, cplane `/notify-safekeepers` for the timeline can accept JSON like ``` { tenant_id: String, timeline_id: String, generation: u32, safekeepers: Vec, } ``` where `SafekeeperId` is ``` { node_id: u64, host: String } ``` In principle `host` is redundant, but may be useful for observability. The request updates list of safekeepers in the db if the provided conf generation is higher (the cplane db should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it should update db which makes the call successful, and then try to schedule `apply_config` if possible, it is ok if not. storage_controller should rate limit calling the endpoint, but likely this won't be needed, as migration throughput is limited by `pull_timeline`. Timeline (branch) creation in cplane should call storage_controller POST `tenant/:tenant_id/timeline` like it currently does for sharded tenants. Response should be augmented with `safekeepers_generation` and `safekeepers` fields like described in `/notify-safekeepers` above. Initially (currently) these fields may be absent; in this case cplane chooses safekeepers on its own like it currently does. The call should be retried until it succeeds. Timeline deletion and tenant deletion in cplane should call appropriate storage_controller endpoints like it currently does for sharded tenants. The calls should be retried until they succeed. When compute receives safekeeper list from control plane it needs to know the generation to check whether it should be updated (note that compute may get safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers` GUC is just a comma separates list of `host:port`. Let's prefix it with `g#:` to this end, so it will look like ``` g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401 ``` To summarize, list of cplane changes: - per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field. - `/notify-safekeepers` endpoint. - Branch creation call may return list of safekeepers and when it is present cplane should adopt it instead of choosing on its own like it does currently. - `neon.safekeepers` GUC should be prefixed with `g#:`. ### storage_controller implementation If desired, we may continue using current 'load everything on startup and keep in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so 10^6 of timelines shouldn't take more than 100MB. Similar to pageserver attachment Intents storage_controller would have in-memory `MigrationRequest` (or its absense) for each timeline and pool of tasks trying to make these request reality; this ensures one instance of storage_controller won't do several migrations on the same timeline concurrently. In the first version it is simpler to have more manual control and no retries, i.e. migration failure removes the request. Later we can build retries and automatic scheduling/migration around. `MigrationRequest` is ``` enum MigrationRequest { To(Vec), FinishPending, } ``` `FinishPending` requests to run the procedure to ensure state is clean: current configuration is not joint and the majority of safekeepers are aware of it, but do not attempt to migrate anywhere. If the current configuration fetched on step 1 is not joint it jumps to step 7. It should be run at startup for all timelines (but similarly, in the first version it is ok to trigger it manually). #### Schema `safekeepers` table mirroring current `nodes` should be added, except that for `scheduling_policy`: it is enough to have at least in the beginning only 3 fields: 1) `active` 2) `paused` (initially means only not assign new tlis there 3) `decommissioned` (node is removed). `timelines` table: ``` table! { // timeline_id is primary key timelines (tenant_id, timeline_id) { timeline_id -> Varchar, tenant_id -> Varchar, start_lsn -> pg_lsn, generation -> Int4, sk_set -> Array, // list of safekeeper ids new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf cplane_notified_generation -> Int4, sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about deleted_at -> Nullable, } } ``` `start_lsn` is needed to create timeline on safekeepers properly, see below. We might also want to add ancestor_timeline_id to preserve the hierarchy, but for this RFC it is not needed. `cplane_notified_generation` and `sk_set_notified_generation` fields are used to track the last stage of the algorithm, when we need to notify safekeeper set and cplane with the final configuration after it's already committed to DB. The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and `*_notified_generation` fields are up to date with `generation`. It's possible to replace `*_notified_generation` with one boolean field `migration_completed`, but for better observability it's nice to have them separately. #### API Node management is similar to pageserver: 1) POST `/control/v1/safekeeper` inserts safekeeper. 2) GET `/control/v1/safekeeper` lists safekeepers. 3) GET `/control/v1/safekeeper/:node_id` gets safekeeper. 4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g. `offline` or `decomissioned`. Initially it is simpler not to schedule any migrations here. Safekeeper deploy scripts should register safekeeper at storage_contorller as they currently do with cplane, under the same id. Timeline creation/deletion will work through already existing POST and DELETE `tenant/:tenant_id/timeline`. Cplane is expected to retry both until they succeed. See next section on the implementation details. We don't want to block timeline creation/deletion when one safekeeper is down. Currently this is crutched by compute implicitly creating timeline on any safekeeper it is connected to. This creates ugly timeline state on safekeeper when timeline is created, but start LSN is not defined yet. Next section describes dealing with this. Tenant deletion repeats timeline deletion for all timelines. Migration API: the first version is the simplest and the most imperative: 1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move all timelines from one safekeeper to another. It accepts json ``` { "src_sk": NodeId, "dst_sk": NodeId, "limit": Optional, } ``` Returns list of scheduled requests. 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` to move single timeline to given set of safekeepers: ``` struct TimelineSafekeeperMigrateRequest { "new_sk_set": Vec, } ``` In the first version the handler migrates the timeline to `new_sk_set` synchronously. Should be retried until success. In the future we might change it to asynchronous API and return scheduled request. Similar call should be added for the tenant. It would be great to have some way of subscribing to the results (apart from looking at logs/metrics). 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return current in memory state of the timeline and pending `MigrationRequest`, if any. 4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS (incrementing generation as always). #### API implementation and reconciliation For timeline creation/deletion we want to preserve the basic assumption that unreachable minority (1 sk of 3) doesn't block their completion, but eventually we want to finish creation/deletion on nodes which missed it (unless they are removed). Similarly for migration; it may and should finish even though excluded members missed their exclusion. And of course e.g. such pending exclusion on node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As another example, if some node missed timeline creation it clearly must not block migration from it. Hence it is natural to have per safekeeper background reconciler which retries these ops until they succeed. There are 3 possible operation types, and the type is defined by timeline state (membership configuration and whether it is deleted) and safekeeper id: we may need to create timeline on sk (node added), locally delete it (node excluded, somewhat similar to detach) or globally delete it (timeline is deleted). Next, on storage controller restart in principle these pending operations can be figured out by comparing safekeepers state against storcon state. But it seems better to me to materialize them in the database; it is not expensive, avoids these startup scans which themselves can fail etc and makes it very easy to see outstanding work directly at the source of truth -- the db. So we can add table `safekeeper_timeline_pending_ops` ``` table! { // timeline_id, sk_id is primary key safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) { sk_id -> int8, tenant_id -> Varchar, timeline_id -> Varchar, generation -> Int4, op_type -> Varchar, } } ``` We load all pending ops from the table on startup into the memory. The table is needed only to preserve the state between restarts. `op_type` can be `include` (seed from peers and ensure generation is up to date), `exclude` (remove locally) and `delete`. Field is actually not strictly needed as it can be computed from current configuration, but gives more explicit observability. `generation` is necessary there because after op is done reconciler must remove it and not remove another row with higher gen which in theory might appear. Any insert of row should overwrite (remove) all rows with the same sk and timeline id but lower `generation` as next op makes previous obsolete. Insertion of `op_type` `delete` overwrites all rows. About `exclude`: rather than adding explicit safekeeper http endpoint, it is reasonable to reuse membership switch endpoint: if safekeeper is not member of the configuration it locally removes the timeline on the switch. In this case 404 should also be considered an 'ok' answer by the caller. So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops` joined with timeline configuration to get current conf (with generation `n`) for the safekeeper and does the jobs, infinitely retrying failures: 1) If node is member (`include`): - Check if timeline exists on it, if not, call pull_timeline on it from other members - Call switch configuration to the current 2) If node is not member (`exclude`): - Call switch configuration to the current, 404 is ok. 3) If timeline is deleted (`delete`), call delete. In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and timeline with generation <= `n` if `op_type` is not `delete`. In case 3 also remove `safekeeper_timeline_pending_ops` entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline. Let's consider in details how APIs can be implemented from this angle. Timeline creation. It is assumed that cplane retries it until success, so all actions must be idempotent. Now, a tricky point here is timeline start LSN. For the initial (tenant creation) call cplane doesn't know it. However, setting start_lsn on safekeepers during creation is a good thing -- it provides a guarantee that walproposer can always find a common point in WAL histories of safekeeper and its own, and so absence of it would be a clear sign of corruption. The following sequence works: 1) Create timeline (or observe that it exists) on pageserver, figuring out last_record_lsn in response. 2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the db. Note that last_record_lsn returned on the previous step is movable as it changes once ingestion starts, insert must not overwrite it (as well as other fields like membership conf). On the contrary, start_lsn used in the next step must be set to the value in the db. cplane_notified_generation can be set to 1 (initial generation) in insert to avoid notifying cplane about initial conf as cplane will receive it in timeline creation request anyway. 3) Issue timeline creation calls to at least majority of safekeepers. Using majority here is not necessary but handy because it guarantees that any live majority will have at least one sk with created timeline and so reconciliation task can use pull_timeline shared with migration instead of create timeline special init case. OFC if timeline is already exists call is ignored. 4) For minority of safekeepers which could have missed creation insert entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion because response to cplane is sent only after it has happened, and cplane retries the call until 200 response. There is a small question how request handler (timeline creation in this case) would interact with per sk reconciler. In the current implementation we first persist the request in the DB, and then send an in-memory request to each safekeeper reconciler to process it. For pg version / wal segment size: while we may persist them in `timelines` table, it is not necessary as initial creation at step 3 can take them from pageserver or cplane creation call and later pull_timeline will carry them around. Timeline migration. 1) CAS to the db to create joint conf. Since this moment the migration is considered to be "in progress". We can detect all "in-progress" migrations looking into the database. 2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership configuration on all safekeepers, notify cplane, etc. All operations are idempotent, so we don't need to persist anything in the database at this stage. If any errors occur, it's safe to retry or abort the migration. 3) Once it becomes possible per alg description above, get out of joint conf with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops` in the same DB transaction. Adding `exclude` entries atomically is nesessary because after CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we need to have them persisted somewhere in case the migration is interrupted right after the CAS. 4) Finish the migration. The final membership configuration is committed to the DB at this stage. So, the migration can not be aborted anymore. But it can still be retried if the migration fails past stage 3. To finish the migration we need to send the new membership configuration to a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude` requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's possible that we have already committed `exclude` requests to DB, but didn't send them to the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops` because it's the only place where they are persistent. The fields `sk_set_notified_generation` and `cplane_notified_generation` are updated after each step. The migration is considered fully completed when they match the `generation` field. In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline reconciler (if we implement it). But it's wise to at least try to finish them synchronously, so the timeline is always in a "good state" and doesn't require an old quorum to commit WAL after the migration reported "success". Timeline deletion: just set `deleted_at` on the timeline row and insert `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by per sk reconcilers. When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops` for it must be cleared in the same transaction. #### Dealing with multiple instances of storage_controller Operations described above executed concurrently might create some errors but do not prevent progress, so while we normally don't want to run multiple instances of storage_controller it is fine to have it temporarily, e.g. during redeploy. To harden against some controller instance creating some work in `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up the job per sk reconcilers apart from explicit wakeups should scan for work periodically. It is possible to remove that though if all db updates are protected with leadership token/term -- then such scans are needed only after leadership is acquired. Any interactions with db update in-memory controller state, e.g. if migration request failed because different one is in progress, controller remembers that and tries to finish it. ## Testing `neon_local` should be switched to use storage_controller, playing role of control plane. There should be following layers of tests: 1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety. 2) To cover real code and at the same time test many schedules we should have simulation tests. For that, configuration storage, storage_controller <-> safekeeper communication and pull_timeline need to be mocked and main switch procedure wrapped to as a node (thread) in simulation tests, using these mocks. Test would inject migrations like it currently injects safekeeper/walproposer restarts. Main assert is the same -- committed WAL must not be lost. 3) Since simulation testing injects at relatively high level points (not syscalls), it omits some code, in particular `pull_timeline`. Thus it is better to have basic tests covering whole system as well. Extended version of `test_restarts_under_load` would do: start background load and do migration under it, then restart endpoint and check that no reported commits had been lost. I'd also add one more creating classic network split scenario, with one compute talking to AC and another to BD while migration from nodes ABC to ABD happens. 4) Simple e2e test should ensure that full flow including cplane notification works. ## Order of implementation and rollout Note that - Control plane parts and integration with it is fully independent from everything else (tests would use simulation and neon_local). - It is reasonable to make compute <-> safekeepers protocol change independent of enabling generations. - There is a lot of infra work making storage_controller aware of timelines and safekeepers and its impl/rollout should be separate from migration itself. - Initially walproposer can just stop working while it observes joint configuration. Such window would be typically very short anyway. - Obviously we want to test the whole thing thoroughly on staging and only then gradually enable in prod. Let's have the following implementation bits for gradual rollout: - compute gets `neon.safekeepers_proto_version` flag. Initially both compute and safekeepers will be able to talk both versions so that we can delay force restart of them and for simplicity of rollback in case it is needed. - storcon gets `-set-safekeepers` config option disabled by default. Timeline creation request chooses safekeepers (and returns them in response to cplane) only when it is set to true. - control_plane [see above](storage_controller-<->-control-plane interface-and-changes) prefixes `neon.safekeepers` GUC with generation number. When it is 0 (or prefix not present at all), walproposer behaves as currently, committing on the provided safekeeper list -- generations are disabled. If it is non 0 it follows this RFC rules. - We provide a script for manual migration to storage controller. It selects timeline(s) from control plane (specified or all of them) db and calls special import endpoint on storage controller which is very similar to timeline creation: it inserts into the db, sets configuration to initial on the safekeepers, calls cplane `notify-safekeepers`. Then the rollout for a region would be: - Current situation: safekeepers are chosen by control_plane. - We manually migrate some timelines, test moving them around. - Then we enable `--set-safekeepers` so that all new timelines are on storage controller. - Finally migrate all existing timelines using the script (no compute should be speaking old proto version at this point). Until all timelines are managed by storcon we'd need to use current ad hoc script to migrate if needed. To keep state clean, all storage controller managed timelines must be migrated before that, or controller db and configurations state of safekeepers dropped manually. Very rough implementation order: - Add concept of configurations to safekeepers (including control file), implement v3 protocol. - Implement walproposer changes, including protocol. - Implement storconn part. Use it in neon_local (and pytest). - Make cplane store safekeepers per timeline instead of per tenant. - Implement cplane/storcon integration. Route branch creation/deletion through storcon. Then we can test migration of new branches. - Finally import existing branches. Then we can drop cplane safekeeper selection code. Gradually enable configurations at computes and safekeepers. Before that, all computes must talk only v3 protocol version. ## Integration with evicted timelines Currently, `pull_timeline` doesn't work correctly with evicted timelines because copy would point to original partial file. To fix let's just do s3 copy of the file. It is a bit stupid as generally unnecessary work, but it makes sense to implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542) ## Possible optimizations Steps above suggest walproposer restart (with re-election) and thus reconnection to safekeepers. Since by bumping term on new majority we ensure that leader terms are unique even across generation switches it is possible to preserve connections. However, it is more complicated, reconnection is very fast and it is much more important to avoid compute restart than millisecond order of write stall. Multiple joint consensus: algorithm above rejects attempt to change membership while another attempt is in progress. It is possible to overlay them and AFAIK Aurora does this but similarly I don't think this is needed. ## Misc We should use Compute <-> safekeeper protocol change to include other (long yearned) modifications: - send data in network order without putting whole structs to be arch independent - remove term_start_lsn from AppendRequest - add horizon to TermHistory - add to ProposerGreeting number of connection from this wp to sk ================================================ FILE: docs/rfcs/035-timeline-archive.md ================================================ # Timeline Archival ## Summary This RFC describes a mechanism for pageservers to eliminate local storage + compute work for timelines which are not in use, in response to external API calls to "archive" a timeline. The archived state roughly corresponds to fully offloading a timeline to object storage, such that its cost is purely the cost of that object storage. ## Motivation Archived timelines serve multiple purposes: - Act as a 'snapshot' for workloads that would like to retain restorable copies of their database from longer ago than their PITR window. - Enable users to create huge numbers of branches (e.g. one per github PR) without having to diligently clean them up later to avoid overloading the pageserver (currently we support up to ~500 branches per tenant). ### Prior art Most storage and database systems have some form of snapshot, which can be implemented several ways: 1. full copies of data (e.g. an EBS snapshot to S3) 2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS. 3. a series of snapshots which are CoW or de-duplicated relative to one another. Today's Neon branches are approximately like `2.`, although due to implementation details branches often end up storing much more data than they really need, as parent branches assume that all data at the branch point is needed. The layers pinned in the parent branch may have a much larger size than the physical size of a compressed image layer representing the data at the branch point. ## Requirements - Enter & exit the archived state in response to external admin API calls - API calls to modify the archived state are atomic and durable - An archived timeline should eventually (once out of PITR window) use an efficient compressed representation, and avoid retaining arbitrarily large data in its parent branch. - Remote object GETs during tenant start may be O(N) with the number of _active_ branches, but must not scale with the number of _archived_ branches. - Background I/O for archived branches should only be done a limited number of times to evolve them to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping" overhead for archived branches, including operations related to calculating sizes for billing. - The pageserver should put no load on the safekeeper for archived branches. - Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch to a performant state in a short time (linear with the branch's logical size) ## Non Goals - Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored in Neon's internal format. - Compute cold starts after activating an archived branch will not have comparable performance to cold starts on an active branch. - Archived branches will not use any new/additional compression or de-duplication beyond what is already implemented for image layers (zstd per page). - The pageserver will not "auto start" archived branches in response to page_service API requests: they are only activated explicitly via the HTTP API. - We will not implement a total offload of archived timelines from safekeepers: their control file (small) will remain on local disk, although existing eviction mechanisms will remove any segments from local disk. - We will not expose any prometheus metrics for archived timelines, or make them visible in any detailed HTTP APIs other than the specific API for listing archived timelines. - A parent branch may not be archived unless all its children are. ## Impacted Components pageserver, storage controller ## Terminology **Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller may assume that this branch is now very cheap to store, although this may not be physically so until the branch proceeds to the offloaded state. **Active** branches are branches which are available for use by page_service clients, and have a relatively high cost due to consuming local storage. **Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such that they now consume minimal runtime resources and have a cost similar to the cost of object storage. **Activate** (verb): transition from Archived to Active **Archive** (verb): transition from Active to Archived **Offload** (verb): transition from Archived to Offloaded **Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load. **Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is warmed up, good performance will be available to page_service clients. ## Implementation ### High level flow We may think of a timeline which is archived and then activated as proceeding through a series of states: ```mermaid stateDiagram [*] --> Active(warm) Active(warm) --> Archived Archived --> Offloaded Archived --> Active(warm) Offloaded --> Active(cold) Active(cold) --> Active(warm) ``` Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles of branches will be: - Very frequent: Short lived branches: Active -> Deleted - Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted - Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination of: - the timeline's lifecycle state: active or archived, stored in the timeline's index - its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the manifest of offloaded timelines. - cache state (whether it's warm or cold). ### Storage format changes There are two storage format changes: 1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to be considered active or archived. 2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load at startup (and is available for storing other small, rarely changing tenant-wide attributes in future) The manifest object will have a format like this: ``` { "offload_timelines": [ { "timeline_id": ... "last_record_lsn": ... "last_record_lsn_time": ... "pitr_interval": ... "last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot) "logical_size": ... # The size at last_record_lsn "physical_size" ... "parent": Option<{ "timeline_id"... "lsn"... # Branch point LSN on the parent "requires_data": bool # True if this branch depends on layers in its parent, identify it here }> } ] } ``` The information about a timeline in its offload state is intentionally minimal: just enough to decide: - Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn. - Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing layers that the archived branch depends on - Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then we don't need to go to S3 for the deletion. - How much archived space to report in consumption metrics The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded` (offloaded timelines). For split-brain protection, the manifest object will be written with a generation suffix, in the same way as index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover the manifest file. ### API & Timeline state Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval a per-timeline configuration). `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure` ``` { 'state': 'active|archive' } ``` When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded. When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part, **and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's index, but not any data: it should be about as fast as a couple of small S3 requests. The API will be available with identical path via the storage controller: calling this on a sharded tenant will simply map the API call to all the shards. Archived timelines may never have descendent timelines which are active. This will be enforced at the API level, such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines in the proper order if they would like to archive whole trees of branches. Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically for archived timelines will be added: this is for use in support/debug: ``` GET /v1/tenants/{tenant_id}/archived_timelines { ...same per-timeline content as the tenant manifest... } ``` ### Tenant attach changes Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived timelines, we must have a single object that tells us which timelines do not need to be loaded. The number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic because each request covers 1000 timelines. This is **not** literally the same as the set of timelines who have state=archived. Rather, it is the set of timelines which have been offloaded in the background after their state was set to archived. We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying to delete an offloaded timeline. ### Warm-up API `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234` This API will be similar to the existing `download_remote_layers` API, but smarter: - It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read) - It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress of downloads, so that the caller can poll. The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache eviction and heatmaps, as well as in this specific case of warming up a timeline. The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised to call it, because otherwise populating local contents for a timeline can take a long time when waiting for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite volatile. ### Background work Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters an archived branch, it will consider rewriting the branch to just image layers if the branch has no history ([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk if its state permits that. Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR has elapsed and it can now be rewritten to image layers. #### Archive branch offload Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do any actual work. This work is done in the background compaction loop. It makes sense to tag this work on to the compaction loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency. The condition for offload is simple: - a `Timeline` object exists with state `Archived` - the timeline does not have any non-offloaded children. Regarding the condition that children must be offloaded, this will always be eventually true, because we enforce at the API level that children of archived timelines must themselves be archived, and all archived timelines will eventually be offloaded. Offloading a timeline is simple: - Read the timeline's attributes that we will store in its offloaded state (especially its logical size) - Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it) - Erase all the timeline's content from local storage (`remove_dir_all` on its path) - Write the tenant manifest to S3 to prevent this timeline being loaded on next start. #### Archive branch optimization (flattening) When we offloaded a branch, it might have had some history that prevented rewriting it to a single point in time set of image layers. For example, a branch might have several days of writes and a 7 day PITR: when we archive it, it still has those days of history. Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by: - Writing compressed image layers within the archived branch, as these are more efficient as a way of storing a point in time compared with delta layers - Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor for data, i.e. the ancestor is free to GC layers files at+below the branch point Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes a true snapshot at that LSN. It is not always more efficient to flatten a branch than to keep some extra history on the parent: this is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper) Archive branch optimization should be done _before_ background offloads during compaction, because there may be timelines which are ready to be offloaded but also would benefit from the optimization step before being offloaded. For example, a branch which has already fallen out of PITR window and has no history of its own may be immediately re-written as a series of image layers before being offloaded. ### Consumption metrics Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived vs. ordinary content. Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size` variant of `MetricsKey`: receivers are then free to bill on this metric as they please. ### Secondary locations Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents will be dropped from secondary locations. ### Sharding Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in the same way that timeline creation and deletion is done. There are no special rules about ordering: the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline. Since consumption metrics are only transmitted from shard zero, the state of archival on this shard will be authoritative for consumption metrics. ## Error cases ### Errors in sharded tenants If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed state, where a timeline is archived on some shards but not on others. We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest). In the transient case callers are expected to retry until success, or to make appropriate API calls to clear up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't break anything, it's just "weird". This is similar to the status quo for timeline creation and deletion: callers are expected to retry these operations until they succeed. ### Archiving/activating Archiving/activating a timeline can fail in a limited number of ways: 1. I/O error storing/reading the timeline's updated index - These errors are always retryable: a fundamental design assumption of the pageserver is that remote storage errors are always transient. 2. NotFound if the timeline doesn't exist - Callers of the API are expected to avoid calling deletion and archival APIs concurrently. - The storage controller has runtime locking to prevent races such as deleting a timeline while archiving it. 3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated - Callers are expected to do their own checks to avoid hitting this case. If they make a mistake and encounter this error, they should give up. ### Offloading Offloading can only fail if remote storage is unavailable, which would prevent us from writing the tenant manifest. In such error cases, we give up in the expectation that offloading will be tried again at the next iteration of the compaction loop. ### Archive branch optimization Optimization is a special form of compaction, so can encounter all the same errors as regular compaction can: it should return Result<(), CompactionError>, and as with compaction it will be retried on the next iteration of the compaction loop. ## Optimizations ### Delaying storage optimization if retaining parent layers is cheaper Optimizing archived branches to image layers and thereby enabling parent branch GC to progress is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they are offloaded to S3 they're totally safe, inert things. However, in some cases it can be advantageous to retain extra history on their parent branch rather than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB of data per day), and archive branches are being created nightly, then writing out full 1TB image layers for each nightly branch is inefficient compared with just keeping more history on the main branch. Getting this right requires consideration of: - Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to write out extra image layers, then it might make more sense to just write out the image layers on the archived branch. - Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely large layer map can cause problems elsewhere. This optimization can probably be implemented quite cheaply with some basic heuristics like: - don't bother doing optimization on an archive branch if the LSN distance between its branch point and the end of the PITR window is <5% of the logical size of the archive branch. - ...but, Don't keep more history on the main branch than double the PITR ### Creating a timeline in archived state (a snapshot) Sometimes, one might want to create a branch with no history, which will not be written to before it is archived. This is a snapshot, although we do not require a special snapshot API, since a snapshot can be represented as a timeline with no history. This can be accomplished by simply creating a timeline and then immediately archiving it, but that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly support this common special case, we may add a parameter to the timeline creation API which creates a timeline directly into the archived state. Such a timeline creation will do exactly two I/Os at creation time: - write the index_part object to record the timeline's existence - when the timeline is offloaded in the next iteration of the compaction loop (~20s later), write the tenant manifest. Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake up the 'snapshot' branch and write out image layers. ## Future Work ### Enabling `fullbackup` dumps from archive branches It would be useful to be able to export an archive branch to another system, or for use in a local postgres database. This could be implemented as a general capability for all branches, in which case it would "just work" for archive branches by activating them. However, downloading all the layers in a branch just to generate a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk. Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup stream to S3 in an intermediate format and, then having one node stitch them together). ### Tagging layers from archived branches When we know a layer is an image layer written for an archived branch that has fallen off the PITR window, we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even cheaper storage. This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver external hints on which branches are likely to be reactivated, and which branches are good candidates for tagging for low performance storage. Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object stores have similar mechanisms. ### Storing sequences of archive branches as deltas When archived branches are used as scheduled snapshots, we could store them even more efficiently by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified pages). This is the kind of encoding that many backup storage systems use. The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds, so the complexity tradeoff of diff-encoding it is dubious). One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch, so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's delta snapshot". Clearly this all requires careful housekeeping to retain the relationship between branches that depend on each other: perhaps this would be done by making the archive branches have child/parent relationships with each other, or perhaps we would permit them to remain children of their original parent, but additionally have a relationship with the snapshot they're encoded relative to. Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring out how frequently to write a full copy is important. This is essentially a zoomed-out version of what we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline. ## FAQ/Alternatives ### Store all timelines in the tenant manifest Rather than special-casing offloaded timelines in the offload manifest, we could store a total manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on startup. That would be a more invasive change (require hooking in to timeline creation), and would generate much more I/O to this manifest for tenants that had many branches _and_ frequent create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines means that we only have to cope with the rate at which long-lived timelines are archived, rather than the rate at which sort lived timelines are created & destroyed. ### Automatically archiving/activating timelines without external API calls We could implement TTL driven offload of timelines, waking them up when a page request arrives. This has downsides: - Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't know which of their branches are in this state, and might get a surprise when they try to use such a branch. - Price fluctuation: if the archival of a branch is used in end user pricing, then users prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it is created, rather than having a usage-dependency storage price. - Complexity: enabling the page service to call up into the Tenant to activate a timeline would be awkward, compared with an external entry point. ### Make offloaded a state of Timeline To reduce the operator-facing complexity of having some timelines APIs that only return non-offloaded timelines, we could build the offloaded state into the Timeline type. `timeline.rs` is already one of the most egregiously long source files in the tree, so this is rejected on the basis that we need to avoid making that complexity worse. ================================================ FILE: docs/rfcs/036-physical-replication.md ================================================ # Physical Replication This RFC is a bit special in that we have already implemented physical replication a long time ago. However, we never properly wrote down all the decisions and assumptions, and in the last months when more users have started to use the feature, numerous issues have surfaced. This RFC documents the design decisions that have been made. ## Summary PostgreSQL has a feature called streaming replication, where a replica streams WAL from the primary and continuously applies it. It is also known as "physical replication", to distinguish it from logical replication. In PostgreSQL, a replica is initialized by taking a physical backup of the primary. In Neon, the replica is initialized from a slim "base backup" from the pageserver, just like a primary, and the primary and the replicas connect to the same pageserver, sharing the storage. There are two kinds of read-only replicas in Neon: - replicas that follow the primary, and - "static" replicas that are pinned at a particular LSN. A static replica is useful e.g. for performing time-travel queries and running one-off slow queries without affecting the primary. A replica that follows the primary can be used e.g. to scale out read-only workloads. ## Motivation Read-only replicas allow offloading read-only queries. It's useful for isolation, if you want to make sure that read-only queries don't affect the primary, and it's also an easy way to provide guaranteed read-only access to an application, without having to mess with access controls. ## Non Goals (if relevant) This RFC is all about WAL-based *physical* replication. Logical replication is a different feature. Neon also has the capability to launch "static" read-only nodes which do not follow the primary, but are pinned to a particular LSN. They can be used for long-running one-off queries, or for Point-in-time queries. They work similarly to read replicas that follow the primary, but some things are simpler: there are no concerns about cache invalidation when the data changes on the primary, or worrying about transactions that are in-progress on the primary. ## Impacted components (e.g. pageserver, safekeeper, console, etc) - Control plane launches the replica - Replica Postgres instance connects to the safekeepers, to stream the WAL - The primary does not know about the standby, except for the hot standby feedback - The primary and replicas all connect to the same pageservers # Context Some useful things to know about hot standby and replicas in PostgreSQL. ## PostgreSQL startup sequence "Running" and "start up" terms are little imprecise. PostgreSQL replica startup goes through several stages: 1. First, the process is started up, and various initialization steps are performed, like initializing shared memory. If you try to connect to the server in this stage, you get an error: ERROR: the database system is starting up. This stage happens very quickly, no 2. Then the server reads the checpoint record from the WAL and starts the WAL replay starting from the checkpoint. This works differently in Neon: we start the WAL replay at the basebackup LSN, not from a checkpoint! If you connect to the server in this state, you get an error: ERROR: the database system is not yet accepting connections. We proceed to the next stage, when the WAL replay sees a running-xacts record. Or in Neon, the "CLOG scanning" mechanism can allow us to move directly to next stage, with all the caveats listed in this RFC. 3. When the running-xacts information is established, the server starts to accept connections normally. From PostgreSQL's point of view, the server is already running in stage 2, even though it's not accepting connections yet. Our `compute_ctl` does not consider it as running until stage 3. If the transition from stage 2 to 3 doesn't happen fast enough, the control plane will mark the start operation as failed. ## Decisions, Issues ### Cache invalidation in replica When a read replica follows the primary in PostgreSQL, it needs to stream all the WAL from the primary and apply all the records, to keep the local copy of the data consistent with the primary. In Neon, the replica can fetch the updated page versions from the pageserver, so it's not necessary to apply all the WAL. However, it needs to ensure that any pages that are currently in the Postgres buffer cache, or the Local File Cache, are either updated, or thrown away so that the next read of the page will fetch the latest version. We choose to apply the WAL records for pages that are already in the buffer cache, and skip records for other pages. Somewhat arbitrarily, we also apply records affecting catalog relations, fetching the old page version from the pageserver if necessary first. See `neon_redo_read_buffer_filter()` function. The replica wouldn't necessarily need to see all the WAL records, only the records that apply to cached pages. For simplicity, we do stream all the WAL to the replica, and the replica simply ignores WAL records that require no action. Like in PostgreSQL, the read replica maintains a "replay LSN", which is the LSN up to which the replica has received and replayed the WAL. The replica can lag behind the primary, if it cannot quite keep up with the primary, or if a long-running query conflicts with changes that are about to be applied, or even intentionally if the user wishes to see delayed data (see recovery_min_apply_delay). It's important that the replica sees a consistent view of the whole cluster at the replay LSN, when it's lagging behind. In Neon, the replica connects to a safekeeper to get the WAL stream. That means that the safekeepers must be able to regurgitate the original WAL as far back as the replay LSN of any running read replica. (A static read-only node that does not follow the primary does not require a WAL stream however). The primary does not need to be running, and when it is, the replicas don't incur any extra overhead to the primary (see hot standby feedback though). ### In-progress transactions In PostgreSQL, when a hot standby server starts up, it cannot immediately open up for queries (see [PostgreSQL startup sequence]). It first needs to establish a complete list of in-progress transactions, including subtransactions, that are running at the primary, at the current replay LSN. Normally that happens quickly, when the replica sees a "running-xacts" WAL record, because the primary writes a running-xacts WAL record at every checkpoint, and in PostgreSQL the replica always starts the WAL replay from a checkpoint REDO point. (A shutdown checkpoint WAL record also implies that all the non-prepared transactions have ended.) If there are a lot of subtransactions in progress, however, the standby might need to wait for old transactions to complete before it can open up for queries. In Neon that problem is worse: a replica can start at any LSN, so there's no guarantee that it will see a running-xacts record any time soon. In particular, if the primary is not running when the replica is started, it might never see a running-xacts record. To make things worse, we initially missed this issue, and always started accepting queries at replica startup, even if it didn't have the transaction information. That could lead to incorrect query results and data corruption later. However, as we fixed that, we introduced a new problem compared to what we had before: previously the replica would always start up, but after fixing that bug, it might not. In a superficial way, the old behavior was better (but could lead to serious issues later!). That made fixing that bug was very hard, because as we fixed it, we made things (superficially) worse for others. See https://github.com/neondatabase/neon/pull/7288 which fixed the bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323 and https://github.com/neondatabase/neon/pull/8484 to try to claw back the cases that started to cause trouble as fixing it. As of this writing, there are still cases where a replica might not immediately start up, causing the control plane operation to fail, the remaining issues are tracked in https://github.com/neondatabase/neon/issues/6211. One long-term fix for this is to switch to using so-called CSN snapshots in read replica. That would make it unnecessary to have the full in-progress transaction list in the replica at startup time. See https://commitfest.postgresql.org/48/4912/ for a work-in-progress patch to upstream to implement that. Another thing we could do is to teach the control plane about that distinction between "starting up" and "running but haven't received running-xacts information yet", so that we could keep the replica waiting longer in that stage, and also give any client connections the same `ERROR: the database system is not yet accepting connections` error that you get in standalone PostgreSQL in that state. ### Recovery conflicts and Hot standby feedback It's possible that a tuple version is vacuumed away in the primary, even though it is still needed by a running transactions in the replica. This is called a "recovery conflict", and PostgreSQL provides various options for dealing with it. By default, the WAL replay will wait up to 30 s for the conflicting query to finish. After that, it will kill the running query, so that the WAL replay can proceed. Another way to avoid the situation is to enable the [`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK) option. When it is enabled, the primary will refrain from vacuuming tuples that are still needed in the primary. That means potentially bloating the primary, which violates the usual rule that read replicas don't affect the operations on the primary, which is why it's off by default. We leave it to users to decide if they want to turn it on, same as PostgreSQL. Neon supports `hot_standby_feedback` by passing the feedback messages from the replica to the safekeepers, and from safekeepers to the primary. ### Relationship of settings between primary and replica In order to enter hot standby mode, some configuration options need to be set to the same or larger values in the standby, compared to the primary. See [explanation in the PostgreSQL docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN) In Neon, we have this problem too. To prevent customers from hitting it, the control plane automatically adjusts the settings of a replica, so that they match or exceed the primary's settings (see https://github.com/neondatabase/cloud/issues/14903). However, you can still hit the issue if the primary is restarted with larger settings, while the replica is running. ### Interaction with Pageserver GC The read replica can lag behind the primary. If there are recovery conflicts or the replica cannot keep up for some reason, the lag can in principle grow indefinitely. The replica will issue all GetPage requests to the pageservers at the current replay LSN, and needs to see the old page versions. If the retention period in the pageserver is set to be small, it may have already garbage collected away the old page versions. That will cause read errors in the compute, and can mean that the replica cannot make progress with the replication anymore. There is a mechanism for replica to pass information about its replay LSN to the pageserver, so that the pageserver refrains from GC'ing data that is still needed by the standby. It's called 'standby_horizon' in the pageserver code, see https://github.com/neondatabase/neon/pull/7368. A separate "lease" mechanism also is in the works, where the replica could hold a lease on the old LSN, preventing the pageserver from advancing the GC horizon past that point. The difference is that the standby_horizon mechanism relies on a feedback message from replica to safekeeper, while the least API is exposed directly from the pageserver. A static read-only node is not connected to safekeepers, so it cannot use the standby_horizon mechanism. ### Synchronous replication We haven't put any effort into synchronous replication yet. PostgreSQL provides multiple levels of synchronicity. In the weaker levels, a transaction is not acknowledged as committed to the client in the primary until the WAL has been streamed to a replica or flushed to disk there. Those modes don't make senses in Neon, because the safekeepers handle durability. `synchronous_commit=remote_apply` mode would make sense. In that mode, the commit is not acknowledged to the client until it has been replayed in the replica. That ensures that after commit, you can see the commit in the replica too (aka. read-your-write consistency). ================================================ FILE: docs/rfcs/037-storage-controller-restarts.md ================================================ # Rolling Storage Controller Restarts ## Summary This RFC describes the issues around the current storage controller restart procedure and describes an implementation which reduces downtime to a few milliseconds on the happy path. ## Motivation Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps. While the storage controller does not sit on the main data path, it's generally not acceptable to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034). ### Current Implementation The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment). In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after, a new instance is created. At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds under unfavourable circumstances: pageservers are heavily loaded or unavailable. ## Prior Art There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include: * Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them. For fail-over, traffic is routed to one of the standbys (which becomes active). * Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs). ## Requirements * Reduce storage controller unavailability during upgrades to milliseconds * Minimize the interval in which it's possible for more than one storage controller to issue reconciles. * Have one uniform implementation for restarts and upgrades * Fit in with the current Kubernetes deployment scheme ## Non Goals * Implement our own consensus algorithm from scratch * Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks like a transient error to the control plane ## Impacted Components * storage controller * deployment orchestration (i.e. Ansible) * helm charts ## Terminology * Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up at start-up by quering pageservers * Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models a set of replicas ## Implementation ### High Level Flow At a very high level the proposed idea is to start a new storage controller instance while the previous one is still running and cut-over to it when it becomes ready. The new instance, should coordinate with the existing one and transition responsibility gracefully. While the controller has built in safety against split-brain situations (via generation numbers), we'd like to avoid such scenarios since they can lead to availability issues for tenants that underwent changes while two controllers were operating at the same time and require operator intervention to remedy. ### Kubernetes Deployment Configuration On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment` to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`. Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`). The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section. ### Storage Controller Start-Up This section describes the primitives required on the storage controller side and the flow of the happy path. #### Database Table For Leader Synchronization A new table should be added to the storage controller database for leader synchronization during startup. This table will always contain at most one row. The proposed name for the table is `leader` and the schema contains two elements: * `hostname`: represents the hostname for the current storage controller leader - should be addressible from other pods in the deployment * `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness) Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently, the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits our needs here. ``` START TRANSACTION ISOLATION LEVEL REPEATABLE READ UPDATE leader SET hostname=, start_timestamp= WHERE hostname=, start_timestampt=; ``` If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure. #### Step Down API A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized snapshot of the observed state. If other step down requests come in after the initial one, the request is handled and the observed state is returned (required for failure scenario handling - see [Handling Failures](#handling-failures)). #### Graceful Restart Happy Path At start-up, the first thing the storage controller does is retrieve the sole row from the new `leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader. This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the observed state into memory and the start-up sequence proceeds as usual, but *without* querying the pageservers in order to build up the observed state. Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization) section. If this step fails, the storage controller process exits. Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table (without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers). Summary of proposed new start-up sequence: 1. Call `/step_down` 2. Perform any pending database migrations 3. Load state from database 4. Load observed state returned in step (1) into memory 5. Do initial heartbeat round (may be moved after 5) 7. Mark self as leader by updating the database 8. Reschedule and reconcile everything Some things to note from the steps above: * The storage controller makes no changes to the cluster state before step (5) (i.e. no location config calls to the pageserver and no compute notifications) * Ask the current leader to step down before loading state from database so we don't get a lost update if the transactions overlap. * Before loading the observed state at step (3), cross-validate against the database. If validation fails, fall back to asking the pageservers about their current locations. * Database migrations should only run **after** the previous instance steps down (or the step down times out). [1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)), so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case. ### Handling Failures #### Storage Controller Crash Or Restart The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to `/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller exists and consistency is maintained. #### Previous Leader Crashes Before New Leader Readiness When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1' (see [2]). Now we have two cases to consider: * P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated by Kubernetes depending on timings. * P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes. The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent. [2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation should avoid this self reference and fail the API call at the client if the persisted hostname matches the current one. #### Previous Leader Crashes After New Leader Readiness The deployment's replica sets already satisfy the deployment's replica count requirements and the Kubernetes deployment rollout will just clean up the dead pod. #### New Leader Crashes Before Pasing Readiness Check The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated with the new pod. #### Network Partition Between New Pod and Previous Leader This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down` API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table. Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles. ### Dealing With Split Brain Scenarios As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening. The rest of this section sketches some safety measure. It's likely overkill to implement all of them however. ### Ensure Leadership Before Producing Side Effects The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane. Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases. ### Leadership Lease Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership to be renewed periodically. Two new columns would be added to the leaders table: 1. `last_renewed` - timestamp indicating when the lease was last renewed 2. `lease_duration` - duration indicating the amount of time after which the lease expires The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request. ### Notify Pageserver Of Storage Controller Term Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader. Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse anything which contains a stale term (i.e. smaller than the current one). ### Observability * The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`). Per region alerts should be added on this metric which triggers when: + no storage controller has been in the `Active` state for an extended period of time + more than one storage controllers are in the `Active` state * An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful. We'd have to expose the storage controller read only database to Grafana (perhaps it is already done). ## Alternatives ### Kubernetes Leases Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election. Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period. In our case, it would work something like this: * `/step_down` deletes the lease or stops it from renewing * lease acquisition becomes part of the start-up procedure The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still not exactly trivial to implement. This approach has the benefit of baked in observability (`kubectl describe lease`), but: * We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong. * More code surface than the simple "row in database" approach. Also, most of this code would be in a dependency not subject to code review, etc. * Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do so is not simple and complictes and the test set-up. To my mind, the "row in database" approach is straightforward enough that we don't have to offload this to something external. ================================================ FILE: docs/rfcs/038-aux-file-v2.md ================================================ # AUX file v2 ## Summary This is a retrospective RFC describing a new storage strategy for AUX files. ## Motivation The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`. Every time the compute node streams a `neon-file` record to the pageserver, it will update the aux file hash map, and then write the serialized hash map into the key. This creates serious space bloat. There was a fix to log delta records (i.e., update a key in the hash map) to the aux file key. In this way, the pageserver only stores the deltas at each of the LSNs. However, this improved v1 storage strategy still requires us to store everything in an aux file cache in memory, because we cannot fetch a single key (or file) from the compound `AUX_FILES_KEY`. ### Prior art For storing large amount of small files, we can use a key-value store where the key is the filename and the value is the file content. ## Requirements - No space bloat, fixed space amplification. - No write bloat, fixed write amplification. ## Impacted Components pageserver ## Sparse Keyspace In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior assumption, there are code that traverses the keyspace by iterating every single key. ```rust loop { // do something key = key.next(); } ``` If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run. Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead, they should fetch all the layer files in the key range, and then do a merge of them. In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`. ## AUX v2 Keyspace and Key Mapping Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`. For example, `pg_logical/mappings/test1` will be encoded as: ``` 62 0000 01 01 7F8B83D94F7081693471ABF91C ^ aux prefix ^ assigned prefix of pg_logical/ ^ assigned prefix of mappings/ ^ 13B FNV hash of test1 ^ not used due to key representation ``` The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace. Note that inside pageserver, there are two representations of the keys: the 18B full key representation and the 16B compact key representation. For the 18B representation, some fields have restricted ranges of values. Therefore, the aux keys only use the 16B compact portion of the full key. It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of each of the aux key is an array that contains all filenames and file content that should be stored in this key. We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before, and we do not need addition code to support reconstructing the value. We simply get the latest image from the storage. ## Inbound Logical Replication Key Mapping For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data. This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace. ## Sparse Keyspace Read Path There are two places we need to read the aux files from the pageserver: * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that. * We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`. ## Compaction and Image Layer Generation With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage. * L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes. * Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later. ## Migration We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration). During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2). If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program. The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes. ================================================ FILE: docs/rfcs/038-independent-compute-release.md ================================================ # Independent compute release Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus) ## Summary This document proposes an approach to fully independent compute release flow. It attempts to cover the following features: - Process is automated as much as possible to minimize human errors. - Compute<->storage protocol compatibility is ensured. - A transparent release history is available with an easy rollback strategy. - Although not in the scope of this document, there is a viable way to extend the proposed release flow to achieve the canary and/or blue-green deployment strategies. ## Motivation Previously, the compute release was tightly coupled to the storage release. This meant that once some storage nodes got restarted with a newer version, all new compute starts using these nodes automatically got a new version. Thus, two releases happen in parallel, which increases the blast radius and makes ownership fuzzy. Now, we practice a manual v0 independent compute release flow -- after getting a new compute release image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws: 1. It's a simple but fairly manual process, as you need to click through a few pages. 2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag. 3. We now require an additional approval in the Admin UI, which partially solves the 2., but also makes the whole process pretty annoying, as you constantly need to go back and forth between two people. ## Non-goals It's not the goal of this document to propose a design for some general-purpose release tool like Helm. The document considers how the current compute fleet is orchestrated at Neon. Even if we later decide to split the control plane further (e.g., introduce a separate compute controller), the proposed release process shouldn't change much, i.e., the releases table and API will reside in one of the parts. Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They were kept in mind, though, so it's expected that the proposed approach will lay down the foundation for implementing them in future iterations. ## Impacted components Compute, control plane, CI, observability (some Grafana dashboards may require changes). ## Prior art One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/). In the code: - [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43) - [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40) - [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42) TL;DR it has several important attributes: - Revision -- unique release ID/primary key. It is not the same as the application version, because the same version can be deployed several times, e.g., after a newer version rollback. - App version -- version of the application chart/code. - Config -- set of overrides to the default config of the application. - Status -- current status of the release in the history. - Timestamps -- tracks when a release was created and deployed. ## Proposed implementation ### Separate release branch We will use a separate release branch, `release-compute`, to have a clean history for releases and commits. In order to avoid confusion with storage releases, we will use a different prefix for compute [git release tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant, but it's better to have image and git tags in sync. Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely independent. The only constraint we want is that it must monotonically increase within the same release branch. ### Compute config/settings manifest We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure: ```yaml pg_settings: # Common settings for primaries and secondaries of all versions. common: wal_log_hints: "off" max_wal_size: "1024" per_version: 14: # Common settings for both replica and primary of version PG 14 common: shared_preload_libraries: "neon,pg_stat_statements,extension_x" 15: common: shared_preload_libraries: "neon,pg_stat_statements,extension_x" # Settings that should be applied only to replica: # Available only starting Postgres 15th recovery_prefetch: "off" # ... 17: common: # For example, if third-party `extension_x` is not yet available for PG 17 shared_preload_libraries: "neon,pg_stat_statements" replica: recovery_prefetch: "off" ``` **N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string) without units for all numeric settings. That's how the control plane currently operates. The priority of settings will be (a higher number is a higher priority): 1. Any static and hard-coded settings in the control plane 2. `pg_settings->common` 3. Per-version `common` 4. Per-version `replica` 5. Any per-user/project/endpoint overrides in the control plane 6. Any dynamic setting calculated based on the compute size **N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely overridden if specified on some level. Make sure that you include all necessary extensions in it when you do any overrides. **N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes do for particular projects and customers. That's usually some ad-hoc work and images are based on the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute release. If for some reason that's not true, and further overrides are needed, it's also possible to do on the project level together with pinning the image, so it's on-call/engineer/support responsibility to ensure that compute starts with the specified custom image. The only real risk is that compute image will get stale and settings from new releases will drift away, so eventually it will get something incompatible, but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely, then together with pinning the image we could also pin the matching release revision in the control plane. The compute team will own the content of `compute/manifest.yaml`. ### Control plane: releases table In order to store information about releases, the control plane will use a table `compute_releases` with the following schema: ```sql CREATE TABLE compute_releases ( -- Unique release ID -- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent -- services. We have the same situation with Helm releases as well because they could be deployed and rolled back -- independently in different clusters. revision BIGSERIAL PRIMARY KEY, -- Numeric version of the compute image, e.g. 9057 version BIGINT NOT NULL, -- Compute image tag, e.g. `release-9057` tag TEXT NOT NULL, -- Current release status. Currently, it will be a simple enum -- * `deployed` -- release is deployed and used for new compute starts. -- Exactly one release can have this status at a time. -- * `superseded` -- release has been replaced by a newer one. -- But we can always extend it in the future when we need more statuses -- for more complex deployment strategies. status TEXT NOT NULL, -- Any additional metadata for compute in the corresponding release manifest JSONB NOT NULL, -- Timestamp when release record was created in the control plane database created_at TIMESTAMP NOT NULL DEFAULT now(), -- Timestamp when release deployment was finished deployed_at TIMESTAMP ); ``` We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of old computes started using the image from one of the previous releases. Yet, when users want to reconfigure them without restarting, the control plane needs to know what settings are applicable to them, so we also need information about the previous releases that are readily available. There could be some other auxiliary info needed as well: supported extensions, compute flags, etc. **N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g., it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet, we could've started some computes with the first deployment and some with the second. Thus, when we need to look up the manifest for the compute by its image tag, we will see two records in the table with the same tag, but different revision numbers. We can assume that this could happen only in case of rollbacks, so we can just take the latest revision for the given tag. ### Control plane: management API The control plane will implement new API methods to manage releases: 1. `POST /management/api/v2/compute_releases` to create a new release. With payload ```json { "version": 9057, "tag": "release-9057", "manifest": {} } ``` and response ```json { "revision": 53, "version": 9057, "tag": "release-9057", "status": "deployed", "manifest": {}, "created_at": "2024-08-15T15:52:01.0000Z", "deployed_at": "2024-08-15T15:52:01.0000Z", } ``` Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane will get information about all available extensions not bundled into compute image. The corresponding workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that, we put a constraint that new custom extension requires new compute release, which is good for the safety, but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all; v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle. **N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable to assume that we could retry the request several times, even though it's already succeeded. Although it's not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane should check if the latest release is identical and just return `304 Not Modified` in this case. 2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload including the revision of the release to rollback to: ```json { "revision": 52 } ``` Rollback marks the current release as `superseded` and creates a new release with all the same data as the requested revision, but with a new revision number. This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any available tag. It's still nice to have for on-call and any urgent matters, for example, if we need to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting all the necessary data for the new release payload. ### Compute->storage compatibility tests In order to safely release new compute versions independently from storage, we need to ensure that the currently deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility in storage, but newer computes may require a newer storage version. Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48) `storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure compatibility between storage and compute: 1. Pick the latest storage release tag and use it as `storage_image_tag`. 2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`. Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged. 3. Trigger e2e tests as usual. ### Release flow ```mermaid sequenceDiagram actor oncall as Compute on-call person participant neon as neondatabase/neon box private participant cloud as neondatabase/cloud participant exts as neondatabase/build-custom-extensions participant infra as neondatabase/infra end box cloud participant preprod as Pre-prod control plane participant prod as Production control plane participant k8s as Compute k8s end oncall ->> neon: Open release PR into release-compute activate neon neon ->> cloud: CI: trigger e2e compatibility tests activate cloud cloud -->> neon: CI: e2e tests pass deactivate cloud neon ->> neon: CI: pass PR checks, get approvals deactivate neon oncall ->> neon: Merge release PR into release-compute activate neon neon ->> neon: CI: pass checks, build and push images neon ->> exts: CI: trigger extensions build activate exts exts -->> neon: CI: extensions are ready deactivate exts neon ->> neon: CI: create release tag neon ->> infra: Trigger release workflow using the produced tag deactivate neon activate infra infra ->> infra: CI: pass checks infra ->> preprod: Release new compute image to pre-prod automatically
POST /management/api/v2/compute_releases activate preprod preprod -->> infra: 200 OK deactivate preprod infra ->> infra: CI: wait for per-region production deploy approvals oncall ->> infra: CI: approve deploys region by region infra ->> k8s: Prewarm new compute image infra ->> prod: POST /management/api/v2/compute_releases activate prod prod -->> infra: 200 OK deactivate prod deactivate infra ``` ## Further work As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies. For example, we can pass a fraction of the total compute starts that should use the new release. Then we can mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it to `deployed` status. If not, we can roll back to the previous one. ## Alternatives In theory, we can try using Helm as-is: 1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read. N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy. 2. The control plane will read it and start using the new compute version for new starts. Drawbacks: 1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled by control plane, so it makes it much more tricky. 2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use `helm` cli and/or K8s UIs like K8sLens. 3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility purpose (see above) control plane may need some auxiliary info from the previous releases. ================================================ FILE: docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md ================================================ # Memo: Endpoint Persistent Unlogged Files Storage Created on 2024-11-05 Implemented on N/A ## Summary A design for a storage system that allows storage of files required to make Neon's Endpoints have a better experience at or after a reboot. ## Motivation Several systems inside PostgreSQL (and Neon) need some persistent storage for optimal workings across reboots and restarts, but still work without. Examples are the query-level statistics files of `pg_stat_statements` in `pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. We need a storage system that can store and manage these files for each Endpoint, without necessarily granting users access to an unlimited storage device. ## Goals - Store known files for Endpoints with reasonable persistence. _Data loss in this service, while annoying and bad for UX, won't lose any customer's data._ ## Non Goals (if relevant) - This storage system does not need branching, file versioning, or other such features. The files are as ephemeral to the timeline of the data as the Endpoints that host the data. - This storage system does not need to store _all_ user files, only 'known' user files. - This storage system does not need to be hosted fully inside Computes. _Instead, this will be a separate component similar to Pageserver, SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._ ## Impacted components - Compute needs new code to load and store these files in its lifetime. - Control Plane needs to consider this new storage system when signalling the deletion of an Endpoint, Timeline, or Tenant. - Control Plane needs to consider this new storage system when it resets or re-assigns an endpoint's timeline/branch state. A new service is created: the Endpoint Persistent Unlogged Files Storage service. This could be integrated in e.g. Pageserver or Control Plane, or a separately hosted service. ## Proposed implementation Endpoint-related data files are managed by a newly designed service (which optionally is integrated in an existing service like Pageserver or Control Plane), which stores data directly into S3 or any blob storage of choice. Upon deletion of the Endpoint, or reassignment of the endpoint to a different branch, this ephemeral data is dropped: the data stored may not match the state of the branch's data after reassignment, and on endpoint deletion the data won't have any use to the user. Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims) which it can use to authenticate to this new service and retrieve and store data associated with this endpoint. This limited scope reduces leaks of data across endpoints and timeline resets, and limits the ability of endpoints to mess with other endpoints' data. The path of this endpoint data in S3 is initially as follows: s3:/// tenants/ / tenants/ / endpoints/ / pgdata/ For other blob storages an equivalent or similar path can be constructed. ### Reliability, failure modes and corner cases (if relevant) Reliability is important, but not critical to the workings of Neon. The data stored in this service will, when lost, reduce performance, but won't be a cause of permanent data loss - only operational metadata is stored. Most, if not all, blob storage services have sufficiently high persistence guarantees to cater our need for persistence and uptime. The only concern with blob storages is that the access latency is generally higher than local disk, but for the object types stored (cache state, ...) I don't think this will be much of an issue. ### Interaction/Sequence diagram (if relevant) In these diagrams you can replace S3 with any persistent storage device of choice, but S3 is chosen as representative name: The well-known and short name of AWS' blob storage. Azure Blob Storage should work too, but it has a much longer name making it less practical for the diagrams. Write data: ```http POST /tenants//timelines//endpoints//pgdata/ Host: epufs.svc.neon.local <<< 200 OK { "version": "", # opaque file version token, changes when the file contents change "size": , } ``` ```mermaid sequenceDiagram autonumber participant co as Compute participant ep as EPUFS participant s3 as Blob Storage co-->ep: Connect with credentials co->>+ep: Store Unlogged Persistent File opt is authenticated ep->>s3: Write UPF to S3 end ep->>-co: OK / Failure / Auth Failure co-->ep: Cancel connection ``` Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since) ```http GET /tenants//timelines//endpoints//pgdata/ Host: epufs.svc.neon.local <<< 200 OK ``` ```mermaid sequenceDiagram autonumber participant co as Compute participant ep as EPUFS participant s3 as Blob Storage co->>+ep: Read Unlogged Persistent File opt is authenticated ep->>+s3: Request UPF from storage s3->>-ep: Receive UPF from storage end ep->>-co: OK(response) / Failure(storage, auth, ...) ``` Compute Startup: ```mermaid sequenceDiagram autonumber participant co as Compute participant ps as Pageserver participant ep as EPUFS participant es as Extension server note over co: Bind endpoint ep-xxx par Get basebackup co->>+ps: Request basebackup @ LSN ps-)ps: Construct basebackup ps->>-co: Receive basebackup TAR @ LSN and Get startup-critical Unlogged Persistent Files co->>+ep: Get all UPFs of endpoint ep-xxx ep-)ep: Retrieve and gather all UPFs ep->>-co: TAR of UPFs and Get startup-critical extensions loop For every startup-critical extension co->>es: Get critical extension es->>co: Receive critical extension end end note over co: Start compute ``` CPlane ops: ```http DELETE /tenants//timelines//endpoints/ Host: epufs.svc.neon.local <<< 200 OK { "tenant": "", "timeline": "", "endpoint": "", "deleted": { "files": , "bytes": , }, } ``` ```http DELETE /tenants//timelines/ Host: epufs.svc.neon.local <<< 200 OK { "tenant": "", "timeline": "", "deleted": { "files": , "bytes": , }, } ``` ```http DELETE /tenants/ Host: epufs.svc.neon.local <<< 200 OK { "tenant": "", "deleted": { "files": , "bytes": , }, } ``` ```mermaid sequenceDiagram autonumber participant cp as Control Plane participant ep as EPUFS participant s3 as Blob Storage alt Tenant deleted cp-)ep: Tenant deleted loop For every object associated with removed tenant ep->>s3: Remove data of deleted tenant from Storage end opt ep-)cp: Tenant cleanup complete end alt Timeline deleted cp-)ep: Timeline deleted loop For every object associated with removed timeline ep->>s3: Remove data of deleted timeline from Storage end opt ep-)cp: Timeline cleanup complete end else Endpoint reassigned or removed cp->>+ep: Endpoint reassigned loop For every object associated with reassigned/removed endpoint ep->>s3: Remove data from Storage end ep->>-cp: Cleanup complete end ``` ### Scalability (if relevant) Provisionally: As this service is going to be part of compute startup, this service should be able to quickly respond to all requests. Therefore this service is deployed to every AZ we host Computes in, and Computes communicate (generally) only to the EPUFS endpoint of the AZ they're hosted in. Local caching of frequently restarted endpoints' data or metadata may be needed for best performance. However, due to the regional nature of stored data but zonal nature of the service deployment, we should be careful when we implement any local caching, as it is possible that computes in AZ 1 will update data originally written and thus cached by AZ 2. Cache version tests and invalidation is therefore required if we want to roll out caching to this service, which is too broad a scope for an MVC. This is why caching is left out of scope for this RFC, and should be considered separately after this RFC is implemented. ### Security implications (if relevant) This service must be able to authenticate users at least by Tenant ID, Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of Compute, which will be upgraded to the extent needed to support Timeline- and Endpoint-based claims. The service requires unlimited access to (a prefix of) a blob storage bucket, and thus must be hosted outside the Compute VM sandbox. A service that generates pre-signed request URLs for Compute to download the data from that URL is likely problematic, too: Compute would be able to write unlimited data to the bucket, or exfiltrate this signed URL to get read/write access to specific objects in this bucket, which would still effectively give users access to the S3 bucket (but with improved access logging). There may be a use case for transferring data associated with one endpoint to another endpoint (e.g. to make one endpoint warm its caches with the state of another endpoint), but that's not currently in scope, and specific needs may be solved through out-of-line communication of data or pre-signed URLs. ### Unresolved questions (if relevant) Caching of files is not in the implementation scope of the document, but should at some future point be considered to maximize performance. ## Alternative implementation (if relevant) Several ideas have come up to solve this issue: ### Use AUXfile One prevalent idea was to WAL-log the files using our AUXfile mechanism. Benefits: + We already have this storage mechanism Demerits: - It isn't available on read replicas - Additional WAL will be consumed during shutdown and after the shutdown checkpoint, which needs PG modifications to work without panics. - It increases the data we need to manage in our versioned storage, thus causing higher storage costs with higher retention due to duplication at the storage layer. ### Sign URLs for read/write operations, instead of proxying them Benefits: + The service can be implemented with a much reduced IO budget Demerits: - Users could get access to these signed credentials - Not all blob storage services may implement URL signing ### Give endpoints each their own directly accessed block volume Benefits: + Easier to integrate for PostgreSQL Demerits: - Little control on data size and contents - Potentially problematic as we'd need to store data all across the pgdata directory. - EBS is not a good candidate - Attaches in 10s of seconds, if not more; i.e. too cold to start - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint with users of the same EBS volumes, which can't work with VM migration - EBS storage costs are very high (>80$/kilotenant when using a volume/tenant) - EBS volumes can't be mounted across AZ boundaries - Bucket per endpoint is unfeasible - S3 buckets are priced at $20/month per 1k, which we could better spend on developers. - Allocating service accounts takes time (100s of ms), and service accounts are a limited resource, too; so they're not a good candidate to allocate on a per-endpoint basis. - Giving credentials limited to prefix has similar issues as the pre-signed URL approach. - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup much more than our current systems would. - Volumes bound by hypervisor are unlikely - This requires significant investment and increased software on the hypervisor. - It is unclear if we can attach volumes after boot, i.e. for pooled instances. ### Put the files into a table Benefits: + Mostly already available in PostgreSQL Demerits: - Uses WAL - Can't be used after shutdown checkpoint - Needs a RW endpoint, and table & catalog access to write to this data - Gets hit with DB size limitations - Depending on user acces: - Inaccessible: The user doesn't have control over database size caused by these systems. - Accessible: The user can corrupt these files and cause the system to crash while user-corrupted files are present, thus increasing on-call overhead. ## Definition of Done (if relevant) This project is done if we have: - One S3 bucket equivalent per region, which stores this per-endpoint data. - A new service endpoint in at least every AZ, which indirectly grants endpoints access to the data stored for these endpoints in these buckets. - Compute writes & reads temp-data at shutdown and startup, respectively, for at least the pg_prewarm or lfc_prewarm state files. - Cleanup of endpoint data is triggered when the endpoint is deleted or is detached from its current timeline. ================================================ FILE: docs/rfcs/040-profiling.md ================================================ # CPU and Memory Profiling Created 2025-01-12 by Erik Grinaker. See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4). ## Summary This document proposes a standard cross-team pattern for CPU and memory profiling across applications and languages, using the [pprof](https://github.com/google/pprof) profile format. It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via [Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/). Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations. ## Motivation CPU and memory profiles are crucial observability tools for understanding performance issues, resource exhaustion, and resource costs. They allow answering questions like: * Why is this process using 100% CPU? * How do I make this go faster? * Why did this process run out of memory? * Why are we paying for all these CPU cores and memory chips? Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its standard library, using the [pprof](https://github.com/google/pprof) profile format and associated tooling. This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires installing and running additional tools like `perf` as root on production nodes, with analysis tools that can be hard to use and often don't give good results. This is not only annoying, but can also significantly affect the resolution time of production incidents. This proposal will: * Provide CPU and heap profiles in pprof format via HTTP API. * Record continuous profiles in Grafana for aggregate historical analysis. * Make it easy for anyone to see a flamegraph in less than one minute. * Be reasonably consistent across teams and services (Rust, Go, C). ## Non Goals (For Now) * [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/) like mutexes, locks, goroutines, etc. * [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/). * [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization). ## Using Profiles Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services: ``` $ curl localhost:9898/profile/cpu >profile.pb.gz ``` pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which provides flamegraphs, call graphs, plain text listings, and more: ``` $ pprof -http :6060 ``` Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly: ``` $ curl localhost:9898/profile/cpu?format=svg >profile.svg $ open profile.svg ``` Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles (currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)). ## API Requirements * HTTP endpoints that return a profile in pprof format (with symbols). * CPU: records a profile over the request time interval (`seconds` query parameter). * Memory: returns the current in-use heap allocations. * Unauthenticated, as it should not expose user data or pose a denial-of-service risk. * Default sample frequency should not impact service (maximum 5% CPU overhead). * Linux-compatibility. Nice to have: * Return flamegraph SVG directly from the HTTP endpoint if requested. * Configurable sample frequency for CPU profiles. * Historical heap allocations, by count and bytes. * macOS-compatiblity. ## Rust Profiling [`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs) contains ready-to-use HTTP endpoints for CPU and memory profiling: [`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). ### CPU CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via [`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338). Expose it unauthenticated at `/profile/cpu`. Parameters: * `format`: profile output format (`pprof` or `svg`; default `pprof`). * `seconds`: duration to collect profile over, in seconds (default `5`). * `frequency`: how often to sample thread stacks, in Hz (default `99`). * `force`: if `true`, cancel a running profile and start a new one (default `false`). Works on Linux and macOS. ### Memory Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator), and enable profiling with samples every 2 MB allocated: ```rust #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[export_name = "malloc_conf"] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; ``` pprof profiles are generated by [`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). Expose it unauthenticated at `/profile/heap`. Parameters: * `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`). Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26). ## Go Profiling The Go standard library includes pprof profiling via HTTP API in [`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at `/debug/pprof`. Works on Linux and macOS. ### CPU Via `/debug/pprof/profile`. Parameters: * `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`). * `seconds`: duration to collect profile over, in seconds (default `30`). Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)), and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default is likely ok (estimated 1% overhead). ### Memory Via `/debug/pprof/heap`. Parameters: * `seconds`: take a delta profile over the given duration, in seconds (default `0`). * `gc`: if `1`, garbage collect before taking profile. ## C Profiling [gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling with pprof output. However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value since we don't own the internals anyway. Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient, so this is not a priority at the moment. ## Grafana Continuous Profiling [Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles across the fleet, and archives them as time series. This can be used to analyze resource usage over time, either in aggregate or zoomed in to specific events and nodes. Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB). It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer) for Pageserver and Safekeeper. ### Scraping * CPU profiling: 59 seconds at 19 Hz every 60 seconds. * Heap profiling: heap snapshot with 2 MB frequency every 60 seconds. There are two main approaches that can be taken for CPU profiles: * Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds). * Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds). We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead of a spiky high overhead. It likely also gives a more representative view of resource usage. However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the actual runtime of small functions. Note that Go does not support a frequency parameter, so we must use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz). Only one CPU profile can be taken at a time. With continuous profiling, one will always be running. To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to cancel a running profile and start a new one. ### Overhead With Rust: * CPU profiles at 19 Hz frequency: 0.1% overhead. * Heap profiles at 2 MB frequency: 3% allocation overhead. * Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver). * Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver). Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was 11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible overhead). CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but likely 0.1% in practice (given e.g. context switches). Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs, so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the fact that performance-sensitive code will avoid allocations as far as possible. Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for Pageserver. ## Alternatives Considered * eBPF profiles. * Don't require instrumenting the binary. * Use less resources. * Can profile in kernel space too. * Supported by Grafana. * Less information about stack frames and spans. * Limited tooling for local analysis. * Does not support heap profiles. * Does not work on macOS. * [Polar Signals](https://www.polarsignals.com) instead of Grafana. * We already use Grafana for everything else. Appears good enough. ================================================ FILE: docs/rfcs/041-rel-sparse-keyspace.md ================================================ # Sparse Keyspace for Relation Directories ## Summary This is an RFC describing a new storage strategy for storing relation directories. ## Motivation Postgres maintains a directory structure for databases and relations. In Neon, we store these information by serializing the directory data in a single key (see `pgdatadir_mapping.rs`). ```rust // DbDir: // 00 00000000 00000000 00000000 00 00000000 // RelDir: // 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) ``` We have a dedicated structure on the ingestion path to serialize the relation directory into this single key. ```rust #[derive(Debug, Serialize, Deserialize, Default)] pub(crate) struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) // // TODO: Store it as a btree or radix tree or something else that spans multiple // key-value pairs, if you have a lot of relations pub(crate) rels: HashSet<(Oid, u8)>, } ``` The current codebase has the following three access patterns for the relation directory. 1. Check if a relation exists. 2. List all relations. 3. Create/drop a relation. For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back. If we have 100k relations in a database, we would have a 100k-large hash set. Then, every relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path, we would have to deserialize this super big 100k-large key before checking if a single relation exists. In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how to seamlessly migrate users to use the new keyspace. The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316). ## Key Mapping We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in [038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>` for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`), into the key. ```plain (REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted (REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists ``` Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be implemented as follows. 1. Check if a relation exists: check if the key maps to "exists". 2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key. 3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will be removed during image layer generation upon compaction. Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum. The mapping is implemented as `rel_tag_sparse_key` in the PoC patch. ## Changes to Sparse Keyspace Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs to be updated accordingly to accommodate such "inherited sparse keys". This is done in [PR#10313](https://github.com/neondatabase/neon/pull/10313). ## Coexistence of the Old and New Keyspaces Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read path needs to combine the data from both keyspaces. Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the migration can happen seamlessly and imposes no potential downtime for the user. With the coexistence assumption, the 3 reldir operations will be implemented as follows: 1. Check if a relation exists - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly return it to the user. - Otherwise, deserialize the old reldir key and get the result. 2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key. Combine them to obtain the final result. 3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace. - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check. - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace. - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key, remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace. - The delete tombstone will be removed during image layer generation upon compaction. This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction. There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives us `O(1)` complexity after fully opt-in the sparse keyspace. The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN. We will introduce a config item and an index_part record to record the current status of the migration process. - Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace. - `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace. If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update `index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to `false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only: once v2 is enabled, the user cannot go back to v1. ## Next Steps ### Full Migration This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this code path, we must ensure the timeline has no old reldir data. We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces: the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers` process discovers the following keys at this LSN. ```plain db1/reldir_key -> (table 1, table 2, table 3) ...db1 rel keys db2/reldir_key -> (table 4, table 5, table 6) ...db2 rel keys sparse_reldir_db2_table7 -> exists sparse_reldir_db1_table8 -> deleted ``` It will generate the following keys: ```plain db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`. ...db1 rel keys db2/reldir_key -> () ...db2 rel keys -- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180 sparse_reldir_db1_table1 -> exists sparse_reldir_db1_table2 -> exists sparse_reldir_db1_table3 -> exists sparse_reldir_db2_table4 -> exists sparse_reldir_db2_table5 -> exists sparse_reldir_db2_table6 -> exists sparse_reldir_db2_table7 -> exists -- end image layer for the sparse keyspace at sparse_reldir_prefix+1 # The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace. # Note that the read path will stop reading if a key is not found in the image layer covering the key range so there # are no correctness issue. ``` We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to `Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers. The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code. ### Consolidate Relation Size Keys We have relsize at the end of all relation nodes. ```plain // RelSize: // 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF ``` This means that computing logical size requires us to do several single-key gets across the keyspace, potentially requiring downloading many layer files. We could consolidate them into a single keyspace, improving logical size calculation performance. ### Migrate DBDir Keys We assume the number of databases created by the users will be small, and therefore, the current way of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into the sparse keyspace to support large amount of databases. ================================================ FILE: docs/rfcs/041-sharded-ingest.md ================================================ # Created on Aug 2024 Implemented on Jan 2025 ## Summary Data in large tenants is split up between multiple pageservers according to key hashes, as introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md). Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs, in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives only the data it needs. This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards). ## Motivation 1. Large databases require higher shard counts. Whereas currently we run with up to 8 shards for tenants with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such that sending all WAL to all shards is impractical in terms of bandwidth. 2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck. To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver only has to process relevant parts. ## Non Goals (if relevant) We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's WAL across safekeepers (beyond simple 3x replication). This RFC may be thought of as an incremental move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the pageserver, they will bottleneck on the safekeeper. ## Impacted components (e.g. pageserver, safekeeper, console, etc) Safekeeper, pageserver. There will be no control plane or storage controller coordination needed, as pageservers will directly indicate their sharding parameters to the safekeeper when subscribing for WAL. ## Proposed implementation Terminology: - "Data pages" refers to postgres relation blocks, and SLRU blocks. - "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and directories of relations. ### Phase 1: Refactor ingest Currently, pageserver ingest code is structured approximately as follows: 1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network socket 2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications 3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`. This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages such as relation sizes and the master DBDIR page. It also assumes that records are ingested strictly one after the other: they cannot be ingested in parallel because each record assumes that earlier records' changes have already been applied to `Timeline`. This code will be refactored to disentangle the simple, fast decode of relation page writes from the more complex logic for updating internal metadata. An intermediate representation called `InterpretedWalRecords` will be introduced. This is similar to the internal state of a `DatadirModification`, but does not require access to a Timeline. Instead of storing metadata updates as materialized writes to pages, it will accumulate these as abstract operations, for example rather than including a write to a relation size key, this structure will include an operation that indicates "Update relation _foo_'s size to the max of its current value and _bar_", such that these may be applied later to a real Timeline. The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates simple page writes of relation blocks, it can write them directly into a buffer in the serialized format. This will avoid the need to later deserialize/reserialize this data when passing the structure between safekeeper and pageserver. The new pipeline will be: 1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network 2. A `InterpretedWalRecords` is generated from the incoming WAL records. This does not require a reference to a Timeline. 3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords` and turn them into literal writes to metadata pages. This part must be done sequentially. 4. The resulting buffer of metadata page writes is combined with the buffer of relation block writes, and written into the `InMemoryLayer`. Implemented in: 1. https://github.com/neondatabase/neon/pull/9472 2. https://github.com/neondatabase/neon/pull/9504 3. https://github.com/neondatabase/neon/pull/9524 ### Phase 2: Decode & filter on safekeeper In the previous phase, the ingest code was modified to be able to do most of its work without access to a Timeline: this first stage of ingest simply converts a series of binary wal records into a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes. The modified ingest code may be transplanted from pageserver to safekeeper (probably via a shared crate). The safekeeper->pageserver network protocol is modified to: - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`. - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for the subscribing shard before transmitting it. The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of consistent LSN feedback, and connection management. Only the payload of the subscriptions changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the raw data. The ingest code on the pageserver can now skip the part where it does the first phase of processing, as it will receive pre-processed, compressed data off the wire. Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards. Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks. The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future, we may migrate it to GRPC altogether Implemented in: 1. https://github.com/neondatabase/neon/pull/9746 2. https://github.com/neondatabase/neon/pull/9821 ### Phase 3: Fan out interpreted WAL In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially when considering converting to Protobuf format and compression). To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted _only_ once per (safekeeper, timeline). When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader. There's two cases to consider: 1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain by letting shards "front-run" since compute backpressure is based on the laggard LSN. 2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive interpreted WAL records below their current position. The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time. If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers, so it's recommended to use large values to avoid pushing CPU utilisation too high. Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll the reader and sender concurrently from the connection task. Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time. This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start position. Implemented in: 1. https://github.com/neondatabase/neon/pull/10190 ## Deployment Each phase shall be deployed independently. Special care should be taken around protocol changes. ## Observability Tips * The safekeeper logs the protocol requested by the pageserver along with the pageserver ID, tenant, timeline and shard: `starting streaming from`. * There's metrics for the number of wal readers: * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline * Interesting log lines for the fan-out reader: * `Spawning interpreted`: first shard creates the interpreted wal reader * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader * `Aborting interpreted`: all senders have finished and the reader task is being aborted ## Future Optimizations This sections describes some improvement areas which may be revisited in the future. ### Buffering of Interpreted WAL The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving subscriptions that are lagging behind the current position of the reader. Counterpoints: * Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful, especially given that it would go unused on the happy path. * WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting. ### Tweaking the Pagserver Safekeeper Selection Algorithm We could make the pageserver aware of which safekeeper's already host shards for the timeline along with their current WAL positions. The pageserver should then prefer safkeepers that are in the same AZ _and_ already have a shard with a position close to the desired start position. We currently run one safekeeper per AZ, so the point is mute until that changes. ### Pipelining first ingest phase The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed output per shard. To put multiple CPUs to work, we may pipeline this processing up to some defined buffer depth. ## Alternatives considered ### Give safekeepers enough state to fully decode WAL In this RFC, we only do the first phase of ingest on the safekeeper, because this is the phase that is stateless. Subsequent changes then happen on the pageserver, with access to the `Timeline` state. We could do more work on the safekeeper if we transmitted metadata state to the safekeeper when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes, so that it could then generate all the metadata writes for relation sizes. We avoid doing this for several reasons: 1. Complexity: it's a more invasive protocol change 2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat infects it with knowledge of the pageserver, but this is mainly an abstract structure that describes postgres writes. However, if we taught the safekeeper about the exact way that pageserver deals with metadata keys, this would be a much tighter coupling. 3. Load: once the WAL has been processed to the point that it can be split between shards, it is preferable to share out work on the remaining shards rather than adding extra CPU load to the safekeeper. ### Do pre-processing on the compute instead of the safekeeper Since our first stage of ingest is stateless, it could be done at any stage in the pipeline, all the way up to the compute. We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather than just the preprocessed WAL: - The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication - It simplifies our paxos implementation to have the offset in the write log be literally the same as the LSN - Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future. Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol. ### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then we could do the initial decode and shard-splitting in some other location: - Shard 0 could subscribe to the full WAL and then send writes to other shards - A new intermediate service between the safekeeper and pageserver could do the splitting. So why not? - Extra network hop from shard 0 to the final destination shard - Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper. - Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate disks for the safekeepers effectively have "free" CPU resources. - Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because shard 0 would have significantly higher CPU load under write workloads than other shards. ================================================ FILE: docs/rfcs/043-bottom-most-gc-compaction.md ================================================ # Bottommost Garbage-Collection Compaction ## Summary The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future. ## Motivation The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification. # Basic Idea ![](images/036-bottom-most-gc-compaction/01-basic-idea.svg) The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process, - All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages. - We produce images for all keys involved in the compaction process at the GC horizon. Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback). ![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png) The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line. # Branches With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. ![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg) ## Single Timeline w/ Snapshots: handle `retain_lsn` First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”). The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below: ``` LSN 0x10 -> A LSN 0x20 -> append B retain_lsn: 0x20 LSN 0x30 -> append C LSN 0x40 -> append D retain_lsn: 0x40 LSN 0x50 -> append E GC horizon: 0x50 LSN 0x60 -> append F ``` The algorithm will produce: ``` LSN 0x20 -> AB (drop all history below the earliest retain_lsn) LSN 0x40 -> ABCD (assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here) LSN 0x50 -> append E (replay one delta is cheap) LSN 0x60 -> append F (keep everything as-is above the GC horizon) ``` ![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg) What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped. In the example above, the `$threshold` is 2. ## Child Branches with data: pull + partial images In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that. We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. ``` branch_lsn: 0x20 LSN 0x30 -> append P LSN 0x40 -> append Q LSN 0x50 -> append R GC horizon: 0x50 LSN 0x60 -> append S ``` Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch. ``` branch_lsn: 0x20 LSN 0x50 -> ABPQR (we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta) GC horizon: 0x50 LSN 0x60 -> append S ``` ![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg) Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch. # Result Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before. Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range After: sum(min(logs for each key, image for each key)) # Compaction Trigger The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)). We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification. Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon. The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space. ![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg) The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space. ![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg) Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon. The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**. To reason about this trigger, consider the two cases: **Data Ingestion** User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written. ![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg) **Updates/Deletion** In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. ![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg) Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size. The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor. 20GB layers → +20GB layers → delete 20GB, need 40GB temporary space # Sub-Compactions The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs. ![](images/036-bottom-most-gc-compaction/13-job-split.svg) As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5). Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range. # Implementation The main implementation of gc-compaction is in `compaction.rs`. * `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range. * `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files. * `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible. * `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried. * Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction. * Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information. Gc-compaction can also be scheduled over the HTTP API. Example: ``` curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }' ``` The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map. The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works). # Next Steps There are still some limitations of gc-compaction itself that needs to be resolved and tested, - gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging. - gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones. - gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history. - We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long. - The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process. - gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction. - gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer. - We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history. In the future, - Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN. - Tiered compaction on deltas: ensure read from any LSN is fast. - Per-timeline compaction → tenant-wide compaction? ================================================ FILE: docs/rfcs/044-feature-flag.md ================================================ # Storage Feature Flags In this RFC, we will describe how we will implement per-tenant feature flags. ## PostHog as Feature Flag Service Before we start, let's talk about how current feature flag services work. PostHog is the feature flag service we are currently using across multiple user-facing components in the company. PostHog has two modes of operation: HTTP evaluation and server-side local evaluation. Let's assume we have a storage feature flag called gc-compaction and we want to roll it out to scale-tier users with resident size >= 10GB and <= 100GB. ### Define User Profiles The first step is to synchronize our user profiles to the PostHog service. We can simply assume that each tenant is a user in PostHog. Each user profile has some properties associated with it. In our case, it will be: plan type (free, scale, enterprise, etc); resident size (in bytes); primary pageserver (string); region (string). ### Define Feature Flags We would create a feature flag called gc-compaction in PostHog with 4 variants: disabled, stage-1, stage-2, fully-enabled. We will flip the feature flags from disabled to fully-enabled stage by stage for some percentage of our users. ### Option 1: HTTP Evaluation Mode When using PostHog's HTTP evaluation mode, the client will make request to the PostHog service, asking for the value of a feature flag for a specific user. * Control plane will report the plan type to PostHog each time it attaches a tenant to the storcon or when the user upgrades/downgrades. It calls the PostHog profile API to associate tenant ID with the plan type. Assume we have X active tenants and such attach or plan change event happens each week, that would be 4X profile update requests per month. * Pageservers will report the resident size and the primary pageserver to the PostHog service. Assume we report resident size every 24 hours, that would be 30X requests per month. * Each tenant will request the state of the feature flag every 1 hour, that's 720X requests per month. * The Rust client would be easy to implement as we only need to call the `/decide` API on PostHog. Using the HTTP evaluation mode we will issue 754X requests a month. ### Option 2: Local Evaluation Mode When using PostHog's HTTP evaluation mode, the client (usually the server in a browser/server architecture) will poll the feature flag configuration every 30s (default in the Python client) from PostHog. Such configuration contains data like:
Example JSON response from the PostHog local evaluation API ``` [ { "id": 1, "name": "Beta Feature", "key": "person-flag", "is_simple_flag": True, "active": True, "filters": { "groups": [ { "properties": [ { "key": "location", "operator": "exact", "value": ["Straße"], "type": "person", } ], "rollout_percentage": 100, }, { "properties": [ { "key": "star", "operator": "exact", "value": ["ſun"], "type": "person", } ], "rollout_percentage": 100, }, ], }, } ] ```
Note that the API only contains information like "under what condition => rollout percentage". The user is responsible to provide the properties required to the client for local evaluation, and the PostHog service (web UI) cannot know if a feature is enabled for the tenant or not until the client uses the `capture` API to report the result back. To control the rollout percentage, the user ID gets mapped to a float number in `[0, 1)` on a consistent hash ring. All values <= the percentage will get the feature enabled or set to the desired value. To use the local evaluation mode, the system needs: * Assume each pageserver will poll PostHog for the local evaluation JSON every 5 minutes (instead of the 30s default as it's too frequent). That's 8640Y per month, Y is the number of pageservers. Local evaluation requests cost 10x more than the normal decide request, so that's 86400Y request units to bill. * Storcon needs to store the plan type in the database and pass that information to the pageserver when attaching the tenant. * Storcon also needs to update PostHog with the active tenants, for example, when the tenant gets detached/attached. Assume each active tenant gets detached/attached every week, that would be 4X requests per month. * We do not need to update bill type or resident size to PostHog as all these are evaluated locally. * After each local evaluation of the feature flag, we need to call PostHog's capture event API to update the result of the evaluation that the feature is enabled. We can do this when the flag gets changed compared with the last cached state in memory. That would be at least 4X (assume we do deployment every week so the cache gets cleared) and maybe an additional multiplifier of 10 assume we have 10 active features. In this case, we will issue 86400Y + 40X requests per month. Assume X = 1,000,000 and Y = 100, | | HTTP Evaluation | Local Evaluation | |---|---|---| | Latency of propagating the conditions/properties for feature flag | 24 hours | available locally | | Latency of applying the feature flag | 1 hour | 5 minutes | | Can properties be reported from different services | Yes | No | | Do we need to sync billing info etc to pageserver | No | Yes | | Cost | 75400$ / month | 4864$ / month | # Our Solution We will use PostHog _only_ as an UI to configure the feature flags. Whether a feature is enabled or not can only be queried through storcon/pageserver instead of using the PostHog UI. (We could report it back to PostHog via `capture_event` but it costs $$$.) This allows us to ramp up the feature flag functionality fast at first. At the same time, it would also give us the option to migrate to our own solution once we want to have more properties and more complex evaluation rules in our system. * We will create several fake users (tenants) in PostHog that contains all the properties we will use for evaluating a feature flag (i.e., resident size, billing type, pageserver id, etc.) * We will use PostHog's local evaluation API to poll the configuration of the feature flags and evaluate them locally on each of the pageserver. * The evaluation result will not be reported back to PostHog. * Storcon needs to pull some information from cplane database. * To know if a feature is currently enabled or not, we need to call the storcon/pageserver API; and we won't be able to know if a feature has been enabled on a tenant before easily: we need to look at the Grafana logs. We only need to pay for the 86400Y local evaluation requests (that would be setting Y=0 in solution 2 => $864/month, and even less if we proxy it through storcon). ## Implementation * Pageserver: implement a PostHog local evaluation client. The client will be shared across all tenants on the pageserver with a single API: `evaluate(tenant_id, feature_flag, properties) -> json`. * Storcon: if we need plan type as the evaluation condition, pull it from cplane database. * Storcon/Pageserver: implement an HTTP API `:tenant_id/feature/:feature` to retrieve the current feature flag status. * Storcon/Pageserver: a loop to update the feature flag spec on both storcon and pageserver. Pageserver loop will only be activated if storcon does not push the specs to the pageserver. ## Difference from Tenant Config * Feature flags can be modified by percentage, and the default config for each feature flag can be modified in UI without going through the release process. * Feature flags are more flexible and won't be persisted anywhere and will be passed as plain JSON over the wire so that do not need to handle backward/forward compatibility as in tenant config. * The expectation of tenant config is that once we add a flag we cannot remove it (or it will be hard to remove), but feature flags are more flexible. # Final Implementation * We added a new crate `posthog_lite_client` that supports local feature evaluations. * We set up two projects "Storage (staging)" and "Storage (production)" in the PostHog console. * Each pageserver reports 10 fake tenants to PostHog so that we can get all combinations of regions (and other properties) in the PostHog UI. * Supported properties: AZ, neon_region, pageserver, tenant_id. * You may use "Pageserver Feature Flags" dashboard to see the evaluation status. * The feature flag spec is polled on storcon every 30s (in each of the region) and storcon will propagate the spec to the pageservers. * The pageserver housekeeping loop updates the tenant-specific properties (e.g., remote size) for evaluation. Each tenant has a `feature_resolver` object. After you add a feature flag in the PostHog console, you can retrieve it with: ```rust // Boolean flag self .feature_resolver .evaluate_boolean("flag") .is_ok() // Multivariate flag self .feature_resolver .evaluate_multivariate("gc-comapction-strategy") .ok(); ``` The user needs to handle the case where the evaluation result is an error. This can occur in a variety of cases: * During the pageserver start, the feature flag spec has not been retrieved. * No condition group is matched. * The feature flag spec contains an operand/operation not supported by the lite PostHog library. For boolean flags, the return value is `Result<(), Error>`. `Ok(())` means the flag is evaluated to true. Otherwise, there is either an error in evaluation or it does not match any groups. For multivariate flags, the return value is `Result`. `Ok(variant)` indicates the flag is evaluated to a variant. Otherwise, there is either an error in evaluation or it does not match any groups. The evaluation logic is documented in the PostHog lite library. It compares the consistent hash of a flag key + tenant_id with the rollout percentage and determines which tenant to roll out a specific feature. Users can use the feature flag evaluation API to get the flag evaluation result of a specific tenant for debugging purposes. ``` curl http://localhost:9898/v1/tenant/:tenant_id/feature_flag?flag=:key&as=multivariate/boolean" ``` By default, the storcon pushes the feature flag specs to the pageservers every 30 seconds, which means that a change in feature flag in the PostHog UI will propagate to the pageservers within 30 seconds. # Future Works * Support dynamic tenant properties like logical size as the evaluation condition. * Support properties like `plan_type` (needs cplane to pass it down). * Report feature flag evaluation result back to PostHog (if the cost is okay). * Fast feature flag evaluation cache on critical paths (e.g., cache a feature flag result in `AtomicBool` and use it on the read path). ================================================ FILE: docs/rfcs/2025-02-14-storage-controller.md ================================================ ## Summary This is a retrospective RFC to document the design of the `storage-controller` service. This service manages the physical mapping of Tenants and Timelines to Pageservers and Safekeepers. It acts as the API for "storage" as an abstract concept: enabling other parts of the system to reason about things like creating/deleting tenants and timelines without having to understand exactly which pageserver and safekeeper to communicate, or any subtle rules about how to orchestrate these things. The storage controller was implemented in the first half of 2024 as an essential part of storage sharding, especially [shard splitting](032-shard-splitting.md). It initially managed only pageservers, but has extended in 2025 to also manage safekeepers. In some places you may seen unqualified references to 'nodes' -- those are pageservers. ## Design Choices ### Durability We rely on an external postgres for all durable state. No local storage is used. We avoid any unnecessary I/O to durable storage. For example: - most tracking of in-flight changes to the system is done in-memory rather than recording progress/steps in a database - When migrating tenant shards between pageservers we only touch the database to increment generation numbers, we do not persist the total state of a tenant shard. Being frugal with database I/O has two benefits: - It avoids the database becoming a practical scaling bottleneck (we expect in-memory scale issues to be hit before we hit e.g. transactions-per-second issues) - It reduces cost when using a cloud database service to run the controller's postgres database. The trade-off is that there is a "bootstrapping" problem: a controller can't be deployed in isolation, one must first have some existing database system. In practice, we expect that Neon is deployed in one of the following ways: - into a cloud which has a postgres service that can be used to run the controller - into a mature on-prem environment that has existing facilities for running databases - into a test/dev environment where a simple one-node vanilla postgres installation is sufficient ### Consensus The controller does _not_ implement any strong consensus mechanism of its own. Instead: - Where strong consistency is required (for example, for pageserver generation numbers), this responsibility is delegated to a transaction in our postgres database. - Highly available deploys are done using a simple in-database record of what controller instances are available, distinguished by timestamps, rather than having controllers directly negotiate a leader. Avoiding strong consensus among controller processes is a cost saving (we avoid running three controllers all the time), and simplifies implementation (we do not have to phrase all configuration changes as e.g raft transactions). The trade-off is that under some circumstances a controller with partial network isolation can cause availability issues in the cluster, by making changes to pageserver state that might disagree with what the "true" active controller is trying to do. The impact of this is bounded by our `controllers` database table, that enables a rogue node to eventually realise that it is not the leader and step down. If a rogue node can't reach the database, then it implicitly stops making progress. A rogue controller cannot durably damage the system because pageserver data and safekeeper configs are protected by generation numbers that are only updated via postgres transactions (i.e. no controller "trusts itself" to independently make decisions about generations). ### Scale We design for high but not unlimited scale. The memory footprint of each tenant shard is small (~8kB), so it is realistic to scale up to a million attached shards on a server with modest resources. Tenants in a detached state (i.e. not active on pageservers) do not need to be managed by storage controller, and can be relegated from memory to the database. Typically, a tenant shard is updated about once a week, when we do a deploy. During deploys, we relocate a few thousand tenants from each pageserver while it is restarted, so it is extremely rare for the controller to have to do O(N) work (on all shards at once). There are places where we do O(N) work: - On normal startup, when loading from the database into memory - On unclean startup (with no handover of observed state from a previous controller), where we will scan all shards on all pageservers. It is important that these locations are written efficiently. At high scale we should still expect runtimes of the order tens of seconds to complete a storage controller start. When the practical scale limit of a single storage controller is reached, just deploy another one with its own pageservers & safekeepers: each controller+its storage servers should be thought of as a logical cluster or "cell" of storage. # High Level Design The storage controller is an in-memory system (i.e. state for all attached tenants is held in memory _as well as_ being represented in durable postgres storage). ## Infrastructure The storage controller is an async rust binary using tokio. The storage controller is built around the `Service` type. This implements all the entry points for the outside world's interaction with the controller (HTTP handlers are mostly thin wrappers of service functions), and holds most in-memory state (e.g. the list of tenant shards). The state is held in a `ServiceInner` wrapped in a RwLock. This monolithic lock is used to simplify reasoning about code that mutates state: each function that takes a write lock may be thought of as a serializable transaction on the in-memory state. This lock is clearly a bottleneck, but nevertheless is scalable to managing millions of tenants. Persistent state is held in a postgres database, and we use the `diesel` crate to provide database client functionality. All database access is wrapped in the `Persistence` type -- this makes it easy to understand which code is touching the database. The database is only used when necessary, i.e. for state that cannot be recovered another way. For example, we do not store the secondary pageserver locations of tenant shards in the database, rather we learn these at startup from running pageservers, and/or make scheduling decisions to fill in the gaps. This adds some complexity, but massively reduces the load on the database, and enables running the storage controller with a very cheap postgres instance. ## Pageserver tenant scheduling & reconciliation ### Intent & observed state Each tenant shard is represented by type `TenantShard`, which has an 'intent' and 'observed' state. Setting the intent state is called _scheduling_, and doing remote I/O to make observed state match intent state is called _reconciliation_. The `Scheduler` type is responsible for making choices about the intent state, such as choosing a pageserver for a new tenant shard, or assigning a replacement pageserver when the original one fails. The observed state is updated after tenant reconciliation (see below), and has the concept of a `None` state for a pageserver, indicating unknown state. This is used to ensure that we can safely clean up after we start but do not finish a remote call to a pageserver, or if a pageserver restarts and we are uncertain of its state. ### Tenant Reconciliation The `Reconciler` type is responsible for updating pageservers to achieve the intent state. It is instantiated when `Service` determines that a shard requires reconciliation, and owned by a background tokio task that runs it to completion. Reconciler does not have access to the `Service` state: it is populated with a snapshot of relevant information when constructed, and submits is results to a channel that `Service` consumes to update the tenant shard's observed state. The Reconciler does have access to the database, but only uses it for a single purpose: updating shards' generation numbers immediately before attaching them to a pageserver. Operations that change a tenant's scheduling will spawn a reconciler if necessary, and there is also a background loop which checks every shard for the need to reconcile -- this background loop ensures eventual progress if some earlier reconciliations failed for some reason. The reconciler has a general purpose code path which will attach/detach from pageservers as necessary, and a special case path for live migrations. The live migration case is more common in practice, and is taken whenever the current observed state indicates that we have a healthy attached location to migrate from. This implements live migration as described in the earlier [live migration RFC](028-pageserver-migration.md). ### Scheduling optimisation During the periodic background reconciliation loop, the controller also performance _scheduling optimization_. This is the process of looking for shards that are in sub-optimal locations, and moving them. Typically, this means: - Shards attached outside their preferred AZ (e.g. after a node failure), to migrate them back to their preferred AZ - Shards attached on the same pageserver as some other shards in the same tenant, to migrate them elsewhere (e.g. after a shard split) Scheduling optimisation is a multi-step process to ensure graceful cutovers, e.g. by creating new secondary location, waiting for it to warm up, then cutting over. This is not done as an explicit queue of operations, but rather by iteratively calling the optimisation function, which will recognise each intervening state as something that can generate the next optimisation. ### Pageserver heartbeats and failure The `Heartbeater` type is responsible for detecting when a pageserver becomes unavailable. This is fed back into `Service` for action: when a pageserver is marked unavailable, tenant shards on that pageserver are rescheduled and Reconcilers are spawned to cut them over to their new location. ## Pageserver timeline CRUD operations By CRUD operations, we mean creating and deleting timelines. The authoritative storage for which timelines exist on the pageserver is in S3, and is governed by the pageserver's system of generation numbers. Because a shard can be attached to multiple pageservers concurrently, we need to handle this when doing timeline CRUD operations: - A timeline operation is only persistent if _after_ the ack from a pageserver, that pageserver's generation is still the latest. - For deletions in particular, they are only persistent if _all_ attached locations have acked the deletion operation, since if only the latest one has acked then the timeline could still return from the dead if some old-generation attachment writes an index for it. ## Zero-downtime controller deployments When two storage controllers run at the same time, they coordinate via the database to establish one leader, and the other controller may proxy requests to this leader See [Storage controller restarts RFC](037-storage-controller-restarts.md). Note that this is not a strong consensus mechanism: the controller must also survive split-brain situations. This is respected by code that e.g. increments version numbers, which uses database transactions that check the expected value before modifying it. A split-brain situation can impact availability (e.g. if two controllers are fighting over where to attach a shard), but it should never impact durability and data integrity. ## Graceful drain & fill of pageservers during deploys The storage controller has functionality for draining + filling pageservers while deploying new pageserver binaries, so that clients are not actively using a pageserver while it restarts. See [Graceful restarts RFC](033-storage-controller-drain-and-fill.md) ## Safekeeper timeline scheduling This is currently under development, see [Safekeeper dynamic membership change RFC](035-safekeeper-dynamic-membership-change.md). ================================================ FILE: docs/rfcs/2025-03-17-compute-prewarm.md ================================================ # Compute rolling restart with prewarm Created on 2025-03-17 Implemented on _TBD_ Author: Alexey Kondratov (@ololobus) ## Summary This RFC describes an approach to reduce performance degradation due to missing caches after compute node restart, i.e.: 1. Rolling restart of the running instance via 'warm' replica. 2. Auto-prewarm compute caches after unplanned restart or scale-to-zero. ## Motivation Neon currently implements several features that guarantee high uptime of compute nodes: 1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure. 2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast. 3. Preemptive NeonVM compute provisioning in case of k8s node unavailability. This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes. During restart, compute loses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches, so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances. ## Non Goals - Details of the persistence storage for prewarm data are out of scope, there is a separate RFC for that: . - Complete compute/Postgres HA setup and flow. Although it was originally in scope of this RFC, during preliminary research it appeared to be a rabbit hole, so it's worth of a separate RFC. - Low-level implementation details for Postgres replica-to-primary promotion. There are a lot of things to think and care about: how to start walproposer, [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html), and so on, but it's worth of at least a separate one-pager design document if not RFC. ## Impacted components Postgres, compute_ctl, Control plane, Endpoint storage for unlogged storage of compute files. For the latter, we will need to implement a uniform abstraction layer on top of S3, ABS, etc., but S3 is used in text interchangeably with 'endpoint storage' for simplicity. ## Proposed implementation ### compute_ctl spec changes and auto-prewarm We are going to extend the current compute spec with the following attributes ```rust struct ComputeSpec { /// [All existing attributes] ... /// Whether to do auto-prewarm at start or not. /// Default to `false`. pub lfc_auto_prewarm: bool /// Interval in seconds between automatic dumps of /// LFC state into S3. Default `None`, which means 'off'. pub lfc_dump_interval_sec: Option } ``` When `lfc_dump_interval_sec` is set to `N`, `compute_ctl` will periodically dump the LFC state and store it in S3, so that it could be used either for auto-prewarm after restart or by replica during the rolling restart. For enabling periodic dumping, we should consider the following value `lfc_dump_interval_sec=300` (5 minutes), same as in the upstream's `pg_prewarm.autoprewarm_interval`. When `lfc_auto_prewarm` is set to `true`, `compute_ctl` will start prewarming the LFC upon restart iif some of the previous states is present in S3. ### compute_ctl API 1. `POST /store_lfc_state` -- dump LFC state using Postgres SQL interface and store result in S3. This has to be a blocking call, i.e. it will return only after the state is stored in S3. If there is any concurrent request in progress, we should return `429 Too Many Requests`, and let the caller to retry. 2. `GET /dump_lfc_state` -- dump LFC state using Postgres SQL interface and return it as is in text format suitable for the future restore/prewarm. This API is not strictly needed at the end state, but could be useful for a faster prototyping of a complete rolling restart flow with prewarm, as it doesn't require persistent for LFC state storage. 3. `POST /restore_lfc_state` -- restore/prewarm LFC state with request ```yaml RestoreLFCStateRequest: oneOf: - type: object required: - lfc_state properties: lfc_state: type: string description: Raw LFC content dumped with GET `/dump_lfc_state` - type: object required: - lfc_cache_key properties: lfc_cache_key: type: string description: | endpoint_id of the source endpoint on the same branch to use as a 'donor' for LFC content. Compute will look up LFC content dump in S3 using this key and do prewarm. ``` where `lfc_state` and `lfc_cache_key` are mutually exclusive. The actual prewarming will happen asynchronously, so the caller need to check the prewarm status using the compute's standard `GET /status` API. 4. `GET /status` -- extend existing API with following attributes ```rust struct ComputeStatusResponse { // [All existing attributes] ... pub prewarm_state: PrewarmState } /// Compute prewarm state. Will be stored in the shared Compute state /// in compute_ctl struct PrewarmState { pub status: PrewarmStatus /// Total number of pages to prewarm pub pages_total: i64 /// Number of pages prewarmed so far pub pages_processed: i64 /// Optional prewarm error pub error: Option } pub enum PrewarmStatus { /// Prewarming was never requested on this compute Off, /// Prewarming was requested, but not started yet Pending, /// Prewarming is in progress. The caller should follow /// `PrewarmState::progress`. InProgress, /// Prewarming has been successfully completed Completed, /// Prewarming failed. The caller should look at /// `PrewarmState::error` for the reason. Failed, /// It is intended to be used by auto-prewarm if none of /// the previous LFC states is available in S3. /// This is a distinct state from the `Failed` because /// technically it's not a failure and could happen if /// compute was restart before it dumped anything into S3, /// or just after the initial rollout of the feature. Skipped, } ``` 5. `POST /promote` -- this is a **blocking** API call to promote compute replica into primary. This API should be very similar to the existing `POST /configure` API, i.e. accept the spec (primary spec, because originally compute was started as replica). It's a distinct API method because semantics and response codes are different: - If promotion is done successfully, it will return `200 OK`. - If compute is already primary, the call will be no-op and `compute_ctl` will return `412 Precondition Failed`. - If, for some reason, second request reaches compute that is in progress of promotion, it will respond with `429 Too Many Requests`. - If compute hit any permanent failure during promotion `500 Internal Server Error` will be returned. ### Control plane operations The complete flow will be present as a sequence diagram in the next section, but here we just want to list some important steps that have to be done by control plane during the rolling restart via warm replica, but without much of low-level implementation details. 1. Register the 'intent' of the instance restart, but not yet interrupt any workload at primary and also accept new connections. This may require some endpoint state machine changes, e.g. introduction of the `pending_restart` state. Being in this state also **mustn't prevent any other operations except restart**: suspend, live-reconfiguration (e.g. due to notify-attach call from the storage controller), deletion. 2. Start new replica compute on the same timeline and start prewarming it. This process may take quite a while, so the same concurrency considerations as in 1. should be applied here as well. 3. When warm replica is ready, control plane should: 3.1. Terminate the primary compute. Starting from here, **this is a critical section**, if anything goes off, the only option is to start the primary normally and proceed with auto-prewarm. 3.2. Send cache invalidation message to all proxies, notifying them that all new connections should request and wait for the new connection details. At this stage, proxy has to also drop any existing connections to the old primary, so they didn't do stale reads. 3.3. Attach warm replica compute to the primary endpoint inside control plane metadata database. 3.4. Promote replica to primary. 3.5. When everything is done, finalize the endpoint state to be just `active`. ### Complete rolling restart flow ```mermaid sequenceDiagram autonumber participant proxy as Neon proxy participant cplane as Control plane participant primary as Compute (primary) box Compute (replica) participant ctl as compute_ctl participant pg as Postgres end box Endpoint unlogged storage participant s3proxy as Endpoint storage service participant s3 as S3/ABS/etc. end cplane ->> primary: POST /store_lfc_state primary -->> cplane: 200 OK cplane ->> ctl: POST /restore_lfc_state activate ctl ctl -->> cplane: 202 Accepted activate cplane cplane ->> ctl: GET /status: poll prewarm status ctl ->> s3proxy: GET /read_file s3proxy ->> s3: read file s3 -->> s3proxy: file content s3proxy -->> ctl: 200 OK: file content proxy ->> cplane: GET /proxy_wake_compute cplane -->> proxy: 200 OK: old primary conninfo ctl ->> pg: prewarm LFC activate pg pg -->> ctl: prewarm is completed deactivate pg ctl -->> cplane: 200 OK: prewarm is completed deactivate ctl deactivate cplane cplane -->> cplane: reassign replica compute to endpoint,
start terminating the old primary compute activate cplane cplane ->> proxy: invalidate caches proxy ->> cplane: GET /proxy_wake_compute cplane -x primary: POST /terminate primary -->> cplane: 200 OK note over primary: old primary
compute terminated cplane ->> ctl: POST /promote activate ctl ctl ->> pg: pg_ctl promote activate pg pg -->> ctl: done deactivate pg ctl -->> cplane: 200 OK deactivate ctl cplane -->> cplane: finalize operation cplane -->> proxy: 200 OK: new primary conninfo deactivate cplane ``` ### Network bandwidth and prewarm speed It's currently known that pageserver can sustain about 3000 RPS per shard for a few running computes. Large tenants are usually split into 8 shards, so the final formula may look like this: ```text 8 shards * 3000 RPS * 8 KB =~ 190 MB/s ``` so depending on the LFC size, prewarming will take at least: - ~5s for 1 GB - ~50s for 10 GB - ~5m for 100 GB - \>1h for 1 TB In total, one pageserver is normally capped by 30k RPS, so it obviously can't sustain many computes doing prewarm at the same time. Later, we may need an additional mechanism for computes to throttle the prewarming requests gracefully. ### Reliability, failure modes and corner cases We consider following failures while implementing this RFC: 1. Compute got interrupted/crashed/restarted during prewarm. The caller -- control plane -- should detect that and start prewarm from the beginning. 2. Control plane promotion request timed out or hit network issues. If it never reached the compute, control plane should just repeat it. If it did reach the compute, then during retry control plane can hit `409` as previous request triggered the promotion already. In this case, control plane need to retry until either `200` or permanent error `500` is returned. 3. Compute got interrupted/crashed/restarted during promotion. At restart it will ask for a spec from control plane, and its content should signal compute to start as **primary**, so it's expected that control plane will continue polling for certain period of time and will discover that compute is ready to accept connections if restart is fast enough. 4. Any other unexpected failure or timeout during prewarming. This **failure mustn't be fatal**, control plane has to report failure, terminate replica and keep primary running. 5. Any other unexpected failure or timeout during promotion. Unfortunately, at this moment we already have the primary node stopped, so the only option is to start primary again and proceed with auto-prewarm. 6. Any unexpected failure during auto-prewarm. This **failure mustn't be fatal**, `compute_ctl` has to report the failure, but do not crash the compute. 7. Control plane failed to confirm that old primary has terminated. This can happen, especially in the future HA setup. In this case, control plane has to ensure that it sent VM deletion and pod termination requests to k8s, so long-term we do not have two running primaries on the same timeline. ### Security implications There are two security implications to consider: 1. Access to `compute_ctl` API. It has to be accessible from the outside of compute, so all new API methods have to be exposed on the **external** HTTP port and **must** be authenticated with JWT. 2. Read/write only your own LFC state data in S3. Although it's not really a security concern, since LFC state is just a mapping of blocks present in LFC at certain moment in time; it still has to be highly restricted, so that i) only computes on the same timeline can read S3 state; ii) each compute can only write to the path that contains it's `endpoint_id`. Both of this must be validated by Endpoint storage service using the JWT token provided by `compute_ctl`. ### Unresolved questions #### Billing, metrics and monitoring Currently, we only label computes with `endpoint_id` after attaching them to the endpoint. In this proposal, this means that temporary replica will remain unlabelled until it's promoted to primary. We can also hide it from users in the control plane API, but what to do with billing and monitoring is still unclear. We can probably mark it as 'billable' and tag with `project_id`, so it will be billed, but not interfere in any way with the current primary monitoring. Another thing to consider is how logs and metrics export will switch to the new compute. It's expected that OpenTelemetry collector will auto-discover the new compute and start scraping metrics from it. #### Auto-prewarm It's still an open question whether we need auto-prewarm at all. The author's gut-feeling is that yes, we need it, but might be not for all workloads, so it could end up exposed as a user-controllable knob on the endpoint. There are two arguments for that: 1. Auto-prewarm existing in upstream's `pg_prewarm`, _probably for a reason_. 2. There are still could be 2 flows when we cannot perform the rolling restart via the warm replica: i) any failure or interruption during promotion; ii) wake up after scale-to-zero. The latter might be challenged as well, i.e. one can argue that auto-prewarm may and will compete with user-workload for storage resources. This is correct, but it might as well reduce the time to get warm LFC and good performance. #### Low-level details of the replica promotion There are many things to consider here, but three items just off the top of my head: 1. How to properly start the `walproposer` inside Postgres. 2. What to do with logical replication. Currently, we do not include logical replication slots inside basebackup, because nobody advances them at replica, so they just prevent the WAL deletion. Yet, we do need to have them at primary after promotion. Starting with Postgres 17, there is a new feature called [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html) and `synchronized_standby_slots` setting, but we need a plan for the older versions. Should we request a new basebackup during promotion? 3. How do we guarantee that replica will receive all the latest WAL from safekeepers? Do some 'shallow' version of sync safekeepers without data copying? Or just a standard version of sync safekeepers? ## Alternative implementation The proposal already assumes one of the alternatives -- do not have any persistent storage for LFC state. This is possible to implement faster with the proposed API, but it means that we do not implement auto-prewarm yet. ## Definition of Done At the end of implementing this RFC we should have two high-level settings that enable: 1. Auto-prewarm of user computes upon restart. 2. Perform primary compute restart via the warm replica promotion. It also has to be decided what's the criteria for enabling one or both of these flows for certain clients. ================================================ FILE: docs/rfcs/2025-04-30-direct-io-for-pageserver.md ================================================ # Direct IO For Pageserver Date: Apr 30, 2025 ## Summary This document is a retroactive RFC. It - provides some background on what direct IO is, - motivates why Pageserver should be using it for its IO, and - describes how we changed Pageserver to use it. The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR. People primarily involved in this project were: - Yuchen Liang - Vlad Lazar - Christian Schwarz ## Timeline For posterity, here is the rough timeline of the development work that got us to where we are today. - Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API - March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode - Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users - Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go. - Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376)) - Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO - Apr 2025: develop & roll out direct IO for the write path ## Background: Terminology & Glossary **kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents. The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k). The cache lives in kernel memory and is not directly accessible through userspace. **Buffered IO**: an application's read/write system calls go through the kernel page cache. For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps track of the fact that the page is now "dirty" in some ancillary structure. **Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant ones are a) explicit request by userspace (`fsync`) and b) memory pressure. **Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity. If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations. Before reusing a page like that, the page has to be written back (writeback, see above). The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only way to get that memory is by eviction & re-using a dirty page cache page. Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`. I refer to this effect as the "malloc latency backscatter" caused by buffered IO. **Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem is still involved because it is ultimately in charge of mapping the concept of files & offsets within them to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). The IO operations will fail at runtime with EINVAL if the alignment requirements are not met. **"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers, kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by the application. It takes more effort by the application to program with direct instead of buffered IO. The return is precise control over and a clear distinction between consumption/modification of memory vs disk. **Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache"). Its caching unit is 8KiB blocks of the layer files written by Pageserver. A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer. The default size is tiny (64MiB), very much like Postgres's `shared_buffers`. We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year. **VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name. Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux. However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`). ## Background: History Of Caching In Pageserver For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO. It performed write-back to the kernel using buffered IO. We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994). The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers. The `PageCache` pages are usable as owned IO buffers. We then started bypassing PageCache for user data blocks. Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets. The disk btree embedded in delta & image layers remains `PageCache`'d. Epics for that work were: - Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright. - Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks: - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice) - InMemoryLayer - Compaction The outcome of the above: 1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache). 2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`. In production we size the PS `PageCache` to be 2GiB. Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines. High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS). The response to this is to migrate tenants away, or increase PS `PageCache` size. It is currently manual but could be automated, e.g., in Storage Controller. In the future, we may eliminate the `PageCache` even for indirect blocks. For example with an LRU cache that has as unit the entire disk btree content instead of individual blocks. ## High-Level Design So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache. We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem. This achieves the following system properties: **Predictable VirtualFile latencies** * With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss. * With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure. * With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe. But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree. * By switching to direct IO, above operations will have the (predictable) device latency -- always. Reads and appends always go to disk. And malloc will not have to write back dirty data. **Explicitness & Tangibility of resource usage** * In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant. * By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control. * We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?"). * We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that. **CPU Efficiency** * The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path. * Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements. The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are: - read latency improvements for repeat reads of the same data ("locality of reference") - asterisk: only if that state is still cache-resident by time of next access - write throughput by having kernel page cache batch small VFS writes into bigger disk writes - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback We are **happy to make this trade-off**: - Because of the advantages listed above. - Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache. (At just 2GiB PS PageCache size, we average a 99.95% hit rate). So, the latency of going to disk is only for data block reads, not the index traversal. - Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance). And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it. (See the appendix for a more detailed explanation why this is). - So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before. ### Desired End State The desired end state of the project is as follows, and with some asterisks, we have achieved it. All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache. In particular, the "data path" includes - the wal ingest path - compaction - anything on the `Timeline::get` / `Timeline::get_vectored` path. The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache. Hit rate target is 99.95%. There are no regressions to ingest latency. The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`. We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO. Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO). The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request. We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call. (This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth). ## Design & Implementation ### Prerequisites A lot of prerequisite work had to happen to enable use of direct IO. To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path: - page_service level server-side batching (config field `page_service_pipelining`) - concurrent IO (config field `get_vectored_concurrent_io`) The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376). Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799). The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`. The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC. For the write path, and especially WAL ingest, we need to hide write latency. We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled buffer happen in a sidecar tokio task while new writes fill a new buffer. We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`. The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558). ### Ensuring Adherence to Alignment Requirements Direct IO puts requirements on - memory buffer alignment - io size (=memory buffer size) - file offset alignment The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!). In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe). Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple. We made this decision because: - a) it is compatible with all the environments we need to run in - b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart) - c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower). - d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO. This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD). The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements. All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits. Implementors of the marker traits are: - `IoBuffer` / `IoBufferMut`: used for most reads and writes - `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!) The alignment requirement is infectious; it permeates bottom-up throughout the code base. We stop the infection at roughly the same layers in the code base where we stopped permeating the use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap. The places where we currently stop permeating are sort of arbitrary. For example, it would probably make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s. The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors: - non-adherence to file offset alignment requirements - non-adherence to io size requirements The following higher-level constructs ensure we meet the requirements: - read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples. - write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment. Note that these types are used always, regardless of whether direct IO is enabled or not. There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512). But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO. ### Configuration / Feature Flagging In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements. To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations. We set `O_DIRECT` based on: - the VirtualFile API used to create/open the VirtualFile instance - the `virtual_file_io_mode` configuration flag - the OpenOptions `read` and/or `write` flags. The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list. Other APIs never use `O_DIRECT`. (The name is bad and should really be `_maybe_direct_io`.) The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path). At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available. The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags. The result is the following runtime behavior: |what|OpenOptions|`v_f_io_mode`
=`buffered`|`v_f_io_mode`
=`direct`|`v_f_io_mode`
=`direct-rw`| |-|-|-|-|-| |`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT| |`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT| |`InMemoryLayer`|read + write|()|()*|O_DIRECT| |`DeltaLayerWriter`| write | () | () | O_DIRECT | |`ImageLayerWriter`| write | () | () | O_DIRECT | |`download_layer_file`|write |()|()|O_DIRECT| The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`. That period was when we implemented and shipped the first version of `BufferedWriter`. We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`. The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later, in https://github.com/neondatabase/neon/pull/11558. Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction. For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set. ## Correctness Validation The correctness risks with this project were: - Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation. These types expose an API that is largely identical to that of the `bytes` crate and/or Vec. - Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path. We sadly do not have infrastructure to run pageserver under `cargo miri`. So for memory safety issues, we relied on careful peer review. We do assert the production-like alignment requirements in testing builds. However, these asserts were added retroactively. The actual validation before rollout happened in staging and pre-prod. We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite. I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements. Evidently developer testing was good enough. ## Performance Validation The read path went through a lot of iterations of benchmarking in staging and pre-prod. The benchmarks in those environments demonstrated performance regressions early in the implementation. It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions. The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns. ## Future Work There is minor and major follow-up work that can be considered in the future. Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list. Read Path: - PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally. Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size and potentially also use that to drive placement decisions of shards from StorageController https://github.com/neondatabase/neon/issues/9288 - ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache. But even then, an estimation of the working set would be helpful to figure out caching strategy. Write Path: - BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129 - ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101 - The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692 - Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676 Both: - A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster. This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts. However, padding latencies at microsecond scale is non-trivial. Misc: - We should finish trimming VirtualFile's scope to be truly limited to core data path read & write. Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string` are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809 # Appendix ## Why Kernel Page Cache Is Ineffective At Tenant High Density In the Motivation section, we stated: > - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance). The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss. That's either sequential scans or random reads. A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available. It is complete waste to have the kernel page cache cache data blocks in this case. Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space. In such cases, the WAL records of those updates likely sit on the same delta layer block. When Compute does a sequential scan, it sends a series of single-page requests for these individual pages. When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit. This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching. We can either add a small per-connection LRU cache for such delta layer blocks. Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice. This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32). There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these 1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation) 2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching). ================================================ FILE: docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md ================================================ # Concurrent IO for Pageserver Read Path Date: May 6, 2025 ## Summary This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025. The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files _as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete. Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`. The motivation for why this work had to happen when it happened was the switch of Pageserver to - not cache user data blocks in PS PageCache and - switch to use direct IO. More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`. ### Refs - Epic: https://github.com/neondatabase/neon/issues/9378 - Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002 - Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378 Design and implementation by: - Vlad Lazar - Christian Schwarz ## Background & Motivation The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps: - Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`). - Pass these values to walredo to reconstruct the page images. The read path used to be single-key but has been made multi-key some time ago. ([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link)) However, for simplicity, most of this doc will explain things in terms of a single key being requested. The `Value` retrieval step above can be broken down into the following functions: - **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction. - **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk. The main job here is to coalesce the small value reads into larger filesystem-level read operations. This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.) Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done. - **Perform the read IO** using `tokio-epoll-uring`. Before this project, above functions were sequentially interleaved, meaning: 1. we would advance traversal, ... 2. discover, that we need to read a value, ... 3. read it from disk using `tokio-epoll-uring`, ... 4. goto 1 unless we're done. This meant that if N `Value`s need to be read to reconstruct a page, the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`. ## Design The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before. But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution. After the last read from the last layer is submitted, we wait for the IOs to complete. Assuming the filesystem / disk is able to actually process the submitted IOs without queuing, we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`. Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe. Traversal will stall on on-demand layer download if a layer is not yet resident. It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index. ### Avoiding Waiting For IO During Traversal The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized. Before this project, traversal needed to perform IOs for the following: 1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks. 2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key, to determine whether the `Value::will_init` the page and therefore traversal can stop for this key. The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%. (Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.) The solution for (2) is source `will_init` from the disk btree index keys, which fortunately already encode this bit of information since the introduction of the current storage/layer format. ### Concurrent IOs, Submission & Completion To separate IO submission from waiting for its completion, we introduce the notion of an `IoConcurrency` struct through which IOs are issued. An IO is an opaque future that - captures the `tx` side of a `oneshot` channel - performs the read IO by calling `VirtualFile::read_exact_at().await` - sending the result into the `tx` Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct. The traversal code that submits the IO stores the the corresponding `oneshot::Receiver` in the `VectoredValueReconstructState`, in the the place where we previously stored the sequentially read `img` and `records` fields. When we're done with traversal, we wait for all submitted IOs: for each key, there is a future that awaits all the `oneshot::Receiver`s for that key, and then calls into walredo to reconstruct the page image. Walredo is now invoked concurrently for each value instead of sequentially. Walredo itself remains unchanged. The spawned IO futures are driven to completion by a sidecar tokio task that is separate from the task that performs all the layer visiting and spawning of IOs. That tasks receives the IO futures via an unbounded mpsc channel and drives them to completion inside a `FuturedUnordered`. ### Error handling, Panics, Cancellation-Safety There are two error classes during reconstruct data retrieval: * traversal errors: index lookup, move to next layer, and the like * value read IO errors A traversal error fails the entire `get_vectored` request, as before this PR. A value read error only fails reconstruction of that value. Panics and dropping of the `get_vectored` future before it completes leaves the sidecar task running and does not cancel submitted IOs (see next section for details on sidecar task lifecycle). All of this is safe, but, today's preference in the team is to close out all resource usage explicitly if possible, rather than cancelling + forgetting about it on drop. So, there is warning if we drop a `VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs. ### Sidecar Task Lifecycle The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct. The `IoConcurrency` object acts as a handle through which IO futures are submitted. The spawned tokio task holds the `Timeline::gate` open. It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped. Once the `IoConcurrency` struct is dropped, no new IO futures can come in but already submitted IO futures will be driven to completion regardless. We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe. But the underlying kernel and hardware resources are not magically freed up by that. So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete. Under normal conditions, this should be in the low hundreds of microseconds. It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack. The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to the (short-lived) functions/scope where we issue the IOs. We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)). For now, we just add another argument to the relevant code paths. ### Feature Gating The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`. The behavior from before this project is available through `IoConcurrency::Sequential`, which awaits the IO futures in place, without "spawning" or "submitting" them anywhere. The `get_vectored_concurrent_io` pageserver config variable determines the runtime value, **except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object. ### Alternatives Explored & Caveats Encountered A few words on the rationale behind having a sidecar *task* and what alternatives were considered but abandoned. #### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work We explored to not have a sidecar task, and instead have a `FuturesUnordered` per `Timeline::get_vectored`. We would queue all IO futures in it and poll it for the first time after traversal is complete (i.e., at `collect_pending_ios`). The obvious disadvantage, but not showstopper, is that we wouldn't be submitting IOs until traversal is complete. The showstopper however, is that deadlocks happen if we don't drive the IO futures to completion independently of the traversal task. The reason is that both the IO futures and the traversal task may hold _some_, _and_ try to acquire _more_, shared limited resources. For example, both the travseral task and IO future may try to acquire * a `VirtualFile` file descriptor cache slot async mutex (observed during impl) * a `tokio-epoll-uring` submission slot (observed during impl) * a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future) #### Why We Don't Do `tokio::task`-per-IO-future Another option is to spawn a short-lived `tokio::task` for each IO future. We implemented and benchmarked it during development, but found little throughput improvement and moderate mean & tail latency degradation. Concerns about pressure on the tokio scheduler led us to abandon this variant. ## Future Work In addition to what is listed here, also check the "Punted" list in the epic: https://github.com/neondatabase/neon/issues/9378 ### Enable `Timeline::get` The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`. The impact is that roughly the following parts of pageserver do not benefit yet: - parts of basebackup - reads performed by the ingest path - most internal operations that read metadata keys (e.g. `collect_keyspace`!) The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460 The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext). Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the place that puts the `IoConcurrency` into the `RequestContext`. We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`. ### Concurrent On-Demand Downloads enabled by Detached Indices As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index. Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695) we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example: - Move the `Layer::get_or_maybe_download().await` inside the IO futures. This goes in the opposite direction of the next "future work" item below, but it's easy to do. - Serve the IO future directly from object storage and dispatch the layer download to some other actor, e.g., an actor that is responsible for both downloads & eviction. ### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission, and then wait for completion. The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`. A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full). While avoiding spending of CPU cycles on processing of completions while we're still traversing. The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing. So, the submission part of the split API needs to process completions if squeue is full. In any way, this split API is precondition for the bigger issue with the design presented here, which we dicsuss in the next section. ### Opaque Futures Are Brittle The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating. However, we take on **brittleness** because callers must guarantee that the submitted futures are independent. By our experience, it is non-trivial to identify or rule out the interdependencies. See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details. The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer") and get back a means to wait for completion. The subsystem can thereby reason by its own how operations may be related; unlike today, where the submitted opaque future can do just about anything. ================================================ FILE: docs/rfcs/2025-07-07-node-deletion-api-improvement.md ================================================ # Node deletion API improvement Created on 2025-07-07 Implemented on _TBD_ ## Summary This RFC describes improvements to the storage controller API for gracefully deleting pageserver nodes. ## Motivation The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333) has several limitations: - Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036) - Process of node deletion is not graceful, i.e. it just imitates a node failure In this context, "graceful" node deletion means that users do not experience any disruption or negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers can handle the workload and all requirements are met). To achieve this, the system must perform live migration of all tenant shards from the node being deleted while the node is still running and continue processing all incoming requests. The node is removed only after all tenant shards have been safely migrated. Although live migrations can be achieved with the drain functionality, it leads to incorrect shard placement, such as not matching availability zones. This results in unnecessary work to optimize the placement that was just recently performed. If we delete a node before its tenant shards are fully moved, the new node won't have all the needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at first. If there are many tenant shards, this slowdown affects a huge amount of users. Graceful node deletion is more complicated and can introduce new issues. It takes longer because live migration of each tenant shard can last several minutes. Using non-blocking accessors may also cause deletion to wait if other processes are holding inner state lock. It also gets trickier because we need to handle other requests, like drain and fill, at the same time. ## Impacted components (e.g. pageserver, safekeeper, console, etc) - storage controller - pageserver (indirectly) ## Proposed implementation ### Tombstones To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced as part of the node stored information. Each node has a separate `NodeLifecycle` field with two possible states: `Active` and `Deleted`. When node deletion completes, the database row is not deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted` lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach and register functionality must be aware of tombstones. Additionally, new debug handlers are available for listing and deleting tombstones via the `/debug/v1/tombstone` path. ### Gracefulness The problem of making node deletion graceful is complex and involves several challenges: - **Cancellable**: The operation must be cancellable to allow administrators to abort the process if needed, e.g. if run by mistake. - **Non-blocking**: We don't want to block deployment operations like draining/filling on the node deletion process. We need clear policies for handling concurrent operations: what happens when a drain/fill request arrives while deletion is in progress, and what happens when a delete request arrives while drain/fill is in progress. - **Persistent**: If the storage controller restarts during this long-running operation, we must preserve progress and automatically resume the deletion process after the storage controller restarts. - **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled for deletion, as this would move shards to irrelevant locations. The drain process expects the node to return, so it only moves shards to backup locations, not to their preferred AZs. It also leaves secondary locations unmoved. This could result in unnecessary load on the storage controller and inefficient resource utilization. - **Force option**: Administrators need the ability to force immediate, non-graceful deletion when time constraints or emergency situations require it, bypassing the normal graceful migration process. See below for a detailed breakdown of the proposed changes and mechanisms. #### Node lifecycle New `NodeLifecycle` enum and a matching database field with these values: - `Active`: The normal state. All operations are allowed. - `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or will happen later, but the node will eventually be removed. All operations are allowed. - `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought back. The only action left is to remove its record from the database. Any attempt to register a node in this state will fail. This state persists across storage controller restarts. **State transition** ``` +--------------------+ +---| Active |<---------------------+ | +--------------------+ | | ^ | | start_node_delete | cancel_node_delete | v | | +----------------------------------+ | | ScheduledForDeletion | | +----------------------------------+ | | | | node_register | | | | delete_node (at the finish) | | | v | +---------+ tombstone_delete +----------+ | Deleted |-------------------------------->| no row | +---------+ +----------+ ``` #### NodeSchedulingPolicy::Deleting A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is running for the node right now. Only one node can have the `Deleting` policy at a time. The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage controller restart, any node previously marked as `Deleting` will have its scheduling policy reset to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state. `NodeSchedulingPolicy` transition details: 1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`. 2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the policy to its previous value. The policy is persisted in storcon DB. 3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since `NodeLifecycle::Deleted` prevents any further access to this field. The deletion process cannot be initiated for nodes currently undergoing deployment-related operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered once the node transitions to either the `Active` or `Pause` state. #### OperationTracker A replacement for `Option ongoing_operation`, the `OperationTracker` is a dedicated service state object responsible for managing all long-running node operations (drain, fill, delete) with robust concurrency control. Key responsibilities: - Orchestrates the execution of operations - Supports cancellation of currently running operations - Enforces operation constraints, e.g. allowing only single drain/fill operation at a time - Persists deletion state, enabling recovery of pending deletions across restarts - Ensures thread safety across concurrent requests #### Attached tenant shard processing When deleting a node, handle each attached tenant shard as follows: 1. Pick the best node to become the new attached (the candidate). 2. If the candidate already has this shard as a secondary: - Create a new secondary for the shard on another suitable node. Otherwise: - Create a secondary for the shard on the candidate node. 3. Wait until all secondaries are ready and pre-warmed. 4. Promote the candidate's secondary to attached. 5. Remove the secondary from the node being deleted. This process safely moves all attached shards before deleting the node. #### Secondary tenant shard processing When deleting a node, handle each secondary tenant shard as follows: 1. Choose the best node to become the new secondary. 2. Create a secondary for the shard on that node. 3. Wait until the new secondary is ready. 4. Remove the secondary from the node being deleted. This ensures all secondary shards are safely moved before deleting the node. ### Reliability, failure modes and corner cases In case of a storage controller failure and following restart, the system behavior depends on the `NodeLifecycle` state: - If `NodeLifecycle` is `Active`: No action is taken for this node. - If `NodeLifecycle` is `Deleted`: The node will not be re-added. - If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for this node. In case of a pageserver node failure during deletion, the behavior depends on the `force` flag: - If `force` is set: The node deletion will proceed regardless of the node's availability. - If `force` is not set: The deletion will be retried a limited number of times. If the node remains unavailable, the deletion process will pause and automatically resume when the node becomes healthy again. ### Operations concurrency The following sections describe the behavior when different types of requests arrive at the storage controller and how they interact with ongoing operations. #### Delete request Handler: `PUT /control/v1/node/:node_id/delete` 1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`: - Return `200 OK`: there is already an ongoing deletion request for this node 2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion` 3. Persist current scheduling policy 4. If there is no active operation (drain/fill/delete): - Run deletion process for this node #### Cancel delete request Handler: `DELETE /control/v1/node/:node_id/delete` 1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`: - Return `404 Not Found`: there is no current deletion request for this node 2. If the active operation is deleting this node, cancel it 3. Update & persist lifecycle to `NodeLifecycle::Active` 4. Restore the last scheduling policy from persistence #### Drain/fill request 1. If there are already ongoing drain/fill processes: - Return `409 Conflict`: queueing of drain/fill processes is not supported 2. If there is an ongoing delete process: - Cancel it and wait until it is cancelled 3. Run the drain/fill process 4. After the drain/fill process is cancelled or finished: - Try to find another candidate to delete and run the deletion process for that node #### Drain/fill cancel request 1. If the active operation is not the related process: - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same 2. Cancel the active operation 3. Try to find another candidate to delete and run the deletion process for that node ## Definition of Done - [x] Fix flaky node scenario and introduce related debug handlers - [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion request regardless of draining/filling requests and restarts - [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to recommended locations - [ ] Deploying does not break due to long deletions - drain/fill operations override deletion process and deletion resumes after drain/fill completes - [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a pageserver node does not respond) - [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli ================================================ FILE: docs/rfcs/README.md ================================================ # Neon RFCs ## Overview This directory contains Request for Comments documents, or RFCs, for features or concepts that have been proposed. Alternative names: technical design doc, ERD, one-pager To make a new proposal, create a new text file in this directory and open a Pull Request with it. That gives others a chance and a forum to comment and discuss the design. When a feature is implemented and the code changes are committed, also include the corresponding RFC in this directory. Some of the RFCs in this directory have been implemented in some form or another, while others are on the roadmap, while still others are just obsolete and forgotten about. So read them with a grain of salt, but hopefully even the ones that don't reflect reality give useful context information. ## What We use Tech Design RFC’s to summarize what we are planning to implement in our system. These RFCs should be created for large or not obvious technical tasks, e.g. changes of the architecture or bigger tasks that could take over a week, changes that touch multiple components or their interaction. RFCs should fit into a couple of pages, but could be longer on occasion. ## Why We’re using RFCs to enable early review and collaboration, reduce uncertainties, risk and save time during the implementation phase that follows the Tech Design RFC. Tech Design RFCs also aim to avoid bus factor and are an additional measure to keep more peers up to date & familiar with our design and architecture. This is a crucial part for ensuring collaboration across timezones and setting up for success a distributed team that works on complex topics. ## Prior art - Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) - React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) - Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) - Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) ## How RFC lifecycle: - Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body. - RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. - Add labels to the PR in the same manner as you do Issues. Example TBD - Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. - The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach - RFCs stop evolving once the consensus is found or the proposal is implemented and merged. - RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. ### RFC template Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions. ```sh cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-.md ``` Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. ================================================ FILE: docs/rfcs/YYYY-MM-DD-copy-me.md ================================================ # Name Created on YYYY-MM-DD Implemented on _TBD_ ## Summary ## Motivation ## Non Goals (if relevant) ## Impacted components (e.g. pageserver, safekeeper, console, etc) ## Proposed implementation ### Reliability, failure modes and corner cases (if relevant) ### Interaction/Sequence diagram (if relevant) ### Scalability (if relevant) ### Security implications (if relevant) ### Unresolved questions (if relevant) ## Alternative implementation (if relevant) ## Pros/cons of proposed approaches (if relevant) ## Definition of Done (if relevant) ================================================ FILE: docs/safekeeper-protocol.md ================================================ # WAL proposer-safekeeper communication consensus protocol. ## General requirements and architecture There is single stateless master and several safekeepers. Number of safekeepers is determined by redundancy level. To minimize number of changes in Postgres core, we are using standard streaming replication from master (through WAL sender). This replication stream is initiated by the WAL proposer process that runs in the PostgreSQL server, which broadcasts the WAL generated by PostgreSQL to safekeepers. To provide durability we use synchronous replication at master (response to the commit statement is sent to the client only when acknowledged by WAL receiver). WAL proposer sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers. WAL proposer tries to establish connections with safekeepers. At any moment of time each safekeeper can serve exactly once proposer, but it can accept new connections. Any of safekeepers can be used as WAL server, producing replication stream. So both `Pagers` and `Replicas` (read-only computation nodes) can connect to safekeeper to receive WAL stream. Safekeepers is streaming WAL until it reaches min(`commitLSN`,`flushLSN`). Then replication is suspended until new data arrives from master. ## Handshake The goal of handshake is to collect quorum (to be able to perform recovery) and avoid split-brains caused by simultaneous presence of old and new master. Procedure of handshake consists of the following steps: 1. Broadcast information about server to all safekeepers (wal segment size, system_id,...) 2. Receive responses with information about safekeepers. 3. Once quorum of handshake responses are received, propose new `NodeId(max(term)+1, server.uuid)` to all of them. 4. On receiving proposed nodeId, safekeeper compares it with locally stored nodeId and if it is greater or equals then accepts proposed nodeId and persists this choice in the local control file. 5. If quorum of safekeepers approve proposed nodeId, then server assumes that handshake is successfully completed and switch to recovery stage. ## Recovery Proposer computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers. `RestartLSN` - is position in WAL which is known to be delivered to all safekeepers. In other words: `restartLSN` can be also considered as cut-off horizon (all preceding WAL segments can be removed). `FlushLSN` is position flushed by safekeeper to the local persistent storage. If max(`restartLSN`) != max(`flushLSN`), then recovery has to be performed. Proposer creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`). Then it downloads all WAL messages between max(`restartLSN`)..max(`flushLSN`). Messages are inserted in L1-list (ordered by LSN). Then we locate position of each safekeeper in this list according to their `flushLSN`s. Safekeepers that are not yet connected (out of quorum) should start from the beginning of the list (corresponding to `restartLSN`). We need to choose max(`flushLSN`) because voting quorum may be different from quorum committed the last message. So we do not know whether records with max(`flushLSN`) was committed by quorum or not. So we have to consider it committed to avoid loose of committed data. Calculated max(`flushLSN`) is called `VCL` (Volume Complete LSN). As far as it is chosen among quorum, there may be some other offline safekeeper with larger `VCL`. Once it becomes online, we need to overwrite its WAL beyond `VCL`. To support it, each safekeeper maintains `epoch` number. `Epoch` plays almost the same role as `term`, but algorithm of `epoch` bumping is different. `VCL` and new epoch are received by safekeeper from proposer during voting. But safekeeper doesn't switch to new epoch immediately after voting. Instead of it, safekeepers waits record with LSN > Max(`flushLSN`,`VCL`) is received. It means that we restore all records from old generation and switch to new generation. When proposer calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs. Let's looks at the examples. Consider that we have three safekeepers: S1, S2, S3. Si(N) means that i-th safekeeper has epoch=N. Ri(x) - WAL record for resource X with LSN=i. Assume that we have the following state: ``` S1(1): R1(a) S2(1): R1(a),R2(b) S3(1): R1(a),R2(b),R3(c),R4(d) - offline ``` Proposer choose quorum (S1,S2). VCL for them is 2. We download S2 to proposer and schedule its write to S1. After receiving record R5 the picture can be: ``` S1(2): R1(a),R2(b),R3(e) S2(2): R1(a),R2(b),R3(e) S3(1): R1(a),R2(b),R3(c),R4(d) - offline ``` Now if server is crashed or restarted, we perform new voting and doesn't matter which quorum we choose: (S1,S2), (S2,S3)... in any case VCL=3, because S3 has smaller epoch. R3(c) will be overwritten with R3(e): ``` S1(3): R1(a),R2(b),R3(e) S2(3): R1(a),R2(b),R3(e) S3(1): R1(a),R2(b),R3(e),R4(d) ``` Epoch of S3 will be adjusted once it overwrites R4: ``` S1(3): R1(a),R2(b),R3(e),R4(f) S2(3): R1(a),R2(b),R3(e),R4(f) S3(3): R1(a),R2(b),R3(e),R4(f) ``` Crash can happen before epoch was bumped. Let's return back to the initial position: ``` S1(1): R1(a) S2(1): R1(a),R2(b) S3(1): R1(a),R2(b),R3(c),R4(d) - offline ``` Assume that we start recovery: ``` S1(1): R1(a),R2(b) S2(1): R1(a),R2(b) S3(1): R1(a),R2(b),R3(c),R4(d) - offline ``` and then crash happens. During voting we choose quorum (S3,S3). Now them belong to the same epoch and S3 is most advanced among them. So VCL is set to 4 and we recover S1 and S2 from S3: ``` S1(1): R1(a),R2(b),R3(c),R4(d) S2(1): R1(a),R2(b),R3(c),R4(d) S3(1): R1(a),R2(b),R3(c),R4(d) ``` ## Main loop Once recovery is completed, proposer switches to normal processing loop: it receives WAL stream from Postgres and appends WAL messages to the list. At the same time it tries to push messages to safekeepers. Each safekeeper is associated with some element in message list and once it acknowledged receiving of the message, position is moved forward. Each queue element contains acknowledgment mask, which bits corresponds to safekeepers. Once all safekeepers acknowledged receiving of this message (by setting correspondent bit), then element can be removed from queue and `restartLSN` is advanced forward. Proposer maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers. `RestartLSN` equals to the LSN of head message in the list. `CommitLSN` is `flushLSN[nSafekeepers-Quorum]` element in ordered array with `flushLSN`s of safekeepers. `CommitLSN` and `RestartLSN` are included in requests sent from proposer to safekeepers and stored in safekeepers control file. To avoid overhead of extra fsync, this control file is not fsynced on each request. Flushing this file is performed periodically, which means that `restartLSN`/`commitLSN` stored by safekeeper may be slightly deteriorated. It is not critical because may only cause redundant processing of some WAL record. And `FlushLSN` is recalculated after node restart by scanning local WAL files. ## Fault tolerance If the WAL proposer process looses connection to safekeeper it tries to reestablish this connection using the same nodeId. Restart of PostgreSQL initiates new round of voting and switching new epoch. ## Limitations Right now message queue is maintained in main memory and is not spilled to the disk. It can cause memory overflow in case of presence of lagging safekeepers. It is assumed that in case of losing local data by some safekeepers, it should be recovered using some external mechanism. ## Glossary * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computation node * `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm ```python process WalProposer(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={}) function do_recovery(epoch,restart_lsn,VCL) leader = i:safekeepers[i].state.epoch=epoch and safekeepers[i].state.flushLsn=VCL wal_stream = safekeepers[leader].start_replication(restart_lsn,VCL) do message = wal_stream.read() message_queue.append(message) while message.startPos < VCL for i in 1..safekeepers.size() for message in message_queue if message.endLsn < safekeepers[i].state.flushLsn message.delivered += i else send_message(i, message) break end function function send_message(i,msg) msg.restartLsn = restart_lsn msg.commitLsn = get_commit_lsn() safekeepers[i].send(msg, response_handler) end function function do_broadcast(message) for i in 1..safekeepers.size() if not safekeepers[i].sending() send_message(i, message) end function function get_commit_lsn() sorted_feedbacks = feedbacks.sort() return sorted_feedbacks[safekeepers.size() - quorum] end function function response_handler(i,message,response) feedbacks[i] = if response.epoch=curr_epoch then response.flushLsn else VCL server.write(get_commit_lsn()) message.delivered += i next_message = message_queue.next(message) if next_message send_message(i, next_message) while message_queue.head.delivered.size() = safekeepers.size() if restart_lsn < message_queue.head.beginLsn restart_lsn = message_queue.head.endLsn message_queue.pop_head() end function server_info = server.read() safekeepers.write(server_info) safekeepers.state = safekeepers.read() next_term = max(safekeepers.state.nodeId.term)+1 restart_lsn = max(safekeepers.state.restartLsn) epoch,VCL = max(safekeepers.state.epoch,safekeepers.state.flushLsn) curr_epoch = epoch + 1 proposal = Proposal(NodeId(next_term,server.id),curr_epoch,VCL) safekeepers.send(proposal) responses = safekeepers.read() if any responses.is_rejected() exit() for i in 1..safekeepers.size() feedbacks[i].flushLsn = if epoch=safekeepers[i].state.epoch then safekeepers[i].state.flushLsn else restart_lsn if restart_lsn != VCL do_recovery(epoch,restart_lsn,VCL) wal_stream = server.start_replication(VCL) for ever message = wal_stream.read() message_queue.append(message) do_broadcast(message) end process process safekeeper(gateway,state) function handshake() proposer = gateway.accept() server_info = proposer.read() proposer.write(state) proposal = proposer.read() if proposal.nodeId < state.nodeId proposer.write(rejected) return null else state.nodeId = proposal.nodeId state.proposed_epoch = proposal.epoch state.VCL = proposal.VCL write_control_file(state) proposer.write(accepted) return proposer end function state = read_control_file() state.flushLsn = locate_end_of_wal() for ever proposer = handshake() if not proposer continue for ever req = proposer.read() if req.nodeId != state.nodeId break save_wal_file(req.data) state.restartLsn = req.restartLsn if state.epoch < state.proposed_epoch and req.endPos > max(state.flushLsn,state.VCL) state.epoch = state.proposed_epoch if req.endPos > state.flushLsn state.flushLsn = req.endPos save_control_file(state) resp = Response(state.epoch,req.endPos) proposer.write(resp) notify_wal_sender(Min(req.commitLsn,req.endPos)) end process ``` ================================================ FILE: docs/separation-compute-storage.md ================================================ # Separation of Compute and Storage TODO: - Read path - Write path - Durability model - API auth ================================================ FILE: docs/settings.md ================================================ ## Pageserver Pageserver is mainly configured via a `pageserver.toml` config file. If there's no such file during `init` phase of the server, it creates the file itself. Without 'init', the file is read. There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override the values in the config file, if any are specified for the same key and get into the final config during init phase. ### Config example ```toml # Initial configuration file created by 'pageserver --init' listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes checkpoint_timeout = '10m' gc_period = '1 hour' gc_horizon = '67108864' max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant initial_superuser_name = 'cloud_admin' broker_endpoint = 'http://127.0.0.1:50051' # [remote_storage] ``` The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user, see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and - either has to be placed in the config after the table-less values such as `initial_superuser_name = 'cloud_admin'` - or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` ### Config values All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form. Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"` Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. #### broker_endpoint A storage broker endpoint to connect and pull the information from. Default is `'http://127.0.0.1:50051'`. #### checkpoint_distance `checkpoint_distance` is the amount of incoming WAL that is held in the open layer, before it's flushed to local disk. It puts an upper bound on how much WAL needs to be re-processed after a pageserver crash. It is a soft limit, the pageserver can momentarily go above it, but it will trigger a checkpoint operation to get it back below the limit. `checkpoint_distance` also determines how much WAL needs to be kept durable in the safekeeper. The safekeeper must have capacity to hold this much WAL, with some headroom, otherwise you can get stuck in a situation where the safekeeper is full and stops accepting new WAL, but the pageserver is not flushing out and releasing the space in the safekeeper because it hasn't reached checkpoint_distance yet. `checkpoint_distance` also controls how often the WAL is uploaded to S3. The unit is # of bytes. #### checkpoint_timeout Apart from `checkpoint_distance`, open layer flushing is also triggered `checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to s3 when activity is stopped. The default is 10m. #### compaction_period Every `compaction_period` seconds, the page server checks if maintenance operations, like compaction, are needed on the layer files. Default is 1 s, which should be fine. #### compaction_target_size File sizes for L0 delta and L1 image layers. Default is 128MB. #### gc_horizon `gz_horizon` determines how much history is retained, to allow branching and read replicas at an older point in time. The unit is # of bytes of WAL. Page versions older than this are garbage collected away. #### gc_period Interval at which garbage collection is triggered. Default is 1 hour. #### image_creation_threshold L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval WAL retention duration for PITR branching. Default is 7 days. #### walreceiver_connect_timeout Time to wait to establish the wal receiver connection before failing #### lagging_wal_timeout Time the pageserver did not get any WAL updates from safekeeper (if any). Avoids lagging pageserver preemptively by forcing to switch it from stalled connections. #### max_lsn_wal_lag Difference between Lsn values of the latest available WAL on safekeepers: if currently connected safekeeper starts to lag too long and too much, it gets swapped to the different one. #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant is initialized. It doesn't affect anything after initialization. The default is Note: The default is 'cloud_admin', and the console depends on that, so if you change it, bad things will happen. #### page_cache_size Size of the page cache. Unit is number of 8 kB blocks. The default is 8192, which means 64 MB. #### max_file_descriptors Max number of file descriptors to hold open concurrently for accessing layer files. This should be kept well below the process/container/OS limit (see `ulimit -n`), as the pageserver also needs file descriptors for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. Since pageserver supports several postgres versions, `pg_distrib_dir` contains a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. The default distrib dir is `./pg_install/`. #### workdir (-D) A directory in the file system, where pageserver will store its files. The default is `./.neon/`. This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way. ##### Remote storage There's a way to automatically back up and restore some of the pageserver's data from working dir to the remote storage. The backup system is disabled by default and can be enabled for either of the currently available storages: ###### Local FS storage Pageserver can back up and restore some of its workdir contents to another directory. For that, only a path to that directory needs to be specified as a parameter: ```toml [remote_storage] local_path = '/some/local/path/' ``` ###### S3 storage Pageserver can back up and restore some of its workdir contents to S3. Full set of S3 credentials is needed for that as parameters. Configuration example: ```toml [remote_storage] # Name of the bucket to connect to bucket_name = 'some-sample-bucket' # Name of the region where the bucket is located at bucket_region = 'eu-north-1' # A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once. # Optional, pageserver uses entire bucket if the prefix is not specified. prefix_in_bucket = '/some/prefix/' # S3 API query limit to avoid getting errors/throttling from AWS. concurrency_limit = 100 ``` If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials. ###### General remote storage configuration Pageserver allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. No default values are used for the remote storage configuration parameters. Besides, there are parameters common for all types of remote storage that can be configured, those have defaults: ```toml [remote_storage] # Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time. max_concurrent_syncs = 50 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore. max_sync_errors = 10 ``` ## safekeeper TODO ================================================ FILE: docs/sourcetree.md ================================================ ## Source tree layout Below you will find a brief overview of each subdir in the source tree in alphabetical order. `storage_broker`: Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) `storage_controller`: Neon storage controller, manages a cluster of pageservers and exposes an API that enables managing a many-sharded tenant as a single entity. `/control_plane`: Local control plane. Functions to start, configure and stop pageserver and postgres instances running as a local processes. Intended to be used in integration tests and in CLI tools for local installations. `/docs`: Documentation of the Neon features and concepts. Now it is mostly dev documentation. `/pageserver`: Neon storage service. The pageserver has a few different duties: - Store and manage the data. - Generate a tarball with files needed to bootstrap ComputeNode. - Respond to GetPage@LSN requests from the Compute Nodes. - Receive WAL from the WAL service and decode it. - Replay WAL that's applicable to the chunks that the Page Server maintains For more detailed info, see [pageserver-services.md](./pageserver-services.md) `/proxy`: Postgres protocol proxy/router. This service listens psql port, can check auth via external service and create new databases and accounts (control plane API in our case). `/test_runner`: Integration tests, written in Python using the `pytest` framework. `/vendor/postgres-v14` and `/vendor/postgres-v15`: PostgreSQL source tree per version, with the modifications needed for Neon. `/pgxn/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. `/pgxn/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. `/pgxn/neon_walredo`: Library to run Postgres as a "WAL redo process" in the pageserver. `/safekeeper`: The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. For more detailed info, see [walservice.md](./walservice.md) `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. `/libs`: Unites granular neon helper crates under the hood. `/libs/postgres_ffi`: Utility functions for interacting with PostgreSQL file formats. Misc constants, copied from PostgreSQL headers. `/libs/utils`: Generic helpers that are shared between other crates in this repository. A subject for future modularization. `/libs/metrics`: Helpers for exposing Prometheus metrics from the server. ### Adding dependencies When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine. ```bash cargo hakari generate cargo hakari manage-deps ``` If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`. ### Checking Rust 3rd-parties [Cargo deny](https://embarkstudios.github.io/cargo-deny/index.html) is a cargo plugin that lets us lint project's dependency graph to ensure all dependencies conform to requirements. It detects security issues, matches licenses, and ensures crates only come from trusted sources. ```bash cargo deny check ``` ## Using Python Note that Debian/Ubuntu Python packages are stale, as it commonly happens, so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites - Install Python 3.11 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update sudo apt install python3.11 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`. This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks We force code formatting via `ruff`, and type hints via `mypy`. Run the following commands in the repository's root (next to `pyproject.toml`): ```bash poetry run ruff format . # All code is reformatted poetry run ruff check . # Python linter poetry run mypy . # Ensure there are no typing errors ``` **WARNING**: do not run `mypy` from a directory other than the root of the repository. Otherwise it will not find its configuration. Also consider: * Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any. * Adding more type hints to your code to avoid `Any`. ### Changing dependencies To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case. More details are available in poetry's [documentation](https://python-poetry.org/docs/). ## Configuring IDEs Neon consists of three projects in different languages which use different project models. * A bunch of Rust crates, all available from the root `Cargo.toml`. * Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well. * Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`. ### CLion You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though. C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database: 1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet. 2. Run the following commands in the repository's root: ```bash # Install a `compiledb` tool which can parse make's output and generate the compilation database. poetry add -D compiledb # Clean the build tree so we can rebuild from scratch. # Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new, # so we don't know a way to generate the compilation database without recompiling everything, # see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 make distclean # Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database. # You can alter the -j parameter to your liking. # Note that we only build for a specific version of Postgres. The extension code is shared, but headers are # different, so we set up CLion to only use a specific version of the headers. make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build # Uninstall the tool poetry remove -D compiledb # Make sure the compile_commands.json file is not committed. echo /compile_commands.json >>.git/info/exclude ``` 3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. 4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). 5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. 6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. 7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4. You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically. Known issues (fixes and suggestions are welcome): * Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. * CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. * Cargo Clippy diagnostics in CLion may take a lot of resources. * `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes. ================================================ FILE: docs/storage_broker.md ================================================ # Storage broker Storage broker targets two issues: - Allowing safekeepers and pageservers learn which nodes also hold their timelines, and timeline statuses there. - Avoiding O(n^2) connections between storage nodes while doing so. This is used - By pageservers to determine the most advanced and alive safekeeper to pull WAL from. - By safekeepers to synchronize on the timeline: advance `remote_consistent_lsn`, `backup_lsn`, choose who offloads WAL to s3. Technically, it is a simple stateless pub-sub message broker based on tonic (grpc) making multiplexing easy. Since it is stateless, fault tolerance can be provided by k8s; there is no built in replication support, though it is not hard to add. Currently, the only message is `SafekeeperTimelineInfo`. Each safekeeper, for each active timeline, once in a while pushes timeline status to the broker. Other nodes subscribe and receive this info, using it per above. Broker serves /metrics on the same port as grpc service. grpcurl can be used to check which values are currently being pushed: ``` grpcurl -proto broker/proto/broker.proto -d '{"all":{}}' -plaintext localhost:50051 storage_broker.BrokerService/SubscribeSafekeeperInfo ``` ================================================ FILE: docs/storage_controller.md ================================================ # Storage Controller ## Concepts The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller, which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations). It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding the underlying details of how data is spread across multiple nodes. The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent. ## APIs The storage controller’s HTTP server implements four logically separate APIs: - `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver. - `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits. - `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system. - `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers to ensure data safety with generation numbers. The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs). See the `http.rs` file in the source for where the HTTP APIs are implemented. ## Database The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and rebuilt on startup. The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why. The `diesel` crate is used for defining models & migrations. Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database. ### Diesel tip: migrations If you need to modify the database schema, here’s how to create a migration: - Install the diesel CLI with `cargo install diesel_cli` - Use `diesel migration generate ` to create a new migration - Populate the SQL files in the `migrations/` subdirectory - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller` - Commit the migration files and the changes to schema.rs - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. ## storcon_cli The `storcon_cli` tool enables interactive management of the storage controller. This is usually only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline). `storcon_cli --help` includes details on commands. # Deploying This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as part of a self-hosted system. _General note: since the default `neon_local` environment includes a storage controller, this is a useful reference when figuring out deployment._ ## Database It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver. The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte. Set the URL to the database using the `--database-url` CLI option. There is no need to run migrations manually: the storage controller automatically applies migrations when it starts up. ## Configure pageservers to use the storage controller 1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters. 2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself with the storage controller when it starts up. See the example below for the format of this file. ### Example `metadata.json` ``` {"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000} ``` - `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever postgres runs. - `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where the storage controller runs. ## Handle compute notifications. The storage controller independently moves tenant attachments between pageservers in response to changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed. Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix: - `notify-attach`, called whenever attachment for pageservers changes - `notify-safekeepers`, called whenever attachment for safekeepers changes If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`. The hooks will be invoked with a `PUT` request. In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems, the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling the compute hook. When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated. ### `notify-attach` body The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience. ``` struct ComputeHookNotifyRequestShard { node_id: NodeId, shard_number: ShardNumber, } struct ComputeHookNotifyRequest { tenant_id: TenantId, stripe_size: Option, shards: Vec, } ``` When a notification is received: 1. Modify postgres configuration for this tenant: - set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The shards identified by `NodeId` must be converted to the address+port of the node. - if stripe_size is not None, set `neon.shard_stripe_size` to this value 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. Example body: ``` { "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", "stripe_size": 2048, "shards": [ {"node_id": 344, "shard_number": 0}, {"node_id": 722, "shard_number": 1}, ], } ``` ### `notify-safekeepers` body The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience. ``` pub struct SafekeeperInfo { pub id: NodeId, pub hostname: String, } pub struct SafekeepersNotifyRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub generation: u32, pub safekeepers: Vec, } ``` When a notification is received: 1. Modify postgres configuration for this tenant: - set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper. The hostname is provided for debugging purposes, so we reserve changes to how we pass it. - set `neon.safekeepers_generation` to the provided `generation` value. 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. ================================================ FILE: docs/synthetic-size.md ================================================ # Synthetic size Neon storage has copy-on-write branching, which makes it difficult to answer the question "how large is my database"? To give one reasonable answer, we calculate _synthetic size_ for a project. The calculation is called "synthetic", because it is based purely on the user-visible logical size, which is the size that you would see on a standalone PostgreSQL installation, and the amount of WAL, which is also the same as what you'd see on a standalone PostgreSQL, for the same set of updates. The synthetic size does *not* depend on the actual physical size consumed in the storage, or implementation details of the Neon storage like garbage collection, compaction and compression. There is a strong *correlation* between the physical size and the synthetic size, but the synthetic size is designed to be independent of the implementation details, so that any improvements we make in the storage system simply reduce our COGS. And vice versa: any bugs or bad implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not straightforward to attribute size to individual branches. See [What is the size of an individual branch?](#what-is-the-size-of-an-individual-branch) for a discussion of those difficulties. The synthetic size is designed to: - Take into account the copy-on-write nature of the storage. For example, if you create a branch, it doesn't immediately add anything to the synthetic size. It starts to affect the synthetic size only as it diverges from the parent branch. - Be independent of any implementation details of the storage, like garbage collection, remote storage, or compression. ## Terms & assumptions - logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some small amount of metadata. Note that currently, Neon does not include the SLRUs and metadata in the logical size. Refer to the comment in [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814). - a "point in time" is defined as an LSN value. You can convert a timestamp to an LSN, but the storage internally works with LSNs. - PITR horizon can be set per-branch. - PITR horizon can be set as a time interval, e.g. 5 days or hours, or as amount of WAL, in bytes. If it's given as a time interval, it's converted to an LSN for the calculation. - PITR horizon can be set to 0, if you don't want to retain any history. ## Calculation Inputs to the calculation are: - logical size of the database at different points in time, - amount of WAL generated, and - the PITR horizon settings The synthetic size is based on an idealistic model of the storage system, where we pretend that the storage consists of two things: - snapshots, containing a full snapshot of the database, at a given point in time, and - WAL. In the simple case that the project contains just one branch (main), and a fixed PITR horizon, the synthetic size is the sum of: - the logical size of the branch *at the beginning of the PITR horizon*, i.e. at the oldest point that you can still recover to, and - the size of the WAL covering the PITR horizon. The snapshot allows you to recover to the beginning of the PITR horizon, and the WAL allows you to recover from that point to any point within the horizon. ``` WAL -----------------------#########> ^ snapshot Legend: ##### PITR horizon. This is the region that you can still access with Point-in-time query and you can still create branches from. ----- history that has fallen out of the PITR horizon, and can no longer be accessed ``` NOTE: This is not how the storage system actually works! The actual implementation is also based on snapshots and WAL, but the snapshots are taken for individual database pages and ranges of pages rather than the whole database, and it is much more complicated. This model is a reasonable approximation, however, to make the synthetic size a useful proxy for the actual storage consumption. ## Example: Data is INSERTed For example, let's assume that your database contained 10 GB of data at the beginning of the PITR horizon, and you have since then inserted 5 GB of additional data into it. The additional insertions of 5 GB of data consume roughly 5 GB of WAL. In that case, the synthetic size is: > 10 GB (snapshot) + 5 GB (WAL) = 15 GB If you now set the PITR horizon on the project to 0, so that no historical data is retained, then the beginning PITR horizon would be at the end of the branch, so the size of the snapshot would be calculated at the end of the branch, after the insertions. Then the synthetic size is: > 15 GB (snapshot) + 0 GB (WAL) = 15 GB. In this case, the synthetic size is the same, regardless of the PITR horizon, because all the history consists of inserts. The newly inserted data takes up the same amount of space, whether it's stored as part of the logical snapshot, or as WAL. (*) (*) This is a rough approximation. In reality, the WAL contains headers and other overhead, and on the other hand, the logical snapshot includes empty space on pages, so the size of insertions in WAL can be smaller or greater than the size of the final table after the insertions. But in most cases, it's in the same ballpark. ## Example: Data is DELETEd Let's look at another example: Let's start again with a database that contains 10 GB of data. Then, you DELETE 5 GB of the data, and run VACUUM to free up the space, so that the logical size of the database is now only 5 GB. Let's assume that the WAL for the deletions and the vacuum take up 100 MB of space. In that case, the synthetic size of the project is: > 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB This is much larger than the logical size of the database after the deletions (5 GB). That's because the system still needs to retain the deleted data, because it's still accessible to queries and branching in the PITR window. If you now set the PITR horizon to 0 or just wait for time to pass so that the data falls out of the PITR horizon, making the deleted data inaccessible, the synthetic size shrinks: > 5 GB (snapshot) + 0 GB (WAL) = 5 GB # Branching Things get more complicated with branching. Branches in Neon are copy-on-write, which is also reflected in the synthetic size. When you create a branch, it doesn't immediately change the synthetic size at all. The branch point is within the PITR horizon, and all the data needed to recover to that point in time needs to be retained anyway. However, if you make modifications on the branch, the system needs to keep the WAL of those modifications. The WAL is included in the synthetic size. ## Example: branch and INSERT Let's assume that you again start with a 10 GB database. On the main branch, you insert 2 GB of data. Then you create a branch at that point, and insert another 3 GB of data on the main branch, and 1 GB of data on the child branch ``` child +#####> | | WAL main ---------###############> ^ snapshot ``` In this case, the synthetic size consists of: - the snapshot at the beginning of the PITR horizon (10 GB) - the WAL on the main branch (2 GB + 3 GB = 5 GB) - the WAL on the child branch (1 GB) Total: 16 GB # Diverging branches If there is only a small amount of changes in the database on the different branches, as in the previous example, the synthetic size consists of a snapshot before the branch point, containing all the shared data, and the WAL on both branches. However, if the branches diverge a lot, it is more efficient to store a separate snapshot of branches. ## Example: diverging branches You start with a 10 GB database. You insert 5 GB of data on the main branch. Then you create a branch, and immediately delete all the data on the child branch and insert 5 GB of new data to it. Then you do the same on the main branch. Let's assume that the PITR horizon requires keeping the last 1 GB of WAL on the both branches. ``` snapshot v WAL child +---------##############> | | main -------------+---------##############> ^ WAL snapshot ``` In this case, the synthetic size consists of: - snapshot at the beginning of the PITR horizon on the main branch (4 GB) - WAL on the main branch (1 GB) - snapshot at the beginning of the PITR horizon on the child branch (4 GB) - last 1 GB of WAL on the child branch (1 GB) Total: 10 GB The alternative way to store this would be to take only one snapshot at the beginning of branch point, and keep all the WAL on both branches. However, the size with that method would be larger, as it would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends on the amount of changes (WAL) on both branches, and the logical size at the branch point, which method would result in a smaller synthetic size. On each branch point, the system performs the calculation with both methods, and uses the method that is cheaper, i.e. the one that results in a smaller synthetic size. One way to think about this is that when you create a branch, it starts out as a thin branch that only stores the WAL since the branch point. As you modify it, and the amount of WAL grows, at some point it becomes cheaper to store a completely new snapshot of the branch and truncate the WAL. # What is the size of an individual branch? Synthetic size is calculated for the whole project, and includes all branches. There is no such thing as the size of a branch, because it is not straightforward to attribute the parts of size to individual branches. ## Example: attributing size to branches (copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278) Imagine that you create two branches, A and B, at the same point from main branch, and do a couple of small updates on both branches. Then six months pass, and during those six months the data on the main branch churns over completely multiple times. The retention period is, say 1 month. ``` +------> A / --------------------*-------------------------------> main \ +--------> B ``` In that situation, the synthetic tenant size would be calculated based on a "logical snapshot" at the branch point, that is, the logical size of the database at that point. Plus the WAL on branches A and B. Let's say that the snapshot size is 10 GB, and the WAL is 1 MB on both branches A and B. So the total synthetic storage size is 10002 MB. (Let's ignore the main branch for now, that would be just added to the sum) How would you break that down per branch? I can think of three different ways to do it, and all of them have their own problems: ### Subtraction method For each branch, calculate how much smaller the total synthetic size would be, if that branch didn't exist. In other words, how much would you save if you dropped the branch. With this method, the size of branches A and B is 1 MB. With this method, the 10 GB shared logical snapshot is not included for A nor B. So the size of all branches is not equal to the total synthetic size of the tenant. If you drop branch A, you save 1 MB as you'd expect, but also the size of B suddenly jumps from 1 MB to 10001 MB, which might feel surprising. ### Division method Divide the common parts evenly across all branches that need them. With this method, the size of branches A and B would be 5001 MB. With this method, the sum of all branches adds up to the total synthetic size. But it's surprising in other ways: if you drop branch A, you might think that you save 5001 MB, but in reality you only save 1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB. ### Addition method For each branch, include all the snapshots and WAL that it depends on, even if some of them are shared by other branches. With this method, the size of branches A and B would be 10001 MB. The surprise with this method is that the sum of all the branches is larger than the total synthetic size. And if you drop branch A, the total synthetic size doesn't fall by 10001 MB as you might think. # Alternatives A sort of cop-out method would be to show the whole tree of branches graphically, and for each section of WAL or logical snapshot, display the size of that section. You can then see which branches depend on which sections, which sections are shared etc. That would be good to have in the UI anyway. Or perhaps calculate per-branch numbers using the subtraction method, and in addition to that, one more number for "shared size" that includes all the data that is needed by more than one branch. ## Which is the right method? The bottom line is that it's not straightforward to attribute the synthetic size to individual branches. There are things we can do, and all of those methods are pretty straightforward to implement, but they all have their own problems. What makes sense depends a lot on what you want to do with the number, what question you are trying to answer. ================================================ FILE: docs/tools.md ================================================ # Useful development tools This readme contains some hints on how to set up some optional development tools. ## ccls [ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup to work well. There are different ways to do it but here's what works for me: 1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`) 2. Go to `vendor/postgres-v15` 3. Run `make clean && ./configure` 4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4` 5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent) 6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well. Some additional tips for various IDEs: ### Emacs To improve performance: `(setq lsp-lens-enable nil)` ================================================ FILE: docs/updating-postgres.md ================================================ # Updating Postgres ## Minor Versions When upgrading to a new minor version of Postgres, please follow these steps: _Example: 15.4 is the new minor version to upgrade to from 15.3._ 1. Clone the Neon Postgres repository if you have not done so already. ```shell git clone git@github.com:neondatabase/postgres.git ``` 1. Add the Postgres upstream remote. ```shell git remote add upstream https://git.postgresql.org/git/postgresql.git ``` 1. Create a new branch based on the stable branch you are updating. ```shell git checkout -b my-branch-15 REL_15_STABLE_neon ``` 1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`. 1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts. ```shell git fetch upstream REL_15_4 git merge REL_15_4 ``` In the commit message of the merge commit, mention if there were any non-trivial conflicts or other issues. 1. Run the Postgres test suite to make sure our commits have not affected Postgres in a negative way. ```shell make check # OR meson test -C builddir ``` 1. Push your branch to the Neon Postgres repository. ```shell git push origin my-branch-15 ``` 1. Clone the Neon repository if you have not done so already. ```shell git clone git@github.com:neondatabase/neon.git ``` 1. Create a new branch. 1. Change the `revisions.json` file to point at the HEAD of your Postgres branch. 1. Update the Git submodule. ```shell git submodule set-branch --branch my-branch-15 vendor/postgres-v15 git submodule update --remote vendor/postgres-v15 ``` 1. Run the Neon test suite to make sure that Neon is still good to go on this minor Postgres release. ```shell ./scripts/poetry -k pg15 ``` 1. Commit your changes. 1. Create a pull request, and wait for CI to go green. 1. Push the Postgres branches with the merge commits into the Neon Postgres repository. ```shell git push origin my-branch-15:REL_15_STABLE_neon ``` 1. Update your Neon PR to point at the branches. ```shell git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15 git commit --amend --no-edit git push --force origin ``` 1. Merge the pull request after getting approval(s) and CI completion. ================================================ FILE: docs/walservice.md ================================================ # WAL service The neon WAL service acts as a holding area and redistribution center for recently generated WAL. The primary Postgres server streams the WAL to the WAL safekeeper, and treats it like a (synchronous) replica. A replication slot is used in the primary to prevent the primary from discarding WAL that hasn't been streamed to the WAL service yet. ``` +--------------+ +------------------+ | | WAL | | | Compute node | ----------> | WAL Service | | | | | +--------------+ +------------------+ | | | WAL | | V +--------------+ | | | Pageservers | | | +--------------+ ``` The WAL service consists of multiple WAL safekeepers that all store a copy of the WAL. A WAL record is considered durable when the majority of safekeepers have received and stored the WAL to local disk. A consensus algorithm based on Paxos is used to manage the quorum. ``` +-------------------------------------------+ | WAL Service | | | | | | +------------+ | | | safekeeper | | | +------------+ | | | | +------------+ | | | safekeeper | | | +------------+ | | | | +------------+ | | | safekeeper | | | +------------+ | | | +-------------------------------------------+ ``` The primary connects to the WAL safekeepers, so it works in a "push" fashion. That's different from how streaming replication usually works, where the replica initiates the connection. To do that, there is a component called the "WAL proposer". The WAL proposer is a background worker that runs in the primary Postgres server. It connects to the WAL safekeeper, and sends all the WAL. (PostgreSQL's archive_commands works in the "push" style, but it operates on a WAL segment granularity. If PostgreSQL had a push style API for streaming, WAL propose could be implemented using it.) The Page Server connects to the WAL safekeeper, using the same streaming replication protocol that's used between Postgres primary and standby. You can also connect the Page Server directly to a primary PostgreSQL node for testing. In a production installation, there are multiple WAL safekeepers running on different nodes, and there is a quorum mechanism using the Paxos algorithm to ensure that a piece of WAL is considered as durable only after it has been flushed to disk on more than half of the WAL safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. See [this section](safekeeper-protocol.md) for a more detailed description of the consensus protocol. spec/ contains TLA+ specification of it. # Q&A Q: Why have a separate service instead of connecting Page Server directly to a primary PostgreSQL node? A: Page Server is a single server which can be lost. As our primary fault-tolerant storage is S3, we do not want to wait for it before committing a transaction. The WAL service acts as a temporary fault-tolerant storage for recent data before it gets to the Page Server and then finally to S3. Whenever WALs and pages are committed to S3, WAL's storage can be trimmed. Q: What if the compute node evicts a page, needs it back, but the page is yet to reach the Page Server? A: If the compute node has evicted a page, changes to it have been WAL-logged (that's why it is called Write Ahead logging; there are some exceptions like index builds, but these are exceptions). These WAL records will eventually reach the Page Server. The Page Server notes that the compute node requests pages with a very recent LSN and will not respond to the compute node until a corresponding WAL is received from WAL safekeepers. Q: How long may Page Server wait for? A: Not too long, hopefully. If a page is evicted, it probably was not used for a while, so the WAL service have had enough time to push changes to the Page Server. To limit the lag, tune backpressure using `max_replication_*_lag` settings. Q: How do WAL safekeepers communicate with each other? A: They may only send each other messages via the compute node, they never communicate directly with each other. Q: Why have a consensus algorithm if there is only a single compute node? A: Actually there may be moments with multiple PostgreSQL nodes running at the same time. E.g. we are bringing one up and one down. We would like to avoid simultaneous writes from different nodes, so there should be a consensus on who is the primary node. # Terminology WAL service - The service as whole that ensures that WAL is stored durably. WAL safekeeper - One node that participates in the quorum. All the safekeepers together form the WAL service. WAL acceptor, WAL proposer - In the context of the consensus algorithm, the Postgres compute node is also known as the WAL proposer, and the safekeeper is also known as the acceptor. Those are the standard terms in the Paxos algorithm. ================================================ FILE: endpoint_storage/Cargo.toml ================================================ [package] name = "endpoint_storage" version = "0.0.1" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true axum-extra.workspace = true axum.workspace = true camino.workspace = true clap.workspace = true futures.workspace = true jsonwebtoken.workspace = true prometheus.workspace = true remote_storage.workspace = true serde.workspace = true serde_json.workspace = true tokio-util.workspace = true tokio.workspace = true tracing.workspace = true utils = { path = "../libs/utils", default-features = false } workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true http-body-util.workspace = true itertools.workspace = true rand.workspace = true test-log.workspace = true tower.workspace = true ================================================ FILE: endpoint_storage/src/app.rs ================================================ use anyhow::anyhow; use axum::body::{Body, Bytes}; use axum::response::{IntoResponse, Response}; use axum::{Router, http::StatusCode}; use endpoint_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok}; use remote_storage::TimeoutOrCancel; use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath}; use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; use utils::backoff::retry; pub fn app(state: Arc) -> Router<()> { use axum::routing::{delete as _delete, get as _get}; let delete_prefix = _delete(delete_prefix); // NB: On any changes do not forget to update the OpenAPI spec // in /endpoint_storage/src/openapi_spec.yml. Router::new() .route( "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", _get(get).put(set).delete(delete), ) .route( "/{tenant_id}/{timeline_id}/{endpoint_id}", delete_prefix.clone(), ) .route("/{tenant_id}/{timeline_id}", delete_prefix.clone()) .route("/{tenant_id}", delete_prefix) .route("/metrics", _get(metrics)) .route("/status", _get(async || StatusCode::OK.into_response())) .with_state(state) } type Result = anyhow::Result; type State = axum::extract::State>; const CONTENT_TYPE: &str = "content-type"; const APPLICATION_OCTET_STREAM: &str = "application/octet-stream"; const WARN_THRESHOLD: u32 = 3; const MAX_RETRIES: u32 = 10; async fn metrics() -> Result { prometheus::TextEncoder::new() .encode_to_string(&prometheus::gather()) .map(|s| s.into_response()) .map_err(|e| internal_error(e, "/metrics", "collecting metrics")) } async fn get(S3Path { path }: S3Path, state: State) -> Result { info!(%path, "downloading"); let download_err = |err| { if let DownloadError::NotFound = err { info!(%path, %err, "downloading"); // 404 is not an issue of _this_ service return not_found(&path); } internal_error(err, &path, "downloading") }; let cancel = state.cancel.clone(); let opts = &DownloadOpts::default(); let stream = retry( async || state.storage.download(&path, opts, &cancel).await, DownloadError::is_permanent, WARN_THRESHOLD, MAX_RETRIES, "downloading", &cancel, ) .await .unwrap_or(Err(DownloadError::Cancelled)) .map_err(download_err)? .download_stream; Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM) .body(Body::from_stream(stream)) .map_err(|e| internal_error(e, path, "reading response")) } // Best solution for files is multipart upload, but remote_storage doesn't support it, // so we can either read Bytes in memory and push at once or forward BodyDataStream to // remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a // guaranteed size() which may produce issues while uploading to s3. // So, currently we're going with an in-memory copy plus a boundary to prevent uploading // very large files. async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result { info!(%path, "uploading"); let request_len = bytes.len(); let max_len = state.max_upload_file_limit; if request_len > max_len { return Err(bad_request( anyhow!("File size {request_len} exceeds max {max_len}"), "uploading", )); } let cancel = state.cancel.clone(); let fun = async || { let stream = bytes_to_stream(bytes.clone()); state .storage .upload(stream, request_len, &path, None, &cancel) .await }; retry( fun, TimeoutOrCancel::caused_by_cancel, WARN_THRESHOLD, MAX_RETRIES, "uploading", &cancel, ) .await .unwrap_or(Err(anyhow!("uploading cancelled"))) .map_err(|e| internal_error(e, path, "reading response"))?; Ok(ok()) } async fn delete(S3Path { path }: S3Path, state: State) -> Result { info!(%path, "deleting"); let cancel = state.cancel.clone(); retry( async || state.storage.delete(&path, &cancel).await, TimeoutOrCancel::caused_by_cancel, WARN_THRESHOLD, MAX_RETRIES, "deleting", &cancel, ) .await .unwrap_or(Err(anyhow!("deleting cancelled"))) .map_err(|e| internal_error(e, path, "deleting"))?; Ok(ok()) } async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result { info!(%path, "deleting prefix"); let cancel = state.cancel.clone(); retry( async || state.storage.delete_prefix(&path, &cancel).await, TimeoutOrCancel::caused_by_cancel, WARN_THRESHOLD, MAX_RETRIES, "deleting prefix", &cancel, ) .await .unwrap_or(Err(anyhow!("deleting prefix cancelled"))) .map_err(|e| internal_error(e, path, "deleting prefix"))?; Ok(ok()) } pub async fn check_storage_permissions( client: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result<()> { info!("storage permissions check"); // as_nanos() as multiple instances proxying same bucket may be started at once let now = SystemTime::now() .duration_since(UNIX_EPOCH)? .as_nanos() .to_string(); let path = RemotePath::from_string(&format!("write_access_{now}"))?; info!(%path, "uploading"); let body = now.to_string(); let stream = bytes_to_stream(Bytes::from(body.clone())); client .upload(stream, body.len(), &path, None, &cancel) .await?; use tokio::io::AsyncReadExt; info!(%path, "downloading"); let download_opts = DownloadOpts { kind: remote_storage::DownloadKind::Small, ..Default::default() }; let mut body_read_buf = Vec::new(); let stream = client .download(&path, &download_opts, &cancel) .await? .download_stream; tokio_util::io::StreamReader::new(stream) .read_to_end(&mut body_read_buf) .await?; let body_read = String::from_utf8(body_read_buf)?; if body != body_read { error!(%body, %body_read, "File contents do not match"); anyhow::bail!("Read back file doesn't match original") } info!(%path, "removing"); client.delete(&path, &cancel).await } fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream> { futures::stream::once(futures::future::ready(Ok(bytes))) } #[cfg(test)] mod tests { use super::*; use axum::{body::Body, extract::Request, response::Response}; use http_body_util::BodyExt; use itertools::iproduct; use std::env::var; use std::sync::Arc; use std::time::Duration; use test_log::test as testlog; use tower::{Service, util::ServiceExt}; use utils::id::{TenantId, TimelineId}; // see libs/remote_storage/tests/test_real_s3.rs const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET"; const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION"; async fn proxy() -> (Storage, Option) { let cancel = CancellationToken::new(); let (dir, storage) = if var(REAL_S3_ENV).is_err() { // tests execute in parallel and we need a new directory for each of them let dir = camino_tempfile::tempdir().unwrap(); let fs = remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap(); (Some(dir), GenericRemoteStorage::LocalFs(fs)) } else { // test_real_s3::create_s3_client is hard to reference, reimplementing here let millis = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_millis(); use rand::Rng; let random = rand::rng().random::(); let s3_config = remote_storage::S3Config { bucket_name: var(REAL_S3_BUCKET).unwrap(), bucket_region: var(REAL_S3_REGION).unwrap(), prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")), endpoint: None, concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: None, upload_storage_class: None, }; let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1)) .await .unwrap(); (None, GenericRemoteStorage::AwsS3(Arc::new(bucket))) }; let proxy = Storage { auth: endpoint_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(), storage, cancel: cancel.clone(), max_upload_file_limit: usize::MAX, }; check_storage_permissions(&proxy.storage, cancel) .await .unwrap(); (proxy, dir) } // see libs/utils/src/auth.rs const TEST_PUB_KEY_ED25519: &[u8] = b" -----BEGIN PUBLIC KEY----- MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= -----END PUBLIC KEY----- "; const TEST_PRIV_KEY_ED25519: &[u8] = br#" -----BEGIN PRIVATE KEY----- MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH -----END PRIVATE KEY----- "#; async fn request(req: Request) -> Response { let (proxy, _) = proxy().await; app(Arc::new(proxy)) .into_service() .oneshot(req) .await .unwrap() } #[testlog(tokio::test)] async fn status() { let res = Request::builder() .uri("/status") .body(Body::empty()) .map(request) .unwrap() .await; assert_eq!(res.status(), StatusCode::OK); } fn routes() -> impl Iterator { iproduct!( vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"], vec!["GET", "PUT", "DELETE"] ) } #[testlog(tokio::test)] async fn no_token() { for (uri, method) in routes() { info!(%uri, %method); let res = Request::builder() .uri(uri) .method(method) .body(Body::empty()) .map(request) .unwrap() .await; assert!(matches!( res.status(), StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST )); } } #[testlog(tokio::test)] async fn invalid_token() { for (uri, method) in routes() { info!(%uri, %method); let status = Request::builder() .uri(uri) .header("Authorization", "Bearer 123") .method(method) .body(Body::empty()) .map(request) .unwrap() .await; assert!(matches!( status.status(), StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST )); } } const TENANT_ID: TenantId = TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); const TIMELINE_ID: TimelineId = TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; fn token() -> String { let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), exp: u64::MAX, }; let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO); jsonwebtoken::encode(&header, &claims, &key).unwrap() } #[testlog(tokio::test)] async fn unauthorized() { let (proxy, _) = proxy().await; let mut app = app(Arc::new(proxy)).into_service(); let token = token(); let args = itertools::iproduct!( vec![TENANT_ID.to_string(), TenantId::generate().to_string()], vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()], vec![ENDPOINT_ID, "ep-ololo"] ) // first one is fully valid path, second path is valid for GET as // read paths may have different endpoint if tenant and timeline matches // (needed for prewarming RO->RW replica) .skip(2); for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) { info!(%uri, %method, %tenant, %timeline, %endpoint); let request = Request::builder() .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key")) .method(method) .header("Authorization", format!("Bearer {token}")) .body(Body::empty()) .unwrap(); let status = ServiceExt::ready(&mut app) .await .unwrap() .call(request) .await .unwrap() .status(); assert_eq!(status, StatusCode::UNAUTHORIZED); } } #[testlog(tokio::test)] async fn method_not_allowed() { let token = token(); let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]); for (key, method) in iter { let status = Request::builder() .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}")) .method(method) .header("Authorization", format!("Bearer {token}")) .body(Body::empty()) .map(request) .unwrap() .await .status(); assert!(matches!( status, StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED )); } } async fn requests_chain( chain: impl Iterator, token: impl Fn(&str) -> String, ) { let (proxy, _) = proxy().await; let mut app = app(Arc::new(proxy)).into_service(); for (uri, method, body, expected_status, compare_body) in chain { info!(%uri, %method, %body, %expected_status); let bearer = format!("Bearer {}", token(&uri)); let request = Request::builder() .uri(uri) .method(method) .header("Authorization", &bearer) .body(Body::from(body)) .unwrap(); let response = ServiceExt::ready(&mut app) .await .unwrap() .call(request) .await .unwrap(); assert_eq!(response.status(), expected_status); if !compare_body { continue; } let read_body = response.into_body().collect().await.unwrap().to_bytes(); assert_eq!(body, read_body); } } #[testlog(tokio::test)] async fn metrics() { let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); let req = vec![ (uri.clone(), "PUT", "body", StatusCode::OK, false), (uri.clone(), "DELETE", "", StatusCode::OK, false), ]; requests_chain(req.into_iter(), |_| token()).await; let res = Request::builder() .uri("/metrics") .body(Body::empty()) .map(request) .unwrap() .await; assert_eq!(res.status(), StatusCode::OK); let body = res.into_body().collect().await.unwrap().to_bytes(); let body = String::from_utf8_lossy(&body); tracing::debug!(%body); // Storage metrics are not gathered for LocalFs if var(REAL_S3_ENV).is_ok() { assert!(body.contains("remote_storage_s3_deleted_objects_total")); } #[cfg(target_os = "linux")] assert!(body.contains("process_threads")); } #[testlog(tokio::test)] async fn insert_retrieve_remove() { let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); let chain = vec![ (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false), (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false), (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true), (uri.clone(), "DELETE", "", StatusCode::OK, false), (uri, "GET", "", StatusCode::NOT_FOUND, false), ]; requests_chain(chain.into_iter(), |_| token()).await; } #[testlog(tokio::test)] async fn read_other_endpoint_data() { let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/other_endpoint/key"); let chain = vec![ (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false), (uri.clone(), "PUT", "", StatusCode::UNAUTHORIZED, false), ]; requests_chain(chain.into_iter(), |_| token()).await; } fn delete_prefix_token(uri: &str) -> String { let parts = uri.split("/").collect::>(); let claims = endpoint_storage::claims::DeletePrefixClaims { tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), timeline_id: parts.get(2).map(|c| c.parse().unwrap()), endpoint_id: parts.get(3).map(ToString::to_string), exp: u64::MAX, }; let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO); jsonwebtoken::encode(&header, &claims, &key).unwrap() } // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId #[testlog(tokio::test)] async fn delete_prefix() { let tenant_id = TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string(); let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}"); // Why extra slash in string literals? Axum is weird with URIs: // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND // as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932 // The cost of removing trailing slash is suprisingly hard: // * Add tower dependency with NormalizePath layer // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377 // * Rewrite make_service() -> into_make_service() // * Rewrite oneshot() (not available for NormalizePath) // I didn't manage to get it working correctly let chain = vec![ // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false), (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false), // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6 (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false), (f(t2, ""), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false), // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9 (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false), (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6 (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false), (f(t2, ""), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), (f(t4, "/5/6"), "GET", "", StatusCode::OK, false), // delete prefix 1 -> empty (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false), (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false), (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), ]; requests_chain(chain.into_iter(), delete_prefix_token).await; } } ================================================ FILE: endpoint_storage/src/claims.rs ================================================ use serde::{Deserialize, Serialize}; use std::fmt::Display; use utils::id::{EndpointId, TenantId, TimelineId}; /// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl #[derive(Deserialize, Serialize, PartialEq)] pub struct EndpointStorageClaims { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub endpoint_id: EndpointId, pub exp: u64, } /// Claims to remove tenant, timeline, or endpoint data. Used by control plane #[derive(Deserialize, Serialize, PartialEq)] pub struct DeletePrefixClaims { pub tenant_id: TenantId, /// None when tenant is deleted (endpoint_id is also None in this case) pub timeline_id: Option, /// None when timeline is deleted pub endpoint_id: Option, pub exp: u64, } impl Display for EndpointStorageClaims { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})", self.tenant_id, self.timeline_id, self.endpoint_id, self.exp ) } } impl Display for DeletePrefixClaims { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})", self.tenant_id, self.timeline_id .as_ref() .map(ToString::to_string) .unwrap_or("".to_string()), self.endpoint_id .as_ref() .map(ToString::to_string) .unwrap_or("".to_string()), self.exp ) } } ================================================ FILE: endpoint_storage/src/lib.rs ================================================ pub mod claims; use crate::claims::{DeletePrefixClaims, EndpointStorageClaims}; use anyhow::Result; use axum::extract::{FromRequestParts, Path}; use axum::response::{IntoResponse, Response}; use axum::{RequestPartsExt, http::StatusCode, http::request::Parts}; use axum_extra::TypedHeader; use axum_extra::headers::{Authorization, authorization::Bearer}; use camino::Utf8PathBuf; use jsonwebtoken::{DecodingKey, Validation}; use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::{Deserialize, Serialize}; use std::fmt::Display; use std::result::Result as StdResult; use std::sync::Arc; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; use utils::id::{EndpointId, TenantId, TimelineId}; // simplified version of utils::auth::JwtAuth pub struct JwtAuth { decoding_key: DecodingKey, validation: Validation, } pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA; impl JwtAuth { pub fn new(key: &[u8]) -> Result { Ok(Self { decoding_key: DecodingKey::from_ed_pem(key)?, validation: Validation::new(VALIDATION_ALGO), }) } pub fn decode(&self, token: &str) -> Result { Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?) } } fn normalize_key(key: &str) -> StdResult { let key = clean_utf8(&Utf8PathBuf::from(key)); if key.starts_with("..") || key == "." || key == "/" { return Err(format!("invalid key {key}")); } match key.strip_prefix("/").map(Utf8PathBuf::from) { Ok(p) => Ok(p), _ => Ok(key), } } // Copied from path_clean crate with PathBuf->Utf8PathBuf fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf { use camino::Utf8Component as Comp; let mut out = Vec::new(); for comp in path.components() { match comp { Comp::CurDir => (), Comp::ParentDir => match out.last() { Some(Comp::RootDir) => (), Some(Comp::Normal(_)) => { out.pop(); } None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => { out.push(comp) } }, comp => out.push(comp), } } if !out.is_empty() { out.iter().collect() } else { Utf8PathBuf::from(".") } } pub struct Storage { pub auth: JwtAuth, pub storage: GenericRemoteStorage, pub cancel: CancellationToken, pub max_upload_file_limit: usize, } #[derive(Deserialize, Serialize)] struct KeyRequest { tenant_id: TenantId, timeline_id: TimelineId, endpoint_id: EndpointId, path: String, } #[derive(Deserialize, Serialize, PartialEq)] struct PrefixKeyRequest { tenant_id: TenantId, timeline_id: Option, endpoint_id: Option, } #[derive(Debug, PartialEq)] pub struct S3Path { pub path: RemotePath, } impl TryFrom<&KeyRequest> for S3Path { type Error = String; fn try_from(req: &KeyRequest) -> StdResult { let KeyRequest { tenant_id, timeline_id, endpoint_id, path, } = &req; let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",); let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?); let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative Ok(S3Path { path }) } } fn unauthorized(route: impl Display, claims: impl Display) -> Response { debug!(%route, %claims, "route doesn't match claims"); StatusCode::UNAUTHORIZED.into_response() } pub fn bad_request(err: impl Display, desc: &'static str) -> Response { debug!(%err, desc); (StatusCode::BAD_REQUEST, err.to_string()).into_response() } pub fn ok() -> Response { StatusCode::OK.into_response() } pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response { error!(%err, %path, desc); StatusCode::INTERNAL_SERVER_ERROR.into_response() } pub fn not_found(key: impl ToString) -> Response { (StatusCode::NOT_FOUND, key.to_string()).into_response() } impl FromRequestParts> for S3Path { type Rejection = Response; async fn from_request_parts( parts: &mut Parts, state: &Arc, ) -> Result { let Path(path): Path = parts .extract() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; let claims: EndpointStorageClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "decoding token"))?; // Read paths may have different endpoint ids. For readonly -> readwrite replica // prewarming, endpoint must read other endpoint's data. let endpoint_id = if parts.method == axum::http::Method::GET { claims.endpoint_id.clone() } else { path.endpoint_id.clone() }; let route = EndpointStorageClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id, exp: claims.exp, }; if route != claims { return Err(unauthorized(route, claims)); } (&path) .try_into() .map_err(|e| bad_request(e, "invalid route")) } } #[derive(Debug, PartialEq)] pub struct PrefixS3Path { pub path: RemotePath, } impl From<&DeletePrefixClaims> for PrefixS3Path { fn from(path: &DeletePrefixClaims) -> Self { let timeline_id = path .timeline_id .as_ref() .map(ToString::to_string) .unwrap_or("".to_string()); let endpoint_id = path .endpoint_id .as_ref() .map(ToString::to_string) .unwrap_or("".to_string()); let path = Utf8PathBuf::from(path.tenant_id.to_string()) .join(timeline_id) .join(endpoint_id); let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative PrefixS3Path { path } } } impl FromRequestParts> for PrefixS3Path { type Rejection = Response; async fn from_request_parts( parts: &mut Parts, state: &Arc, ) -> Result { let Path(path) = parts .extract::>() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; let claims: DeletePrefixClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "invalid token"))?; let route = DeletePrefixClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id: path.endpoint_id, exp: claims.exp, }; if route != claims { return Err(unauthorized(route, claims)); } Ok((&route).into()) } } #[cfg(test)] mod tests { use super::*; #[test] fn normalize_key() { let f = super::normalize_key; assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello")); assert_eq!( f("ololo/1/../../not_ololo").unwrap(), Utf8PathBuf::from("not_ololo") ); assert!(f("ololo/1/../../../").is_err()); assert!(f(".").is_err()); assert!(f("../").is_err()); assert!(f("").is_err()); assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3")); assert!(f("/1/2/3/../../../").is_err()); assert!(f("/1/2/3/../../../../").is_err()); } const TENANT_ID: TenantId = TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); const TIMELINE_ID: TimelineId = TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; #[test] fn s3_path() { let auth = EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), exp: u64::MAX, }; let s3_path = |key| { let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}"); let path = RemotePath::from_string(path).unwrap(); S3Path { path } }; let path = "cache_key".to_string(); let mut key_path = KeyRequest { path, tenant_id: auth.tenant_id, timeline_id: auth.timeline_id, endpoint_id: auth.endpoint_id, }; assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); key_path.path = "we/can/have/nested/paths".to_string(); assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); key_path.path = "../error/hello/../".to_string(); assert!(S3Path::try_from(&key_path).is_err()); } #[test] fn prefix_s3_path() { let mut path = DeletePrefixClaims { tenant_id: TENANT_ID, timeline_id: None, endpoint_id: None, exp: 0, }; let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); assert_eq!( PrefixS3Path::from(&path).path, prefix_path(format!("{TENANT_ID}")) ); path.timeline_id = Some(TIMELINE_ID); assert_eq!( PrefixS3Path::from(&path).path, prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}")) ); path.endpoint_id = Some(ENDPOINT_ID.into()); assert_eq!( PrefixS3Path::from(&path).path, prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}")) ); } } ================================================ FILE: endpoint_storage/src/main.rs ================================================ //! `endpoint_storage` is a service which provides API for uploading and downloading //! files. It is used by compute and control plane for accessing LFC prewarm data. //! This service is deployed either as a separate component or as part of compute image //! for large computes. mod app; use anyhow::Context; use clap::Parser; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use tracing::info; use utils::logging; //see set() const fn max_upload_file_limit() -> usize { 100 * 1024 * 1024 } const fn listen() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 51243) } #[derive(Parser)] struct Args { #[arg(exclusive = true)] config_file: Option, #[arg(long, default_value = "false", requires = "config")] /// to allow testing k8s helm chart where we don't have s3 credentials no_s3_check_on_startup: bool, #[arg(long, value_name = "FILE")] /// inline config mode for k8s helm chart config: Option, } #[derive(serde::Deserialize)] struct Config { #[serde(default = "listen")] listen: std::net::SocketAddr, pemfile: camino::Utf8PathBuf, #[serde(flatten)] storage_kind: remote_storage::TypedRemoteStorageKind, #[serde(default = "max_upload_file_limit")] max_upload_file_limit: usize, } #[tokio::main] async fn main() -> anyhow::Result<()> { logging::init( logging::LogFormat::Plain, logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, )?; let args = Args::parse(); let config: Config = if let Some(config_path) = args.config_file { info!("Reading config from {config_path}"); let config = std::fs::read_to_string(config_path)?; serde_json::from_str(&config).context("parsing config")? } else if let Some(config) = args.config { info!("Reading inline config"); serde_json::from_str(&config).context("parsing config")? } else { anyhow::bail!("Supply either config file path or --config=inline-config"); }; info!("Reading pemfile from {}", config.pemfile.clone()); let pemfile = std::fs::read(config.pemfile.clone())?; info!("Loading public key from {}", config.pemfile.clone()); let auth = endpoint_storage::JwtAuth::new(&pemfile)?; let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap(); info!("listening on {}", listener.local_addr().unwrap()); let storage = remote_storage::GenericRemoteStorage::from_storage_kind(config.storage_kind).await?; let cancel = tokio_util::sync::CancellationToken::new(); if !args.no_s3_check_on_startup { app::check_storage_permissions(&storage, cancel.clone()).await?; } let proxy = std::sync::Arc::new(endpoint_storage::Storage { auth, storage, cancel: cancel.clone(), max_upload_file_limit: config.max_upload_file_limit, }); tokio::spawn(utils::signals::signal_handler(cancel.clone())); axum::serve(listener, app::app(proxy)) .with_graceful_shutdown(async move { cancel.cancelled().await }) .await?; Ok(()) } ================================================ FILE: endpoint_storage/src/openapi_spec.yml ================================================ openapi: "3.0.2" info: title: Endpoint Storage API description: Endpoint Storage API version: "1.0" license: name: "Apache" url: https://github.com/neondatabase/neon/blob/main/LICENSE servers: - url: "" paths: /status: description: Healthcheck endpoint get: description: Healthcheck security: [] responses: "200": description: OK /{tenant_id}/{timeline_id}/{endpoint_id}/{key}: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string - name: endpoint_id in: path required: true schema: type: string - name: key in: path required: true schema: type: string get: description: Get file from blob storage responses: "200": description: "File stream from blob storage" content: application/octet-stream: schema: type: string format: binary "400": description: File was not found "403": description: JWT does not authorize request to this route put: description: Insert file into blob storage. If file exists, override it requestBody: content: application/octet-stream: schema: type: string format: binary responses: "200": description: File was inserted successfully "403": description: JWT does not authorize request to this route delete: description: Delete file from blob storage responses: "200": description: File was successfully deleted or not found "403": description: JWT does not authorize request to this route /{tenant_id}/{timeline_id}/{endpoint_id}: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string - name: endpoint_id in: path required: true schema: type: string delete: description: Delete endpoint data from blob storage responses: "200": description: Endpoint data was deleted "403": description: JWT does not authorize request to this route /{tenant_id}/{timeline_id}: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string delete: description: Delete timeline data from blob storage responses: "200": description: Timeline data was deleted "403": description: JWT does not authorize request to this route /{tenant_id}: parameters: - name: tenant_id in: path required: true schema: type: string delete: description: Delete tenant data from blob storage responses: "200": description: Tenant data was deleted "403": description: JWT does not authorize request to this route components: securitySchemes: JWT: type: http scheme: bearer bearerFormat: JWT security: - JWT: [] ================================================ FILE: libs/compute_api/Cargo.toml ================================================ [package] name = "compute_api" version = "0.1.0" edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true indexmap.workspace = true jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true url.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } ================================================ FILE: libs/compute_api/src/lib.rs ================================================ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] pub mod privilege; pub mod requests; pub mod responses; pub mod spec; ================================================ FILE: libs/compute_api/src/privilege.rs ================================================ #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] #[serde(rename_all = "UPPERCASE")] pub enum Privilege { Select, Insert, Update, Delete, Truncate, References, Trigger, Usage, Create, Connect, Temporary, Execute, } impl Privilege { pub fn as_str(&self) -> &'static str { match self { Privilege::Select => "SELECT", Privilege::Insert => "INSERT", Privilege::Update => "UPDATE", Privilege::Delete => "DELETE", Privilege::Truncate => "TRUNCATE", Privilege::References => "REFERENCES", Privilege::Trigger => "TRIGGER", Privilege::Usage => "USAGE", Privilege::Create => "CREATE", Privilege::Connect => "CONNECT", Privilege::Temporary => "TEMPORARY", Privilege::Execute => "EXECUTE", } } } ================================================ FILE: libs/compute_api/src/requests.rs ================================================ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. use std::str::FromStr; use serde::{Deserialize, Serialize}; use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; /// The value to place in the [`ComputeClaims::audience`] claim. pub static COMPUTE_AUDIENCE: &str = "compute"; /// Available scopes for a compute's JWT. #[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] #[serde(rename_all = "snake_case")] pub enum ComputeClaimsScope { /// An admin-scoped token allows access to all of `compute_ctl`'s authorized /// facilities. #[serde(rename = "compute_ctl:admin")] Admin, } impl FromStr for ComputeClaimsScope { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "compute_ctl:admin" => Ok(ComputeClaimsScope::Admin), _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), } } } /// When making requests to the `compute_ctl` external HTTP server, the client /// must specify a set of claims in `Authorization` header JWTs such that /// `compute_ctl` can authorize the request. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(rename = "snake_case")] pub struct ComputeClaims { /// The compute ID that will validate the token. The only case in which this /// can be [`None`] is if [`Self::scope`] is /// [`ComputeClaimsScope::Admin`]. pub compute_id: Option, /// The scope of what the token authorizes. pub scope: Option, /// The recipient the token is intended for. /// /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for /// more information. /// /// TODO: Remove the [`Option`] wrapper when control plane learns to send /// the claim. #[serde(rename = "aud")] pub audience: Option>, } /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can /// extend it and something like `restart: bool` or something else. So put /// `spec` into a struct initially to be more flexible in the future. #[derive(Debug, Deserialize, Serialize)] pub struct ConfigurationRequest { pub spec: ComputeSpec, pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Debug)] pub struct ExtensionInstallRequest { pub extension: PgIdent, pub database: PgIdent, pub version: ExtVersion, } #[derive(Deserialize, Debug)] pub struct SetRoleGrantsRequest { pub database: PgIdent, pub schema: PgIdent, pub privileges: Vec, pub role: PgIdent, } #[cfg(test)] mod test { use std::str::FromStr; use crate::requests::ComputeClaimsScope; /// Confirm that whether we parse the scope by string or through serde, the /// same values parse to the same enum variant. #[test] fn compute_request_scopes() { const ADMIN_SCOPE: &str = "compute_ctl:admin"; let from_serde: ComputeClaimsScope = serde_json::from_str(&format!("\"{ADMIN_SCOPE}\"")).unwrap(); let from_str = ComputeClaimsScope::from_str(ADMIN_SCOPE).unwrap(); assert_eq!(from_serde, from_str); } } ================================================ FILE: libs/compute_api/src/responses.rs ================================================ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; use std::fmt::Display; use crate::privilege::Privilege; use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { pub error: String, } /// All configuration parameters necessary for a compute. When /// [`ComputeConfig::spec`] is provided, it means that the compute is attached /// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided /// and contains parameters necessary for operating `compute_ctl` independently /// of whether a tenant is attached to the compute or not. /// /// This also happens to be the body of `compute_ctl`'s /configure request. #[derive(Debug, Deserialize, Serialize)] pub struct ComputeConfig { /// The compute spec pub spec: Option, /// The compute_ctl configuration #[allow(dead_code)] pub compute_ctl_config: ComputeCtlConfig, } impl From for ComputeConfig { fn from(value: ControlPlaneConfigResponse) -> Self { Self { spec: value.spec, compute_ctl_config: value.compute_ctl_config, } } } #[derive(Debug, Clone, Serialize)] pub struct ExtensionInstallResponse { pub extension: PgIdent, pub version: ExtVersion, } /// Status of the LFC prewarm process. The same state machine is reused for /// both autoprewarm (prewarm after compute/Postgres start using the previously /// stored LFC state) and explicit prewarming via API. #[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { /// Default value when compute boots up. #[default] NotPrewarmed, /// Prewarming thread is active and loading pages into LFC. Prewarming, /// We found requested LFC state in the endpoint storage and /// completed prewarming successfully. Completed { total: i32, prewarmed: i32, skipped: i32, state_download_time_ms: u32, uncompress_time_ms: u32, prewarm_time_ms: u32, }, /// Unexpected error happened during prewarming. Note, `Not Found 404` /// response from the endpoint storage is explicitly excluded here /// because it can normally happen on the first compute start, /// since LFC state is not available yet. Failed { error: String }, /// We tried to fetch the corresponding LFC state from the endpoint storage, /// but received `Not Found 404`. This should normally happen only during the /// first endpoint start after creation with `autoprewarm: true`. /// This may also happen if LFC is turned off or not initialized /// /// During the orchestrated prewarm via API, when a caller explicitly /// provides the LFC state key to prewarm from, it's the caller responsibility /// to handle this status as an error state in this case. Skipped, /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query /// has started working before cancellation Cancelled, } impl Display for LfcPrewarmState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), LfcPrewarmState::Prewarming => f.write_str("Prewarming"), LfcPrewarmState::Completed { .. } => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), LfcPrewarmState::Cancelled => f.write_str("Cancelled"), } } } #[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { #[default] NotOffloaded, Offloading, Completed { state_query_time_ms: u32, compress_time_ms: u32, state_upload_time_ms: u32, }, Failed { error: String, }, /// LFC state was empty so it wasn't offloaded Skipped, } #[derive(Serialize, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum PromoteState { NotPromoted, Completed { lsn_wait_time_ms: u32, pg_promote_time_ms: u32, reconfigure_time_ms: u32, }, Failed { error: String, }, } #[derive(Deserialize, Default, Debug)] #[serde(rename_all = "snake_case")] pub struct PromoteConfig { pub spec: ComputeSpec, pub wal_flush_lsn: utils::lsn::Lsn, } /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { pub start_time: DateTime, pub tenant: Option, pub timeline: Option, pub status: ComputeStatus, #[serde(serialize_with = "rfc3339_serialize")] pub last_active: Option>, pub error: Option, } #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)] #[serde(rename_all = "snake_case")] pub enum TerminateMode { #[default] /// wait 30s till returning from /terminate to allow control plane to get the error Fast, /// return from /terminate immediately as soon as all components are terminated Immediate, } impl From for ComputeStatus { fn from(mode: TerminateMode) -> Self { match mode { TerminateMode::Fast => ComputeStatus::TerminationPendingFast, TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate, } } } #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { // Spec wasn't provided at start, waiting for it to be // provided by control-plane. Empty, // Compute configuration was requested. ConfigurationPending, // Compute node has spec and initial startup and // configuration is in progress. Init, // Compute is configured and running. Running, // New spec is being applied. Configuration, // Either startup or configuration failed, // compute will exit soon or is waiting for // control-plane to terminate it. Failed, // Termination requested TerminationPendingFast, // Termination requested, without waiting 30s before returning from /terminate TerminationPendingImmediate, // Terminated Postgres Terminated, // A spec refresh is being requested RefreshConfigurationPending, // A spec refresh is being applied. We cannot refresh configuration again until the current // refresh is done, i.e., signal_refresh_configuration() will return 500 error. RefreshConfiguration, } #[derive(Deserialize, Serialize)] pub struct TerminateResponse { pub lsn: Option, } impl Display for ComputeStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ComputeStatus::Empty => f.write_str("empty"), ComputeStatus::ConfigurationPending => f.write_str("configuration-pending"), ComputeStatus::RefreshConfiguration => f.write_str("refresh-configuration"), ComputeStatus::RefreshConfigurationPending => { f.write_str("refresh-configuration-pending") } ComputeStatus::Init => f.write_str("init"), ComputeStatus::Running => f.write_str("running"), ComputeStatus::Configuration => f.write_str("configuration"), ComputeStatus::Failed => f.write_str("failed"), ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"), ComputeStatus::TerminationPendingImmediate => { f.write_str("termination-pending-immediate") } ComputeStatus::Terminated => f.write_str("terminated"), } } } pub fn rfc3339_serialize(x: &Option>, s: S) -> Result where S: Serializer, { if let Some(x) = x { x.to_rfc3339().serialize(s) } else { s.serialize_none() } } /// Response of the /metrics.json API #[derive(Clone, Debug, Default, Serialize)] pub struct ComputeMetrics { /// Time spent waiting in pool pub wait_for_spec_ms: u64, /// Time spent checking if safekeepers are synced pub sync_sk_check_ms: u64, /// Time spent syncing safekeepers (walproposer.c). /// In most cases this should be zero. pub sync_safekeepers_ms: u64, /// Time it took to establish a pg connection to the pageserver. /// This is two roundtrips, so it's a good proxy for compute-pageserver /// latency. The latency is usually 0.2ms, but it's not safe to assume /// that. pub pageserver_connect_micros: u64, /// Time to get basebackup from pageserver and write it to disk. pub basebackup_ms: u64, /// Compressed size of basebackup received. pub basebackup_bytes: u64, /// Time spent starting potgres. This includes initialization of shared /// buffers, preloading extensions, and other pg operations. pub start_postgres_ms: u64, /// Time spent applying pg catalog updates that were made in the console /// UI. This should be 0 when startup time matters, since cplane tries /// to do these updates eagerly, and passes the skip_pg_catalog_updates /// when it's safe to skip this step. pub config_ms: u64, /// Total time, from when we receive the spec to when we're ready to take /// pg connections. pub total_startup_ms: u64, pub load_ext_ms: u64, pub num_ext_downloaded: u64, pub largest_ext_size: u64, // these are measured in bytes pub total_ext_download_size: u64, } #[derive(Clone, Debug, Default, Serialize)] pub struct CatalogObjects { pub roles: Vec, pub databases: Vec, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct ComputeCtlConfig { /// Set of JSON web keys that the compute can use to authenticate /// communication from the control plane. pub jwks: JwkSet, pub tls: Option, } impl Default for ComputeCtlConfig { fn default() -> Self { Self { jwks: JwkSet { keys: Vec::default(), }, tls: None, } } } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct TlsConfig { pub key_path: String, pub cert_path: String, } /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] pub struct ControlPlaneConfigResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ControlPlaneComputeStatus { // Compute is known to control-plane, but it's not // yet attached to any timeline / endpoint. Empty, // Compute is attached to some timeline / endpoint and // should be able to start with provided spec. Attached, } #[derive(Clone, Debug, Default, Serialize)] pub struct InstalledExtension { pub extname: String, pub version: String, pub n_databases: u32, // Number of databases using this extension pub owned_by_superuser: String, } #[derive(Clone, Debug, Default, Serialize)] pub struct InstalledExtensions { pub extensions: Vec, } #[derive(Clone, Debug, Default, Serialize)] pub struct ExtensionInstallResult { pub extension: PgIdent, pub version: ExtVersion, } #[derive(Clone, Debug, Default, Serialize)] pub struct SetRoleGrantsResponse { pub database: PgIdent, pub schema: PgIdent, pub privileges: Vec, pub role: PgIdent, } ================================================ FILE: libs/compute_api/src/spec.rs ================================================ //! The ComputeSpec contains all the information needed to start up //! the right version of PostgreSQL, and connect it to the storage nodes. //! It can be passed as part of the `config.json`, or the control plane can //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or //! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; use std::fmt::Display; use anyhow::anyhow; use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use url::Url; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize}; use crate::responses::TlsConfig; /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; /// String type alias representing Postgres extension version pub type ExtVersion = String; fn default_reconfigure_concurrency() -> usize { 1 } /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct ComputeSpec { pub format_version: f32, // The control plane also includes a 'timestamp' field in the JSON document, // but we don't use it for anything. Serde will ignore missing fields when // deserializing it. pub operation_uuid: Option, /// Compute features to enable. These feature flags are provided, when we /// know all the details about client's compute, so they cannot be used /// to change `Empty` compute behavior. #[serde(default)] pub features: Vec, /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first /// received. /// /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's /// spec generation doesn't need to be aware of the actual compute it's running on, while /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus /// giving every VM much more swap than it should have (32GiB). /// /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for /// enabling the swap resizing behavior once rollout is complete. /// /// See neondatabase/cloud#12047 for more. #[serde(default)] pub swap_size_bytes: Option, /// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs /// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the /// spec is first received. /// /// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's /// spec generation doesn't need to be aware of the actual compute it's running on, while /// guaranteeing gradual rollout of disk quota. #[serde(default)] pub disk_quota_bytes: Option, /// Disables the vm-monitor behavior that resizes LFC on upscale/downscale, instead relying on /// the initial size of LFC. /// /// This is intended for use when the LFC size is being overridden from the default but /// autoscaling is still enabled, and we don't want the vm-monitor to interfere with the custom /// LFC sizing. #[serde(default)] pub disable_lfc_resizing: Option, /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, /// An optional hint that can be passed to speed up startup time if we know /// that no pg catalog mutations (like role creation, database creation, /// extension creation) need to be done on the actual database to start. #[serde(default)] // Default false pub skip_pg_catalog_updates: bool, // Information needed to connect to the storage layer. // // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed. // // Depending on `mode`, this can be a primary read-write node, a read-only // replica, or a read-only node pinned at an older LSN. // `safekeeper_connstrings` must be set for a primary. // // For backwards compatibility, the control plane may leave out all of // these, and instead set the "neon.tenant_id", "neon.timeline_id", // etc. GUCs in cluster.settings. TODO: Once the control plane has been // updated to fill these fields, we can make these non optional. pub tenant_id: Option, pub timeline_id: Option, /// Pageserver information can be passed in three different ways: /// 1. Here in `pageserver_connection_info` /// 2. In the `pageserver_connstring` field. /// 3. in `cluster.settings`. /// /// The goal is to use method 1. everywhere. But for backwards-compatibility with old /// versions of the control plane, `compute_ctl` will check 2. and 3. if the /// `pageserver_connection_info` field is missing. /// /// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are /// given, they must contain the same information. pub pageserver_connection_info: Option, pub pageserver_connstring: Option, /// Stripe size for pageserver sharding, in pages. This is set together with the legacy /// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used, /// the stripe size is stored in `pageserver_connection_info.stripe_size` instead. pub shard_stripe_size: Option, // More neon ids that we expose to the compute_ctl // and to postgres as neon extension GUCs. pub project_id: Option, pub branch_id: Option, pub endpoint_id: Option, /// Safekeeper membership config generation. It is put in /// neon.safekeepers GUC and serves two purposes: /// 1) Non zero value forces walproposer to use membership configurations. /// 2) If walproposer wants to update list of safekeepers to connect to /// taking them from some safekeeper mconf, it should check what value /// is newer by comparing the generation. /// /// Note: it could be SafekeeperGeneration, but this needs linking /// compute_ctl with postgres_ffi. #[serde(default)] pub safekeepers_generation: Option, #[serde(default)] pub safekeeper_connstrings: Vec, #[serde(default)] pub mode: ComputeMode, /// If set, 'storage_auth_token' is used as the password to authenticate to /// the pageserver and safekeepers. pub storage_auth_token: Option, // information about available remote extensions pub remote_extensions: Option, pub pgbouncer_settings: Option>, /// Local Proxy configuration used for JWT authentication #[serde(default)] pub local_proxy_config: Option, /// Number of concurrent connections during the parallel RunInEachDatabase /// phase of the apply config process. /// /// We need a higher concurrency during reconfiguration in case of many DBs, /// but instance is already running and used by client. We can easily get out of /// `max_connections` limit, and the current code won't handle that. /// /// Default is 1, but also allow control plane to override this value for specific /// projects. It's also recommended to bump `superuser_reserved_connections` += /// `reconfigure_concurrency` for such projects to ensure that we always have /// enough spare connections for reconfiguration process to succeed. #[serde(default = "default_reconfigure_concurrency")] pub reconfigure_concurrency: usize, /// If set to true, the compute_ctl will drop all subscriptions before starting the /// compute. This is needed when we start an endpoint on a branch, so that child /// would not compete with parent branch subscriptions /// over the same replication content from publisher. #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, /// Log level for compute audit logging #[serde(default)] pub audit_log_level: ComputeAudit, /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 pub logs_export_host: Option, /// Address of endpoint storage service pub endpoint_storage_addr: Option, /// JWT for authorizing requests to endpoint storage service pub endpoint_storage_token: Option, #[serde(default)] /// Download LFC state from endpoint storage and pass it to Postgres on compute startup pub autoprewarm: bool, #[serde(default)] /// Upload LFC state to endpoint storage periodically. Default value (None) means "don't upload" pub offload_lfc_interval_seconds: Option, /// Suspend timeout in seconds. /// /// We use this value to derive other values, such as the installed extensions metric. pub suspend_timeout_seconds: i64, // Databricks specific options for compute instance. pub databricks_settings: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeFeature { // XXX: Add more feature flags here. /// Enable the experimental activity monitor logic, which uses `pg_stat_database` to /// track short-lived connections as user activity. ActivityMonitorExperimental, /// Enable TLS functionality. TlsExperimental, /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. #[serde(other)] UnknownFeature, } #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] pub struct PageserverConnectionInfo { /// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage pub shard_count: ShardCount, /// INVARIANT: null if shard_count is 0, otherwise non-null and immutable pub stripe_size: Option, pub shards: HashMap, /// If the compute supports both protocols, this indicates which one it should use. The compute /// may use other available protocols too, if it doesn't support the preferred one. The URL's /// for the protocol specified here must be present for all shards, i.e. do not mark a protocol /// as preferred if it cannot actually be used with all the pageservers. #[serde(default)] pub prefer_protocol: PageserverProtocol, } /// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings. /// /// This is used for backwards-compatibility, to parse the legacy /// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays, /// the 'pageserver_connection_info' field should be used instead. impl PageserverConnectionInfo { pub fn from_connstr( connstr: &str, stripe_size: Option, ) -> Result { let shard_infos: Vec<_> = connstr .split(',') .map(|connstr| PageserverShardInfo { pageservers: vec![PageserverShardConnectionInfo { id: None, libpq_url: Some(connstr.to_string()), grpc_url: None, }], }) .collect(); match shard_infos.len() { 0 => anyhow::bail!("empty connection string"), 1 => { // We assume that if there's only connection string, it means "unsharded", // rather than a sharded system with just a single shard. The latter is // possible in principle, but we never do it. let shard_count = ShardCount::unsharded(); let only_shard = shard_infos.first().unwrap().clone(); let shards = vec![(ShardIndex::unsharded(), only_shard)]; Ok(PageserverConnectionInfo { shard_count, stripe_size: None, shards: shards.into_iter().collect(), prefer_protocol: PageserverProtocol::Libpq, }) } n => { if stripe_size.is_none() { anyhow::bail!("{n} shards but no stripe_size"); } let shard_count = ShardCount(n.try_into()?); let shards = shard_infos .into_iter() .enumerate() .map(|(idx, shard_info)| { ( ShardIndex { shard_count, shard_number: ShardNumber( idx.try_into().expect("shard number fits in u8"), ), }, shard_info, ) }) .collect(); Ok(PageserverConnectionInfo { shard_count, stripe_size, shards, prefer_protocol: PageserverProtocol::Libpq, }) } } } /// Convenience routine to get the connection string for a shard. pub fn shard_url( &self, shard_number: ShardNumber, protocol: PageserverProtocol, ) -> anyhow::Result<&str> { let shard_index = ShardIndex { shard_number, shard_count: self.shard_count, }; let shard = self.shards.get(&shard_index).ok_or_else(|| { anyhow::anyhow!("shard connection info missing for shard {}", shard_index) })?; // Just use the first pageserver in the list. That's good enough for this // convenience routine; if you need more control, like round robin policy or // failover support, roll your own. (As of this writing, we never have more than // one pageserver per shard anyway, but that will change in the future.) let pageserver = shard .pageservers .first() .ok_or(anyhow::anyhow!("must have at least one pageserver"))?; let result = match protocol { PageserverProtocol::Grpc => pageserver .grpc_url .as_ref() .ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?, PageserverProtocol::Libpq => pageserver .libpq_url .as_ref() .ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?, }; Ok(result) } } #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] pub struct PageserverShardInfo { pub pageservers: Vec, } #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] pub struct PageserverShardConnectionInfo { pub id: Option, pub libpq_url: Option, pub grpc_url: Option, } #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct RemoteExtSpec { pub public_extensions: Option>, pub custom_extensions: Option>, pub library_index: HashMap, pub extension_data: HashMap, } #[derive(Clone, Debug, Serialize, Deserialize)] pub struct ExtensionData { pub control_data: HashMap, pub archive_path: String, } impl RemoteExtSpec { pub fn get_ext( &self, ext_name: &str, is_library: bool, build_tag: &str, pg_major_version: &str, ) -> anyhow::Result<(String, RemotePath)> { let mut real_ext_name = ext_name; if is_library { // sometimes library names might have a suffix like // library.so or library.so.3. We strip this off // because library_index is based on the name without the file extension let strip_lib_suffix = Regex::new(r"\.so.*").unwrap(); let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string(); real_ext_name = self .library_index .get(&lib_raw_name) .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?; } // Check if extension is present in public or custom. // If not, then it is not allowed to be used by this compute. if !self .public_extensions .as_ref() .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) && !self .custom_extensions .as_ref() .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) { return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); } match self.extension_data.get(real_ext_name) { Some(_ext_data) => Ok(( real_ext_name.to_string(), Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?, )), None => Err(anyhow::anyhow!( "real_ext_name {} is not found", real_ext_name )), } } /// Get the architecture-specific portion of the remote extension path. We /// use the Go naming convention due to Kubernetes. fn get_arch() -> &'static str { match std::env::consts::ARCH { "x86_64" => "amd64", "aarch64" => "arm64", arch => arch, } } /// Build a [`RemotePath`] for an extension. fn build_remote_path( build_tag: &str, pg_major_version: &str, ext_name: &str, ) -> anyhow::Result { let arch = Self::get_arch(); // Construct the path to the extension archive // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst // // Keep it in sync with path generation in // https://github.com/neondatabase/build-custom-extensions/tree/main RemotePath::from_string(&format!( "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst" )) } } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeMode { /// A read-write node #[default] Primary, /// A read-only node, pinned at a particular LSN Static(Lsn), /// A read-only node that follows the tip of the branch in hot standby mode /// /// Future versions may want to distinguish between replicas with hot standby /// feedback and other kinds of replication configurations. Replica, } impl ComputeMode { /// Convert the compute mode to a string that can be used to identify the type of compute, /// which means that if it's a static compute, the LSN will not be included. pub fn to_type_str(&self) -> &'static str { match self { ComputeMode::Primary => "primary", ComputeMode::Static(_) => "static", ComputeMode::Replica => "replica", } } } impl Display for ComputeMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.to_type_str()) } } /// Log level for audit logging #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { #[default] Disabled, // Deprecated, use Base instead Log, // (pgaudit.log = 'ddl', pgaudit.log_parameter='off') // logged to the standard postgresql log stream Base, // Deprecated, use Full or Extended instead Hipaa, // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off') // logged to separate files collected by rsyslog // into dedicated log storage with strict access Extended, // (pgaudit.log='all', pgaudit.log_parameter='on'), // logged to separate files collected by rsyslog // into dedicated log storage with strict access. Full, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, pub name: Option, pub state: Option, pub roles: Vec, pub databases: Vec, /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl' /// tool may add additional settings to the final file.) pub postgresql_conf: Option, /// Additional settings that will be appended to the 'postgresql.conf' file. pub settings: GenericOptions, } /// Single cluster state changing operation that could not be represented as /// a static `Cluster` structure. For example: /// - DROP DATABASE /// - DROP ROLE /// - ALTER ROLE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name #[derive(Clone, Debug, Deserialize, Serialize)] pub struct DeltaOp { pub action: String, pub name: PgIdent, pub new_name: Option, } /// Rust representation of Postgres role info with only those fields /// that matter for us. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, pub options: GenericOptions, } /// Rust representation of Postgres database info with only those fields /// that matter for us. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, pub options: GenericOptions, // These are derived flags, not present in the spec file. // They are never set by the control plane. #[serde(skip_deserializing, default)] pub restrict_conn: bool, #[serde(skip_deserializing, default)] pub invalid: bool, } /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct GenericOption { pub name: String, pub value: Option, pub vartype: String, } /// Postgres compute TLS settings. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct PgComputeTlsSettings { // Absolute path to the certificate file for server-side TLS. pub cert_file: String, // Absolute path to the private key file for server-side TLS. pub key_file: String, // Absolute path to the certificate authority file for verifying client certificates. pub ca_file: String, } /// Databricks specific options for compute instance. /// This is used to store any other settings that needs to be propagate to Compute /// but should not be persisted to ComputeSpec in the database. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct DatabricksSettings { pub pg_compute_tls_settings: PgComputeTlsSettings, // Absolute file path to databricks_pg_hba.conf file. pub databricks_pg_hba: String, // Absolute file path to databricks_pg_ident.conf file. pub databricks_pg_ident: String, // Hostname portion of the Databricks workspace URL of the endpoint, or empty string if not known. // A valid hostname is required for the compute instance to support PAT logins. pub databricks_workspace_host: String, } /// Optional collection of `GenericOption`'s. Type alias allows us to /// declare a `trait` on it. pub type GenericOptions = Option>; /// Configured the local_proxy application with the relevant JWKS and roles it should /// use for authorizing connect requests using JWT. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct LocalProxySpec { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub jwks: Option>, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub tls: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] pub struct JwksSettings { pub id: String, pub role_names: Vec, pub jwks_url: String, pub provider_name: String, pub jwt_audience: Option, } /// Protocol used to connect to a Pageserver. #[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub enum PageserverProtocol { /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. #[default] #[serde(rename = "libpq")] Libpq, /// A newer, gRPC-based protocol. Uses grpc:// scheme. #[serde(rename = "grpc")] Grpc, } impl PageserverProtocol { /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given. /// Errors if the connstring is an invalid URL. pub fn from_connstring(connstring: &str) -> anyhow::Result { let scheme = match Url::parse(connstring) { Ok(url) => url.scheme().to_lowercase(), Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()), Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), }; match scheme.as_str() { "postgresql" | "postgres" => Ok(Self::Libpq), "grpc" => Ok(Self::Grpc), scheme => Err(anyhow!("invalid protocol scheme: {scheme}")), } } /// Returns the URL scheme for the protocol, for use in connstrings. pub fn scheme(&self) -> &'static str { match self { Self::Libpq => "postgresql", Self::Grpc => "grpc", } } } impl Display for PageserverProtocol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.scheme()) } } #[cfg(test)] mod tests { use std::fs::File; use super::*; #[test] fn allow_installing_remote_extensions() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": null, "custom_extensions": null, "library_index": {}, "extension_data": {}, })) .unwrap(); rspec .get_ext("ext", false, "latest", "v17") .expect_err("Extension should not be found"); let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": [], "custom_extensions": null, "library_index": {}, "extension_data": {}, })) .unwrap(); rspec .get_ext("ext", false, "latest", "v17") .expect_err("Extension should not be found"); let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": [], "custom_extensions": [], "library_index": { "ext": "ext" }, "extension_data": { "ext": { "control_data": { "ext.control": "" }, "archive_path": "" } }, })) .unwrap(); rspec .get_ext("ext", false, "latest", "v17") .expect_err("Extension should not be found"); let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": [], "custom_extensions": ["ext"], "library_index": { "ext": "ext" }, "extension_data": { "ext": { "control_data": { "ext.control": "" }, "archive_path": "" } }, })) .unwrap(); rspec .get_ext("ext", false, "latest", "v17") .expect("Extension should be found"); let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": ["ext"], "custom_extensions": [], "library_index": { "extlib": "ext", }, "extension_data": { "ext": { "control_data": { "ext.control": "" }, "archive_path": "" } }, })) .unwrap(); rspec .get_ext("ext", false, "latest", "v17") .expect("Extension should be found"); // test library index for the case when library name // doesn't match the extension name rspec .get_ext("extlib", true, "latest", "v17") .expect("Library should be found"); } #[test] fn remote_extension_path() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ "public_extensions": ["ext"], "custom_extensions": [], "library_index": { "extlib": "ext", }, "extension_data": { "ext": { "control_data": { "ext.control": "" }, "archive_path": "" } }, })) .unwrap(); let (_ext_name, ext_path) = rspec .get_ext("ext", false, "latest", "v17") .expect("Extension should be found"); // Starting with a forward slash would have consequences for the // Url::join() that occurs when downloading a remote extension. assert!(!ext_path.to_string().starts_with("/")); assert_eq!( ext_path, RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap() ); } #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); // Features list defaults to empty vector. assert!(spec.features.is_empty()); // Reconfigure concurrency defaults to 1. assert_eq!(spec.reconfigure_concurrency, 1); } #[test] fn parse_unknown_fields() { // Forward compatibility test let file = File::open("tests/cluster_spec.json").unwrap(); let mut json: serde_json::Value = serde_json::from_reader(file).unwrap(); let ob = json.as_object_mut().unwrap(); ob.insert("unknown_field_123123123".into(), "hello".into()); let _spec: ComputeSpec = serde_json::from_value(json).unwrap(); } #[test] fn parse_unknown_features() { // Test that unknown feature flags do not cause any errors. let file = File::open("tests/cluster_spec.json").unwrap(); let mut json: serde_json::Value = serde_json::from_reader(file).unwrap(); let ob = json.as_object_mut().unwrap(); // Add unknown feature flags. let features = vec!["foo_bar_feature", "baz_feature"]; ob.insert("features".into(), features.into()); let spec: ComputeSpec = serde_json::from_value(json).unwrap(); assert!(spec.features.len() == 2); assert!(spec.features.contains(&ComputeFeature::UnknownFeature)); assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]); } #[test] fn parse_known_features() { // Test that we can properly parse known feature flags. let file = File::open("tests/cluster_spec.json").unwrap(); let mut json: serde_json::Value = serde_json::from_reader(file).unwrap(); let ob = json.as_object_mut().unwrap(); // Add known feature flags. let features = vec!["activity_monitor_experimental"]; ob.insert("features".into(), features.into()); let spec: ComputeSpec = serde_json::from_value(json).unwrap(); assert_eq!( spec.features, vec![ComputeFeature::ActivityMonitorExperimental] ); } } ================================================ FILE: libs/compute_api/tests/cluster_spec.json ================================================ { "format_version": 1.0, "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", "suspend_timeout_seconds": 3600, "cluster": { "cluster_id": "test-cluster-42", "name": "Zenith Test", "state": "restarted", "roles": [ { "name": "postgres", "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972", "options": null }, { "name": "alexk", "encrypted_password": null, "options": null }, { "name": "zenith \"new\"", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972", "options": null }, { "name": "zen", "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972" }, { "name": "\"name\";\\n select 1;", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" }, { "name": "MyRole", "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" } ], "databases": [ { "name": "DB2", "owner": "alexk", "options": [ { "name": "LC_COLLATE", "value": "C", "vartype": "string" }, { "name": "LC_CTYPE", "value": "C", "vartype": "string" }, { "name": "TEMPLATE", "value": "template0", "vartype": "enum" } ] }, { "name": "zenith", "owner": "MyRole" }, { "name": "zen", "owner": "zen" } ], "settings": [ { "name": "fsync", "value": "off", "vartype": "bool" }, { "name": "wal_level", "value": "logical", "vartype": "enum" }, { "name": "hot_standby", "value": "on", "vartype": "bool" }, { "name": "autoprewarm", "value": "off", "vartype": "bool" }, { "name": "offload_lfc_interval_seconds", "value": "20", "vartype": "integer" }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, { "name": "wal_log_hints", "value": "on", "vartype": "bool" }, { "name": "log_connections", "value": "on", "vartype": "bool" }, { "name": "shared_buffers", "value": "32768", "vartype": "integer" }, { "name": "port", "value": "55432", "vartype": "integer" }, { "name": "max_connections", "value": "100", "vartype": "integer" }, { "name": "max_wal_senders", "value": "10", "vartype": "integer" }, { "name": "listen_addresses", "value": "0.0.0.0", "vartype": "string" }, { "name": "wal_sender_timeout", "value": "0", "vartype": "integer" }, { "name": "password_encryption", "value": "md5", "vartype": "enum" }, { "name": "maintenance_work_mem", "value": "65536", "vartype": "integer" }, { "name": "max_parallel_workers", "value": "8", "vartype": "integer" }, { "name": "max_worker_processes", "value": "8", "vartype": "integer" }, { "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, { "name": "max_replication_slots", "value": "10", "vartype": "integer" }, { "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", "value": "neon", "vartype": "string" }, { "name": "synchronous_standby_names", "value": "walproposer", "vartype": "string" }, { "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" }, { "name": "test.escaping", "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray", "vartype": "string" } ] }, "delta_operations": [ { "action": "delete_db", "name": "zenith_test" }, { "action": "rename_db", "name": "DB", "new_name": "DB2" }, { "action": "delete_role", "name": "zenith2" }, { "action": "rename_role", "name": "zenith new", "new_name": "zenith \"new\"" } ], "remote_extensions": { "library_index": { "postgis-3": "postgis", "libpgrouting-3.4": "postgis", "postgis_raster-3": "postgis", "postgis_sfcgal-3": "postgis", "postgis_topology-3": "postgis", "address_standardizer-3": "postgis" }, "extension_data": { "postgis": { "archive_path": "5834329303/v15/extensions/postgis.tar.zst", "control_data": { "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n", "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n", "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n", "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n", "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n", "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n", "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n", "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n" } } }, "custom_extensions": [ ], "public_extensions": [ "postgis" ] }, "pgbouncer_settings": { "default_pool_size": "42", "pool_mode": "session" } } ================================================ FILE: libs/consumption_metrics/Cargo.toml ================================================ [package] name = "consumption_metrics" version = "0.1.0" edition = "2024" license = "Apache-2.0" [dependencies] chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true ================================================ FILE: libs/consumption_metrics/src/lib.rs ================================================ //! Shared code for consumption metics collection #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use chrono::{DateTime, Utc}; use rand::Rng; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] #[serde(tag = "type")] pub enum EventType { #[serde(rename = "absolute")] Absolute { time: DateTime }, #[serde(rename = "incremental")] Incremental { start_time: DateTime, stop_time: DateTime, }, } impl EventType { pub fn absolute_time(&self) -> Option<&DateTime> { use EventType::*; match self { Absolute { time } => Some(time), _ => None, } } pub fn incremental_timerange(&self) -> Option>> { // these can most likely be thought of as Range or RangeFull, at least pageserver creates // incremental ranges where the stop and next start are equal. use EventType::*; match self { Incremental { start_time, stop_time, } => Some(start_time..stop_time), _ => None, } } pub fn is_incremental(&self) -> bool { matches!(self, EventType::Incremental { .. }) } /// Returns the absolute time, or for incremental ranges, the stop time. pub fn recorded_at(&self) -> &DateTime { use EventType::*; match self { Absolute { time } => time, Incremental { stop_time, .. } => stop_time, } } } #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] pub struct Event { #[serde(flatten)] #[serde(rename = "type")] pub kind: EventType, pub metric: Metric, pub idempotency_key: String, pub value: u64, #[serde(flatten)] pub extra: Extra, } pub fn idempotency_key(node_id: &str) -> String { IdempotencyKey::generate(node_id).to_string() } /// Downstream users will use these to detect upload retries. pub struct IdempotencyKey<'a> { now: chrono::DateTime, node_id: &'a str, nonce: u16, } impl std::fmt::Display for IdempotencyKey<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce) } } impl<'a> IdempotencyKey<'a> { pub fn generate(node_id: &'a str) -> Self { IdempotencyKey { now: Utc::now(), node_id, nonce: rand::rng().random_range(0..=9999), } } pub fn for_tests(now: DateTime, node_id: &'a str, nonce: u16) -> Self { IdempotencyKey { now, node_id, nonce, } } } /// Split into chunks of 1000 metrics to avoid exceeding the max request size. pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } #[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] pub struct EventChunk<'a, T: Clone + PartialEq> { pub events: std::borrow::Cow<'a, [T]>, } ================================================ FILE: libs/desim/Cargo.toml ================================================ [package] name = "desim" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true rand.workspace = true tracing.workspace = true bytes.workspace = true utils.workspace = true parking_lot.workspace = true hex.workspace = true smallvec = { workspace = true, features = ["write"] } ================================================ FILE: libs/desim/README.md ================================================ # Discrete Event SIMulator This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc). Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something. The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests. ================================================ FILE: libs/desim/src/chan.rs ================================================ use std::collections::VecDeque; use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use crate::executor::{self, PollSome, Waker}; /// FIFO channel with blocking send and receive. Can be cloned and shared between threads. /// Blocking functions should be used only from threads that are managed by the executor. pub struct Chan { shared: Arc>, } impl Clone for Chan { fn clone(&self) -> Self { Chan { shared: self.shared.clone(), } } } impl Default for Chan { fn default() -> Self { Self::new() } } impl Chan { pub fn new() -> Chan { Chan { shared: Arc::new(State { queue: Mutex::new(VecDeque::new()), waker: Waker::new(), }), } } /// Get a message from the front of the queue, block if the queue is empty. /// If not called from the executor thread, it can block forever. pub fn recv(&self) -> T { self.shared.recv() } /// Panic if the queue is empty. pub fn must_recv(&self) -> T { self.shared .try_recv() .expect("message should've been ready") } /// Get a message from the front of the queue, return None if the queue is empty. /// Never blocks. pub fn try_recv(&self) -> Option { self.shared.try_recv() } /// Send a message to the back of the queue. pub fn send(&self, t: T) { self.shared.send(t); } } struct State { queue: Mutex>, waker: Waker, } impl State { fn send(&self, t: T) { self.queue.lock().push_back(t); self.waker.wake_all(); } fn try_recv(&self) -> Option { let mut q = self.queue.lock(); q.pop_front() } fn recv(&self) -> T { // interrupt the receiver to prevent consuming everything at once executor::yield_me(0); let mut queue = self.queue.lock(); if let Some(t) = queue.pop_front() { return t; } loop { self.waker.wake_me_later(); if let Some(t) = queue.pop_front() { return t; } MutexGuard::unlocked(&mut queue, || { executor::yield_me(-1); }); } } } impl PollSome for Chan { /// Schedules a wakeup for the current thread. fn wake_me(&self) { self.shared.waker.wake_me_later(); } /// Checks if chan has any pending messages. fn has_some(&self) -> bool { !self.shared.queue.lock().is_empty() } } ================================================ FILE: libs/desim/src/executor.rs ================================================ use std::panic::AssertUnwindSafe; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, Ordering}; use std::sync::{Arc, OnceLock, mpsc}; use std::thread::JoinHandle; use tracing::{debug, error, trace}; use crate::time::Timing; /// Stores status of the running threads. Threads are registered in the runtime upon creation /// and deregistered upon termination. pub struct Runtime { // stores handles to all threads that are currently running threads: Vec, // stores current time and pending wakeups clock: Arc, // thread counter thread_counter: AtomicU32, // Thread step counter -- how many times all threads has been actually // stepped (note that all world/time/executor/thread have slightly different // meaning of steps). For observability. pub step_counter: u64, } impl Runtime { /// Init new runtime, no running threads. pub fn new(clock: Arc) -> Self { Self { threads: Vec::new(), clock, thread_counter: AtomicU32::new(0), step_counter: 0, } } /// Spawn a new thread and register it in the runtime. pub fn spawn(&mut self, f: F) -> ExternalHandle where F: FnOnce() + Send + 'static, { let (tx, rx) = mpsc::channel(); let clock = self.clock.clone(); let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst); debug!("spawning thread-{}", tid); let join = std::thread::spawn(move || { let _guard = tracing::info_span!("", tid).entered(); let res = std::panic::catch_unwind(AssertUnwindSafe(|| { with_thread_context(|ctx| { assert!(ctx.clock.set(clock).is_ok()); ctx.id.store(tid, Ordering::SeqCst); tx.send(ctx.clone()).expect("failed to send thread context"); // suspend thread to put it to `threads` in sleeping state ctx.yield_me(0); }); // start user-provided function f(); })); debug!("thread finished"); if let Err(e) = res { with_thread_context(|ctx| { if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) { error!("thread panicked, terminating the process: {:?}", e); std::process::exit(1); } debug!("thread panicked: {:?}", e); let mut result = ctx.result.lock(); if result.0 == -1 { *result = (256, format!("thread panicked: {e:?}")); } }); } with_thread_context(|ctx| { ctx.finish_me(); }); }); let ctx = rx.recv().expect("failed to receive thread context"); let handle = ThreadHandle::new(ctx.clone(), join); self.threads.push(handle); ExternalHandle { ctx } } /// Returns true if there are any unfinished activity, such as running thread or pending events. /// Otherwise returns false, which means all threads are blocked forever. pub fn step(&mut self) -> bool { trace!("runtime step"); // have we run any thread? let mut ran = false; self.threads.retain(|thread: &ThreadHandle| { let res = thread.ctx.wakeup.compare_exchange( PENDING_WAKEUP, NO_WAKEUP, Ordering::SeqCst, Ordering::SeqCst, ); if res.is_err() { // thread has no pending wakeups, leaving as is return true; } ran = true; trace!("entering thread-{}", thread.ctx.tid()); let status = thread.step(); self.step_counter += 1; trace!( "out of thread-{} with status {:?}", thread.ctx.tid(), status ); if status == Status::Sleep { true } else { trace!("thread has finished"); // removing the thread from the list false } }); if !ran { trace!("no threads were run, stepping clock"); if let Some(ctx_to_wake) = self.clock.step() { trace!("waking up thread-{}", ctx_to_wake.tid()); ctx_to_wake.inc_wake(); } else { return false; } } true } /// Kill all threads. This is done by setting a flag in each thread context and waking it up. pub fn crash_all_threads(&mut self) { for thread in self.threads.iter() { thread.ctx.crash_stop(); } // all threads should be finished after a few steps while !self.threads.is_empty() { self.step(); } } } impl Drop for Runtime { fn drop(&mut self) { debug!("dropping the runtime"); self.crash_all_threads(); } } #[derive(Clone)] pub struct ExternalHandle { ctx: Arc, } impl ExternalHandle { /// Returns true if thread has finished execution. pub fn is_finished(&self) -> bool { let status = self.ctx.mutex.lock(); *status == Status::Finished } /// Returns exitcode and message, which is available after thread has finished execution. pub fn result(&self) -> (i32, String) { let result = self.ctx.result.lock(); result.clone() } /// Returns thread id. pub fn id(&self) -> u32 { self.ctx.id.load(Ordering::SeqCst) } /// Sets a flag to crash thread on the next wakeup. pub fn crash_stop(&self) { self.ctx.crash_stop(); } } struct ThreadHandle { ctx: Arc, _join: JoinHandle<()>, } impl ThreadHandle { /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state. fn new(ctx: Arc, join: JoinHandle<()>) -> Self { let mut status = ctx.mutex.lock(); // wait until thread will go into the first yield while *status != Status::Sleep { ctx.condvar.wait(&mut status); } drop(status); Self { ctx, _join: join } } /// Allows thread to execute one step of its execution. /// Returns [`Status`] of the thread after the step. fn step(&self) -> Status { let mut status = self.ctx.mutex.lock(); assert!(matches!(*status, Status::Sleep)); *status = Status::Running; self.ctx.condvar.notify_all(); while *status == Status::Running { self.ctx.condvar.wait(&mut status); } *status } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum Status { /// Thread is running. Running, /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set. Sleep, /// Thread finished execution. Finished, } const NO_WAKEUP: u8 = 0; const PENDING_WAKEUP: u8 = 1; pub struct ThreadContext { id: AtomicU32, // used to block thread until it is woken up mutex: parking_lot::Mutex, condvar: parking_lot::Condvar, // used as a flag to indicate runtime that thread is ready to be woken up wakeup: AtomicU8, clock: OnceLock>, // execution result, set by exit() call result: parking_lot::Mutex<(i32, String)>, // determines if process should be killed on receiving panic allow_panic: AtomicBool, // acts as a signal that thread should crash itself on the next wakeup crash_request: AtomicBool, } impl ThreadContext { pub(crate) fn new() -> Self { Self { id: AtomicU32::new(0), mutex: parking_lot::Mutex::new(Status::Running), condvar: parking_lot::Condvar::new(), wakeup: AtomicU8::new(NO_WAKEUP), clock: OnceLock::new(), result: parking_lot::Mutex::new((-1, String::new())), allow_panic: AtomicBool::new(false), crash_request: AtomicBool::new(false), } } } // Functions for executor to control thread execution. impl ThreadContext { /// Set atomic flag to indicate that thread is ready to be woken up. fn inc_wake(&self) { self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst); } /// Internal function used for event queues. pub(crate) fn schedule_wakeup(self: &Arc, after_ms: u64) { self.clock .get() .unwrap() .schedule_wakeup(after_ms, self.clone()); } fn tid(&self) -> u32 { self.id.load(Ordering::SeqCst) } fn crash_stop(&self) { let status = self.mutex.lock(); if *status == Status::Finished { debug!( "trying to crash thread-{}, which is already finished", self.tid() ); return; } assert!(matches!(*status, Status::Sleep)); drop(status); self.allow_panic.store(true, Ordering::SeqCst); self.crash_request.store(true, Ordering::SeqCst); // set a wakeup self.inc_wake(); // it will panic on the next wakeup } } // Internal functions. impl ThreadContext { /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time. /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before /// calling this function. fn yield_me(self: &Arc, after_ms: i64) { let mut status = self.mutex.lock(); assert!(matches!(*status, Status::Running)); match after_ms.cmp(&0) { std::cmp::Ordering::Less => { // block until something wakes us up } std::cmp::Ordering::Equal => { // tell executor that we are ready to be woken up self.inc_wake(); } std::cmp::Ordering::Greater => { // schedule wakeup self.clock .get() .unwrap() .schedule_wakeup(after_ms as u64, self.clone()); } } *status = Status::Sleep; self.condvar.notify_all(); // wait until executor wakes us up while *status != Status::Running { self.condvar.wait(&mut status); } if self.crash_request.load(Ordering::SeqCst) { panic!("crashed by request"); } } /// Called only once, exactly before thread finishes execution. fn finish_me(&self) { let mut status = self.mutex.lock(); assert!(matches!(*status, Status::Running)); *status = Status::Finished; { let mut result = self.result.lock(); if result.0 == -1 { *result = (0, "finished normally".to_owned()); } } self.condvar.notify_all(); } } /// Invokes the given closure with a reference to the current thread [`ThreadContext`]. #[inline(always)] fn with_thread_context(f: impl FnOnce(&Arc) -> T) -> T { thread_local!(static THREAD_DATA: Arc = Arc::new(ThreadContext::new())); THREAD_DATA.with(f) } /// Waker is used to wake up threads that are blocked on condition. /// It keeps track of contexts [`Arc`] and can increment the counter /// of several contexts to send a notification. pub struct Waker { // contexts that are waiting for a notification contexts: parking_lot::Mutex; 8]>>, } impl Default for Waker { fn default() -> Self { Self::new() } } impl Waker { pub fn new() -> Self { Self { contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()), } } /// Subscribe current thread to receive a wake notification later. pub fn wake_me_later(&self) { with_thread_context(|ctx| { self.contexts.lock().push(ctx.clone()); }); } /// Wake up all threads that are waiting for a notification and clear the list. pub fn wake_all(&self) { let mut v = self.contexts.lock(); for ctx in v.iter() { ctx.inc_wake(); } v.clear(); } } /// See [`ThreadContext::yield_me`]. pub fn yield_me(after_ms: i64) { with_thread_context(|ctx| ctx.yield_me(after_ms)) } /// Get current time. pub fn now() -> u64 { with_thread_context(|ctx| ctx.clock.get().unwrap().now()) } pub fn exit(code: i32, msg: String) -> ! { with_thread_context(|ctx| { ctx.allow_panic.store(true, Ordering::SeqCst); let mut result = ctx.result.lock(); *result = (code, msg); panic!("exit"); }) } pub(crate) fn get_thread_ctx() -> Arc { with_thread_context(|ctx| ctx.clone()) } /// Trait for polling channels until they have something. pub trait PollSome { /// Schedule wakeup for message arrival. fn wake_me(&self); /// Check if channel has a ready message. fn has_some(&self) -> bool; } /// Blocks current thread until one of the channels has a ready message. Returns /// index of the channel that has a message. If timeout is reached, returns None. /// /// Negative timeout means block forever. Zero timeout means check channels and return /// immediately. Positive timeout means block until timeout is reached. pub fn epoll_chans(chans: &[Box], timeout: i64) -> Option { let deadline = if timeout < 0 { 0 } else { now() + timeout as u64 }; loop { for chan in chans { chan.wake_me() } for (i, chan) in chans.iter().enumerate() { if chan.has_some() { return Some(i); } } if timeout < 0 { // block until wakeup yield_me(-1); } else { let current_time = now(); if current_time >= deadline { return None; } yield_me((deadline - current_time) as i64); } } } ================================================ FILE: libs/desim/src/lib.rs ================================================ pub mod chan; pub mod executor; pub mod network; pub mod node_os; pub mod options; pub mod proto; pub mod time; pub mod world; ================================================ FILE: libs/desim/src/network.rs ================================================ use std::cmp::Ordering; use std::collections::{BinaryHeap, VecDeque}; use std::fmt::{self, Debug}; use std::ops::DerefMut; use std::sync::{Arc, mpsc}; use parking_lot::lock_api::{MappedMutexGuard, MutexGuard}; use parking_lot::{Mutex, RawMutex}; use rand::rngs::StdRng; use tracing::debug; use super::chan::Chan; use super::proto::AnyMessage; use crate::executor::{self, ThreadContext}; use crate::options::NetworkOptions; use crate::proto::{NetEvent, NodeEvent}; pub struct NetworkTask { options: Arc, connections: Mutex>, /// min-heap of connections having something to deliver. events: Mutex>, task_context: Arc, } impl NetworkTask { pub fn start_new(options: Arc, tx: mpsc::Sender>) { let ctx = executor::get_thread_ctx(); let task = Arc::new(Self { options, connections: Mutex::new(Vec::new()), events: Mutex::new(BinaryHeap::new()), task_context: ctx, }); // send the task upstream tx.send(task.clone()).unwrap(); // start the task task.start(); } pub fn start_new_connection(self: &Arc, rng: StdRng, dst_accept: Chan) -> TCP { let now = executor::now(); let connection_id = self.connections.lock().len(); let vc = VirtualConnection { connection_id, dst_accept, dst_sockets: [Chan::new(), Chan::new()], state: Mutex::new(ConnectionState { buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))], rng, }), }; vc.schedule_timeout(self); vc.send_connect(self); let recv_chan = vc.dst_sockets[0].clone(); self.connections.lock().push(vc); TCP { net: self.clone(), conn_id: connection_id, dir: 0, recv_chan, } } } // private functions impl NetworkTask { /// Schedule to wakeup network task (self) `after_ms` later to deliver /// messages of connection `id`. fn schedule(&self, id: usize, after_ms: u64) { self.events.lock().push(Event { time: executor::now() + after_ms, conn_id: id, }); self.task_context.schedule_wakeup(after_ms); } /// Get locked connection `id`. fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> { MutexGuard::map(self.connections.lock(), |connections| { connections.get_mut(id).unwrap() }) } fn collect_pending_events(&self, now: u64, vec: &mut Vec) { vec.clear(); let mut events = self.events.lock(); while let Some(event) = events.peek() { if event.time > now { break; } let event = events.pop().unwrap(); vec.push(event); } } fn start(self: &Arc) { debug!("started network task"); let mut events = Vec::new(); loop { let now = executor::now(); self.collect_pending_events(now, &mut events); for event in events.drain(..) { let conn = self.get(event.conn_id); conn.process(self); } // block until wakeup executor::yield_me(-1); } } } // 0 - from node(0) to node(1) // 1 - from node(1) to node(0) type MessageDirection = u8; fn sender_str(dir: MessageDirection) -> &'static str { match dir { 0 => "client", 1 => "server", _ => unreachable!(), } } fn receiver_str(dir: MessageDirection) -> &'static str { match dir { 0 => "server", 1 => "client", _ => unreachable!(), } } /// Virtual connection between two nodes. /// Node 0 is the creator of the connection (client), /// and node 1 is the acceptor (server). struct VirtualConnection { connection_id: usize, /// one-off chan, used to deliver Accept message to dst dst_accept: Chan, /// message sinks dst_sockets: [Chan; 2], state: Mutex, } struct ConnectionState { buffers: [NetworkBuffer; 2], rng: StdRng, } impl VirtualConnection { /// Notify the future about the possible timeout. fn schedule_timeout(&self, net: &NetworkTask) { if let Some(timeout) = net.options.keepalive_timeout { net.schedule(self.connection_id, timeout); } } /// Send the handshake (Accept) to the server. fn send_connect(&self, net: &NetworkTask) { let now = executor::now(); let mut state = self.state.lock(); let delay = net.options.connect_delay.delay(&mut state.rng); let buffer = &mut state.buffers[0]; assert!(buffer.buf.is_empty()); assert!(!buffer.recv_closed); assert!(!buffer.send_closed); assert!(buffer.last_recv.is_none()); let delay = if let Some(ms) = delay { ms } else { debug!("NET: TCP #{} dropped connect", self.connection_id); buffer.send_closed = true; return; }; // Send a message into the future. buffer .buf .push_back((now + delay, AnyMessage::InternalConnect)); net.schedule(self.connection_id, delay); } /// Transmit some of the messages from the buffer to the nodes. fn process(&self, net: &Arc) { let now = executor::now(); let mut state = self.state.lock(); for direction in 0..2 { self.process_direction( net, state.deref_mut(), now, direction as MessageDirection, &self.dst_sockets[direction ^ 1], ); } // Close the one side of the connection by timeout if the node // has not received any messages for a long time. if let Some(timeout) = net.options.keepalive_timeout { let mut to_close = [false, false]; for direction in 0..2 { let buffer = &mut state.buffers[direction]; if buffer.recv_closed { continue; } if let Some(last_recv) = buffer.last_recv { if now - last_recv >= timeout { debug!( "NET: connection {} timed out at {}", self.connection_id, receiver_str(direction as MessageDirection) ); let node_idx = direction ^ 1; to_close[node_idx] = true; } } } drop(state); for (node_idx, should_close) in to_close.iter().enumerate() { if *should_close { self.close(node_idx); } } } } /// Process messages in the buffer in the given direction. fn process_direction( &self, net: &Arc, state: &mut ConnectionState, now: u64, direction: MessageDirection, to_socket: &Chan, ) { let buffer = &mut state.buffers[direction as usize]; if buffer.recv_closed { assert!(buffer.buf.is_empty()); } while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now { let msg = buffer.buf.pop_front().unwrap().1; buffer.last_recv = Some(now); self.schedule_timeout(net); if let AnyMessage::InternalConnect = msg { // TODO: assert to_socket is the server let server_to_client = TCP { net: net.clone(), conn_id: self.connection_id, dir: direction ^ 1, recv_chan: to_socket.clone(), }; // special case, we need to deliver new connection to a separate channel self.dst_accept.send(NodeEvent::Accept(server_to_client)); } else { to_socket.send(NetEvent::Message(msg)); } } } /// Try to send a message to the buffer, optionally dropping it and /// determining delivery timestamp. fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) { let now = executor::now(); let mut state = self.state.lock(); let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) { (ms, false) } else { (0, true) }; let buffer = &mut state.buffers[direction as usize]; if buffer.send_closed { debug!( "NET: TCP #{} dropped message {:?} (broken pipe)", self.connection_id, msg ); return; } if close { debug!( "NET: TCP #{} dropped message {:?} (pipe just broke)", self.connection_id, msg ); buffer.send_closed = true; return; } if buffer.recv_closed { debug!( "NET: TCP #{} dropped message {:?} (recv closed)", self.connection_id, msg ); return; } // Send a message into the future. buffer.buf.push_back((now + delay, msg)); net.schedule(self.connection_id, delay); } /// Close the connection. Only one side of the connection will be closed, /// and no further messages will be delivered. The other side will not be notified. fn close(&self, node_idx: usize) { let mut state = self.state.lock(); let recv_buffer = &mut state.buffers[1 ^ node_idx]; if recv_buffer.recv_closed { debug!( "NET: TCP #{} closed twice at {}", self.connection_id, sender_str(node_idx as MessageDirection), ); return; } debug!( "NET: TCP #{} closed at {}", self.connection_id, sender_str(node_idx as MessageDirection), ); recv_buffer.recv_closed = true; for msg in recv_buffer.buf.drain(..) { debug!( "NET: TCP #{} dropped message {:?} (closed)", self.connection_id, msg ); } let send_buffer = &mut state.buffers[node_idx]; send_buffer.send_closed = true; drop(state); // TODO: notify the other side? self.dst_sockets[node_idx].send(NetEvent::Closed); } } struct NetworkBuffer { /// Messages paired with time of delivery buf: VecDeque<(u64, AnyMessage)>, /// True if the connection is closed on the receiving side, /// i.e. no more messages from the buffer will be delivered. recv_closed: bool, /// True if the connection is closed on the sending side, /// i.e. no more messages will be added to the buffer. send_closed: bool, /// Last time a message was delivered from the buffer. /// If None, it means that the server is the receiver and /// it has not yet aware of this connection (i.e. has not /// received the Accept). last_recv: Option, } impl NetworkBuffer { fn new(last_recv: Option) -> Self { Self { buf: VecDeque::new(), recv_closed: false, send_closed: false, last_recv, } } } /// Single end of a bidirectional network stream without reordering (TCP-like). /// Reads are implemented using channels, writes go to the buffer inside VirtualConnection. pub struct TCP { net: Arc, conn_id: usize, dir: MessageDirection, recv_chan: Chan, } impl Debug for TCP { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),) } } impl TCP { /// Send a message to the other side. It's guaranteed that it will not arrive /// before the arrival of all messages sent earlier. pub fn send(&self, msg: AnyMessage) { let conn = self.net.get(self.conn_id); conn.send(&self.net, self.dir, msg); } /// Get a channel to receive incoming messages. pub fn recv_chan(&self) -> Chan { self.recv_chan.clone() } pub fn connection_id(&self) -> usize { self.conn_id } pub fn close(&self) { let conn = self.net.get(self.conn_id); conn.close(self.dir as usize); } } struct Event { time: u64, conn_id: usize, } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here // to get that. impl PartialOrd for Event { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for Event { fn cmp(&self, other: &Self) -> Ordering { (other.time, other.conn_id).cmp(&(self.time, self.conn_id)) } } impl PartialEq for Event { fn eq(&self, other: &Self) -> bool { (other.time, other.conn_id) == (self.time, self.conn_id) } } impl Eq for Event {} ================================================ FILE: libs/desim/src/node_os.rs ================================================ use std::sync::Arc; use rand::Rng; use super::chan::Chan; use super::network::TCP; use super::world::{Node, NodeId, World}; use crate::proto::NodeEvent; /// Abstraction with all functions (aka syscalls) available to the node. #[derive(Clone)] pub struct NodeOs { world: Arc, internal: Arc, } impl NodeOs { pub fn new(world: Arc, internal: Arc) -> NodeOs { NodeOs { world, internal } } /// Get the node id. pub fn id(&self) -> NodeId { self.internal.id } /// Opens a bidirectional connection with the other node. Always successful. pub fn open_tcp(&self, dst: NodeId) -> TCP { self.world.open_tcp(dst) } /// Returns a channel to receive node events (socket Accept and internal messages). pub fn node_events(&self) -> Chan { self.internal.node_events() } /// Get current time. pub fn now(&self) -> u64 { self.world.now() } /// Generate a random number in range [0, max). pub fn random(&self, max: u64) -> u64 { self.internal.rng.lock().random_range(0..max) } /// Append a new event to the world event log. pub fn log_event(&self, data: String) { self.internal.log_event(data) } } ================================================ FILE: libs/desim/src/options.rs ================================================ use rand::Rng; use rand::rngs::StdRng; /// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. /// Connection failure will occur with the probablity fail_prob. #[derive(Clone, Debug)] pub struct Delay { pub min: u64, pub max: u64, pub fail_prob: f64, // [0; 1] } impl Delay { /// Create a struct with no delay, no failures. pub fn empty() -> Delay { Delay { min: 0, max: 0, fail_prob: 0.0, } } /// Create a struct with a fixed delay. pub fn fixed(ms: u64) -> Delay { Delay { min: ms, max: ms, fail_prob: 0.0, } } /// Generate a random delay in range [min, max]. Return None if the /// message should be dropped. pub fn delay(&self, rng: &mut StdRng) -> Option { if rng.random_bool(self.fail_prob) { return None; } Some(rng.random_range(self.min..=self.max)) } } /// Describes network settings. All network packets will be subjected to the same delays and failures. #[derive(Clone, Debug)] pub struct NetworkOptions { /// Connection will be automatically closed after this timeout if no data is received. pub keepalive_timeout: Option, /// New connections will be delayed by this amount of time. pub connect_delay: Delay, /// Each message will be delayed by this amount of time. pub send_delay: Delay, } ================================================ FILE: libs/desim/src/proto.rs ================================================ use std::fmt::Debug; use bytes::Bytes; use utils::lsn::Lsn; use crate::network::TCP; use crate::world::NodeId; /// Internal node events. #[derive(Debug)] pub enum NodeEvent { Accept(TCP), Internal(AnyMessage), } /// Events that are coming from a network socket. #[derive(Clone, Debug)] pub enum NetEvent { Message(AnyMessage), Closed, } /// Custom events generated throughout the simulation. Can be used by the test to verify the correctness. #[derive(Debug)] pub struct SimEvent { pub time: u64, pub node: NodeId, pub data: String, } /// Umbrella type for all possible flavours of messages. These events can be sent over network /// or to an internal node events channel. #[derive(Clone)] pub enum AnyMessage { /// Not used, empty placeholder. None, /// Used internally for notifying node about new incoming connection. InternalConnect, Just32(u32), ReplCell(ReplCell), Bytes(Bytes), LSN(u64), } impl Debug for AnyMessage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { AnyMessage::None => write!(f, "None"), AnyMessage::InternalConnect => write!(f, "InternalConnect"), AnyMessage::Just32(v) => write!(f, "Just32({v})"), AnyMessage::ReplCell(v) => write!(f, "ReplCell({v:?})"), AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), } } } /// Used in reliable_copy_test.rs #[derive(Clone, Debug)] pub struct ReplCell { pub value: u32, pub client_id: u32, pub seqno: u32, } ================================================ FILE: libs/desim/src/time.rs ================================================ use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::DerefMut; use std::sync::Arc; use std::sync::atomic::{AtomicU32, AtomicU64}; use parking_lot::Mutex; use tracing::trace; use crate::executor::ThreadContext; /// Holds current time and all pending wakeup events. pub struct Timing { /// Current world's time. current_time: AtomicU64, /// Pending timers. queue: Mutex>, /// Global nonce. Makes picking events from binary heap queue deterministic /// by appending a number to events with the same timestamp. nonce: AtomicU32, /// Used to schedule fake events. fake_context: Arc, } impl Default for Timing { fn default() -> Self { Self::new() } } impl Timing { /// Create a new empty clock with time set to 0. pub fn new() -> Timing { Timing { current_time: AtomicU64::new(0), queue: Mutex::new(BinaryHeap::new()), nonce: AtomicU32::new(0), fake_context: Arc::new(ThreadContext::new()), } } /// Return the current world's time. pub fn now(&self) -> u64 { self.current_time.load(std::sync::atomic::Ordering::SeqCst) } /// Tick-tock the global clock. Return the event ready to be processed /// or move the clock forward and then return the event. pub(crate) fn step(&self) -> Option> { let mut queue = self.queue.lock(); if queue.is_empty() { // no future events return None; } if !self.is_event_ready(queue.deref_mut()) { let next_time = queue.peek().unwrap().time; self.current_time .store(next_time, std::sync::atomic::Ordering::SeqCst); trace!("rewind time to {}", next_time); assert!(self.is_event_ready(queue.deref_mut())); } Some(queue.pop().unwrap().wake_context) } /// Append an event to the queue, to wakeup the thread in `ms` milliseconds. pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc) { self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst); let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst); self.queue.lock().push(Pending { time: self.now() + ms, nonce, wake_context, }) } /// Append a fake event to the queue, to prevent clocks from skipping this time. pub fn schedule_fake(&self, ms: u64) { self.queue.lock().push(Pending { time: self.now() + ms, nonce: 0, wake_context: self.fake_context.clone(), }); } /// Return true if there is a ready event. fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { queue.peek().is_some_and(|x| x.time <= self.now()) } /// Clear all pending events. pub(crate) fn clear(&self) { self.queue.lock().clear(); } } struct Pending { time: u64, nonce: u32, wake_context: Arc, } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here // to get that. impl PartialOrd for Pending { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for Pending { fn cmp(&self, other: &Self) -> Ordering { (other.time, other.nonce).cmp(&(self.time, self.nonce)) } } impl PartialEq for Pending { fn eq(&self, other: &Self) -> bool { (other.time, other.nonce) == (self.time, self.nonce) } } impl Eq for Pending {} ================================================ FILE: libs/desim/src/world.rs ================================================ use std::ops::DerefMut; use std::sync::{Arc, mpsc}; use parking_lot::Mutex; use rand::SeedableRng; use rand::rngs::StdRng; use super::chan::Chan; use super::network::TCP; use super::node_os::NodeOs; use crate::executor::{ExternalHandle, Runtime}; use crate::network::NetworkTask; use crate::options::NetworkOptions; use crate::proto::{NodeEvent, SimEvent}; use crate::time::Timing; pub type NodeId = u32; /// World contains simulation state. pub struct World { nodes: Mutex>>, /// Random number generator. rng: Mutex, /// Internal event log. events: Mutex>, /// Separate task that processes all network messages. network_task: Arc, /// Runtime for running threads and moving time. runtime: Mutex, /// To get current time. timing: Arc, } impl World { pub fn new(seed: u64, options: Arc) -> World { let timing = Arc::new(Timing::new()); let mut runtime = Runtime::new(timing.clone()); let (tx, rx) = mpsc::channel(); runtime.spawn(move || { // create and start network background thread, and send it back via the channel NetworkTask::start_new(options, tx) }); // wait for the network task to start while runtime.step() {} let network_task = rx.recv().unwrap(); World { nodes: Mutex::new(Vec::new()), rng: Mutex::new(StdRng::seed_from_u64(seed)), events: Mutex::new(Vec::new()), network_task, runtime: Mutex::new(runtime), timing, } } pub fn step(&self) -> bool { self.runtime.lock().step() } pub fn get_thread_step_count(&self) -> u64 { self.runtime.lock().step_counter } /// Create a new random number generator. pub fn new_rng(&self) -> StdRng { let mut rng = self.rng.lock(); StdRng::from_rng(rng.deref_mut()) } /// Create a new node. pub fn new_node(self: &Arc) -> Arc { let mut nodes = self.nodes.lock(); let id = nodes.len() as NodeId; let node = Arc::new(Node::new(id, self.clone(), self.new_rng())); nodes.push(node.clone()); node } /// Get an internal node state by id. fn get_node(&self, id: NodeId) -> Option> { let nodes = self.nodes.lock(); let num = id as usize; if num < nodes.len() { Some(nodes[num].clone()) } else { None } } pub fn stop_all(&self) { self.runtime.lock().crash_all_threads(); } /// Returns a writable end of a TCP connection, to send src->dst messages. pub fn open_tcp(self: &Arc, dst: NodeId) -> TCP { // TODO: replace unwrap() with /dev/null socket. let dst = self.get_node(dst).unwrap(); let dst_accept = dst.node_events.lock().clone(); let rng = self.new_rng(); self.network_task.start_new_connection(rng, dst_accept) } /// Get current time. pub fn now(&self) -> u64 { self.timing.now() } /// Get a copy of the internal clock. pub fn clock(&self) -> Arc { self.timing.clone() } pub fn add_event(&self, node: NodeId, data: String) { let time = self.now(); self.events.lock().push(SimEvent { time, node, data }); } pub fn take_events(&self) -> Vec { let mut events = self.events.lock(); let mut res = Vec::new(); std::mem::swap(&mut res, &mut events); res } pub fn deallocate(&self) { self.stop_all(); self.timing.clear(); self.nodes.lock().clear(); } } /// Internal node state. pub struct Node { pub id: NodeId, node_events: Mutex>, world: Arc, pub(crate) rng: Mutex, } impl Node { pub fn new(id: NodeId, world: Arc, rng: StdRng) -> Node { Node { id, node_events: Mutex::new(Chan::new()), world, rng: Mutex::new(rng), } } /// Spawn a new thread with this node context. pub fn launch(self: &Arc, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle { let node = self.clone(); let world = self.world.clone(); self.world.runtime.lock().spawn(move || { f(NodeOs::new(world, node.clone())); }) } /// Returns a channel to receive Accepts and internal messages. pub fn node_events(&self) -> Chan { self.node_events.lock().clone() } /// This will drop all in-flight Accept messages. pub fn replug_node_events(&self, chan: Chan) { *self.node_events.lock() = chan; } /// Append event to the world's log. pub fn log_event(&self, data: String) { self.world.add_event(self.id, data) } } ================================================ FILE: libs/desim/tests/reliable_copy_test.rs ================================================ //! Simple test to verify that simulator is working. #[cfg(test)] mod reliable_copy_test { use std::sync::Arc; use anyhow::Result; use desim::executor::{self, PollSome}; use desim::node_os::NodeOs; use desim::options::{Delay, NetworkOptions}; use desim::proto::{AnyMessage, NetEvent, NodeEvent, ReplCell}; use desim::world::{NodeId, World}; use parking_lot::Mutex; use tracing::info; /// Disk storage trait and implementation. pub trait Storage { fn flush_pos(&self) -> u32; fn flush(&mut self) -> Result<()>; fn write(&mut self, t: T); } #[derive(Clone)] pub struct SharedStorage { pub state: Arc>>, } impl SharedStorage { pub fn new() -> Self { Self { state: Arc::new(Mutex::new(InMemoryStorage::new())), } } } impl Storage for SharedStorage { fn flush_pos(&self) -> u32 { self.state.lock().flush_pos } fn flush(&mut self) -> Result<()> { executor::yield_me(0); self.state.lock().flush() } fn write(&mut self, t: T) { executor::yield_me(0); self.state.lock().write(t); } } pub struct InMemoryStorage { pub data: Vec, pub flush_pos: u32, } impl InMemoryStorage { pub fn new() -> Self { Self { data: Vec::new(), flush_pos: 0, } } pub fn flush(&mut self) -> Result<()> { self.flush_pos = self.data.len() as u32; Ok(()) } pub fn write(&mut self, t: T) { self.data.push(t); } } /// Server implementation. pub fn run_server(os: NodeOs, mut storage: Box>) { info!("started server"); let node_events = os.node_events(); let mut epoll_vec: Vec> = vec![Box::new(node_events.clone())]; let mut sockets = vec![]; loop { let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); if index == 0 { let node_event = node_events.must_recv(); info!("got node event: {:?}", node_event); if let NodeEvent::Accept(tcp) = node_event { tcp.send(AnyMessage::Just32(storage.flush_pos())); epoll_vec.push(Box::new(tcp.recv_chan())); sockets.push(tcp); } continue; } let recv_chan = sockets[index - 1].recv_chan(); let socket = &sockets[index - 1]; let event = recv_chan.must_recv(); info!("got event: {:?}", event); if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event { if cell.seqno != storage.flush_pos() { info!("got out of order data: {:?}", cell); continue; } storage.write(cell.value); storage.flush().unwrap(); socket.send(AnyMessage::Just32(storage.flush_pos())); } } } /// Client copies all data from array to the remote node. pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) { info!("started client"); let mut delivered = 0; let mut sock = os.open_tcp(dst); let mut recv_chan = sock.recv_chan(); while delivered < data.len() { let num = &data[delivered]; info!("sending data: {:?}", num.clone()); sock.send(AnyMessage::ReplCell(num.clone())); // loop { let event = recv_chan.recv(); match event { NetEvent::Message(AnyMessage::Just32(flush_pos)) => { if flush_pos == 1 + delivered as u32 { delivered += 1; } } NetEvent::Closed => { info!("connection closed, reestablishing"); sock = os.open_tcp(dst); recv_chan = sock.recv_chan(); } _ => {} } // } } let sock = os.open_tcp(dst); for num in data { info!("sending data: {:?}", num.clone()); sock.send(AnyMessage::ReplCell(num.clone())); } info!("sent all data and finished client"); } /// Run test simulations. #[test] fn sim_example_reliable_copy() { utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, utils::logging::Output::Stdout, ) .expect("logging init failed"); let delay = Delay { min: 1, max: 60, fail_prob: 0.4, }; let network = NetworkOptions { keepalive_timeout: Some(50), connect_delay: delay.clone(), send_delay: delay.clone(), }; for seed in 0..20 { let u32_data: [u32; 5] = [1, 2, 3, 4, 5]; let data = u32_to_cells(&u32_data, 1); let world = Arc::new(World::new(seed, Arc::new(network.clone()))); start_simulation(Options { world, time_limit: 1_000_000, client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)), u32_data, }); } } pub struct Options { pub world: Arc, pub time_limit: u64, pub u32_data: [u32; 5], pub client_fn: Box, } pub fn start_simulation(options: Options) { let world = options.world; let client_node = world.new_node(); let server_node = world.new_node(); let server_id = server_node.id; // start the client thread client_node.launch(move |os| { let client_fn = options.client_fn; client_fn(os, server_id); }); // start the server thread let shared_storage = SharedStorage::new(); let server_storage = shared_storage.clone(); server_node.launch(move |os| run_server(os, Box::new(server_storage))); while world.step() && world.now() < options.time_limit {} let disk_data = shared_storage.state.lock().data.clone(); assert!(verify_data(&disk_data, &options.u32_data[..])); } pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec { let mut res = Vec::new(); for (i, _) in data.iter().enumerate() { res.push(ReplCell { client_id, seqno: i as u32, value: data[i], }); } res } fn verify_data(disk_data: &[u32], data: &[u32]) -> bool { if disk_data.len() != data.len() { return false; } for i in 0..data.len() { if disk_data[i] != data[i] { return false; } } true } } ================================================ FILE: libs/http-utils/Cargo.toml ================================================ [package] name = "http-utils" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true arc-swap.workspace = true bytes.workspace = true camino.workspace = true fail.workspace = true futures.workspace = true hyper0.workspace = true itertools.workspace = true jemalloc_pprof.workspace = true jsonwebtoken.workspace = true once_cell.workspace = true pprof.workspace = true regex.workspace = true routerify.workspace = true rustls-pemfile.workspace = true rustls.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true serde.workspace = true thiserror.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio.workspace = true tracing.workspace = true url.workspace = true uuid.workspace = true x509-cert.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. tokio-stream = { version = "0.1.14" } metrics.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } ================================================ FILE: libs/http-utils/src/endpoint.rs ================================================ use std::future::Future; use std::io::Write as _; use std::str::FromStr; use std::time::Duration; use anyhow::{Context, anyhow}; use bytes::{Bytes, BytesMut}; use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName}; use hyper::http::HeaderValue; use hyper::{Body, Method, Request, Response}; use jsonwebtoken::TokenData; use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter}; use once_cell::sync::Lazy; use pprof::ProfilerGuardBuilder; use pprof::protos::Message as _; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tokio::sync::{Mutex, Notify, mpsc}; use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{Instrument, debug, info, info_span, warn}; use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS}; use crate::error::{ApiError, api_error_handler, route_error_handler}; use crate::request::{get_query_param, parse_query_param}; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", "Number of metric requests made" ) .expect("failed to define a metric") }); static X_REQUEST_ID_HEADER_STR: &str = "x-request-id"; static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR); #[derive(Debug, Default, Clone)] struct RequestId(String); /// Adds a tracing info_span! instrumentation around the handler events, /// logs the request start and end events for non-GET requests and non-200 responses. /// /// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)` /// /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped /// with this will get request info logged in the wrapping span, including the unique request ID. /// /// This also handles errors, logging them and converting them to an HTTP error response. /// /// NB: If the client disconnects, Hyper will drop the Future, without polling it to /// completion. In other words, the handler must be async cancellation safe! request_span /// prints a warning to the log when that happens, so that you have some trace of it in /// the log. /// /// /// There could be other ways to implement similar functionality: /// /// * procmacros placed on top of all handler methods /// With all the drawbacks of procmacros, brings no difference implementation-wise, /// and little code reduction compared to the existing approach. /// /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, /// implemented for [`RouterBuilder`]. /// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. /// /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped /// later, in a post-response middleware. /// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` /// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. pub async fn request_span(request: Request, handler: H) -> R::Output where R: Future, ApiError>> + Send + 'static, H: FnOnce(Request) -> R + Send + Sync + 'static, { let request_id = request.context::().unwrap_or_default().0; let method = request.method(); let path = request.uri().path(); let request_span = info_span!("request", %method, %path, %request_id); let log_quietly = method == Method::GET; async move { let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); if log_quietly { debug!("Handling request"); } else { info!("Handling request"); } // No special handling for panics here. There's a `tracing_panic_hook` from another // module to do that globally. let res = handler(request).await; cancellation_guard.disarm(); // Log the result if needed. // // We also convert any errors into an Ok response with HTTP error code here. // `make_router` sets a last-resort error handler that would do the same, but // we prefer to do it here, before we exit the request span, so that the error // is still logged with the span. // // (Because we convert errors to Ok response, we never actually return an error, // and we could declare the function to return the never type (`!`). However, // using `routerify::RouterBuilder` requires a proper error type.) match res { Ok(response) => { let response_status = response.status(); if log_quietly && response_status.is_success() { debug!("Request handled, status: {response_status}"); } else { info!("Request handled, status: {response_status}"); } Ok(response) } Err(err) => Ok(api_error_handler(err)), } } .instrument(request_span) .await } /// Drop guard to WARN in case the request was dropped before completion. struct RequestCancelled { warn: Option, } impl RequestCancelled { /// Create the drop guard using the [`tracing::Span::current`] as the span. fn warn_when_dropped_without_responding() -> Self { RequestCancelled { warn: Some(tracing::Span::current()), } } /// Consume the drop guard without logging anything. fn disarm(mut self) { self.warn = None; } } impl Drop for RequestCancelled { fn drop(&mut self) { if std::thread::panicking() { // we are unwinding due to panicking, assume we are not dropped for cancellation } else if let Some(span) = self.warn.take() { // the span has all of the info already, but the outer `.instrument(span)` has already // been dropped, so we need to manually re-enter it for this message. // // this is what the instrument would do before polling so it is fine. let _g = span.entered(); warn!("request was dropped before completing"); } } } /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks. pub struct ChannelWriter { buffer: BytesMut, pub tx: mpsc::Sender>, written: usize, /// Time spent waiting for the channel to make progress. It is not the same as time to upload a /// buffer because we cannot know anything about that, but this should allow us to understand /// the actual time taken without the time spent `std::thread::park`ed. wait_time: std::time::Duration, } impl ChannelWriter { pub fn new(buf_len: usize, tx: mpsc::Sender>) -> Self { assert_ne!(buf_len, 0); ChannelWriter { // split about half off the buffer from the start, because we flush depending on // capacity. first flush will come sooner than without this, but now resizes will // have better chance of picking up the "other" half. not guaranteed of course. buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2), tx, written: 0, wait_time: std::time::Duration::ZERO, } } pub fn flush0(&mut self) -> std::io::Result { let n = self.buffer.len(); if n == 0 { return Ok(0); } tracing::trace!(n, "flushing"); let ready = self.buffer.split().freeze(); let wait_started_at = std::time::Instant::now(); // not ideal to call from blocking code to block_on, but we are sure that this // operation does not spawn_blocking other tasks let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async { self.tx.send(Ok(ready)).await.map_err(|_| ())?; // throttle sending to allow reuse of our buffer in `write`. self.tx.reserve().await.map_err(|_| ())?; // now the response task has picked up the buffer and hopefully started // sending it to the client. Ok(()) }); self.wait_time += wait_started_at.elapsed(); if res.is_err() { return Err(std::io::ErrorKind::BrokenPipe.into()); } self.written += n; Ok(n) } pub fn flushed_bytes(&self) -> usize { self.written } pub fn wait_time(&self) -> std::time::Duration { self.wait_time } } impl std::io::Write for ChannelWriter { fn write(&mut self, mut buf: &[u8]) -> std::io::Result { let remaining = self.buffer.capacity() - self.buffer.len(); let out_of_space = remaining < buf.len(); let original_len = buf.len(); if out_of_space { let can_still_fit = buf.len() - remaining; self.buffer.extend_from_slice(&buf[..can_still_fit]); buf = &buf[can_still_fit..]; self.flush0()?; } // assume that this will often under normal operation just move the pointer back to the // beginning of allocation, because previous split off parts are already sent and // dropped. self.buffer.extend_from_slice(buf); Ok(original_len) } fn flush(&mut self) -> std::io::Result<()> { self.flush0().map(|_| ()) } } pub async fn prometheus_metrics_handler( req: Request, force_metric_collection_on_scrape: bool, ) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); // HADRON let requested_use_latest = parse_query_param(&req, "use_latest")?; let use_latest = match requested_use_latest { None => force_metric_collection_on_scrape, Some(true) => true, Some(false) => { if force_metric_collection_on_scrape { // We don't cache in this case true } else { false } } }; let started_at = std::time::Instant::now(); let (tx, rx) = mpsc::channel(1); let body = Body::wrap_stream(ReceiverStream::new(rx)); let mut writer = ChannelWriter::new(128 * 1024, tx); let encoder = TextEncoder::new(); let response = Response::builder() .status(200) .header(CONTENT_TYPE, encoder.format_type()) .body(body) .unwrap(); let span = info_span!("blocking"); tokio::task::spawn_blocking(move || { // there are situations where we lose scraped metrics under load, try to gather some clues // since all nodes are queried this, keep the message count low. let spawned_at = std::time::Instant::now(); let _span = span.entered(); // HADRON let collected = if use_latest { // Skip caching the results if we always force metric collection on scrape. METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape) } else { METRICS_COLLECTOR.last_collected() }; let gathered_at = std::time::Instant::now(); let res = encoder .encode(&collected.metrics, &mut writer) .and_then(|_| writer.flush().map_err(|e| e.into())); // this instant is not when we finally got the full response sent, sending is done by hyper // in another task. let encoded_at = std::time::Instant::now(); let spawned_in = spawned_at - started_at; let collected_in = gathered_at - spawned_at; // remove the wait time here in case the tcp connection was clogged let encoded_in = encoded_at - gathered_at - writer.wait_time(); let total = encoded_at - started_at; // HADRON let staleness_ms = (encoded_at - collected.collected_at).as_millis(); METRICS_STALE_MILLIS.set(staleness_ms as i64); match res { Ok(()) => { tracing::info!( bytes = writer.flushed_bytes(), total_ms = total.as_millis(), spawning_ms = spawned_in.as_millis(), collection_ms = collected_in.as_millis(), encoding_ms = encoded_in.as_millis(), stalenss_ms = staleness_ms, "responded /metrics" ); } Err(e) => { // there is a chance that this error is not the BrokenPipe we generate in the writer // for "closed connection", but it is highly unlikely. tracing::warn!( after_bytes = writer.flushed_bytes(), total_ms = total.as_millis(), spawning_ms = spawned_in.as_millis(), collection_ms = collected_in.as_millis(), encoding_ms = encoded_in.as_millis(), "failed to write out /metrics response: {e:?}" ); // semantics of this error are quite... unclear. we want to error the stream out to // abort the response to somehow notify the client that we failed. // // though, most likely the reason for failure is that the receiver is already gone. drop( writer .tx .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())), ); } } }); Ok(response) } /// Generates CPU profiles. pub async fn profile_cpu_handler(req: Request) -> Result, ApiError> { enum Format { Pprof, Svg, } // Parameters. let format = match get_query_param(&req, "format")?.as_deref() { None => Format::Pprof, Some("pprof") => Format::Pprof, Some("svg") => Format::Svg, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; let seconds = match parse_query_param(&req, "seconds")? { None => 5, Some(seconds @ 1..=60) => seconds, Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))), }; let frequency_hz = match parse_query_param(&req, "frequency")? { None => 99, Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), Some(frequency) => frequency, }; let force: bool = parse_query_param(&req, "force")?.unwrap_or_default(); // Take the profile. static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); static PROFILE_CANCEL: Lazy = Lazy::new(Notify::new); let report = { // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting // for a lock(), to avoid races where the notify isn't currently awaited. let _lock = loop { match PROFILE_LOCK.try_lock() { Ok(lock) => break lock, Err(_) if force => PROFILE_CANCEL.notify_waiters(), Err(_) => { return Err(ApiError::Conflict( "profiler already running (use ?force=true to cancel it)".into(), )); } } tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait }; let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) .build() .map_err(|err| ApiError::InternalServerError(err.into()))?; tokio::select! { _ = tokio::time::sleep(Duration::from_secs(seconds)) => {}, _ = PROFILE_CANCEL.notified() => {}, }; guard .report() .build() .map_err(|err| ApiError::InternalServerError(err.into()))? }; // Return the report in the requested format. match format { Format::Pprof => { let body = report .pprof() .map_err(|err| ApiError::InternalServerError(err.into()))? .encode_to_vec(); Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") .header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"") .body(Body::from(body)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Svg => { let mut body = Vec::new(); report .flamegraph(&mut body) .map_err(|err| ApiError::InternalServerError(err.into()))?; Response::builder() .status(200) .header(CONTENT_TYPE, "image/svg+xml") .body(Body::from(body)) .map_err(|err| ApiError::InternalServerError(err.into())) } } } /// Generates heap profiles. /// /// This only works with jemalloc on Linux. pub async fn profile_heap_handler(req: Request) -> Result, ApiError> { enum Format { Jemalloc, Pprof, Svg, } // Parameters. let format = match get_query_param(&req, "format")?.as_deref() { None => Format::Pprof, Some("jemalloc") => Format::Jemalloc, Some("pprof") => Format::Pprof, Some("svg") => Format::Svg, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() .ok_or(ApiError::InternalServerError(anyhow!( "heap profiling not enabled" )))? .lock() .await; if !prof_ctl.activated() { return Err(ApiError::InternalServerError(anyhow!( "heap profiling not enabled" ))); } // Take and return the profile. match format { Format::Jemalloc => { // NB: file is an open handle to a tempfile that's already deleted. let file = tokio::task::spawn_blocking(move || prof_ctl.dump()) .await .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? .map_err(ApiError::InternalServerError)?; let stream = ReaderStream::new(tokio::fs::File::from_std(file)); Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"") .body(Body::wrap_stream(stream)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Pprof => { let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) .await .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"") .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Svg => { let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph()) .await .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "image/svg+xml") .body(Body::from(svg)) .map_err(|err| ApiError::InternalServerError(err.into())) } } } pub fn add_request_id_middleware() -> Middleware { Middleware::pre(move |req| async move { let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { Some(request_id) => request_id .to_str() .expect("extract request id value") .to_owned(), None => { let request_id = uuid::Uuid::new_v4(); request_id.to_string() } }; req.set_context(RequestId(request_id)); Ok(req) }) } async fn add_request_id_header_to_response( mut res: Response, req_info: RequestInfo, ) -> Result, ApiError> { if let Some(request_id) = req_info.context::() && let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) { res.headers_mut() .insert(&X_REQUEST_ID_HEADER, request_header_value); }; Ok(res) } pub fn make_router() -> RouterBuilder { Router::builder() .middleware(add_request_id_middleware()) .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) .err_handler(route_error_handler) } pub fn attach_openapi_ui( router_builder: RouterBuilder, spec: &'static [u8], spec_mount_path: &'static str, ui_mount_path: &'static str, ) -> RouterBuilder { router_builder .get(spec_mount_path, move |r| request_span(r, move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) }) ) .get(ui_mount_path, move |r| request_span(r, move |_| async move { Ok(Response::builder().body(Body::from(format!(r#" rweb
"#))).unwrap()) }) ) } fn parse_token(header_value: &str) -> Result<&str, ApiError> { // header must be in form Bearer let (prefix, token) = header_value .split_once(' ') .ok_or_else(|| ApiError::Unauthorized("malformed authorization header".to_string()))?; if prefix != "Bearer" { return Err(ApiError::Unauthorized( "malformed authorization header".to_string(), )); } Ok(token) } pub fn auth_middleware( provide_auth: fn(&Request) -> Option<&SwappableJwtAuth>, ) -> Middleware { Middleware::pre(move |req| async move { if let Some(auth) = provide_auth(&req) { match req.headers().get(AUTHORIZATION) { Some(value) => { let header_value = value.to_str().map_err(|_| { ApiError::Unauthorized("malformed authorization header".to_string()) })?; let token = parse_token(header_value)?; let data: TokenData = auth.decode(token).map_err(|err| { warn!("Authentication error: {err}"); // Rely on From for ApiError impl err })?; req.set_context(data.claims); } None => { return Err(ApiError::Unauthorized( "missing authorization header".to_string(), )); } } } Ok(req) }) } pub fn add_response_header_middleware( header: &str, value: &str, ) -> anyhow::Result> where B: hyper::body::HttpBody + Send + Sync + 'static, { let name = HeaderName::from_str(header).with_context(|| format!("invalid header name: {header}"))?; let value = HeaderValue::from_str(value).with_context(|| format!("invalid header value: {value}"))?; Ok(Middleware::post_with_info( move |mut response, request_info| { let name = name.clone(); let value = value.clone(); async move { let headers = response.headers_mut(); if headers.contains_key(&name) { warn!( "{} response already contains header {:?}", request_info.uri(), &name, ); } else { headers.insert(name, value); } Ok(response) } }, )) } pub fn check_permission_with( req: &Request, check_permission: impl Fn(&Claims) -> Result<(), AuthError>, ) -> Result<(), ApiError> { match req.context::() { Some(claims) => Ok(check_permission(&claims) .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?), None => Ok(()), // claims is None because auth is disabled } } #[cfg(test)] mod tests { use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; use hyper::service::Service; use routerify::RequestServiceBuilder; use super::*; #[tokio::test] async fn test_request_id_returned() { let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { panic!("request service is not ready: {e:?}"); } let mut req: Request = Request::default(); req.headers_mut() .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap()); let resp: Response = service.call(req).await.unwrap(); let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap(); assert!(header_val == "42", "response header mismatch"); } #[tokio::test] async fn test_request_id_empty() { let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { panic!("request service is not ready: {e:?}"); } let req: Request = Request::default(); let resp: Response = service.call(req).await.unwrap(); let header_val = resp.headers().get(&X_REQUEST_ID_HEADER); assert_ne!(header_val, None, "response header should NOT be empty"); } } ================================================ FILE: libs/http-utils/src/error.rs ================================================ use std::borrow::Cow; use std::error::Error as StdError; use hyper::{Body, Response, StatusCode, header}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracing::{error, info, warn}; use utils::auth::AuthError; #[derive(Debug, Error)] pub enum ApiError { #[error("Bad request: {0:#?}")] BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), #[error("Unauthorized: {0}")] Unauthorized(String), #[error("NotFound: {0}")] NotFound(Box), #[error("Conflict: {0}")] Conflict(String), #[error("Precondition failed: {0}")] PreconditionFailed(Box), #[error("Resource temporarily unavailable: {0}")] ResourceUnavailable(Cow<'static, str>), #[error("Too many requests: {0}")] TooManyRequests(Cow<'static, str>), #[error("Shutting down")] ShuttingDown, #[error("Timeout")] Timeout(Cow<'static, str>), #[error("Request cancelled")] Cancelled, #[error(transparent)] InternalServerError(anyhow::Error), } impl ApiError { pub fn into_response(self) -> Response { match self { ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::FORBIDDEN) } ApiError::Unauthorized(_) => HttpErrorBody::response_from_msg_and_status( self.to_string(), StatusCode::UNAUTHORIZED, ), ApiError::NotFound(_) => { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::NOT_FOUND) } ApiError::Conflict(_) => { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT) } ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( self.to_string(), StatusCode::PRECONDITION_FAILED, ), ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( "Shutting down".to_string(), StatusCode::SERVICE_UNAVAILABLE, ), ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::TOO_MANY_REQUESTS, ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, ), ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( self.to_string(), StatusCode::INTERNAL_SERVER_ERROR, ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace StatusCode::INTERNAL_SERVER_ERROR, ), } } } impl From for ApiError { fn from(_value: AuthError) -> Self { // Don't pass on the value of the AuthError as a precautionary measure. // Being intentionally vague in public error communication hurts debugability // but it is more secure. ApiError::Forbidden("JWT authentication error".to_string()) } } #[derive(Serialize, Deserialize)] pub struct HttpErrorBody { pub msg: String, } impl HttpErrorBody { pub fn from_msg(msg: String) -> Self { HttpErrorBody { msg } } pub fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response { HttpErrorBody { msg }.to_response(status) } pub fn to_response(&self, status: StatusCode) -> Response { Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail .body(Body::from(serde_json::to_string(self).unwrap())) .unwrap() } } pub async fn route_error_handler(err: routerify::RouteError) -> Response { match err.downcast::() { Ok(api_error) => api_error_handler(*api_error), Err(other_error) => { // We expect all the request handlers to return an ApiError, so this should // not be reached. But just in case. error!("Error processing HTTP request: {other_error:?}"); HttpErrorBody::response_from_msg_and_status( other_error.to_string(), StatusCode::INTERNAL_SERVER_ERROR, ) } } } pub fn api_error_handler(api_error: ApiError) -> Response { // Print a stack trace for Internal Server errors match api_error { ApiError::Forbidden(_) | ApiError::Unauthorized(_) => { warn!("Error processing HTTP request: {api_error:#}") } ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), ApiError::ShuttingDown => info!("Shut down while processing HTTP request"), ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"), ApiError::Cancelled => info!("Request cancelled while processing HTTP request"), _ => info!("Error processing HTTP request: {api_error:#}"), } api_error.into_response() } ================================================ FILE: libs/http-utils/src/failpoints.rs ================================================ use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use utils::failpoint_support::apply_failpoint; use crate::error::ApiError; use crate::json::{json_request, json_response}; pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point #[derive(Debug, Serialize, Deserialize)] pub struct FailpointConfig { /// Name of the fail point pub name: String, /// List of actions to take, using the format described in `fail::cfg` /// /// We also support `actions = "exit"` to cause the fail point to immediately exit. pub actions: String, } /// Configure failpoints through http. pub async fn failpoints_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow::anyhow!( "Cannot manage failpoints because neon was compiled without failpoints support" ))); } let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; for fp in failpoints { tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions); // We recognize one extra "action" that's not natively recognized // by the failpoints crate: exit, to immediately kill the process let cfg_result = apply_failpoint(&fp.name, &fp.actions); if let Err(err_msg) = cfg_result { return Err(ApiError::BadRequest(anyhow::anyhow!( "Failed to configure failpoints: {err_msg}" ))); } } json_response(StatusCode::OK, ()) } ================================================ FILE: libs/http-utils/src/json.rs ================================================ use anyhow::Context; use bytes::Buf; use hyper::{Body, Request, Response, StatusCode, header}; use serde::{Deserialize, Serialize}; use super::error::ApiError; /// Parse a json request body and deserialize it to the type `T`. pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; if body.remaining() == 0 { return Err(ApiError::BadRequest(anyhow::anyhow!( "missing request body" ))); } let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); serde_path_to_error::deserialize(&mut deser) // intentionally stringify because the debug version is not helpful in python logs .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) .map_err(ApiError::BadRequest) } /// Parse a json request body and deserialize it to the type `T`. If the body is empty, return `T::default`. pub async fn json_request_maybe Deserialize<'de> + Default>( request: &mut Request, ) -> Result { let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; if body.remaining() == 0 { return Ok(T::default()); } let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); serde_path_to_error::deserialize(&mut deser) // intentionally stringify because the debug version is not helpful in python logs .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { let json = serde_json::to_string(&data) .context("Failed to serialize JSON response") .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } ================================================ FILE: libs/http-utils/src/lib.rs ================================================ pub mod endpoint; pub mod error; pub mod failpoints; pub mod json; pub mod request; pub mod server; pub mod tls_certs; extern crate hyper0 as hyper; /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt}; ================================================ FILE: libs/http-utils/src/request.rs ================================================ use core::fmt; use std::borrow::Cow; use std::str::FromStr; use anyhow::anyhow; use hyper::body::HttpBody; use hyper::{Body, Request}; use routerify::ext::RequestExt; use super::error::ApiError; pub fn get_request_param<'a>( request: &'a Request, param_name: &str, ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), None => Err(ApiError::BadRequest(anyhow!( "no {param_name} specified in path param", ))), } } pub fn parse_request_param( request: &Request, param_name: &str, ) -> Result { match get_request_param(request, param_name)?.parse() { Ok(v) => Ok(v), Err(_) => Err(ApiError::BadRequest(anyhow!( "failed to parse {param_name}", ))), } } pub fn get_query_param<'a>( request: &'a Request, param_name: &str, ) -> Result>, ApiError> { let query = match request.uri().query() { Some(q) => q, None => return Ok(None), }; let values = url::form_urlencoded::parse(query.as_bytes()) .filter_map(|(k, v)| if k == param_name { Some(v) } else { None }) // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards .fuse(); // Work around an issue with Alloy's pyroscope scrape where the "seconds" // parameter is added several times. https://github.com/grafana/alloy/issues/3026 // TODO: revert after Alloy is fixed. let value1 = values .map(Ok) .reduce(|acc, i| { match acc { Err(_) => acc, // It's okay to have duplicates as along as they have the same value. Ok(ref a) if a == &i.unwrap() => acc, _ => Err(ApiError::BadRequest(anyhow!( "param {param_name} specified more than once" ))), } }) .transpose()?; // if values.next().is_some() { // return Err(ApiError::BadRequest(anyhow!( // "param {param_name} specified more than once" // ))); // } Ok(value1) } pub fn must_get_query_param<'a>( request: &'a Request, param_name: &str, ) -> Result, ApiError> { get_query_param(request, param_name)?.ok_or_else(|| { ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters")) }) } pub fn parse_query_param>( request: &Request, param_name: &str, ) -> Result, ApiError> { get_query_param(request, param_name)? .map(|v| { v.parse().map_err(|e| { ApiError::BadRequest(anyhow!("cannot parse query param {param_name}: {e}")) }) }) .transpose() } pub fn must_parse_query_param>( request: &Request, param_name: &str, ) -> Result { parse_query_param(request, param_name)?.ok_or_else(|| { ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters")) }) } pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), None => Ok(()), } } #[cfg(test)] mod tests { use super::*; #[test] fn test_get_query_param_duplicate() { let req = Request::builder() .uri("http://localhost:12345/testuri?testparam=1") .body(hyper::Body::empty()) .unwrap(); let value = get_query_param(&req, "testparam").unwrap(); assert_eq!(value.unwrap(), "1"); let req = Request::builder() .uri("http://localhost:12345/testuri?testparam=1&testparam=1") .body(hyper::Body::empty()) .unwrap(); let value = get_query_param(&req, "testparam").unwrap(); assert_eq!(value.unwrap(), "1"); let req = Request::builder() .uri("http://localhost:12345/testuri") .body(hyper::Body::empty()) .unwrap(); let value = get_query_param(&req, "testparam").unwrap(); assert!(value.is_none()); let req = Request::builder() .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3") .body(hyper::Body::empty()) .unwrap(); let value = get_query_param(&req, "testparam"); assert!(value.is_err()); } } ================================================ FILE: libs/http-utils/src/server.rs ================================================ use std::{error::Error, sync::Arc}; use futures::StreamExt; use futures::stream::FuturesUnordered; use hyper0::Body; use hyper0::server::conn::Http; use metrics::{IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; use routerify::{RequestService, RequestServiceBuilder}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tracing::{error, info}; use crate::error::ApiError; /// A simple HTTP server over hyper library. /// You may want to use it instead of [`hyper0::server::Server`] because: /// 1. hyper0's Server was removed from hyper v1. /// It's recommended to replace hyepr0's Server with a manual loop, which is done here. /// 2. hyper0's Server doesn't support TLS out of the box, and there is no way /// to support it efficiently with the Accept trait that hyper0's Server uses. /// That's one of the reasons why it was removed from v1. /// pub struct Server { request_service: Arc>, listener: tokio::net::TcpListener, tls_acceptor: Option, } static CONNECTION_STARTED_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "http_server_connection_started_total", "Number of established http/https connections", &["scheme"] ) .expect("failed to define a metric") }); static CONNECTION_ERROR_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "http_server_connection_errors_total", "Number of occured connection errors by type", &["type"] ) .expect("failed to define a metric") }); impl Server { pub fn new( request_service: Arc>, listener: std::net::TcpListener, tls_acceptor: Option, ) -> anyhow::Result { // Note: caller of from_std is responsible for setting nonblocking mode. listener.set_nonblocking(true)?; let listener = tokio::net::TcpListener::from_std(listener)?; Ok(Self { request_service, listener, tls_acceptor, }) } pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> { fn suppress_io_error(err: &std::io::Error) -> bool { use std::io::ErrorKind::*; matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe) } fn suppress_hyper_error(err: &hyper0::Error) -> bool { if err.is_incomplete_message() || err.is_closed() || err.is_timeout() { return true; } if let Some(inner) = err.source() && let Some(io) = inner.downcast_ref::() { return suppress_io_error(io); } false } let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]); let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]); let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]); let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]); let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]); let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]); let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]); let mut connections = FuturesUnordered::new(); loop { tokio::select! { stream = self.listener.accept() => { let (tcp_stream, remote_addr) = match stream { Ok(stream) => stream, Err(err) => { tcp_error_cnt.inc(); if !suppress_io_error(&err) { info!("Failed to accept TCP connection: {err:#}"); } continue; } }; let service = self.request_service.build(remote_addr); let tls_acceptor = self.tls_acceptor.clone(); let cancel = cancel.clone(); let tls_error_cnt = tls_error_cnt.clone(); let http_error_cnt = http_error_cnt.clone(); let https_error_cnt = https_error_cnt.clone(); let http_connection_cnt = http_connection_cnt.clone(); let https_connection_cnt = https_connection_cnt.clone(); connections.push(tokio::spawn( async move { match tls_acceptor { Some(tls_acceptor) => { // Handle HTTPS connection. https_connection_cnt.inc(); let tls_stream = tokio::select! { tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, _ = cancel.cancelled() => return, }; let tls_stream = match tls_stream { Ok(tls_stream) => tls_stream, Err(err) => { tls_error_cnt.inc(); if !suppress_io_error(&err) { info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } return; } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { https_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } } } None => { // Handle HTTP connection. http_connection_cnt.inc(); if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { http_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } } } }; })); } Some(conn) = connections.next() => { if let Err(err) = conn { panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } _ = cancel.cancelled() => { // Wait for graceful shutdown of all connections. while let Some(conn) = connections.next().await { if let Err(err) = conn { panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } break; } } } Ok(()) } /// Serves HTTP connection with graceful shutdown. async fn serve_connection( io: I, service: RequestService, cancel: CancellationToken, ) -> Result<(), hyper0::Error> where I: AsyncRead + AsyncWrite + Unpin + Send + 'static, { let mut conn = Http::new().serve_connection(io, service).with_upgrades(); tokio::select! { res = &mut conn => res, _ = cancel.cancelled() => { Pin::new(&mut conn).graceful_shutdown(); // Note: connection should still be awaited for graceful shutdown to complete. conn.await } } } } ================================================ FILE: libs/http-utils/src/tls_certs.rs ================================================ use std::{sync::Arc, time::Duration}; use anyhow::Context; use arc_swap::ArcSwap; use camino::Utf8Path; use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; use once_cell::sync::Lazy; use rustls::{ pki_types::{CertificateDer, PrivateKeyDer, UnixTime}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; use x509_cert::der::Reader; pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { let cert_data = tokio::fs::read(filename) .await .context(format!("failed reading certificate file {filename:?}"))?; let mut reader = std::io::Cursor::new(&cert_data); let cert_chain = rustls_pemfile::certs(&mut reader) .collect::, _>>() .context(format!("failed parsing certificate from file {filename:?}"))?; Ok(cert_chain) } pub async fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { let key_data = tokio::fs::read(filename) .await .context(format!("failed reading private key file {filename:?}"))?; let mut reader = std::io::Cursor::new(&key_data); let key = rustls_pemfile::private_key(&mut reader) .context(format!("failed parsing private key from file {filename:?}"))?; key.ok_or(anyhow::anyhow!( "no private key found in {}", filename.as_str(), )) } pub async fn load_certified_key( key_filename: &Utf8Path, cert_filename: &Utf8Path, ) -> anyhow::Result { let cert_chain = load_cert_chain(cert_filename).await?; let key = load_private_key(key_filename).await?; let key = rustls::crypto::ring::default_provider() .key_provider .load_private_key(key)?; let certified_key = CertifiedKey::new(cert_chain, key); certified_key.keys_match()?; Ok(certified_key) } /// rustls's CertifiedKey with extra parsed fields used for metrics. struct ParsedCertifiedKey { certified_key: CertifiedKey, expiration_time: UnixTime, } /// Parse expiration time from an X509 certificate. fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result { let parsed_cert = x509_cert::der::SliceReader::new(cert) .context("Failed to parse cerficiate")? .decode::() .context("Failed to parse cerficiate")?; Ok(UnixTime::since_unix_epoch( parsed_cert .tbs_certificate .validity .not_after .to_unix_duration(), )) } async fn load_and_parse_certified_key( key_filename: &Utf8Path, cert_filename: &Utf8Path, ) -> anyhow::Result { let certified_key = load_certified_key(key_filename, cert_filename).await?; let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?; Ok(ParsedCertifiedKey { certified_key, expiration_time, }) } static CERT_EXPIRATION_TIME: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "tls_certs_expiration_time_seconds", "Expiration time of the loaded certificate since unix epoch in seconds", &["resolver_name"] ) .expect("failed to define a metric") }); static CERT_RELOAD_STARTED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "tls_certs_reload_started_total", "Number of certificate reload loop iterations started", &["resolver_name"] ) .expect("failed to define a metric") }); static CERT_RELOAD_UPDATED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "tls_certs_reload_updated_total", "Number of times the certificate was updated to the new one", &["resolver_name"] ) .expect("failed to define a metric") }); static CERT_RELOAD_FAILED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "tls_certs_reload_failed_total", "Number of times the certificate reload failed", &["resolver_name"] ) .expect("failed to define a metric") }); /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from /// the disk periodically. #[derive(Debug)] pub struct ReloadingCertificateResolver { certified_key: ArcSwap, } impl ReloadingCertificateResolver { /// Creates a new Resolver by loading certificate and private key from FS and /// creating tokio::task to reload them with provided reload_period. /// resolver_name is used as metric's label. pub async fn new( resolver_name: &str, key_filename: &Utf8Path, cert_filename: &Utf8Path, reload_period: Duration, ) -> anyhow::Result> { // Create metrics for current resolver. let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]); let cert_reload_started_counter = CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]); let cert_reload_updated_counter = CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]); let cert_reload_failed_counter = CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]); let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?; let this = Arc::new(Self { certified_key: ArcSwap::from_pointee(parsed_key.certified_key), }); cert_expiration_time.set(parsed_key.expiration_time.as_secs()); tokio::spawn({ let weak_this = Arc::downgrade(&this); let key_filename = key_filename.to_owned(); let cert_filename = cert_filename.to_owned(); async move { let start = tokio::time::Instant::now() + reload_period; let mut interval = tokio::time::interval_at(start, reload_period); let mut last_reload_failed = false; loop { interval.tick().await; let this = match weak_this.upgrade() { Some(this) => this, None => break, // Resolver has been destroyed, exit. }; cert_reload_started_counter.inc(); match load_and_parse_certified_key(&key_filename, &cert_filename).await { Ok(parsed_key) => { if parsed_key.certified_key.cert == this.certified_key.load().cert { tracing::debug!("Certificate has not changed since last reloading"); } else { tracing::info!("Certificate has been reloaded"); this.certified_key.store(Arc::new(parsed_key.certified_key)); cert_expiration_time.set(parsed_key.expiration_time.as_secs()); cert_reload_updated_counter.inc(); } last_reload_failed = false; } Err(err) => { cert_reload_failed_counter.inc(); // Note: Reloading certs may fail if it conflicts with the script updating // the files at the same time. Warn only if the error is persistent. if last_reload_failed { tracing::warn!("Error reloading certificate: {err:#}"); } else { tracing::info!("Error reloading certificate: {err:#}"); } last_reload_failed = true; } } } } }); Ok(this) } } impl ResolvesServerCert for ReloadingCertificateResolver { fn resolve(&self, _client_hello: ClientHello<'_>) -> Option> { Some(self.certified_key.load_full()) } } ================================================ FILE: libs/metrics/Cargo.toml ================================================ [package] name = "metrics" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] prometheus.workspace = true libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true measured.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true measured-process.workspace = true [dev-dependencies] rand.workspace = true rand_distr = "0.5" ================================================ FILE: libs/metrics/src/hll.rs ================================================ //! HyperLogLog is an algorithm for the count-distinct problem, //! approximating the number of distinct elements in a multiset. //! Calculating the exact cardinality of the distinct elements //! of a multiset requires an amount of memory proportional to //! the cardinality, which is impractical for very large data sets. //! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, //! use significantly less memory than this, but can only approximate the cardinality. use std::hash::{BuildHasher, BuildHasherDefault, Hash}; use std::sync::atomic::AtomicU8; use measured::LabelGroup; use measured::label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}; use measured::metric::counter::CounterState; use measured::metric::name::MetricNameEncoder; use measured::metric::{Metric, MetricType, MetricVec}; use measured::text::TextEncoder; use twox_hash::xxh3; /// Create an [`HyperLogLogVec`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_hll_vec { ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{ let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap(); $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) }}; ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) }}; } /// Create an [`HyperLogLog`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_hll { ($N:literal, $OPTS:expr $(,)?) => {{ let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap(); $crate::register(Box::new(hll.clone())).map(|_| hll) }}; ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) }}; } /// HLL is a probabilistic cardinality measure. /// /// How to use this time-series for a metric name `my_metrics_total_hll`: /// /// ```promql /// # harmonic mean /// 1 / ( /// sum ( /// 2 ^ -( /// # HLL merge operation /// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) /// ) /// ) without (hll_shard) /// ) /// * alpha /// * shards_count /// * shards_count /// ``` /// /// If you want an estimate over time, you can use the following query: /// /// ```promql /// # harmonic mean /// 1 / ( /// sum ( /// 2 ^ -( /// # HLL merge operation /// max ( /// max_over_time(my_metrics_total_hll{}[$__rate_interval]) /// ) by (hll_shard, other_labels...) /// ) /// ) without (hll_shard) /// ) /// * alpha /// * shards_count /// * shards_count /// ``` /// /// In the case of low cardinality, you might want to use the linear counting approximation: /// /// ```promql /// # LinearCounting(m, V) = m log (m / V) /// shards_count * ln(shards_count / /// # calculate V = how many shards contain a 0 /// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) /// ) /// ``` /// /// See for estimates on alpha pub type HyperLogLogVec = MetricVec, L>; pub type HyperLogLog = Metric>; pub struct HyperLogLogState { shards: [AtomicU8; N], } impl Default for HyperLogLogState { fn default() -> Self { #[allow(clippy::declare_interior_mutable_const)] const ZERO: AtomicU8 = AtomicU8::new(0); Self { shards: [ZERO; N] } } } impl MetricType for HyperLogLogState { type Metadata = (); } impl HyperLogLogState { pub fn measure(&self, item: &(impl Hash + ?Sized)) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); } fn record(&self, hash: u64) { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } fn take_sample(&self) -> [u8; N] { self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, // but HLL is not impacted by a write in one shard happening in between. // This is because in PromQL we will be implementing a harmonic mean of all buckets. // we will also merge samples in a time series using `max by (hll_shard)`. // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } impl measured::metric::MetricEncoding> for HyperLogLogState { fn write_type( name: impl MetricNameEncoder, enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { enc.write_type(&name, measured::text::MetricType::Gauge) } fn collect_into( &self, _: &(), labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { struct I64(i64); impl LabelValue for I64 { fn visit(&self, v: V) -> V::Output { v.write_int(self.0) } } struct HllShardLabel { hll_shard: i64, } impl LabelGroup for HllShardLabel { fn visit_values(&self, v: &mut impl LabelGroupVisitor) { const LE: &LabelName = LabelName::from_str("hll_shard"); v.write_value(LE, &I64(self.hll_shard)); } } self.take_sample() .into_iter() .enumerate() .try_for_each(|(hll_shard, val)| { CounterState::new(val as u64).collect_into( &(), labels.by_ref().compose_with(HllShardLabel { hll_shard: hll_shard as i64, }), name.by_ref(), enc, ) }) } } #[cfg(test)] mod tests { use std::collections::HashSet; use measured::FixedCardinalityLabel; use measured::label::StaticLabelSet; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; #[derive(FixedCardinalityLabel, Clone, Copy)] #[label(singleton = "x")] enum Label { A, B, } fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { // cannot go through the `hll.collect_family_into` interface yet... // need to see if I can fix the conflicting impls problem in measured. ( hll.get_metric(hll.with_labels(Label::A)).take_sample(), hll.get_metric(hll.with_labels(Label::B)).take_sample(), ) } fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; for &sample in samples { for (i, m) in sample.into_iter().enumerate() { buckets[i] = f64::max(buckets[i], m as f64); } } buckets .into_iter() .map(|f| 2.0f64.powf(-f)) .sum::() .recip() * 0.697 * 32.0 * 32.0 } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); let mut set_b = HashSet::new(); for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); hll.get_metric(hll.with_labels(Label::A)) .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); hll.get_metric(hll.with_labels(Label::B)) .measure(&x.to_bits()); } let merge = &set_a | &set_b; let (a, b) = collect(&hll); let len = get_cardinality(&[a, b]); let len_a = get_cardinality(&[a]); let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } #[test] fn test_cardinality_small() { let (actual, estimate) = test_cardinality(100, Zipf::new(100.0, 1.2f64).unwrap()); assert_eq!(actual, [46, 30, 32]); assert!(51.3 < estimate[0] && estimate[0] < 51.4); assert!(44.0 < estimate[1] && estimate[1] < 44.1); assert!(39.0 < estimate[2] && estimate[2] < 39.1); } #[test] fn test_cardinality_medium() { let (actual, estimate) = test_cardinality(10000, Zipf::new(10000.0, 1.2f64).unwrap()); assert_eq!(actual, [2529, 1618, 1629]); assert!(2309.1 < estimate[0] && estimate[0] < 2309.2); assert!(1566.6 < estimate[1] && estimate[1] < 1566.7); assert!(1629.5 < estimate[2] && estimate[2] < 1629.6); } #[test] fn test_cardinality_large() { let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000.0, 1.2f64).unwrap()); assert_eq!(actual, [129077, 79579, 79630]); assert!(126067.2 < estimate[0] && estimate[0] < 126067.3); assert!(83076.8 < estimate[1] && estimate[1] < 83076.9); assert!(64251.2 < estimate[2] && estimate[2] < 64251.3); } #[test] fn test_cardinality_small2() { let (actual, estimate) = test_cardinality(100, Zipf::new(200.0, 0.8f64).unwrap()); assert_eq!(actual, [92, 58, 60]); assert!(116.1 < estimate[0] && estimate[0] < 116.2); assert!(81.7 < estimate[1] && estimate[1] < 81.8); assert!(69.3 < estimate[2] && estimate[2] < 69.4); } #[test] fn test_cardinality_medium2() { let (actual, estimate) = test_cardinality(10000, Zipf::new(20000.0, 0.8f64).unwrap()); assert_eq!(actual, [8201, 5131, 5051]); assert!(6846.4 < estimate[0] && estimate[0] < 6846.5); assert!(5239.1 < estimate[1] && estimate[1] < 5239.2); assert!(4292.8 < estimate[2] && estimate[2] < 4292.9); } #[test] fn test_cardinality_large2() { let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000.0, 0.8f64).unwrap()); assert_eq!(actual, [777847, 482069, 482246]); assert!(699437.4 < estimate[0] && estimate[0] < 699437.5); assert!(374948.9 < estimate[1] && estimate[1] < 374949.0); assert!(434609.7 < estimate[2] && estimate[2] < 434609.8); } } ================================================ FILE: libs/metrics/src/launch_timestamp.rs ================================================ //! A timestamp captured at process startup to identify restarts of the process, e.g., in logs and metrics. use std::fmt::Display; use chrono::Utc; use super::register_uint_gauge; pub struct LaunchTimestamp(chrono::DateTime); impl LaunchTimestamp { pub fn generate() -> Self { LaunchTimestamp(Utc::now()) } } impl Display for LaunchTimestamp { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } pub fn set_launch_timestamp_metric(launch_ts: &'static LaunchTimestamp) { let millis_since_epoch: u64 = launch_ts .0 .timestamp_millis() .try_into() .expect("we're after the epoch, this should be positive"); let metric = register_uint_gauge!( "libmetrics_launch_timestamp", "Timestamp (millis since epoch) at wich the process launched." ) .unwrap(); metric.set(millis_since_epoch); } ================================================ FILE: libs/metrics/src/lib.rs ================================================ //! We re-export those from prometheus crate to //! make sure that we use the same dep version everywhere. //! Otherwise, we might not see all metrics registered via //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] use std::sync::RwLock; use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; use measured::metric::counter::CounterState; use measured::metric::gauge::GaugeState; use measured::metric::group::Encoding; use measured::metric::name::{MetricName, MetricNameEncoder}; use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use once_cell::sync::Lazy; use prometheus::Registry; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, }; pub use prometheus::local::LocalHistogram; pub use prometheus::{ Counter, CounterVec, Encoder, Error, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, TextEncoder, core, default_registry, exponential_buckets, linear_buckets, opts, proto, register, register_counter_vec, register_gauge, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, }; pub mod launch_timestamp; mod wrappers; pub use prometheus; pub use wrappers::{CountedReader, CountedWriter}; mod hll; pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; #[macro_export] macro_rules! register_uint_gauge_vec { ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap(); $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec) }}; } #[macro_export] macro_rules! register_uint_gauge { ($NAME:expr, $HELP:expr $(,)?) => {{ let gauge = $crate::UIntGauge::new($NAME, $HELP).unwrap(); $crate::register(Box::new(gauge.clone())).map(|_| gauge) }}; } /// Special internal registry, to collect metrics independently from the default registry. /// Was introduced to fix deadlock with lazy registration of metrics in the default registry. static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { update_rusage_metrics(); let mut mfs = prometheus::gather(); let mut internal_mfs = INTERNAL_REGISTRY.gather(); mfs.append(&mut internal_mfs); mfs } static DISK_IO_BYTES: Lazy = Lazy::new(|| { register_int_gauge_vec!( "libmetrics_disk_io_bytes_total", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) .expect("Failed to register disk i/o bytes int gauge vec") }); static MAXRSS_KB: Lazy = Lazy::new(|| { register_int_gauge!( "libmetrics_maxrss_kb", "Memory usage (Maximum Resident Set Size)" ) .expect("Failed to register maxrss_kb int gauge") }); /// Most common fsync latency is 50 µs - 100 µs, but it can be much higher, /// especially during many concurrent disk operations. pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] = &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0]; /// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end /// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32. pub fn pow2_buckets(start: usize, end: usize) -> Vec { assert_ne!(start, 0); assert!(start <= end); let start = match start.checked_next_power_of_two() { Some(n) if n == start => n, // start already power of two Some(n) => n >> 1, // power of two below start None => panic!("start too large"), }; let end = end.checked_next_power_of_two().expect("end too large"); std::iter::successors(Some(start), |n| n.checked_mul(2)) .take_while(|n| n <= &end) .map(|n| n as f64) .collect() } pub struct InfoMetric { label: RwLock, metric: M, } impl InfoMetric { pub fn new(label: L) -> Self { Self::with_metric(label, GaugeState::new(1)) } } impl Default for InfoMetric { fn default() -> Self { InfoMetric::new(L::default()) } } impl> InfoMetric { pub fn with_metric(label: L, metric: M) -> Self { Self { label: RwLock::new(label), metric, } } pub fn set_label(&self, label: L) { *self.label.write().unwrap() = label; } } impl MetricFamilyEncoding for InfoMetric where L: LabelGroup, M: MetricEncoding, E: Encoding, { fn collect_family_into( &self, name: impl measured::metric::name::MetricNameEncoder, enc: &mut E, ) -> Result<(), E::Err> { M::write_type(&name, enc)?; self.metric .collect_into(&(), &*self.label.read().unwrap(), name, enc) } } pub struct BuildInfo { pub revision: &'static str, pub build_tag: &'static str, } impl LabelGroup for BuildInfo { fn visit_values(&self, v: &mut impl LabelGroupVisitor) { const REVISION: &LabelName = LabelName::from_str("revision"); v.write_value(REVISION, &self.revision); const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); v.write_value(BUILD_TAG, &self.build_tag); } } #[derive(MetricGroup)] #[metric(new(build_info: BuildInfo))] pub struct NeonMetrics { #[cfg(target_os = "linux")] #[metric(namespace = "process")] #[metric(init = measured_process::ProcessCollector::for_self())] process: measured_process::ProcessCollector, #[metric(namespace = "libmetrics")] #[metric(init = LibMetrics::new(build_info))] libmetrics: LibMetrics, } #[derive(MetricGroup)] #[metric(new(build_info: BuildInfo))] pub struct LibMetrics { #[metric(init = InfoMetric::new(build_info))] build_info: InfoMetric, #[metric(flatten)] rusage: Rusage, serve_count: CollectionCounter, } fn write_gauge( x: i64, labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut Enc, ) -> Result<(), Enc::Err> where GaugeState: MetricEncoding, { GaugeState::new(x).collect_into(&(), labels, name, enc) } #[derive(Default)] struct Rusage; #[derive(FixedCardinalityLabel, Clone, Copy)] #[label(singleton = "io_operation")] enum IoOp { Read, Write, } impl MetricGroup for Rusage where GaugeState: MetricEncoding, { fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); let ru = get_rusage_stats(); enc.write_help( DISK_IO, "Bytes written and read from disk, grouped by the operation (read|write)", )?; GaugeState::write_type(DISK_IO, enc)?; write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; GaugeState::write_type(MAXRSS, enc)?; write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; Ok(()) } } #[derive(Default)] struct CollectionCounter(CounterState); impl MetricFamilyEncoding for CollectionCounter where CounterState: MetricEncoding, { fn collect_family_into( &self, name: impl measured::metric::name::MetricNameEncoder, enc: &mut T, ) -> Result<(), T::Err> { self.0.inc(); enc.write_help(&name, "Number of metric requests made")?; self.0.collect_into(&(), NoLabels, name, enc) } } pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", "Build/version information", &["revision", "build_tag"] ) .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned. // // Uses https://www.freebsd.org/cgi/man.cgi?query=getrusage to retrieve the number of block operations // performed by the process. // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669 #[cfg(target_os = "macos")] { MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024); } #[cfg(not(target_os = "macos"))] { MAXRSS_KB.set(rusage_stats.ru_maxrss); } } fn get_rusage_stats() -> libc::rusage { let mut rusage = std::mem::MaybeUninit::uninit(); // SAFETY: kernel will initialize the struct for us unsafe { let ret = libc::getrusage(libc::RUSAGE_SELF, rusage.as_mut_ptr()); assert!(ret == 0, "getrusage failed: bad args"); rusage.assume_init() } } /// Create an [`IntCounterPairVec`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair_vec { ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{ match ( $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES), $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES), ) { (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)), (Err(e), _) | (_, Err(e)) => Err(e), } }}; } /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{ match ( $crate::register_int_counter!($NAME1, $HELP1), $crate::register_int_counter!($NAME2, $HELP2), ) { (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)), (Err(e), _) | (_, Err(e)) => Err(e), } }}; } /// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes pub struct GenericCounterPairVec { inc: GenericCounterVec

, dec: GenericCounterVec

, } /// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes pub struct GenericCounterPair { inc: GenericCounter

, dec: GenericCounter

, } impl GenericCounterPairVec

{ pub fn new(inc: GenericCounterVec

, dec: GenericCounterVec

) -> Self { Self { inc, dec } } /// `get_metric_with_label_values` returns the [`GenericCounterPair

`] for the given slice /// of label values (same order as the VariableLabels in Desc). If that combination of /// label values is accessed for the first time, a new [`GenericCounterPair

`] is created. /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. pub fn get_metric_with_label_values( &self, vals: &[&str], ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, }) } /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error /// occurs. pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ self.get_metric_with_label_values(vals).unwrap() } pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } } impl GenericCounterPair

{ pub fn new(inc: GenericCounter

, dec: GenericCounter

) -> Self { Self { inc, dec } } /// Increment the gauge by 1, returning a guard that decrements by 1 on drop. pub fn guard(&self) -> GenericCounterPairGuard

{ self.inc.inc(); GenericCounterPairGuard(self.dec.clone()) } /// Increment the gauge by n, returning a guard that decrements by n on drop. pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy

{ self.inc.inc_by(n); GenericCounterPairGuardBy(self.dec.clone(), n) } /// Increase the gauge by 1. #[inline] pub fn inc(&self) { self.inc.inc(); } /// Decrease the gauge by 1. #[inline] pub fn dec(&self) { self.dec.inc(); } /// Add the given value to the gauge. (The value can be /// negative, resulting in a decrement of the gauge.) #[inline] pub fn inc_by(&self, v: P::T) { self.inc.inc_by(v); } /// Subtract the given value from the gauge. (The value can be /// negative, resulting in an increment of the gauge.) #[inline] pub fn dec_by(&self, v: P::T) { self.dec.inc_by(v); } } impl Clone for GenericCounterPair

{ fn clone(&self) -> Self { Self { inc: self.inc.clone(), dec: self.dec.clone(), } } } /// Guard returned by [`GenericCounterPair::guard`] pub struct GenericCounterPairGuard(GenericCounter

); impl Drop for GenericCounterPairGuard

{ fn drop(&mut self) { self.0.inc(); } } /// Guard returned by [`GenericCounterPair::guard_by`] pub struct GenericCounterPairGuardBy(GenericCounter

, P::T); impl Drop for GenericCounterPairGuardBy

{ fn drop(&mut self) { self.0.inc_by(self.1); } } /// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes pub type IntCounterPairVec = GenericCounterPairVec; /// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; pub trait CounterPairAssoc { const INC_NAME: &'static MetricName; const DEC_NAME: &'static MetricName; const INC_HELP: &'static str; const DEC_HELP: &'static str; type LabelGroupSet: LabelGroupSet; } pub struct CounterPairVec { vec: measured::metric::MetricVec, } impl Default for CounterPairVec where A::LabelGroupSet: Default, { fn default() -> Self { Self { vec: Default::default(), } } } impl CounterPairVec { pub fn guard( &self, labels: ::Group<'_>, ) -> MeasuredCounterPairGuard<'_, A> { let id = self.vec.with_labels(labels); self.vec.get_metric(id).inc.inc(); MeasuredCounterPairGuard { vec: &self.vec, id } } pub fn inc(&self, labels: ::Group<'_>) { let id = self.vec.with_labels(labels); self.vec.get_metric(id).inc.inc(); } pub fn dec(&self, labels: ::Group<'_>) { let id = self.vec.with_labels(labels); self.vec.get_metric(id).dec.inc(); } pub fn remove_metric( &self, labels: ::Group<'_>, ) -> Option { let id = self.vec.with_labels(labels); self.vec.remove_metric(id) } pub fn sample(&self, labels: ::Group<'_>) -> u64 { let id = self.vec.with_labels(labels); let metric = self.vec.get_metric(id); let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed); let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed); inc.saturating_sub(dec) } } impl ::measured::metric::group::MetricGroup for CounterPairVec where T: ::measured::metric::group::Encoding, A: CounterPairAssoc, ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, { fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { // write decrement first to avoid a race condition where inc - dec < 0 T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; self.vec .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; T::write_help(enc, A::INC_NAME, A::INC_HELP)?; self.vec .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; Ok(()) } } #[derive(MetricGroup, Default)] pub struct MeasuredCounterPairState { pub inc: CounterState, pub dec: CounterState, } impl measured::metric::MetricType for MeasuredCounterPairState { type Metadata = (); } pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { vec: &'a measured::metric::MetricVec, id: measured::metric::LabelId, } impl Drop for MeasuredCounterPairGuard<'_, A> { fn drop(&mut self) { self.vec.get_metric(self.id).dec.inc(); } } /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. struct Inc(T); /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. struct Dec(T); impl Encoding for Inc { type Err = T::Err; fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } } impl MetricEncoding> for MeasuredCounterPairState where CounterState: MetricEncoding, { fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { CounterState::write_type(name, &mut enc.0) } fn collect_into( &self, metadata: &(), labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut Inc, ) -> Result<(), T::Err> { self.inc.collect_into(metadata, labels, name, &mut enc.0) } } impl Encoding for Dec { type Err = T::Err; fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } } /// Write the dec counter to the encoder impl MetricEncoding> for MeasuredCounterPairState where CounterState: MetricEncoding, { fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { CounterState::write_type(name, &mut enc.0) } fn collect_into( &self, metadata: &(), labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut Dec, ) -> Result<(), T::Err> { self.dec.collect_into(metadata, labels, name, &mut enc.0) } } #[cfg(test)] mod tests { use super::*; const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1); #[test] fn pow2_buckets_cases() { assert_eq!(pow2_buckets(1, 1), vec![1.0]); assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]); assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]); assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]); assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]); assert_eq!( pow2_buckets(1, 200), vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0] ); assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]); assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]); assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]); assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]); assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]); assert_eq!(pow2_buckets(8, 8), vec![8.0]); assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]); // Largest valid values. assert_eq!( pow2_buckets(1, POW2_BUCKETS_MAX).len(), usize::BITS as usize ); assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1); } #[test] #[should_panic] fn pow2_buckets_zero_start() { pow2_buckets(0, 1); } #[test] #[should_panic] fn pow2_buckets_end_lt_start() { pow2_buckets(2, 1); } #[test] #[should_panic] fn pow2_buckets_end_overflow_min() { pow2_buckets(1, POW2_BUCKETS_MAX + 1); } #[test] #[should_panic] fn pow2_buckets_end_overflow_max() { pow2_buckets(1, usize::MAX); } } ================================================ FILE: libs/metrics/src/more_process_metrics.rs ================================================ //! process metrics that the [`::prometheus`] crate doesn't provide. // This module has heavy inspiration from the prometheus crate's `process_collector.rs`. use once_cell::sync::Lazy; use prometheus::Gauge; use crate::UIntGauge; pub struct Collector { descs: Vec, vmlck: crate::UIntGauge, cpu_seconds_highres: Gauge, } const NMETRICS: usize = 2; static CLK_TCK_F64: Lazy = Lazy::new(|| { // SAFETY: libc::sysconf is safe, it merely returns a value. let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; if long == -1 { panic!("sysconf(_SC_CLK_TCK) failed"); } let convertible_to_f64: i32 = i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32"); convertible_to_f64 as f64 }); impl prometheus::core::Collector for Collector { fn desc(&self) -> Vec<&prometheus::core::Desc> { self.descs.iter().collect() } fn collect(&self) -> Vec { let Ok(myself) = procfs::process::Process::myself() else { return vec![]; }; let mut mfs = Vec::with_capacity(NMETRICS); if let Ok(status) = myself.status() { if let Some(vmlck) = status.vmlck { self.vmlck.set(vmlck); mfs.extend(self.vmlck.collect()) } } if let Ok(stat) = myself.stat() { let cpu_seconds = stat.utime + stat.stime; self.cpu_seconds_highres .set(cpu_seconds as f64 / *CLK_TCK_F64); mfs.extend(self.cpu_seconds_highres.collect()); } mfs } } impl Collector { pub fn new() -> Self { let mut descs = Vec::new(); let vmlck = UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap(); descs.extend( prometheus::core::Collector::desc(&vmlck) .into_iter() .cloned(), ); let cpu_seconds_highres = Gauge::new( "libmetrics_process_cpu_seconds_highres", "Total user and system CPU time spent in seconds.\ Sub-second resolution, hence better than `process_cpu_seconds_total`.", ) .unwrap(); descs.extend( prometheus::core::Collector::desc(&cpu_seconds_highres) .into_iter() .cloned(), ); Self { descs, vmlck, cpu_seconds_highres, } } } impl Default for Collector { fn default() -> Self { Self::new() } } ================================================ FILE: libs/metrics/src/wrappers.rs ================================================ use std::io::{Read, Result, Write}; /// A wrapper for an object implementing [Read] /// which allows a closure to observe the amount of bytes read. /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)). /// /// Example: /// /// ``` /// # use std::io::{Result, Read}; /// # use metrics::{register_int_counter, IntCounter}; /// # use metrics::CountedReader; /// # use once_cell::sync::Lazy; /// # /// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" /// # ).unwrap() /// # }); /// # /// fn do_some_reads(stream: impl Read, count: usize) -> Result> { /// let mut reader = CountedReader::new(stream, |cnt| { /// // bump a counter each time we do a read /// INT_COUNTER.inc_by(cnt as u64); /// }); /// /// let mut proto_header = [0; 8]; /// reader.read_exact(&mut proto_header)?; /// assert!(&proto_header == b"deadbeef"); /// /// let mut payload = vec![0; count]; /// reader.read_exact(&mut payload)?; /// Ok(payload) /// } /// ``` /// /// NB: rapid concurrent bumping of an atomic counter might incur /// a performance penalty. Please make sure to amortize the amount /// of atomic operations by either using [BufReader](std::io::BufReader) /// or choosing a non-atomic (thread local) counter. pub struct CountedReader<'a, T> { reader: T, update_counter: Box, } impl<'a, T> CountedReader<'a, T> { pub fn new(reader: T, update_counter: impl FnMut(usize) + Sync + Send + 'a) -> Self { Self { reader, update_counter: Box::new(update_counter), } } /// Get an immutable reference to the underlying [Read] implementor pub fn inner(&self) -> &T { &self.reader } /// Get a mutable reference to the underlying [Read] implementor pub fn inner_mut(&mut self) -> &mut T { &mut self.reader } /// Consume the wrapper and return the underlying [Read] implementor pub fn into_inner(self) -> T { self.reader } } impl Read for CountedReader<'_, T> { fn read(&mut self, buf: &mut [u8]) -> Result { let count = self.reader.read(buf)?; (self.update_counter)(count); Ok(count) } } /// A wrapper for an object implementing [Write] /// which allows a closure to observe the amount of bytes written. /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)). /// /// Example: /// /// ``` /// # use std::io::{Result, Write}; /// # use metrics::{register_int_counter, IntCounter}; /// # use metrics::CountedWriter; /// # use once_cell::sync::Lazy; /// # /// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" /// # ).unwrap() /// # }); /// # /// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> { /// let mut writer = CountedWriter::new(stream, |cnt| { /// // bump a counter each time we do a write /// INT_COUNTER.inc_by(cnt as u64); /// }); /// /// let proto_header = b"deadbeef"; /// writer.write_all(proto_header)?; /// writer.write_all(payload) /// } /// ``` /// /// NB: rapid concurrent bumping of an atomic counter might incur /// a performance penalty. Please make sure to amortize the amount /// of atomic operations by either using [BufWriter](std::io::BufWriter) /// or choosing a non-atomic (thread local) counter. pub struct CountedWriter<'a, T> { writer: T, update_counter: Box, } impl<'a, T> CountedWriter<'a, T> { pub fn new(writer: T, update_counter: impl FnMut(usize) + Sync + Send + 'a) -> Self { Self { writer, update_counter: Box::new(update_counter), } } /// Get an immutable reference to the underlying [Write] implementor pub fn inner(&self) -> &T { &self.writer } /// Get a mutable reference to the underlying [Write] implementor pub fn inner_mut(&mut self) -> &mut T { &mut self.writer } /// Consume the wrapper and return the underlying [Write] implementor pub fn into_inner(self) -> T { self.writer } } impl Write for CountedWriter<'_, T> { fn write(&mut self, buf: &[u8]) -> Result { let count = self.writer.write(buf)?; (self.update_counter)(count); Ok(count) } fn flush(&mut self) -> Result<()> { self.writer.flush() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_counted_reader() { let stream = [0; 16]; let mut total = 0; let mut reader = CountedReader::new(stream.as_ref(), |cnt| { total += cnt; }); let mut buffer = [0; 8]; reader.read_exact(&mut buffer).unwrap(); reader.read_exact(&mut buffer).unwrap(); drop(reader); assert_eq!(total, stream.len()); } #[test] fn test_counted_writer() { let mut stream = [0; 16]; let mut total = 0; let mut writer = CountedWriter::new(stream.as_mut(), |cnt| { total += cnt; }); let buffer = [0; 8]; writer.write_all(&buffer).unwrap(); writer.write_all(&buffer).unwrap(); drop(writer); assert_eq!(total, stream.len()); } // This mimics the constraints of std::thread::spawn fn assert_send_sync(_x: impl Sync + Send + 'static) {} #[test] fn test_send_sync_counted_reader() { let stream: &[u8] = &[]; let mut reader = CountedReader::new(stream, |_| {}); assert_send_sync(move || { reader.read_exact(&mut []).unwrap(); }); } #[test] fn test_send_sync_counted_writer() { let stream = Vec::::new(); let mut writer = CountedWriter::new(stream, |_| {}); assert_send_sync(move || { writer.write_all(&[]).unwrap(); }); } } ================================================ FILE: libs/neon-shmem/Cargo.toml ================================================ [package] name = "neon-shmem" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] thiserror.workspace = true nix.workspace=true workspace_hack = { version = "0.1", path = "../../workspace_hack" } libc.workspace = true lock_api.workspace = true rustc-hash.workspace = true [target.'cfg(target_os = "macos")'.dependencies] tempfile = "3.14.0" [dev-dependencies] rand.workspace = true rand_distr = "0.5.1" ================================================ FILE: libs/neon-shmem/src/hash/core.rs ================================================ //! Simple hash table with chaining. use std::hash::Hash; use std::mem::MaybeUninit; use crate::hash::entry::*; /// Invalid position within the map (either within the dictionary or bucket array). pub(crate) const INVALID_POS: u32 = u32::MAX; /// Fundamental storage unit within the hash table. Either empty or contains a key-value pair. /// Always part of a chain of some kind (either a freelist if empty or a hash chain if full). pub(crate) struct Bucket { /// Index of next bucket in the chain. pub(crate) next: u32, /// Key-value pair contained within bucket. pub(crate) inner: Option<(K, V)>, } /// Core hash table implementation. pub(crate) struct CoreHashMap<'a, K, V> { /// Dictionary used to map hashes to bucket indices. pub(crate) dictionary: &'a mut [u32], /// Buckets containing key-value pairs. pub(crate) buckets: &'a mut [Bucket], /// Head of the freelist. pub(crate) free_head: u32, /// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit. pub(crate) alloc_limit: u32, /// The number of currently occupied buckets. pub(crate) buckets_in_use: u32, } /// Error for when there are no empty buckets left but one is needed. #[derive(Debug, PartialEq)] pub struct FullError; impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> { const FILL_FACTOR: f32 = 0.60; /// Estimate the size of data contained within the the hash map. pub fn estimate_size(num_buckets: u32) -> usize { let mut size = 0; // buckets size += size_of::>() * num_buckets as usize; // dictionary size += (f32::ceil((size_of::() * num_buckets as usize) as f32 / Self::FILL_FACTOR)) as usize; size } pub fn new( buckets: &'a mut [MaybeUninit>], dictionary: &'a mut [MaybeUninit], ) -> Self { // Initialize the buckets for i in 0..buckets.len() { buckets[i].write(Bucket { next: if i < buckets.len() - 1 { i as u32 + 1 } else { INVALID_POS }, inner: None, }); } // Initialize the dictionary for e in dictionary.iter_mut() { e.write(INVALID_POS); } // TODO: use std::slice::assume_init_mut() once it stabilizes let buckets = unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) }; let dictionary = unsafe { std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len()) }; Self { dictionary, buckets, free_head: 0, buckets_in_use: 0, alloc_limit: INVALID_POS, } } /// Get the value associated with a key (if it exists) given its hash. pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> { let mut next = self.dictionary[hash as usize % self.dictionary.len()]; loop { if next == INVALID_POS { return None; } let bucket = &self.buckets[next as usize]; let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use"); if bucket_key == key { return Some(bucket_value); } next = bucket.next; } } /// Get number of buckets in map. pub fn get_num_buckets(&self) -> usize { self.buckets.len() } /// Clears all entries from the hashmap. /// /// Does not reset any allocation limits, but does clear any entries beyond them. pub fn clear(&mut self) { for i in 0..self.buckets.len() { self.buckets[i] = Bucket { next: if i < self.buckets.len() - 1 { i as u32 + 1 } else { INVALID_POS }, inner: None, } } for i in 0..self.dictionary.len() { self.dictionary[i] = INVALID_POS; } self.free_head = 0; self.buckets_in_use = 0; } /// Find the position of an unused bucket via the freelist and initialize it. pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result { let mut pos = self.free_head; // Find the first bucket we're *allowed* to use. let mut prev = PrevPos::First(self.free_head); while pos != INVALID_POS && pos >= self.alloc_limit { let bucket = &mut self.buckets[pos as usize]; prev = PrevPos::Chained(pos); pos = bucket.next; } if pos == INVALID_POS { return Err(FullError); } // Repair the freelist. match prev { PrevPos::First(_) => { let next_pos = self.buckets[pos as usize].next; self.free_head = next_pos; } PrevPos::Chained(p) => { if p != INVALID_POS { let next_pos = self.buckets[pos as usize].next; self.buckets[p as usize].next = next_pos; } } _ => unreachable!(), } // Initialize the bucket. let bucket = &mut self.buckets[pos as usize]; self.buckets_in_use += 1; bucket.next = INVALID_POS; bucket.inner = Some((key, value)); Ok(pos) } } ================================================ FILE: libs/neon-shmem/src/hash/entry.rs ================================================ //! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap. use crate::hash::core::{CoreHashMap, FullError, INVALID_POS}; use crate::sync::{RwLockWriteGuard, ValueWriteGuard}; use std::hash::Hash; use std::mem; pub enum Entry<'a, 'b, K, V> { Occupied(OccupiedEntry<'a, 'b, K, V>), Vacant(VacantEntry<'a, 'b, K, V>), } /// Enum representing the previous position within a chain. #[derive(Clone, Copy)] pub(crate) enum PrevPos { /// Starting index within the dictionary. First(u32), /// Regular index within the buckets. Chained(u32), /// Unknown - e.g. the associated entry was retrieved by index instead of chain. Unknown(u64), } pub struct OccupiedEntry<'a, 'b, K, V> { /// Mutable reference to the map containing this entry. pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>, /// The key of the occupied entry pub(crate) _key: K, /// The index of the previous entry in the chain. pub(crate) prev_pos: PrevPos, /// The position of the bucket in the [`CoreHashMap`] bucket array. pub(crate) bucket_pos: u32, } impl OccupiedEntry<'_, '_, K, V> { pub fn get(&self) -> &V { &self.map.buckets[self.bucket_pos as usize] .inner .as_ref() .unwrap() .1 } pub fn get_mut(&mut self) -> &mut V { &mut self.map.buckets[self.bucket_pos as usize] .inner .as_mut() .unwrap() .1 } /// Inserts a value into the entry, replacing (and returning) the existing value. pub fn insert(&mut self, value: V) -> V { let bucket = &mut self.map.buckets[self.bucket_pos as usize]; // This assumes inner is Some, which it must be for an OccupiedEntry mem::replace(&mut bucket.inner.as_mut().unwrap().1, value) } /// Removes the entry from the hash map, returning the value originally stored within it. /// /// This may result in multiple bucket accesses if the entry was obtained by index as the /// previous chain entry needs to be discovered in this case. pub fn remove(mut self) -> V { // If this bucket was queried by index, go ahead and follow its chain from the start. let prev = if let PrevPos::Unknown(hash) = self.prev_pos { let dict_idx = hash as usize % self.map.dictionary.len(); let mut prev = PrevPos::First(dict_idx as u32); let mut curr = self.map.dictionary[dict_idx]; while curr != self.bucket_pos { assert!(curr != INVALID_POS); prev = PrevPos::Chained(curr); curr = self.map.buckets[curr as usize].next; } prev } else { self.prev_pos }; // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry. let bucket = &mut self.map.buckets[self.bucket_pos as usize]; // unlink it from the chain match prev { PrevPos::First(dict_pos) => { self.map.dictionary[dict_pos as usize] = bucket.next; } PrevPos::Chained(bucket_pos) => { self.map.buckets[bucket_pos as usize].next = bucket.next; } _ => unreachable!(), } // and add it to the freelist let free = self.map.free_head; let bucket = &mut self.map.buckets[self.bucket_pos as usize]; let old_value = bucket.inner.take(); bucket.next = free; self.map.free_head = self.bucket_pos; self.map.buckets_in_use -= 1; old_value.unwrap().1 } } /// An abstract view into a vacant entry within the map. pub struct VacantEntry<'a, 'b, K, V> { /// Mutable reference to the map containing this entry. pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>, /// The key to be inserted into this entry. pub(crate) key: K, /// The position within the dictionary corresponding to the key's hash. pub(crate) dict_pos: u32, } impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> { /// Insert a value into the vacant entry, finding and populating an empty bucket in the process. /// /// # Errors /// Will return [`FullError`] if there are no unoccupied buckets in the map. pub fn insert(mut self, value: V) -> Result, FullError> { let pos = self.map.alloc_bucket(self.key, value)?; self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize]; self.map.dictionary[self.dict_pos as usize] = pos; Ok(RwLockWriteGuard::map(self.map, |m| { &mut m.buckets[pos as usize].inner.as_mut().unwrap().1 })) } } ================================================ FILE: libs/neon-shmem/src/hash/tests.rs ================================================ use std::collections::BTreeMap; use std::collections::HashSet; use std::fmt::Debug; use std::mem::MaybeUninit; use crate::hash::Entry; use crate::hash::HashMapAccess; use crate::hash::HashMapInit; use crate::hash::core::FullError; use rand::seq::SliceRandom; use rand::{Rng, RngCore}; use rand_distr::Zipf; const TEST_KEY_LEN: usize = 16; #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] struct TestKey([u8; TEST_KEY_LEN]); impl From<&TestKey> for u128 { fn from(val: &TestKey) -> u128 { u128::from_be_bytes(val.0) } } impl From for TestKey { fn from(val: u128) -> TestKey { TestKey(val.to_be_bytes()) } } impl<'a> From<&'a [u8]> for TestKey { fn from(bytes: &'a [u8]) -> TestKey { TestKey(bytes.try_into().unwrap()) } } fn test_inserts + Copy>(keys: &[K]) { let w = HashMapInit::::new_resizeable_named(100000, 120000, "test_inserts") .attach_writer(); for (idx, k) in keys.iter().enumerate() { let res = w.entry((*k).into()); match res { Entry::Occupied(mut e) => { e.insert(idx); } Entry::Vacant(e) => { let res = e.insert(idx); assert!(res.is_ok()); } }; } for (idx, k) in keys.iter().enumerate() { let x = w.get(&(*k).into()); let value = x.as_deref().copied(); assert_eq!(value, Some(idx)); } } #[test] fn dense() { // This exercises splitting a node with prefix let keys: &[u128] = &[0, 1, 2, 3, 256]; test_inserts(keys); // Dense keys let mut keys: Vec = (0..10000).collect(); test_inserts(&keys); // Do the same in random orders for _ in 1..10 { keys.shuffle(&mut rand::rng()); test_inserts(&keys); } } #[test] fn sparse() { // sparse keys let mut keys: Vec = Vec::new(); let mut used_keys = HashSet::new(); for _ in 0..10000 { loop { let key = rand::random::(); if used_keys.contains(&key) { continue; } used_keys.insert(key); keys.push(key.into()); break; } } test_inserts(&keys); } #[derive(Clone, Debug)] struct TestOp(TestKey, Option); fn apply_op( op: &TestOp, map: &mut HashMapAccess, shadow: &mut BTreeMap, ) { // apply the change to the shadow tree first let shadow_existing = if let Some(v) = op.1 { shadow.insert(op.0, v) } else { shadow.remove(&op.0) }; let entry = map.entry(op.0); let hash_existing = match op.1 { Some(new) => match entry { Entry::Occupied(mut e) => Some(e.insert(new)), Entry::Vacant(e) => { _ = e.insert(new).unwrap(); None } }, None => match entry { Entry::Occupied(e) => Some(e.remove()), Entry::Vacant(_) => None, }, }; assert_eq!(shadow_existing, hash_existing); } fn do_random_ops( num_ops: usize, size: u32, del_prob: f64, writer: &mut HashMapAccess, shadow: &mut BTreeMap, rng: &mut rand::rngs::ThreadRng, ) { for i in 0..num_ops { let key: TestKey = ((rng.next_u32() % size) as u128).into(); let op = TestOp( key, if rng.random_bool(del_prob) { Some(i) } else { None }, ); apply_op(&op, writer, shadow); } } fn do_deletes( num_ops: usize, writer: &mut HashMapAccess, shadow: &mut BTreeMap, ) { for _ in 0..num_ops { let (k, _) = shadow.pop_first().unwrap(); writer.remove(&k); } } fn do_shrink( writer: &mut HashMapAccess, shadow: &mut BTreeMap, from: u32, to: u32, ) { assert!(writer.shrink_goal().is_none()); writer.begin_shrink(to); assert_eq!(writer.shrink_goal(), Some(to as usize)); for i in to..from { if let Some(entry) = writer.entry_at_bucket(i as usize) { shadow.remove(&entry._key); entry.remove(); } } let old_usage = writer.get_num_buckets_in_use(); writer.finish_shrink().unwrap(); assert!(writer.shrink_goal().is_none()); assert_eq!(writer.get_num_buckets_in_use(), old_usage); } #[test] fn random_ops() { let mut writer = HashMapInit::::new_resizeable_named(100000, 120000, "test_random") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap(); let mut rng = rand::rng(); for i in 0..100000 { let key: TestKey = (rng.sample(distribution) as u128).into(); let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); apply_op(&op, &mut writer, &mut shadow); } } #[test] fn test_shuffle() { let mut writer = HashMapInit::::new_resizeable_named(1000, 1200, "test_shuf") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); writer.shuffle(); do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); } #[test] fn test_grow() { let mut writer = HashMapInit::::new_resizeable_named(1000, 2000, "test_grow") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); let old_usage = writer.get_num_buckets_in_use(); writer.grow(1500).unwrap(); assert_eq!(writer.get_num_buckets_in_use(), old_usage); assert_eq!(writer.get_num_buckets(), 1500); do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); } #[test] fn test_clear() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); writer.clear(); assert_eq!(writer.get_num_buckets_in_use(), 0); assert_eq!(writer.get_num_buckets(), 1500); while let Some((key, _)) = shadow.pop_first() { assert!(writer.get(&key).is_none()); } do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); for i in 0..(1500 - writer.get_num_buckets_in_use()) { writer.insert((1500 + i as u128).into(), 0).unwrap(); } assert_eq!(writer.insert(5000.into(), 0), Err(FullError {})); writer.clear(); assert!(writer.insert(5000.into(), 0).is_ok()); } #[test] fn test_idx_remove() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng); for _ in 0..100 { let idx = (rng.next_u32() % 1500) as usize; if let Some(e) = writer.entry_at_bucket(idx) { shadow.remove(&e._key); e.remove(); } } while let Some((key, val)) = shadow.pop_first() { assert_eq!(*writer.get(&key).unwrap(), val); } } #[test] fn test_idx_get() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng); for _ in 0..100 { let idx = (rng.next_u32() % 1500) as usize; if let Some(pair) = writer.get_at_bucket(idx) { { let v: *const usize = &pair.1; assert_eq!(writer.get_bucket_for_value(v), idx); } { let v: *const usize = &pair.1; assert_eq!(writer.get_bucket_for_value(v), idx); } } } } #[test] fn test_shrink() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_shrink") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); do_shrink(&mut writer, &mut shadow, 1500, 1000); assert_eq!(writer.get_num_buckets(), 1000); do_deletes(500, &mut writer, &mut shadow); do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng); assert!(writer.get_num_buckets_in_use() <= 1000); } #[test] fn test_shrink_grow_seq() { let mut writer = HashMapInit::::new_resizeable_named(1000, 20000, "test_grow_seq") .attach_writer(); let mut shadow: std::collections::BTreeMap = BTreeMap::new(); let mut rng = rand::rng(); do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng); eprintln!("Shrinking to 750"); do_shrink(&mut writer, &mut shadow, 1000, 750); do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng); eprintln!("Growing to 1500"); writer.grow(1500).unwrap(); do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng); eprintln!("Shrinking to 200"); while shadow.len() > 100 { do_deletes(1, &mut writer, &mut shadow); } do_shrink(&mut writer, &mut shadow, 1500, 200); do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng); eprintln!("Growing to 10k"); writer.grow(10000).unwrap(); do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng); } #[test] fn test_bucket_ops() { let writer = HashMapInit::::new_resizeable_named(1000, 1200, "test_bucket_ops") .attach_writer(); match writer.entry(1.into()) { Entry::Occupied(mut e) => { e.insert(2); } Entry::Vacant(e) => { _ = e.insert(2).unwrap(); } } assert_eq!(writer.get_num_buckets_in_use(), 1); assert_eq!(writer.get_num_buckets(), 1000); assert_eq!(*writer.get(&1.into()).unwrap(), 2); let pos = match writer.entry(1.into()) { Entry::Occupied(e) => { assert_eq!(e._key, 1.into()); e.bucket_pos as usize } Entry::Vacant(_) => { panic!("Insert didn't affect entry"); } }; assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into()); assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2)); { let ptr: *const usize = &*writer.get(&1.into()).unwrap(); assert_eq!(writer.get_bucket_for_value(ptr), pos); } writer.remove(&1.into()); assert!(writer.get(&1.into()).is_none()); } #[test] fn test_shrink_zero() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_shrink_zero") .attach_writer(); writer.begin_shrink(0); for i in 0..1500 { writer.entry_at_bucket(i).map(|x| x.remove()); } writer.finish_shrink().unwrap(); assert_eq!(writer.get_num_buckets_in_use(), 0); let entry = writer.entry(1.into()); if let Entry::Vacant(v) = entry { assert!(v.insert(2).is_err()); } else { panic!("Somehow got non-vacant entry in empty map.") } writer.grow(50).unwrap(); let entry = writer.entry(1.into()); if let Entry::Vacant(v) = entry { assert!(v.insert(2).is_ok()); } else { panic!("Somehow got non-vacant entry in empty map.") } assert_eq!(writer.get_num_buckets_in_use(), 1); } #[test] #[should_panic] fn test_grow_oom() { let writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_grow_oom") .attach_writer(); writer.grow(20000).unwrap(); } #[test] #[should_panic] fn test_shrink_bigger() { let mut writer = HashMapInit::::new_resizeable_named(1500, 2500, "test_shrink_bigger") .attach_writer(); writer.begin_shrink(2000); } #[test] #[should_panic] fn test_shrink_early_finish() { let writer = HashMapInit::::new_resizeable_named(1500, 2500, "test_shrink_early_finish") .attach_writer(); writer.finish_shrink().unwrap(); } #[test] #[should_panic] fn test_shrink_fixed_size() { let mut area = [MaybeUninit::uninit(); 10000]; let init_struct = HashMapInit::::with_fixed(3, &mut area); let mut writer = init_struct.attach_writer(); writer.begin_shrink(1); } ================================================ FILE: libs/neon-shmem/src/hash.rs ================================================ //! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array). //! //! This hash table has two major components: the bucket array and the dictionary. Each bucket within the //! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an //! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash //! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash). //! //! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash- //! dependent component is done with the dictionary. When a new key is inserted into the map, a position //! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based //! off of the freelist, and then the index of said bucket is placed in the dictionary. //! //! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen //! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the //! dictionary by rehashing all keys. //! //! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock. use std::hash::{BuildHasher, Hash}; use std::mem::MaybeUninit; use crate::shmem::ShmemHandle; use crate::{shmem, sync::*}; mod core; pub mod entry; #[cfg(test)] mod tests; use core::{Bucket, CoreHashMap, INVALID_POS}; use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry}; use thiserror::Error; /// Error type for a hashmap shrink operation. #[derive(Error, Debug)] pub enum HashMapShrinkError { /// There was an error encountered while resizing the memory area. #[error("shmem resize failed: {0}")] ResizeError(shmem::Error), /// Occupied entries in to-be-shrunk space were encountered beginning at the given index. #[error("occupied entry in deallocated space found at {0}")] RemainingEntries(usize), } /// This represents a hash table that (possibly) lives in shared memory. /// If a new process is launched with fork(), the child process inherits /// this struct. #[must_use] pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> { shmem_handle: Option, shared_ptr: *mut HashMapShared<'a, K, V>, shared_size: usize, hasher: S, num_buckets: u32, } /// This is a per-process handle to a hash table that (possibly) lives in shared memory. /// If a child process is launched with fork(), the child process should /// get its own HashMapAccess by calling HashMapInit::attach_writer/reader(). /// /// XXX: We're not making use of it at the moment, but this struct could /// hold process-local information in the future. pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> { shmem_handle: Option, shared_ptr: *mut HashMapShared<'a, K, V>, hasher: S, } unsafe impl Sync for HashMapAccess<'_, K, V, S> {} unsafe impl Send for HashMapAccess<'_, K, V, S> {} impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> { /// Change the 'hasher' used by the hash table. /// /// NOTE: This must be called right after creating the hash table, /// before inserting any entries and before calling attach_writer/reader. /// Otherwise different accessors could be using different hash function, /// with confusing results. pub fn with_hasher(self, hasher: T) -> HashMapInit<'a, K, V, T> { HashMapInit { hasher, shmem_handle: self.shmem_handle, shared_ptr: self.shared_ptr, shared_size: self.shared_size, num_buckets: self.num_buckets, } } /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets. pub fn estimate_size(num_buckets: u32) -> usize { // add some margin to cover alignment etc. CoreHashMap::::estimate_size(num_buckets) + size_of::>() + 1000 } fn new( num_buckets: u32, shmem_handle: Option, area_ptr: *mut u8, area_size: usize, hasher: S, ) -> Self { let mut ptr: *mut u8 = area_ptr; let end_ptr: *mut u8 = unsafe { ptr.add(area_size) }; // carve out area for the One Big Lock (TM) and the HashMapShared. ptr = unsafe { ptr.add(ptr.align_offset(align_of::())) }; let raw_lock_ptr = ptr; ptr = unsafe { ptr.add(size_of::()) }; ptr = unsafe { ptr.add(ptr.align_offset(align_of::>())) }; let shared_ptr: *mut HashMapShared = ptr.cast(); ptr = unsafe { ptr.add(size_of::>()) }; // carve out the buckets ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::>())) }; let buckets_ptr = ptr; ptr = unsafe { ptr.add(size_of::>() * num_buckets as usize) }; // use remaining space for the dictionary ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::())) }; assert!(ptr.addr() < end_ptr.addr()); let dictionary_ptr = ptr; let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::() as isize }; assert!(dictionary_size > 0); let buckets = unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }; let dictionary = unsafe { std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize) }; let hashmap = CoreHashMap::new(buckets, dictionary); unsafe { let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap); std::ptr::write(shared_ptr, lock); } Self { num_buckets, shmem_handle, shared_ptr, shared_size: area_size, hasher, } } /// Attach to a hash table for writing. pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> { HashMapAccess { shmem_handle: self.shmem_handle, shared_ptr: self.shared_ptr, hasher: self.hasher, } } /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`]. /// /// This is a holdover from a previous implementation and is being kept around for /// backwards compatibility reasons. pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> { self.attach_writer() } } /// Hash table data that is actually stored in the shared memory area. /// /// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table /// relies on the memory layout! The data structures are laid out in the contiguous shared memory /// area as follows: /// /// [`libc::pthread_rwlock_t`] /// [`HashMapShared`] /// buckets /// dictionary /// /// In between the above parts, there can be padding bytes to align the parts correctly. type HashMapShared<'a, K, V> = RwLock>; impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher> where K: Clone + Hash + Eq, { /// Place the hash table within a user-supplied fixed memory area. pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit]) -> Self { Self::new( num_buckets, None, area.as_mut_ptr().cast(), area.len(), rustc_hash::FxBuildHasher, ) } /// Place a new hash map in the given shared memory area /// /// # Panics /// Will panic on failure to resize area to expected map size. pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self { let size = Self::estimate_size(num_buckets); shmem .set_size(size) .expect("could not resize shared memory area"); let ptr = shmem.data_ptr.as_ptr().cast(); Self::new( num_buckets, Some(shmem), ptr, size, rustc_hash::FxBuildHasher, ) } /// Make a resizable hash map within a new shared memory area with the given name. pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self { let size = Self::estimate_size(num_buckets); let max_size = Self::estimate_size(max_buckets); let shmem = ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area"); let ptr = shmem.data_ptr.as_ptr().cast(); Self::new( num_buckets, Some(shmem), ptr, size, rustc_hash::FxBuildHasher, ) } /// Make a resizable hash map within a new anonymous shared memory area. pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self { use std::sync::atomic::{AtomicUsize, Ordering}; static COUNTER: AtomicUsize = AtomicUsize::new(0); let val = COUNTER.fetch_add(1, Ordering::Relaxed); let name = format!("neon_shmem_hmap{val}"); Self::new_resizeable_named(num_buckets, max_buckets, &name) } } impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S> where K: Clone + Hash + Eq, { /// Hash a key using the map's hasher. #[inline] fn get_hash_value(&self, key: &K) -> u64 { self.hasher.hash_one(key) } fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> { let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write(); let dict_pos = hash as usize % map.dictionary.len(); let first = map.dictionary[dict_pos]; if first == INVALID_POS { // no existing entry return Entry::Vacant(VacantEntry { map, key, dict_pos: dict_pos as u32, }); } let mut prev_pos = PrevPos::First(dict_pos as u32); let mut next = first; loop { let bucket = &mut map.buckets[next as usize]; let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use"); if *bucket_key == key { // found existing entry return Entry::Occupied(OccupiedEntry { map, _key: key, prev_pos, bucket_pos: next, }); } if bucket.next == INVALID_POS { // No existing entry return Entry::Vacant(VacantEntry { map, key, dict_pos: dict_pos as u32, }); } prev_pos = PrevPos::Chained(next); next = bucket.next; } } /// Get a reference to the corresponding value for a key. pub fn get<'e>(&'e self, key: &K) -> Option> { let hash = self.get_hash_value(key); let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok() } /// Get a reference to the entry containing a key. /// /// NB: THis takes a write lock as there's no way to distinguish whether the intention /// is to use the entry for reading or for writing in advance. pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> { let hash = self.get_hash_value(&key); self.entry_with_hash(key, hash) } /// Remove a key given its hash. Returns the associated value if it existed. pub fn remove(&self, key: &K) -> Option { let hash = self.get_hash_value(key); match self.entry_with_hash(key.clone(), hash) { Entry::Occupied(e) => Some(e.remove()), Entry::Vacant(_) => None, } } /// Insert/update a key. Returns the previous associated value if it existed. /// /// # Errors /// Will return [`core::FullError`] if there is no more space left in the map. pub fn insert(&self, key: K, value: V) -> Result, core::FullError> { let hash = self.get_hash_value(&key); match self.entry_with_hash(key.clone(), hash) { Entry::Occupied(mut e) => Ok(Some(e.insert(value))), Entry::Vacant(e) => { _ = e.insert(value)?; Ok(None) } } } /// Optionally return the entry for a bucket at a given index if it exists. /// /// Has more overhead than one would intuitively expect: performs both a clone of the key /// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order /// to enable repairing the hash chain if the entry is removed. pub fn entry_at_bucket(&self, pos: usize) -> Option> { let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); if pos >= map.buckets.len() { return None; } let entry = map.buckets[pos].inner.as_ref(); match entry { Some((key, _)) => Some(OccupiedEntry { _key: key.clone(), bucket_pos: pos as u32, prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)), map, }), _ => None, } } /// Returns the number of buckets in the table. pub fn get_num_buckets(&self) -> usize { let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); map.get_num_buckets() } /// Return the key and value stored in bucket with given index. This can be used to /// iterate through the hash map. // TODO: An Iterator might be nicer. The communicator's clock algorithm needs to // _slowly_ iterate through all buckets with its clock hand, without holding a lock. // If we switch to an Iterator, it must not hold the lock. pub fn get_at_bucket(&self, pos: usize) -> Option> { let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); if pos >= map.buckets.len() { return None; } RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok() } /// Returns the index of the bucket a given value corresponds to. pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize { let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); let origin = map.buckets.as_ptr(); let idx = (val_ptr as usize - origin as usize) / size_of::>(); assert!(idx < map.buckets.len()); idx } /// Returns the number of occupied buckets in the table. pub fn get_num_buckets_in_use(&self) -> usize { let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); map.buckets_in_use as usize } /// Clears all entries in a table. Does not reset any shrinking operations. pub fn clear(&self) { let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); map.clear(); } /// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset /// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist /// in the process. fn rehash_dict( &self, inner: &mut CoreHashMap<'a, K, V>, buckets_ptr: *mut core::Bucket, end_ptr: *mut u8, num_buckets: u32, rehash_buckets: u32, ) { inner.free_head = INVALID_POS; let buckets; let dictionary; unsafe { let buckets_end_ptr = buckets_ptr.add(num_buckets as usize); let dictionary_ptr: *mut u32 = buckets_end_ptr .byte_add(buckets_end_ptr.align_offset(align_of::())) .cast(); let dictionary_size: usize = end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::(); buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize); dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size); } for e in dictionary.iter_mut() { *e = INVALID_POS; } for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) { if bucket.inner.is_none() { bucket.next = inner.free_head; inner.free_head = i as u32; continue; } let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0); let pos: usize = (hash % dictionary.len() as u64) as usize; bucket.next = dictionary[pos]; dictionary[pos] = i as u32; } inner.dictionary = dictionary; inner.buckets = buckets; } /// Rehash the map without growing or shrinking. pub fn shuffle(&self) { let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); let num_buckets = map.get_num_buckets() as u32; let size_bytes = HashMapInit::::estimate_size(num_buckets); let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() }; let buckets_ptr = map.buckets.as_mut_ptr(); self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets); } /// Grow the number of buckets within the table. /// /// 1. Grows the underlying shared memory area /// 2. Initializes new buckets and overwrites the current dictionary /// 3. Rehashes the dictionary /// /// # Panics /// Panics if called on a map initialized with [`HashMapInit::with_fixed`]. /// /// # Errors /// Returns an [`shmem::Error`] if any errors occur resizing the memory region. pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> { let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); let old_num_buckets = map.buckets.len() as u32; assert!( num_buckets >= old_num_buckets, "grow called with a smaller number of buckets" ); if num_buckets == old_num_buckets { return Ok(()); } let shmem_handle = self .shmem_handle .as_ref() .expect("grow called on a fixed-size hash table"); let size_bytes = HashMapInit::::estimate_size(num_buckets); shmem_handle.set_size(size_bytes)?; let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) }; // Initialize new buckets. The new buckets are linked to the free list. // NB: This overwrites the dictionary! let buckets_ptr = map.buckets.as_mut_ptr(); unsafe { for i in old_num_buckets..num_buckets { let bucket = buckets_ptr.add(i as usize); bucket.write(core::Bucket { next: if i < num_buckets - 1 { i + 1 } else { map.free_head }, inner: None, }); } } self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets); map.free_head = old_num_buckets; Ok(()) } /// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`. /// /// # Panics /// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is /// greater than the number of buckets in the map. pub fn begin_shrink(&mut self, num_buckets: u32) { let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); assert!( num_buckets <= map.get_num_buckets() as u32, "shrink called with a larger number of buckets" ); _ = self .shmem_handle .as_ref() .expect("shrink called on a fixed-size hash table"); map.alloc_limit = num_buckets; } /// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None. pub fn shrink_goal(&self) -> Option { let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read(); let goal = map.alloc_limit; if goal == INVALID_POS { None } else { Some(goal as usize) } } /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing. /// /// # Panics /// The following cases result in a panic: /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`]. /// - Calling this function on a map when no shrink operation is in progress. pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> { let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); assert!( map.alloc_limit != INVALID_POS, "called finish_shrink when no shrink is in progress" ); let num_buckets = map.alloc_limit; if map.get_num_buckets() == num_buckets as usize { return Ok(()); } assert!( map.buckets_in_use <= num_buckets, "called finish_shrink before enough entries were removed" ); for i in (num_buckets as usize)..map.buckets.len() { if map.buckets[i].inner.is_some() { return Err(HashMapShrinkError::RemainingEntries(i)); } } let shmem_handle = self .shmem_handle .as_ref() .expect("shrink called on a fixed-size hash table"); let size_bytes = HashMapInit::::estimate_size(num_buckets); if let Err(e) = shmem_handle.set_size(size_bytes) { return Err(HashMapShrinkError::ResizeError(e)); } let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) }; let buckets_ptr = map.buckets.as_mut_ptr(); self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets); map.alloc_limit = INVALID_POS; Ok(()) } } ================================================ FILE: libs/neon-shmem/src/lib.rs ================================================ pub mod hash; pub mod shmem; pub mod sync; ================================================ FILE: libs/neon-shmem/src/shmem.rs ================================================ //! Dynamically resizable contiguous chunk of shared memory use std::num::NonZeroUsize; use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; use std::ptr::NonNull; use std::sync::atomic::{AtomicUsize, Ordering}; use nix::errno::Errno; use nix::sys::mman::MapFlags; use nix::sys::mman::ProtFlags; use nix::sys::mman::mmap as nix_mmap; use nix::sys::mman::munmap as nix_munmap; use nix::unistd::ftruncate as nix_ftruncate; /// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`. /// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's /// specified at creation. /// /// The area is backed by an anonymous file created with `memfd_create()`. The full address space for /// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`], /// the underlying file is resized. Do not access the area beyond the current size. Currently, that /// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the /// future. pub struct ShmemHandle { /// memfd file descriptor fd: OwnedFd, max_size: usize, // Pointer to the beginning of the shared memory area. The header is stored there. shared_ptr: NonNull, // Pointer to the beginning of the user data pub data_ptr: NonNull, } /// This is stored at the beginning in the shared memory area. struct SharedStruct { max_size: usize, /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag. current_size: AtomicUsize, } const RESIZE_IN_PROGRESS: usize = 1 << 63; const HEADER_SIZE: usize = std::mem::size_of::(); /// Error type returned by the [`ShmemHandle`] functions. #[derive(thiserror::Error, Debug)] #[error("{msg}: {errno}")] pub struct Error { pub msg: String, pub errno: Errno, } impl Error { fn new(msg: &str, errno: Errno) -> Self { Self { msg: msg.to_string(), errno, } } } impl ShmemHandle { /// Create a new shared memory area. To communicate between processes, the processes need to be /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes. /// /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other /// processes can continue using it, however. pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { // create the backing anonymous file. let fd = create_backing_file(name)?; Self::new_with_fd(fd, initial_size, max_size) } fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result { // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size // is a little larger than this because of the SharedStruct header. Make the upper limit // somewhat smaller than that, because with anything close to that, you'll run out of // memory anyway. assert!(max_size < 1 << 48, "max size {max_size} too large"); assert!( initial_size <= max_size, "initial size {initial_size} larger than max size {max_size}" ); // The actual initial / max size is the one given by the caller, plus the size of // 'SharedStruct'. let initial_size = HEADER_SIZE + initial_size; let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); // Reserve address space for it with mmap // // TODO: Use MAP_HUGETLB if possible let start_ptr = unsafe { nix_mmap( None, max_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, MapFlags::MAP_SHARED, &fd, 0, ) } .map_err(|e| Error::new("mmap failed", e))?; // Reserve space for the initial size enlarge_file(fd.as_fd(), initial_size as u64)?; // Initialize the header let shared: NonNull = start_ptr.cast(); unsafe { shared.write(SharedStruct { max_size: max_size.into(), current_size: AtomicUsize::new(initial_size), }); } // The user data begins after the header let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; Ok(Self { fd, max_size: max_size.into(), shared_ptr: shared, data_ptr, }) } // return reference to the header fn shared(&self) -> &SharedStruct { unsafe { self.shared_ptr.as_ref() } } /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified /// when creating the area. /// /// This may only be called from one process/thread concurrently. We detect that case /// and return an [`shmem::Error`](Error). pub fn set_size(&self, new_size: usize) -> Result<(), Error> { let new_size = new_size + HEADER_SIZE; let shared = self.shared(); assert!( new_size <= self.max_size, "new size ({new_size}) is greater than max size ({})", self.max_size ); assert_eq!(self.max_size, shared.max_size); // Lock the area by setting the bit in `current_size` // // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But // since this is not performance-critical, better safe than sorry. let mut old_size = shared.current_size.load(Ordering::Acquire); loop { if (old_size & RESIZE_IN_PROGRESS) != 0 { return Err(Error::new( "concurrent resize detected", Errno::UnknownErrno, )); } match shared.current_size.compare_exchange( old_size, new_size, Ordering::Acquire, Ordering::Relaxed, ) { Ok(_) => break, Err(x) => old_size = x, } } // Ok, we got the lock. // // NB: If anything goes wrong, we *must* clear the bit! let result = { use std::cmp::Ordering::{Equal, Greater, Less}; match new_size.cmp(&old_size) { Less => nix_ftruncate(&self.fd, new_size as i64) .map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)), Equal => Ok(()), Greater => enlarge_file(self.fd.as_fd(), new_size as u64), } }; // Unlock shared.current_size.store( if result.is_ok() { new_size } else { old_size }, Ordering::Release, ); result } /// Returns the current user-visible size of the shared memory segment. /// /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time. /// It is the caller's responsibility not to access the area beyond the current size. pub fn current_size(&self) -> usize { let total_current_size = self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; total_current_size - HEADER_SIZE } } impl Drop for ShmemHandle { fn drop(&mut self) { // SAFETY: The pointer was obtained from mmap() with the given size. // We unmap the entire region. let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; // The fd is dropped automatically by OwnedFd. } } /// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an /// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for /// development and testing, but in production we want the file to stay in memory. /// /// Disable unused variables warnings because `name` is unused in the macos path. #[allow(unused_variables)] fn create_backing_file(name: &str) -> Result { #[cfg(not(target_os = "macos"))] { nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) .map_err(|e| Error::new("memfd_create failed", e)) } #[cfg(target_os = "macos")] { let file = tempfile::tempfile().map_err(|e| { Error::new( "could not create temporary file to back shmem area", nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), ) })?; Ok(OwnedFd::from(file)) } } fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that // we don't get a segfault later when trying to actually use it. #[cfg(not(target_os = "macos"))] { nix::fcntl::posix_fallocate(fd, 0, size as i64) .map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e)) } // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' #[cfg(target_os = "macos")] { nix::unistd::ftruncate(fd, size as i64) .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e)) } } #[cfg(test)] mod tests { use super::*; use nix::unistd::ForkResult; use std::ops::Range; /// check that all bytes in given range have the expected value. fn assert_range(ptr: *const u8, expected: u8, range: Range) { for i in range { let b = unsafe { *(ptr.add(i)) }; assert_eq!(expected, b, "unexpected byte at offset {i}"); } } /// Write 'b' to all bytes in the given range fn write_range(ptr: *mut u8, b: u8, range: Range) { unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; } // simple single-process test of growing and shrinking #[test] fn test_shmem_resize() -> Result<(), Error> { let max_size = 1024 * 1024; let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; assert_eq!(init_struct.current_size(), 0); // Initial grow let size1 = 10000; init_struct.set_size(size1).unwrap(); assert_eq!(init_struct.current_size(), size1); // Write some data let data_ptr = init_struct.data_ptr.as_ptr(); write_range(data_ptr, 0xAA, 0..size1); assert_range(data_ptr, 0xAA, 0..size1); // Shrink let size2 = 5000; init_struct.set_size(size2).unwrap(); assert_eq!(init_struct.current_size(), size2); // Grow again let size3 = 20000; init_struct.set_size(size3).unwrap(); assert_eq!(init_struct.current_size(), size3); // Try to read it. The area that was shrunk and grown again should read as all zeros now assert_range(data_ptr, 0xAA, 0..5000); assert_range(data_ptr, 0, 5000..size1); // Try to grow beyond max_size //let size4 = max_size + 1; //assert!(init_struct.set_size(size4).is_err()); // Dropping init_struct should unmap the memory drop(init_struct); Ok(()) } /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`, /// but is stored in the shared memory area and works across processes. It's implemented by /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. struct SimpleBarrier { num_procs: usize, count: AtomicUsize, } impl SimpleBarrier { unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { unsafe { *ptr = SimpleBarrier { num_procs, count: AtomicUsize::new(0), } } } pub fn wait(&self) { let old = self.count.fetch_add(1, Ordering::Relaxed); let generation = old / self.num_procs; let mut current = old + 1; while current < (generation + 1) * self.num_procs { std::thread::sleep(std::time::Duration::from_millis(10)); current = self.count.load(Ordering::Relaxed); } } } #[test] fn test_multi_process() { // Initialize let max_size = 1_000_000_000_000; let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); let ptr = init_struct.data_ptr.as_ptr(); // Store the SimpleBarrier in the first 1k of the area. init_struct.set_size(10000).unwrap(); let barrier_ptr: *mut SimpleBarrier = unsafe { ptr.add(ptr.align_offset(std::mem::align_of::())) .cast() }; unsafe { SimpleBarrier::init(barrier_ptr, 2) }; let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; // Fork another test process. The code after this runs in both processes concurrently. let fork_result = unsafe { nix::unistd::fork().unwrap() }; // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 if fork_result.is_parent() { write_range(ptr, 0xAA, 1000..2000); } else { write_range(ptr, 0xBB, 2000..3000); } barrier.wait(); // Verify the contents. (in both processes) assert_range(ptr, 0xAA, 1000..2000); assert_range(ptr, 0xBB, 2000..3000); // Grow, from the child this time let size = 10_000_000; if !fork_result.is_parent() { init_struct.set_size(size).unwrap(); } barrier.wait(); // make some writes at the end if fork_result.is_parent() { write_range(ptr, 0xAA, (size - 10)..size); } else { write_range(ptr, 0xBB, (size - 20)..(size - 10)); } barrier.wait(); // Verify the contents. (This runs in both processes) assert_range(ptr, 0, (size - 1000)..(size - 20)); assert_range(ptr, 0xBB, (size - 20)..(size - 10)); assert_range(ptr, 0xAA, (size - 10)..size); if let ForkResult::Parent { child } = fork_result { nix::sys::wait::waitpid(child, None).unwrap(); } } } ================================================ FILE: libs/neon-shmem/src/sync.rs ================================================ //! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory. use std::mem::MaybeUninit; use std::ptr::NonNull; use nix::errno::Errno; pub type RwLock = lock_api::RwLock; pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>; pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>; pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>; pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>; /// Shared memory read-write lock. pub struct PthreadRwLock(Option>); /// Simple macro that calls a function in the libc namespace and panics if return value is nonzero. macro_rules! libc_checked { ($fn_name:ident ( $($arg:expr),* )) => {{ let res = libc::$fn_name($($arg),*); if res != 0 { panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res)); } }}; } impl PthreadRwLock { /// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock. /// /// # Safety /// `lock` must be non-null. Every unsafe operation will panic in the event of an error. pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self { unsafe { let mut attrs = MaybeUninit::uninit(); libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr())); libc_checked!(pthread_rwlockattr_setpshared( attrs.as_mut_ptr(), libc::PTHREAD_PROCESS_SHARED )); libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr())); // Safety: POSIX specifies that "any function affecting the attributes // object (including destruction) shall not affect any previously // initialized read-write locks". libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr())); Self(Some(NonNull::new_unchecked(lock))) } } fn inner(&self) -> NonNull { match self.0 { None => { panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT") } Some(x) => x, } } } unsafe impl lock_api::RawRwLock for PthreadRwLock { type GuardMarker = lock_api::GuardSend; const INIT: Self = Self(None); fn try_lock_shared(&self) -> bool { unsafe { let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr()); match res { 0 => true, libc::EAGAIN => false, _ => panic!( "pthread_rwlock_tryrdlock failed with {}", Errno::from_raw(res) ), } } } fn try_lock_exclusive(&self) -> bool { unsafe { let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr()); match res { 0 => true, libc::EAGAIN => false, _ => panic!("try_wrlock failed with {}", Errno::from_raw(res)), } } } fn lock_shared(&self) { unsafe { libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr())); } } fn lock_exclusive(&self) { unsafe { libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr())); } } unsafe fn unlock_exclusive(&self) { unsafe { libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr())); } } unsafe fn unlock_shared(&self) { unsafe { libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr())); } } } ================================================ FILE: libs/pageserver_api/Cargo.toml ================================================ [package] name = "pageserver_api" version = "0.1.0" edition = "2024" license.workspace = true [features] default = ["io-align-512"] # See pageserver/Cargo.toml testing = ["dep:nix"] # Direct IO alignment options (mutually exclusive) io-align-512 = [] io-align-4k = [] [dependencies] serde.workspace = true serde_with.workspace = true serde_json.workspace = true const_format.workspace = true anyhow.workspace = true bytes.workspace = true byteorder.workspace = true utils.workspace = true postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true posthog_client_lite.workspace = true enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true storage_broker.workspace = true camino = { workspace = true, features = ["serde1"] } remote_storage.workspace = true postgres_backend.workspace = true nix = { workspace = true, optional = true } reqwest.workspace = true rand.workspace = true tracing.workspace = true tracing-utils.workspace = true once_cell.workspace = true [dev-dependencies] bincode.workspace = true rand.workspace = true ================================================ FILE: libs/pageserver_api/src/config/tests.rs ================================================ use super::*; #[test] fn test_node_metadata_v1_backward_compatibilty() { let v1 = serde_json::to_vec(&serde_json::json!({ "host": "localhost", "port": 23, "http_host": "localhost", "http_port": 42, })); assert_eq!( serde_json::from_slice::(&v1.unwrap()).unwrap(), NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: 23, grpc_host: None, grpc_port: None, http_host: "localhost".to_string(), http_port: 42, https_port: None, other: HashMap::new(), } ) } #[test] fn test_node_metadata_v2_backward_compatibilty() { let v2 = serde_json::to_vec(&serde_json::json!({ "host": "localhost", "port": 23, "http_host": "localhost", "http_port": 42, "https_port": 123, })); assert_eq!( serde_json::from_slice::(&v2.unwrap()).unwrap(), NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: 23, grpc_host: None, grpc_port: None, http_host: "localhost".to_string(), http_port: 42, https_port: Some(123), other: HashMap::new(), } ) } #[test] fn test_node_metadata_v3_backward_compatibilty() { let v3 = serde_json::to_vec(&serde_json::json!({ "host": "localhost", "port": 23, "grpc_host": "localhost", "grpc_port": 51, "http_host": "localhost", "http_port": 42, "https_port": 123, })); assert_eq!( serde_json::from_slice::(&v3.unwrap()).unwrap(), NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: 23, grpc_host: Some("localhost".to_string()), grpc_port: Some(51), http_host: "localhost".to_string(), http_port: 42, https_port: Some(123), other: HashMap::new(), } ) } ================================================ FILE: libs/pageserver_api/src/config.rs ================================================ use camino::Utf8PathBuf; #[cfg(test)] mod tests; use const_format::formatcp; use posthog_client_lite::PostHogClientConfig; use utils::serde_percent::Percent; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); // TODO: gRPC is disabled by default for now, but the port is used in neon_local. pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051 use std::collections::HashMap; use std::fmt::Display; use std::num::{NonZeroU64, NonZeroUsize}; use std::str::FromStr; use std::time::Duration; use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; use utils::logging::LogFormat; use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not needed by the pageserver // itself, it is only used for registering the pageserver with the control // plane and/or storage controller. #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] pub struct NodeMetadata { #[serde(rename = "host")] pub postgres_host: String, #[serde(rename = "port")] pub postgres_port: u16, pub grpc_host: Option, pub grpc_port: Option, pub http_host: String, pub http_port: u16, pub https_port: Option, // Deployment tools may write fields to the metadata file beyond what we // use in this type: this type intentionally only names fields that require. #[serde(flatten)] pub other: HashMap, } impl Display for NodeMetadata { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "postgresql://{}:{} ", self.postgres_host, self.postgres_port )?; if let Some(grpc_host) = &self.grpc_host { let grpc_port = self.grpc_port.unwrap_or_default(); write!(f, "grpc://{grpc_host}:{grpc_port} ")?; } write!(f, "http://{}:{} ", self.http_host, self.http_port)?; write!(f, "other:{:?}", self.other)?; Ok(()) } } /// PostHog integration config. This is used in pageserver, storcon, and neon_local. /// Ensure backward compatibility when adding new fields. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PostHogConfig { /// PostHog project ID #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub project_id: Option, /// Server-side (private) API key #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub server_api_key: Option, /// Client-side (public) API key #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub client_api_key: Option, /// Private API URL #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub private_api_url: Option, /// Public API URL #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub public_api_url: Option, /// Refresh interval for the feature flag spec. /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub refresh_interval: Option, } impl PostHogConfig { pub fn try_into_posthog_config(self) -> Result { let Some(project_id) = self.project_id else { return Err("project_id is required"); }; let Some(server_api_key) = self.server_api_key else { return Err("server_api_key is required"); }; let Some(client_api_key) = self.client_api_key else { return Err("client_api_key is required"); }; let Some(private_api_url) = self.private_api_url else { return Err("private_api_url is required"); }; let Some(public_api_url) = self.public_api_url else { return Err("public_api_url is required"); }; Ok(PostHogClientConfig { project_id, server_api_key, client_api_key, private_api_url, public_api_url, }) } } /// `pageserver.toml` /// /// We use serde derive with `#[serde(default)]` to generate a deserializer /// that fills in the default values for each config field. /// /// If there cannot be a static default value because we need to make runtime /// checks to determine the default, make it an `Option` (which defaults to None). /// The runtime check should be done in the consuming crate, i.e., `pageserver`. /// /// Unknown fields are silently ignored during deserialization. /// The alternative, which we used in the past, was to set `deny_unknown_fields`, /// which fails deserialization, and hence pageserver startup, if there is an unknown field. /// The reason we don't do that anymore is that it complicates /// usage of config fields for feature flagging, which we commonly do for /// region-by-region rollouts. /// The complications mainly arise because the `pageserver.toml` contents on a /// prod server have a separate lifecycle from the pageserver binary. /// For instance, `pageserver.toml` contents today are defined in the internal /// infra repo, and thus introducing a new config field to pageserver and /// rolling it out to prod servers are separate commits in separate repos /// that can't be made or rolled back atomically. /// Rollbacks in particular pose a risk with deny_unknown_fields because /// the old pageserver binary may reject a new config field, resulting in /// an outage unless the person doing the pageserver rollback remembers /// to also revert the commit that added the config field in to the /// `pageserver.toml` templates in the internal infra repo. /// (A pre-deploy config check would eliminate this risk during rollbacks, /// cf [here](https://github.com/neondatabase/cloud/issues/24349).) /// In addition to this compatibility problem during emergency rollbacks, /// deny_unknown_fields adds further complications when decomissioning a feature /// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`] /// until all prod servers' `pageserver.toml` files have been updated to a version /// that doesn't specify the flag. Otherwise new software would fail to start up. /// This adds the requirement for an intermediate step where the new config field /// is accepted but ignored, prolonging the decomissioning process by an entire /// release cycle. /// By contrast with unknown fields silently ignored, decomissioning a feature /// flag is a one-step process: we can skip the intermediate step and straight /// remove the field from the [`ConfigToml`]. We leave the field in the /// `pageserver.toml` files on prod servers until we reach certainty that we /// will not roll back to old software whose behavior was dependent on config. /// Then we can remove the field from the templates in the internal infra repo. /// This process is [documented internally]( /// https://docs.neon.build/storage/pageserver_configuration.html). /// /// Note that above relaxed compatbility for the config format does NOT APPLY /// TO THE STORAGE FORMAT. As general guidance, when introducing storage format /// changes, ensure that the potential rollback target version will be compatible /// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`: /// any format version that exists in an environment must be compatible with the software that runs there. /// Use a pageserver.toml flag only to gate whether software _writes_ the new format. /// For more compatibility considerations, refer to [internal docs]( /// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility) #[serde_as] #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(default)] pub struct ConfigToml { // types mapped 1:1 into the runtime PageServerConfig type pub listen_pg_addr: String, pub listen_http_addr: String, pub listen_https_addr: Option, pub listen_grpc_addr: Option, pub ssl_key_file: Utf8PathBuf, pub ssl_cert_file: Utf8PathBuf, #[serde(with = "humantime_serde")] pub ssl_cert_reload_period: Duration, pub ssl_ca_file: Option, pub availability_zone: Option, #[serde(with = "humantime_serde")] pub wait_lsn_timeout: Duration, #[serde(with = "humantime_serde")] pub wal_redo_timeout: Duration, pub superuser: String, pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, pub pg_distrib_dir: Option, #[serde_as(as = "serde_with::DisplayFromStr")] pub http_auth_type: AuthType, #[serde_as(as = "serde_with::DisplayFromStr")] pub pg_auth_type: AuthType, pub grpc_auth_type: AuthType, pub auth_validation_public_key_path: Option, pub remote_storage: Option, pub tenant_config: TenantConfigToml, #[serde_as(as = "serde_with::DisplayFromStr")] pub broker_endpoint: storage_broker::Uri, #[serde(with = "humantime_serde")] pub broker_keepalive_interval: Duration, #[serde_as(as = "serde_with::DisplayFromStr")] pub log_format: LogFormat, pub concurrent_tenant_warmup: NonZeroUsize, pub concurrent_tenant_size_logical_size_queries: NonZeroUsize, #[serde(with = "humantime_serde")] pub metric_collection_interval: Duration, pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, #[serde(with = "humantime_serde")] pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, pub test_remote_failures_probability: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, #[serde(with = "humantime_serde")] pub background_task_maximum_delay: Duration, pub control_plane_api: Option, pub control_plane_api_token: Option, pub control_plane_emergency_mode: bool, /// Unstable feature: subject to change or removal without notice. /// See . pub import_pgdata_upcall_api: Option, /// Unstable feature: subject to change or removal without notice. /// See . pub import_pgdata_upcall_api_token: Option, /// Unstable feature: subject to change or removal without notice. /// See . pub import_pgdata_aws_endpoint_url: Option, pub heatmap_upload_concurrency: usize, pub secondary_download_concurrency: usize, pub virtual_file_io_engine: Option, pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, pub max_get_vectored_keys: MaxGetVectoredKeys, pub image_compression: ImageCompressionAlgorithm, pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, #[serde(skip_serializing_if = "Option::is_none")] pub validate_wal_contiguity: Option, #[serde(skip_serializing_if = "Option::is_none")] pub load_previous_heatmap: Option, #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, #[serde(skip_serializing_if = "Option::is_none")] pub posthog_config: Option, pub timeline_import_config: TimelineImportConfig, #[serde(skip_serializing_if = "Option::is_none")] pub basebackup_cache_config: Option, #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_generation_large_timeline_threshold: Option, pub force_metric_collection_on_scrape: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(default)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, #[serde(with = "humantime_serde")] pub period: Duration, #[cfg(feature = "testing")] pub mock_statvfs: Option, /// Select sorting for evicted layers #[serde(default)] pub eviction_order: EvictionOrder, pub enabled: bool, } impl Default for DiskUsageEvictionTaskConfig { fn default() -> Self { Self { max_usage_pct: Percent::new(80).unwrap(), min_avail_bytes: 2_000_000_000, period: Duration::from_secs(60), #[cfg(feature = "testing")] mock_statvfs: None, eviction_order: EvictionOrder::default(), enabled: true, } } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum PageServicePipeliningConfig { Serial, Pipelined(PageServicePipeliningConfigPipelined), } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PageServicePipeliningConfigPipelined { /// Failed config parsing and validation if larger than `max_get_vectored_keys`. pub max_batch_size: NonZeroUsize, pub execution: PageServiceProtocolPipelinedExecutionStrategy, // The default below is such that new versions of the software can start // with the old configuration. #[serde(default)] pub batching: PageServiceProtocolPipelinedBatchingStrategy, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] pub enum PageServiceProtocolPipelinedExecutionStrategy { ConcurrentFutures, Tasks, } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] pub enum PageServiceProtocolPipelinedBatchingStrategy { /// All get page requests in a batch will be at the same LSN #[default] UniformLsn, /// Get page requests in a batch may be at different LSN /// /// One key cannot be present more than once at different LSNs in /// the same batch. ScatteredLsn, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum GetVectoredConcurrentIo { /// The read path is fully sequential: layers are visited /// one after the other and IOs are issued and waited upon /// from the same task that traverses the layers. Sequential, /// The read path still traverses layers sequentially, and /// index blocks will be read into the PS PageCache from /// that task, with waiting. /// But data IOs are dispatched and waited upon from a sidecar /// task so that the traversing task can continue to traverse /// layers while the IOs are in flight. /// If the PS PageCache miss rate is low, this improves /// throughput dramatically. SidecarTask, } #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct Ratio { pub numerator: usize, pub denominator: usize, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct OtelExporterConfig { pub endpoint: String, pub protocol: OtelExporterProtocol, #[serde(with = "humantime_serde")] pub timeout: Duration, } #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] pub enum OtelExporterProtocol { Grpc, HttpBinary, HttpJson, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct Tracing { pub sampling_ratio: Ratio, pub export_config: OtelExporterConfig, } impl From<&OtelExporterConfig> for tracing_utils::ExportConfig { fn from(val: &OtelExporterConfig) -> Self { tracing_utils::ExportConfig { endpoint: Some(val.endpoint.clone()), protocol: val.protocol.into(), timeout: Some(val.timeout), } } } impl From for tracing_utils::Protocol { fn from(val: OtelExporterProtocol) -> Self { match val { OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc, OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson, OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary, } } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct TimelineImportConfig { pub import_job_concurrency: NonZeroUsize, pub import_job_soft_size_limit: NonZeroUsize, pub import_job_checkpoint_threshold: NonZeroUsize, /// Max size of the remote storage partial read done by any job pub import_job_max_byte_range_size: NonZeroUsize, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(default)] pub struct BasebackupCacheConfig { #[serde(with = "humantime_serde")] pub cleanup_period: Duration, /// Maximum total size of basebackup cache entries on disk in bytes. /// The cache may slightly exceed this limit because we do not know /// the exact size of the cache entry untill it's written to disk. pub max_total_size_bytes: u64, // TODO(diko): support max_entry_size_bytes. // pub max_entry_size_bytes: u64, pub max_size_entries: usize, /// Size of the channel used to send prepare requests to the basebackup cache worker. /// If exceeded, new prepare requests will be dropped. pub prepare_channel_size: usize, } impl Default for BasebackupCacheConfig { fn default() -> Self { Self { cleanup_period: Duration::from_secs(60), max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB // max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB max_size_entries: 10000, prepare_channel_size: 100, } } } pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "type")] pub enum Behavior { Success { blocksize: u64, total_blocks: u64, name_filter: Option, }, #[cfg(feature = "testing")] Failure { mocked_error: MockedError }, } #[cfg(feature = "testing")] #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[allow(clippy::upper_case_acronyms)] pub enum MockedError { EIO, } #[cfg(feature = "testing")] impl From for nix::Error { fn from(e: MockedError) -> Self { match e { MockedError::EIO => nix::Error::EIO, } } } } } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "type", content = "args")] pub enum EvictionOrder { RelativeAccessed { highest_layer_count_loses_first: bool, }, } impl Default for EvictionOrder { fn default() -> Self { Self::RelativeAccessed { highest_layer_count_loses_first: true, } } } #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(transparent)] pub struct MaxGetVectoredKeys(NonZeroUsize); impl MaxGetVectoredKeys { pub fn get(&self) -> usize { self.0.get() } } /// Tenant-level configuration values, used for various purposes. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(default)] pub struct TenantConfigToml { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the // page server crashes. // This parameter actually determines L0 layer file size. pub checkpoint_distance: u64, // Inmemory layer is also flushed at least once in checkpoint_timeout to // eventually upload WAL after activity is stopped. #[serde(with = "humantime_serde")] pub checkpoint_timeout: Duration, // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub compaction_target_size: u64, // How often to check if there's compaction work to be done. // Duration::ZERO means automatic compaction is disabled. #[serde(with = "humantime_serde")] pub compaction_period: Duration, /// Level0 delta layer threshold for compaction. pub compaction_threshold: usize, /// Controls the amount of L0 included in a single compaction iteration. /// The unit is `checkpoint_distance`, i.e., a size. /// We add L0s to the set of layers to compact until their cumulative /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, /// If true, enable shard ancestor compaction (enabled by default). pub compaction_shard_ancestor: bool, /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0 /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true. pub compaction_l0_first: bool, /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only /// has an effect if `compaction_l0_first` is true. Defaults to true. pub compaction_l0_semaphore: bool, /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long, /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up. /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold. pub l0_flush_delay_threshold: Option, /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. pub l0_flush_stall_threshold: Option, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. // Page versions older than this are garbage collected away. pub gc_horizon: u64, // Interval at which garbage collection is triggered. // Duration::ZERO means automatic GC is disabled #[serde(with = "humantime_serde")] pub gc_period: Duration, // Delta layer churn threshold to create L1 image layers. pub image_creation_threshold: usize, // HADRON // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and // (2) create image layers if there are any L1 deltas. #[serde(with = "humantime_serde")] pub image_layer_force_creation_period: Option, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is time. // Page versions older than this are garbage collected away. #[serde(with = "humantime_serde")] pub pitr_interval: Duration, /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. #[serde(with = "humantime_serde")] pub walreceiver_connect_timeout: Duration, /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. /// A stalled safekeeper will be changed to a newer one when it appears. #[serde(with = "humantime_serde")] pub lagging_wal_timeout: Duration, /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, pub eviction_policy: crate::models::EvictionPolicy, pub min_resident_size_override: Option, // See the corresponding metric's help string. #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, /// If non-zero, the period between uploads of a heatmap from attached tenants. This /// may be disabled if a Tenant will not have secondary locations: only secondary /// locations will use the heatmap uploaded by attached locations. #[serde(with = "humantime_serde")] pub heatmap_period: Duration, /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup pub lazy_slru_download: bool, pub timeline_get_throttle: crate::models::ThrottleConfig, // How much WAL must be ingested before checking again whether a new image layer is required. // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction. // Set to 0 to disable preemption. pub image_creation_preempt_threshold: usize, /// The length for an explicit LSN lease request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] pub lsn_lease_length: Duration, /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] pub lsn_lease_length_for_ts: Duration, /// Enable auto-offloading of timelines. /// (either this flag or the pageserver-global one need to be set) pub timeline_offloading: bool, /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. pub rel_size_v2_enabled: bool, // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. pub gc_compaction_enabled: bool, /// Enable verification of gc-compaction results. pub gc_compaction_verification: bool, /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, /// gc-compaction will be triggered. pub gc_compaction_initial_threshold_kb: u64, /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) /// is above this ratio, gc-compaction will be triggered. pub gc_compaction_ratio_percent: u64, /// Tenant level performance sampling ratio override. Controls the ratio of get page requests /// that will get perf sampling for the tenant. pub sampling_ratio: Option, /// Capacity of relsize snapshot cache (used by replicas). pub relsize_snapshot_cache_capacity: usize, /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests. // FIXME: Remove skip_serializing_if when the feature is stable. #[serde(skip_serializing_if = "std::ops::Not::not")] pub basebackup_cache_enabled: bool, } pub mod defaults { pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; use crate::models::ImageCompressionAlgorithm; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { "C" } else { "C.UTF-8" }; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; pub const DEFAULT_LOG_FORMAT: &str = "plain"; pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1; pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; /// Soft limit for the maximum size of a vectored read. /// /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record, /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`. /// That is, slightly above 128 kB. pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32; pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Zstd { level: Some(1) }; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; #[cfg(feature = "io-align-4k")] pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 4096; #[cfg(all(feature = "io-align-512", not(feature = "io-align-4k")))] pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; #[cfg(not(any(feature = "io-align-512", feature = "io-align-4k")))] pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } impl Default for ConfigToml { fn default() -> Self { use defaults::*; Self { listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), listen_https_addr: (None), listen_grpc_addr: None, // TODO: default to 127.0.0.1:51051 ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE), ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE), ssl_cert_reload_period: Duration::from_secs(60), ssl_ca_file: None, availability_zone: (None), wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) .expect("cannot parse default wal redo timeout")), superuser: (DEFAULT_SUPERUSER.to_string()), locale: DEFAULT_LOCALE.to_string(), page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() http_auth_type: (AuthType::Trust), pg_auth_type: (AuthType::Trust), grpc_auth_type: (AuthType::Trust), auth_validation_public_key_path: (None), remote_storage: None, broker_endpoint: (storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), broker_keepalive_interval: (humantime::parse_duration( storage_broker::DEFAULT_KEEPALIVE_INTERVAL, ) .expect("cannot parse default keepalive interval")), log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) .expect("Invalid default constant")), concurrent_tenant_size_logical_size_queries: NonZeroUsize::new( DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES, ) .unwrap(), metric_collection_interval: (humantime::parse_duration( DEFAULT_METRIC_COLLECTION_INTERVAL, ) .expect("cannot parse default metric collection interval")), synthetic_size_calculation_interval: (humantime::parse_duration( DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, ) .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT), metric_collection_bucket: (None), disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), test_remote_failures_probability: (100), ondemand_download_behavior_treat_error_as_warn: (false), background_task_maximum_delay: (humantime::parse_duration( DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, ) .unwrap()), control_plane_api: (None), control_plane_api_token: (None), control_plane_emergency_mode: (false), import_pgdata_upcall_api: (None), import_pgdata_upcall_api_token: (None), import_pgdata_aws_endpoint_url: (None), heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE), virtual_file_io_engine: None, max_vectored_read_bytes: (MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), max_get_vectored_keys: (MaxGetVectoredKeys( NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), no_sync: None, page_service_pipelining: PageServicePipeliningConfig::Pipelined( PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, }, ), get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None }, validate_wal_contiguity: None, load_previous_heatmap: None, generate_unarchival_heatmap: None, tracing: None, enable_tls_page_service_api: false, dev_mode: false, timeline_import_config: TimelineImportConfig { import_job_concurrency: NonZeroUsize::new(32).unwrap(), import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(), import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(), import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(), }, basebackup_cache_config: None, posthog_config: None, image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024), force_metric_collection_on_scrape: true, } } } pub mod tenant_conf_defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. // This parameter actually determines L0 layer file size. pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; // FIXME the below configs are only used by legacy algorithm. The new algorithm // has different parameters. // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true; // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak // compaction usage of 15360MB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10; // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid // read amp. pub const DEFAULT_COMPACTION_L0_FIRST: bool = true; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. // If there's a need to decrease this value, first make sure that GC // doesn't hold a layer map write lock for non-trivial operations. // Relevant: https://github.com/neondatabase/neon/issues/3394 pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure // without looking at the exact number of L0 layers. // It was expected to have the following behavior: // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image // > layer creation will end immediately. Set to 0 to disable. pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; // The default limit on WAL lag should be set to avoid causing disconnects under high throughput // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for // throughputs up to 1GiB/s per timeline. pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true; pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000; } impl Default for TenantConfigToml { fn default() -> Self { use tenant_conf_defaults::*; Self { checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) .expect("cannot parse default checkpoint timeout"), compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT, compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR, compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST, compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, l0_flush_delay_threshold: None, l0_flush_stall_threshold: None, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, image_layer_force_creation_period: None, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), walreceiver_connect_timeout: humantime::parse_duration( DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, ) .expect("cannot parse default walreceiver connect timeout"), lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) .expect("cannot parse default walreceiver lagging wal timeout"), max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), eviction_policy: crate::models::EvictionPolicy::NoEviction, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: humantime::parse_duration( DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), heatmap_period: Duration::ZERO, lazy_slru_download: false, timeline_get_throttle: crate::models::ThrottleConfig::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: true, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, basebackup_cache_enabled: false, } } } ================================================ FILE: libs/pageserver_api/src/controller_api.rs ================================================ use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::net::IpAddr; use std::str::FromStr; use std::time::{Duration, Instant}; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server /// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo}; use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantCreateRequest { pub new_tenant_id: TenantShardId, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub generation: Option, // If omitted, create a single shard with TenantShardId::unsharded() #[serde(default)] #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] pub shard_parameters: ShardParameters, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub placement_policy: Option, #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } #[derive(Serialize, Deserialize)] pub struct TenantCreateResponseShard { pub shard_id: TenantShardId, pub node_id: NodeId, pub generation: u32, } #[derive(Serialize, Deserialize)] pub struct TenantCreateResponse { pub shards: Vec, } #[derive(Serialize, Deserialize, Debug, Clone)] pub struct NodeRegisterRequest { pub node_id: NodeId, pub listen_pg_addr: String, pub listen_pg_port: u16, pub listen_grpc_addr: Option, pub listen_grpc_port: Option, pub listen_http_addr: String, pub listen_http_port: u16, pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, // Reachable IP address of the PS/SK registering, if known. // Hadron Cluster Coordiantor will update the DNS record of the registering node // with this IP address. pub node_ip_addr: Option, } #[derive(Serialize, Deserialize)] pub struct NodeConfigureRequest { pub node_id: NodeId, pub availability: Option, pub scheduling: Option, } #[derive(Serialize, Deserialize)] pub struct TenantPolicyRequest { pub placement: Option, pub scheduling: Option, } #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] pub struct AvailabilityZone(pub String); impl Display for AvailabilityZone { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] pub preferred_az_ids: HashMap>, } #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsResponse { pub updated: Vec, } #[derive(Serialize, Deserialize, Debug)] pub struct TenantLocateResponseShard { pub shard_id: TenantShardId, pub node_id: NodeId, pub listen_pg_addr: String, pub listen_pg_port: u16, pub listen_grpc_addr: Option, pub listen_grpc_port: Option, pub listen_http_addr: String, pub listen_http_port: u16, pub listen_https_port: Option, } #[derive(Serialize, Deserialize)] pub struct TenantLocateResponse { pub shards: Vec, pub shard_params: ShardParameters, } #[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponse { pub tenant_id: TenantId, pub shards: Vec, pub stripe_size: ShardStripeSize, pub policy: PlacementPolicy, pub config: TenantConfig, } #[derive(Serialize, Deserialize, Debug)] pub struct TenantTimelineDescribeResponse { pub shards: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub image_consistent_lsn: Option, } #[derive(Serialize, Deserialize, Debug)] pub struct NodeShardResponse { pub node_id: NodeId, pub shards: Vec, } #[derive(Serialize, Deserialize, Debug)] pub struct NodeShard { pub tenant_shard_id: TenantShardId, /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node. pub is_observed_secondary: Option, /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node. pub is_intended_secondary: Option, } #[derive(Serialize, Deserialize)] pub struct NodeDescribeResponse { pub id: NodeId, pub availability: NodeAvailabilityWrapper, pub scheduling: NodeSchedulingPolicy, pub availability_zone_id: String, pub listen_http_addr: String, pub listen_http_port: u16, pub listen_https_port: Option, pub listen_pg_addr: String, pub listen_pg_port: u16, pub listen_grpc_addr: Option, pub listen_grpc_port: Option, } #[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, pub node_attached: Option, pub node_secondary: Vec, pub last_error: String, /// A task is currently running to reconcile this tenant's intent state with the state on pageservers pub is_reconciling: bool, /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending. pub is_pending_compute_notification: bool, /// A shard split is currently underway pub is_splitting: bool, /// A timeline is being imported into this tenant pub is_importing: bool, pub scheduling_policy: ShardSchedulingPolicy, pub preferred_az_id: Option, } /// Migration request for a given tenant shard to a given node. /// /// Explicitly migrating a particular shard is a low level operation /// TODO: higher level "Reschedule tenant" operation where the request /// specifies some constraints, e.g. asking it to get off particular node(s) #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { pub node_id: NodeId, /// Optionally, callers may specify the node they are migrating _from_, and the server will /// reject the request if the shard is no longer attached there: this enables writing safer /// clients that don't risk fighting with some other movement of the shard. #[serde(default)] pub origin_node_id: Option, #[serde(default)] pub migration_config: MigrationConfig, } #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] pub struct MigrationConfig { /// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling /// score: this is usually not what you want, and if you use this then you'll also need to set the /// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration. /// /// Default: false #[serde(default)] pub override_scheduler: bool, /// If true, the migration will be done gracefully by creating a secondary location first and /// waiting for it to warm up before cutting over. If false, if there is no existing secondary /// location at the destination, the tenant will be migrated immediately. If the tenant's data /// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go /// ahead but run with a cold cache that can severely reduce performance until it warms up. /// /// When doing a graceful migration, the migration API returns as soon as it is started. /// /// Default: true #[serde(default = "default_prewarm")] pub prewarm: bool, /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait /// overall for secondary warmup before cutting over #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_warmup_timeout: Option, /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait /// within each secondary download poll call to pageserver. #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_download_request_timeout: Option, } fn default_prewarm() -> bool { true } impl Default for MigrationConfig { fn default() -> Self { Self { override_scheduler: false, prewarm: default_prewarm(), secondary_warmup_timeout: None, secondary_download_request_timeout: None, } } } #[derive(Serialize, Clone, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state Active(PageserverUtilization), // Node is warming up, but we expect it to become available soon. Covers // the time span between the re-attach response being composed on the storage controller // and the first successful heartbeat after the processing of the re-attach response // finishes on the pageserver. WarmingUp(Instant), // Offline: Tenants shouldn't try to attach here, but they may assume that their // secondary locations on this node still exist. Newly added nodes are in this // state until we successfully contact them. Offline, } impl PartialEq for NodeAvailability { fn eq(&self, other: &Self) -> bool { use NodeAvailability::*; matches!( (self, other), (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_)) ) } } impl Eq for NodeAvailability {} // This wrapper provides serde functionality and it should only be used to // communicate with external callers which don't know or care about the // utilisation score of the pageserver it is targeting. #[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, WarmingUp, Offline, } impl From for NodeAvailability { fn from(val: NodeAvailabilityWrapper) -> Self { match val { // Assume the worst utilisation score to begin with. It will later be updated by // the heartbeats. NodeAvailabilityWrapper::Active => { NodeAvailability::Active(PageserverUtilization::full()) } NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, } } } impl From for NodeAvailabilityWrapper { fn from(val: NodeAvailability) -> Self { match val { NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp, NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, } } } /// Scheduling policy enables us to selectively disable some automatic actions that the /// controller performs on a tenant shard. This is only set to a non-default value by /// human intervention, and it is reset to the default value (Active) when the tenant's /// placement policy is modified away from Attached. /// /// The typical use of a non-Active scheduling policy is one of: /// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy) /// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed) /// /// If you're not sure which policy to use to pin a shard to its current location, you probably /// want Pause. #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum ShardSchedulingPolicy { // Normal mode: the tenant's scheduled locations may be updated at will, including // for non-essential optimization. Active, // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. // For example, this still permits a node's attachment location to change to a secondary in // response to a node failure, or to assign a new secondary if a node was removed. Essential, // No scheduling: leave the shard running wherever it currently is. Even if the shard is // unavailable, it will not be rescheduled to another node. Pause, // No reconciling: we will make no location_conf API calls to pageservers at all. If the // shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over. Stop, } impl Default for ShardSchedulingPolicy { fn default() -> Self { Self::Active } } #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeLifecycle { Active, Deleted, } impl FromStr for NodeLifecycle { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "active" => Ok(Self::Active), "deleted" => Ok(Self::Deleted), _ => Err(anyhow::anyhow!("Unknown node lifecycle '{s}'")), } } } impl From for String { fn from(value: NodeLifecycle) -> String { use NodeLifecycle::*; match value { Active => "active", Deleted => "deleted", } .to_string() } } #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeSchedulingPolicy { Active, Filling, Pause, PauseForRestart, Draining, Deleting, } impl FromStr for NodeSchedulingPolicy { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "active" => Ok(Self::Active), "filling" => Ok(Self::Filling), "pause" => Ok(Self::Pause), "pause_for_restart" => Ok(Self::PauseForRestart), "draining" => Ok(Self::Draining), "deleting" => Ok(Self::Deleting), _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), } } } impl From for String { fn from(value: NodeSchedulingPolicy) -> String { use NodeSchedulingPolicy::*; match value { Active => "active", Filling => "filling", Pause => "pause", PauseForRestart => "pause_for_restart", Draining => "draining", Deleting => "deleting", } .to_string() } } #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum SkSchedulingPolicy { Active, Activating, Pause, Decomissioned, } impl FromStr for SkSchedulingPolicy { type Err = anyhow::Error; fn from_str(s: &str) -> Result { Ok(match s { "active" => Self::Active, "activating" => Self::Activating, "pause" => Self::Pause, "decomissioned" => Self::Decomissioned, _ => { return Err(anyhow::anyhow!( "Unknown scheduling policy '{s}', try active,pause,decomissioned" )); } }) } } impl From for String { fn from(value: SkSchedulingPolicy) -> String { use SkSchedulingPolicy::*; match value { Active => "active", Activating => "activating", Pause => "pause", Decomissioned => "decomissioned", } .to_string() } } /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether /// to create secondary locations. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] pub enum PlacementPolicy { /// Normal live state: one attached pageserver and zero or more secondaries. Attached(usize), /// Create one secondary mode locations. This is useful when onboarding /// a tenant, or for an idle tenant that we might want to bring online quickly. Secondary, /// Do not attach to any pageservers. This is appropriate for tenants that /// have been idle for a long time, where we do not mind some delay in making /// them available in future. Detached, } impl PlacementPolicy { pub fn want_secondaries(&self) -> usize { match self { PlacementPolicy::Attached(secondary_count) => *secondary_count, PlacementPolicy::Secondary => 1, PlacementPolicy::Detached => 0, } } } #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} /// Metadata health record posted from scrubber. #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthRecord { pub tenant_shard_id: TenantShardId, pub healthy: bool, pub last_scrubbed_at: chrono::DateTime, } #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthUpdateRequest { pub healthy_tenant_shards: HashSet, pub unhealthy_tenant_shards: HashSet, } #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthUpdateResponse {} #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthListUnhealthyResponse { pub unhealthy_tenant_shards: Vec, } #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthListOutdatedRequest { #[serde(with = "humantime_serde")] pub not_scrubbed_for: Duration, } #[derive(Serialize, Deserialize, Debug)] pub struct MetadataHealthListOutdatedResponse { pub health_records: Vec, } /// Publicly exposed safekeeper description #[derive(Serialize, Deserialize, Clone)] pub struct SafekeeperDescribeResponse { pub id: NodeId, pub region_id: String, /// 1 is special, it means just created (not currently posted to storcon). /// Zero or negative is not really expected. /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. pub version: i64, pub host: String, pub port: i32, pub http_port: i32, pub https_port: Option, pub availability_zone_id: String, pub scheduling_policy: SkSchedulingPolicy, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct TimelineSafekeeperPeer { pub node_id: NodeId, pub listen_http_addr: String, pub http_port: i32, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct SCSafekeeperTimeline { // SC does not know the tenant id. pub timeline_id: TimelineId, pub peers: Vec, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct SCSafekeeperTimelinesResponse { pub timelines: Vec, pub safekeeper_peers: Vec, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct SafekeeperTimeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub peers: Vec, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct SafekeeperTimelinesResponse { pub timelines: Vec, pub safekeeper_peers: Vec, } #[derive(Serialize, Deserialize, Clone)] pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, } /// Import request for safekeeper timelines. #[derive(Serialize, Deserialize, Clone)] pub struct TimelineImportRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub start_lsn: Lsn, pub sk_set: Vec, pub force_upsert: bool, } #[derive(serde::Serialize, serde::Deserialize, Clone)] pub struct TimelineSafekeeperMigrateRequest { pub new_sk_set: Vec, } #[cfg(test)] mod test { use serde_json; use super::*; /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { let v = PlacementPolicy::Attached(1); let encoded = serde_json::to_string(&v)?; assert_eq!(encoded, "{\"Attached\":1}"); assert_eq!(serde_json::from_str::(&encoded)?, v); let v = PlacementPolicy::Detached; let encoded = serde_json::to_string(&v)?; assert_eq!(encoded, "\"Detached\""); assert_eq!(serde_json::from_str::(&encoded)?, v); Ok(()) } #[test] fn test_reject_unknown_field() { let id = TenantId::generate(); let create_request = serde_json::json!({ "new_tenant_id": id.to_string(), "unknown_field": "unknown_value".to_string(), }); let err = serde_json::from_value::(create_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), "expect unknown field `unknown_field` error, got: {err}" ); } /// Check that a minimal migrate request with no config results in the expected default settings #[test] fn test_migrate_request_decode_defaults() { let json = r#"{ "node_id": 123 }"#; let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap(); assert_eq!(request.node_id, NodeId(123)); assert_eq!(request.origin_node_id, None); assert!(!request.migration_config.override_scheduler); assert!(request.migration_config.prewarm); assert_eq!(request.migration_config.secondary_warmup_timeout, None); assert_eq!( request.migration_config.secondary_download_request_timeout, None ); } /// Check that a partially specified migration config results in the expected default settings #[test] fn test_migration_config_decode_defaults() { // Specify just one field of the config let json = r#"{ }"#; let config: MigrationConfig = serde_json::from_str(json).unwrap(); // Check each field's expected default value assert!(!config.override_scheduler); assert!(config.prewarm); assert_eq!(config.secondary_warmup_timeout, None); assert_eq!(config.secondary_download_request_timeout, None); assert_eq!(config.secondary_warmup_timeout, None); // Consistency check that the Default impl agrees with our serde defaults assert_eq!(MigrationConfig::default(), config); } } ================================================ FILE: libs/pageserver_api/src/key.rs ================================================ use std::fmt; use std::ops::Range; use anyhow::{Result, bail}; use byteorder::{BE, ByteOrder}; use bytes::Bytes; use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi_types::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; /// Key used in the Repository kv-store. /// /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] pub struct Key { pub field1: u8, pub field2: u32, pub field3: u32, pub field4: u32, pub field5: u8, pub field6: u32, } /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as /// a struct of fields. #[derive( Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug, )] pub struct CompactKey(i128); /// The storage key size. pub const KEY_SIZE: usize = 18; /// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. /// See [`Key::to_i128`] for more information on the encoding. pub const METADATA_KEY_SIZE: usize = 16; /// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key. pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; /// The (reserved) key prefix of relation sizes. pub const RELATION_SIZE_PREFIX: u8 = 0x61; /// The key prefix of AUX file keys. pub const AUX_KEY_PREFIX: u8 = 0x62; /// The key prefix of ReplOrigin keys. pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; /// The key prefix of db directory keys. pub const DB_DIR_KEY_PREFIX: u8 = 0x64; /// The key prefix of rel directory keys. pub const REL_DIR_KEY_PREFIX: u8 = 0x65; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum RelDirExists { Exists, Removed, } #[derive(Debug)] pub struct DecodeError; impl fmt::Display for DecodeError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "invalid marker") } } impl std::error::Error for DecodeError {} impl RelDirExists { /// The value of the rel directory keys that indicates the existence of a relation. const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r"); pub fn encode(&self) -> Bytes { match self { Self::Exists => Self::REL_EXISTS_MARKER.clone(), Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(), } } pub fn decode_option(data: Option>) -> Result { match data { Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists), // Any other marker is invalid Some(_) => Err(DecodeError), None => Ok(Self::Removed), } } pub fn decode(data: impl AsRef<[u8]>) -> Result { let data = data.as_ref(); if data == Self::REL_EXISTS_MARKER { Ok(Self::Exists) } else if data == SPARSE_TOMBSTONE_MARKER { Ok(Self::Removed) } else { Err(DecodeError) } } } /// A tombstone in the sparse keyspace, which is an empty buffer. pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b""); /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX } impl Key { /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key(&self) -> bool { self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX } /// Encode a metadata key to a storage key. pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { assert!(is_metadata_key_slice(key), "key not in metadata key range"); // Metadata key space ends at 0x7F so it's fine to directly convert it to i128. Self::from_i128(i128::from_be_bytes(*key)) } /// Encode a metadata key to a storage key. pub fn from_metadata_key(key: &[u8]) -> Self { Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) } /// Get the range of metadata keys. pub const fn metadata_key_range() -> Range { Key { field1: METADATA_KEY_BEGIN_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }..Key { field1: METADATA_KEY_END_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, } } /// Get the range of aux keys. pub fn metadata_aux_key_range() -> Range { Key { field1: AUX_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }..Key { field1: AUX_KEY_PREFIX + 1, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, } } pub fn rel_dir_sparse_key_range() -> Range { Key { field1: REL_DIR_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }..Key { field1: REL_DIR_KEY_PREFIX + 1, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, } } /// This function checks more extensively what keys we can take on the write path. /// If a key beginning with 00 does not have a global/default tablespace OID, it /// will be rejected on the write path. #[allow(dead_code)] pub fn is_valid_key_on_write_path_strong(&self) -> bool { use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; if !self.is_i128_representable() { return false; } if self.field1 == 0 && !(self.field2 == GLOBALTABLESPACE_OID || self.field2 == DEFAULTTABLESPACE_OID || self.field2 == 0) { return false; // User defined tablespaces are not supported } true } /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply /// checks if the key is i128 representable. Note that some keys can be successfully /// ingested into the pageserver, but will cause errors on generating basebackup. pub fn is_valid_key_on_write_path(&self) -> bool { self.is_i128_representable() } pub fn is_i128_representable(&self) -> bool { self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222 } /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { assert!(self.is_i128_representable(), "invalid key: {self}"); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) | ((self.field4 as i128) << 40) | ((self.field5 as i128) << 32) | self.field6 as i128 } pub const fn from_i128(x: i128) -> Self { Key { field1: ((x >> 120) & 0x7F) as u8, field2: ((x >> 104) & 0xFFFF) as u32, field3: (x >> 72) as u32, field4: (x >> 40) as u32, field5: (x >> 32) as u8, field6: x as u32, } } pub fn to_compact(&self) -> CompactKey { CompactKey(self.to_i128()) } pub fn from_compact(k: CompactKey) -> Self { Self::from_i128(k.0) } pub const fn next(&self) -> Key { self.add(1) } pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); key.field6 = r.0; if r.1 { let r = key.field5.overflowing_add(1); key.field5 = r.0; if r.1 { let r = key.field4.overflowing_add(1); key.field4 = r.0; if r.1 { let r = key.field3.overflowing_add(1); key.field3 = r.0; if r.1 { let r = key.field2.overflowing_add(1); key.field2 = r.0; if r.1 { let r = key.field1.overflowing_add(1); key.field1 = r.0; assert!(!r.1); } } } } } key } /// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently. /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`, /// and therefore not all 18B slices are valid page server keys. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], field2: u32::from_be_bytes(b[1..5].try_into().unwrap()), field3: u32::from_be_bytes(b[5..9].try_into().unwrap()), field4: u32::from_be_bytes(b[9..13].try_into().unwrap()), field5: b[13], field6: u32::from_be_bytes(b[14..18].try_into().unwrap()), } } /// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently. /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys). pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); BE::write_u32(&mut buf[5..9], self.field3); BE::write_u32(&mut buf[9..13], self.field4); buf[13] = self.field5; BE::write_u32(&mut buf[14..18], self.field6); } } impl CompactKey { pub fn raw(&self) -> i128 { self.0 } } impl From for CompactKey { fn from(value: i128) -> Self { Self(value) } } impl fmt::Display for Key { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}", self.field1, self.field2, self.field3, self.field4, self.field5, self.field6 ) } } impl fmt::Display for CompactKey { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let k = Key::from_compact(*self); k.fmt(f) } } impl Key { pub const MIN: Key = Key { field1: u8::MIN, field2: u32::MIN, field3: u32::MIN, field4: u32::MIN, field5: u8::MIN, field6: u32::MIN, }; pub const MAX: Key = Key { field1: u8::MAX, field2: u32::MAX, field3: u32::MAX, field4: u32::MAX, field5: u8::MAX, field6: u32::MAX, }; pub fn from_hex(s: &str) -> Result { if s.len() != 36 { bail!("parse error"); } Ok(Key { field1: u8::from_str_radix(&s[0..2], 16)?, field2: u32::from_str_radix(&s[2..10], 16)?, field3: u32::from_str_radix(&s[10..18], 16)?, field4: u32::from_str_radix(&s[18..26], 16)?, field5: u8::from_str_radix(&s[26..28], 16)?, field6: u32::from_str_radix(&s[28..36], 16)?, }) } } // Layout of the Key address space // // The Key struct, used to address the underlying key-value store, consists of // 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map // all the data and metadata keys into those 18 bytes. // // Principles for the mapping: // // - Things that are often accessed or modified together, should be close to // each other in the key space. For example, if a relation is extended by one // block, we create a new key-value pair for the block data, and update the // relation size entry. Because of that, the RelSize key comes after all the // RelBlocks of a relation: the RelSize and the last RelBlock are always next // to each other. // // The key space is divided into four major sections, identified by the first // byte, and the form a hierarchy: // // 00 Relation data and metadata // // DbDir () -> (dbnode, spcnode) // Filenodemap // RelDir -> relnode forknum // RelBlocks // RelSize // // 01 SLRUs // // SlruDir kind // SlruSegBlocks segno // SlruSegSize // // 02 pg_twophase // // 03 misc // Controlfile // checkpoint // pg_version // // 04 aux files // // Below is a full list of the keyspace allocation: // // DbDir: // 00 00000000 00000000 00000000 00 00000000 // // Filenodemap: // 00 SPCNODE DBNODE 00000000 00 00000000 // // RelDir: // 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) // // RelBlock: // 00 SPCNODE DBNODE RELNODE FORK BLKNUM // // RelSize: // 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF // // SlruDir: // 01 kind 00000000 00000000 00 00000000 // // SlruSegBlock: // 01 kind 00000001 SEGNO 00 BLKNUM // // SlruSegSize: // 01 kind 00000001 SEGNO 00 FFFFFFFF // // TwoPhaseDir: // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: // // 02 00000000 00000000 00XXXXXX XX XXXXXXXX // // \______XID_________/ // // The 64-bit XID is stored a little awkwardly in field6, field5 and // field4. PostgreSQL v16 and below only stored a 32-bit XID, which // fit completely in field6, but starting with PostgreSQL v17, a full // 64-bit XID is used. Most pageserver code that accesses // TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits // are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 // // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 // // AuxFiles: // 03 00000000 00000000 00000000 00 00000002 // //-- Section 01: relation data and metadata pub const DBDIR_KEY: Key = Key { field1: 0x00, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }; #[inline(always)] pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { Key { field1: 0x00, field2: spcnode, field3: dbnode, field4: 0, field5: 0, field6: 0, }..Key { field1: 0x00, field2: spcnode, field3: dbnode, field4: 0xffffffff, field5: 0xff, field6: 0xffffffff, } } #[inline(always)] pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { Key { field1: 0x00, field2: spcnode, field3: dbnode, field4: 0, field5: 0, field6: 0, } } #[inline(always)] pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { Key { field1: 0x00, field2: spcnode, field3: dbnode, field4: 0, field5: 0, field6: 1, } } #[inline(always)] pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { Key { field1: REL_DIR_KEY_PREFIX, field2: spcnode, field3: dbnode, field4: relnode, field5: forknum, field6: 1, } } pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range { Key { field1: REL_DIR_KEY_PREFIX, field2: spcnode, field3: dbnode, field4: 0, field5: 0, field6: 0, }..Key { field1: REL_DIR_KEY_PREFIX, field2: spcnode, field3: dbnode, field4: u32::MAX, field5: u8::MAX, field6: u32::MAX, } // it's fine to exclude the last key b/c we only use field6 == 1 } #[inline(always)] pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { field1: 0x00, field2: rel.spcnode, field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, field6: blknum, } } #[inline(always)] pub fn rel_size_to_key(rel: RelTag) -> Key { Key { field1: 0x00, field2: rel.spcnode, field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, field6: 0xffff_ffff, } } impl Key { #[inline(always)] pub fn is_rel_size_key(&self) -> bool { self.field1 == 0 && self.field6 == u32::MAX } } #[inline(always)] pub fn rel_key_range(rel: RelTag) -> Range { Key { field1: 0x00, field2: rel.spcnode, field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, field6: 0, }..Key { field1: 0x00, field2: rel.spcnode, field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum + 1, field6: 0, } } //-- Section 02: SLRUs #[inline(always)] pub fn slru_dir_to_key(kind: SlruKind) -> Key { Key { field1: 0x01, field2: match kind { SlruKind::Clog => 0x00, SlruKind::MultiXactMembers => 0x01, SlruKind::MultiXactOffsets => 0x02, }, field3: 0, field4: 0, field5: 0, field6: 0, } } #[inline(always)] pub fn slru_dir_kind(key: &Key) -> Option> { if key.field1 == 0x01 && key.field3 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 { match key.field2 { 0 => Some(Ok(SlruKind::Clog)), 1 => Some(Ok(SlruKind::MultiXactMembers)), 2 => Some(Ok(SlruKind::MultiXactOffsets)), x => Some(Err(x)), } } else { None } } #[inline(always)] pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { Key { field1: 0x01, field2: match kind { SlruKind::Clog => 0x00, SlruKind::MultiXactMembers => 0x01, SlruKind::MultiXactOffsets => 0x02, }, field3: 1, field4: segno, field5: 0, field6: blknum, } } #[inline(always)] pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { Key { field1: 0x01, field2: match kind { SlruKind::Clog => 0x00, SlruKind::MultiXactMembers => 0x01, SlruKind::MultiXactOffsets => 0x02, }, field3: 1, field4: segno, field5: 0, field6: 0xffff_ffff, } } impl Key { pub fn is_slru_segment_size_key(&self) -> bool { self.field1 == 0x01 && self.field2 < 0x03 && self.field3 == 0x01 && self.field5 == 0 && self.field6 == u32::MAX } pub fn is_slru_dir_key(&self) -> bool { slru_dir_kind(self).is_some() } } #[inline(always)] pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { let field2 = match kind { SlruKind::Clog => 0x00, SlruKind::MultiXactMembers => 0x01, SlruKind::MultiXactOffsets => 0x02, }; Key { field1: 0x01, field2, field3: 1, field4: segno, field5: 0, field6: 0, }..Key { field1: 0x01, field2, field3: 1, field4: segno, field5: 1, field6: 0, } } //-- Section 03: pg_twophase pub const TWOPHASEDIR_KEY: Key = Key { field1: 0x02, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }; #[inline(always)] pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, field5: ((xid & 0x000000FF00000000) >> 32) as u8, field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] pub fn twophase_key_range(xid: u64) -> Range { // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, field5: ((xid & 0x000000FF00000000) >> 32) as u8, field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, field3: u32::from(overflowed), field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } //-- Section 03: Control file pub const CONTROLFILE_KEY: Key = Key { field1: 0x03, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }; pub const CHECKPOINT_KEY: Key = Key { field1: 0x03, field2: 0, field3: 0, field4: 0, field5: 0, field6: 1, }; pub const AUX_FILES_KEY: Key = Key { field1: 0x03, field2: 0, field3: 0, field4: 0, field5: 0, field6: 2, }; #[inline(always)] pub fn repl_origin_key(origin_id: RepOriginId) -> Key { Key { field1: REPL_ORIGIN_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: origin_id as u32, } } /// Get the range of replorigin keys. pub fn repl_origin_key_range() -> Range { Key { field1: REPL_ORIGIN_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }..Key { field1: REPL_ORIGIN_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0x10000, } } // Reverse mappings for a few Keys. // These are needed by WAL redo manager. /// Non inherited range for vectored get. pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. pub const SPARSE_RANGE: Range = Key::metadata_key_range(); impl Key { // AUX_FILES currently stores only data for logical replication (slots etc), and // we don't preserve these on a branch because safekeepers can't follow timeline // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(self) -> bool { if self.is_sparse() { self.is_inherited_sparse_key() } else { !NON_INHERITED_RANGE.contains(&self) } } #[inline(always)] pub fn is_sparse(self) -> bool { self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX } /// Check if the key belongs to the inherited keyspace. fn is_inherited_sparse_key(self) -> bool { debug_assert!(self.is_sparse()); self.field1 == RELATION_SIZE_PREFIX } pub const fn sparse_non_inherited_keyspace() -> Range { // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX); Key { field1: AUX_KEY_PREFIX, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, }..Key { field1: REPL_ORIGIN_KEY_PREFIX + 1, field2: 0, field3: 0, field4: 0, field5: 0, field6: 0, } } #[inline(always)] pub fn is_rel_fsm_block_key(self) -> bool { self.field1 == 0x00 && self.field4 != 0 && self.field5 == FSM_FORKNUM && self.field6 != 0xffffffff } #[inline(always)] pub fn is_rel_vm_block_key(self) -> bool { self.field1 == 0x00 && self.field4 != 0 && self.field5 == VISIBILITYMAP_FORKNUM && self.field6 != 0xffffffff } #[inline(always)] pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { Ok(match self.field1 { 0x01 => { let kind = match self.field2 { 0x00 => SlruKind::Clog, 0x01 => SlruKind::MultiXactMembers, 0x02 => SlruKind::MultiXactOffsets, _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2), }; let segno = self.field4; let blknum = self.field6; (kind, segno, blknum) } _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), }) } #[inline(always)] pub fn is_slru_block_key(self) -> bool { self.field1 == 0x01 // SLRU-related && self.field3 == 0x00000001 // but not SlruDir && self.field6 != 0xffffffff // and not SlruSegSize } #[inline(always)] pub fn is_rel_block_key(&self) -> bool { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } #[inline(always)] pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool { self.is_rel_block_key() && self.field4 == rel } #[inline(always)] pub fn is_rel_dir_key(&self) -> bool { self.field1 == 0x00 && self.field2 != 0 && self.field3 != 0 && self.field4 == 0 && self.field5 == 0 && self.field6 == 1 } #[inline(always)] pub fn is_aux_file_key(&self) -> bool { self.field1 == AUX_KEY_PREFIX } /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> { Ok(match self.field1 { 0x00 => ( RelTag { spcnode: self.field2, dbnode: self.field3, relnode: self.field4, forknum: self.field5, }, self.field6, ), _ => return Err(ToRelBlockError(self.field1)), }) } } impl std::str::FromStr for Key { type Err = anyhow::Error; fn from_str(s: &str) -> std::result::Result { Self::from_hex(s) } } #[derive(Debug)] pub struct ToRelBlockError(u8); impl fmt::Display for ToRelBlockError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "unexpected value kind 0x{:02x}", self.0) } } impl std::error::Error for ToRelBlockError {} #[cfg(test)] mod tests { use std::str::FromStr; use rand::{Rng, SeedableRng}; use super::AUX_KEY_PREFIX; use crate::key::{Key, is_metadata_key_slice}; #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { field1: rng.random(), field2: rng.random(), field3: rng.random(), field4: rng.random(), field5: rng.random(), field6: rng.random(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } #[test] fn test_metadata_keys() { let mut metadata_key = vec![AUX_KEY_PREFIX]; metadata_key.extend_from_slice(&[0xFF; 15]); let encoded_key = Key::from_metadata_key(&metadata_key); let output_key = encoded_key.to_i128().to_be_bytes(); assert_eq!(metadata_key, output_key); assert!(encoded_key.is_metadata_key()); assert!(is_metadata_key_slice(&metadata_key)); } #[test] fn test_possible_largest_key() { Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF); // TODO: put this key into the system and see if anything breaks. } } ================================================ FILE: libs/pageserver_api/src/keyspace.rs ================================================ use std::ops::Range; use itertools::Itertools; use crate::key::Key; use crate::shard::{ShardCount, ShardIdentity}; /// /// Represents a set of Keys, in a compact form. /// #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct KeySpace { /// Contiguous ranges of keys that belong to the key space. In key order, /// and with no overlap. pub ranges: Vec>, } impl std::fmt::Display for KeySpace { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; for range in &self.ranges { write!(f, "{}..{},", range.start, range.end)?; } write!(f, "]") } } /// A wrapper type for sparse keyspaces. #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct SparseKeySpace(pub KeySpace); /// Represents a contiguous half-open range of the keyspace, masked according to a particular /// ShardNumber's stripes: within this range of keys, only some "belong" to the current /// shard. /// /// When we iterate over keys within this object, we will skip any keys that don't belong /// to this shard. /// /// The start + end keys may not belong to the shard: these specify where layer files should /// start + end, but we will never actually read/write those keys. #[derive(Clone, Debug, PartialEq, Eq)] pub struct ShardedRange<'a> { pub shard_identity: &'a ShardIdentity, pub range: Range, } // Calculate the size of a range within the blocks of the same relation, or spanning only the // top page in the previous relation's space. pub fn contiguous_range_len(range: &Range) -> u32 { debug_assert!(is_contiguous_range(range)); if range.start.field6 == 0xffffffff { range.end.field6 + 1 } else { range.end.field6 - range.start.field6 } } /// Return true if this key range includes only keys in the same relation's data blocks, or /// just spanning one relation and the logical size (0xffffffff) block of the relation before it. /// /// Contiguous in this context means we know the keys are in use _somewhere_, but it might not /// be on our shard. Later in ShardedRange we do the extra work to figure out how much /// of a given contiguous range is present on one shard. /// /// This matters, because: /// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. /// - Within such ranges, we may calculate distances using simple subtraction of field6. pub fn is_contiguous_range(range: &Range) -> bool { range.start.field1 == range.end.field1 && range.start.field2 == range.end.field2 && range.start.field3 == range.end.field3 && range.start.field4 == range.end.field4 && (range.start.field5 == range.end.field5 || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5)) } impl<'a> ShardedRange<'a> { pub fn new(range: Range, shard_identity: &'a ShardIdentity) -> Self { Self { shard_identity, range, } } /// Break up this range into chunks, each of which has at least one local key in it if the /// total range has at least one local key. pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range)> { // Optimization for single-key case (e.g. logical size keys) if self.range.end == self.range.start.add(1) { return vec![( if self.shard_identity.is_key_disposable(&self.range.start) { 0 } else { 1 }, self.range, )]; } if !is_contiguous_range(&self.range) { // Ranges that span relations are not fragmented. We only get these ranges as a result // of operations that act on existing layers, so we trust that the existing range is // reasonably small. return vec![(u32::MAX, self.range)]; } let mut fragments: Vec<(u32, Range)> = Vec::new(); let mut cursor = self.range.start; while cursor < self.range.end { let advance_by = self.distance_to_next_boundary(cursor); let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor); // If the previous fragment is undersized, then we seek to consume enough // blocks to complete it. let (want_blocks, merge_last_fragment) = match fragments.last_mut() { Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)), Some(frag) => { // Prev block is complete, want the full number. ( target_nblocks, if is_fragment_disposable { // If this current range will be empty (not shard-local data), we will merge into previous Some(frag) } else { None }, ) } None => { // First iteration, want the full number (target_nblocks, None) } }; let advance_by = if is_fragment_disposable { advance_by } else { std::cmp::min(advance_by, want_blocks) }; let next_cursor = cursor.add(advance_by); let this_frag = ( if is_fragment_disposable { 0 } else { advance_by }, cursor..next_cursor, ); cursor = next_cursor; if let Some(last_fragment) = merge_last_fragment { // Previous fragment was short or this one is empty, merge into it last_fragment.0 += this_frag.0; last_fragment.1.end = this_frag.1.end; } else { fragments.push(this_frag); } } fragments } /// Estimate the physical pages that are within this range, on this shard. This returns /// u32::MAX if the range spans relations: this return value should be interpreted as "large". pub fn page_count(&self) -> u32 { // Special cases for single keys like logical sizes if self.range.end == self.range.start.add(1) { return if self.shard_identity.is_key_disposable(&self.range.start) { 0 } else { 1 }; } // We can only do an authentic calculation of contiguous key ranges if !is_contiguous_range(&self.range) { return u32::MAX; } // Special case for single sharded tenants: our logical and physical sizes are the same if self.shard_identity.count < ShardCount::new(2) { return contiguous_range_len(&self.range); } // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs // to Self, and add the stripe's block count to our total if so. let mut result: u64 = 0; let mut cursor = self.range.start; while cursor < self.range.end { // Count up to the next stripe_size boundary or end of range let advance_by = self.distance_to_next_boundary(cursor); // If this blocks in this stripe belong to us, add them to our count if !self.shard_identity.is_key_disposable(&cursor) { result += advance_by as u64; } cursor = cursor.add(advance_by); } if result > u32::MAX as u64 { u32::MAX } else { result as u32 } } /// Advance the cursor to the next potential fragment boundary: this is either /// a stripe boundary, or the end of the range. fn distance_to_next_boundary(&self, cursor: Key) -> u32 { let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end)); if self.shard_identity.count < ShardCount::new(2) { // Optimization: don't bother stepping through stripes if the tenant isn't sharded. return distance_to_range_end; } if cursor.field6 == 0xffffffff { // We are wrapping from one relation's logical size to the next relation's first data block return 1; } let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0; let stripe_remainder = self.shard_identity.stripe_size.0 - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0); if cfg!(debug_assertions) { // We should never overflow field5 and field6 -- our callers check this earlier // and would have returned their u32::MAX cases if the input range violated this. let next_cursor = cursor.add(stripe_remainder); debug_assert!( next_cursor.field1 == cursor.field1 && next_cursor.field2 == cursor.field2 && next_cursor.field3 == cursor.field3 && next_cursor.field4 == cursor.field4 && next_cursor.field5 == cursor.field5 ) } std::cmp::min(stripe_remainder, distance_to_range_end) } /// Whereas `page_count` estimates the number of pages physically in this range on this shard, /// this function simply calculates the number of pages in the space, without accounting for those /// pages that would not actually be stored on this node. /// /// Don't use this function in code that works with physical entities like layer files. pub fn raw_size(range: &Range) -> u32 { if is_contiguous_range(range) { contiguous_range_len(range) } else { u32::MAX } } } impl KeySpace { /// Create a key space with a single range. pub fn single(key_range: Range) -> Self { Self { ranges: vec![key_range], } } /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// pub fn partition( &self, shard_identity: &ShardIdentity, target_size: u64, block_size: u64, ) -> KeyPartitioning { let target_nblocks = (target_size / block_size) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { // While doing partitioning, wrap the range in ShardedRange so that our size calculations // will respect shard striping rather than assuming all keys within a range are present. let range = ShardedRange::new(range.clone(), shard_identity); // Chunk up the range into parts that each contain up to target_size local blocks for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { // If appending the next contiguous range in the keyspace to the current // partition would cause it to be too large, and our current partition // covers at least one block that is physically present in this shard, // then start a new partition if current_part_size + frag_on_shard_size as usize > target_nblocks as usize && current_part_size > 0 { parts.push(KeySpace { ranges: current_part, }); current_part = Vec::new(); current_part_size = 0; } current_part.push(frag_range.start..frag_range.end); current_part_size += frag_on_shard_size as usize; } } // add last partition that wasn't full yet. if !current_part.is_empty() { parts.push(KeySpace { ranges: current_part, }); } KeyPartitioning { parts } } pub fn is_empty(&self) -> bool { self.total_raw_size() == 0 } /// Merge another keyspace into the current one. /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. pub fn merge(&mut self, other: &KeySpace) { let all_ranges = self .ranges .iter() .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start); let mut accum = KeySpaceAccum::new(); let mut prev: Option<&Range> = None; for range in all_ranges { if let Some(prev) = prev { let overlap = std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); assert!( !overlap, "Attempt to merge ovelapping keyspaces: {prev:?} overlaps {range:?}" ); } accum.add_range(range.clone()); prev = Some(range); } self.ranges = accum.to_keyspace().ranges; } /// Remove all keys in `other` from `self`. /// This can involve splitting or removing of existing ranges. /// Returns the removed keyspace pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { let (self_start, self_end) = match (self.start(), self.end()) { (Some(start), Some(end)) => (start, end), _ => { // self is empty return KeySpace::default(); } }; // Key spaces are sorted by definition, so skip ahead to the first // potentially intersecting range. Similarly, ignore ranges that start // after the current keyspace ends. let other_ranges = other .ranges .iter() .skip_while(|range| self_start >= range.end) .take_while(|range| self_end > range.start); let mut removed_accum = KeySpaceRandomAccum::new(); for range in other_ranges { while let Some(overlap_at) = self.overlaps_at(range) { let overlapped = self.ranges[overlap_at].clone(); if overlapped.start < range.start && overlapped.end <= range.end { // Higher part of the range is completely overlapped. removed_accum.add_range(range.start..self.ranges[overlap_at].end); self.ranges[overlap_at].end = range.start; } if overlapped.start >= range.start && overlapped.end > range.end { // Lower part of the range is completely overlapped. removed_accum.add_range(self.ranges[overlap_at].start..range.end); self.ranges[overlap_at].start = range.end; } if overlapped.start < range.start && overlapped.end > range.end { // Middle part of the range is overlapped. removed_accum.add_range(range.clone()); self.ranges[overlap_at].end = range.start; self.ranges .insert(overlap_at + 1, range.end..overlapped.end); } if overlapped.start >= range.start && overlapped.end <= range.end { // Whole range is overlapped removed_accum.add_range(self.ranges[overlap_at].clone()); self.ranges.remove(overlap_at); } } } removed_accum.to_keyspace() } pub fn start(&self) -> Option { self.ranges.first().map(|range| range.start) } pub fn end(&self) -> Option { self.ranges.last().map(|range| range.end) } /// The size of the keyspace in pages, before accounting for sharding pub fn total_raw_size(&self) -> usize { self.ranges .iter() .map(|range| ShardedRange::raw_size(range) as usize) .sum() } fn overlaps_at(&self, range: &Range) -> Option { match self.ranges.binary_search_by_key(&range.end, |r| r.start) { Ok(0) => None, Err(0) => None, Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1), Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1), _ => None, } } /// /// Check if key space contains overlapping range /// pub fn overlaps(&self, range: &Range) -> bool { self.overlaps_at(range).is_some() } /// Check if the keyspace contains a key pub fn contains(&self, key: &Key) -> bool { self.overlaps(&(*key..key.next())) } } /// /// Represents a partitioning of the key space. /// /// The only kind of partitioning we do is to partition the key space into /// partitions that are roughly equal in physical size (see KeySpace::partition). /// But this data structure could represent any partitioning. /// #[derive(Clone, Debug, Default)] pub struct KeyPartitioning { pub parts: Vec, } /// Represents a partitioning of the sparse key space. #[derive(Clone, Debug, Default)] pub struct SparseKeyPartitioning { pub parts: Vec, } impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { parts: Vec::new() } } /// Convert a key partitioning to a sparse partition. pub fn into_sparse(self) -> SparseKeyPartitioning { SparseKeyPartitioning { parts: self.parts.into_iter().map(SparseKeySpace).collect(), } } } impl SparseKeyPartitioning { /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will /// cause long/dead loops. pub fn into_dense(self) -> KeyPartitioning { KeyPartitioning { parts: self.parts.into_iter().map(|x| x.0).collect(), } } } /// /// A helper object, to collect a set of keys and key ranges into a KeySpace /// object. This takes care of merging adjacent keys and key ranges into /// contiguous ranges. /// #[derive(Clone, Debug, Default)] pub struct KeySpaceAccum { accum: Option>, ranges: Vec>, size: u64, } impl KeySpaceAccum { pub fn new() -> Self { Self { accum: None, ranges: Vec::new(), size: 0, } } #[inline(always)] pub fn add_key(&mut self, key: Key) { self.add_range(singleton_range(key)) } #[inline(always)] pub fn add_range(&mut self, range: Range) { self.size += ShardedRange::raw_size(&range) as u64; match self.accum.as_mut() { Some(accum) => { if range.start == accum.end { accum.end = range.end; } else { // TODO: to efficiently support small sharding stripe sizes, we should avoid starting // a new range here if the skipped region was all keys that don't belong on this shard. // (https://github.com/neondatabase/neon/issues/6247) assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; } } None => self.accum = Some(range), } } pub fn to_keyspace(mut self) -> KeySpace { if let Some(accum) = self.accum.take() { self.ranges.push(accum); } KeySpace { ranges: self.ranges, } } pub fn consume_keyspace(&mut self) -> KeySpace { std::mem::take(self).to_keyspace() } // The total number of keys in this object, ignoring any sharding effects that might cause some of // the keys to be omitted in storage on this shard. pub fn raw_size(&self) -> u64 { self.size } } /// /// A helper object, to collect a set of keys and key ranges into a KeySpace /// object. Key ranges may be inserted in any order and can overlap. /// #[derive(Clone, Debug, Default)] pub struct KeySpaceRandomAccum { ranges: Vec>, } impl KeySpaceRandomAccum { pub fn new() -> Self { Self { ranges: Vec::new() } } pub fn add_key(&mut self, key: Key) { self.add_range(singleton_range(key)) } pub fn add_range(&mut self, range: Range) { self.ranges.push(range); } pub fn add_keyspace(&mut self, keyspace: KeySpace) { for range in keyspace.ranges { self.add_range(range); } } pub fn to_keyspace(mut self) -> KeySpace { let mut ranges = Vec::new(); if !self.ranges.is_empty() { self.ranges.sort_by_key(|r| r.start); let mut start = self.ranges.first().unwrap().start; let mut end = self.ranges.first().unwrap().end; for r in self.ranges { assert!(r.start >= start); if r.start > end { ranges.push(start..end); start = r.start; end = r.end; } else if r.end > end { end = r.end; } } ranges.push(start..end); } KeySpace { ranges } } pub fn consume_keyspace(&mut self) -> KeySpace { let mut prev_accum = KeySpaceRandomAccum::new(); std::mem::swap(self, &mut prev_accum); prev_accum.to_keyspace() } } pub fn singleton_range(key: Key) -> Range { key..key.next() } #[cfg(test)] mod tests { use std::fmt::Write; use rand::{RngCore, SeedableRng}; use super::*; use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize}; // Helper function to create a key range. // // Make the tests below less verbose. fn kr(irange: Range) -> Range { Key::from_i128(irange.start)..Key::from_i128(irange.end) } #[allow(dead_code)] fn dump_keyspace(ks: &KeySpace) { for r in ks.ranges.iter() { println!(" {}..{}", r.start.to_i128(), r.end.to_i128()); } } fn assert_ks_eq(actual: &KeySpace, expected: Vec>) { if actual.ranges != expected { let mut msg = String::new(); writeln!(msg, "expected:").unwrap(); for r in &expected { writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); } writeln!(msg, "got:").unwrap(); for r in &actual.ranges { writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); } panic!("{}", msg); } } #[test] fn keyspace_consume() { let ranges = vec![kr(0..10), kr(20..35), kr(40..45)]; let mut accum = KeySpaceAccum::new(); for range in &ranges { accum.add_range(range.clone()); } let expected_size: u64 = ranges .iter() .map(|r| ShardedRange::raw_size(r) as u64) .sum(); assert_eq!(accum.raw_size(), expected_size); assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); assert_eq!(accum.raw_size(), 0); assert_ks_eq(&accum.consume_keyspace(), vec![]); assert_eq!(accum.raw_size(), 0); for range in &ranges { accum.add_range(range.clone()); } assert_ks_eq(&accum.to_keyspace(), ranges); } #[test] fn keyspace_add_range() { // two separate ranges // // ##### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(0..10)); ks.add_range(kr(20..30)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]); // two separate ranges, added in reverse order // // ##### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(20..30)); ks.add_range(kr(0..10)); // add range that is adjacent to the end of an existing range // // ##### // ##### ks.add_range(kr(0..10)); ks.add_range(kr(10..30)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add range that is adjacent to the start of an existing range // // ##### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(10..30)); ks.add_range(kr(0..10)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add range that overlaps with the end of an existing range // // ##### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(0..10)); ks.add_range(kr(5..30)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add range that overlaps with the start of an existing range // // ##### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(5..30)); ks.add_range(kr(0..10)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add range that is fully covered by an existing range // // ######### // ##### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(0..30)); ks.add_range(kr(10..20)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add range that extends an existing range from both ends // // ##### // ######### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(10..20)); ks.add_range(kr(0..30)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); // add a range that overlaps with two existing ranges, joining them // // ##### ##### // ####### let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(0..10)); ks.add_range(kr(20..30)); ks.add_range(kr(5..25)); assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); } #[test] fn keyspace_overlaps() { let mut ks = KeySpaceRandomAccum::default(); ks.add_range(kr(10..20)); ks.add_range(kr(30..40)); let ks = ks.to_keyspace(); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(0..5))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(5..9))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(5..10))); // ##### ##### // xxxx assert!(ks.overlaps(&kr(5..11))); // ##### ##### // xxxx assert!(ks.overlaps(&kr(10..15))); // ##### ##### // xxxx assert!(ks.overlaps(&kr(15..20))); // ##### ##### // xxxx assert!(ks.overlaps(&kr(15..25))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(22..28))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(25..30))); // ##### ##### // xxxx assert!(ks.overlaps(&kr(35..35))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(40..45))); // ##### ##### // xxxx assert!(!ks.overlaps(&kr(45..50))); // ##### ##### // xxxxxxxxxxx assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! } #[test] fn test_remove_full_overlapps() { let mut key_space1 = KeySpace { ranges: vec![ Key::from_i128(1)..Key::from_i128(4), Key::from_i128(5)..Key::from_i128(8), Key::from_i128(10)..Key::from_i128(12), ], }; let key_space2 = KeySpace { ranges: vec![ Key::from_i128(2)..Key::from_i128(3), Key::from_i128(6)..Key::from_i128(7), Key::from_i128(11)..Key::from_i128(13), ], }; let removed = key_space1.remove_overlapping_with(&key_space2); let removed_expected = KeySpace { ranges: vec![ Key::from_i128(2)..Key::from_i128(3), Key::from_i128(6)..Key::from_i128(7), Key::from_i128(11)..Key::from_i128(12), ], }; assert_eq!(removed, removed_expected); assert_eq!( key_space1.ranges, vec![ Key::from_i128(1)..Key::from_i128(2), Key::from_i128(3)..Key::from_i128(4), Key::from_i128(5)..Key::from_i128(6), Key::from_i128(7)..Key::from_i128(8), Key::from_i128(10)..Key::from_i128(11) ] ); } #[test] fn test_remove_partial_overlaps() { // Test partial ovelaps let mut key_space1 = KeySpace { ranges: vec![ Key::from_i128(1)..Key::from_i128(5), Key::from_i128(7)..Key::from_i128(10), Key::from_i128(12)..Key::from_i128(15), ], }; let key_space2 = KeySpace { ranges: vec![ Key::from_i128(3)..Key::from_i128(6), Key::from_i128(8)..Key::from_i128(11), Key::from_i128(14)..Key::from_i128(17), ], }; let removed = key_space1.remove_overlapping_with(&key_space2); let removed_expected = KeySpace { ranges: vec![ Key::from_i128(3)..Key::from_i128(5), Key::from_i128(8)..Key::from_i128(10), Key::from_i128(14)..Key::from_i128(15), ], }; assert_eq!(removed, removed_expected); assert_eq!( key_space1.ranges, vec![ Key::from_i128(1)..Key::from_i128(3), Key::from_i128(7)..Key::from_i128(8), Key::from_i128(12)..Key::from_i128(14), ] ); } #[test] fn test_remove_no_overlaps() { let mut key_space1 = KeySpace { ranges: vec![ Key::from_i128(1)..Key::from_i128(5), Key::from_i128(7)..Key::from_i128(10), Key::from_i128(12)..Key::from_i128(15), ], }; let key_space2 = KeySpace { ranges: vec![ Key::from_i128(6)..Key::from_i128(7), Key::from_i128(11)..Key::from_i128(12), Key::from_i128(15)..Key::from_i128(17), ], }; let removed = key_space1.remove_overlapping_with(&key_space2); let removed_expected = KeySpace::default(); assert_eq!(removed, removed_expected); assert_eq!( key_space1.ranges, vec![ Key::from_i128(1)..Key::from_i128(5), Key::from_i128(7)..Key::from_i128(10), Key::from_i128(12)..Key::from_i128(15), ] ); } #[test] fn test_remove_one_range_overlaps_multiple() { let mut key_space1 = KeySpace { ranges: vec![ Key::from_i128(1)..Key::from_i128(3), Key::from_i128(3)..Key::from_i128(6), Key::from_i128(6)..Key::from_i128(10), Key::from_i128(12)..Key::from_i128(15), Key::from_i128(17)..Key::from_i128(20), Key::from_i128(20)..Key::from_i128(30), Key::from_i128(30)..Key::from_i128(40), ], }; let key_space2 = KeySpace { ranges: vec![Key::from_i128(9)..Key::from_i128(19)], }; let removed = key_space1.remove_overlapping_with(&key_space2); let removed_expected = KeySpace { ranges: vec![ Key::from_i128(9)..Key::from_i128(10), Key::from_i128(12)..Key::from_i128(15), Key::from_i128(17)..Key::from_i128(19), ], }; assert_eq!(removed, removed_expected); assert_eq!( key_space1.ranges, vec![ Key::from_i128(1)..Key::from_i128(3), Key::from_i128(3)..Key::from_i128(6), Key::from_i128(6)..Key::from_i128(9), Key::from_i128(19)..Key::from_i128(20), Key::from_i128(20)..Key::from_i128(30), Key::from_i128(30)..Key::from_i128(40), ] ); } #[test] fn sharded_range_relation_gap() { let shard_identity = ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { start: Key::from_hex("000000067F00000005000040100300000000").unwrap(), end: Key::from_hex("000000067F00000005000040130000004000").unwrap(), }, &shard_identity, ); // Key range spans relations, expect MAX assert_eq!(range.page_count(), u32::MAX); } #[test] fn shard_identity_keyspaces_single_key() { let shard_identity = ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(), end: Key::from_hex("000000067f00000001000000700100000000").unwrap(), }, &shard_identity, ); // Single-key range on logical size key assert_eq!(range.page_count(), 1); } /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation #[test] fn contiguous_range_check() { assert!(!is_contiguous_range( &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() ..Key::from_hex("000000067f00000001000004df0100000003").unwrap()) ),); // The ranges goes all the way up to the 0xffffffff, including it: this is // not considered a rel block range because 0xffffffff stores logical sizes, // not blocks. assert!(!is_contiguous_range( &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() ..Key::from_hex("000000067f00000001000004df0100000000").unwrap()) ),); // Keys within the normal data region of a relation assert!(is_contiguous_range( &(Key::from_hex("000000067f00000001000004df0000000000").unwrap() ..Key::from_hex("000000067f00000001000004df0000000080").unwrap()) ),); // The logical size key of one forkno, then some blocks in the next assert!(is_contiguous_range( &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap() ..Key::from_hex("000000067f00000001000004df0100000080").unwrap()) ),); } #[test] fn shard_identity_keyspaces_forkno_gap() { let shard_identity = ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(), end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(), }, &shard_identity, ); // Range spanning the end of one forkno and the start of the next: we do not attempt to // calculate a valid size, because we have no way to know if they keys between start // and end are actually in use. assert_eq!(range.page_count(), u32::MAX); } #[test] fn shard_identity_keyspaces_one_relation() { for shard_number in 0..4 { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), ShardCount::new(4), DEFAULT_STRIPE_SIZE, ) .unwrap(); let range = ShardedRange::new( Range { start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(), end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(), }, &shard_identity, ); // Very simple case: range covering block zero of one relation, where that block maps to shard zero if shard_number == 0 { assert_eq!(range.page_count(), 1); } else { // Other shards should perceive the range's size as zero assert_eq!(range.page_count(), 0); } } } /// Test helper: construct a ShardedRange and call fragment() on it, returning /// the total page count in the range and the fragments. fn do_fragment( range_start: Key, range_end: Key, shard_identity: &ShardIdentity, target_nblocks: u32, ) -> (u32, Vec<(u32, Range)>) { let range = ShardedRange::new( Range { start: range_start, end: range_end, }, shard_identity, ); let page_count = range.page_count(); let fragments = range.fragment(target_nblocks); // Invariant: we always get at least one fragment assert!(!fragments.is_empty()); // Invariant: the first/last fragment start/end should equal the input start/end assert_eq!(fragments.first().unwrap().1.start, range_start); assert_eq!(fragments.last().unwrap().1.end, range_end); if page_count > 0 { // Invariant: every fragment must contain at least one shard-local page, if the // total range contains at least one shard-local page let all_nonzero = fragments.iter().all(|f| f.0 > 0); if !all_nonzero { eprintln!("Found a zero-length fragment: {fragments:?}"); } assert!(all_nonzero); } else { // A range with no shard-local pages should always be returned as a single fragment assert_eq!(fragments, vec![(0, range_start..range_end)]); } // Invariant: fragments must be ordered and non-overlapping let mut last: Option> = None; for frag in &fragments { if let Some(last) = last { assert!(frag.1.start >= last.end); assert!(frag.1.start > last.start); } last = Some(frag.1.clone()) } // Invariant: fragments respect target_nblocks for frag in &fragments { assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks); } (page_count, fragments) } /// Really simple tests for fragment(), on a range that just contains a single stripe /// for a single tenant. #[test] fn sharded_range_fragment_simple() { const SHARD_COUNT: u8 = 4; const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; let shard_identity = ShardIdentity::new( ShardNumber(0), ShardCount::new(SHARD_COUNT), ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which we happen to know covers exactly one stripe which belongs to this shard let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); let mut input_end = input_start; input_end.field6 += STRIPE_SIZE; // field6 is block number // Ask for stripe_size blocks, we get the whole stripe assert_eq!( do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE), (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for more, we still get the whole stripe assert_eq!( do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE), (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for target_nblocks of half the stripe size, we get two halves assert_eq!( do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2), ( STRIPE_SIZE, vec![ ( STRIPE_SIZE / 2, input_start..input_start.add(STRIPE_SIZE / 2) ), (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end) ] ) ); } #[test] fn sharded_range_fragment_multi_stripe() { const SHARD_COUNT: u8 = 4; const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; let shard_identity = ShardIdentity::new( ShardNumber(0), ShardCount::new(SHARD_COUNT), ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which covers multiple stripes, exactly one of which belongs to the current shard. let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); let mut input_end = input_start; input_end.field6 += RANGE_SIZE; // field6 is block number // Ask for all the blocks, get a fragment that covers the whole range but reports // its size to be just the blocks belonging to our shard. assert_eq!( do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE), (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for a sub-stripe quantity that results in 3 fragments. let limit = STRIPE_SIZE / 3 + 1; assert_eq!( do_fragment(input_start, input_end, &shard_identity, limit), ( STRIPE_SIZE, vec![ (limit, input_start..input_start.add(limit)), (limit, input_start.add(limit)..input_start.add(2 * limit)), ( STRIPE_SIZE - 2 * limit, input_start.add(2 * limit)..input_end ), ] ) ); // Try on a range that starts slightly after our owned stripe assert_eq!( do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE), ( STRIPE_SIZE - 1, vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)] ) ); } /// Test our calculations work correctly when we start a range from the logical size key of /// a previous relation. #[test] fn sharded_range_fragment_starting_from_logical_size() { const SHARD_COUNT: u8 = 4; const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap(); input_end.field6 += RANGE_SIZE; // field6 is block number // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too let shard_identity = ShardIdentity::new( ShardNumber(0), ShardCount::new(SHARD_COUNT), ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), ( STRIPE_SIZE + 1, vec![(STRIPE_SIZE + 1, input_start..input_end)] ) ); // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards // store all logical sizes) let shard_identity = ShardIdentity::new( ShardNumber(1), ShardCount::new(SHARD_COUNT), ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), (1, vec![(1, input_start..input_end)]) ); } /// Test that ShardedRange behaves properly when used on un-sharded data #[test] fn sharded_range_fragment_unsharded() { let shard_identity = ShardIdentity::unsharded(); let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), ( 0x10000, vec![ (0x8000, input_start..input_start.add(0x8000)), (0x8000, input_start.add(0x8000)..input_start.add(0x10000)) ] ) ); } #[test] fn sharded_range_fragment_cross_relation() { let shard_identity = ShardIdentity::unsharded(); // A range that spans relations: expect fragmentation to give up and return a u32::MAX size let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), (u32::MAX, vec![(u32::MAX, input_start..input_end),]) ); // Same, but using a sharded identity let shard_identity = ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), (u32::MAX, vec![(u32::MAX, input_start..input_end),]) ); } #[test] fn sharded_range_fragment_tiny_nblocks() { let shard_identity = ShardIdentity::unsharded(); // A range that spans relations: expect fragmentation to give up and return a u32::MAX size let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap(); let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 16), ( 0x38, vec![ (16, input_start..input_start.add(16)), (16, input_start.add(16)..input_start.add(32)), (16, input_start.add(32)..input_start.add(48)), (8, input_start.add(48)..input_end), ] ) ); } #[test] fn sharded_range_fragment_fuzz() { // Use a fixed seed: we don't want to explicitly pick values, but we do want // the test to be reproducible. let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef); for _i in 0..1000 { let shard_identity = if prng.next_u32() % 2 == 0 { ShardIdentity::unsharded() } else { let shard_count = prng.next_u32() % 127 + 1; ShardIdentity::new( ShardNumber((prng.next_u32() % shard_count) as u8), ShardCount::new(shard_count as u8), DEFAULT_STRIPE_SIZE, ) .unwrap() }; let target_nblocks = prng.next_u32() % 65536 + 1; let start_offset = prng.next_u32() % 16384; // Try ranges up to 4GiB in size, that are always at least 1 let range_size = prng.next_u32() % 8192 + 1; // A range that spans relations: expect fragmentation to give up and return a u32::MAX size let input_start = Key::from_hex("000000067F00000001000004E10000000000") .unwrap() .add(start_offset); let input_end = input_start.add(range_size); // This test's main success conditions are the invariants baked into do_fragment let (_total_size, fragments) = do_fragment(input_start, input_end, &shard_identity, target_nblocks); // Pick a random key within the range and check it appears in the output let example_key = input_start.add(prng.next_u32() % range_size); // Panic on unwrap if it isn't found let example_key_frag = fragments .iter() .find(|f| f.1.contains(&example_key)) .unwrap(); // Check that the fragment containing our random key has a nonzero size if // that key is shard-local let example_key_local = !shard_identity.is_key_disposable(&example_key); if example_key_local { assert!(example_key_frag.0 > 0); } } } } ================================================ FILE: libs/pageserver_api/src/lib.rs ================================================ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; pub mod pagestream_api; pub mod reltag; pub mod shard; /// Public API types pub mod upcall_api; pub mod config; ================================================ FILE: libs/pageserver_api/src/models/detach_ancestor.rs ================================================ use std::collections::HashSet; use utils::id::TimelineId; #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { pub reparented_timelines: HashSet, } ================================================ FILE: libs/pageserver_api/src/models/partitioning.rs ================================================ use utils::lsn::Lsn; use crate::keyspace::SparseKeySpace; #[derive(Debug, PartialEq, Eq)] pub struct Partitioning { pub keys: crate::keyspace::KeySpace, pub sparse_keys: crate::keyspace::SparseKeySpace, pub at_lsn: Lsn, } impl serde::Serialize for Partitioning { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, { pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace); impl serde::Serialize for KeySpace<'_> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, { use serde::ser::SerializeSeq; let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?; for kr in &self.0.ranges { seq.serialize_element(&KeyRange(kr))?; } seq.end() } } use serde::ser::SerializeMap; let mut map = serializer.serialize_map(Some(2))?; map.serialize_key("keys")?; map.serialize_value(&KeySpace(&self.keys))?; map.serialize_key("sparse_keys")?; map.serialize_value(&KeySpace(&self.sparse_keys.0))?; map.serialize_key("at_lsn")?; map.serialize_value(&WithDisplay(&self.at_lsn))?; map.end() } } pub struct WithDisplay<'a, T>(&'a T); impl serde::Serialize for WithDisplay<'_, T> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, { serializer.collect_str(&self.0) } } pub struct KeyRange<'a>(&'a std::ops::Range); impl serde::Serialize for KeyRange<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { use serde::ser::SerializeTuple; let mut t = serializer.serialize_tuple(2)?; t.serialize_element(&WithDisplay(&self.0.start))?; t.serialize_element(&WithDisplay(&self.0.end))?; t.end() } } impl<'a> serde::Deserialize<'a> for Partitioning { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'a>, { pub struct KeySpace(crate::keyspace::KeySpace); impl<'de> serde::Deserialize<'de> for KeySpace { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { #[serde_with::serde_as] #[derive(serde::Deserialize)] #[serde(transparent)] struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key); #[serde_with::serde_as] #[derive(serde::Deserialize)] struct Range(Key, Key); let ranges: Vec = serde::Deserialize::deserialize(deserializer)?; Ok(Self(crate::keyspace::KeySpace { ranges: ranges .into_iter() .map(|Range(start, end)| (start.0..end.0)) .collect(), })) } } #[serde_with::serde_as] #[derive(serde::Deserialize)] struct De { keys: KeySpace, sparse_keys: KeySpace, #[serde_as(as = "serde_with::DisplayFromStr")] at_lsn: Lsn, } let de: De = serde::Deserialize::deserialize(deserializer)?; Ok(Self { at_lsn: de.at_lsn, keys: de.keys.0, sparse_keys: SparseKeySpace(de.sparse_keys.0), }) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_serialization_roundtrip() { let reference = r#" { "keys": [ [ "000000000000000000000000000000000000", "000000000000000000000000000000000001" ], [ "000000067F00000001000000000000000000", "000000067F00000001000000000000000002" ], [ "030000000000000000000000000000000000", "030000000000000000000000000000000003" ] ], "sparse_keys": [ [ "620000000000000000000000000000000000", "620000000000000000000000000000000003" ] ], "at_lsn": "0/2240160" } "#; let de: Partitioning = serde_json::from_str(reference).unwrap(); let ser = serde_json::to_string(&de).unwrap(); let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap(); assert_eq!( ser_de, serde_json::from_str::<'_, serde_json::Value>(reference).unwrap() ); } } ================================================ FILE: libs/pageserver_api/src/models/utilization.rs ================================================ use std::time::SystemTime; use utils::serde_percent::Percent; use utils::serde_system_time; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. /// /// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. /// /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might /// not handle full u64 values properly. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { /// Used disk space (physical, ground truth from statfs()) #[serde(serialize_with = "ser_saturating_u63")] pub disk_usage_bytes: u64, /// Free disk space #[serde(serialize_with = "ser_saturating_u63")] pub free_space_bytes: u64, /// Wanted disk space, based on the tenant shards currently present on this pageserver: this /// is like disk_usage_bytes, but it is stable and does not change with the cache state of /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay /// there, or may be unrealistically low if the pageserver has attached tenants which haven't /// downloaded layers yet. #[serde(serialize_with = "ser_saturating_u63", default)] pub disk_wanted_bytes: u64, // What proportion of total disk space will this pageserver use before it starts evicting data? #[serde(default = "unity_percent")] pub disk_usable_pct: Percent, // How many shards are currently on this node? #[serde(default)] pub shard_count: u32, // How many shards should this node be able to handle at most? #[serde(default)] pub max_shard_count: u32, /// Cached result of [`Self::score`] pub utilization_score: Option, /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. pub captured_at: serde_system_time::SystemTime, } fn unity_percent() -> Percent { Percent::new(0).unwrap() } pub type RawScore = u64; impl PageserverUtilization { const UTILIZATION_FULL: u64 = 1000000; /// Calculate a utilization score. The result is to be inrepreted as a fraction of /// Self::UTILIZATION_FULL. /// /// Lower values are more affine to scheduling more work on this node. /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. /// - 0.0 represents an empty node. /// - Negative values are forbidden /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to /// layer eviction. pub fn score(&self) -> RawScore { let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) * self.disk_usable_pct.get() as u64) / 100; let disk_utilization_score = self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; let shard_utilization_score = self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; std::cmp::max(disk_utilization_score, shard_utilization_score) } pub fn cached_score(&mut self) -> RawScore { match self.utilization_score { None => { let s = self.score(); self.utilization_score = Some(s); s } Some(s) => s, } } /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative. /// /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded. pub fn is_overloaded(score: RawScore) -> bool { // Why the factor of two? This is unscientific but reflects behavior of real systems: // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed // until some more nodes are deployed. // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to // hold its biggest timeline fully on disk, which is tends to be an over estimate when // some tenants are very idle and have dropped layers from disk. In practice going up to // double is generally better than giving up and scheduling in a sub-optimal AZ. score >= 2 * Self::UTILIZATION_FULL } pub fn adjust_shard_count_max(&mut self, shard_count: u32) { if self.shard_count < shard_count { self.shard_count = shard_count; // Dirty cache: this will be calculated next time someone retrives the score self.utilization_score = None; } } /// A utilization structure that has a full utilization score: use this as a placeholder when /// you need a utilization but don't have real values yet. pub fn full() -> Self { Self { disk_usage_bytes: 1, free_space_bytes: 0, disk_wanted_bytes: 1, disk_usable_pct: Percent::new(100).unwrap(), shard_count: 1, max_shard_count: 1, utilization_score: Some(Self::UTILIZATION_FULL), captured_at: serde_system_time::SystemTime(SystemTime::now()), } } } /// Test helper pub mod test_utilization { use std::time::SystemTime; use utils::serde_percent::Percent; use utils::serde_system_time::{self}; use super::PageserverUtilization; // Parameters of the imaginary node used for test utilization instances const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; const TEST_SHARDS_MAX: u32 = 1000; /// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do /// not abuse this function from non-test code. /// /// Emulates a node with a 1000 shard limit and a 1TB disk. pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization { PageserverUtilization { disk_usage_bytes: disk_wanted_bytes, free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE), disk_wanted_bytes, disk_usable_pct: Percent::new(100).unwrap(), shard_count, max_shard_count: TEST_SHARDS_MAX, utilization_score: None, captured_at: serde_system_time::SystemTime(SystemTime::now()), } } } /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values /// with the highest bit set which is properly parsed by serde formats, but would create a /// conundrum on how to handle and again serialize such values at type level. It will be a few /// years until we can use more than `i64::MAX` bytes on a disk. fn ser_saturating_u63(value: &u64, serializer: S) -> Result { const MAX_FORMAT_INT64: u64 = i64::MAX as u64; let value = (*value).min(MAX_FORMAT_INT64); serializer.serialize_u64(value) } #[cfg(test)] mod tests { use std::time::Duration; use super::*; #[test] fn u64_max_is_serialized_as_u63_max() { let doc = PageserverUtilization { disk_usage_bytes: u64::MAX, free_space_bytes: 0, disk_wanted_bytes: u64::MAX, utilization_score: Some(13), disk_usable_pct: Percent::new(90).unwrap(), shard_count: 100, max_shard_count: 200, captured_at: serde_system_time::SystemTime( std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), ), }; let s = serde_json::to_string(&doc).unwrap(); let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; assert_eq!(s, expected); } } ================================================ FILE: libs/pageserver_api/src/models.rs ================================================ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; use core::ops::Range; use std::collections::HashMap; use std::fmt::Display; use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; use std::str::FromStr; use std::time::{Duration, SystemTime}; #[cfg(feature = "testing")] use camino::Utf8PathBuf; use postgres_versioninfo::PgMajorVersion; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; pub use utilization::PageserverUtilization; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; use crate::shard::{ DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId, }; /// The state of a tenant in this pageserver. /// /// ```mermaid /// stateDiagram-v2 /// /// [*] --> Attaching: spawn_attach() /// /// Attaching --> Activating: activate() /// Activating --> Active: infallible /// /// Attaching --> Broken: attach() failure /// /// Active --> Stopping: set_stopping(), part of shutdown & detach /// Stopping --> Broken: late error in remove_tenant_from_memory /// /// Broken --> [*]: ignore / detach / shutdown /// Stopping --> [*]: remove_from_memory complete /// /// Active --> Broken: cfg(testing)-only tenant break point /// ``` #[derive( Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize, strum_macros::Display, strum_macros::VariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] #[serde(tag = "slug", content = "data")] pub enum TenantState { /// This tenant is being attached to the pageserver. /// /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Attaching, /// The tenant is transitioning from Loading/Attaching to Active. /// /// While in this state, the individual timelines are being activated. /// /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Activating(ActivatingFrom), /// The tenant has finished activating and is open for business. /// /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`. Active, /// The tenant is recognized by pageserver, but it is being detached or the /// system is being shut down. /// /// Transitions out of this state are possible through `set_broken()`. Stopping { /// The barrier can be used to wait for shutdown to complete. The first caller to set /// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers /// will wait for the first caller's existing barrier. /// /// None is set when an attach is cancelled, to signal to shutdown that the attach has in /// fact cancelled: /// /// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant. /// 2. `attach` sets `TenantState::Stopping(None)` and exits. /// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets /// `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner. // // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field, // otherwise it will not be skipped during deserialization #[serde(skip)] progress: Option, }, /// The tenant is recognized by the pageserver, but can no longer be used for /// any operations. /// /// If the tenant fails to load or attach, it will transition to this state /// and it is guaranteed that no background tasks are running in its name. /// /// The other way to transition into this state is from `Stopping` state /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens /// if the cleanup future executed by `remove_tenant_from_memory()` fails. Broken { reason: String, backtrace: String }, } impl TenantState { pub fn attachment_status(&self) -> TenantAttachmentStatus { use TenantAttachmentStatus::*; // Below TenantState::Activating is used as "transient" or "transparent" state for // attachment_status determining. match self { // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. // So, technically, we can return Attached here. // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. // But, our attach task might still be fetching the remote timelines, etc. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, // We only reach Active after successful load / attach. // So, call atttachment status Attached. Self::Active => Attached, // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. // However, it also becomes Broken if the regular load fails. // From Console's perspective there's no practical difference // because attachment_status is polled by console only during attach operation execution. Self::Broken { reason, .. } => Failed { reason: reason.to_owned(), }, // Why is Stopping a Maybe case? Because, during pageserver shutdown, // we set the Stopping state irrespective of whether the tenant // has finished attaching or not. Self::Stopping { .. } => Maybe, } } pub fn broken_from_reason(reason: String) -> Self { let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); Self::Broken { reason, backtrace: backtrace_str, } } } impl std::fmt::Debug for TenantState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Broken { reason, backtrace } if !reason.is_empty() => { write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}") } _ => write!(f, "{self}"), } } } /// A temporary lease to a specific lsn inside a timeline. /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`. #[serde_as] #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct LsnLease { #[serde_as(as = "SystemTimeAsRfc3339Millis")] pub valid_until: SystemTime, } serde_with::serde_conv!( SystemTimeAsRfc3339Millis, SystemTime, |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(), |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) } ); impl LsnLease { /// The default length for an explicit LSN lease request (10 minutes). pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60); /// The default length for an implicit LSN lease granted during /// `get_lsn_by_timestamp` request (1 minutes). pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60); /// Checks whether the lease is expired. pub fn is_expired(&self, now: &SystemTime) -> bool { now > &self.valid_until } } /// Controls the detach ancestor behavior. /// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. /// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. #[derive(Debug, Clone, Copy, Default)] pub enum DetachBehavior { #[default] NoAncestorAndReparent, MultiLevelAndNoReparent, } impl std::str::FromStr for DetachBehavior { type Err = &'static str; fn from_str(s: &str) -> Result { match s { "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), "v1" => Ok(DetachBehavior::NoAncestorAndReparent), "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), _ => Err("cannot parse detach behavior"), } } } impl std::fmt::Display for DetachBehavior { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"), DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"), } } } /// The only [`TenantState`] variants we could be `TenantState::Activating` from. /// /// XXX: We used to have more variants here, but now it's just one, which makes this rather /// useless. Remove, once we've checked that there's no client code left that looks at this. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`] Attaching, } /// A state of a timeline in pageserver's memory. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TimelineState { /// The timeline is recognized by the pageserver but is not yet operational. /// In particular, the walreceiver connection loop is not running for this timeline. /// It will eventually transition to state Active or Broken. Loading, /// The timeline is fully operational. /// It can be queried, and the walreceiver connection loop is running. Active, /// The timeline was previously Loading or Active but is shutting down. /// It cannot transition back into any other state. Stopping, /// The timeline is broken and not operational (previous states: Loading or Active). Broken { reason: String, backtrace: String }, } #[serde_with::serde_as] #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub struct CompactLsnRange { pub start: Lsn, pub end: Lsn, } #[serde_with::serde_as] #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub struct CompactKeyRange { #[serde_as(as = "serde_with::DisplayFromStr")] pub start: Key, #[serde_as(as = "serde_with::DisplayFromStr")] pub end: Key, } impl From> for CompactLsnRange { fn from(range: Range) -> Self { Self { start: range.start, end: range.end, } } } impl From> for CompactKeyRange { fn from(range: Range) -> Self { Self { start: range.start, end: range.end, } } } impl From for Range { fn from(range: CompactLsnRange) -> Self { range.start..range.end } } impl From for Range { fn from(range: CompactKeyRange) -> Self { range.start..range.end } } impl CompactLsnRange { pub fn above(lsn: Lsn) -> Self { Self { start: lsn, end: Lsn::MAX, } } } #[derive(Debug, Clone, Serialize)] pub struct CompactInfoResponse { pub compact_key_range: Option, pub compact_lsn_range: Option, pub sub_compaction: bool, pub running: bool, pub job_id: usize, } #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, #[serde(flatten)] pub mode: TimelineCreateRequestMode, } impl TimelineCreateRequest { pub fn mode_tag(&self) -> &'static str { match &self.mode { TimelineCreateRequestMode::Branch { .. } => "branch", TimelineCreateRequestMode::ImportPgdata { .. } => "import", TimelineCreateRequestMode::Bootstrap { .. } => "bootstrap", } } pub fn is_import(&self) -> bool { matches!(self.mode, TimelineCreateRequestMode::ImportPgdata { .. }) } } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportStatus { InProgress(Option), Done, Error(String), } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportProgress { V1(ShardImportProgressV1), } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct ShardImportProgressV1 { /// Total number of jobs in the import plan pub jobs: usize, /// Number of jobs completed pub completed: usize, /// Hash of the plan pub import_plan_hash: u64, /// Soft limit for the job size /// This needs to remain constant throughout the import pub job_soft_size_limit: usize, } impl ShardImportStatus { pub fn is_terminal(&self) -> bool { match self { ShardImportStatus::InProgress(_) => false, ShardImportStatus::Done | ShardImportStatus::Error(_) => true, } } } /// Storage controller specific extensions to [`TimelineInfo`]. #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateResponseStorcon { #[serde(flatten)] pub timeline_info: TimelineInfo, pub safekeepers: Option, } /// Safekeepers as returned in timeline creation request to storcon or pushed to /// cplane in the post migration hook. #[derive(Serialize, Deserialize, Clone)] pub struct SafekeepersInfo { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub generation: u32, pub safekeepers: Vec, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct SafekeeperInfo { pub id: NodeId, pub hostname: String, } #[derive(Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum TimelineCreateRequestMode { Branch { ancestor_timeline_id: TimelineId, #[serde(default)] ancestor_start_lsn: Option, // TODO: cplane sets this, but, the branching code always // inherits the ancestor's pg_version. Earlier code wasn't // using a flattened enum, so, it was an accepted field, and // we continue to accept it by having it here. pg_version: Option, #[serde(default, skip_serializing_if = "std::ops::Not::not")] read_only: bool, }, ImportPgdata { import_pgdata: TimelineCreateRequestModeImportPgdata, }, // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap. // (serde picks the first matching enum variant, in declaration order). Bootstrap { #[serde(default)] existing_initdb_timeline_id: Option, pg_version: Option, }, } #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequestModeImportPgdata { pub location: ImportPgdataLocation, pub idempotency_key: ImportPgdataIdempotencyKey, } #[derive(Serialize, Deserialize, Clone, Debug)] pub enum ImportPgdataLocation { #[cfg(feature = "testing")] LocalFs { path: Utf8PathBuf }, AwsS3 { region: String, bucket: String, /// A better name for this would be `prefix`; changing requires coordination with cplane. /// See . key: String, }, } #[derive(Serialize, Deserialize, Clone)] #[serde(transparent)] pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { use rand::Rng; use rand::distr::Alphanumeric; Self( rand::rng() .sample_iter(&Alphanumeric) .take(20) .map(char::from) .collect(), ) } } #[derive(Serialize, Deserialize, Clone)] pub struct LsnLeaseRequest { pub lsn: Lsn, } #[derive(Serialize, Deserialize)] pub struct TenantShardSplitRequest { pub new_shard_count: u8, // A tenant's stripe size is only meaningful the first time their shard count goes // above 1: therefore during a split from 1->N shards, we may modify the stripe size. // // If this is set while the stripe count is being increased from an already >1 value, // then the request will fail with 400. pub new_stripe_size: Option, } #[derive(Serialize, Deserialize)] pub struct TenantShardSplitResponse { pub new_shards: Vec, } /// Parameters that apply to all shards in a tenant. Used during tenant creation. #[derive(Clone, Copy, Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct ShardParameters { pub count: ShardCount, pub stripe_size: ShardStripeSize, } impl ShardParameters { pub fn is_unsharded(&self) -> bool { self.count.is_unsharded() } } impl Default for ShardParameters { fn default() -> Self { Self { count: ShardCount::new(0), stripe_size: DEFAULT_STRIPE_SIZE, } } } impl From for ShardParameters { fn from(identity: ShardIdentity) -> Self { Self { count: identity.count, stripe_size: identity.stripe_size, } } } #[derive(Debug, Default, Clone, Eq, PartialEq)] pub enum FieldPatch { Upsert(T), Remove, #[default] Noop, } impl FieldPatch { fn is_noop(&self) -> bool { matches!(self, FieldPatch::Noop) } pub fn apply(self, target: &mut Option) { match self { Self::Upsert(v) => *target = Some(v), Self::Remove => *target = None, Self::Noop => {} } } pub fn map Result>(self, map: F) -> Result, E> { match self { Self::Upsert(v) => Ok(FieldPatch::::Upsert(map(v)?)), Self::Remove => Ok(FieldPatch::::Remove), Self::Noop => Ok(FieldPatch::::Noop), } } } impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { Option::deserialize(deserializer).map(|opt| match opt { None => FieldPatch::Remove, Some(val) => FieldPatch::Upsert(val), }) } } impl Serialize for FieldPatch { fn serialize(&self, serializer: S) -> Result where S: Serializer, { match self { FieldPatch::Upsert(val) => serializer.serialize_some(val), FieldPatch::Remove => serializer.serialize_none(), FieldPatch::Noop => unreachable!(), } } } #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] #[serde(default)] pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub checkpoint_distance: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub checkpoint_timeout: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_target_size: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_upper_limit: FieldPatch, // defer parsing compaction_algorithm, like eviction_policy #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_algorithm: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_shard_ancestor: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_l0_first: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_l0_semaphore: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_delay_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_stall_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_horizon: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_creation_threshold: FieldPatch, // HADRON #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_layer_force_creation_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub pitr_interval: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub walreceiver_connect_timeout: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lagging_wal_timeout: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub max_lsn_wal_lag: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub eviction_policy: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub min_resident_size_override: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub evictions_low_residence_duration_metric_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub heatmap_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lazy_slru_download: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub timeline_get_throttle: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_layer_creation_check_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_creation_preempt_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length_for_ts: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub timeline_offloading: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub rel_size_v2_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_verification: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_initial_threshold_kb: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_ratio_percent: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub sampling_ratio: FieldPatch>, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub relsize_snapshot_cache_capacity: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub basebackup_cache_enabled: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information /// about which parameters are set and which are not. /// /// Used in many places, including durably stored ones. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(default)] // this maps omitted fields in deserialization to None pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub checkpoint_distance: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub checkpoint_timeout: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_target_size: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub compaction_period: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_upper_limit: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_algorithm: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_shard_ancestor: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_l0_first: Option, #[serde(skip_serializing_if = "Option::is_none")] pub compaction_l0_semaphore: Option, #[serde(skip_serializing_if = "Option::is_none")] pub l0_flush_delay_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] pub l0_flush_stall_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_horizon: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub gc_period: Option, #[serde(skip_serializing_if = "Option::is_none")] pub image_creation_threshold: Option, // HADRON #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub image_layer_force_creation_period: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub pitr_interval: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub walreceiver_connect_timeout: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lagging_wal_timeout: Option, #[serde(skip_serializing_if = "Option::is_none")] pub max_lsn_wal_lag: Option, #[serde(skip_serializing_if = "Option::is_none")] pub eviction_policy: Option, #[serde(skip_serializing_if = "Option::is_none")] pub min_resident_size_override: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub heatmap_period: Option, #[serde(skip_serializing_if = "Option::is_none")] pub lazy_slru_download: Option, #[serde(skip_serializing_if = "Option::is_none")] pub timeline_get_throttle: Option, #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] pub image_creation_preempt_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lsn_lease_length: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub lsn_lease_length_for_ts: Option, #[serde(skip_serializing_if = "Option::is_none")] pub timeline_offloading: Option, #[serde(skip_serializing_if = "Option::is_none")] pub rel_size_v2_enabled: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_enabled: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_verification: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_initial_threshold_kb: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_ratio_percent: Option, #[serde(skip_serializing_if = "Option::is_none")] pub sampling_ratio: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub relsize_snapshot_cache_capacity: Option, #[serde(skip_serializing_if = "Option::is_none")] pub basebackup_cache_enabled: Option, } impl TenantConfig { pub fn apply_patch( self, patch: TenantConfigPatch, ) -> Result { let Self { mut checkpoint_distance, mut checkpoint_timeout, mut compaction_target_size, mut compaction_period, mut compaction_threshold, mut compaction_upper_limit, mut compaction_algorithm, mut compaction_shard_ancestor, mut compaction_l0_first, mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, mut gc_horizon, mut gc_period, mut image_creation_threshold, mut image_layer_force_creation_period, mut pitr_interval, mut walreceiver_connect_timeout, mut lagging_wal_timeout, mut max_lsn_wal_lag, mut eviction_policy, mut min_resident_size_override, mut evictions_low_residence_duration_metric_threshold, mut heatmap_period, mut lazy_slru_download, mut timeline_get_throttle, mut image_layer_creation_check_threshold, mut image_creation_preempt_threshold, mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, mut rel_size_v2_enabled, mut gc_compaction_enabled, mut gc_compaction_verification, mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, mut sampling_ratio, mut relsize_snapshot_cache_capacity, mut basebackup_cache_enabled, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); patch .checkpoint_timeout .map(|v| humantime::parse_duration(&v))? .apply(&mut checkpoint_timeout); patch .compaction_target_size .apply(&mut compaction_target_size); patch .compaction_period .map(|v| humantime::parse_duration(&v))? .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); patch .compaction_upper_limit .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); patch .compaction_shard_ancestor .apply(&mut compaction_shard_ancestor); patch.compaction_l0_first.apply(&mut compaction_l0_first); patch .compaction_l0_semaphore .apply(&mut compaction_l0_semaphore); patch .l0_flush_delay_threshold .apply(&mut l0_flush_delay_threshold); patch .l0_flush_stall_threshold .apply(&mut l0_flush_stall_threshold); patch.gc_horizon.apply(&mut gc_horizon); patch .gc_period .map(|v| humantime::parse_duration(&v))? .apply(&mut gc_period); patch .image_creation_threshold .apply(&mut image_creation_threshold); // HADRON patch .image_layer_force_creation_period .map(|v| humantime::parse_duration(&v))? .apply(&mut image_layer_force_creation_period); patch .pitr_interval .map(|v| humantime::parse_duration(&v))? .apply(&mut pitr_interval); patch .walreceiver_connect_timeout .map(|v| humantime::parse_duration(&v))? .apply(&mut walreceiver_connect_timeout); patch .lagging_wal_timeout .map(|v| humantime::parse_duration(&v))? .apply(&mut lagging_wal_timeout); patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); patch.eviction_policy.apply(&mut eviction_policy); patch .min_resident_size_override .apply(&mut min_resident_size_override); patch .evictions_low_residence_duration_metric_threshold .map(|v| humantime::parse_duration(&v))? .apply(&mut evictions_low_residence_duration_metric_threshold); patch .heatmap_period .map(|v| humantime::parse_duration(&v))? .apply(&mut heatmap_period); patch.lazy_slru_download.apply(&mut lazy_slru_download); patch .timeline_get_throttle .apply(&mut timeline_get_throttle); patch .image_layer_creation_check_threshold .apply(&mut image_layer_creation_check_threshold); patch .image_creation_preempt_threshold .apply(&mut image_creation_preempt_threshold); patch .lsn_lease_length .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); patch .gc_compaction_enabled .apply(&mut gc_compaction_enabled); patch .gc_compaction_verification .apply(&mut gc_compaction_verification); patch .gc_compaction_initial_threshold_kb .apply(&mut gc_compaction_initial_threshold_kb); patch .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); patch.sampling_ratio.apply(&mut sampling_ratio); patch .relsize_snapshot_cache_capacity .apply(&mut relsize_snapshot_cache_capacity); patch .basebackup_cache_enabled .apply(&mut basebackup_cache_enabled); Ok(Self { checkpoint_distance, checkpoint_timeout, compaction_target_size, compaction_period, compaction_threshold, compaction_upper_limit, compaction_algorithm, compaction_shard_ancestor, compaction_l0_first, compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, gc_horizon, gc_period, image_creation_threshold, image_layer_force_creation_period, pitr_interval, walreceiver_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, eviction_policy, min_resident_size_override, evictions_low_residence_duration_metric_threshold, heatmap_period, lazy_slru_download, timeline_get_throttle, image_layer_creation_check_threshold, image_creation_preempt_threshold, lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, rel_size_v2_enabled, gc_compaction_enabled, gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, sampling_ratio, relsize_snapshot_cache_capacity, basebackup_cache_enabled, }) } pub fn merge( &self, global_conf: crate::config::TenantConfigToml, ) -> crate::config::TenantConfigToml { crate::config::TenantConfigToml { checkpoint_distance: self .checkpoint_distance .unwrap_or(global_conf.checkpoint_distance), checkpoint_timeout: self .checkpoint_timeout .unwrap_or(global_conf.checkpoint_timeout), compaction_target_size: self .compaction_target_size .unwrap_or(global_conf.compaction_target_size), compaction_period: self .compaction_period .unwrap_or(global_conf.compaction_period), compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), compaction_upper_limit: self .compaction_upper_limit .unwrap_or(global_conf.compaction_upper_limit), compaction_algorithm: self .compaction_algorithm .as_ref() .unwrap_or(&global_conf.compaction_algorithm) .clone(), compaction_shard_ancestor: self .compaction_shard_ancestor .unwrap_or(global_conf.compaction_shard_ancestor), compaction_l0_first: self .compaction_l0_first .unwrap_or(global_conf.compaction_l0_first), compaction_l0_semaphore: self .compaction_l0_semaphore .unwrap_or(global_conf.compaction_l0_semaphore), l0_flush_delay_threshold: self .l0_flush_delay_threshold .or(global_conf.l0_flush_delay_threshold), l0_flush_stall_threshold: self .l0_flush_stall_threshold .or(global_conf.l0_flush_stall_threshold), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self .image_creation_threshold .unwrap_or(global_conf.image_creation_threshold), image_layer_force_creation_period: self .image_layer_force_creation_period .or(global_conf.image_layer_force_creation_period), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), walreceiver_connect_timeout: self .walreceiver_connect_timeout .unwrap_or(global_conf.walreceiver_connect_timeout), lagging_wal_timeout: self .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), min_resident_size_override: self .min_resident_size_override .or(global_conf.min_resident_size_override), evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), lazy_slru_download: self .lazy_slru_download .unwrap_or(global_conf.lazy_slru_download), timeline_get_throttle: self .timeline_get_throttle .clone() .unwrap_or(global_conf.timeline_get_throttle), image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), image_creation_preempt_threshold: self .image_creation_preempt_threshold .unwrap_or(global_conf.image_creation_preempt_threshold), lsn_lease_length: self .lsn_lease_length .unwrap_or(global_conf.lsn_lease_length), lsn_lease_length_for_ts: self .lsn_lease_length_for_ts .unwrap_or(global_conf.lsn_lease_length_for_ts), timeline_offloading: self .timeline_offloading .unwrap_or(global_conf.timeline_offloading), rel_size_v2_enabled: self .rel_size_v2_enabled .unwrap_or(global_conf.rel_size_v2_enabled), gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), gc_compaction_verification: self .gc_compaction_verification .unwrap_or(global_conf.gc_compaction_verification), gc_compaction_initial_threshold_kb: self .gc_compaction_initial_threshold_kb .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), gc_compaction_ratio_percent: self .gc_compaction_ratio_percent .unwrap_or(global_conf.gc_compaction_ratio_percent), sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio), relsize_snapshot_cache_capacity: self .relsize_snapshot_cache_capacity .unwrap_or(global_conf.relsize_snapshot_cache_capacity), basebackup_cache_enabled: self .basebackup_cache_enabled .unwrap_or(global_conf.basebackup_cache_enabled), } } } /// The policy for the aux file storage. /// /// It can be switched through `switch_aux_file_policy` tenant config. /// When the first aux file written, the policy will be persisted in the /// `index_part.json` file and has a limited migration path. /// /// Currently, we only allow the following migration path: /// /// Unset -> V1 /// -> V2 /// -> CrossValidation -> V2 #[derive( Eq, PartialEq, Debug, Copy, Clone, strum_macros::EnumString, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, )] #[strum(serialize_all = "kebab-case")] pub enum AuxFilePolicy { /// V1 aux file policy: store everything in AUX_FILE_KEY #[strum(ascii_case_insensitive)] V1, /// V2 aux file policy: store in the AUX_FILE keyspace #[strum(ascii_case_insensitive)] V2, /// Cross validation runs both formats on the write path and does validation /// on the read path. #[strum(ascii_case_insensitive)] CrossValidation, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum EvictionPolicy { NoEviction, LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), OnlyImitiate(EvictionPolicyLayerAccessThreshold), } impl EvictionPolicy { pub fn discriminant_str(&self) -> &'static str { match self { EvictionPolicy::NoEviction => "NoEviction", EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate", } } } #[derive( Eq, PartialEq, Debug, Copy, Clone, strum_macros::EnumString, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, )] #[strum(serialize_all = "kebab-case")] pub enum CompactionAlgorithm { Legacy, Tiered, } #[derive( Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, )] pub enum ImageCompressionAlgorithm { // Disabled for writes, support decompressing during read path Disabled, /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well. /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html). Zstd { level: Option, }, } impl FromStr for ImageCompressionAlgorithm { type Err = anyhow::Error; fn from_str(s: &str) -> Result { let mut components = s.split(['(', ')']); let first = components .next() .ok_or_else(|| anyhow::anyhow!("empty string"))?; match first { "disabled" => Ok(ImageCompressionAlgorithm::Disabled), "zstd" => { let level = if let Some(v) = components.next() { let v: i8 = v.parse()?; Some(v) } else { None }; Ok(ImageCompressionAlgorithm::Zstd { level }) } _ => anyhow::bail!("invalid specifier '{first}'"), } } } impl Display for ImageCompressionAlgorithm { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), ImageCompressionAlgorithm::Zstd { level } => { if let Some(level) = level { write!(f, "zstd({level})") } else { write!(f, "zstd") } } } } } #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, } #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum L0FlushConfig { #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] pub period: Duration, #[serde(with = "humantime_serde")] pub threshold: Duration, } #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ThrottleConfig { /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`. #[serde(rename = "task_kinds")] pub enabled: ThrottleConfigTaskKinds, pub initial: u32, #[serde(with = "humantime_serde")] pub refill_interval: Duration, pub refill_amount: NonZeroU32, pub max: u32, } /// Before /// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call. /// The `task_kinds` field controlled which Pageserver "Task Kind"s /// were subject to the throttle. /// /// After that PR, the throttle is applied at pagestream request level /// and the `task_kinds` field does not apply since the only task kind /// that us subject to the throttle is that of the page service. /// /// However, we don't want to make a breaking config change right now /// because it means we have to migrate all the tenant configs. /// This will be done in a future PR. /// /// In the meantime, we use emptiness / non-emptsiness of the `task_kinds` /// field to determine if the throttle is enabled or not. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[serde(transparent)] pub struct ThrottleConfigTaskKinds(Vec); impl ThrottleConfigTaskKinds { pub fn disabled() -> Self { Self(vec![]) } pub fn is_enabled(&self) -> bool { !self.0.is_empty() } } impl ThrottleConfig { pub fn disabled() -> Self { Self { enabled: ThrottleConfigTaskKinds::disabled(), // other values don't matter with emtpy `task_kinds`. initial: 0, refill_interval: Duration::from_millis(1), refill_amount: NonZeroU32::new(1).unwrap(), max: 1, } } /// The requests per second allowed by the given config. pub fn steady_rps(&self) -> f64 { (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) } } #[cfg(test)] mod throttle_config_tests { use super::*; #[test] fn test_disabled_is_disabled() { let config = ThrottleConfig::disabled(); assert!(!config.enabled.is_enabled()); } #[test] fn test_enabled_backwards_compat() { let input = serde_json::json!({ "task_kinds": ["PageRequestHandler"], "initial": 40000, "refill_interval": "50ms", "refill_amount": 1000, "max": 40000, "fair": true }); let config: ThrottleConfig = serde_json::from_value(input).unwrap(); assert!(config.enabled.is_enabled()); } } /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] pub enum LocationConfigMode { AttachedSingle, AttachedMulti, AttachedStale, Secondary, Detached, } #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct LocationConfigSecondary { pub warm: bool, } /// An alternative representation of `pageserver::tenant::LocationConf`, /// for use in external-facing APIs. #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct LocationConfig { pub mode: LocationConfigMode, /// If attaching, in what generation? #[serde(default)] pub generation: Option, // If requesting mode `Secondary`, configuration for that. #[serde(default)] pub secondary_conf: Option, // Shard parameters: if shard_count is nonzero, then other shard_* fields // must be set accurately. #[serde(default)] pub shard_number: u8, #[serde(default)] pub shard_count: u8, #[serde(default)] pub shard_stripe_size: u32, // This configuration only affects attached mode, but should be provided irrespective // of the mode, as a secondary location might transition on startup if the response // to the `/re-attach` control plane API requests it. pub tenant_conf: TenantConfig, } #[derive(Serialize, Deserialize)] pub struct LocationConfigListResponse { pub tenant_shards: Vec<(TenantShardId, Option)>, } #[derive(Serialize)] pub struct StatusResponse { pub id: NodeId, } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantTimeTravelRequest { pub shard_counts: Vec, } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantShardLocation { pub shard_id: TenantShardId, pub node_id: NodeId, } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigResponse { pub shards: Vec, // If the shards' ShardCount count is >1, stripe_size will be set. pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantConfigRequest { pub tenant_id: TenantId, #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } impl std::ops::Deref for TenantConfigRequest { type Target = TenantConfig; fn deref(&self) -> &Self::Target { &self.config } } impl TenantConfigRequest { pub fn new(tenant_id: TenantId) -> TenantConfigRequest { let config = TenantConfig::default(); TenantConfigRequest { tenant_id, config } } } #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantConfigPatchRequest { pub tenant_id: TenantId, #[serde(flatten)] pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it } #[derive(Serialize, Deserialize, Debug)] pub struct TenantWaitLsnRequest { #[serde(flatten)] pub timelines: HashMap, pub timeout: Duration, } /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] pub enum TenantAttachmentStatus { Maybe, Attached, Failed { reason: String }, } #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { pub id: TenantShardId, // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, pub generation: u32, /// Opaque explanation if gc is being blocked. /// /// Only looked up for the individual tenant detail, not the listing. #[serde(skip_serializing_if = "Option::is_none")] pub gc_blocking: Option, } #[derive(Serialize, Deserialize, Clone)] pub struct TenantDetails { #[serde(flatten)] pub tenant_info: TenantInfo, pub walredo: Option, pub timelines: Vec, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)] pub enum TimelineArchivalState { Archived, Unarchived, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] pub enum TimelineVisibilityState { Visible, Invisible, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct TimelinePatchIndexPartRequest { pub rel_size_migration: Option, pub rel_size_migrated_at: Option, pub gc_compaction_last_completed_lsn: Option, pub applied_gc_cutoff_lsn: Option, #[serde(default)] pub force_index_update: bool, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelinesInfoAndOffloaded { pub timelines: Vec, pub offloaded: Vec, } /// Analog of [`TimelineInfo`] for offloaded timelines. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct OffloadedTimelineInfo { pub tenant_id: TenantShardId, pub timeline_id: TimelineId, /// Whether the timeline has a parent it has been branched off from or not pub ancestor_timeline_id: Option, /// Whether to retain the branch lsn at the ancestor or not pub ancestor_retain_lsn: Option, /// The time point when the timeline was archived pub archived_at: chrono::DateTime, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub enum RelSizeMigration { /// The tenant is using the old rel_size format. /// Note that this enum is persisted as `Option` in the index part, so /// `None` is the same as `Some(RelSizeMigration::Legacy)`. Legacy, /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are /// persisted in the storage. The read path will read both formats and validate them. Migrating, /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted /// in the storage, and the read path will not read the old format. Migrated, } /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { pub tenant_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, pub ancestor_lsn: Option, pub last_record_lsn: Lsn, pub prev_record_lsn: Option, /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, /// as it is easier to reason about. #[serde(default)] pub applied_gc_cutoff_lsn: Lsn, /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval. /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest /// LSN at which it is legal to create a branch or ephemeral endpoint. /// /// Note that holders of valid LSN leases may be able to create branches and read pages earlier /// than this LSN, but new leases may not be taken out earlier than this LSN. #[serde(default)] pub min_readable_lsn: Lsn, pub disk_consistent_lsn: Lsn, /// The LSN that we have succesfully uploaded to remote storage pub remote_consistent_lsn: Lsn, /// The LSN that we are advertizing to safekeepers pub remote_consistent_lsn_visible: Lsn, /// The LSN from the start of the root timeline (never changes) pub initdb_lsn: Lsn, pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, pub directory_entries_counts: Vec, /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes /// beyond the branch's branch point, we only count up to the branch point. pub pitr_history_size: u64, /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any /// ancestor data used by this branch would have been retained anyway). If this is false, then /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would /// otherwise be able to GC. pub within_ancestor_pitr: bool, pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, pub pg_version: PgMajorVersion, pub state: TimelineState, pub walreceiver_status: String, // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility. // Backward compatibility: you will get a JSON not containing the newly-added field. // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. /// Whether the timeline is archived. pub is_archived: Option, /// The status of the rel_size migration. pub rel_size_migration: Option, pub rel_size_migrated_at: Option, /// Whether the timeline is invisible in synthetic size calculations. pub is_invisible: Option, // HADRON: the largest LSN below which all page updates have been included in the image layers. #[serde(skip_serializing_if = "Option::is_none")] pub image_consistent_lsn: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerMapInfo { pub in_memory_layers: Vec, pub historic_layers: Vec, } /// The residence status of a layer #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceStatus { /// Residence status for a layer file that exists locally. /// It may also exist on the remote, we don't care here. Resident, /// Residence status for a layer file that only exists on the remote. Evicted, } #[serde_as] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { #[serde_as(as = "serde_with::TimestampMilliSeconds")] pub access_time: SystemTime, #[serde_as(as = "serde_with::TimestampMilliSeconds")] pub residence_time: SystemTime, pub visible: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum InMemoryLayerInfo { Open { lsn_start: Lsn }, Frozen { lsn_start: Lsn, lsn_end: Lsn }, } #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum HistoricLayerInfo { Delta { layer_file_name: String, layer_file_size: u64, lsn_start: Lsn, lsn_end: Lsn, remote: bool, access_stats: LayerAccessStats, l0: bool, }, Image { layer_file_name: String, layer_file_size: u64, lsn_start: Lsn, remote: bool, access_stats: LayerAccessStats, }, } impl HistoricLayerInfo { pub fn layer_file_name(&self) -> &str { match self { HistoricLayerInfo::Delta { layer_file_name, .. } => layer_file_name, HistoricLayerInfo::Image { layer_file_name, .. } => layer_file_name, } } pub fn is_remote(&self) -> bool { match self { HistoricLayerInfo::Delta { remote, .. } => *remote, HistoricLayerInfo::Image { remote, .. } => *remote, } } pub fn set_remote(&mut self, value: bool) { let field = match self { HistoricLayerInfo::Delta { remote, .. } => remote, HistoricLayerInfo::Image { remote, .. } => remote, }; *field = value; } pub fn layer_file_size(&self) -> u64 { match self { HistoricLayerInfo::Delta { layer_file_size, .. } => *layer_file_size, HistoricLayerInfo::Image { layer_file_size, .. } => *layer_file_size, } } } #[derive(Debug, Serialize, Deserialize)] pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, } #[derive(Debug, Serialize, Deserialize)] pub struct IngestAuxFilesRequest { pub aux_files: HashMap, } #[derive(Debug, Serialize, Deserialize)] pub struct ListAuxFilesRequest { pub lsn: Lsn, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DownloadRemoteLayersTaskInfo { pub task_id: String, pub state: DownloadRemoteLayersTaskState, pub total_layer_count: u64, // stable once `completed` pub successful_download_count: u64, // stable once `completed` pub failed_download_count: u64, // stable once `completed` } #[derive(Debug, Serialize, Deserialize, Clone)] pub enum DownloadRemoteLayersTaskState { Running, Completed, ShutDown, } #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerProcessStatus { pub pid: u32, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerStatus { pub last_redo_at: Option>, pub process: Option, } /// The progress of a secondary tenant. /// /// It is mostly useful when doing a long running download: e.g. initiating /// a download job, timing out while waiting for it to run, and then inspecting this status to understand /// what's happening. #[derive(Default, Debug, Serialize, Deserialize, Clone)] pub struct SecondaryProgress { /// The remote storage LastModified time of the heatmap object we last downloaded. pub heatmap_mtime: Option, /// The number of layers currently on-disk pub layers_downloaded: usize, /// The number of layers in the most recently seen heatmap pub layers_total: usize, /// The number of layer bytes currently on-disk pub bytes_downloaded: u64, /// The number of layer bytes in the most recently seen heatmap pub bytes_total: u64, } #[derive(Serialize, Deserialize, Debug)] pub struct TenantScanRemoteStorageShard { pub tenant_shard_id: TenantShardId, pub generation: Option, pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug, Default)] pub struct TenantScanRemoteStorageResponse { pub shards: Vec, } #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] pub enum TenantSorting { /// Total size of layers on local disk for all timelines in a shard. ResidentSize, /// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on /// shard 0, contains the sum across all shards. MaxLogicalSize, /// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of /// shards. Only tracked on shard 0, and estimates the per-shard logical size. MaxLogicalSizePerShard, } impl Default for TenantSorting { fn default() -> Self { Self::ResidentSize } } #[derive(Serialize, Deserialize, Debug, Clone)] pub struct TopTenantShardsRequest { // How would you like to sort the tenants? pub order_by: TenantSorting, // How many results? pub limit: usize, // Omit tenants with more than this many shards (e.g. if this is the max number of shards // that the caller would ever split to) pub where_shards_lt: Option, // Omit tenants where the ordering metric is less than this (this is an optimization to // let us quickly exclude numerous tiny shards) pub where_gt: Option, } #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] pub struct TopTenantShardItem { pub id: TenantShardId, /// Total size of layers on local disk for all timelines in this shard. pub resident_size: u64, /// Total size of layers in remote storage for all timelines in this shard. pub physical_size: u64, /// The largest logical size of a timeline within this _tenant_ (not shard). This is only /// tracked on shard 0, and contains the sum of the logical size across all shards. pub max_logical_size: u64, /// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of /// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by /// shard count, rounded up. pub max_logical_size_per_shard: u64, } #[derive(Serialize, Deserialize, Debug, Default)] pub struct TopTenantShardsResponse { pub shards: Vec, } pub mod virtual_file { #[derive( Copy, Clone, PartialEq, Eq, Hash, strum_macros::EnumString, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, Debug, )] #[strum(serialize_all = "kebab-case")] pub enum IoEngineKind { StdFs, #[cfg(target_os = "linux")] TokioEpollUring, } /// Direct IO modes for a pageserver. #[derive( Copy, Clone, PartialEq, Eq, Hash, strum_macros::EnumString, strum_macros::EnumIter, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, Debug, )] #[strum(serialize_all = "kebab-case")] #[repr(u8)] pub enum IoMode { /// Uses buffered IO. Buffered, /// Uses direct IO for reads only. Direct, /// Use direct IO for reads and writes. DirectRw, } impl IoMode { pub fn preferred() -> Self { IoMode::DirectRw } } impl TryFrom for IoMode { type Error = u8; fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoMode::Buffered as u8) => IoMode::Buffered, v if v == (IoMode::Direct as u8) => IoMode::Direct, v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw, x => return Err(x), }) } } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ScanDisposableKeysResponse { pub disposable_count: usize, pub not_disposable_count: usize, } // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] pub struct TenantHistorySize { pub id: TenantId, /// Size is a mixture of WAL and logical size, so the unit is bytes. /// /// Will be none if `?inputs_only=true` was given. pub size: Option, } #[derive(Debug, Serialize, Deserialize)] pub struct PageTraceEvent { pub key: CompactKey, pub effective_lsn: Lsn, pub time: SystemTime, } impl Default for PageTraceEvent { fn default() -> Self { Self { key: Default::default(), effective_lsn: Default::default(), time: std::time::UNIX_EPOCH, } } } #[cfg(test)] mod tests { use std::str::FromStr; use serde_json::json; use super::*; #[test] fn test_tenantinfo_serde() { // Test serialization/deserialization of TenantInfo let original_active = TenantInfo { id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, gc_blocking: None, }; let expected_active = json!({ "id": original_active.id.to_string(), "state": { "slug": "Active", }, "current_physical_size": 42, "attachment_status": { "slug":"attached", }, "generation" : 1 }); let original_broken = TenantInfo { id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Broken { reason: "reason".into(), backtrace: "backtrace info".into(), }, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, gc_blocking: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), "state": { "slug": "Broken", "data": { "backtrace": "backtrace info", "reason": "reason", } }, "current_physical_size": 42, "attachment_status": { "slug":"attached", }, "generation" : 1 }); assert_eq!( serde_json::to_value(&original_active).unwrap(), expected_active ); assert_eq!( serde_json::to_value(&original_broken).unwrap(), expected_broken ); assert!(format!("{:?}", &original_broken.state).contains("reason")); assert!(format!("{:?}", &original_broken.state).contains("backtrace info")); } #[test] fn test_reject_unknown_field() { let id = TenantId::generate(); let config_request = json!({ "tenant_id": id.to_string(), "unknown_field": "unknown_value".to_string(), }); let err = serde_json::from_value::(config_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), "expect unknown field `unknown_field` error, got: {err}" ); } #[test] fn tenantstatus_activating_serde() { let states = [TenantState::Activating(ActivatingFrom::Attaching)]; let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]"; let actual = serde_json::to_string(&states).unwrap(); assert_eq!(actual, expected); let parsed = serde_json::from_str::>(&actual).unwrap(); assert_eq!(states.as_slice(), &parsed); } #[test] fn tenantstatus_activating_strum() { // tests added, because we use these for metrics let examples = [ (line!(), TenantState::Attaching, "Attaching"), ( line!(), TenantState::Activating(ActivatingFrom::Attaching), "Activating", ), (line!(), TenantState::Active, "Active"), ( line!(), TenantState::Stopping { progress: None }, "Stopping", ), ( line!(), TenantState::Stopping { progress: Some(completion::Barrier::default()), }, "Stopping", ), ( line!(), TenantState::Broken { reason: "Example".into(), backtrace: "Looooong backtrace".into(), }, "Broken", ), ]; for (line, rendered, expected) in examples { let actual: &'static str = rendered.into(); assert_eq!(actual, expected, "example on {line}"); } } #[test] fn test_image_compression_algorithm_parsing() { use ImageCompressionAlgorithm::*; let cases = [ ("disabled", Disabled), ("zstd", Zstd { level: None }), ("zstd(18)", Zstd { level: Some(18) }), ("zstd(-3)", Zstd { level: Some(-3) }), ]; for (display, expected) in cases { assert_eq!( ImageCompressionAlgorithm::from_str(display).unwrap(), expected, "parsing works" ); assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip"); let ser = serde_json::to_string(&expected).expect("serialization"); assert_eq!( serde_json::from_str::(&ser).unwrap(), expected, "serde roundtrip" ); assert_eq!( serde_json::Value::String(display.to_string()), serde_json::to_value(expected).unwrap(), "Display is the serde serialization" ); } } #[test] fn test_tenant_config_patch_request_serde() { let patch_request = TenantConfigPatchRequest { tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(), config: TenantConfigPatch { checkpoint_distance: FieldPatch::Upsert(42), gc_horizon: FieldPatch::Remove, compaction_threshold: FieldPatch::Noop, ..TenantConfigPatch::default() }, }; let json = serde_json::to_string(&patch_request).unwrap(); let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#; assert_eq!(json, expected); let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap(); assert_eq!(decoded.tenant_id, patch_request.tenant_id); assert_eq!(decoded.config, patch_request.config); // Now apply the patch to a config to demonstrate semantics let base = TenantConfig { checkpoint_distance: Some(28), gc_horizon: Some(100), compaction_target_size: Some(1024), ..Default::default() }; let expected = TenantConfig { checkpoint_distance: Some(42), gc_horizon: None, ..base.clone() }; let patched = base.apply_patch(decoded.config).unwrap(); assert_eq!(patched, expected); } } ================================================ FILE: libs/pageserver_api/src/pagestream_api.rs ================================================ //! Rust definitions of the libpq-based pagestream API //! //! See also the C implementation of the same API in pgxn/neon/pagestore_client.h use std::io::{BufRead, Read}; use crate::reltag::RelTag; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use utils::lsn::Lsn; /// Block size. /// /// XXX: We assume 8k block size in the SLRU fetch API. It's not great to hardcode /// that in the protocol, because Postgres supports different block sizes as a compile /// time option. const BLCKSZ: usize = 8192; // Wrapped in libpq CopyData #[derive(PartialEq, Eq, Debug)] pub enum PagestreamFeMessage { Exists(PagestreamExistsRequest), Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), GetSlruSegment(PagestreamGetSlruSegmentRequest), #[cfg(feature = "testing")] Test(PagestreamTestRequest), } // Wrapped in libpq CopyData #[derive(Debug, strum_macros::EnumProperty)] pub enum PagestreamBeMessage { Exists(PagestreamExistsResponse), Nblocks(PagestreamNblocksResponse), GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), GetSlruSegment(PagestreamGetSlruSegmentResponse), #[cfg(feature = "testing")] Test(PagestreamTestResponse), } // Keep in sync with `pagestore_client.h` #[repr(u8)] enum PagestreamFeMessageTag { Exists = 0, Nblocks = 1, GetPage = 2, DbSize = 3, GetSlruSegment = 4, /* future tags above this line */ /// For testing purposes, not available in production. #[cfg(feature = "testing")] Test = 99, } // Keep in sync with `pagestore_client.h` #[repr(u8)] enum PagestreamBeMessageTag { Exists = 100, Nblocks = 101, GetPage = 102, Error = 103, DbSize = 104, GetSlruSegment = 105, /* future tags above this line */ /// For testing purposes, not available in production. #[cfg(feature = "testing")] Test = 199, } impl TryFrom for PagestreamFeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { match value { 0 => Ok(PagestreamFeMessageTag::Exists), 1 => Ok(PagestreamFeMessageTag::Nblocks), 2 => Ok(PagestreamFeMessageTag::GetPage), 3 => Ok(PagestreamFeMessageTag::DbSize), 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), #[cfg(feature = "testing")] 99 => Ok(PagestreamFeMessageTag::Test), _ => Err(value), } } } impl TryFrom for PagestreamBeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { match value { 100 => Ok(PagestreamBeMessageTag::Exists), 101 => Ok(PagestreamBeMessageTag::Nblocks), 102 => Ok(PagestreamBeMessageTag::GetPage), 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), #[cfg(feature = "testing")] 199 => Ok(PagestreamBeMessageTag::Test), _ => Err(value), } } } // A GetPage request contains two LSN values: // // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means // "get the latest version present". It's used by the primary server, which knows that no one else // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is // Lsn::Max. Standby servers use the current replay LSN as the request LSN. // // not_modified_since: Hint to the pageserver that the client knows that the page has not been // modified between 'not_modified_since' and the request LSN. It's always correct to set // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but // passing an earlier LSN can speed up the request, by allowing the pageserver to process the // request without waiting for 'request_lsn' to arrive. // // The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and // 'latest' was set to true. The V2 interface was added because there was no correct way for a // standby to request a page at a particular non-latest LSN, and also include the // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the // request, if the standby knows that the page hasn't been modified since, and risk getting an error // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 // interface allows sending both LSNs, and let the pageserver do the right thing. There was no // difference in the responses between V1 and V2. // // V3 version of protocol adds request ID to all requests. This request ID is also included in response // as well as other fields from requests, which allows to verify that we receive response for our request. // We copy fields from request to response to make checking more reliable: request ID is formed from process ID // and local counter, so in principle there can be duplicated requests IDs if process PID is reused. // #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum PagestreamProtocolVersion { V2, V3, } pub type RequestId = u64; #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] pub struct PagestreamRequest { pub reqid: RequestId, pub request_lsn: Lsn, pub not_modified_since: Lsn, } #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamExistsRequest { pub hdr: PagestreamRequest, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamNblocksRequest { pub hdr: PagestreamRequest, pub rel: RelTag, } #[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetPageRequest { pub hdr: PagestreamRequest, pub rel: RelTag, pub blkno: u32, } #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamDbSizeRequest { pub hdr: PagestreamRequest, pub dbnode: u32, } #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetSlruSegmentRequest { pub hdr: PagestreamRequest, pub kind: u8, pub segno: u32, } #[derive(Debug)] pub struct PagestreamExistsResponse { pub req: PagestreamExistsRequest, pub exists: bool, } #[derive(Debug)] pub struct PagestreamNblocksResponse { pub req: PagestreamNblocksRequest, pub n_blocks: u32, } #[derive(Debug)] pub struct PagestreamGetPageResponse { pub req: PagestreamGetPageRequest, pub page: Bytes, } #[derive(Debug)] pub struct PagestreamGetSlruSegmentResponse { pub req: PagestreamGetSlruSegmentRequest, pub segment: Bytes, } #[derive(Debug)] pub struct PagestreamErrorResponse { pub req: PagestreamRequest, pub message: String, } #[derive(Debug)] pub struct PagestreamDbSizeResponse { pub req: PagestreamDbSizeRequest, pub db_size: i64, } #[cfg(feature = "testing")] #[derive(Debug, PartialEq, Eq, Clone)] pub struct PagestreamTestRequest { pub hdr: PagestreamRequest, pub batch_key: u64, pub message: String, } #[cfg(feature = "testing")] #[derive(Debug)] pub struct PagestreamTestResponse { pub req: PagestreamTestRequest, } impl PagestreamFeMessage { /// Serialize a compute -> pageserver message. This is currently only used in testing /// tools. Always uses protocol version 3. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(PagestreamFeMessageTag::Exists as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); bytes.put_u8(req.rel.forknum); } Self::Nblocks(req) => { bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); bytes.put_u8(req.rel.forknum); } Self::GetPage(req) => { bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); bytes.put_u8(req.rel.forknum); bytes.put_u32(req.blkno); } Self::DbSize(req) => { bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.dbnode); } Self::GetSlruSegment(req) => { bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } #[cfg(feature = "testing")] Self::Test(req) => { bytes.put_u8(PagestreamFeMessageTag::Test as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u64(req.batch_key); let message = req.message.as_bytes(); bytes.put_u64(message.len() as u64); bytes.put_slice(message); } } bytes.into() } pub fn parse( body: &mut R, protocol_version: PagestreamProtocolVersion, ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; let (reqid, request_lsn, not_modified_since) = match protocol_version { PagestreamProtocolVersion::V2 => ( 0, Lsn::from(body.read_u64::()?), Lsn::from(body.read_u64::()?), ), PagestreamProtocolVersion::V3 => ( body.read_u64::()?, Lsn::from(body.read_u64::()?), Lsn::from(body.read_u64::()?), ), }; match PagestreamFeMessageTag::try_from(msg_tag) .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { PagestreamFeMessageTag::Exists => { Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, relnode: body.read_u32::()?, forknum: body.read_u8()?, }, })) } PagestreamFeMessageTag::Nblocks => { Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, relnode: body.read_u32::()?, forknum: body.read_u8()?, }, })) } PagestreamFeMessageTag::GetPage => { Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, relnode: body.read_u32::()?, forknum: body.read_u8()?, }, blkno: body.read_u32::()?, })) } PagestreamFeMessageTag::DbSize => { Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, dbnode: body.read_u32::()?, })) } PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, kind: body.read_u8()?, segno: body.read_u32::()?, }, )), #[cfg(feature = "testing")] PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, batch_key: body.read_u64::()?, message: { let len = body.read_u64::()?; let mut buf = vec![0; len as usize]; body.read_exact(&mut buf)?; String::from_utf8(buf)? }, })), } } } impl PagestreamBeMessage { pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes { let mut bytes = BytesMut::new(); use PagestreamBeMessageTag as Tag; match protocol_version { PagestreamProtocolVersion::V2 => { match self { Self::Exists(resp) => { bytes.put_u8(Tag::Exists as u8); bytes.put_u8(resp.exists as u8); } Self::Nblocks(resp) => { bytes.put_u8(Tag::Nblocks as u8); bytes.put_u32(resp.n_blocks); } Self::GetPage(resp) => { bytes.put_u8(Tag::GetPage as u8); bytes.put(&resp.page[..]) } Self::Error(resp) => { bytes.put_u8(Tag::Error as u8); bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } Self::DbSize(resp) => { bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } Self::GetSlruSegment(resp) => { bytes.put_u8(Tag::GetSlruSegment as u8); bytes.put_u32((resp.segment.len() / BLCKSZ) as u32); bytes.put(&resp.segment[..]); } #[cfg(feature = "testing")] Self::Test(resp) => { bytes.put_u8(Tag::Test as u8); bytes.put_u64(resp.req.batch_key); let message = resp.req.message.as_bytes(); bytes.put_u64(message.len() as u64); bytes.put_slice(message); } } } PagestreamProtocolVersion::V3 => { match self { Self::Exists(resp) => { bytes.put_u8(Tag::Exists as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u32(resp.req.rel.spcnode); bytes.put_u32(resp.req.rel.dbnode); bytes.put_u32(resp.req.rel.relnode); bytes.put_u8(resp.req.rel.forknum); bytes.put_u8(resp.exists as u8); } Self::Nblocks(resp) => { bytes.put_u8(Tag::Nblocks as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u32(resp.req.rel.spcnode); bytes.put_u32(resp.req.rel.dbnode); bytes.put_u32(resp.req.rel.relnode); bytes.put_u8(resp.req.rel.forknum); bytes.put_u32(resp.n_blocks); } Self::GetPage(resp) => { bytes.put_u8(Tag::GetPage as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u32(resp.req.rel.spcnode); bytes.put_u32(resp.req.rel.dbnode); bytes.put_u32(resp.req.rel.relnode); bytes.put_u8(resp.req.rel.forknum); bytes.put_u32(resp.req.blkno); bytes.put(&resp.page[..]) } Self::Error(resp) => { bytes.put_u8(Tag::Error as u8); bytes.put_u64(resp.req.reqid); bytes.put_u64(resp.req.request_lsn.0); bytes.put_u64(resp.req.not_modified_since.0); bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } Self::DbSize(resp) => { bytes.put_u8(Tag::DbSize as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u32(resp.req.dbnode); bytes.put_i64(resp.db_size); } Self::GetSlruSegment(resp) => { bytes.put_u8(Tag::GetSlruSegment as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u8(resp.req.kind); bytes.put_u32(resp.req.segno); bytes.put_u32((resp.segment.len() / BLCKSZ) as u32); bytes.put(&resp.segment[..]); } #[cfg(feature = "testing")] Self::Test(resp) => { bytes.put_u8(Tag::Test as u8); bytes.put_u64(resp.req.hdr.reqid); bytes.put_u64(resp.req.hdr.request_lsn.0); bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u64(resp.req.batch_key); let message = resp.req.message.as_bytes(); bytes.put_u64(message.len() as u64); bytes.put_slice(message); } } } } bytes.into() } pub fn deserialize(buf: Bytes) -> anyhow::Result { let mut buf = buf.reader(); let msg_tag = buf.read_u8()?; use PagestreamBeMessageTag as Tag; let ok = match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { Tag::Exists => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let rel = RelTag { spcnode: buf.read_u32::()?, dbnode: buf.read_u32::()?, relnode: buf.read_u32::()?, forknum: buf.read_u8()?, }; let exists = buf.read_u8()? != 0; Self::Exists(PagestreamExistsResponse { req: PagestreamExistsRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel, }, exists, }) } Tag::Nblocks => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let rel = RelTag { spcnode: buf.read_u32::()?, dbnode: buf.read_u32::()?, relnode: buf.read_u32::()?, forknum: buf.read_u8()?, }; let n_blocks = buf.read_u32::()?; Self::Nblocks(PagestreamNblocksResponse { req: PagestreamNblocksRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel, }, n_blocks, }) } Tag::GetPage => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let rel = RelTag { spcnode: buf.read_u32::()?, dbnode: buf.read_u32::()?, relnode: buf.read_u32::()?, forknum: buf.read_u8()?, }; let blkno = buf.read_u32::()?; let mut page = vec![0; 8192]; // TODO: use MaybeUninit buf.read_exact(&mut page)?; Self::GetPage(PagestreamGetPageResponse { req: PagestreamGetPageRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, rel, blkno, }, page: page.into(), }) } Tag::Error => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let mut msg = Vec::new(); buf.read_until(0, &mut msg)?; let cstring = std::ffi::CString::from_vec_with_nul(msg)?; let rust_str = cstring.to_str()?; Self::Error(PagestreamErrorResponse { req: PagestreamRequest { reqid, request_lsn, not_modified_since, }, message: rust_str.to_owned(), }) } Tag::DbSize => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let dbnode = buf.read_u32::()?; let db_size = buf.read_i64::()?; Self::DbSize(PagestreamDbSizeResponse { req: PagestreamDbSizeRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, dbnode, }, db_size, }) } Tag::GetSlruSegment => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let kind = buf.read_u8()?; let segno = buf.read_u32::()?; let n_blocks = buf.read_u32::()?; let mut segment = vec![0; n_blocks as usize * BLCKSZ]; buf.read_exact(&mut segment)?; Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { req: PagestreamGetSlruSegmentRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, kind, segno, }, segment: segment.into(), }) } #[cfg(feature = "testing")] Tag::Test => { let reqid = buf.read_u64::()?; let request_lsn = Lsn(buf.read_u64::()?); let not_modified_since = Lsn(buf.read_u64::()?); let batch_key = buf.read_u64::()?; let len = buf.read_u64::()?; let mut msg = vec![0; len as usize]; buf.read_exact(&mut msg)?; let message = String::from_utf8(msg)?; Self::Test(PagestreamTestResponse { req: PagestreamTestRequest { hdr: PagestreamRequest { reqid, request_lsn, not_modified_since, }, batch_key, message, }, }) } }; let remaining = buf.into_inner(); if !remaining.is_empty() { anyhow::bail!( "remaining bytes in msg with tag={msg_tag}: {}", remaining.len() ); } Ok(ok) } pub fn kind(&self) -> &'static str { match self { Self::Exists(_) => "Exists", Self::Nblocks(_) => "Nblocks", Self::GetPage(_) => "GetPage", Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", Self::GetSlruSegment(_) => "GetSlruSegment", #[cfg(feature = "testing")] Self::Test(_) => "Test", } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_pagestream() { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { hdr: PagestreamRequest { reqid: 0, request_lsn: Lsn(4), not_modified_since: Lsn(3), }, rel: RelTag { forknum: 1, spcnode: 2, dbnode: 3, relnode: 4, }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { hdr: PagestreamRequest { reqid: 0, request_lsn: Lsn(4), not_modified_since: Lsn(4), }, rel: RelTag { forknum: 1, spcnode: 2, dbnode: 3, relnode: 4, }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { hdr: PagestreamRequest { reqid: 0, request_lsn: Lsn(4), not_modified_since: Lsn(3), }, rel: RelTag { forknum: 1, spcnode: 2, dbnode: 3, relnode: 4, }, blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { hdr: PagestreamRequest { reqid: 0, request_lsn: Lsn(4), not_modified_since: Lsn(3), }, dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3) .unwrap(); assert!(msg == reconstructed); } } } ================================================ FILE: libs/pageserver_api/src/reltag.rs ================================================ use std::cmp::Ordering; use std::fmt; use postgres_ffi_types::Oid; use postgres_ffi_types::constants::GLOBALTABLESPACE_OID; use postgres_ffi_types::forknum::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; use serde::{Deserialize, Serialize}; /// /// Relation data file segment id throughout the Postgres cluster. /// /// Every data file in Postgres is uniquely identified by 4 numbers: /// - relation id / node (`relnode`) /// - database id (`dbnode`) /// - tablespace id (`spcnode`), in short this is a unique id of a separate /// directory to store data files. /// - forknumber (`forknum`) is used to split different kinds of data of the same relation /// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`). /// /// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value /// are used for the same purpose. /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). /// // FIXME: should move 'forknum' as last field to keep this consistent with Postgres. // Then we could replace the custom Ord and PartialOrd implementations below with // deriving them. This will require changes in walredoproc.c. #[derive(Debug, Default, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] pub struct RelTag { pub forknum: u8, pub spcnode: Oid, pub dbnode: Oid, pub relnode: Oid, } /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; impl PartialOrd for RelTag { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for RelTag { fn cmp(&self, other: &Self) -> Ordering { // Custom ordering where we put forknum to the end of the list let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum); (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup) } } /// Display RelTag in the same format that's used in most PostgreSQL debug messages: /// /// ```text /// //[_fsm|_vm|_init] /// ``` impl fmt::Display for RelTag { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(forkname) = forknumber_to_name(self.forknum) { write!( f, "{}/{}/{}_{}", self.spcnode, self.dbnode, self.relnode, forkname ) } else { write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode) } } } #[derive(Debug, thiserror::Error)] pub enum ParseRelTagError { #[error("invalid forknum")] InvalidForknum(#[source] std::num::ParseIntError), #[error("missing triplet member {}", .0)] MissingTripletMember(usize), #[error("invalid triplet member {}", .0)] InvalidTripletMember(usize, #[source] std::num::ParseIntError), } impl std::str::FromStr for RelTag { type Err = ParseRelTagError; fn from_str(s: &str) -> Result { use ParseRelTagError::*; // FIXME: in postgres logs this separator is dot // Example: // could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0 // with a regex we could get this more painlessly let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) { Some((t, f)) => { let forknum = forkname_to_number(Some(f)); let forknum = if let Ok(f) = forknum { f } else { f.parse::().map_err(InvalidForknum)? }; (t, Some(forknum)) } None => (s, None), }; let mut split = triplet .splitn(3, '/') .enumerate() .map(|(i, s)| s.parse::().map_err(|e| InvalidTripletMember(i, e))); let spcnode = split.next().ok_or(MissingTripletMember(0))??; let dbnode = split.next().ok_or(MissingTripletMember(1))??; let relnode = split.next().ok_or(MissingTripletMember(2))??; Ok(RelTag { spcnode, forknum: forknum.unwrap_or(MAIN_FORKNUM), dbnode, relnode, }) } } impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { let mut name = if self.spcnode == GLOBALTABLESPACE_OID { "global/".to_string() } else { format!("base/{}/", self.dbnode) }; name += &self.relnode.to_string(); if let Some(fork_name) = forknumber_to_name(self.forknum) { name += "_"; name += fork_name; } if segno != 0 { name += "."; name += &segno.to_string(); } name } pub fn with_forknum(&self, forknum: u8) -> Self { RelTag { forknum, spcnode: self.spcnode, dbnode: self.dbnode, relnode: self.relnode, } } } /// /// Non-relation transaction status files (clog (a.k.a. pg_xact) and /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, /// hence the name. /// /// These files are global for a postgres instance. /// /// These files are divided into segments, which are divided into /// pages of the same BLCKSZ as used for relation files. /// #[derive( Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, strum_macros::EnumIter, strum_macros::FromRepr, enum_map::Enum, )] #[repr(u8)] pub enum SlruKind { Clog = 0, MultiXactMembers, MultiXactOffsets, } impl fmt::Display for SlruKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Clog => write!(f, "pg_xact"), Self::MultiXactMembers => write!(f, "pg_multixact/members"), Self::MultiXactOffsets => write!(f, "pg_multixact/offsets"), } } } ================================================ FILE: libs/pageserver_api/src/shard.rs ================================================ //! See docs/rfcs/031-sharding-static.md for an overview of sharding. //! //! This module contains a variety of types used to represent the concept of sharding //! a Neon tenant across multiple physical shards. Since there are quite a few of these, //! we provide an summary here. //! //! Types used to describe shards: //! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value //! which identifies a tenant which is not shard-aware. This means its storage paths do not include //! a shard suffix. //! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. //! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` //! without the tenant ID. This is useful for things that are implicitly scoped to a particular //! tenant, such as layer files. //! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient //! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. //! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as //! four hex digits. An unsharded tenant is `0000`. //! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant //! //! Types used to describe the parameters for data distribution in a sharded tenant: //! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across //! multiple shards. Its value is given in 8kiB pages. //! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is //! always zero: this is provided for future upgrades that might introduce different //! data distribution schemes. //! //! Examples: //! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 //! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), //! and their slugs are 0004, 0104, 0204, and 0304. use std::hash::{Hash, Hasher}; #[doc(inline)] pub use ::utils::shard::*; use postgres_ffi_types::forknum::INIT_FORKNUM; use serde::{Deserialize, Serialize}; use utils::critical; use crate::key::Key; use crate::models::ShardParameters; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardIdentity { pub number: ShardNumber, pub count: ShardCount, pub stripe_size: ShardStripeSize, layout: ShardLayout, } /// Hash implementation /// /// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons. impl Hash for ShardIdentity { fn hash(&self, state: &mut H) { let ShardIdentity { number, count, stripe_size: _, layout: _, } = self; number.0.hash(state); count.0.hash(state); } } /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); const LAYOUT_V1: ShardLayout = ShardLayout(1); /// ShardIdentity uses a magic layout value to indicate if it is unusable const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// The default stripe size in pages. 16 MiB divided by 8 kiB page size. /// /// A lower stripe size distributes ingest load better across shards, but reduces IO amortization. /// 16 MiB appears to be a reasonable balance: . pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8); #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] InvalidCount, #[error("Invalid shard number")] InvalidNumber, #[error("Invalid stripe size")] InvalidStripeSize, } impl ShardIdentity { /// An identity with number=0 count=0 is a "none" identity, which represents legacy /// tenants. Modern single-shard tenants should not use this: they should /// have number=0 count=1. pub const fn unsharded() -> Self { Self { number: ShardNumber(0), count: ShardCount(0), layout: LAYOUT_V1, stripe_size: DEFAULT_STRIPE_SIZE, } } /// An unsharded identity with the given stripe size (if non-zero). This is typically used to /// carry over a stripe size for an unsharded tenant from persistent storage. pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self { let mut shard_identity = Self::unsharded(); if stripe_size.0 > 0 { shard_identity.stripe_size = stripe_size; } shard_identity } /// A broken instance of this type is only used for `TenantState::Broken` tenants, /// which are constructed in code paths that don't have access to proper configuration. /// /// A ShardIdentity in this state may not be used for anything, and should not be persisted. /// Enforcement is via assertions, to avoid making our interface fallible for this /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken /// state, and by extension to avoid trying to do any page->shard resolution. pub fn broken(number: ShardNumber, count: ShardCount) -> Self { Self { number, count, layout: LAYOUT_BROKEN, stripe_size: DEFAULT_STRIPE_SIZE, } } /// The "unsharded" value is distinct from simply having a single shard: it represents /// a tenant which is not shard-aware at all, and whose storage paths will not include /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } /// Count must be nonzero, and number must be < count. To construct /// the legacy case (count==0), use Self::unsharded instead. pub fn new( number: ShardNumber, count: ShardCount, stripe_size: ShardStripeSize, ) -> Result { if count.0 == 0 { Err(ShardConfigError::InvalidCount) } else if number.0 > count.0 - 1 { Err(ShardConfigError::InvalidNumber) } else if stripe_size.0 == 0 { Err(ShardConfigError::InvalidStripeSize) } else { Ok(Self { number, count, layout: LAYOUT_V1, stripe_size, }) } } /// For use when creating ShardIdentity instances for new shards, where a creation request /// specifies the ShardParameters that apply to all shards. pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self { Self { number, count: params.count, layout: LAYOUT_V1, stripe_size: params.stripe_size, } } /// Asserts that the given shard identities are equal. Changes to shard parameters will likely /// result in data corruption. pub fn assert_equal(&self, other: ShardIdentity) { if self != &other { // TODO: for now, we're conservative and just log errors in production. Turn this into a // real assertion when we're confident it doesn't misfire, and also reject requests that // attempt to change it with an error response. critical!("shard identity mismatch: {self:?} != {other:?}"); } } fn is_broken(&self) -> bool { self.layout == LAYOUT_BROKEN } pub fn get_shard_number(&self, key: &Key) -> ShardNumber { assert!(!self.is_broken()); key_to_shard_number(self.count, self.stripe_size, key) } /// Return true if the key is stored only on this shard. This does not include /// global keys, see is_key_global(). /// /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { true } else { key_to_shard_number(self.count, self.stripe_size, key) == self.number } } /// Return true if the key should be stored on all shards, not just one. pub fn is_key_global(&self, key: &Key) -> bool { if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() || key.is_slru_dir_key() { // Special keys that are only stored on shard 0 false } else if key.is_rel_block_key() { // Ordinary relation blocks are distributed across shards false } else if key.is_rel_size_key() { // All shards maintain rel size keys (although only shard 0 is responsible for // keeping it strictly accurate, other shards just reflect the highest block they've ingested) true } else { // For everything else, we assume it must be kept everywhere, because ingest code // might assume this -- this covers functionality where the ingest code has // not (yet) been made fully shard aware. true } } /// Return true if the key should be discarded if found in this shard's /// data store, e.g. during compaction after a split. /// /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if self.count < ShardCount(2) { // Fast path: unsharded tenant doesn't dispose of anything return false; } if self.is_key_global(key) { false } else { !self.is_key_local(key) } } /// Obtains the shard number and count combined into a `ShardIndex`. pub fn shard_index(&self) -> ShardIndex { ShardIndex { shard_count: self.count, shard_number: self.number, } } pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) } else { String::new() } } /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys /// in order to be able to serve basebackup requests without peer communication). fn key_is_shard0(key: &Key) -> bool { // To decide what to shard out to shards >0, we apply a simple rule that only // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. // // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 // because they must be included in basebackups. let is_initfork = key.field5 == INIT_FORKNUM; !key.is_rel_block_key() || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name fn murmurhash32(mut h: u32) -> u32 { h ^= h >> 16; h = h.wrapping_mul(0x85ebca6b); h ^= h >> 13; h = h.wrapping_mul(0xc2b2ae35); h ^= h >> 16; h } /// Provide the same result as the function in postgres `hashfn.h` with the same name fn hash_combine(mut a: u32, mut b: u32) -> u32 { b = b.wrapping_add(0x9e3779b9); b = b.wrapping_add(a << 6); b = b.wrapping_add(a >> 2); a ^= b; a } /// Where a Key is to be distributed across shards, select the shard. This function /// does not account for keys that should be broadcast across shards. /// /// The hashing in this function must exactly match what we do in postgres smgr /// code. The resulting distribution of pages is intended to preserve locality within /// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise /// distributing data pseudo-randomly. /// /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional /// and will be handled at higher levels when shards are split. pub fn key_to_shard_number( count: ShardCount, stripe_size: ShardStripeSize, key: &Key, ) -> ShardNumber { // Fast path for un-sharded tenants or broadcast keys if count < ShardCount(2) || key_is_shard0(key) { return ShardNumber(0); } // relNode let mut hash = murmurhash32(key.field4); // blockNum/stripe size hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0)); ShardNumber((hash % count.0 as u32) as u8) } /// For debugging, while not exposing the internals. #[derive(Debug)] #[allow(unused)] // used by debug formatting by pagectl struct KeyShardingInfo { shard0: bool, shard_number: ShardNumber, } pub fn describe( key: &Key, shard_count: ShardCount, stripe_size: ShardStripeSize, ) -> impl std::fmt::Debug { KeyShardingInfo { shard0: key_is_shard0(key), shard_number: key_to_shard_number(shard_count, stripe_size, key), } } #[cfg(test)] mod tests { use std::str::FromStr; use utils::Hex; use utils::id::TenantId; use super::*; const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc"; #[test] fn tenant_shard_id_string() -> Result<(), hex::FromHexError> { let example = TenantShardId { tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(), shard_count: ShardCount(10), shard_number: ShardNumber(7), }; let encoded = format!("{example}"); let expected = format!("{EXAMPLE_TENANT_ID}-070a"); assert_eq!(&encoded, &expected); let decoded = TenantShardId::from_str(&encoded)?; assert_eq!(example, decoded); Ok(()) } #[test] fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> { let example = TenantShardId { tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(), shard_count: ShardCount(10), shard_number: ShardNumber(7), }; let encoded = bincode::serialize(&example).unwrap(); let expected: [u8; 18] = [ 0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90, 0xf6, 0xfc, 0x07, 0x0a, ]; assert_eq!(Hex(&encoded), Hex(&expected)); let decoded = bincode::deserialize(&encoded).unwrap(); assert_eq!(example, decoded); Ok(()) } #[test] fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> { // Test that TenantShardId can decode a TenantId in human // readable form let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(); let encoded = format!("{example}"); assert_eq!(&encoded, EXAMPLE_TENANT_ID); let decoded = TenantShardId::from_str(&encoded)?; assert_eq!(example, decoded.tenant_id); assert_eq!(decoded.shard_count, ShardCount(0)); assert_eq!(decoded.shard_number, ShardNumber(0)); Ok(()) } #[test] fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> { // Test that a legacy TenantShardId encodes into a form that // can be decoded as TenantId let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(); let example = TenantShardId::unsharded(example_tenant_id); let encoded = format!("{example}"); assert_eq!(&encoded, EXAMPLE_TENANT_ID); let decoded = TenantId::from_str(&encoded)?; assert_eq!(example_tenant_id, decoded); Ok(()) } #[test] fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> { // Unlike in human readable encoding, binary encoding does not // do any special handling of legacy unsharded TenantIds: this test // is equivalent to the main test for binary encoding, just verifying // that the same behavior applies when we have used `unsharded()` to // construct a TenantShardId. let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap()); let encoded = bincode::serialize(&example).unwrap(); let expected: [u8; 18] = [ 0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90, 0xf6, 0xfc, 0x00, 0x00, ]; assert_eq!(Hex(&encoded), Hex(&expected)); let decoded = bincode::deserialize::(&encoded).unwrap(); assert_eq!(example, decoded); Ok(()) } #[test] fn shard_identity_validation() -> Result<(), ShardConfigError> { // Happy cases ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?; ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?; ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?; assert_eq!( ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE), Err(ShardConfigError::InvalidCount) ); assert_eq!( ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE), Err(ShardConfigError::InvalidNumber) ); assert_eq!( ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE), Err(ShardConfigError::InvalidNumber) ); assert_eq!( ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE), Err(ShardConfigError::InvalidNumber) ); assert_eq!( ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)), Err(ShardConfigError::InvalidStripeSize) ); Ok(()) } #[test] fn shard_index_human_encoding() -> Result<(), hex::FromHexError> { let example = ShardIndex { shard_number: ShardNumber(13), shard_count: ShardCount(17), }; let expected: String = "0d11".to_string(); let encoded = format!("{example}"); assert_eq!(&encoded, &expected); let decoded = ShardIndex::from_str(&encoded)?; assert_eq!(example, decoded); Ok(()) } #[test] fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> { let example = ShardIndex { shard_number: ShardNumber(13), shard_count: ShardCount(17), }; let expected: [u8; 2] = [0x0d, 0x11]; let encoded = bincode::serialize(&example).unwrap(); assert_eq!(Hex(&encoded), Hex(&expected)); let decoded = bincode::deserialize(&encoded).unwrap(); assert_eq!(example, decoded); Ok(()) } // These are only smoke tests to spot check that our implementation doesn't // deviate from a few examples values: not aiming to validate the overall // hashing algorithm. #[test] fn murmur_hash() { assert_eq!(murmurhash32(0), 0); assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9); } #[test] fn shard_mapping() { let key = Key { field1: 0x00, field2: 0x67f, field3: 0x5, field4: 0x400c, field5: 0x00, field6: 0x7d06, }; let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key); assert_eq!(shard, ShardNumber(8)); } #[test] fn shard_id_split() { let tenant_id = TenantId::generate(); let parent = TenantShardId::unsharded(tenant_id); // Unsharded into 2 assert_eq!( parent.split(ShardCount(2)), vec![ TenantShardId { tenant_id, shard_count: ShardCount(2), shard_number: ShardNumber(0) }, TenantShardId { tenant_id, shard_count: ShardCount(2), shard_number: ShardNumber(1) } ] ); // Unsharded into 4 assert_eq!( parent.split(ShardCount(4)), vec![ TenantShardId { tenant_id, shard_count: ShardCount(4), shard_number: ShardNumber(0) }, TenantShardId { tenant_id, shard_count: ShardCount(4), shard_number: ShardNumber(1) }, TenantShardId { tenant_id, shard_count: ShardCount(4), shard_number: ShardNumber(2) }, TenantShardId { tenant_id, shard_count: ShardCount(4), shard_number: ShardNumber(3) } ] ); // count=1 into 2 (check this works the same as unsharded.) let parent = TenantShardId { tenant_id, shard_count: ShardCount(1), shard_number: ShardNumber(0), }; assert_eq!( parent.split(ShardCount(2)), vec![ TenantShardId { tenant_id, shard_count: ShardCount(2), shard_number: ShardNumber(0) }, TenantShardId { tenant_id, shard_count: ShardCount(2), shard_number: ShardNumber(1) } ] ); // count=2 into count=8 let parent = TenantShardId { tenant_id, shard_count: ShardCount(2), shard_number: ShardNumber(1), }; assert_eq!( parent.split(ShardCount(8)), vec![ TenantShardId { tenant_id, shard_count: ShardCount(8), shard_number: ShardNumber(1) }, TenantShardId { tenant_id, shard_count: ShardCount(8), shard_number: ShardNumber(3) }, TenantShardId { tenant_id, shard_count: ShardCount(8), shard_number: ShardNumber(5) }, TenantShardId { tenant_id, shard_count: ShardCount(8), shard_number: ShardNumber(7) }, ] ); } } ================================================ FILE: libs/pageserver_api/src/upcall_api.rs ================================================ //! Types in this file are for pageserver's upward-facing API calls to the storage controller, //! required for acquiring and validating tenant generation numbers. //! //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; use crate::models::{LocationConfigMode, ShardImportStatus}; use crate::shard::{ShardStripeSize, TenantShardId}; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { pub node_id: NodeId, /// Optional inline self-registration: this is useful with the storage controller, /// if the node already has a node_id set. #[serde(skip_serializing_if = "Option::is_none", default)] pub register: Option, /// Hadron: Optional flag to indicate whether the node is starting with an empty local disk. /// Will be set to true if the node couldn't find any local tenant data on startup, could be /// due to the node starting for the first time or due to a local SSD failure/disk wipe event. /// The flag may be used by the storage controller to update its observed state of the world /// to make sure that it sends explicit location_config calls to the node following the /// re-attach request. pub empty_local_disk: Option, } #[derive(Serialize, Deserialize, Debug)] pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode pub r#gen: Option, pub mode: LocationConfigMode, pub stripe_size: ShardStripeSize, } #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { pub tenants: Vec, } #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { pub id: TenantShardId, pub r#gen: u32, } #[derive(Serialize, Deserialize)] pub struct ValidateRequest { pub tenants: Vec, } #[derive(Serialize, Deserialize)] pub struct ValidateResponse { pub tenants: Vec, } #[derive(Serialize, Deserialize)] pub struct ValidateResponseTenant { pub id: TenantShardId, pub valid: bool, } #[derive(Serialize, Deserialize)] pub struct TimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub generation: Generation, } #[derive(Serialize, Deserialize)] pub struct PutTimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub status: ShardImportStatus, pub generation: Generation, } ================================================ FILE: libs/postgres_backend/Cargo.toml ================================================ [package] name = "postgres_backend" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true bytes.workspace = true rustls.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true tokio-rustls.workspace = true tracing.workspace = true pq_proto.workspace = true [dev-dependencies] once_cell.workspace = true rustls-pemfile.workspace = true tokio-postgres.workspace = true tokio-postgres-rustls.workspace = true ================================================ FILE: libs/postgres_backend/src/lib.rs ================================================ //! Server-side asynchronous Postgres connection, as limited as we need. //! To use, create PostgresBackend and run() it, passing the Handler //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use std::future::Future; use std::net::SocketAddr; use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; use std::task::{Poll, ready}; use std::{fmt, io}; use anyhow::Context; use bytes::Bytes; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN, SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION, }; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; /// An error, occurred during query processing: /// either during the connection ([`ConnectionError`]) or before/after it. #[derive(thiserror::Error, Debug)] pub enum QueryError { /// The connection was lost while processing the query. #[error(transparent)] Disconnected(#[from] ConnectionError), /// We were instructed to shutdown while processing the query #[error("Shutting down")] Shutdown, /// Query handler indicated that client should reconnect #[error("Server requested reconnect")] Reconnect, /// Query named an entity that was not found #[error("Not found: {0}")] NotFound(std::borrow::Cow<'static, str>), /// Authentication failure #[error("Unauthorized: {0}")] Unauthorized(std::borrow::Cow<'static, str>), #[error("Simulated Connection Error")] SimulatedConnectionError, /// Some other error #[error(transparent)] Other(#[from] anyhow::Error), } impl From for QueryError { fn from(e: io::Error) -> Self { Self::Disconnected(ConnectionError::Io(e)) } } impl QueryError { pub fn pg_error_code(&self) -> &'static [u8; 5] { match self { Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN, Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR, Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error } } } /// Returns true if the given error is a normal consequence of a network issue, /// or the client closing the connection. /// /// These errors can happen during normal operations, /// and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), HostUnreachable | NetworkUnreachable | BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut, ) } pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). It will also flush out the output buffer. fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, ) -> impl Future>; /// Called on startup packet receival, allows to process params. /// /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow /// to override whole init logic in implementations. fn startup( &mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) } /// Check auth jwt fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8], ) -> Result<(), QueryError> { Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) } } /// PostgresBackend protocol state. /// XXX: The order of the constructors matters. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { /// Nothing happened yet. Initialization, /// Encryption handshake is done; waiting for encrypted Startup message. Encrypted, /// Waiting for password (auth token). Authentication, /// Performed handshake and auth, ReadyForQuery is issued. Established, Closed, } #[derive(Clone, Copy)] pub enum ProcessMsgResult { Continue, Break, } /// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite. pub enum MaybeTlsStream { Unencrypted(IO), Tls(Box>), } impl AsyncWrite for MaybeTlsStream { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), Self::Tls(stream) => Pin::new(stream).poll_flush(cx), } } fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), } } } impl AsyncRead for MaybeTlsStream { fn poll_read( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), } } } #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum AuthType { Trust, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT NeonJWT, } impl FromStr for AuthType { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { "Trust" => Ok(Self::Trust), "NeonJWT" => Ok(Self::NeonJWT), _ => anyhow::bail!("invalid value \"{s}\" for auth type"), } } } impl fmt::Display for AuthType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { AuthType::Trust => "Trust", AuthType::NeonJWT => "NeonJWT", }) } } /// Either full duplex Framed or write only half; the latter is left in /// PostgresBackend after call to `split`. In principle we could always store a /// pair of splitted handles, but that would force to to pay splitting price /// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver). enum MaybeWriteOnly { Full(Framed>), WriteOnly(FramedWriter>), Broken, // temporary value palmed off during the split } impl MaybeWriteOnly { async fn read_startup_message(&mut self) -> Result, ConnectionError> { match self { MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, MaybeWriteOnly::WriteOnly(_) => { Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } async fn read_message(&mut self) -> Result, ConnectionError> { match self { MaybeWriteOnly::Full(framed) => framed.read_message().await, MaybeWriteOnly::WriteOnly(_) => { Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { match self { MaybeWriteOnly::Full(framed) => framed.write_message(msg), MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg), MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } async fn flush(&mut self) -> io::Result<()> { match self { MaybeWriteOnly::Full(framed) => framed.flush().await, MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await, MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } /// Cancellation safe as long as the underlying IO is cancellation safe. async fn shutdown(&mut self) -> io::Result<()> { match self { MaybeWriteOnly::Full(framed) => framed.shutdown().await, MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await, MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } } pub struct PostgresBackend { pub socket_fd: RawFd, framed: MaybeWriteOnly, pub state: ProtoState, auth_type: AuthType, peer_addr: SocketAddr, pub tls_config: Option>, } pub type PostgresBackendTCP = PostgresBackend; /// Cast a byte slice to a string slice, dropping null terminator if there's one. fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } impl PostgresBackend { pub fn new( socket: tokio::net::TcpStream, auth_type: AuthType, tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; let socket_fd = socket.as_raw_fd(); let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, tls_config, peer_addr, }) } } impl PostgresBackend { pub fn new_from_io( socket_fd: RawFd, socket: IO, peer_addr: SocketAddr, auth_type: AuthType, tls_config: Option>, ) -> io::Result { let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, tls_config, peer_addr, }) } pub fn get_peer_addr(&self) -> &SocketAddr { &self.peer_addr } /// Read full message or return None if connection is cleanly closed with no /// unprocessed data. pub async fn read_message(&mut self) -> Result, ConnectionError> { if let ProtoState::Closed = self.state { Ok(None) } else { match self.framed.read_message().await { Ok(m) => { trace!("read msg {:?}", m); Ok(m) } Err(e) => { // remember not to try to read anymore self.state = ProtoState::Closed; Err(e) } } } } /// Write message into internal output buffer, doesn't flush it. Technically /// error type can be only ProtocolError here (if, unlikely, serialization /// fails), but callers typically wrap it anyway. pub fn write_message_noflush( &mut self, message: &BeMessage<'_>, ) -> Result<&mut Self, ConnectionError> { self.framed.write_message_noflush(message)?; trace!("wrote msg {:?}", message); Ok(self) } /// Flush output buffer into the socket. pub async fn flush(&mut self) -> io::Result<()> { self.framed.flush().await } /// Polling version of `flush()`, saves the caller need to pin. pub fn poll_flush( &mut self, cx: &mut std::task::Context<'_>, ) -> Poll> { let flush_fut = std::pin::pin!(self.flush()); flush_fut.poll(cx) } /// Write message into internal output buffer and flush it to the stream. pub async fn write_message( &mut self, message: &BeMessage<'_>, ) -> Result<&mut Self, ConnectionError> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) } /// Returns an AsyncWrite implementation that wraps all the data written /// to it in CopyData messages, and writes them to the connection /// /// The caller is responsible for sending CopyOutResponse and CopyDone messages. pub fn copyout_writer(&mut self) -> CopyDataWriter { CopyDataWriter { pgb: self } } /// Wrapper for run_message_loop() that shuts down socket when we are done pub async fn run( mut self, handler: &mut impl Handler, cancel: &CancellationToken, ) -> Result<(), QueryError> { let ret = self.run_message_loop(handler, cancel).await; tokio::select! { _ = cancel.cancelled() => { // do nothing; we most likely got already stopped by shutdown and will log it next. } _ = self.framed.shutdown() => { // socket might be already closed, e.g. if previously received error, // so ignore result. }, } match ret { Ok(()) => Ok(()), Err(QueryError::Shutdown) => { info!("Stopped due to shutdown"); Ok(()) } Err(QueryError::Reconnect) => { // Dropping out of this loop implicitly disconnects info!("Stopped due to handler reconnect request"); Ok(()) } Err(QueryError::Disconnected(e)) => { info!("Disconnected ({e:#})"); // Disconnection is not an error: we just use it that way internally to drop // out of loops. Ok(()) } e => e, } } async fn run_message_loop( &mut self, handler: &mut impl Handler, cancel: &CancellationToken, ) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); tokio::select!( biased; _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during handshake"); return Err(QueryError::Shutdown) }, handshake_r = self.handshake(handler) => { handshake_r?; } ); // Authentication completed let mut query_string = Bytes::new(); while let Some(msg) = tokio::select!( biased; _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received in run_message_loop"); return Err(QueryError::Shutdown) }, msg = self.read_message() => { msg }, )? { trace!("got message {:?}", msg); let result = self.process_message(handler, msg, &mut query_string).await; tokio::select!( biased; _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during response flush"); // If we exited process_message with a shutdown error, there may be // some valid response content on in our transmit buffer: permit sending // this within a short timeout. This is a best effort thing so we don't // care about the result. tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok(); return Err(QueryError::Shutdown) }, flush_r = self.flush() => { flush_r?; } ); match result? { ProcessMsgResult::Continue => { continue; } ProcessMsgResult::Break => break, } } trace!("postgres backend to {:?} exited", self.peer_addr); Ok(()) } /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake. async fn tls_upgrade( src: MaybeTlsStream, tls_config: Arc, ) -> anyhow::Result> { match src { MaybeTlsStream::Unencrypted(s) => { let acceptor = TlsAcceptor::from(tls_config); let tls_stream = acceptor.accept(s).await?; Ok(MaybeTlsStream::Tls(Box::new(tls_stream))) } MaybeTlsStream::Tls(_) => { anyhow::bail!("TLS already started"); } } } async fn start_tls(&mut self) -> anyhow::Result<()> { // temporary replace stream with fake to cook TLS one, Indiana Jones style match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { MaybeWriteOnly::Full(framed) => { let tls_config = self .tls_config .as_ref() .context("start_tls called without conf")? .clone(); let tls_framed = framed .map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config)) .await?; // push back ready TLS stream self.framed = MaybeWriteOnly::Full(tls_framed); Ok(()) } MaybeWriteOnly::WriteOnly(_) => { anyhow::bail!("TLS upgrade attempt in split state") } MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"), } } /// Split off owned read part from which messages can be read in different /// task/thread. pub fn split(&mut self) -> anyhow::Result> { // temporary replace stream with fake to cook split one, Indiana Jones style match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { MaybeWriteOnly::Full(framed) => { let (reader, writer) = framed.split(); self.framed = MaybeWriteOnly::WriteOnly(writer); Ok(PostgresBackendReader { reader, closed: false, }) } MaybeWriteOnly::WriteOnly(_) => { anyhow::bail!("PostgresBackend is already split") } MaybeWriteOnly::Broken => panic!("split on framed in invalid state"), } } /// Join read part back. pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> { // temporary replace stream with fake to cook joined one, Indiana Jones style match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { MaybeWriteOnly::Full(_) => { anyhow::bail!("PostgresBackend is not split") } MaybeWriteOnly::WriteOnly(writer) => { let joined = Framed::unsplit(reader.reader, writer); self.framed = MaybeWriteOnly::Full(joined); // if reader encountered connection error, do not attempt reading anymore if reader.closed { self.state = ProtoState::Closed; } Ok(()) } MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"), } } /// Perform handshake with the client, transitioning to Established. /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()). async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { while self.state < ProtoState::Authentication { match self.framed.read_startup_message().await? { Some(msg) => { self.process_startup_message(handler, msg).await?; } None => { trace!( "postgres backend to {:?} received EOF during handshake", self.peer_addr ); self.state = ProtoState::Closed; return Err(QueryError::Disconnected(ConnectionError::Protocol( ProtocolError::Protocol("EOF during handshake".to_string()), ))); } } } // Perform auth, if needed. if self.state == ProtoState::Authentication { match self.framed.read_message().await? { Some(FeMessage::PasswordMessage(m)) => { assert!(self.auth_type == AuthType::NeonJWT); let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { self.write_message_noflush(&BeMessage::ErrorResponse( &short_error(&e), Some(e.pg_error_code()), ))?; return Err(e); } self.write_message_noflush(&BeMessage::AuthenticationOk)? .write_message_noflush(&BeMessage::CLIENT_ENCODING)? .write_message(&BeMessage::ReadyForQuery) .await?; self.state = ProtoState::Established; } Some(m) => { return Err(QueryError::Other(anyhow::anyhow!( "Unexpected message {:?} while waiting for handshake", m ))); } None => { trace!( "postgres backend to {:?} received EOF during auth", self.peer_addr ); self.state = ProtoState::Closed; return Err(QueryError::Disconnected(ConnectionError::Protocol( ProtocolError::Protocol("EOF during auth".to_string()), ))); } } } Ok(()) } /// Process startup packet: /// - transition to Established if auth type is trust /// - transition to Authentication if auth type is NeonJWT. /// - or perform TLS handshake -- then need to call this again to receive /// actual startup packet. async fn process_startup_message( &mut self, handler: &mut impl Handler, msg: FeStartupPacket, ) -> Result<(), QueryError> { assert!(self.state < ProtoState::Authentication); let have_tls = self.tls_config.is_some(); match msg { FeStartupPacket::SslRequest { direct } => { debug!("SSL requested"); if !direct { self.write_message(&BeMessage::EncryptionResponse(have_tls)) .await?; } else if !have_tls { return Err(QueryError::Other(anyhow::anyhow!( "direct SSL negotiation but no TLS support" ))); } if have_tls { self.start_tls().await?; self.state = ProtoState::Encrypted; } } FeStartupPacket::GssEncRequest => { debug!("GSS requested"); self.write_message(&BeMessage::EncryptionResponse(false)) .await?; } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None)) .await?; return Err(QueryError::Other(anyhow::anyhow!( "client did not connect with TLS" ))); } // NB: startup() may change self.auth_type -- we are using that in proxy code // to bypass auth for new users. handler.startup(self, &msg)?; match self.auth_type { AuthType::Trust => { self.write_message_noflush(&BeMessage::AuthenticationOk)? .write_message_noflush(&BeMessage::CLIENT_ENCODING)? .write_message_noflush(&BeMessage::INTEGER_DATETIMES)? // The async python driver requires a valid server_version .write_message_noflush(&BeMessage::server_version("14.1"))? .write_message(&BeMessage::ReadyForQuery) .await?; self.state = ProtoState::Established; } AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword) .await?; self.state = ProtoState::Authentication; } } } FeStartupPacket::CancelRequest { .. } => { return Err(QueryError::Other(anyhow::anyhow!( "Unexpected CancelRequest message during handshake" ))); } } Ok(()) } // Proto looks like this: // FeMessage::Query("pagestream_v2{FeMessage::CopyData(PagesetreamFeMessage::GetPage(..))}") async fn process_message( &mut self, handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, ) -> Result { // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth // TODO: change that to proper top-level match of protocol state with separate message handling for each state assert!(self.state == ProtoState::Established); match msg { FeMessage::Query(body) => { // remove null terminator let query_string = cstr_to_str(&body)?; trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { match e { err @ QueryError::Shutdown => { // Notify postgres of the connection shutdown at the libpq // protocol level. This avoids postgres having to tell apart // from an idle connection and a stale one, which is bug prone. let shutdown_error = short_error(&err); self.write_message_noflush(&BeMessage::ErrorResponse( &shutdown_error, Some(err.pg_error_code()), ))?; return Ok(ProcessMsgResult::Break); } QueryError::SimulatedConnectionError => { return Err(QueryError::SimulatedConnectionError); } err @ QueryError::Reconnect => { // Instruct the client to reconnect, stop processing messages // from this libpq connection and, finally, disconnect from the // server side (returning an Err achieves the later). // // Note the flushing is done by the caller. let reconnect_error = short_error(&err); self.write_message_noflush(&BeMessage::ErrorResponse( &reconnect_error, Some(err.pg_error_code()), ))?; return Err(err); } e => { log_query_error(query_string, &e); let short_error = short_error(&e); self.write_message_noflush(&BeMessage::ErrorResponse( &short_error, Some(e.pg_error_code()), ))?; } } } self.write_message_noflush(&BeMessage::ReadyForQuery)?; } FeMessage::Parse(m) => { *unnamed_query_string = m.query_string; self.write_message_noflush(&BeMessage::ParseComplete)?; } FeMessage::Describe(_) => { self.write_message_noflush(&BeMessage::ParameterDescription)? .write_message_noflush(&BeMessage::NoData)?; } FeMessage::Bind(_) => { self.write_message_noflush(&BeMessage::BindComplete)?; } FeMessage::Close(_) => { self.write_message_noflush(&BeMessage::CloseComplete)?; } FeMessage::Execute(_) => { let query_string = cstr_to_str(unnamed_query_string)?; trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { log_query_error(query_string, &e); self.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))?; } // NOTE there is no ReadyForQuery message. This handler is used // for basebackup and it uses CopyOut which doesn't require // ReadyForQuery message and backend just switches back to // processing mode after sending CopyDone or ErrorResponse. } FeMessage::Sync => { self.write_message_noflush(&BeMessage::ReadyForQuery)?; } FeMessage::Terminate => { return Ok(ProcessMsgResult::Break); } // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail | FeMessage::PasswordMessage(_) => { return Err(QueryError::Other(anyhow::anyhow!( "unexpected message type: {msg:?}", ))); } } Ok(ProcessMsgResult::Continue) } /// - Log as info/error result of handling COPY stream and send back /// ErrorResponse if that makes sense. /// - Shutdown the stream if we got Terminate. /// - Then close the connection because we don't handle exiting from COPY /// stream normally. pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { use CopyStreamHandlerEnd::*; let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, // The timeline doesn't exist and we have been requested to not auto-create it. // Compute requests for timelines that haven't been created yet // might reach us before the storcon request to create those timelines. TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { true } _ => false, }; if expected_end { info!("terminated: {:#}", end); } else { error!("terminated: {:?}", end); } // Note: no current usages ever send this if let CopyDone = &end { if let Err(e) = self.write_message(&BeMessage::CopyDone).await { error!("failed to send CopyDone: {}", e); } } let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), // Note: CopyFail in duplex copy is somewhat unexpected (at least to // PG walsender; evidently and per my docs reading client should // finish it with CopyDone). It is not a problem to recover from it // finishing the stream in both directions like we do, but note that // sync rust-postgres client (which we don't use anymore) hangs if // socket is not closed here. // https://github.com/sfackler/rust-postgres/issues/755 // https://github.com/neondatabase/neon/issues/935 // // Currently, the version of tokio_postgres replication patch we use // sends this when it closes the stream (e.g. pageserver decided to // switch conn to another safekeeper and client gets dropped). // Moreover, seems like 'connection' task errors with 'unexpected // message from server' when it receives ErrorResponse (anything but // CopyData/CopyDone) back. CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), // When cancelled, send no response: we must not risk blocking on sending that response Cancelled => None, _ => None, }; if let Some((err, errcode)) = err_to_send_and_errcode { if let Err(ee) = self .write_message(&BeMessage::ErrorResponse(&err, Some(errcode))) .await { error!("failed to send ErrorResponse: {}", ee); } } // Proper COPY stream finishing to continue using the connection is not // implemented at the server side (we don't need it so far). To prevent // further usages of the connection, close it. self.framed.shutdown().await.ok(); self.state = ProtoState::Closed; } } pub struct PostgresBackendReader { reader: FramedReader>, closed: bool, // true if received error closing the connection } impl PostgresBackendReader { /// Read full message or return None if connection is cleanly closed with no /// unprocessed data. pub async fn read_message(&mut self) -> Result, ConnectionError> { match self.reader.read_message().await { Ok(m) => { trace!("read msg {:?}", m); Ok(m) } Err(e) => { self.closed = true; Err(e) } } } /// Get CopyData contents of the next message in COPY stream or error /// closing it. The error type is wider than actual errors which can happen /// here -- it includes 'Other' and 'ServerInitiated', but that's ok for /// current callers. pub async fn read_copy_message(&mut self) -> Result { match self.read_message().await? { Some(msg) => match msg { FeMessage::CopyData(m) => Ok(m), FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone), FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( ProtocolError::Protocol(format!("unexpected message in COPY stream {msg:?}")), ))), }, None => Err(CopyStreamHandlerEnd::EOF), } } } /// /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData /// messages. /// pub struct CopyDataWriter<'a, IO> { pgb: &'a mut PostgresBackend, } impl AsyncWrite for CopyDataWriter<'_, IO> { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], ) -> Poll> { let this = self.get_mut(); // It's not strictly required to flush between each message, but makes it easier // to view in wireshark, and usually the messages that the callers write are // decently-sized anyway. if let Err(err) = ready!(this.pgb.poll_flush(cx)) { return Poll::Ready(Err(err)); } // CopyData // XXX: if the input is large, we should split it into multiple messages. // Not sure what the threshold should be, but the ultimate hard limit is that // the length cannot exceed u32. this.pgb .write_message_noflush(&BeMessage::CopyData(buf)) // write_message only writes to the buffer, so it can fail iff the // message is invaid, but CopyData can't be invalid. .map_err(|_| io::Error::other("failed to serialize CopyData"))?; Poll::Ready(Ok(buf.len())) } fn poll_flush( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Poll> { let this = self.get_mut(); this.pgb.poll_flush(cx) } fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Poll> { let this = self.get_mut(); this.pgb.poll_flush(cx) } } pub fn short_error(e: &QueryError) -> String { match e { QueryError::Disconnected(connection_error) => connection_error.to_string(), QueryError::Reconnect => "reconnect".to_string(), QueryError::Shutdown => "shutdown".to_string(), QueryError::NotFound(_) => "not found".to_string(), QueryError::Unauthorized(_e) => "JWT authentication error".to_string(), QueryError::SimulatedConnectionError => "simulated connection error".to_string(), QueryError::Other(e) => format!("{e:#}"), } } fn log_query_error(query: &str, e: &QueryError) { // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`. match e { QueryError::Disconnected(ConnectionError::Io(io_error)) => { if is_expected_io_error(io_error) { info!("query handler for '{query}' failed with expected io error: {io_error}"); } else { error!("query handler for '{query}' failed with io error: {io_error}"); } } QueryError::Disconnected(other_connection_error) => { error!( "query handler for '{query}' failed with connection error: {other_connection_error:?}" ) } QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") } QueryError::Reconnect => { info!("query handler for '{query}' requested client to reconnect") } QueryError::Shutdown => { info!("query handler for '{query}' cancelled during tenant shutdown") } QueryError::NotFound(reason) => { info!("query handler for '{query}' entity not found: {reason}") } QueryError::Unauthorized(e) => { warn!("query handler for '{query}' failed with authentication error: {e}"); } QueryError::Other(e) => { error!("query handler for '{query}' failed: {e:?}"); } } } /// Something finishing handling of COPY stream, see handle_copy_stream_end. /// This is not always a real error, but it allows to use ? and thiserror impls. #[derive(thiserror::Error, Debug)] pub enum CopyStreamHandlerEnd { /// Handler initiates the end of streaming. #[error("{0}")] ServerInitiated(String), #[error("received CopyDone")] CopyDone, #[error("received CopyFail")] CopyFail, #[error("received Terminate")] Terminate, #[error("EOF on COPY stream")] EOF, #[error("timeline not found, and allow_timeline_creation is false")] TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), #[error("Shutdown")] Cancelled, /// Some other error #[error(transparent)] Other(#[from] anyhow::Error), } ================================================ FILE: libs/postgres_backend/tests/cert.pem ================================================ -----BEGIN CERTIFICATE----- MIIDbjCCAlagAwIBAgIUGHJukXa1bQathgBHC40+A18BsnYwDQYJKoZIhvcNAQEL BQAwYzELMAkGA1UEBhMCVVMxEzARBgNVBAgMCkNhbGlmb3JuaWExFjAUBgNVBAcM DVNhbiBGcmFuY2lzY28xEzARBgNVBAoMCk15IENvbXBhbnkxEjAQBgNVBAMMCWxv Y2FsaG9zdDAgFw0yMTA4MTMxODQyMjBaGA8yMTIxMDcyMDE4NDIyMFowYzELMAkG A1UEBhMCVVMxEzARBgNVBAgMCkNhbGlmb3JuaWExFjAUBgNVBAcMDVNhbiBGcmFu Y2lzY28xEzARBgNVBAoMCk15IENvbXBhbnkxEjAQBgNVBAMMCWxvY2FsaG9zdDCC ASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAOI9S+nh8ABMp5jpb7WWfAYr tGJ4C7gi9IPTVIRxSSrt5KglEysrOiKlhan1Ut2e8CCudztdXtCvT8/goJWlmxpF IQkErlCsOdGHeEJ0EZxoU1fMkBAQVf6Rb1JE9ladG2+D1e7yvxmMqfPVuU8lj+kN nESP+I3ESNCtuqgtfcErxu3TuhSzV2slSi5lrYQCwERgCevl6LUNd2mEaYdS4mmJ 4RZqc2C4y7JO5wSDjga8GIBHJVo70HRVsvX7eE8r6tMP2HyGyonBitBKAc2QEQIv cLCuMOTtTBlYcMvTmJEOHFKwIJXm0XmQfAWeKFfyK7493fB4Gu+8Dc1xC+IHaTEC AwEAAaMYMBYwFAYDVR0RBA0wC4IJbG9jYWxob3N0MA0GCSqGSIb3DQEBCwUAA4IB AQBjY+g3eF8m8lEWz+QgKp88MhTdtJTsEsSz0GAi58SnEkuyxVOHjKEyjGKJWTtT ICgmEzC85uaS7VBdftoYNmsbvNewGiisDGQRWCjOGM7lTaA4FQPADguexMvXh/nO 9PQoTxtp7qwvGWO2mED6LWU6bjT3cL+XgrOwT9sticRTl6/BXV8wAmyxT0DkQ3nJ zbRuTP/G2kE0bRK++67kK0ovopRkX6Dl6di1EFlkAnPBC2d8tdcNTXYhkxZk4O0q GUolwiuWz/dtD3tZ2bx3vqzT7uIFHS4XP6Q3SRNWFTGhuvAc7DPvCZBqxy6odeyQ VxBgJtq+pNjYYkeaSQVQ+UMU -----END CERTIFICATE----- ================================================ FILE: libs/postgres_backend/tests/key.pem ================================================ -----BEGIN RSA PRIVATE KEY----- MIIEpAIBAAKCAQEA4j1L6eHwAEynmOlvtZZ8Biu0YngLuCL0g9NUhHFJKu3kqCUT Kys6IqWFqfVS3Z7wIK53O11e0K9Pz+CglaWbGkUhCQSuUKw50Yd4QnQRnGhTV8yQ EBBV/pFvUkT2Vp0bb4PV7vK/GYyp89W5TyWP6Q2cRI/4jcRI0K26qC19wSvG7dO6 FLNXayVKLmWthALARGAJ6+XotQ13aYRph1LiaYnhFmpzYLjLsk7nBIOOBrwYgEcl WjvQdFWy9ft4Tyvq0w/YfIbKicGK0EoBzZARAi9wsK4w5O1MGVhwy9OYkQ4cUrAg lebReZB8BZ4oV/Irvj3d8Hga77wNzXEL4gdpMQIDAQABAoIBAQClKycO+zpinZQG GPbLVa/6OVIaSZYUusBUtaaQgrxuMPusnlSeQZLR1JH/APGchvq8gWLe3k3ogPT9 yPq0BhF0Xl+928L/dp1HkWWE7oQk8i1Wfiv27lY54iepoltN5KkxAsjfCC3oEz/I mpINbFjiRmN90rYdmd2nLA6H1Z5ntZQm5AcTo3OJZlTVN9eH9TV8f0AQRQgUJsL9 75agSmj7euqZOqvvwfpsYzaZEhzMSG2QIcS3WglInbHy8c6ikZSm36J36wgsatMz CBZ6pMNtonRSKvAECQhBGEA73evtnGbLH0EY9KouN4KSHEHob89dGVeeXozksf9x QUE1/yOhAoGBAP818f7vIH6Z3QwWgTMwQsPBW+wNOIbTZrbZaihnz2K9XMu39TV6 DWQHMsOlvg2QURZGwqB3jFn4wqZHmt7XYwk553E60kIw4hDvgpkkqmXVwK3kZASQ RRUax3hZ1gCWxpXlRZ1SvHNXjN9KEFwqQbR33XcxzC3TpSp0KYghT9jFAoGBAOLw agejqSF+f/5W1QhEKlM+tSlluo2sn5kKVkM4nNezFukb3pu5oScFjoQQGsoaz5aU kLlxW5h/aSxquhgcuo6I4Ux5dcgNm4QeonCCp+Qycn7tzyoJFL4odT9vYPQa5O9E hD9aSqhBBD1IIOS2T3vcW6VxibKZx1CRMDdRz119AoGALflr1L8DHYteNLVBJRWG kXkdtBJVooQmtr3Hz+uTgngWZWSIOc/45ZIeZPxQlmTvFpI8sWeX0wVrG0U+8vHe F2Vk+hLcmavwrZhX8HqYb6vn/+tq0R+kMj8Wu+mDEawXrh0VQ1gKNsUIzZisBc5e 88G8FaLU41SDJniymqFVnvkCgYEA1ou/UfWRwg6b5tIkmKoI8aZJExgPpDzcrYyu POLatLmlIUCt1b9K8V85evTWvtdWBd/yar8WfzeFMO69fGo8nOAfT3NMvJLQwblM jN2Y6A4hXIpq3iyzpYsOPaiImn6KjQHTnSk5h5Pf9CeqoU8SGeEb629JZMYpPqvk T4hSaOkCgYBPaf51oSAstqdj0vxrsFS3EN3D8Fk0xQWt9Ss3ZGFAlTaEq5xoIk4k YfKVDv1S6/vlzbheIIzQ2lzVvG4AW+drQLsmEx5iMKvbNtFAur9kwUFU202Q2dki ZQJ/JvjnPYFKxy+SVlLJ1h9RD9E3dgL/Ai7OUfbmX771vN0IQF7Z6Q== -----END RSA PRIVATE KEY----- ================================================ FILE: libs/postgres_backend/tests/simple_select.rs ================================================ use std::io::Cursor; use std::sync::Arc; /// Test postgres_backend_async with tokio_postgres use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use rustls::crypto::ring; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; use tokio_postgres_rustls::MakeRustlsConnect; use tokio_util::sync::CancellationToken; // generate client, server test streams async fn make_tcp_pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); let client_stream = TcpStream::connect(addr).await.unwrap(); let (server_stream, _) = listener.accept().await.unwrap(); (client_stream, server_stream) } struct TestHandler {} impl Handler for TestHandler { // return single col 'hey' for any query async fn process_query( &mut self, pgb: &mut PostgresBackend, _query_string: &str, ) -> Result<(), QueryError> { pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"hey", )]))? .write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; Ok(()) } } // test that basic select works #[tokio::test] async fn simple_select() { let (client_sock, server_sock) = make_tcp_pair().await; // create and run pgbackend let pgbackend = PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation"); tokio::spawn(async move { let mut handler = TestHandler {}; pgbackend.run(&mut handler, &CancellationToken::new()).await }); let conf = Config::new(); let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect"); // The connection object performs the actual communication with the database, // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; if let SimpleQueryMessage::Row(row) = first_val { let first_col = row.get(0).expect("first column"); assert_eq!(first_col, "hey"); } else { panic!("expected SimpleQueryMessage::Row"); } } static KEY: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("key.pem")); let key = rustls_pemfile::rsa_private_keys(&mut cursor) .next() .unwrap() .unwrap(); rustls::pki_types::PrivateKeyDer::Pkcs1(key) }); static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap() }); // test that basic select with ssl works #[tokio::test] async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; let server_cfg = rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("aws_lc_rs should support the default protocol versions") .with_no_client_auth() .with_single_cert(vec![CERT.clone()], KEY.clone_key()) .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation"); tokio::spawn(async move { let mut handler = TestHandler {}; pgbackend.run(&mut handler, &CancellationToken::new()).await }); let client_cfg = rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("aws_lc_rs should support the default protocol versions") .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); store.add(CERT.clone()).unwrap(); store }) .with_no_client_auth(); let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); let tls_connect = >::make_tls_connect( &mut make_tls_connect, "localhost", ) .expect("make_tls_connect"); let mut conf = Config::new(); conf.ssl_mode(SslMode::Require); let (client, connection) = conf .connect_raw(client_sock, tls_connect) .await .expect("connect"); // The connection object performs the actual communication with the database, // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { eprintln!("connection error: {e}"); } }); let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; if let SimpleQueryMessage::Row(row) = first_val { let first_col = row.get(0).expect("first column"); assert_eq!(first_col, "hey"); } else { panic!("expected SimpleQueryMessage::Row"); } } ================================================ FILE: libs/postgres_connection/Cargo.toml ================================================ [package] name = "postgres_connection" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true itertools.workspace = true tokio-postgres.workspace = true url.workspace = true [dev-dependencies] once_cell.workspace = true ================================================ FILE: libs/postgres_connection/src/lib.rs ================================================ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use std::borrow::Cow; use std::fmt; use anyhow::{Context, bail}; use itertools::Itertools; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. /// /// The `host` part should be a correct `url::Host`, while `port` (if present) should be /// a valid decimal u16 of digits only. pub fn parse_host_port>(host_port: S) -> Result<(Host, Option), anyhow::Error> { let (host, port) = match host_port.as_ref().rsplit_once(':') { Some((host, port)) => ( host, // +80 is a valid u16, but not a valid port if port.chars().all(|c| c.is_ascii_digit()) { Some(port.parse::().context("Unable to parse port")?) } else { bail!("Port contains a non-ascii-digit") }, ), None => (host_port.as_ref(), None), // No colons, no port specified }; let host = Host::parse(host).context("Unable to parse host")?; Ok((host, port)) } #[cfg(test)] mod tests_parse_host_port { use url::Host; use crate::parse_host_port; #[test] fn test_normal() { let (host, port) = parse_host_port("hello:123").unwrap(); assert_eq!(host, Host::Domain("hello".to_owned())); assert_eq!(port, Some(123)); } #[test] fn test_no_port() { let (host, port) = parse_host_port("hello").unwrap(); assert_eq!(host, Host::Domain("hello".to_owned())); assert_eq!(port, None); } #[test] fn test_ipv6() { let (host, port) = parse_host_port("[::1]:123").unwrap(); assert_eq!(host, Host::::Ipv6(std::net::Ipv6Addr::LOCALHOST)); assert_eq!(port, Some(123)); } #[test] fn test_invalid_host() { assert!(parse_host_port("hello world").is_err()); } #[test] fn test_invalid_port() { assert!(parse_host_port("hello:+80").is_err()); } } #[derive(Clone)] pub struct PgConnectionConfig { host: Host, port: u16, password: Option, options: Vec, } /// A simplified PostgreSQL connection configuration. Supports only a subset of possible /// settings for simplicity. A password getter or `to_connection_string` methods are not /// added by design to avoid accidentally leaking password through logging, command line /// arguments to a child process, or likewise. impl PgConnectionConfig { pub fn new_host_port(host: Host, port: u16) -> Self { PgConnectionConfig { host, port, password: None, options: vec![], } } pub fn host(&self) -> &Host { &self.host } pub fn port(&self) -> u16 { self.port } pub fn set_host(mut self, h: Host) -> Self { self.host = h; self } pub fn set_port(mut self, p: u16) -> Self { self.port = p; self } pub fn set_password(mut self, s: Option) -> Self { self.password = s; self } pub fn extend_options, S: Into>(mut self, i: I) -> Self { self.options.extend(i.into_iter().map(|s| s.into())); self } /// Return a `:` string. pub fn raw_address(&self) -> String { format!("{}:{}", self.host(), self.port()) } /// Build a client library-specific connection configuration. /// Used for testing and when we need to add some obscure configuration /// elements at the last moment. pub fn to_tokio_postgres_config(&self) -> tokio_postgres::Config { // Use `tokio_postgres::Config` instead of `postgres::Config` because // the former supports more options to fiddle with later. let mut config = tokio_postgres::Config::new(); config.host(&self.host().to_string()).port(self.port); if let Some(password) = &self.password { config.password(password); } if !self.options.is_empty() { // These options are command-line options and should be escaped before being passed // as an 'options' connection string parameter, see // https://www.postgresql.org/docs/15/libpq-connect.html#LIBPQ-CONNECT-OPTIONS // // They will be space-separated, so each space inside an option should be escaped, // and all backslashes should be escaped before that. Although we don't expect options // with spaces at the moment, they're supported by PostgreSQL. Hence we support them // in this typesafe interface. // // We use `Cow` to avoid allocations in the best case (no escaping). A fully imperative // solution would require 1-2 allocations in the worst case as well, but it's harder to // implement and this function is hardly a bottleneck. The function is only called around // establishing a new connection. #[allow(unstable_name_collisions)] config.options( &self .options .iter() .map(|s| { if s.contains(['\\', ' ']) { Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ ")) } else { Cow::Borrowed(s.as_str()) } }) .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized .collect::(), ); } config } /// Connect using postgres protocol with TLS disabled. pub async fn connect_no_tls( &self, ) -> Result< ( tokio_postgres::Client, tokio_postgres::Connection, ), tokio_postgres::Error, > { self.to_tokio_postgres_config() .connect(tokio_postgres::NoTls) .await } } impl fmt::Display for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // The password is intentionally hidden and not part of this display string. write!(f, "postgresql://{}:{}", self.host, self.port) } } impl fmt::Debug for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")` // so even if the password is `REDACTED-STRING` (quite unlikely) there is no confusion. // Hence `format_args!()`, it returns a "safe" string which is not escaped by `Debug`. f.debug_struct("PgConnectionConfig") .field("host", &self.host) .field("port", &self.port) .field( "password", &self .password .as_ref() .map(|_| format_args!("REDACTED-STRING")), ) .finish() } } #[cfg(test)] mod tests_pg_connection_config { use once_cell::sync::Lazy; use url::Host; use crate::PgConnectionConfig; static STUB_HOST: Lazy = Lazy::new(|| Host::Domain("stub.host.example".to_owned())); #[test] fn test_no_password() { let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123); assert_eq!(cfg.host(), &*STUB_HOST); assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: None }" ); } #[test] fn test_ipv6() { // May be a special case because hostname contains a colon. let cfg = PgConnectionConfig::new_host_port(Host::parse("[::1]").unwrap(), 123); assert_eq!( cfg.host(), &Host::::Ipv6(std::net::Ipv6Addr::LOCALHOST) ); assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "[::1]:123"); assert_eq!( format!("{cfg:?}"), "PgConnectionConfig { host: Ipv6(::1), port: 123, password: None }" ); } #[test] fn test_with_password() { let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123) .set_password(Some("password".to_owned())); assert_eq!(cfg.host(), &*STUB_HOST); assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: Some(REDACTED-STRING) }" ); } #[test] fn test_with_options() { let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([ "hello", "world", "with space", "and \\ backslashes", ]); assert_eq!(cfg.host(), &*STUB_HOST); assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( cfg.to_tokio_postgres_config().get_options(), Some("hello world with\\ space and\\ \\\\\\ backslashes") ); } } ================================================ FILE: libs/postgres_ffi/Cargo.toml ================================================ [package] name = "postgres_ffi" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] regex.workspace = true bytes.workspace = true anyhow.workspace = true crc32c.workspace = true once_cell.workspace = true pprof.workspace = true thiserror.workspace = true serde.workspace = true postgres_ffi_types.workspace = true utils.workspace = true tracing.workspace = true postgres_versioninfo.workspace = true [dev-dependencies] criterion.workspace = true env_logger.workspace = true postgres.workspace = true [build-dependencies] anyhow.workspace = true bindgen.workspace = true [[bench]] name = "waldecoder" harness = false ================================================ FILE: libs/postgres_ffi/README.md ================================================ This module contains utilities for working with PostgreSQL file formats. It's a collection of structs that are auto-generated from the PostgreSQL header files using bindgen, and Rust functions to read and manipulate them. There are also a bunch of constants in `pg_constants.rs` that are copied from various PostgreSQL headers, rather than auto-generated. They mostly should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change in each major PostgreSQL version. Currently, this module supports PostgreSQL v14, v15 and v16: bindings and code that depends on them are version-specific. This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and `postgres_ffi::v16`. Version independent code is explicitly exported into shared `postgres_ffi`. TODO: Currently, there is also some code that deals with WAL records in pageserver/src/waldecoder.rs. That should be moved into this module. The rest of the codebase should not have intimate knowledge of PostgreSQL file formats or WAL layout, that knowledge should be encapsulated in this module. ================================================ FILE: libs/postgres_ffi/benches/README.md ================================================ ## Benchmarks To run benchmarks: ```sh # All benchmarks. cargo bench --package postgres_ffi # Specific file. cargo bench --package postgres_ffi --bench waldecoder # Specific benchmark. cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 # List available benchmarks. cargo bench --package postgres_ffi --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 -- --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. Benchmarks are automatically compared against the previous run. To compare against other runs, see `--baseline` and `--save-baseline`. ================================================ FILE: libs/postgres_ffi/benches/waldecoder.rs ================================================ use std::ffi::CStr; use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_versioninfo::PgMajorVersion; use pprof::criterion::{Output, PProfProfiler}; use utils::lsn::Lsn; const KB: usize = 1024; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_complete_record, ); criterion_main!(benches); /// Benchmarks WalStreamDecoder::complete_record() for a logical message of varying size. fn bench_complete_record(c: &mut Criterion) { let mut g = c.benchmark_group("complete_record"); for size in [64, KB, 8 * KB, 128 * KB] { // Kind of weird to change the group throughput per benchmark, but it's the only way // to vary it per benchmark. It works. g.throughput(criterion::Throughput::Bytes(size as u64)); g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); } fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { const PREFIX: &CStr = c""; let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); let value = vec![1; value_size]; let mut decoder = WalStreamDecoder::new(Lsn(0), PgMajorVersion::PG17); let msg = LogicalMessageGenerator::new(PREFIX, &value) .next() .unwrap() .encode(Lsn(0)); assert_eq!(msg.len(), size); b.iter(|| { let msg = msg.clone(); // Bytes::clone() is cheap decoder.complete_record(msg).unwrap(); }); Ok(()) } } ================================================ FILE: libs/postgres_ffi/bindgen_deps.h ================================================ /* * This header file is the input to bindgen. It includes all the * PostgreSQL headers that we need to auto-generate Rust structs * from. If you need to expose a new struct to Rust code, add the * header here, and whitelist the struct in the build.rs file. */ #include "c.h" #include "catalog/pg_control.h" #include "access/xlog_internal.h" #include "storage/block.h" #include "storage/bufpage.h" #include "storage/off.h" #include "access/multixact.h" ================================================ FILE: libs/postgres_ffi/build.rs ================================================ extern crate bindgen; use std::env; use std::path::PathBuf; use std::process::Command; use anyhow::{Context, anyhow}; use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] struct PostgresFfiCallbacks; impl ParseCallbacks for PostgresFfiCallbacks { fn include_file(&self, filename: &str) { // This does the equivalent of passing bindgen::CargoCallbacks // to the builder .parse_callbacks() method. let cargo_callbacks = bindgen::CargoCallbacks::new(); cargo_callbacks.include_file(filename) } // Add any custom #[derive] attributes to the data structures that bindgen // creates. fn add_derives(&self, derive_info: &DeriveInfo) -> Vec { // This is the list of data structures that we want to serialize/deserialize. let serde_list = [ "XLogRecord", "XLogPageHeaderData", "XLogLongPageHeaderData", "CheckPoint", "FullTransactionId", "ControlFileData", ]; if serde_list.contains(&derive_info.name) { vec![ "Default".into(), // Default allows us to easily fill the padding fields with 0. "Serialize".into(), "Deserialize".into(), ] } else { vec![] } } } fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { PathBuf::from("pg_install") }; for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } let pg_config_bin = pg_install_dir_versioned.join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } String::from_utf8(output.stdout) .context("pg_config output is not UTF-8")? .trim_end() .into() } else { let server_path = pg_install_dir_versioned .join("include") .join("postgresql") .join("server") .into_os_string(); server_path .into_string() .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. let bindings = bindgen::Builder::default() // // All the needed PostgreSQL headers are included from 'bindgen_deps.h' // .header("bindgen_deps.h") // // Tell cargo to invalidate the built crate whenever any of the // included header files changed. // .parse_callbacks(Box::new(PostgresFfiCallbacks)) // // These are the types and constants that we want to generate bindings for // .allowlist_type("BlockNumber") .allowlist_type("OffsetNumber") .allowlist_type("XLogRecPtr") .allowlist_type("XLogSegNo") .allowlist_type("TimeLineID") .allowlist_type("MultiXactId") .allowlist_type("MultiXactOffset") .allowlist_type("MultiXactStatus") .allowlist_type("ControlFileData") .allowlist_type("CheckPoint") .allowlist_type("FullTransactionId") .allowlist_type("XLogRecord") .allowlist_type("XLogPageHeaderData") .allowlist_type("XLogLongPageHeaderData") .allowlist_var("XLOG_PAGE_MAGIC") .allowlist_var("PG_MAJORVERSION_NUM") .allowlist_var("PG_CONTROL_FILE_SIZE") .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") .allowlist_type("DBState") .allowlist_type("RelMapFile") .allowlist_type("RepOriginId") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) // .clang_arg(format!("-I{inc_server_path}")) // // Finish the builder and generate the bindings. // .generate() .context("Unable to generate bindings")?; // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. let out_path: PathBuf = env::var("OUT_DIR") .context("Couldn't read OUT_DIR environment variable var")? .into(); let filename = format!("bindings_{pg_version}.rs"); bindings .write_to_file(out_path.join(filename)) .context("Couldn't write bindings")?; } Ok(()) } ================================================ FILE: libs/postgres_ffi/samples/pg_hba.conf ================================================ # PostgreSQL Client Authentication Configuration File # =================================================== # # Refer to the "Client Authentication" section in the PostgreSQL # documentation for a complete description of this file. A short # synopsis follows. # # This file controls: which hosts are allowed to connect, how clients # are authenticated, which PostgreSQL user names they can use, which # databases they can access. Records take one of these forms: # # local DATABASE USER METHOD [OPTIONS] # host DATABASE USER ADDRESS METHOD [OPTIONS] # hostssl DATABASE USER ADDRESS METHOD [OPTIONS] # hostnossl DATABASE USER ADDRESS METHOD [OPTIONS] # hostgssenc DATABASE USER ADDRESS METHOD [OPTIONS] # hostnogssenc DATABASE USER ADDRESS METHOD [OPTIONS] # # (The uppercase items must be replaced by actual values.) # # The first field is the connection type: # - "local" is a Unix-domain socket # - "host" is a TCP/IP socket (encrypted or not) # - "hostssl" is a TCP/IP socket that is SSL-encrypted # - "hostnossl" is a TCP/IP socket that is not SSL-encrypted # - "hostgssenc" is a TCP/IP socket that is GSSAPI-encrypted # - "hostnogssenc" is a TCP/IP socket that is not GSSAPI-encrypted # # DATABASE can be "all", "sameuser", "samerole", "replication", a # database name, or a comma-separated list thereof. The "all" # keyword does not match "replication". Access to replication # must be enabled in a separate record (see example below). # # USER can be "all", a user name, a group name prefixed with "+", or a # comma-separated list thereof. In both the DATABASE and USER fields # you can also write a file name prefixed with "@" to include names # from a separate file. # # ADDRESS specifies the set of hosts the record matches. It can be a # host name, or it is made up of an IP address and a CIDR mask that is # an integer (between 0 and 32 (IPv4) or 128 (IPv6) inclusive) that # specifies the number of significant bits in the mask. A host name # that starts with a dot (.) matches a suffix of the actual host name. # Alternatively, you can write an IP address and netmask in separate # columns to specify the set of hosts. Instead of a CIDR-address, you # can write "samehost" to match any of the server's own IP addresses, # or "samenet" to match any address in any subnet that the server is # directly connected to. # # METHOD can be "trust", "reject", "md5", "password", "scram-sha-256", # "gss", "sspi", "ident", "peer", "pam", "ldap", "radius" or "cert". # Note that "password" sends passwords in clear text; "md5" or # "scram-sha-256" are preferred since they send encrypted passwords. # # OPTIONS are a set of options for the authentication in the format # NAME=VALUE. The available options depend on the different # authentication methods -- refer to the "Client Authentication" # section in the documentation for a list of which options are # available for which authentication methods. # # Database and user names containing spaces, commas, quotes and other # special characters must be quoted. Quoting one of the keywords # "all", "sameuser", "samerole" or "replication" makes the name lose # its special character, and just match a database or username with # that name. # # This file is read on server startup and when the server receives a # SIGHUP signal. If you edit the file on a running system, you have to # SIGHUP the server for the changes to take effect, run "pg_ctl reload", # or execute "SELECT pg_reload_conf()". # # Put your actual configuration here # ---------------------------------- # # If you want to allow non-local connections, you need to add more # "host" records. In that case you will also need to make PostgreSQL # listen on a non-local interface via the listen_addresses # configuration parameter, or via the -i or -h command line switches. # CAUTION: Configuring the system for local "trust" authentication # allows any local user to connect as any PostgreSQL user, including # the database superuser. If you do not trust all your local users, # use another authentication method. # TYPE DATABASE USER ADDRESS METHOD # "local" is for Unix domain socket connections only local all all trust # IPv4 local connections: host all all 127.0.0.1/32 trust # IPv6 local connections: host all all ::1/128 trust # Allow replication connections from localhost, by a user with the # replication privilege. local replication all trust host replication all 127.0.0.1/32 trust host replication all ::1/128 trust ================================================ FILE: libs/postgres_ffi/src/controlfile_utils.rs ================================================ //! //! Utilities for reading and writing the PostgreSQL control file. //! //! The PostgreSQL control file is one the first things that the PostgreSQL //! server reads when it starts up. It indicates whether the server was shut //! down cleanly, or if it crashed or was restored from online backup so that //! WAL recovery needs to be performed. It also contains a copy of the latest //! checkpoint record and its location in the WAL. //! //! The control file also contains fields for detecting whether the //! data directory is compatible with a postgres binary. That includes //! a version number, configuration options that can be set at //! compilation time like the block size, and the platform's alignment //! and endianness information. (The PostgreSQL on-disk file format is //! not portable across platforms.) //! //! The control file is stored in the PostgreSQL data directory, as //! `global/pg_control`. The data stored in it is designed to be smaller than //! 512 bytes, on the assumption that it can be updated atomically. The actual //! file is larger, 8192 bytes, but the rest of it is just filled with zeros. //! //! See src/include/catalog/pg_control.h in the PostgreSQL sources for more //! information. You can use PostgreSQL's pg_controldata utility to view its //! contents. //! use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE}; use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; /// Equivalent to sizeof(ControlFileData) in C const SIZEOF_CONTROLDATA: usize = size_of::(); impl ControlFileData { /// Compute the offset of the `crc` field within the `ControlFileData` struct. /// Equivalent to offsetof(ControlFileData, crc) in C. const fn pg_control_crc_offset() -> usize { std::mem::offset_of!(ControlFileData, crc) } /// /// Interpret a slice of bytes as a Postgres control file. /// pub fn decode(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; // Check that the slice has the expected size. The control file is // padded with zeros up to a 512 byte sector size, so accept a // larger size too, so that the caller can just the whole file // contents without knowing the exact size of the struct. if buf.len() < SIZEOF_CONTROLDATA { bail!("control file is too short"); } // Compute the expected CRC of the content. let OFFSETOF_CRC = Self::pg_control_crc_offset(); let expectedcrc = crc32c::crc32c(&buf[0..OFFSETOF_CRC]); // Use serde to deserialize the input as a ControlFileData struct. let controlfile = ControlFileData::des_prefix(buf)?; // Check the CRC if expectedcrc != controlfile.crc { bail!( "invalid CRC in control file: expected {:08X}, was {:08X}", expectedcrc, controlfile.crc ); } Ok(controlfile) } /// /// Convert a struct representing a Postgres control file into raw bytes. /// /// The CRC is recomputed to match the contents of the fields. pub fn encode(&self) -> Bytes { use utils::bin_ser::LeSer; // Serialize into a new buffer. let b = self.ser().unwrap(); // Recompute the CRC let OFFSETOF_CRC = Self::pg_control_crc_offset(); let newcrc = crc32c::crc32c(&b[0..OFFSETOF_CRC]); let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize); buf.extend_from_slice(&b[0..OFFSETOF_CRC]); buf.extend_from_slice(&newcrc.to_ne_bytes()); // Fill the rest of the control file with zeros. buf.resize(PG_CONTROL_FILE_SIZE as usize, 0); buf.into() } } ================================================ FILE: libs/postgres_ffi/src/lib.rs ================================================ #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] // noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code. #![allow(clippy::useless_transmute)] // modules included with the postgres_ffi macro depend on the types of the specific version's // types, and trigger a too eager lint. #![allow(clippy::duplicate_mod)] #![deny(clippy::undocumented_unsafe_blocks)] use bytes::Bytes; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub use postgres_versioninfo::PgMajorVersion; macro_rules! postgres_ffi { ($version:ident) => { #[path = "."] pub mod $version { pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::undocumented_unsafe_blocks)] #![allow(clippy::ptr_offset_with_cast)] use serde::{Deserialize, Serialize}; include!(concat!( env!("OUT_DIR"), "/bindings_", stringify!($version), ".rs" )); include!(concat!("pg_constants_", stringify!($version), ".rs")); } pub mod controlfile_utils; pub mod nonrelfile_utils; pub mod wal_craft_test_export; pub mod wal_generator; pub mod waldecoder_handler; pub mod xlog_utils; pub const PG_MAJORVERSION: &str = stringify!($version); // Re-export some symbols from bindings pub use bindings::{CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, XLogRecord}; pub const ZERO_CHECKPOINT: bytes::Bytes = bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); } }; } #[macro_export] macro_rules! for_all_postgres_versions { ($macro:tt) => { $macro!(v14); $macro!(v15); $macro!(v16); $macro!(v17); }; } for_all_postgres_versions! { postgres_ffi } /// dispatch_pgversion /// /// Run a code block in a context where the postgres_ffi bindings for a /// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv /// identifier. /// If the provided pg_version is not supported, we panic!(), unless the /// optional third argument was provided (in which case that code will provide /// the default handling instead). /// /// Use like /// /// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE }) /// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE) /// /// Other uses are for macro-internal purposes only and strictly unsupported. /// #[macro_export] macro_rules! dispatch_pgversion { ($version:expr, $code:expr) => { dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version)) }; ($version:expr, $code:expr, $invalid_pgver_handling:expr) => { dispatch_pgversion!( $version => $code, default = $invalid_pgver_handling, pgversions = [ $crate::PgMajorVersion::PG14 => v14, $crate::PgMajorVersion::PG15 => v15, $crate::PgMajorVersion::PG16 => v16, $crate::PgMajorVersion::PG17 => v17, ] ) }; ($pgversion:expr => $code:expr, default = $default:expr, pgversions = [$($sv:pat => $vsv:ident),+ $(,)?]) => { match ($pgversion.clone().into()) { $($sv => { use $crate::$vsv as pgv; $code },)+ #[allow(unreachable_patterns)] _ => { $default } } }; } #[macro_export] macro_rules! enum_pgversion_dispatch { ($name:expr, $typ:ident, $bind:ident, $code:block) => { enum_pgversion_dispatch!( name = $name, bind = $bind, typ = $typ, code = $code, pgversions = [ V14 : v14, V15 : v15, V16 : v16, V17 : v17, ] ) }; (name = $name:expr, bind = $bind:ident, typ = $typ:ident, code = $code:block, pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => { match $name { $( self::$typ::$variant($bind) => { use $crate::$md as pgv; $code } ),+, } }; } #[macro_export] macro_rules! enum_pgversion { {$name:ident, pgv :: $t:ident} => { enum_pgversion!{ name = $name, typ = $t, pgversions = [ V14 : v14, V15 : v15, V16 : v16, V17 : v17, ] } }; {$name:ident, pgv :: $p:ident :: $t:ident} => { enum_pgversion!{ name = $name, path = $p, typ = $t, pgversions = [ V14 : v14, V15 : v15, V16 : v16, V17 : v17, ] } }; {name = $name:ident, typ = $t:ident, pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { pub enum $name { $($variant ( $crate::$md::$t )),+ } impl self::$name { pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { pgv::bindings::MY_PGVERSION }) } } $( impl Into for $crate::$md::$t { fn into(self) -> self::$name { self::$name::$variant (self) } } )+ }; {name = $name:ident, path = $p:ident, $(typ = $t:ident,)? pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { pub enum $name { $($variant $(($crate::$md::$p::$t))?),+ } impl $name { pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { pgv::bindings::MY_PGVERSION }) } } $( impl Into<$name> for $crate::$md::$p::$t { fn into(self) -> $name { $name::$variant (self) } } )+ }; } pub mod pg_constants; pub mod relfile_utils; pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{ BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData, RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64, }; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::xlog_utils::{ XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); pub const XLOG_BLCKSZ: usize = 8192; pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod pub use v14::bindings::DBState_DB_SHUTDOWNED; pub use v14::xlog_utils::{ XLogFileName, encode_logical_message, get_current_timestamp, to_pg_timestamp, try_from_pg_timestamp, }; pub fn bkpimage_is_compressed(bimg_info: u8, version: PgMajorVersion) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( segno: u64, system_id: u64, pg_version: PgMajorVersion, lsn: Lsn, ) -> Result { assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn) ) } pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, pg_version: PgMajorVersion, ) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), anyhow::bail!("Unknown version {}", pg_version) ) } // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // // NOTE: this is not to be confused with Neon timelines; different concept! // // It's a shaky assumption, that it's always 1. We might import a // PostgreSQL data directory that has gone through timeline bumps, // for example. FIXME later. pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { id > pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) -> bool { /* * If either ID is a permanent XID then we can just do unsigned * comparison. If both are normal, do a modulo-2^32 comparison. */ if !(transaction_id_is_normal(id1)) || !transaction_id_is_normal(id2) { return id1 < id2; } let diff = id1.wrapping_sub(id2) as i32; diff < 0 } // Check if page is not yet initialized (port of Postgres PageIsInit() macro) pub fn page_is_new(pg: &[u8]) -> bool { pg[14] == 0 && pg[15] == 0 // pg_upper == 0 } // ExtractLSN from page header pub fn page_get_lsn(pg: &[u8]) -> Lsn { Lsn( ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32) | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64, ) } pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } // This is port of function with the same name from freespace.c. // The only difference is that it does not have "level" parameter because XLogRecordPageWithFreeSpace // always call it with level=FSM_BOTTOM_LEVEL pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { let mut leafno = addr; const FSM_TREE_DEPTH: u32 = if pg_constants::SLOTS_PER_FSM_PAGE >= 1626 { 3 } else { 4 }; /* Count upper level nodes required to address the leaf page */ let mut pages: BlockNumber = 0; for _l in 0..FSM_TREE_DEPTH { pages += leafno + 1; leafno /= pg_constants::SLOTS_PER_FSM_PAGE; } /* Turn the page count into 0-based block number */ pages - 1 } pub mod waldecoder { use std::num::NonZeroU32; use crate::PgMajorVersion; use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; pub enum State { WaitingForRecord, ReassemblingRecord { recordbuf: BytesMut, contlen: NonZeroU32, }, SkippingEverything { skip_until_lsn: Lsn, }, } pub struct WalStreamDecoder { pub lsn: Lsn, pub pg_version: PgMajorVersion, pub inputbuf: BytesMut, pub state: State, } #[derive(Error, Debug, Clone)] #[error("{msg} at {lsn}")] pub struct WalDecodeError { pub msg: String, pub lsn: Lsn, } impl WalStreamDecoder { pub fn new(lsn: Lsn, pg_version: PgMajorVersion) -> WalStreamDecoder { WalStreamDecoder { lsn, pg_version, inputbuf: BytesMut::new(), state: State::WaitingForRecord, } } // The latest LSN position fed to the decoder. pub fn available(&self) -> Lsn { self.lsn + self.inputbuf.remaining() as u64 } /// Returns the LSN up to which the WAL decoder has processed. /// /// If [`Self::poll_decode`] returned a record, then this will return /// the end LSN of said record. pub fn lsn(&self) -> Lsn { self.lsn } pub fn feed_bytes(&mut self, buf: &[u8]) { self.inputbuf.extend_from_slice(buf); } pub fn poll_decode(&mut self) -> Result, WalDecodeError> { dispatch_pgversion!( self.pg_version, { use pgv::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() }, Err(WalDecodeError { msg: format!("Unknown version {}", self.pg_version), lsn: self.lsn, }) ) } } } ================================================ FILE: libs/postgres_ffi/src/nonrelfile_utils.rs ================================================ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { tracing::trace!( "handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)", status ); let byteno: usize = ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; page[byteno] = (page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift); } pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 { let byteno: usize = ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK } // See CLOGPagePrecedes in clog.c pub const fn clogpage_precedes(page1: u32, page2: u32) -> bool { let mut xid1 = page1 * pg_constants::CLOG_XACTS_PER_PAGE; xid1 += pg_constants::FIRST_NORMAL_TRANSACTION_ID + 1; let mut xid2 = page2 * pg_constants::CLOG_XACTS_PER_PAGE; xid2 += pg_constants::FIRST_NORMAL_TRANSACTION_ID + 1; transaction_id_precedes(xid1, xid2) && transaction_id_precedes(xid1, xid2 + pg_constants::CLOG_XACTS_PER_PAGE - 1) } // See SlruMayDeleteSegment() in slru.c pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool { let seg_last_page = segpage + pg_constants::SLRU_PAGES_PER_SEGMENT - 1; assert_eq!(segpage % pg_constants::SLRU_PAGES_PER_SEGMENT, 0); clogpage_precedes(segpage, cutoff_page) && clogpage_precedes(seg_last_page, cutoff_page) } // Multixact utils pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize { ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32 * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize } pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 { (xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP * pg_constants::MXACT_MEMBER_BITS_PER_XACT } /* Location (byte offset within page) of TransactionId of given member */ pub fn mx_offset_to_member_offset(xid: MultiXactId) -> usize { mx_offset_to_flags_offset(xid) + (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP + (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4) as usize } fn mx_offset_to_member_page(xid: u32) -> u32 { xid / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32 } pub fn mx_offset_to_member_segment(xid: u32) -> i32 { (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32 } #[cfg(test)] mod tests { use super::*; #[test] fn test_multixid_calc() { // Check that the mx_offset_* functions produce the same values as the // corresponding PostgreSQL C macros (MXOffsetTo*). These test values // were generated by calling the PostgreSQL macros with a little C // program. assert_eq!(mx_offset_to_member_segment(0), 0); assert_eq!(mx_offset_to_member_page(0), 0); assert_eq!(mx_offset_to_flags_offset(0), 0); assert_eq!(mx_offset_to_flags_bitshift(0), 0); assert_eq!(mx_offset_to_member_offset(0), 4); assert_eq!(mx_offset_to_member_segment(1), 0); assert_eq!(mx_offset_to_member_page(1), 0); assert_eq!(mx_offset_to_flags_offset(1), 0); assert_eq!(mx_offset_to_flags_bitshift(1), 8); assert_eq!(mx_offset_to_member_offset(1), 8); assert_eq!(mx_offset_to_member_segment(123456789), 2358); assert_eq!(mx_offset_to_member_page(123456789), 75462); assert_eq!(mx_offset_to_flags_offset(123456789), 4780); assert_eq!(mx_offset_to_flags_bitshift(123456789), 8); assert_eq!(mx_offset_to_member_offset(123456789), 4788); assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040); assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285); assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160); assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16); assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172); assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040); assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285); assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160); assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24); assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176); } } ================================================ FILE: libs/postgres_ffi/src/pg_constants.rs ================================================ //! //! Misc constants, copied from PostgreSQL headers. //! //! Only place version-independent constants here. //! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! use crate::{BLCKSZ, PageHeaderData}; // Note: There are a few more widely-used constants in the postgres_ffi_types::constants crate. // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // From bufpage.h // // Assumes 8 byte alignment const SIZEOF_PAGE_HEADER_DATA: usize = size_of::(); pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7; // // constants from clog.h // pub const CLOG_XACTS_PER_BYTE: u32 = 4; pub const CLOG_XACTS_PER_PAGE: u32 = BLCKSZ as u32 * CLOG_XACTS_PER_BYTE; pub const CLOG_BITS_PER_XACT: u8 = 2; pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1; pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01; pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02; pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03; pub const CLOG_ZEROPAGE: u8 = 0x00; pub const CLOG_TRUNCATE: u8 = 0x10; // // Constants from visibilitymap.h, visibilitymapdefs.h and visibilitymap.c // pub const SIZE_OF_PAGE_HEADER: u16 = 24; pub const BITS_PER_BYTE: u16 = 8; pub const HEAPBLOCKS_PER_PAGE: u32 = (BLCKSZ - SIZE_OF_PAGE_HEADER) as u32 * 8 / BITS_PER_HEAPBLOCK as u32; pub const HEAPBLOCKS_PER_BYTE: u16 = BITS_PER_BYTE / BITS_PER_HEAPBLOCK; pub const fn HEAPBLK_TO_MAPBLOCK(x: u32) -> u32 { x / HEAPBLOCKS_PER_PAGE } pub const fn HEAPBLK_TO_MAPBYTE(x: u32) -> u32 { (x % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE as u32 } pub const fn HEAPBLK_TO_OFFSET(x: u32) -> u32 { (x % HEAPBLOCKS_PER_BYTE as u32) * BITS_PER_HEAPBLOCK as u32 } pub const BITS_PER_HEAPBLOCK: u16 = 2; pub const VISIBILITYMAP_ALL_VISIBLE: u8 = 0x01; pub const VISIBILITYMAP_ALL_FROZEN: u8 = 0x02; pub const VISIBILITYMAP_VALID_BITS: u8 = 0x03; // From xact.h pub const XLOG_XACT_COMMIT: u8 = 0x00; pub const XLOG_XACT_PREPARE: u8 = 0x10; pub const XLOG_XACT_ABORT: u8 = 0x20; pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30; pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40; // From standbydefs.h pub const XLOG_RUNNING_XACTS: u8 = 0x10; // From srlu.h pub const SLRU_PAGES_PER_SEGMENT: u32 = 32; pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize; /* mask for filtering opcodes out of xl_info */ pub const XLOG_XACT_OPMASK: u8 = 0x70; pub const XLOG_HEAP_OPMASK: u8 = 0x70; /* does this record have a 'xinfo' field or not */ pub const XLOG_XACT_HAS_INFO: u8 = 0x80; /* * The following flags, stored in xinfo, determine which information is * contained in commit/abort records. */ pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0; pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1; pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2; pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3; pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6; // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7; // From pg_control.h and rmgrlist.h pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; pub const MAX_MULTIXACT_ID: u32 = 0xFFFFFFFF; pub const MAX_MULTIXACT_OFFSET: u32 = 0xFFFFFFFF; pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00; pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10; pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20; pub const XLOG_MULTIXACT_TRUNCATE_ID: u8 = 0x30; pub const MULTIXACT_OFFSETS_PER_PAGE: u16 = BLCKSZ / 4; pub const MXACT_MEMBER_BITS_PER_XACT: u16 = 8; pub const MXACT_MEMBER_FLAGS_PER_BYTE: u16 = 1; pub const MULTIXACT_FLAGBYTES_PER_GROUP: u16 = 4; pub const MULTIXACT_MEMBERS_PER_MEMBERGROUP: u16 = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE; /* size in bytes of a complete group */ pub const MULTIXACT_MEMBERGROUP_SIZE: u16 = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP; pub const MULTIXACT_MEMBERGROUPS_PER_PAGE: u16 = BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE; pub const MULTIXACT_MEMBERS_PER_PAGE: u16 = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP; // From heapam_xlog.h pub const XLOG_HEAP_INSERT: u8 = 0x00; pub const XLOG_HEAP_DELETE: u8 = 0x10; pub const XLOG_HEAP_UPDATE: u8 = 0x20; pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40; pub const XLOG_HEAP_LOCK: u8 = 0x60; pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80; pub const XLOG_HEAP2_VISIBLE: u8 = 0x40; pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50; pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60; pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01; pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8; pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; // From heapam_xlog.h pub const XLOG_HEAP2_REWRITE: u8 = 0x00; // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; // From rmgrlist.h pub const RM_XLOG_ID: u8 = 0; pub const RM_XACT_ID: u8 = 1; pub const RM_SMGR_ID: u8 = 2; pub const RM_CLOG_ID: u8 = 3; pub const RM_DBASE_ID: u8 = 4; pub const RM_TBLSPC_ID: u8 = 5; pub const RM_MULTIXACT_ID: u8 = 6; pub const RM_RELMAP_ID: u8 = 7; pub const RM_STANDBY_ID: u8 = 8; pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; pub const RM_REPLORIGIN_ID: u8 = 19; pub const RM_LOGICALMSG_ID: u8 = 21; // from neon_rmgr.h pub const RM_NEON_ID: u8 = 134; pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80; pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00; pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10; pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20; pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30; pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40; pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50; pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40; // from xlogreader.h pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; // // from xlogrecord.h // pub const XLR_MAX_BLOCK_ID: u8 = 32; pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255; pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254; pub const XLR_BLOCK_ID_ORIGIN: u8 = 253; pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252; pub const BKPBLOCK_FORK_MASK: u8 = 0x0F; pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0; pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */ pub const BKPBLOCK_HAS_DATA: u8 = 0x20; pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */ /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; pub const INVALID_TRANSACTION_ID: u32 = 0; /* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; /* xlog_internal.h */ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; /* From fsm_internals.h */ const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4; const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1; const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_PER_PAGE; pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; /* From visibilitymap.c */ pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA; pub const VM_BITS_PER_HEAPBLOCK: usize = 2; pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK; pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE; /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files // here are not needed before backup so it is okay to edit them after. pub const PGDATA_SPECIAL_FILES: [&str; 3] = ["pg_hba.conf", "pg_ident.conf", "postgresql.auto.conf"]; pub static PG_HBA: &str = include_str!("../samples/pg_hba.conf"); ================================================ FILE: libs/postgres_ffi/src/pg_constants_v14.rs ================================================ use crate::PgMajorVersion; pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG14; pub const XLOG_DBASE_CREATE: u8 = 0x00; pub const XLOG_DBASE_DROP: u8 = 0x10; pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ "global", "pg_wal/archive_status", "pg_commit_ts", "pg_dynshmem", "pg_notify", "pg_serial", "pg_snapshots", "pg_subtrans", "pg_twophase", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", "base", "base/1", "pg_replslot", "pg_tblspc", "pg_stat", "pg_stat_tmp", "pg_xact", "pg_logical", "pg_logical/snapshots", "pg_logical/mappings", ]; pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } ================================================ FILE: libs/postgres_ffi/src/pg_constants_v15.rs ================================================ use crate::PgMajorVersion; pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG15; pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; pub const XLOG_DBASE_DROP: u8 = 0x20; pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ pub use super::super::v14::bindings::PGDATA_SUBDIRS; pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; (bimg_info & ANY_COMPRESS_FLAG) != 0 } ================================================ FILE: libs/postgres_ffi/src/pg_constants_v16.rs ================================================ use crate::PgMajorVersion; pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG16; pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; pub const XLOG_DBASE_DROP: u8 = 0x20; pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ pub use super::super::v14::bindings::PGDATA_SUBDIRS; pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; (bimg_info & ANY_COMPRESS_FLAG) != 0 } ================================================ FILE: libs/postgres_ffi/src/pg_constants_v17.rs ================================================ use crate::PgMajorVersion; pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG17; pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; pub const XLOG_DBASE_DROP: u8 = 0x20; pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 23] = [ "global", "pg_wal/archive_status", "pg_wal/summaries", "pg_commit_ts", "pg_dynshmem", "pg_notify", "pg_serial", "pg_snapshots", "pg_subtrans", "pg_twophase", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", "base", "base/1", "pg_replslot", "pg_tblspc", "pg_stat", "pg_stat_tmp", "pg_xact", "pg_logical", "pg_logical/snapshots", "pg_logical/mappings", ]; pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; (bimg_info & ANY_COMPRESS_FLAG) != 0 } pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; ================================================ FILE: libs/postgres_ffi/src/relfile_utils.rs ================================================ //! //! Common utilities for dealing with PostgreSQL relation files. //! use once_cell::sync::OnceCell; use regex::Regex; use postgres_ffi_types::forknum::*; /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple. /// /// Formats: /// /// ```text /// /// _ /// . /// _. /// ``` /// /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources. /// pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> { static RELFILE_RE: OnceCell = OnceCell::new(); RELFILE_RE.get_or_init(|| { Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap() }); let caps = RELFILE_RE .get() .unwrap() .captures(fname) .ok_or(FilePathError::InvalidFileName)?; let relnode_str = caps.name("relnode").unwrap().as_str(); let relnode = relnode_str .parse::() .map_err(|_e| FilePathError::InvalidFileName)?; let forkname = caps.name("forkname").map(|f| f.as_str()); let forknum = forkname_to_number(forkname)?; let segno_match = caps.name("segno"); let segno = if segno_match.is_none() { 0 } else { segno_match .unwrap() .as_str() .parse::() .map_err(|_e| FilePathError::InvalidFileName)? }; Ok((relnode, forknum, segno)) } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_valid_relfilenames() { assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0))); assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0))); assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0))); assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0))); assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12))); assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12))); assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12))); assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12))); // relfilenode is unsigned, so it can go up to 2^32-1 assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0))); } #[test] fn test_parse_invalid_relfilenames() { assert_eq!( parse_relfilename("foo"), Err(FilePathError::InvalidFileName) ); assert_eq!( parse_relfilename("1.2.3"), Err(FilePathError::InvalidFileName) ); assert_eq!( parse_relfilename("1234_invalid"), Err(FilePathError::InvalidForkName) ); assert_eq!( parse_relfilename("1234_"), Err(FilePathError::InvalidFileName) ); // too large for u32 assert_eq!( parse_relfilename("12345678901"), Err(FilePathError::InvalidFileName) ); assert_eq!( parse_relfilename("-1234"), Err(FilePathError::InvalidFileName) ); } #[test] fn test_parse_weird_relfilenames() { // we accept 0 for the relfilenode, but PostgreSQL should never do that. assert_eq!(parse_relfilename("0"), Ok((0, 0, 0))); // PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and // 1 GB segments, the max segment number is 32767. But we accept larger values // currently. assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456))); } } ================================================ FILE: libs/postgres_ffi/src/wal_craft_test_export.rs ================================================ //! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage. pub use super::PG_MAJORVERSION; pub use super::xlog_utils::*; pub use super::bindings::*; pub use crate::WAL_SEGMENT_SIZE; ================================================ FILE: libs/postgres_ffi/src/wal_generator.rs ================================================ use std::ffi::{CStr, CString}; use bytes::{Bytes, BytesMut}; use crc32c::crc32c_append; use utils::lsn::Lsn; use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}; use super::xlog_utils::{ XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE, XLP_FIRST_IS_CONTRECORD, }; use super::XLogRecord; use crate::pg_constants::{ RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG, XLR_BLOCK_ID_DATA_SHORT, }; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; /// A WAL record payload. Will be prefixed by an XLogRecord header when encoded. pub struct Record { pub rmid: RmgrId, pub info: u8, pub data: Bytes, } impl Record { /// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of /// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres. pub fn encode(&self, prev_lsn: Lsn) -> Bytes { // Prefix data with block ID and length. let data_header = Bytes::from(match self.data.len() { 0 => vec![], 1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8], 256.. => { let len_bytes = (self.data.len() as u32).to_le_bytes(); [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat() } }); // Construct the WAL record header. let mut header = XLogRecord { xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32, xl_xid: 0, xl_prev: prev_lsn.into(), xl_info: self.info, xl_rmid: self.rmid, __bindgen_padding_0: [0; 2], xl_crc: 0, // see below }; // Compute the CRC checksum for the data, and the header up to the CRC field. let mut crc = 0; crc = crc32c_append(crc, &data_header); crc = crc32c_append(crc, &self.data); crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]); header.xl_crc = crc; // Encode the final header and record. let header = header.encode().unwrap(); [header, data_header, self.data.clone()].concat().into() } } /// Generates WAL record payloads. /// /// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator /// that creates a table and inserts rows. pub trait RecordGenerator: Iterator {} impl> RecordGenerator for I {} /// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs /// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record, /// including internal page headers if it spans pages. Concatenating the bytes will yield a /// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized /// for performance. /// /// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this /// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`). /// /// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers. /// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.: /// /// | Segment 1 | Segment 2 | Segment 3 | /// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 | /// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 | #[derive(Default)] pub struct WalGenerator { /// Generates record payloads for the WAL. pub record_generator: R, /// Current LSN to append the next record at. /// /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should /// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the /// middle on an existing record or header, or beyond the end of the existing WAL). pub lsn: Lsn, /// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't /// care about this, unlike Postgres, but we include it for completeness. pub prev_lsn: Lsn, } impl WalGenerator { // Hardcode the sys and timeline ID. We can make them configurable if we care about them. const SYS_ID: u64 = 0; const TIMELINE_ID: u32 = 1; /// Creates a new WAL generator with the given record generator. pub fn new(record_generator: R, start_lsn: Lsn) -> WalGenerator { Self { record_generator, lsn: start_lsn, prev_lsn: start_lsn, } } /// Appends a record with an arbitrary payload at the current LSN, then increments the LSN. /// Returns the WAL bytes for the record, including page headers and padding, and the start LSN. fn append_record(&mut self, record: Record) -> (Lsn, Bytes) { let record = record.encode(self.prev_lsn); let record = Self::insert_pages(record, self.lsn); let record = Self::pad_record(record, self.lsn); let lsn = self.lsn; self.prev_lsn = self.lsn; self.lsn += record.len() as u64; (lsn, record) } /// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record /// is to be appended. fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes { // Fast path: record fits in current page, and the page already has a header. if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 { return record; } let mut pages = BytesMut::new(); let mut remaining = record.clone(); // Bytes::clone() is cheap while !remaining.is_empty() { // At new page boundary, inject page header. if lsn.block_offset() == 0 { let mut page_header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_info: XLP_BKP_REMOVABLE, xlp_tli: Self::TIMELINE_ID, xlp_pageaddr: lsn.0, xlp_rem_len: 0, __bindgen_padding_0: [0; 4], }; // If the record was split across page boundaries, mark as continuation. if remaining.len() < record.len() { page_header.xlp_rem_len = remaining.len() as u32; page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD; } // At start of segment, use a long page header. let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 { page_header.xlp_info |= XLP_LONG_HEADER; XLogLongPageHeaderData { std: page_header, xlp_sysid: Self::SYS_ID, xlp_seg_size: WAL_SEGMENT_SIZE as u32, xlp_xlog_blcksz: XLOG_BLCKSZ as u32, } .encode() .unwrap() } else { page_header.encode().unwrap() }; pages.extend_from_slice(&page_header); lsn += page_header.len() as u64; } // Append the record up to the next page boundary, if any. let page_free = lsn.remaining_in_block() as usize; let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len())); pages.extend_from_slice(&chunk); lsn += chunk.len() as u64; } pages.freeze() } /// Records must be 8-byte aligned. Take an encoded record (including any injected page /// boundaries), starting at the given LSN, and add any necessary padding at the end. fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes { lsn += record.len() as u64; let padding = lsn.calc_padding(8u64) as usize; if padding == 0 { return record; } [record, Bytes::from(vec![0; padding])].concat().into() } } /// Generates WAL records as an iterator. impl Iterator for WalGenerator { type Item = (Lsn, Bytes); fn next(&mut self) -> Option { let record = self.record_generator.next()?; Some(self.append_record(record)) } } /// Generates logical message records (effectively noops) with a fixed message. pub struct LogicalMessageGenerator { prefix: CString, message: Vec, } impl LogicalMessageGenerator { const DB_ID: u32 = 0; // hardcoded for now const RM_ID: RmgrId = RM_LOGICALMSG_ID; const INFO: u8 = XLOG_LOGICAL_MESSAGE; /// Creates a new LogicalMessageGenerator. pub fn new(prefix: &CStr, message: &[u8]) -> Self { Self { prefix: prefix.to_owned(), message: message.to_owned(), } } /// Encodes a logical message. fn encode(prefix: &CStr, message: &[u8]) -> Bytes { let prefix = prefix.to_bytes_with_nul(); let header = XlLogicalMessage { db_id: Self::DB_ID, transactional: 0, prefix_size: prefix.len() as u64, message_size: message.len() as u64, }; [&header.encode(), prefix, message].concat().into() } /// Computes how large a value must be to get a record of the given size. Convenience method to /// construct records of pre-determined size. Panics if the record size is too small. pub fn make_value_size(record_size: usize, prefix: &CStr) -> usize { let xlog_header_size = XLOG_SIZE_OF_XLOG_RECORD; let lm_header_size = size_of::(); let prefix_size = prefix.to_bytes_with_nul().len(); let data_header_size = match record_size - xlog_header_size - 2 { 0..=255 => 2, 256..=258 => panic!("impossible record_size {record_size}"), 259.. => 5, }; record_size .checked_sub(xlog_header_size + lm_header_size + prefix_size + data_header_size) .expect("record_size too small") } } impl Iterator for LogicalMessageGenerator { type Item = Record; fn next(&mut self) -> Option { Some(Record { rmid: Self::RM_ID, info: Self::INFO, data: Self::encode(&self.prefix, &self.message), }) } } impl WalGenerator { /// Convenience method for appending a WAL record with an arbitrary logical message at the /// current WAL LSN position. Returns the start LSN and resulting WAL bytes. pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) { let record = Record { rmid: LogicalMessageGenerator::RM_ID, info: LogicalMessageGenerator::INFO, data: LogicalMessageGenerator::encode(prefix, message), }; self.append_record(record) } } ================================================ FILE: libs/postgres_ffi/src/waldecoder_handler.rs ================================================ //! //! Basic WAL stream decoding. //! //! This understands the WAL page and record format, enough to figure out where the WAL record //! boundaries are, and to reassemble WAL records that cross page boundaries. //! //! This functionality is needed by both the pageserver and the safekeepers. The pageserver needs //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use std::cmp::min; use std::num::NonZeroU32; use utils::lsn::Lsn; pub trait WalStreamDecoderHandler { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; } // // This is a trick to support several postgres versions simultaneously. // // Page decoding code depends on postgres bindings, so it is compiled for each version. // Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. // WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. // Other methods are internal and are not dispatched. // // It is similar to having several impl blocks for the same struct, // but the impls here are in different modules, so need to use a trait. // impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { let validate_impl = || { if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { return Err(format!( "invalid xlog page header: xlp_magic={}, expected {}", hdr.xlp_magic, XLOG_PAGE_MAGIC )); } if hdr.xlp_pageaddr != self.lsn.0 { return Err(format!( "invalid xlog page header: xlp_pageaddr={}, expected {}", hdr.xlp_pageaddr, self.lsn )); } match self.state { State::WaitingForRecord => { if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 { return Err( "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(), ); } if hdr.xlp_rem_len != 0 { return Err(format!( "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord", hdr.xlp_rem_len )); } } State::ReassemblingRecord { contlen, .. } => { if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 { return Err( "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found" .into(), ); } if hdr.xlp_rem_len != contlen.get() { return Err(format!( "invalid xlog page header: xlp_rem_len={}, expected {}", hdr.xlp_rem_len, contlen.get() )); } } State::SkippingEverything { .. } => { panic!("Should not be validating page header in the SkippingEverything state"); } }; Ok(()) }; validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn }) } /// Attempt to decode another WAL record from the input that has been fed to the /// decoder so far. /// /// Returns one of the following: /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { // parse and verify page boundaries as we go // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason. match self.state { State::WaitingForRecord | State::ReassemblingRecord { .. } => { if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 { // parse long header if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { return Ok(None); } let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err( |e| WalDecodeError { msg: format!("long header deserialization failed {e}"), lsn: self.lsn, }, )?; self.validate_page_header(&hdr.std)?; self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; } else if self.lsn.block_offset() == 0 { if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD { return Ok(None); } let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { WalDecodeError { msg: format!("header deserialization failed {e}"), lsn: self.lsn, } })?; self.validate_page_header(&hdr)?; self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } } State::SkippingEverything { .. } => {} } // now read page contents match &mut self.state { State::WaitingForRecord => { // need to have at least the xl_tot_len field if self.inputbuf.remaining() < 4 { return Ok(None); } // peek xl_tot_len at the beginning of the record. // FIXME: assumes little-endian let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { return Err(WalDecodeError { msg: format!("invalid xl_tot_len {xl_tot_len}"), lsn: self.lsn, }); } // Fast path for the common case that the whole record fits on the page. let pageleft = self.lsn.remaining_in_block() as u32; if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { self.lsn += xl_tot_len as u64; let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); return Ok(Some(self.complete_record(recordbuf)?)); } else { // Need to assemble the record from pieces. Remember the size of the // record, and loop back. On next iterations, we will reach the branch // below, and copy the part of the record that was on this or next page(s) // to 'recordbuf'. Subsequent iterations will skip page headers, and // append the continuations from the next pages to 'recordbuf'. self.state = State::ReassemblingRecord { recordbuf: BytesMut::with_capacity(xl_tot_len as usize), contlen: NonZeroU32::new(xl_tot_len).unwrap(), } } } State::ReassemblingRecord { recordbuf, contlen } => { // we're continuing a record, possibly from previous page. let pageleft = self.lsn.remaining_in_block() as u32; // read the rest of the record, or as much as fits on this page. let n = min(contlen.get(), pageleft) as usize; if self.inputbuf.remaining() < n { return Ok(None); } recordbuf.put(self.inputbuf.split_to(n)); self.lsn += n as u64; *contlen = match NonZeroU32::new(contlen.get() - n as u32) { Some(x) => x, None => { // The record is now complete. let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze(); return Ok(Some(self.complete_record(recordbuf)?)); } } } State::SkippingEverything { skip_until_lsn } => { assert!(*skip_until_lsn >= self.lsn); let n = skip_until_lsn.0 - self.lsn.0; if self.inputbuf.remaining() < n as usize { return Ok(None); } self.inputbuf.advance(n as usize); self.lsn += n; self.state = State::WaitingForRecord; } } } } fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> { // We now have a record in the 'recordbuf' local variable. let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { WalDecodeError { msg: format!("xlog record deserialization failed {e}"), lsn: self.lsn, } })?; let mut crc = 0; crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]); if crc != xlogrec.xl_crc { return Err(WalDecodeError { msg: "WAL record crc mismatch".into(), lsn: self.lsn, }); } // XLOG_SWITCH records are special. If we see one, we need to skip // to the next WAL segment. let next_lsn = if xlogrec.is_xlog_switch_record() { tracing::trace!("saw xlog switch record at {}", self.lsn); self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64) } else { // Pad to an 8-byte boundary self.lsn.align() }; self.state = State::SkippingEverything { skip_until_lsn: next_lsn, }; // We should return LSN of the next record, not the last byte of this record or // the byte immediately after. Note that this handles both XLOG_SWITCH and usual // records, the former "spans" until the next WAL segment (see test_xlog_switch). Ok((next_lsn, recordbuf)) } } ================================================ FILE: libs/postgres_ffi/src/walrecord.rs ================================================ //! This module houses types used in decoding of PG WAL //! records. //! //! TODO: Generate separate types for each supported PG version use bytes::{Buf, Bytes}; use postgres_ffi_types::TimestampTz; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; use crate::{ BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion, RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, }; #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { pub mid: MultiXactId, /* new MultiXact's ID */ pub moff: MultiXactOffset, /* its starting offset in members file */ pub nmembers: u32, /* number of member XIDs */ pub members: Vec, } impl XlMultiXactCreate { pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate { let mid = buf.get_u32_le(); let moff = buf.get_u32_le(); let nmembers = buf.get_u32_le(); let mut members = Vec::new(); for _ in 0..nmembers { members.push(MultiXactMember::decode(buf)); } XlMultiXactCreate { mid, moff, nmembers, members, } } } #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactTruncate { pub oldest_multi_db: Oid, /* to-be-truncated range of multixact offsets */ pub start_trunc_off: MultiXactId, /* just for completeness' sake */ pub end_trunc_off: MultiXactId, /* to-be-truncated range of multixact members */ pub start_trunc_memb: MultiXactOffset, pub end_trunc_memb: MultiXactOffset, } impl XlMultiXactTruncate { pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate { XlMultiXactTruncate { oldest_multi_db: buf.get_u32_le(), start_trunc_off: buf.get_u32_le(), end_trunc_off: buf.get_u32_le(), start_trunc_memb: buf.get_u32_le(), end_trunc_memb: buf.get_u32_le(), } } } #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlRelmapUpdate { pub dbid: Oid, /* database ID, or 0 for shared map */ pub tsid: Oid, /* database's tablespace, or pg_global */ pub nbytes: i32, /* size of relmap data */ } impl XlRelmapUpdate { pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate { XlRelmapUpdate { dbid: buf.get_u32_le(), tsid: buf.get_u32_le(), nbytes: buf.get_i32_le(), } } } #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginDrop { pub node_id: RepOriginId, } impl XlReploriginDrop { pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { XlReploriginDrop { node_id: buf.get_u16_le(), } } } #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginSet { pub remote_lsn: Lsn, pub node_id: RepOriginId, } impl XlReploriginSet { pub fn decode(buf: &mut Bytes) -> XlReploriginSet { XlReploriginSet { remote_lsn: Lsn(buf.get_u64_le()), node_id: buf.get_u16_le(), } } } #[repr(C)] #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct RelFileNode { pub spcnode: Oid, /* tablespace */ pub dbnode: Oid, /* database */ pub relnode: Oid, /* relation */ } #[repr(C)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct MultiXactMember { pub xid: TransactionId, pub status: MultiXactStatus, } impl MultiXactMember { pub fn decode(buf: &mut Bytes) -> MultiXactMember { MultiXactMember { xid: buf.get_u32_le(), status: buf.get_u32_le(), } } } /// DecodedBkpBlock represents per-page data contained in a WAL record. #[derive(Default)] pub struct DecodedBkpBlock { /* Is this block ref in use? */ //in_use: bool, /* Identify the block this refers to */ pub rnode_spcnode: u32, pub rnode_dbnode: u32, pub rnode_relnode: u32, // Note that we have a few special forknum values for non-rel files. pub forknum: u8, pub blkno: u32, /* copy of the fork_flags field from the XLogRecordBlockHeader */ pub flags: u8, /* Information on full-page image, if any */ pub has_image: bool, /* has image, even for consistency checking */ pub apply_image: bool, /* has image that should be restored */ pub will_init: bool, /* record doesn't need previous page version to apply */ //char *bkp_image; pub hole_offset: u16, pub hole_length: u16, pub bimg_offset: u32, pub bimg_len: u16, pub bimg_info: u8, /* Buffer holding the rmgr-specific data associated with this block */ has_data: bool, data_len: u16, } impl DecodedBkpBlock { pub fn new() -> DecodedBkpBlock { Default::default() } } #[derive(Default)] pub struct DecodedWALRecord { pub xl_xid: TransactionId, pub xl_info: u8, pub xl_rmid: u8, pub record: Bytes, // raw XLogRecord pub blocks: Vec, pub main_data_offset: usize, pub origin_id: u16, } impl DecodedWALRecord { /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations /// by reading other existing relations' data blocks. This is more complex to apply than new-style database /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. pub fn is_dbase_create_copy(&self, pg_version: PgMajorVersion) -> bool { if self.xl_rmid == pg_constants::RM_DBASE_ID { let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; match pg_version { PgMajorVersion::PG14 => { // Postgres 14 database creations are always the legacy kind info == crate::v14::bindings::XLOG_DBASE_CREATE } PgMajorVersion::PG15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, PgMajorVersion::PG16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, PgMajorVersion::PG17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, } } else { false } } } /// Main routine to decode a WAL record and figure out which blocks are modified // // See xlogrecord.h for details // The overall layout of an XLOG record is: // Fixed-size header (XLogRecord struct) // XLogRecordBlockHeader struct // If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows // If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an // XLogRecordBlockCompressHeader struct follows. // If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows // BlockNumber follows // XLogRecordBlockHeader struct // ... // XLogRecordDataHeader[Short|Long] struct // block data // block data // ... // main data // // // For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. // It would be more natural for this function to return a DecodedWALRecord as return value, // but reusing the caller-supplied struct avoids an allocation. // This code is in the hot path for digesting incoming WAL, and is very performance sensitive. // pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; let mut got_rnode = false; let mut origin_id: u16 = 0; let mut buf = record.clone(); // 1. Parse XLogRecord struct // FIXME: assume little-endian here let xlogrec = XLogRecord::from_bytes(&mut buf)?; tracing::trace!( "decode_wal_record xl_rmid = {} xl_info = {}", xlogrec.xl_rmid, xlogrec.xl_info ); let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD; if buf.remaining() != remaining { //TODO error } let mut max_block_id = 0; let mut blocks_total_len: u32 = 0; let mut main_data_len = 0; let mut datatotal: u32 = 0; decoded.blocks.clear(); // 2. Decode the headers. // XLogRecordBlockHeaders if any, // XLogRecordDataHeader[Short|Long] while buf.remaining() > datatotal as usize { let block_id = buf.get_u8(); match block_id { pg_constants::XLR_BLOCK_ID_DATA_SHORT => { /* XLogRecordDataHeaderShort */ main_data_len = buf.get_u8() as u32; datatotal += main_data_len; } pg_constants::XLR_BLOCK_ID_DATA_LONG => { /* XLogRecordDataHeaderLong */ main_data_len = buf.get_u32_le(); datatotal += main_data_len; } pg_constants::XLR_BLOCK_ID_ORIGIN => { // RepOriginId is uint16 origin_id = buf.get_u16_le(); } pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { // TransactionId is uint32 buf.advance(4); } 0..=pg_constants::XLR_MAX_BLOCK_ID => { /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock::new(); if block_id <= max_block_id { // TODO //report_invalid_record(state, // "out-of-order block_id %u at %X/%X", // block_id, // (uint32) (state->ReadRecPtr >> 32), // (uint32) state->ReadRecPtr); // goto err; } max_block_id = block_id; let fork_flags: u8 = buf.get_u8(); blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; blk.flags = fork_flags; blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0; blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0; blk.data_len = buf.get_u16_le(); /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */ datatotal += blk.data_len as u32; blocks_total_len += blk.data_len as u32; if blk.has_image { blk.bimg_len = buf.get_u16_le(); blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); blk.apply_image = dispatch_pgversion!( pg_version, (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0 ); let blk_img_is_compressed = crate::bkpimage_is_compressed(blk.bimg_info, pg_version); if blk_img_is_compressed { tracing::debug!("compressed block image , pg_version = {}", pg_version); } if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { blk.hole_length = 0; } } else { blk.hole_length = BLCKSZ - blk.bimg_len; } datatotal += blk.bimg_len as u32; blocks_total_len += blk.bimg_len as u32; /* * cross-check that hole_offset > 0, hole_length > 0 and * bimg_len < BLCKSZ if the HAS_HOLE flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) { // TODO /* report_invalid_record(state, "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, (unsigned int) blk->bimg_len, (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); goto err; */ } /* * cross-check that hole_offset == 0 and hole_length == 0 if * the HAS_HOLE flag is not set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 && (blk.hole_offset != 0 || blk.hole_length != 0) { // TODO /* report_invalid_record(state, "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); goto err; */ } /* * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", (unsigned int) blk->bimg_len, (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); goto err; */ } /* * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 && !blk_img_is_compressed && blk.bimg_len != BLCKSZ { // TODO /* report_invalid_record(state, "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", (unsigned int) blk->data_len, (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); goto err; */ } } if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 { rnode_spcnode = buf.get_u32_le(); rnode_dbnode = buf.get_u32_le(); rnode_relnode = buf.get_u32_le(); got_rnode = true; } else if !got_rnode { // TODO /* report_invalid_record(state, "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X", (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); goto err; */ } blk.rnode_spcnode = rnode_spcnode; blk.rnode_dbnode = rnode_dbnode; blk.rnode_relnode = rnode_relnode; blk.blkno = buf.get_u32_le(); tracing::trace!( "this record affects {}/{}/{} blk {}", rnode_spcnode, rnode_dbnode, rnode_relnode, blk.blkno ); decoded.blocks.push(blk); } _ => { // TODO: invalid block_id } } } // 3. Decode blocks. let mut ptr = record.len() - buf.remaining(); for blk in decoded.blocks.iter_mut() { if blk.has_image { blk.bimg_offset = ptr as u32; ptr += blk.bimg_len as usize; } if blk.has_data { ptr += blk.data_len as usize; } } // We don't need them, so just skip blocks_total_len bytes buf.advance(blocks_total_len as usize); assert_eq!(ptr, record.len() - buf.remaining()); let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize; // 4. Decode main_data if main_data_len > 0 { assert_eq!(buf.remaining(), main_data_len as usize); } decoded.xl_xid = xlogrec.xl_xid; decoded.xl_info = xlogrec.xl_info; decoded.xl_rmid = xlogrec.xl_rmid; decoded.record = record; decoded.origin_id = origin_id; decoded.main_data_offset = main_data_offset; Ok(()) } pub mod v14 { use bytes::{Buf, Bytes}; use crate::{OffsetNumber, TransactionId}; #[repr(C)] #[derive(Debug)] pub struct XlHeapInsert { pub offnum: OffsetNumber, pub flags: u8, } impl XlHeapInsert { pub fn decode(buf: &mut Bytes) -> XlHeapInsert { XlHeapInsert { offnum: buf.get_u16_le(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapMultiInsert { pub flags: u8, pub _padding: u8, pub ntuples: u16, } impl XlHeapMultiInsert { pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert { XlHeapMultiInsert { flags: buf.get_u8(), _padding: buf.get_u8(), ntuples: buf.get_u16_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapDelete { pub xmax: TransactionId, pub offnum: OffsetNumber, pub _padding: u16, pub t_cid: u32, pub infobits_set: u8, pub flags: u8, } impl XlHeapDelete { pub fn decode(buf: &mut Bytes) -> XlHeapDelete { XlHeapDelete { xmax: buf.get_u32_le(), offnum: buf.get_u16_le(), _padding: buf.get_u16_le(), t_cid: buf.get_u32_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapUpdate { pub old_xmax: TransactionId, pub old_offnum: OffsetNumber, pub old_infobits_set: u8, pub flags: u8, pub t_cid: u32, pub new_xmax: TransactionId, pub new_offnum: OffsetNumber, } impl XlHeapUpdate { pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { XlHeapUpdate { old_xmax: buf.get_u32_le(), old_offnum: buf.get_u16_le(), old_infobits_set: buf.get_u8(), flags: buf.get_u8(), t_cid: buf.get_u32_le(), new_xmax: buf.get_u32_le(), new_offnum: buf.get_u16_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapLock { pub locking_xid: TransactionId, pub offnum: OffsetNumber, pub _padding: u16, pub t_cid: u32, pub infobits_set: u8, pub flags: u8, } impl XlHeapLock { pub fn decode(buf: &mut Bytes) -> XlHeapLock { XlHeapLock { locking_xid: buf.get_u32_le(), offnum: buf.get_u16_le(), _padding: buf.get_u16_le(), t_cid: buf.get_u32_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapLockUpdated { pub xmax: TransactionId, pub offnum: OffsetNumber, pub infobits_set: u8, pub flags: u8, } impl XlHeapLockUpdated { pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated { XlHeapLockUpdated { xmax: buf.get_u32_le(), offnum: buf.get_u16_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlParameterChange { pub max_connections: i32, pub max_worker_processes: i32, pub max_wal_senders: i32, pub max_prepared_xacts: i32, pub max_locks_per_xact: i32, pub wal_level: i32, pub wal_log_hints: bool, pub track_commit_timestamp: bool, pub _padding: [u8; 2], } impl XlParameterChange { pub fn decode(buf: &mut Bytes) -> XlParameterChange { XlParameterChange { max_connections: buf.get_i32_le(), max_worker_processes: buf.get_i32_le(), max_wal_senders: buf.get_i32_le(), max_prepared_xacts: buf.get_i32_le(), max_locks_per_xact: buf.get_i32_le(), wal_level: buf.get_i32_le(), wal_log_hints: buf.get_u8() != 0, track_commit_timestamp: buf.get_u8() != 0, _padding: [buf.get_u8(), buf.get_u8()], } } } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, }; } pub mod v16 { use bytes::{Buf, Bytes}; pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use crate::{OffsetNumber, TransactionId}; pub struct XlHeapDelete { pub xmax: TransactionId, pub offnum: OffsetNumber, pub infobits_set: u8, pub flags: u8, } impl XlHeapDelete { pub fn decode(buf: &mut Bytes) -> XlHeapDelete { XlHeapDelete { xmax: buf.get_u32_le(), offnum: buf.get_u16_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapUpdate { pub old_xmax: TransactionId, pub old_offnum: OffsetNumber, pub old_infobits_set: u8, pub flags: u8, pub new_xmax: TransactionId, pub new_offnum: OffsetNumber, } impl XlHeapUpdate { pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { XlHeapUpdate { old_xmax: buf.get_u32_le(), old_offnum: buf.get_u16_le(), old_infobits_set: buf.get_u8(), flags: buf.get_u8(), new_xmax: buf.get_u32_le(), new_offnum: buf.get_u16_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlHeapLock { pub locking_xid: TransactionId, pub offnum: OffsetNumber, pub infobits_set: u8, pub flags: u8, } impl XlHeapLock { pub fn decode(buf: &mut Bytes) -> XlHeapLock { XlHeapLock { locking_xid: buf.get_u32_le(), offnum: buf.get_u16_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ pub mod rm_neon { use bytes::{Buf, Bytes}; use crate::{OffsetNumber, TransactionId}; #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapInsert { pub offnum: OffsetNumber, pub flags: u8, } impl XlNeonHeapInsert { pub fn decode(buf: &mut Bytes) -> XlNeonHeapInsert { XlNeonHeapInsert { offnum: buf.get_u16_le(), flags: buf.get_u8(), } } } #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapMultiInsert { pub flags: u8, pub _padding: u8, pub ntuples: u16, pub t_cid: u32, } impl XlNeonHeapMultiInsert { pub fn decode(buf: &mut Bytes) -> XlNeonHeapMultiInsert { XlNeonHeapMultiInsert { flags: buf.get_u8(), _padding: buf.get_u8(), ntuples: buf.get_u16_le(), t_cid: buf.get_u32_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapDelete { pub xmax: TransactionId, pub offnum: OffsetNumber, pub infobits_set: u8, pub flags: u8, pub t_cid: u32, } impl XlNeonHeapDelete { pub fn decode(buf: &mut Bytes) -> XlNeonHeapDelete { XlNeonHeapDelete { xmax: buf.get_u32_le(), offnum: buf.get_u16_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), t_cid: buf.get_u32_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapUpdate { pub old_xmax: TransactionId, pub old_offnum: OffsetNumber, pub old_infobits_set: u8, pub flags: u8, pub t_cid: u32, pub new_xmax: TransactionId, pub new_offnum: OffsetNumber, } impl XlNeonHeapUpdate { pub fn decode(buf: &mut Bytes) -> XlNeonHeapUpdate { XlNeonHeapUpdate { old_xmax: buf.get_u32_le(), old_offnum: buf.get_u16_le(), old_infobits_set: buf.get_u8(), flags: buf.get_u8(), t_cid: buf.get_u32(), new_xmax: buf.get_u32_le(), new_offnum: buf.get_u16_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapLock { pub locking_xid: TransactionId, pub t_cid: u32, pub offnum: OffsetNumber, pub infobits_set: u8, pub flags: u8, } impl XlNeonHeapLock { pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock { XlNeonHeapLock { locking_xid: buf.get_u32_le(), t_cid: buf.get_u32_le(), offnum: buf.get_u16_le(), infobits_set: buf.get_u8(), flags: buf.get_u8(), } } } } } pub mod v17 { use bytes::{Buf, Bytes}; pub use super::v14::XlHeapLockUpdated; pub use super::v16::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, rm_neon, }; pub use crate::TimeLineID; pub use postgres_ffi_types::TimestampTz; #[repr(C)] #[derive(Debug)] pub struct XlEndOfRecovery { pub end_time: TimestampTz, pub this_time_line_id: TimeLineID, pub prev_time_line_id: TimeLineID, pub wal_level: i32, } impl XlEndOfRecovery { pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { XlEndOfRecovery { end_time: buf.get_i64_le(), this_time_line_id: buf.get_u32_le(), prev_time_line_id: buf.get_u32_le(), wal_level: buf.get_i32_le(), } } } } #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { pub rnode: RelFileNode, // FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have // well-defined size? pub forknum: u8, } impl XlSmgrCreate { pub fn decode(buf: &mut Bytes) -> XlSmgrCreate { XlSmgrCreate { rnode: RelFileNode { spcnode: buf.get_u32_le(), /* tablespace */ dbnode: buf.get_u32_le(), /* database */ relnode: buf.get_u32_le(), /* relation */ }, forknum: buf.get_u32_le() as u8, } } } #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlSmgrTruncate { pub blkno: BlockNumber, pub rnode: RelFileNode, pub flags: u32, } impl XlSmgrTruncate { pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate { XlSmgrTruncate { blkno: buf.get_u32_le(), rnode: RelFileNode { spcnode: buf.get_u32_le(), /* tablespace */ dbnode: buf.get_u32_le(), /* database */ relnode: buf.get_u32_le(), /* relation */ }, flags: buf.get_u32_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlCreateDatabase { pub db_id: Oid, pub tablespace_id: Oid, pub src_db_id: Oid, pub src_tablespace_id: Oid, } impl XlCreateDatabase { pub fn decode(buf: &mut Bytes) -> XlCreateDatabase { XlCreateDatabase { db_id: buf.get_u32_le(), tablespace_id: buf.get_u32_le(), src_db_id: buf.get_u32_le(), src_tablespace_id: buf.get_u32_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlDropDatabase { pub db_id: Oid, pub n_tablespaces: Oid, /* number of tablespace IDs */ pub tablespace_ids: Vec, } impl XlDropDatabase { pub fn decode(buf: &mut Bytes) -> XlDropDatabase { let mut rec = XlDropDatabase { db_id: buf.get_u32_le(), n_tablespaces: buf.get_u32_le(), tablespace_ids: Vec::::new(), }; for _i in 0..rec.n_tablespaces { let id = buf.get_u32_le(); rec.tablespace_ids.push(id); } rec } } /// /// Note: Parsing some fields is missing, because they're not needed. /// /// This is similar to the xl_xact_parsed_commit and /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same /// struct for commits and aborts. /// #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlXactParsedRecord { pub xid: TransactionId, pub info: u8, pub xact_time: TimestampTz, pub xinfo: u32, pub db_id: Oid, /* MyDatabaseId */ pub ts_id: Oid, /* MyDatabaseTableSpace */ pub subxacts: Vec, pub xnodes: Vec, pub origin_lsn: Lsn, } impl XlXactParsedRecord { /// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED /// record. This should agree with the ParseCommitRecord and ParseAbortRecord /// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c) pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord { let info = xl_info & pg_constants::XLOG_XACT_OPMASK; // The record starts with time of commit/abort let xact_time = buf.get_i64_le(); let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { buf.get_u32_le() } else { 0 }; let db_id; let ts_id; if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 { db_id = buf.get_u32_le(); ts_id = buf.get_u32_le(); } else { db_id = 0; ts_id = 0; } let mut subxacts = Vec::::new(); if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 { let nsubxacts = buf.get_i32_le(); for _i in 0..nsubxacts { let subxact = buf.get_u32_le(); subxacts.push(subxact); } } let mut xnodes = Vec::::new(); if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 { let nrels = buf.get_i32_le(); for _i in 0..nrels { let spcnode = buf.get_u32_le(); let dbnode = buf.get_u32_le(); let relnode = buf.get_u32_le(); tracing::trace!( "XLOG_XACT_COMMIT relfilenode {}/{}/{}", spcnode, dbnode, relnode ); xnodes.push(RelFileNode { spcnode, dbnode, relnode, }); } } if xinfo & crate::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { let nitems = buf.get_i32_le(); tracing::debug!( "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", nitems ); let sizeof_xl_xact_stats_item = 12; buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap()); } if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 { let nmsgs = buf.get_i32_le(); let sizeof_shared_invalidation_message = 16; buf.advance( (nmsgs * sizeof_shared_invalidation_message) .try_into() .unwrap(), ); } if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 { xid = buf.get_u32_le(); tracing::debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 { Lsn(buf.get_u64_le()) } else { Lsn::INVALID }; XlXactParsedRecord { xid, info, xact_time, xinfo, db_id, ts_id, subxacts, xnodes, origin_lsn, } } } #[repr(C)] #[derive(Debug)] pub struct XlClogTruncate { pub pageno: u32, pub oldest_xid: TransactionId, pub oldest_xid_db: Oid, } impl XlClogTruncate { pub fn decode(buf: &mut Bytes, pg_version: PgMajorVersion) -> XlClogTruncate { XlClogTruncate { pageno: if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } } } #[repr(C)] #[derive(Debug)] pub struct XlLogicalMessage { pub db_id: Oid, pub transactional: bool, pub prefix_size: usize, pub message_size: usize, } impl XlLogicalMessage { pub fn decode(buf: &mut Bytes) -> XlLogicalMessage { XlLogicalMessage { db_id: buf.get_u32_le(), transactional: buf.get_u32_le() != 0, // 4-bytes alignment prefix_size: buf.get_u64_le() as usize, message_size: buf.get_u64_le() as usize, } } } #[repr(C)] #[derive(Debug)] pub struct XlRunningXacts { pub xcnt: u32, pub subxcnt: u32, pub subxid_overflow: bool, pub next_xid: TransactionId, pub oldest_running_xid: TransactionId, pub latest_completed_xid: TransactionId, pub xids: Vec, } impl XlRunningXacts { pub fn decode(buf: &mut Bytes) -> XlRunningXacts { let xcnt = buf.get_u32_le(); let subxcnt = buf.get_u32_le(); let subxid_overflow = buf.get_u32_le() != 0; let next_xid = buf.get_u32_le(); let oldest_running_xid = buf.get_u32_le(); let latest_completed_xid = buf.get_u32_le(); let mut xids = Vec::new(); for _ in 0..(xcnt + subxcnt) { xids.push(buf.get_u32_le()); } XlRunningXacts { xcnt, subxcnt, subxid_overflow, next_xid, oldest_running_xid, latest_completed_xid, xids, } } } pub fn describe_postgres_wal_record(record: &Bytes) -> Result { // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this. // Maybe use the postgres wal redo process, the same used for replaying WAL records? // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly, // without worrying about security? // // But for now, we have a hand-written code for a few common WAL record types here. let mut buf = record.clone(); // 1. Parse XLogRecord struct // FIXME: assume little-endian here let xlogrec = XLogRecord::from_bytes(&mut buf)?; let unknown_str: String; let result: &str = match xlogrec.xl_rmid { pg_constants::RM_HEAP2_ID => { let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_HEAP2_MULTI_INSERT => "HEAP2 MULTI_INSERT", pg_constants::XLOG_HEAP2_VISIBLE => "HEAP2 VISIBLE", _ => { unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } } pg_constants::RM_HEAP_ID => { let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_HEAP_INSERT => "HEAP INSERT", pg_constants::XLOG_HEAP_DELETE => "HEAP DELETE", pg_constants::XLOG_HEAP_UPDATE => "HEAP UPDATE", pg_constants::XLOG_HEAP_HOT_UPDATE => "HEAP HOT_UPDATE", _ => { unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } } pg_constants::RM_XLOG_ID => { let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK; match info { pg_constants::XLOG_FPI => "XLOG FPI", pg_constants::XLOG_FPI_FOR_HINT => "XLOG FPI_FOR_HINT", _ => { unknown_str = format!("XLOG UNKNOWN_0x{info:02x}"); &unknown_str } } } rmid => { let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK; unknown_str = format!("UNKNOWN_RM_{rmid} INFO_0x{info:02x}"); &unknown_str } }; Ok(String::from(result)) } ================================================ FILE: libs/postgres_ffi/src/xlog_utils.rs ================================================ // // This file contains common utilities for dealing with PostgreSQL WAL files and // LSNs. // // Many of these functions have been copied from PostgreSQL, and rewritten in // Rust. That's why they don't follow the usual Rust naming conventions, they // have been named the same as the corresponding PostgreSQL functions instead. // use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, MY_PGVERSION }; use postgres_ffi_types::TimestampTz; use super::wal_generator::LogicalMessageGenerator; use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use bytes::BytesMut; use bytes::{Buf, Bytes}; use serde::Serialize; use std::ffi::{CString, OsStr}; use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; use std::io::SeekFrom; use std::path::Path; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; pub const XLP_BKP_REMOVABLE: u16 = 0x0004; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::(); pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::(); pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; /// Interval of checkpointing metadata file. We should store metadata file to enforce /// predicate that checkpoint.nextXid is larger than any XID in WAL. /// But flushing checkpoint file for each transaction seems to be too expensive, /// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform /// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions. /// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. const XID_CHECKPOINT_INTERVAL: u32 = 1024; pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, wal_segsz_bytes: usize, ) -> XLogRecPtr { segno * (wal_segsz_bytes as u64) + (offset as u64) } pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { format!( "{:>08X}{:>08X}{:>08X}", tli, logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) ) } pub fn XLogFromFileName( fname: &OsStr, wal_seg_size: usize, ) -> anyhow::Result<(XLogSegNo, TimeLineID)> { if let Some(fname_str) = fname.to_str() { let tli = u32::from_str_radix(&fname_str[0..8], 16)?; let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo; let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo; Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)) } else { anyhow::bail!("non-ut8 filename: {:?}", fname); } } pub fn IsXLogFileName(fname: &OsStr) -> bool { if let Some(fname) = fname.to_str() { fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()) } else { false } } pub fn IsPartialXLogFileName(fname: &OsStr) -> bool { if let Some(fname) = fname.to_str() { fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8])) } else { false } } /// If LSN points to the beginning of the page, then shift it to first record, /// otherwise align on 8-bytes boundary (required for WAL records) pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { if lsn.0 % XLOG_BLCKSZ as u64 == 0 { let hdr_size = if lsn.0 % seg_sz as u64 == 0 { XLOG_SIZE_OF_XLOG_LONG_PHD } else { XLOG_SIZE_OF_XLOG_SHORT_PHD }; lsn + hdr_size as u64 } else { lsn.align() } } /// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN /// /// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in /// the pageserver. They use the same format as the PostgreSQL control file and the /// checkpoint record, but see walingest.rs for how exactly they are kept up to date. /// 'lsn' is the LSN at which we're starting up. /// /// Returns: /// - pg_control file contents /// - system_identifier, extracted from the persisted information /// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown /// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, ) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; // Generate new pg_control needed for bootstrap // // NB: In the checkpoint struct that we persist in the pageserver, we have a different // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. // // We didn't always have this convention however, and old persisted records will have // old REDO values that point to some old LSN. // // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown // checkpoint record at that point in WAL, with no new WAL records after it. That case // can be treated as starting from a clean shutdown. All other cases are treated as // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so // that distinction doesn't matter very much. As of this writing, it only affects // whether the persisted pg_stats information can be used or not. // // In the Checkpoint struct in the returned pg_control file, the redo pointer is // always set to the LSN we're starting at, to hint that no WAL replay is required. // (There's some neon-specific code in Postgres startup to make that work, though. // Just setting the redo pointer is not sufficient.) let was_shutdown = Lsn(checkpoint.redo) == lsn; checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The // neon-specific code at postgres startup ignores the state stored in the control // file, similar to archive recovery in standalone PostgreSQL. Similarly, the // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { to_pg_timestamp(SystemTime::now()) } // Module to reduce the scope of the constants mod timestamp_conversions { use std::time::Duration; use anyhow::Context; use super::*; const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1) const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1) const SECS_PER_DAY: u64 = 86400; const USECS_PER_SEC: u64 = 1000000; const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 = (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY; pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz { match time.duration_since(SystemTime::UNIX_EPOCH) { Ok(n) => { ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC + n.subsec_micros() as u64) as i64 } Err(_) => panic!("SystemTime before UNIX EPOCH!"), } } pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result { let time: u64 = time .try_into() .context("timestamp before millenium (postgres epoch)")?; let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC; SystemTime::UNIX_EPOCH .checked_add(Duration::from_micros(since_unix_epoch)) .context("SystemTime overflow") } } pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp}; // Returns (aligned) end_lsn of the last record in data_dir with WAL segments. // start_lsn must point to some previously known record boundary (beginning of // the next record). If no valid record after is found, start_lsn is returned // back. pub fn find_end_of_wal( data_dir: &Path, wal_seg_size: usize, start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn. ) -> anyhow::Result { let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; let pg_version = MY_PGVERSION; tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); // loop over segments loop { let segno = curr_lsn.segment_number(wal_seg_size); let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); let seg_file_path = data_dir.join(seg_file_name); match open_wal_segment(&seg_file_path)? { None => { // no more segments tracing::debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); return Ok(result); } Some(mut segment) => { let seg_offs = curr_lsn.segment_offset(wal_seg_size); segment.seek(SeekFrom::Start(seg_offs as u64))?; // loop inside segment while curr_lsn.segment_number(wal_seg_size) == segno { let bytes_read = segment.read(&mut buf)?; if bytes_read == 0 { tracing::debug!( "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}", result, seg_file_path, curr_lsn.segment_offset(wal_seg_size) ); return Ok(result); } curr_lsn += bytes_read as u64; decoder.feed_bytes(&buf[0..bytes_read]); // advance result past all completely read records loop { match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { tracing::debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); return Ok(result); } Ok(None) => break, // need more data } } } } } } } // Open .partial or full WAL segment file, if present. fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { let mut partial_path = seg_file_path.to_owned(); partial_path.set_extension("partial"); match File::open(partial_path) { Ok(file) => Ok(Some(file)), Err(e) => match e.kind() { ErrorKind::NotFound => { // .partial not found, try full match File::open(seg_file_path) { Ok(file) => Ok(Some(file)), Err(e) => match e.kind() { ErrorKind::NotFound => Ok(None), _ => Err(e.into()), }, } } _ => Err(e.into()), }, } } impl XLogRecord { pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; XLogRecord::des(buf) } pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; XLogRecord::des_from(&mut buf.reader()) } pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; Ok(self.ser()?.into()) } // Is this record an XLOG_SWITCH record? They need some special processing, pub fn is_xlog_switch_record(&self) -> bool { self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID } } impl XLogPageHeaderData { pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()) } pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; self.ser().map(|b| b.into()) } } impl XLogLongPageHeaderData { pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; XLogLongPageHeaderData::des_from(&mut buf.reader()) } pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; self.ser().map(|b| b.into()) } } pub const SIZEOF_CHECKPOINT: usize = size_of::(); impl CheckPoint { pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; Ok(self.ser()?.into()) } pub fn decode(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; CheckPoint::des(buf) } /// Update next XID based on provided new_xid and stored epoch. /// Next XID should be greater than new_xid. This handles 32-bit /// XID wraparound correctly. /// /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. let mut new_xid = std::cmp::max( xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID, ); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE new_xid = new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); let full_xid = self.nextXid.value; let old_xid = full_xid as u32; if new_xid.wrapping_sub(old_xid) as i32 > 0 { let mut epoch = full_xid >> 32; if new_xid < old_xid { // wrap-around epoch += 1; } let nextXid = (epoch << 32) | new_xid as u64; if nextXid != self.nextXid.value { self.nextXid = FullTransactionId { value: nextXid }; return true; } } false } /// Advance next multi-XID/offset to those given in arguments. /// /// It's important that this handles wraparound correctly. This should match the /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function. /// /// Returns 'true' if the Checkpoint was updated. pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { let mut modified = false; if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 { self.nextMulti = multi_xid; modified = true; } if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 { self.nextMultiOffset = multi_offset; modified = true; } modified } } /// Generate new, empty WAL segment, with correct block headers at the first /// page of the segment and the page that contains the given LSN. /// We need this segment to start compute node. pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result { let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); let page_off = lsn.block_offset(); let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE); let first_page_only = seg_off < XLOG_BLCKSZ; // If first records starts in the middle of the page, pretend in page header // there is a fake record which ends where first real record starts. This // makes pg_waldump etc happy. let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 { assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD); // xlp_rem_len doesn't include page header, hence the subtraction. ( seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD, pg_constants::XLP_FIRST_IS_CONTRECORD, ) } else { (0, 0) }; let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_info: pg_constants::XLP_LONG_HEADER | infoflags, xlp_tli: PG_TLI, xlp_pageaddr: pageaddr, xlp_rem_len: shdr_rem_len as u32, ..Default::default() // Put 0 in padding fields. } }, xlp_sysid: system_id, xlp_seg_size: WAL_SEGMENT_SIZE as u32, xlp_xlog_blcksz: XLOG_BLCKSZ as u32, }; let hdr_bytes = hdr.encode()?; seg_buf.extend_from_slice(&hdr_bytes); //zero out the rest of the file seg_buf.resize(WAL_SEGMENT_SIZE, 0); if !first_page_only { let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize; // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len. let (xlp_rem_len, xlp_info) = if page_off > 0 { assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); ( (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32, pg_constants::XLP_FIRST_IS_CONTRECORD, ) } else { (0, 0) }; let header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_info, xlp_tli: PG_TLI, xlp_pageaddr: lsn.page_lsn().0, xlp_rem_len, ..Default::default() // Put 0 in padding fields. }; let hdr_bytes = header.encode()?; debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len()); debug_assert_ne!(block_offset, 0); seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]); } Ok(seg_buf.freeze()) } #[repr(C)] #[derive(Serialize)] pub struct XlLogicalMessage { pub db_id: Oid, pub transactional: uint32, // bool, takes 4 bytes due to alignment in C structures pub prefix_size: uint64, pub message_size: uint64, } impl XlLogicalMessage { pub fn encode(&self) -> Bytes { use utils::bin_ser::LeSer; self.ser().unwrap().into() } } /// Create new WAL record for non-transactional logical message. /// Used for creating artificial WAL for tests, as LogicalMessage /// record is basically no-op. pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes { // This function can take untrusted input, so discard any NUL bytes in the prefix string. let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs"); let message = message.as_bytes(); LogicalMessageGenerator::new(&prefix, message) .next() .unwrap() .encode(Lsn(0)) } #[cfg(test)] mod tests { use super::*; #[test] fn test_ts_conversion() { let now = SystemTime::now(); let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap(); let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap(); let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap(); assert_eq!(now_since.as_micros(), round_trip_since.as_micros()); let now_pg = get_current_timestamp(); let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap()); assert_eq!(now_pg, round_trip_pg); } // If you need to craft WAL and write tests for this module, put it at wal_craft crate. } ================================================ FILE: libs/postgres_ffi/wal_craft/Cargo.toml ================================================ [package] name = "wal_craft" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true clap.workspace = true env_logger.workspace = true log.workspace = true postgres.workspace = true postgres_ffi.workspace = true camino-tempfile.workspace = true [dev-dependencies] regex.workspace = true utils.workspace = true ================================================ FILE: libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs ================================================ use std::path::PathBuf; use std::str::FromStr; use anyhow::*; use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; use postgres_ffi::PgMajorVersion; use wal_craft::*; fn main() -> Result<()> { env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info")) .init(); let arg_matches = cli().get_matches(); let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| { let intermediate_lsns = match arg_matches .get_one::("type") .map(|s| s.as_str()) .context("'type' is required")? { Simple::NAME => Simple::craft(client)?, LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)? } WalRecordCrossingSegmentFollowedBySmallOne::NAME => { WalRecordCrossingSegmentFollowedBySmallOne::craft(client)? } LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {a}"), }; let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?; for lsn in intermediate_lsns { println!("intermediate_lsn = {lsn}"); } println!("end_of_wal = {end_of_wal_lsn}"); Ok(()) }; match arg_matches.subcommand() { None => panic!("No subcommand provided"), Some(("print-postgres-config", _)) => { for cfg in REQUIRED_POSTGRES_CONFIG.iter() { println!("{cfg}"); } Ok(()) } Some(("with-initdb", arg_matches)) => { let cfg = Conf { pg_version: *arg_matches .get_one::("pg-version") .context("'pg-version' is required")?, pg_distrib_dir: arg_matches .get_one::("pg-distrib-dir") .context("'pg-distrib-dir' is required")? .to_owned(), datadir: arg_matches .get_one::("datadir") .context("'datadir' is required")? .to_owned(), }; cfg.initdb()?; let srv = cfg.start_server()?; wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?; srv.kill(); Ok(()) } Some(("in-existing", arg_matches)) => wal_craft( arg_matches, &mut postgres::Config::from_str( arg_matches .get_one::("connection") .context("'connection' is required")?, ) .context( "'connection' argument value could not be parsed as a postgres connection string", )? .connect(postgres::NoTls)?, ), Some(_) => panic!("Unknown subcommand"), } } fn cli() -> Command { let type_arg = &Arg::new("type") .help("Type of WAL to craft") .value_parser([ Simple::NAME, LastWalRecordXlogSwitch::NAME, LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, WalRecordCrossingSegmentFollowedBySmallOne::NAME, LastWalRecordCrossingSegment::NAME, ]) .required(true); Command::new("Postgres WAL crafter") .about("Crafts Postgres databases with specific WAL properties") .subcommand( Command::new("print-postgres-config") .about("Print the configuration required for PostgreSQL server before running this script") ) .subcommand( Command::new("with-initdb") .about("Craft WAL in a new data directory first initialized with initdb") .arg(type_arg) .arg( Arg::new("datadir") .help("Data directory for the Postgres server") .value_parser(value_parser!(PathBuf)) .required(true) ) .arg( Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .value_parser(value_parser!(PathBuf)) .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") .default_value("/usr/local") ) .arg( Arg::new("pg-version") .long("pg-version") .help("Postgres version to use for the initial tenant") .value_parser(value_parser!(u32)) .required(true) ) ) .subcommand( Command::new("in-existing") .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") .arg(type_arg) .arg( Arg::new("connection") .help("Connection string to the Postgres database to populate") .required(true) ) ) } #[test] fn verify_cli() { cli().debug_assert(); } ================================================ FILE: libs/postgres_ffi/wal_craft/src/lib.rs ================================================ use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; use anyhow::{bail, ensure}; use camino_tempfile::{Utf8TempDir, tempdir}; use log::*; use postgres::Client; use postgres::types::PgLsn; use postgres_ffi::{ PgMajorVersion, WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; macro_rules! xlog_utils_test { ($version:ident) => { #[path = "."] mod $version { #[allow(unused_imports)] pub use postgres_ffi::$version::wal_craft_test_export::*; #[allow(clippy::duplicate_mod)] #[cfg(test)] mod xlog_utils_test; } }; } postgres_ffi::for_all_postgres_versions! { xlog_utils_test } pub struct Conf { pub pg_version: PgMajorVersion, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } pub struct PostgresServer { process: std::process::Child, _unix_socket_dir: Utf8TempDir, client_config: postgres::Config, } pub static REQUIRED_POSTGRES_CONFIG: [&str; 4] = [ "wal_keep_size=50MB", // Ensure old WAL is not removed "shared_preload_libraries=neon", // can only be loaded at startup // Disable background processes as much as possible "wal_writer_delay=10s", "autovacuum=off", ]; impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); Ok(path.join(self.pg_version.v_str())) } fn pg_bin_dir(&self) -> anyhow::Result { Ok(self.pg_distrib_dir()?.join("bin")) } fn pg_lib_dir(&self) -> anyhow::Result { Ok(self.pg_distrib_dir()?.join("lib")) } pub fn wal_dir(&self) -> PathBuf { self.datadir.join("pg_wal") } fn new_pg_command(&self, command: impl AsRef) -> anyhow::Result { let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); cmd.env_clear() .env("LD_LIBRARY_PATH", self.pg_lib_dir()?) .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ); Ok(cmd) } pub fn initdb(&self) -> anyhow::Result<()> { if let Some(parent) = self.datadir.parent() { info!("Pre-creating parent directory {:?}", parent); // Tests may be run concurrently and there may be a race to create `test_output/`. // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories. std::fs::create_dir_all(parent)?; } info!( "Running initdb in {:?} with user \"postgres\"", self.datadir ); let output = self .new_pg_command("initdb")? .arg("--pgdata") .arg(&self.datadir) .args(["--username", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); ensure!( output.status.success(), "initdb failed, stdout and stderr follow:\n{}{}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr), ); Ok(()) } pub fn start_server(&self) -> anyhow::Result { info!("Starting Postgres server in {:?}", self.datadir); let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self .new_pg_command("postgres")? .args(["-c", "listen_addresses="]) .arg("-k") .arg(&unix_socket_dir_path) .arg("-D") .arg(&self.datadir) .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .spawn()?; let server = PostgresServer { process: server_process, _unix_socket_dir: unix_socket_dir, client_config: { let mut c = postgres::Config::new(); c.host_path(&unix_socket_dir_path); c.user("postgres"); c.connect_timeout(Duration::from_millis(10000)); c }, }; Ok(server) } pub fn pg_waldump( &self, first_segment_name: &OsStr, last_segment_name: &OsStr, ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); info!( "Running pg_waldump for {} .. {}", first_segment_file.display(), last_segment_file.display() ); let output = self .new_pg_command("pg_waldump")? .args([&first_segment_file, &last_segment_file]) .output()?; debug!("waldump output: {:?}", output); Ok(output) } } impl PostgresServer { pub fn connect_with_timeout(&self) -> anyhow::Result { let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); while Instant::now() < retry_until { if let Ok(client) = self.client_config.connect(postgres::NoTls) { return Ok(client); } std::thread::sleep(Duration::from_millis(100)); } bail!("Connection timed out"); } pub fn kill(mut self) { self.process.kill().unwrap(); self.process.wait().unwrap(); } } impl Drop for PostgresServer { fn drop(&mut self) { match self.process.try_wait() { Ok(Some(_)) => return, Ok(None) => { warn!("Server was not terminated, will be killed"); } Err(e) => { error!("Unable to get status of the server: {}, will be killed", e); } } let _ = self.process.kill(); } } pub trait PostgresClientExt: postgres::GenericClient { fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_insert_lsn()", &[])? .get(0)) } fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_flush_lsn()", &[])? .get(0)) } } impl PostgresClientExt for C {} pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); ensure!(wal_keep_size == "50MB"); let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0); ensure!(wal_writer_delay == "10s"); let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0); ensure!(autovacuum == "off"); let wal_segment_size = client.query_one( "select cast(setting as bigint) as setting, unit \ from pg_settings where name = 'wal_segment_size'", &[], )?; ensure!( wal_segment_size.get::<_, String>("unit") == "B", "Unexpected wal_segment_size unit" ); ensure!( wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64, "Unexpected wal_segment_size in bytes" ); Ok(()) } pub trait Crafter { const NAME: &'static str; /// Generates WAL using the client `client`. Returns a vector of some valid /// "interesting" intermediate LSNs which one may start reading from. /// test_end_of_wal uses this to check various starting points. /// /// Note that postgres is generally keen about writing some WAL. While we /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about /// stable WAL end would be flaky unless postgres is shut down. For this /// reason returning potential end of WAL here is pointless. Most of the /// time this doesn't happen though, so it is reasonable to create needed /// WAL structure and immediately kill postgres like test_end_of_wal does. fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result>; } /// Wraps some WAL craft function, providing current LSN to it before the /// insertion and flushing WAL afterwards. Also pushes initial LSN to the /// result. fn craft_internal( client: &mut C, f: impl Fn(&mut C, PgLsn) -> anyhow::Result>, ) -> anyhow::Result> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); let mut intermediate_lsns = f(client, initial_lsn)?; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); } // Some records may be not flushed, e.g. non-transactional logical messages. Flush now. // // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn // returns the position just after the page header on the next page. That's where the next // record will be inserted. But the page header hasn't actually been written to the WAL // yet, and if you try to flush it, you get a "request to flush past end of generated WAL" // error. Because of that, if the insert location is just after a page header, back off to // previous page boundary. let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?); if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 { lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64; } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 { lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?; Ok(intermediate_lsns) } pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok(Vec::new()) }) } } pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; let before_xlog_switch = client.pg_current_wal_insert_lsn()?; // pg_switch_wal returns end of last record of the switched segment, // i.e. end of SWITCH itself. let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); let before_xlog_switch_u64 = u64::from(before_xlog_switch); let next_segment = PgLsn::from( before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64) + WAL_SEGMENT_SIZE as u64, ); ensure!( xlog_switch_record_end <= next_segment, "XLOG_SWITCH record ended after the expected segment boundary: {} > {}", xlog_switch_record_end, next_segment ); Ok(vec![before_xlog_switch, xlog_switch_record_end]) } } pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; /// Craft xlog SWITCH record ending at page boundary. impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We // will use carefully-sized logical messages to advance WAL insert location such // that there is just enough space on the page for the XLOG_SWITCH record. loop { // We start with measuring how much WAL it takes for one logical message, // considering all alignments and headers. let before_lsn = client.pg_current_wal_insert_lsn()?; client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", &[], )?; let after_lsn = client.pg_current_wal_insert_lsn()?; // Did the record cross a page boundary? If it did, start over. Crossing a // page boundary adds to the apparent size of the record because of the page // header, which throws off the calculation. if u64::from(before_lsn) / XLOG_BLCKSZ as u64 != u64::from(after_lsn) / XLOG_BLCKSZ as u64 { continue; } // base_size is the size of a logical message without the payload let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10; // Is there enough space on the page for another logical message and an // XLOG_SWITCH? If not, start over. let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 { continue; } // We will write another logical message, such that after the logical message // record, there will be space for exactly one XLOG_SWITCH. How large should // the logical message's payload be? An XLOG_SWITCH record has no data => its // size is exactly XLOG_SIZE_OF_XLOG_RECORD. let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64; client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", &[&(repeats as i32)], )?; info!( "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", client.pg_current_wal_insert_lsn()?, XLOG_SIZE_OF_XLOG_RECORD ); // Emit the XLOG_SWITCH let before_xlog_switch = client.pg_current_wal_insert_lsn()?; let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ != XLOG_SIZE_OF_XLOG_SHORT_PHD { warn!( "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating", xlog_switch_record_end, u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ ); continue; } return Ok(vec![before_xlog_switch, xlog_switch_record_end]); } } } /// Write ~16MB logical message; it should cross WAL segment. fn craft_seg_size_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, ) -> anyhow::Result> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), "Initial LSN is too far in the future" ); let message_lsn: PgLsn = client .query_one( "select pg_logical_emit_message($1, 'big-16mb-msg', \ concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn", &[&transactional], )? .get("message_lsn"); ensure!( message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192), "Logical message did not cross the segment boundary" ); ensure!( message_lsn < PgLsn::from(0x0400_0000), "Logical message crossed two segments" ); Ok(vec![message_lsn]) }) } pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Transactional message crossing WAL segment will be followed by small // commit record. craft_seg_size_logical_message(client, true) } } pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { craft_seg_size_logical_message(client, false) } } ================================================ FILE: libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs ================================================ //! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency. use super::*; use crate::{error, info}; use regex::Regex; use std::cmp::min; use std::ffi::OsStr; use std::fs::{self, File}; use std::io::Write; use std::{env, str::FromStr}; use utils::const_assert; use utils::lsn::Lsn; fn init_logging() { let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!( "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace" ))) .is_test(true) .try_init(); } /// Test that find_end_of_wal returns the same results as pg_dump on various /// WALs created by Crafter. fn test_end_of_wal(test_name: &str) { use crate::*; let pg_version = MY_PGVERSION; // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join("..") .join(".."); let cfg = Conf { pg_version, pg_distrib_dir: top_path.join("pg_install"), datadir: top_path.join(format!("test_output/{test_name}-{PG_MAJORVERSION}")), }; if cfg.datadir.exists() { fs::remove_dir_all(&cfg.datadir).unwrap(); } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); let intermediate_lsns: Vec = intermediate_lsns .iter() .map(|&lsn| u64::from(lsn).into()) .collect(); // Kill postgres. Note that it might have inserted to WAL something after // 'craft' did its job. srv.kill(); // Check find_end_of_wal on the initial WAL let last_segment = cfg .wal_dir() .read_dir() .unwrap() .map(|f| f.unwrap().file_name()) .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment); for start_lsn in intermediate_lsns .iter() .chain(std::iter::once(&expected_end_of_wal)) { // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. // We assume that `start_lsn` is non-decreasing. info!( "Checking with start_lsn={}, erasing WAL before it", start_lsn ); for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { let fname = file.file_name(); if !IsXLogFileName(&fname) { continue; } let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap(); let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); if seg_start_lsn > u64::from(*start_lsn) { continue; } let mut f = File::options().write(true).open(file.path()).unwrap(); static ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; f.write_all( &ZEROS[0..min( WAL_SEGMENT_SIZE, (u64::from(*start_lsn) - seg_start_lsn) as usize, )], ) .unwrap(); } check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal); } } fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn { // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump(OsStr::new("000000010000000000000001"), last_segment) .unwrap() .stderr; let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); let caps = match Regex::new(r"invalid record length at (.+):") .unwrap() .captures(waldump_output) { Some(caps) => caps, None => { error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); panic!(); } }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); info!("waldump erred on {}", waldump_wal_end); waldump_wal_end } fn check_end_of_wal( cfg: &crate::Conf, last_segment: &OsStr, start_lsn: Lsn, expected_end_of_wal: Lsn, ) { // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); // info!( // "find_end_of_wal returned wal_end={} with non-partial WAL segment", // wal_end // ); // assert_eq!(wal_end, expected_end_of_wal_non_partial); // Rename file to partial to actually find last valid lsn, then rename it back. fs::rename( cfg.wal_dir().join(last_segment), cfg.wal_dir() .join(format!("{}.partial", last_segment.to_str().unwrap())), ) .unwrap(); let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); info!( "find_end_of_wal returned wal_end={} with partial WAL segment", wal_end ); assert_eq!(wal_end, expected_end_of_wal); fs::rename( cfg.wal_dir() .join(format!("{}.partial", last_segment.to_str().unwrap())), cfg.wal_dir().join(last_segment), ) .unwrap(); } const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); #[test] pub fn test_find_end_of_wal_simple() { init_logging(); test_end_of_wal::("test_find_end_of_wal_simple"); } #[test] pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { init_logging(); test_end_of_wal::( "test_find_end_of_wal_crossing_segment_followed_by_small_one", ); } #[test] pub fn test_find_end_of_wal_last_crossing_segment() { init_logging(); test_end_of_wal::( "test_find_end_of_wal_last_crossing_segment", ); } /// Check the math in update_next_xid /// /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, /// currently 1024. #[test] pub fn test_update_next_xid() { let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); checkpoint.nextXid = FullTransactionId { value: 10 }; assert_eq!(checkpoint.nextXid.value, 10); // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL // boundary checkpoint.update_next_xid(100); assert_eq!(checkpoint.nextXid.value, 1024); // No change checkpoint.update_next_xid(500); assert_eq!(checkpoint.nextXid.value, 1024); checkpoint.update_next_xid(1023); assert_eq!(checkpoint.nextXid.value, 1024); // The function returns the *next* XID, given the highest XID seen so // far. So when we pass 1024, the nextXid gets bumped up to the next // XID_CHECKPOINT_INTERVAL boundary. checkpoint.update_next_xid(1024); assert_eq!(checkpoint.nextXid.value, 2048); } #[test] pub fn test_update_next_multixid() { let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); // simple case checkpoint.nextMulti = 20; checkpoint.nextMultiOffset = 20; checkpoint.update_next_multixid(1000, 2000); assert_eq!(checkpoint.nextMulti, 1000); assert_eq!(checkpoint.nextMultiOffset, 2000); // No change checkpoint.update_next_multixid(500, 900); assert_eq!(checkpoint.nextMulti, 1000); assert_eq!(checkpoint.nextMultiOffset, 2000); // Close to wraparound, but not wrapped around yet checkpoint.nextMulti = 0xffff0000; checkpoint.nextMultiOffset = 0xfffe0000; checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff); assert_eq!(checkpoint.nextMulti, 0xffff00ff); assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); // Wraparound checkpoint.update_next_multixid(1, 900); assert_eq!(checkpoint.nextMulti, 1); assert_eq!(checkpoint.nextMultiOffset, 900); // Wraparound nextMulti to 0. // // It's a bit surprising that nextMulti can be 0, because that's a special value // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound: // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips // the 0 and the next multi-xid actually assigned is 1. checkpoint.nextMulti = 0xffff0000; checkpoint.nextMultiOffset = 0xfffe0000; checkpoint.update_next_multixid(0, 0xfffe00ff); assert_eq!(checkpoint.nextMulti, 0); assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); // Wraparound nextMultiOffset to 0 checkpoint.update_next_multixid(0, 0); assert_eq!(checkpoint.nextMulti, 0); assert_eq!(checkpoint.nextMultiOffset, 0); } #[test] pub fn test_encode_logical_message() { let expected = [ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, ]; let actual = encode_logical_message("prefix", "message"); assert_eq!(expected, actual[..]); } ================================================ FILE: libs/postgres_ffi_types/Cargo.toml ================================================ [package] name = "postgres_ffi_types" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] thiserror.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] ================================================ FILE: libs/postgres_ffi_types/src/constants.rs ================================================ //! Misc constants, copied from PostgreSQL headers. //! //! Any constants included here must be the same in all PostgreSQL versions and unlikely to change //! in the future either! // From pg_tablespace_d.h pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; ================================================ FILE: libs/postgres_ffi_types/src/forknum.rs ================================================ // Fork numbers, from relpath.h pub const MAIN_FORKNUM: u8 = 0; pub const FSM_FORKNUM: u8 = 1; pub const VISIBILITYMAP_FORKNUM: u8 = 2; pub const INIT_FORKNUM: u8 = 3; #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] InvalidForkName, #[error("invalid relation data file name")] InvalidFileName, } /// Convert Postgres relation file's fork suffix to fork number. pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present None => Ok(MAIN_FORKNUM), Some("fsm") => Ok(FSM_FORKNUM), Some("vm") => Ok(VISIBILITYMAP_FORKNUM), Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { MAIN_FORKNUM => None, FSM_FORKNUM => Some("fsm"), VISIBILITYMAP_FORKNUM => Some("vm"), INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } ================================================ FILE: libs/postgres_ffi_types/src/lib.rs ================================================ //! This package contains some PostgreSQL constants and datatypes that are the same in all versions //! of PostgreSQL and unlikely to change in the future either. These could be derived from the //! PostgreSQL headers with 'bindgen', but in order to avoid proliferating the dependency to bindgen //! and the PostgreSQL C headers to all services, we prefer to have this small stand-alone crate for //! them instead. //! //! Be mindful in what you add here, as these types are deeply ingrained in the APIs. pub mod constants; pub mod forknum; pub type Oid = u32; pub type RepOriginId = u16; pub type TimestampTz = i64; ================================================ FILE: libs/postgres_initdb/Cargo.toml ================================================ [package] name = "postgres_initdb" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true tokio.workspace = true camino.workspace = true thiserror.workspace = true postgres_versioninfo.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } ================================================ FILE: libs/postgres_initdb/src/lib.rs ================================================ //! The canonical way we run `initdb` in Neon. //! //! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations. //! //! This module's job is to eliminate the environment-dependence as much as possible. use std::fmt; use camino::Utf8Path; use postgres_versioninfo::PgMajorVersion; pub struct RunInitdbArgs<'a> { pub superuser: &'a str, pub locale: &'a str, pub initdb_bin: &'a Utf8Path, pub pg_version: PgMajorVersion, pub library_search_path: &'a Utf8Path, pub pgdata: &'a Utf8Path, } #[derive(thiserror::Error, Debug)] pub enum Error { Spawn(std::io::Error), Failed { status: std::process::ExitStatus, stderr: Vec, }, WaitOutput(std::io::Error), Other(anyhow::Error), } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Error::Spawn(e) => write!(f, "Error spawning command: {e:?}"), Error::Failed { status, stderr } => write!( f, "Command failed with status {:?}: {}", status, String::from_utf8_lossy(stderr) ), Error::WaitOutput(e) => write!(f, "Error waiting for command output: {e:?}"), Error::Other(e) => write!(f, "Error: {e:?}"), } } } pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { let RunInitdbArgs { superuser, locale, initdb_bin: initdb_bin_path, pg_version, library_search_path, pgdata, } = args; let mut initdb_command = tokio::process::Command::new(initdb_bin_path); initdb_command .args(["--pgdata", pgdata.as_ref()]) .args(["--username", superuser]) .args(["--encoding", "utf8"]) .args(["--locale", locale]) .arg("--no-instructions") .arg("--no-sync") .env_clear() .env("LD_LIBRARY_PATH", library_search_path) .env("DYLD_LIBRARY_PATH", library_search_path) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ) .stdin(std::process::Stdio::null()) // stdout invocation produces the same output every time, we don't need it .stdout(std::process::Stdio::null()) // we would be interested in the stderr output, if there was any .stderr(std::process::Stdio::piped()); // Before version 14, only the libc provide was available. if pg_version > PgMajorVersion::PG14 { // Version 17 brought with it a builtin locale provider which only provides // C and C.UTF-8. While being safer for collation purposes since it is // guaranteed to be consistent throughout a major release, it is also more // performant. let locale_provider = if pg_version >= PgMajorVersion::PG17 { "builtin" } else { "libc" }; initdb_command.args(["--locale-provider", locale_provider]); } let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?; // Ideally we'd select here with the cancellation token, but the problem is that // we can't safely terminate initdb: it launches processes of its own, and killing // initdb doesn't kill them. After we return from this function, we want the target // directory to be able to be cleaned up. // See https://github.com/neondatabase/neon/issues/6385 let initdb_output = initdb_proc .wait_with_output() .await .map_err(Error::WaitOutput)?; if !initdb_output.status.success() { return Err(Error::Failed { status: initdb_output.status, stderr: initdb_output.stderr, }); } Ok(()) } ================================================ FILE: libs/postgres_versioninfo/Cargo.toml ================================================ [package] name = "postgres_versioninfo" version = "0.1.0" edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true thiserror.workspace = true serde.workspace = true serde_repr.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } ================================================ FILE: libs/postgres_versioninfo/src/lib.rs ================================================ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_repr::{Deserialize_repr, Serialize_repr}; use std::fmt::{Display, Formatter}; use std::str::FromStr; /// An enum with one variant for each major version of PostgreSQL that we support. /// #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Deserialize_repr, Serialize_repr)] #[repr(u32)] pub enum PgMajorVersion { PG14 = 14, PG15 = 15, PG16 = 16, PG17 = 17, // !!! When you add a new PgMajorVersion, don't forget to update PgMajorVersion::ALL } /// A full PostgreSQL version ID, in MMmmbb numerical format (Major/minor/bugfix) #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] #[repr(transparent)] pub struct PgVersionId(u32); impl PgVersionId { pub const UNKNOWN: PgVersionId = PgVersionId(0); pub fn from_full_pg_version(version: u32) -> PgVersionId { match version { 0 => PgVersionId(version), // unknown version 140000..180000 => PgVersionId(version), _ => panic!("Invalid full PostgreSQL version ID {version}"), } } } impl Display for PgVersionId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { u32::fmt(&self.0, f) } } impl Serialize for PgVersionId { fn serialize(&self, serializer: S) -> Result where S: Serializer, { u32::serialize(&self.0, serializer) } } impl<'de> Deserialize<'de> for PgVersionId { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { u32::deserialize(deserializer).map(PgVersionId) } fn deserialize_in_place(deserializer: D, place: &mut Self) -> Result<(), D::Error> where D: Deserializer<'de>, { u32::deserialize_in_place(deserializer, &mut place.0) } } impl PgMajorVersion { /// Get the numerical representation of the represented Major Version pub const fn major_version_num(&self) -> u32 { match self { PgMajorVersion::PG14 => 14, PgMajorVersion::PG15 => 15, PgMajorVersion::PG16 => 16, PgMajorVersion::PG17 => 17, } } /// Get the contents of this version's PG_VERSION file. /// /// The PG_VERSION file is used to determine the PostgreSQL version that currently /// owns the data in a PostgreSQL data directory. pub fn versionfile_string(&self) -> &'static str { match self { PgMajorVersion::PG14 => "14", PgMajorVersion::PG15 => "15", PgMajorVersion::PG16 => "16\x0A", PgMajorVersion::PG17 => "17\x0A", } } /// Get the v{version} string of this major PostgreSQL version. /// /// Because this was hand-coded in various places, this was moved into a shared /// implementation. pub fn v_str(&self) -> String { match self { PgMajorVersion::PG14 => "v14", PgMajorVersion::PG15 => "v15", PgMajorVersion::PG16 => "v16", PgMajorVersion::PG17 => "v17", } .to_string() } /// All currently supported major versions of PostgreSQL. pub const ALL: &'static [PgMajorVersion] = &[ PgMajorVersion::PG14, PgMajorVersion::PG15, PgMajorVersion::PG16, PgMajorVersion::PG17, ]; } impl Display for PgMajorVersion { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(match self { PgMajorVersion::PG14 => "PgMajorVersion::PG14", PgMajorVersion::PG15 => "PgMajorVersion::PG15", PgMajorVersion::PG16 => "PgMajorVersion::PG16", PgMajorVersion::PG17 => "PgMajorVersion::PG17", }) } } #[derive(Debug, thiserror::Error)] #[allow(dead_code)] pub struct InvalidPgVersion(u32); impl Display for InvalidPgVersion { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "InvalidPgVersion({})", self.0) } } impl TryFrom for PgMajorVersion { type Error = InvalidPgVersion; fn try_from(value: PgVersionId) -> Result { Ok(match value.0 / 10000 { 14 => PgMajorVersion::PG14, 15 => PgMajorVersion::PG15, 16 => PgMajorVersion::PG16, 17 => PgMajorVersion::PG17, _ => return Err(InvalidPgVersion(value.0)), }) } } impl From for PgVersionId { fn from(value: PgMajorVersion) -> Self { PgVersionId((value as u32) * 10000) } } #[derive(Debug, PartialEq, Eq, thiserror::Error)] pub struct PgMajorVersionParseError(String); impl Display for PgMajorVersionParseError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "PgMajorVersionParseError({})", self.0) } } impl FromStr for PgMajorVersion { type Err = PgMajorVersionParseError; fn from_str(s: &str) -> Result { Ok(match s { "14" => PgMajorVersion::PG14, "15" => PgMajorVersion::PG15, "16" => PgMajorVersion::PG16, "17" => PgMajorVersion::PG17, _ => return Err(PgMajorVersionParseError(s.to_string())), }) } } ================================================ FILE: libs/posthog_client_lite/Cargo.toml ================================================ [package] name = "posthog_client_lite" version = "0.1.0" edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true arc-swap.workspace = true reqwest.workspace = true serde_json.workspace = true serde.workspace = true sha2.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-util.workspace = true tracing-utils.workspace = true tracing.workspace = true workspace_hack.workspace = true ================================================ FILE: libs/posthog_client_lite/src/background_loop.rs ================================================ //! A background loop that fetches feature flags from PostHog and updates the feature store. use std::{ sync::Arc, time::{Duration, SystemTime}, }; use arc_swap::ArcSwap; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; use crate::{ CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig, }; /// A background loop that fetches feature flags from PostHog and updates the feature store. pub struct FeatureResolverBackgroundLoop { posthog_client: PostHogClient, feature_store: ArcSwap<(SystemTime, Arc)>, cancel: CancellationToken, } impl FeatureResolverBackgroundLoop { pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self { Self { posthog_client: PostHogClient::new(config), feature_store: ArcSwap::new(Arc::new(( SystemTime::UNIX_EPOCH, Arc::new(FeatureStore::new()), ))), cancel: shutdown_pageserver, } } /// Update the feature store with a new feature flag spec bypassing the normal refresh loop. pub fn update(&self, spec: String) -> anyhow::Result<()> { let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?; self.update_feature_store_nofail(resp, "http_propagate"); Ok(()) } fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) { let project_id = self.posthog_client.config.project_id.parse::().ok(); match FeatureStore::new_with_flags(resp.flags, project_id) { Ok(feature_store) => { self.feature_store .store(Arc::new((SystemTime::now(), Arc::new(feature_store)))); tracing::info!("Feature flag updated from {}", source); } Err(e) => { tracing::warn!("Cannot process feature flag spec from {}: {}", source, e); } } } pub fn spawn( self: Arc, handle: &tokio::runtime::Handle, refresh_period: Duration, fake_tenants: Vec, ) { let this = self.clone(); let cancel = self.cancel.clone(); // Main loop of updating the feature flags. handle.spawn( async move { tracing::info!( "Starting PostHog feature resolver with refresh period: {:?}", refresh_period ); let mut ticker = tokio::time::interval(refresh_period); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { tokio::select! { _ = ticker.tick() => {} _ = cancel.cancelled() => break } { let last_update = this.feature_store.load().0; if let Ok(elapsed) = last_update.elapsed() { if elapsed < refresh_period { tracing::debug!( "Skipping feature flag refresh because it's too soon" ); continue; } } } let resp = match this .posthog_client .get_feature_flags_local_evaluation() .await { Ok(resp) => resp, Err(e) => { tracing::warn!("Cannot get feature flags: {}", e); continue; } }; this.update_feature_store_nofail(resp, "refresh_loop"); } tracing::info!("PostHog feature resolver stopped"); } .instrument(info_span!("posthog_feature_resolver")), ); // Report fake tenants to PostHog so that we have the combination of all the properties in the UI. // Do one report per pageserver restart. let this = self.clone(); handle.spawn( async move { tracing::info!("Starting PostHog feature reporter"); for tenant in &fake_tenants { tracing::info!("Reporting fake tenant: {:?}", tenant); } if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await { tracing::warn!("Cannot report fake tenants: {}", e); } } .instrument(info_span!("posthog_feature_reporter")), ); } pub fn feature_store(&self) -> Arc { self.feature_store.load().1.clone() } } ================================================ FILE: libs/posthog_client_lite/src/lib.rs ================================================ //! A lite version of the PostHog client that only supports local evaluation of feature flags. mod background_loop; pub use background_loop::FeatureResolverBackgroundLoop; use std::collections::HashMap; use serde::{Deserialize, Serialize}; use serde_json::json; use sha2::Digest; #[derive(Debug, thiserror::Error)] pub enum PostHogEvaluationError { /// The feature flag is not available, for example, because the local evaluation data is not populated yet. #[error("Feature flag not available: {0}")] NotAvailable(String), #[error("No condition group is matched")] NoConditionGroupMatched, /// Real errors, e.g., the rollout percentage does not add up to 100. #[error("Failed to evaluate feature flag: {0}")] Internal(String), } impl PostHogEvaluationError { pub fn as_variant_str(&self) -> &'static str { match self { PostHogEvaluationError::NotAvailable(_) => "not_available", PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched", PostHogEvaluationError::Internal(_) => "internal", } } } #[derive(Deserialize)] pub struct LocalEvaluationResponse { pub flags: Vec, } #[derive(Deserialize)] pub struct LocalEvaluationFlag { #[allow(dead_code)] id: u64, team_id: u64, key: String, filters: LocalEvaluationFlagFilters, active: bool, } #[derive(Deserialize)] pub struct LocalEvaluationFlagFilters { groups: Vec, multivariate: Option, } #[derive(Deserialize)] pub struct LocalEvaluationFlagFilterGroup { variant: Option, properties: Option>, rollout_percentage: i64, } #[derive(Deserialize)] pub struct LocalEvaluationFlagFilterProperty { key: String, value: PostHogFlagFilterPropertyValue, operator: String, } #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum PostHogFlagFilterPropertyValue { String(String), Number(f64), Boolean(bool), List(Vec), } #[derive(Deserialize)] pub struct LocalEvaluationFlagMultivariate { variants: Vec, } #[derive(Deserialize)] pub struct LocalEvaluationFlagMultivariateVariant { key: String, rollout_percentage: i64, } pub struct FeatureStore { flags: HashMap, } impl Default for FeatureStore { fn default() -> Self { Self::new() } } enum GroupEvaluationResult { MatchedAndOverride(String), MatchedAndEvaluate, Unmatched, } impl FeatureStore { pub fn new() -> Self { Self { flags: HashMap::new(), } } pub fn new_with_flags( flags: Vec, project_id: Option, ) -> Result { let mut store = Self::new(); store.set_flags(flags, project_id)?; Ok(store) } pub fn set_flags( &mut self, flags: Vec, project_id: Option, ) -> Result<(), &'static str> { self.flags.clear(); for flag in flags { if let Some(project_id) = project_id { if flag.team_id != project_id { return Err( "Retrieved a spec with different project id, wrong config? Discarding the feature flags.", ); } } self.flags.insert(flag.key.clone(), flag); } Ok(()) } /// Generate a consistent hash for a user ID (e.g., tenant ID). /// /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`. /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`. fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 { let mut hasher = sha2::Sha256::new(); hasher.update(user_id); hasher.update("."); hasher.update(flag_key); hasher.update("."); hasher.update(salt); let hash = hasher.finalize(); let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap()); hash_int as f64 / u64::MAX as f64 } /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing /// property. fn evaluate_condition( &self, operator: &str, provided: &PostHogFlagFilterPropertyValue, requested: &PostHogFlagFilterPropertyValue, ) -> Result { match operator { "exact" => { let PostHogFlagFilterPropertyValue::String(provided) = provided else { // Left should be a string return Err(PostHogEvaluationError::Internal(format!( "The left side of the condition is not a string: {provided:?}" ))); }; let PostHogFlagFilterPropertyValue::List(requested) = requested else { // Right should be a list of string return Err(PostHogEvaluationError::Internal(format!( "The right side of the condition is not a list: {requested:?}" ))); }; Ok(requested.contains(provided)) } "lt" | "gt" => { let PostHogFlagFilterPropertyValue::String(requested) = requested else { // Right should be a string return Err(PostHogEvaluationError::Internal(format!( "The right side of the condition is not a string: {requested:?}" ))); }; let Ok(requested) = requested.parse::() else { return Err(PostHogEvaluationError::Internal(format!( "Can not parse the right side of the condition as a number: {requested:?}" ))); }; // Left can either be a number or a string let provided = match provided { PostHogFlagFilterPropertyValue::Number(provided) => *provided, PostHogFlagFilterPropertyValue::String(provided) => { let Ok(provided) = provided.parse::() else { return Err(PostHogEvaluationError::Internal(format!( "Can not parse the left side of the condition as a number: {provided:?}" ))); }; provided } _ => { return Err(PostHogEvaluationError::Internal(format!( "The left side of the condition is not a number or a string: {provided:?}" ))); } }; match operator { "lt" => Ok(provided < requested), "gt" => Ok(provided > requested), op => Err(PostHogEvaluationError::Internal(format!( "Unsupported operator: {op}" ))), } } _ => Err(PostHogEvaluationError::Internal(format!( "Unsupported operator: {operator}" ))), } } /// Evaluate a percentage. fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool { mapped_user_id <= percentage as f64 / 100.0 } /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation. /// /// Return values: /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage /// Ok(GroupEvaluationResult::Unmatched): condition unmatched fn evaluate_group( &self, group: &LocalEvaluationFlagFilterGroup, hash_on_group_rollout_percentage: f64, provided_properties: &HashMap, ) -> Result { if let Some(ref properties) = group.properties { for property in properties { if let Some(value) = provided_properties.get(&property.key) { // The user provided the property value if !self.evaluate_condition( property.operator.as_ref(), value, &property.value, )? { return Ok(GroupEvaluationResult::Unmatched); } } else { // We cannot evaluate, the property is not available return Err(PostHogEvaluationError::NotAvailable(format!( "The required property in the condition is not available: {}", property.key ))); } } } // The group has no condition matchers or we matched the properties if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) { if let Some(ref variant_override) = group.variant { Ok(GroupEvaluationResult::MatchedAndOverride( variant_override.clone(), )) } else { Ok(GroupEvaluationResult::MatchedAndEvaluate) } } else { Ok(GroupEvaluationResult::Unmatched) } } /// Evaluate a multivariate feature flag. Returns an error if the flag is not available or if there are errors /// during the evaluation. /// /// The parsing logic is as follows: /// /// * Match each filter group. /// - If a group is matched, it will first determine whether the user is in the range of the group's rollout /// percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash /// is shared across all groups. /// - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or /// - Evaluate the variant using the global config and the global rollout percentage. /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the /// rollout percentage. /// * If there are no matching groups, return an error. /// /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%). /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override. /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be /// propagated beyond where the feature flag gets resolved. pub fn evaluate_multivariate( &self, flag_key: &str, user_id: &str, properties: &HashMap, ) -> Result { let hash_on_global_rollout_percentage = Self::consistent_hash(user_id, flag_key, "multivariate"); let hash_on_group_rollout_percentage = Self::consistent_hash(user_id, flag_key, "within_group"); self.evaluate_multivariate_inner( flag_key, hash_on_global_rollout_percentage, hash_on_group_rollout_percentage, properties, ) } /// Evaluate a boolean feature flag. Returns an error if the flag is not available or if there are errors /// during the evaluation. /// /// The parsing logic is as follows: /// /// * Generate a consistent hash for the tenant-feature. /// * Match each filter group. /// - If a group is matched, it will first determine whether the user is in the range of the rollout /// percentage. /// - If the hash falls within the group's rollout percentage, return true. /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the /// rollout percentage. /// * If there are no matching groups, return an error. /// /// Returns `Ok(())` if the feature flag evaluates to true. In the future, it will return a payload. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be /// propagated beyond where the feature flag gets resolved. pub fn evaluate_boolean( &self, flag_key: &str, user_id: &str, properties: &HashMap, ) -> Result<(), PostHogEvaluationError> { let hash_on_global_rollout_percentage = Self::consistent_hash(user_id, flag_key, "boolean"); self.evaluate_boolean_inner(flag_key, hash_on_global_rollout_percentage, properties) } /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests /// and avoid duplicate computations. /// /// Use a different consistent hash for evaluating the group rollout percentage. /// The behavior: if the condition is set to rolling out to 10% of the users, and /// we set the variant A to 20% in the global config, then 2% of the total users will /// be evaluated to variant A. /// /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users /// will be evaluated (versus 30% if group evaluation is done independently). pub(crate) fn evaluate_multivariate_inner( &self, flag_key: &str, hash_on_global_rollout_percentage: f64, hash_on_group_rollout_percentage: f64, properties: &HashMap, ) -> Result { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( "The feature flag is not active: {flag_key}" ))); } let Some(ref multivariate) = flag_config.filters.multivariate else { return Err(PostHogEvaluationError::Internal(format!( "No multivariate available, should use evaluate_boolean?: {flag_key}" ))); }; // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it // does not matter. for group in &flag_config.filters.groups { match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? { GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant), GroupEvaluationResult::MatchedAndEvaluate => { let mut percentage = 0; for variant in &multivariate.variants { percentage += variant.rollout_percentage; if self .evaluate_percentage(hash_on_global_rollout_percentage, percentage) { return Ok(variant.key.clone()); } } // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog // returned invalid spec, we return an error. return Err(PostHogEvaluationError::Internal(format!( "Rollout percentage does not add up to 100: {flag_key}" ))); } GroupEvaluationResult::Unmatched => continue, } } // If no group is matched, the feature is not available, and up to the caller to decide what to do. Err(PostHogEvaluationError::NoConditionGroupMatched) } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( "Not found in the local evaluation spec: {flag_key}" ))) } } /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests /// and avoid duplicate computations. /// /// Use a different consistent hash for evaluating the group rollout percentage. /// The behavior: if the condition is set to rolling out to 10% of the users, and /// we set the variant A to 20% in the global config, then 2% of the total users will /// be evaluated to variant A. /// /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users /// will be evaluated (versus 30% if group evaluation is done independently). pub(crate) fn evaluate_boolean_inner( &self, flag_key: &str, hash_on_global_rollout_percentage: f64, properties: &HashMap, ) -> Result<(), PostHogEvaluationError> { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( "The feature flag is not active: {flag_key}" ))); } if flag_config.filters.multivariate.is_some() { return Err(PostHogEvaluationError::Internal(format!( "This looks like a multivariate flag, should use evaluate_multivariate?: {flag_key}" ))); }; // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it // does not matter. for group in &flag_config.filters.groups { match self.evaluate_group(group, hash_on_global_rollout_percentage, properties)? { GroupEvaluationResult::MatchedAndOverride(_) => { return Err(PostHogEvaluationError::Internal(format!( "Boolean flag cannot have overrides: {flag_key}" ))); } GroupEvaluationResult::MatchedAndEvaluate => { return Ok(()); } GroupEvaluationResult::Unmatched => continue, } } // If no group is matched, the feature is not available, and up to the caller to decide what to do. Err(PostHogEvaluationError::NoConditionGroupMatched) } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( "Not found in the local evaluation spec: {flag_key}" ))) } } /// Infer whether a feature flag is a boolean flag by checking if it has a multivariate filter. pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result { if let Some(flag_config) = self.flags.get(flag_key) { Ok(flag_config.filters.multivariate.is_none()) } else { Err(PostHogEvaluationError::NotAvailable(format!( "Not found in the local evaluation spec: {flag_key}" ))) } } } pub struct PostHogClientConfig { /// The server API key. pub server_api_key: String, /// The client API key. pub client_api_key: String, /// The project ID. pub project_id: String, /// The private API URL. pub private_api_url: String, /// The public API URL. pub public_api_url: String, } /// A lite PostHog client. /// /// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support. /// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs /// that will be used within Neon. /// /// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed /// to the end users; the server side uses a server key and is not exposed to the end users. The client and the /// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is /// pageserver), and it will use both the client API and the server API. So we need to store two API keys within /// our PostHog client. /// /// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we /// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to /// configure feature flags so it is very likely that the client API will not be used. pub struct PostHogClient { /// The config. config: PostHogClientConfig, /// The HTTP client. client: reqwest::Client, } #[derive(Serialize, Debug)] pub struct CaptureEvent { pub event: String, pub distinct_id: String, pub properties: serde_json::Value, } impl PostHogClient { pub fn new(config: PostHogClientConfig) -> Self { let client = reqwest::Client::new(); Self { config, client } } pub fn new_with_us_region( server_api_key: String, client_api_key: String, project_id: String, ) -> Self { Self::new(PostHogClientConfig { server_api_key, client_api_key, project_id, private_api_url: "https://us.posthog.com".to_string(), public_api_url: "https://us.i.posthog.com".to_string(), }) } /// Check if the server API key is a feature flag secure API key. This key can only be /// used to fetch the feature flag specs and can only be used on a undocumented API /// endpoint. fn is_feature_flag_secure_api_key(&self) -> bool { self.config.server_api_key.starts_with("phs_") } /// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing. pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result { // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation // with bearer token of self.server_api_key // OR // BASE_URL/api/feature_flag/local_evaluation/ // with bearer token of feature flag specific self.server_api_key let url = if self.is_feature_flag_secure_api_key() { // The new feature local evaluation secure API token format!( "{}/api/feature_flag/local_evaluation", self.config.private_api_url ) } else { // The old personal API token format!( "{}/api/projects/{}/feature_flags/local_evaluation", self.config.private_api_url, self.config.project_id ) }; let response = self .client .get(url) .bearer_auth(&self.config.server_api_key) .send() .await?; let status = response.status(); let body = response.text().await?; if !status.is_success() { return Err(anyhow::anyhow!( "Failed to get feature flags: {}, {}", status, body )); } Ok(body) } /// Fetch the feature flag specs from the server. /// /// This is unfortunately an undocumented API at: /// - /// - /// /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. /// See `_compute_flag_locally` in pub async fn get_feature_flags_local_evaluation( &self, ) -> Result { let raw = self.get_feature_flags_local_evaluation_raw().await?; Ok(serde_json::from_str(&raw)?) } /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though /// it also support a lot of other functionalities. /// /// pub async fn capture_event( &self, event: &str, distinct_id: &str, properties: &serde_json::Value, ) -> anyhow::Result<()> { // PUBLIC_URL/capture/ let url = format!("{}/capture/", self.config.public_api_url); let response = self .client .post(url) .body(serde_json::to_string(&json!({ "api_key": self.config.client_api_key, "distinct_id": distinct_id, "event": event, "properties": properties, }))?) .send() .await?; let status = response.status(); let body = response.text().await?; if !status.is_success() { return Err(anyhow::anyhow!( "Failed to capture events: {}, {}", status, body )); } Ok(()) } pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> { // PUBLIC_URL/batch/ let url = format!("{}/batch/", self.config.public_api_url); let response = self .client .post(url) .body(serde_json::to_string(&json!({ "api_key": self.config.client_api_key, "batch": events, }))?) .send() .await?; let status = response.status(); let body = response.text().await?; if !status.is_success() { return Err(anyhow::anyhow!( "Failed to capture events: {}, {}", status, body )); } Ok(()) } } #[cfg(test)] mod tests { use super::*; fn data() -> &'static str { r#"{ "flags": [ { "id": 141807, "team_id": 152860, "name": "", "key": "image-compaction-boundary", "filters": { "groups": [ { "variant": null, "properties": [ { "key": "plan_type", "type": "person", "value": [ "free" ], "operator": "exact" } ], "rollout_percentage": 40 }, { "variant": null, "properties": [], "rollout_percentage": 10 } ], "payloads": {}, "multivariate": null }, "deleted": false, "active": true, "ensure_experience_continuity": false, "has_encrypted_payloads": false, "version": 1 }, { "id": 135586, "team_id": 152860, "name": "", "key": "boolean-flag", "filters": { "groups": [ { "variant": null, "properties": [ { "key": "plan_type", "type": "person", "value": [ "free" ], "operator": "exact" } ], "rollout_percentage": 47 } ], "payloads": {}, "multivariate": null }, "deleted": false, "active": true, "ensure_experience_continuity": false, "has_encrypted_payloads": false, "version": 1 }, { "id": 132794, "team_id": 152860, "name": "", "key": "gc-compaction", "filters": { "groups": [ { "variant": "enabled-stage-2", "properties": [ { "key": "plan_type", "type": "person", "value": [ "free" ], "operator": "exact" }, { "key": "pageserver_remote_size", "type": "person", "value": "10000000", "operator": "lt" } ], "rollout_percentage": 50 }, { "properties": [ { "key": "plan_type", "type": "person", "value": [ "free" ], "operator": "exact" }, { "key": "pageserver_remote_size", "type": "person", "value": "10000000", "operator": "lt" } ], "rollout_percentage": 80 } ], "payloads": {}, "multivariate": { "variants": [ { "key": "disabled", "name": "", "rollout_percentage": 90 }, { "key": "enabled-stage-1", "name": "", "rollout_percentage": 10 }, { "key": "enabled-stage-2", "name": "", "rollout_percentage": 0 }, { "key": "enabled-stage-3", "name": "", "rollout_percentage": 0 }, { "key": "enabled", "name": "", "rollout_percentage": 0 } ] } }, "deleted": false, "active": true, "ensure_experience_continuity": false, "has_encrypted_payloads": false, "version": 7 } ], "group_type_mapping": {}, "cohorts": {} }"# } #[test] fn parse_local_evaluation() { let data = data(); let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap(); } #[test] fn evaluate_multivariate() { let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new()); assert!(matches!( variant, Err(PostHogEvaluationError::NotAvailable(_)) ),); let properties_unmatched = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("paid".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // This does not match any group so there will be an error. let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); let properties = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("free".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties); assert_eq!(variant.unwrap(), "enabled-stage-2".to_string()); // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage. let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties); assert_eq!(variant.unwrap(), "enabled-stage-1".to_string()); let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties); assert_eq!(variant.unwrap(), "disabled".to_string()); // It matches the group conditions but not the group rollout percentage. let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); } #[test] fn evaluate_boolean_1() { // The `boolean-flag` feature flag only has one group that matches on the free user. let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new()); assert!(matches!( variant, Err(PostHogEvaluationError::NotAvailable(_)) ),); let properties_unmatched = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("paid".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // This does not match any group so there will be an error. let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &properties_unmatched); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); let properties = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("free".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. let variant = store.evaluate_boolean_inner("boolean-flag", 0.10, &properties); assert!(variant.is_ok()); // It matches the group conditions but not the group rollout percentage. let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &properties); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); } #[test] fn evaluate_boolean_2() { // The `image-compaction-boundary` feature flag has one group that matches on the free user and a group that matches on all users. let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &HashMap::new()); assert!(matches!( variant, Err(PostHogEvaluationError::NotAvailable(_)) ),); let properties_unmatched = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("paid".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // This does not match the filtered group but the all user group. let variant = store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &properties_unmatched); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); let variant = store.evaluate_boolean_inner("image-compaction-boundary", 0.05, &properties_unmatched); assert!(variant.is_ok()); let properties = HashMap::from([ ( "plan_type".to_string(), PostHogFlagFilterPropertyValue::String("free".to_string()), ), ( "pageserver_remote_size".to_string(), PostHogFlagFilterPropertyValue::Number(1000.0), ), ]); // It matches the first group as 0.30 <= 0.40 and the properties are matched. Then it gets evaluated to the variant override. let variant = store.evaluate_boolean_inner("image-compaction-boundary", 0.30, &properties); assert!(variant.is_ok()); // It matches the group conditions but not the group rollout percentage. let variant = store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &properties); assert!(matches!( variant, Err(PostHogEvaluationError::NoConditionGroupMatched) ),); // It matches the second "all" group conditions. let variant = store.evaluate_boolean_inner("image-compaction-boundary", 0.09, &properties); assert!(variant.is_ok()); } } ================================================ FILE: libs/pq_proto/Cargo.toml ================================================ [package] name = "pq_proto" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] bytes.workspace = true byteorder.workspace = true itertools.workspace = true postgres-protocol.workspace = true rand.workspace = true tokio = { workspace = true, features = ["io-util"] } thiserror.workspace = true serde.workspace = true ================================================ FILE: libs/pq_proto/src/framed.rs ================================================ //! Provides `Framed` -- writing/flushing and reading Postgres messages to/from //! the async stream based on (and buffered with) BytesMut. All functions are //! cancellation safe. //! //! It is similar to what tokio_util::codec::Framed with appropriate codec //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used //! separately without using split from futures::stream::StreamExt (which //! allocates a [Box] in polling internally). tokio::io::split is used for splitting //! instead. Plus we customize error messages more than a single type for all io //! calls. //! //! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 use std::future::Future; use std::io::{self, ErrorKind}; use bytes::{Buf, BytesMut}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; const INITIAL_CAPACITY: usize = 8 * 1024; /// Error on postgres connection: either IO (physical transport error) or /// protocol violation. #[derive(thiserror::Error, Debug)] pub enum ConnectionError { #[error(transparent)] Io(#[from] io::Error), #[error(transparent)] Protocol(#[from] ProtocolError), } impl ConnectionError { /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { match self { ConnectionError::Io(io) => io, ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()), } } } /// Wraps async io `stream`, providing messages to write/flush + read Postgres /// messages. pub struct Framed { pub stream: S, pub read_buf: BytesMut, pub write_buf: BytesMut, } impl Framed { pub fn new(stream: S) -> Self { Self { stream, read_buf: BytesMut::with_capacity(INITIAL_CAPACITY), write_buf: BytesMut::with_capacity(INITIAL_CAPACITY), } } /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } /// Deconstruct into the underlying stream and read buffer. pub fn into_inner(self) -> (S, BytesMut) { (self.stream, self.read_buf) } /// Return new Framed with stream type transformed by async f, for TLS /// upgrade. pub async fn map_stream(self, f: F) -> Result, E> where F: FnOnce(S) -> Fut, Fut: Future>, { let stream = f(self.stream).await?; Ok(Framed { stream, read_buf: self.read_buf, write_buf: self.write_buf, }) } } impl Framed { pub async fn read_startup_message( &mut self, ) -> Result, ConnectionError> { read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await } pub async fn read_message(&mut self) -> Result, ConnectionError> { read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await } } impl Framed { /// Write next message to the output buffer; doesn't flush. pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { BeMessage::write(&mut self.write_buf, msg) } /// Flush out the buffer. This function is cancellation safe: it can be /// interrupted and flushing will be continued in the next call. pub async fn flush(&mut self) -> Result<(), io::Error> { flush(&mut self.stream, &mut self.write_buf).await } /// Flush out the buffer and shutdown the stream. pub async fn shutdown(&mut self) -> Result<(), io::Error> { shutdown(&mut self.stream, &mut self.write_buf).await } } impl Framed { /// Split into owned read and write parts. Beware of potential issues with /// using halves in different tasks on TLS stream: /// pub fn split(self) -> (FramedReader, FramedWriter) { let (read_half, write_half) = tokio::io::split(self.stream); let reader = FramedReader { stream: read_half, read_buf: self.read_buf, }; let writer = FramedWriter { stream: write_half, write_buf: self.write_buf, }; (reader, writer) } /// Join read and write parts back. pub fn unsplit(reader: FramedReader, writer: FramedWriter) -> Self { Self { stream: reader.stream.unsplit(writer.stream), read_buf: reader.read_buf, write_buf: writer.write_buf, } } } /// Read-only version of `Framed`. pub struct FramedReader { stream: ReadHalf, read_buf: BytesMut, } impl FramedReader { pub async fn read_message(&mut self) -> Result, ConnectionError> { read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await } } /// Write-only version of `Framed`. pub struct FramedWriter { stream: WriteHalf, write_buf: BytesMut, } impl FramedWriter { /// Write next message to the output buffer; doesn't flush. pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { BeMessage::write(&mut self.write_buf, msg) } /// Flush out the buffer. This function is cancellation safe: it can be /// interrupted and flushing will be continued in the next call. pub async fn flush(&mut self) -> Result<(), io::Error> { flush(&mut self.stream, &mut self.write_buf).await } /// Flush out the buffer and shutdown the stream. pub async fn shutdown(&mut self) -> Result<(), io::Error> { shutdown(&mut self.stream, &mut self.write_buf).await } } /// Read next message from the stream. Returns Ok(None), if EOF happened and we /// don't have remaining data in the buffer. This function is cancellation safe: /// you can drop future which is not yet complete and finalize reading message /// with the next call. /// /// Parametrized to allow reading startup or usual message, having different /// format. async fn read_message( stream: &mut S, read_buf: &mut BytesMut, parse: P, ) -> Result, ConnectionError> where P: Fn(&mut BytesMut) -> Result, ProtocolError>, { loop { if let Some(msg) = parse(read_buf)? { return Ok(Some(msg)); } // If we can't build a frame yet, try to read more data and try again. // Make sure we've got room for at least one byte to read to ensure // that we don't get a spurious 0 that looks like EOF. read_buf.reserve(1); if stream.read_buf(read_buf).await? == 0 { if read_buf.has_remaining() { return Err(io::Error::new( ErrorKind::UnexpectedEof, "EOF with unprocessed data in the buffer", ) .into()); } else { return Ok(None); // clean EOF } } } } /// Cancellation safe as long as the AsyncWrite is cancellation safe. async fn flush( stream: &mut S, write_buf: &mut BytesMut, ) -> Result<(), io::Error> { while write_buf.has_remaining() { let bytes_written = stream.write_buf(write_buf).await?; if bytes_written == 0 { return Err(io::Error::new( ErrorKind::WriteZero, "failed to write message", )); } } stream.flush().await } /// Cancellation safe as long as the AsyncWrite is cancellation safe. async fn shutdown( stream: &mut S, write_buf: &mut BytesMut, ) -> Result<(), io::Error> { flush(stream, write_buf).await?; stream.shutdown().await } ================================================ FILE: libs/pq_proto/src/lib.rs ================================================ //! Postgres protocol messages serialization-deserialization. See //! //! on message formats. #![deny(clippy::undocumented_unsafe_blocks)] pub mod framed; use std::borrow::Cow; use std::{fmt, io, str}; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::Itertools; // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; pub type Oid = u32; pub type SystemId = u64; pub const INT8_OID: Oid = 20; pub const INT4_OID: Oid = 23; pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { // Simple query. Query(Bytes), // Extended query protocol. Parse(FeParseMessage), Describe(FeDescribeMessage), Bind(FeBindMessage), Execute(FeExecuteMessage), Close(FeCloseMessage), Sync, Terminate, CopyData(Bytes), CopyDone, CopyFail, PasswordMessage(Bytes), } #[derive(Clone, Copy, PartialEq, PartialOrd)] pub struct ProtocolVersion(u32); impl ProtocolVersion { pub const fn new(major: u16, minor: u16) -> Self { Self(((major as u32) << 16) | minor as u32) } pub const fn minor(self) -> u16 { self.0 as u16 } pub const fn major(self) -> u16 { (self.0 >> 16) as u16 } } impl fmt::Debug for ProtocolVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_list() .entry(&self.major()) .entry(&self.minor()) .finish() } } #[derive(Debug)] pub enum FeStartupPacket { CancelRequest(CancelKeyData), SslRequest { direct: bool, }, GssEncRequest, StartupMessage { version: ProtocolVersion, params: StartupMessageParams, }, } #[derive(Debug, Clone, Default)] pub struct StartupMessageParamsBuilder { params: BytesMut, } impl StartupMessageParamsBuilder { /// Set parameter's value by its name. /// name and value must not contain a \0 byte pub fn insert(&mut self, name: &str, value: &str) { self.params.put(name.as_bytes()); self.params.put(&b"\0"[..]); self.params.put(value.as_bytes()); self.params.put(&b"\0"[..]); } pub fn freeze(self) -> StartupMessageParams { StartupMessageParams { params: self.params.freeze(), } } } #[derive(Debug, Clone, Default)] pub struct StartupMessageParams { pub params: Bytes, } impl StartupMessageParams { /// Get parameter's value by its name. pub fn get(&self, name: &str) -> Option<&str> { self.iter().find_map(|(k, v)| (k == name).then_some(v)) } /// Split command-line options according to PostgreSQL's logic, /// taking into account all escape sequences but leaving them as-is. /// [`None`] means that there's no `options` in [`Self`]. pub fn options_raw(&self) -> Option> { self.get("options").map(Self::parse_options_raw) } /// Split command-line options according to PostgreSQL's logic, /// applying all escape sequences (using owned strings as needed). /// [`None`] means that there's no `options` in [`Self`]. pub fn options_escaped(&self) -> Option>> { self.get("options").map(Self::parse_options_escaped) } /// Split command-line options according to PostgreSQL's logic, /// taking into account all escape sequences but leaving them as-is. pub fn parse_options_raw(input: &str) -> impl Iterator { // See `postgres: pg_split_opts`. let mut last_was_escape = false; input .split(move |c: char| { // We split by non-escaped whitespace symbols. let should_split = c.is_ascii_whitespace() && !last_was_escape; last_was_escape = c == '\\' && !last_was_escape; should_split }) .filter(|s| !s.is_empty()) } /// Split command-line options according to PostgreSQL's logic, /// applying all escape sequences (using owned strings as needed). pub fn parse_options_escaped(input: &str) -> impl Iterator> { // See `postgres: pg_split_opts`. Self::parse_options_raw(input).map(|s| { let mut preserve_next_escape = false; let escape = |c| { // We should remove '\\' unless it's preceded by '\\'. let should_remove = c == '\\' && !preserve_next_escape; preserve_next_escape = should_remove; should_remove }; match s.contains('\\') { true => Cow::Owned(s.replace(escape, "")), false => Cow::Borrowed(s), } }) } /// Iterate through key-value pairs in an arbitrary order. pub fn iter(&self) -> impl Iterator { let params = std::str::from_utf8(&self.params).expect("should be validated as utf8 already"); params.split_terminator('\0').tuples() } // This function is mostly useful in tests. #[doc(hidden)] pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { let mut b = StartupMessageParamsBuilder::default(); for (k, v) in pairs { b.insert(k, v) } b.freeze() } } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub struct CancelKeyData { pub backend_pid: i32, pub cancel_key: i32, } pub fn id_to_cancel_key(id: u64) -> CancelKeyData { CancelKeyData { backend_pid: (id >> 32) as i32, cancel_key: (id & 0xffffffff) as i32, } } impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let hi = (self.backend_pid as u64) << 32; let lo = (self.cancel_key as u64) & 0xffffffff; let id = hi | lo; // This format is more compact and might work better for logs. f.debug_tuple("CancelKeyData") .field(&format_args!("{id:x}")) .finish() } } use rand::distr::{Distribution, StandardUniform}; impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> CancelKeyData { CancelKeyData { backend_pid: rng.random(), cancel_key: rng.random(), } } } // We only support the simple case of Parse on unnamed prepared statement and // no params #[derive(Debug)] pub struct FeParseMessage { pub query_string: Bytes, } #[derive(Debug)] pub struct FeDescribeMessage { pub kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal. // we only support unnamed prepared stmt or portal } // we only support unnamed prepared stmt and portal #[derive(Debug)] pub struct FeBindMessage; // we only support unnamed prepared stmt or portal #[derive(Debug)] pub struct FeExecuteMessage { /// max # of rows pub maxrows: i32, } // we only support unnamed prepared stmt and portal #[derive(Debug)] pub struct FeCloseMessage; /// An error occurred while parsing or serializing raw stream into Postgres /// messages. #[derive(thiserror::Error, Debug)] pub enum ProtocolError { /// Invalid packet was received from the client (e.g. unexpected message /// type or broken len). #[error("Protocol error: {0}")] Protocol(String), /// Failed to parse or, (unlikely), serialize a protocol message. #[error("Message parse error: {0}")] BadMessage(String), } impl ProtocolError { /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { io::Error::other(self.to_string()) } } impl FeMessage { /// Read and parse one message from the `buf` input buffer. If there is at /// least one valid message, returns it, advancing `buf`; redundant copies /// are avoided, as thanks to `bytes` crate ptrs in parsed message point /// directly into the `buf` (processed data is garbage collected after /// parsed message is dropped). /// /// Returns None if `buf` doesn't contain enough data for a single message. /// For efficiency, tries to reserve large enough space in `buf` for the /// next message in this case to save the repeated calls. /// /// Returns Error if message is malformed, the only possible ErrorKind is /// InvalidInput. // // Inspired by rust-postgres Message::parse. pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { // Every message contains message type byte and 4 bytes len; can't do // much without them. if buf.len() < 5 { let to_read = 5 - buf.len(); buf.reserve(to_read); return Ok(None); } // We shouldn't advance `buf` as probably full message is not there yet, // so can't directly use Bytes::get_u32 etc. let tag = buf[0]; let len = (&buf[1..5]).read_u32::().unwrap(); if len < 4 { return Err(ProtocolError::Protocol(format!( "invalid message length {len}" ))); } // length field includes itself, but not message type. let total_len = len as usize + 1; if buf.len() < total_len { // Don't have full message yet. let to_read = total_len - buf.len(); buf.reserve(to_read); return Ok(None); } // got the message, advance buffer let mut msg = buf.split_to(total_len).freeze(); msg.advance(5); // consume message type and len match tag { b'Q' => Ok(Some(FeMessage::Query(msg))), b'P' => Ok(Some(FeParseMessage::parse(msg)?)), b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)), b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)), b'B' => Ok(Some(FeBindMessage::parse(msg)?)), b'C' => Ok(Some(FeCloseMessage::parse(msg)?)), b'S' => Ok(Some(FeMessage::Sync)), b'X' => Ok(Some(FeMessage::Terminate)), b'd' => Ok(Some(FeMessage::CopyData(msg))), b'c' => Ok(Some(FeMessage::CopyDone)), b'f' => Ok(Some(FeMessage::CopyFail)), b'p' => Ok(Some(FeMessage::PasswordMessage(msg))), tag => Err(ProtocolError::Protocol(format!( "unknown message tag: {tag},'{msg:?}'" ))), } } } impl FeStartupPacket { /// Read and parse startup message from the `buf` input buffer. It is /// different from [`FeMessage::parse`] because startup messages don't have /// message type byte; otherwise, its comments apply. pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { /// const MAX_STARTUP_PACKET_LENGTH: usize = 10000; const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; /// const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); /// const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); /// const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); // // First byte indicates standard SSL handshake message // (It can't be a Postgres startup length because in network byte order // that would be a startup packet hundreds of megabytes long) if buf.first() == Some(&0x16) { return Ok(Some(FeStartupPacket::SslRequest { direct: true })); } // need at least 4 bytes with packet len if buf.len() < 4 { let to_read = 4 - buf.len(); buf.reserve(to_read); return Ok(None); } // We shouldn't advance `buf` as probably full message is not there yet, // so can't directly use Bytes::get_u32 etc. let len = (&buf[0..4]).read_u32::().unwrap() as usize; // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` // which is less readable #[allow(clippy::manual_range_contains)] if len < 8 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( "invalid startup packet message length {len}" ))); } if buf.len() < len { // Don't have full message yet. let to_read = len - buf.len(); buf.reserve(to_read); return Ok(None); } // got the message, advance buffer let mut msg = buf.split_to(len).freeze(); msg.advance(4); // consume len let request_code = ProtocolVersion(msg.get_u32()); // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. let message = match request_code { CANCEL_REQUEST_CODE => { if msg.remaining() != 8 { return Err(ProtocolError::BadMessage( "CancelRequest message is malformed, backend PID / secret key missing" .to_owned(), )); } FeStartupPacket::CancelRequest(CancelKeyData { backend_pid: msg.get_i32(), cancel_key: msg.get_i32(), }) } NEGOTIATE_SSL_CODE => { // Requested upgrade to SSL (aka TLS) FeStartupPacket::SslRequest { direct: false } } NEGOTIATE_GSS_CODE => { // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } version if version.major() == RESERVED_INVALID_MAJOR_VERSION => { return Err(ProtocolError::Protocol(format!( "Unrecognized request code {}", version.minor() ))); } // TODO bail if protocol major_version is not 3? version => { // StartupMessage let s = str::from_utf8(&msg).map_err(|_e| { ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) })?; let s = s.strip_suffix('\0').ok_or_else(|| { ProtocolError::Protocol( "StartupMessage params: missing null terminator".to_string(), ) })?; FeStartupPacket::StartupMessage { version, params: StartupMessageParams { params: msg.slice_ref(s.as_bytes()), }, } } }; Ok(Some(message)) } } impl FeParseMessage { fn parse(mut buf: Bytes) -> Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never // uses more than one prepared statement at a time. let _pstmt_name = read_cstr(&mut buf)?; let query_string = read_cstr(&mut buf)?; if buf.remaining() < 2 { return Err(ProtocolError::BadMessage( "Parse message is malformed, nparams missing".to_string(), )); } let nparams = buf.get_i16(); if nparams != 0 { return Err(ProtocolError::BadMessage( "query params not implemented".to_string(), )); } Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { fn parse(mut buf: Bytes) -> Result { let kind = buf.get_u8(); let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse if kind != b'S' { return Err(ProtocolError::BadMessage( "only prepared statemement Describe is implemented".to_string(), )); } Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; if buf.remaining() < 4 { return Err(ProtocolError::BadMessage( "FeExecuteMessage message is malformed, maxrows missing".to_string(), )); } let maxrows = buf.get_i32(); if !portal_name.is_empty() { return Err(ProtocolError::BadMessage( "named portals not implemented".to_string(), )); } if maxrows != 0 { return Err(ProtocolError::BadMessage( "row limit in Execute message not implemented".to_string(), )); } Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse if !portal_name.is_empty() { return Err(ProtocolError::BadMessage( "named portals not implemented".to_string(), )); } Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { fn parse(mut buf: Bytes) -> Result { let _kind = buf.get_u8(); let _pstmt_or_portal_name = read_cstr(&mut buf)?; // FIXME: we do nothing with Close Ok(FeMessage::Close(FeCloseMessage)) } } // Backend #[derive(Debug)] pub enum BeMessage<'a> { AuthenticationOk, AuthenticationMD5Password([u8; 4]), AuthenticationSasl(BeAuthenticationSaslMessage<'a>), AuthenticationCleartextPassword, BackendKeyData(CancelKeyData), BindComplete, CommandComplete(&'a [u8]), CopyData(&'a [u8]), CopyDone, CopyFail, CopyInResponse, CopyOutResponse, CopyBothResponse, CloseComplete, // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), // None errcode means internal_error will be sent. ErrorResponse(&'a str, Option<&'a [u8; 5]>), /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, ParameterDescription, ParameterStatus { name: &'a [u8], value: &'a [u8], }, ParseComplete, ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), NoticeResponse(&'a str), NegotiateProtocolVersion { version: ProtocolVersion, options: &'a [&'a str], }, KeepAlive(WalSndKeepAlive), /// Batch of interpreted, shard filtered WAL records, /// ready for the pageserver to ingest InterpretedWalRecords(InterpretedWalRecordsBody<'a>), Raw(u8, &'a [u8]), } /// Common shorthands. impl<'a> BeMessage<'a> { /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8. /// This is a sensible default, given that: /// * rust strings only support this encoding out of the box. /// * tokio-postgres, postgres-jdbc (and probably more) mandate it. /// /// TODO: do we need to report `server_encoding` as well? pub const CLIENT_ENCODING: Self = Self::ParameterStatus { name: b"client_encoding", value: b"UTF8", }; pub const INTEGER_DATETIMES: Self = Self::ParameterStatus { name: b"integer_datetimes", value: b"on", }; /// Build a [`BeMessage::ParameterStatus`] holding the server version. pub fn server_version(version: &'a str) -> Self { Self::ParameterStatus { name: b"server_version", value: version.as_bytes(), } } } #[derive(Debug)] pub enum BeAuthenticationSaslMessage<'a> { Methods(&'a [&'a str]), Continue(&'a [u8]), Final(&'a [u8]), } #[derive(Debug)] pub enum BeParameterStatusMessage<'a> { Encoding(&'a str), ServerVersion(&'a str), } // One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { pub name: &'a [u8], pub tableoid: Oid, pub attnum: i16, pub typoid: Oid, pub typlen: i16, pub typmod: i32, pub formatcode: i16, } impl Default for RowDescriptor<'_> { fn default() -> RowDescriptor<'static> { RowDescriptor { name: b"", tableoid: 0, attnum: 0, typoid: 0, typlen: 0, typmod: 0, formatcode: 0, } } } impl RowDescriptor<'_> { /// Convenience function to create a RowDescriptor message for an int8 column pub const fn int8_col(name: &[u8]) -> RowDescriptor { RowDescriptor { name, tableoid: 0, attnum: 0, typoid: INT8_OID, typlen: 8, typmod: 0, formatcode: 0, } } pub const fn text_col(name: &[u8]) -> RowDescriptor { RowDescriptor { name, tableoid: 0, attnum: 0, typoid: TEXT_OID, typlen: -1, typmod: 0, formatcode: 0, } } } #[derive(Debug)] pub struct XLogDataBody<'a> { pub wal_start: u64, pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub data: &'a [u8], } #[derive(Debug)] pub struct WalSndKeepAlive { pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub request_reply: bool, } /// Batch of interpreted WAL records used in the interpreted /// safekeeper to pageserver protocol. /// /// Note that the pageserver uses the RawInterpretedWalRecordsBody /// counterpart of this from the neondatabase/rust-postgres repo. /// If you're changing this struct, you likely need to change its /// twin as well. #[derive(Debug)] pub struct InterpretedWalRecordsBody<'a> { /// End of raw WAL in [`Self::data`] pub streaming_lsn: u64, /// Current end of WAL on the server pub commit_lsn: u64, pub data: &'a [u8], } pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]); // single text column pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescriptor { name: b"data", tableoid: 0, attnum: 0, typoid: TEXT_OID, typlen: -1, typmod: 0, formatcode: 0, }]); /// Call f() to write body of the message and prepend it with 4-byte len as /// prescribed by the protocol. fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { let base = buf.len(); buf.extend_from_slice(&[0; 4]); let res = f(buf); let size = i32::try_from(buf.len() - base).expect("message too big to transmit"); (&mut buf[base..]).put_slice(&size.to_be_bytes()); res } /// Safe write of s into buf as cstring (String in the protocol). fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> { let bytes = s.as_ref(); if bytes.contains(&0) { return Err(ProtocolError::BadMessage( "string contains embedded null".to_owned(), )); } buf.put_slice(bytes); buf.put_u8(0); Ok(()) } /// Read cstring from buf, advancing it. pub fn read_cstr(buf: &mut Bytes) -> Result { let pos = buf .iter() .position(|x| *x == 0) .ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?; let result = buf.split_to(pos); buf.advance(1); // drop the null terminator Ok(result) } pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01"; pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; impl BeMessage<'_> { /// Serialize `message` to the given `buf`. /// Apart from smart memory managemet, BytesMut is good here as msg len /// precedes its body and it is handy to write it down first and then fill /// the length. With Write we would have to either calc it manually or have /// one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> { match message { BeMessage::Raw(code, data) => { buf.put_u8(*code); write_body(buf, |b| b.put_slice(data)) } BeMessage::AuthenticationOk => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(0); // Specifies that the authentication was successful. }); } BeMessage::AuthenticationCleartextPassword => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(3); // Specifies that clear text password is required. }); } BeMessage::AuthenticationMD5Password(salt) => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(5); // Specifies that an MD5-encrypted password is required. buf.put_slice(&salt[..]); }); } BeMessage::AuthenticationSasl(msg) => { buf.put_u8(b'R'); write_body(buf, |buf| { use BeAuthenticationSaslMessage::*; match msg { Methods(methods) => { buf.put_i32(10); // Specifies that SASL auth method is used. for method in methods.iter() { write_cstr(method, buf)?; } buf.put_u8(0); // zero terminator for the list } Continue(extra) => { buf.put_i32(11); // Continue SASL auth. buf.put_slice(extra); } Final(extra) => { buf.put_i32(12); // Send final SASL message. buf.put_slice(extra); } } Ok(()) })?; } BeMessage::BackendKeyData(key_data) => { buf.put_u8(b'K'); write_body(buf, |buf| { buf.put_i32(key_data.backend_pid); buf.put_i32(key_data.cancel_key); }); } BeMessage::BindComplete => { buf.put_u8(b'2'); write_body(buf, |_| {}); } BeMessage::CloseComplete => { buf.put_u8(b'3'); write_body(buf, |_| {}); } BeMessage::CommandComplete(cmd) => { buf.put_u8(b'C'); write_body(buf, |buf| write_cstr(cmd, buf))?; } BeMessage::CopyData(data) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_slice(data); }); } BeMessage::CopyDone => { buf.put_u8(b'c'); write_body(buf, |_| {}); } BeMessage::CopyFail => { buf.put_u8(b'f'); write_body(buf, |_| {}); } BeMessage::CopyInResponse => { buf.put_u8(b'G'); write_body(buf, |buf| { buf.put_u8(1); // copy_is_binary buf.put_i16(0); // numAttributes }); } BeMessage::CopyOutResponse => { buf.put_u8(b'H'); write_body(buf, |buf| { buf.put_u8(0); // copy_is_binary buf.put_i16(0); // numAttributes }); } BeMessage::CopyBothResponse => { buf.put_u8(b'W'); write_body(buf, |buf| { // doesn't matter, used only for replication buf.put_u8(0); // copy_is_binary buf.put_i16(0); // numAttributes }); } BeMessage::DataRow(vals) => { buf.put_u8(b'D'); write_body(buf, |buf| { buf.put_u16(vals.len() as u16); // num of cols for val_opt in vals.iter() { if let Some(val) = val_opt { buf.put_u32(val.len() as u32); buf.put_slice(val); } else { buf.put_i32(-1); } } }); } // ErrorResponse is a zero-terminated array of zero-terminated fields. // First byte of each field represents type of this field. Set just enough fields // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error // message text. BeMessage::ErrorResponse(error_msg, pg_error_code) => { // 'E' signalizes ErrorResponse messages buf.put_u8(b'E'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code buf.put_slice(&terminate_code( pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR), )); buf.put_u8(b'M'); // the message write_cstr(error_msg, buf)?; buf.put_u8(0); // terminator Ok(()) })?; } // NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the // message but continue listening for ReadyForQuery or ErrorResponse" BeMessage::NoticeResponse(error_msg) => { // For all the errors set Severity to Error and error code to // 'internal error'. // 'N' signalizes NoticeResponse messages buf.put_u8(b'N'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR)); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok(()) })?; } BeMessage::NoData => { buf.put_u8(b'n'); write_body(buf, |_| {}); } BeMessage::EncryptionResponse(should_negotiate) => { let response = if *should_negotiate { b'S' } else { b'N' }; buf.put_u8(response); } BeMessage::ParameterStatus { name, value } => { buf.put_u8(b'S'); write_body(buf, |buf| { write_cstr(name, buf)?; write_cstr(value, buf) })?; } BeMessage::ParameterDescription => { buf.put_u8(b't'); write_body(buf, |buf| { // we don't support params, so always 0 buf.put_i16(0); }); } BeMessage::ParseComplete => { buf.put_u8(b'1'); write_body(buf, |_| {}); } BeMessage::ReadyForQuery => { buf.put_u8(b'Z'); write_body(buf, |buf| { buf.put_u8(b'I'); }); } BeMessage::RowDescription(rows) => { buf.put_u8(b'T'); write_body(buf, |buf| { buf.put_i16(rows.len() as i16); // # of fields for row in rows.iter() { write_cstr(row.name, buf)?; buf.put_i32(0); /* table oid */ buf.put_i16(0); /* attnum */ buf.put_u32(row.typoid); buf.put_i16(row.typlen); buf.put_i32(-1); /* typmod */ buf.put_i16(0); /* format code */ } Ok(()) })?; } BeMessage::XLogData(body) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'w'); buf.put_u64(body.wal_start); buf.put_u64(body.wal_end); buf.put_i64(body.timestamp); buf.put_slice(body.data); }); } BeMessage::KeepAlive(req) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'k'); buf.put_u64(req.wal_end); buf.put_i64(req.timestamp); buf.put_u8(u8::from(req.request_reply)); }); } BeMessage::NegotiateProtocolVersion { version, options } => { buf.put_u8(b'v'); write_body(buf, |buf| { buf.put_u32(version.0); buf.put_u32(options.len() as u32); for option in options.iter() { write_cstr(option, buf)?; } Ok(()) })? } BeMessage::InterpretedWalRecords(rec) => { // We use the COPY_DATA_TAG for our custom message // since this tag is interpreted as raw bytes. buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol // dependency buf.put_u64(rec.streaming_lsn); buf.put_u64(rec.commit_lsn); buf.put_slice(rec.data); }); } } Ok(()) } } fn terminate_code(code: &[u8; 5]) -> [u8; 6] { let mut terminated = [0; 6]; for (i, &elem) in code.iter().enumerate() { terminated[i] = elem; } terminated } #[cfg(test)] mod tests { use super::*; #[test] fn test_startup_message_params_options_escaped() { fn split_options(params: &StartupMessageParams) -> Vec> { params .options_escaped() .expect("options are None") .collect() } let make_params = |options| StartupMessageParams::new([("options", options)]); let params = StartupMessageParams::new([]); assert!(params.options_escaped().is_none()); let params = make_params(""); assert!(split_options(¶ms).is_empty()); let params = make_params("foo"); assert_eq!(split_options(¶ms), ["foo"]); let params = make_params(" foo bar "); assert_eq!(split_options(¶ms), ["foo", "bar"]); let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } #[test] fn parse_fe_startup_packet_regression() { let data = [0, 0, 0, 7, 0, 0, 0, 0]; FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err(); } #[test] fn cancel_key_data() { let key = CancelKeyData { backend_pid: -1817212860, cancel_key: -1183897012, }; assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)"); } } ================================================ FILE: libs/proxy/README.md ================================================ This directory contains libraries that are specific for proxy. Currently, it contains a signficant fork/refactoring of rust-postgres that no longer reflects the API of the original library. Since it was so significant, it made sense to upgrade it to it's own set of libraries. Proxy needs unique access to the protocol, which explains why such heavy modifications were necessary. ================================================ FILE: libs/proxy/json/Cargo.toml ================================================ [package] name = "json" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] ryu = "1" itoa = "1" [dev-dependencies] futures = "0.3" ================================================ FILE: libs/proxy/json/src/lib.rs ================================================ //! A JSON serialization lib, designed for more flexibility than `serde_json` offers. //! //! Features: //! //! ## Dynamic construction //! //! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc. //! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types. //! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct), //! but that is often not very efficient. //! //! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the //! relevant functions, and it will guarantee a correctly encoded JSON value. //! //! ## Async construction //! //! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory //! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however //! there are exceptions. //! //! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes, //! whereas serializing values incrementally spreads out the CPU load and reduces lag. //! //! ## Examples //! //! To represent the following JSON as a compact string //! //! ```json //! { //! "results": { //! "rows": [ //! { //! "id": 1, //! "value": null //! }, //! { //! "id": 2, //! "value": "hello" //! } //! ] //! } //! } //! ``` //! //! We can use the following code: //! //! ``` //! // create the outer object //! let s = json::value_to_string!(|v| json::value_as_object!(|v| { //! // create an entry with key "results" and start an object value associated with it. //! let results = v.key("results"); //! json::value_as_object!(|results| { //! // create an entry with key "rows" and start an list value associated with it. //! let rows = results.key("rows"); //! json::value_as_list!(|rows| { //! // create a list entry and start an object value associated with it. //! let row = rows.entry(); //! json::value_as_object!(|row| { //! // add entry "id": 1 //! row.entry("id", 1); //! // add entry "value": null //! row.entry("value", json::Null); //! }); //! //! // create a list entry and start an object value associated with it. //! let row = rows.entry(); //! json::value_as_object!(|row| { //! // add entry "id": 2 //! row.entry("id", 2); //! // add entry "value": "hello" //! row.entry("value", "hello"); //! }); //! }); //! }); //! })); //! //! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#); //! ``` mod macros; mod str; mod value; pub use value::{Null, ValueEncoder}; #[must_use] /// Serialize a single json value. pub struct ValueSer<'buf> { buf: &'buf mut Vec, start: usize, } impl<'buf> ValueSer<'buf> { /// Create a new json value serializer. pub fn new(buf: &'buf mut Vec) -> Self { Self { buf, start: 0 } } /// Borrow the underlying buffer pub fn as_buffer(&self) -> &[u8] { self.buf } #[inline] pub fn value(self, e: impl ValueEncoder) { e.encode(self); } /// Write raw bytes to the buf. This must be already JSON encoded. #[inline] pub fn write_raw_json(self, data: &[u8]) { self.buf.extend_from_slice(data); self.finish(); } /// Start a new object serializer. #[inline] pub fn object(self) -> ObjectSer<'buf> { ObjectSer::new(self) } /// Start a new list serializer. #[inline] pub fn list(self) -> ListSer<'buf> { ListSer::new(self) } /// Finish the value ser. #[inline] fn finish(self) { // don't trigger the drop handler which triggers a rollback. // this won't cause memory leaks because `ValueSet` owns no allocations. std::mem::forget(self); } } impl Drop for ValueSer<'_> { fn drop(&mut self) { self.buf.truncate(self.start); } } #[must_use] /// Serialize a json object. pub struct ObjectSer<'buf> { value: ValueSer<'buf>, start: usize, } impl<'buf> ObjectSer<'buf> { /// Start a new object serializer. #[inline] pub fn new(value: ValueSer<'buf>) -> Self { value.buf.push(b'{'); let start = value.buf.len(); Self { value, start } } /// Borrow the underlying buffer pub fn as_buffer(&self) -> &[u8] { self.value.as_buffer() } /// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value. #[inline] pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> { key.write_key(self) } /// Write an entry (key-value pair) to the object. #[inline] pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) { self.key(key).value(val); } #[inline] fn entry_inner(&mut self, f: impl FnOnce(&mut Vec)) -> ValueSer<'_> { // track before the separator so we the value is rolled back it also removes the separator. let start = self.value.buf.len(); // push separator if necessary if self.value.buf.len() > self.start { self.value.buf.push(b','); } // push key f(self.value.buf); // push value separator self.value.buf.push(b':'); // return value writer. ValueSer { buf: self.value.buf, start, } } /// Reset the buffer back to before this object was started. #[inline] pub fn rollback(self) -> ValueSer<'buf> { // Do not fully reset the value, only reset it to before the `{`. // This ensures any `,` before this value are not clobbered. self.value.buf.truncate(self.start - 1); self.value } /// Finish the object ser. #[inline] pub fn finish(self) { self.value.buf.push(b'}'); self.value.finish(); } } pub trait KeyEncoder { fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>; } #[must_use] /// Serialize a json object. pub struct ListSer<'buf> { value: ValueSer<'buf>, start: usize, } impl<'buf> ListSer<'buf> { /// Start a new list serializer. #[inline] pub fn new(value: ValueSer<'buf>) -> Self { value.buf.push(b'['); let start = value.buf.len(); Self { value, start } } /// Borrow the underlying buffer pub fn as_buffer(&self) -> &[u8] { self.value.as_buffer() } /// Write an value to the list. #[inline] pub fn push(&mut self, val: impl ValueEncoder) { self.entry().value(val); } /// Start a new value entry in this list. #[inline] pub fn entry(&mut self) -> ValueSer<'_> { // track before the separator so we the value is rolled back it also removes the separator. let start = self.value.buf.len(); // push separator if necessary if self.value.buf.len() > self.start { self.value.buf.push(b','); } // return value writer. ValueSer { buf: self.value.buf, start, } } /// Reset the buffer back to before this object was started. #[inline] pub fn rollback(self) -> ValueSer<'buf> { // Do not fully reset the value, only reset it to before the `[`. // This ensures any `,` before this value are not clobbered. self.value.buf.truncate(self.start - 1); self.value } /// Finish the object ser. #[inline] pub fn finish(self) { self.value.buf.push(b']'); self.value.finish(); } } #[cfg(test)] mod tests { use crate::{Null, ValueSer}; #[test] fn object() { let mut buf = vec![]; let mut object = ValueSer::new(&mut buf).object(); object.entry("foo", "bar"); object.entry("baz", Null); object.finish(); assert_eq!(buf, br#"{"foo":"bar","baz":null}"#); } #[test] fn list() { let mut buf = vec![]; let mut list = ValueSer::new(&mut buf).list(); list.entry().value("bar"); list.entry().value(Null); list.finish(); assert_eq!(buf, br#"["bar",null]"#); } #[test] fn object_macro() { let res = crate::value_to_string!(|obj| { crate::value_as_object!(|obj| { obj.entry("foo", "bar"); obj.entry("baz", Null); }) }); assert_eq!(res, r#"{"foo":"bar","baz":null}"#); } #[test] fn list_macro() { let res = crate::value_to_string!(|list| { crate::value_as_list!(|list| { list.entry().value("bar"); list.entry().value(Null); }) }); assert_eq!(res, r#"["bar",null]"#); } #[test] fn rollback_on_drop() { let res = crate::value_to_string!(|list| { crate::value_as_list!(|list| { list.entry().value("bar"); 'cancel: { let nested_list = list.entry(); crate::value_as_list!(|nested_list| { nested_list.entry().value(1); assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#); if true { break 'cancel; } }) } assert_eq!(list.as_buffer(), br#"["bar""#); list.entry().value(Null); }) }); assert_eq!(res, r#"["bar",null]"#); } #[test] fn rollback_object() { let res = crate::value_to_string!(|obj| { crate::value_as_object!(|obj| { let entry = obj.key("1"); entry.value(1_i32); let entry = obj.key("2"); let entry = { let mut nested_obj = entry.object(); nested_obj.entry("foo", "bar"); nested_obj.rollback() }; entry.value(2_i32); }) }); assert_eq!(res, r#"{"1":1,"2":2}"#); } #[test] fn rollback_list() { let res = crate::value_to_string!(|list| { crate::value_as_list!(|list| { let entry = list.entry(); entry.value(1_i32); let entry = list.entry(); let entry = { let mut nested_list = entry.list(); nested_list.push("foo"); nested_list.rollback() }; entry.value(2_i32); }) }); assert_eq!(res, r#"[1,2]"#); } #[test] fn string_escaping() { let mut buf = vec![]; let mut object = ValueSer::new(&mut buf).object(); let key = "hello"; let value = "\n world"; object.entry(format_args!("{key:?}"), value); object.finish(); assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#); } } ================================================ FILE: libs/proxy/json/src/macros.rs ================================================ //! # Examples //! //! ``` //! use futures::{StreamExt, TryStream, TryStreamExt}; //! //! async fn stream_to_json_list(mut s: S) -> Result //! where //! S: TryStream + Unpin, //! T: json::ValueEncoder //! { //! Ok(json::value_to_string!(|val| json::value_as_list!(|val| { //! // note how we can use `.await` and `?` in here. //! while let Some(value) = s.try_next().await? { //! val.push(value); //! } //! }))) //! } //! //! let stream = futures::stream::iter([1, 2, 3]).map(Ok::); //! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap(); //! assert_eq!(json_string, "[1,2,3]"); //! ``` /// A helper to create a new JSON vec. /// /// Implemented as a macro to preserve all control flow. #[macro_export] macro_rules! value_to_vec { (|$val:ident| $body:expr) => {{ let mut buf = vec![]; let $val = $crate::ValueSer::new(&mut buf); let _: () = $body; buf }}; } /// A helper to create a new JSON string. /// /// Implemented as a macro to preserve all control flow. #[macro_export] macro_rules! value_to_string { (|$val:ident| $body:expr) => {{ ::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body)) .expect("json should be valid utf8") }}; } /// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion. /// /// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer. /// The serializer is only 'finished' if the body completes. /// The serializer is rolled back if `break`/`return` escapes the body. /// /// Implemented as a macro to preserve all control flow. #[macro_export] macro_rules! value_as_object { (|$val:ident| $body:expr) => {{ let mut obj = $crate::ObjectSer::new($val); let $val = &mut obj; let res = $body; obj.finish(); res }}; } /// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion. /// /// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer. /// The serializer is only 'finished' if the body completes. /// The serializer is rolled back if `break`/`return` escapes the body. /// /// Implemented as a macro to preserve all control flow. #[macro_export] macro_rules! value_as_list { (|$val:ident| $body:expr) => {{ let mut list = $crate::ListSer::new($val); let $val = &mut list; let res = $body; list.finish(); res }}; } ================================================ FILE: libs/proxy/json/src/str.rs ================================================ //! Helpers for serializing escaped strings. //! //! ## License //! //! //! //! Licensed by David Tolnay under MIT or Apache-2.0. //! //! With modifications by Conrad Ludgate on behalf of Databricks. use std::fmt::{self, Write}; /// Represents a character escape code in a type-safe manner. pub enum CharEscape { /// An escaped quote `"` Quote, /// An escaped reverse solidus `\` ReverseSolidus, // /// An escaped solidus `/` // Solidus, /// An escaped backspace character (usually escaped as `\b`) Backspace, /// An escaped form feed character (usually escaped as `\f`) FormFeed, /// An escaped line feed character (usually escaped as `\n`) LineFeed, /// An escaped carriage return character (usually escaped as `\r`) CarriageReturn, /// An escaped tab character (usually escaped as `\t`) Tab, /// An escaped ASCII plane control character (usually escaped as /// `\u00XX` where `XX` are two hex characters) AsciiControl(u8), } impl CharEscape { #[inline] fn from_escape_table(escape: u8, byte: u8) -> CharEscape { match escape { self::BB => CharEscape::Backspace, self::TT => CharEscape::Tab, self::NN => CharEscape::LineFeed, self::FF => CharEscape::FormFeed, self::RR => CharEscape::CarriageReturn, self::QU => CharEscape::Quote, self::BS => CharEscape::ReverseSolidus, self::UU => CharEscape::AsciiControl(byte), _ => unreachable!(), } } } pub(crate) fn format_escaped_str(writer: &mut Vec, value: &str) { writer.reserve(2 + value.len()); writer.push(b'"'); let rest = format_escaped_str_contents(writer, value); writer.extend_from_slice(rest); writer.push(b'"'); } pub(crate) fn format_escaped_fmt(writer: &mut Vec, args: fmt::Arguments) { writer.push(b'"'); Collect { buf: writer } .write_fmt(args) .expect("formatting should not error"); writer.push(b'"'); } struct Collect<'buf> { buf: &'buf mut Vec, } impl fmt::Write for Collect<'_> { fn write_str(&mut self, s: &str) -> fmt::Result { let last = format_escaped_str_contents(self.buf, s); self.buf.extend(last); Ok(()) } } // writes any escape sequences, and returns the suffix still needed to be written. fn format_escaped_str_contents<'a>(writer: &mut Vec, value: &'a str) -> &'a [u8] { let bytes = value.as_bytes(); let mut start = 0; for (i, &byte) in bytes.iter().enumerate() { let escape = ESCAPE[byte as usize]; if escape == 0 { continue; } writer.extend_from_slice(&bytes[start..i]); let char_escape = CharEscape::from_escape_table(escape, byte); write_char_escape(writer, char_escape); start = i + 1; } &bytes[start..] } const BB: u8 = b'b'; // \x08 const TT: u8 = b't'; // \x09 const NN: u8 = b'n'; // \x0A const FF: u8 = b'f'; // \x0C const RR: u8 = b'r'; // \x0D const QU: u8 = b'"'; // \x22 const BS: u8 = b'\\'; // \x5C const UU: u8 = b'u'; // \x00...\x1F except the ones above const __: u8 = 0; // Lookup table of escape sequences. A value of b'x' at index i means that byte // i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped. static ESCAPE: [u8; 256] = [ // 1 2 3 4 5 6 7 8 9 A B C D E F UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F ]; fn write_char_escape(writer: &mut Vec, char_escape: CharEscape) { let s = match char_escape { CharEscape::Quote => b"\\\"", CharEscape::ReverseSolidus => b"\\\\", // CharEscape::Solidus => b"\\/", CharEscape::Backspace => b"\\b", CharEscape::FormFeed => b"\\f", CharEscape::LineFeed => b"\\n", CharEscape::CarriageReturn => b"\\r", CharEscape::Tab => b"\\t", CharEscape::AsciiControl(byte) => { static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; let bytes = &[ b'\\', b'u', b'0', b'0', HEX_DIGITS[(byte >> 4) as usize], HEX_DIGITS[(byte & 0xF) as usize], ]; return writer.extend_from_slice(bytes); } }; writer.extend_from_slice(s); } ================================================ FILE: libs/proxy/json/src/value.rs ================================================ use core::fmt; use std::collections::{BTreeMap, HashMap}; use crate::str::{format_escaped_fmt, format_escaped_str}; use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object}; /// Write a value to the underlying json representation. pub trait ValueEncoder { fn encode(self, v: ValueSer<'_>); } pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec) { b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes()); } pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec) { b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes()); } impl ValueEncoder for &T { #[inline] fn encode(self, v: ValueSer<'_>) { T::encode(*self, v); } } impl ValueEncoder for &str { #[inline] fn encode(self, v: ValueSer<'_>) { format_escaped_str(v.buf, self); v.finish(); } } impl ValueEncoder for fmt::Arguments<'_> { #[inline] fn encode(self, v: ValueSer<'_>) { if let Some(s) = self.as_str() { format_escaped_str(v.buf, s); } else { format_escaped_fmt(v.buf, self); } v.finish(); } } macro_rules! int { [$($t:ty),*] => { $( impl ValueEncoder for $t { #[inline] fn encode(self, v: ValueSer<'_>) { write_int(self, v.buf); v.finish(); } } )* }; } int![u8, u16, u32, u64, usize, u128]; int![i8, i16, i32, i64, isize, i128]; macro_rules! float { [$($t:ty),*] => { $( impl ValueEncoder for $t { #[inline] fn encode(self, v: ValueSer<'_>) { write_float(self, v.buf); v.finish(); } } )* }; } float![f32, f64]; impl ValueEncoder for bool { #[inline] fn encode(self, v: ValueSer<'_>) { v.write_raw_json(if self { b"true" } else { b"false" }); } } impl ValueEncoder for Option { #[inline] fn encode(self, v: ValueSer<'_>) { match self { Some(value) => value.encode(v), None => Null.encode(v), } } } impl KeyEncoder for &str { #[inline] fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> { let obj = &mut *obj; obj.entry_inner(|b| format_escaped_str(b, self)) } } impl KeyEncoder for fmt::Arguments<'_> { #[inline] fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> { if let Some(key) = self.as_str() { obj.entry_inner(|b| format_escaped_str(b, key)) } else { obj.entry_inner(|b| format_escaped_fmt(b, self)) } } } /// Represents the JSON null value. pub struct Null; impl ValueEncoder for Null { #[inline] fn encode(self, v: ValueSer<'_>) { v.write_raw_json(b"null"); } } impl ValueEncoder for Vec { #[inline] fn encode(self, v: ValueSer<'_>) { value_as_list!(|v| { for t in self { v.entry().value(t); } }); } } impl ValueEncoder for &[T] { #[inline] fn encode(self, v: ValueSer<'_>) { value_as_list!(|v| { for t in self { v.entry().value(t); } }); } } impl ValueEncoder for HashMap { #[inline] fn encode(self, o: ValueSer<'_>) { value_as_object!(|o| { for (k, v) in self { o.entry(k, v); } }); } } impl ValueEncoder for BTreeMap { #[inline] fn encode(self, o: ValueSer<'_>) { value_as_object!(|o| { for (k, v) in self { o.entry(k, v); } }); } } ================================================ FILE: libs/proxy/postgres-protocol2/Cargo.toml ================================================ [package] name = "postgres-protocol2" version = "0.1.0" edition = "2024" license = "MIT/Apache-2.0" [dependencies] base64.workspace = true byteorder.workspace = true bytes.workspace = true fallible-iterator.workspace = true hmac.workspace = true memchr = "2.0" rand.workspace = true sha2.workspace = true stringprep = "0.1" tokio = { workspace = true, features = ["rt"] } [dev-dependencies] tokio = { workspace = true, features = ["full"] } ================================================ FILE: libs/proxy/postgres-protocol2/src/authentication/mod.rs ================================================ //! Authentication protocol support. pub mod sasl; ================================================ FILE: libs/proxy/postgres-protocol2/src/authentication/sasl.rs ================================================ //! SASL-based authentication support. use std::fmt::Write; use std::{io, iter, mem, str}; use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use hmac::{Hmac, Mac}; use rand::{self, Rng}; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; use tokio::task::yield_now; const NONCE_LENGTH: usize = 24; /// The identifier of the SCRAM-SHA-256 SASL authentication mechanism. pub const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; /// The identifier of the SCRAM-SHA-256-PLUS SASL authentication mechanism. pub const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; // since postgres passwords are not required to exclude saslprep-prohibited // characters or even be valid UTF8, we run saslprep if possible and otherwise // return the raw password. fn normalize(pass: &[u8]) -> Vec { let pass = match str::from_utf8(pass) { Ok(pass) => pass, Err(_) => return pass.to_vec(), }; match stringprep::saslprep(pass) { Ok(pass) => pass.into_owned().into_bytes(), Err(_) => pass.as_bytes().to_vec(), } } pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { let mut hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); hmac.update(salt); hmac.update(&[0, 0, 0, 1]); let mut prev = hmac.finalize().into_bytes(); let mut hi = prev; for i in 1..iterations { let mut hmac = Hmac::::new_from_slice(str).expect("already checked above"); hmac.update(&prev); prev = hmac.finalize().into_bytes(); for (hi, prev) in hi.iter_mut().zip(prev) { *hi ^= prev; } // yield every ~250us // hopefully reduces tail latencies if i.is_multiple_of(1024) { yield_now().await } } hi.into() } enum ChannelBindingInner { Unrequested, Unsupported, TlsServerEndPoint(Vec), } /// The channel binding configuration for a SCRAM authentication exchange. pub struct ChannelBinding(ChannelBindingInner); impl ChannelBinding { /// The server did not request channel binding. pub fn unrequested() -> ChannelBinding { ChannelBinding(ChannelBindingInner::Unrequested) } /// The server requested channel binding but the client is unable to provide it. pub fn unsupported() -> ChannelBinding { ChannelBinding(ChannelBindingInner::Unsupported) } /// The server requested channel binding and the client will use the `tls-server-end-point` /// method. pub fn tls_server_end_point(signature: Vec) -> ChannelBinding { ChannelBinding(ChannelBindingInner::TlsServerEndPoint(signature)) } fn gs2_header(&self) -> &'static str { match self.0 { ChannelBindingInner::Unrequested => "y,,", ChannelBindingInner::Unsupported => "n,,", ChannelBindingInner::TlsServerEndPoint(_) => "p=tls-server-end-point,,", } } fn cbind_data(&self) -> &[u8] { match self.0 { ChannelBindingInner::Unrequested | ChannelBindingInner::Unsupported => &[], ChannelBindingInner::TlsServerEndPoint(ref buf) => buf, } } } /// A pair of keys for the SCRAM-SHA-256 mechanism. /// See for details. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ScramKeys { /// Used by server to authenticate client. pub client_key: [u8; N], /// Used by client to verify server's signature. pub server_key: [u8; N], } /// Password or keys which were derived from it. enum Credentials { /// A regular password as a vector of bytes. Password(Vec), /// A precomputed pair of keys. Keys(ScramKeys), } enum State { Update { nonce: String, password: Credentials<32>, channel_binding: ChannelBinding, }, Finish { server_key: [u8; 32], auth_message: String, }, Done, } /// A type which handles the client side of the SCRAM-SHA-256/SCRAM-SHA-256-PLUS authentication /// process. /// /// During the authentication process, if the backend sends an `AuthenticationSASL` message which /// includes `SCRAM-SHA-256` as an authentication mechanism, this type can be used. /// /// After a `ScramSha256` is constructed, the buffer returned by the `message()` method should be /// sent to the backend in a `SASLInitialResponse` message along with the mechanism name. /// /// The server will reply with an `AuthenticationSASLContinue` message. Its contents should be /// passed to the `update()` method, after which the buffer returned by the `message()` method /// should be sent to the backend in a `SASLResponse` message. /// /// The server will reply with an `AuthenticationSASLFinal` message. Its contents should be passed /// to the `finish()` method, after which the authentication process is complete. pub struct ScramSha256 { message: String, state: State, } fn nonce() -> String { // rand 0.5's ThreadRng is cryptographically secure let mut rng = rand::rng(); (0..NONCE_LENGTH) .map(|_| { let mut v = rng.random_range(0x21u8..0x7e); if v == 0x2c { v = 0x7e } v as char }) .collect() } impl ScramSha256 { /// Constructs a new instance which will use the provided password for authentication. pub fn new(password: &[u8], channel_binding: ChannelBinding) -> ScramSha256 { let password = Credentials::Password(normalize(password)); ScramSha256::new_inner(password, channel_binding, nonce()) } /// Constructs a new instance which will use the provided key pair for authentication. pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 { let password = Credentials::Keys(keys); ScramSha256::new_inner(password, channel_binding, nonce()) } fn new_inner( password: Credentials<32>, channel_binding: ChannelBinding, nonce: String, ) -> ScramSha256 { ScramSha256 { message: format!("{}n=,r={}", channel_binding.gs2_header(), nonce), state: State::Update { nonce, password, channel_binding, }, } } /// Returns the message which should be sent to the backend in an `SASLResponse` message. pub fn message(&self) -> &[u8] { if let State::Done = self.state { panic!("invalid SCRAM state"); } self.message.as_bytes() } /// Updates the state machine with the response from the backend. /// /// This should be called when an `AuthenticationSASLContinue` message is received. pub async fn update(&mut self, message: &[u8]) -> io::Result<()> { let (client_nonce, password, channel_binding) = match mem::replace(&mut self.state, State::Done) { State::Update { nonce, password, channel_binding, } => (nonce, password, channel_binding), _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; let parsed = Parser::new(message).server_first_message()?; if !parsed.nonce.starts_with(&client_nonce) { return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid nonce")); } let (client_key, server_key) = match password { Credentials::Password(password) => { let salt = match BASE64_STANDARD.decode(parsed.salt) { Ok(salt) => salt, Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), }; let salted_password = hi(&password, &salt, parsed.iteration_count).await; let make_key = |name| { let mut hmac = Hmac::::new_from_slice(&salted_password) .expect("HMAC is able to accept all key sizes"); hmac.update(name); let mut key = [0u8; 32]; key.copy_from_slice(hmac.finalize().into_bytes().as_slice()); key }; (make_key(b"Client Key"), make_key(b"Server Key")) } Credentials::Keys(keys) => (keys.client_key, keys.server_key), }; let mut hash = Sha256::default(); hash.update(client_key); let stored_key = hash.finalize_fixed(); let mut cbind_input = vec![]; cbind_input.extend(channel_binding.gs2_header().as_bytes()); cbind_input.extend(channel_binding.cbind_data()); let cbind_input = BASE64_STANDARD.encode(&cbind_input); self.message.clear(); write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap(); let auth_message = format!("n=,r={},{},{}", client_nonce, message, self.message); let mut hmac = Hmac::::new_from_slice(&stored_key) .expect("HMAC is able to accept all key sizes"); hmac.update(auth_message.as_bytes()); let client_signature = hmac.finalize().into_bytes(); let mut client_proof = client_key; for (proof, signature) in client_proof.iter_mut().zip(client_signature) { *proof ^= signature; } write!( &mut self.message, ",p={}", BASE64_STANDARD.encode(client_proof) ) .unwrap(); self.state = State::Finish { server_key, auth_message, }; Ok(()) } /// Finalizes the authentication process. /// /// This should be called when the backend sends an `AuthenticationSASLFinal` message. /// Authentication has only succeeded if this method returns `Ok(())`. pub fn finish(&mut self, message: &[u8]) -> io::Result<()> { let (server_key, auth_message) = match mem::replace(&mut self.state, State::Done) { State::Finish { server_key, auth_message, } => (server_key, auth_message), _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; let parsed = Parser::new(message).server_final_message()?; let verifier = match parsed { ServerFinalMessage::Error(e) => { return Err(io::Error::other(format!("SCRAM error: {e}"))); } ServerFinalMessage::Verifier(verifier) => verifier, }; let verifier = match BASE64_STANDARD.decode(verifier) { Ok(verifier) => verifier, Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), }; let mut hmac = Hmac::::new_from_slice(&server_key) .expect("HMAC is able to accept all key sizes"); hmac.update(auth_message.as_bytes()); hmac.verify_slice(&verifier) .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "SCRAM verification error")) } } struct Parser<'a> { s: &'a str, it: iter::Peekable>, } impl<'a> Parser<'a> { fn new(s: &'a str) -> Parser<'a> { Parser { s, it: s.char_indices().peekable(), } } fn eat(&mut self, target: char) -> io::Result<()> { match self.it.next() { Some((_, c)) if c == target => Ok(()), Some((i, c)) => { let m = format!("unexpected character at byte {i}: expected `{target}` but got `{c}"); Err(io::Error::new(io::ErrorKind::InvalidInput, m)) } None => Err(io::Error::new( io::ErrorKind::UnexpectedEof, "unexpected EOF", )), } } fn take_while(&mut self, f: F) -> io::Result<&'a str> where F: Fn(char) -> bool, { let start = match self.it.peek() { Some(&(i, _)) => i, None => return Ok(""), }; loop { match self.it.peek() { Some(&(_, c)) if f(c) => { self.it.next(); } Some(&(i, _)) => return Ok(&self.s[start..i]), None => return Ok(&self.s[start..]), } } } fn printable(&mut self) -> io::Result<&'a str> { self.take_while(|c| matches!(c, '\x21'..='\x2b' | '\x2d'..='\x7e')) } fn nonce(&mut self) -> io::Result<&'a str> { self.eat('r')?; self.eat('=')?; self.printable() } fn base64(&mut self) -> io::Result<&'a str> { self.take_while(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '+' | '=')) } fn salt(&mut self) -> io::Result<&'a str> { self.eat('s')?; self.eat('=')?; self.base64() } fn posit_number(&mut self) -> io::Result { let n = self.take_while(|c| c.is_ascii_digit())?; n.parse() .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) } fn iteration_count(&mut self) -> io::Result { self.eat('i')?; self.eat('=')?; self.posit_number() } fn eof(&mut self) -> io::Result<()> { match self.it.peek() { Some(&(i, _)) => Err(io::Error::new( io::ErrorKind::InvalidInput, format!("unexpected trailing data at byte {i}"), )), None => Ok(()), } } fn server_first_message(&mut self) -> io::Result> { let nonce = self.nonce()?; self.eat(',')?; let salt = self.salt()?; self.eat(',')?; let iteration_count = self.iteration_count()?; self.eof()?; Ok(ServerFirstMessage { nonce, salt, iteration_count, }) } fn value(&mut self) -> io::Result<&'a str> { self.take_while(|c| matches!(c, '\0' | '=' | ',')) } fn server_error(&mut self) -> io::Result> { match self.it.peek() { Some(&(_, 'e')) => {} _ => return Ok(None), } self.eat('e')?; self.eat('=')?; self.value().map(Some) } fn verifier(&mut self) -> io::Result<&'a str> { self.eat('v')?; self.eat('=')?; self.base64() } fn server_final_message(&mut self) -> io::Result> { let message = match self.server_error()? { Some(error) => ServerFinalMessage::Error(error), None => ServerFinalMessage::Verifier(self.verifier()?), }; self.eof()?; Ok(message) } } struct ServerFirstMessage<'a> { nonce: &'a str, salt: &'a str, iteration_count: u32, } enum ServerFinalMessage<'a> { Error(&'a str), Verifier(&'a str), } #[cfg(test)] mod test { use super::*; #[test] fn parse_server_first_message() { let message = "r=fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j,s=QSXCR+Q6sek8bf92,i=4096"; let message = Parser::new(message).server_first_message().unwrap(); assert_eq!(message.nonce, "fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j"); assert_eq!(message.salt, "QSXCR+Q6sek8bf92"); assert_eq!(message.iteration_count, 4096); } // recorded auth exchange from psql #[tokio::test] async fn exchange() { let password = "foobar"; let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; let server_first = "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ =4096"; let client_final = "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; let mut scram = ScramSha256::new_inner( Credentials::Password(normalize(password.as_bytes())), ChannelBinding::unsupported(), nonce.to_string(), ); assert_eq!(str::from_utf8(scram.message()).unwrap(), client_first); scram.update(server_first.as_bytes()).await.unwrap(); assert_eq!(str::from_utf8(scram.message()).unwrap(), client_final); scram.finish(server_final.as_bytes()).unwrap(); } } ================================================ FILE: libs/proxy/postgres-protocol2/src/escape/mod.rs ================================================ //! Provides functions for escaping literals and identifiers for use //! in SQL queries. //! //! Prefer parameterized queries where possible. Do not escape //! parameters in a parameterized query. #[cfg(test)] mod test; /// Escape a literal and surround result with single quotes. Not /// recommended in most cases. /// /// If input contains backslashes, result will be of the form ` /// E'...'` so it is safe to use regardless of the setting of /// standard_conforming_strings. pub fn escape_literal(input: &str) -> String { escape_internal(input, false) } /// Escape an identifier and surround result with double quotes. pub fn escape_identifier(input: &str) -> String { escape_internal(input, true) } // Translation of PostgreSQL libpq's PQescapeInternal(). Does not // require a connection because input string is known to be valid // UTF-8. // // Escape arbitrary strings. If as_ident is true, we escape the // result as an identifier; if false, as a literal. The result is // returned in a newly allocated buffer. If we fail due to an // encoding violation or out of memory condition, we return NULL, // storing an error message into conn. fn escape_internal(input: &str, as_ident: bool) -> String { let mut num_backslashes = 0; let mut num_quotes = 0; let quote_char = if as_ident { '"' } else { '\'' }; // Scan the string for characters that must be escaped. for ch in input.chars() { if ch == quote_char { num_quotes += 1; } else if ch == '\\' { num_backslashes += 1; } } // Allocate output String. let mut result_size = input.len() + num_quotes + 3; // two quotes, plus a NUL if !as_ident && num_backslashes > 0 { result_size += num_backslashes + 2; } let mut output = String::with_capacity(result_size); // If we are escaping a literal that contains backslashes, we use // the escape string syntax so that the result is correct under // either value of standard_conforming_strings. We also emit a // leading space in this case, to guard against the possibility // that the result might be interpolated immediately following an // identifier. if !as_ident && num_backslashes > 0 { output.push(' '); output.push('E'); } // Opening quote. output.push(quote_char); // Use fast path if possible. // // We've already verified that the input string is well-formed in // the current encoding. If it contains no quotes and, in the // case of literal-escaping, no backslashes, then we can just copy // it directly to the output buffer, adding the necessary quotes. // // If not, we must rescan the input and process each character // individually. if num_quotes == 0 && (num_backslashes == 0 || as_ident) { output.push_str(input); } else { for ch in input.chars() { if ch == quote_char || (!as_ident && ch == '\\') { output.push(ch); } output.push(ch); } } output.push(quote_char); output } ================================================ FILE: libs/proxy/postgres-protocol2/src/escape/test.rs ================================================ use crate::escape::{escape_identifier, escape_literal}; #[test] fn test_escape_idenifier() { assert_eq!(escape_identifier("foo"), String::from("\"foo\"")); assert_eq!(escape_identifier("f\\oo"), String::from("\"f\\oo\"")); assert_eq!(escape_identifier("f'oo"), String::from("\"f'oo\"")); assert_eq!(escape_identifier("f\"oo"), String::from("\"f\"\"oo\"")); } #[test] fn test_escape_literal() { assert_eq!(escape_literal("foo"), String::from("'foo'")); assert_eq!(escape_literal("f\\oo"), String::from(" E'f\\\\oo'")); assert_eq!(escape_literal("f'oo"), String::from("'f''oo'")); assert_eq!(escape_literal("f\"oo"), String::from("'f\"oo'")); } ================================================ FILE: libs/proxy/postgres-protocol2/src/lib.rs ================================================ //! Low level Postgres protocol APIs. //! //! This crate implements the low level components of Postgres's communication //! protocol, including message and value serialization and deserialization. //! It is designed to be used as a building block by higher level APIs such as //! `rust-postgres`, and should not typically be used directly. //! //! # Note //! //! This library assumes that the `client_encoding` backend parameter has been //! set to `UTF8`. It will most likely not behave properly if that is not the case. #![warn(missing_docs, clippy::all)] use std::io; use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; pub mod authentication; pub mod escape; pub mod message; pub mod password; pub mod types; /// A Postgres OID. pub type Oid = u32; /// A Postgres Log Sequence Number (LSN). pub type Lsn = u64; /// An enum indicating if a value is `NULL` or not. pub enum IsNull { /// The value is `NULL`. Yes, /// The value is not `NULL`. No, } fn write_nullable(serializer: F, buf: &mut BytesMut) -> Result<(), E> where F: FnOnce(&mut BytesMut) -> Result, E: From, { let base = buf.len(); buf.put_i32(0); let size = match serializer(buf)? { IsNull::No => i32::from_usize(buf.len() - base - 4)?, IsNull::Yes => -1, }; BigEndian::write_i32(&mut buf[base..], size); Ok(()) } trait FromUsize: Sized { fn from_usize(x: usize) -> Result; } macro_rules! from_usize { ($t:ty) => { impl FromUsize for $t { #[inline] fn from_usize(x: usize) -> io::Result<$t> { if x > <$t>::MAX as usize { Err(io::Error::new( io::ErrorKind::InvalidInput, "value too large to transmit", )) } else { Ok(x as $t) } } } }; } from_usize!(i16); from_usize!(i32); ================================================ FILE: libs/proxy/postgres-protocol2/src/message/backend.rs ================================================ #![allow(missing_docs)] use std::io::{self, Read}; use std::ops::Range; use std::{cmp, str}; use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use memchr::memchr; use crate::Oid; // top-level message tags const PARSE_COMPLETE_TAG: u8 = b'1'; const BIND_COMPLETE_TAG: u8 = b'2'; const CLOSE_COMPLETE_TAG: u8 = b'3'; pub const NOTIFICATION_RESPONSE_TAG: u8 = b'A'; const COPY_DONE_TAG: u8 = b'c'; const COMMAND_COMPLETE_TAG: u8 = b'C'; const COPY_DATA_TAG: u8 = b'd'; const DATA_ROW_TAG: u8 = b'D'; const ERROR_RESPONSE_TAG: u8 = b'E'; const COPY_IN_RESPONSE_TAG: u8 = b'G'; const COPY_OUT_RESPONSE_TAG: u8 = b'H'; const COPY_BOTH_RESPONSE_TAG: u8 = b'W'; const EMPTY_QUERY_RESPONSE_TAG: u8 = b'I'; const BACKEND_KEY_DATA_TAG: u8 = b'K'; pub const NO_DATA_TAG: u8 = b'n'; pub const NOTICE_RESPONSE_TAG: u8 = b'N'; const AUTHENTICATION_TAG: u8 = b'R'; const PORTAL_SUSPENDED_TAG: u8 = b's'; pub const PARAMETER_STATUS_TAG: u8 = b'S'; const PARAMETER_DESCRIPTION_TAG: u8 = b't'; const ROW_DESCRIPTION_TAG: u8 = b'T'; pub const READY_FOR_QUERY_TAG: u8 = b'Z'; #[derive(Debug, Copy, Clone)] pub struct Header { tag: u8, len: i32, } #[allow(clippy::len_without_is_empty)] impl Header { #[inline] pub fn parse(buf: &[u8]) -> io::Result> { if buf.len() < 5 { return Ok(None); } let tag = buf[0]; let len = BigEndian::read_i32(&buf[1..]); if len < 4 { return Err(io::Error::new( io::ErrorKind::InvalidData, "invalid message length: header length < 4", )); } Ok(Some(Header { tag, len })) } #[inline] pub fn tag(self) -> u8 { self.tag } #[inline] pub fn len(self) -> i32 { self.len } } /// An enum representing Postgres backend messages. pub enum Message { AuthenticationCleartextPassword, AuthenticationGss, AuthenticationKerberosV5, AuthenticationMd5Password, AuthenticationOk, AuthenticationScmCredential, AuthenticationSspi, AuthenticationGssContinue, AuthenticationSasl(AuthenticationSaslBody), AuthenticationSaslContinue(AuthenticationSaslContinueBody), AuthenticationSaslFinal(AuthenticationSaslFinalBody), BackendKeyData(BackendKeyDataBody), BindComplete, CloseComplete, CommandComplete(CommandCompleteBody), CopyData, CopyDone, CopyInResponse, CopyOutResponse, CopyBothResponse, DataRow(DataRowBody), EmptyQueryResponse, ErrorResponse(ErrorResponseBody), NoData, NoticeResponse(NoticeResponseBody), NotificationResponse(NotificationResponseBody), ParameterDescription(ParameterDescriptionBody), ParameterStatus(ParameterStatusBody), ParseComplete, PortalSuspended, ReadyForQuery(ReadyForQueryBody), RowDescription(RowDescriptionBody), } impl Message { #[inline] pub fn parse(buf: &mut BytesMut) -> io::Result> { if buf.len() < 5 { let to_read = 5 - buf.len(); buf.reserve(to_read); return Ok(None); } let tag = buf[0]; let len = (&buf[1..5]).read_u32::().unwrap(); if len < 4 { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: parsing u32", )); } let total_len = len as usize + 1; if buf.len() < total_len { let to_read = total_len - buf.len(); buf.reserve(to_read); return Ok(None); } let mut buf = Buffer { bytes: buf.split_to(total_len).freeze(), idx: 5, }; let message = match tag { PARSE_COMPLETE_TAG => Message::ParseComplete, BIND_COMPLETE_TAG => Message::BindComplete, CLOSE_COMPLETE_TAG => Message::CloseComplete, NOTIFICATION_RESPONSE_TAG => Message::NotificationResponse(NotificationResponseBody {}), COPY_DONE_TAG => Message::CopyDone, COMMAND_COMPLETE_TAG => { let tag = buf.read_cstr()?; Message::CommandComplete(CommandCompleteBody { tag }) } COPY_DATA_TAG => Message::CopyData, DATA_ROW_TAG => { let len = buf.read_u16::()?; let storage = buf.read_all(); Message::DataRow(DataRowBody { storage, len }) } ERROR_RESPONSE_TAG => { let storage = buf.read_all(); Message::ErrorResponse(ErrorResponseBody { storage }) } COPY_IN_RESPONSE_TAG => Message::CopyInResponse, COPY_OUT_RESPONSE_TAG => Message::CopyOutResponse, COPY_BOTH_RESPONSE_TAG => Message::CopyBothResponse, EMPTY_QUERY_RESPONSE_TAG => Message::EmptyQueryResponse, BACKEND_KEY_DATA_TAG => { let process_id = buf.read_i32::()?; let secret_key = buf.read_i32::()?; Message::BackendKeyData(BackendKeyDataBody { process_id, secret_key, }) } NO_DATA_TAG => Message::NoData, NOTICE_RESPONSE_TAG => { let storage = buf.read_all(); Message::NoticeResponse(NoticeResponseBody { storage }) } AUTHENTICATION_TAG => match buf.read_i32::()? { 0 => Message::AuthenticationOk, 2 => Message::AuthenticationKerberosV5, 3 => Message::AuthenticationCleartextPassword, 5 => Message::AuthenticationMd5Password, 6 => Message::AuthenticationScmCredential, 7 => Message::AuthenticationGss, 8 => Message::AuthenticationGssContinue, 9 => Message::AuthenticationSspi, 10 => { let storage = buf.read_all(); Message::AuthenticationSasl(AuthenticationSaslBody(storage)) } 11 => { let storage = buf.read_all(); Message::AuthenticationSaslContinue(AuthenticationSaslContinueBody(storage)) } 12 => { let storage = buf.read_all(); Message::AuthenticationSaslFinal(AuthenticationSaslFinalBody(storage)) } tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, format!("unknown authentication tag `{tag}`"), )); } }, PORTAL_SUSPENDED_TAG => Message::PortalSuspended, PARAMETER_STATUS_TAG => { let name = buf.read_cstr()?; let value = buf.read_cstr()?; Message::ParameterStatus(ParameterStatusBody { name, value }) } PARAMETER_DESCRIPTION_TAG => { let len = buf.read_u16::()?; let storage = buf.read_all(); Message::ParameterDescription(ParameterDescriptionBody { storage, len }) } ROW_DESCRIPTION_TAG => { let len = buf.read_u16::()?; let storage = buf.read_all(); Message::RowDescription(RowDescriptionBody { storage, len }) } READY_FOR_QUERY_TAG => { let status = buf.read_u8()?; Message::ReadyForQuery(ReadyForQueryBody { status }) } tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, format!("unknown message tag `{tag}`"), )); } }; if !buf.is_empty() { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: expected buffer to be empty", )); } Ok(Some(message)) } } struct Buffer { bytes: Bytes, idx: usize, } impl Buffer { #[inline] fn slice(&self) -> &[u8] { &self.bytes[self.idx..] } #[inline] fn is_empty(&self) -> bool { self.slice().is_empty() } #[inline] fn read_cstr(&mut self) -> io::Result { match memchr(0, self.slice()) { Some(pos) => { let start = self.idx; let end = start + pos; let cstr = self.bytes.slice(start..end); self.idx = end + 1; Ok(cstr) } None => Err(io::Error::new( io::ErrorKind::UnexpectedEof, "unexpected EOF", )), } } #[inline] fn read_all(&mut self) -> Bytes { let buf = self.bytes.slice(self.idx..); self.idx = self.bytes.len(); buf } } impl Read for Buffer { #[inline] fn read(&mut self, buf: &mut [u8]) -> io::Result { let len = { let slice = self.slice(); let len = cmp::min(slice.len(), buf.len()); buf[..len].copy_from_slice(&slice[..len]); len }; self.idx += len; Ok(len) } } pub struct AuthenticationMd5PasswordBody { salt: [u8; 4], } impl AuthenticationMd5PasswordBody { #[inline] pub fn salt(&self) -> [u8; 4] { self.salt } } pub struct AuthenticationSaslBody(Bytes); impl AuthenticationSaslBody { #[inline] pub fn mechanisms(&self) -> SaslMechanisms<'_> { SaslMechanisms(&self.0) } } pub struct SaslMechanisms<'a>(&'a [u8]); impl<'a> FallibleIterator for SaslMechanisms<'a> { type Item = &'a str; type Error = io::Error; #[inline] fn next(&mut self) -> io::Result> { let value_end = find_null(self.0, 0)?; if value_end == 0 { if self.0.len() != 1 { return Err(io::Error::new( io::ErrorKind::InvalidData, "invalid message length: expected to be at end of iterator for sasl", )); } Ok(None) } else { let value = get_str(&self.0[..value_end])?; self.0 = &self.0[value_end + 1..]; Ok(Some(value)) } } } pub struct AuthenticationSaslContinueBody(Bytes); impl AuthenticationSaslContinueBody { #[inline] pub fn data(&self) -> &[u8] { &self.0 } } pub struct AuthenticationSaslFinalBody(Bytes); impl AuthenticationSaslFinalBody { #[inline] pub fn data(&self) -> &[u8] { &self.0 } } pub struct BackendKeyDataBody { process_id: i32, secret_key: i32, } impl BackendKeyDataBody { #[inline] pub fn process_id(&self) -> i32 { self.process_id } #[inline] pub fn secret_key(&self) -> i32 { self.secret_key } } pub struct CommandCompleteBody { tag: Bytes, } impl CommandCompleteBody { #[inline] pub fn tag(&self) -> io::Result<&str> { get_str(&self.tag) } } #[derive(Debug)] pub struct DataRowBody { storage: Bytes, len: u16, } impl DataRowBody { #[inline] pub fn ranges(&self) -> DataRowRanges<'_> { DataRowRanges { buf: &self.storage, len: self.storage.len(), remaining: self.len, } } #[inline] pub fn buffer(&self) -> &[u8] { &self.storage } } pub struct DataRowRanges<'a> { buf: &'a [u8], len: usize, remaining: u16, } impl FallibleIterator for DataRowRanges<'_> { type Item = Option>; type Error = io::Error; #[inline] fn next(&mut self) -> io::Result>>> { if self.remaining == 0 { if self.buf.is_empty() { return Ok(None); } else { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: datarowrange is not empty", )); } } self.remaining -= 1; let len = self.buf.read_i32::()?; if len < 0 { Ok(Some(None)) } else { let len = len as usize; if self.buf.len() < len { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, "unexpected EOF", )); } let base = self.len - self.buf.len(); self.buf = &self.buf[len..]; Ok(Some(Some(base..base + len))) } } #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.remaining as usize; (len, Some(len)) } } pub struct ErrorResponseBody { storage: Bytes, } impl ErrorResponseBody { #[inline] pub fn fields(&self) -> ErrorFields<'_> { ErrorFields { buf: &self.storage } } } pub struct ErrorFields<'a> { buf: &'a [u8], } impl<'a> FallibleIterator for ErrorFields<'a> { type Item = ErrorField<'a>; type Error = io::Error; #[inline] fn next(&mut self) -> io::Result>> { let type_ = self.buf.read_u8()?; if type_ == 0 { if self.buf.is_empty() { return Ok(None); } else { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: error fields is not drained", )); } } let value_end = find_null(self.buf, 0)?; let value = get_str(&self.buf[..value_end])?; self.buf = &self.buf[value_end + 1..]; Ok(Some(ErrorField { type_, value })) } } pub struct ErrorField<'a> { type_: u8, value: &'a str, } impl ErrorField<'_> { #[inline] pub fn type_(&self) -> u8 { self.type_ } #[inline] pub fn value(&self) -> &str { self.value } } pub struct NoticeResponseBody { storage: Bytes, } impl NoticeResponseBody { #[inline] pub fn fields(&self) -> ErrorFields<'_> { ErrorFields { buf: &self.storage } } pub fn as_bytes(&self) -> &[u8] { &self.storage } } pub struct NotificationResponseBody {} pub struct ParameterDescriptionBody { storage: Bytes, len: u16, } impl ParameterDescriptionBody { #[inline] pub fn parameters(&self) -> Parameters<'_> { Parameters { buf: &self.storage, remaining: self.len, } } } pub struct Parameters<'a> { buf: &'a [u8], remaining: u16, } impl FallibleIterator for Parameters<'_> { type Item = Oid; type Error = io::Error; #[inline] fn next(&mut self) -> io::Result> { if self.remaining == 0 { if self.buf.is_empty() { return Ok(None); } else { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: parameters is not drained", )); } } self.remaining -= 1; self.buf.read_u32::().map(Some) } #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.remaining as usize; (len, Some(len)) } } pub struct ParameterStatusBody { name: Bytes, value: Bytes, } impl ParameterStatusBody { #[inline] pub fn name(&self) -> io::Result<&str> { get_str(&self.name) } #[inline] pub fn value(&self) -> io::Result<&str> { get_str(&self.value) } } pub struct ReadyForQueryBody { status: u8, } impl ReadyForQueryBody { #[inline] pub fn status(&self) -> u8 { self.status } } pub struct RowDescriptionBody { storage: Bytes, len: u16, } impl RowDescriptionBody { #[inline] pub fn fields(&self) -> Fields<'_> { Fields { buf: &self.storage, remaining: self.len, } } } pub struct Fields<'a> { buf: &'a [u8], remaining: u16, } impl<'a> FallibleIterator for Fields<'a> { type Item = Field<'a>; type Error = io::Error; #[inline] fn next(&mut self) -> io::Result>> { if self.remaining == 0 { if self.buf.is_empty() { return Ok(None); } else { return Err(io::Error::new( io::ErrorKind::InvalidInput, "invalid message length: field is not drained", )); } } self.remaining -= 1; let name_end = find_null(self.buf, 0)?; let name = get_str(&self.buf[..name_end])?; self.buf = &self.buf[name_end + 1..]; let table_oid = self.buf.read_u32::()?; let column_id = self.buf.read_i16::()?; let type_oid = self.buf.read_u32::()?; let type_size = self.buf.read_i16::()?; let type_modifier = self.buf.read_i32::()?; let format = self.buf.read_i16::()?; Ok(Some(Field { name, table_oid, column_id, type_oid, type_size, type_modifier, format, })) } } pub struct Field<'a> { name: &'a str, table_oid: Oid, column_id: i16, type_oid: Oid, type_size: i16, type_modifier: i32, format: i16, } impl<'a> Field<'a> { #[inline] pub fn name(&self) -> &'a str { self.name } #[inline] pub fn table_oid(&self) -> Oid { self.table_oid } #[inline] pub fn column_id(&self) -> i16 { self.column_id } #[inline] pub fn type_oid(&self) -> Oid { self.type_oid } #[inline] pub fn type_size(&self) -> i16 { self.type_size } #[inline] pub fn type_modifier(&self) -> i32 { self.type_modifier } #[inline] pub fn format(&self) -> i16 { self.format } } #[inline] fn find_null(buf: &[u8], start: usize) -> io::Result { match memchr(0, &buf[start..]) { Some(pos) => Ok(pos + start), None => Err(io::Error::new( io::ErrorKind::UnexpectedEof, "unexpected EOF", )), } } #[inline] fn get_str(buf: &[u8]) -> io::Result<&str> { str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) } ================================================ FILE: libs/proxy/postgres-protocol2/src/message/frontend.rs ================================================ //! Frontend message serialization. #![allow(missing_docs)] use std::error::Error; use std::{io, marker}; use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; use crate::{FromUsize, IsNull, Oid, write_nullable}; #[inline] fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> where F: FnOnce(&mut BytesMut) -> Result<(), E>, E: From, { let base = buf.len(); buf.extend_from_slice(&[0; 4]); f(buf)?; let size = i32::from_usize(buf.len() - base)?; BigEndian::write_i32(&mut buf[base..], size); Ok(()) } #[derive(Debug)] pub enum BindError { Conversion(Box), Serialization(io::Error), } impl From> for BindError { #[inline] fn from(e: Box) -> BindError { BindError::Conversion(e) } } impl From for BindError { #[inline] fn from(e: io::Error) -> BindError { BindError::Serialization(e) } } #[inline] pub fn bind( portal: &str, statement: &str, formats: I, values: J, mut serializer: F, result_formats: K, buf: &mut BytesMut, ) -> Result<(), BindError> where I: IntoIterator, J: IntoIterator, F: FnMut(T, &mut BytesMut) -> Result>, K: IntoIterator, { buf.put_u8(b'B'); write_body(buf, |buf| { write_cstr(portal.as_bytes(), buf)?; write_cstr(statement.as_bytes(), buf)?; write_counted( formats, |f, buf| { buf.put_i16(f); Ok::<_, io::Error>(()) }, buf, )?; write_counted( values, |v, buf| write_nullable(|buf| serializer(v, buf), buf), buf, )?; write_counted( result_formats, |f, buf| { buf.put_i16(f); Ok::<_, io::Error>(()) }, buf, )?; Ok(()) }) } #[inline] fn write_counted(items: I, mut serializer: F, buf: &mut BytesMut) -> Result<(), E> where I: IntoIterator, F: FnMut(T, &mut BytesMut) -> Result<(), E>, E: From, { let base = buf.len(); buf.extend_from_slice(&[0; 2]); let mut count = 0; for item in items { serializer(item, buf)?; count += 1; } let count = i16::from_usize(count)?; BigEndian::write_i16(&mut buf[base..], count); Ok(()) } #[inline] pub fn cancel_request(process_id: i32, secret_key: i32, buf: &mut BytesMut) { write_body(buf, |buf| { buf.put_i32(80_877_102); buf.put_i32(process_id); buf.put_i32(secret_key); Ok::<_, io::Error>(()) }) .unwrap(); } #[inline] pub fn close(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'C'); write_body(buf, |buf| { buf.put_u8(variant); write_cstr(name.as_bytes(), buf) }) } pub struct CopyData { buf: T, len: i32, } impl CopyData where T: Buf, { pub fn new(buf: T) -> io::Result> { let len = buf .remaining() .checked_add(4) .and_then(|l| i32::try_from(l).ok()) .ok_or_else(|| { io::Error::new(io::ErrorKind::InvalidInput, "message length overflow") })?; Ok(CopyData { buf, len }) } pub fn write(self, out: &mut BytesMut) { out.put_u8(b'd'); out.put_i32(self.len); out.put(self.buf); } } #[inline] pub fn copy_done(buf: &mut BytesMut) { buf.put_u8(b'c'); write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } #[inline] pub fn copy_fail(message: &str, buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'f'); write_body(buf, |buf| write_cstr(message.as_bytes(), buf)) } #[inline] pub fn describe(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'D'); write_body(buf, |buf| { buf.put_u8(variant); write_cstr(name.as_bytes(), buf) }) } #[inline] pub fn execute(portal: &str, max_rows: i32, buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'E'); write_body(buf, |buf| { write_cstr(portal.as_bytes(), buf)?; buf.put_i32(max_rows); Ok(()) }) } #[inline] pub fn parse(name: &str, query: &str, param_types: I, buf: &mut BytesMut) -> io::Result<()> where I: IntoIterator, { buf.put_u8(b'P'); write_body(buf, |buf| { write_cstr(name.as_bytes(), buf)?; write_cstr(query.as_bytes(), buf)?; write_counted( param_types, |t, buf| { buf.put_u32(t); Ok::<_, io::Error>(()) }, buf, )?; Ok(()) }) } #[inline] pub fn password_message(password: &[u8], buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'p'); write_body(buf, |buf| write_cstr(password, buf)) } #[inline] pub fn query(query: &str, buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'Q'); write_body(buf, |buf| write_cstr(query.as_bytes(), buf)) } #[inline] pub fn sasl_initial_response(mechanism: &str, data: &[u8], buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'p'); write_body(buf, |buf| { write_cstr(mechanism.as_bytes(), buf)?; let len = i32::from_usize(data.len())?; buf.put_i32(len); buf.put_slice(data); Ok(()) }) } #[inline] pub fn sasl_response(data: &[u8], buf: &mut BytesMut) -> io::Result<()> { buf.put_u8(b'p'); write_body(buf, |buf| { buf.put_slice(data); Ok(()) }) } #[inline] pub fn ssl_request(buf: &mut BytesMut) { write_body(buf, |buf| { buf.put_i32(80_877_103); Ok::<_, io::Error>(()) }) .unwrap(); } #[inline] pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> { write_body(buf, |buf| { // postgres protocol version 3.0(196608) in bigger-endian buf.put_i32(0x00_03_00_00); buf.put_slice(¶meters.params); buf.put_u8(0); Ok(()) }) } #[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct StartupMessageParams { pub params: BytesMut, } impl StartupMessageParams { /// Set parameter's value by its name. pub fn insert(&mut self, name: &str, value: &str) { if name.contains('\0') || value.contains('\0') { panic!("startup parameter name or value contained a null") } self.params.put_slice(name.as_bytes()); self.params.put_u8(0); self.params.put_slice(value.as_bytes()); self.params.put_u8(0); } } #[inline] pub fn sync(buf: &mut BytesMut) { buf.put_u8(b'S'); write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } #[inline] pub fn flush(buf: &mut BytesMut) { buf.put_u8(b'H'); write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } #[inline] pub fn terminate(buf: &mut BytesMut) { buf.put_u8(b'X'); write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } #[inline] fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { if s.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, "string contains embedded null", )); } buf.put_slice(s); buf.put_u8(0); Ok(()) } ================================================ FILE: libs/proxy/postgres-protocol2/src/message/mod.rs ================================================ //! Postgres message protocol support. //! //! See [Postgres's documentation][docs] for more information on message flow. //! //! [docs]: https://www.postgresql.org/docs/9.5/static/protocol-flow.html pub mod backend; pub mod frontend; ================================================ FILE: libs/proxy/postgres-protocol2/src/password/mod.rs ================================================ //! Functions to encrypt a password in the client. //! //! This is intended to be used by client applications that wish to //! send commands like `ALTER USER joe PASSWORD 'pwd'`. The password //! need not be sent in cleartext if it is encrypted on the client //! side. This is good because it ensures the cleartext password won't //! end up in logs pg_stat displays, etc. use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use hmac::{Hmac, Mac}; use rand::RngCore; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; use crate::authentication::sasl; #[cfg(test)] mod test; const SCRAM_DEFAULT_ITERATIONS: u32 = 4096; const SCRAM_DEFAULT_SALT_LEN: usize = 16; /// Hash password using SCRAM-SHA-256 with a randomly-generated /// salt. /// /// The client may assume the returned string doesn't contain any /// special characters that would require escaping in an SQL command. pub async fn scram_sha_256(password: &[u8]) -> String { let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN]; let mut rng = rand::rng(); rng.fill_bytes(&mut salt); scram_sha_256_salt(password, salt).await } // Internal implementation of scram_sha_256 with a caller-provided // salt. This is useful for testing. pub(crate) async fn scram_sha_256_salt( password: &[u8], salt: [u8; SCRAM_DEFAULT_SALT_LEN], ) -> String { // Prepare the password, per [RFC // 4013](https://tools.ietf.org/html/rfc4013), if possible. // // Postgres treats passwords as byte strings (without embedded NUL // bytes), but SASL expects passwords to be valid UTF-8. // // Follow the behavior of libpq's PQencryptPasswordConn(), and // also the backend. If the password is not valid UTF-8, or if it // contains prohibited characters (such as non-ASCII whitespace), // just skip the SASLprep step and use the original byte // sequence. let prepared: Vec = match std::str::from_utf8(password) { Ok(password_str) => { match stringprep::saslprep(password_str) { Ok(p) => p.into_owned().into_bytes(), // contains invalid characters; skip saslprep Err(_) => Vec::from(password), } } // not valid UTF-8; skip saslprep Err(_) => Vec::from(password), }; // salt password let salted_password = sasl::hi(&prepared, &salt, SCRAM_DEFAULT_ITERATIONS).await; // client key let mut hmac = Hmac::::new_from_slice(&salted_password) .expect("HMAC is able to accept all key sizes"); hmac.update(b"Client Key"); let client_key = hmac.finalize().into_bytes(); // stored key let mut hash = Sha256::default(); hash.update(client_key.as_slice()); let stored_key = hash.finalize_fixed(); // server key let mut hmac = Hmac::::new_from_slice(&salted_password) .expect("HMAC is able to accept all key sizes"); hmac.update(b"Server Key"); let server_key = hmac.finalize().into_bytes(); format!( "SCRAM-SHA-256${}:{}${}:{}", SCRAM_DEFAULT_ITERATIONS, BASE64_STANDARD.encode(salt), BASE64_STANDARD.encode(stored_key), BASE64_STANDARD.encode(server_key) ) } ================================================ FILE: libs/proxy/postgres-protocol2/src/password/test.rs ================================================ use crate::password; #[tokio::test] async fn test_encrypt_scram_sha_256() { // Specify the salt to make the test deterministic. Any bytes will do. let salt: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; assert_eq!( password::scram_sha_256_salt(b"secret", salt).await, "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA=" ); } ================================================ FILE: libs/proxy/postgres-protocol2/src/types/mod.rs ================================================ //! Conversions to and from Postgres's binary format for various types. use std::boxed::Box as StdBox; use std::error::Error; use std::str; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{BufMut, BytesMut}; use fallible_iterator::FallibleIterator; use crate::Oid; #[cfg(test)] mod test; /// Serializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. #[inline] pub fn text_to_sql(v: &str, buf: &mut BytesMut) { buf.put_slice(v.as_bytes()); } /// Deserializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. #[inline] pub fn text_from_sql(buf: &[u8]) -> Result<&str, StdBox> { Ok(str::from_utf8(buf)?) } /// Deserializes a `"char"` value. #[inline] pub fn char_from_sql(mut buf: &[u8]) -> Result> { let v = buf.read_i8()?; if !buf.is_empty() { return Err("invalid buffer size".into()); } Ok(v) } /// Serializes an `OID` value. #[inline] pub fn oid_to_sql(v: Oid, buf: &mut BytesMut) { buf.put_u32(v); } /// Deserializes an `OID` value. #[inline] pub fn oid_from_sql(mut buf: &[u8]) -> Result> { let v = buf.read_u32::()?; if !buf.is_empty() { return Err("invalid buffer size".into()); } Ok(v) } /// A fallible iterator over `HSTORE` entries. pub struct HstoreEntries<'a> { remaining: i32, buf: &'a [u8], } impl<'a> FallibleIterator for HstoreEntries<'a> { type Item = (&'a str, Option<&'a str>); type Error = StdBox; #[inline] #[allow(clippy::type_complexity)] fn next( &mut self, ) -> Result)>, StdBox> { if self.remaining == 0 { if !self.buf.is_empty() { return Err("invalid buffer size".into()); } return Ok(None); } self.remaining -= 1; let key_len = self.buf.read_i32::()?; if key_len < 0 { return Err("invalid key length".into()); } let (key, buf) = self.buf.split_at(key_len as usize); let key = str::from_utf8(key)?; self.buf = buf; let value_len = self.buf.read_i32::()?; let value = if value_len < 0 { None } else { let (value, buf) = self.buf.split_at(value_len as usize); let value = str::from_utf8(value)?; self.buf = buf; Some(value) }; Ok(Some((key, value))) } #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.remaining as usize; (len, Some(len)) } } /// Deserializes an array value. #[inline] pub fn array_from_sql(mut buf: &[u8]) -> Result, StdBox> { let dimensions = buf.read_i32::()?; if dimensions < 0 { return Err("invalid dimension count".into()); } let mut r = buf; let mut elements = 1i32; for _ in 0..dimensions { let len = r.read_i32::()?; if len < 0 { return Err("invalid dimension size".into()); } let _lower_bound = r.read_i32::()?; elements = match elements.checked_mul(len) { Some(elements) => elements, None => return Err("too many array elements".into()), }; } if dimensions == 0 { elements = 0; } Ok(Array { dimensions, elements, buf, }) } /// A Postgres array. pub struct Array<'a> { dimensions: i32, elements: i32, buf: &'a [u8], } impl<'a> Array<'a> { /// Returns an iterator over the dimensions of the array. #[inline] pub fn dimensions(&self) -> ArrayDimensions<'a> { ArrayDimensions(&self.buf[..self.dimensions as usize * 8]) } /// Returns an iterator over the values of the array. #[inline] pub fn values(&self) -> ArrayValues<'a> { ArrayValues { remaining: self.elements, buf: &self.buf[self.dimensions as usize * 8..], } } } /// An iterator over the dimensions of an array. pub struct ArrayDimensions<'a>(&'a [u8]); impl FallibleIterator for ArrayDimensions<'_> { type Item = ArrayDimension; type Error = StdBox; #[inline] fn next(&mut self) -> Result, StdBox> { if self.0.is_empty() { return Ok(None); } let len = self.0.read_i32::()?; let lower_bound = self.0.read_i32::()?; Ok(Some(ArrayDimension { len, lower_bound })) } #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.0.len() / 8; (len, Some(len)) } } /// Information about a dimension of an array. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ArrayDimension { /// The length of this dimension. pub len: i32, /// The base value used to index into this dimension. pub lower_bound: i32, } /// An iterator over the values of an array, in row-major order. pub struct ArrayValues<'a> { remaining: i32, buf: &'a [u8], } impl<'a> FallibleIterator for ArrayValues<'a> { type Item = Option<&'a [u8]>; type Error = StdBox; #[inline] fn next(&mut self) -> Result>, StdBox> { if self.remaining == 0 { if !self.buf.is_empty() { return Err("invalid message length: arrayvalue not drained".into()); } return Ok(None); } self.remaining -= 1; let len = self.buf.read_i32::()?; let val = if len < 0 { None } else { if self.buf.len() < len as usize { return Err("invalid value length".into()); } let (val, buf) = self.buf.split_at(len as usize); self.buf = buf; Some(val) }; Ok(Some(val)) } fn size_hint(&self) -> (usize, Option) { let len = self.remaining as usize; (len, Some(len)) } } /// Serializes a Postgres ltree string #[inline] pub fn ltree_to_sql(v: &str, buf: &mut BytesMut) { // A version number is prepended to an ltree string per spec buf.put_u8(1); // Append the rest of the query buf.put_slice(v.as_bytes()); } /// Deserialize a Postgres ltree string #[inline] pub fn ltree_from_sql(buf: &[u8]) -> Result<&str, StdBox> { match buf { // Remove the version number from the front of the ltree per spec [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), _ => Err("ltree version 1 only supported".into()), } } /// Serializes a Postgres lquery string #[inline] pub fn lquery_to_sql(v: &str, buf: &mut BytesMut) { // A version number is prepended to an lquery string per spec buf.put_u8(1); // Append the rest of the query buf.put_slice(v.as_bytes()); } /// Deserialize a Postgres lquery string #[inline] pub fn lquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { match buf { // Remove the version number from the front of the lquery per spec [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), _ => Err("lquery version 1 only supported".into()), } } /// Serializes a Postgres ltxtquery string #[inline] pub fn ltxtquery_to_sql(v: &str, buf: &mut BytesMut) { // A version number is prepended to an ltxtquery string per spec buf.put_u8(1); // Append the rest of the query buf.put_slice(v.as_bytes()); } /// Deserialize a Postgres ltxtquery string #[inline] pub fn ltxtquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { match buf { // Remove the version number from the front of the ltxtquery per spec [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), _ => Err("ltxtquery version 1 only supported".into()), } } ================================================ FILE: libs/proxy/postgres-protocol2/src/types/test.rs ================================================ use bytes::{Buf, BytesMut}; use super::*; #[test] fn ltree_sql() { let mut query = vec![1u8]; query.extend_from_slice("A.B.C".as_bytes()); let mut buf = BytesMut::new(); ltree_to_sql("A.B.C", &mut buf); assert_eq!(query.as_slice(), buf.chunk()); } #[test] fn ltree_str() { let mut query = vec![1u8]; query.extend_from_slice("A.B.C".as_bytes()); assert!(ltree_from_sql(query.as_slice()).is_ok()) } #[test] fn ltree_wrong_version() { let mut query = vec![2u8]; query.extend_from_slice("A.B.C".as_bytes()); assert!(ltree_from_sql(query.as_slice()).is_err()) } #[test] fn lquery_sql() { let mut query = vec![1u8]; query.extend_from_slice("A.B.C".as_bytes()); let mut buf = BytesMut::new(); lquery_to_sql("A.B.C", &mut buf); assert_eq!(query.as_slice(), buf.chunk()); } #[test] fn lquery_str() { let mut query = vec![1u8]; query.extend_from_slice("A.B.C".as_bytes()); assert!(lquery_from_sql(query.as_slice()).is_ok()) } #[test] fn lquery_wrong_version() { let mut query = vec![2u8]; query.extend_from_slice("A.B.C".as_bytes()); assert!(lquery_from_sql(query.as_slice()).is_err()) } #[test] fn ltxtquery_sql() { let mut query = vec![1u8]; query.extend_from_slice("a & b*".as_bytes()); let mut buf = BytesMut::new(); ltree_to_sql("a & b*", &mut buf); assert_eq!(query.as_slice(), buf.chunk()); } #[test] fn ltxtquery_str() { let mut query = vec![1u8]; query.extend_from_slice("a & b*".as_bytes()); assert!(ltree_from_sql(query.as_slice()).is_ok()) } #[test] fn ltxtquery_wrong_version() { let mut query = vec![2u8]; query.extend_from_slice("a & b*".as_bytes()); assert!(ltree_from_sql(query.as_slice()).is_err()) } ================================================ FILE: libs/proxy/postgres-types2/Cargo.toml ================================================ [package] name = "postgres-types2" version = "0.1.0" edition = "2024" license = "MIT/Apache-2.0" [dependencies] bytes.workspace = true fallible-iterator.workspace = true postgres-protocol2 = { path = "../postgres-protocol2" } ================================================ FILE: libs/proxy/postgres-types2/src/lib.rs ================================================ //! Conversions to and from Postgres types. //! //! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it //! unless you want to define your own `ToSql` or `FromSql` definitions. #![warn(clippy::all, missing_docs)] use std::any::type_name; use std::error::Error; use std::fmt; use std::sync::Arc; use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; use postgres_protocol2::types; use crate::type_gen::{Inner, Other}; /// Generates a simple implementation of `ToSql::accepts` which accepts the /// types passed to it. macro_rules! accepts { ($($expected:ident),+) => ( fn accepts(ty: &$crate::Type) -> bool { matches!(*ty, $($crate::Type::$expected)|+) } ) } // mod pg_lsn; #[doc(hidden)] pub mod private; // mod special; mod type_gen; /// A Postgres type. #[derive(PartialEq, Eq, Clone, Hash)] pub struct Type(Inner); impl fmt::Debug for Type { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.0, fmt) } } impl fmt::Display for Type { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self.schema() { "public" | "pg_catalog" => {} schema => write!(fmt, "{schema}.")?, } fmt.write_str(self.name()) } } impl Type { /// Creates a new `Type`. pub fn new(name: String, oid: Oid, kind: Kind, schema: String) -> Type { Type(Inner::Other(Arc::new(Other { name, oid, kind, schema, }))) } /// Returns the `Type` corresponding to the provided `Oid` if it /// corresponds to a built-in type. pub fn from_oid(oid: Oid) -> Option { Inner::from_oid(oid).map(Type) } /// Returns the OID of the `Type`. pub fn oid(&self) -> Oid { self.0.oid() } /// Returns the kind of this type. pub fn kind(&self) -> &Kind { self.0.kind() } /// Returns the schema of this type. pub fn schema(&self) -> &str { match self.0 { Inner::Other(ref u) => &u.schema, _ => "pg_catalog", } } /// Returns the name of this type. pub fn name(&self) -> &str { self.0.name() } } /// Represents the kind of a Postgres type. #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[non_exhaustive] pub enum Kind { /// A simple type like `VARCHAR` or `INTEGER`. Simple, /// An enumerated type. Enum, /// A pseudo-type. Pseudo, /// An array type along with the type of its elements. Array(Type), /// A range type along with the type of its elements. Range(Oid), /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. Domain(Oid), /// A composite type. Composite(Oid), } /// Information about a field of a composite type. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Field { name: String, type_: Type, } impl Field { /// Creates a new `Field`. pub fn new(name: String, type_: Type) -> Field { Field { name, type_ } } /// Returns the name of the field. pub fn name(&self) -> &str { &self.name } /// Returns the type of the field. pub fn type_(&self) -> &Type { &self.type_ } } /// An error indicating that a `NULL` Postgres value was passed to a `FromSql` /// implementation that does not support `NULL` values. #[derive(Debug, Clone, Copy)] pub struct WasNull; impl fmt::Display for WasNull { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.write_str("a Postgres value was `NULL`") } } impl Error for WasNull {} /// An error indicating that a conversion was attempted between incompatible /// Rust and Postgres types. #[derive(Debug)] pub struct WrongType { postgres: Type, rust: &'static str, } impl fmt::Display for WrongType { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "cannot convert between the Rust type `{}` and the Postgres type `{}`", self.rust, self.postgres, ) } } impl Error for WrongType {} impl WrongType { /// Creates a new `WrongType` error. pub fn new(ty: Type) -> WrongType { WrongType { postgres: ty, rust: type_name::(), } } } /// An error indicating that a as_text conversion was attempted on a binary /// result. #[derive(Debug)] pub struct WrongFormat {} impl Error for WrongFormat {} impl fmt::Display for WrongFormat { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "cannot read column as text while it is in binary format" ) } } /// A trait for types that can be created from a Postgres value. pub trait FromSql<'a>: Sized { /// Creates a new value of this type from a buffer of data of the specified /// Postgres `Type` in its binary format. /// /// The caller of this method is responsible for ensuring that this type /// is compatible with the Postgres `Type`. fn from_sql(ty: &Type, raw: &'a [u8]) -> Result>; /// Creates a new value of this type from a `NULL` SQL value. /// /// The caller of this method is responsible for ensuring that this type /// is compatible with the Postgres `Type`. /// /// The default implementation returns `Err(Box::new(WasNull))`. #[allow(unused_variables)] fn from_sql_null(ty: &Type) -> Result> { Err(Box::new(WasNull)) } /// A convenience function that delegates to `from_sql` and `from_sql_null` depending on the /// value of `raw`. fn from_sql_nullable( ty: &Type, raw: Option<&'a [u8]>, ) -> Result> { match raw { Some(raw) => Self::from_sql(ty, raw), None => Self::from_sql_null(ty), } } /// Determines if a value of this type can be created from the specified /// Postgres `Type`. fn accepts(ty: &Type) -> bool; } /// A trait for types which can be created from a Postgres value without borrowing any data. /// /// This is primarily useful for trait bounds on functions. pub trait FromSqlOwned: for<'a> FromSql<'a> {} impl FromSqlOwned for T where T: for<'a> FromSql<'a> {} impl<'a, T: FromSql<'a>> FromSql<'a> for Option { fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { ::from_sql(ty, raw).map(Some) } fn from_sql_null(_: &Type) -> Result, Box> { Ok(None) } fn accepts(ty: &Type) -> bool { ::accepts(ty) } } impl<'a, T: FromSql<'a>> FromSql<'a> for Vec { fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { let member_type = match *ty.kind() { Kind::Array(ref member) => member, _ => panic!("expected array type"), }; let array = types::array_from_sql(raw)?; if array.dimensions().count()? > 1 { return Err("array contains too many dimensions".into()); } array .values() .map(|v| T::from_sql_nullable(member_type, v)) .collect() } fn accepts(ty: &Type) -> bool { match *ty.kind() { Kind::Array(ref inner) => T::accepts(inner), _ => false, } } } impl<'a> FromSql<'a> for String { fn from_sql(ty: &Type, raw: &'a [u8]) -> Result> { <&str as FromSql>::from_sql(ty, raw).map(ToString::to_string) } fn accepts(ty: &Type) -> bool { <&str as FromSql>::accepts(ty) } } impl<'a> FromSql<'a> for &'a str { fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<&'a str, Box> { match *ty { ref ty if ty.name() == "ltree" => types::ltree_from_sql(raw), ref ty if ty.name() == "lquery" => types::lquery_from_sql(raw), ref ty if ty.name() == "ltxtquery" => types::ltxtquery_from_sql(raw), _ => types::text_from_sql(raw), } } fn accepts(ty: &Type) -> bool { match *ty { Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, ref ty if (ty.name() == "citext" || ty.name() == "ltree" || ty.name() == "lquery" || ty.name() == "ltxtquery") => { true } _ => false, } } } macro_rules! simple_from { ($t:ty, $f:ident, $($expected:ident),+) => { impl<'a> FromSql<'a> for $t { fn from_sql(_: &Type, raw: &'a [u8]) -> Result<$t, Box> { types::$f(raw) } accepts!($($expected),+); } } } simple_from!(i8, char_from_sql, CHAR); simple_from!(u32, oid_from_sql, OID); /// An enum representing the nullability of a Postgres value. pub enum IsNull { /// The value is NULL. Yes, /// The value is not NULL. No, } /// Supported Postgres message format types /// /// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8` #[derive(Clone, Copy, Debug, PartialEq)] pub enum Format { /// Text format (UTF-8) Text, /// Compact, typed binary format Binary, } ================================================ FILE: libs/proxy/postgres-types2/src/private.rs ================================================ use std::error::Error; pub use bytes::BytesMut; use crate::{FromSql, Type}; pub fn read_be_i32(buf: &mut &[u8]) -> Result> { if buf.len() < 4 { return Err("invalid buffer size".into()); } let mut bytes = [0; 4]; bytes.copy_from_slice(&buf[..4]); *buf = &buf[4..]; Ok(i32::from_be_bytes(bytes)) } pub fn read_value<'a, T>( type_: &Type, buf: &mut &'a [u8], ) -> Result> where T: FromSql<'a>, { let len = read_be_i32(buf)?; let value = if len < 0 { None } else { if len as usize > buf.len() { return Err("invalid buffer size".into()); } let (head, tail) = buf.split_at(len as usize); *buf = tail; Some(head) }; T::from_sql_nullable(type_, value) } ================================================ FILE: libs/proxy/postgres-types2/src/type_gen.rs ================================================ // Autogenerated file - DO NOT EDIT use std::sync::Arc; use crate::{Kind, Oid, Type}; #[derive(PartialEq, Eq, Debug, Hash)] pub struct Other { pub name: String, pub oid: Oid, pub kind: Kind, pub schema: String, } #[derive(PartialEq, Eq, Clone, Debug, Hash)] pub enum Inner { Bool, Bytea, Char, Name, Int8, Int2, Int2Vector, Int4, Regproc, Text, Oid, Tid, Xid, Cid, OidVector, PgDdlCommand, Json, Xml, XmlArray, PgNodeTree, JsonArray, TableAmHandler, Xid8Array, IndexAmHandler, Point, Lseg, Path, Box, Polygon, Line, LineArray, Cidr, CidrArray, Float4, Float8, Unknown, Circle, CircleArray, Macaddr8, Macaddr8Array, Money, MoneyArray, Macaddr, Inet, BoolArray, ByteaArray, CharArray, NameArray, Int2Array, Int2VectorArray, Int4Array, RegprocArray, TextArray, TidArray, XidArray, CidArray, OidVectorArray, BpcharArray, VarcharArray, Int8Array, PointArray, LsegArray, PathArray, BoxArray, Float4Array, Float8Array, PolygonArray, OidArray, Aclitem, AclitemArray, MacaddrArray, InetArray, Bpchar, Varchar, Date, Time, Timestamp, TimestampArray, DateArray, TimeArray, Timestamptz, TimestamptzArray, Interval, IntervalArray, NumericArray, CstringArray, Timetz, TimetzArray, Bit, BitArray, Varbit, VarbitArray, Numeric, Refcursor, RefcursorArray, Regprocedure, Regoper, Regoperator, Regclass, Regtype, RegprocedureArray, RegoperArray, RegoperatorArray, RegclassArray, RegtypeArray, Record, Cstring, Any, Anyarray, Void, Trigger, LanguageHandler, Internal, Anyelement, RecordArray, Anynonarray, TxidSnapshotArray, Uuid, UuidArray, TxidSnapshot, FdwHandler, PgLsn, PgLsnArray, TsmHandler, PgNdistinct, PgDependencies, Anyenum, TsVector, Tsquery, GtsVector, TsVectorArray, GtsVectorArray, TsqueryArray, Regconfig, RegconfigArray, Regdictionary, RegdictionaryArray, Jsonb, JsonbArray, AnyRange, EventTrigger, Int4Range, Int4RangeArray, NumRange, NumRangeArray, TsRange, TsRangeArray, TstzRange, TstzRangeArray, DateRange, DateRangeArray, Int8Range, Int8RangeArray, Jsonpath, JsonpathArray, Regnamespace, RegnamespaceArray, Regrole, RegroleArray, Regcollation, RegcollationArray, Int4multiRange, NummultiRange, TsmultiRange, TstzmultiRange, DatemultiRange, Int8multiRange, AnymultiRange, AnycompatiblemultiRange, PgBrinBloomSummary, PgBrinMinmaxMultiSummary, PgMcvList, PgSnapshot, PgSnapshotArray, Xid8, Anycompatible, Anycompatiblearray, Anycompatiblenonarray, AnycompatibleRange, Int4multiRangeArray, NummultiRangeArray, TsmultiRangeArray, TstzmultiRangeArray, DatemultiRangeArray, Int8multiRangeArray, Other(Arc), } impl Inner { pub fn from_oid(oid: Oid) -> Option { match oid { 16 => Some(Inner::Bool), 17 => Some(Inner::Bytea), 18 => Some(Inner::Char), 19 => Some(Inner::Name), 20 => Some(Inner::Int8), 21 => Some(Inner::Int2), 22 => Some(Inner::Int2Vector), 23 => Some(Inner::Int4), 24 => Some(Inner::Regproc), 25 => Some(Inner::Text), 26 => Some(Inner::Oid), 27 => Some(Inner::Tid), 28 => Some(Inner::Xid), 29 => Some(Inner::Cid), 30 => Some(Inner::OidVector), 32 => Some(Inner::PgDdlCommand), 114 => Some(Inner::Json), 142 => Some(Inner::Xml), 143 => Some(Inner::XmlArray), 194 => Some(Inner::PgNodeTree), 199 => Some(Inner::JsonArray), 269 => Some(Inner::TableAmHandler), 271 => Some(Inner::Xid8Array), 325 => Some(Inner::IndexAmHandler), 600 => Some(Inner::Point), 601 => Some(Inner::Lseg), 602 => Some(Inner::Path), 603 => Some(Inner::Box), 604 => Some(Inner::Polygon), 628 => Some(Inner::Line), 629 => Some(Inner::LineArray), 650 => Some(Inner::Cidr), 651 => Some(Inner::CidrArray), 700 => Some(Inner::Float4), 701 => Some(Inner::Float8), 705 => Some(Inner::Unknown), 718 => Some(Inner::Circle), 719 => Some(Inner::CircleArray), 774 => Some(Inner::Macaddr8), 775 => Some(Inner::Macaddr8Array), 790 => Some(Inner::Money), 791 => Some(Inner::MoneyArray), 829 => Some(Inner::Macaddr), 869 => Some(Inner::Inet), 1000 => Some(Inner::BoolArray), 1001 => Some(Inner::ByteaArray), 1002 => Some(Inner::CharArray), 1003 => Some(Inner::NameArray), 1005 => Some(Inner::Int2Array), 1006 => Some(Inner::Int2VectorArray), 1007 => Some(Inner::Int4Array), 1008 => Some(Inner::RegprocArray), 1009 => Some(Inner::TextArray), 1010 => Some(Inner::TidArray), 1011 => Some(Inner::XidArray), 1012 => Some(Inner::CidArray), 1013 => Some(Inner::OidVectorArray), 1014 => Some(Inner::BpcharArray), 1015 => Some(Inner::VarcharArray), 1016 => Some(Inner::Int8Array), 1017 => Some(Inner::PointArray), 1018 => Some(Inner::LsegArray), 1019 => Some(Inner::PathArray), 1020 => Some(Inner::BoxArray), 1021 => Some(Inner::Float4Array), 1022 => Some(Inner::Float8Array), 1027 => Some(Inner::PolygonArray), 1028 => Some(Inner::OidArray), 1033 => Some(Inner::Aclitem), 1034 => Some(Inner::AclitemArray), 1040 => Some(Inner::MacaddrArray), 1041 => Some(Inner::InetArray), 1042 => Some(Inner::Bpchar), 1043 => Some(Inner::Varchar), 1082 => Some(Inner::Date), 1083 => Some(Inner::Time), 1114 => Some(Inner::Timestamp), 1115 => Some(Inner::TimestampArray), 1182 => Some(Inner::DateArray), 1183 => Some(Inner::TimeArray), 1184 => Some(Inner::Timestamptz), 1185 => Some(Inner::TimestamptzArray), 1186 => Some(Inner::Interval), 1187 => Some(Inner::IntervalArray), 1231 => Some(Inner::NumericArray), 1263 => Some(Inner::CstringArray), 1266 => Some(Inner::Timetz), 1270 => Some(Inner::TimetzArray), 1560 => Some(Inner::Bit), 1561 => Some(Inner::BitArray), 1562 => Some(Inner::Varbit), 1563 => Some(Inner::VarbitArray), 1700 => Some(Inner::Numeric), 1790 => Some(Inner::Refcursor), 2201 => Some(Inner::RefcursorArray), 2202 => Some(Inner::Regprocedure), 2203 => Some(Inner::Regoper), 2204 => Some(Inner::Regoperator), 2205 => Some(Inner::Regclass), 2206 => Some(Inner::Regtype), 2207 => Some(Inner::RegprocedureArray), 2208 => Some(Inner::RegoperArray), 2209 => Some(Inner::RegoperatorArray), 2210 => Some(Inner::RegclassArray), 2211 => Some(Inner::RegtypeArray), 2249 => Some(Inner::Record), 2275 => Some(Inner::Cstring), 2276 => Some(Inner::Any), 2277 => Some(Inner::Anyarray), 2278 => Some(Inner::Void), 2279 => Some(Inner::Trigger), 2280 => Some(Inner::LanguageHandler), 2281 => Some(Inner::Internal), 2283 => Some(Inner::Anyelement), 2287 => Some(Inner::RecordArray), 2776 => Some(Inner::Anynonarray), 2949 => Some(Inner::TxidSnapshotArray), 2950 => Some(Inner::Uuid), 2951 => Some(Inner::UuidArray), 2970 => Some(Inner::TxidSnapshot), 3115 => Some(Inner::FdwHandler), 3220 => Some(Inner::PgLsn), 3221 => Some(Inner::PgLsnArray), 3310 => Some(Inner::TsmHandler), 3361 => Some(Inner::PgNdistinct), 3402 => Some(Inner::PgDependencies), 3500 => Some(Inner::Anyenum), 3614 => Some(Inner::TsVector), 3615 => Some(Inner::Tsquery), 3642 => Some(Inner::GtsVector), 3643 => Some(Inner::TsVectorArray), 3644 => Some(Inner::GtsVectorArray), 3645 => Some(Inner::TsqueryArray), 3734 => Some(Inner::Regconfig), 3735 => Some(Inner::RegconfigArray), 3769 => Some(Inner::Regdictionary), 3770 => Some(Inner::RegdictionaryArray), 3802 => Some(Inner::Jsonb), 3807 => Some(Inner::JsonbArray), 3831 => Some(Inner::AnyRange), 3838 => Some(Inner::EventTrigger), 3904 => Some(Inner::Int4Range), 3905 => Some(Inner::Int4RangeArray), 3906 => Some(Inner::NumRange), 3907 => Some(Inner::NumRangeArray), 3908 => Some(Inner::TsRange), 3909 => Some(Inner::TsRangeArray), 3910 => Some(Inner::TstzRange), 3911 => Some(Inner::TstzRangeArray), 3912 => Some(Inner::DateRange), 3913 => Some(Inner::DateRangeArray), 3926 => Some(Inner::Int8Range), 3927 => Some(Inner::Int8RangeArray), 4072 => Some(Inner::Jsonpath), 4073 => Some(Inner::JsonpathArray), 4089 => Some(Inner::Regnamespace), 4090 => Some(Inner::RegnamespaceArray), 4096 => Some(Inner::Regrole), 4097 => Some(Inner::RegroleArray), 4191 => Some(Inner::Regcollation), 4192 => Some(Inner::RegcollationArray), 4451 => Some(Inner::Int4multiRange), 4532 => Some(Inner::NummultiRange), 4533 => Some(Inner::TsmultiRange), 4534 => Some(Inner::TstzmultiRange), 4535 => Some(Inner::DatemultiRange), 4536 => Some(Inner::Int8multiRange), 4537 => Some(Inner::AnymultiRange), 4538 => Some(Inner::AnycompatiblemultiRange), 4600 => Some(Inner::PgBrinBloomSummary), 4601 => Some(Inner::PgBrinMinmaxMultiSummary), 5017 => Some(Inner::PgMcvList), 5038 => Some(Inner::PgSnapshot), 5039 => Some(Inner::PgSnapshotArray), 5069 => Some(Inner::Xid8), 5077 => Some(Inner::Anycompatible), 5078 => Some(Inner::Anycompatiblearray), 5079 => Some(Inner::Anycompatiblenonarray), 5080 => Some(Inner::AnycompatibleRange), 6150 => Some(Inner::Int4multiRangeArray), 6151 => Some(Inner::NummultiRangeArray), 6152 => Some(Inner::TsmultiRangeArray), 6153 => Some(Inner::TstzmultiRangeArray), 6155 => Some(Inner::DatemultiRangeArray), 6157 => Some(Inner::Int8multiRangeArray), _ => None, } } pub const fn const_oid(&self) -> Oid { match *self { Inner::Bool => 16, Inner::Bytea => 17, Inner::Char => 18, Inner::Name => 19, Inner::Int8 => 20, Inner::Int2 => 21, Inner::Int2Vector => 22, Inner::Int4 => 23, Inner::Regproc => 24, Inner::Text => 25, Inner::Oid => 26, Inner::Tid => 27, Inner::Xid => 28, Inner::Cid => 29, Inner::OidVector => 30, Inner::PgDdlCommand => 32, Inner::Json => 114, Inner::Xml => 142, Inner::XmlArray => 143, Inner::PgNodeTree => 194, Inner::JsonArray => 199, Inner::TableAmHandler => 269, Inner::Xid8Array => 271, Inner::IndexAmHandler => 325, Inner::Point => 600, Inner::Lseg => 601, Inner::Path => 602, Inner::Box => 603, Inner::Polygon => 604, Inner::Line => 628, Inner::LineArray => 629, Inner::Cidr => 650, Inner::CidrArray => 651, Inner::Float4 => 700, Inner::Float8 => 701, Inner::Unknown => 705, Inner::Circle => 718, Inner::CircleArray => 719, Inner::Macaddr8 => 774, Inner::Macaddr8Array => 775, Inner::Money => 790, Inner::MoneyArray => 791, Inner::Macaddr => 829, Inner::Inet => 869, Inner::BoolArray => 1000, Inner::ByteaArray => 1001, Inner::CharArray => 1002, Inner::NameArray => 1003, Inner::Int2Array => 1005, Inner::Int2VectorArray => 1006, Inner::Int4Array => 1007, Inner::RegprocArray => 1008, Inner::TextArray => 1009, Inner::TidArray => 1010, Inner::XidArray => 1011, Inner::CidArray => 1012, Inner::OidVectorArray => 1013, Inner::BpcharArray => 1014, Inner::VarcharArray => 1015, Inner::Int8Array => 1016, Inner::PointArray => 1017, Inner::LsegArray => 1018, Inner::PathArray => 1019, Inner::BoxArray => 1020, Inner::Float4Array => 1021, Inner::Float8Array => 1022, Inner::PolygonArray => 1027, Inner::OidArray => 1028, Inner::Aclitem => 1033, Inner::AclitemArray => 1034, Inner::MacaddrArray => 1040, Inner::InetArray => 1041, Inner::Bpchar => 1042, Inner::Varchar => 1043, Inner::Date => 1082, Inner::Time => 1083, Inner::Timestamp => 1114, Inner::TimestampArray => 1115, Inner::DateArray => 1182, Inner::TimeArray => 1183, Inner::Timestamptz => 1184, Inner::TimestamptzArray => 1185, Inner::Interval => 1186, Inner::IntervalArray => 1187, Inner::NumericArray => 1231, Inner::CstringArray => 1263, Inner::Timetz => 1266, Inner::TimetzArray => 1270, Inner::Bit => 1560, Inner::BitArray => 1561, Inner::Varbit => 1562, Inner::VarbitArray => 1563, Inner::Numeric => 1700, Inner::Refcursor => 1790, Inner::RefcursorArray => 2201, Inner::Regprocedure => 2202, Inner::Regoper => 2203, Inner::Regoperator => 2204, Inner::Regclass => 2205, Inner::Regtype => 2206, Inner::RegprocedureArray => 2207, Inner::RegoperArray => 2208, Inner::RegoperatorArray => 2209, Inner::RegclassArray => 2210, Inner::RegtypeArray => 2211, Inner::Record => 2249, Inner::Cstring => 2275, Inner::Any => 2276, Inner::Anyarray => 2277, Inner::Void => 2278, Inner::Trigger => 2279, Inner::LanguageHandler => 2280, Inner::Internal => 2281, Inner::Anyelement => 2283, Inner::RecordArray => 2287, Inner::Anynonarray => 2776, Inner::TxidSnapshotArray => 2949, Inner::Uuid => 2950, Inner::UuidArray => 2951, Inner::TxidSnapshot => 2970, Inner::FdwHandler => 3115, Inner::PgLsn => 3220, Inner::PgLsnArray => 3221, Inner::TsmHandler => 3310, Inner::PgNdistinct => 3361, Inner::PgDependencies => 3402, Inner::Anyenum => 3500, Inner::TsVector => 3614, Inner::Tsquery => 3615, Inner::GtsVector => 3642, Inner::TsVectorArray => 3643, Inner::GtsVectorArray => 3644, Inner::TsqueryArray => 3645, Inner::Regconfig => 3734, Inner::RegconfigArray => 3735, Inner::Regdictionary => 3769, Inner::RegdictionaryArray => 3770, Inner::Jsonb => 3802, Inner::JsonbArray => 3807, Inner::AnyRange => 3831, Inner::EventTrigger => 3838, Inner::Int4Range => 3904, Inner::Int4RangeArray => 3905, Inner::NumRange => 3906, Inner::NumRangeArray => 3907, Inner::TsRange => 3908, Inner::TsRangeArray => 3909, Inner::TstzRange => 3910, Inner::TstzRangeArray => 3911, Inner::DateRange => 3912, Inner::DateRangeArray => 3913, Inner::Int8Range => 3926, Inner::Int8RangeArray => 3927, Inner::Jsonpath => 4072, Inner::JsonpathArray => 4073, Inner::Regnamespace => 4089, Inner::RegnamespaceArray => 4090, Inner::Regrole => 4096, Inner::RegroleArray => 4097, Inner::Regcollation => 4191, Inner::RegcollationArray => 4192, Inner::Int4multiRange => 4451, Inner::NummultiRange => 4532, Inner::TsmultiRange => 4533, Inner::TstzmultiRange => 4534, Inner::DatemultiRange => 4535, Inner::Int8multiRange => 4536, Inner::AnymultiRange => 4537, Inner::AnycompatiblemultiRange => 4538, Inner::PgBrinBloomSummary => 4600, Inner::PgBrinMinmaxMultiSummary => 4601, Inner::PgMcvList => 5017, Inner::PgSnapshot => 5038, Inner::PgSnapshotArray => 5039, Inner::Xid8 => 5069, Inner::Anycompatible => 5077, Inner::Anycompatiblearray => 5078, Inner::Anycompatiblenonarray => 5079, Inner::AnycompatibleRange => 5080, Inner::Int4multiRangeArray => 6150, Inner::NummultiRangeArray => 6151, Inner::TsmultiRangeArray => 6152, Inner::TstzmultiRangeArray => 6153, Inner::DatemultiRangeArray => 6155, Inner::Int8multiRangeArray => 6157, Inner::Other(_) => u32::MAX, } } pub fn oid(&self) -> Oid { match *self { Inner::Other(ref u) => u.oid, _ => self.const_oid(), } } pub fn kind(&self) -> &Kind { match *self { Inner::Bool => &Kind::Simple, Inner::Bytea => &Kind::Simple, Inner::Char => &Kind::Simple, Inner::Name => &Kind::Simple, Inner::Int8 => &Kind::Simple, Inner::Int2 => &Kind::Simple, Inner::Int2Vector => &Kind::Array(Type(Inner::Int2)), Inner::Int4 => &Kind::Simple, Inner::Regproc => &Kind::Simple, Inner::Text => &Kind::Simple, Inner::Oid => &Kind::Simple, Inner::Tid => &Kind::Simple, Inner::Xid => &Kind::Simple, Inner::Cid => &Kind::Simple, Inner::OidVector => &Kind::Array(Type(Inner::Oid)), Inner::PgDdlCommand => &Kind::Pseudo, Inner::Json => &Kind::Simple, Inner::Xml => &Kind::Simple, Inner::XmlArray => &Kind::Array(Type(Inner::Xml)), Inner::PgNodeTree => &Kind::Simple, Inner::JsonArray => &Kind::Array(Type(Inner::Json)), Inner::TableAmHandler => &Kind::Pseudo, Inner::Xid8Array => &Kind::Array(Type(Inner::Xid8)), Inner::IndexAmHandler => &Kind::Pseudo, Inner::Point => &Kind::Simple, Inner::Lseg => &Kind::Simple, Inner::Path => &Kind::Simple, Inner::Box => &Kind::Simple, Inner::Polygon => &Kind::Simple, Inner::Line => &Kind::Simple, Inner::LineArray => &Kind::Array(Type(Inner::Line)), Inner::Cidr => &Kind::Simple, Inner::CidrArray => &Kind::Array(Type(Inner::Cidr)), Inner::Float4 => &Kind::Simple, Inner::Float8 => &Kind::Simple, Inner::Unknown => &Kind::Simple, Inner::Circle => &Kind::Simple, Inner::CircleArray => &Kind::Array(Type(Inner::Circle)), Inner::Macaddr8 => &Kind::Simple, Inner::Macaddr8Array => &Kind::Array(Type(Inner::Macaddr8)), Inner::Money => &Kind::Simple, Inner::MoneyArray => &Kind::Array(Type(Inner::Money)), Inner::Macaddr => &Kind::Simple, Inner::Inet => &Kind::Simple, Inner::BoolArray => &Kind::Array(Type(Inner::Bool)), Inner::ByteaArray => &Kind::Array(Type(Inner::Bytea)), Inner::CharArray => &Kind::Array(Type(Inner::Char)), Inner::NameArray => &Kind::Array(Type(Inner::Name)), Inner::Int2Array => &Kind::Array(Type(Inner::Int2)), Inner::Int2VectorArray => &Kind::Array(Type(Inner::Int2Vector)), Inner::Int4Array => &Kind::Array(Type(Inner::Int4)), Inner::RegprocArray => &Kind::Array(Type(Inner::Regproc)), Inner::TextArray => &Kind::Array(Type(Inner::Text)), Inner::TidArray => &Kind::Array(Type(Inner::Tid)), Inner::XidArray => &Kind::Array(Type(Inner::Xid)), Inner::CidArray => &Kind::Array(Type(Inner::Cid)), Inner::OidVectorArray => &Kind::Array(Type(Inner::OidVector)), Inner::BpcharArray => &Kind::Array(Type(Inner::Bpchar)), Inner::VarcharArray => &Kind::Array(Type(Inner::Varchar)), Inner::Int8Array => &Kind::Array(Type(Inner::Int8)), Inner::PointArray => &Kind::Array(Type(Inner::Point)), Inner::LsegArray => &Kind::Array(Type(Inner::Lseg)), Inner::PathArray => &Kind::Array(Type(Inner::Path)), Inner::BoxArray => &Kind::Array(Type(Inner::Box)), Inner::Float4Array => &Kind::Array(Type(Inner::Float4)), Inner::Float8Array => &Kind::Array(Type(Inner::Float8)), Inner::PolygonArray => &Kind::Array(Type(Inner::Polygon)), Inner::OidArray => &Kind::Array(Type(Inner::Oid)), Inner::Aclitem => &Kind::Simple, Inner::AclitemArray => &Kind::Array(Type(Inner::Aclitem)), Inner::MacaddrArray => &Kind::Array(Type(Inner::Macaddr)), Inner::InetArray => &Kind::Array(Type(Inner::Inet)), Inner::Bpchar => &Kind::Simple, Inner::Varchar => &Kind::Simple, Inner::Date => &Kind::Simple, Inner::Time => &Kind::Simple, Inner::Timestamp => &Kind::Simple, Inner::TimestampArray => &Kind::Array(Type(Inner::Timestamp)), Inner::DateArray => &Kind::Array(Type(Inner::Date)), Inner::TimeArray => &Kind::Array(Type(Inner::Time)), Inner::Timestamptz => &Kind::Simple, Inner::TimestamptzArray => &Kind::Array(Type(Inner::Timestamptz)), Inner::Interval => &Kind::Simple, Inner::IntervalArray => &Kind::Array(Type(Inner::Interval)), Inner::NumericArray => &Kind::Array(Type(Inner::Numeric)), Inner::CstringArray => &Kind::Array(Type(Inner::Cstring)), Inner::Timetz => &Kind::Simple, Inner::TimetzArray => &Kind::Array(Type(Inner::Timetz)), Inner::Bit => &Kind::Simple, Inner::BitArray => &Kind::Array(Type(Inner::Bit)), Inner::Varbit => &Kind::Simple, Inner::VarbitArray => &Kind::Array(Type(Inner::Varbit)), Inner::Numeric => &Kind::Simple, Inner::Refcursor => &Kind::Simple, Inner::RefcursorArray => &Kind::Array(Type(Inner::Refcursor)), Inner::Regprocedure => &Kind::Simple, Inner::Regoper => &Kind::Simple, Inner::Regoperator => &Kind::Simple, Inner::Regclass => &Kind::Simple, Inner::Regtype => &Kind::Simple, Inner::RegprocedureArray => &Kind::Array(Type(Inner::Regprocedure)), Inner::RegoperArray => &Kind::Array(Type(Inner::Regoper)), Inner::RegoperatorArray => &Kind::Array(Type(Inner::Regoperator)), Inner::RegclassArray => &Kind::Array(Type(Inner::Regclass)), Inner::RegtypeArray => &Kind::Array(Type(Inner::Regtype)), Inner::Record => &Kind::Pseudo, Inner::Cstring => &Kind::Pseudo, Inner::Any => &Kind::Pseudo, Inner::Anyarray => &Kind::Pseudo, Inner::Void => &Kind::Pseudo, Inner::Trigger => &Kind::Pseudo, Inner::LanguageHandler => &Kind::Pseudo, Inner::Internal => &Kind::Pseudo, Inner::Anyelement => &Kind::Pseudo, Inner::RecordArray => &Kind::Pseudo, Inner::Anynonarray => &Kind::Pseudo, Inner::TxidSnapshotArray => &Kind::Array(Type(Inner::TxidSnapshot)), Inner::Uuid => &Kind::Simple, Inner::UuidArray => &Kind::Array(Type(Inner::Uuid)), Inner::TxidSnapshot => &Kind::Simple, Inner::FdwHandler => &Kind::Pseudo, Inner::PgLsn => &Kind::Simple, Inner::PgLsnArray => &Kind::Array(Type(Inner::PgLsn)), Inner::TsmHandler => &Kind::Pseudo, Inner::PgNdistinct => &Kind::Simple, Inner::PgDependencies => &Kind::Simple, Inner::Anyenum => &Kind::Pseudo, Inner::TsVector => &Kind::Simple, Inner::Tsquery => &Kind::Simple, Inner::GtsVector => &Kind::Simple, Inner::TsVectorArray => &Kind::Array(Type(Inner::TsVector)), Inner::GtsVectorArray => &Kind::Array(Type(Inner::GtsVector)), Inner::TsqueryArray => &Kind::Array(Type(Inner::Tsquery)), Inner::Regconfig => &Kind::Simple, Inner::RegconfigArray => &Kind::Array(Type(Inner::Regconfig)), Inner::Regdictionary => &Kind::Simple, Inner::RegdictionaryArray => &Kind::Array(Type(Inner::Regdictionary)), Inner::Jsonb => &Kind::Simple, Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)), Inner::AnyRange => &Kind::Pseudo, Inner::EventTrigger => &Kind::Pseudo, Inner::Int4Range => &const { Kind::Range(Inner::Int4.const_oid()) }, Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)), Inner::NumRange => &const { Kind::Range(Inner::Numeric.const_oid()) }, Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)), Inner::TsRange => &const { Kind::Range(Inner::Timestamp.const_oid()) }, Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)), Inner::TstzRange => &const { Kind::Range(Inner::Timestamptz.const_oid()) }, Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)), Inner::DateRange => &const { Kind::Range(Inner::Date.const_oid()) }, Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)), Inner::Int8Range => &const { Kind::Range(Inner::Int8.const_oid()) }, Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)), Inner::Jsonpath => &Kind::Simple, Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)), Inner::Regnamespace => &Kind::Simple, Inner::RegnamespaceArray => &Kind::Array(Type(Inner::Regnamespace)), Inner::Regrole => &Kind::Simple, Inner::RegroleArray => &Kind::Array(Type(Inner::Regrole)), Inner::Regcollation => &Kind::Simple, Inner::RegcollationArray => &Kind::Array(Type(Inner::Regcollation)), Inner::Int4multiRange => &Kind::Multirange(Type(Inner::Int4)), Inner::NummultiRange => &Kind::Multirange(Type(Inner::Numeric)), Inner::TsmultiRange => &Kind::Multirange(Type(Inner::Timestamp)), Inner::TstzmultiRange => &Kind::Multirange(Type(Inner::Timestamptz)), Inner::DatemultiRange => &Kind::Multirange(Type(Inner::Date)), Inner::Int8multiRange => &Kind::Multirange(Type(Inner::Int8)), Inner::AnymultiRange => &Kind::Pseudo, Inner::AnycompatiblemultiRange => &Kind::Pseudo, Inner::PgBrinBloomSummary => &Kind::Simple, Inner::PgBrinMinmaxMultiSummary => &Kind::Simple, Inner::PgMcvList => &Kind::Simple, Inner::PgSnapshot => &Kind::Simple, Inner::PgSnapshotArray => &Kind::Array(Type(Inner::PgSnapshot)), Inner::Xid8 => &Kind::Simple, Inner::Anycompatible => &Kind::Pseudo, Inner::Anycompatiblearray => &Kind::Pseudo, Inner::Anycompatiblenonarray => &Kind::Pseudo, Inner::AnycompatibleRange => &Kind::Pseudo, Inner::Int4multiRangeArray => &Kind::Array(Type(Inner::Int4multiRange)), Inner::NummultiRangeArray => &Kind::Array(Type(Inner::NummultiRange)), Inner::TsmultiRangeArray => &Kind::Array(Type(Inner::TsmultiRange)), Inner::TstzmultiRangeArray => &Kind::Array(Type(Inner::TstzmultiRange)), Inner::DatemultiRangeArray => &Kind::Array(Type(Inner::DatemultiRange)), Inner::Int8multiRangeArray => &Kind::Array(Type(Inner::Int8multiRange)), Inner::Other(ref u) => &u.kind, } } pub fn name(&self) -> &str { match *self { Inner::Bool => "bool", Inner::Bytea => "bytea", Inner::Char => "char", Inner::Name => "name", Inner::Int8 => "int8", Inner::Int2 => "int2", Inner::Int2Vector => "int2vector", Inner::Int4 => "int4", Inner::Regproc => "regproc", Inner::Text => "text", Inner::Oid => "oid", Inner::Tid => "tid", Inner::Xid => "xid", Inner::Cid => "cid", Inner::OidVector => "oidvector", Inner::PgDdlCommand => "pg_ddl_command", Inner::Json => "json", Inner::Xml => "xml", Inner::XmlArray => "_xml", Inner::PgNodeTree => "pg_node_tree", Inner::JsonArray => "_json", Inner::TableAmHandler => "table_am_handler", Inner::Xid8Array => "_xid8", Inner::IndexAmHandler => "index_am_handler", Inner::Point => "point", Inner::Lseg => "lseg", Inner::Path => "path", Inner::Box => "box", Inner::Polygon => "polygon", Inner::Line => "line", Inner::LineArray => "_line", Inner::Cidr => "cidr", Inner::CidrArray => "_cidr", Inner::Float4 => "float4", Inner::Float8 => "float8", Inner::Unknown => "unknown", Inner::Circle => "circle", Inner::CircleArray => "_circle", Inner::Macaddr8 => "macaddr8", Inner::Macaddr8Array => "_macaddr8", Inner::Money => "money", Inner::MoneyArray => "_money", Inner::Macaddr => "macaddr", Inner::Inet => "inet", Inner::BoolArray => "_bool", Inner::ByteaArray => "_bytea", Inner::CharArray => "_char", Inner::NameArray => "_name", Inner::Int2Array => "_int2", Inner::Int2VectorArray => "_int2vector", Inner::Int4Array => "_int4", Inner::RegprocArray => "_regproc", Inner::TextArray => "_text", Inner::TidArray => "_tid", Inner::XidArray => "_xid", Inner::CidArray => "_cid", Inner::OidVectorArray => "_oidvector", Inner::BpcharArray => "_bpchar", Inner::VarcharArray => "_varchar", Inner::Int8Array => "_int8", Inner::PointArray => "_point", Inner::LsegArray => "_lseg", Inner::PathArray => "_path", Inner::BoxArray => "_box", Inner::Float4Array => "_float4", Inner::Float8Array => "_float8", Inner::PolygonArray => "_polygon", Inner::OidArray => "_oid", Inner::Aclitem => "aclitem", Inner::AclitemArray => "_aclitem", Inner::MacaddrArray => "_macaddr", Inner::InetArray => "_inet", Inner::Bpchar => "bpchar", Inner::Varchar => "varchar", Inner::Date => "date", Inner::Time => "time", Inner::Timestamp => "timestamp", Inner::TimestampArray => "_timestamp", Inner::DateArray => "_date", Inner::TimeArray => "_time", Inner::Timestamptz => "timestamptz", Inner::TimestamptzArray => "_timestamptz", Inner::Interval => "interval", Inner::IntervalArray => "_interval", Inner::NumericArray => "_numeric", Inner::CstringArray => "_cstring", Inner::Timetz => "timetz", Inner::TimetzArray => "_timetz", Inner::Bit => "bit", Inner::BitArray => "_bit", Inner::Varbit => "varbit", Inner::VarbitArray => "_varbit", Inner::Numeric => "numeric", Inner::Refcursor => "refcursor", Inner::RefcursorArray => "_refcursor", Inner::Regprocedure => "regprocedure", Inner::Regoper => "regoper", Inner::Regoperator => "regoperator", Inner::Regclass => "regclass", Inner::Regtype => "regtype", Inner::RegprocedureArray => "_regprocedure", Inner::RegoperArray => "_regoper", Inner::RegoperatorArray => "_regoperator", Inner::RegclassArray => "_regclass", Inner::RegtypeArray => "_regtype", Inner::Record => "record", Inner::Cstring => "cstring", Inner::Any => "any", Inner::Anyarray => "anyarray", Inner::Void => "void", Inner::Trigger => "trigger", Inner::LanguageHandler => "language_handler", Inner::Internal => "internal", Inner::Anyelement => "anyelement", Inner::RecordArray => "_record", Inner::Anynonarray => "anynonarray", Inner::TxidSnapshotArray => "_txid_snapshot", Inner::Uuid => "uuid", Inner::UuidArray => "_uuid", Inner::TxidSnapshot => "txid_snapshot", Inner::FdwHandler => "fdw_handler", Inner::PgLsn => "pg_lsn", Inner::PgLsnArray => "_pg_lsn", Inner::TsmHandler => "tsm_handler", Inner::PgNdistinct => "pg_ndistinct", Inner::PgDependencies => "pg_dependencies", Inner::Anyenum => "anyenum", Inner::TsVector => "tsvector", Inner::Tsquery => "tsquery", Inner::GtsVector => "gtsvector", Inner::TsVectorArray => "_tsvector", Inner::GtsVectorArray => "_gtsvector", Inner::TsqueryArray => "_tsquery", Inner::Regconfig => "regconfig", Inner::RegconfigArray => "_regconfig", Inner::Regdictionary => "regdictionary", Inner::RegdictionaryArray => "_regdictionary", Inner::Jsonb => "jsonb", Inner::JsonbArray => "_jsonb", Inner::AnyRange => "anyrange", Inner::EventTrigger => "event_trigger", Inner::Int4Range => "int4range", Inner::Int4RangeArray => "_int4range", Inner::NumRange => "numrange", Inner::NumRangeArray => "_numrange", Inner::TsRange => "tsrange", Inner::TsRangeArray => "_tsrange", Inner::TstzRange => "tstzrange", Inner::TstzRangeArray => "_tstzrange", Inner::DateRange => "daterange", Inner::DateRangeArray => "_daterange", Inner::Int8Range => "int8range", Inner::Int8RangeArray => "_int8range", Inner::Jsonpath => "jsonpath", Inner::JsonpathArray => "_jsonpath", Inner::Regnamespace => "regnamespace", Inner::RegnamespaceArray => "_regnamespace", Inner::Regrole => "regrole", Inner::RegroleArray => "_regrole", Inner::Regcollation => "regcollation", Inner::RegcollationArray => "_regcollation", Inner::Int4multiRange => "int4multirange", Inner::NummultiRange => "nummultirange", Inner::TsmultiRange => "tsmultirange", Inner::TstzmultiRange => "tstzmultirange", Inner::DatemultiRange => "datemultirange", Inner::Int8multiRange => "int8multirange", Inner::AnymultiRange => "anymultirange", Inner::AnycompatiblemultiRange => "anycompatiblemultirange", Inner::PgBrinBloomSummary => "pg_brin_bloom_summary", Inner::PgBrinMinmaxMultiSummary => "pg_brin_minmax_multi_summary", Inner::PgMcvList => "pg_mcv_list", Inner::PgSnapshot => "pg_snapshot", Inner::PgSnapshotArray => "_pg_snapshot", Inner::Xid8 => "xid8", Inner::Anycompatible => "anycompatible", Inner::Anycompatiblearray => "anycompatiblearray", Inner::Anycompatiblenonarray => "anycompatiblenonarray", Inner::AnycompatibleRange => "anycompatiblerange", Inner::Int4multiRangeArray => "_int4multirange", Inner::NummultiRangeArray => "_nummultirange", Inner::TsmultiRangeArray => "_tsmultirange", Inner::TstzmultiRangeArray => "_tstzmultirange", Inner::DatemultiRangeArray => "_datemultirange", Inner::Int8multiRangeArray => "_int8multirange", Inner::Other(ref u) => &u.name, } } } impl Type { /// BOOL - boolean, 'true'/'false' pub const BOOL: Type = Type(Inner::Bool); /// BYTEA - variable-length string, binary values escaped pub const BYTEA: Type = Type(Inner::Bytea); /// CHAR - single character pub const CHAR: Type = Type(Inner::Char); /// NAME - 63-byte type for storing system identifiers pub const NAME: Type = Type(Inner::Name); /// INT8 - ~18 digit integer, 8-byte storage pub const INT8: Type = Type(Inner::Int8); /// INT2 - -32 thousand to 32 thousand, 2-byte storage pub const INT2: Type = Type(Inner::Int2); /// INT2VECTOR - array of int2, used in system tables pub const INT2_VECTOR: Type = Type(Inner::Int2Vector); /// INT4 - -2 billion to 2 billion integer, 4-byte storage pub const INT4: Type = Type(Inner::Int4); /// REGPROC - registered procedure pub const REGPROC: Type = Type(Inner::Regproc); /// TEXT - variable-length string, no limit specified pub const TEXT: Type = Type(Inner::Text); /// OID - object identifier(oid), maximum 4 billion pub const OID: Type = Type(Inner::Oid); /// TID - (block, offset), physical location of tuple pub const TID: Type = Type(Inner::Tid); /// XID - transaction id pub const XID: Type = Type(Inner::Xid); /// CID - command identifier type, sequence in transaction id pub const CID: Type = Type(Inner::Cid); /// OIDVECTOR - array of oids, used in system tables pub const OID_VECTOR: Type = Type(Inner::OidVector); /// PG_DDL_COMMAND - internal type for passing CollectedCommand pub const PG_DDL_COMMAND: Type = Type(Inner::PgDdlCommand); /// JSON - JSON stored as text pub const JSON: Type = Type(Inner::Json); /// XML - XML content pub const XML: Type = Type(Inner::Xml); /// XML[] pub const XML_ARRAY: Type = Type(Inner::XmlArray); /// PG_NODE_TREE - string representing an internal node tree pub const PG_NODE_TREE: Type = Type(Inner::PgNodeTree); /// JSON[] pub const JSON_ARRAY: Type = Type(Inner::JsonArray); /// TABLE_AM_HANDLER pub const TABLE_AM_HANDLER: Type = Type(Inner::TableAmHandler); /// XID8[] pub const XID8_ARRAY: Type = Type(Inner::Xid8Array); /// INDEX_AM_HANDLER - pseudo-type for the result of an index AM handler function pub const INDEX_AM_HANDLER: Type = Type(Inner::IndexAmHandler); /// POINT - geometric point '(x, y)' pub const POINT: Type = Type(Inner::Point); /// LSEG - geometric line segment '(pt1,pt2)' pub const LSEG: Type = Type(Inner::Lseg); /// PATH - geometric path '(pt1,...)' pub const PATH: Type = Type(Inner::Path); /// BOX - geometric box '(lower left,upper right)' pub const BOX: Type = Type(Inner::Box); /// POLYGON - geometric polygon '(pt1,...)' pub const POLYGON: Type = Type(Inner::Polygon); /// LINE - geometric line pub const LINE: Type = Type(Inner::Line); /// LINE[] pub const LINE_ARRAY: Type = Type(Inner::LineArray); /// CIDR - network IP address/netmask, network address pub const CIDR: Type = Type(Inner::Cidr); /// CIDR[] pub const CIDR_ARRAY: Type = Type(Inner::CidrArray); /// FLOAT4 - single-precision floating point number, 4-byte storage pub const FLOAT4: Type = Type(Inner::Float4); /// FLOAT8 - double-precision floating point number, 8-byte storage pub const FLOAT8: Type = Type(Inner::Float8); /// UNKNOWN - pseudo-type representing an undetermined type pub const UNKNOWN: Type = Type(Inner::Unknown); /// CIRCLE - geometric circle '(center,radius)' pub const CIRCLE: Type = Type(Inner::Circle); /// CIRCLE[] pub const CIRCLE_ARRAY: Type = Type(Inner::CircleArray); /// MACADDR8 - XX:XX:XX:XX:XX:XX:XX:XX, MAC address pub const MACADDR8: Type = Type(Inner::Macaddr8); /// MACADDR8[] pub const MACADDR8_ARRAY: Type = Type(Inner::Macaddr8Array); /// MONEY - monetary amounts, $d,ddd.cc pub const MONEY: Type = Type(Inner::Money); /// MONEY[] pub const MONEY_ARRAY: Type = Type(Inner::MoneyArray); /// MACADDR - XX:XX:XX:XX:XX:XX, MAC address pub const MACADDR: Type = Type(Inner::Macaddr); /// INET - IP address/netmask, host address, netmask optional pub const INET: Type = Type(Inner::Inet); /// BOOL[] pub const BOOL_ARRAY: Type = Type(Inner::BoolArray); /// BYTEA[] pub const BYTEA_ARRAY: Type = Type(Inner::ByteaArray); /// CHAR[] pub const CHAR_ARRAY: Type = Type(Inner::CharArray); /// NAME[] pub const NAME_ARRAY: Type = Type(Inner::NameArray); /// INT2[] pub const INT2_ARRAY: Type = Type(Inner::Int2Array); /// INT2VECTOR[] pub const INT2_VECTOR_ARRAY: Type = Type(Inner::Int2VectorArray); /// INT4[] pub const INT4_ARRAY: Type = Type(Inner::Int4Array); /// REGPROC[] pub const REGPROC_ARRAY: Type = Type(Inner::RegprocArray); /// TEXT[] pub const TEXT_ARRAY: Type = Type(Inner::TextArray); /// TID[] pub const TID_ARRAY: Type = Type(Inner::TidArray); /// XID[] pub const XID_ARRAY: Type = Type(Inner::XidArray); /// CID[] pub const CID_ARRAY: Type = Type(Inner::CidArray); /// OIDVECTOR[] pub const OID_VECTOR_ARRAY: Type = Type(Inner::OidVectorArray); /// BPCHAR[] pub const BPCHAR_ARRAY: Type = Type(Inner::BpcharArray); /// VARCHAR[] pub const VARCHAR_ARRAY: Type = Type(Inner::VarcharArray); /// INT8[] pub const INT8_ARRAY: Type = Type(Inner::Int8Array); /// POINT[] pub const POINT_ARRAY: Type = Type(Inner::PointArray); /// LSEG[] pub const LSEG_ARRAY: Type = Type(Inner::LsegArray); /// PATH[] pub const PATH_ARRAY: Type = Type(Inner::PathArray); /// BOX[] pub const BOX_ARRAY: Type = Type(Inner::BoxArray); /// FLOAT4[] pub const FLOAT4_ARRAY: Type = Type(Inner::Float4Array); /// FLOAT8[] pub const FLOAT8_ARRAY: Type = Type(Inner::Float8Array); /// POLYGON[] pub const POLYGON_ARRAY: Type = Type(Inner::PolygonArray); /// OID[] pub const OID_ARRAY: Type = Type(Inner::OidArray); /// ACLITEM - access control list pub const ACLITEM: Type = Type(Inner::Aclitem); /// ACLITEM[] pub const ACLITEM_ARRAY: Type = Type(Inner::AclitemArray); /// MACADDR[] pub const MACADDR_ARRAY: Type = Type(Inner::MacaddrArray); /// INET[] pub const INET_ARRAY: Type = Type(Inner::InetArray); /// BPCHAR - char(length), blank-padded string, fixed storage length pub const BPCHAR: Type = Type(Inner::Bpchar); /// VARCHAR - varchar(length), non-blank-padded string, variable storage length pub const VARCHAR: Type = Type(Inner::Varchar); /// DATE - date pub const DATE: Type = Type(Inner::Date); /// TIME - time of day pub const TIME: Type = Type(Inner::Time); /// TIMESTAMP - date and time pub const TIMESTAMP: Type = Type(Inner::Timestamp); /// TIMESTAMP[] pub const TIMESTAMP_ARRAY: Type = Type(Inner::TimestampArray); /// DATE[] pub const DATE_ARRAY: Type = Type(Inner::DateArray); /// TIME[] pub const TIME_ARRAY: Type = Type(Inner::TimeArray); /// TIMESTAMPTZ - date and time with time zone pub const TIMESTAMPTZ: Type = Type(Inner::Timestamptz); /// TIMESTAMPTZ[] pub const TIMESTAMPTZ_ARRAY: Type = Type(Inner::TimestamptzArray); /// INTERVAL - @ <number> <units>, time interval pub const INTERVAL: Type = Type(Inner::Interval); /// INTERVAL[] pub const INTERVAL_ARRAY: Type = Type(Inner::IntervalArray); /// NUMERIC[] pub const NUMERIC_ARRAY: Type = Type(Inner::NumericArray); /// CSTRING[] pub const CSTRING_ARRAY: Type = Type(Inner::CstringArray); /// TIMETZ - time of day with time zone pub const TIMETZ: Type = Type(Inner::Timetz); /// TIMETZ[] pub const TIMETZ_ARRAY: Type = Type(Inner::TimetzArray); /// BIT - fixed-length bit string pub const BIT: Type = Type(Inner::Bit); /// BIT[] pub const BIT_ARRAY: Type = Type(Inner::BitArray); /// VARBIT - variable-length bit string pub const VARBIT: Type = Type(Inner::Varbit); /// VARBIT[] pub const VARBIT_ARRAY: Type = Type(Inner::VarbitArray); /// NUMERIC - numeric(precision, decimal), arbitrary precision number pub const NUMERIC: Type = Type(Inner::Numeric); /// REFCURSOR - reference to cursor (portal name) pub const REFCURSOR: Type = Type(Inner::Refcursor); /// REFCURSOR[] pub const REFCURSOR_ARRAY: Type = Type(Inner::RefcursorArray); /// REGPROCEDURE - registered procedure (with args) pub const REGPROCEDURE: Type = Type(Inner::Regprocedure); /// REGOPER - registered operator pub const REGOPER: Type = Type(Inner::Regoper); /// REGOPERATOR - registered operator (with args) pub const REGOPERATOR: Type = Type(Inner::Regoperator); /// REGCLASS - registered class pub const REGCLASS: Type = Type(Inner::Regclass); /// REGTYPE - registered type pub const REGTYPE: Type = Type(Inner::Regtype); /// REGPROCEDURE[] pub const REGPROCEDURE_ARRAY: Type = Type(Inner::RegprocedureArray); /// REGOPER[] pub const REGOPER_ARRAY: Type = Type(Inner::RegoperArray); /// REGOPERATOR[] pub const REGOPERATOR_ARRAY: Type = Type(Inner::RegoperatorArray); /// REGCLASS[] pub const REGCLASS_ARRAY: Type = Type(Inner::RegclassArray); /// REGTYPE[] pub const REGTYPE_ARRAY: Type = Type(Inner::RegtypeArray); /// RECORD - pseudo-type representing any composite type pub const RECORD: Type = Type(Inner::Record); /// CSTRING - C-style string pub const CSTRING: Type = Type(Inner::Cstring); /// ANY - pseudo-type representing any type pub const ANY: Type = Type(Inner::Any); /// ANYARRAY - pseudo-type representing a polymorphic array type pub const ANYARRAY: Type = Type(Inner::Anyarray); /// VOID - pseudo-type for the result of a function with no real result pub const VOID: Type = Type(Inner::Void); /// TRIGGER - pseudo-type for the result of a trigger function pub const TRIGGER: Type = Type(Inner::Trigger); /// LANGUAGE_HANDLER - pseudo-type for the result of a language handler function pub const LANGUAGE_HANDLER: Type = Type(Inner::LanguageHandler); /// INTERNAL - pseudo-type representing an internal data structure pub const INTERNAL: Type = Type(Inner::Internal); /// ANYELEMENT - pseudo-type representing a polymorphic base type pub const ANYELEMENT: Type = Type(Inner::Anyelement); /// RECORD[] pub const RECORD_ARRAY: Type = Type(Inner::RecordArray); /// ANYNONARRAY - pseudo-type representing a polymorphic base type that is not an array pub const ANYNONARRAY: Type = Type(Inner::Anynonarray); /// TXID_SNAPSHOT[] pub const TXID_SNAPSHOT_ARRAY: Type = Type(Inner::TxidSnapshotArray); /// UUID - UUID datatype pub const UUID: Type = Type(Inner::Uuid); /// UUID[] pub const UUID_ARRAY: Type = Type(Inner::UuidArray); /// TXID_SNAPSHOT - txid snapshot pub const TXID_SNAPSHOT: Type = Type(Inner::TxidSnapshot); /// FDW_HANDLER - pseudo-type for the result of an FDW handler function pub const FDW_HANDLER: Type = Type(Inner::FdwHandler); /// PG_LSN - PostgreSQL LSN datatype pub const PG_LSN: Type = Type(Inner::PgLsn); /// PG_LSN[] pub const PG_LSN_ARRAY: Type = Type(Inner::PgLsnArray); /// TSM_HANDLER - pseudo-type for the result of a tablesample method function pub const TSM_HANDLER: Type = Type(Inner::TsmHandler); /// PG_NDISTINCT - multivariate ndistinct coefficients pub const PG_NDISTINCT: Type = Type(Inner::PgNdistinct); /// PG_DEPENDENCIES - multivariate dependencies pub const PG_DEPENDENCIES: Type = Type(Inner::PgDependencies); /// ANYENUM - pseudo-type representing a polymorphic base type that is an enum pub const ANYENUM: Type = Type(Inner::Anyenum); /// TSVECTOR - text representation for text search pub const TS_VECTOR: Type = Type(Inner::TsVector); /// TSQUERY - query representation for text search pub const TSQUERY: Type = Type(Inner::Tsquery); /// GTSVECTOR - GiST index internal text representation for text search pub const GTS_VECTOR: Type = Type(Inner::GtsVector); /// TSVECTOR[] pub const TS_VECTOR_ARRAY: Type = Type(Inner::TsVectorArray); /// GTSVECTOR[] pub const GTS_VECTOR_ARRAY: Type = Type(Inner::GtsVectorArray); /// TSQUERY[] pub const TSQUERY_ARRAY: Type = Type(Inner::TsqueryArray); /// REGCONFIG - registered text search configuration pub const REGCONFIG: Type = Type(Inner::Regconfig); /// REGCONFIG[] pub const REGCONFIG_ARRAY: Type = Type(Inner::RegconfigArray); /// REGDICTIONARY - registered text search dictionary pub const REGDICTIONARY: Type = Type(Inner::Regdictionary); /// REGDICTIONARY[] pub const REGDICTIONARY_ARRAY: Type = Type(Inner::RegdictionaryArray); /// JSONB - Binary JSON pub const JSONB: Type = Type(Inner::Jsonb); /// JSONB[] pub const JSONB_ARRAY: Type = Type(Inner::JsonbArray); /// ANYRANGE - pseudo-type representing a range over a polymorphic base type pub const ANY_RANGE: Type = Type(Inner::AnyRange); /// EVENT_TRIGGER - pseudo-type for the result of an event trigger function pub const EVENT_TRIGGER: Type = Type(Inner::EventTrigger); /// INT4RANGE - range of integers pub const INT4_RANGE: Type = Type(Inner::Int4Range); /// INT4RANGE[] pub const INT4_RANGE_ARRAY: Type = Type(Inner::Int4RangeArray); /// NUMRANGE - range of numerics pub const NUM_RANGE: Type = Type(Inner::NumRange); /// NUMRANGE[] pub const NUM_RANGE_ARRAY: Type = Type(Inner::NumRangeArray); /// TSRANGE - range of timestamps without time zone pub const TS_RANGE: Type = Type(Inner::TsRange); /// TSRANGE[] pub const TS_RANGE_ARRAY: Type = Type(Inner::TsRangeArray); /// TSTZRANGE - range of timestamps with time zone pub const TSTZ_RANGE: Type = Type(Inner::TstzRange); /// TSTZRANGE[] pub const TSTZ_RANGE_ARRAY: Type = Type(Inner::TstzRangeArray); /// DATERANGE - range of dates pub const DATE_RANGE: Type = Type(Inner::DateRange); /// DATERANGE[] pub const DATE_RANGE_ARRAY: Type = Type(Inner::DateRangeArray); /// INT8RANGE - range of bigints pub const INT8_RANGE: Type = Type(Inner::Int8Range); /// INT8RANGE[] pub const INT8_RANGE_ARRAY: Type = Type(Inner::Int8RangeArray); /// JSONPATH - JSON path pub const JSONPATH: Type = Type(Inner::Jsonpath); /// JSONPATH[] pub const JSONPATH_ARRAY: Type = Type(Inner::JsonpathArray); /// REGNAMESPACE - registered namespace pub const REGNAMESPACE: Type = Type(Inner::Regnamespace); /// REGNAMESPACE[] pub const REGNAMESPACE_ARRAY: Type = Type(Inner::RegnamespaceArray); /// REGROLE - registered role pub const REGROLE: Type = Type(Inner::Regrole); /// REGROLE[] pub const REGROLE_ARRAY: Type = Type(Inner::RegroleArray); /// REGCOLLATION - registered collation pub const REGCOLLATION: Type = Type(Inner::Regcollation); /// REGCOLLATION[] pub const REGCOLLATION_ARRAY: Type = Type(Inner::RegcollationArray); /// INT4MULTIRANGE - multirange of integers pub const INT4MULTI_RANGE: Type = Type(Inner::Int4multiRange); /// NUMMULTIRANGE - multirange of numerics pub const NUMMULTI_RANGE: Type = Type(Inner::NummultiRange); /// TSMULTIRANGE - multirange of timestamps without time zone pub const TSMULTI_RANGE: Type = Type(Inner::TsmultiRange); /// TSTZMULTIRANGE - multirange of timestamps with time zone pub const TSTZMULTI_RANGE: Type = Type(Inner::TstzmultiRange); /// DATEMULTIRANGE - multirange of dates pub const DATEMULTI_RANGE: Type = Type(Inner::DatemultiRange); /// INT8MULTIRANGE - multirange of bigints pub const INT8MULTI_RANGE: Type = Type(Inner::Int8multiRange); /// ANYMULTIRANGE - pseudo-type representing a polymorphic base type that is a multirange pub const ANYMULTI_RANGE: Type = Type(Inner::AnymultiRange); /// ANYCOMPATIBLEMULTIRANGE - pseudo-type representing a multirange over a polymorphic common type pub const ANYCOMPATIBLEMULTI_RANGE: Type = Type(Inner::AnycompatiblemultiRange); /// PG_BRIN_BLOOM_SUMMARY - BRIN bloom summary pub const PG_BRIN_BLOOM_SUMMARY: Type = Type(Inner::PgBrinBloomSummary); /// PG_BRIN_MINMAX_MULTI_SUMMARY - BRIN minmax-multi summary pub const PG_BRIN_MINMAX_MULTI_SUMMARY: Type = Type(Inner::PgBrinMinmaxMultiSummary); /// PG_MCV_LIST - multivariate MCV list pub const PG_MCV_LIST: Type = Type(Inner::PgMcvList); /// PG_SNAPSHOT - snapshot pub const PG_SNAPSHOT: Type = Type(Inner::PgSnapshot); /// PG_SNAPSHOT[] pub const PG_SNAPSHOT_ARRAY: Type = Type(Inner::PgSnapshotArray); /// XID8 - full transaction id pub const XID8: Type = Type(Inner::Xid8); /// ANYCOMPATIBLE - pseudo-type representing a polymorphic common type pub const ANYCOMPATIBLE: Type = Type(Inner::Anycompatible); /// ANYCOMPATIBLEARRAY - pseudo-type representing an array of polymorphic common type elements pub const ANYCOMPATIBLEARRAY: Type = Type(Inner::Anycompatiblearray); /// ANYCOMPATIBLENONARRAY - pseudo-type representing a polymorphic common type that is not an array pub const ANYCOMPATIBLENONARRAY: Type = Type(Inner::Anycompatiblenonarray); /// ANYCOMPATIBLERANGE - pseudo-type representing a range over a polymorphic common type pub const ANYCOMPATIBLE_RANGE: Type = Type(Inner::AnycompatibleRange); /// INT4MULTIRANGE[] pub const INT4MULTI_RANGE_ARRAY: Type = Type(Inner::Int4multiRangeArray); /// NUMMULTIRANGE[] pub const NUMMULTI_RANGE_ARRAY: Type = Type(Inner::NummultiRangeArray); /// TSMULTIRANGE[] pub const TSMULTI_RANGE_ARRAY: Type = Type(Inner::TsmultiRangeArray); /// TSTZMULTIRANGE[] pub const TSTZMULTI_RANGE_ARRAY: Type = Type(Inner::TstzmultiRangeArray); /// DATEMULTIRANGE[] pub const DATEMULTI_RANGE_ARRAY: Type = Type(Inner::DatemultiRangeArray); /// INT8MULTIRANGE[] pub const INT8MULTI_RANGE_ARRAY: Type = Type(Inner::Int8multiRangeArray); } ================================================ FILE: libs/proxy/subzero_core/.gitignore ================================================ target Cargo.lock ================================================ FILE: libs/proxy/subzero_core/Cargo.toml ================================================ # This is a stub for the subzero-core crate. [package] name = "subzero-core" version = "3.0.1" edition = "2024" publish = false # "private"! [features] default = [] postgresql = [] [dependencies] ================================================ FILE: libs/proxy/subzero_core/src/lib.rs ================================================ // This is a stub for the subzero-core crate. ================================================ FILE: libs/proxy/tokio-postgres2/Cargo.toml ================================================ [package] name = "tokio-postgres2" version = "0.1.0" edition = "2024" license = "MIT/Apache-2.0" [dependencies] bytes.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } tracing.workspace = true parking_lot.workspace = true pin-project-lite.workspace = true postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } serde = { workspace = true, features = ["derive"] } ================================================ FILE: libs/proxy/tokio-postgres2/src/cancel_query.rs ================================================ use tokio::net::TcpStream; use crate::client::SocketConfig; use crate::config::{Host, SslMode}; use crate::tls::MakeTlsConnect; use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( config: SocketConfig, ssl_mode: SslMode, tls: T, process_id: i32, secret_key: i32, ) -> Result<(), Error> where T: MakeTlsConnect, { let hostname = match &config.host { Host::Tcp(host) => &**host, }; let tls = tls .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; let socket = connect_socket::connect_socket( config.host_addr, &config.host, config.port, config.connect_timeout, ) .await?; cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await } ================================================ FILE: libs/proxy/tokio-postgres2/src/cancel_query_raw.rs ================================================ use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use crate::config::SslMode; use crate::tls::TlsConnect; use crate::{Error, connect_tls}; pub async fn cancel_query_raw( stream: S, mode: SslMode, tls: T, process_id: i32, secret_key: i32, ) -> Result<(), Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { let mut stream = connect_tls::connect_tls(stream, mode, tls).await?; let mut buf = BytesMut::new(); frontend::cancel_request(process_id, secret_key, &mut buf); stream.write_all(&buf).await.map_err(Error::io)?; stream.flush().await.map_err(Error::io)?; stream.shutdown().await.map_err(Error::io)?; Ok(()) } ================================================ FILE: libs/proxy/tokio-postgres2/src/cancel_token.rs ================================================ use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; use crate::client::SocketConfig; use crate::config::SslMode; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Error, cancel_query, cancel_query_raw}; /// A cancellation token that allows easy cancellation of a query. #[derive(Clone)] pub struct CancelToken { pub socket_config: SocketConfig, pub raw: RawCancelToken, } /// A raw cancellation token that allows cancellation of a query, given a fresh connection to postgres. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RawCancelToken { pub ssl_mode: SslMode, pub process_id: i32, pub secret_key: i32, } impl CancelToken { /// Attempts to cancel the in-progress query on the connection associated /// with this `CancelToken`. /// /// The server provides no information about whether a cancellation attempt was successful or not. An error will /// only be returned if the client was unable to connect to the database. /// /// Cancellation is inherently racy. There is no guarantee that the /// cancellation request will reach the server before the query terminates /// normally, or that the connection associated with this token is still /// active. /// /// Requires the `runtime` Cargo feature (enabled by default). pub async fn cancel_query(&self, tls: T) -> Result<(), Error> where T: MakeTlsConnect, { cancel_query::cancel_query( self.socket_config.clone(), self.raw.ssl_mode, tls, self.raw.process_id, self.raw.secret_key, ) .await } } impl RawCancelToken { /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new /// connection itself. pub async fn cancel_query_raw(&self, stream: S, tls: T) -> Result<(), Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { cancel_query_raw::cancel_query_raw( stream, self.ssl_mode, tls, self.process_id, self.secret_key, ) .await } } ================================================ FILE: libs/proxy/tokio-postgres2/src/client.rs ================================================ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; use std::task::{Context, Poll}; use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{TryStreamExt, future, ready}; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; use crate::cancel_token::RawCancelToken; use crate::codec::{BackendMessages, FrontendMessage, RecordNotices}; use crate::config::{Host, SslMode}; use crate::connection::gc_bytesmut; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, Type}; use crate::{ CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder, query, simple_query, }; pub struct Responses { /// new messages from conn receiver: mpsc::Receiver, /// current batch of messages cur: BackendMessages, /// number of total queries sent. waiting: usize, /// number of ReadyForQuery messages received. received: usize, } impl Responses { pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll> { loop { // get the next saved message if let Some(message) = self.cur.next().map_err(Error::parse)? { let received = self.received; // increase the query head if this is the last message. if let Message::ReadyForQuery(_) = message { self.received += 1; } // check if the client has skipped this query. if received + 1 < self.waiting { // grab the next message. continue; } // convenience: turn the error messaage into a proper error. let res = match message { Message::ErrorResponse(body) => Err(Error::db(body)), message => Ok(message), }; return Poll::Ready(res); } // get the next batch of messages. match ready!(self.receiver.poll_recv(cx)) { Some(messages) => self.cur = messages, None => return Poll::Ready(Err(Error::closed())), } } } pub async fn next(&mut self) -> Result { future::poll_fn(|cx| self.poll_next(cx)).await } } /// A cache of type info and prepared statements for fetching type info /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] pub(crate) struct CachedTypeInfo { /// Cache of types already looked up. pub(crate) types: HashMap, } pub struct InnerClient { sender: mpsc::UnboundedSender, responses: Responses, /// A buffer to use when writing out postgres commands. buffer: BytesMut, } impl InnerClient { pub fn start(&mut self) -> Result, Error> { self.responses.waiting += 1; Ok(PartialQuery(Some(self))) } pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> { self.responses.waiting += 1; self.buffer.clear(); // simple queries do not need sync. frontend::query(query, &mut self.buffer).map_err(Error::encode)?; let buf = self.buffer.split(); self.send_message(FrontendMessage::Raw(buf)) } fn send_message(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> { self.sender.send(messages).map_err(|_| Error::closed())?; Ok(&mut self.responses) } } pub struct PartialQuery<'a>(Option<&'a mut InnerClient>); impl Drop for PartialQuery<'_> { fn drop(&mut self) { if let Some(client) = self.0.take() { client.buffer.clear(); frontend::sync(&mut client.buffer); let buf = client.buffer.split(); let _ = client.send_message(FrontendMessage::Raw(buf)); } } } impl<'a> PartialQuery<'a> { pub fn send_with_flush(&mut self, f: F) -> Result<&mut Responses, Error> where F: FnOnce(&mut BytesMut) -> Result<(), Error>, { let client = self.0.as_deref_mut().unwrap(); client.buffer.clear(); f(&mut client.buffer)?; frontend::flush(&mut client.buffer); let buf = client.buffer.split(); client.send_message(FrontendMessage::Raw(buf)) } pub fn send_with_sync(mut self, f: F) -> Result<&'a mut Responses, Error> where F: FnOnce(&mut BytesMut) -> Result<(), Error>, { let client = self.0.as_deref_mut().unwrap(); client.buffer.clear(); f(&mut client.buffer)?; frontend::sync(&mut client.buffer); let buf = client.buffer.split(); let _ = client.send_message(FrontendMessage::Raw(buf)); Ok(&mut self.0.take().unwrap().responses) } } #[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { pub host_addr: Option, pub host: Host, pub port: u16, pub connect_timeout: Option, } /// An asynchronous PostgreSQL client. /// /// The client is one half of what is returned when a connection is established. Users interact with the database /// through this client object. pub struct Client { inner: InnerClient, cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, } impl Client { pub(crate) fn new( sender: mpsc::UnboundedSender, receiver: mpsc::Receiver, socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, write_buf: BytesMut, ) -> Client { Client { inner: InnerClient { sender, responses: Responses { receiver, cur: BackendMessages::empty(), waiting: 0, received: 0, }, buffer: write_buf, }, cached_typeinfo: Default::default(), socket_config, ssl_mode, process_id, secret_key, } } /// Returns process_id. pub fn get_process_id(&self) -> i32 { self.process_id } pub(crate) fn inner_mut(&mut self) -> &mut InnerClient { &mut self.inner } pub fn record_notices(&mut self, limit: usize) -> mpsc::UnboundedReceiver> { let (tx, rx) = mpsc::unbounded_channel(); let notices = RecordNotices { sender: tx, limit }; self.inner .sender .send(FrontendMessage::RecordNotices(notices)) .ok(); rx } /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip pub async fn query_raw_txt( &mut self, statement: &str, params: I, ) -> Result, Error> where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { query::query_txt( &mut self.inner, &mut self.cached_typeinfo, statement, params, ) .await } /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings, /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the /// rows, this method returns a list of an enum which indicates either the completion of one of the commands, /// or a row of data. This preserves the framing between the separate statements in the request. /// /// # Warning /// /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! pub async fn simple_query(&mut self, query: &str) -> Result, Error> { self.simple_query_raw(query).await?.try_collect().await } pub(crate) async fn simple_query_raw( &mut self, query: &str, ) -> Result, Error> { simple_query::simple_query(self.inner_mut(), query).await } /// Executes a sequence of SQL statements using the simple query protocol. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that /// point. This is intended for use when, for example, initializing a database schema. /// /// # Warning /// /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! pub async fn batch_execute(&mut self, query: &str) -> Result { simple_query::batch_execute(self.inner_mut(), query).await } /// Similar to `discard_all`, but it does not clear any query plans /// /// This runs in the background, so it can be executed without `await`ing. pub fn reset_session_background(&mut self) -> Result<(), Error> { // "CLOSE ALL": closes any cursors // "SET SESSION AUTHORIZATION DEFAULT": resets the current_user back to the session_user // "RESET ALL": resets any GUCs back to their session defaults. // "DEALLOCATE ALL": deallocates any prepared statements // "UNLISTEN *": stops listening on all channels // "SELECT pg_advisory_unlock_all();": unlocks all advisory locks // "DISCARD TEMP;": drops all temporary tables // "DISCARD SEQUENCES;": deallocates all cached sequence state let _responses = self.inner_mut().send_simple_query( "ROLLBACK; CLOSE ALL; SET SESSION AUTHORIZATION DEFAULT; RESET ALL; DEALLOCATE ALL; UNLISTEN *; SELECT pg_advisory_unlock_all(); DISCARD TEMP; DISCARD SEQUENCES;", )?; // Clean up memory usage. gc_bytesmut(&mut self.inner_mut().buffer); Ok(()) } /// Begins a new database transaction. /// /// The transaction will roll back by default - use the `commit` method to commit it. pub async fn transaction(&mut self) -> Result, Error> { struct RollbackIfNotDone<'me> { client: &'me mut Client, done: bool, } impl Drop for RollbackIfNotDone<'_> { fn drop(&mut self) { if self.done { return; } let _ = self.client.inner.send_simple_query("ROLLBACK"); } } // This is done, as `Future` created by this method can be dropped after // `RequestMessages` is synchronously send to the `Connection` by // `batch_execute()`, but before `Responses` is asynchronously polled to // completion. In that case `Transaction` won't be created and thus // won't be rolled back. { let mut cleaner = RollbackIfNotDone { client: self, done: false, }; cleaner.client.batch_execute("BEGIN").await?; cleaner.done = true; } Ok(Transaction::new(self)) } /// Returns a builder for a transaction with custom settings. /// /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other /// attributes. pub fn build_transaction(&mut self) -> TransactionBuilder<'_> { TransactionBuilder::new(self) } /// Constructs a cancellation token that can later be used to request cancellation of a query running on the /// connection associated with this client. pub fn cancel_token(&self) -> CancelToken { CancelToken { socket_config: self.socket_config.clone(), raw: RawCancelToken { ssl_mode: self.ssl_mode, process_id: self.process_id, secret_key: self.secret_key, }, } } /// Determines if the connection to the server has already closed. /// /// In that case, all future queries will fail. pub fn is_closed(&self) -> bool { self.inner.sender.is_closed() } } impl fmt::Debug for Client { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Client").finish() } } ================================================ FILE: libs/proxy/tokio-postgres2/src/codec.rs ================================================ use std::io; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; use tokio::sync::mpsc::UnboundedSender; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { Raw(BytesMut), RecordNotices(RecordNotices), } pub struct RecordNotices { pub sender: UnboundedSender>, pub limit: usize, } pub enum BackendMessage { Normal { messages: BackendMessages, ready: bool, }, Async(backend::Message), } pub struct BackendMessages(BytesMut); impl BackendMessages { pub fn empty() -> BackendMessages { BackendMessages(BytesMut::new()) } } impl FallibleIterator for BackendMessages { type Item = backend::Message; type Error = io::Error; fn next(&mut self) -> io::Result> { backend::Message::parse(&mut self.0) } } pub struct PostgresCodec; impl Encoder for PostgresCodec { type Error = io::Error; fn encode(&mut self, item: BytesMut, dst: &mut BytesMut) -> io::Result<()> { dst.unsplit(item); Ok(()) } } impl Decoder for PostgresCodec { type Item = BackendMessage; type Error = io::Error; fn decode(&mut self, src: &mut BytesMut) -> Result, io::Error> { let mut idx = 0; let mut ready = false; while let Some(header) = backend::Header::parse(&src[idx..])? { let len = header.len() as usize + 1; if src[idx..].len() < len { break; } match header.tag() { backend::NOTICE_RESPONSE_TAG | backend::NOTIFICATION_RESPONSE_TAG | backend::PARAMETER_STATUS_TAG => { if idx == 0 { let message = backend::Message::parse(src)?.unwrap(); return Ok(Some(BackendMessage::Async(message))); } else { break; } } _ => {} } idx += len; if header.tag() == backend::READY_FOR_QUERY_TAG { ready = true; break; } } if idx == 0 { Ok(None) } else { Ok(Some(BackendMessage::Normal { messages: BackendMessages(src.split_to(idx)), ready, })) } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/config.rs ================================================ //! Connection configuration. use std::net::IpAddr; use std::time::Duration; use std::{fmt, str}; pub use postgres_protocol2::authentication::sasl::ScramKeys; use postgres_protocol2::message::frontend::StartupMessageParams; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; use crate::connect::connect; use crate::connect_raw::{self, StartupStream}; use crate::connect_tls::connect_tls; use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream}; use crate::{Client, Connection, Error}; /// TLS configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum SslMode { /// Do not use TLS. Disable, /// Attempt to connect with TLS but allow sessions without. Prefer, /// Require the use of TLS. Require, } /// Channel binding configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum ChannelBinding { /// Do not use channel binding. Disable, /// Attempt to use channel binding but allow sessions without. Prefer, /// Require the use of channel binding. Require, } /// Replication mode configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum ReplicationMode { /// Physical replication. Physical, /// Logical replication. Logical, } /// A host specification. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum Host { /// A TCP hostname. Tcp(String), } /// Precomputed keys which may override password during auth. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AuthKeys { /// A `ClientKey` & `ServerKey` pair for `SCRAM-SHA-256`. ScramSha256(ScramKeys<32>), } /// Connection configuration. #[derive(Clone, PartialEq, Eq)] pub struct Config { pub(crate) host_addr: Option, pub(crate) host: Host, pub(crate) port: u16, pub(crate) password: Option>, pub(crate) auth_keys: Option>, pub(crate) ssl_mode: SslMode, pub(crate) connect_timeout: Option, pub(crate) channel_binding: ChannelBinding, pub(crate) server_params: StartupMessageParams, database: bool, username: bool, } impl Config { /// Creates a new configuration. pub fn new(host: String, port: u16) -> Config { Config { host_addr: None, host: Host::Tcp(host), port, password: None, auth_keys: None, ssl_mode: SslMode::Prefer, connect_timeout: None, channel_binding: ChannelBinding::Prefer, server_params: StartupMessageParams::default(), database: false, username: false, } } /// Sets the user to authenticate with. /// /// Required. pub fn user(&mut self, user: &str) -> &mut Config { self.set_param("user", user) } /// Gets the user to authenticate with, if one has been configured with /// the `user` method. pub fn user_is_set(&self) -> bool { self.username } /// Sets the password to authenticate with. pub fn password(&mut self, password: T) -> &mut Config where T: AsRef<[u8]>, { self.password = Some(password.as_ref().to_vec()); self } /// Gets the password to authenticate with, if one has been configured with /// the `password` method. pub fn get_password(&self) -> Option<&[u8]> { self.password.as_deref() } /// Sets precomputed protocol-specific keys to authenticate with. /// When set, this option will override `password`. /// See [`AuthKeys`] for more information. pub fn auth_keys(&mut self, keys: AuthKeys) -> &mut Config { self.auth_keys = Some(Box::new(keys)); self } /// Gets precomputed protocol-specific keys to authenticate with. /// if one has been configured with the `auth_keys` method. pub fn get_auth_keys(&self) -> Option { self.auth_keys.as_deref().copied() } /// Sets the name of the database to connect to. /// /// Defaults to the user. pub fn dbname(&mut self, dbname: &str) -> &mut Config { self.set_param("database", dbname) } /// Gets the name of the database to connect to, if one has been configured /// with the `dbname` method. pub fn db_is_set(&self) -> bool { self.database } pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config { if name == "database" { self.database = true; } else if name == "user" { self.username = true; } self.server_params.insert(name, value); self } pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config { self.host_addr = Some(addr); self } pub fn get_host_addr(&self) -> Option { self.host_addr } /// Sets the SSL configuration. /// /// Defaults to `prefer`. pub fn ssl_mode(&mut self, ssl_mode: SslMode) -> &mut Config { self.ssl_mode = ssl_mode; self } /// Gets the SSL configuration. pub fn get_ssl_mode(&self) -> SslMode { self.ssl_mode } /// Gets the hosts that have been added to the configuration with `host`. pub fn get_host(&self) -> &Host { &self.host } /// Gets the ports that have been added to the configuration with `port`. pub fn get_port(&self) -> u16 { self.port } /// Sets the timeout applied to socket-level connection attempts. /// /// Note that hostnames can resolve to multiple IP addresses, and this timeout will apply to each address of each /// host separately. Defaults to no limit. pub fn connect_timeout(&mut self, connect_timeout: Duration) -> &mut Config { self.connect_timeout = Some(connect_timeout); self } /// Gets the connection timeout, if one has been set with the /// `connect_timeout` method. pub fn get_connect_timeout(&self) -> Option<&Duration> { self.connect_timeout.as_ref() } /// Sets the channel binding behavior. /// /// Defaults to `prefer`. pub fn channel_binding(&mut self, channel_binding: ChannelBinding) -> &mut Config { self.channel_binding = channel_binding; self } /// Gets the channel binding behavior. pub fn get_channel_binding(&self) -> ChannelBinding { self.channel_binding } /// Opens a connection to a PostgreSQL database. /// /// Requires the `runtime` Cargo feature (enabled by default). pub async fn connect( &self, tls: &T, ) -> Result<(Client, Connection), Error> where T: MakeTlsConnect, { connect(tls, self).await } pub async fn tls_and_authenticate( &self, stream: S, tls: T, ) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { let stream = connect_tls(stream, self.ssl_mode, tls).await?; let mut stream = StartupStream::new(stream); connect_raw::authenticate(&mut stream, self).await?; Ok(stream) } pub fn authenticate( &self, stream: &mut StartupStream, ) -> impl Future> where S: AsyncRead + AsyncWrite + Unpin, T: TlsStream + Unpin, { connect_raw::authenticate(stream, self) } } // Omit password from debug output impl fmt::Debug for Config { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { struct Redaction {} impl fmt::Debug for Redaction { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "_") } } f.debug_struct("Config") .field("password", &self.password.as_ref().map(|_| Redaction {})) .field("ssl_mode", &self.ssl_mode) .field("host", &self.host) .field("port", &self.port) .field("connect_timeout", &self.connect_timeout) .field("channel_binding", &self.channel_binding) .field("server_params", &self.server_params) .finish() } } ================================================ FILE: libs/proxy/tokio-postgres2/src/connect.rs ================================================ use std::net::IpAddr; use futures_util::TryStreamExt; use postgres_protocol2::message::backend::Message; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; use tokio::sync::mpsc; use crate::client::SocketConfig; use crate::config::{Host, SslMode}; use crate::connect_raw::StartupStream; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error}; pub async fn connect( tls: &T, config: &Config, ) -> Result<(Client, Connection), Error> where T: MakeTlsConnect, { let hostname = match &config.host { Host::Tcp(host) => host.as_str(), }; let tls = tls .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; match connect_once(config.host_addr, &config.host, config.port, tls, config).await { Ok((client, connection)) => Ok((client, connection)), Err(e) => Err(e), } } async fn connect_once( host_addr: Option, host: &Host, port: u16, tls: T, config: &Config, ) -> Result<(Client, Connection), Error> where T: TlsConnect, { let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; let stream = config.tls_and_authenticate(socket, tls).await?; managed( stream, host_addr, host.clone(), port, config.ssl_mode, config.connect_timeout, ) .await } pub async fn managed( mut stream: StartupStream, host_addr: Option, host: Host, port: u16, ssl_mode: SslMode, connect_timeout: Option, ) -> Result<(Client, Connection), Error> where TlsStream: AsyncRead + AsyncWrite + Unpin, { let (process_id, secret_key) = wait_until_ready(&mut stream).await?; let socket_config = SocketConfig { host_addr, host, port, connect_timeout, }; let mut stream = stream.into_framed(); let write_buf = std::mem::take(stream.write_buffer_mut()); let (client_tx, conn_rx) = mpsc::unbounded_channel(); let (conn_tx, client_rx) = mpsc::channel(4); let client = Client::new( client_tx, client_rx, socket_config, ssl_mode, process_id, secret_key, write_buf, ); let connection = Connection::new(stream, conn_tx, conn_rx); Ok((client, connection)) } async fn wait_until_ready(stream: &mut StartupStream) -> Result<(i32, i32), Error> where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { let mut process_id = 0; let mut secret_key = 0; loop { match stream.try_next().await.map_err(Error::io)? { Some(Message::BackendKeyData(body)) => { process_id = body.process_id(); secret_key = body.secret_key(); } // These values are currently not used by `Client`/`Connection`. Ignore them. Some(Message::ParameterStatus(_)) | Some(Message::NoticeResponse(_)) => {} Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key)), Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), None => return Err(Error::closed()), } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/connect_raw.rs ================================================ use std::io; use std::pin::Pin; use std::task::{Context, Poll, ready}; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{SinkExt, Stream, TryStreamExt}; use postgres_protocol2::authentication::sasl; use postgres_protocol2::authentication::sasl::ScramSha256; use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message}; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_util::codec::{Framed, FramedParts}; use crate::Error; use crate::codec::PostgresCodec; use crate::config::{self, AuthKeys, Config}; use crate::connection::{GC_THRESHOLD, INITIAL_CAPACITY}; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::TlsStream; pub struct StartupStream { inner: Framed, PostgresCodec>, read_buf: BytesMut, } impl Stream for StartupStream where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { type Item = io::Result; fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { // We don't use `self.inner.poll_next()` as that might over-read into the read buffer. // read 1 byte tag, 4 bytes length. let header = ready!(self.as_mut().poll_fill_buf_exact(cx, 5)?); let len = u32::from_be_bytes(header[1..5].try_into().unwrap()); if len < 4 { return Poll::Ready(Some(Err(std::io::Error::other( "postgres message too small", )))); } if len >= 65536 { return Poll::Ready(Some(Err(std::io::Error::other( "postgres message too large", )))); } // the tag is an additional byte. let _message = ready!(self.as_mut().poll_fill_buf_exact(cx, len as usize + 1)?); // Message::parse will remove the all the bytes from the buffer. Poll::Ready(Message::parse(&mut self.read_buf).transpose()) } } impl StartupStream where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { /// Fill the buffer until it's the exact length provided. No additional data will be read from the socket. /// /// If the current buffer length is greater, nothing happens. fn poll_fill_buf_exact( self: Pin<&mut Self>, cx: &mut Context<'_>, len: usize, ) -> Poll> { let this = self.get_mut(); let mut stream = Pin::new(this.inner.get_mut()); let mut n = this.read_buf.len(); while n < len { this.read_buf.resize(len, 0); let mut buf = ReadBuf::new(&mut this.read_buf[..]); buf.set_filled(n); if stream.as_mut().poll_read(cx, &mut buf)?.is_pending() { this.read_buf.truncate(n); return Poll::Pending; } if buf.filled().len() == n { return Poll::Ready(Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "early eof", ))); } n = buf.filled().len(); this.read_buf.truncate(n); } Poll::Ready(Ok(&this.read_buf[..len])) } pub fn into_framed(mut self) -> Framed, PostgresCodec> { *self.inner.read_buffer_mut() = self.read_buf; self.inner } pub fn new(io: MaybeTlsStream) -> Self { let mut parts = FramedParts::new(io, PostgresCodec); parts.write_buf = BytesMut::with_capacity(INITIAL_CAPACITY); let mut inner = Framed::from_parts(parts); // This is the default already, but nice to be explicit. // We divide by two because writes will overshoot the boundary. // We don't want constant overshoots to cause us to constantly re-shrink the buffer. inner.set_backpressure_boundary(GC_THRESHOLD / 2); Self { inner, read_buf: BytesMut::with_capacity(INITIAL_CAPACITY), } } } pub(crate) async fn authenticate( stream: &mut StartupStream, config: &Config, ) -> Result<(), Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsStream + Unpin, { frontend::startup_message(&config.server_params, stream.inner.write_buffer_mut()) .map_err(Error::encode)?; stream.inner.flush().await.map_err(Error::io)?; match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationOk) => { can_skip_channel_binding(config)?; return Ok(()); } Some(Message::AuthenticationCleartextPassword) => { can_skip_channel_binding(config)?; let pass = config .password .as_ref() .ok_or_else(|| Error::config("password missing".into()))?; frontend::password_message(pass, stream.inner.write_buffer_mut()) .map_err(Error::encode)?; } Some(Message::AuthenticationSasl(body)) => { authenticate_sasl(stream, body, config).await?; } Some(Message::AuthenticationMd5Password) | Some(Message::AuthenticationKerberosV5) | Some(Message::AuthenticationScmCredential) | Some(Message::AuthenticationGss) | Some(Message::AuthenticationSspi) => { return Err(Error::authentication( "unsupported authentication method".into(), )); } Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), None => return Err(Error::closed()), } stream.inner.flush().await.map_err(Error::io)?; match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationOk) => Ok(()), Some(Message::ErrorResponse(body)) => Err(Error::db(body)), Some(_) => Err(Error::unexpected_message()), None => Err(Error::closed()), } } fn can_skip_channel_binding(config: &Config) -> Result<(), Error> { match config.channel_binding { config::ChannelBinding::Disable | config::ChannelBinding::Prefer => Ok(()), config::ChannelBinding::Require => Err(Error::authentication( "server did not use channel binding".into(), )), } } async fn authenticate_sasl( stream: &mut StartupStream, body: AuthenticationSaslBody, config: &Config, ) -> Result<(), Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsStream + Unpin, { let mut has_scram = false; let mut has_scram_plus = false; let mut mechanisms = body.mechanisms(); while let Some(mechanism) = mechanisms.next().map_err(Error::parse)? { match mechanism { sasl::SCRAM_SHA_256 => has_scram = true, sasl::SCRAM_SHA_256_PLUS => has_scram_plus = true, _ => {} } } let channel_binding = stream .inner .get_ref() .channel_binding() .tls_server_end_point .filter(|_| config.channel_binding != config::ChannelBinding::Disable) .map(sasl::ChannelBinding::tls_server_end_point); let (channel_binding, mechanism) = if has_scram_plus { match channel_binding { Some(channel_binding) => (channel_binding, sasl::SCRAM_SHA_256_PLUS), None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), } } else if has_scram { match channel_binding { Some(_) => (sasl::ChannelBinding::unrequested(), sasl::SCRAM_SHA_256), None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), } } else { return Err(Error::authentication("unsupported SASL mechanism".into())); }; if mechanism != sasl::SCRAM_SHA_256_PLUS { can_skip_channel_binding(config)?; } let mut scram = if let Some(AuthKeys::ScramSha256(keys)) = config.get_auth_keys() { ScramSha256::new_with_keys(keys, channel_binding) } else if let Some(password) = config.get_password() { ScramSha256::new(password, channel_binding) } else { return Err(Error::config("password or auth keys missing".into())); }; frontend::sasl_initial_response(mechanism, scram.message(), stream.inner.write_buffer_mut()) .map_err(Error::encode)?; stream.inner.flush().await.map_err(Error::io)?; let body = match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationSaslContinue(body)) => body, Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), None => return Err(Error::closed()), }; scram .update(body.data()) .await .map_err(|e| Error::authentication(e.into()))?; frontend::sasl_response(scram.message(), stream.inner.write_buffer_mut()) .map_err(Error::encode)?; stream.inner.flush().await.map_err(Error::io)?; let body = match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationSaslFinal(body)) => body, Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), None => return Err(Error::closed()), }; scram .finish(body.data()) .map_err(|e| Error::authentication(e.into()))?; Ok(()) } ================================================ FILE: libs/proxy/tokio-postgres2/src/connect_socket.rs ================================================ use std::future::Future; use std::io; use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use tokio::net::{self, TcpStream}; use tokio::time; use crate::Error; use crate::config::Host; pub(crate) async fn connect_socket( host_addr: Option, host: &Host, port: u16, connect_timeout: Option, ) -> Result { match host { Host::Tcp(host) => { let addrs = match host_addr { Some(addr) => vec![SocketAddr::new(addr, port)], None => net::lookup_host((&**host, port)) .await .map_err(Error::connect)? .collect(), }; let mut last_err = None; for addr in addrs { let stream = match connect_with_timeout(TcpStream::connect(addr), connect_timeout).await { Ok(stream) => stream, Err(e) => { last_err = Some(e); continue; } }; stream.set_nodelay(true).map_err(Error::connect)?; return Ok(stream); } Err(last_err.unwrap_or_else(|| { Error::connect(io::Error::new( io::ErrorKind::InvalidInput, "could not resolve any addresses", )) })) } } } async fn connect_with_timeout(connect: F, timeout: Option) -> Result where F: Future>, { match timeout { Some(timeout) => match time::timeout(timeout, connect).await { Ok(Ok(socket)) => Ok(socket), Ok(Err(e)) => Err(Error::connect(e)), Err(_) => Err(Error::connect(io::Error::new( io::ErrorKind::TimedOut, "connection timed out", ))), }, None => match connect.await { Ok(socket) => Ok(socket), Err(e) => Err(Error::connect(e)), }, } } ================================================ FILE: libs/proxy/tokio-postgres2/src/connect_tls.rs ================================================ use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use crate::Error; use crate::config::SslMode; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::TlsConnect; use crate::tls::private::ForcePrivateApi; pub async fn connect_tls( mut stream: S, mode: SslMode, tls: T, ) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { match mode { SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { return Ok(MaybeTlsStream::Raw(stream)); } SslMode::Prefer | SslMode::Require => {} } let mut buf = BytesMut::new(); frontend::ssl_request(&mut buf); stream.write_all(&buf).await.map_err(Error::io)?; let mut buf = [0]; stream.read_exact(&mut buf).await.map_err(Error::io)?; if buf[0] != b'S' { if SslMode::Require == mode { return Err(Error::tls("server does not support TLS".into())); } else { return Ok(MaybeTlsStream::Raw(stream)); } } let stream = tls .connect(stream) .await .map_err(|e| Error::tls(e.into()))?; Ok(MaybeTlsStream::Tls(stream)) } ================================================ FILE: libs/proxy/tokio-postgres2/src/connection.rs ================================================ use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{Sink, StreamExt, ready}; use postgres_protocol2::message::backend::{Message, NoticeResponseBody}; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; use tracing::trace; use crate::Error; use crate::codec::{ BackendMessage, BackendMessages, FrontendMessage, PostgresCodec, RecordNotices, }; use crate::maybe_tls_stream::MaybeTlsStream; #[derive(PartialEq, Debug)] enum State { Active, Closing, } /// A connection to a PostgreSQL database. /// /// This is one half of what is returned when a new connection is established. It performs the actual IO with the /// server, and should generally be spawned off onto an executor to run in the background. /// /// `Connection` implements `Future`, and only resolves when the connection is closed, either because a fatal error has /// occurred, or because its associated `Client` has dropped and all outstanding work has completed. #[must_use = "futures do nothing unless polled"] pub struct Connection { stream: Framed, PostgresCodec>, sender: PollSender, receiver: mpsc::UnboundedReceiver, notices: Option, pending_response: Option, state: State, } pub const INITIAL_CAPACITY: usize = 2 * 1024; pub const GC_THRESHOLD: usize = 16 * 1024; /// Gargabe collect the [`BytesMut`] if it has too much spare capacity. pub fn gc_bytesmut(buf: &mut BytesMut) { // We use a different mode to shrink the buf when above the threshold. // When above the threshold, we only re-allocate when the buf has 2x spare capacity. let reclaim = GC_THRESHOLD.checked_sub(buf.len()).unwrap_or(buf.len()); // `try_reclaim` tries to get the capacity from any shared `BytesMut`s, // before then comparing the length against the capacity. if buf.try_reclaim(reclaim) { let capacity = usize::max(buf.len(), INITIAL_CAPACITY); // Allocate a new `BytesMut` so that we deallocate the old version. let mut new = BytesMut::with_capacity(capacity); new.extend_from_slice(buf); *buf = new; } } pub enum Never {} impl Connection where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { pub(crate) fn new( stream: Framed, PostgresCodec>, sender: mpsc::Sender, receiver: mpsc::UnboundedReceiver, ) -> Connection { Connection { stream, sender: PollSender::new(sender), receiver, notices: None, pending_response: None, state: State::Active, } } /// Read and process messages from the connection to postgres. /// client <- postgres fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll> { loop { let messages = match self.pending_response.take() { Some(messages) => messages, None => { let message = match self.stream.poll_next_unpin(cx) { Poll::Pending => return Poll::Pending, Poll::Ready(None) => return Poll::Ready(Err(Error::closed())), Poll::Ready(Some(Err(e))) => return Poll::Ready(Err(Error::io(e))), Poll::Ready(Some(Ok(message))) => message, }; match message { BackendMessage::Async(Message::NoticeResponse(body)) => { self.handle_notice(body)?; continue; } BackendMessage::Async(_) => continue, BackendMessage::Normal { messages, ready } => { // if we read a ReadyForQuery from postgres, let's try GC the read buffer. if ready { gc_bytesmut(self.stream.read_buffer_mut()); } messages } } } }; match self.sender.poll_reserve(cx) { Poll::Ready(Ok(())) => { let _ = self.sender.send_item(messages); } Poll::Ready(Err(_)) => { return Poll::Ready(Err(Error::closed())); } Poll::Pending => { self.pending_response = Some(messages); trace!("poll_read: waiting on sender"); return Poll::Pending; } } } } fn handle_notice(&mut self, body: NoticeResponseBody) -> Result<(), Error> { let Some(notices) = &mut self.notices else { return Ok(()); }; let mut fields = body.fields(); while let Some(field) = fields.next().map_err(Error::parse)? { // loop until we find the message field if field.type_() == b'M' { // if the message field is within the limit, send it. if let Some(new_limit) = notices.limit.checked_sub(field.value().len()) { match notices.sender.send(field.value().into()) { // set the new limit. Ok(()) => notices.limit = new_limit, // closed. Err(_) => self.notices = None, } } break; } } Ok(()) } /// Fetch the next client request and enqueue the response sender. fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { if self.receiver.is_closed() { return Poll::Ready(None); } match self.receiver.poll_recv(cx) { Poll::Ready(Some(request)) => { trace!("polled new request"); Poll::Ready(Some(request)) } Poll::Ready(None) => Poll::Ready(None), Poll::Pending => Poll::Pending, } } /// Process client requests and write them to the postgres connection, flushing if necessary. /// client -> postgres fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { loop { if Pin::new(&mut self.stream) .poll_ready(cx) .map_err(Error::io)? .is_pending() { trace!("poll_write: waiting on socket"); // poll_ready is self-flushing. return Poll::Pending; } match self.poll_request(cx) { // send the message to postgres Poll::Ready(Some(FrontendMessage::Raw(request))) => { Pin::new(&mut self.stream) .start_send(request) .map_err(Error::io)?; } Poll::Ready(Some(FrontendMessage::RecordNotices(notices))) => { self.notices = Some(notices) } // No more messages from the client, and no more responses to wait for. // Send a terminate message to postgres Poll::Ready(None) => { trace!("poll_write: at eof, terminating"); frontend::terminate(self.stream.write_buffer_mut()); trace!("poll_write: sent eof, closing"); trace!("poll_write: done"); return Poll::Ready(Ok(())); } // Still waiting for a message from the client. Poll::Pending => { trace!("poll_write: waiting on request"); ready!(self.poll_flush(cx))?; return Poll::Pending; } } } } fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream) .poll_flush(cx) .map_err(Error::io)? { Poll::Ready(()) => { trace!("poll_flush: flushed"); // Since our codec prefers to share the buffer with the `Client`, // if we don't release our share, then the `Client` would have to re-alloc // the buffer when they next use it. debug_assert!(self.stream.write_buffer().is_empty()); *self.stream.write_buffer_mut() = BytesMut::new(); Poll::Ready(Ok(())) } Poll::Pending => { trace!("poll_flush: waiting on socket"); Poll::Pending } } } fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream) .poll_close(cx) .map_err(Error::io)? { Poll::Ready(()) => { trace!("poll_shutdown: complete"); Poll::Ready(Ok(())) } Poll::Pending => { trace!("poll_shutdown: waiting on socket"); Poll::Pending } } } fn poll_message(&mut self, cx: &mut Context<'_>) -> Poll>> { if self.state != State::Closing { // if the state is still active, try read from and write to postgres. let Poll::Pending = self.poll_read(cx)?; if self.poll_write(cx)?.is_ready() { self.state = State::Closing; } // poll_read returned Pending. // poll_write returned Pending or Ready(()). // if poll_write returned Ready(()), then we are waiting to read more data from postgres. if self.state != State::Closing { return Poll::Pending; } } match self.poll_shutdown(cx) { Poll::Ready(Ok(())) => Poll::Ready(None), Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), Poll::Pending => Poll::Pending, } } } impl Future for Connection where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { type Output = Result<(), Error>; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { match self.poll_message(cx)? { Poll::Ready(None) => Poll::Ready(Ok(())), Poll::Pending => Poll::Pending, } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/error/mod.rs ================================================ //! Errors. use std::error::{self, Error as _Error}; use std::{fmt, io}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; pub use self::sqlstate::*; #[allow(clippy::unreadable_literal)] pub mod sqlstate; /// The severity of a Postgres error or notice. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum Severity { /// PANIC Panic, /// FATAL Fatal, /// ERROR Error, /// WARNING Warning, /// NOTICE Notice, /// DEBUG Debug, /// INFO Info, /// LOG Log, } impl fmt::Display for Severity { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match *self { Severity::Panic => "PANIC", Severity::Fatal => "FATAL", Severity::Error => "ERROR", Severity::Warning => "WARNING", Severity::Notice => "NOTICE", Severity::Debug => "DEBUG", Severity::Info => "INFO", Severity::Log => "LOG", }; fmt.write_str(s) } } impl Severity { fn from_str(s: &str) -> Option { match s { "PANIC" => Some(Severity::Panic), "FATAL" => Some(Severity::Fatal), "ERROR" => Some(Severity::Error), "WARNING" => Some(Severity::Warning), "NOTICE" => Some(Severity::Notice), "DEBUG" => Some(Severity::Debug), "INFO" => Some(Severity::Info), "LOG" => Some(Severity::Log), _ => None, } } } /// A Postgres error or notice. #[derive(Debug, Clone, PartialEq, Eq)] pub struct DbError { severity: String, parsed_severity: Option, code: SqlState, message: String, detail: Option, hint: Option, position: Option, where_: Option, schema: Option, table: Option, column: Option, datatype: Option, constraint: Option, file: Option, line: Option, routine: Option, } impl DbError { pub fn new_test_error(code: SqlState, message: String) -> Self { DbError { severity: "ERROR".to_string(), parsed_severity: Some(Severity::Error), code, message, detail: None, hint: None, position: None, where_: None, schema: None, table: None, column: None, datatype: None, constraint: None, file: None, line: None, routine: None, } } pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result { let mut severity = None; let mut parsed_severity = None; let mut code = None; let mut message = None; let mut detail = None; let mut hint = None; let mut normal_position = None; let mut internal_position = None; let mut internal_query = None; let mut where_ = None; let mut schema = None; let mut table = None; let mut column = None; let mut datatype = None; let mut constraint = None; let mut file = None; let mut line = None; let mut routine = None; while let Some(field) = fields.next()? { match field.type_() { b'S' => severity = Some(field.value().to_owned()), b'C' => code = Some(SqlState::from_code(field.value())), b'M' => message = Some(field.value().to_owned()), b'D' => detail = Some(field.value().to_owned()), b'H' => hint = Some(field.value().to_owned()), b'P' => { normal_position = Some(field.value().parse::().map_err(|_| { io::Error::new( io::ErrorKind::InvalidInput, "`P` field did not contain an integer", ) })?); } b'p' => { internal_position = Some(field.value().parse::().map_err(|_| { io::Error::new( io::ErrorKind::InvalidInput, "`p` field did not contain an integer", ) })?); } b'q' => internal_query = Some(field.value().to_owned()), b'W' => where_ = Some(field.value().to_owned()), b's' => schema = Some(field.value().to_owned()), b't' => table = Some(field.value().to_owned()), b'c' => column = Some(field.value().to_owned()), b'd' => datatype = Some(field.value().to_owned()), b'n' => constraint = Some(field.value().to_owned()), b'F' => file = Some(field.value().to_owned()), b'L' => { line = Some(field.value().parse::().map_err(|_| { io::Error::new( io::ErrorKind::InvalidInput, "`L` field did not contain an integer", ) })?); } b'R' => routine = Some(field.value().to_owned()), b'V' => { parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidInput, "`V` field contained an invalid value", ) })?); } _ => {} } } Ok(DbError { severity: severity .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?, parsed_severity, code: code .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?, message: message .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?, detail, hint, position: match normal_position { Some(position) => Some(ErrorPosition::Original(position)), None => match internal_position { Some(position) => Some(ErrorPosition::Internal { position, query: internal_query.ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidInput, "`q` field missing but `p` field present", ) })?, }), None => None, }, }, where_, schema, table, column, datatype, constraint, file, line, routine, }) } /// The field contents are ERROR, FATAL, or PANIC (in an error message), /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a /// localized translation of one of these. pub fn severity(&self) -> &str { &self.severity } /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+) pub fn parsed_severity(&self) -> Option { self.parsed_severity } /// The SQLSTATE code for the error. pub fn code(&self) -> &SqlState { &self.code } /// The primary human-readable error message. /// /// This should be accurate but terse (typically one line). pub fn message(&self) -> &str { &self.message } /// An optional secondary error message carrying more detail about the /// problem. /// /// Might run to multiple lines. pub fn detail(&self) -> Option<&str> { self.detail.as_deref() } /// An optional suggestion what to do about the problem. /// /// This is intended to differ from `detail` in that it offers advice /// (potentially inappropriate) rather than hard facts. Might run to /// multiple lines. pub fn hint(&self) -> Option<&str> { self.hint.as_deref() } /// An optional error cursor position into either the original query string /// or an internally generated query. pub fn position(&self) -> Option<&ErrorPosition> { self.position.as_ref() } /// An indication of the context in which the error occurred. /// /// Presently this includes a call stack traceback of active procedural /// language functions and internally-generated queries. The trace is one /// entry per line, most recent first. pub fn where_(&self) -> Option<&str> { self.where_.as_deref() } /// If the error was associated with a specific database object, the name /// of the schema containing that object, if any. (PostgreSQL 9.3+) pub fn schema(&self) -> Option<&str> { self.schema.as_deref() } /// If the error was associated with a specific table, the name of the /// table. (Refer to the schema name field for the name of the table's /// schema.) (PostgreSQL 9.3+) pub fn table(&self) -> Option<&str> { self.table.as_deref() } /// If the error was associated with a specific table column, the name of /// the column. /// /// (Refer to the schema and table name fields to identify the table.) /// (PostgreSQL 9.3+) pub fn column(&self) -> Option<&str> { self.column.as_deref() } /// If the error was associated with a specific data type, the name of the /// data type. (Refer to the schema name field for the name of the data /// type's schema.) (PostgreSQL 9.3+) pub fn datatype(&self) -> Option<&str> { self.datatype.as_deref() } /// If the error was associated with a specific constraint, the name of the /// constraint. /// /// Refer to fields listed above for the associated table or domain. /// (For this purpose, indexes are treated as constraints, even if they /// weren't created with constraint syntax.) (PostgreSQL 9.3+) pub fn constraint(&self) -> Option<&str> { self.constraint.as_deref() } /// The file name of the source-code location where the error was reported. pub fn file(&self) -> Option<&str> { self.file.as_deref() } /// The line number of the source-code location where the error was /// reported. pub fn line(&self) -> Option { self.line } /// The name of the source-code routine reporting the error. pub fn routine(&self) -> Option<&str> { self.routine.as_deref() } } impl fmt::Display for DbError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!(fmt, "{}: {}", self.severity, self.message)?; if let Some(detail) = &self.detail { write!(fmt, "\nDETAIL: {detail}")?; } if let Some(hint) = &self.hint { write!(fmt, "\nHINT: {hint}")?; } Ok(()) } } impl error::Error for DbError {} /// Represents the position of an error in a query. #[derive(Clone, PartialEq, Eq, Debug)] pub enum ErrorPosition { /// A position in the original query. Original(u32), /// A position in an internally generated query. Internal { /// The byte position. position: u32, /// A query generated by the Postgres server. query: String, }, } #[derive(Debug, PartialEq)] enum Kind { Io, UnexpectedMessage, Tls, ToSql(usize), FromSql(usize), Column(String), Closed, Db, Parse, Encode, Authentication, Config, Connect, Timeout, } struct ErrorInner { kind: Kind, cause: Option>, } /// An error communicating with the Postgres server. pub struct Error(Box); impl fmt::Debug for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("Error") .field("kind", &self.0.kind) .field("cause", &self.0.cause) .finish() } } impl fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match &self.0.kind { Kind::Io => fmt.write_str("error communicating with the server")?, Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?, Kind::Tls => fmt.write_str("error performing TLS handshake")?, Kind::ToSql(idx) => write!(fmt, "error serializing parameter {idx}")?, Kind::FromSql(idx) => write!(fmt, "error deserializing column {idx}")?, Kind::Column(column) => write!(fmt, "invalid column `{column}`")?, Kind::Closed => fmt.write_str("connection closed")?, Kind::Db => fmt.write_str("db error")?, Kind::Parse => fmt.write_str("error parsing response from server")?, Kind::Encode => fmt.write_str("error encoding message to server")?, Kind::Authentication => fmt.write_str("authentication error")?, Kind::Config => fmt.write_str("invalid configuration")?, Kind::Connect => fmt.write_str("error connecting to server")?, Kind::Timeout => fmt.write_str("timeout waiting for server")?, }; if let Some(ref cause) = self.0.cause { write!(fmt, ": {cause}")?; } Ok(()) } } impl error::Error for Error { fn source(&self) -> Option<&(dyn error::Error + 'static)> { self.0.cause.as_ref().map(|e| &**e as _) } } impl Error { /// Consumes the error, returning its cause. pub fn into_source(self) -> Option> { self.0.cause } /// Returns the source of this error if it was a `DbError`. /// /// This is a simple convenience method. pub fn as_db_error(&self) -> Option<&DbError> { self.source().and_then(|e| e.downcast_ref::()) } /// Determines if the error was associated with closed connection. pub fn is_closed(&self) -> bool { self.0.kind == Kind::Closed } /// Returns the SQLSTATE error code associated with the error. /// /// This is a convenience method that downcasts the cause to a `DbError` and returns its code. pub fn code(&self) -> Option<&SqlState> { self.as_db_error().map(DbError::code) } fn new(kind: Kind, cause: Option>) -> Error { Error(Box::new(ErrorInner { kind, cause })) } pub fn closed() -> Error { Error::new(Kind::Closed, None) } pub fn unexpected_message() -> Error { Error::new(Kind::UnexpectedMessage, None) } #[allow(clippy::needless_pass_by_value)] pub fn db(error: ErrorResponseBody) -> Error { match DbError::parse(&mut error.fields()) { Ok(e) => Error::new(Kind::Db, Some(Box::new(e))), Err(e) => Error::new(Kind::Parse, Some(Box::new(e))), } } pub(crate) fn parse(e: io::Error) -> Error { Error::new(Kind::Parse, Some(Box::new(e))) } pub(crate) fn encode(e: io::Error) -> Error { Error::new(Kind::Encode, Some(Box::new(e))) } #[allow(clippy::wrong_self_convention)] pub(crate) fn to_sql(e: Box, idx: usize) -> Error { Error::new(Kind::ToSql(idx), Some(e)) } pub(crate) fn from_sql(e: Box, idx: usize) -> Error { Error::new(Kind::FromSql(idx), Some(e)) } pub(crate) fn column(column: String) -> Error { Error::new(Kind::Column(column), None) } pub(crate) fn tls(e: Box) -> Error { Error::new(Kind::Tls, Some(e)) } pub fn io(e: io::Error) -> Error { Error::new(Kind::Io, Some(Box::new(e))) } pub(crate) fn authentication(e: Box) -> Error { Error::new(Kind::Authentication, Some(e)) } pub(crate) fn config(e: Box) -> Error { Error::new(Kind::Config, Some(e)) } pub(crate) fn connect(e: io::Error) -> Error { Error::new(Kind::Connect, Some(Box::new(e))) } #[doc(hidden)] pub fn __private_api_timeout() -> Error { Error::new(Kind::Timeout, None) } } ================================================ FILE: libs/proxy/tokio-postgres2/src/error/sqlstate.rs ================================================ //! Rust repr for /// A SQLSTATE error code #[derive(PartialEq, Eq, Clone, Debug)] pub struct SqlState([u8; 5]); impl SqlState { /// Creates a `SqlState` from its error code. pub fn from_code(s: &str) -> SqlState { let mut code = [b'0'; 5]; if s.len() == 5 { code.copy_from_slice(s.as_bytes()); } SqlState(code) } /// Returns the error code corresponding to the `SqlState`. pub fn code(&self) -> &str { std::str::from_utf8(&self.0).unwrap() } // Class 08 - Connection Exception /// 08000 pub const CONNECTION_EXCEPTION: SqlState = SqlState(*b"08000"); /// 08003 pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(*b"08003"); /// 08006 pub const CONNECTION_FAILURE: SqlState = SqlState(*b"08006"); /// 08001 pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(*b"08001"); /// 08P01 pub const PROTOCOL_VIOLATION: SqlState = SqlState(*b"08P01"); // Class 22 - Data Exception /// 22023 pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(*b"22023"); // Class 3D - Invalid Catalog Name /// 3D000 pub const INVALID_CATALOG_NAME: SqlState = SqlState(*b"3D000"); // Class 3F - Invalid Schema Name /// 3F000 pub const INVALID_SCHEMA_NAME: SqlState = SqlState(*b"3F000"); // Class 40 - Transaction Rollback /// 40001 pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(*b"40001"); // Class 42 - Syntax Error or Access Rule Violation /// 42601 pub const SYNTAX_ERROR: SqlState = SqlState(*b"42601"); // Class 53 - Insufficient Resources /// 53200 pub const OUT_OF_MEMORY: SqlState = SqlState(*b"53200"); /// 53300 pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(*b"53300"); // Class 57 - Operator Intervention /// 57014 pub const QUERY_CANCELED: SqlState = SqlState(*b"57014"); } #[cfg(test)] mod tests { use super::SqlState; #[test] fn round_trip() { let state = SqlState::from_code("08P01"); assert_eq!(state, SqlState::PROTOCOL_VIOLATION); assert_eq!(state.code(), "08P01"); } } ================================================ FILE: libs/proxy/tokio-postgres2/src/generic_client.rs ================================================ #![allow(async_fn_in_trait)] use crate::query::RowStream; use crate::{Client, Error, Transaction}; mod private { pub trait Sealed {} } /// A trait allowing abstraction over connections and transactions. /// /// This trait is "sealed", and cannot be implemented outside of this crate. pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. async fn query_raw_txt( &mut self, statement: &str, params: I, ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, I::IntoIter: ExactSizeIterator + Sync + Send; } impl private::Sealed for Client {} impl GenericClient for Client { async fn query_raw_txt( &mut self, statement: &str, params: I, ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, I::IntoIter: ExactSizeIterator + Sync + Send, { self.query_raw_txt(statement, params).await } } impl private::Sealed for Transaction<'_> {} impl GenericClient for Transaction<'_> { async fn query_raw_txt( &mut self, statement: &str, params: I, ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, I::IntoIter: ExactSizeIterator + Sync + Send, { self.query_raw_txt(statement, params).await } } ================================================ FILE: libs/proxy/tokio-postgres2/src/lib.rs ================================================ //! An asynchronous, pipelined, PostgreSQL client. #![warn(clippy::all)] use postgres_protocol2::message::backend::ReadyForQueryBody; pub use crate::cancel_token::{CancelToken, RawCancelToken}; pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; pub use crate::connection::Connection; pub use crate::error::Error; pub use crate::generic_client::GenericClient; pub use crate::query::RowStream; pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] #[repr(u8)] pub enum ReadyForQueryStatus { /// Connection state is unknown Unknown, /// Connection is idle (no transactions) Idle = b'I', /// Connection is in a transaction block Transaction = b'T', /// Connection is in a failed transaction block FailedTransaction = b'E', } impl From for ReadyForQueryStatus { fn from(value: ReadyForQueryBody) -> Self { match value.status() { b'I' => Self::Idle, b'T' => Self::Transaction, b'E' => Self::FailedTransaction, _ => Self::Unknown, } } } mod cancel_query; mod cancel_query_raw; mod cancel_token; mod client; mod codec; pub mod config; pub mod connect; pub mod connect_raw; mod connect_socket; mod connect_tls; mod connection; pub mod error; mod generic_client; pub mod maybe_tls_stream; mod prepare; mod query; pub mod row; mod simple_query; mod statement; pub mod tls; mod transaction; mod transaction_builder; pub mod types; /// An asynchronous notification. #[derive(Clone, Debug)] pub struct Notification { process_id: i32, channel: String, payload: String, } impl Notification { /// The process ID of the notifying backend process. pub fn process_id(&self) -> i32 { self.process_id } /// The name of the channel that the notify has been raised on. pub fn channel(&self) -> &str { &self.channel } /// The "payload" string passed from the notifying process. pub fn payload(&self) -> &str { &self.payload } } /// Message returned by the `SimpleQuery` stream. #[derive(Debug)] #[non_exhaustive] pub enum SimpleQueryMessage { /// A row of data. Row(SimpleQueryRow), /// A statement in the query has completed. /// /// The number of rows modified or selected is returned. CommandComplete(u64), } ================================================ FILE: libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs ================================================ //! MaybeTlsStream. //! //! Represents a stream that may or may not be encrypted with TLS. use std::io; use std::pin::Pin; use std::task::{Context, Poll}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use crate::tls::{ChannelBinding, TlsStream}; /// A stream that may or may not be encrypted with TLS. pub enum MaybeTlsStream { /// An unencrypted stream. Raw(S), /// An encrypted stream. Tls(T), } impl AsyncRead for MaybeTlsStream where S: AsyncRead + Unpin, T: AsyncRead + Unpin, { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { match &mut *self { MaybeTlsStream::Raw(s) => Pin::new(s).poll_read(cx, buf), MaybeTlsStream::Tls(s) => Pin::new(s).poll_read(cx, buf), } } } impl AsyncWrite for MaybeTlsStream where S: AsyncWrite + Unpin, T: AsyncWrite + Unpin, { fn poll_write( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { match &mut *self { MaybeTlsStream::Raw(s) => Pin::new(s).poll_write(cx, buf), MaybeTlsStream::Tls(s) => Pin::new(s).poll_write(cx, buf), } } fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { match &mut *self { MaybeTlsStream::Raw(s) => Pin::new(s).poll_flush(cx), MaybeTlsStream::Tls(s) => Pin::new(s).poll_flush(cx), } } fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { match &mut *self { MaybeTlsStream::Raw(s) => Pin::new(s).poll_shutdown(cx), MaybeTlsStream::Tls(s) => Pin::new(s).poll_shutdown(cx), } } } impl TlsStream for MaybeTlsStream where S: AsyncRead + AsyncWrite + Unpin, T: TlsStream + Unpin, { fn channel_binding(&self) -> ChannelBinding { match self { MaybeTlsStream::Raw(_) => ChannelBinding::none(), MaybeTlsStream::Tls(s) => s.channel_binding(), } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/prepare.rs ================================================ use bytes::BytesMut; use fallible_iterator::FallibleIterator; use postgres_protocol2::IsNull; use postgres_protocol2::message::backend::{Message, RowDescriptionBody}; use postgres_protocol2::message::frontend; use postgres_protocol2::types::oid_to_sql; use postgres_types2::Format; use crate::client::{CachedTypeInfo, PartialQuery, Responses}; use crate::types::{Kind, Oid, Type}; use crate::{Column, Error, Row, Statement}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid FROM pg_catalog.pg_type t LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; /// we need to make sure we close this prepared statement. struct CloseStmt<'a, 'b> { client: Option<&'a mut PartialQuery<'b>>, name: &'static str, } impl<'a> CloseStmt<'a, '_> { fn close(mut self) -> Result<&'a mut Responses, Error> { let client = self.client.take().unwrap(); client.send_with_flush(|buf| { frontend::close(b'S', self.name, buf).map_err(Error::encode)?; Ok(()) }) } } impl Drop for CloseStmt<'_, '_> { fn drop(&mut self) { if let Some(client) = self.client.take() { let _ = client.send_with_flush(|buf| { frontend::close(b'S', self.name, buf).map_err(Error::encode)?; Ok(()) }); } } } async fn prepare_typecheck( client: &mut PartialQuery<'_>, name: &'static str, query: &str, ) -> Result { let responses = client.send_with_flush(|buf| { frontend::parse(name, query, [], buf).map_err(Error::encode)?; frontend::describe(b'S', name, buf).map_err(Error::encode)?; Ok(()) })?; match responses.next().await? { Message::ParseComplete => {} _ => return Err(Error::unexpected_message()), } match responses.next().await? { Message::ParameterDescription(_) => {} _ => return Err(Error::unexpected_message()), }; let row_description = match responses.next().await? { Message::RowDescription(body) => Some(body), Message::NoData => None, _ => return Err(Error::unexpected_message()), }; let mut columns = vec![]; if let Some(row_description) = row_description { let mut it = row_description.fields(); while let Some(field) = it.next().map_err(Error::parse)? { let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?; let column = Column::new(field.name().to_string(), type_, field); columns.push(column); } } Ok(Statement::new(name, columns)) } fn try_from_cache(typecache: &CachedTypeInfo, oid: Oid) -> Option { if let Some(type_) = Type::from_oid(oid) { return Some(type_); } if let Some(type_) = typecache.types.get(&oid) { return Some(type_.clone()); }; None } pub async fn parse_row_description( client: &mut PartialQuery<'_>, typecache: &mut CachedTypeInfo, row_description: Option, ) -> Result, Error> { let mut columns = vec![]; if let Some(row_description) = row_description { let mut it = row_description.fields(); while let Some(field) = it.next().map_err(Error::parse)? { let type_ = try_from_cache(typecache, field.type_oid()).unwrap_or(Type::UNKNOWN); let column = Column::new(field.name().to_string(), type_, field); columns.push(column); } } let all_known = columns.iter().all(|c| c.type_ != Type::UNKNOWN); if all_known { // all known, return early. return Ok(columns); } let typeinfo = "neon_proxy_typeinfo"; // make sure to close the typeinfo statement before exiting. let mut guard = CloseStmt { name: typeinfo, client: None, }; let client = guard.client.insert(client); // get the typeinfo statement. let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY).await?; for column in &mut columns { column.type_ = get_type(client, typecache, &stmt, column.type_oid()).await?; } // cancel the close guard. let responses = guard.close()?; match responses.next().await? { Message::CloseComplete => {} _ => return Err(Error::unexpected_message()), } Ok(columns) } async fn get_type( client: &mut PartialQuery<'_>, typecache: &mut CachedTypeInfo, stmt: &Statement, mut oid: Oid, ) -> Result { let mut stack = vec![]; let mut type_ = loop { if let Some(type_) = try_from_cache(typecache, oid) { break type_; } let row = exec(client, stmt, oid).await?; if stack.len() > 8 { return Err(Error::unexpected_message()); } let name: String = row.try_get(0)?; let type_: i8 = row.try_get(1)?; let elem_oid: Oid = row.try_get(2)?; let rngsubtype: Option = row.try_get(3)?; let basetype: Oid = row.try_get(4)?; let schema: String = row.try_get(5)?; let relid: Oid = row.try_get(6)?; let kind = if type_ == b'e' as i8 { Kind::Enum } else if type_ == b'p' as i8 { Kind::Pseudo } else if basetype != 0 { Kind::Domain(basetype) } else if elem_oid != 0 { stack.push((name, oid, schema)); oid = elem_oid; continue; } else if relid != 0 { Kind::Composite(relid) } else if let Some(rngsubtype) = rngsubtype { Kind::Range(rngsubtype) } else { Kind::Simple }; let type_ = Type::new(name, oid, kind, schema); typecache.types.insert(oid, type_.clone()); break type_; }; while let Some((name, oid, schema)) = stack.pop() { type_ = Type::new(name, oid, Kind::Array(type_), schema); typecache.types.insert(oid, type_.clone()); } Ok(type_) } /// exec the typeinfo statement returning one row. async fn exec( client: &mut PartialQuery<'_>, statement: &Statement, param: Oid, ) -> Result { let responses = client.send_with_flush(|buf| { encode_bind(statement, param, "", buf); frontend::execute("", 0, buf).map_err(Error::encode)?; Ok(()) })?; match responses.next().await? { Message::BindComplete => {} _ => return Err(Error::unexpected_message()), } let row = match responses.next().await? { Message::DataRow(body) => Row::new(statement.clone(), body, Format::Binary)?, _ => return Err(Error::unexpected_message()), }; match responses.next().await? { Message::CommandComplete(_) => {} _ => return Err(Error::unexpected_message()), }; Ok(row) } fn encode_bind(statement: &Statement, param: Oid, portal: &str, buf: &mut BytesMut) { frontend::bind( portal, statement.name(), [Format::Binary as i16], [param], |param, buf| { oid_to_sql(param, buf); Ok(IsNull::No) }, [Format::Binary as i16], buf, ) .unwrap(); } ================================================ FILE: libs/proxy/tokio-postgres2/src/query.rs ================================================ use std::pin::Pin; use std::task::{Context, Poll}; use bytes::BufMut; use futures_util::{Stream, ready}; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use postgres_types2::Format; use crate::client::{CachedTypeInfo, InnerClient, Responses}; use crate::{Error, ReadyForQueryStatus, Row, Statement}; pub async fn query_txt<'a, S, I>( client: &'a mut InnerClient, typecache: &mut CachedTypeInfo, query: &str, params: I, ) -> Result, Error> where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { let params = params.into_iter(); let mut client = client.start()?; // Flow: // 1. Parse the query // 2. Inspect the row description for OIDs // 3. If there's any OIDs we don't already know about, perform the typeinfo routine // 4. Execute the query // 5. Sync. // // The typeinfo routine: // 1. Parse the typeinfo query // 2. Execute the query on each OID // 3. If the result does not match an OID we know, repeat 2. // parse the query and get type info let responses = client.send_with_flush(|buf| { frontend::parse( "", // unnamed prepared statement query, // query to parse std::iter::empty(), // give no type info buf, ) .map_err(Error::encode)?; frontend::describe(b'S', "", buf).map_err(Error::encode)?; Ok(()) })?; match responses.next().await? { Message::ParseComplete => {} _ => return Err(Error::unexpected_message()), } match responses.next().await? { Message::ParameterDescription(_) => {} _ => return Err(Error::unexpected_message()), }; let row_description = match responses.next().await? { Message::RowDescription(body) => Some(body), Message::NoData => None, _ => return Err(Error::unexpected_message()), }; let columns = crate::prepare::parse_row_description(&mut client, typecache, row_description).await?; let responses = client.send_with_sync(|buf| { // Bind, pass params as text, retrieve as text match frontend::bind( "", // empty string selects the unnamed portal "", // unnamed prepared statement std::iter::empty(), // all parameters use the default format (text) params, |param, buf| match param { Some(param) => { buf.put_slice(param.as_ref().as_bytes()); Ok(postgres_protocol2::IsNull::No) } None => Ok(postgres_protocol2::IsNull::Yes), }, Some(0), // all text buf, ) { Ok(()) => Ok(()), Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, 0)), Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), }?; // Execute frontend::execute("", 0, buf).map_err(Error::encode)?; Ok(()) })?; match responses.next().await? { Message::BindComplete => {} _ => return Err(Error::unexpected_message()), } Ok(RowStream { responses, statement: Statement::new("", columns), command_tag: None, status: ReadyForQueryStatus::Unknown, output_format: Format::Text, }) } /// A stream of table rows. pub struct RowStream<'a> { responses: &'a mut Responses, output_format: Format, pub statement: Statement, pub command_tag: Option, pub status: ReadyForQueryStatus, } impl Stream for RowStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.get_mut(); loop { match ready!(this.responses.poll_next(cx)?) { Message::DataRow(body) => { return Poll::Ready(Some(Ok(Row::new( this.statement.clone(), body, this.output_format, )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { if let Ok(tag) = body.tag() { this.command_tag = Some(tag.to_string()); } } Message::ReadyForQuery(status) => { this.status = status.into(); return Poll::Ready(None); } _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), } } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/row.rs ================================================ //! Rows. use std::ops::Range; use std::sync::Arc; use std::{fmt, str}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend::DataRowBody; use postgres_types2::{Format, WrongFormat}; use crate::row::sealed::{AsName, Sealed}; use crate::simple_query::SimpleColumn; use crate::statement::Column; use crate::types::{FromSql, Type, WrongType}; use crate::{Error, Statement}; mod sealed { pub trait Sealed {} pub trait AsName { fn as_name(&self) -> &str; } } impl AsName for Column { fn as_name(&self) -> &str { self.name() } } impl AsName for String { fn as_name(&self) -> &str { self } } /// A trait implemented by types that can index into columns of a row. /// /// This cannot be implemented outside of this crate. pub trait RowIndex: Sealed { #[doc(hidden)] fn __idx(&self, columns: &[T]) -> Option where T: AsName; } impl Sealed for usize {} impl RowIndex for usize { #[inline] fn __idx(&self, columns: &[T]) -> Option where T: AsName, { if *self >= columns.len() { None } else { Some(*self) } } } impl Sealed for str {} impl RowIndex for str { #[inline] fn __idx(&self, columns: &[T]) -> Option where T: AsName, { if let Some(idx) = columns.iter().position(|d| d.as_name() == self) { return Some(idx); }; // FIXME ASCII-only case insensitivity isn't really the right thing to // do. Postgres itself uses a dubious wrapper around tolower and JDBC // uses the US locale. columns .iter() .position(|d| d.as_name().eq_ignore_ascii_case(self)) } } impl Sealed for &T where T: ?Sized + Sealed {} impl RowIndex for &T where T: ?Sized + RowIndex, { #[inline] fn __idx(&self, columns: &[U]) -> Option where U: AsName, { T::__idx(*self, columns) } } /// A row of data returned from the database by a query. pub struct Row { statement: Statement, output_format: Format, body: DataRowBody, ranges: Vec>>, } impl fmt::Debug for Row { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Row") .field("columns", &self.columns()) .finish() } } impl Row { pub(crate) fn new( statement: Statement, body: DataRowBody, output_format: Format, ) -> Result { let ranges = body.ranges().collect().map_err(Error::parse)?; Ok(Row { statement, body, ranges, output_format, }) } /// Returns information about the columns of data in the row. pub fn columns(&self) -> &[Column] { self.statement.columns() } /// Determines if the row contains no values. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the number of values in the row. pub fn len(&self) -> usize { self.columns().len() } /// Deserializes a value from the row. /// /// The value can be specified either by its numeric index in the row, or by its column name. /// /// # Panics /// /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. pub fn get<'a, I, T>(&'a self, idx: I) -> T where I: RowIndex + fmt::Display, T: FromSql<'a>, { match self.get_inner(&idx) { Ok(ok) => ok, Err(err) => panic!("error retrieving column {idx}: {err}"), } } /// Like `Row::get`, but returns a `Result` rather than panicking. pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result where I: RowIndex + fmt::Display, T: FromSql<'a>, { self.get_inner(&idx) } fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result where I: RowIndex + fmt::Display, T: FromSql<'a>, { let idx = match idx.__idx(self.columns()) { Some(idx) => idx, None => return Err(Error::column(idx.to_string())), }; let ty = self.columns()[idx].type_(); if !T::accepts(ty) { return Err(Error::from_sql( Box::new(WrongType::new::(ty.clone())), idx, )); } FromSql::from_sql_nullable(ty, self.col_buffer(idx)).map_err(|e| Error::from_sql(e, idx)) } /// Get the raw bytes for the column at the given index. fn col_buffer(&self, idx: usize) -> Option<&[u8]> { let range = self.ranges.get(idx)?.to_owned()?; Some(&self.body.buffer()[range]) } /// Interpret the column at the given index as text /// /// Useful when using query_raw_txt() which sets text transfer mode pub fn as_text(&self, idx: usize) -> Result, Error> { if self.output_format == Format::Text { match self.col_buffer(idx) { Some(raw) => { FromSql::from_sql(&Type::TEXT, raw).map_err(|e| Error::from_sql(e, idx)) } None => Ok(None), } } else { Err(Error::from_sql(Box::new(WrongFormat {}), idx)) } } /// Row byte size pub fn body_len(&self) -> usize { self.body.buffer().len() } } impl AsName for SimpleColumn { fn as_name(&self) -> &str { self.name() } } /// A row of data returned from the database by a simple query. #[derive(Debug)] pub struct SimpleQueryRow { columns: Arc<[SimpleColumn]>, body: DataRowBody, ranges: Vec>>, } impl SimpleQueryRow { #[allow(clippy::new_ret_no_self)] pub(crate) fn new( columns: Arc<[SimpleColumn]>, body: DataRowBody, ) -> Result { let ranges = body.ranges().collect().map_err(Error::parse)?; Ok(SimpleQueryRow { columns, body, ranges, }) } /// Returns information about the columns of data in the row. pub fn columns(&self) -> &[SimpleColumn] { &self.columns } /// Determines if the row contains no values. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the number of values in the row. pub fn len(&self) -> usize { self.columns.len() } /// Returns a value from the row. /// /// The value can be specified either by its numeric index in the row, or by its column name. /// /// # Panics /// /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. pub fn get(&self, idx: I) -> Option<&str> where I: RowIndex + fmt::Display, { match self.get_inner(&idx) { Ok(ok) => ok, Err(err) => panic!("error retrieving column {idx}: {err}"), } } /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking. pub fn try_get(&self, idx: I) -> Result, Error> where I: RowIndex + fmt::Display, { self.get_inner(&idx) } fn get_inner(&self, idx: &I) -> Result, Error> where I: RowIndex + fmt::Display, { let idx = match idx.__idx(&self.columns) { Some(idx) => idx, None => return Err(Error::column(idx.to_string())), }; let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]); FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx)) } } ================================================ FILE: libs/proxy/tokio-postgres2/src/simple_query.rs ================================================ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; use fallible_iterator::FallibleIterator; use futures_util::{Stream, ready}; use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; use tracing::debug; use crate::client::{InnerClient, Responses}; use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; /// Information about a column of a single query row. #[derive(Debug)] pub struct SimpleColumn { name: String, } impl SimpleColumn { pub(crate) fn new(name: String) -> SimpleColumn { SimpleColumn { name } } /// Returns the name of the column. pub fn name(&self) -> &str { &self.name } } pub async fn simple_query<'a>( client: &'a mut InnerClient, query: &str, ) -> Result, Error> { debug!("executing simple query: {}", query); let responses = client.send_simple_query(query)?; Ok(SimpleQueryStream { responses, columns: None, status: ReadyForQueryStatus::Unknown, }) } pub async fn batch_execute( client: &mut InnerClient, query: &str, ) -> Result { debug!("executing statement batch: {}", query); let responses = client.send_simple_query(query)?; loop { match responses.next().await? { Message::ReadyForQuery(status) => return Ok(status.into()), Message::CommandComplete(_) | Message::EmptyQueryResponse | Message::RowDescription(_) | Message::DataRow(_) => {} _ => return Err(Error::unexpected_message()), } } } pin_project! { /// A stream of simple query results. pub struct SimpleQueryStream<'a> { responses: &'a mut Responses, columns: Option>, status: ReadyForQueryStatus, } } impl SimpleQueryStream<'_> { /// Returns if the connection is ready for querying, with the status of the connection. /// /// This might be available only after the stream has been exhausted. pub fn ready_status(&self) -> ReadyForQueryStatus { self.status } } impl Stream for SimpleQueryStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.project(); loop { match ready!(this.responses.poll_next(cx)?) { Message::CommandComplete(body) => { let rows = body .tag() .map_err(Error::parse)? .rsplit(' ') .next() .unwrap() .parse() .unwrap_or(0); return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows)))); } Message::EmptyQueryResponse => { return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0)))); } Message::RowDescription(body) => { let columns = body .fields() .map(|f| Ok(SimpleColumn::new(f.name().to_string()))) .collect::>() .map_err(Error::parse)? .into(); *this.columns = Some(columns); } Message::DataRow(body) => { let row = match &this.columns { Some(columns) => SimpleQueryRow::new(columns.clone(), body)?, None => return Poll::Ready(Some(Err(Error::unexpected_message()))), }; return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row)))); } Message::ReadyForQuery(s) => { *this.status = s.into(); return Poll::Ready(None); } _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), } } } } ================================================ FILE: libs/proxy/tokio-postgres2/src/statement.rs ================================================ use std::fmt; use std::sync::Arc; use crate::types::Type; use postgres_protocol2::Oid; use postgres_protocol2::message::backend::Field; struct StatementInner { name: &'static str, columns: Vec, } /// A prepared statement. /// /// Prepared statements can only be used with the connection that created them. #[derive(Clone)] pub struct Statement(Arc); impl Statement { pub(crate) fn new(name: &'static str, columns: Vec) -> Statement { Statement(Arc::new(StatementInner { name, columns })) } pub(crate) fn name(&self) -> &str { self.0.name } /// Returns information about the columns returned when the statement is queried. pub fn columns(&self) -> &[Column] { &self.0.columns } } /// Information about a column of a query. pub struct Column { name: String, pub(crate) type_: Type, // raw fields from RowDescription table_oid: Oid, column_id: i16, format: i16, // that better be stored in self.type_, but that is more radical refactoring type_oid: Oid, type_size: i16, type_modifier: i32, } impl Column { pub(crate) fn new(name: String, type_: Type, raw_field: Field<'_>) -> Column { Column { name, type_, table_oid: raw_field.table_oid(), column_id: raw_field.column_id(), format: raw_field.format(), type_oid: raw_field.type_oid(), type_size: raw_field.type_size(), type_modifier: raw_field.type_modifier(), } } /// Returns the name of the column. pub fn name(&self) -> &str { &self.name } /// Returns the type of the column. pub fn type_(&self) -> &Type { &self.type_ } /// Returns the table OID of the column. pub fn table_oid(&self) -> Oid { self.table_oid } /// Returns the column ID of the column. pub fn column_id(&self) -> i16 { self.column_id } /// Returns the format of the column. pub fn format(&self) -> i16 { self.format } /// Returns the type OID of the column. pub fn type_oid(&self) -> Oid { self.type_oid } /// Returns the type size of the column. pub fn type_size(&self) -> i16 { self.type_size } /// Returns the type modifier of the column. pub fn type_modifier(&self) -> i32 { self.type_modifier } } impl fmt::Debug for Column { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("Column") .field("name", &self.name) .field("type", &self.type_) .finish() } } ================================================ FILE: libs/proxy/tokio-postgres2/src/tls.rs ================================================ //! TLS support. use std::error::Error; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use std::{fmt, io}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pub(crate) mod private { pub struct ForcePrivateApi; } /// Channel binding information returned from a TLS handshake. pub struct ChannelBinding { pub(crate) tls_server_end_point: Option>, } impl ChannelBinding { /// Creates a `ChannelBinding` containing no information. pub fn none() -> ChannelBinding { ChannelBinding { tls_server_end_point: None, } } /// Creates a `ChannelBinding` containing `tls-server-end-point` channel binding information. pub fn tls_server_end_point(tls_server_end_point: Vec) -> ChannelBinding { ChannelBinding { tls_server_end_point: Some(tls_server_end_point), } } } /// A constructor of `TlsConnect`ors. /// /// Requires the `runtime` Cargo feature (enabled by default). pub trait MakeTlsConnect { /// The stream type created by the `TlsConnect` implementation. type Stream: TlsStream + Unpin; /// The `TlsConnect` implementation created by this type. type TlsConnect: TlsConnect; /// The error type returned by the `TlsConnect` implementation. type Error: Into>; /// Creates a new `TlsConnect`or. /// /// The domain name is provided for certificate verification and SNI. fn make_tls_connect(&self, domain: &str) -> Result; } /// An asynchronous function wrapping a stream in a TLS session. pub trait TlsConnect { /// The stream returned by the future. type Stream: TlsStream + Unpin; /// The error returned by the future. type Error: Into>; /// The future returned by the connector. type Future: Future>; /// Returns a future performing a TLS handshake over the stream. fn connect(self, stream: S) -> Self::Future; #[doc(hidden)] fn can_connect(&self, _: private::ForcePrivateApi) -> bool { true } } /// A TLS-wrapped connection to a PostgreSQL database. pub trait TlsStream: AsyncRead + AsyncWrite { /// Returns channel binding information for the session. fn channel_binding(&self) -> ChannelBinding; } /// A `MakeTlsConnect` and `TlsConnect` implementation which simply returns an error. /// /// This can be used when `sslmode` is `none` or `prefer`. #[derive(Debug, Copy, Clone)] pub struct NoTls; impl MakeTlsConnect for NoTls { type Stream = NoTlsStream; type TlsConnect = NoTls; type Error = NoTlsError; fn make_tls_connect(&self, _: &str) -> Result { Ok(NoTls) } } impl TlsConnect for NoTls { type Stream = NoTlsStream; type Error = NoTlsError; type Future = NoTlsFuture; fn connect(self, _: S) -> NoTlsFuture { NoTlsFuture(()) } fn can_connect(&self, _: private::ForcePrivateApi) -> bool { false } } /// The future returned by `NoTls`. pub struct NoTlsFuture(()); impl Future for NoTlsFuture { type Output = Result; fn poll(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll { Poll::Ready(Err(NoTlsError(()))) } } /// The TLS "stream" type produced by the `NoTls` connector. /// /// Since `NoTls` doesn't support TLS, this type is uninhabited. pub enum NoTlsStream {} impl AsyncRead for NoTlsStream { fn poll_read( self: Pin<&mut Self>, _: &mut Context<'_>, _: &mut ReadBuf<'_>, ) -> Poll> { match *self {} } } impl AsyncWrite for NoTlsStream { fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, _: &[u8]) -> Poll> { match *self {} } fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { match *self {} } fn poll_shutdown(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { match *self {} } } impl TlsStream for NoTlsStream { fn channel_binding(&self) -> ChannelBinding { match *self {} } } /// The error returned by `NoTls`. #[derive(Debug)] pub struct NoTlsError(()); impl fmt::Display for NoTlsError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.write_str("no TLS implementation configured") } } impl Error for NoTlsError {} ================================================ FILE: libs/proxy/tokio-postgres2/src/transaction.rs ================================================ use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; /// A representation of a PostgreSQL database transaction. /// /// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the /// transaction. Transactions can be nested, with inner transactions implemented via safepoints. pub struct Transaction<'a> { client: &'a mut Client, done: bool, } impl Drop for Transaction<'_> { fn drop(&mut self) { if self.done { return; } let _ = self.client.inner_mut().send_simple_query("ROLLBACK"); } } impl<'a> Transaction<'a> { pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> { Transaction { client, done: false, } } /// Consumes the transaction, committing all changes made within it. pub async fn commit(mut self) -> Result { self.done = true; self.client.batch_execute("COMMIT").await } /// Rolls the transaction back, discarding all changes made within it. /// /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller. pub async fn rollback(mut self) -> Result { self.done = true; self.client.batch_execute("ROLLBACK").await } /// Like `Client::query_raw_txt`. pub async fn query_raw_txt( &mut self, statement: &str, params: I, ) -> Result, Error> where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { self.client.query_raw_txt(statement, params).await } /// Like `Client::cancel_token`. pub fn cancel_token(&self) -> CancelToken { self.client.cancel_token() } /// Returns a reference to the underlying `Client`. pub fn client(&self) -> &Client { self.client } /// Returns a reference to the underlying `Client`. pub fn client_mut(&mut self) -> &mut Client { self.client } } ================================================ FILE: libs/proxy/tokio-postgres2/src/transaction_builder.rs ================================================ use crate::{Client, Error, Transaction}; /// The isolation level of a database transaction. #[derive(Debug, Copy, Clone)] #[non_exhaustive] pub enum IsolationLevel { /// Equivalent to `ReadCommitted`. ReadUncommitted, /// An individual statement in the transaction will see rows committed before it began. ReadCommitted, /// All statements in the transaction will see the same view of rows committed before the first query in the /// transaction. RepeatableRead, /// The reads and writes in this transaction must be able to be committed as an atomic "unit" with respect to reads /// and writes of all other concurrent serializable transactions without interleaving. Serializable, } /// A builder for database transactions. pub struct TransactionBuilder<'a> { client: &'a mut Client, isolation_level: Option, read_only: Option, deferrable: Option, } impl<'a> TransactionBuilder<'a> { pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> { TransactionBuilder { client, isolation_level: None, read_only: None, deferrable: None, } } /// Sets the isolation level of the transaction. pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self { self.isolation_level = Some(isolation_level); self } /// Sets the access mode of the transaction. pub fn read_only(mut self, read_only: bool) -> Self { self.read_only = Some(read_only); self } /// Sets the deferrability of the transaction. /// /// If the transaction is also serializable and read only, creation of the transaction may block, but when it /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to /// serialization failure. pub fn deferrable(mut self, deferrable: bool) -> Self { self.deferrable = Some(deferrable); self } /// Begins the transaction. /// /// The transaction will roll back by default - use the `commit` method to commit it. pub async fn start(self) -> Result, Error> { let mut query = "START TRANSACTION".to_string(); let mut first = true; if let Some(level) = self.isolation_level { first = false; query.push_str(" ISOLATION LEVEL "); let level = match level { IsolationLevel::ReadUncommitted => "READ UNCOMMITTED", IsolationLevel::ReadCommitted => "READ COMMITTED", IsolationLevel::RepeatableRead => "REPEATABLE READ", IsolationLevel::Serializable => "SERIALIZABLE", }; query.push_str(level); } if let Some(read_only) = self.read_only { if !first { query.push(','); } first = false; let s = if read_only { " READ ONLY" } else { " READ WRITE" }; query.push_str(s); } if let Some(deferrable) = self.deferrable { if !first { query.push(','); } let s = if deferrable { " DEFERRABLE" } else { " NOT DEFERRABLE" }; query.push_str(s); } self.client.batch_execute(&query).await?; Ok(Transaction::new(self.client)) } } ================================================ FILE: libs/proxy/tokio-postgres2/src/types.rs ================================================ //! Types. //! //! This module is a reexport of the `postgres_types` crate. #[doc(inline)] pub use postgres_types2::*; ================================================ FILE: libs/remote_storage/Cargo.toml ================================================ [package] name = "remote_storage" version = "0.1.0" edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true async-stream.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true base64.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true hyper = { workspace = true, features = ["client"] } futures.workspace = true reqwest = { workspace = true, features = ["multipart", "stream"] } chrono = { version = "0.4", default-features = false, features = ["clock"] } serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } tokio-stream.workspace = true tokio-util = { workspace = true, features = ["compat"] } toml_edit.workspace = true tracing.workspace = true scopeguard.workspace = true metrics.workspace = true utils = { path = "../utils", default-features = false } pin-project-lite.workspace = true azure_core.workspace = true azure_identity.workspace = true azure_storage.workspace = true azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } gcp_auth = "0.12.3" url.workspace = true http.workspace = true uuid.workspace = true byteorder = "1.4" rand.workspace = true [dev-dependencies] camino-tempfile.workspace = true test-context.workspace = true rand.workspace = true tokio = { workspace = true, features = ["test-util"] } ================================================ FILE: libs/remote_storage/src/azure_blob.rs ================================================ //! Azure Blob Storage wrapper use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Display; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, SystemTime}; use std::{env, io}; use anyhow::{Context, Result, anyhow}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::BlobBlockType; use azure_storage_blobs::blob::BlockList; use azure_storage_blobs::blob::{Blob, CopyStatus}; use azure_storage_blobs::container::operations::ListBlobsBuilder; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use base64::{Engine as _, engine::general_purpose::URL_SAFE}; use byteorder::{BigEndian, ByteOrder}; use bytes::Bytes; use camino::Utf8Path; use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; use tokio::fs::File; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use crate::config::AzureConfig; use crate::error::Cancelled; use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, Version, VersionKind, }; pub struct AzureBlobStorage { client: ContainerClient, container_name: String, prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, // Alternative timeout used for metadata objects which are expected to be small pub small_timeout: Duration, /* BEGIN_HADRON */ pub put_block_size_mb: Option, /* END_HADRON */ } impl AzureBlobStorage { pub fn new( azure_config: &AzureConfig, timeout: Duration, small_timeout: Duration, ) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name ); // Use the storage account from the config by default, fall back to env var if not present. let account = azure_config.storage_account.clone().unwrap_or_else(|| { env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT") }); // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that, // otherwise try the token based credentials. let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") { StorageCredentials::access_key(account.clone(), access_key) } else { let token_credential = azure_identity::create_default_credential() .context("trying to obtain Azure default credentials")?; StorageCredentials::token_credential(token_credential) }; let builder = ClientBuilder::new(account, credentials) // we have an outer retry .retry(RetryOptions::none()) // Customize transport to configure conneciton pooling .transport(TransportOptions::new(Self::reqwest_client( azure_config.conn_pool_size, ))); let client = builder.container_client(azure_config.container_name.to_owned()); let max_keys_per_list_response = if let Some(limit) = azure_config.max_keys_per_list_response { Some( NonZeroU32::new(limit as u32) .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?, ) } else { None }; Ok(AzureBlobStorage { client, container_name: azure_config.container_name.to_owned(), prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, small_timeout, /* BEGIN_HADRON */ put_block_size_mb: azure_config.put_block_size_mb, /* END_HADRON */ }) } fn reqwest_client(conn_pool_size: usize) -> Arc { let client = reqwest::ClientBuilder::new() .pool_max_idle_per_host(conn_pool_size) .build() .expect("failed to build `reqwest` client"); Arc::new(client) } pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let path_string = path.get_path().as_str(); match &self.prefix_in_container { Some(prefix) => { if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix.clone() + path_string } else { format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}") } } None => path_string.to_string(), } } fn name_to_relative_path(&self, key: &str) -> RemotePath { let relative_path = match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) { Some(stripped) => stripped, // we rely on Azure to return properly prefixed paths // for requests with a certain prefix None => panic!( "Key {key} does not start with container prefix {:?}", self.prefix_in_container ), }; RemotePath( relative_path .split(REMOTE_STORAGE_PREFIX_SEPARATOR) .collect(), ) } async fn download_for_builder( &self, builder: GetBlobBuilder, timeout: Duration, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Get; let _permit = self.permit(kind, cancel).await?; let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let mut etag = None; let mut last_modified = None; let mut metadata = HashMap::new(); let started_at = start_measuring_requests(kind); let download = async { let response = builder // convert to concrete Pageable .into_stream() // convert to TryStream .into_stream() .map_err(to_download_error); // apply per request timeout let response = tokio_stream::StreamExt::timeout(response, timeout); // flatten let response = response.map(|res| match res { Ok(res) => res, Err(_elapsed) => Err(DownloadError::Timeout), }); let mut response = Box::pin(response); let Some(part) = response.next().await else { return Err(DownloadError::Other(anyhow::anyhow!( "Azure GET response contained no response body" ))); }; let part = part?; if etag.is_none() { etag = Some(part.blob.properties.etag); } if last_modified.is_none() { last_modified = Some(part.blob.properties.last_modified.into()); } if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } // unwrap safety: if these were None, bufs would be empty and we would have returned an error already let etag = etag.unwrap(); let last_modified = last_modified.unwrap(); let tail_stream = response .map(|part| match part { Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))), Err(e) => { Either::Right(futures::stream::once(async { Err(io::Error::other(e)) })) } }) .flatten(); let stream = part .data .map(|r| r.map_err(io::Error::other)) .chain(sync_wrapper::SyncStream::new(tail_stream)); //.chain(SyncStream::from_pin(Box::pin(tail_stream))); let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream); Ok(Download { download_stream: Box::pin(download_stream), etag, last_modified, metadata: Some(StorageMetadata(metadata)), }) }; let download = tokio::select! { bufs = download => bufs, cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout), TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled), }, }; let started_at = ScopeGuard::into_inner(started_at); let outcome = match &download { Ok(_) => AttemptOutcome::Ok, // At this level in the stack 404 and 304 responses do not indicate an error. // There's expected cases when a blob may not exist or hasn't been modified since // the last get (e.g. probing for timeline indices and heatmap downloads). // Callers should handle errors if they are unexpected. Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok, Err(_) => AttemptOutcome::Err, }; crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, outcome, started_at); download } fn list_streaming_for_fn( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, request_kind: RequestKind, customize_builder: impl Fn(ListBlobsBuilder) -> ListBlobsBuilder, ) -> impl Stream> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| { self.prefix_in_container.clone().map(|mut s| { if !s.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } s }) }); async_stream::stream! { let _permit = self.permit(request_kind, cancel).await?; let mut builder = self.client.list_blobs(); if let ListingMode::WithDelimiter = mode { builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } if let Some(prefix) = list_prefix { builder = builder.prefix(Cow::from(prefix.to_owned())); } if let Some(limit) = self.max_keys_per_list_response { builder = builder.max_results(MaxResults::new(limit)); } builder = customize_builder(builder); let mut next_marker = None; let mut timeout_try_cnt = 1; 'outer: loop { let mut builder = builder.clone(); if let Some(marker) = next_marker.clone() { builder = builder.marker(marker); } // Azure Blob Rust SDK does not expose the list blob API directly. Users have to use // their pageable iterator wrapper that returns all keys as a stream. We want to have // full control of paging, and therefore we only take the first item from the stream. let mut response_stream = builder.into_stream(); let response = response_stream.next(); // Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request // would immediately succeed. Therefore, we use exponential backoff timeout to retry the request. // (Usually, exponential backoff is used to determine the sleep time between two retries.) We // start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures. // timeout = min(5 * (1.0+1.0)^n, self.timeout). let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64()); let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response); let response = response.map(|res| { match res { Ok(Some(Ok(res))) => Ok(Some(res)), Ok(Some(Err(e))) => Err(to_download_error(e)), Ok(None) => Ok(None), Err(_elasped) => Err(DownloadError::Timeout), } }); let mut max_keys = max_keys.map(|mk| mk.get()); let next_item = tokio::select! { op = response => op, _ = cancel.cancelled() => Err(DownloadError::Cancelled), }; if let Err(DownloadError::Timeout) = &next_item { timeout_try_cnt += 1; if timeout_try_cnt <= 5 { continue 'outer; } } let next_item = match next_item { Ok(next_item) => next_item, Err(e) => { // The error is potentially retryable, so we must rewind the loop after yielding. yield Err(e); continue 'outer; }, }; // Log a warning if we saw two timeouts in a row before a successful request if timeout_try_cnt > 2 { tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); } timeout_try_cnt = 1; let Some(entry) = next_item else { // The list is complete, so yield it. break; }; let mut res = T::default(); next_marker = entry.continuation(); let prefix_iter = entry .blobs .prefixes() .map(|prefix| self.name_to_relative_path(&prefix.name)); res.add_prefixes(self, prefix_iter); let blob_iter = entry .blobs .blobs(); for key in blob_iter { res.add_blob(self, key); if let Some(mut mk) = max_keys { assert!(mk > 0); mk -= 1; if mk == 0 { yield Ok(res); // limit reached break 'outer; } max_keys = Some(mk); } } yield Ok(res); // We are done here if next_marker.is_none() { break; } } } } async fn permit( &self, kind: RequestKind, cancel: &CancellationToken, ) -> Result, Cancelled> { let acquire = self.concurrency_limiter.acquire(kind); tokio::select! { permit = acquire => Ok(permit.expect("never closed")), _ = cancel.cancelled() => Err(Cancelled), } } pub fn container_name(&self) -> &str { &self.container_name } async fn list_versions_with_permit( &self, _permit: &tokio::sync::SemaphorePermit<'_>, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let customize_builder = |mut builder: ListBlobsBuilder| { builder = builder.include_versions(true); // We do not return this info back to `VersionListing` yet. builder = builder.include_deleted(true); builder }; let kind = RequestKind::ListVersions; let mut stream = std::pin::pin!(self.list_streaming_for_fn( prefix, mode, max_keys, cancel, kind, customize_builder )); let mut combined: crate::VersionListing = stream.next().await.expect("At least one item required")?; while let Some(list) = stream.next().await { let list = list?; combined.versions.extend(list.versions.into_iter()); } Ok(combined) } } trait ListingCollector { fn add_prefixes(&mut self, abs: &AzureBlobStorage, prefix_it: impl Iterator); fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob); } impl ListingCollector for Listing { fn add_prefixes( &mut self, _abs: &AzureBlobStorage, prefix_it: impl Iterator, ) { self.prefixes.extend(prefix_it); } fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) { self.keys.push(ListingObject { key: abs.name_to_relative_path(&blob.name), last_modified: blob.properties.last_modified.into(), size: blob.properties.content_length, }); } } impl ListingCollector for crate::VersionListing { fn add_prefixes( &mut self, _abs: &AzureBlobStorage, _prefix_it: impl Iterator, ) { // nothing } fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) { let id = crate::VersionId(blob.version_id.clone().expect("didn't find version ID")); self.versions.push(crate::Version { key: abs.name_to_relative_path(&blob.name), last_modified: blob.properties.last_modified.into(), kind: crate::VersionKind::Version(id), }); } } fn to_azure_metadata(metadata: StorageMetadata) -> Metadata { let mut res = Metadata::new(); for (k, v) in metadata.0.into_iter() { res.insert(k, v); } res } fn to_download_error(error: azure_core::Error) -> DownloadError { if let Some(http_err) = error.as_http_error() { match http_err.status() { StatusCode::NotFound => DownloadError::NotFound, StatusCode::NotModified => DownloadError::Unmodified, StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)), _ => DownloadError::Other(anyhow::Error::new(error)), } } else { DownloadError::Other(error.into()) } } impl RemoteStorage for AzureBlobStorage { fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> { let customize_builder = |builder| builder; let kind = RequestKind::ListVersions; self.list_streaming_for_fn(prefix, mode, max_keys, cancel, kind, customize_builder) } async fn list_versions( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> std::result::Result { let kind = RequestKind::ListVersions; let permit = self.permit(kind, cancel).await?; self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel) .await } async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Head; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let blob_client = self.client.blob_client(self.relative_path_to_name(key)); let properties_future = blob_client.get_properties().into_future(); let properties_future = tokio::time::timeout(self.small_timeout, properties_future); let res = tokio::select! { res = properties_future => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; if let Ok(inner) = &res { // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, inner, started_at); } let data = match res { Ok(Ok(data)) => Ok(data), Ok(Err(sdk)) => Err(to_download_error(sdk)), Err(_timeout) => Err(DownloadError::Timeout), }?; let properties = data.blob.properties; Ok(ListingObject { key: key.to_owned(), last_modified: SystemTime::from(properties.last_modified), size: properties.content_length, }) } async fn upload( &self, from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let mut metadata_map = metadata.unwrap_or([].into()); let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block"); /* BEGIN_HADRON */ let op = async move { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024; if timeline_file_path.is_none() || put_block_size == 0 { // Use put_block_blob directly. let from: Pin< Box> + Send + Sync + 'static>, > = Box::pin(from); let from = NonSeekableStream::new(from, data_size_bytes); let body = azure_core::Body::SeekableStream(Box::new(from)); let mut builder = blob_client.put_block_blob(body); if !metadata_map.0.is_empty() { builder = builder.metadata(to_azure_metadata(metadata_map)); } let fut = builder.into_future(); let fut = tokio::time::timeout(self.timeout, fut); let result = fut.await; match result { Ok(Ok(_response)) => return Ok(()), Ok(Err(azure)) => return Err(azure.into()), Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()), }; } // Upload chunks concurrently using Put Block. // Each PutBlock uploads put_block_size bytes of the file. let mut upload_futures: Vec>> = vec![]; let mut block_list = BlockList::default(); let mut start_bytes = 0u64; let mut remaining_bytes = data_size_bytes; let mut block_list_count = 0; while remaining_bytes > 0 { let block_size = std::cmp::min(remaining_bytes, put_block_size); let end_bytes = start_bytes + block_size as u64; let block_id = block_list_count; let timeout = self.timeout; let blob_client = blob_client.clone(); let timeline_file = timeline_file_path.clone().unwrap().clone(); let mut encoded_block_id = [0u8; 8]; BigEndian::write_u64(&mut encoded_block_id, block_id); URL_SAFE.encode(encoded_block_id); // Put one block. let part_fut = async move { let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?; file.seek(io::SeekFrom::Start(start_bytes)).await?; let limited_reader = file.take(block_size as u64); let file_chunk_stream = tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024); let file_chunk_stream_pin: Pin< Box> + Send + Sync + 'static>, > = Box::pin(file_chunk_stream); let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size); let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper)); // Azure put block takes URL-encoded block ids and all blocks must have the same byte length. // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters let builder = blob_client.put_block(encoded_block_id.to_vec(), body); let fut = builder.into_future(); let fut = tokio::time::timeout(timeout, fut); let result = fut.await; tracing::debug!( "azure put block id-{} size {} start {} end {} file {} response {:#?}", block_id, block_size, start_bytes, end_bytes, timeline_file, result ); match result { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure), Err(_timeout) => Err(azure_core::Error::new( azure_core::error::ErrorKind::Io, std::io::Error::new( std::io::ErrorKind::TimedOut, "Operation timed out", ), )), } }; upload_futures.push(tokio::spawn(part_fut)); block_list_count += 1; remaining_bytes -= block_size; start_bytes += block_size as u64; block_list .blocks .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into())); } tracing::debug!( "azure put blocks {} total MB: {} chunk size MB: {}", block_list_count, data_size_bytes / 1024 / 1024, put_block_size / 1024 / 1024 ); // Wait for all blocks to be uploaded. let upload_results = futures::future::try_join_all(upload_futures).await; if upload_results.is_err() { return Err(anyhow::anyhow!(format!( "Failed to upload all blocks {:#?}", upload_results.unwrap_err() ))); } // Commit the blocks. let mut builder = blob_client.put_block_list(block_list); if !metadata_map.0.is_empty() { builder = builder.metadata(to_azure_metadata(metadata_map)); } let fut = builder.into_future(); let fut = tokio::time::timeout(self.timeout, fut); let result = fut.await; tracing::debug!("azure put block list response {:#?}", result); match result { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } }; /* END_HADRON */ let res = tokio::select! { res = op => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let outcome = match res { Ok(_) => AttemptOutcome::Ok, Err(_) => AttemptOutcome::Err, }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, outcome, started_at); res } async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let mut builder = blob_client.get(); if let Some(ref etag) = opts.etag { builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string())); } if let Some(ref version_id) = opts.version_id { let version_id = azure_storage_blobs::prelude::VersionId::new(version_id.0.clone()); builder = builder.blob_versioning(version_id); } if let Some((start, end)) = opts.byte_range() { builder = builder.range(match end { Some(end) => Range::Range(start..end), None => Range::RangeFrom(start..), }); } let timeout = match opts.kind { DownloadKind::Small => self.small_timeout, DownloadKind::Large => self.timeout, }; self.download_for_builder(builder, timeout, cancel).await } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { self.delete_objects(std::array::from_ref(path), cancel) .await } async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let op = async { // TODO batch requests are not supported by the SDK // https://github.com/Azure/azure-sdk-for-rust/issues/1068 for path in paths { #[derive(Debug)] enum AzureOrTimeout { AzureError(azure_core::Error), Timeout, Cancel, } impl Display for AzureOrTimeout { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self:?}") } } let warn_threshold = 3; let max_retries = 5; backoff::retry( || async { let blob_client = self.client.blob_client(self.relative_path_to_name(path)); let request = blob_client.delete().into_future(); let res = tokio::time::timeout(self.timeout, request).await; match res { Ok(Ok(_v)) => Ok(()), Ok(Err(azure_err)) => { if let Some(http_err) = azure_err.as_http_error() { if http_err.status() == StatusCode::NotFound { return Ok(()); } } Err(AzureOrTimeout::AzureError(azure_err)) } Err(_elapsed) => Err(AzureOrTimeout::Timeout), } }, |err| match err { AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false, AzureOrTimeout::Cancel => true, }, warn_threshold, max_retries, "deleting remote object", cancel, ) .await .ok_or_else(|| AzureOrTimeout::Cancel) .and_then(|x| x) .map_err(|e| match e { AzureOrTimeout::AzureError(err) => anyhow::Error::from(err), AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(), AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(), })?; } Ok(()) }; let res = tokio::select! { res = op => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); res } fn max_keys_per_delete(&self) -> usize { super::MAX_KEYS_PER_DELETE_AZURE } async fn copy( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Copy; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let timeout = tokio::time::sleep(self.timeout); let mut copy_status = None; let op = async { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); let source_url = format!( "{}/{}", self.client.url()?, self.relative_path_to_name(from) ); let builder = blob_client.copy(Url::from_str(&source_url)?); let copy = builder.into_future(); let result = copy.await?; copy_status = Some(result.copy_status); loop { match copy_status.as_ref().expect("we always set it to Some") { CopyStatus::Aborted => { anyhow::bail!("Received abort for copy from {from} to {to}."); } CopyStatus::Failed => { anyhow::bail!("Received failure response for copy from {from} to {to}."); } CopyStatus::Success => return Ok(()), CopyStatus::Pending => (), } // The copy is taking longer. Waiting a second and then re-trying. // TODO estimate time based on copy_progress and adjust time based on that tokio::time::sleep(Duration::from_millis(1000)).await; let properties = blob_client.get_properties().into_future().await?; let Some(status) = properties.blob.properties.copy_status else { tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); return Ok(()); }; copy_status = Some(status); } }; let res = tokio::select! { res = op => res, _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), _ = timeout => { let e = anyhow::Error::new(TimeoutOrCancel::Timeout); let e = e.context(format!("Timeout, last status: {copy_status:?}")); Err(e) }, }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); res } async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, _complexity_limit: Option, ) -> Result<(), TimeTravelError> { let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected " .to_string() + "for some specific files. If a file gets deleted but then overwritten and we want to recover " + "to the time during the file was not present, this functionality will recover the file. Only " + "use the functionality for services that can tolerate this. For example, recovering a state of the " + "pageserver tenants."; tracing::error!("{}", msg); let kind = RequestKind::TimeTravel; let permit = self.permit(kind, cancel).await?; let mode = ListingMode::NoDelimiter; let version_listing = self .list_versions_with_permit(&permit, prefix, mode, None, cancel) .await .map_err(|err| match err { DownloadError::Other(e) => TimeTravelError::Other(e), DownloadError::Cancelled => TimeTravelError::Cancelled, other => TimeTravelError::Other(other.into()), })?; let versions_and_deletes = version_listing.versions; tracing::info!( "Built list for time travel with {} versions and deletions", versions_and_deletes.len() ); // Work on the list of references instead of the objects directly, // otherwise we get lifetime errors in the sort_by_key call below. let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); let mut vds_for_key = HashMap::<_, Vec<_>>::new(); for vd in &versions_and_deletes { let Version { key, .. } = &vd; let version_id = vd.version_id().map(|v| v.0.as_str()); if version_id == Some("null") { return Err(TimeTravelError::Other(anyhow!( "Received ListVersions response for key={key} with version_id='null', \ indicating either disabled versioning, or legacy objects with null version id values" ))); } tracing::trace!("Parsing version key={key} kind={:?}", vd.kind); vds_for_key.entry(key).or_default().push(vd); } let warn_threshold = 3; let max_retries = 10; let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); for (key, versions) in vds_for_key { let last_vd = versions.last().unwrap(); let key = self.relative_path_to_name(key); if last_vd.last_modified > done_if_after { tracing::debug!("Key {key} has version later than done_if_after, skipping"); continue; } // the version we want to restore to. let version_to_restore_to = match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { Ok(v) => v, Err(e) => e, }; if version_to_restore_to == versions.len() { tracing::debug!("Key {key} has no changes since timestamp, skipping"); continue; } let mut do_delete = false; if version_to_restore_to == 0 { // All versions more recent, so the key didn't exist at the specified time point. tracing::debug!( "All {} versions more recent for {key}, deleting", versions.len() ); do_delete = true; } else { match &versions[version_to_restore_to - 1] { Version { kind: VersionKind::Version(version_id), .. } => { let source_url = format!( "{}/{}?versionid={}", self.client .url() .map_err(|e| TimeTravelError::Other(anyhow!("{e}")))?, key, version_id.0 ); tracing::debug!( "Promoting old version {} for {key} at {}...", version_id.0, source_url ); backoff::retry( || async { let blob_client = self.client.blob_client(key.clone()); let op = blob_client.copy(Url::from_str(&source_url).unwrap()); tokio::select! { res = op => res.map_err(|e| TimeTravelError::Other(e.into())), _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), } }, is_permanent, warn_threshold, max_retries, "copying object version for time_travel_recover", cancel, ) .await .ok_or_else(|| TimeTravelError::Cancelled) .and_then(|x| x)?; tracing::info!(?version_id, %key, "Copied old version in Azure blob storage"); } Version { kind: VersionKind::DeletionMarker, .. } => { do_delete = true; } } }; if do_delete { if matches!(last_vd.kind, VersionKind::DeletionMarker) { // Key has since been deleted (but there was some history), no need to do anything tracing::debug!("Key {key} already deleted, skipping."); } else { tracing::debug!("Deleting {key}..."); self.delete(&RemotePath::from_string(&key).unwrap(), cancel) .await .map_err(|e| { // delete_oid0 will use TimeoutOrCancel if TimeoutOrCancel::caused_by_cancel(&e) { TimeTravelError::Cancelled } else { TimeTravelError::Other(e) } })?; } } } Ok(()) } } pin_project_lite::pin_project! { /// Hack to work around not being able to stream once with azure sdk. /// /// Azure sdk clones streams around with the assumption that they are like /// `Arc` (except not supporting tokio), however our streams are not like /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`] /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially" /// seekable, but we can also just re-try the request easier. #[project = NonSeekableStreamProj] enum NonSeekableStream { /// A stream wrappers initial form. /// /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1 /// clone before first request, then this must be changed. Initial { inner: std::sync::Mutex>>>, len: usize, }, /// The actually readable variant, produced by cloning the Initial variant. /// /// The sdk currently always clones once, even without retry policy. Actual { #[pin] inner: tokio_util::compat::Compat>, len: usize, read_any: bool, }, /// Most likely unneeded, but left to make life easier, in case more clones are added. Cloned { len_was: usize, } } } impl NonSeekableStream where S: Stream> + Send + Sync + 'static, { fn new(inner: S, len: usize) -> NonSeekableStream { use tokio_util::compat::TokioAsyncReadCompatExt; let inner = tokio_util::io::StreamReader::new(inner).compat(); let inner = Some(inner); let inner = std::sync::Mutex::new(inner); NonSeekableStream::Initial { inner, len } } } impl std::fmt::Debug for NonSeekableStream { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(), Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(), Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(), } } } impl futures::io::AsyncRead for NonSeekableStream where S: Stream>, { fn poll_read( self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &mut [u8], ) -> std::task::Poll> { match self.project() { NonSeekableStreamProj::Actual { inner, read_any, .. } => { *read_any = true; inner.poll_read(cx, buf) } // NonSeekableStream::Initial does not support reading because it is just much easier // to have the mutex in place where one does not poll the contents, or that's how it // seemed originally. If there is a version upgrade which changes the cloning, then // that support needs to be hacked in. // // including {self:?} into the message would be useful, but unsure how to unproject. _ => std::task::Poll::Ready(Err(std::io::Error::other( "cloned or initial values cannot be read", ))), } } } impl Clone for NonSeekableStream { /// Weird clone implementation exists to support the sdk doing cloning before issuing the first /// request, see type documentation. fn clone(&self) -> Self { use NonSeekableStream::*; match self { Initial { inner, len } => { if let Some(inner) = inner.lock().unwrap().take() { Actual { inner, len: *len, read_any: false, } } else { Self::Cloned { len_was: *len } } } Actual { len, .. } => Cloned { len_was: *len }, Cloned { len_was } => Cloned { len_was: *len_was }, } } } #[async_trait::async_trait] impl azure_core::SeekableStream for NonSeekableStream where S: Stream> + Unpin + Send + Sync + 'static, { async fn reset(&mut self) -> azure_core::error::Result<()> { use NonSeekableStream::*; let msg = match self { Initial { inner, .. } => { if inner.get_mut().unwrap().is_some() { return Ok(()); } else { "reset after first clone is not supported" } } Actual { read_any, .. } if !*read_any => return Ok(()), Actual { .. } => "reset after reading is not supported", Cloned { .. } => "reset after second clone is not supported", }; Err(azure_core::error::Error::new( azure_core::error::ErrorKind::Io, std::io::Error::other(msg), )) } // Note: it is not documented if this should be the total or remaining length, total passes the // tests. fn len(&self) -> usize { use NonSeekableStream::*; match self { Initial { len, .. } => *len, Actual { len, .. } => *len, Cloned { len_was, .. } => *len_was, } } } ================================================ FILE: libs/remote_storage/src/config.rs ================================================ use std::fmt::Debug; use std::num::NonZeroUsize; use std::str::FromStr; use std::time::Duration; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; use serde::{Deserialize, Serialize}; use crate::{ DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct RemoteStorageConfig { /// The storage connection configuration. #[serde(flatten)] pub storage: RemoteStorageKind, /// A common timeout enforced for all requests after concurrency limiter permit has been /// acquired. #[serde( with = "humantime_serde", default = "default_timeout", skip_serializing_if = "is_default_timeout" )] pub timeout: Duration, /// Alternative timeout used for metadata objects which are expected to be small #[serde( with = "humantime_serde", default = "default_small_timeout", skip_serializing_if = "is_default_small_timeout" )] pub small_timeout: Duration, } impl RemoteStorageKind { pub fn bucket_name(&self) -> Option<&str> { match self { RemoteStorageKind::LocalFs { .. } => None, RemoteStorageKind::AwsS3(config) => Some(&config.bucket_name), RemoteStorageKind::AzureContainer(config) => Some(&config.container_name), RemoteStorageKind::GCS(config) => Some(&config.bucket_name), } } } impl RemoteStorageConfig { /// Helper to fetch the configured concurrency limit. pub fn concurrency_limit(&self) -> usize { match &self.storage { RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(), RemoteStorageKind::GCS(c) => c.concurrency_limit.into(), RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(), } } } fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } fn default_small_timeout() -> Duration { RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } fn is_default_timeout(d: &Duration) -> bool { *d == RemoteStorageConfig::DEFAULT_TIMEOUT } fn is_default_small_timeout(d: &Duration) -> bool { *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } /// A kind of a remote storage to connect to, with its connection configuration. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] #[serde(untagged)] pub enum RemoteStorageKind { /// Storage based on local file system. /// Specify a root folder to place all stored files into. LocalFs { local_path: Utf8PathBuf }, /// AWS S3 based storage, storing all files in the S3 bucket /// specified by the config AwsS3(S3Config), /// Azure Blob based storage, storing all files in the container /// specified by the config AzureContainer(AzureConfig), /// Google Cloud based storage, storing all files in the GCS bucket /// specified by the config GCS(GCSConfig), } #[derive(Deserialize)] #[serde(tag = "type")] /// Version of RemoteStorageKind which deserializes with type: LocalFs | AwsS3 | AzureContainer /// Needed for endpoint storage service pub enum TypedRemoteStorageKind { LocalFs { local_path: Utf8PathBuf }, AwsS3(S3Config), AzureContainer(AzureConfig), } impl From for RemoteStorageKind { fn from(value: TypedRemoteStorageKind) -> Self { match value { TypedRemoteStorageKind::LocalFs { local_path } => { RemoteStorageKind::LocalFs { local_path } } TypedRemoteStorageKind::AwsS3(v) => RemoteStorageKind::AwsS3(v), TypedRemoteStorageKind::AzureContainer(v) => RemoteStorageKind::AzureContainer(v), } } } /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). #[derive(Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct S3Config { /// Name of the bucket to connect to. pub bucket_name: String, /// The region where the bucket is located at. pub bucket_region: String, /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. pub prefix_in_bucket: Option, /// A base URL to send S3 requests to. /// By default, the endpoint is derived from a region name, assuming it's /// an AWS S3 region name, erroring on wrong region name. /// Endpoint provides a way to support other S3 flavors and their regions. /// /// Example: `http://127.0.0.1:5000` pub endpoint: Option, /// AWS S3 has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. #[serde(default = "default_remote_storage_s3_concurrency_limit")] pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, #[serde( deserialize_with = "deserialize_storage_class", serialize_with = "serialize_storage_class", default )] pub upload_storage_class: Option, } fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize { DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT .try_into() .unwrap() } fn default_max_keys_per_list_response() -> Option { DEFAULT_MAX_KEYS_PER_LIST_RESPONSE } fn default_azure_conn_pool_size() -> usize { // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues // (https://github.com/hyperium/hyper/issues/2312) // // However, using connection pooling is important to avoid exhausting client ports when // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971) // // We therefore enable a modest pool size by default: this may be configured to zero if // issues like the alleged upstream hyper issue appear. 8 } impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") .field("bucket_name", &self.bucket_name) .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) .field( "max_keys_per_list_response", &self.max_keys_per_list_response, ) .finish() } } #[derive(Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct GCSConfig { /// Name of the bucket to connect to. pub bucket_name: String, /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. pub prefix_in_bucket: Option, #[serde(default = "default_remote_storage_s3_concurrency_limit")] pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, } impl Debug for GCSConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("GCSConfig") .field("bucket_name", &self.bucket_name) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) .field( "max_keys_per_list_response", &self.max_keys_per_list_response, ) .finish() } } /// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct AzureConfig { /// Name of the container to connect to. pub container_name: String, /// Name of the storage account the container is inside of pub storage_account: Option, /// The region where the bucket is located at. pub container_region: String, /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. pub prefix_in_container: Option, /// Azure has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. #[serde(default = "default_remote_storage_azure_concurrency_limit")] pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, #[serde(default = "default_azure_conn_pool_size")] pub conn_pool_size: usize, /* BEGIN_HADRON */ #[serde(default = "default_azure_put_block_size_mb")] pub put_block_size_mb: Option, /* END_HADRON */ } /* BEGIN_HADRON */ fn default_azure_put_block_size_mb() -> Option { // Disable parallel upload by default. Some(0) } /* END_HADRON */ fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() } impl Debug for AzureConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("AzureConfig") .field("bucket_name", &self.container_name) .field("storage_account", &self.storage_account) .field("bucket_region", &self.container_region) .field("prefix_in_container", &self.prefix_in_container) .field("concurrency_limit", &self.concurrency_limit) .field( "max_keys_per_list_response", &self.max_keys_per_list_response, ) /* BEGIN_HADRON */ .field("put_block_size_mb", &self.put_block_size_mb) /* END_HADRON */ .finish() } } fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>( deserializer: D, ) -> Result, D::Error> { Option::::deserialize(deserializer).and_then(|s| { if let Some(s) = s { use serde::de::Error; let storage_class = StorageClass::from_str(&s).expect("infallible"); #[allow(deprecated)] if matches!(storage_class, StorageClass::Unknown(_)) { return Err(D::Error::custom(format!( "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values() ))); } Ok(Some(storage_class)) } else { Ok(None) } }) } fn serialize_storage_class( val: &Option, serializer: S, ) -> Result { let val = val.as_ref().map(StorageClass::as_str); Option::<&str>::serialize(&val, serializer) } impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30); pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { Ok(utils::toml_edit_ext::deserialize_item(toml)?) } pub fn from_toml_str(input: &str) -> anyhow::Result { let toml_document = toml_edit::DocumentMut::from_str(input)?; if let Some(item) = toml_document.get("remote_storage") { return Self::from_toml(item); } Self::from_toml(toml_document.as_item()) } } #[cfg(test)] mod tests { use super::*; fn parse(input: &str) -> anyhow::Result { RemoteStorageConfig::from_toml_str(input) } #[test] fn parse_localfs_config_with_timeout() { let input = "local_path = '.' timeout = '5s'"; let config = parse(input).unwrap(); assert_eq!( config, RemoteStorageConfig { storage: RemoteStorageKind::LocalFs { local_path: Utf8PathBuf::from(".") }, timeout: Duration::from_secs(5), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } #[test] fn test_gcs_parsing() { let toml = "\ bucket_name = 'foo-bar' prefix_in_bucket = 'pageserver/' "; let config = parse(toml).unwrap(); assert_eq!( config, RemoteStorageConfig { storage: RemoteStorageKind::GCS(GCSConfig { bucket_name: "foo-bar".into(), prefix_in_bucket: Some("pageserver/".into()), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, concurrency_limit: std::num::NonZero::new(100).unwrap(), }), timeout: Duration::from_secs(120), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } #[test] fn test_s3_parsing() { let toml = "\ bucket_name = 'foo-bar' bucket_region = 'eu-central-1' upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' "; let config = parse(toml).unwrap(); assert_eq!( config, RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: "foo-bar".into(), bucket_region: "eu-central-1".into(), prefix_in_bucket: None, endpoint: None, concurrency_limit: default_remote_storage_s3_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, upload_storage_class: Some(StorageClass::IntelligentTiering), }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } #[test] fn test_storage_class_serde_roundtrip() { let classes = [ None, Some(StorageClass::Standard), Some(StorageClass::IntelligentTiering), ]; for class in classes { #[derive(Serialize, Deserialize)] struct Wrapper { #[serde( deserialize_with = "deserialize_storage_class", serialize_with = "serialize_storage_class" )] class: Option, } let wrapped = Wrapper { class: class.clone(), }; let serialized = serde_json::to_string(&wrapped).unwrap(); let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap(); assert_eq!(class, deserialized.class); } } #[test] fn test_azure_parsing() { let toml = "\ container_name = 'foo-bar' container_region = 'westeurope' upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' conn_pool_size = 8 put_block_size_mb = 1024 "; let config = parse(toml).unwrap(); assert_eq!( config, RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { container_name: "foo-bar".into(), storage_account: None, container_region: "westeurope".into(), prefix_in_container: None, concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, conn_pool_size: 8, /* BEGIN_HADRON */ put_block_size_mb: Some(1024), /* END_HADRON */ }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } } ================================================ FILE: libs/remote_storage/src/error.rs ================================================ /// Reasons for downloads or listings to fail. #[derive(Debug)] pub enum DownloadError { /// Validation or other error happened due to user input. BadInput(anyhow::Error), /// The file was not found in the remote storage. NotFound, /// The caller provided an ETag, and the file was not modified. Unmodified, /// A cancellation token aborted the download, typically during /// tenant detach or process shutdown. Cancelled, /// A timeout happened while executing the request. Possible reasons: /// - stuck tcp connection /// /// Concurrency control is not timed within timeout. Timeout, /// Some integrity/consistency check failed during download. This is used during /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption. Fatal(String), /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } impl std::fmt::Display for DownloadError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { DownloadError::BadInput(e) => { write!(f, "Failed to download a remote file due to user input: {e}") } DownloadError::NotFound => write!(f, "No file found for the remote object id given"), DownloadError::Unmodified => write!(f, "File was not modified"), DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::Timeout => write!(f, "timeout"), DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } } } impl std::error::Error for DownloadError {} impl DownloadError { /// Returns true if the error should not be retried with backoff pub fn is_permanent(&self) -> bool { use DownloadError::*; match self { BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true, Timeout | Other(_) => false, } } pub fn is_cancelled(&self) -> bool { matches!(self, DownloadError::Cancelled) } } impl From for DownloadError { fn from(value: std::io::Error) -> Self { let needs_unwrap = value.kind() == std::io::ErrorKind::Other && value .get_ref() .and_then(|x| x.downcast_ref::()) .is_some(); if needs_unwrap { *value .into_inner() .expect("just checked") .downcast::() .expect("just checked") } else { DownloadError::Other(value.into()) } } } #[derive(Debug)] pub enum TimeTravelError { /// Validation or other error happened due to user input. BadInput(anyhow::Error), /// The used remote storage does not have time travel recovery implemented Unimplemented, /// The number of versions/deletion markers is above our limit. TooManyVersions, /// A cancellation token aborted the process, typically during /// request closure or process shutdown. Cancelled, /// Other errors Other(anyhow::Error), } impl std::fmt::Display for TimeTravelError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TimeTravelError::BadInput(e) => { write!( f, "Failed to time travel recover a prefix due to user input: {e}" ) } TimeTravelError::Unimplemented => write!( f, "time travel recovery is not implemented for the current storage backend" ), TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"), TimeTravelError::TooManyVersions => { write!(f, "Number of versions/delete markers above limit") } TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"), } } } impl std::error::Error for TimeTravelError {} /// Plain cancelled error. /// /// By design this type does not not implement `std::error::Error` so it cannot be put as the root /// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this /// crate. /// /// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning /// operations and ensuring that those get converted to proper versions with just `?`. #[derive(Debug)] pub(crate) struct Cancelled; impl From for anyhow::Error { fn from(_: Cancelled) -> Self { anyhow::Error::new(TimeoutOrCancel::Cancel) } } impl From for TimeTravelError { fn from(_: Cancelled) -> Self { TimeTravelError::Cancelled } } impl From for TimeoutOrCancel { fn from(_: Cancelled) -> Self { TimeoutOrCancel::Cancel } } impl From for DownloadError { fn from(_: Cancelled) -> Self { DownloadError::Cancelled } } /// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning /// RemoteStorage methods. /// /// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is /// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors. #[derive(Debug)] pub enum TimeoutOrCancel { Timeout, Cancel, } impl std::fmt::Display for TimeoutOrCancel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use TimeoutOrCancel::*; match self { Timeout => write!(f, "timeout"), Cancel => write!(f, "cancel"), } } } impl std::error::Error for TimeoutOrCancel {} impl TimeoutOrCancel { /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`]. pub fn caused_by_cancel(error: &anyhow::Error) -> bool { error .root_cause() .downcast_ref::() .is_some_and(Self::is_cancel) } pub fn is_cancel(&self) -> bool { matches!(self, TimeoutOrCancel::Cancel) } pub fn is_timeout(&self) -> bool { matches!(self, TimeoutOrCancel::Timeout) } } /// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or /// timeout to wrap it in an `std::io::Error`. impl From for std::io::Error { fn from(value: TimeoutOrCancel) -> Self { let e = DownloadError::from(value); std::io::Error::other(e) } } impl From for DownloadError { fn from(value: TimeoutOrCancel) -> Self { use TimeoutOrCancel::*; match value { Timeout => DownloadError::Timeout, Cancel => DownloadError::Cancelled, } } } ================================================ FILE: libs/remote_storage/src/gcs_bucket.rs ================================================ use crate::config::GCSConfig; use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; use crate::{ ConcurrencyLimiter, Download, DownloadError, DownloadOpts, GCS_SCOPES, Listing, ListingMode, ListingObject, MAX_KEYS_PER_DELETE_GCS, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, GCSVersion, VersionId, GCSVersionListing, }; use anyhow::Context; use azure_core::Etag; use bytes::Bytes; use bytes::BytesMut; use chrono::DateTime; use futures::stream::Stream; use futures::stream::TryStreamExt; use futures_util::StreamExt; use gcp_auth::{Token, TokenProvider}; use http::Method; use http::StatusCode; use reqwest::{Client, header}; use scopeguard::ScopeGuard; use serde::{Deserialize, Deserializer, Serialize, de}; use std::collections::HashMap; use std::fmt::Debug; use std::num::{NonZeroU32, ParseIntError}; use std::pin::{Pin, pin}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use std::time::SystemTime; use tokio_util::codec::{BytesCodec, FramedRead}; use tokio_util::sync::CancellationToken; use tracing; use url::{ParseError, Url}; use utils::backoff; use uuid::Uuid; // --------- fn to_system_time(timestamp: Option) -> Option { timestamp .and_then(|s| DateTime::parse_from_rfc3339(&s).ok()) .map(|s| s.into()) } // --------- pub struct GCSBucket { token_provider: Arc, bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, pub timeout: Duration, } struct GetObjectRequest { bucket: String, key: String, etag: Option, range: Option, } // --------- impl GCSBucket { pub async fn new(remote_storage_config: &GCSConfig, timeout: Duration) -> anyhow::Result { tracing::debug!( "creating remote storage for gcs bucket {}", remote_storage_config.bucket_name ); // clean up 'prefix_in_bucket' if user provides '/pageserver' or 'pageserver/' let prefix_in_bucket = remote_storage_config .prefix_in_bucket .as_deref() .map(|prefix| { let mut prefix = prefix; while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix = &prefix[1..]; } let mut prefix = prefix.to_string(); if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix.pop(); } prefix }); // get GOOGLE_APPLICATION_CREDENTIALS let provider = gcp_auth::provider().await?; Ok(GCSBucket { token_provider: Arc::clone(&provider), bucket_name: remote_storage_config.bucket_name.clone(), prefix_in_bucket, timeout, max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new( remote_storage_config.concurrency_limit.get(), ), }) } // convert `RemotePath` -> `String` pub fn relative_path_to_gcs_object(&self, path: &RemotePath) -> String { let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), } } // convert `String` -> `RemotePath` pub fn gcs_object_to_relative_path(&self, key: &str) -> RemotePath { let relative_path = match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) { Some(stripped) => stripped, // we rely on GCS to return properly prefixed paths // for requests with a certain prefix None => panic!( "Key {} does not start with bucket prefix {:?}", key, self.prefix_in_bucket ), }; RemotePath( relative_path .split(REMOTE_STORAGE_PREFIX_SEPARATOR) .collect(), ) } pub fn bucket_name(&self) -> &str { &self.bucket_name } fn max_keys_per_delete(&self) -> usize { MAX_KEYS_PER_DELETE_GCS } async fn permit( &self, kind: RequestKind, cancel: &CancellationToken, ) -> Result, Cancelled> { let started_at = start_counting_cancelled_wait(kind); let acquire = self.concurrency_limiter.acquire(kind); let permit = tokio::select! { permit = acquire => permit.expect("semaphore is never closed"), _ = cancel.cancelled() => return Err(Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); Ok(permit) } async fn owned_permit( &self, kind: RequestKind, cancel: &CancellationToken, ) -> Result { let started_at = start_counting_cancelled_wait(kind); let acquire = self.concurrency_limiter.acquire_owned(kind); let permit = tokio::select! { permit = acquire => permit.expect("semaphore is never closed"), _ = cancel.cancelled() => return Err(Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); Ok(permit) } async fn list_versions_with_permit( &self, _permit: &tokio::sync::SemaphorePermit<'_>, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let warn_threshold = 3; let max_retries = 10; let is_permanent = |e: &_| matches!(e, DownloadError::Cancelled); // GCS only has versions, which may contain 'deleted_at'. let mut versions = crate::GCSVersionListing::default(); let mut continuation_token = None; let mut uri: String; let list_prefix = prefix .map(|p| self.relative_path_to_gcs_object(p)) .or_else(|| { self.prefix_in_bucket.clone().map(|mut s| { s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); s }) }) .unwrap(); let mut versions_base_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o?prefix={}&versions=true", self.bucket_name.clone(), list_prefix, ); if let ListingMode::WithDelimiter = mode { versions_base_uri.push_str(&format!( "&delimiter={}", REMOTE_STORAGE_PREFIX_SEPARATOR.to_string() )); } loop { match &continuation_token { Some(token) => { uri = format!("{}&pageToken={}", &versions_base_uri, token); }, None => { uri = versions_base_uri.clone(); }, } let mut req_uri = versions_base_uri.clone(); let response = backoff::retry( || async { // fetch an array of results, keep looping to get them let op = Client::new() .get(&uri) .bearer_auth( self.token_provider .token(GCS_SCOPES) .await .map_err(|e: gcp_auth::Error| DownloadError::Other(e.into()))? .as_str() ) .send(); tokio::select! { res = op => res.map_err(|e| DownloadError::Other(e.into())), _ = cancel.cancelled() => Err(DownloadError::Cancelled), } }, is_permanent, warn_threshold, max_retries, "listing object versions", cancel, ) .await .ok_or_else(|| DownloadError::Cancelled) .and_then(|x| x)?; let res = response.json::() .await .map_err(|e| DownloadError::Other(e.into()))?; // fill up our results vec, continuation_token = res.next_page_token; let version_listing = res.items .ok_or_else(|| DownloadError::Other(anyhow::anyhow!("no items returned")))? .into_iter() .map(| GCSObject { name, updated, time_deleted, generation, .. } | { // don't `filter_map`, a `None` for `last_modified` ('updated') is bad for // time travel, so catch it. if updated.is_none() { return Err( DownloadError::Other( anyhow::anyhow!("no 'updated' field") ) ) } Ok( GCSVersion { key: self.gcs_object_to_relative_path(&name), last_modified: to_system_time(updated).unwrap(), id: VersionId(generation.expect("no version id")), time_deleted: to_system_time(time_deleted), } ) }).collect::, _>>(); versions.versions.extend(version_listing?); if let Some(max_keys) = max_keys { if versions.versions.len() >= max_keys.get().try_into().unwrap() { return Err(DownloadError::Other( anyhow::anyhow!("max keys reached") )); } } if continuation_token.is_none() { break } } Ok(versions) } async fn put_object( &self, byte_stream: impl Stream> + Send + Sync + 'static, fs_size: usize, to: &RemotePath, cancel: &CancellationToken, metadata: Option, ) -> anyhow::Result<()> { let kind = RequestKind::Put; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let multipart_uri = format!( "https://storage.googleapis.com/upload/storage/v1/b/{}/o?uploadType=multipart", self.bucket_name.clone() ); let mut metadata = metadata.clone(); let gcs_path = self.relative_path_to_gcs_object(to); // Always specify destination via `RemotePath` in multipart uploads if metadata.is_none() { metadata = Some(StorageMetadata::from([("name", gcs_path.as_str())])); } else { metadata .as_mut() .map(|m| m.0.insert("name".to_string(), gcs_path)); } let metadata_body = serde_json::to_string(&metadata.map(|m| m.0))?; let metadata_part = reqwest::multipart::Part::text(metadata_body) .mime_str("application/json; charset=UTF-8")?; let stream_body = reqwest::Body::wrap_stream(byte_stream); let data_part = reqwest::multipart::Part::stream_with_length(stream_body, fs_size as u64) .mime_str("application/octet-stream")?; let mut form = reqwest::multipart::Form::new() .part("metadata", metadata_part) .part("bodystream", data_part); let mut headers = header::HeaderMap::new(); headers.insert( header::CONTENT_TYPE, header::HeaderValue::from_str(&format!( "multipart/related; boundary={}", form.boundary() ))?, ); let upload = Client::new() .post(multipart_uri) .bearer_auth(self.token_provider.token(GCS_SCOPES).await?.as_str()) .multipart(form) .headers(headers) .send(); let upload = tokio::time::timeout(self.timeout, upload); let res = tokio::select! { res = upload => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; if let Ok(inner) = &res { let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, inner, started_at); } match res { Ok(Ok(res)) => { if !res.status().is_success() { match res.status() { _ => Err(anyhow::anyhow!("GCS PUT error \n\t {:?}", res)), } } else { let body = res .text() .await .map_err(|e: reqwest::Error| DownloadError::Other(e.into()))?; let resp: GCSObject = serde_json::from_str(&body) .map_err(|e: serde_json::Error| DownloadError::Other(e.into()))?; if !resp.size.is_some_and(|s| s == fs_size as i64) { // very unlikely return Err(anyhow::anyhow!( "Boundary string from 'multipart/related' HTTP upload occurred in payload" )); }; Ok(()) } } Ok(Err(reqw)) => Err(reqw.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } } async fn delete_oids( &self, delete_objects: &[String], cancel: &CancellationToken, _permit: &tokio::sync::SemaphorePermit<'_>, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; let mut cancel = std::pin::pin!(cancel.cancelled()); for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_GCS) { let started_at = start_measuring_requests(kind); // Use this to report keys that didn't delete based on 'content_id' let mut delete_objects_status = HashMap::new(); let mut form = reqwest::multipart::Form::new(); let bulk_uri = "https://storage.googleapis.com/batch/storage/v1"; for (index, path) in delete_objects.iter().enumerate() { delete_objects_status.insert(index + 1, path.clone()); let path_to_delete: String = url::form_urlencoded::byte_serialize(path.trim_start_matches("/").as_bytes()) .collect(); let delete_req = format!( " DELETE /storage/v1/b/{}/o/{} HTTP/1.1\r\n\ Content-Type: application/json\r\n\ accept: application/json\r\n\ content-length: 0\r\n ", self.bucket_name.clone(), path_to_delete ) .trim() .to_string(); let content_id = format!("<{}+{}>", Uuid::new_v4(), index + 1); let mut part_headers = header::HeaderMap::new(); part_headers.insert( header::CONTENT_TYPE, header::HeaderValue::from_static("application/http"), ); part_headers.insert( header::TRANSFER_ENCODING, header::HeaderValue::from_static("binary"), ); part_headers.insert( header::HeaderName::from_static("content-id"), header::HeaderValue::from_str(&content_id)?, ); let part = reqwest::multipart::Part::text(delete_req).headers(part_headers); form = form.part(format!("request-{}", index), part); } let mut headers = header::HeaderMap::new(); headers.insert( header::CONTENT_TYPE, header::HeaderValue::from_str(&format!( "multipart/mixed; boundary={}", form.boundary() ))?, ); let req = Client::new() .post(bulk_uri) .bearer_auth(self.token_provider.token(GCS_SCOPES).await?.as_str()) .multipart(form) .headers(headers) .send(); let resp = tokio::select! { resp = req => resp, _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()), _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &resp, started_at); let resp = resp.context("request deletion")?; crate::metrics::BUCKET_METRICS .deleted_objects_total .inc_by(chunk.len() as u64); let res_headers = resp.headers().to_owned(); let boundary = res_headers .get(header::CONTENT_TYPE) .unwrap() .to_str()? .split("=") .last() .unwrap(); let res_body = resp.text().await?; let parsed: HashMap = res_body .split(&format!("--{}", boundary)) .filter_map(|c| { let mut lines = c.lines(); let id = lines.find_map(|line| { line.strip_prefix("Content-ID:") .and_then(|suf| suf.split('+').last()) .and_then(|suf| suf.split('>').next()) .map(|x| x.trim().to_string()) }); let status_code = lines.find_map(|line| { // Not sure if this protocol version shouldn't be so specific line.strip_prefix("HTTP/1.1") .and_then(|x| x.split_whitespace().next()) .map(|x| x.trim().to_string()) }); id.zip(status_code) }) .collect(); // Gather failures let errors: HashMap = parsed .iter() .filter_map(|(x, y)| { let id = x.parse::().ok(); if y == "404" { // GCS returns Error on 404, S3 doesn't. Warn and omit from failed count. // https://cloud.google.com/storage/docs/xml-api/delete-object tracing::warn!( "DeleteObjects key {} {} NotFound. Already deleted.", delete_objects_status.get(&id?).unwrap(), y ); None } else if y.chars().next() != Some('2') { id.map(|v| (v, y)) } else { None } }) .collect(); if !errors.is_empty() { // Report 10 of them like S3 const LOG_UP_TO_N_ERRORS: usize = 10; for (id, code) in errors.iter().take(LOG_UP_TO_N_ERRORS) { tracing::warn!( "DeleteObjects key {} failed with code: {}", delete_objects_status.get(id).unwrap(), code ); } return Err(anyhow::anyhow!( "Failed to delete {}/{} objects", errors.len(), chunk.len(), )); } } Ok(()) } async fn head_object( &self, key: String, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Head; let _permit = self.permit(kind, cancel).await?; let encoded_path: String = url::form_urlencoded::byte_serialize(key.as_bytes()).collect(); let metadata_uri_mod = "alt=json"; let download_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o/{}?{}", self.bucket_name.clone(), encoded_path, metadata_uri_mod ); let head_future = Client::new() .get(download_uri) .bearer_auth( self.token_provider .token(GCS_SCOPES) .await .map_err(|e: gcp_auth::Error| DownloadError::Other(e.into()))? .as_str(), ) .send(); let started_at = start_measuring_requests(kind); let head_future = tokio::time::timeout(self.timeout, head_future); let res = tokio::select! { res = head_future => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let res = res.map_err(|_e| DownloadError::Timeout)?; // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); let data = match res { Ok(data) => { if !data.status().is_success() { match data.status() { StatusCode::NOT_FOUND => return Err(DownloadError::NotFound), _ => { return Err(DownloadError::Other(anyhow::anyhow!( "GCS head response contained no response body" ))); } } } else { data } } Err(e) => { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); return Err(DownloadError::Other( anyhow::Error::new(e).context("error in HEAD of GCS object"), )); } }; let body = data .text() .await .map_err(|e: reqwest::Error| DownloadError::Other(e.into()))?; let resp: GCSObject = serde_json::from_str(&body) .map_err(|e: serde_json::Error| DownloadError::Other(e.into()))?; Ok(resp) } async fn list_objects_v2(&self, list_uri: String) -> anyhow::Result { let res = Client::new() .get(list_uri) .bearer_auth(self.token_provider.token(GCS_SCOPES).await?.as_str()); Ok(res) } // need a 'bucket', a 'key', and a bytes 'range'. async fn get_object( &self, request: GetObjectRequest, cancel: &CancellationToken, ) -> anyhow::Result { let kind = RequestKind::Get; let permit = self.owned_permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let encoded_path: String = url::form_urlencoded::byte_serialize(request.key.as_bytes()).collect(); /// We do this in two parts: /// 1. Serialize the metadata of the first request to get Etag, last modified, etc /// 2. We do not .await the second request pass on the pinned stream to the 'get_object' /// caller let metadata_uri_mod = "alt=json"; let download_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o/{}?{}", self.bucket_name.clone(), encoded_path, metadata_uri_mod ); let res = Client::new() .get(download_uri) .bearer_auth( self.token_provider .token(GCS_SCOPES) .await .map_err(|e: gcp_auth::Error| DownloadError::Other(e.into()))? .as_str(), ) .send(); let obj_metadata = tokio::select! { res = res => res, _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), _ = cancel.cancelled() => return Err(DownloadError::Cancelled), }; let resp = match obj_metadata { Ok(resp) => { if !resp.status().is_success() { match resp.status() { StatusCode::NOT_FOUND => return Err(DownloadError::NotFound), _ => { return Err(DownloadError::Other(anyhow::anyhow!( "GCS GET response contained no response body" ))); } } } else { resp } } _ => { return Err(DownloadError::Other(anyhow::anyhow!("download gcs object"))); } }; let body = resp .text() .await .map_err(|e: reqwest::Error| DownloadError::Other(e.into()))?; let resp: GCSObject = serde_json::from_str(&body) .map_err(|e: serde_json::Error| DownloadError::Other(e.into()))?; // 2. Byte Stream request let mut headers = header::HeaderMap::new(); let bytes_range = match &request.range { Some(s) => header::HeaderValue::from_str(s).unwrap(), None => header::HeaderValue::from_static("bytes=0-"), }; tracing::info!( "performing object download with {:?} range header", bytes_range ); headers.insert(header::RANGE, bytes_range); let encoded_path: String = url::form_urlencoded::byte_serialize(request.key.as_bytes()).collect(); let stream_uri_mod = "alt=media"; // See: https://cloud.google.com/storage/docs/streaming-downloads#stream_a_download // REST APIs > JSON API > 1st bullet point let generation = resp .generation .expect("object did not contain generation number"); let generation_mod = format!("generation={generation}"); let stream_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o/{}?{}&{}", self.bucket_name.clone(), encoded_path, stream_uri_mod, generation_mod, ); let mut req = Client::new() .get(stream_uri) .headers(headers) .bearer_auth( self.token_provider .token(GCS_SCOPES) .await .map_err(|e: gcp_auth::Error| DownloadError::Other(e.into()))? .as_str(), ) .send(); let get_object = tokio::select! { res = req => res, _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), _ = cancel.cancelled() => return Err(DownloadError::Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); let object_output = match get_object { Ok(object_output) => { if !object_output.status().is_success() { match object_output.status() { StatusCode::NOT_FOUND => return Err(DownloadError::NotFound), _ => { return Err(DownloadError::Other(anyhow::anyhow!( "GCS GET response contained no response body" ))); } } } else { object_output } } Err(e) => { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); return Err(DownloadError::Other( anyhow::Error::new(e).context("download s3 object"), )); } }; let remaining = self.timeout.saturating_sub(started_at.elapsed()); let metadata = resp.metadata.map(StorageMetadata); let etag = resp .etag .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))? .into(); let last_modified: SystemTime = to_system_time(resp.updated).unwrap_or(SystemTime::now()); // But let data stream pass through Ok(Download { download_stream: Box::pin(object_output.bytes_stream().map(|item| { item.map_err(|e: reqwest::Error| std::io::Error::new(std::io::ErrorKind::Other, e)) })), etag, last_modified, metadata, }) } async fn copy_object( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, generation: Option<&String>, ) -> anyhow::Result { let copy_from_path: String = url::form_urlencoded::byte_serialize( self.relative_path_to_gcs_object(to) .trim_start_matches("/") .as_bytes() ) .collect(); let copy_to_path: String = url::form_urlencoded::byte_serialize( self.relative_path_to_gcs_object(to) .trim_start_matches("/") .as_bytes() ) .collect(); let mut copy_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o/{}/rewriteTo/b/{}/o/{}", self.bucket_name.clone(), copy_from_path, self.bucket_name.clone(), copy_to_path, ); if let Some(gen_id) = generation { copy_uri += gen_id; } Ok( Client::new() .post(copy_uri) .bearer_auth(self.token_provider.token(GCS_SCOPES).await?.as_str()) .header(header::CONTENT_TYPE, "application/json") .header(header::CONTENT_LENGTH, "0") ) } } impl RemoteStorage for GCSBucket { // --------------------------------------- // Neon wrappers for GCS client functions // --------------------------------------- fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> { let kind = RequestKind::List; let mut max_keys = max_keys.map(|mk| mk.get() as i32); let list_prefix = prefix .map(|p| self.relative_path_to_gcs_object(p)) .or_else(|| { self.prefix_in_bucket.clone().map(|mut s| { s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); s }) }) .unwrap(); let request_max_keys = self .max_keys_per_list_response .into_iter() .chain(max_keys.into_iter()) .min() // https://cloud.google.com/storage/docs/json_api/v1/objects/list?hl=en#parameters .unwrap_or(1000); // We pass URI in to `list_objects_v2` as we'll modify it with `NextPageToken`, hence // `mut` let mut list_uri = format!( "https://storage.googleapis.com/storage/v1/b/{}/o?prefix={}&maxResults={}", self.bucket_name.clone(), list_prefix, request_max_keys, ); // on ListingMode: // https://github.com/neondatabase/neon/blob/edc11253b65e12a10843711bd88ad277511396d7/libs/remote_storage/src/lib.rs#L158C1-L164C2 if let ListingMode::WithDelimiter = mode { list_uri.push_str(&format!( "&delimiter={}", REMOTE_STORAGE_PREFIX_SEPARATOR.to_string() )); } async_stream::stream! { let mut continuation_token = None; 'outer: loop { let started_at = start_measuring_requests(kind); let request = self.list_objects_v2(list_uri.clone()) .await .map_err(DownloadError::Other)? .send(); // this is like `await` let response = tokio::select! { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), }?; // just mapping our `Result' error variant's type. let response = response .context("Failed to list GCS prefixes") .map_err(DownloadError::Other); let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &response, started_at); let response = match response { Ok(response) => response, Err(e) => { // The error is potentially retryable, so we must rewind the loop after yielding. yield Err(e); continue 'outer; }, }; let body = response.text() .await .map_err(|e: reqwest::Error| DownloadError::Other(e.into()))?; let resp: GCSListResponse = serde_json::from_str(&body).map_err(|e: serde_json::Error| DownloadError::Other(e.into()))?; let prefixes = resp.common_prefixes(); let keys = resp.contents(); tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); let mut result = Listing::default(); for res in keys.iter() { let last_modified: SystemTime = to_system_time(res.updated.clone()).unwrap_or(SystemTime::now()); let size = res.size.unwrap_or(0) as u64; let key = res.name.clone(); result.keys.push( ListingObject{ key: self.gcs_object_to_relative_path(&key), last_modified, size } ); if let Some(mut mk) = max_keys { assert!(mk > 0); mk -= 1; if mk == 0 { tracing::debug!("reached limit set by max_keys"); yield Ok(result); break 'outer; } max_keys = Some(mk); }; } result.prefixes.extend(prefixes.iter().filter_map(|p| { Some( self.gcs_object_to_relative_path( p.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR) ), ) })); yield Ok(result); continuation_token = match resp.next_page_token { Some(token) => { list_uri = list_uri + "&pageToken=" + &token; Some(token) }, None => break } } } } async fn copy( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Copy; let _permit = self.permit(kind, cancel).await?; let timeout = tokio::time::sleep(self.timeout); let started_at = start_measuring_requests(kind); let op = self.copy_object( from, to, cancel, None ).await?.send(); let res = tokio::select! { res = op => res, _ = timeout => return Err(TimeoutOrCancel::Timeout.into()), _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); res?; Ok(()) } async fn upload( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let upload = self.put_object(from, from_size_bytes, to, cancel, metadata); let upload = tokio::time::timeout(self.timeout, upload); let res = tokio::select! { res = upload => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; if let Ok(inner) = &res { // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, inner, started_at); } match res { Ok(Ok(_put)) => Ok(()), Ok(Err(sdk)) => { Err(sdk.into()) } Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } } async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` self.get_object( GetObjectRequest { bucket: self.bucket_name.clone(), key: self .relative_path_to_gcs_object(from) .trim_start_matches("/") .to_string(), etag: opts.etag.as_ref().map(|e| e.to_string()), range: opts.byte_range_header(), }, cancel, ) .await } async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; let permit = self.permit(kind, cancel).await?; let mut delete_objects: Vec = Vec::with_capacity(paths.len()); let delete_objects: Vec = paths .iter() .map(|i| self.relative_path_to_gcs_object(i)) .collect(); self.delete_oids(&delete_objects, cancel, &permit).await } fn max_keys_per_delete(&self) -> usize { MAX_KEYS_PER_DELETE_GCS } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { let paths = std::array::from_ref(path); self.delete_objects(paths, cancel).await } async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, complexity_limit: Option, ) -> Result<(), TimeTravelError> { let kind = RequestKind::TimeTravel; let permit = self.permit(kind, cancel).await?; tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}"); let mode = ListingMode::NoDelimiter; let version_listing = self .list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel) .await .map_err(|err| match err { DownloadError::Other(e) => TimeTravelError::Other(e), DownloadError::Cancelled => TimeTravelError::Cancelled, other => TimeTravelError::Other(other.into()), })?; let versions_and_deletes = version_listing.versions; tracing::info!( "Built list for time travel with {} versions and deletions", versions_and_deletes.len() ); // Work on the list of references instead of the objects directly, // otherwise we get lifetime errors in the sort_by_key call below. let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); let mut vds_for_key = HashMap::<_, Vec<_>>::new(); for vd in &versions_and_deletes { let GCSVersion { key, .. } = &vd; if Some(vd.id.0.as_str()) == Some("null") { // TODO: check the behavior of using the SDK on a non-versioned container return Err(TimeTravelError::Other(anyhow::anyhow!( "Received ListVersions response for key={key} with version_id='null', \ indicating either disabled versioning, or legacy objects with null version id values" ))); } tracing::trace!("Parsing version key={key} id={:?}", vd.id); vds_for_key.entry(key).or_default().push(vd); } let warn_threshold = 3; let max_retries = 10; let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); for (key, versions) in vds_for_key { let last_vd = versions.last().unwrap(); let key = self.relative_path_to_gcs_object(key); if last_vd.last_modified > done_if_after { /// Case 1: we have a recent object outside of our restore window. tracing::trace!("Key {key} has version later than done_if_after, skipping"); continue; } /// we get index in the array that we want whether its `v` or `e` let version_to_restore_to = match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { Ok(v) => v, Err(e) => e, }; let mut do_delete = false; if version_to_restore_to == 0 { // All versions more recent, so the key didn't exist at the specified time point. tracing::trace!( "All {} versions more recent for {key}, deleting", versions.len() ); do_delete = true; } else { let GCSVersion { id: VersionId(version_id), time_deleted: deletion_timestamp, .. } = &versions[version_to_restore_to - 1]; // GCS only has 'timeDeleted', not a version object per delete + version. // A version is either replaced by an object or removed -- stomped or dropped. // If `timeDeleted` < `time_travel_timestamp`, obj was removed and ought to be deleted. // If its `None`, that means we have the most current object, no-op. // Else, it was the same as the `updated` / `timeCreated` of the subsequent version, and ought to be restored. match &deletion_timestamp { Some(time) => { if time < ×tamp { // Case 2: version was last marked deleted before `timestamp` do_delete = true; } else { // Case 3: restore state to this version via `copy_object` tracing::trace!("Copying old version {version_id} for {key}..."); let source_id = format!("?sourceGeneration={version_id}"); backoff::retry( || async { let key_path = self.gcs_object_to_relative_path(&key); let op = self.copy_object( &key_path, &key_path, cancel, Some(&source_id), ).await.map_err(|e| TimeTravelError::Other(e.into()))? .send(); tokio::select! { res = op => res.map_err(|e| TimeTravelError::Other(e.into())), _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), } }, is_permanent, warn_threshold, max_retries, "copying object version for time_travel_recover", cancel, ) .await .ok_or_else(|| TimeTravelError::Cancelled) .and_then(|x| {x})?; tracing::info!(%version_id, %key, "Copied old version in GCS"); } }, _ => { tracing::info!("most current object version, skipping"); } } }; if do_delete { tracing::trace!("Deleting {key}..."); self.delete_oids(&[key], cancel, &permit) .await .map_err(|e| { // delete_oid0 will use TimeoutOrCancel if TimeoutOrCancel::caused_by_cancel(&e) { TimeTravelError::Cancelled } else { TimeTravelError::Other(e) } })?; } } Ok(()) } async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result { let path = self .relative_path_to_gcs_object(key) .trim_start_matches("/") .to_string(); let resp = self.head_object(path.clone(), cancel).await?; let last_modified: SystemTime = to_system_time(resp.updated).unwrap_or(SystemTime::now()); let Some(size) = resp.size else { return Err(DownloadError::Other(anyhow::anyhow!( "Missing size (content length) header" ))); }; Ok(ListingObject { key: self.gcs_object_to_relative_path(&path), last_modified, size: size as u64, }) } async fn list_versions( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::ListVersions; let permit = self.permit(kind, cancel).await?; Ok( self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel) .await?.into() ) } } // --------- #[derive(Serialize, Deserialize, Debug)] #[serde(rename_all = "snake_case")] pub struct GCSListResponse { #[serde(rename = "nextPageToken")] pub next_page_token: Option, pub items: Option>, pub prefixes: Option>, } fn de_from_str<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { let s = Option::::deserialize(deserializer)?; match s { Some(s) => i64::from_str(&s).map(Some).map_err(de::Error::custom), None => Ok(None), } } #[derive(Serialize, Deserialize, Debug)] #[serde(rename_all = "snake_case")] pub struct GCSObject { pub name: String, pub bucket: String, pub generation: Option, pub metageneration: String, #[serde(rename = "contentType")] pub content_type: Option, #[serde(rename = "storageClass")] pub storage_class: String, #[serde(deserialize_with = "de_from_str")] pub size: Option, #[serde(rename = "md5Hash")] pub md5_hash: Option, pub crc32c: String, pub etag: Option, #[serde(rename = "timeCreated")] pub time_created: String, pub updated: Option, #[serde(rename = "timeStorageClassUpdated")] pub time_storage_class_updated: String, #[serde(rename = "timeDeleted")] pub time_deleted: Option, #[serde(rename = "timeFinalized")] pub time_finalized: String, pub metadata: Option>, } impl GCSListResponse { pub fn contents(&self) -> &[GCSObject] { self.items.as_deref().unwrap_or_default() } pub fn common_prefixes(&self) -> &[String] { self.prefixes.as_deref().unwrap_or_default() } } ================================================ FILE: libs/remote_storage/src/lib.rs ================================================ //! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. //! No other modules from this tree are supposed to be used directly by the external code. //! //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: //! * [`local_fs`] allows to use local file system as an external storage //! * [`s3_bucket`] uses AWS S3 bucket as an external storage //! * [`azure_blob`] allows to use Azure Blob storage as an external storage //! #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; mod config; mod error; mod gcs_bucket; mod local_fs; mod metrics; mod s3_bucket; mod simulate_failures; mod support; use std::collections::HashMap; use std::fmt::Debug; use std::num::NonZeroU32; use std::ops::Bound; use std::pin::{Pin, pin}; use std::sync::Arc; use std::time::SystemTime; use anyhow::Context; /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. pub use azure_core::Etag; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; pub use config::TypedRemoteStorageKind; pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; use futures::StreamExt; use futures::stream::Stream; use itertools::Itertools as _; use s3_bucket::RequestKind; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::info; pub use self::azure_blob::AzureBlobStorage; pub use self::gcs_bucket::GCSBucket; pub use self::local_fs::LocalFs; pub use self::s3_bucket::S3Bucket; pub use self::simulate_failures::UnreliableWrapper; pub use crate::config::{AzureConfig, GCSConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; /// Default concurrency limit for S3 operations /// /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// Set this limit analogously to the S3 limit /// /// Here, a limit of max 20k concurrent connections was noted. /// pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; /// Set this limit analogously to the S3 limit. /// /// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds /// the upload queue concurrency. Some tests create thousands of uploads, which slows down the /// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks. pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; /// As defined in S3 docs /// /// pub const MAX_KEYS_PER_DELETE_S3: usize = 1000; /// As defined in Azure docs /// /// pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256; pub const MAX_KEYS_PER_DELETE_GCS: usize = 1000; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; const GCS_SCOPES: &[&str] = &["https://www.googleapis.com/auth/cloud-platform"]; /// Path on the remote storage, relative to some inner prefix. /// The prefix is an implementation detail, that allows representing local paths /// as the remote ones, stripping the local storage prefix away. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct RemotePath(Utf8PathBuf); impl Serialize for RemotePath { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { serializer.collect_str(self) } } impl<'de> Deserialize<'de> for RemotePath { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { let str = String::deserialize(deserializer)?; Ok(Self(Utf8PathBuf::from(&str))) } } impl std::fmt::Display for RemotePath { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Display::fmt(&self.0, f) } } impl RemotePath { pub fn new(relative_path: &Utf8Path) -> anyhow::Result { anyhow::ensure!( relative_path.is_relative(), "Path {relative_path:?} is not relative" ); Ok(Self(relative_path.to_path_buf())) } pub fn from_string(relative_path: &str) -> anyhow::Result { Self::new(Utf8Path::new(relative_path)) } pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf { base_path.join(&self.0) } pub fn object_name(&self) -> Option<&str> { self.0.file_name() } pub fn join(&self, path: impl AsRef) -> Self { Self(self.0.join(path)) } pub fn get_path(&self) -> &Utf8PathBuf { &self.0 } pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } pub fn add_trailing_slash(&self) -> Self { // Unwrap safety inputs are guararnteed to be valid UTF-8 Self(format!("{}/", self.0).try_into().unwrap()) } } /// We don't need callers to be able to pass arbitrary delimiters: just control /// whether listings will use a '/' separator or not. /// /// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The /// NoDelimiter mode will only populate `keys`. #[derive(Copy, Clone)] pub enum ListingMode { WithDelimiter, NoDelimiter, } #[derive(PartialEq, Eq, Debug, Clone)] pub struct ListingObject { pub key: RemotePath, pub last_modified: SystemTime, pub size: u64, } #[derive(Default)] pub struct Listing { pub prefixes: Vec, pub keys: Vec, } #[derive(Default)] pub struct VersionListing { pub versions: Vec, } #[derive(Debug)] pub struct Version { pub key: RemotePath, pub last_modified: SystemTime, pub kind: VersionKind, } impl Version { pub fn version_id(&self) -> Option<&VersionId> { match &self.kind { VersionKind::Version(id) => Some(id), VersionKind::DeletionMarker => None, } } } #[derive(Debug)] pub enum VersionKind { DeletionMarker, Version(VersionId), } // I was going to do an `enum GenericVersion` but this feels cleaner. #[derive(Default)] pub struct GCSVersionListing { pub versions: Vec, } #[derive(Debug)] pub struct GCSVersion { pub key: RemotePath, pub last_modified: SystemTime, pub id: VersionId, pub time_deleted: Option, } impl From for VersionListing { fn from(gcs_listing: GCSVersionListing) -> Self { let version_listing = gcs_listing .versions .into_iter() .map( |GCSVersion { key, last_modified, id, .. }| { Version { key, last_modified, kind: VersionKind::Version(VersionId(id.0)), } }, ) .collect::>(); VersionListing { versions: version_listing, } } } /// Options for downloads. The default value is a plain GET. pub struct DownloadOpts { /// If given, returns [`DownloadError::Unmodified`] if the object still has /// the same ETag (using If-None-Match). pub etag: Option, /// The start of the byte range to download, or unbounded. pub byte_start: Bound, /// The end of the byte range to download, or unbounded. Must be after the /// start bound. pub byte_end: Bound, /// Optionally request a specific version of a key pub version_id: Option, /// Indicate whether we're downloading something small or large: this indirectly controls /// timeouts: for something like an index/manifest/heatmap, we should time out faster than /// for layer files pub kind: DownloadKind, } pub enum DownloadKind { Large, Small, } #[derive(Debug, Clone)] pub struct VersionId(pub String); impl Default for DownloadOpts { fn default() -> Self { Self { etag: Default::default(), byte_start: Bound::Unbounded, byte_end: Bound::Unbounded, version_id: None, kind: DownloadKind::Large, } } } impl DownloadOpts { /// Returns the byte range with inclusive start and exclusive end, or None /// if unbounded. pub fn byte_range(&self) -> Option<(u64, Option)> { if self.byte_start == Bound::Unbounded && self.byte_end == Bound::Unbounded { return None; } let start = match self.byte_start { Bound::Excluded(i) => i + 1, Bound::Included(i) => i, Bound::Unbounded => 0, }; let end = match self.byte_end { Bound::Excluded(i) => Some(i), Bound::Included(i) => Some(i + 1), Bound::Unbounded => None, }; if let Some(end) = end { assert!(start < end, "range end {end} at or before start {start}"); } Some((start, end)) } /// Returns the byte range as an RFC 2616 Range header value with inclusive /// bounds, or None if unbounded. pub fn byte_range_header(&self) -> Option { self.byte_range() .map(|(start, end)| (start, end.map(|end| end - 1))) // make end inclusive .map(|(start, end)| match end { Some(end) => format!("bytes={start}-{end}"), None => format!("bytes={start}-"), }) } } /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`]. /// /// The stream is guaranteed to return at least one element, even in the case of errors /// (in that case it's an `Err()`), or an empty `Listing`. /// /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error. /// The `next` function can be retried, and maybe in a future retry, there will be success. /// /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not /// from the absolute root of the bucket. /// /// `mode` configures whether to use a delimiter. Without a delimiter, all keys /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are /// returned in `keys` (). /// /// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// /// [`ListObjectsV2`]: /// [`is_permanent`]: DownloadError::is_permanent fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> + Send; async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel)); let mut combined = stream.next().await.expect("At least one item required")?; while let Some(list) = stream.next().await { let list = list?; combined.keys.extend(list.keys.into_iter()); combined.prefixes.extend_from_slice(&list.prefixes); } Ok(combined) } async fn list_versions( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result; /// Obtain metadata information about an object. async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result; /// Streams the local file contents into remote into the remote storage entry. /// /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. async fn upload( &self, from: impl Stream> + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()>; /// Streams the remote storage entry contents. /// /// The returned download stream will obey initial timeout and cancellation signal by erroring /// on whichever happens first. Only one of the reasons will fail the stream, which is usually /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. /// /// Returns the metadata, if any was stored with the file previously. async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result; /// Delete a single path from remote storage. /// /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through. async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>; /// Delete a multiple paths from remote storage. /// /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went /// through. async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()>; /// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking /// /// The value returned is only an optimization hint, One can pass larger number of objects to /// `delete_objects` as well. /// /// The value is guaranteed to be >= 1. fn max_keys_per_delete(&self) -> usize; /// Deletes all objects matching the given prefix. /// /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc. /// /// If the operation fails because of timeout or cancellation, the root cause of the error will /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went /// through. async fn delete_prefix( &self, prefix: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { let mut stream = pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel)); while let Some(result) = stream.next().await { let keys = match result { Ok(listing) if listing.keys.is_empty() => continue, Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(), Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()), Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()), Err(err) => return Err(err.into()), }; tracing::info!("Deleting {} keys from remote storage", keys.len()); self.delete_objects(&keys, cancel).await?; } Ok(()) } /// Copy a remote object inside a bucket from one path to another. async fn copy( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()>; /// Resets the content of everything with the given prefix to the given state async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, complexity_limit: Option, ) -> Result<(), TimeTravelError>; } /// Data part of an ongoing [`Download`]. /// /// `DownloadStream` is sensitive to the timeout and cancellation used with the original /// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible /// with `tokio::io::copy_buf`. // This has 'static because safekeepers do not use cancellation tokens (yet) pub type DownloadStream = Pin> + Send + Sync + 'static>>; pub struct Download { pub download_stream: DownloadStream, /// The last time the file was modified (`last-modified` HTTP header) pub last_modified: SystemTime, /// A way to identify this specific version of the resource (`etag` HTTP header) pub etag: Etag, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } impl Debug for Download { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Download") .field("metadata", &self.metadata) .finish() } } /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 #[derive(Clone)] pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), AzureBlob(Arc), Unreliable(Other), GCS(Arc), } impl GenericRemoteStorage> { // See [`RemoteStorage::list`]. pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await, Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await, Self::GCS(s) => s.list(prefix, mode, max_keys, cancel).await, } } // See [`RemoteStorage::list_streaming`]. pub fn list_streaming<'a>( &'a self, prefix: Option<&'a RemotePath>, mode: ListingMode, max_keys: Option, cancel: &'a CancellationToken, ) -> impl Stream> + 'a + Send { match self { Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) as Pin> + Send>>, Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), Self::GCS(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), } } // See [`RemoteStorage::list_versions`]. pub async fn list_versions<'a>( &'a self, prefix: Option<&'a RemotePath>, mode: ListingMode, max_keys: Option, cancel: &'a CancellationToken, ) -> Result { match self { Self::LocalFs(s) => s.list_versions(prefix, mode, max_keys, cancel).await, Self::AwsS3(s) => s.list_versions(prefix, mode, max_keys, cancel).await, Self::AzureBlob(s) => s.list_versions(prefix, mode, max_keys, cancel).await, Self::Unreliable(s) => s.list_versions(prefix, mode, max_keys, cancel).await, Self::GCS(s) => s.list_versions(prefix, mode, max_keys, cancel).await, } } // See [`RemoteStorage::head_object`]. pub async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => s.head_object(key, cancel).await, Self::AwsS3(s) => s.head_object(key, cancel).await, Self::AzureBlob(s) => s.head_object(key, cancel).await, Self::Unreliable(s) => s.head_object(key, cancel).await, Self::GCS(s) => s.head_object(key, cancel).await, } } /// See [`RemoteStorage::upload`] pub async fn upload( &self, from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, Self::GCS(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, } } /// See [`RemoteStorage::download`] pub async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => s.download(from, opts, cancel).await, Self::AwsS3(s) => s.download(from, opts, cancel).await, Self::AzureBlob(s) => s.download(from, opts, cancel).await, Self::Unreliable(s) => s.download(from, opts, cancel).await, Self::GCS(s) => s.download(from, opts, cancel).await, } } /// See [`RemoteStorage::delete`] pub async fn delete( &self, path: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { Self::LocalFs(s) => s.delete(path, cancel).await, Self::AwsS3(s) => s.delete(path, cancel).await, Self::AzureBlob(s) => s.delete(path, cancel).await, Self::Unreliable(s) => s.delete(path, cancel).await, Self::GCS(s) => s.delete(path, cancel).await, } } /// See [`RemoteStorage::delete_objects`] pub async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { Self::LocalFs(s) => s.delete_objects(paths, cancel).await, Self::AwsS3(s) => s.delete_objects(paths, cancel).await, Self::AzureBlob(s) => s.delete_objects(paths, cancel).await, Self::Unreliable(s) => s.delete_objects(paths, cancel).await, Self::GCS(s) => s.delete_objects(paths, cancel).await, } } /// [`RemoteStorage::max_keys_per_delete`] pub fn max_keys_per_delete(&self) -> usize { match self { Self::LocalFs(s) => s.max_keys_per_delete(), Self::AwsS3(s) => s.max_keys_per_delete(), Self::AzureBlob(s) => s.max_keys_per_delete(), Self::Unreliable(s) => s.max_keys_per_delete(), Self::GCS(s) => s.max_keys_per_delete(), } } /// See [`RemoteStorage::delete_prefix`] pub async fn delete_prefix( &self, prefix: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await, Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await, Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await, Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await, Self::GCS(s) => s.delete_prefix(prefix, cancel).await, } } /// See [`RemoteStorage::copy`] pub async fn copy_object( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { Self::LocalFs(s) => s.copy(from, to, cancel).await, Self::AwsS3(s) => s.copy(from, to, cancel).await, Self::AzureBlob(s) => s.copy(from, to, cancel).await, Self::Unreliable(s) => s.copy(from, to, cancel).await, Self::GCS(s) => s.copy(from, to, cancel).await, } } /// See [`RemoteStorage::time_travel_recover`]. pub async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, complexity_limit: Option, ) -> Result<(), TimeTravelError> { match self { Self::LocalFs(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::AwsS3(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::AzureBlob(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::Unreliable(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::GCS(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } } } } impl GenericRemoteStorage { pub async fn from_storage_kind(kind: TypedRemoteStorageKind) -> anyhow::Result { Self::from_config(&RemoteStorageConfig { storage: kind.into(), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }) .await } pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { info!("RemoteStorageConfig: {:?}", storage_config); let timeout = storage_config.timeout; // If someone overrides timeout to be small without adjusting small_timeout, then adjust it automatically let small_timeout = std::cmp::min(storage_config.small_timeout, timeout); info!( "RemoteStorageConfig's storage attribute: {:?}", storage_config.storage ); Ok(match &storage_config.storage { RemoteStorageKind::LocalFs { local_path: path } => { info!("Using fs root '{path}' as a remote storage"); Self::LocalFs(LocalFs::new(path.clone(), timeout)?) } RemoteStorageKind::AwsS3(s3_config) => { // The profile and access key id are only printed here for debugging purposes, // their values don't indicate the eventually taken choice for auth. let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); info!( "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint ); Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { let storage_account = azure_config .storage_account .as_deref() .unwrap_or(""); info!( "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container ); Self::AzureBlob(Arc::new(AzureBlobStorage::new( azure_config, timeout, small_timeout, )?)) } RemoteStorageKind::GCS(gcs_config) => { let google_application_credentials = std::env::var("GOOGLE_APPLICATION_CREDENTIALS") .unwrap_or_else(|_| "".into()); info!( "Using gcs bucket '{}' as a remote storage, prefix in bucket: '{:?}', GOOGLE_APPLICATION_CREDENTIALS: {google_application_credentials }", gcs_config.bucket_name, gcs_config.prefix_in_bucket ); Self::GCS(Arc::new(GCSBucket::new(gcs_config, timeout).await?)) } }) } /* BEGIN_HADRON */ pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self { Self::Unreliable(Arc::new(UnreliableWrapper::new( s, fail_first, fail_probability, ))) } /* END_HADRON */ /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { self.upload(from, from_size_bytes, to, None, cancel) .await .with_context(|| { format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}") }) } /// The name of the bucket/container/etc. pub fn bucket_name(&self) -> Option<&str> { match self { Self::LocalFs(_s) => None, Self::AwsS3(s) => Some(s.bucket_name()), Self::AzureBlob(s) => Some(s.container_name()), Self::Unreliable(_s) => None, Self::GCS(s) => Some(s.bucket_name()), } } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. /// Immutable, cannot be changed once the file is created. #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); impl From<[(&str, &str); N]> for StorageMetadata { fn from(arr: [(&str, &str); N]) -> Self { let map: HashMap = arr .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(); Self(map) } } struct ConcurrencyLimiter { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. write: Arc, read: Arc, } impl ConcurrencyLimiter { fn for_kind(&self, kind: RequestKind) -> &Arc { match kind { RequestKind::Get => &self.read, RequestKind::Put => &self.write, RequestKind::List => &self.read, RequestKind::Delete => &self.write, RequestKind::Copy => &self.write, RequestKind::TimeTravel => &self.write, RequestKind::Head => &self.read, RequestKind::ListVersions => &self.read, } } async fn acquire( &self, kind: RequestKind, ) -> Result, tokio::sync::AcquireError> { self.for_kind(kind).acquire().await } async fn acquire_owned( &self, kind: RequestKind, ) -> Result { Arc::clone(self.for_kind(kind)).acquire_owned().await } fn new(limit: usize) -> ConcurrencyLimiter { Self { read: Arc::new(Semaphore::new(limit)), write: Arc::new(Semaphore::new(limit)), } } } #[cfg(test)] mod tests { use super::*; /// DownloadOpts::byte_range() should generate (inclusive, exclusive) ranges /// with optional end bound, or None when unbounded. #[test] fn download_opts_byte_range() { // Consider using test_case or a similar table-driven test framework. let cases = [ // (byte_start, byte_end, expected) (Bound::Unbounded, Bound::Unbounded, None), (Bound::Unbounded, Bound::Included(7), Some((0, Some(8)))), (Bound::Unbounded, Bound::Excluded(7), Some((0, Some(7)))), (Bound::Included(3), Bound::Unbounded, Some((3, None))), (Bound::Included(3), Bound::Included(7), Some((3, Some(8)))), (Bound::Included(3), Bound::Excluded(7), Some((3, Some(7)))), (Bound::Excluded(3), Bound::Unbounded, Some((4, None))), (Bound::Excluded(3), Bound::Included(7), Some((4, Some(8)))), (Bound::Excluded(3), Bound::Excluded(7), Some((4, Some(7)))), // 1-sized ranges are fine, 0 aren't and will panic (separate test). (Bound::Included(3), Bound::Included(3), Some((3, Some(4)))), (Bound::Included(3), Bound::Excluded(4), Some((3, Some(4)))), ]; for (byte_start, byte_end, expect) in cases { let opts = DownloadOpts { byte_start, byte_end, ..Default::default() }; let result = opts.byte_range(); assert_eq!( result, expect, "byte_start={byte_start:?} byte_end={byte_end:?}" ); // Check generated HTTP header, which uses an inclusive range. let expect_header = expect.map(|(start, end)| match end { Some(end) => format!("bytes={start}-{}", end - 1), // inclusive end None => format!("bytes={start}-"), }); assert_eq!( opts.byte_range_header(), expect_header, "byte_start={byte_start:?} byte_end={byte_end:?}" ); } } /// DownloadOpts::byte_range() zero-sized byte range should panic. #[test] #[should_panic] fn download_opts_byte_range_zero() { DownloadOpts { byte_start: Bound::Included(3), byte_end: Bound::Excluded(3), ..Default::default() } .byte_range(); } /// DownloadOpts::byte_range() negative byte range should panic. #[test] #[should_panic] fn download_opts_byte_range_negative() { DownloadOpts { byte_start: Bound::Included(3), byte_end: Bound::Included(2), ..Default::default() } .byte_range(); } #[test] fn test_object_name() { let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap(); assert_eq!(k.object_name(), Some("c")); let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap(); assert_eq!(k.object_name(), Some("c")); let k = RemotePath::new(Utf8Path::new("a/")).unwrap(); assert_eq!(k.object_name(), Some("a")); // XXX is it impossible to have an empty key? let k = RemotePath::new(Utf8Path::new("")).unwrap(); assert_eq!(k.object_name(), None); } #[test] fn rempte_path_cannot_be_created_from_absolute_ones() { let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths"); assert_eq!(err.to_string(), "Path \"/\" is not relative"); } } ================================================ FILE: libs/remote_storage/src/local_fs.rs ================================================ //! Local filesystem acting as a remote storage. //! Multiple API users can use the same "storage" of this kind by using different storage roots. //! //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. use std::collections::HashSet; use std::io::ErrorKind; use std::num::NonZeroU32; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use anyhow::{Context, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::Stream; use tokio::fs; use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::ReaderStream; use tokio_util::sync::CancellationToken; use utils::crashsafe::path_with_suffix_extension; use super::{RemoteStorage, StorageMetadata}; use crate::{ Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel, }; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; #[derive(Debug, Clone)] pub struct LocalFs { storage_root: Utf8PathBuf, timeout: Duration, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative). pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result { if !storage_root.exists() { std::fs::create_dir_all(&storage_root).with_context(|| { format!("Failed to create all directories in the given root path {storage_root:?}") })?; } if !storage_root.is_absolute() { storage_root = storage_root.canonicalize_utf8().with_context(|| { format!("Failed to represent path {storage_root:?} as an absolute path") })?; } Ok(Self { storage_root, timeout, }) } // mirrors S3Bucket::s3_object_to_relative_path fn local_file_to_relative_path(&self, key: Utf8PathBuf) -> RemotePath { let relative_path = key .strip_prefix(&self.storage_root) .expect("relative path must contain storage_root as prefix"); RemotePath(relative_path.into()) } async fn read_storage_metadata( &self, file_path: &Utf8Path, ) -> anyhow::Result> { let metadata_path = storage_metadata_path(file_path); if metadata_path.exists() && metadata_path.is_file() { let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| { format!("Failed to read metadata from the local storage at '{metadata_path}'") })?; serde_json::from_str(&metadata_string) .with_context(|| { format!( "Failed to deserialize metadata from the local storage at '{metadata_path}'", ) }) .map(|metadata| Some(StorageMetadata(metadata))) } else { Ok(None) } } #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { use std::future::Future; use std::pin::Pin; fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, { Box::pin(async move { let directory_path = directory_path.as_ref(); if directory_path.exists() { if directory_path.is_dir() { let mut paths = Vec::new(); let mut dir_contents = fs::read_dir(directory_path).await?; while let Some(dir_entry) = dir_contents.next_entry().await? { let file_type = dir_entry.file_type().await?; let entry_path = Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { anyhow::Error::msg(format!( "non-Unicode path: {}", pb.to_string_lossy() )) })?; if file_type.is_symlink() { tracing::debug!("{entry_path:?} is a symlink, skipping") } else if file_type.is_dir() { paths.extend(get_all_files(&entry_path).await?.into_iter()) } else { paths.push(entry_path); } } Ok(paths) } else { bail!("Path {directory_path:?} is not a directory") } } else { Ok(Vec::new()) } }) } Ok(get_all_files(&self.storage_root) .await? .into_iter() .map(|path| { path.strip_prefix(&self.storage_root) .context("Failed to strip storage root prefix") .and_then(RemotePath::new) .expect( "We list files for storage root, hence should be able to remote the prefix", ) }) .collect()) } // recursively lists all files in a directory, // mirroring the `list_files` for `s3_bucket` async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result> { let full_path = match folder { Some(folder) => folder.with_base(&self.storage_root), None => self.storage_root.clone(), }; // If we were given a directory, we may use it as our starting point. // Otherwise, we must go up to the first ancestor dir that exists. This is because // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); // If there's no trailing slash, we have to start looking from one above: even if // `initial_dir` is a directory, we should still list any prefixes in the parent // that start with the same string. if !full_path.to_string().ends_with('/') { initial_dir.pop(); } loop { // Did we make it to the root? if initial_dir.parent().is_none() { anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}"); } match fs::metadata(initial_dir.clone()).await { Ok(meta) if meta.is_dir() => { // We found a directory, break break; } Ok(_meta) => { // It's not a directory: strip back to the parent initial_dir.pop(); } Err(e) if e.kind() == ErrorKind::NotFound => { // It's not a file that exists: strip the prefix back to the parent directory initial_dir.pop(); } Err(e) => { // Unexpected I/O error anyhow::bail!(e) } } } // Note that Utf8PathBuf starts_with only considers full path segments, but // object prefixes are arbitrary strings, so we need the strings for doing // starts_with later. let prefix = full_path.as_str(); let mut files = vec![]; let mut directory_queue = vec![initial_dir]; while let Some(cur_folder) = directory_queue.pop() { let mut entries = cur_folder.read_dir_utf8()?; while let Some(Ok(entry)) = entries.next() { let file_name = entry.file_name(); let full_file_name = cur_folder.join(file_name); if full_file_name.as_str().starts_with(prefix) { let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); files.push(file_remote_path); if full_file_name.is_dir() { directory_queue.push(full_file_name); } } } } Ok(files) } async fn upload0( &self, data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { let target_file_path = to.with_base(&self.storage_root); create_target_directory(&target_file_path).await?; // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs // NOTE: Because temp file suffix always the same this operation is racy. // Two concurrent operations can lead to the following sequence: // T1: write(temp) // T2: write(temp) -> overwrites the content // T1: rename(temp, dst) -> succeeds // T2: rename(temp, dst) -> fails, temp no longet exists // This can be solved by supplying unique temp suffix every time, but this situation // is not normal in the first place, the error can help (and helped at least once) // to discover bugs in upper level synchronization. let temp_file_path = path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) .create(true) .truncate(true) .open(&temp_file_path) .await .with_context(|| { format!("Failed to open target fs destination at '{target_file_path}'") })?, ); let from_size_bytes = data_size_bytes as u64; let data = tokio_util::io::StreamReader::new(data); let data = std::pin::pin!(data); let mut buffer_to_read = data.take(from_size_bytes); // alternatively we could just write the bytes to a file, but local_fs is a testing utility let copy = io::copy_buf(&mut buffer_to_read, &mut destination); let bytes_read = tokio::select! { biased; _ = cancel.cancelled() => { let file = destination.into_inner(); // wait for the inflight operation(s) to complete so that there could be a next // attempt right away and our writes are not directed to their file. file.into_std().await; // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at // least. fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?; return Err(TimeoutOrCancel::Cancel.into()); } read = copy => read, }; let bytes_read = bytes_read.with_context(|| { format!( "Failed to upload file (write temp) to the local storage at '{temp_file_path}'", ) })?; if bytes_read < from_size_bytes { bail!( "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes" ); } // Check if there is any extra data after the given size. let mut from = buffer_to_read.into_inner(); let extra_read = from.read(&mut [1]).await?; ensure!( extra_read == 0, "Provided stream was larger than expected: expected {from_size_bytes} bytes", ); destination.flush().await.with_context(|| { format!( "Failed to upload (flush temp) file to the local storage at '{temp_file_path}'", ) })?; fs::rename(temp_file_path, &target_file_path) .await .with_context(|| { format!( "Failed to upload (rename) file to the local storage at '{target_file_path}'", ) })?; if let Some(storage_metadata) = metadata { // FIXME: we must not be using metadata much, since this would forget the old metadata // for new writes? or perhaps metadata is sticky; could consider removing if it's never // used. let storage_metadata_path = storage_metadata_path(&target_file_path); fs::write( &storage_metadata_path, serde_json::to_string(&storage_metadata.0) .context("Failed to serialize storage metadata as json")?, ) .await .with_context(|| { format!( "Failed to write metadata to the local storage at '{storage_metadata_path}'", ) })?; } Ok(()) } } impl RemoteStorage for LocalFs { fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> { let listing = self.list(prefix, mode, max_keys, cancel); futures::stream::once(listing) } async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let op = async { let mut result = Listing::default(); // Filter out directories: in S3 directories don't exist, only the keys within them do. let keys = self .list_recursive(prefix) .await .map_err(DownloadError::Other)?; let mut objects = Vec::with_capacity(keys.len()); for key in keys { let path = key.with_base(&self.storage_root); let metadata = file_metadata(&path).await; if let Err(DownloadError::NotFound) = metadata { // Race: if the file is deleted between listing and metadata check, ignore it. continue; } let metadata = metadata?; if metadata.is_dir() { continue; } objects.push(ListingObject { key: key.clone(), last_modified: metadata.modified()?, size: metadata.len(), }); } let objects = objects; if let ListingMode::NoDelimiter = mode { result.keys = objects; } else { let mut prefixes = HashSet::new(); for object in objects { let key = object.key; // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. let relative_key = if let Some(prefix) = prefix { let mut prefix = prefix.clone(); // We only strip the dirname of the prefix, so that when we strip it from the start of keys we // end up with full file/dir names. let prefix_full_local_path = prefix.with_base(&self.storage_root); let has_slash = prefix.0.to_string().ends_with('/'); let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { prefix } else { prefix.0.pop(); prefix }; RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() } else { key }; let relative_key = format!("{relative_key}"); if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { let first_part = relative_key .split(REMOTE_STORAGE_PREFIX_SEPARATOR) .next() .unwrap() .to_owned(); prefixes.insert(first_part); } else { result.keys.push(ListingObject { key: RemotePath::from_string(&relative_key).unwrap(), last_modified: object.last_modified, size: object.size, }); } } result.prefixes = prefixes .into_iter() .map(|s| RemotePath::from_string(&s).unwrap()) .collect(); } if let Some(max_keys) = max_keys { result.keys.truncate(max_keys.get() as usize); } Ok(result) }; let timeout = async { tokio::time::sleep(self.timeout).await; Err(DownloadError::Timeout) }; let cancelled = async { cancel.cancelled().await; Err(DownloadError::Cancelled) }; tokio::select! { res = op => res, res = timeout => res, res = cancelled => res, } } async fn list_versions( &self, _prefix: Option<&RemotePath>, _mode: ListingMode, _max_keys: Option, _cancel: &CancellationToken, ) -> Result { unimplemented!() } async fn head_object( &self, key: &RemotePath, _cancel: &CancellationToken, ) -> Result { let target_file_path = key.with_base(&self.storage_root); let metadata = file_metadata(&target_file_path).await?; Ok(ListingObject { key: key.clone(), last_modified: metadata.modified()?, size: metadata.len(), }) } async fn upload( &self, data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { let cancel = cancel.child_token(); let op = self.upload0(data, data_size_bytes, to, metadata, &cancel); let mut op = std::pin::pin!(op); // race the upload0 to the timeout; if it goes over, do a graceful shutdown let (res, timeout) = tokio::select! { res = &mut op => (res, false), _ = tokio::time::sleep(self.timeout) => { cancel.cancel(); (op.await, true) } }; match res { Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => { // we caused this cancel (or they happened simultaneously) -- swap it out to // Timeout Err(TimeoutOrCancel::Timeout.into()) } res => res, } } async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { let target_path = from.with_base(&self.storage_root); let file_metadata = file_metadata(&target_path).await?; let etag = mock_etag(&file_metadata); if opts.etag.as_ref() == Some(&etag) { return Err(DownloadError::Unmodified); } let mut file = fs::OpenOptions::new() .read(true) .open(&target_path) .await .with_context(|| { format!("Failed to open source file {target_path:?} to use in the download") }) .map_err(DownloadError::Other)?; let mut take = file_metadata.len(); if let Some((start, end)) = opts.byte_range() { if start > 0 { file.seek(io::SeekFrom::Start(start)) .await .context("Failed to seek to the range start in a local storage file") .map_err(DownloadError::Other)?; } if let Some(end) = end { take = end - start; } } let source = ReaderStream::new(file.take(take)); let metadata = self .read_storage_metadata(&target_path) .await .map_err(DownloadError::Other)?; let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let source = crate::support::DownloadStream::new(cancel_or_timeout, source); Ok(Download { metadata, last_modified: file_metadata .modified() .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, etag, download_stream: Box::pin(source), }) } async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); match fs::remove_file(&file_path).await { Ok(()) => Ok(()), // The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour. // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful. Err(e) if e.kind() == ErrorKind::NotFound => Ok(()), Err(e) => Err(anyhow::anyhow!(e)), } } async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { for path in paths { self.delete(path, cancel).await? } Ok(()) } fn max_keys_per_delete(&self) -> usize { super::MAX_KEYS_PER_DELETE_S3 } async fn copy( &self, from: &RemotePath, to: &RemotePath, _cancel: &CancellationToken, ) -> anyhow::Result<()> { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; fs::copy(&from_path, &to_path) .await .with_context(|| format!("Failed to copy file from '{from_path}' to '{to_path}'"))?; Ok(()) } async fn time_travel_recover( &self, _prefix: Option<&RemotePath>, _timestamp: SystemTime, _done_if_after: SystemTime, _cancel: &CancellationToken, _complexity_limit: Option, ) -> Result<(), TimeTravelError> { Err(TimeTravelError::Unimplemented) } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { path_with_suffix_extension(original_path, "metadata") } async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { let target_dir = match target_file_path.parent() { Some(parent_dir) => parent_dir, None => bail!("File path '{target_file_path}' has no parent directory"), }; if !target_dir.exists() { fs::create_dir_all(target_dir).await?; } Ok(()) } async fn file_metadata(file_path: &Utf8Path) -> Result { tokio::fs::metadata(&file_path).await.map_err(|e| { if e.kind() == ErrorKind::NotFound { DownloadError::NotFound } else { DownloadError::BadInput(e.into()) } }) } // Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we // read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests // quickly, with less overhead than using a mock S3 server. fn mock_etag(meta: &std::fs::Metadata) -> Etag { let mtime = meta.modified().expect("Filesystem mtime missing"); format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into() } #[cfg(test)] mod fs_tests { use std::collections::HashMap; use std::io::Write; use std::ops::Bound; use camino_tempfile::tempdir; use super::*; async fn read_and_check_metadata( storage: &LocalFs, remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { let cancel = CancellationToken::new(); let download = storage .download(remote_storage_path, &DownloadOpts::default(), &cancel) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; ensure!( download.metadata.as_ref() == expected_metadata, "Unexpected metadata returned for the downloaded file" ); let contents = aggregate(download.download_stream).await?; String::from_utf8(contents).map_err(anyhow::Error::new) } #[tokio::test] async fn upload_file() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?; assert_eq!( storage.list_all().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], "Should list a two different files after second upload" ); Ok(()) } #[tokio::test] async fn upload_file_negatives() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let id = RemotePath::new(Utf8Path::new("dummy"))?; let content = Bytes::from_static(b"12345"); let content = move || futures::stream::once(futures::future::ready(Ok(content.clone()))); // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage .upload(content(), 0, &id, None, &cancel) .await .expect_err("upload with zero size succeeded"); storage .upload(content(), 4, &id, None, &cancel) .await .expect_err("upload with too short size succeeded"); storage .upload(content(), 6, &id, None, &cancel) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. storage.upload(content(), 5, &id, None, &cancel).await?; Ok(()) } fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> { let storage_root = tempdir()?.path().to_path_buf(); LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new())) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let contents = read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, "We should upload and download the same contents" ); let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; match storage .download(&non_existing_path, &DownloadOpts::default(), &cancel) .await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!( "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}" ), } Ok(()) } #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let full_range_download_contents = read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, "Download full range should return the whole upload" ); let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); let first_part_download = storage .download( &upload_target, &DownloadOpts { byte_end: Bound::Excluded(first_part_local.len() as u64), ..Default::default() }, &cancel, ) .await?; assert!( first_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); let first_part_remote = aggregate(first_part_download.download_stream).await?; assert_eq!( first_part_local, first_part_remote, "First part bytes should be returned when requested" ); let second_part_download = storage .download( &upload_target, &DownloadOpts { byte_start: Bound::Included(first_part_local.len() as u64), byte_end: Bound::Excluded( (first_part_local.len() + second_part_local.len()) as u64, ), ..Default::default() }, &cancel, ) .await?; assert!( second_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); let second_part_remote = aggregate(second_part_download.download_stream).await?; assert_eq!( second_part_local, second_part_remote, "Second part bytes should be returned when requested" ); let suffix_bytes = storage .download( &upload_target, &DownloadOpts { byte_start: Bound::Included(13), ..Default::default() }, &cancel, ) .await? .download_stream; let suffix_bytes = aggregate(suffix_bytes).await?; let suffix = std::str::from_utf8(&suffix_bytes)?; assert_eq!(upload_name, suffix); let all_bytes = storage .download(&upload_target, &DownloadOpts::default(), &cancel) .await? .download_stream; let all_bytes = aggregate(all_bytes).await?; let all_bytes = std::str::from_utf8(&all_bytes)?; assert_eq!(dummy_contents("upload_1"), all_bytes); Ok(()) } #[tokio::test] #[should_panic(expected = "at or before start")] async fn download_file_range_negative() { let (storage, cancel) = create_storage().unwrap(); let upload_name = "upload_1"; let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel) .await .unwrap(); storage .download( &upload_target, &DownloadOpts { byte_start: Bound::Included(10), byte_end: Bound::Excluded(10), ..Default::default() }, &cancel, ) .await .unwrap(); } #[tokio::test] async fn delete_file() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; storage.delete(&upload_target, &cancel).await?; assert!(storage.list_all().await?.is_empty()); storage .delete(&upload_target, &cancel) .await .expect("Should allow deleting non-existing storage files"); Ok(()) } #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ ("one".to_string(), "1".to_string()), ("two".to_string(), "2".to_string()), ])); let upload_target = upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?; let full_range_download_contents = read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, "We should upload and download the same contents" ); let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, _) = uploaded_bytes.split_at(3); let partial_download_with_metadata = storage .download( &upload_target, &DownloadOpts { byte_end: Bound::Excluded(first_part_local.len() as u64), ..Default::default() }, &cancel, ) .await?; let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?; assert_eq!( first_part_local, first_part_remote.as_slice(), "First part bytes should be returned when requested" ); assert_eq!( partial_download_with_metadata.metadata, Some(metadata), "We should get the same metadata back for partial download" ); Ok(()) } #[tokio::test] async fn list() -> anyhow::Result<()> { // No delimiter: should recursively list everything let (storage, cancel) = create_storage()?; let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; let child_sibling = upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?; let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; let listing = storage .list(None, ListingMode::NoDelimiter, None, &cancel) .await?; assert!(listing.prefixes.is_empty()); assert_eq!( listing .keys .into_iter() .map(|o| o.key) .collect::>(), HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) ); // Delimiter: should only go one deep let listing = storage .list(None, ListingMode::WithDelimiter, None, &cancel) .await?; assert_eq!( listing.prefixes, [RemotePath::from_string("timelines").unwrap()].to_vec() ); assert!(listing.keys.is_empty()); // Delimiter & prefix with a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()), ListingMode::WithDelimiter, None, &cancel, ) .await?; assert_eq!( listing.keys.into_iter().map(|o| o.key).collect::>(), [RemotePath::from_string("uncle").unwrap()].to_vec() ); assert_eq!( listing.prefixes, [RemotePath::from_string("parent").unwrap()].to_vec() ); // Delimiter and prefix without a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), ListingMode::WithDelimiter, None, &cancel, ) .await?; assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() ); // Delimiter and prefix that's partway through a path component let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()), ListingMode::WithDelimiter, None, &cancel, ) .await?; assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() ); Ok(()) } #[tokio::test] async fn list_part_component() -> anyhow::Result<()> { // No delimiter: should recursively list everything let (storage, cancel) = create_storage()?; // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as // a freeform prefix. let _child_a = upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?; let _child_b = upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?; // Delimiter and prefix that's partway through a path component let listing = storage .list( Some( &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(), ), ListingMode::WithDelimiter, None, &cancel, ) .await?; assert_eq!(listing.keys, vec![]); let mut found_prefixes = listing.prefixes.clone(); found_prefixes.sort(); assert_eq!( found_prefixes, [ RemotePath::from_string("tenant").unwrap(), RemotePath::from_string("tenant-01").unwrap(), ] .to_vec() ); Ok(()) } #[tokio::test] async fn overwrite_shorter_file() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let path = RemotePath::new("does/not/matter/file".into())?; let body = Bytes::from_static(b"long file contents is long"); { let len = body.len(); let body = futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); storage.upload(body, len, &path, None, &cancel).await?; } let read = aggregate( storage .download(&path, &DownloadOpts::default(), &cancel) .await? .download_stream, ) .await?; assert_eq!(body, read); let shorter = Bytes::from_static(b"shorter body"); { let len = shorter.len(); let body = futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone()))); storage.upload(body, len, &path, None, &cancel).await?; } let read = aggregate( storage .download(&path, &DownloadOpts::default(), &cancel) .await? .download_stream, ) .await?; assert_eq!(shorter, read); Ok(()) } #[tokio::test] async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> { let (storage, cancel) = create_storage()?; let path = RemotePath::new("does/not/matter/file".into())?; let body = Bytes::from_static(b"long file contents is long"); { let len = body.len(); let body = futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); let cancel = cancel.child_token(); cancel.cancel(); let e = storage .upload(body, len, &path, None, &cancel) .await .unwrap_err(); assert!(TimeoutOrCancel::caused_by_cancel(&e)); } { let len = body.len(); let body = futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); storage.upload(body, len, &path, None, &cancel).await?; } let read = aggregate( storage .download(&path, &DownloadOpts::default(), &cancel) .await? .download_stream, ) .await?; assert_eq!(body, read); Ok(()) } async fn upload_dummy_file( storage: &LocalFs, name: &str, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result { let from_path = storage .storage_root .join("timelines") .join("some_timeline") .join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; let relative_path = from_path .strip_prefix(&storage.storage_root) .context("Failed to strip storage root prefix") .and_then(RemotePath::new) .with_context(|| { format!( "Failed to resolve remote part of path {:?} for base {:?}", from_path, storage.storage_root ) })?; let file = tokio_util::io::ReaderStream::new(file); storage .upload(file, size, &relative_path, metadata, cancel) .await?; Ok(relative_path) } async fn create_file_for_upload( path: &Utf8Path, contents: &str, ) -> anyhow::Result<(fs::File, usize)> { std::fs::create_dir_all(path.parent().unwrap())?; let mut file_for_writing = std::fs::OpenOptions::new() .write(true) .create_new(true) .open(path)?; write!(file_for_writing, "{contents}")?; drop(file_for_writing); let file_size = path.metadata()?.len() as usize; Ok(( fs::OpenOptions::new().read(true).open(&path).await?, file_size, )) } fn dummy_contents(name: &str) -> String { format!("contents for {name}") } async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { let mut files = storage.list_all().await?; files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) } async fn aggregate( stream: impl Stream>, ) -> anyhow::Result> { use futures::stream::StreamExt; let mut out = Vec::new(); let mut stream = std::pin::pin!(stream); while let Some(res) = stream.next().await { out.extend_from_slice(&res?[..]); } Ok(out) } } ================================================ FILE: libs/remote_storage/src/metrics.rs ================================================ use metrics::{ Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec, }; use once_cell::sync::Lazy; pub(super) static BUCKET_METRICS: Lazy = Lazy::new(Default::default); #[derive(Clone, Copy, Debug)] pub(crate) enum RequestKind { Get = 0, Put = 1, Delete = 2, List = 3, Copy = 4, TimeTravel = 5, Head = 6, ListVersions = 7, } use RequestKind::*; use scopeguard::ScopeGuard; impl RequestKind { const fn as_str(&self) -> &'static str { match self { Get => "get_object", Put => "put_object", Delete => "delete_object", List => "list_objects", Copy => "copy_object", TimeTravel => "time_travel_recover", Head => "head_object", ListVersions => "list_versions", } } const fn as_index(&self) -> usize { *self as usize } } const REQUEST_KIND_LIST: &[RequestKind] = &[Get, Put, Delete, List, Copy, TimeTravel, Head, ListVersions]; const REQUEST_KIND_COUNT: usize = REQUEST_KIND_LIST.len(); pub(crate) struct RequestTyped([C; REQUEST_KIND_COUNT]); impl RequestTyped { pub(crate) fn get(&self, kind: RequestKind) -> &C { &self.0[kind.as_index()] } fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { let mut it = REQUEST_KIND_LIST.iter(); let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(*next) }); if let Some(next) = it.next() { panic!("unexpected {next:?}"); } RequestTyped(arr) } } impl RequestTyped { pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { self.get(kind).observe(started_at.elapsed().as_secs_f64()) } } pub(crate) struct PassFailCancelledRequestTyped { success: RequestTyped, fail: RequestTyped, cancelled: RequestTyped, } #[derive(Debug, Clone, Copy)] pub(crate) enum AttemptOutcome { Ok, Err, Cancelled, } impl From<&Result> for AttemptOutcome { fn from(value: &Result) -> Self { match value { Ok(_) => AttemptOutcome::Ok, Err(_) => AttemptOutcome::Err, } } } impl AttemptOutcome { pub(crate) fn as_str(&self) -> &'static str { match self { AttemptOutcome::Ok => "ok", AttemptOutcome::Err => "err", AttemptOutcome::Cancelled => "cancelled", } } } impl PassFailCancelledRequestTyped { pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { let target = match outcome { AttemptOutcome::Ok => &self.success, AttemptOutcome::Err => &self.fail, AttemptOutcome::Cancelled => &self.cancelled, }; target.get(kind) } fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self { let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok)); let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err)); let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled)); PassFailCancelledRequestTyped { success, fail, cancelled, } } } impl PassFailCancelledRequestTyped { pub(crate) fn observe_elapsed( &self, kind: RequestKind, outcome: impl Into, started_at: std::time::Instant, ) { self.get(kind, outcome.into()) .observe(started_at.elapsed().as_secs_f64()) } } /// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`]. pub(crate) fn start_counting_cancelled_wait( kind: RequestKind, ) -> ScopeGuard { scopeguard::guard_on_success(std::time::Instant::now(), move |_| { crate::metrics::BUCKET_METRICS .cancelled_waits .get(kind) .inc() }) } /// On drop (cancellation) add time to [`BucketMetrics::req_seconds`]. pub(crate) fn start_measuring_requests( kind: RequestKind, ) -> ScopeGuard { scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Cancelled, started_at, ) }) } pub(crate) struct BucketMetrics { /// Full request duration until successful completion, error or cancellation. pub(crate) req_seconds: PassFailCancelledRequestTyped, /// Total amount of seconds waited on queue. pub(crate) wait_seconds: RequestTyped, /// Track how many semaphore awaits were cancelled per request type. /// /// This is in case cancellations are happening more than expected. pub(crate) cancelled_waits: RequestTyped, /// Total amount of deleted objects in batches or single requests. pub(crate) deleted_objects_total: IntCounter, } impl Default for BucketMetrics { fn default() -> Self { // first bucket 100 microseconds to count requests that do not need to wait at all // and get a permit immediately let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]; let req_seconds = register_histogram_vec!( "remote_storage_s3_request_seconds", "Seconds to complete a request", &["request_type", "result"], buckets.to_vec(), ) .unwrap(); let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| { req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()]) }); let wait_seconds = register_histogram_vec!( "remote_storage_s3_wait_seconds", "Seconds rate limited", &["request_type"], buckets.to_vec(), ) .unwrap(); let wait_seconds = RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()])); let cancelled_waits = register_int_counter_vec!( "remote_storage_s3_cancelled_waits_total", "Times a semaphore wait has been cancelled per request type", &["request_type"], ) .unwrap(); let cancelled_waits = RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()])); let deleted_objects_total = register_int_counter!( "remote_storage_s3_deleted_objects_total", "Amount of deleted objects in total", ) .unwrap(); Self { req_seconds, wait_seconds, cancelled_waits, deleted_objects_total, } } } ================================================ FILE: libs/remote_storage/src/s3_bucket.rs ================================================ //! AWS S3 storage wrapper around `rusoto` library. //! //! Respects `prefix_in_bucket` property from [`S3Config`], //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. use std::borrow::Cow; use std::collections::HashMap; use std::num::NonZeroU32; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::{Duration, SystemTime}; use anyhow::{Context as _, anyhow}; use aws_config::BehaviorVersion; use aws_config::default_provider::credentials::DefaultCredentialsChain; use aws_config::retry::{RetryConfigBuilder, RetryMode}; use aws_sdk_s3::Client; use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}; use aws_sdk_s3::error::SdkError; use aws_sdk_s3::operation::get_object::GetObjectError; use aws_sdk_s3::operation::head_object::HeadObjectError; use aws_sdk_s3::types::{Delete, ObjectIdentifier, StorageClass}; use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_types::body::SdkBody; use aws_smithy_types::byte_stream::ByteStream; use aws_smithy_types::date_time::ConversionError; use bytes::Bytes; use futures::stream::Stream; use futures_util::StreamExt; use http_body_util::StreamBody; use http_types::StatusCode; use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; use super::StorageMetadata; use crate::config::S3Config; use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; use crate::support::PermitCarrying; use crate::{ ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, Version, VersionId, VersionKind, VersionListing, }; /// AWS S3 storage. pub struct S3Bucket { client: Client, bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, } struct GetObjectRequest { bucket: String, key: String, etag: Option, range: Option, version_id: Option, } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", remote_storage_config.bucket_name ); let region = Region::new(remote_storage_config.bucket_region.clone()); let region_opt = Some(region.clone()); // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html // Incomplete list of auth methods used by this: // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" // * "AWS_PROFILE" / `aws sso login --profile ` // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" // * http (ECS/EKS) container credentials // * imds v2 let credentials_provider = DefaultCredentialsChain::builder() .region(region) .build() .await; // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ BehaviorVersion::v2023_11_09(), ) .region(region_opt) .identity_cache(IdentityCache::lazy().build()) .credentials_provider(credentials_provider) .sleep_impl(SharedAsyncSleep::from(sleep_impl)); let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { s.spawn(|| { // TODO: make this function async. tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap() .block_on(sdk_config_loader.load()) }) .join() .unwrap() }); let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { s3_config_builder = s3_config_builder .endpoint_url(custom_endpoint) .force_path_style(true); } // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. let mut retry_config = RetryConfigBuilder::new(); retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); s3_config_builder = s3_config_builder.retry_config(retry_config.build()); let s3_config = s3_config_builder.build(); let client = aws_sdk_s3::Client::from_conf(s3_config); let prefix_in_bucket = remote_storage_config .prefix_in_bucket .as_deref() .map(|prefix| { let mut prefix = prefix; while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix = &prefix[1..] } let mut prefix = prefix.to_string(); while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix.pop(); } prefix }); Ok(Self { client, bucket_name: remote_storage_config.bucket_name.clone(), max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: ConcurrencyLimiter::new( remote_storage_config.concurrency_limit.get(), ), upload_storage_class: remote_storage_config.upload_storage_class.clone(), timeout, }) } fn s3_object_to_relative_path(&self, key: &str) -> RemotePath { let relative_path = match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) { Some(stripped) => stripped, // we rely on AWS to return properly prefixed paths // for requests with a certain prefix None => panic!( "Key {} does not start with bucket prefix {:?}", key, self.prefix_in_bucket ), }; RemotePath( relative_path .split(REMOTE_STORAGE_PREFIX_SEPARATOR) .collect(), ) } pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), } } async fn permit( &self, kind: RequestKind, cancel: &CancellationToken, ) -> Result, Cancelled> { let started_at = start_counting_cancelled_wait(kind); let acquire = self.concurrency_limiter.acquire(kind); let permit = tokio::select! { permit = acquire => permit.expect("semaphore is never closed"), _ = cancel.cancelled() => return Err(Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); Ok(permit) } async fn owned_permit( &self, kind: RequestKind, cancel: &CancellationToken, ) -> Result { let started_at = start_counting_cancelled_wait(kind); let acquire = self.concurrency_limiter.acquire_owned(kind); let permit = tokio::select! { permit = acquire => permit.expect("semaphore is never closed"), _ = cancel.cancelled() => return Err(Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); Ok(permit) } async fn download_object( &self, request: GetObjectRequest, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Get; let permit = self.owned_permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let mut builder = self .client .get_object() .bucket(request.bucket) .key(request.key) .set_version_id(request.version_id) .set_range(request.range); if let Some(etag) = request.etag { builder = builder.if_none_match(etag); } let get_object = builder.send(); let get_object = tokio::select! { res = get_object => res, _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), _ = cancel.cancelled() => return Err(DownloadError::Cancelled), }; let started_at = ScopeGuard::into_inner(started_at); let object_output = match get_object { Ok(object_output) => object_output, Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, // e.g. when probing for timeline indices. crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, ); return Err(DownloadError::NotFound); } Err(SdkError::ServiceError(e)) // aws_smithy_runtime_api::http::response::StatusCode isn't // re-exported by any aws crates, so just check the numeric // status against http_types::StatusCode instead of pulling it. if e.raw().status().as_u16() == StatusCode::NotModified => { // Count an unmodified file as a success. crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, ); return Err(DownloadError::Unmodified); } Err(e) => { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); return Err(DownloadError::Other( anyhow::Error::new(e).context("download s3 object"), )); } }; // even if we would have no timeout left, continue anyways. the caller can decide to ignore // the errors considering timeouts and cancellation. let remaining = self.timeout.saturating_sub(started_at.elapsed()); let metadata = object_output.metadata().cloned().map(StorageMetadata); let etag = object_output .e_tag .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))? .into(); let last_modified = object_output .last_modified .ok_or(DownloadError::Other(anyhow::anyhow!( "Missing LastModified header" )))? .try_into() .map_err(|e: ConversionError| DownloadError::Other(e.into()))?; let body = object_output.body; let body = ByteStreamAsStream::from(body); let body = PermitCarrying::new(permit, body); let body = TimedDownload::new(started_at, body); let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone()); let body = crate::support::DownloadStream::new(cancel_or_timeout, body); Ok(Download { metadata, etag, last_modified, download_stream: Box::pin(body), }) } async fn delete_oids( &self, _permit: &tokio::sync::SemaphorePermit<'_>, delete_objects: &[ObjectIdentifier], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; let mut cancel = std::pin::pin!(cancel.cancelled()); for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) { let started_at = start_measuring_requests(kind); let req = self .client .delete_objects() .bucket(self.bucket_name.clone()) .delete( Delete::builder() .set_objects(Some(chunk.to_vec())) .build() .context("build request")?, ) .send(); let resp = tokio::select! { resp = req => resp, _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()), _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &resp, started_at); let resp = resp.context("request deletion")?; crate::metrics::BUCKET_METRICS .deleted_objects_total .inc_by(chunk.len() as u64); if let Some(errors) = resp.errors { // Log a bounded number of the errors within the response: // these requests can carry 1000 keys so logging each one // would be too verbose, especially as errors may lead us // to retry repeatedly. const LOG_UP_TO_N_ERRORS: usize = 10; for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { tracing::warn!( "DeleteObjects key {} failed: {}: {}", e.key.as_ref().map(Cow::from).unwrap_or("".into()), e.code.as_ref().map(Cow::from).unwrap_or("".into()), e.message.as_ref().map(Cow::from).unwrap_or("".into()) ); } return Err(anyhow::anyhow!( "Failed to delete {}/{} objects", errors.len(), chunk.len(), )); } } Ok(()) } async fn list_versions_with_permit( &self, _permit: &tokio::sync::SemaphorePermit<'_>, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { // get the passed prefix or if it is not set use prefix_in_bucket value let prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) .or_else(|| self.prefix_in_bucket.clone()); let warn_threshold = 3; let max_retries = 10; let is_permanent = |e: &_| matches!(e, DownloadError::Cancelled); let mut key_marker = None; let mut version_id_marker = None; let mut versions_and_deletes = Vec::new(); loop { let response = backoff::retry( || async { let mut request = self .client .list_object_versions() .bucket(self.bucket_name.clone()) .set_prefix(prefix.clone()) .set_key_marker(key_marker.clone()) .set_version_id_marker(version_id_marker.clone()); if let ListingMode::WithDelimiter = mode { request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } let op = request.send(); tokio::select! { res = op => res.map_err(|e| DownloadError::Other(e.into())), _ = cancel.cancelled() => Err(DownloadError::Cancelled), } }, is_permanent, warn_threshold, max_retries, "listing object versions", cancel, ) .await .ok_or_else(|| DownloadError::Cancelled) .and_then(|x| x)?; tracing::trace!( " Got List response version_id_marker={:?}, key_marker={:?}", response.version_id_marker, response.key_marker ); let versions = response .versions .unwrap_or_default() .into_iter() .map(|version| { let key = version.key.expect("response does not contain a key"); let key = self.s3_object_to_relative_path(&key); let version_id = VersionId(version.version_id.expect("needing version id")); let last_modified = SystemTime::try_from(version.last_modified.expect("no last_modified"))?; Ok(Version { key, last_modified, kind: crate::VersionKind::Version(version_id), }) }); let deletes = response .delete_markers .unwrap_or_default() .into_iter() .map(|version| { let key = version.key.expect("response does not contain a key"); let key = self.s3_object_to_relative_path(&key); let last_modified = SystemTime::try_from(version.last_modified.expect("no last_modified"))?; Ok(Version { key, last_modified, kind: crate::VersionKind::DeletionMarker, }) }); itertools::process_results(versions.chain(deletes), |n_vds| { versions_and_deletes.extend(n_vds) }) .map_err(DownloadError::Other)?; fn none_if_empty(v: Option) -> Option { v.filter(|v| !v.is_empty()) } version_id_marker = none_if_empty(response.next_version_id_marker); key_marker = none_if_empty(response.next_key_marker); if version_id_marker.is_none() { // The final response is not supposed to be truncated if response.is_truncated.unwrap_or_default() { return Err(DownloadError::Other(anyhow::anyhow!( "Received truncated ListObjectVersions response for prefix={prefix:?}" ))); } break; } if let Some(max_keys) = max_keys { if versions_and_deletes.len() >= max_keys.get().try_into().unwrap() { return Err(DownloadError::Other(anyhow::anyhow!("too many versions"))); } } } Ok(VersionListing { versions: versions_and_deletes, }) } pub fn bucket_name(&self) -> &str { &self.bucket_name } } pin_project_lite::pin_project! { struct ByteStreamAsStream { #[pin] inner: aws_smithy_types::byte_stream::ByteStream } } impl From for ByteStreamAsStream { fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self { ByteStreamAsStream { inner } } } impl Stream for ByteStreamAsStream { type Item = std::io::Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { // this does the std::io::ErrorKind::Other conversion self.project().inner.poll_next(cx).map_err(|x| x.into()) } // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes // sense and Stream::size_hint does not really } pin_project_lite::pin_project! { /// Times and tracks the outcome of the request. struct TimedDownload { started_at: std::time::Instant, outcome: AttemptOutcome, #[pin] inner: S } impl PinnedDrop for TimedDownload { fn drop(mut this: Pin<&mut Self>) { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); } } } impl TimedDownload { fn new(started_at: std::time::Instant, inner: S) -> Self { TimedDownload { started_at, outcome: AttemptOutcome::Cancelled, inner, } } } impl>> Stream for TimedDownload { type Item = ::Item; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { use std::task::ready; let this = self.project(); let res = ready!(this.inner.poll_next(cx)); match &res { Some(Ok(_)) => {} Some(Err(_)) => *this.outcome = AttemptOutcome::Err, None => *this.outcome = AttemptOutcome::Ok, } Poll::Ready(res) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } impl RemoteStorage for S3Bucket { fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> { let kind = RequestKind::List; // s3 sdk wants i32 let mut max_keys = max_keys.map(|mk| mk.get() as i32); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) .or_else(|| { self.prefix_in_bucket.clone().map(|mut s| { s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); s }) }); async_stream::stream! { let _permit = self.permit(kind, cancel).await?; let mut continuation_token = None; 'outer: loop { let started_at = start_measuring_requests(kind); // min of two Options, returning Some if one is value and another is // None (None is smaller than anything, so plain min doesn't work). let request_max_keys = self .max_keys_per_list_response .into_iter() .chain(max_keys.into_iter()) .min(); let mut request = self .client .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token.clone()) .set_max_keys(request_max_keys); if let ListingMode::WithDelimiter = mode { request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } let request = request.send(); let response = tokio::select! { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), }; if let Err(DownloadError::Timeout) = &response { yield Err(DownloadError::Timeout); continue 'outer; } let response = response?; // always yield cancellation errors and stop the stream let response = response .context("Failed to list S3 prefixes") .map_err(DownloadError::Other); let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &response, started_at); let response = match response { Ok(response) => response, Err(e) => { // The error is potentially retryable, so we must rewind the loop after yielding. yield Err(e); continue 'outer; }, }; let keys = response.contents(); let prefixes = response.common_prefixes.as_deref().unwrap_or_default(); tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); let mut result = Listing::default(); for object in keys { let key = object.key().expect("response does not contain a key"); let key = self.s3_object_to_relative_path(key); let last_modified = match object.last_modified.map(SystemTime::try_from) { Some(Ok(t)) => t, Some(Err(_)) => { tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds", object.last_modified, key ); SystemTime::now() }, None => { SystemTime::now() } }; let size = object.size.unwrap_or(0) as u64; result.keys.push(ListingObject{ key, last_modified, size, }); if let Some(mut mk) = max_keys { assert!(mk > 0); mk -= 1; if mk == 0 { // limit reached yield Ok(result); break 'outer; } max_keys = Some(mk); } } // S3 gives us prefixes like "foo/", we return them like "foo" result.prefixes.extend(prefixes.iter().filter_map(|o| { Some( self.s3_object_to_relative_path( o.prefix()? .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), ), ) })); yield Ok(result); continuation_token = match response.next_continuation_token { Some(new_token) => Some(new_token), None => break, }; } } } async fn list_versions( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::ListVersions; let permit = self.permit(kind, cancel).await?; self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel) .await } async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Head; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let head_future = self .client .head_object() .bucket(self.bucket_name()) .key(self.relative_path_to_s3_object(key)) .send(); let head_future = tokio::time::timeout(self.timeout, head_future); let res = tokio::select! { res = head_future => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let res = res.map_err(|_e| DownloadError::Timeout)?; // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); let data = match res { Ok(object_output) => object_output, Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, // e.g. when probing for timeline indices. crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, ); return Err(DownloadError::NotFound); } Err(e) => { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); return Err(DownloadError::Other( anyhow::Error::new(e).context("s3 head object"), )); } }; let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else { return Err(DownloadError::Other(anyhow!( "head_object doesn't contain last_modified or content_length" )))?; }; Ok(ListingObject { key: key.to_owned(), last_modified: SystemTime::try_from(last_modified).map_err(|e| { DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}")) })?, size: size as u64, }) } async fn upload( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let body = StreamBody::new(from.map(|x| x.map(Frame::data))); let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body)); let upload = self .client .put_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) .send(); let upload = tokio::time::timeout(self.timeout, upload); let res = tokio::select! { res = upload => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; if let Ok(inner) = &res { // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, inner, started_at); } match res { Ok(Ok(_put)) => Ok(()), Ok(Err(sdk)) => Err(sdk.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } } async fn copy( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Copy; let _permit = self.permit(kind, cancel).await?; let timeout = tokio::time::sleep(self.timeout); let started_at = start_measuring_requests(kind); // we need to specify bucket_name as a prefix let copy_source = format!( "{}/{}", self.bucket_name, self.relative_path_to_s3_object(from) ); let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) .send(); let res = tokio::select! { res = op => res, _ = timeout => return Err(TimeoutOrCancel::Timeout.into()), _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; let started_at = ScopeGuard::into_inner(started_at); crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); res?; Ok(()) } async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` self.download_object( GetObjectRequest { bucket: self.bucket_name.clone(), key: self.relative_path_to_s3_object(from), etag: opts.etag.as_ref().map(|e| e.to_string()), range: opts.byte_range_header(), version_id: opts.version_id.as_ref().map(|v| v.0.to_owned()), }, cancel, ) .await } async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; let permit = self.permit(kind, cancel).await?; let mut delete_objects = Vec::with_capacity(paths.len()); for path in paths { let obj_id = ObjectIdentifier::builder() .set_key(Some(self.relative_path_to_s3_object(path))) .build() .context("convert path to oid")?; delete_objects.push(obj_id); } self.delete_oids(&permit, &delete_objects, cancel).await } fn max_keys_per_delete(&self) -> usize { MAX_KEYS_PER_DELETE_S3 } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { let paths = std::array::from_ref(path); self.delete_objects(paths, cancel).await } async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, complexity_limit: Option, ) -> Result<(), TimeTravelError> { let kind = RequestKind::TimeTravel; let permit = self.permit(kind, cancel).await?; tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}"); let mode = ListingMode::NoDelimiter; let version_listing = self .list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel) .await .map_err(|err| match err { DownloadError::Other(e) => TimeTravelError::Other(e), DownloadError::Cancelled => TimeTravelError::Cancelled, other => TimeTravelError::Other(other.into()), })?; let versions_and_deletes = version_listing.versions; tracing::info!( "Built list for time travel with {} versions and deletions", versions_and_deletes.len() ); // Work on the list of references instead of the objects directly, // otherwise we get lifetime errors in the sort_by_key call below. let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); let mut vds_for_key = HashMap::<_, Vec<_>>::new(); for vd in &versions_and_deletes { let Version { key, .. } = &vd; let version_id = vd.version_id().map(|v| v.0.as_str()); if version_id == Some("null") { // TODO: check the behavior of using the SDK on a non-versioned container return Err(TimeTravelError::Other(anyhow!( "Received ListVersions response for key={key} with version_id='null', \ indicating either disabled versioning, or legacy objects with null version id values" ))); } tracing::trace!("Parsing version key={key} kind={:?}", vd.kind); vds_for_key.entry(key).or_default().push(vd); } let warn_threshold = 3; let max_retries = 10; let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); for (key, versions) in vds_for_key { let last_vd = versions.last().unwrap(); let key = self.relative_path_to_s3_object(key); if last_vd.last_modified > done_if_after { tracing::trace!("Key {key} has version later than done_if_after, skipping"); continue; } // the version we want to restore to. let version_to_restore_to = match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { Ok(v) => v, Err(e) => e, }; if version_to_restore_to == versions.len() { tracing::trace!("Key {key} has no changes since timestamp, skipping"); continue; } let mut do_delete = false; if version_to_restore_to == 0 { // All versions more recent, so the key didn't exist at the specified time point. tracing::trace!( "All {} versions more recent for {key}, deleting", versions.len() ); do_delete = true; } else { match &versions[version_to_restore_to - 1] { Version { kind: VersionKind::Version(version_id), .. } => { let version_id = &version_id.0; tracing::trace!("Copying old version {version_id} for {key}..."); // Restore the state to the last version by copying let source_id = format!("{}/{key}?versionId={version_id}", self.bucket_name); backoff::retry( || async { let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(&key) .set_storage_class(self.upload_storage_class.clone()) .copy_source(&source_id) .send(); tokio::select! { res = op => res.map_err(|e| TimeTravelError::Other(e.into())), _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), } }, is_permanent, warn_threshold, max_retries, "copying object version for time_travel_recover", cancel, ) .await .ok_or_else(|| TimeTravelError::Cancelled) .and_then(|x| x)?; tracing::info!(%version_id, %key, "Copied old version in S3"); } Version { kind: VersionKind::DeletionMarker, .. } => { do_delete = true; } } }; if do_delete { if matches!(last_vd.kind, VersionKind::DeletionMarker) { // Key has since been deleted (but there was some history), no need to do anything tracing::trace!("Key {key} already deleted, skipping."); } else { tracing::trace!("Deleting {key}..."); let oid = ObjectIdentifier::builder() .key(key.to_owned()) .build() .map_err(|e| TimeTravelError::Other(e.into()))?; self.delete_oids(&permit, &[oid], cancel) .await .map_err(|e| { // delete_oid0 will use TimeoutOrCancel if TimeoutOrCancel::caused_by_cancel(&e) { TimeTravelError::Cancelled } else { TimeTravelError::Other(e) } })?; } } } Ok(()) } } #[cfg(test)] mod tests { use std::num::NonZeroUsize; use camino::Utf8Path; use crate::{RemotePath, S3Bucket, S3Config}; #[tokio::test] async fn relative_path() { let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path")) .collect(); let prefixes = [ None, Some(""), Some("test/prefix"), Some("test/prefix/"), Some("/test/prefix/"), ]; let expected_outputs = [ vec!["", "some/path", "some/path/"], vec!["/", "/some/path", "/some/path/"], vec![ "test/prefix/", "test/prefix/some/path", "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", "test/prefix/some/path/", ], ]; for (prefix_idx, prefix) in prefixes.iter().enumerate() { let config = S3Config { bucket_name: "bucket".to_owned(), bucket_region: "region".to_owned(), prefix_in_bucket: prefix.map(str::to_string), endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), upload_storage_class: None, }; let storage = S3Bucket::new(&config, std::time::Duration::ZERO) .await .expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; assert_eq!(result, expected); } } } } ================================================ FILE: libs/remote_storage/src/simulate_failures.rs ================================================ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. use rand::Rng; use std::cmp; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::num::NonZeroU32; use std::sync::{Arc, Mutex}; use std::time::SystemTime; use bytes::Bytes; use futures::StreamExt; use futures::stream::Stream; use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { inner: GenericRemoteStorage>, // This many attempts of each operation will fail, then we let it succeed. attempts_to_fail: u64, // Tracks how many failed attempts of each operation has been made. attempts: Mutex>, /* BEGIN_HADRON */ // This the probability of failure for each operation, ranged from [0, 100]. // The probability is default to 100, which means that all operations will fail. // Storage will fail by probability up to attempts_to_fail times. attempt_failure_probability: u64, /* END_HADRON */ } /// Used to identify retries of different unique operation. #[derive(Debug, Hash, Eq, PartialEq)] enum RemoteOp { ListPrefixes(Option), HeadObject(RemotePath), Upload(RemotePath), Download(RemotePath), Delete(RemotePath), DeleteObjects(Vec), TimeTravelRecover(Option), } impl UnreliableWrapper { pub fn new( inner: crate::GenericRemoteStorage, attempts_to_fail: u64, attempt_failure_probability: u64, ) -> Self { assert!(attempts_to_fail > 0); let inner = match inner { GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s), GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s), // We could also make this a no-op, as in, extract the inner of the passed generic remote storage GenericRemoteStorage::Unreliable(_s) => { panic!("Can't wrap unreliable wrapper unreliably") } GenericRemoteStorage::GCS(s) => GenericRemoteStorage::GCS(s), }; let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100); UnreliableWrapper { inner, attempts_to_fail, attempt_failure_probability: actual_attempt_failure_probability, attempts: Mutex::new(HashMap::new()), } } /// /// Common functionality for all operations. /// /// On the first attempts of this operation, return an error. After 'attempts_to_fail' /// attempts, let the operation go ahead, and clear the counter. /// fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); let mut rng = rand::rng(); match attempts.entry(op) { Entry::Occupied(mut e) => { let attempts_before_this = { let p = e.get_mut(); *p += 1; *p }; /* BEGIN_HADRON */ // If there are more attempts to fail, fail the request by probability. if (attempts_before_this < self.attempts_to_fail) && (rng.random_range(0..=100) < self.attempt_failure_probability) { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); Err(error) } else { e.remove(); Ok(attempts_before_this) } /* END_HADRON */ } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); e.insert(1); Err(error) } } } async fn delete_inner( &self, path: &RemotePath, attempt: bool, cancel: &CancellationToken, ) -> anyhow::Result<()> { if attempt { self.attempt(RemoteOp::Delete(path.clone()))?; } self.inner.delete(path, cancel).await } } // We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage. type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> impl Stream> + Send { async_stream::stream! { self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) .map_err(DownloadError::Other)?; let mut stream = self.inner .list_streaming(prefix, mode, max_keys, cancel); while let Some(item) = stream.next().await { yield item; } } } async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) .map_err(DownloadError::Other)?; self.inner.list(prefix, mode, max_keys, cancel).await } async fn list_versions( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, ) -> Result { self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) .map_err(DownloadError::Other)?; self.inner .list_versions(prefix, mode, max_keys, cancel) .await } async fn head_object( &self, key: &RemotePath, cancel: &CancellationToken, ) -> Result { self.attempt(RemoteOp::HeadObject(key.clone())) .map_err(DownloadError::Other)?; self.inner.head_object(key, cancel).await } async fn upload( &self, data: impl Stream> + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, to: &RemotePath, metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::Upload(to.clone()))?; self.inner .upload(data, data_size_bytes, to, metadata, cancel) .await } async fn download( &self, from: &RemotePath, opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { // Note: We treat any byte range as an "attempt" of the same operation. // We don't pay attention to the ranges. That's good enough for now. self.attempt(RemoteOp::Download(from.clone())) .map_err(DownloadError::Other)?; self.inner.download(from, opts, cancel).await } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { self.delete_inner(path, true, cancel).await } async fn delete_objects( &self, paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; let mut error_counter = 0; for path in paths { // Dont record attempt because it was already recorded above if (self.delete_inner(path, false, cancel).await).is_err() { error_counter += 1; } } if error_counter > 0 { return Err(anyhow::anyhow!( "failed to delete {} objects", error_counter )); } Ok(()) } fn max_keys_per_delete(&self) -> usize { self.inner.max_keys_per_delete() } async fn copy( &self, from: &RemotePath, to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { // copy is equivalent to download + upload self.attempt(RemoteOp::Download(from.clone()))?; self.attempt(RemoteOp::Upload(to.clone()))?; self.inner.copy_object(from, to, cancel).await } async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, complexity_limit: Option, ) -> Result<(), TimeTravelError> { self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned()))) .map_err(TimeTravelError::Other)?; self.inner .time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } } ================================================ FILE: libs/remote_storage/src/support.rs ================================================ use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use std::time::Duration; use bytes::Bytes; use futures_util::Stream; use tokio_util::sync::CancellationToken; use crate::TimeoutOrCancel; pin_project_lite::pin_project! { /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. pub(crate) struct PermitCarrying { permit: tokio::sync::OwnedSemaphorePermit, #[pin] inner: S, } } impl PermitCarrying { pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { Self { permit, inner } } } impl Stream for PermitCarrying { type Item = ::Item; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { self.project().inner.poll_next(cx) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } pin_project_lite::pin_project! { pub(crate) struct DownloadStream { hit: bool, #[pin] cancellation: F, #[pin] inner: S, } } impl DownloadStream { pub(crate) fn new(cancellation: F, inner: S) -> Self { Self { cancellation, hit: false, inner, } } } /// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used. impl Stream for DownloadStream where std::io::Error: From, F: Future, S: Stream>, { type Item = ::Item; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.project(); if !*this.hit { if let Poll::Ready(e) = this.cancellation.poll(cx) { *this.hit = true; // most likely this will be a std::io::Error wrapping a DownloadError let e = Err(std::io::Error::from(e)); return Poll::Ready(Some(e)); } } else { // this would be perfectly valid behaviour for doing a graceful completion on the // download for example, but not one we expect to do right now. tracing::warn!("continuing polling after having cancelled or timeouted"); } this.inner.poll_next(cx) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } /// Fires only on the first cancel or timeout, not on both. pub(crate) fn cancel_or_timeout( timeout: Duration, cancel: CancellationToken, ) -> impl std::future::Future + 'static { // futures are lazy, they don't do anything before being polled. // // "precalculate" the wanted deadline before returning the future, so that we can use pause // failpoint to trigger a timeout in test. let deadline = tokio::time::Instant::now() + timeout; async move { tokio::select! { _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout, _ = cancel.cancelled() => { TimeoutOrCancel::Cancel }, } } } #[cfg(test)] mod tests { use futures::stream::StreamExt; use super::*; use crate::DownloadError; #[tokio::test(start_paused = true)] async fn cancelled_download_stream() { let inner = futures::stream::pending(); let timeout = Duration::from_secs(120); let cancel = CancellationToken::new(); let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); let mut stream = std::pin::pin!(stream); let mut first = stream.next(); tokio::select! { _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"), _ = tokio::time::sleep(Duration::from_secs(1)) => {}, } cancel.cancel(); let e = first.await.expect("there must be some").unwrap_err(); assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); let inner = e.get_ref().expect("inner should be set"); assert!( inner .downcast_ref::() .is_some_and(|e| matches!(e, DownloadError::Cancelled)), "{inner:?}" ); let e = DownloadError::from(e); assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); tokio::select! { _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"), _ = tokio::time::sleep(Duration::from_secs(121)) => {}, } } #[tokio::test(start_paused = true)] async fn timeouted_download_stream() { let inner = futures::stream::pending(); let timeout = Duration::from_secs(120); let cancel = CancellationToken::new(); let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); let mut stream = std::pin::pin!(stream); // because the stream uses 120s timeout and we are paused, we advance to 120s right away. let first = stream.next(); let e = first.await.expect("there must be some").unwrap_err(); assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); let inner = e.get_ref().expect("inner should be set"); assert!( inner .downcast_ref::() .is_some_and(|e| matches!(e, DownloadError::Timeout)), "{inner:?}" ); let e = DownloadError::from(e); assert!(matches!(e, DownloadError::Timeout), "{e:?}"); cancel.cancel(); tokio::select! { _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"), _ = tokio::time::sleep(Duration::from_secs(121)) => {}, } } #[tokio::test] async fn notified_but_pollable_after() { let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static( b"hello world", )))); let timeout = Duration::from_secs(120); let cancel = CancellationToken::new(); cancel.cancel(); let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); let mut stream = std::pin::pin!(stream); let next = stream.next().await; let ioe = next.unwrap().unwrap_err(); assert!( matches!( ioe.get_ref().unwrap().downcast_ref::(), Some(&DownloadError::Cancelled) ), "{ioe:?}" ); let next = stream.next().await; let bytes = next.unwrap().unwrap(); assert_eq!(&b"hello world"[..], bytes); } } ================================================ FILE: libs/remote_storage/tests/common/mod.rs ================================================ use std::collections::HashSet; use std::ops::ControlFlow; use std::path::PathBuf; use std::sync::Arc; use anyhow::Context; use bytes::Bytes; use camino::Utf8Path; use futures::stream::Stream; use once_cell::sync::OnceCell; use remote_storage::{Download, GenericRemoteStorage, RemotePath}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; static LOGGING_DONE: OnceCell<()> = OnceCell::new(); pub(crate) fn upload_stream( content: std::borrow::Cow<'static, [u8]>, ) -> ( impl Stream> + Send + Sync + 'static, usize, ) { use std::borrow::Cow; let content = match content { Cow::Borrowed(x) => Bytes::from_static(x), Cow::Owned(vec) => Bytes::from(vec), }; wrap_stream(content) } pub(crate) fn wrap_stream( content: bytes::Bytes, ) -> ( impl Stream> + Send + Sync + 'static, usize, ) { let len = content.len(); let content = futures::future::ready(Ok(content)); (futures::stream::once(content), len) } pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result> { let mut buf = Vec::new(); tokio::io::copy_buf( &mut tokio_util::io::StreamReader::new(dl.download_stream), &mut buf, ) .await?; Ok(buf) } // Uploads files `folder{j}/blob{i}.txt`. See test description for more details. pub(crate) async fn upload_simple_remote_data( client: &Arc, upload_tasks_count: usize, ) -> ControlFlow, HashSet> { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); let cancel = CancellationToken::new(); for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); let cancel = cancel.clone(); upload_tasks.spawn(async move { let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); let blob_path = RemotePath::new( Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), ) .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; debug!("Creating remote item {i} at path {blob_path:?}"); let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); task_client .upload(data, len, &blob_path, None, &cancel) .await?; Ok::<_, anyhow::Error>(blob_path) }); } let mut upload_tasks_failed = false; let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); while let Some(task_run_result) = upload_tasks.join_next().await { match task_run_result .context("task join failed") .and_then(|task_result| task_result.context("upload task failed")) { Ok(upload_path) => { uploaded_blobs.insert(upload_path); } Err(e) => { error!("Upload task failed: {e:?}"); upload_tasks_failed = true; } } } if upload_tasks_failed { ControlFlow::Break(uploaded_blobs) } else { ControlFlow::Continue(uploaded_blobs) } } pub(crate) async fn cleanup( client: &Arc, objects_to_delete: HashSet, ) { info!( "Removing {} objects from the remote storage during cleanup", objects_to_delete.len() ); let cancel = CancellationToken::new(); let mut delete_tasks = JoinSet::new(); for object_to_delete in objects_to_delete { let task_client = Arc::clone(client); let cancel = cancel.clone(); delete_tasks.spawn(async move { debug!("Deleting remote item at path {object_to_delete:?}"); task_client .delete(&object_to_delete, &cancel) .await .with_context(|| format!("{object_to_delete:?} removal")) }); } while let Some(task_run_result) = delete_tasks.join_next().await { match task_run_result { Ok(task_result) => match task_result { Ok(()) => {} Err(e) => error!("Delete task failed: {e:?}"), }, Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), } } } pub(crate) struct Uploads { pub(crate) prefixes: HashSet, pub(crate) blobs: HashSet, } pub(crate) async fn upload_remote_data( client: &Arc, base_prefix_str: &'static str, upload_tasks_count: usize, ) -> ControlFlow { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); let cancel = CancellationToken::new(); for i in 1..=upload_tasks_count { let task_client = Arc::clone(client); let cancel = cancel.clone(); upload_tasks.spawn(async move { let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); debug!("Creating remote item {i} at path {blob_path:?}"); let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); /* BEGIN_HADRON */ let mut metadata = None; if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) { let file_path = "/tmp/dbx_upload_tmp_file.txt"; { // Open the file in append mode let mut file = std::fs::OpenOptions::new() .append(true) .create(true) // Create the file if it doesn't exist .open(file_path)?; // Append some bytes to the file std::io::Write::write_all( &mut file, &format!("remote blob data {i}").into_bytes(), )?; file.sync_all()?; } metadata = Some(remote_storage::StorageMetadata::from([( "databricks_azure_put_block", file_path, )])); } /* END_HADRON */ task_client .upload(data, data_len, &blob_path, metadata, &cancel) .await?; // TODO: Check upload is using the put_block upload. // We cannot consume data here since data is moved inside the upload. // let total_bytes = data.fold(0, |acc, chunk| async move { // acc + chunk.map(|bytes| bytes.len()).unwrap_or(0) // }).await; // assert_eq!(total_bytes, data_len); Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); } let mut upload_tasks_failed = false; let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); while let Some(task_run_result) = upload_tasks.join_next().await { match task_run_result .context("task join failed") .and_then(|task_result| task_result.context("upload task failed")) { Ok((upload_prefix, upload_path)) => { uploaded_prefixes.insert(upload_prefix); uploaded_blobs.insert(upload_path); } Err(e) => { error!("Upload task failed: {e:?}"); upload_tasks_failed = true; } } } let uploads = Uploads { prefixes: uploaded_prefixes, blobs: uploaded_blobs, }; if upload_tasks_failed { ControlFlow::Break(uploads) } else { ControlFlow::Continue(uploads) } } pub(crate) fn ensure_logging_ready() { LOGGING_DONE.get_or_init(|| { utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, utils::logging::Output::Stdout, ) .expect("logging init failed"); }); } ================================================ FILE: libs/remote_storage/tests/common/tests.rs ================================================ use std::collections::HashSet; use std::num::NonZeroU32; use std::ops::Bound; use std::sync::Arc; use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; use test_context::test_context; use tokio_util::sync::CancellationToken; use tracing::debug; use super::{ MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, }; use crate::common::{download_to_vec, upload_stream, wrap_stream}; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. /// See the client creation in [`create_s3_client`] for details on the required env vars. /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. /// /// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] /// where /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket /// /// Then, verifies that the client does return correct prefixes when queried: /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` /// /// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response. /// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, /// as the current default AWS S3 pagination limit is 1000. /// (see ). /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. #[test_context(MaybeEnabledStorageWithTestBlobs)] #[tokio::test] async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()), MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => { anyhow::bail!("S3 init failed: {e:?}") } }; let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); let expected_remote_prefixes = ctx.remote_prefixes.clone(); let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) .context("common_prefix construction")?; let root_remote_prefixes = test_client .list(None, ListingMode::WithDelimiter, None, &cancel) .await? .prefixes .into_iter() .collect::>(); assert_eq!( root_remote_prefixes, HashSet::from([base_prefix.clone()]), "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" ); let nested_remote_prefixes = test_client .list( Some(&base_prefix.add_trailing_slash()), ListingMode::WithDelimiter, None, &cancel, ) .await? .prefixes .into_iter() .collect::>(); let remote_only_prefixes = nested_remote_prefixes .difference(&expected_remote_prefixes) .collect::>(); let missing_uploaded_prefixes = expected_remote_prefixes .difference(&nested_remote_prefixes) .collect::>(); assert_eq!( remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); // list_streaming let prefix_with_slash = base_prefix.add_trailing_slash(); let mut nested_remote_prefixes_st = test_client.list_streaming( Some(&prefix_with_slash), ListingMode::WithDelimiter, None, &cancel, ); let mut nested_remote_prefixes_combined = HashSet::new(); let mut segments = 0; let mut segment_max_size = 0; while let Some(st) = nested_remote_prefixes_st.next().await { let st = st?; segment_max_size = segment_max_size.max(st.prefixes.len()); nested_remote_prefixes_combined.extend(st.prefixes.into_iter()); segments += 1; } assert!(segments > 1, "less than 2 segments: {segments}"); assert!( segment_max_size * 2 <= nested_remote_prefixes_combined.len(), "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}", nested_remote_prefixes_combined.len() ); let remote_only_prefixes = nested_remote_prefixes_combined .difference(&expected_remote_prefixes) .collect::>(); let missing_uploaded_prefixes = expected_remote_prefixes .difference(&nested_remote_prefixes_combined) .collect::>(); assert_eq!( remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); Ok(()) } /// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. /// See `s3_pagination_should_work` for more information. /// /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: /// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` /// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] async fn list_no_delimiter_works( ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, ) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { anyhow::bail!("S3 init failed: {e:?}") } }; let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); let base_prefix = RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; let root_files = test_client .list(None, ListingMode::NoDelimiter, None, &cancel) .await .context("client list root files failure")? .keys .into_iter() .map(|o| o.key) .collect::>(); assert_eq!( root_files, ctx.remote_blobs.clone(), "remote storage list on root mismatches with the uploads." ); // Test that max_keys limit works. In total there are about 21 files (see // upload_simple_remote_data call in test_real_s3.rs). let limited_root_files = test_client .list( None, ListingMode::NoDelimiter, Some(NonZeroU32::new(2).unwrap()), &cancel, ) .await .context("client list root files failure")?; assert_eq!(limited_root_files.keys.len(), 2); let nested_remote_files = test_client .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel) .await .context("client list nested files failure")? .keys .into_iter() .map(|o| o.key) .collect::>(); let trim_remote_blobs: HashSet<_> = ctx .remote_blobs .iter() .map(|x| x.get_path()) .filter(|x| x.starts_with("folder1")) .map(|x| RemotePath::new(x).expect("must be valid path")) .collect(); assert_eq!( nested_remote_files, trim_remote_blobs, "remote storage list on subdirrectory mismatches with the uploads." ); Ok(()) } /// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"), /// but only with NoDelimiter. #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] async fn list_partial_prefix( ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, ) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { anyhow::bail!("S3 init failed: {e:?}") } }; let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); // Prefix "fold" should match all "folder{i}" directories with NoDelimiter. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("fold")?), ListingMode::NoDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert_eq!(&objects, &ctx.remote_blobs); // Prefix "fold" matches nothing with WithDelimiter. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("fold")?), ListingMode::WithDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert!(objects.is_empty()); // Prefix "" matches everything. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("")?), ListingMode::NoDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert_eq!(&objects, &ctx.remote_blobs); // Prefix "" matches nothing with WithDelimiter. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("")?), ListingMode::WithDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert!(objects.is_empty()); // Prefix "foo" matches nothing. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("foo")?), ListingMode::NoDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert!(objects.is_empty()); // Prefix "folder2/blob" matches. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("folder2/blob")?), ListingMode::NoDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); let expect: HashSet<_> = ctx .remote_blobs .iter() .filter(|o| o.get_path().starts_with("folder2")) .cloned() .collect(); assert_eq!(&objects, &expect); // Prefix "folder2/foo" matches nothing. let objects: HashSet<_> = test_client .list( Some(&RemotePath::from_string("folder2/foo")?), ListingMode::NoDelimiter, None, &cancel, ) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert!(objects.is_empty()); Ok(()) } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorage::Enabled(ctx) => ctx, MaybeEnabledStorage::Disabled => return Ok(()), }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new( format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; ctx.client .delete(&path, &cancel) .await .expect("should succeed"); Ok(()) } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorage::Enabled(ctx) => ctx, MaybeEnabledStorage::Disabled => return Ok(()), }; let cancel = CancellationToken::new(); let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let (data, len) = upload_stream("remote blob data1".as_bytes().into()); ctx.client.upload(data, len, &path1, None, &cancel).await?; let (data, len) = upload_stream("remote blob data2".as_bytes().into()); ctx.client.upload(data, len, &path2, None, &cancel).await?; let (data, len) = upload_stream("remote blob data3".as_bytes().into()); ctx.client.upload(data, len, &path3, None, &cancel).await?; ctx.client.delete_objects(&[path1, path2], &cancel).await?; let prefixes = ctx .client .list(None, ListingMode::WithDelimiter, None, &cancel) .await? .prefixes; assert_eq!(prefixes.len(), 1); ctx.client.delete_objects(&[path3], &cancel).await?; Ok(()) } /// Tests that delete_prefix() will delete all objects matching a prefix, including /// partial prefixes (i.e. "/foo" matches "/foobar"). #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { anyhow::bail!("S3 init failed: {e:?}") } }; let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); /// Asserts that the S3 listing matches the given paths. macro_rules! assert_list { ($expect:expr) => {{ let listing = test_client .list(None, ListingMode::NoDelimiter, None, &cancel) .await? .keys .into_iter() .map(|o| o.key) .collect(); assert_eq!($expect, listing); }}; } // We start with the full set of uploaded files. let mut expect = ctx.remote_blobs.clone(); // Deleting a non-existing prefix should do nothing. test_client .delete_prefix(&RemotePath::from_string("xyz")?, &cancel) .await?; assert_list!(expect); // Prefixes are case-sensitive. test_client .delete_prefix(&RemotePath::from_string("Folder")?, &cancel) .await?; assert_list!(expect); // Deleting a path which overlaps with an existing object should do nothing. We pick the first // path in the set as our common prefix. let path = expect.iter().next().expect("empty set").clone().join("xyz"); test_client.delete_prefix(&path, &cancel).await?; assert_list!(expect); // Deleting an exact path should work. We pick the first path in the set. let path = expect.iter().next().expect("empty set").clone(); test_client.delete_prefix(&path, &cancel).await?; expect.remove(&path); assert_list!(expect); // Deleting a prefix should delete all matching objects. test_client .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel) .await?; expect.retain(|p| !p.get_path().as_str().starts_with("folder0/")); assert_list!(expect); // Deleting a common prefix should delete all objects. test_client .delete_prefix(&RemotePath::from_string("fold")?, &cancel) .await?; expect.clear(); assert_list!(expect); Ok(()) } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return Ok(()); }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); let (data, len) = wrap_stream(orig.clone()); ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request let dl = ctx .client .download(&path, &DownloadOpts::default(), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // Full range (end specified) let dl = ctx .client .download( &path, &DownloadOpts { byte_start: Bound::Included(0), byte_end: Bound::Excluded(len as u64), ..Default::default() }, &cancel, ) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // partial range (end specified) let dl = ctx .client .download( &path, &DownloadOpts { byte_start: Bound::Included(4), byte_end: Bound::Excluded(10), ..Default::default() }, &cancel, ) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..10]); // partial range (end beyond real end) let dl = ctx .client .download( &path, &DownloadOpts { byte_start: Bound::Included(8), byte_end: Bound::Excluded(len as u64 * 100), ..Default::default() }, &cancel, ) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[8..]); // Partial range (end unspecified) let dl = ctx .client .download( &path, &DownloadOpts { byte_start: Bound::Included(4), ..Default::default() }, &cancel, ) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..]); // Full range (end unspecified) let dl = ctx .client .download( &path, &DownloadOpts { byte_start: Bound::Included(0), ..Default::default() }, &cancel, ) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client .delete(&path, &cancel) .await .with_context(|| format!("{path:?} removal"))?; Ok(()) } /// Tests that conditional downloads work properly, by returning /// DownloadError::Unmodified when the object ETag matches the given ETag. #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn download_conditional(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return Ok(()); }; let cancel = CancellationToken::new(); // Create a file. let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?; let data = bytes::Bytes::from_static("foo".as_bytes()); let (stream, len) = wrap_stream(data); ctx.client.upload(stream, len, &path, None, &cancel).await?; // Download it to obtain its etag. let mut opts = DownloadOpts::default(); let download = ctx.client.download(&path, &opts, &cancel).await?; // Download with the etag yields DownloadError::Unmodified. opts.etag = Some(download.etag); let result = ctx.client.download(&path, &opts, &cancel).await; assert!( matches!(result, Err(DownloadError::Unmodified)), "expected DownloadError::Unmodified, got {result:?}" ); // Replace the file contents. let data = bytes::Bytes::from_static("bar".as_bytes()); let (stream, len) = wrap_stream(data); ctx.client.upload(stream, len, &path, None, &cancel).await?; // A download with the old etag should yield the new file. let download = ctx.client.download(&path, &opts, &cancel).await?; assert_ne!(download.etag, opts.etag.unwrap(), "ETag did not change"); // A download with the new etag should yield Unmodified again. opts.etag = Some(download.etag); let result = ctx.client.download(&path, &opts, &cancel).await; assert!( matches!(result, Err(DownloadError::Unmodified)), "expected DownloadError::Unmodified, got {result:?}" ); Ok(()) } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return Ok(()); }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new( format!("{}/file_to_copy", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; let path_dest = RemotePath::new(Utf8Path::new( format!("{}/file_dest", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; let orig = bytes::Bytes::from_static("remote blob data content".as_bytes()); let (data, len) = wrap_stream(orig.clone()); ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request ctx.client.copy_object(&path, &path_dest, &cancel).await?; let dl = ctx .client .download(&path_dest, &DownloadOpts::default(), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client .delete_objects(&[path.clone(), path_dest.clone()], &cancel) .await .with_context(|| format!("{path:?} removal"))?; Ok(()) } /// Tests that head_object works properly. #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn head_object(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return Ok(()); }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?; // Errors on missing file. let result = ctx.client.head_object(&path, &cancel).await; assert!( matches!(result, Err(DownloadError::NotFound)), "expected NotFound, got {result:?}" ); // Create the file. let data = bytes::Bytes::from_static("foo".as_bytes()); let (stream, len) = wrap_stream(data); ctx.client.upload(stream, len, &path, None, &cancel).await?; // Fetch the head metadata. let object = ctx.client.head_object(&path, &cancel).await?; assert_eq!( object, ListingObject { key: path.clone(), last_modified: object.last_modified, // ignore size: 3 } ); // Wait for a couple of seconds, and then update the file to check the last // modified timestamp. tokio::time::sleep(std::time::Duration::from_secs(2)).await; let data = bytes::Bytes::from_static("bar".as_bytes()); let (stream, len) = wrap_stream(data); ctx.client.upload(stream, len, &path, None, &cancel).await?; let new = ctx.client.head_object(&path, &cancel).await?; assert!( !new.last_modified .duration_since(object.last_modified)? .is_zero(), "last_modified did not advance" ); Ok(()) } ================================================ FILE: libs/remote_storage/tests/test_real_azure.rs ================================================ use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; use anyhow::Context; use remote_storage::{ AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, }; use test_context::AsyncTestContext; use tracing::info; mod common; #[path = "common/tests.rs"] mod tests_azure; use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE"; const BASE_PREFIX: &str = "test"; struct EnabledAzure { client: Arc, base_prefix: &'static str, } impl EnabledAzure { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_azure_client(max_keys_in_list_response) .await .context("Azure client creation") .expect("Azure client creation failed"); EnabledAzure { client, base_prefix: BASE_PREFIX, } } #[allow(unused)] // this will be needed when moving the timeout integration tests back fn configure_request_timeout(&mut self, timeout: Duration) { match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { GenericRemoteStorage::AzureBlob(azure) => { let azure = Arc::get_mut(azure).expect("inner Arc::get_mut"); azure.timeout = timeout; } _ => unreachable!(), } } } enum MaybeEnabledStorage { Enabled(EnabledAzure), Disabled, } impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } Self::Enabled(EnabledAzure::setup(None).await) } } enum MaybeEnabledStorageWithTestBlobs { Enabled(AzureWithTestBlobs), Disabled, UploadsFailed(anyhow::Error, AzureWithTestBlobs), } struct AzureWithTestBlobs { enabled: EnabledAzure, remote_prefixes: HashSet, remote_blobs: HashSet, } impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } let max_keys_in_list_response = 10; let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); Self::Enabled(AzureWithTestBlobs { enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }) } ControlFlow::Break(uploads) => Self::UploadsFailed( anyhow::anyhow!("One or multiple blobs failed to upload to Azure"), AzureWithTestBlobs { enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }, ), } } async fn teardown(self) { match self { Self::Disabled => {} Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { cleanup(&ctx.enabled.client, ctx.remote_blobs).await; } } } } enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(AzureWithSimpleTestBlobs), Disabled, UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs), } struct AzureWithSimpleTestBlobs { enabled: EnabledAzure, remote_blobs: HashSet, } impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } let max_keys_in_list_response = 10; let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); Self::Enabled(AzureWithSimpleTestBlobs { enabled, remote_blobs: uploads, }) } ControlFlow::Break(uploads) => Self::UploadsFailed( anyhow::anyhow!("One or multiple blobs failed to upload to Azure"), AzureWithSimpleTestBlobs { enabled, remote_blobs: uploads, }, ), } } async fn teardown(self) { match self { Self::Disabled => {} Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { cleanup(&ctx.enabled.client, ctx.remote_blobs).await; } } } } async fn create_azure_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context( "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled", )?; let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context( "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled", )?; // due to how time works, we've had test runners use the same nanos as bucket prefixes. // millis is just a debugging aid for easier finding the prefix later. let millis = std::time::SystemTime::now() .duration_since(UNIX_EPOCH) .context("random Azure test prefix part calculation")? .as_millis(); // because nanos can be the same for two threads so can millis, add randomness let random = rand::rng().random::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { container_name: remote_storage_azure_container, storage_account: None, container_region: remote_storage_azure_region, prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, conn_pool_size: 8, /* BEGIN_HADRON */ put_block_size_mb: Some(1), /* END_HADRON */ }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) .await .context("remote storage init")?, )) } ================================================ FILE: libs/remote_storage/tests/test_real_gcs.rs ================================================ #![allow(dead_code)] #![allow(unused)] mod common; use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use futures::stream::Stream; use remote_storage::{ DownloadKind, DownloadOpts, GCSConfig, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, StorageMetadata, }; use std::collections::HashMap; #[path = "common/tests.rs"] use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::io::Cursor; use std::ops::Bound; use std::pin::pin; use std::sync::Arc; use std::time::Duration; use std::time::SystemTime; use test_context::{AsyncTestContext, test_context}; use tokio_util::sync::CancellationToken; use utils::backoff; // A minimal working GCS client I can pass around in async context const BASE_PREFIX: &str = "test"; async fn create_gcs_client() -> anyhow::Result> { let bucket_name = std::env::var("GCS_TEST_BUCKET").expect("GCS_TEST_BUCKET must be set"); let gcs_config = GCSConfig { bucket_name, prefix_in_bucket: Some("testing-path/".into()), max_keys_per_list_response: Some(100), concurrency_limit: std::num::NonZero::new(100).unwrap(), }; let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::GCS(gcs_config), timeout: Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(120), }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) .await .context("remote storage init")?, )) } struct EnabledGCS { client: Arc, base_prefix: &'static str, } impl EnabledGCS { async fn setup() -> Self { let client = create_gcs_client() .await .context("gcs client creation") .expect("gcs client creation failed"); EnabledGCS { client, base_prefix: BASE_PREFIX, } } } impl AsyncTestContext for EnabledGCS { async fn setup() -> Self { Self::setup().await } } #[test_context(EnabledGCS)] #[tokio::test] async fn gcs_get_object_bytes_range_header(ctx: &mut EnabledGCS) -> anyhow::Result<()> { let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new( format!("{}/000000010000028000000086", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; let (data, len) = upload_stream("hello, world".as_bytes().into()); ctx.client.upload(data, len, &path, None, &cancel).await?; let opts = DownloadOpts { byte_start: Bound::Included(7), ..Default::default() }; let dl_object = download_to_vec(ctx.client.download(&path, &opts, &cancel).await?).await?; let s = String::from_utf8(dl_object).unwrap(); assert_eq!(5, s.len()); Ok(()) } #[test_context(EnabledGCS)] #[tokio::test] async fn gcs_test_suite(ctx: &mut EnabledGCS) -> anyhow::Result<()> { // ------------------------------------------------ // --- `time_travel_recover`, showcasing `upload`, `delete_objects`, `copy` // ------------------------------------------------ // Our test depends on discrepancies in the clock between S3 and the environment the tests // run in. Therefore, wait a little bit before and after. The alternative would be // to take the time from S3 response headers. const WAIT_TIME: Duration = Duration::from_millis(3_000); async fn retry(op: O) -> Result where E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, { let warn_threshold = 3; let max_retries = 10; backoff::retry( op, |_e| false, warn_threshold, max_retries, "test retry", &CancellationToken::new(), ) .await .expect("never cancelled") } async fn time_point() -> SystemTime { tokio::time::sleep(WAIT_TIME).await; let ret = SystemTime::now(); tokio::time::sleep(WAIT_TIME).await; ret } async fn list_files( client: &Arc, cancel: &CancellationToken, ) -> anyhow::Result> { Ok( retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) .await .context("list root files failure")? .keys .into_iter() .map(|o| o.key) .collect::>(), ) } let cancel = CancellationToken::new(); let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; // ---------------- t0 --------------- // Upload 'path1' retry(|| { let (data, len) = upload_stream("remote blob data1".as_bytes().into()); ctx.client.upload(data, len, &path1, None, &cancel) }) .await?; let t0_files = list_files(&ctx.client, &cancel).await?; let t0 = time_point().await; // Show 'path1' println!("at t0: {t0_files:?}"); // Upload 'path2' let old_data = "remote blob data2"; retry(|| { let (data, len) = upload_stream(old_data.as_bytes().into()); ctx.client.upload(data, len, &path2, None, &cancel) }) .await?; // ---------------- t1 --------------- // Show 'path1' and 'path2' let t1_files = list_files(&ctx.client, &cancel).await?; let t1 = time_point().await; println!("at t1: {t1_files:?}"); { let opts = DownloadOpts::default(); let dl = retry(|| ctx.client.download(&path2, &opts, &cancel)).await?; let last_modified = dl.last_modified; let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { panic!( "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ This likely means a large lock discrepancy between S3 and the local clock." ); } } // Upload 'path3' retry(|| { let (data, len) = upload_stream("remote blob data3".as_bytes().into()); ctx.client.upload(data, len, &path3, None, &cancel) }) .await?; // Overwrite 'path2' let new_data = "new remote blob data2"; retry(|| { let (data, len) = upload_stream(new_data.as_bytes().into()); ctx.client.upload(data, len, &path2, None, &cancel) }) .await?; // Delete 'path1' retry(|| ctx.client.delete(&path1, &cancel)).await?; // Show 'path2' and `path3` let t2_files = list_files(&ctx.client, &cancel).await?; let t2 = time_point().await; println!("at t2: {t2_files:?}"); // No changes after recovery to t2 (no-op) let t_final = time_point().await; ctx.client .time_travel_recover(None, t2, t_final, &cancel, None) .await?; let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); assert_eq!(t2_files, t2_files_recovered); let path2_recovered_t2 = download_to_vec( ctx.client .download(&path2, &DownloadOpts::default(), &cancel) .await?, ) .await?; assert_eq!(path2_recovered_t2, new_data.as_bytes()); // after recovery to t1: path1 is back, path2 has the old content let t_final = time_point().await; ctx.client .time_travel_recover(None, t1, t_final, &cancel, None) .await?; let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); assert_eq!(t1_files, t1_files_recovered); let path2_recovered_t1 = download_to_vec( ctx.client .download(&path2, &DownloadOpts::default(), &cancel) .await?, ) .await?; assert_eq!(path2_recovered_t1, old_data.as_bytes()); // after recovery to t0: everything is gone except for path1 let t_final = time_point().await; ctx.client .time_travel_recover(None, t0, t_final, &cancel, None) .await?; let t0_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t0: {t0_files_recovered:?}"); assert_eq!(t0_files, t0_files_recovered); // cleanup let paths = &[path1, path2, path3]; retry(|| ctx.client.delete_objects(paths, &cancel)).await?; Ok(()) } ================================================ FILE: libs/remote_storage/tests/test_real_s3.rs ================================================ use std::collections::HashSet; use std::env; use std::fmt::{Debug, Display}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; use remote_storage::{ DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use test_context::{AsyncTestContext, test_context}; use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; use crate::common::{download_to_vec, upload_stream}; mod common; #[path = "common/tests.rs"] mod tests_s3; use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; use utils::backoff; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; const BASE_PREFIX: &str = "test"; #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorage::Enabled(ctx) => ctx, MaybeEnabledStorage::Disabled => return Ok(()), }; // Our test depends on discrepancies in the clock between S3 and the environment the tests // run in. Therefore, wait a little bit before and after. The alternative would be // to take the time from S3 response headers. const WAIT_TIME: Duration = Duration::from_millis(3_000); async fn retry(op: O) -> Result where E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, { let warn_threshold = 3; let max_retries = 10; backoff::retry( op, |_e| false, warn_threshold, max_retries, "test retry", &CancellationToken::new(), ) .await .expect("never cancelled") } async fn time_point() -> SystemTime { tokio::time::sleep(WAIT_TIME).await; let ret = SystemTime::now(); tokio::time::sleep(WAIT_TIME).await; ret } async fn list_files( client: &Arc, cancel: &CancellationToken, ) -> anyhow::Result> { Ok( retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) .await .context("list root files failure")? .keys .into_iter() .map(|o| o.key) .collect::>(), ) } let cancel = CancellationToken::new(); let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; retry(|| { let (data, len) = upload_stream("remote blob data1".as_bytes().into()); ctx.client.upload(data, len, &path1, None, &cancel) }) .await?; let t0_files = list_files(&ctx.client, &cancel).await?; let t0 = time_point().await; println!("at t0: {t0_files:?}"); let old_data = "remote blob data2"; retry(|| { let (data, len) = upload_stream(old_data.as_bytes().into()); ctx.client.upload(data, len, &path2, None, &cancel) }) .await?; let t1_files = list_files(&ctx.client, &cancel).await?; let t1 = time_point().await; println!("at t1: {t1_files:?}"); // A little check to ensure that our clock is not too far off from the S3 clock { let opts = DownloadOpts::default(); let dl = retry(|| ctx.client.download(&path2, &opts, &cancel)).await?; let last_modified = dl.last_modified; let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { panic!( "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ This likely means a large lock discrepancy between S3 and the local clock." ); } } retry(|| { let (data, len) = upload_stream("remote blob data3".as_bytes().into()); ctx.client.upload(data, len, &path3, None, &cancel) }) .await?; let new_data = "new remote blob data2"; retry(|| { let (data, len) = upload_stream(new_data.as_bytes().into()); ctx.client.upload(data, len, &path2, None, &cancel) }) .await?; retry(|| ctx.client.delete(&path1, &cancel)).await?; let t2_files = list_files(&ctx.client, &cancel).await?; let t2 = time_point().await; println!("at t2: {t2_files:?}"); // No changes after recovery to t2 (no-op) let t_final = time_point().await; ctx.client .time_travel_recover(None, t2, t_final, &cancel, None) .await?; let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); assert_eq!(t2_files, t2_files_recovered); let path2_recovered_t2 = download_to_vec( ctx.client .download(&path2, &DownloadOpts::default(), &cancel) .await?, ) .await?; assert_eq!(path2_recovered_t2, new_data.as_bytes()); // after recovery to t1: path1 is back, path2 has the old content let t_final = time_point().await; ctx.client .time_travel_recover(None, t1, t_final, &cancel, None) .await?; let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); assert_eq!(t1_files, t1_files_recovered); let path2_recovered_t1 = download_to_vec( ctx.client .download(&path2, &DownloadOpts::default(), &cancel) .await?, ) .await?; assert_eq!(path2_recovered_t1, old_data.as_bytes()); // after recovery to t0: everything is gone except for path1 let t_final = time_point().await; ctx.client .time_travel_recover(None, t0, t_final, &cancel, None) .await?; let t0_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t0: {t0_files_recovered:?}"); assert_eq!(t0_files, t0_files_recovered); // cleanup let paths = &[path1, path2, path3]; retry(|| ctx.client.delete_objects(paths, &cancel)).await?; Ok(()) } struct EnabledS3 { client: Arc, base_prefix: &'static str, } impl EnabledS3 { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_s3_client(max_keys_in_list_response) .await .context("S3 client creation") .expect("S3 client creation failed"); EnabledS3 { client, base_prefix: BASE_PREFIX, } } fn configure_request_timeout(&mut self, timeout: Duration) { match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { GenericRemoteStorage::AwsS3(s3) => { let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut"); s3.timeout = timeout; } _ => unreachable!(), } } } enum MaybeEnabledStorage { Enabled(EnabledS3), Disabled, } impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } Self::Enabled(EnabledS3::setup(None).await) } } enum MaybeEnabledStorageWithTestBlobs { Enabled(S3WithTestBlobs), Disabled, UploadsFailed(anyhow::Error, S3WithTestBlobs), } struct S3WithTestBlobs { enabled: EnabledS3, remote_prefixes: HashSet, remote_blobs: HashSet, } impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } let max_keys_in_list_response = 10; let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); Self::Enabled(S3WithTestBlobs { enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }) } ControlFlow::Break(uploads) => Self::UploadsFailed( anyhow::anyhow!("One or multiple blobs failed to upload to S3"), S3WithTestBlobs { enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }, ), } } async fn teardown(self) { match self { Self::Disabled => {} Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { cleanup(&ctx.enabled.client, ctx.remote_blobs).await; } } } } enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(S3WithSimpleTestBlobs), Disabled, UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs), } struct S3WithSimpleTestBlobs { enabled: EnabledS3, remote_blobs: HashSet, } impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME ); return Self::Disabled; } let max_keys_in_list_response = 10; let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; match upload_simple_remote_data(&enabled.client, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); Self::Enabled(S3WithSimpleTestBlobs { enabled, remote_blobs: uploads, }) } ControlFlow::Break(uploads) => Self::UploadsFailed( anyhow::anyhow!("One or multiple blobs failed to upload to S3"), S3WithSimpleTestBlobs { enabled, remote_blobs: uploads, }, ), } } async fn teardown(self) { match self { Self::Disabled => {} Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { cleanup(&ctx.enabled.client, ctx.remote_blobs).await; } } } } async fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; // due to how time works, we've had test runners use the same nanos as bucket prefixes. // millis is just a debugging aid for easier finding the prefix later. let millis = std::time::SystemTime::now() .duration_since(UNIX_EPOCH) .context("random s3 test prefix part calculation")? .as_millis(); // because nanos can be the same for two threads so can millis, add randomness let random = rand::rng().random::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: remote_storage_s3_bucket, bucket_region: remote_storage_s3_region, prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")), endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) .await .context("remote storage init")?, )) } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return; }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new( format!("{}/file_to_copy", ctx.base_prefix).as_str(), )) .unwrap(); let len = upload_large_enough_file(&ctx.client, &path, &cancel).await; let timeout = std::time::Duration::from_secs(5); ctx.configure_request_timeout(timeout); let started_at = std::time::Instant::now(); let mut stream = ctx .client .download(&path, &DownloadOpts::default(), &cancel) .await .expect("download succeeds") .download_stream; if started_at.elapsed().mul_f32(0.9) >= timeout { tracing::warn!( elapsed_ms = started_at.elapsed().as_millis(), "timeout might be too low, consumed most of it during headers" ); } let first = stream .next() .await .expect("should have the first blob") .expect("should have succeeded"); tracing::info!(len = first.len(), "downloaded first chunk"); assert!( first.len() < len, "uploaded file is too small, we downloaded all on first chunk" ); tokio::time::sleep(timeout).await; { let started_at = std::time::Instant::now(); let next = stream .next() .await .expect("stream should not have ended yet"); tracing::info!( next.is_err = next.is_err(), elapsed_ms = started_at.elapsed().as_millis(), "received item after timeout" ); let e = next.expect_err("expected an error, but got a chunk?"); let inner = e.get_ref().expect("std::io::Error::inner should be set"); assert!( inner .downcast_ref::() .is_some_and(|e| matches!(e, DownloadError::Timeout)), "{inner:?}" ); } ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT); ctx.client.delete_objects(&[path], &cancel).await.unwrap() } #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { let MaybeEnabledStorage::Enabled(ctx) = ctx else { return; }; let cancel = CancellationToken::new(); let path = RemotePath::new(Utf8Path::new( format!("{}/file_to_copy", ctx.base_prefix).as_str(), )) .unwrap(); let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await; { let stream = ctx .client .download(&path, &DownloadOpts::default(), &cancel) .await .expect("download succeeds") .download_stream; let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream)); let first = reader.fill_buf().await.expect("should have the first blob"); let len = first.len(); tracing::info!(len, "downloaded first chunk"); assert!( first.len() < file_len, "uploaded file is too small, we downloaded all on first chunk" ); reader.consume(len); cancel.cancel(); let next = reader.fill_buf().await; let e = next.expect_err("expected an error, but got a chunk?"); let inner = e.get_ref().expect("std::io::Error::inner should be set"); assert!( inner .downcast_ref::() .is_some_and(|e| matches!(e, DownloadError::Cancelled)), "{inner:?}" ); let e = DownloadError::from(e); assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); } let cancel = CancellationToken::new(); ctx.client.delete_objects(&[path], &cancel).await.unwrap(); } /// Upload a long enough file so that we cannot download it in single chunk /// /// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin async fn upload_large_enough_file( client: &GenericRemoteStorage, path: &RemotePath, cancel: &CancellationToken, ) -> usize { let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); let body = bytes::Bytes::from(vec![0u8; 1024]); let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128)); let len = contents.clone().fold(0, |acc, next| acc + next.len()); let contents = futures::stream::iter(contents.map(std::io::Result::Ok)); client .upload(contents, len, path, None, cancel) .await .expect("upload succeeds"); len } ================================================ FILE: libs/safekeeper_api/Cargo.toml ================================================ [package] name = "safekeeper_api" version = "0.1.0" edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true const_format.workspace = true serde.workspace = true serde_json.workspace = true postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true pq_proto.workspace = true tokio.workspace = true utils.workspace = true pageserver_api.workspace = true ================================================ FILE: libs/safekeeper_api/src/lib.rs ================================================ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; use pq_proto::SystemId; use serde::{Deserialize, Serialize}; pub mod membership; /// Public API types pub mod models; pub use postgres_versioninfo::{PgMajorVersion, PgVersionId}; /// Consensus logical timestamp. Note: it is a part of sk control file. pub type Term = u64; /// With this term timeline is created initially. It /// is a normal term except wp is never elected with it. pub const INITIAL_TERM: Term = 0; /// Information about Postgres. Safekeeper gets it once and then verifies all /// further connections from computes match. Note: it is a part of sk control /// file. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version pub pg_version: PgVersionId, pub system_id: SystemId, pub wal_seg_size: u32, } pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); ================================================ FILE: libs/safekeeper_api/src/membership.rs ================================================ //! Types defining safekeeper membership, see //! rfcs/035-safekeeper-dynamic-membership-change.md //! for details. use std::collections::HashSet; use std::fmt::Display; use anyhow; use anyhow::bail; use serde::{Deserialize, Serialize}; use utils::id::NodeId; /// 1 is the first valid generation, 0 is used as /// a placeholder before we fully migrate to generations. pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0); pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1); /// Number uniquely identifying safekeeper configuration. /// Note: it is a part of sk control file. /// /// Like tenant generations, but for safekeepers. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub struct SafekeeperGeneration(u32); impl SafekeeperGeneration { pub const fn new(v: u32) -> Self { Self(v) } #[track_caller] pub fn previous(&self) -> Option { Some(Self(self.0.checked_sub(1)?)) } #[track_caller] pub fn next(&self) -> Self { Self(self.0 + 1) } pub fn into_inner(self) -> u32 { self.0 } } impl Display for SafekeeperGeneration { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } /// Membership is defined by ids so e.g. walproposer uses them to figure out /// quorums, but we also carry host and port to give wp idea where to connect. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct SafekeeperId { pub id: NodeId, pub host: String, /// We include here only port for computes -- that is, pg protocol tenant /// only port, or wide pg protocol port if the former is not configured. pub pg_port: u16, } impl Display for SafekeeperId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port) } } /// Set of safekeepers. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(transparent)] pub struct MemberSet { pub m: Vec, } impl MemberSet { pub fn empty() -> Self { MemberSet { m: Vec::new() } } pub fn new(members: Vec) -> anyhow::Result { let hs: HashSet = HashSet::from_iter(members.iter().map(|sk| sk.id)); if hs.len() != members.len() { bail!("duplicate safekeeper id in the set {:?}", members); } Ok(MemberSet { m: members }) } pub fn contains(&self, sk: NodeId) -> bool { self.m.iter().any(|m| m.id == sk) } pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { if self.contains(sk.id) { bail!(format!( "sk {} is already member of the set {}", sk.id, self )); } self.m.push(sk); Ok(()) } } impl Display for MemberSet { /// Display as a comma separated list of members. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::>(); write!(f, "({})", sks_str.join(", ")) } } /// Safekeeper membership configuration. /// Note: it is a part of both control file and http API. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Configuration { /// Unique id. pub generation: SafekeeperGeneration, /// Current members of the configuration. pub members: MemberSet, /// Some means it is a joint conf. pub new_members: Option, } impl Configuration { /// Used for pre-generations timelines, will be removed eventually. pub fn empty() -> Self { Configuration { generation: INVALID_GENERATION, members: MemberSet::empty(), new_members: None, } } pub fn new(members: MemberSet) -> Self { Configuration { generation: INITIAL_GENERATION, members, new_members: None, } } /// Is `sk_id` member of the configuration? pub fn contains(&self, sk_id: NodeId) -> bool { self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id)) } } impl Display for Configuration { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "gen={}, members={}, new_members={}", self.generation, self.members, self.new_members .as_ref() .map(ToString::to_string) .unwrap_or(String::from("none")) ) } } #[cfg(test)] mod tests { use utils::id::NodeId; use super::{MemberSet, SafekeeperId}; #[test] fn test_member_set() { let mut members = MemberSet::empty(); members .add(SafekeeperId { id: NodeId(42), host: String::from("lala.org"), pg_port: 5432, }) .unwrap(); members .add(SafekeeperId { id: NodeId(42), host: String::from("lala.org"), pg_port: 5432, }) .expect_err("duplicate must not be allowed"); members .add(SafekeeperId { id: NodeId(43), host: String::from("bubu.org"), pg_port: 5432, }) .unwrap(); println!("members: {members}"); let j = serde_json::to_string(&members).expect("failed to serialize"); println!("members json: {j}"); assert_eq!( j, r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# ); } } ================================================ FILE: libs/safekeeper_api/src/models.rs ================================================ //! Types used in safekeeper http API. Many of them are also reused internally. use std::net::SocketAddr; use pageserver_api::shard::ShardIdentity; use postgres_ffi_types::TimestampTz; use postgres_versioninfo::PgVersionId; use serde::{Deserialize, Serialize}; use tokio::time::Instant; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; use crate::membership::{Configuration, SafekeeperGeneration}; use crate::{ServerInfo, Term}; #[derive(Debug, Serialize, Deserialize)] pub struct SafekeeperStatus { pub id: NodeId, } #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, pub pg_version: PgVersionId, pub system_id: Option, // By default WAL_SEGMENT_SIZE pub wal_seg_size: Option, pub start_lsn: Lsn, // Normal creation should omit this field (start_lsn initializes all LSNs). // However, we allow specifying custom value higher than start_lsn for // manual recovery case, see test_s3_wal_replay. pub commit_lsn: Option, } /// Same as TermLsn, but serializes LSN using display serializer /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct TermSwitchApiEntry { pub term: Term, pub lsn: Lsn, } /// Augment AcceptorState with last_log_term for convenience #[derive(Debug, Serialize, Deserialize)] pub struct AcceptorStateStatus { pub term: Term, pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility pub term_history: Vec, } /// Things safekeeper should know about timeline state on peers. /// Used as both model and internally. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeerInfo { pub sk_id: NodeId, pub term: Term, /// Term of the last entry. pub last_log_term: Term, /// LSN of the last record. pub flush_lsn: Lsn, pub commit_lsn: Lsn, /// Since which LSN safekeeper has WAL. pub local_start_lsn: Lsn, /// When info was received. Serde annotations are not very useful but make /// the code compile -- we don't rely on this field externally. #[serde(skip)] #[serde(default = "Instant::now")] pub ts: Instant, pub pg_connstr: String, pub http_connstr: String, pub https_connstr: Option, } pub type FullTransactionId = u64; /// Hot standby feedback received from replica #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, pub catalog_xmin: FullTransactionId, } pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; impl HotStandbyFeedback { pub fn empty() -> HotStandbyFeedback { HotStandbyFeedback { ts: 0, xmin: 0, catalog_xmin: 0, } } } /// Standby status update #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyReply { pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. pub reply_requested: bool, } impl StandbyReply { pub fn empty() -> Self { StandbyReply { write_lsn: Lsn::INVALID, flush_lsn: Lsn::INVALID, apply_lsn: Lsn::INVALID, reply_ts: 0, reply_requested: false, } } } #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyFeedback { pub reply: StandbyReply, pub hs_feedback: HotStandbyFeedback, } impl StandbyFeedback { pub fn empty() -> Self { StandbyFeedback { reply: StandbyReply::empty(), hs_feedback: HotStandbyFeedback::empty(), } } } /// Receiver is either pageserver or regular standby, which have different /// feedbacks. /// Used as both model and internally. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum ReplicationFeedback { Pageserver(PageserverFeedback), Standby(StandbyFeedback), } /// Uniquely identifies a WAL service connection. Logged in spans for /// observability. pub type ConnectionId = u32; /// Serialize is used only for json'ing in API response. Also used internally. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum WalSenderState { Vanilla(VanillaWalSenderState), Interpreted(InterpretedWalSenderState), } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VanillaWalSenderState { pub ttid: TenantTimelineId, pub addr: SocketAddr, pub conn_id: ConnectionId, // postgres application_name pub appname: Option, pub feedback: ReplicationFeedback, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct InterpretedWalSenderState { pub ttid: TenantTimelineId, pub shard: ShardIdentity, pub addr: SocketAddr, pub conn_id: ConnectionId, // postgres application_name pub appname: Option, pub feedback: ReplicationFeedback, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). pub conn_id: Option, pub status: WalReceiverStatus, } /// Walreceiver status. Currently only whether it passed voting stage and /// started receiving the stream, but it is easy to add more if needed. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum WalReceiverStatus { Voting, Streaming, } /// Info about timeline on safekeeper ready for reporting. #[derive(Debug, Serialize, Deserialize)] pub struct TimelineStatus { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, pub acceptor_state: AcceptorStateStatus, pub pg_info: ServerInfo, pub flush_lsn: Lsn, pub timeline_start_lsn: Lsn, pub local_start_lsn: Lsn, pub commit_lsn: Lsn, pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, pub peers: Vec, pub walsenders: Vec, pub walreceivers: Vec, } /// Request to switch membership configuration. #[derive(Clone, Serialize, Deserialize)] #[serde(transparent)] pub struct TimelineMembershipSwitchRequest { pub mconf: Configuration, } /// In response both previous and current configuration are sent. #[derive(Serialize, Deserialize)] pub struct TimelineMembershipSwitchResponse { pub previous_conf: Configuration, pub current_conf: Configuration, pub last_log_term: Term, pub flush_lsn: Lsn, } #[derive(Clone, Copy, Serialize, Deserialize)] pub struct TimelineDeleteResult { pub dir_existed: bool, } pub type TenantDeleteResult = std::collections::HashMap; fn lsn_invalid() -> Lsn { Lsn::INVALID } /// Data about safekeeper's timeline, mirrors broker.proto. #[derive(Debug, Clone, Deserialize, Serialize)] pub struct SkTimelineInfo { /// Term. pub term: Option, /// Term of the last entry. pub last_log_term: Option, /// LSN of the last record. #[serde(default = "lsn_invalid")] pub flush_lsn: Lsn, /// Up to which LSN safekeeper regards its WAL as committed. #[serde(default = "lsn_invalid")] pub commit_lsn: Lsn, /// LSN up to which safekeeper has backed WAL. #[serde(default = "lsn_invalid")] pub backup_lsn: Lsn, /// LSN of last checkpoint uploaded by pageserver. #[serde(default = "lsn_invalid")] pub remote_consistent_lsn: Lsn, #[serde(default = "lsn_invalid")] pub peer_horizon_lsn: Lsn, #[serde(default = "lsn_invalid")] pub local_start_lsn: Lsn, /// A connection string to use for WAL receiving. #[serde(default)] pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, #[serde(default)] pub https_connstr: Option, // Minimum of all active RO replicas flush LSN #[serde(default = "lsn_invalid")] pub standby_horizon: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct TimelineCopyRequest { pub target_timeline_id: TimelineId, pub until_lsn: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct TimelineTermBumpRequest { /// bump to pub term: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct TimelineTermBumpResponse { // before the request pub previous_term: u64, pub current_term: u64, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct SafekeeperUtilization { pub timeline_count: u64, } /// pull_timeline request body. #[derive(Debug, Clone, Deserialize, Serialize)] pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub http_hosts: Vec, /// Membership configuration to switch to after pull. /// It guarantees that if pull_timeline returns successfully, the timeline will /// not be deleted by request with an older generation. /// Storage controller always sets this field. /// None is only allowed for manual pull_timeline requests. pub mconf: Option, } #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { /// Donor safekeeper host. /// None if no pull happened because the timeline already exists. pub safekeeper_host: Option, // TODO: add more fields? } /// Response to a timeline locate request. /// Storcon-only API. #[derive(Serialize, Deserialize, Clone, Debug)] pub struct TimelineLocateResponse { pub generation: SafekeeperGeneration, pub sk_set: Vec, pub new_sk_set: Option>, } ================================================ FILE: libs/tenant_size_model/.gitignore ================================================ *.dot *.png *.svg ================================================ FILE: libs/tenant_size_model/Cargo.toml ================================================ [package] name = "tenant_size_model" version = "0.1.0" edition.workspace = true publish = false license.workspace = true [dependencies] anyhow.workspace = true serde.workspace = true serde_json.workspace = true ================================================ FILE: libs/tenant_size_model/Makefile ================================================ all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png ../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs cargo build --bin tenant_size_model %.svg: %.dot dot -Tsvg $< > $@ %.png: %.dot dot -Tpng $< > $@ %.dot: ../../target/debug/tenant_size_model ../../target/debug/tenant_size_model $* > $@ ================================================ FILE: libs/tenant_size_model/README.md ================================================ # Logical size + WAL pricing This is a simulator to calculate the tenant size in different scenarios, using the "Logical size + WAL" method. Makefile produces diagrams used in a private presentation: https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing ================================================ FILE: libs/tenant_size_model/src/calculation.rs ================================================ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; // // *-g--*---D---> // / // / // / *---b----*-B---> // / / // / / // -----*--e---*-----f----* C // E \ // \ // *--a---*---A--> // // If A and B need to be retained, is it cheaper to store // snapshot at C+a+b, or snapshots at A and B ? // // If D also needs to be retained, which is cheaper: // // 1. E+g+e+f+a+b // 2. D+C+a+b // 3. D+A+B /// `Segment` which has had its size calculated. #[derive(Clone, Debug)] struct SegmentSize { method: SegmentMethod, // calculated size of this subtree, using this method accum_size: u64, seg_id: usize, children: Vec, } struct SizeAlternatives { /// cheapest alternative if parent is available. incremental: SegmentSize, /// cheapest alternative if parent node is not available non_incremental: Option, } impl StorageModel { pub fn calculate(&self) -> SizeResult { // Build adjacency list. 'child_list' is indexed by segment id. Each entry // contains a list of all child segments of the segment. let mut roots: Vec = Vec::new(); let mut child_list: Vec> = Vec::new(); child_list.resize(self.segments.len(), Vec::new()); for (seg_id, seg) in self.segments.iter().enumerate() { if let Some(parent_id) = seg.parent { child_list[parent_id].push(seg_id); } else { roots.push(seg_id); } } let mut segment_results = Vec::new(); segment_results.resize( self.segments.len(), SegmentSizeResult { method: SegmentMethod::Skipped, accum_size: 0, }, ); let mut total_size = 0; for root in roots { if let Some(selected) = self.size_here(root, &child_list).non_incremental { StorageModel::fill_selected_sizes(&selected, &mut segment_results); total_size += selected.accum_size; } else { // Couldn't find any way to get this root. Error? } } SizeResult { // If total_size is 0, it means that the tenant has all timelines offloaded; we need to report 1 // here so that the data point shows up in the s3 files. total_size: total_size.max(1), segments: segment_results, } } fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec) { result[selected.seg_id] = SegmentSizeResult { method: selected.method, accum_size: selected.accum_size, }; // recurse to children for child in selected.children.iter() { StorageModel::fill_selected_sizes(child, result); } } // // This is the core of the sizing calculation. // // This is a recursive function, that for each Segment calculates the best way // to reach all the Segments that are marked as needed in this subtree, under two // different conditions: // a) when the parent of this segment is available (as a snaphot or through WAL), and // b) when the parent of this segment is not available. // fn size_here(&self, seg_id: usize, child_list: &Vec>) -> SizeAlternatives { let seg = &self.segments[seg_id]; // First figure out the best way to get each child let mut children = Vec::new(); for child_id in &child_list[seg_id] { children.push(self.size_here(*child_id, child_list)) } // Method 1. If this node is not needed, we can skip it as long as we // take snapshots later in each sub-tree let snapshot_later = if !seg.needed { let mut snapshot_later = SegmentSize { seg_id, method: SegmentMethod::Skipped, accum_size: 0, children: Vec::new(), }; let mut possible = true; for child in children.iter() { if let Some(non_incremental) = &child.non_incremental { snapshot_later.accum_size += non_incremental.accum_size; snapshot_later.children.push(non_incremental.clone()) } else { possible = false; break; } } if possible { Some(snapshot_later) } else { None } } else { None }; // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of // this Segment was given. let snapshot_here = if !seg.needed || seg.parent.is_none() { if let Some(snapshot_size) = seg.size { let mut snapshot_here = SegmentSize { seg_id, method: SegmentMethod::SnapshotHere, accum_size: snapshot_size, children: Vec::new(), }; for child in children.iter() { snapshot_here.accum_size += child.incremental.accum_size; snapshot_here.children.push(child.incremental.clone()) } Some(snapshot_here) } else { None } } else { None }; // Method 3. Use WAL to get here from parent let wal_here = { let mut wal_here = SegmentSize { seg_id, method: SegmentMethod::Wal, accum_size: if let Some(parent_id) = seg.parent { seg.lsn - self.segments[parent_id].lsn } else { 0 }, children: Vec::new(), }; for child in children { wal_here.accum_size += child.incremental.accum_size; wal_here.children.push(child.incremental) } wal_here }; // If the parent is not available, what's the cheapest method involving // a snapshot here or later? let mut cheapest_non_incremental: Option = None; if let Some(snapshot_here) = snapshot_here { cheapest_non_incremental = Some(snapshot_here); } if let Some(snapshot_later) = snapshot_later { // Use <=, to prefer skipping if the size is equal if let Some(parent) = &cheapest_non_incremental { if snapshot_later.accum_size <= parent.accum_size { cheapest_non_incremental = Some(snapshot_later); } } else { cheapest_non_incremental = Some(snapshot_later); } } // And what's the cheapest method, if the parent is available? let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental { // Is it cheaper to use a snapshot here or later, anyway? // Use <, to prefer Wal over snapshot if the cost is the same if wal_here.accum_size < cheapest_non_incremental.accum_size { wal_here } else { cheapest_non_incremental.clone() } } else { wal_here }; SizeAlternatives { incremental: cheapest_incremental, non_incremental: cheapest_non_incremental, } } } ================================================ FILE: libs/tenant_size_model/src/lib.rs ================================================ //! Synthetic size calculation #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] mod calculation; pub mod svg; /// StorageModel is the input to the synthetic size calculation. /// /// It represents a tree of timelines, with just the information that's needed /// for the calculation. This doesn't track timeline names or where each timeline /// begins and ends, for example. Instead, it consists of "points of interest" /// on the timelines. A point of interest could be the timeline start or end point, /// the oldest point on a timeline that needs to be retained because of PITR /// cutoff, or snapshot points named by the user. For each such point, and the /// edge connecting the points (implicit in Segment), we store information about /// whether we need to be able to recover to the point, and if known, the logical /// size at the point. /// /// The segments must form a well-formed tree, with no loops. #[derive(serde::Serialize)] pub struct StorageModel { pub segments: Vec, } /// Segment represents one point in the tree of branches, *and* the edge that leads /// to it (if any). We don't need separate structs for points and edges, because each /// point can have only one parent. /// /// When 'needed' is true, it means that we need to be able to reconstruct /// any version between 'parent.lsn' and 'lsn'. If you want to represent that only /// a single point is needed, create two Segments with the same lsn, and mark only /// the child as needed. /// #[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Segment { /// Previous segment index into ['Storage::segments`], if any. pub parent: Option, /// LSN at this point pub lsn: u64, /// Logical size at this node, if known. pub size: Option, /// If true, the segment from parent to this node is needed by `retention_period` pub needed: bool, } /// Result of synthetic size calculation. Returned by StorageModel::calculate() pub struct SizeResult { pub total_size: u64, // This has same length as the StorageModel::segments vector in the input. // Each entry in this array corresponds to the entry with same index in // StorageModel::segments. pub segments: Vec, } #[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct SegmentSizeResult { pub method: SegmentMethod, // calculated size of this subtree, using this method pub accum_size: u64, } /// Different methods to retain history from a particular state #[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub enum SegmentMethod { SnapshotHere, // A logical snapshot is needed after this segment Wal, // Keep WAL leading up to this node Skipped, } ================================================ FILE: libs/tenant_size_model/src/svg.rs ================================================ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; use std::fmt::Write; const SVG_WIDTH: f32 = 500.0; /// Different branch kind for SVG drawing. #[derive(PartialEq)] pub enum SvgBranchKind { Timeline, Lease, } struct SvgDraw<'a> { storage: &'a StorageModel, branches: &'a [String], seg_to_branch: &'a [(usize, SvgBranchKind)], sizes: &'a [SegmentSizeResult], // layout xscale: f32, min_lsn: u64, seg_coordinates: Vec<(f32, f32)>, } fn draw_legend(result: &mut String) -> anyhow::Result<()> { writeln!( result, "" )?; writeln!(result, "logical snapshot")?; writeln!( result, "" )?; writeln!( result, "WAL within retention period" )?; writeln!( result, "" )?; writeln!( result, "WAL retained to avoid copy" )?; writeln!( result, "" )?; writeln!(result, "WAL not retained")?; writeln!( result, "" )?; writeln!(result, "LSN lease")?; Ok(()) } pub fn draw_svg( storage: &StorageModel, branches: &[String], seg_to_branch: &[(usize, SvgBranchKind)], sizes: &SizeResult, ) -> anyhow::Result { let mut draw = SvgDraw { storage, branches, seg_to_branch, sizes: &sizes.segments, xscale: 0.0, min_lsn: 0, seg_coordinates: Vec::new(), }; let mut result = String::new(); writeln!( result, "" )?; draw.calculate_svg_layout(); // Draw the tree for (seg_id, _seg) in storage.segments.iter().enumerate() { draw.draw_seg_phase1(seg_id, &mut result)?; } // Draw snapshots for (seg_id, _seg) in storage.segments.iter().enumerate() { draw.draw_seg_phase2(seg_id, &mut result)?; } draw_legend(&mut result)?; write!(result, "")?; Ok(result) } impl SvgDraw<'_> { fn calculate_svg_layout(&mut self) { // Find x scale let segments = &self.storage.segments; let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min); let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max); // Start with 1 pixel = 1 byte. Double the scale until it fits into the image let mut xscale = 1.0; while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH { xscale *= 2.0; } // Layout the timelines on Y dimension. // TODO let mut y = 120.0; let mut branch_y_coordinates = Vec::new(); for _branch in self.branches { branch_y_coordinates.push(y); y += 40.0; } // Calculate coordinates for each point let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) .map(|(seg, (branch_id, _))| { let x = (seg.lsn - min_lsn) as f32 / xscale; let y = branch_y_coordinates[*branch_id]; (x, y) }) .collect(); self.xscale = xscale; self.min_lsn = min_lsn; self.seg_coordinates = seg_coordinates; } /// Draws lines between points fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { let seg = &self.storage.segments[seg_id]; let wal_bytes = if let Some(parent_id) = seg.parent { seg.lsn - self.storage.segments[parent_id].lsn } else { 0 }; let style = match self.sizes[seg_id].method { SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"", SegmentMethod::Wal if seg.needed && wal_bytes > 0 => { "stroke-width=\"6\" stroke=\"black\"" } SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"", SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"", }; if let Some(parent_id) = seg.parent { let (x1, y1) = self.seg_coordinates[parent_id]; let (x2, y2) = self.seg_coordinates[seg_id]; writeln!( result, "", )?; writeln!( result, " {wal_bytes} bytes of WAL (seg {seg_id})" )?; writeln!(result, "")?; } else { // draw a little dash to mark the starting point of this branch let (x, y) = self.seg_coordinates[seg_id]; let (x1, y1) = (x, y - 5.0); let (x2, y2) = (x, y + 5.0); writeln!( result, "", )?; writeln!(result, " (seg {seg_id})")?; writeln!(result, "")?; } Ok(()) } /// Draw circles where snapshots are taken fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { let seg = &self.storage.segments[seg_id]; // draw a snapshot point if it's needed let (coord_x, coord_y) = self.seg_coordinates[seg_id]; let (_, kind) = &self.seg_to_branch[seg_id]; if kind == &SvgBranchKind::Lease { let (x1, y1) = (coord_x, coord_y - 10.0); let (x2, y2) = (coord_x, coord_y + 10.0); let style = "stroke-width=\"3\" stroke=\"blue\""; writeln!( result, "", )?; writeln!(result, " leased lsn at {}", seg.lsn)?; writeln!(result, "")?; } if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { writeln!( result, "", )?; writeln!( result, " logical size {}", seg.size.unwrap() )?; write!(result, "")?; } Ok(()) } } ================================================ FILE: libs/tenant_size_model/tests/tests.rs ================================================ //! Tenant size model tests. use tenant_size_model::{Segment, SizeResult, StorageModel}; use std::collections::HashMap; struct ScenarioBuilder { segments: Vec, /// Mapping from the branch name to the index of a segment describing its latest state. branches: HashMap, } impl ScenarioBuilder { /// Creates a new storage with the given default branch name. pub fn new(initial_branch: &str) -> ScenarioBuilder { let init_segment = Segment { parent: None, lsn: 0, size: Some(0), needed: false, // determined later }; ScenarioBuilder { segments: vec![init_segment], branches: HashMap::from([(initial_branch.into(), 0)]), } } /// Advances the branch with the named operation, by the relative LSN and logical size bytes. pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) { let lastseg_id = *self.branches.get(branch).unwrap(); let newseg_id = self.segments.len(); let lastseg = &mut self.segments[lastseg_id]; let newseg = Segment { parent: Some(lastseg_id), lsn: lastseg.lsn + lsn_bytes, size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64), needed: false, }; self.segments.push(newseg); *self.branches.get_mut(branch).expect("read already") = newseg_id; } pub fn insert(&mut self, branch: &str, bytes: u64) { self.modify_branch(branch, bytes, bytes as i64); } pub fn update(&mut self, branch: &str, bytes: u64) { self.modify_branch(branch, bytes, 0i64); } pub fn _delete(&mut self, branch: &str, bytes: u64) { self.modify_branch(branch, bytes, -(bytes as i64)); } /// Panics if the parent branch cannot be found. pub fn branch(&mut self, parent: &str, name: &str) { // Find the right segment let branchseg_id = *self .branches .get(parent) .expect("should had found the parent by key"); let _branchseg = &mut self.segments[branchseg_id]; // Create branch name for it self.branches.insert(name.to_string(), branchseg_id); } pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) { // Phase 1: Mark all the segments that need to be retained for (_branch, &last_seg_id) in self.branches.iter() { let last_seg = &self.segments[last_seg_id]; let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period); let mut seg_id = last_seg_id; loop { let seg = &mut self.segments[seg_id]; if seg.lsn <= cutoff_lsn { break; } seg.needed = true; if let Some(prev_seg_id) = seg.parent { seg_id = prev_seg_id; } else { break; } } } // Perform the calculation let storage_model = StorageModel { segments: self.segments.clone(), }; let size_result = storage_model.calculate(); (storage_model, size_result) } } // Main branch only. Some updates on it. #[test] fn scenario_1() { // Create main branch let mut scenario = ScenarioBuilder::new("main"); // Bulk load 5 GB of data to it scenario.insert("main", 5_000); // Stream of updates for _ in 0..5 { scenario.update("main", 1_000); } // Calculate the synthetic size with retention horizon 1000 let (_model, result) = scenario.calculate(1000); // The end of the branch is at LSN 10000. Need to retain // a logical snapshot at LSN 9000, plus the WAL between 9000-10000. // The logical snapshot has size 5000. assert_eq!(result.total_size, 5000 + 1000); } // Main branch only. Some updates on it. #[test] fn scenario_2() { // Create main branch let mut scenario = ScenarioBuilder::new("main"); // Bulk load 5 GB of data to it scenario.insert("main", 5_000); // Stream of updates for _ in 0..5 { scenario.update("main", 1_000); } // Branch scenario.branch("main", "child"); scenario.update("child", 1_000); // More updates on parent scenario.update("main", 1_000); // // The history looks like this now: // // 10000 11000 // *----*----*--------------* main // | // | 11000 // +-------------- child // // // With retention horizon 1000, we need to retain logical snapshot // at the branch point, size 5000, and the WAL from 10000-11000 on // both branches. let (_model, result) = scenario.calculate(1000); assert_eq!(result.total_size, 5000 + 1000 + 1000); } // Like 2, but more updates on main #[test] fn scenario_3() { // Create main branch let mut scenario = ScenarioBuilder::new("main"); // Bulk load 5 GB of data to it scenario.insert("main", 5_000); // Stream of updates for _ in 0..5 { scenario.update("main", 1_000); } // Branch scenario.branch("main", "child"); scenario.update("child", 1_000); // More updates on parent for _ in 0..5 { scenario.update("main", 1_000); } // // The history looks like this now: // // 10000 15000 // *----*----*------------------------------------* main // | // | 11000 // +-------------- child // // // With retention horizon 1000, it's still cheapest to retain // - snapshot at branch point (size 5000) // - WAL on child between 10000-11000 // - WAL on main between 10000-15000 // // This is in total 5000 + 1000 + 5000 // let (_model, result) = scenario.calculate(1000); assert_eq!(result.total_size, 5000 + 1000 + 5000); } // Diverged branches #[test] fn scenario_4() { // Create main branch let mut scenario = ScenarioBuilder::new("main"); // Bulk load 5 GB of data to it scenario.insert("main", 5_000); // Stream of updates for _ in 0..5 { scenario.update("main", 1_000); } // Branch scenario.branch("main", "child"); scenario.update("child", 1_000); // More updates on parent for _ in 0..8 { scenario.update("main", 1_000); } // // The history looks like this now: // // 10000 18000 // *----*----*------------------------------------* main // | // | 11000 // +-------------- child // // // With retention horizon 1000, it's now cheapest to retain // separate snapshots on both branches: // - snapshot on main branch at LSN 17000 (size 5000) // - WAL on main between 17000-18000 // - snapshot on child branch at LSN 10000 (size 5000) // - WAL on child between 10000-11000 // // This is in total 5000 + 1000 + 5000 + 1000 = 12000 // // (If we used the method from the previous scenario, and // kept only snapshot at the branch point, we'd need to keep // all the WAL between 10000-18000 on the main branch, so // the total size would be 5000 + 1000 + 8000 = 14000. The // calculation always picks the cheapest alternative) let (_model, result) = scenario.calculate(1000); assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000); } #[test] fn scenario_5() { let mut scenario = ScenarioBuilder::new("a"); scenario.insert("a", 5000); scenario.branch("a", "b"); scenario.update("b", 4000); scenario.update("a", 2000); scenario.branch("a", "c"); scenario.insert("c", 4000); scenario.insert("a", 2000); let (_model, result) = scenario.calculate(1000); assert_eq!(result.total_size, 17000); } #[test] fn scenario_6() { let branches = [ "7ff1edab8182025f15ae33482edb590a", "b1719e044db05401a05a2ed588a3ad3f", "0xb68d6691c895ad0a70809470020929ef", ]; // compared to other scenarios, this one uses bytes instead of kB let mut scenario = ScenarioBuilder::new(""); scenario.branch("", branches[0]); // at 0 scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064 scenario.branch(branches[0], branches[1]); // at 108951064 scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472 scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424 scenario.branch(branches[0], branches[2]); // at 283415424 scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616 scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400 let (model, result) = scenario.calculate(100_000); // FIXME: We previously calculated 333_792_000. But with this PR, we get // a much lower number. At a quick look at the model output and the // calculations here, the new result seems correct to me. eprintln!( " MODEL: {}", serde_json::to_string(&model.segments).unwrap() ); eprintln!( "RESULT: {}", serde_json::to_string(&result.segments).unwrap() ); assert_eq!(result.total_size, 136_236_928); } ================================================ FILE: libs/tracing-utils/Cargo.toml ================================================ [package] name = "tracing-utils" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] hyper0.workspace = true opentelemetry = { workspace = true, features = ["trace"] } opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] } opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] } opentelemetry-semantic-conventions.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true pin-project-lite.workspace = true [dev-dependencies] tracing-subscriber.workspace = true # For examples in docs ================================================ FILE: libs/tracing-utils/src/http.rs ================================================ //! Tracing wrapper for Hyper HTTP server use std::future::Future; use hyper0::{Body, HeaderMap, Request, Response}; use tracing::Instrument; use tracing_opentelemetry::OpenTelemetrySpanExt; /// Configuration option for what to use as the "otel.name" field in the traces. pub enum OtelName<'a> { /// Use a constant string Constant(&'a str), /// Use the path from the request. /// /// That's very useful information, but is not appropriate if the /// path contains parameters that differ on ever request, or worse, /// sensitive information like usernames or email addresses. /// /// See UriPath, } /// Handle an incoming HTTP request using the given handler function, /// with OpenTelemetry tracing. /// /// This runs 'handler' on the request in a new span, with fields filled in /// from the request. Notably, if the request contains tracing information, /// it is propagated to the span, so that this request is traced as part of /// the same trace. /// /// XXX: Usually, this is handled by existing libraries, or built /// directly into HTTP servers. However, I couldn't find one for Hyper, /// so I had to write our own. OpenTelemetry website has a registry of /// instrumentation libraries at: /// /// If a Hyper crate appears, consider switching to that. pub async fn tracing_handler( req: Request, handler: F, otel_name: OtelName<'_>, ) -> Response where F: Fn(Request) -> R, R: Future>, { // Create a tracing span, with context propagated from the incoming // request if any. // // See list of standard fields defined for HTTP requests at // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md // We only fill in a few of the most useful ones here. let otel_name = match otel_name { OtelName::Constant(s) => s, OtelName::UriPath => req.uri().path(), }; let span = tracing::info_span!( "http request", otel.name= %otel_name, http.method = %req.method(), http.status_code = tracing::field::Empty, ); let parent_ctx = extract_remote_context(req.headers()); span.set_parent(parent_ctx); // Handle the request within the span let response = handler(req).instrument(span.clone()).await; // Fill in the fields from the response code let status = response.status(); span.record("http.status_code", status.as_str()); span.record( "otel.status_code", if status.is_success() { "OK" } else { "ERROR" }, ); response } // Extract remote tracing context from the HTTP headers fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context { struct HeaderExtractor<'a>(&'a HeaderMap); impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> { fn get(&self, key: &str) -> Option<&str> { self.0.get(key).and_then(|value| value.to_str().ok()) } fn keys(&self) -> Vec<&str> { self.0.keys().map(|value| value.as_str()).collect() } } let extractor = HeaderExtractor(headers); opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor)) } ================================================ FILE: libs/tracing-utils/src/lib.rs ================================================ //! Helper functions to set up OpenTelemetry tracing. //! //! Example: //! //! ```rust,no_run //! use tracing_subscriber::prelude::*; //! //! #[tokio::main] //! async fn main() { //! // Set up logging to stderr //! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() //! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")); //! let fmt_layer = tracing_subscriber::fmt::layer() //! .with_target(false) //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces //! let provider = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()); //! let otlp_layer = provider.as_ref().map(tracing_utils::layer); //! //! // Put it all together //! tracing_subscriber::registry() //! .with(env_filter) //! .with(otlp_layer) //! .with(fmt_layer) //! .init(); //! } //! ``` #![deny(clippy::undocumented_unsafe_blocks)] pub mod http; pub mod perf_span; use opentelemetry::trace::TracerProvider; use opentelemetry_otlp::WithExportConfig; pub use opentelemetry_otlp::{ExportConfig, Protocol}; use opentelemetry_sdk::trace::SdkTracerProvider; use tracing::level_filters::LevelFilter; use tracing::{Dispatch, Subscriber}; use tracing_subscriber::Layer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::registry::LookupSpan; pub type Provider = SdkTracerProvider; /// Set up OpenTelemetry exporter, using configuration from environment variables. /// /// `service_name` is set as the OpenTelemetry 'service.name' resource (see /// ) /// /// We try to follow the conventions for the environment variables specified in /// /// /// However, we only support a subset of those options: /// /// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing /// is enabled by default. Set it to "true" to disable. /// /// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_* /// settings specified in /// /// are supported, as they are handled by the `opentelemetry-otlp` crate. /// Settings related to other exporters have no effect. /// /// - Some other settings are supported by the `opentelemetry` crate. /// /// If you need some other setting, please test if it works first. And perhaps /// add a comment in the list above to save the effort of testing for the next /// person. pub fn init_tracing(service_name: &str, export_config: ExportConfig) -> Option { if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; Some(init_tracing_internal( service_name.to_string(), export_config, )) } pub fn layer(p: &Provider) -> impl Layer where S: Subscriber + for<'span> LookupSpan<'span>, { tracing_opentelemetry::layer().with_tracer(p.tracer("global")) } fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> Provider { // Sets up exporter from the provided [`ExportConfig`] parameter. // If the endpoint is not specified, it is loaded from the // OTEL_EXPORTER_OTLP_ENDPOINT environment variable. let exporter = opentelemetry_otlp::SpanExporter::builder() .with_http() .with_export_config(export_config) .build() .expect("could not initialize opentelemetry exporter"); // TODO: opentelemetry::global::set_error_handler() with custom handler that // bypasses default tracing layers, but logs regular looking log // messages. // Propagate trace information in the standard W3C TraceContext format. opentelemetry::global::set_text_map_propagator( opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); Provider::builder() .with_batch_exporter(exporter) .with_resource( opentelemetry_sdk::Resource::builder() .with_service_name(service_name) .build(), ) .build() } pub enum OtelEnablement { Disabled, Enabled { service_name: String, export_config: ExportConfig, }, } pub struct OtelGuard { provider: Provider, pub dispatch: Dispatch, } impl Drop for OtelGuard { fn drop(&mut self) { _ = self.provider.shutdown(); } } /// Initializes OTEL infrastructure for performance tracing according to the provided configuration /// /// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns /// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber. /// Applications should use this dispatch for their performance traces. /// /// The lifetime of the guard should match taht of the application. On drop, it tears down the /// OTEL infra. pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option { match otel_enablement { OtelEnablement::Disabled => None, OtelEnablement::Enabled { service_name, export_config, } => { let provider = init_tracing(&service_name, export_config)?; let otel_layer = layer(&provider).with_filter(LevelFilter::INFO); let otel_subscriber = tracing_subscriber::registry().with(otel_layer); let dispatch = Dispatch::new(otel_subscriber); Some(OtelGuard { dispatch, provider }) } } } ================================================ FILE: libs/tracing-utils/src/perf_span.rs ================================================ //! Crutch module to work around tracing infrastructure deficiencies //! //! We wish to collect granular request spans without impacting performance //! by much. Ideally, we should have zero overhead for a sampling rate of 0. //! //! The approach taken by the pageserver crate is to use a completely different //! span hierarchy for the performance spans. Spans are explicitly stored in //! the request context and use a different [`tracing::Subscriber`] in order //! to avoid expensive filtering. //! //! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly, //! their [`tracing::Subscriber`] at creation time. However, upon exiting the span, //! the global default [`tracing::Dispatch`] is used. This is problematic if one //! wishes to juggle different subscribers. //! //! In order to work around this, this module provides a [`PerfSpan`] type which //! wraps a [`Span`] and sets the default subscriber when exiting the span. This //! achieves the correct routing. //! //! There's also a modified version of [`tracing::Instrument`] which works with //! [`PerfSpan`]. use core::{ future::Future, marker::Sized, mem::ManuallyDrop, pin::Pin, task::{Context, Poll}, }; use pin_project_lite::pin_project; use tracing::{Dispatch, span::Span}; #[derive(Debug, Clone)] pub struct PerfSpan { inner: ManuallyDrop, dispatch: Dispatch, } #[must_use = "once a span has been entered, it should be exited"] pub struct PerfSpanEntered<'a> { span: &'a PerfSpan, } impl PerfSpan { pub fn new(span: Span, dispatch: Dispatch) -> Self { Self { inner: ManuallyDrop::new(span), dispatch, } } pub fn enter(&self) -> PerfSpanEntered<'_> { if let Some(ref id) = self.inner.id() { self.dispatch.enter(id); } PerfSpanEntered { span: self } } pub fn inner(&self) -> &Span { &self.inner } } impl Drop for PerfSpan { fn drop(&mut self) { // Bring the desired dispatch into scope before explicitly calling // the span destructor. This routes the span exit to the correct // [`tracing::Subscriber`]. let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch); // SAFETY: ManuallyDrop in Drop implementation unsafe { ManuallyDrop::drop(&mut self.inner) } } } impl Drop for PerfSpanEntered<'_> { fn drop(&mut self) { assert!(self.span.inner.id().is_some()); let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch); self.span.dispatch.exit(&self.span.inner.id().unwrap()); } } pub trait PerfInstrument: Sized { fn instrument(self, span: PerfSpan) -> PerfInstrumented { PerfInstrumented { inner: ManuallyDrop::new(self), span, } } } pin_project! { #[project = PerfInstrumentedProj] #[derive(Debug, Clone)] #[must_use = "futures do nothing unless you `.await` or poll them"] pub struct PerfInstrumented { // `ManuallyDrop` is used here to to enter instrument `Drop` by entering // `Span` and executing `ManuallyDrop::drop`. #[pin] inner: ManuallyDrop, span: PerfSpan, } impl PinnedDrop for PerfInstrumented { fn drop(this: Pin<&mut Self>) { let this = this.project(); let _enter = this.span.enter(); // SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't // different from wrapping `T` in `Option` and calling // `Pin::set(&mut this.inner, None)`, except avoiding // additional memory overhead. // 2. `ManuallyDrop::drop()` is safe, because // `PinnedDrop::drop()` is guaranteed to be called only // once. unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) } } } } impl<'a, T> PerfInstrumentedProj<'a, T> { /// Get a mutable reference to the [`Span`] a pinned mutable reference to /// the wrapped type. fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) { // SAFETY: As long as `ManuallyDrop` does not move, `T` won't move // and `inner` is valid, because `ManuallyDrop::drop` is called // only inside `Drop` of the `Instrumented`. let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) }; (self.span, inner) } } impl Future for PerfInstrumented { type Output = T::Output; fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let (span, inner) = self.project().span_and_inner_pin_mut(); let _enter = span.enter(); inner.poll(cx) } } impl PerfInstrument for T {} ================================================ FILE: libs/utils/Cargo.toml ================================================ [package] name = "utils" version = "0.1.0" edition.workspace = true license.workspace = true [features] default = ["rename_noreplace"] rename_noreplace = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] [dependencies] arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true fail.workspace = true futures = { workspace = true } jsonwebtoken.workspace = true nix = { workspace = true, features = ["ioctl"] } once_cell.workspace = true pem.workspace = true pin-project-lite.workspace = true regex.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["signal"] } tokio-tar.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true uuid.workspace = true strum.workspace = true strum_macros.workspace = true walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true metrics.workspace = true const_format.workspace = true [dev-dependencies] byteorder.workspace = true bytes.workspace = true criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true pprof.workspace = true serde_assert.workspace = true tokio = { workspace = true, features = ["test-util"] } [[bench]] name = "benchmarks" harness = false ================================================ FILE: libs/utils/benches/README.md ================================================ ## Utils Benchmarks To run benchmarks: ```sh # All benchmarks. cargo bench --package utils # Specific file. cargo bench --package utils --bench benchmarks # Specific benchmark. cargo bench --package utils --bench benchmarks log_slow/enabled=true # List available benchmarks. cargo bench --package utils --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. Benchmarks are automatically compared against the previous run. To compare against other runs, see `--baseline` and `--save-baseline`. ================================================ FILE: libs/utils/benches/benchmarks.rs ================================================ use std::time::Duration; use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use pprof::criterion::{Output, PProfProfiler}; use utils::id; use utils::logging::log_slow; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_id_stringify, bench_log_slow, ); criterion_main!(benches); pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. let ttid = id::TenantTimelineId::generate(); c.bench_function("id.to_string", |b| { b.iter(|| { // FIXME measurement overhead? //for _ in 0..1000 { // ttid.tenant_id.to_string(); //} ttid.tenant_id.to_string(); }) }); } pub fn bench_log_slow(c: &mut Criterion) { for enabled in [false, true] { c.bench_function(&format!("log_slow/enabled={enabled}"), |b| { run_bench(b, enabled).unwrap() }); } // The actual benchmark. fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> { const THRESHOLD: Duration = Duration::from_secs(1); // Use a multi-threaded runtime to avoid thread parking overhead when yielding. let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; // Test both with and without log_slow, since we're essentially measuring Tokio scheduling // performance too. Use a simple noop future that yields once, to avoid any scheduler fast // paths for a ready future. if enabled { b.iter(|| { runtime.block_on(log_slow( "ready", THRESHOLD, std::pin::pin!(tokio::task::yield_now()), )) }); } else { b.iter(|| runtime.block_on(tokio::task::yield_now())); } Ok(()) } } ================================================ FILE: libs/utils/scripts/restore_from_wal.sh ================================================ #!/usr/bin/env bash set -euxo pipefail PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 PG_VERSION=$5 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) # The way that initdb is invoked must match how the pageserver runs initdb. function initdb_with_args { local cmd=( "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --locale 'C.UTF-8' --lc-collate 'C.UTF-8' --lc-ctype 'C.UTF-8' --lc-messages 'C.UTF-8' --lc-monetary 'C.UTF-8' --lc-numeric 'C.UTF-8' --lc-time 'C.UTF-8' --sysid="$SYSID" ) case "$PG_VERSION" in 14) # Postgres 14 and below didn't support --locale-provider ;; 15 | 16) cmd+=(--locale-provider 'libc') ;; *) # Postgres 17 added the builtin provider cmd+=(--locale-provider 'builtin') ;; esac eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}" } rm -fr "$DATA_DIR" initdb_with_args echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR" cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc rm -f "$DATA_DIR"/000000010000000000000001 ================================================ FILE: libs/utils/scripts/restore_from_wal_initdb.sh ================================================ #!/bin/bash # like restore_from_wal.sh, but takes existing initdb.tar.zst set -euxo pipefail PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR" cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc rm -f "$DATA_DIR"/000000010000000000000001 ================================================ FILE: libs/utils/src/auth.rs ================================================ // For details about authentication see docs/authentication.md use std::borrow::Cow; use std::fmt::Display; use std::fs; use std::sync::Arc; use anyhow::Result; use arc_swap::ArcSwap; use camino::Utf8Path; use jsonwebtoken::{ Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, }; use pem::Pem; use serde::{Deserialize, Deserializer, Serialize, de::DeserializeOwned}; use uuid::Uuid; use crate::id::TenantId; /// Algorithm to use. We require EdDSA. const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { /// Provides access to all data for a specific tenant (specified in `struct Claims` below) // TODO: join these two? Tenant, /// Provides access to all data for a specific tenant, but based on endpoint ID. This token scope /// is only used by compute to fetch the spec for a specific endpoint. The spec contains a Tenant-scoped /// token authorizing access to all data of a tenant, so the spec-fetch API requires a TenantEndpoint /// scope token to ensure that untrusted compute nodes can't fetch spec for arbitrary endpoints. TenantEndpoint, /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. /// Should only be used e.g. for status check/tenant creation/list. PageServerApi, /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. /// Should only be used e.g. for status check. /// Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, /// The scope used by pageservers in upcalls to storage controller and cloud control plane #[serde(rename = "generations_api")] GenerationsApi, /// Allows access to control plane managment API and all storage controller endpoints. Admin, /// Allows access to control plane & storage controller endpoints used in infrastructure automation (e.g. node registration) Infra, /// Allows access to storage controller APIs used by the scrubber, to interrogate the state /// of a tenant & post scrub results. Scrubber, /// This scope is used for communication with other storage controller instances. /// At the time of writing, this is only used for the step down request. #[serde(rename = "controller_peer")] ControllerPeer, } fn deserialize_empty_string_as_none_uuid<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { let opt = Option::::deserialize(deserializer)?; match opt.as_deref() { Some("") => Ok(None), Some(s) => Uuid::parse_str(s) .map(Some) .map_err(serde::de::Error::custom), None => Ok(None), } } /// JWT payload. See docs/authentication.md for the format #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub struct Claims { #[serde(default)] pub tenant_id: Option, #[serde( default, skip_serializing_if = "Option::is_none", // Neon control plane includes this field as empty in the claims. // Consider it None in those cases. deserialize_with = "deserialize_empty_string_as_none_uuid" )] pub endpoint_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { Self { tenant_id, scope, endpoint_id: None, } } } pub struct SwappableJwtAuth(ArcSwap); impl SwappableJwtAuth { pub fn new(jwt_auth: JwtAuth) -> Self { SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth))) } pub fn swap(&self, jwt_auth: JwtAuth) { self.0.swap(Arc::new(jwt_auth)); } pub fn decode( &self, token: &str, ) -> std::result::Result, AuthError> { self.0.load().decode(token) } } impl std::fmt::Debug for SwappableJwtAuth { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "Swappable({:?})", self.0.load()) } } #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct AuthError(pub Cow<'static, str>); impl Display for AuthError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } pub struct JwtAuth { decoding_keys: Vec, validation: Validation, } impl JwtAuth { pub fn new(decoding_keys: Vec) -> Self { let mut validation = Validation::default(); validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM]; // The default 'required_spec_claims' is 'exp'. But we don't want to require // expiration. validation.required_spec_claims = [].into(); Self { decoding_keys, validation, } } pub fn from_key_path(key_path: &Utf8Path) -> Result { let metadata = key_path.metadata()?; let decoding_keys = if metadata.is_dir() { let mut keys = Vec::new(); for entry in fs::read_dir(key_path)? { let path = entry?.path(); if !path.is_file() { // Ignore directories (don't recurse) continue; } let public_key = fs::read(path)?; keys.push(DecodingKey::from_ed_pem(&public_key)?); } keys } else if metadata.is_file() { let public_key = fs::read(key_path)?; vec![DecodingKey::from_ed_pem(&public_key)?] } else { anyhow::bail!("path is neither a directory or a file") }; if decoding_keys.is_empty() { anyhow::bail!( "Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected." ); } Ok(Self::new(decoding_keys)) } pub fn from_key(key: String) -> Result { Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?])) } /// Attempt to decode the token with the internal decoding keys. /// /// The function tries the stored decoding keys in succession, /// and returns the first yielding a successful result. /// If there is no working decoding key, it returns the last error. pub fn decode( &self, token: &str, ) -> std::result::Result, AuthError> { let mut res = None; for decoding_key in &self.decoding_keys { res = Some(decode(token, decoding_key, &self.validation)); if let Some(Ok(res)) = res { return Ok(res); } } if let Some(res) = res { res.map_err(|e| AuthError(Cow::Owned(e.to_string()))) } else { Err(AuthError(Cow::Borrowed("no JWT decoding keys configured"))) } } } impl std::fmt::Debug for JwtAuth { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("JwtAuth") .field("validation", &self.validation) .finish() } } // this function is used only for testing purposes in CLI e g generate tokens during init pub fn encode_from_key_file(claims: &S, pem: &Pem) -> Result { let key = EncodingKey::from_ed_der(pem.contents()); Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?) } #[cfg(test)] mod tests { use std::str::FromStr; use super::*; // Generated with: // // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem const TEST_PUB_KEY_ED25519: &str = r#" -----BEGIN PUBLIC KEY----- MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= -----END PUBLIC KEY----- "#; const TEST_PRIV_KEY_ED25519: &str = r#" -----BEGIN PRIVATE KEY----- MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH -----END PRIVATE KEY----- "#; #[test] fn test_decode() { let expected_claims = Claims { tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, endpoint_id: None, }; // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519: // // ``` // { // "scope": "tenant", // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", // "iss": "neon.controlplane", // "iat": 1678442479 // } // ``` // let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key let auth = JwtAuth::new(vec![ DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(), ]); let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims; assert_eq!(claims_from_token, expected_claims); } #[test] fn test_encode() { let claims = Claims { tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, endpoint_id: None, }; let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap(); let encoded = encode_from_key_file(&claims, &pem).unwrap(); // decode it back let auth = JwtAuth::new(vec![ DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(), ]); let decoded: TokenData = auth.decode(&encoded).unwrap(); assert_eq!(decoded.claims, claims); } } ================================================ FILE: libs/utils/src/backoff.rs ================================================ use std::fmt::{Debug, Display}; use std::time::Duration; use futures::Future; use tokio_util::sync::CancellationToken; pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; pub async fn exponential_backoff( n: u32, base_increment: f64, max_seconds: f64, cancel: &CancellationToken, ) { let backoff_duration_seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds); if backoff_duration_seconds > 0.0 { tracing::info!( "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", ); drop( tokio::time::timeout( std::time::Duration::from_secs_f64(backoff_duration_seconds), cancel.cancelled(), ) .await, ) } } pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration { let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds); Duration::from_secs_f64(seconds) } pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 } else { (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) } } /// Retries passed operation until one of the following conditions are met: /// - encountered error is considered as permanent (non-retryable) /// - retries have been exhausted /// - cancellation token has been cancelled /// /// `is_permanent` closure should be used to provide distinction between permanent/non-permanent /// errors. When attempts cross `warn_threshold` function starts to emit log warnings. /// `description` argument is added to log messages. Its value should identify the `op` is doing /// `cancel` cancels new attempts and the backoff sleep. /// /// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work /// for any other error type. Final failed attempt is logged with `{:?}`. /// /// Returns `None` if cancellation was noticed during backoff or the terminal result. pub async fn retry( mut op: O, is_permanent: impl Fn(&E) -> bool, warn_threshold: u32, max_retries: u32, description: &str, cancel: &CancellationToken, ) -> Option> where // Not std::error::Error because anyhow::Error doesnt implement it. // For context see https://github.com/dtolnay/anyhow/issues/63 E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, { let mut attempts = 0; loop { if cancel.is_cancelled() { return None; } let result = op().await; match &result { Ok(_) => { if attempts > 0 { tracing::info!("{description} succeeded after {attempts} retries"); } return Some(result); } // These are "permanent" errors that should not be retried. Err(e) if is_permanent(e) => { return Some(result); } // Assume that any other failure might be transient, and the operation might // succeed if we just keep trying. Err(err) if attempts < warn_threshold => { tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}"); } Err(err) if attempts < max_retries => { tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); } Err(err) => { // Operation failed `max_attempts` times. Time to give up. tracing::warn!( "{description} still failed after {attempts} retries, giving up: {err:?}" ); return Some(result); } } // sleep and retry exponential_backoff( attempts, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, cancel, ) .await; attempts += 1; } } #[cfg(test)] mod tests { use std::io; use tokio::sync::Mutex; use super::*; #[test] fn backoff_defaults_produce_growing_backoff_sequence() { let mut current_backoff_value = None; for i in 0..10_000 { let new_backoff_value = exponential_backoff_duration_seconds( i, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, ); if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { assert!( old_backoff_value <= new_backoff_value, "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" ) } } assert_eq!( current_backoff_value.expect("Should have produced backoff values to compare"), DEFAULT_MAX_BACKOFF_SECONDS, "Given big enough of retries, backoff should reach its allowed max value" ); } #[tokio::test(start_paused = true)] async fn retry_always_error() { let count = Mutex::new(0); retry( || async { *count.lock().await += 1; Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) }, |_e| false, 1, 1, "work", &CancellationToken::new(), ) .await .expect("not cancelled") .expect_err("it can only fail"); assert_eq!(*count.lock().await, 2); } #[tokio::test(start_paused = true)] async fn retry_ok_after_err() { let count = Mutex::new(0); retry( || async { let mut locked = count.lock().await; if *locked > 1 { Ok(()) } else { *locked += 1; Err(io::Error::from(io::ErrorKind::Other)) } }, |_e| false, 2, 2, "work", &CancellationToken::new(), ) .await .expect("not cancelled") .expect("success on second try"); } #[tokio::test(start_paused = true)] async fn dont_retry_permanent_errors() { let count = Mutex::new(0); let _ = retry( || async { let mut locked = count.lock().await; if *locked > 1 { Ok(()) } else { *locked += 1; Err(io::Error::from(io::ErrorKind::Other)) } }, |_e| true, 2, 2, "work", &CancellationToken::new(), ) .await .expect("was not cancellation") .expect_err("it was permanent error"); assert_eq!(*count.lock().await, 1); } } ================================================ FILE: libs/utils/src/bin_ser.rs ================================================ //! Utilities for binary serialization/deserialization. //! //! The [`BeSer`] trait allows us to define data structures //! that can match data structures that are sent over the wire //! in big-endian form with no packing. //! //! The [`LeSer`] trait does the same thing, in little-endian form. //! //! Note: you will get a compile error if you try to `use` both traits //! in the same module or scope. This is intended to be a safety //! mechanism: mixing big-endian and little-endian encoding in the same file //! is error-prone. #![warn(missing_docs)] use std::io::{self, Read, Write}; use bincode::Options; use serde::Serialize; use serde::de::DeserializeOwned; use thiserror::Error; /// An error that occurred during a deserialize operation /// /// This could happen because the input data was too short, /// or because an invalid value was encountered. #[derive(Debug, Error)] pub enum DeserializeError { /// The deserializer isn't able to deserialize the supplied data. #[error("deserialize error")] BadInput, /// While deserializing from a `Read` source, an `io::Error` occurred. #[error("deserialize error: {0}")] Io(io::Error), } impl From for DeserializeError { fn from(e: bincode::Error) -> Self { match *e { bincode::ErrorKind::Io(io_err) => DeserializeError::Io(io_err), _ => DeserializeError::BadInput, } } } /// An error that occurred during a serialize operation /// /// This probably means our [`Write`] failed, e.g. we tried /// to write beyond the end of a buffer. #[derive(Debug, Error)] pub enum SerializeError { /// The serializer isn't able to serialize the supplied data. #[error("serialize error")] BadInput, /// While serializing into a `Write` sink, an `io::Error` occurred. #[error("serialize error: {0}")] Io(io::Error), } impl From for SerializeError { fn from(e: bincode::Error) -> Self { match *e { bincode::ErrorKind::Io(io_err) => SerializeError::Io(io_err), _ => SerializeError::BadInput, } } } /// A shortcut that configures big-endian binary serialization /// /// Properties: /// - Big endian /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you /// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn be_coder() -> impl Options { bincode::DefaultOptions::new() .with_big_endian() .with_fixint_encoding() } /// A shortcut that configures little-ending binary serialization /// /// Properties: /// - Little endian /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you /// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn le_coder() -> impl Options { bincode::DefaultOptions::new() .with_little_endian() .with_fixint_encoding() } /// Binary serialize/deserialize helper functions (Big Endian) /// pub trait BeSer { /// Serialize into a byte slice fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> where Self: Serialize, { // &mut [u8] implements Write, but `ser_into` needs a mutable // reference to that. So we need the slightly awkward "mutable // reference to a mutable reference. self.ser_into(&mut b) } /// Serialize into a borrowed writer /// /// This is useful for most `Write` types except `&mut [u8]`, which /// can more easily use [`ser_into_slice`](Self::ser_into_slice). fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> where Self: Serialize, { be_coder().serialize_into(w, &self).map_err(|e| e.into()) } /// Serialize into a new heap-allocated buffer fn ser(&self) -> Result, SerializeError> where Self: Serialize, { be_coder().serialize(&self).map_err(|e| e.into()) } /// Deserialize from the full contents of a byte slice /// /// See also: [`BeSer::des_prefix`] fn des(buf: &[u8]) -> Result where Self: DeserializeOwned, { be_coder() .deserialize(buf) .or(Err(DeserializeError::BadInput)) } /// Deserialize from a prefix of the byte slice /// /// Uses as much of the byte slice as is necessary to deserialize the /// type, but does not guarantee that the entire slice is used. /// /// See also: [`BeSer::des`] fn des_prefix(buf: &[u8]) -> Result where Self: DeserializeOwned, { be_coder() .allow_trailing_bytes() .deserialize(buf) .or(Err(DeserializeError::BadInput)) } /// Deserialize from a reader fn des_from(r: &mut R) -> Result where Self: DeserializeOwned, { be_coder().deserialize_from(r).map_err(|e| e.into()) } /// Compute the serialized size of a data structure /// /// Note: it may be faster to serialize to a buffer and then measure the /// buffer length, than to call `serialized_size` and then `ser_into`. fn serialized_size(&self) -> Result where Self: Serialize, { be_coder().serialized_size(self).map_err(|e| e.into()) } } /// Binary serialize/deserialize helper functions (Little Endian) /// pub trait LeSer { /// Serialize into a byte slice fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> where Self: Serialize, { // &mut [u8] implements Write, but `ser_into` needs a mutable // reference to that. So we need the slightly awkward "mutable // reference to a mutable reference. self.ser_into(&mut b) } /// Serialize into a borrowed writer /// /// This is useful for most `Write` types except `&mut [u8]`, which /// can more easily use [`ser_into_slice`](Self::ser_into_slice). fn ser_into(&self, w: &mut W) -> Result<(), SerializeError> where Self: Serialize, { le_coder().serialize_into(w, &self).map_err(|e| e.into()) } /// Serialize into a new heap-allocated buffer fn ser(&self) -> Result, SerializeError> where Self: Serialize, { le_coder().serialize(&self).map_err(|e| e.into()) } /// Deserialize from the full contents of a byte slice /// /// See also: [`LeSer::des_prefix`] fn des(buf: &[u8]) -> Result where Self: DeserializeOwned, { le_coder() .deserialize(buf) .or(Err(DeserializeError::BadInput)) } /// Deserialize from a prefix of the byte slice /// /// Uses as much of the byte slice as is necessary to deserialize the /// type, but does not guarantee that the entire slice is used. /// /// See also: [`LeSer::des`] fn des_prefix(buf: &[u8]) -> Result where Self: DeserializeOwned, { le_coder() .allow_trailing_bytes() .deserialize(buf) .or(Err(DeserializeError::BadInput)) } /// Deserialize from a reader fn des_from(r: &mut R) -> Result where Self: DeserializeOwned, { le_coder().deserialize_from(r).map_err(|e| e.into()) } /// Compute the serialized size of a data structure /// /// Note: it may be faster to serialize to a buffer and then measure the /// buffer length, than to call `serialized_size` and then `ser_into`. fn serialized_size(&self) -> Result where Self: Serialize, { le_coder().serialized_size(self).map_err(|e| e.into()) } } // Because usage of `BeSer` or `LeSer` can be done with *either* a Serialize or // DeserializeOwned implementation, the blanket implementation has to be for every type. impl BeSer for T {} impl LeSer for T {} #[cfg(test)] mod tests { use std::io::Cursor; use serde::{Deserialize, Serialize}; use super::DeserializeError; #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, b: u32, } const SHORT1: ShortStruct = ShortStruct { a: 7, b: 65536 }; const SHORT1_ENC_BE: &[u8] = &[7, 0, 1, 0, 0]; const SHORT1_ENC_BE_TRAILING: &[u8] = &[7, 0, 1, 0, 0, 255, 255, 255]; const SHORT1_ENC_LE: &[u8] = &[7, 0, 0, 1, 0]; const SHORT1_ENC_LE_TRAILING: &[u8] = &[7, 0, 0, 1, 0, 255, 255, 255]; const SHORT2: ShortStruct = ShortStruct { a: 8, b: 0x07030000, }; const SHORT2_ENC_BE: &[u8] = &[8, 7, 3, 0, 0]; const SHORT2_ENC_BE_TRAILING: &[u8] = &[8, 7, 3, 0, 0, 0xff, 0xff, 0xff]; const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] struct NewTypeStruct(u32); const NT1: NewTypeStruct = NewTypeStruct(414243); const NT1_INNER: u32 = 414243; #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, pub blockpos: u32, pub last_flush_position: u64, pub apply: u64, pub timestamp: i64, pub reply_requested: u8, } const LONG1: LongMsg = LongMsg { tag: 42, blockpos: 0x1000_2000, last_flush_position: 0x1234_2345_3456_4567, apply: 0x9876_5432_10FE_DCBA, timestamp: 0x7788_99AA_BBCC_DDFF, reply_requested: 1, }; #[test] fn be_short() { use super::BeSer; assert_eq!(SHORT1.serialized_size().unwrap(), 5); let encoded = SHORT1.ser().unwrap(); assert_eq!(encoded, SHORT1_ENC_BE); let decoded = ShortStruct::des(SHORT2_ENC_BE).unwrap(); assert_eq!(decoded, SHORT2); // with trailing data let decoded = ShortStruct::des_prefix(SHORT2_ENC_BE_TRAILING).unwrap(); assert_eq!(decoded, SHORT2); let err = ShortStruct::des(SHORT2_ENC_BE_TRAILING).unwrap_err(); assert!(matches!(err, DeserializeError::BadInput)); // serialize into a `Write` sink. let mut buf = Cursor::new(vec![0xFF; 8]); SHORT1.ser_into(&mut buf).unwrap(); assert_eq!(buf.into_inner(), SHORT1_ENC_BE_TRAILING); // deserialize from a `Write` sink. let mut buf = Cursor::new(SHORT2_ENC_BE); let decoded = ShortStruct::des_from(&mut buf).unwrap(); assert_eq!(decoded, SHORT2); // deserialize from a `Write` sink that terminates early. let mut buf = Cursor::new([0u8; 4]); let err = ShortStruct::des_from(&mut buf).unwrap_err(); assert!(matches!(err, DeserializeError::Io(_))); } #[test] fn le_short() { use super::LeSer; assert_eq!(SHORT1.serialized_size().unwrap(), 5); let encoded = SHORT1.ser().unwrap(); assert_eq!(encoded, SHORT1_ENC_LE); let decoded = ShortStruct::des(SHORT2_ENC_LE).unwrap(); assert_eq!(decoded, SHORT2); // with trailing data let decoded = ShortStruct::des_prefix(SHORT2_ENC_LE_TRAILING).unwrap(); assert_eq!(decoded, SHORT2); let err = ShortStruct::des(SHORT2_ENC_LE_TRAILING).unwrap_err(); assert!(matches!(err, DeserializeError::BadInput)); // serialize into a `Write` sink. let mut buf = Cursor::new(vec![0xFF; 8]); SHORT1.ser_into(&mut buf).unwrap(); assert_eq!(buf.into_inner(), SHORT1_ENC_LE_TRAILING); // deserialize from a `Write` sink. let mut buf = Cursor::new(SHORT2_ENC_LE); let decoded = ShortStruct::des_from(&mut buf).unwrap(); assert_eq!(decoded, SHORT2); // deserialize from a `Write` sink that terminates early. let mut buf = Cursor::new([0u8; 4]); let err = ShortStruct::des_from(&mut buf).unwrap_err(); assert!(matches!(err, DeserializeError::Io(_))); } #[test] fn be_long() { use super::BeSer; assert_eq!(LONG1.serialized_size().unwrap(), 30); let msg = LONG1; let encoded = msg.ser().unwrap(); let expected = hex_literal::hex!( "2A 1000 2000 1234 2345 3456 4567 9876 5432 10FE DCBA 7788 99AA BBCC DDFF 01" ); assert_eq!(encoded, expected); let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } #[test] fn le_long() { use super::LeSer; assert_eq!(LONG1.serialized_size().unwrap(), 30); let msg = LONG1; let encoded = msg.ser().unwrap(); let expected = hex_literal::hex!( "2A 0020 0010 6745 5634 4523 3412 BADC FE10 3254 7698 FFDD CCBB AA99 8877 01" ); assert_eq!(encoded, expected); let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } #[test] /// Ensure that newtype wrappers around u32 don't change the serialization format fn be_nt() { use super::BeSer; assert_eq!(NT1.serialized_size().unwrap(), 4); let msg = NT1; let encoded = msg.ser().unwrap(); let expected = hex_literal::hex!("0006 5223"); assert_eq!(encoded, expected); assert_eq!(encoded, NT1_INNER.ser().unwrap()); let msg2 = NewTypeStruct::des(&encoded).unwrap(); assert_eq!(msg, msg2); } #[test] /// Ensure that newtype wrappers around u32 don't change the serialization format fn le_nt() { use super::LeSer; assert_eq!(NT1.serialized_size().unwrap(), 4); let msg = NT1; let encoded = msg.ser().unwrap(); let expected = hex_literal::hex!("2352 0600"); assert_eq!(encoded, expected); assert_eq!(encoded, NT1_INNER.ser().unwrap()); let msg2 = NewTypeStruct::des(&encoded).unwrap(); assert_eq!(msg, msg2); } } ================================================ FILE: libs/utils/src/circuit_breaker.rs ================================================ use std::fmt::Display; use std::time::{Duration, Instant}; use metrics::IntCounter; /// Circuit breakers are for operations that are expensive and fallible. /// /// If a circuit breaker fails repeatedly, we will stop attempting it for some /// period of time, to avoid denial-of-service from retries, and /// to mitigate the log spam from repeated failures. pub struct CircuitBreaker { /// An identifier that enables us to log useful errors when a circuit is broken name: String, /// Consecutive failures since last success fail_count: usize, /// How many consecutive failures before we break the circuit fail_threshold: usize, /// If circuit is broken, when was it broken? broken_at: Option, /// If set, we will auto-reset the circuit this long after it was broken. If None, broken /// circuits stay broken forever, or until success() is called. reset_period: Option, /// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker /// to permit something to keep running even if it would otherwise have tripped it. short_circuit: bool, } impl CircuitBreaker { pub fn new(name: String, fail_threshold: usize, reset_period: Option) -> Self { Self { name, fail_count: 0, fail_threshold, broken_at: None, reset_period, short_circuit: false, } } /// Construct an unbreakable circuit breaker, for use in unit tests etc. pub fn short_circuit() -> Self { Self { name: String::new(), fail_threshold: 0, fail_count: 0, broken_at: None, reset_period: None, short_circuit: true, } } pub fn fail(&mut self, metric: &IntCounter, error: E) where E: Display, { if self.short_circuit { return; } self.fail_count += 1; if self.broken_at.is_none() && self.fail_count >= self.fail_threshold { self.break_circuit(metric, error); } } /// Call this after successfully executing an operation pub fn success(&mut self, metric: &IntCounter) { self.fail_count = 0; if let Some(broken_at) = &self.broken_at { tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})", humantime::format_duration(broken_at.elapsed())); self.broken_at = None; metric.inc(); } } /// Call this before attempting an operation, and skip the operation if we are currently broken. pub fn is_broken(&mut self) -> bool { if self.short_circuit { return false; } if let Some(broken_at) = self.broken_at { match self.reset_period { Some(reset_period) if broken_at.elapsed() > reset_period => { self.reset_circuit(); false } _ => true, } } else { false } } fn break_circuit(&mut self, metric: &IntCounter, error: E) where E: Display, { self.broken_at = Some(Instant::now()); tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}"); metric.inc(); } fn reset_circuit(&mut self) { self.broken_at = None; self.fail_count = 0; } } ================================================ FILE: libs/utils/src/completion.rs ================================================ use tokio_util::task::TaskTracker; use tokio_util::task::task_tracker::TaskTrackerToken; /// While a reference is kept around, the associated [`Barrier::wait`] will wait. /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] pub struct Completion { token: TaskTrackerToken, } impl std::fmt::Debug for Completion { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Completion") .field("siblings", &self.token.task_tracker().len()) .finish() } } impl Completion { /// Returns true if this completion is associated with the given barrier. pub fn blocks(&self, barrier: &Barrier) -> bool { TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) } pub fn barrier(&self) -> Barrier { Barrier(self.token.task_tracker().clone()) } } /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); impl std::fmt::Debug for Barrier { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Barrier") .field("remaining", &self.0.len()) .finish() } } impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); rx } } impl Barrier { pub async fn wait(self) { self.0.wait().await; } pub async fn maybe_wait(barrier: Option) { if let Some(b) = barrier { b.wait().await } } /// Return true if a call to wait() would complete immediately pub fn is_ready(&self) -> bool { futures::future::FutureExt::now_or_never(self.0.wait()).is_some() } } impl PartialEq for Barrier { fn eq(&self, other: &Self) -> bool { TaskTracker::ptr_eq(&self.0, &other.0) } } impl Eq for Barrier {} /// Create new Guard and Barrier pair. pub fn channel() -> (Completion, Barrier) { let tracker = TaskTracker::new(); // otherwise wait never exits tracker.close(); let token = tracker.token(); (Completion { token }, Barrier(tracker)) } ================================================ FILE: libs/utils/src/crashsafe.rs ================================================ use std::borrow::Cow; use std::fs::{self, File}; use std::io::{self, Write}; use std::os::fd::AsFd; use camino::{Utf8Path, Utf8PathBuf}; /// Similar to [`std::fs::create_dir`], except we fsync the /// created directory and its parent. pub fn create_dir(path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); fs::create_dir(path)?; fsync_file_and_parent(path)?; Ok(()) } /// Similar to [`std::fs::create_dir_all`], except we fsync all /// newly created directories and the pre-existing parent. pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { let mut path = path.as_ref(); let mut dirs_to_create = Vec::new(); // Figure out which directories we need to create. loop { match path.metadata() { Ok(metadata) if metadata.is_dir() => break, Ok(_) => { return Err(io::Error::new( io::ErrorKind::AlreadyExists, format!("non-directory found in path: {path}"), )); } Err(ref e) if e.kind() == io::ErrorKind::NotFound => {} Err(e) => return Err(e), } dirs_to_create.push(path); match path.parent() { Some(parent) => path = parent, None => { return Err(io::Error::new( io::ErrorKind::InvalidInput, format!("can't find parent of path '{path}'"), )); } } } // Create directories from parent to child. for &path in dirs_to_create.iter().rev() { fs::create_dir(path)?; } // Fsync the created directories from child to parent. for &path in dirs_to_create.iter() { fsync(path)?; } // If we created any new directories, fsync the parent. if !dirs_to_create.is_empty() { fsync(path)?; } Ok(()) } /// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, /// or if there's no extension, creates one and puts a suffix there. pub fn path_with_suffix_extension( original_path: impl AsRef, suffix: &str, ) -> Utf8PathBuf { let new_extension = match original_path.as_ref().extension() { Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), None => Cow::Borrowed(suffix), }; original_path.as_ref().with_extension(new_extension) } pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> { let parent = file_path .parent() .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?; fsync(file_path)?; fsync(parent)?; Ok(()) } pub fn fsync(path: &Utf8Path) -> io::Result<()> { File::open(path) .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}"))) .and_then(|file| { file.sync_all().map_err(|e| { io::Error::new( e.kind(), format!("Failed to sync file {path:?} data and metadata: {e}"), ) }) }) .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}"))) } pub async fn fsync_async(path: impl AsRef) -> Result<(), std::io::Error> { tokio::fs::File::open(path.as_ref()).await?.sync_all().await } pub async fn fsync_async_opt( path: impl AsRef, do_fsync: bool, ) -> Result<(), std::io::Error> { if do_fsync { fsync_async(path.as_ref()).await?; } Ok(()) } /// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After /// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the /// same file system. /// /// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to /// make the rename durable. This sequence ensures the target file will never be incomplete. /// /// Postgres also: /// /// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing /// file survives a crash. Current callers don't need this as it should already be fsynced if /// durability is needed. /// /// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g. /// NFS), but not on Linux with most common file systems like ext4 (which we currently use). /// /// An audit of 8 other databases found that none fsynced the file after a rename: /// /// /// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs: /// /// /// virtual_file.rs has similar code, but it doesn't use vfs. /// /// Useful links: /// /// pub async fn durable_rename( old_path: impl AsRef, new_path: impl AsRef, do_fsync: bool, ) -> io::Result<()> { // first fsync the file fsync_async_opt(old_path.as_ref(), do_fsync).await?; // Time to do the real deal. tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; // Now fsync the parent let parent = match new_path.as_ref().parent() { Some(p) => p, None => Utf8Path::new("./"), // assume current dir if there is no parent }; fsync_async_opt(parent, do_fsync).await?; Ok(()) } /// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`]. /// /// The file is first written to the specified `tmp_path`, and in a second /// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync /// and atomic rename guarantee that, if we crash at any point, there will never /// be a partially written file at `final_path` (but maybe at `tmp_path`). /// /// Callers are responsible for serializing calls of this function for a given `final_path`. /// If they don't, there may be an error due to conflicting `tmp_path`, or there will /// be no error and the content of `final_path` will be the "winner" caller's `content`. /// I.e., the atomticity guarantees still hold. pub fn overwrite( final_path: &Utf8Path, tmp_path: &Utf8Path, content: &[u8], ) -> std::io::Result<()> { let Some(final_path_parent) = final_path.parent() else { return Err(std::io::Error::from_raw_os_error( nix::errno::Errno::EINVAL as i32, )); }; std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?; let mut file = std::fs::OpenOptions::new() .write(true) // Use `create_new` so that, if we race with ourselves or something else, // we bail out instead of causing damage. .create_new(true) .open(tmp_path)?; file.write_all(content)?; file.sync_all()?; drop(file); // don't keep the fd open for longer than we have to std::fs::rename(tmp_path, final_path)?; let final_parent_dirfd = std::fs::OpenOptions::new() .read(true) .open(final_path_parent)?; final_parent_dirfd.sync_all()?; Ok(()) } /// Syncs the filesystem for the given file descriptor. #[cfg_attr(target_os = "macos", allow(unused_variables))] pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> { // Linux guarantees durability for syncfs. // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). #[cfg(target_os = "linux")] { use anyhow::Context; nix::unistd::syncfs(fd).context("syncfs")?; } #[cfg(target_os = "macos")] { // macOS is not a production platform for Neon, don't even bother. } #[cfg(not(any(target_os = "linux", target_os = "macos")))] { compile_error!("Unsupported OS"); } Ok(()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_create_dir_fsyncd() { let dir = camino_tempfile::tempdir().unwrap(); let existing_dir_path = dir.path(); let err = create_dir(existing_dir_path).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); let child_dir = existing_dir_path.join("child"); create_dir(child_dir).unwrap(); let nested_child_dir = existing_dir_path.join("child1").join("child2"); let err = create_dir(nested_child_dir).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::NotFound); } #[test] fn test_create_dir_all_fsyncd() { let dir = camino_tempfile::tempdir().unwrap(); let existing_dir_path = dir.path(); create_dir_all(existing_dir_path).unwrap(); let child_dir = existing_dir_path.join("child"); assert!(!child_dir.exists()); create_dir_all(&child_dir).unwrap(); assert!(child_dir.exists()); let nested_child_dir = existing_dir_path.join("child1").join("child2"); assert!(!nested_child_dir.exists()); create_dir_all(&nested_child_dir).unwrap(); assert!(nested_child_dir.exists()); let file_path = existing_dir_path.join("file"); std::fs::write(&file_path, b"").unwrap(); let err = create_dir_all(&file_path).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); let invalid_dir_path = file_path.join("folder"); create_dir_all(invalid_dir_path).unwrap_err(); } #[test] fn test_path_with_suffix_extension() { let p = Utf8PathBuf::from("/foo/bar"); assert_eq!( &path_with_suffix_extension(p, "temp").to_string(), "/foo/bar.temp" ); let p = Utf8PathBuf::from("/foo/bar"); assert_eq!( &path_with_suffix_extension(p, "temp.temp").to_string(), "/foo/bar.temp.temp" ); let p = Utf8PathBuf::from("/foo/bar.baz"); assert_eq!( &path_with_suffix_extension(p, "temp.temp").to_string(), "/foo/bar.baz.temp.temp" ); let p = Utf8PathBuf::from("/foo/bar.baz"); assert_eq!( &path_with_suffix_extension(p, ".temp").to_string(), "/foo/bar.baz..temp" ); let p = Utf8PathBuf::from("/foo/bar/dir/"); assert_eq!( &path_with_suffix_extension(p, ".temp").to_string(), "/foo/bar/dir..temp" ); } } ================================================ FILE: libs/utils/src/elapsed_accum.rs ================================================ use std::time::{Duration, Instant}; #[derive(Default)] pub struct ElapsedAccum { accum: Duration, } impl ElapsedAccum { pub fn get(&self) -> Duration { self.accum } pub fn guard(&mut self) -> impl Drop + '_ { let start = Instant::now(); scopeguard::guard(start, |last_wait_at| { self.accum += Instant::now() - last_wait_at; }) } pub async fn measure(&mut self, fut: Fut) -> O where Fut: Future, { let _guard = self.guard(); fut.await } } ================================================ FILE: libs/utils/src/env.rs ================================================ //! Wrapper around `std::env::var` for parsing environment variables. use std::fmt::Display; use std::str::FromStr; /// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option where V: FromStr, E: Display, { match std::env::var(varname) { Ok(s) => Some( s.parse() .map_err(|e| { format!("failed to parse env var {varname} using FromStr::parse: {e:#}") }) .unwrap(), ), Err(std::env::VarError::NotPresent) => None, Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {varname} is not unicode") } } } /// For types `V` that implement [`serde::de::DeserializeOwned`]. pub fn var_serde_json_string(varname: &str) -> Option where V: serde::de::DeserializeOwned, { match std::env::var(varname) { Ok(s) => Some({ let value = serde_json::Value::String(s); serde_json::from_value(value) .map_err(|e| { format!("failed to parse env var {varname} as a serde_json json string: {e:#}") }) .unwrap() }), Err(std::env::VarError::NotPresent) => None, Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {varname} is not unicode") } } } /* BEGIN_HADRON */ pub enum DeploymentMode { Local, Dev, Staging, Prod, } pub fn get_deployment_mode() -> Option { match std::env::var("DEPLOYMENT_MODE") { Ok(env) => match env.as_str() { "development" => Some(DeploymentMode::Dev), "staging" => Some(DeploymentMode::Staging), "production" => Some(DeploymentMode::Prod), _ => { tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env); None } }, Err(_) => { // tracing::error!("DEPLOYMENT_MODE not set"); None } } } pub fn is_dev_or_staging() -> bool { matches!( get_deployment_mode(), Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging) ) } pub enum TestingMode { Chaos, Stress, } pub fn get_test_mode() -> Option { match std::env::var("HADRON_TEST_MODE") { Ok(env) => match env.as_str() { "chaos" => Some(TestingMode::Chaos), "stress" => Some(TestingMode::Stress), _ => { tracing::error!("Unexpected HADRON_TEST_MODE: {}", env); None } }, Err(_) => { tracing::error!("HADRON_TEST_MODE not set"); None } } } pub fn is_chaos_testing() -> bool { matches!(get_test_mode(), Some(TestingMode::Chaos)) } /* END_HADRON */ ================================================ FILE: libs/utils/src/error.rs ================================================ /// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting. /// /// It can be used with `anyhow::Error` as well. /// /// Why would one use this instead of converting to `anyhow::Error` on the spot? Because /// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after /// formatting. /// /// ## Usage /// /// ```rust /// #[derive(Debug, thiserror::Error)] /// enum MyCoolError { /// #[error("should never happen")] /// Bad(#[source] std::io::Error), /// } /// /// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) } /// /// # fn main() { /// use utils::error::report_compact_sources; /// /// if let Err(e) = failing_call() { /// let e = report_compact_sources(&e); /// assert_eq!(format!("{e}"), "should never happen: permission denied"); /// } /// # } /// ``` /// /// ## TODO /// /// When we are able to describe return position impl trait in traits, this should of course be an /// extension trait. Until then avoid boxing with this more ackward interface. pub fn report_compact_sources(e: &E) -> impl std::fmt::Display + '_ { struct AnyhowDisplayAlternateAlike<'a, E>(&'a E); impl std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0)?; // why is E a generic parameter here? hope that rustc will see through a default // Error::source implementation and leave the following out if there cannot be any // sources: Sources(self.0.source()).try_for_each(|src| write!(f, ": {src}")) } } struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>); impl<'a> Iterator for Sources<'a> { type Item = &'a (dyn std::error::Error + 'static); fn next(&mut self) -> Option { let rem = self.0; let next = self.0.and_then(|x| x.source()); self.0 = next; rem } } AnyhowDisplayAlternateAlike(e) } #[cfg(test)] mod tests { use super::report_compact_sources; #[test] fn report_compact_sources_examples() { use std::fmt::Write; #[derive(Debug, thiserror::Error)] enum EvictionError { #[error("cannot evict a remote layer")] CannotEvictRemoteLayer, #[error("stat failed")] StatFailed(#[source] std::io::Error), #[error("layer was no longer part of LayerMap")] LayerNotFound(#[source] anyhow::Error), } let examples = [ ( line!(), EvictionError::CannotEvictRemoteLayer, "cannot evict a remote layer", ), ( line!(), EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()), "stat failed: permission denied", ), ( line!(), EvictionError::LayerNotFound(anyhow::anyhow!("foobar")), "layer was no longer part of LayerMap: foobar", ), ]; let mut s = String::new(); for (line, example, expected) in examples { s.clear(); write!(s, "{}", report_compact_sources(&example)).expect("string grows"); assert_eq!(s, expected, "example on line {line}"); } } } ================================================ FILE: libs/utils/src/failpoint_support.rs ================================================ //! Failpoint support code shared between pageserver and safekeepers. use tokio_util::sync::CancellationToken; /// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. /// /// Optionally pass a cancellation token, and this failpoint will drop out of /// its pause when the cancellation token fires. This is useful for testing /// cases where we would like to block something, but test its clean shutdown behavior. /// The macro evaluates to a Result in that case, where Ok(()) is the case /// where the failpoint was not paused, and Err() is the case where cancellation /// token fired while evaluating the failpoint. /// /// Remember to unpause the failpoint in the test; until that happens, one of the /// limited number of spawn_blocking thread pool threads is leaked. #[macro_export] macro_rules! pausable_failpoint { ($name:literal) => {{ if cfg!(feature = "testing") { let cancel = ::tokio_util::sync::CancellationToken::new(); let _ = $crate::pausable_failpoint!($name, &cancel); } }}; ($name:literal, $cancel:expr) => {{ if cfg!(feature = "testing") { let failpoint_fut = ::tokio::task::spawn_blocking({ let current = ::tracing::Span::current(); move || { let _entered = current.entered(); ::tracing::info!("at failpoint {}", $name); ::fail::fail_point!($name); } }); let cancel_fut = async move { $cancel.cancelled().await; }; ::tokio::select! { res = failpoint_fut => { res.expect("spawn_blocking"); // continue with execution Ok(()) }, _ = cancel_fut => { Err(()) } } } else { Ok(()) } }}; } pub use pausable_failpoint; /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the /// specified time (in milliseconds). The main difference is that we use async /// tokio sleep function. Another difference is that we print lines to the log, /// which can be useful in tests to check that the failpoint was hit. /// /// Optionally pass a cancellation token, and this failpoint will drop out of /// its sleep when the cancellation token fires. This is useful for testing /// cases where we would like to block something, but test its clean shutdown behavior. #[macro_export] macro_rules! __failpoint_sleep_millis_async { ($name:literal) => {{ // If the failpoint is used with a "return" action, set should_sleep to the // returned value (as string). Otherwise it's set to None. let should_sleep = (|| { ::fail::fail_point!($name, |x| x); ::std::option::Option::None })(); // Sleep if the action was a returned value if let ::std::option::Option::Some(duration_str) = should_sleep { $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await } }}; ($name:literal, $cancel:expr) => {{ // If the failpoint is used with a "return" action, set should_sleep to the // returned value (as string). Otherwise it's set to None. let should_sleep = (|| { ::fail::fail_point!($name, |x| x); ::std::option::Option::None })(); // Sleep if the action was a returned value if let ::std::option::Option::Some(duration_str) = should_sleep { $crate::failpoint_support::failpoint_sleep_cancellable_helper( $name, duration_str, $cancel, ) .await } }}; } pub use __failpoint_sleep_millis_async as sleep_millis_async; // Helper function used by the macro. (A function has nicer scoping so we // don't need to decorate everything with "::") #[doc(hidden)] pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { let millis = duration_str.parse::().unwrap(); let d = std::time::Duration::from_millis(millis); tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); tokio::time::sleep(d).await; tracing::info!("failpoint {:?}: sleep done", name); } // Helper function used by the macro. (A function has nicer scoping so we // don't need to decorate everything with "::") #[doc(hidden)] pub async fn failpoint_sleep_cancellable_helper( name: &'static str, duration_str: String, cancel: &CancellationToken, ) { let millis = duration_str.parse::().unwrap(); let d = std::time::Duration::from_millis(millis); tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); tokio::time::timeout(d, cancel.cancelled()).await.ok(); tracing::info!("failpoint {:?}: sleep done", name); } /// Initialize the configured failpoints /// /// You must call this function before any concurrent threads do operations. pub fn init() -> fail::FailScenario<'static> { // The failpoints lib provides support for parsing the `FAILPOINTS` env var. // We want non-default behavior for `exit`, though, so, we handle it separately. // // Format for FAILPOINTS is "name=actions" separated by ";". let actions = std::env::var("FAILPOINTS"); if actions.is_ok() { // SAFETY: this function should before any threads start and access env vars concurrently unsafe { std::env::remove_var("FAILPOINTS"); } } else { // let the library handle non-utf8, or nothing for not present } let scenario = fail::FailScenario::setup(); if let Ok(val) = actions { val.split(';') .enumerate() .map(|(i, s)| s.split_once('=').ok_or((i, s))) .for_each(|res| { let (name, actions) = match res { Ok(t) => t, Err((i, s)) => { panic!( "startup failpoints: missing action on the {}th failpoint; try `{s}=return`", i + 1, ); } }; if let Err(e) = apply_failpoint(name, actions) { panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}"); } }); } scenario } pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { if actions == "exit" { fail::cfg_callback(name, exit_failpoint) } else { fail::cfg(name, actions) } } #[inline(never)] fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } ================================================ FILE: libs/utils/src/fs_ext/rename_noreplace.rs ================================================ use nix::NixPath; /// Rename a file without replacing an existing file. /// /// This is a wrapper around platform-specific APIs. pub fn rename_noreplace( src: &P1, dst: &P2, ) -> nix::Result<()> { { #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( nix::fcntl::AT_FDCWD, src, nix::fcntl::AT_FDCWD, dst, nix::fcntl::RenameFlags::RENAME_NOREPLACE, ) } #[cfg(target_os = "macos")] { let res = src.with_nix_path(|src| { dst.with_nix_path(|dst| // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np. unsafe { nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL) }) })??; nix::errno::Errno::result(res).map(drop) } #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))] { std::compile_error!("OS does not support no-replace renames"); } } } #[cfg(test)] mod test { use std::fs; use std::path::PathBuf; use super::*; fn testdir() -> camino_tempfile::Utf8TempDir { match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") { Some(path) => { let path: camino::Utf8PathBuf = path; camino_tempfile::tempdir_in(path).unwrap() } None => camino_tempfile::tempdir().unwrap(), } } #[test] fn test_absolute_paths() { let testdir = testdir(); println!("testdir: {}", testdir.path()); let src = testdir.path().join("src"); let dst = testdir.path().join("dst"); fs::write(&src, b"").unwrap(); fs::write(&dst, b"").unwrap(); let src = src.canonicalize().unwrap(); assert!(src.is_absolute()); let dst = dst.canonicalize().unwrap(); assert!(dst.is_absolute()); let result = rename_noreplace(&src, &dst); assert_eq!(result.unwrap_err(), nix::Error::EEXIST); } #[test] fn test_relative_paths() { let testdir = testdir(); println!("testdir: {}", testdir.path()); // this is fine because we run in nextest => process per test std::env::set_current_dir(testdir.path()).unwrap(); let src = PathBuf::from("src"); let dst = PathBuf::from("dst"); fs::write(&src, b"").unwrap(); fs::write(&dst, b"").unwrap(); let result = rename_noreplace(&src, &dst); assert_eq!(result.unwrap_err(), nix::Error::EEXIST); } #[test] fn test_works_when_not_exists() { let testdir = testdir(); println!("testdir: {}", testdir.path()); let src = testdir.path().join("src"); let dst = testdir.path().join("dst"); fs::write(&src, b"content").unwrap(); rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap(); assert_eq!( "content", String::from_utf8(std::fs::read(&dst).unwrap()).unwrap() ); } } ================================================ FILE: libs/utils/src/fs_ext.rs ================================================ /// Extensions to `std::fs` types. use std::{fs, io, path::Path}; use anyhow::Context; #[cfg(feature = "rename_noreplace")] mod rename_noreplace; #[cfg(feature = "rename_noreplace")] pub use rename_noreplace::rename_noreplace; pub trait PathExt { /// Returns an error if `self` is not a directory. fn is_empty_dir(&self) -> io::Result; } impl

PathExt for P where P: AsRef, { fn is_empty_dir(&self) -> io::Result { Ok(fs::read_dir(self)?.next().is_none()) } } pub async fn is_directory_empty(path: impl AsRef) -> anyhow::Result { let mut dir = tokio::fs::read_dir(&path) .await .context(format!("read_dir({})", path.as_ref().display()))?; Ok(dir.next_entry().await?.is_none()) } pub async fn list_dir(path: impl AsRef) -> anyhow::Result> { let mut dir = tokio::fs::read_dir(&path) .await .context(format!("read_dir({})", path.as_ref().display()))?; let mut content = vec![]; while let Some(next) = dir.next_entry().await? { let file_name = next.file_name(); content.push(file_name.to_string_lossy().to_string()); } Ok(content) } pub fn ignore_not_found(e: io::Error) -> io::Result<()> { if e.kind() == io::ErrorKind::NotFound { Ok(()) } else { Err(e) } } pub fn ignore_absent_files(fs_operation: F) -> io::Result<()> where F: Fn() -> io::Result<()>, { fs_operation().or_else(ignore_not_found) } #[cfg(test)] mod test { use super::ignore_absent_files; use crate::fs_ext::{is_directory_empty, list_dir}; #[test] fn is_empty_dir() { use super::PathExt; let dir = camino_tempfile::tempdir().unwrap(); let dir_path = dir.path(); // test positive case assert!( dir_path.is_empty_dir().expect("test failure"), "new tempdir should be empty" ); // invoke on a file to ensure it returns an error let file_path = dir_path.join("testfile"); let f = std::fs::File::create(&file_path).unwrap(); drop(f); assert!(file_path.is_empty_dir().is_err()); // do it again on a path, we know to be nonexistent std::fs::remove_file(&file_path).unwrap(); assert!(file_path.is_empty_dir().is_err()); } #[tokio::test] async fn is_empty_dir_async() { let dir = camino_tempfile::tempdir().unwrap(); let dir_path = dir.path(); // test positive case assert!( is_directory_empty(dir_path).await.expect("test failure"), "new tempdir should be empty" ); // invoke on a file to ensure it returns an error let file_path = dir_path.join("testfile"); let f = std::fs::File::create(&file_path).unwrap(); drop(f); assert!(is_directory_empty(&file_path).await.is_err()); // do it again on a path, we know to be nonexistent std::fs::remove_file(&file_path).unwrap(); assert!(is_directory_empty(file_path).await.is_err()); } #[test] fn ignore_absent_files_works() { let dir = camino_tempfile::tempdir().unwrap(); let file_path = dir.path().join("testfile"); ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); let f = std::fs::File::create(&file_path).unwrap(); drop(f); ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); assert!(!file_path.exists()); } #[tokio::test] async fn list_dir_works() { let dir = camino_tempfile::tempdir().unwrap(); let dir_path = dir.path(); assert!(list_dir(dir_path).await.unwrap().is_empty()); let file_path = dir_path.join("testfile"); let _ = std::fs::File::create(&file_path).unwrap(); assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]); let another_dir_path = dir_path.join("testdir"); std::fs::create_dir(another_dir_path).unwrap(); let expected = &["testdir", "testfile"]; let mut actual = list_dir(dir_path).await.unwrap(); actual.sort(); assert_eq!(actual, expected); } } ================================================ FILE: libs/utils/src/generation.rs ================================================ use std::fmt::Debug; use serde::{Deserialize, Serialize}; /// Tenant generations are used to provide split-brain safety and allow /// multiple pageservers to attach the same tenant concurrently. /// /// See docs/rfcs/025-generation-numbers.md for detail on how generation /// numbers are used. #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum Generation { // The None Generation is used in the metadata of layers written before generations were // introduced. A running Tenant always has a valid generation, but the layer metadata may // include None generations. None, Valid(u32), } /// The Generation type represents a number associated with a Tenant, which /// increments every time the tenant is attached to a new pageserver, or /// an attached pageserver restarts. /// /// It is included as a suffix in S3 keys, as a protection against split-brain /// scenarios where pageservers might otherwise issue conflicting writes to /// remote storage impl Generation { pub const MAX: Self = Self::Valid(u32::MAX); /// Create a new Generation that represents a legacy key format with /// no generation suffix pub fn none() -> Self { Self::None } pub const fn new(v: u32) -> Self { Self::Valid(v) } pub fn is_none(&self) -> bool { matches!(self, Self::None) } #[track_caller] pub fn get_suffix(&self) -> impl std::fmt::Display { match self { Self::Valid(v) => GenerationFileSuffix(Some(*v)), Self::None => GenerationFileSuffix(None), } } /// `suffix` is the part after "-" in a key /// /// Returns None if parsing was unsuccessful pub fn parse_suffix(suffix: &str) -> Option { u32::from_str_radix(suffix, 16).map(Generation::new).ok() } #[track_caller] pub fn previous(&self) -> Generation { match self { Self::Valid(n) => { if *n == 0 { // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation // to 0 as being "no generation". Self::None } else { Self::Valid(n - 1) } } Self::None => Self::None, } } #[track_caller] pub fn next(&self) -> Generation { match self { Self::Valid(n) => Self::Valid(*n + 1), Self::None => Self::Valid(1), } } pub fn into(self) -> Option { if let Self::Valid(v) = self { Some(v) } else { None } } } struct GenerationFileSuffix(Option); impl std::fmt::Display for GenerationFileSuffix { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if let Some(g) = self.0 { write!(f, "-{g:08x}") } else { Ok(()) } } } impl Serialize for Generation { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { if let Self::Valid(v) = self { v.serialize(serializer) } else { // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None Err(serde::ser::Error::custom(format!( "Tried to serialize invalid generation ({self:?})" ))) } } } impl<'de> Deserialize<'de> for Generation { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { Ok(Self::Valid(u32::deserialize(deserializer)?)) } } // We intentionally do not implement Display for Generation, to reduce the // risk of a bug where the generation is used in a format!() string directly // instead of using get_suffix(). impl Debug for Generation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Valid(v) => { write!(f, "{v:08x}") } Self::None => { write!(f, "") } } } } #[cfg(test)] mod test { use super::*; #[test] fn generation_gt() { // Important that a None generation compares less than a valid one, during upgrades from // pre-generation systems. assert!(Generation::none() < Generation::new(0)); assert!(Generation::none() < Generation::new(1)); } #[test] fn suffix_is_stable() { use std::fmt::Write as _; // the suffix must remain stable through-out the pageserver remote storage evolution and // not be changed accidentially without thinking about migration let examples = [ (line!(), Generation::None, ""), (line!(), Generation::Valid(0), "-00000000"), (line!(), Generation::Valid(u32::MAX), "-ffffffff"), ]; let mut s = String::new(); for (line, gen_, expected) in examples { s.clear(); write!(s, "{}", &gen_.get_suffix()).expect("string grows"); assert_eq!(s, expected, "example on {line}"); } } } ================================================ FILE: libs/utils/src/guard_arc_swap.rs ================================================ //! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes //! don't block reads. use std::sync::Arc; use arc_swap::ArcSwap; use tokio::sync::TryLockError; pub struct GuardArcSwap { inner: ArcSwap, guard: tokio::sync::Mutex<()>, } pub struct Guard<'a, T> { _guard: tokio::sync::MutexGuard<'a, ()>, inner: &'a ArcSwap, } impl GuardArcSwap { pub fn new(inner: T) -> Self { Self { inner: ArcSwap::new(Arc::new(inner)), guard: tokio::sync::Mutex::new(()), } } pub fn read(&self) -> Arc { self.inner.load_full() } pub async fn write_guard(&self) -> Guard<'_, T> { Guard { _guard: self.guard.lock().await, inner: &self.inner, } } pub fn try_write_guard(&self) -> Result, TryLockError> { let guard = self.guard.try_lock()?; Ok(Guard { _guard: guard, inner: &self.inner, }) } } impl Guard<'_, T> { pub fn read(&self) -> Arc { self.inner.load_full() } pub fn write(&mut self, value: T) { self.inner.store(Arc::new(value)); } } ================================================ FILE: libs/utils/src/hex.rs ================================================ /// Useful type for asserting that expected bytes match reporting the bytes more readable /// array-syntax compatible hex bytes. /// /// # Usage /// /// ``` /// use utils::Hex; /// /// let actual = serialize_something(); /// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64]; /// /// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline /// // output suffixed with an array style length for easier comparisons. /// assert_eq!(Hex(&actual), Hex(&expected)); /// /// // with `let expected = [0x68];` the error would had been: /// // assertion `left == right` failed /// // left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11] /// // right: [0x68; 1] /// # fn serialize_something() -> Vec { "hello world".as_bytes().to_vec() } /// ``` pub struct Hex(pub S); impl> std::fmt::Debug for Hex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; let chunks = self.0.as_ref().chunks(16); for (i, c) in chunks.enumerate() { if i > 0 && !c.is_empty() { writeln!(f, ", ")?; } for (j, b) in c.iter().enumerate() { if j > 0 { write!(f, ", ")?; } write!(f, "0x{b:02x}")?; } } write!(f, "; {}]", self.0.as_ref().len()) } } impl, L: AsRef<[u8]>> PartialEq> for Hex { fn eq(&self, other: &Hex) -> bool { let left = self.0.as_ref(); let right = other.0.as_ref(); left == right } } ================================================ FILE: libs/utils/src/id.rs ================================================ use std::fmt; use std::num::ParseIntError; use std::str::FromStr; use anyhow::Context; use hex::FromHex; use rand::Rng; use serde::de::Visitor; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Error, Debug)] pub enum IdError { #[error("invalid id length {0}")] SliceParseError(usize), } /// Neon ID is a 128-bit random ID. /// Used to represent various identifiers. Provides handy utility methods and impls. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. #[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] struct Id([u8; 16]); impl Serialize for Id { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { if serializer.is_human_readable() { serializer.collect_str(self) } else { self.0.serialize(serializer) } } } impl<'de> Deserialize<'de> for Id { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct IdVisitor { is_human_readable_deserializer: bool, } impl<'de> Visitor<'de> for IdVisitor { type Value = Id; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { if self.is_human_readable_deserializer { formatter.write_str("value in form of hex string") } else { formatter.write_str("value in form of integer array([u8; 16])") } } fn visit_seq(self, seq: A) -> Result where A: serde::de::SeqAccess<'de>, { let s = serde::de::value::SeqAccessDeserializer::new(seq); let id: [u8; 16] = Deserialize::deserialize(s)?; Ok(Id::from(id)) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Id::from_str(v).map_err(E::custom) } } if deserializer.is_human_readable() { deserializer.deserialize_str(IdVisitor { is_human_readable_deserializer: true, }) } else { deserializer.deserialize_tuple( 16, IdVisitor { is_human_readable_deserializer: false, }, ) } } } impl Id { pub fn from_slice(src: &[u8]) -> Result { if src.len() != 16 { return Err(IdError::SliceParseError(src.len())); } let mut id_array = [0u8; 16]; id_array.copy_from_slice(src); Ok(id_array.into()) } pub fn as_arr(&self) -> [u8; 16] { self.0 } pub fn generate() -> Self { let mut tli_buf = [0u8; 16]; rand::rng().fill(&mut tli_buf); Id::from(tli_buf) } fn hex_encode(&self) -> String { static HEX: &[u8] = b"0123456789abcdef"; let mut buf = vec![0u8; self.0.len() * 2]; for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { chunk[0] = HEX[((b >> 4) & 0xf) as usize]; chunk[1] = HEX[(b & 0xf) as usize]; } // SAFETY: vec constructed out of `HEX`, it can only be ascii unsafe { String::from_utf8_unchecked(buf) } } } impl FromStr for Id { type Err = hex::FromHexError; fn from_str(s: &str) -> Result { Self::from_hex(s) } } // this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate impl FromHex for Id { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { let mut buf: [u8; 16] = [0u8; 16]; hex::decode_to_slice(hex, &mut buf)?; Ok(Id(buf)) } } impl AsRef<[u8]> for Id { fn as_ref(&self) -> &[u8] { &self.0 } } impl From<[u8; 16]> for Id { fn from(b: [u8; 16]) -> Self { Id(b) } } impl From for u128 { fn from(id: Id) -> Self { u128::from_le_bytes(id.0) } } impl fmt::Display for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } impl fmt::Debug for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } macro_rules! id_newtype { ($t:ident) => { impl $t { pub fn from_slice(src: &[u8]) -> Result<$t, IdError> { Ok($t(Id::from_slice(src)?)) } pub fn as_arr(&self) -> [u8; 16] { self.0.as_arr() } pub fn generate() -> Self { $t(Id::generate()) } pub const fn from_array(b: [u8; 16]) -> Self { $t(Id(b)) } } impl FromStr for $t { type Err = hex::FromHexError; fn from_str(s: &str) -> Result<$t, Self::Err> { let value = Id::from_str(s)?; Ok($t(value)) } } impl From<[u8; 16]> for $t { fn from(b: [u8; 16]) -> Self { $t(Id::from(b)) } } impl FromHex for $t { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { Ok($t(Id::from_hex(hex)?)) } } impl AsRef<[u8]> for $t { fn as_ref(&self) -> &[u8] { &self.0.0 } } impl From<$t> for u128 { fn from(id: $t) -> Self { u128::from(id.0) } } impl fmt::Display for $t { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } } impl fmt::Debug for $t { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } } }; } /// Neon timeline ID. /// /// They are different from PostgreSQL timeline /// IDs, but serve a similar purpose: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only /// 32-bits wide, and they must be in ascending order in any given /// timeline history. Those limitations mean that we cannot generate a /// new PostgreSQL timeline ID by just generating a random number. And /// that in turn is problematic for the "pull/push" workflow, where you /// have a local copy of a Neon repository, and you periodically sync /// the local changes with a remote server. When you work "detached" /// from the remote server, you cannot create a PostgreSQL timeline ID /// that's guaranteed to be different from all existing timelines in /// the remote server. For example, if two people are having a clone of /// the repository on their laptops, and they both create a new branch /// with different name. What timeline ID would they assign to their /// branches? If they pick the same one, and later try to push the /// branches to the same remote server, they will get mixed up. /// /// To avoid those issues, Neon has its own concept of timelines that /// is separate from PostgreSQL timelines, and doesn't have those /// limitations. A Neon timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. /// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] pub struct TimelineId(Id); id_newtype!(TimelineId); impl TryFrom> for TimelineId { type Error = anyhow::Error; fn try_from(value: Option<&str>) -> Result { value .unwrap_or_default() .parse::() .with_context(|| format!("Could not parse timeline id from {value:?}")) } } /// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. /// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] pub struct TenantId(Id); id_newtype!(TenantId); /// If needed, reuse small string from proxy/src/types.rc pub type EndpointId = String; // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { pub tenant_id: TenantId, pub timeline_id: TimelineId, } impl TenantTimelineId { pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self { TenantTimelineId { tenant_id, timeline_id, } } pub fn generate() -> Self { Self::new(TenantId::generate(), TimelineId::generate()) } pub fn empty() -> Self { Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16])) } } impl fmt::Display for TenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}/{}", self.tenant_id, self.timeline_id) } } impl FromStr for TenantTimelineId { type Err = anyhow::Error; fn from_str(s: &str) -> Result { let mut parts = s.split('/'); let tenant_id = parts .next() .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))? .parse()?; let timeline_id = parts .next() .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))? .parse()?; if parts.next().is_some() { anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id"); } Ok(TenantTimelineId::new(tenant_id, timeline_id)) } } // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] pub struct NodeId(pub u64); impl fmt::Display for NodeId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } impl FromStr for NodeId { type Err = ParseIntError; fn from_str(s: &str) -> Result { Ok(NodeId(u64::from_str(s)?)) } } #[cfg(test)] mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; use super::*; use crate::bin_ser::BeSer; #[test] fn test_id_serde_non_human_readable() { let original_id = Id([ 173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24, ]); let expected_tokens = Tokens(vec![ Token::Tuple { len: 16 }, Token::U8(173), Token::U8(80), Token::U8(132), Token::U8(115), Token::U8(129), Token::U8(226), Token::U8(72), Token::U8(254), Token::U8(170), Token::U8(201), Token::U8(135), Token::U8(108), Token::U8(199), Token::U8(26), Token::U8(228), Token::U8(24), Token::TupleEnd, ]); let serializer = Serializer::builder().is_human_readable(false).build(); let serialized_tokens = original_id.serialize(&serializer).unwrap(); assert_eq!(serialized_tokens, expected_tokens); let mut deserializer = Deserializer::builder() .is_human_readable(false) .tokens(serialized_tokens) .build(); let deserialized_id = Id::deserialize(&mut deserializer).unwrap(); assert_eq!(deserialized_id, original_id); } #[test] fn test_id_serde_human_readable() { let original_id = Id([ 173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24, ]); let expected_tokens = Tokens(vec![Token::Str(String::from( "ad50847381e248feaac9876cc71ae418", ))]); let serializer = Serializer::builder().is_human_readable(true).build(); let serialized_tokens = original_id.serialize(&serializer).unwrap(); assert_eq!(serialized_tokens, expected_tokens); let mut deserializer = Deserializer::builder() .is_human_readable(true) .tokens(Tokens(vec![Token::Str(String::from( "ad50847381e248feaac9876cc71ae418", ))])) .build(); assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id); } macro_rules! roundtrip_type { ($type:ty, $expected_bytes:expr) => {{ let expected_bytes: [u8; 16] = $expected_bytes; let original_id = <$type>::from(expected_bytes); let ser_bytes = original_id.ser().unwrap(); assert_eq!(ser_bytes, expected_bytes); let des_id = <$type>::des(&ser_bytes).unwrap(); assert_eq!(des_id, original_id); }}; } #[test] fn test_id_bincode_serde() { let expected_bytes = [ 173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24, ]; roundtrip_type!(Id, expected_bytes); } #[test] fn test_tenant_id_bincode_serde() { let expected_bytes = [ 173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24, ]; roundtrip_type!(TenantId, expected_bytes); } #[test] fn test_timeline_id_bincode_serde() { let expected_bytes = [ 173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24, ]; roundtrip_type!(TimelineId, expected_bytes); } } ================================================ FILE: libs/utils/src/ip_address.rs ================================================ use std::env::{VarError, var}; use std::error::Error; use std::net::IpAddr; use std::str::FromStr; /// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this /// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration. /// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod /// template). pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS"; /// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS. /// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template). pub fn read_node_ip_addr_from_env() -> Result, Box> { match var(HADRON_NODE_IP_ADDRESS) { Ok(v) => { if let Ok(addr) = IpAddr::from_str(&v) { Ok(Some(addr)) } else { Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into()) } } Err(VarError::NotPresent) => Ok(None), Err(e) => Err(e.into()), } } #[cfg(test)] mod tests { use super::*; use std::env; use std::net::{Ipv4Addr, Ipv6Addr}; #[test] fn test_read_node_ip_addr_from_env() { // SAFETY: test code unsafe { // Test with a valid IPv4 address env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1"); let result = read_node_ip_addr_from_env().unwrap(); assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)))); // Test with a valid IPv6 address env::set_var( HADRON_NODE_IP_ADDRESS, "2001:0db8:85a3:0000:0000:8a2e:0370:7334", ); } let result = read_node_ip_addr_from_env().unwrap(); assert_eq!( result, Some(IpAddr::V6( Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap() )) ); // Test with an invalid IP address // SAFETY: test code unsafe { env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip"); } let result = read_node_ip_addr_from_env(); assert!(result.is_err()); // Test with no environment variable set // SAFETY: test code unsafe { env::remove_var(HADRON_NODE_IP_ADDRESS); } let result = read_node_ip_addr_from_env().unwrap(); assert_eq!(result, None); } } ================================================ FILE: libs/utils/src/leaky_bucket.rs ================================================ //! This module implements the Generic Cell Rate Algorithm for a simplified //! version of the Leaky Bucket rate limiting system. //! //! # Leaky Bucket //! //! If the bucket is full, no new requests are allowed and are throttled/errored. //! If the bucket is partially full/empty, new requests are added to the bucket in //! terms of "tokens". //! //! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate. //! //! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second. //! //! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm) //! //! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires //! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time. //! //! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach //! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`. //! //! Another explaination can be found here: use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Duration; use tokio::sync::Notify; use tokio::time::Instant; #[derive(Clone, Copy)] pub struct LeakyBucketConfig { /// This is the "time cost" of a single request unit. /// Should loosely represent how long it takes to handle a request unit in active resource time. /// Loosely speaking this is the inverse of the steady-rate requests-per-second pub cost: Duration, /// total size of the bucket pub bucket_width: Duration, } impl LeakyBucketConfig { pub fn new(rps: f64, bucket_size: f64) -> Self { let cost = Duration::from_secs_f64(rps.recip()); let bucket_width = cost.mul_f64(bucket_size); Self { cost, bucket_width } } } pub struct LeakyBucketState { /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`. /// /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost". /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`. /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens. /// Draining the bucket will happen naturally as `now` moves forward. /// /// Let `n` be some "time cost" for the request, /// If now is after empty_at, the bucket is empty and the empty_at is reset to now, /// If now is within the `bucket window + n`, we are within time budget. /// If now is before the `bucket window + n`, we have run out of budget. /// /// This is inspired by the generic cell rate algorithm (GCRA) and works /// exactly the same as a leaky-bucket. pub empty_at: Instant, } impl LeakyBucketState { pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self { LeakyBucketState { empty_at: Instant::now() + config.cost.mul_f64(initial_tokens), } } pub fn bucket_is_empty(&self, now: Instant) -> bool { // if self.end is after now, the bucket is not empty self.empty_at <= now } /// Immediately adds tokens to the bucket, if there is space. /// /// In a scenario where you are waiting for available rate, /// rather than just erroring immediately, `started` corresponds to when this waiting started. /// /// `n` is the number of tokens that will be filled in the bucket. /// /// # Errors /// /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when /// there will be space again. pub fn add_tokens( &mut self, config: &LeakyBucketConfig, started: Instant, n: f64, ) -> Result<(), Instant> { let now = Instant::now(); // invariant: started <= now debug_assert!(started <= now); // If the bucket was empty when we started our search, // we should update the `empty_at` value accordingly. // this prevents us from having negative tokens in the bucket. let mut empty_at = self.empty_at; if empty_at < started { empty_at = started; } let n = config.cost.mul_f64(n); let new_empty_at = empty_at + n; let allow_at = new_empty_at.checked_sub(config.bucket_width); // empty_at // allow_at | new_empty_at // / | / // -------o-[---------o-|--]--------- // now1 ^ now2 ^ // // at now1, the bucket would be completely filled if we add n tokens. // at now2, the bucket would be partially filled if we add n tokens. match allow_at { Some(allow_at) if now < allow_at => Err(allow_at), _ => { self.empty_at = new_empty_at; Ok(()) } } } } pub struct RateLimiter { pub config: LeakyBucketConfig, pub sleep_counter: AtomicU64, pub state: Mutex, /// a queue to provide this fair ordering. pub queue: Notify, } struct Requeue<'a>(&'a Notify); impl Drop for Requeue<'_> { fn drop(&mut self) { self.0.notify_one(); } } impl RateLimiter { pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self { RateLimiter { sleep_counter: AtomicU64::new(0), state: Mutex::new(LeakyBucketState::with_initial_tokens( &config, initial_tokens, )), config, queue: { let queue = Notify::new(); queue.notify_one(); queue }, } } pub fn steady_rps(&self) -> f64 { self.config.cost.as_secs_f64().recip() } /// returns true if we did throttle pub async fn acquire(&self, count: usize) -> bool { let start = tokio::time::Instant::now(); let start_count = self.sleep_counter.load(Ordering::Acquire); let mut end_count = start_count; // wait until we are the first in the queue let mut notified = std::pin::pin!(self.queue.notified()); if !notified.as_mut().enable() { notified.await; end_count = self.sleep_counter.load(Ordering::Acquire); } // notify the next waiter in the queue when we are done. let _guard = Requeue(&self.queue); loop { let res = self .state .lock() .unwrap() .add_tokens(&self.config, start, count as f64); match res { Ok(()) => return end_count > start_count, Err(ready_at) => { struct Increment<'a>(&'a AtomicU64); impl Drop for Increment<'_> { fn drop(&mut self) { self.0.fetch_add(1, Ordering::AcqRel); } } // increment the counter after we finish sleeping (or cancel this task). // this ensures that tasks that have already started the acquire will observe // the new sleep count when they are allowed to resume on the notify. let _inc = Increment(&self.sleep_counter); end_count += 1; tokio::time::sleep_until(ready_at).await; } } } } } #[cfg(test)] mod tests { use std::time::Duration; use tokio::time::Instant; use super::{LeakyBucketConfig, LeakyBucketState}; #[tokio::test(start_paused = true)] async fn check() { let config = LeakyBucketConfig { // average 100rps cost: Duration::from_millis(10), // burst up to 100 requests bucket_width: Duration::from_millis(1000), }; let mut state = LeakyBucketState { empty_at: Instant::now(), }; // supports burst { // should work for 100 requests this instant for _ in 0..100 { state.add_tokens(&config, Instant::now(), 1.0).unwrap(); } let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); assert_eq!(ready - Instant::now(), Duration::from_millis(10)); } // doesn't overfill { // after 1s we should have an empty bucket again. tokio::time::advance(Duration::from_secs(1)).await; assert!(state.bucket_is_empty(Instant::now())); // after 1s more, we should not over count the tokens and allow more than 200 requests. tokio::time::advance(Duration::from_secs(1)).await; for _ in 0..100 { state.add_tokens(&config, Instant::now(), 1.0).unwrap(); } let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); assert_eq!(ready - Instant::now(), Duration::from_millis(10)); } // supports sustained rate over a long period { tokio::time::advance(Duration::from_secs(1)).await; // should sustain 100rps for _ in 0..2000 { tokio::time::advance(Duration::from_millis(10)).await; state.add_tokens(&config, Instant::now(), 1.0).unwrap(); } } // supports requesting more tokens than can be stored in the bucket // we just wait a little bit longer upfront. { // start the bucket completely empty tokio::time::advance(Duration::from_secs(5)).await; assert!(state.bucket_is_empty(Instant::now())); // requesting 200 tokens of space should take 200*cost = 2s // but we already have 1s available, so we wait 1s from start. let start = Instant::now(); let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); assert_eq!(ready - Instant::now(), Duration::from_secs(1)); tokio::time::advance(Duration::from_millis(500)).await; let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); assert_eq!(ready - Instant::now(), Duration::from_millis(500)); tokio::time::advance(Duration::from_millis(500)).await; state.add_tokens(&config, start, 200.0).unwrap(); // bucket should be completely full now let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); assert_eq!(ready - Instant::now(), Duration::from_millis(10)); } } } ================================================ FILE: libs/utils/src/lib.rs ================================================ //! `utils` is intended to be a place to put code that is shared //! between other crates in this repository. #![deny(clippy::undocumented_unsafe_blocks)] pub mod backoff; /// `Lsn` type implements common tasks on Log Sequence Numbers pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive pub mod seqwait; /// A simple Read-Copy-Update implementation. pub mod simple_rcu; /// append only ordered map implemented with a Vec pub mod vec_map; pub mod bin_ser; // helper functions for creating and fsyncing pub mod crashsafe; // common authentication routines pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. pub mod id; // utility functions to obtain reachable IP addresses in PS/SK nodes. pub mod ip_address; pub mod shard; mod hex; pub use hex::Hex; // definition of the Generation type for pageserver attachment APIs pub mod generation; // common log initialisation routine pub mod logging; pub mod lock_file; pub mod pid_file; // Utility for binding TcpListeners with proper socket options. pub mod tcp_listener; // Default signal handling pub mod sentry_init; pub mod signals; pub mod fs_ext; pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; pub mod serde_system_time; pub mod pageserver_feedback; pub mod postgres_client; pub mod tracing_span_assert; pub mod leaky_bucket; pub mod rate_limit; /// Simple once-barrier and a guard which keeps barrier awaiting. pub mod completion; /// Reporting utilities pub mod error; /// async timeout helper pub mod timeout; pub mod span; pub mod sync; pub mod failpoint_support; pub mod yielding_loop; pub mod zstd; pub mod env; pub mod poison; pub mod toml_edit_ext; pub mod circuit_breaker; pub mod try_rcu; pub mod guard_arc_swap; pub mod elapsed_accum; #[cfg(target_os = "linux")] pub mod linux_socket_ioctl; pub mod metrics_collector; // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: /// * building locally from git repo /// * building in CI from git repo /// * building in docker (either in CI or locally) /// /// One thing to note is that .git is not available in docker (and it is bad to include it there). /// When building locally, the `git_version` is used to query .git. When building on CI and docker, /// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to /// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION /// environment variable. /// /// We ended up with this compromise between phantom would be merge commits vs. pull request branch /// heads due to old logs becoming more reliable (github could gc the phantom merge commit /// anytime) in #4641. /// /// To avoid running buildscript every recompilation, we use rerun-if-env-changed option. /// So the build script will be run only when GIT_VERSION envvar has changed. /// /// Why not to use buildscript to get git commit sha directly without procmacro from different crate? /// Caching and workspaces complicates that. In case `utils` is not /// recompiled due to caching then version may become outdated. /// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, /// so if we changed the index state git_version will pick that up and rerun the macro. /// /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. /// /// ############################################################################################# /// TODO this macro is not the way the library is intended to be used, see for details. /// We used `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. #[macro_export] macro_rules! project_git_version { ($const_identifier:ident) => { // this should try GIT_VERSION first only then git_version::git_version! const $const_identifier: &::core::primitive::str = { const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! { prefix = "", fallback = "unknown", args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha }; const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") { ::core::option::Option::Some(x) => ["git-env:", x], ::core::option::Option::None => ["git:", __COMMIT_FROM_GIT], }; $crate::__const_format::concatcp!(__ARG[0], __ARG[1]) }; }; } /// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages #[macro_export] macro_rules! project_build_tag { ($const_identifier:ident) => { const $const_identifier: &::core::primitive::str = { const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") { ::core::option::Option::Some(x) => ["build_tag-env:", x], ::core::option::Option::None => ["build_tag:", ""], }; $crate::__const_format::concatcp!(__ARG[0], __ARG[1]) }; }; } /// Re-export for `project_git_version` macro #[doc(hidden)] pub use const_format as __const_format; /// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. #[macro_export] macro_rules! const_assert { ($($args:tt)*) => { const _: () = assert!($($args)*); }; } ================================================ FILE: libs/utils/src/linux_socket_ioctl.rs ================================================ //! Linux-specific socket ioctls. //! //! use std::io; use std::mem::MaybeUninit; use std::os::fd::RawFd; use std::os::raw::c_int; use nix::libc::{FIONREAD, TIOCOUTQ}; unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { let mut inq: MaybeUninit = MaybeUninit::uninit(); // SAFETY: encapsulating fn is unsafe, we require `socket_fd` to be a valid file descriptor unsafe { let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); if err == 0 { Ok(inq.assume_init()) } else { Err(io::Error::last_os_error()) } } } /// # Safety /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn inq(socket_fd: RawFd) -> io::Result { // SAFETY: encapsulating fn is unsafe unsafe { do_ioctl(socket_fd, FIONREAD) } } /// # Safety /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn outq(socket_fd: RawFd) -> io::Result { // SAFETY: encapsulating fn is unsafe unsafe { do_ioctl(socket_fd, TIOCOUTQ) } } ================================================ FILE: libs/utils/src/lock_file.rs ================================================ //! A module to create and read lock files. //! //! File locking is done using [`nix::fcntl::Flock`] exclusive locks. //! The only consumer of this module is currently //! [`pid_file`](crate::pid_file). See the module-level comment //! there for potential pitfalls with lock files that are used //! to store PIDs (pidfiles). use std::fs; use std::io::{Read, Write}; use std::ops::Deref; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno::EAGAIN; use nix::fcntl::{Flock, FlockArg}; use crate::crashsafe; /// A handle to an open and flocked, but not-yet-written lock file. /// Returned by [`create_exclusive`]. #[must_use] pub struct UnwrittenLockFile { path: Utf8PathBuf, file: Flock, } /// Returned by [`UnwrittenLockFile::write_content`]. #[must_use] pub struct LockFileGuard(Flock); impl Deref for LockFileGuard { type Target = fs::File; fn deref(&self) -> &Self::Target { &self.0 } } impl UnwrittenLockFile { /// Replace the content of this lock file with the byte representation of `contents`. pub fn write_content(mut self, contents: String) -> anyhow::Result { self.file .set_len(0) .context("Failed to truncate lockfile")?; self.file .write_all(contents.as_bytes()) .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?; crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?; Ok(LockFileGuard(self.file)) } } /// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns /// a handle that allows overwriting the locked file's content. /// /// The exclusive lock is released when dropping the returned handle. /// /// It is not an error if the file already exists. /// It is an error if the file is already locked. pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result { let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT .truncate(true) .write(true) .open(lock_file_path) .context("open lock file")?; let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); match res { Ok(lock_file) => Ok(UnwrittenLockFile { path: lock_file_path.to_owned(), file: lock_file, }), Err((_, EAGAIN)) => anyhow::bail!("file is already locked"), Err((_, e)) => Err(e).context("flock error"), } } /// Returned by [`read_and_hold_lock_file`]. /// Check out the [`pid_file`](crate::pid_file) module for what the variants mean /// and potential caveats if the lock files that are used to store PIDs. pub enum LockFileRead { /// No file exists at the given path. NotExist, /// No other process held the lock file, so we grabbed an flock /// on it and read its contents. /// Release the flock by dropping the [`LockFileGuard`]. NotHeldByAnyProcess(LockFileGuard, String), /// The file exists but another process was holding an flock on it. LockedByOtherProcess { not_locked_file: fs::File, content: String, }, } /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to /// inspect its content. /// /// It is not an `Err(...)` if the file does not exist or is already locked. /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); let lock_file = match res { Ok(f) => f, Err(e) => match e.kind() { std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), _ => return Err(e).context("open lock file"), }, }; let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); // We need the content regardless of lock success / failure. // But, read it after flock so that, if it succeeded, the content is consistent. match res { Ok(mut locked_file) => { let mut content = String::new(); locked_file .read_to_string(&mut content) .context("read lock file")?; Ok(LockFileRead::NotHeldByAnyProcess( LockFileGuard(locked_file), content, )) } Err((mut not_locked_file, EAGAIN)) => { let mut content = String::new(); not_locked_file .read_to_string(&mut content) .context("read lock file")?; Ok(LockFileRead::LockedByOtherProcess { not_locked_file, content, }) } Err((_, e)) => Err(e).context("flock error"), } } ================================================ FILE: libs/utils/src/logging.rs ================================================ use std::future::Future; use std::pin::Pin; use std::str::FromStr; use std::time::Duration; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; use tokio::time::Instant; use tracing::{info, warn}; /// Logs a critical error, similarly to `tracing::error!`. This will: /// /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. /// * Trigger a pageable alert (via the metric below). /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". /// * In debug builds, panic the process. /// /// When including errors in the message, please use {err:?} to include the error cause and original /// backtrace. #[macro_export] macro_rules! critical { ($($arg:tt)*) => {{ if cfg!(debug_assertions) { panic!($($arg)*); } // Increment both metrics $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); }}; } #[macro_export] macro_rules! critical_timeline { ($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{ if cfg!(debug_assertions) { panic!($($arg)*); } // Increment both metrics $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); $crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string()); if let Some(c) = $corruption_detected.as_ref() { c.store(true, std::sync::atomic::Ordering::Relaxed); } let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}", $tenant_shard_id, $timeline_id, format!($($arg)*)); }}; } #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, Json, Test, } impl LogFormat { pub fn from_config(s: &str) -> anyhow::Result { use strum::VariantNames; LogFormat::from_str(s).with_context(|| { format!( "Unrecognized log format. Please specify one of: {:?}", LogFormat::VARIANTS ) }) } } pub struct TracingEventCountMetric { /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro, /// and also emit it as a regular error. These are thus double-counted, but that seems fine. critical: IntCounter, error: IntCounter, warn: IntCounter, info: IntCounter, debug: IntCounter, trace: IntCounter, } // Begin Hadron: Add a HadronCriticalStorageEventCountMetric metric that is sliced by tenant_id and timeline_id pub struct HadronCriticalStorageEventCountMetric { critical: IntCounterVec, } pub static HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "hadron_critical_storage_event_count", "Number of critical storage events, by tenant_id and timeline_id", &["tenant_shard_id", "timeline_id"] ) .expect("failed to define metric"); HadronCriticalStorageEventCountMetric::new(vec) }); impl HadronCriticalStorageEventCountMetric { fn new(vec: IntCounterVec) -> Self { Self { critical: vec } } // Allow public access from `critical!` macro. pub fn inc(&self, tenant_shard_id: &str, timeline_id: &str) { self.critical .with_label_values(&[tenant_shard_id, timeline_id]) .inc(); } } // End Hadron pub static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", &["level"] ) .expect("failed to define metric"); TracingEventCountMetric::new(vec) }); impl TracingEventCountMetric { fn new(vec: IntCounterVec) -> Self { Self { critical: vec.with_label_values(&["critical"]), error: vec.with_label_values(&["error"]), warn: vec.with_label_values(&["warn"]), info: vec.with_label_values(&["info"]), debug: vec.with_label_values(&["debug"]), trace: vec.with_label_values(&["trace"]), } } // Allow public access from `critical!` macro. pub fn inc_critical(&self) { self.critical.inc(); } fn inc_for_level(&self, level: tracing::Level) { let counter = match level { tracing::Level::ERROR => &self.error, tracing::Level::WARN => &self.warn, tracing::Level::INFO => &self.info, tracing::Level::DEBUG => &self.debug, tracing::Level::TRACE => &self.trace, }; counter.inc(); } } struct TracingEventCountLayer(&'static TracingEventCountMetric); impl tracing_subscriber::layer::Layer for TracingEventCountLayer where S: tracing::Subscriber, { fn on_event( &self, event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { self.0.inc_for_level(*event.metadata().level()); } } /// Whether to add the `tracing_error` crate's `ErrorLayer` /// to the global tracing subscriber. /// pub enum TracingErrorLayerEnablement { /// Do not add the `ErrorLayer`. Disabled, /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset. EnableWithRustLogFilter, } /// Where the logging should output to. #[derive(Clone, Copy)] pub enum Output { Stdout, Stderr, } pub fn init( log_format: LogFormat, tracing_error_layer_enablement: TracingErrorLayerEnablement, output: Output, ) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) }; // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; let r = tracing_subscriber::registry(); let r = r.with({ let log_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_ansi(false) .with_writer(move || -> Box { match output { Output::Stdout => Box::new(std::io::stdout()), Output::Stderr => Box::new(std::io::stderr()), } }); let log_layer = match log_format { LogFormat::Json => log_layer.json().boxed(), LogFormat::Plain => log_layer.boxed(), LogFormat::Test => log_layer.with_test_writer().boxed(), }; log_layer.with_filter(rust_log_env_filter()) }); let r = r.with( TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), ); match tracing_error_layer_enablement { TracingErrorLayerEnablement::EnableWithRustLogFilter => r .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) .init(), TracingErrorLayerEnablement::Disabled => r.init(), } Ok(()) } /// Disable the default rust panic hook by using `set_hook`. /// /// For neon binaries, the assumption is that tracing is configured before with [`init`], after /// that sentry is configured (if needed). sentry will install it's own on top of this, always /// processing the panic before we log it. /// /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr). /// If the assumptions about the initialization order are not held, use /// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be /// lost. #[must_use] pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard { std::panic::set_hook(Box::new(tracing_panic_hook)); TracingPanicHookGuard::new() } /// Drop guard which restores the std panic hook on drop. /// /// Tracing should not be used when it's not configured, but we cannot really latch on to any /// imaginary lifetime of tracing. pub struct TracingPanicHookGuard { act: bool, } impl TracingPanicHookGuard { fn new() -> Self { TracingPanicHookGuard { act: true } } /// Make this hook guard not do anything when dropped. pub fn forget(&mut self) { self.act = false; } } impl Drop for TracingPanicHookGuard { fn drop(&mut self) { if self.act { let _ = std::panic::take_hook(); } } } /// Named symbol for our panic hook, which logs the panic. fn tracing_panic_hook(info: &std::panic::PanicHookInfo) { // following rust 1.66.1 std implementation: // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 let location = info.location(); let msg = match info.payload().downcast_ref::<&'static str>() { Some(s) => *s, None => match info.payload().downcast_ref::() { Some(s) => &s[..], None => "Box", }, }; let thread = std::thread::current(); let thread = thread.name().unwrap_or(""); let backtrace = std::backtrace::Backtrace::capture(); let _entered = if let Some(location) = location { tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) } else { // very unlikely to hit here, but the guarantees of std could change tracing::error_span!("panic", %thread) } .entered(); if backtrace.status() == std::backtrace::BacktraceStatus::Captured { // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to // string, maybe even to a TLS one but tracing already does that. tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); } else { tracing::error!("{msg}"); } // ensure that we log something on the panic if this hook is left after tracing has been // unconfigured. worst case when teardown is racing the panic is to log the panic twice. tracing::dispatcher::get_default(|d| { if let Some(_none) = d.downcast_ref::() { let location = location.map(PrettyLocation); log_panic_to_stderr(thread, msg, location, &backtrace); } }); } #[cold] fn log_panic_to_stderr( thread: &str, msg: &str, location: Option>, backtrace: &std::backtrace::Backtrace, ) { eprintln!( "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}" ); } struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); impl std::fmt::Display for PrettyLocation<'_, '_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) } } impl std::fmt::Debug for PrettyLocation<'_, '_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { ::fmt(self, f) } } /// When you will store a secret but want to make sure it won't /// be accidentally logged, wrap it in a SecretString, whose Debug /// implementation does not expose the contents. #[derive(Clone, Eq, PartialEq)] pub struct SecretString(String); impl SecretString { pub fn get_contents(&self) -> &str { self.0.as_str() } } impl From for SecretString { fn from(s: String) -> Self { Self(s) } } impl FromStr for SecretString { type Err = std::convert::Infallible; fn from_str(s: &str) -> Result { Ok(Self(s.to_string())) } } impl std::fmt::Debug for SecretString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[SECRET]") } } /// Logs a periodic message if a future is slow to complete. /// /// This is performance-sensitive as it's used on the GetPage read path. /// /// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] pub async fn log_slow( name: &str, threshold: Duration, f: Pin<&mut impl Future>, ) -> O { monitor_slow_future( threshold, threshold, // period = threshold f, |MonitorSlowFutureCallback { ready, is_slow, elapsed_total, elapsed_since_last_callback: _, }| { if !is_slow { return; } let elapsed = elapsed_total.as_secs_f64(); if ready { info!("slow {name} completed after {elapsed:.3}s"); } else { info!("slow {name} still running after {elapsed:.3}s"); } }, ) .await } /// Logs a periodic warning if a future is slow to complete. #[inline] pub async fn warn_slow( name: &str, threshold: Duration, f: Pin<&mut impl Future>, ) -> O { monitor_slow_future( threshold, threshold, // period = threshold f, |MonitorSlowFutureCallback { ready, is_slow, elapsed_total, elapsed_since_last_callback: _, }| { if !is_slow { return; } let elapsed = elapsed_total.as_secs_f64(); if ready { warn!("slow {name} completed after {elapsed:.3}s"); } else { warn!("slow {name} still running after {elapsed:.3}s"); } }, ) .await } /// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every /// `period` afterwards, and also unconditionally when the future completes. #[inline] pub async fn monitor_slow_future( threshold: Duration, period: Duration, mut fut: Pin<&mut F>, mut cb: impl FnMut(MonitorSlowFutureCallback), ) -> O where F: Future, { let started = Instant::now(); let mut attempt = 1; let mut last_cb = started; loop { // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common // case where the timeout doesn't fire. let deadline = started + threshold + (attempt - 1) * period; // TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter. let res = tokio::time::timeout_at(deadline, &mut fut).await; let now = Instant::now(); let elapsed_total = now - started; cb(MonitorSlowFutureCallback { ready: res.is_ok(), is_slow: elapsed_total >= threshold, elapsed_total, elapsed_since_last_callback: now - last_cb, }); last_cb = now; if let Ok(output) = res { return output; } attempt += 1; } } /// See [`monitor_slow_future`]. pub struct MonitorSlowFutureCallback { /// Whether the future completed. If true, there will be no more callbacks. pub ready: bool, /// Whether the future is taking `>=` the specififed threshold duration to complete. /// Monotonic: if true in one callback invocation, true in all subsequent onces. pub is_slow: bool, /// The time elapsed since the [`monitor_slow_future`] was first polled. pub elapsed_total: Duration, /// The time elapsed since the last callback invocation. /// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled. pub elapsed_since_last_callback: Duration, } #[cfg(test)] mod tests { use metrics::IntCounterVec; use metrics::core::Opts; use crate::logging::{TracingEventCountLayer, TracingEventCountMetric}; #[test] fn tracing_event_count_metric() { let counter_vec = IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone()))); let layer = TracingEventCountLayer(metric); use tracing_subscriber::prelude::*; tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { tracing::trace!("foo"); tracing::debug!("foo"); tracing::info!("foo"); tracing::warn!("foo"); tracing::error!("foo"); }); assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1); assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1); assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1); assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1); assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1); } } ================================================ FILE: libs/utils/src/lsn.rs ================================================ #![warn(missing_docs)] use std::fmt; use std::ops::{Add, AddAssign}; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use serde::de::Visitor; use serde::{Deserialize, Serialize}; use crate::seqwait::MonotonicCounter; /// Transaction log block size in bytes pub const XLOG_BLCKSZ: u32 = 8192; /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr #[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)] pub struct Lsn(pub u64); impl Serialize for Lsn { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { if serializer.is_human_readable() { serializer.collect_str(self) } else { self.0.serialize(serializer) } } } impl<'de> Deserialize<'de> for Lsn { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct LsnVisitor { is_human_readable_deserializer: bool, } impl Visitor<'_> for LsnVisitor { type Value = Lsn; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { if self.is_human_readable_deserializer { formatter.write_str( "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer", ) } else { formatter.write_str("value in form of integer(u64)") } } fn visit_u64(self, v: u64) -> Result where E: serde::de::Error, { Ok(Lsn(v)) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Lsn::from_str(v).map_err(|e| E::custom(e)) } } if deserializer.is_human_readable() { deserializer.deserialize_str(LsnVisitor { is_human_readable_deserializer: true, }) } else { deserializer.deserialize_u64(LsnVisitor { is_human_readable_deserializer: false, }) } } } /// Allows (de)serialization of an `Lsn` always as `u64`. /// /// ### Example /// /// ```rust /// # use serde::{Serialize, Deserialize}; /// use utils::lsn::Lsn; /// /// #[derive(PartialEq, Serialize, Deserialize, Debug)] /// struct Foo { /// #[serde(with = "utils::lsn::serde_as_u64")] /// always_u64: Lsn, /// } /// /// let orig = Foo { always_u64: Lsn(1234) }; /// /// let res = serde_json::to_string(&orig).unwrap(); /// assert_eq!(res, r#"{"always_u64":1234}"#); /// /// let foo = serde_json::from_str::(&res).unwrap(); /// assert_eq!(foo, orig); /// ``` /// pub mod serde_as_u64 { use super::Lsn; /// Serializes the Lsn as u64 disregarding the human readability of the format. /// /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`. pub fn serialize(lsn: &Lsn, serializer: S) -> Result { use serde::Serialize; lsn.0.serialize(serializer) } /// Deserializes the Lsn as u64 disregarding the human readability of the format. /// /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`. pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result { use serde::Deserialize; u64::deserialize(deserializer).map(Lsn) } } /// We tried to parse an LSN from a string, but failed #[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("LsnParseError")] pub struct LsnParseError; impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h pub const INVALID: Lsn = Lsn(0); /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); self.0.checked_sub(other).map(Lsn) } /// Subtract a number, saturating at numeric bounds instead of overflowing. pub fn saturating_sub>(self, other: T) -> Lsn { Lsn(self.0.saturating_sub(other.into())) } /// Subtract a number, returning the difference as i128 to avoid overflow. pub fn widening_sub>(self, other: T) -> i128 { let other: u64 = other.into(); i128::from(self.0) - i128::from(other) } /// Parse an LSN from a string in the form `0000000000000000` pub fn from_hex(s: S) -> Result where S: AsRef, { let s: &str = s.as_ref(); let n = u64::from_str_radix(s, 16).or(Err(LsnParseError))?; Ok(Lsn(n)) } /// Compute the offset into a segment #[inline] pub fn segment_offset(self, seg_sz: usize) -> usize { (self.0 % seg_sz as u64) as usize } /// Compute LSN of the segment start. #[inline] pub fn segment_lsn(self, seg_sz: usize) -> Lsn { Lsn(self.0 - (self.0 % seg_sz as u64)) } /// Compute the segment number #[inline] pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 } /// Compute the offset into a block #[inline] pub fn block_offset(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; self.0 % BLCKSZ } /// Compute the block offset of the first byte of this Lsn within this /// segment #[inline] pub fn page_lsn(self) -> Lsn { Lsn(self.0 - self.block_offset()) } /// Compute the block offset of the first byte of this Lsn within this /// segment #[inline] pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 { (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0 } /// Compute the bytes remaining in this block /// /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`. #[inline] pub fn remaining_in_block(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; BLCKSZ - (self.0 % BLCKSZ) } /// Compute the bytes remaining to fill a chunk of some size /// /// If the LSN is already at the chunk boundary, it will return 0. pub fn calc_padding>(self, sz: T) -> u64 { let sz: u64 = sz.into(); // By using wrapping_sub, we can subtract first and then mod second. // If it's done the other way around, then we would return a full // chunk size if we're already at the chunk boundary. // (Regular subtraction will panic on overflow in debug builds.) (sz.wrapping_sub(self.0)) % sz } /// Align LSN on 8-byte boundary (alignment of WAL records). pub fn align(&self) -> Lsn { Lsn((self.0 + 7) & !7) } /// Align LSN on 8-byte boundary (alignment of WAL records). pub fn is_aligned(&self) -> bool { *self == self.align() } /// Return if the LSN is valid /// mimics postgres XLogRecPtrIsInvalid macro pub fn is_valid(self) -> bool { self != Lsn::INVALID } } impl From for Lsn { fn from(n: u64) -> Self { Lsn(n) } } impl From for u64 { fn from(lsn: Lsn) -> u64 { lsn.0 } } impl FromStr for Lsn { type Err = LsnParseError; /// Parse an LSN from a string in the form `00000000/00000000` /// /// If the input string is missing the '/' character, then use `Lsn::from_hex` fn from_str(s: &str) -> Result { let mut splitter = s.trim().split('/'); if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next()) { let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?; let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?; Ok(Lsn(((left_num as u64) << 32) | right_num as u64)) } else { Err(LsnParseError) } } } impl fmt::Display for Lsn { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:X}/{:X}", self.0 >> 32, self.0 & 0xffffffff) } } impl fmt::Debug for Lsn { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:X}/{:X}", self.0 >> 32, self.0 & 0xffffffff) } } impl Add for Lsn { type Output = Lsn; fn add(self, other: u64) -> Self::Output { // panic if the addition overflows. Lsn(self.0.checked_add(other).unwrap()) } } impl AddAssign for Lsn { fn add_assign(&mut self, other: u64) { // panic if the addition overflows. self.0 = self.0.checked_add(other).unwrap(); } } /// An [`Lsn`] that can be accessed atomically. pub struct AtomicLsn { inner: AtomicU64, } impl AtomicLsn { /// Creates a new atomic `Lsn`. pub fn new(val: u64) -> Self { AtomicLsn { inner: AtomicU64::new(val), } } /// Atomically retrieve the `Lsn` value from memory. pub fn load(&self) -> Lsn { Lsn(self.inner.load(Ordering::Acquire)) } /// Atomically store a new `Lsn` value to memory. pub fn store(&self, lsn: Lsn) { self.inner.store(lsn.0, Ordering::Release); } /// Adds to the current value, returning the previous value. /// /// This operation will panic on overflow. pub fn fetch_add(&self, val: u64) -> Lsn { let prev = self.inner.fetch_add(val, Ordering::AcqRel); assert!(prev.checked_add(val).is_some(), "AtomicLsn overflow"); Lsn(prev) } /// Atomically sets the Lsn to the max of old and new value, returning the old value. pub fn fetch_max(&self, lsn: Lsn) -> Lsn { let prev = self.inner.fetch_max(lsn.0, Ordering::AcqRel); Lsn(prev) } } impl From for AtomicLsn { fn from(lsn: Lsn) -> Self { Self::new(lsn.0) } } /// Pair of LSN's pointing to the end of the last valid record and previous one #[derive(Debug, Clone, Copy)] pub struct RecordLsn { /// LSN at the end of the current record pub last: Lsn, /// LSN at the end of the previous record pub prev: Lsn, } /// Expose `self.last` as counter to be able to use RecordLsn in SeqWait impl MonotonicCounter for RecordLsn { fn cnt_advance(&mut self, lsn: Lsn) { assert!(self.last <= lsn); let new_prev = self.last; self.last = lsn; self.prev = new_prev; } fn cnt_value(&self) -> Lsn { self.last } } /// Implements [`rand::distr::uniform::UniformSampler`] so we can sample [`Lsn`]s. /// /// This is used by the `pagebench` pageserver benchmarking tool. pub struct LsnSampler(::Sampler); impl rand::distr::uniform::SampleUniform for Lsn { type Sampler = LsnSampler; } impl rand::distr::uniform::UniformSampler for LsnSampler { type X = Lsn; fn new(low: B1, high: B2) -> Result where B1: rand::distr::uniform::SampleBorrow + Sized, B2: rand::distr::uniform::SampleBorrow + Sized, { ::Sampler::new(low.borrow().0, high.borrow().0) .map(Self) } fn new_inclusive(low: B1, high: B2) -> Result where B1: rand::distr::uniform::SampleBorrow + Sized, B2: rand::distr::uniform::SampleBorrow + Sized, { ::Sampler::new_inclusive( low.borrow().0, high.borrow().0, ) .map(Self) } fn sample(&self, rng: &mut R) -> Self::X { Lsn(self.0.sample(rng)) } } #[cfg(test)] mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; use super::*; use crate::bin_ser::BeSer; #[test] fn test_lsn_strings() { assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555))); assert_eq!("aaaa/bbbb".parse(), Ok(Lsn(0x0000AAAA0000BBBB))); assert_eq!("1/A".parse(), Ok(Lsn(0x000000010000000A))); assert_eq!("0/0".parse(), Ok(Lsn(0))); "ABCDEFG/12345678".parse::().unwrap_err(); "123456789/AAAA5555".parse::().unwrap_err(); "12345678/AAAA55550".parse::().unwrap_err(); "-1/0".parse::().unwrap_err(); "1/-1".parse::().unwrap_err(); assert_eq!(format!("{}", Lsn(0x12345678AAAA5555)), "12345678/AAAA5555"); assert_eq!(format!("{}", Lsn(0x000000010000000A)), "1/A"); assert_eq!( Lsn::from_hex("12345678AAAA5555"), Ok(Lsn(0x12345678AAAA5555)) ); assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0))); assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError)); let expected_lsn = Lsn(0x3C490F8); assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn)); assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn)); assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn)); } #[test] fn test_lsn_math() { assert_eq!(Lsn(1234) + 11u64, Lsn(1245)); assert_eq!( { let mut lsn = Lsn(1234); lsn += 11u64; lsn }, Lsn(1245) ); assert_eq!(Lsn(1234).checked_sub(1233u64), Some(Lsn(1))); assert_eq!(Lsn(1234).checked_sub(1235u64), None); assert_eq!(Lsn(1235).widening_sub(1234u64), 1); assert_eq!(Lsn(1234).widening_sub(1235u64), -1); assert_eq!(Lsn(u64::MAX).widening_sub(0u64), i128::from(u64::MAX)); assert_eq!(Lsn(0).widening_sub(u64::MAX), -i128::from(u64::MAX)); let seg_sz: usize = 16 * 1024 * 1024; assert_eq!(Lsn(0x1000007).segment_offset(seg_sz), 7); assert_eq!(Lsn(0x1000007).segment_number(seg_sz), 1u64); assert_eq!(Lsn(0x4007).block_offset(), 7u64); assert_eq!(Lsn(0x4000).block_offset(), 0u64); assert_eq!(Lsn(0x4007).remaining_in_block(), 8185u64); assert_eq!(Lsn(0x4000).remaining_in_block(), 8192u64); assert_eq!(Lsn(0xffff01).calc_padding(seg_sz as u64), 255u64); assert_eq!(Lsn(0x2000000).calc_padding(seg_sz as u64), 0u64); assert_eq!(Lsn(0xffff01).calc_padding(8u32), 7u64); assert_eq!(Lsn(0xffff00).calc_padding(8u32), 0u64); } #[test] fn test_atomic_lsn() { let lsn = AtomicLsn::new(0); assert_eq!(lsn.fetch_add(1234), Lsn(0)); assert_eq!(lsn.load(), Lsn(1234)); lsn.store(Lsn(5678)); assert_eq!(lsn.load(), Lsn(5678)); assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678)); assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000)); } #[test] fn test_lsn_serde() { let original_lsn = Lsn(0x0123456789abcdef); let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]); let expected_non_readable_tokens = Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]); // Testing human_readable ser/de let serializer = Serializer::builder().is_human_readable(false).build(); let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap(); assert_eq!(readable_ser_tokens, expected_readable_tokens); let mut deserializer = Deserializer::builder() .is_human_readable(false) .tokens(readable_ser_tokens) .build(); let des_lsn = Lsn::deserialize(&mut deserializer).unwrap(); assert_eq!(des_lsn, original_lsn); // Testing NON human_readable ser/de let serializer = Serializer::builder().is_human_readable(true).build(); let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap(); assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens); let mut deserializer = Deserializer::builder() .is_human_readable(true) .tokens(non_readable_ser_tokens) .build(); let des_lsn = Lsn::deserialize(&mut deserializer).unwrap(); assert_eq!(des_lsn, original_lsn); // Testing mismatching ser/de let serializer = Serializer::builder().is_human_readable(false).build(); let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap(); let mut deserializer = Deserializer::builder() .is_human_readable(true) .tokens(non_readable_ser_tokens) .build(); Lsn::deserialize(&mut deserializer).unwrap_err(); let serializer = Serializer::builder().is_human_readable(true).build(); let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap(); let mut deserializer = Deserializer::builder() .is_human_readable(false) .tokens(readable_ser_tokens) .build(); Lsn::deserialize(&mut deserializer).unwrap_err(); } #[test] fn test_lsn_ensure_roundtrip() { let original_lsn = Lsn(0xaaaabbbb); let serializer = Serializer::builder().is_human_readable(false).build(); let ser_tokens = original_lsn.serialize(&serializer).unwrap(); let mut deserializer = Deserializer::builder() .is_human_readable(false) .tokens(ser_tokens) .build(); let des_lsn = Lsn::deserialize(&mut deserializer).unwrap(); assert_eq!(des_lsn, original_lsn); } #[test] fn test_lsn_bincode_serde() { let lsn = Lsn(0x0123456789abcdef); let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef]; let ser_bytes = lsn.ser().unwrap(); assert_eq!(ser_bytes, expected_bytes); let des_lsn = Lsn::des(&ser_bytes).unwrap(); assert_eq!(des_lsn, lsn); } #[test] fn test_lsn_bincode_ensure_roundtrip() { let original_lsn = Lsn(0x01_02_03_04_05_06_07_08); let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]; let ser_bytes = original_lsn.ser().unwrap(); assert_eq!(ser_bytes, expected_bytes); let des_lsn = Lsn::des(&ser_bytes).unwrap(); assert_eq!(des_lsn, original_lsn); } } ================================================ FILE: libs/utils/src/measured_stream.rs ================================================ use std::io::Read; use std::pin::Pin; use std::{io, task}; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pin_project! { /// This stream tracks all writes and calls user provided /// callback when the underlying stream is flushed. pub struct MeasuredStream { #[pin] stream: S, write_count: usize, inc_read_count: R, inc_write_count: W, } } impl MeasuredStream { pub fn new(stream: S, inc_read_count: R, inc_write_count: W) -> Self { Self { stream, write_count: 0, inc_read_count, inc_write_count, } } } impl AsyncRead for MeasuredStream { fn poll_read( self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &mut ReadBuf<'_>, ) -> task::Poll> { let this = self.project(); let filled = buf.filled().len(); this.stream.poll_read(context, buf).map_ok(|()| { let cnt = buf.filled().len() - filled; // Increment the read count. (this.inc_read_count)(cnt); }) } } impl AsyncWrite for MeasuredStream { fn poll_write( self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &[u8], ) -> task::Poll> { let this = self.project(); this.stream.poll_write(context, buf).map_ok(|cnt| { // Increment the write count. *this.write_count += cnt; cnt }) } fn poll_flush( self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { let this = self.project(); this.stream.poll_flush(context).map_ok(|()| { // Call the user provided callback and reset the write count. (this.inc_write_count)(*this.write_count); *this.write_count = 0; }) } fn poll_shutdown( self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { self.project().stream.poll_shutdown(context) } } /// Wrapper for a reader that counts bytes read. /// /// Similar to MeasuredStream but it's one way and it's sync pub struct MeasuredReader { inner: R, byte_count: usize, } impl MeasuredReader { pub fn new(reader: R) -> Self { Self { inner: reader, byte_count: 0, } } pub fn get_byte_count(&self) -> usize { self.byte_count } } impl Read for MeasuredReader { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { let result = self.inner.read(buf); if let Ok(n_bytes) = result { self.byte_count += n_bytes } result } } ================================================ FILE: libs/utils/src/metrics_collector.rs ================================================ use std::{ sync::{Arc, RwLock}, time::{Duration, Instant}, }; use metrics::{IntGauge, proto::MetricFamily, register_int_gauge}; use once_cell::sync::Lazy; pub static METRICS_STALE_MILLIS: Lazy = Lazy::new(|| { register_int_gauge!( "metrics_metrics_stale_milliseconds", "The current metrics stale time in milliseconds" ) .expect("failed to define a metric") }); #[derive(Debug)] pub struct CollectedMetrics { pub metrics: Vec, pub collected_at: Instant, } impl CollectedMetrics { fn new(metrics: Vec) -> Self { Self { metrics, collected_at: Instant::now(), } } } #[derive(Debug)] pub struct MetricsCollector { last_collected: RwLock>, } impl MetricsCollector { pub fn new() -> Self { Self { last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))), } } #[tracing::instrument(name = "metrics_collector", skip_all)] pub fn run_once(&self, cache_metrics: bool) -> Arc { let started = Instant::now(); let metrics = metrics::gather(); let collected = Arc::new(CollectedMetrics::new(metrics)); if cache_metrics { let mut guard = self.last_collected.write().unwrap(); *guard = collected.clone(); } tracing::info!( "Collected {} metric families in {} ms", collected.metrics.len(), started.elapsed().as_millis() ); collected } pub fn last_collected(&self) -> Arc { self.last_collected.read().unwrap().clone() } } impl Default for MetricsCollector { fn default() -> Self { Self::new() } } // Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30); pub static METRICS_COLLECTOR: Lazy = Lazy::new(MetricsCollector::default); ================================================ FILE: libs/utils/src/pageserver_feedback.rs ================================================ use std::time::{Duration, SystemTime}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use pq_proto::{PG_EPOCH, read_cstr}; use serde::{Deserialize, Serialize}; use tracing::{trace, warn}; use crate::lsn::Lsn; /// Feedback pageserver sends to safekeeper and safekeeper resends to compute. /// /// Serialized in custom flexible key/value format. In replication protocol, it /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres /// Standby status update / Hot standby feedback messages. /// /// serde Serialize is used only for human readable dump to json (e.g. in /// safekeepers debug_dump). #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct PageserverFeedback { /// Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, /// LSN last received and ingested by the pageserver. Controls backpressure. pub last_received_lsn: Lsn, /// LSN up to which data is persisted by the pageserver to its local disc. /// Controls backpressure. pub disk_consistent_lsn: Lsn, /// LSN up to which data is persisted by the pageserver on s3; safekeepers /// consider WAL before it can be removed. pub remote_consistent_lsn: Lsn, // Serialize with RFC3339 format. #[serde(with = "serde_systemtime")] pub replytime: SystemTime, /// Used to track feedbacks from different shards. Always zero for unsharded tenants. pub shard_number: u32, /// If true, the pageserver has detected corruption and the safekeeper and postgres /// should stop sending WAL. pub corruption_detected: bool, } impl PageserverFeedback { pub fn empty() -> PageserverFeedback { PageserverFeedback { current_timeline_size: 0, last_received_lsn: Lsn::INVALID, remote_consistent_lsn: Lsn::INVALID, disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, shard_number: 0, corruption_detected: false, } } // Serialize PageserverFeedback using custom format // to support protocol extensibility. // // Following layout is used: // char - number of key-value pairs that follow. // // key-value pairs: // null-terminated string - key, // uint32 - value length in bytes // value itself // // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { let buf_ptr = buf.len(); buf.put_u8(0); // # of keys, will be filled later let mut nkeys = 0; nkeys += 1; buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); nkeys += 1; buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.last_received_lsn.0); nkeys += 1; buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.disk_consistent_lsn.0); nkeys += 1; buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.remote_consistent_lsn.0); let timestamp = self .replytime .duration_since(*PG_EPOCH) .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; nkeys += 1; buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); if self.shard_number > 0 { nkeys += 1; buf.put_slice(b"shard_number\0"); buf.put_i32(4); buf.put_u32(self.shard_number); } if self.corruption_detected { nkeys += 1; buf.put_slice(b"corruption_detected\0"); buf.put_i32(1); buf.put_u8(1); } buf[buf_ptr] = nkeys; } // Deserialize PageserverFeedback message // TODO: change serialized fields names once all computes migrate to rename. pub fn parse(mut buf: Bytes) -> PageserverFeedback { let mut rf = PageserverFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); match key.as_ref() { b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); rf.current_timeline_size = buf.get_u64(); } b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); rf.last_received_lsn = Lsn(buf.get_u64()); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); rf.disk_consistent_lsn = Lsn(buf.get_u64()); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); rf.remote_consistent_lsn = Lsn(buf.get_u64()); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } b"shard_number" => { let len = buf.get_i32(); assert_eq!(len, 4); rf.shard_number = buf.get_u32(); } b"corruption_detected" => { let len = buf.get_i32(); assert_eq!(len, 1); rf.corruption_detected = buf.get_u8() != 0; } _ => { let len = buf.get_i32(); warn!( "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } } } trace!("PageserverFeedback parsed is {:?}", rf); rf } } mod serde_systemtime { use std::time::SystemTime; use chrono::{DateTime, Utc}; use serde::{Deserialize, Deserializer, Serializer}; pub fn serialize(ts: &SystemTime, serializer: S) -> Result where S: Serializer, { let chrono_dt: DateTime = (*ts).into(); serializer.serialize_str(&chrono_dt.to_rfc3339()) } pub fn deserialize<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, { let time: String = Deserialize::deserialize(deserializer)?; Ok(DateTime::parse_from_rfc3339(&time) .map_err(serde::de::Error::custom)? .into()) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_replication_feedback_serialization() { let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } // Test that databricks-specific fields added to the PageserverFeedback message are serialized // and deserialized correctly, in addition to the existing fields from upstream. #[test] fn test_replication_feedback_databricks_fields() { let mut rf = PageserverFeedback::empty(); rf.current_timeline_size = 12345678; rf.last_received_lsn = Lsn(23456789); rf.disk_consistent_lsn = Lsn(34567890); rf.remote_consistent_lsn = Lsn(45678901); rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); rf.shard_number = 1; rf.corruption_detected = true; let mut data = BytesMut::new(); rf.serialize(&mut data); let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys data[0] += 1; data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); // Parse serialized data and check that new field is not parsed let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } } ================================================ FILE: libs/utils/src/pid_file.rs ================================================ //! Abstraction to create & read pidfiles. //! //! A pidfile is a file in the filesystem that stores a process's PID. //! Its purpose is to implement a singleton behavior where only //! one process of some "kind" is supposed to be running at a given time. //! The "kind" is identified by the pidfile. //! //! During process startup, the process that is supposed to be a singleton //! must [claim][`claim_for_current_process`] the pidfile first. //! If that is unsuccessful, the process must not act as the singleton, i.e., //! it must not access any of the resources that only the singleton may access. //! //! A common need is to signal a running singleton process, e.g., to make //! it shut down and exit. //! For that, we have to [`read`] the pidfile. The result of the `read` operation //! tells us if there is any singleton process, and if so, what PID it has. //! We can then proceed to signal it, although some caveats still apply. //! Read the function-level documentation of [`read`] for that. //! //! ## Never Remove Pidfiles //! //! It would be natural to assume that the process who claimed the pidfile //! should remove it upon exit to avoid leaving a stale pidfile in place. //! However, we already have a reliable way to detect staleness of the pidfile, //! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it. //! //! And further, removing pidfiles would introduce a **catastrophic race condition** //! where two processes are running that are supposed to be singletons. //! Suppose we were to remove our pidfile during process shutdown. //! Here is how the race plays out: //! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`. //! - Process `A` starts to shut down. //! - Process `B` is just starting up //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file //! - It blocks on `flock` //! - Process `A` removes the pidfile as the last step of its shutdown procedure //! - `unlink("myservice.pid") //! - Process `A` exits //! - This releases its `flock` and unblocks `B` //! - Process `B` still has the file descriptor for `myservice.pid` open //! - Process `B` writes its PID into `myservice.pid`. //! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid` //! in the directory. //! - Process `C` starts //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode) //! - It `flock`s the file, which, since it's a different file, does not block //! - It writes its PID into the file //! //! At this point, `B` and `C` are running, which is hazardous. //! Morale of the story: don't unlink pidfiles, ever. use std::ops::Deref; use anyhow::Context; use camino::Utf8Path; use nix::unistd::Pid; use crate::lock_file::{self, LockFileRead}; /// Keeps a claim on a pidfile alive until it is dropped. /// Returned by [`claim_for_current_process`]. #[must_use] pub struct PidFileGuard(lock_file::LockFileGuard); impl Deref for PidFileGuard { type Target = lock_file::LockFileGuard; fn deref(&self) -> &Self::Target { &self.0 } } /// Try to claim `path` as a pidfile for the current process. /// /// If another process has already claimed the pidfile, and it is still running, /// this function returns ane error. /// Otherwise, the function `flock`s the file and updates its contents to the /// current process's PID. /// If the update fails, the flock is released and an error returned. /// On success, the function returns a [`PidFileGuard`] to keep the flock alive. /// /// ### Maintaining A Claim /// /// It is the caller's responsibility to maintain the claim. /// The claim ends as soon as the returned guard object is dropped. /// To maintain the claim for the remaining lifetime of the current process, /// use [`std::mem::forget`] or similar. pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result { let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?; // if any of the next steps fail, we drop the file descriptor and thereby release the lock let guard = unwritten_lock_file .write_content(Pid::this().to_string()) .context("write pid to lock file")?; Ok(PidFileGuard(guard)) } /// Returned by [`read`]. pub enum PidFileRead { /// No file exists at the given path. NotExist, /// The given pidfile is currently not claimed by any process. /// To determine this, the [`read`] operation acquired /// an exclusive flock on the file. The lock is still held and responsibility /// to release it is returned through the guard object. /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls /// will fail. /// /// ### Caveats /// /// Do not unlink the pidfile from the filesystem. See module-comment for why. NotHeldByAnyProcess(PidFileGuard), /// The given pidfile is still claimed by another process whose PID is given /// as part of this variant. /// /// ### Caveats /// /// 1. The other process might exit at any time, turning the given PID stale. /// 2. There is a small window in which `claim_for_current_process` has already /// locked the file but not yet updates its contents. [`read`] will return /// this variant here, but with the old file contents, i.e., a stale PID. /// /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill` /// system call on it, bears the risk of killing an unrelated process. /// This is an inherent limitation of using pidfiles. /// The only race-free solution is to have a supervisor-process with a lifetime /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`). LockedByOtherProcess(Pid), } /// Try to read the file at the given path as a pidfile that was previously created /// through [`claim_for_current_process`]. /// /// On success, this function returns a [`PidFileRead`]. /// Check its docs for a description of the meaning of its different variants. pub fn read(pidfile: &Utf8Path) -> anyhow::Result { let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?; let ret = match res { LockFileRead::NotExist => PidFileRead::NotExist, LockFileRead::NotHeldByAnyProcess(guard, _) => { PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard)) } LockFileRead::LockedByOtherProcess { not_locked_file: _not_locked_file, content, } => { // XXX the read races with the write in claim_pid_file_for_pid(). // But pids are smaller than a page, so the kernel page cache will lock for us. // The only problem is that we might get the old contents here. // Can only fix that by implementing some scheme that downgrades the // exclusive lock to shared lock in claim_pid_file_for_pid(). PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?) } }; Ok(ret) } fn parse_pidfile_content(content: &str) -> anyhow::Result { let pid: i32 = content .parse() .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?; if pid < 1 { anyhow::bail!("bad value in pidfile '{pid}'"); } Ok(Pid::from_raw(pid)) } ================================================ FILE: libs/utils/src/poison.rs ================================================ //! Protect a piece of state from reuse after it is left in an inconsistent state. //! //! # Example //! //! ``` //! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { //! use utils::poison::Poison; //! use std::time::Duration; //! //! struct State { //! clean: bool, //! } //! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); //! //! let mut mutex_guard = state.lock().await; //! let mut poison_guard = mutex_guard.check_and_arm()?; //! let state = poison_guard.data_mut(); //! state.clean = false; //! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. //! tokio::time::sleep(Duration::from_secs(10)).await; //! state.clean = true; //! poison_guard.disarm(); //! # Ok::<(), utils::poison::Error>(()) //! # }); //! ``` use tracing::warn; pub struct Poison { what: &'static str, state: State, data: T, } #[derive(Clone, Copy)] enum State { Clean, Armed, Poisoned { at: chrono::DateTime }, } impl Poison { /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. pub fn new(what: &'static str, data: T) -> Self { Self { what, state: State::Clean, data, } } /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. pub fn check_and_arm(&mut self) -> Result, Error> { match self.state { State::Clean => { self.state = State::Armed; Ok(Guard(self)) } State::Armed => unreachable!("transient state"), State::Poisoned { at } => Err(Error::Poisoned { what: self.what, at, }), } } } /// Armed pointer to a [`Poison`]. /// /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. /// Once modifications are done, use [`Self::disarm`]. /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. pub struct Guard<'a, T>(&'a mut Poison); impl Guard<'_, T> { pub fn data(&self) -> &T { &self.0.data } pub fn data_mut(&mut self) -> &mut T { &mut self.0.data } pub fn disarm(self) { match self.0.state { State::Clean => unreachable!("we set it to Armed in check_and_arm()"), State::Armed => { self.0.state = State::Clean; } State::Poisoned { at } => { unreachable!("we fail check_and_arm() if it's in that state: {at}") } } } } impl Drop for Guard<'_, T> { fn drop(&mut self) { match self.0.state { State::Clean => { // set by disarm() } State::Armed => { // still armed => poison it let at = chrono::Utc::now(); self.0.state = State::Poisoned { at }; warn!(at=?at, "poisoning {}", self.0.what); } State::Poisoned { at } => { unreachable!("we fail check_and_arm() if it's in that state: {at}") } } } } #[derive(thiserror::Error, Debug)] pub enum Error { #[error("poisoned at {at}: {what}")] Poisoned { what: &'static str, at: chrono::DateTime, }, } ================================================ FILE: libs/utils/src/postgres_client.rs ================================================ //! Postgres client connection code common to other crates (safekeeper and //! pageserver) which depends on tenant/timeline ids and thus not fitting into //! postgres_connection crate. use anyhow::Context; use postgres_connection::{PgConnectionConfig, parse_host_port}; use crate::id::TenantTimelineId; #[derive(Copy, Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] pub enum InterpretedFormat { Bincode, Protobuf, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "kebab-case")] pub enum Compression { Zstd { level: i8 }, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "type", content = "args")] #[serde(rename_all = "kebab-case")] pub enum PostgresClientProtocol { /// Usual Postgres replication protocol Vanilla, /// Custom shard-aware protocol that replicates interpreted records. /// Used to send wal from safekeeper to pageserver. Interpreted { format: InterpretedFormat, compression: Option, }, } pub struct ConnectionConfigArgs<'a> { pub protocol: PostgresClientProtocol, pub ttid: TenantTimelineId, pub shard_number: Option, pub shard_count: Option, pub shard_stripe_size: Option, pub listen_pg_addr_str: &'a str, pub auth_token: Option<&'a str>, pub availability_zone: Option<&'a str>, } impl<'a> ConnectionConfigArgs<'a> { fn options(&'a self) -> Vec { let mut options = vec![ "-c".to_owned(), format!("timeline_id={}", self.ttid.timeline_id), format!("tenant_id={}", self.ttid.tenant_id), format!( "protocol={}", serde_json::to_string(&self.protocol).unwrap() ), ]; if self.shard_number.is_some() { assert!(self.shard_count.is_some()); assert!(self.shard_stripe_size.is_some()); options.push(format!("shard_count={}", self.shard_count.unwrap())); options.push(format!("shard_number={}", self.shard_number.unwrap())); options.push(format!( "shard_stripe_size={}", self.shard_stripe_size.unwrap() )); } options } } /// Create client config for fetching WAL from safekeeper on particular timeline. /// listen_pg_addr_str is in form host:\[port\]. pub fn wal_stream_connection_config( args: ConnectionConfigArgs, ) -> anyhow::Result { let (host, port) = parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); let mut connstr = PgConnectionConfig::new_host_port(host, port) .extend_options(args.options()) .set_password(args.auth_token.map(|s| s.to_owned())); if let Some(availability_zone) = args.availability_zone { connstr = connstr.extend_options([format!("availability_zone={availability_zone}")]); } Ok(connstr) } ================================================ FILE: libs/utils/src/rate_limit.rs ================================================ //! A helper to rate limit operations. use std::time::{Duration, Instant}; pub struct RateLimit { last: Option, interval: Duration, dropped: u64, } pub struct RateLimitStats(u64); impl std::fmt::Display for RateLimitStats { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{} dropped calls", self.0) } } impl RateLimit { pub const fn new(interval: Duration) -> Self { Self { last: None, interval, dropped: 0, } } /// Call `f` if the rate limit allows. /// Don't call it otherwise. pub fn call(&mut self, f: F) { self.call2(|_| f()) } pub fn call2(&mut self, f: F) { let now = Instant::now(); match self.last { Some(last) if now - last <= self.interval => { // ratelimit self.dropped += 1; } _ => { self.last = Some(now); f(RateLimitStats(self.dropped)); self.dropped = 0; } } } } #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; #[test] fn basics() { use std::sync::atomic::Ordering::Relaxed; use std::time::Duration; use super::RateLimit; let called = AtomicUsize::new(0); let mut f = RateLimit::new(Duration::from_millis(100)); let cl = || { called.fetch_add(1, Relaxed); }; f.call(cl); assert_eq!(called.load(Relaxed), 1); f.call(cl); assert_eq!(called.load(Relaxed), 1); f.call(cl); assert_eq!(called.load(Relaxed), 1); std::thread::sleep(Duration::from_millis(100)); f.call(cl); assert_eq!(called.load(Relaxed), 2); f.call(cl); assert_eq!(called.load(Relaxed), 2); std::thread::sleep(Duration::from_millis(100)); f.call(cl); assert_eq!(called.load(Relaxed), 3); } } ================================================ FILE: libs/utils/src/sentry_init.rs ================================================ use std::borrow::Cow; use std::env; use sentry::ClientInitGuard; pub use sentry::release_name; use tracing::{error, info}; #[must_use] pub fn init_sentry( release_name: Option>, extra_options: &[(&str, &str)], ) -> Option { let Ok(dsn) = env::var("SENTRY_DSN") else { info!("not initializing Sentry, no SENTRY_DSN given"); return None; }; let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into()); let guard = sentry::init(( dsn, sentry::ClientOptions { release: release_name.clone(), environment: Some(environment.clone().into()), ..Default::default() }, )); sentry::configure_scope(|scope| { for &(key, value) in extra_options { scope.set_extra(key, value.into()); } }); if let Some(dsn) = guard.dsn() { info!( "initialized Sentry for project {}, environment {}, release {} (using API {})", dsn.project_id(), environment, release_name.unwrap_or(Cow::Borrowed("None")), dsn.envelope_api_url(), ); } else { // This should panic during sentry::init(), but we may as well cover it. error!("failed to initialize Sentry, invalid DSN"); } Some(guard) } ================================================ FILE: libs/utils/src/seqwait.rs ================================================ #![warn(missing_docs)] use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; use std::mem; use std::sync::Mutex; use std::time::Duration; use tokio::sync::watch::{self, channel}; use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] pub enum SeqWaitError { /// The wait timeout was reached #[error("seqwait timeout was reached")] Timeout, /// [`SeqWait::shutdown`] was called #[error("SeqWait::shutdown was called")] Shutdown, } /// Monotonically increasing value /// /// It is handy to store some other fields under the same mutex in `SeqWait` /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with /// any type that can expose counter. `V` is the type of exposed counter. pub trait MonotonicCounter { /// Bump counter value and check that it goes forward /// N.B.: new_val is an actual new value, not a difference. fn cnt_advance(&mut self, new_val: V); /// Get counter value fn cnt_value(&self) -> V; } /// Heap of waiters, lowest numbers pop first. struct Waiters where V: Ord, { heap: BinaryHeap>, /// Number of the first waiter in the heap, or None if there are no waiters. status_channel: watch::Sender>, } impl Waiters where V: Ord + Copy, { fn new() -> Self { Waiters { heap: BinaryHeap::new(), status_channel: channel(None).0, } } /// `status_channel` contains the number of the first waiter in the heap. /// This function should be called whenever waiters heap changes. fn update_status(&self) { let first_waiter = self.heap.peek().map(|w| w.wake_num); let _ = self.status_channel.send_replace(first_waiter); } /// Add new waiter to the heap, return a channel that will be notified when the number arrives. fn add(&mut self, num: V) -> watch::Receiver<()> { let (tx, rx) = channel(()); self.heap.push(Waiter { wake_num: num, wake_channel: tx, }); self.update_status(); rx } /// Pop all waiters <= num from the heap. Collect channels in a vector, /// so that caller can wake them up. fn pop_leq(&mut self, num: V) -> Vec> { let mut wake_these = Vec::new(); while let Some(n) = self.heap.peek() { if n.wake_num > num { break; } wake_these.push(self.heap.pop().unwrap().wake_channel); } if !wake_these.is_empty() { self.update_status(); } wake_these } /// Used on shutdown to efficiently drop all waiters. fn take_all(&mut self) -> BinaryHeap> { let heap = mem::take(&mut self.heap); self.update_status(); heap } } struct Waiter where T: Ord, { wake_num: T, // wake me when this number arrives ... wake_channel: watch::Sender<()>, // ... by sending a message to this channel } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here // to get that. impl PartialOrd for Waiter { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for Waiter { fn cmp(&self, other: &Self) -> Ordering { other.wake_num.cmp(&self.wake_num) } } impl PartialEq for Waiter { fn eq(&self, other: &Self) -> bool { other.wake_num == self.wake_num } } impl Eq for Waiter {} /// Internal components of a `SeqWait` struct SeqWaitInt where S: MonotonicCounter, V: Ord, { waiters: Waiters, current: S, shutdown: bool, } /// A tool for waiting on a sequence number /// /// This provides a way to wait the arrival of a number. /// As soon as the number arrives by another caller calling /// [`advance`], then the waiter will be woken up. /// /// This implementation takes a blocking Mutex on both [`wait_for`] /// and [`advance`], meaning there may be unexpected executor blocking /// due to thread scheduling unfairness. There are probably better /// implementations, but we can probably live with this for now. /// /// [`wait_for`]: SeqWait::wait_for /// [`advance`]: SeqWait::advance /// /// `S` means Storage, `V` is type of counter that this storage exposes. /// pub struct SeqWait where S: MonotonicCounter, V: Ord, { internal: Mutex>, } impl SeqWait where S: MonotonicCounter + Copy, V: Ord + Copy, { /// Create a new `SeqWait`, initialized to a particular number pub fn new(starting_num: S) -> Self { let internal = SeqWaitInt { waiters: Waiters::new(), current: starting_num, shutdown: false, }; SeqWait { internal: Mutex::new(internal), } } /// Shut down a `SeqWait`, causing all waiters (present and /// future) to return an error. pub fn shutdown(&self) { let waiters = { // Prevent new waiters; wake all those that exist. // Wake everyone with an error. let mut internal = self.internal.lock().unwrap(); // Block any future waiters from starting internal.shutdown = true; // Take all waiters to drop them later. internal.waiters.take_all() // Drop the lock as we exit this scope. }; // When we drop the waiters list, each Receiver will // be woken with an error. // This drop doesn't need to be explicit; it's done // here to make it easier to read the code and understand // the order of events. drop(waiters); } /// Wait for a number to arrive /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. /// /// This function is async cancellation-safe. pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } /// Wait for a number to arrive /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. /// /// This function is async cancellation-safe. pub async fn wait_for_timeout( &self, num: V, timeout_duration: Duration, ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { Ok(Ok(())) => Ok(()), Ok(Err(_)) => Err(SeqWaitError::Shutdown), Err(_) => Err(SeqWaitError::Timeout), }, Err(e) => Err(e), } } /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. pub fn would_wait_for(&self, num: V) -> Result<(), V> { let internal = self.internal.lock().unwrap(); let cnt = internal.current.cnt_value(); drop(internal); if cnt >= num { Ok(()) } else { Err(cnt) } } /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { let mut internal = self.internal.lock().unwrap(); if internal.current.cnt_value() >= num { return Ok(None); } if internal.shutdown { return Err(SeqWaitError::Shutdown); } // Add waiter channel to the queue. let rx = internal.waiters.add(num); // Drop the lock as we exit this scope. Ok(Some(rx)) } /// Announce a new number has arrived /// /// All waiters at this value or below will be woken. /// /// Returns the old number. pub fn advance(&self, num: V) -> V { let old_value; let wake_these = { let mut internal = self.internal.lock().unwrap(); old_value = internal.current.cnt_value(); if old_value >= num { return old_value; } internal.current.cnt_advance(num); // Pop all waiters <= num from the heap. internal.waiters.pop_leq(num) }; for tx in wake_these { // This can fail if there are no receivers. // We don't care; discard the error. let _ = tx.send(()); } old_value } /// Read the current value, without waiting. pub fn load(&self) -> S { self.internal.lock().unwrap().current } /// Get a Receiver for the current status. /// /// The current status is the number of the first waiter in the queue, /// or None if there are no waiters. /// /// This receiver will be notified whenever the status changes. /// It is useful for receiving notifications when the first waiter /// starts waiting for a number, or when there are no more waiters left. pub fn status_receiver(&self) -> watch::Receiver> { self.internal .lock() .unwrap() .waiters .status_channel .subscribe() } } #[cfg(test)] mod tests { use std::sync::Arc; use super::*; impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { assert!(*self <= val); *self = val; } fn cnt_value(&self) -> i32 { *self } } #[tokio::test] async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); let jh1 = tokio::task::spawn(async move { seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); seq2.wait_for_timeout(999, Duration::from_millis(100)) .await .expect_err("no 999"); }); let jh2 = tokio::task::spawn(async move { seq3.wait_for(42).await.expect("wait_for 42"); seq3.wait_for(0).await.expect("wait_for 0"); }); tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); jh1.await.unwrap(); jh2.await.unwrap(); seq.shutdown(); } #[tokio::test] async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); assert_eq!(old, 0); jh.await.unwrap(); seq.shutdown(); } } ================================================ FILE: libs/utils/src/serde_percent.rs ================================================ //! A serde::Deserialize type for percentages. //! //! See [`Percent`] for details. use serde::{Deserialize, Serialize}; /// If the value is not an integer between 0 and 100, /// deserialization fails with a descriptive error. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] #[serde(transparent)] pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { pub const fn new(pct: u8) -> Option { if pct <= 100 { Some(Percent(pct)) } else { None } } pub fn get(&self) -> u8 { self.0 } } fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, { let v: u8 = serde::de::Deserialize::deserialize(deserializer)?; if v > 100 { return Err(serde::de::Error::custom( "must be an integer between 0 and 100", )); } Ok(v) } #[cfg(test)] mod tests { use super::Percent; #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)] struct Foo { bar: Percent, } #[test] fn basics() { let input = r#"{ "bar": 50 }"#; let foo: Foo = serde_json::from_str(input).unwrap(); assert_eq!(foo.bar.get(), 50); } #[test] fn null_handling() { let input = r#"{ "bar": null }"#; let res: Result = serde_json::from_str(input); assert!(res.is_err()); } #[test] fn zero() { let input = r#"{ "bar": 0 }"#; let foo: Foo = serde_json::from_str(input).unwrap(); assert_eq!(foo.bar.get(), 0); } #[test] fn out_of_range_above() { let input = r#"{ "bar": 101 }"#; let res: Result = serde_json::from_str(input); assert!(res.is_err()); } #[test] fn out_of_range_below() { let input = r#"{ "bar": -1 }"#; let res: Result = serde_json::from_str(input); assert!(res.is_err()); } #[test] fn float() { let input = r#"{ "bar": 50.5 }"#; let res: Result = serde_json::from_str(input); assert!(res.is_err()); } #[test] fn string() { let input = r#"{ "bar": "50 %" }"#; let res: Result = serde_json::from_str(input); assert!(res.is_err()); } } ================================================ FILE: libs/utils/src/serde_regex.rs ================================================ //! A `serde::{Deserialize,Serialize}` type for regexes. use std::ops::Deref; #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[serde(transparent)] pub struct Regex( #[serde( deserialize_with = "deserialize_regex", serialize_with = "serialize_regex" )] regex::Regex, ); fn deserialize_regex<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, { let s: String = serde::de::Deserialize::deserialize(deserializer)?; let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?; Ok(re) } fn serialize_regex(re: ®ex::Regex, serializer: S) -> Result where S: serde::ser::Serializer, { serializer.collect_str(re.as_str()) } impl Deref for Regex { type Target = regex::Regex; fn deref(&self) -> ®ex::Regex { &self.0 } } impl PartialEq for Regex { fn eq(&self, other: &Regex) -> bool { // comparing the automatons would be quite complicated self.as_str() == other.as_str() } } impl Eq for Regex {} #[cfg(test)] mod tests { #[test] fn roundtrip() { let input = r#""foo.*bar""#; let re: super::Regex = serde_json::from_str(input).unwrap(); assert!(re.is_match("foo123bar")); assert!(!re.is_match("foo")); let output = serde_json::to_string(&re).unwrap(); assert_eq!(output, input); } } ================================================ FILE: libs/utils/src/serde_system_time.rs ================================================ //! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] #[serde(transparent)] pub struct SystemTime( #[serde( deserialize_with = "deser_rfc3339_millis", serialize_with = "ser_rfc3339_millis" )] pub std::time::SystemTime, ); fn ser_rfc3339_millis( ts: &std::time::SystemTime, serializer: S, ) -> Result { serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) } fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, { let s: String = serde::de::Deserialize::deserialize(deserializer)?; humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) } #[cfg(test)] mod tests { use super::*; /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. fn to_millisecond_precision(time: SystemTime) -> SystemTime { match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { Ok(duration) => { let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); SystemTime( std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_millis(total_millis), ) } Err(_) => time, } } #[test] fn test_serialize_deserialize() { let input = SystemTime(std::time::SystemTime::now()); let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); let serialized = serde_json::to_string(&input).unwrap(); assert_eq!(expected_serialized, serialized); let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); assert_eq!(to_millisecond_precision(input), deserialized); } } ================================================ FILE: libs/utils/src/shard.rs ================================================ //! See `pageserver_api::shard` for description on sharding. use std::ops::RangeInclusive; use std::str::FromStr; use hex::FromHex; use serde::{Deserialize, Serialize}; use crate::id::TenantId; #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(pub u8); /// Combination of ShardNumber and ShardCount. /// /// For use within the context of a particular tenant, when we need to know which shard we're /// dealing with, but do not need to know the full ShardIdentity (because we won't be doing /// any page->shard mapping), and do not need to know the fully qualified TenantShardId. #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct ShardIndex { pub shard_number: ShardNumber, pub shard_count: ShardCount, } /// Stripe size as number of pages. /// /// NB: don't implement Default, so callers don't lazily use it by mistake. See DEFAULT_STRIPE_SIZE. #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); /// Formatting helper, for generating the `shard_id` label in traces. pub struct ShardSlug<'a>(&'a TenantShardId); /// TenantShardId globally identifies a particular shard in a particular tenant. /// /// These are written as `-`, for example: /// # The second shard in a two-shard tenant /// 072f1291a5310026820b2fe4b2968934-0102 /// /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. /// /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, /// is both forward and backward compatible with TenantId: a legacy TenantId can be /// decoded as a TenantShardId, and when re-encoded it will be parseable /// as a TenantId. #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct TenantShardId { pub tenant_id: TenantId, pub shard_number: ShardNumber, pub shard_count: ShardCount, } impl ShardCount { pub const MAX: Self = Self(u8::MAX); pub const MIN: Self = Self(0); pub fn unsharded() -> Self { ShardCount(0) } /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known /// as [`TenantShardId::unsharded`]. /// /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). pub fn count(&self) -> u8 { if self.0 > 0 { self.0 } else { 1 } } /// The literal internal value: this is **not** the number of shards in the /// tenant, as we have a special zero value for legacy unsharded tenants. Use /// [`Self::count`] if you want to know the cardinality of shards. pub fn literal(&self) -> u8 { self.0 } /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but /// uses the legacy format for `TenantShardId`. See also the documentation for /// [`Self::count`]. pub fn is_unsharded(&self) -> bool { self.0 == 0 } /// `v` may be zero, or the number of shards in the tenant. `v` is what /// [`Self::literal`] would return. pub const fn new(val: u8) -> Self { Self(val) } } impl ShardNumber { pub const MAX: Self = Self(u8::MAX); } impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { tenant_id, shard_number: ShardNumber(0), shard_count: ShardCount(0), } } /// The range of all TenantShardId that belong to a particular TenantId. This is useful when /// you have a BTreeMap of TenantShardId, and are querying by TenantId. pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { RangeInclusive::new( Self { tenant_id, shard_number: ShardNumber(0), shard_count: ShardCount(0), }, Self { tenant_id, shard_number: ShardNumber::MAX, shard_count: ShardCount::MAX, }, ) } pub fn range(&self) -> RangeInclusive { RangeInclusive::new(*self, *self) } pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } /// Convenience for code that has special behavior on the 0th shard. pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } /// The "unsharded" value is distinct from simply having a single shard: it represents /// a tenant which is not shard-aware at all, and whose storage paths will not include /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } /// Convenience for dropping the tenant_id and just getting the ShardIndex: this /// is useful when logging from code that is already in a span that includes tenant ID, to /// keep messages reasonably terse. pub fn to_index(&self) -> ShardIndex { ShardIndex { shard_number: self.shard_number, shard_count: self.shard_count, } } /// Calculate the children of this TenantShardId when splitting the overall tenant into /// the given number of shards. pub fn split(&self, new_shard_count: ShardCount) -> Vec { let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); let mut child_shards = Vec::new(); for shard_number in 0..ShardNumber(new_shard_count.0).0 { // Key mapping is based on a round robin mapping of key hash modulo shard count, // so our child shards are the ones which the same keys would map to. if shard_number % effective_old_shard_count == self.shard_number.0 { child_shards.push(TenantShardId { tenant_id: self.tenant_id, shard_number: ShardNumber(shard_number), shard_count: new_shard_count, }) } } child_shards } } impl std::fmt::Display for ShardNumber { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl std::fmt::Display for ShardCount { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl std::fmt::Display for ShardStripeSize { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{:02x}{:02x}", self.0.shard_number.0, self.0.shard_count.0 ) } } impl std::fmt::Display for TenantShardId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.shard_count != ShardCount(0) { write!(f, "{}-{}", self.tenant_id, self.shard_slug()) } else { // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this // is distinct from the normal single shard case (shard count == 1). self.tenant_id.fmt(f) } } } impl std::fmt::Debug for TenantShardId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation write!(f, "{self}") } } impl std::str::FromStr for TenantShardId { type Err = hex::FromHexError; fn from_str(s: &str) -> Result { // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count if s.len() == 32 { // Legacy case: no shard specified Ok(Self { tenant_id: TenantId::from_str(s)?, shard_number: ShardNumber(0), shard_count: ShardCount(0), }) } else if s.len() == 37 { let bytes = s.as_bytes(); let tenant_id = TenantId::from_hex(&bytes[0..32])?; let mut shard_parts: [u8; 2] = [0u8; 2]; hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; Ok(Self { tenant_id, shard_number: ShardNumber(shard_parts[0]), shard_count: ShardCount(shard_parts[1]), }) } else { Err(hex::FromHexError::InvalidStringLength) } } } impl From<[u8; 18]> for TenantShardId { fn from(b: [u8; 18]) -> Self { let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); Self { tenant_id: TenantId::from(tenant_id_bytes), shard_number: ShardNumber(b[16]), shard_count: ShardCount(b[17]), } } } impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { shard_number: number, shard_count: count, } } pub fn unsharded() -> Self { Self { shard_number: ShardNumber(0), shard_count: ShardCount(0), } } /// The "unsharded" value is distinct from simply having a single shard: it represents /// a tenant which is not shard-aware at all, and whose storage paths will not include /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } /// For use in constructing remote storage paths: concatenate this with a TenantId /// to get a fully qualified TenantShardId. /// /// Backward compat: this function returns an empty string if Self::is_unsharded, such /// that the legacy pre-sharding remote key format is preserved. pub fn get_suffix(&self) -> String { if self.is_unsharded() { "".to_string() } else { format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) } } } impl std::fmt::Display for ShardIndex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) } } impl std::fmt::Debug for ShardIndex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation write!(f, "{self}") } } impl std::str::FromStr for ShardIndex { type Err = hex::FromHexError; fn from_str(s: &str) -> Result { // Expect format: 1 byte shard number, 1 byte shard count if s.len() == 4 { let bytes = s.as_bytes(); let mut shard_parts: [u8; 2] = [0u8; 2]; hex::decode_to_slice(bytes, &mut shard_parts)?; Ok(Self { shard_number: ShardNumber(shard_parts[0]), shard_count: ShardCount(shard_parts[1]), }) } else { Err(hex::FromHexError::InvalidStringLength) } } } impl From<[u8; 2]> for ShardIndex { fn from(b: [u8; 2]) -> Self { Self { shard_number: ShardNumber(b[0]), shard_count: ShardCount(b[1]), } } } impl Serialize for TenantShardId { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { if serializer.is_human_readable() { serializer.collect_str(self) } else { // Note: while human encoding of [`TenantShardId`] is backward and forward // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; packed[17] = self.shard_count.0; packed.serialize(serializer) } } } impl<'de> Deserialize<'de> for TenantShardId { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct IdVisitor { is_human_readable_deserializer: bool, } impl<'de> serde::de::Visitor<'de> for IdVisitor { type Value = TenantShardId; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { if self.is_human_readable_deserializer { formatter.write_str("value in form of hex string") } else { formatter.write_str("value in form of integer array([u8; 18])") } } fn visit_seq(self, seq: A) -> Result where A: serde::de::SeqAccess<'de>, { let s = serde::de::value::SeqAccessDeserializer::new(seq); let id: [u8; 18] = Deserialize::deserialize(s)?; Ok(TenantShardId::from(id)) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { TenantShardId::from_str(v).map_err(E::custom) } } if deserializer.is_human_readable() { deserializer.deserialize_str(IdVisitor { is_human_readable_deserializer: true, }) } else { deserializer.deserialize_tuple( 18, IdVisitor { is_human_readable_deserializer: false, }, ) } } } impl Serialize for ShardIndex { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { if serializer.is_human_readable() { serializer.collect_str(self) } else { // Binary encoding is not used in index_part.json, but is included in anticipation of // switching various structures (e.g. inter-process communication, remote metadata) to more // compact binary encodings in future. let mut packed: [u8; 2] = [0; 2]; packed[0] = self.shard_number.0; packed[1] = self.shard_count.0; packed.serialize(serializer) } } } impl<'de> Deserialize<'de> for ShardIndex { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct IdVisitor { is_human_readable_deserializer: bool, } impl<'de> serde::de::Visitor<'de> for IdVisitor { type Value = ShardIndex; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { if self.is_human_readable_deserializer { formatter.write_str("value in form of hex string") } else { formatter.write_str("value in form of integer array([u8; 2])") } } fn visit_seq(self, seq: A) -> Result where A: serde::de::SeqAccess<'de>, { let s = serde::de::value::SeqAccessDeserializer::new(seq); let id: [u8; 2] = Deserialize::deserialize(s)?; Ok(ShardIndex::from(id)) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { ShardIndex::from_str(v).map_err(E::custom) } } if deserializer.is_human_readable() { deserializer.deserialize_str(IdVisitor { is_human_readable_deserializer: true, }) } else { deserializer.deserialize_tuple( 2, IdVisitor { is_human_readable_deserializer: false, }, ) } } } ================================================ FILE: libs/utils/src/signals.rs ================================================ pub use signal_hook::consts::TERM_SIGNALS; pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; use tokio::signal::unix::{SignalKind, signal}; use tracing::info; pub enum Signal { Quit, Interrupt, Terminate, } impl Signal { pub fn name(&self) -> &'static str { match self { Signal::Quit => "SIGQUIT", Signal::Interrupt => "SIGINT", Signal::Terminate => "SIGTERM", } } } pub struct ShutdownSignals; impl ShutdownSignals { pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, SIGTERM => Signal::Terminate, SIGQUIT => Signal::Quit, other => panic!("unknown signal: {other}"), }; handler(signal)?; } Ok(()) } } /// Runs in a loop since we want to be responsive to multiple signals /// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown) /// pub async fn signal_handler(token: tokio_util::sync::CancellationToken) { let mut sigint = signal(SignalKind::interrupt()).unwrap(); let mut sigterm = signal(SignalKind::terminate()).unwrap(); let mut sigquit = signal(SignalKind::quit()).unwrap(); loop { let signal = tokio::select! { _ = sigquit.recv() => { info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); std::process::exit(111); } _ = sigint.recv() => "SIGINT", _ = sigterm.recv() => "SIGTERM", }; if !token.is_cancelled() { info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); token.cancel(); } else { info!("Got signal {signal}. Already shutting down."); } } } ================================================ FILE: libs/utils/src/simple_rcu.rs ================================================ //! //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat //! similar to a lock, but it allows readers to "hold on" to an old value of RCU //! without blocking writers, and allows writing a new value without blocking //! readers. When you update the value, the new value is immediately visible //! to new readers, but the update waits until all existing readers have //! finished, so that on return, no one sees the old value anymore. //! //! This implementation isn't wait-free; it uses an RwLock that is held for a //! short duration when the value is read or updated. //! //! # Examples //! //! Read a value and do things with it while holding the guard: //! //! ``` //! # let rcu = utils::simple_rcu::Rcu::new(1); //! { //! let read = rcu.read(); //! println!("the current value is {}", *read); //! // exiting the scope drops the read-guard, and allows concurrent writers //! // to finish. //! } //! ``` //! //! Increment the value by one, and wait for old readers to finish: //! //! ``` //! # async fn dox() { //! # let rcu = utils::simple_rcu::Rcu::new(1); //! let write_guard = rcu.lock_for_write(); //! //! // NB: holding `write_guard` blocks new readers and writers. Keep this section short! //! let new_value = *write_guard + 1; //! //! let waitlist = write_guard.store_and_unlock(new_value); // consumes `write_guard` //! //! // Concurrent reads and writes are now possible again. Wait for all the readers //! // that still observe the old value to finish. //! waitlist.wait().await; //! # } //! ``` //! #![warn(missing_docs)] use std::ops::Deref; use std::sync::{Arc, RwLock, RwLockWriteGuard, Weak}; use tokio::sync::watch; /// Rcu allows multiple readers to read and hold onto a value without blocking /// (for very long). /// /// Storing to the Rcu updates the value, making new readers immediately see /// the new value, but it also waits for all current readers to finish. pub struct Rcu { inner: RwLock>, } struct RcuInner { current_cell: Arc>, old_cells: Vec>>, } /// /// RcuCell holds one value. It can be the latest one, or an old one. /// struct RcuCell { value: V, /// A dummy channel. We never send anything to this channel. The point is /// that when the RcuCell is dropped, any subscribed Receivers will be notified /// that the channel is closed. Updaters can use this to wait out until the /// RcuCell has been dropped, i.e. until the old value is no longer in use. /// /// We never send anything to this, we just need to hold onto it so that the /// Receivers will be notified when it's dropped. watch: watch::Sender<()>, } impl RcuCell { fn new(value: V) -> Self { let (watch_sender, _) = watch::channel(()); RcuCell { value, watch: watch_sender, } } } impl Rcu { /// Create a new `Rcu`, initialized to `starting_val` pub fn new(starting_val: V) -> Self { let inner = RcuInner { current_cell: Arc::new(RcuCell::new(starting_val)), old_cells: Vec::new(), }; Self { inner: RwLock::new(inner), } } /// /// Read current value. Any store() calls will block until the returned /// guard object is dropped. /// pub fn read(&self) -> RcuReadGuard { let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell); RcuReadGuard { cell: current_cell } } /// /// Lock the current value for updating. Returns a guard object that can be /// used to read the current value, and to store a new value. /// /// Note: holding the write-guard blocks concurrent readers, so you should /// finish the update and drop the guard quickly! Multiple writers can be /// waiting on the RcuWriteGuard::store step at the same time, however. /// pub fn lock_for_write(&self) -> RcuWriteGuard<'_, V> { let inner = self.inner.write().unwrap(); RcuWriteGuard { inner } } } /// /// Read guard returned by `read` /// pub struct RcuReadGuard { cell: Arc>, } impl Deref for RcuReadGuard { type Target = V; fn deref(&self) -> &V { &self.cell.value } } /// /// Write guard returned by `write` /// /// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be /// held for a short duration! /// /// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible /// again. /// pub struct RcuWriteGuard<'a, V> { inner: RwLockWriteGuard<'a, RcuInner>, } impl Deref for RcuWriteGuard<'_, V> { type Target = V; fn deref(&self) -> &V { &self.inner.current_cell.value } } impl RcuWriteGuard<'_, V> { /// /// Store a new value. The new value will be written to the Rcu immediately, /// and will be immediately seen by any `read` calls that start afterwards. /// /// Returns a list of readers that can see old values. You can call `wait()` /// on it to wait for them to finish. /// pub fn store_and_unlock(mut self, new_val: V) -> RcuWaitList { let new_cell = Arc::new(RcuCell::new(new_val)); let mut watches = Vec::new(); { let old = std::mem::replace(&mut self.inner.current_cell, new_cell); self.inner.old_cells.push(Arc::downgrade(&old)); // cleanup old cells that no longer have any readers, and collect // the watches for any that do. self.inner.old_cells.retain(|weak| { if let Some(cell) = weak.upgrade() { watches.push(cell.watch.subscribe()); true } else { false } }); } RcuWaitList(watches) } } /// /// List of readers who can still see old values. /// pub struct RcuWaitList(Vec>); impl RcuWaitList { /// /// Wait for old readers to finish. /// pub async fn wait(mut self) { // after all the old_cells are no longer in use, we're done for w in self.0.iter_mut() { // This will block until the Receiver is closed. That happens when // the RcuCell is dropped. #[allow(clippy::single_match)] match w.changed().await { Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"), Err(_) => { // closed, which means that the cell has been dropped, and // its value is no longer in use } } } } } #[cfg(test)] mod tests { use std::sync::Mutex; use std::time::Duration; use super::*; #[tokio::test] async fn two_writers() { let rcu = Rcu::new(1); let read1 = rcu.read(); assert_eq!(*read1, 1); let write2 = rcu.lock_for_write(); assert_eq!(*write2, 1); let wait2 = write2.store_and_unlock(2); let read2 = rcu.read(); assert_eq!(*read2, 2); let write3 = rcu.lock_for_write(); assert_eq!(*write3, 2); let wait3 = write3.store_and_unlock(3); // new reader can see the new value, and old readers continue to see the old values. let read3 = rcu.read(); assert_eq!(*read3, 3); assert_eq!(*read2, 2); assert_eq!(*read1, 1); let log = Arc::new(Mutex::new(Vec::new())); // Wait for the old readers to finish in separate tasks. let log_clone = Arc::clone(&log); let task2 = tokio::spawn(async move { wait2.wait().await; log_clone.lock().unwrap().push("wait2 done"); }); let log_clone = Arc::clone(&log); let task3 = tokio::spawn(async move { wait3.wait().await; log_clone.lock().unwrap().push("wait3 done"); }); // without this sleep the test can pass on accident if the writer is slow tokio::time::sleep(Duration::from_millis(100)).await; // Release first reader. This allows first write to finish, but calling // wait() on the 'task3' would still block. log.lock().unwrap().push("dropping read1"); drop(read1); task2.await.unwrap(); assert!(!task3.is_finished()); tokio::time::sleep(Duration::from_millis(100)).await; // Release second reader, and finish second writer. log.lock().unwrap().push("dropping read2"); drop(read2); task3.await.unwrap(); assert_eq!( log.lock().unwrap().as_slice(), &[ "dropping read1", "wait2 done", "dropping read2", "wait3 done" ] ); } } ================================================ FILE: libs/utils/src/span.rs ================================================ //! Tracing span helpers. /// Records the given fields in the current span, as a single call. The fields must already have /// been declared for the span (typically with empty values). #[macro_export] macro_rules! span_record { ($($tokens:tt)*) => {$crate::span_record_in!(::tracing::Span::current(), $($tokens)*)}; } /// Records the given fields in the given span, as a single call. The fields must already have been /// declared for the span (typically with empty values). #[macro_export] macro_rules! span_record_in { ($span:expr, $($tokens:tt)*) => { if let Some(meta) = $span.metadata() { $span.record_all(&tracing::valueset!(meta.fields(), $($tokens)*)); } }; } ================================================ FILE: libs/utils/src/sync/duplex/mpsc.rs ================================================ use tokio::sync::mpsc; /// A bi-directional channel. pub struct Duplex { pub tx: mpsc::Sender, pub rx: mpsc::Receiver, } /// Creates a bi-directional channel. /// /// The channel will buffer up to the provided number of messages. Once the buffer is full, /// attempts to send new messages will wait until a message is received from the channel. /// The provided buffer capacity must be at least 1. pub fn channel(buffer: usize) -> (Duplex, Duplex) { let (tx_a, rx_a) = mpsc::channel::(buffer); let (tx_b, rx_b) = mpsc::channel::(buffer); (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) } impl Duplex { /// Sends a value, waiting until there is capacity. /// /// A successful send occurs when it is determined that the other end of the channel has not hung up already. pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { self.tx.send(x).await } pub fn try_send(&self, x: S) -> Result<(), mpsc::error::TrySendError> { self.tx.try_send(x) } /// Receives the next value for this receiver. /// /// This method returns `None` if the channel has been closed and there are /// no remaining messages in the channel's buffer. pub async fn recv(&mut self) -> Option { self.rx.recv().await } } ================================================ FILE: libs/utils/src/sync/duplex.rs ================================================ pub mod mpsc; ================================================ FILE: libs/utils/src/sync/gate.rs ================================================ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// /// Users of a resource call `enter()` to acquire a GateGuard, and the owner of /// the resource calls `close()` when they want to ensure that all holders of guards /// have released them, and that no future guards will be issued. pub struct Gate { inner: Arc, } impl std::fmt::Debug for Gate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Gate") // use this for identification .field("ptr", &Arc::as_ptr(&self.inner)) .field("inner", &self.inner) .finish() } } struct GateInner { sem: tokio::sync::Semaphore, closing: std::sync::atomic::AtomicBool, } impl std::fmt::Debug for GateInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let avail = self.sem.available_permits(); let guards = u32::try_from(avail) .ok() // the sem only supports 32-bit ish amount, but lets play it safe .and_then(|x| Gate::MAX_UNITS.checked_sub(x)); let closing = self.closing.load(Ordering::Relaxed); if let Some(guards) = guards { f.debug_struct("Gate") .field("remaining_guards", &guards) .field("closing", &closing) .finish() } else { f.debug_struct("Gate") .field("avail_permits", &avail) .field("closing", &closing) .finish() } } } /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] pub struct GateGuard { // Record the span where the gate was entered, so that we can identify who was blocking Gate::close span_at_enter: tracing::Span, gate: Arc, } impl GateGuard { pub fn try_clone(&self) -> Result { Gate::enter_impl(self.gate.clone()) } } impl Drop for GateGuard { fn drop(&mut self) { if self.gate.closing.load(Ordering::Relaxed) { self.span_at_enter.in_scope( || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"), ); } // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle // manually, so "return" the permit now. self.gate.sem.add_permits(1); } } #[derive(Debug, thiserror::Error)] pub enum GateError { #[error("gate is closed")] GateClosed, } impl GateError { pub fn is_cancel(&self) -> bool { match self { GateError::GateClosed => true, } } } impl Default for Gate { fn default() -> Self { Self { inner: Arc::new(GateInner { sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize), closing: AtomicBool::new(false), }), } } } impl Gate { const MAX_UNITS: u32 = u32::MAX; /// Acquire a guard that will prevent close() calls from completing. If close() /// was already called, this will return an error which should be interpreted /// as "shutting down". /// /// This function would typically be used from e.g. request handlers. While holding /// the guard returned from this function, it is important to respect a CancellationToken /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { Self::enter_impl(self.inner.clone()) } fn enter_impl(gate: Arc) -> Result { let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?; // we now have the permit, let's disable the normal raii functionality and leave // "returning" the permit to our GateGuard::drop. // // this is done to avoid the need for multiple Arcs (one for semaphore, next for other // fields). permit.forget(); Ok(GateGuard { span_at_enter: tracing::Span::current(), gate, }) } /// Types with a shutdown() method and a gate should call this method at the /// end of shutdown, to ensure that all GateGuard holders are done. /// /// This will wait for all guards to be destroyed. For this to complete promptly, it is /// important that the holders of such guards are respecting a CancellationToken which has /// been cancelled before entering this function. pub async fn close(&self) { let started_at = std::time::Instant::now(); let mut do_close = std::pin::pin!(self.do_close()); // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms let nag_after = Duration::from_millis(100); let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else { return; }; tracing::info!( gate = ?self.as_ptr(), elapsed_ms = started_at.elapsed().as_millis(), "closing is taking longer than expected" ); // close operation is not trying to be cancellation safe as pageserver does not need it. // // note: "closing" is not checked in Gate::enter -- it exists just for observability, // dropping of GateGuard after this will log who they were. self.inner.closing.store(true, Ordering::Relaxed); do_close.await; tracing::info!( gate = ?self.as_ptr(), elapsed_ms = started_at.elapsed().as_millis(), "close completed" ); } /// Used as an identity of a gate. This identity will be resolved to something useful when /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even /// more. /// /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate /// open for too long. fn as_ptr(&self) -> *const GateInner { Arc::as_ptr(&self.inner) } /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking /// the CancellationToken on such types is analogous to "Did shutdown start?" pub fn close_complete(&self) -> bool { self.inner.sem.is_closed() } #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))] async fn do_close(&self) { tracing::debug!("Closing Gate..."); match self.inner.sem.acquire_many(Self::MAX_UNITS).await { Ok(_permit) => { // While holding all units, close the semaphore. All subsequent calls to enter() will fail. self.inner.sem.close(); } Err(_closed) => { // Semaphore closed: we are the only function that can do this, so it indicates a double-call. // This is legal. Timeline::shutdown for example is not protected from being called more than // once. tracing::debug!("Double close") } } tracing::debug!("Closed Gate.") } } #[cfg(test)] mod tests { use super::*; #[tokio::test] async fn close_unused() { // Having taken no guards, we should not be blocked in close let gate = Gate::default(); gate.close().await; } #[tokio::test] async fn close_idle() { // If a guard is dropped before entering, close should not be blocked let gate = Gate::default(); let guard = gate.enter().unwrap(); drop(guard); gate.close().await; // Entering a closed guard fails gate.enter().expect_err("enter should fail after close"); } #[tokio::test(start_paused = true)] async fn close_busy_gate() { let gate = Gate::default(); let forever = Duration::from_secs(24 * 7 * 365); let guard = tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap()); let mut close_fut = std::pin::pin!(gate.close()); // Close should be waiting for guards to drop tokio::time::timeout(forever, &mut close_fut) .await .unwrap_err(); // Attempting to enter() should fail, even though close isn't done yet. gate.enter() .expect_err("enter should fail after entering close"); // this will now log, which we cannot verify except manually drop(guard); // Guard is gone, close should finish close_fut.await; // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); } #[tokio::test(start_paused = true)] async fn clone_gate_guard() { let gate = Gate::default(); let forever = Duration::from_secs(24 * 7 * 365); let guard1 = gate.enter().expect("gate isn't closed"); let guard2 = guard1.try_clone().expect("gate isn't clsoed"); let mut close_fut = std::pin::pin!(gate.close()); tokio::time::timeout(forever, &mut close_fut) .await .unwrap_err(); // we polled close_fut once, that should prevent all later enters and clones gate.enter().unwrap_err(); guard1.try_clone().unwrap_err(); guard2.try_clone().unwrap_err(); // guard2 keeps gate open even if guard1 is closed drop(guard1); tokio::time::timeout(forever, &mut close_fut) .await .unwrap_err(); drop(guard2); // now that the last guard is dropped, closing should complete close_fut.await; // entering is still forbidden gate.enter().expect_err("enter should stilll fail"); } } ================================================ FILE: libs/utils/src/sync/heavier_once_cell.rs ================================================ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of /// `SemaphorePermit`. /// /// Allows use of `take` which does not require holding an outer mutex guard /// for the duration of initialization. /// /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`]. /// /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit pub struct OnceCell { inner: Mutex>, initializers: AtomicUsize, } impl Default for OnceCell { /// Create new uninitialized [`OnceCell`]. fn default() -> Self { Self { inner: Default::default(), initializers: AtomicUsize::new(0), } } } /// Semaphore is the current state: /// - open semaphore means the value is `None`, not yet initialized /// - closed semaphore means the value has been initialized #[derive(Debug)] struct Inner { init_semaphore: Arc, value: Option, } impl Default for Inner { fn default() -> Self { Self { init_semaphore: Arc::new(Semaphore::new(1)), value: None, } } } impl OnceCell { /// Creates an already initialized `OnceCell` with the given value. pub fn new(value: T) -> Self { let sem = Semaphore::new(1); sem.close(); Self { inner: Mutex::new(Inner { init_semaphore: Arc::new(sem), value: Some(value), }), initializers: AtomicUsize::new(0), } } /// Returns a guard to an existing initialized value, or uniquely initializes the value before /// returning the guard. /// /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization. /// /// Initialization is panic-safe and cancellation-safe. pub async fn get_or_init(&self, factory: F) -> Result, E> where F: FnOnce(InitPermit) -> Fut, Fut: std::future::Future>, { loop { let sem = { let guard = self.inner.lock().unwrap(); if guard.value.is_some() { return Ok(Guard(guard)); } guard.init_semaphore.clone() }; { let permit = { // increment the count for the duration of queued let _guard = CountWaitingInitializers::start(self); sem.acquire().await }; let Ok(permit) = permit else { let guard = self.inner.lock().unwrap(); if !Arc::ptr_eq(&sem, &guard.init_semaphore) { // there was a take_and_deinit in between continue; } assert!( guard.value.is_some(), "semaphore got closed, must be initialized" ); return Ok(Guard(guard)); }; permit.forget(); } let permit = InitPermit(sem); let (value, _permit) = factory(permit).await?; let guard = self.inner.lock().unwrap(); return Ok(Self::set0(value, guard)); } } /// Like [`Self::get_or_init_detached_measured`], but without out parameter for time spent waiting. pub async fn get_or_init_detached(&self) -> Result, InitPermit> { self.get_or_init_detached_measured(None).await } /// Returns a guard to an existing initialized value, or returns an unique initialization /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. pub async fn get_or_init_detached_measured( &self, mut wait_time: Option<&mut crate::elapsed_accum::ElapsedAccum>, ) -> Result, InitPermit> { // It looks like OnceCell::get_or_init could be implemented using this method instead of // duplication. However, that makes the future be !Send due to possibly holding on to the // MutexGuard over an await point. loop { let sem = { let guard = self.inner.lock().unwrap(); if guard.value.is_some() { return Ok(Guard(guard)); } guard.init_semaphore.clone() }; { let permit = { // increment the count for the duration of queued let _guard = CountWaitingInitializers::start(self); let fut = sem.acquire(); if let Some(wait_time) = wait_time.as_mut() { wait_time.measure(fut).await } else { fut.await } }; let Ok(permit) = permit else { let guard = self.inner.lock().unwrap(); if !Arc::ptr_eq(&sem, &guard.init_semaphore) { // there was a take_and_deinit in between continue; } assert!( guard.value.is_some(), "semaphore got closed, must be initialized" ); return Ok(Guard(guard)); }; permit.forget(); } let permit = InitPermit(sem); return Err(permit); } } /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used /// to complete initializing the inner value. /// /// # Panics /// /// If the inner has already been initialized. pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> { let guard = self.inner.lock().unwrap(); // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot // give more permits right now. if guard.init_semaphore.try_acquire().is_ok() { drop(guard); panic!("permit is of wrong origin"); } Self::set0(value, guard) } fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner>) -> Guard<'_, T> { if guard.value.is_some() { drop(guard); unreachable!("we won permit, must not be initialized"); } guard.value = Some(value); guard.init_semaphore.close(); Guard(guard) } /// Returns a guard to an existing initialized value, if any. pub fn get(&self) -> Option> { let guard = self.inner.lock().unwrap(); if guard.value.is_some() { Some(Guard(guard)) } else { None } } /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never /// initialized. pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { let inner = self.inner.get_mut().unwrap(); inner.take_and_deinit() } /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) } } /// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the /// initializing task for example at the end of initialization. struct CountWaitingInitializers<'a, T>(&'a OnceCell); impl<'a, T> CountWaitingInitializers<'a, T> { fn start(target: &'a OnceCell) -> Self { target.initializers.fetch_add(1, Ordering::Relaxed); CountWaitingInitializers(target) } } impl Drop for CountWaitingInitializers<'_, T> { fn drop(&mut self) { self.0.initializers.fetch_sub(1, Ordering::Relaxed); } } /// Uninteresting guard object to allow short-lived access to inspect or clone the held, /// initialized value. #[derive(Debug)] pub struct Guard<'a, T>(MutexGuard<'a, Inner>); impl std::ops::Deref for Guard<'_, T> { type Target = T; fn deref(&self) -> &Self::Target { self.0 .value .as_ref() .expect("guard is not created unless value has been initialized") } } impl std::ops::DerefMut for Guard<'_, T> { fn deref_mut(&mut self) -> &mut Self::Target { self.0 .value .as_mut() .expect("guard is not created unless value has been initialized") } } impl Guard<'_, T> { /// Take the current value, and a new permit for it's deinitialization. /// /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. pub fn take_and_deinit(mut self) -> (T, InitPermit) { self.0 .take_and_deinit() .expect("guard is not created unless value has been initialized") } } impl Inner { pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { let value = self.value.take()?; let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit sem.try_acquire().expect("we just created this").forget(); let permit = InitPermit(sem); std::mem::swap(self, &mut swapped); Some((value, permit)) } } /// Type held by OnceCell (de)initializing task. /// /// On drop, this type will return the permit. pub struct InitPermit(Arc); impl std::fmt::Debug for InitPermit { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let ptr = Arc::as_ptr(&self.0) as *const (); f.debug_tuple("InitPermit").field(&ptr).finish() } } impl Drop for InitPermit { fn drop(&mut self) { assert_eq!( self.0.available_permits(), 0, "InitPermit should only exist as the unique permit" ); self.0.add_permits(1); } } #[cfg(test)] mod tests { use std::convert::Infallible; use std::pin::{Pin, pin}; use std::time::Duration; use futures::Future; use super::*; #[tokio::test] async fn many_initializers() { #[derive(Default, Debug)] struct Counters { factory_got_to_run: AtomicUsize, future_polled: AtomicUsize, winners: AtomicUsize, } let initializers = 100; let cell = Arc::new(OnceCell::default()); let counters = Arc::new(Counters::default()); let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1)); let mut js = tokio::task::JoinSet::new(); for i in 0..initializers { js.spawn({ let cell = cell.clone(); let counters = counters.clone(); let barrier = barrier.clone(); async move { barrier.wait().await; let won = { let g = cell .get_or_init(|permit| { counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed); async { counters.future_polled.fetch_add(1, Ordering::Relaxed); Ok::<_, Infallible>((i, permit)) } }) .await .unwrap(); *g == i }; if won { counters.winners.fetch_add(1, Ordering::Relaxed); } } }); } barrier.wait().await; while let Some(next) = js.join_next().await { next.expect("no panics expected"); } let mut counters = Arc::try_unwrap(counters).unwrap(); assert_eq!(*counters.factory_got_to_run.get_mut(), 1); assert_eq!(*counters.future_polled.get_mut(), 1); assert_eq!(*counters.winners.get_mut(), 1); } #[tokio::test(start_paused = true)] async fn reinit_waits_for_deinit() { // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization let sleep_for = Duration::from_secs(1); let initial = 42; let reinit = 1; let cell = Arc::new(OnceCell::new(initial)); let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2)); let jh = tokio::spawn({ let cell = cell.clone(); let deinitialization_started = deinitialization_started.clone(); async move { let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit(); assert_eq!(answer, initial); deinitialization_started.wait().await; tokio::time::sleep(sleep_for).await; } }); deinitialization_started.wait().await; let started_at = tokio::time::Instant::now(); cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) }) .await .unwrap(); let elapsed = started_at.elapsed(); assert!( elapsed >= sleep_for, "initialization should had taken at least the time time slept with permit" ); jh.await.unwrap(); assert_eq!(*cell.get().unwrap(), reinit); } #[test] fn reinit_with_deinit_permit() { let cell = Arc::new(OnceCell::new(42)); let (mol, permit) = cell.get().unwrap().take_and_deinit(); cell.set(5, permit); assert_eq!(*cell.get().unwrap(), 5); let (five, permit) = cell.get().unwrap().take_and_deinit(); assert_eq!(5, five); cell.set(mol, permit); assert_eq!(*cell.get().unwrap(), 42); } #[tokio::test] async fn initialization_attemptable_until_ok() { let cell = OnceCell::default(); for _ in 0..10 { cell.get_or_init(|_permit| async { Err("whatever error") }) .await .unwrap_err(); } let g = cell .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) }) .await .unwrap(); assert_eq!(*g, "finally success"); } #[tokio::test] async fn initialization_is_cancellation_safe() { let cell = OnceCell::default(); let barrier = tokio::sync::Barrier::new(2); let initializer = cell.get_or_init(|permit| async { barrier.wait().await; futures::future::pending::<()>().await; Ok::<_, Infallible>(("never reached", permit)) }); tokio::select! { _ = initializer => { unreachable!("cannot complete; stuck in pending().await") }, _ = barrier.wait() => {} }; // now initializer is dropped assert!(cell.get().is_none()); let g = cell .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) }) .await .unwrap(); assert_eq!(*g, "now initialized"); } #[tokio::test(start_paused = true)] async fn reproduce_init_take_deinit_race() { init_take_deinit_scenario(|cell, factory| { Box::pin(async { cell.get_or_init(factory).await.unwrap(); }) }) .await; } type BoxedInitFuture = Pin>>>; type BoxedInitFunction = Box BoxedInitFuture>; /// Reproduce an assertion failure. /// /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`. /// We currently only have one, but the structure is kept. async fn init_take_deinit_scenario(init_way: F) where F: for<'a> Fn( &'a OnceCell<&'static str>, BoxedInitFunction<&'static str, Infallible>, ) -> Pin + 'a>>, { let cell = OnceCell::default(); // acquire the init_semaphore only permit to drive initializing tasks in order to waiting // on the same semaphore. let permit = cell .inner .lock() .unwrap() .init_semaphore .clone() .try_acquire_owned() .unwrap(); let mut t1 = pin!(init_way( &cell, Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })), )); let mut t2 = pin!(init_way( &cell, Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })), )); // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can // no longer make progress tokio::select! { _ = &mut t2 => unreachable!("it cannot get permit"), _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} } // followed by t1 in the init_semaphore tokio::select! { _ = &mut t1 => unreachable!("it cannot get permit"), _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} } // now let t2 proceed and initialize drop(permit); t2.await; let (s, permit) = { cell.get().unwrap().take_and_deinit() }; assert_eq!("t2", s); // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from // the new one. tokio::select! { _ = &mut t1 => unreachable!("it cannot get permit"), _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} } // only now we get to initialize it drop(permit); t1.await; assert_eq!("t1", *cell.get().unwrap()); } #[tokio::test(start_paused = true)] async fn detached_init_smoke() { let target = OnceCell::default(); let Err(permit) = target.get_or_init_detached().await else { unreachable!("it is not initialized") }; tokio::time::timeout( std::time::Duration::from_secs(3600 * 24 * 7 * 365), target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }), ) .await .expect_err("should timeout since we are already holding the permit"); target.set(42, permit); let (_answer, permit) = { let guard = target .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) .await .unwrap(); assert_eq!(*guard, 42); guard.take_and_deinit() }; assert!(target.get().is_none()); target.set(11, permit); assert_eq!(*target.get().unwrap(), 11); } #[tokio::test] async fn take_and_deinit_on_mut() { use std::convert::Infallible; let mut target = OnceCell::::default(); assert!(target.take_and_deinit().is_none()); target .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) .await .unwrap(); let again = target.take_and_deinit(); assert!(matches!(again, Some((42, _))), "{again:?}"); assert!(target.take_and_deinit().is_none()); } } ================================================ FILE: libs/utils/src/sync/spsc_fold.rs ================================================ use core::future::poll_fn; use core::task::Poll; use std::sync::{Arc, Mutex}; use diatomic_waker::DiatomicWaker; pub struct Sender { state: Arc>, } pub struct Receiver { state: Arc>, } struct Inner { wake_receiver: DiatomicWaker, wake_sender: DiatomicWaker, value: Mutex>, } enum State { NoData, HasData(T), TryFoldFailed, // transient state SenderWaitsForReceiverToConsume(T), SenderGone(Option), ReceiverGone, AllGone, SenderDropping, // transient state ReceiverDropping, // transient state } pub fn channel() -> (Sender, Receiver) { let inner = Inner { wake_receiver: DiatomicWaker::new(), wake_sender: DiatomicWaker::new(), value: Mutex::new(State::NoData), }; let state = Arc::new(inner); ( Sender { state: state.clone(), }, Receiver { state }, ) } #[derive(Debug, thiserror::Error)] pub enum SendError { #[error("receiver is gone")] ReceiverGone, } impl Sender { /// # Panics /// /// If `try_fold` panics, any subsequent call to `send` panic. pub async fn send(&mut self, value: T, try_fold: F) -> Result<(), SendError> where F: Fn(&mut T, T) -> Result<(), T>, { let mut value = Some(value); poll_fn(|cx| { let mut guard = self.state.value.lock().unwrap(); match &mut *guard { State::NoData => { *guard = State::HasData(value.take().unwrap()); self.state.wake_receiver.notify(); Poll::Ready(Ok(())) } State::HasData(_) => { let State::HasData(acc_mut) = &mut *guard else { unreachable!("this match arm guarantees that the guard is HasData"); }; match try_fold(acc_mut, value.take().unwrap()) { Ok(()) => { // no need to wake receiver, if it was waiting it already // got a wake-up when we transitioned from NoData to HasData Poll::Ready(Ok(())) } Err(unfoldable_value) => { value = Some(unfoldable_value); let State::HasData(acc) = std::mem::replace(&mut *guard, State::TryFoldFailed) else { unreachable!("this match arm guarantees that the guard is HasData"); }; *guard = State::SenderWaitsForReceiverToConsume(acc); // SAFETY: send is single threaded due to `&mut self` requirement, // therefore register is not concurrent. unsafe { self.state.wake_sender.register(cx.waker()); } Poll::Pending } } } State::SenderWaitsForReceiverToConsume(_data) => { // SAFETY: send is single threaded due to `&mut self` requirement, // therefore register is not concurrent. unsafe { self.state.wake_sender.register(cx.waker()); } Poll::Pending } State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), State::SenderGone(_) | State::AllGone | State::SenderDropping | State::ReceiverDropping | State::TryFoldFailed => { unreachable!(); } } }) .await } } impl Drop for Sender { fn drop(&mut self) { scopeguard::defer! { self.state.wake_receiver.notify() }; let Ok(mut guard) = self.state.value.lock() else { return; }; *guard = match std::mem::replace(&mut *guard, State::SenderDropping) { State::NoData => State::SenderGone(None), State::HasData(data) | State::SenderWaitsForReceiverToConsume(data) => { State::SenderGone(Some(data)) } State::ReceiverGone => State::AllGone, State::TryFoldFailed | State::SenderGone(_) | State::AllGone | State::SenderDropping | State::ReceiverDropping => { unreachable!("unreachable state {:?}", guard.discriminant_str()) } } } } #[derive(Debug, thiserror::Error)] pub enum RecvError { #[error("sender is gone")] SenderGone, } impl Receiver { pub async fn recv(&mut self) -> Result { poll_fn(|cx| { let mut guard = self.state.value.lock().unwrap(); match &mut *guard { State::NoData => { // SAFETY: recv is single threaded due to `&mut self` requirement, // therefore register is not concurrent. unsafe { self.state.wake_receiver.register(cx.waker()); } Poll::Pending } guard @ State::HasData(_) | guard @ State::SenderWaitsForReceiverToConsume(_) | guard @ State::SenderGone(Some(_)) => { let data = guard .take_data() .expect("in these states, data is guaranteed to be present"); self.state.wake_sender.notify(); Poll::Ready(Ok(data)) } State::SenderGone(None) => Poll::Ready(Err(RecvError::SenderGone)), State::ReceiverGone | State::AllGone | State::SenderDropping | State::ReceiverDropping | State::TryFoldFailed => { unreachable!("unreachable state {:?}", guard.discriminant_str()); } } }) .await } } impl Drop for Receiver { fn drop(&mut self) { scopeguard::defer! { self.state.wake_sender.notify() }; let Ok(mut guard) = self.state.value.lock() else { return; }; *guard = match std::mem::replace(&mut *guard, State::ReceiverDropping) { State::NoData => State::ReceiverGone, State::HasData(_) | State::SenderWaitsForReceiverToConsume(_) => State::ReceiverGone, State::SenderGone(_) => State::AllGone, State::TryFoldFailed | State::ReceiverGone | State::AllGone | State::SenderDropping | State::ReceiverDropping => { unreachable!("unreachable state {:?}", guard.discriminant_str()) } } } } impl State { fn take_data(&mut self) -> Option { match self { State::HasData(_) => { let State::HasData(data) = std::mem::replace(self, State::NoData) else { unreachable!("this match arm guarantees that the state is HasData"); }; Some(data) } State::SenderWaitsForReceiverToConsume(_) => { let State::SenderWaitsForReceiverToConsume(data) = std::mem::replace(self, State::NoData) else { unreachable!( "this match arm guarantees that the state is SenderWaitsForReceiverToConsume" ); }; Some(data) } State::SenderGone(data) => Some(data.take().unwrap()), State::NoData | State::TryFoldFailed | State::ReceiverGone | State::AllGone | State::SenderDropping | State::ReceiverDropping => None, } } fn discriminant_str(&self) -> &'static str { match self { State::NoData => "NoData", State::HasData(_) => "HasData", State::TryFoldFailed => "TryFoldFailed", State::SenderWaitsForReceiverToConsume(_) => "SenderWaitsForReceiverToConsume", State::SenderGone(_) => "SenderGone", State::ReceiverGone => "ReceiverGone", State::AllGone => "AllGone", State::SenderDropping => "SenderDropping", State::ReceiverDropping => "ReceiverDropping", } } } #[cfg(test)] mod tests { use super::*; const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); #[tokio::test] async fn test_send_recv() { let (mut sender, mut receiver) = channel(); sender .send(42, |acc, val| { *acc += val; Ok(()) }) .await .unwrap(); let received = receiver.recv().await.unwrap(); assert_eq!(received, 42); } #[tokio::test] async fn test_send_recv_with_fold() { let (mut sender, mut receiver) = channel(); sender .send(1, |acc, val| { *acc += val; Ok(()) }) .await .unwrap(); sender .send(2, |acc, val| { *acc += val; Ok(()) }) .await .unwrap(); let received = receiver.recv().await.unwrap(); assert_eq!(received, 3); } #[tokio::test(start_paused = true)] async fn test_sender_waits_for_receiver_if_try_fold_fails() { let (mut sender, mut receiver) = channel(); sender.send(23, |_, _| panic!("first send")).await.unwrap(); let send_fut = sender.send(42, |_, val| Err(val)); let mut send_fut = std::pin::pin!(send_fut); tokio::select! { _ = tokio::time::sleep(FOREVER) => {}, _ = &mut send_fut => { panic!("send should not complete"); }, } let val = receiver.recv().await.unwrap(); assert_eq!(val, 23); tokio::select! { _ = tokio::time::sleep(FOREVER) => { panic!("receiver should have consumed the value"); }, _ = &mut send_fut => { }, } let val = receiver.recv().await.unwrap(); assert_eq!(val, 42); } #[tokio::test(start_paused = true)] async fn test_sender_errors_if_waits_for_receiver_and_receiver_drops() { let (mut sender, receiver) = channel(); sender.send(23, |_, _| unreachable!()).await.unwrap(); let send_fut = sender.send(42, |_, val| Err(val)); let send_fut = std::pin::pin!(send_fut); drop(receiver); let result = send_fut.await; assert!(matches!(result, Err(SendError::ReceiverGone))); } #[tokio::test(start_paused = true)] async fn test_receiver_errors_if_waits_for_sender_and_sender_drops() { let (sender, mut receiver) = channel::<()>(); let recv_fut = receiver.recv(); let recv_fut = std::pin::pin!(recv_fut); drop(sender); let result = recv_fut.await; assert!(matches!(result, Err(RecvError::SenderGone))); } #[tokio::test(start_paused = true)] async fn test_receiver_errors_if_waits_for_sender_and_sender_drops_with_data() { let (mut sender, mut receiver) = channel(); sender.send(42, |_, _| unreachable!()).await.unwrap(); { let recv_fut = receiver.recv(); let recv_fut = std::pin::pin!(recv_fut); drop(sender); let val = recv_fut.await.unwrap(); assert_eq!(val, 42); } let result = receiver.recv().await; assert!(matches!(result, Err(RecvError::SenderGone))); } #[tokio::test(start_paused = true)] async fn test_receiver_waits_for_sender_if_no_data() { let (mut sender, mut receiver) = channel(); let recv_fut = receiver.recv(); let mut recv_fut = std::pin::pin!(recv_fut); tokio::select! { _ = tokio::time::sleep(FOREVER) => {}, _ = &mut recv_fut => { panic!("recv should not complete"); }, } sender.send(42, |_, _| Ok(())).await.unwrap(); let val = recv_fut.await.unwrap(); assert_eq!(val, 42); } #[tokio::test] async fn test_receiver_gone_while_nodata() { let (mut sender, receiver) = channel(); drop(receiver); let result = sender.send(42, |_, _| Ok(())).await; assert!(matches!(result, Err(SendError::ReceiverGone))); } #[tokio::test] async fn test_sender_gone_while_nodata() { let (sender, mut receiver) = super::channel::(); drop(sender); let result = receiver.recv().await; assert!(matches!(result, Err(RecvError::SenderGone))); } #[tokio::test(start_paused = true)] async fn test_receiver_drops_after_sender_went_to_sleep() { let (mut sender, receiver) = channel(); let state = receiver.state.clone(); sender.send(23, |_, _| unreachable!()).await.unwrap(); let send_task = tokio::spawn(async move { sender.send(42, |_, v| Err(v)).await }); tokio::time::sleep(FOREVER).await; assert!(matches!( &*state.value.lock().unwrap(), &State::SenderWaitsForReceiverToConsume(_) )); drop(receiver); let err = send_task .await .unwrap() .expect_err("should unblock immediately"); assert!(matches!(err, SendError::ReceiverGone)); } #[tokio::test(start_paused = true)] async fn test_sender_drops_after_receiver_went_to_sleep() { let (sender, mut receiver) = channel::(); let state = sender.state.clone(); let recv_task = tokio::spawn(async move { receiver.recv().await }); tokio::time::sleep(FOREVER).await; assert!(matches!(&*state.value.lock().unwrap(), &State::NoData)); drop(sender); let err = recv_task.await.unwrap().expect_err("should error"); assert!(matches!(err, RecvError::SenderGone)); } #[tokio::test(start_paused = true)] async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() { let (mut sender, receiver) = channel(); let state = receiver.state.clone(); sender.send((), |_, _| unreachable!()).await.unwrap(); assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_))); let unmergeable = sender.send((), |_, _| Err(())); let mut unmergeable = std::pin::pin!(unmergeable); tokio::select! { _ = tokio::time::sleep(FOREVER) => {}, _ = &mut unmergeable => { panic!("unmergeable should not complete"); }, } assert!(matches!( &*state.value.lock().unwrap(), &State::SenderWaitsForReceiverToConsume(_) )); drop(receiver); assert!(matches!( &*state.value.lock().unwrap(), &State::ReceiverGone )); unmergeable.await.unwrap_err(); } } ================================================ FILE: libs/utils/src/sync.rs ================================================ pub mod heavier_once_cell; pub mod duplex; pub mod gate; pub mod spsc_fold; ================================================ FILE: libs/utils/src/tcp_listener.rs ================================================ use std::io; use std::net::{TcpListener, ToSocketAddrs}; use nix::sys::socket::setsockopt; use nix::sys::socket::sockopt::ReuseAddr; /// Bind a [`TcpListener`] to addr with `SO_REUSEADDR` set to true. pub fn bind(addr: A) -> io::Result { let listener = TcpListener::bind(addr)?; setsockopt(&listener, ReuseAddr, &true)?; Ok(listener) } ================================================ FILE: libs/utils/src/timeout.rs ================================================ use std::time::Duration; use tokio_util::sync::CancellationToken; #[derive(thiserror::Error, Debug)] pub enum TimeoutCancellableError { #[error("Timed out")] Timeout, #[error("Cancelled")] Cancelled, } /// Wrap [`tokio::time::timeout`] with a CancellationToken. /// /// This wrapper is appropriate for any long running operation in a task /// that ought to respect a CancellationToken (which means most tasks). /// /// The only time you should use a bare tokio::timeout is when the future `F` /// itself respects a CancellationToken: otherwise, always use this wrapper /// with your CancellationToken to ensure that your task does not hold up /// graceful shutdown. pub async fn timeout_cancellable( duration: Duration, cancel: &CancellationToken, future: F, ) -> Result where F: std::future::Future, { tokio::select!( r = tokio::time::timeout(duration, future) => { r.map_err(|_| TimeoutCancellableError::Timeout) }, _ = cancel.cancelled() => { Err(TimeoutCancellableError::Cancelled) } ) } ================================================ FILE: libs/utils/src/toml_edit_ext.rs ================================================ #[derive(Debug, thiserror::Error)] pub enum Error { #[error("item is not a document")] ItemIsNotADocument, #[error(transparent)] Serde(toml_edit::de::Error), } pub fn deserialize_item(item: &toml_edit::Item) -> Result where T: serde::de::DeserializeOwned, { let document: toml_edit::DocumentMut = match item { toml_edit::Item::Table(toml) => toml.clone().into(), toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { toml.clone().into_table().into() } _ => return Err(Error::ItemIsNotADocument), }; toml_edit::de::from_document(document).map_err(Error::Serde) } ================================================ FILE: libs/utils/src/tracing_span_assert.rs ================================================ //! Assert that the current [`tracing::Span`] has a given set of fields. //! //! Can only produce meaningful positive results when tracing has been configured as in example. //! Absence of `tracing_error::ErrorLayer` is not detected yet. //! //! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing //! is completly unconfigured. //! //! # Usage //! //! ```rust //! # fn main() { //! use tracing_subscriber::prelude::*; //! let registry = tracing_subscriber::registry() //! .with(tracing_error::ErrorLayer::default()); //! //! // Register the registry as the global subscriber. //! // In this example, we'll only use it as a thread-local subscriber. //! let _guard = tracing::subscriber::set_default(registry); //! //! // Then, in the main code: //! //! let span = tracing::info_span!("TestSpan", tenant_id = 1); //! let _guard = span.enter(); //! //! // ... down the call stack //! //! use utils::tracing_span_assert::{check_fields_present, ConstExtractor}; //! let extractor = ConstExtractor::new("tenant_id"); //! if let Err(missing) = check_fields_present!([&extractor]) { //! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! // to get the "user" code location for the panic. //! panic!("Missing fields: {missing:?}"); //! } //! # } //! ``` //! //! Recommended reading: //! #[derive(Debug)] pub enum ExtractionResult { Present, Absent, } pub trait Extractor: Send + Sync + std::fmt::Debug { fn id(&self) -> &str; fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; } #[derive(Debug)] pub struct ConstExtractor { field_name: &'static str, } impl ConstExtractor { pub const fn new(field_name: &'static str) -> ConstExtractor { ConstExtractor { field_name } } } impl Extractor for ConstExtractor { fn id(&self) -> &str { self.field_name } fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { if fields.iter().any(|f| f.name() == self.field_name) { ExtractionResult::Present } else { ExtractionResult::Absent } } } /// Checks that the given extractors are satisfied with the current span hierarchy. /// /// This should not be called directly, but used through [`check_fields_present`] which allows /// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default. #[doc(hidden)] pub fn check_fields_present0( must_be_present: [&dyn Extractor; L], ) -> Result> { let mut missing = must_be_present.into_iter().collect::>(); let trace = tracing_error::SpanTrace::capture(); trace.with_spans(|md, _formatted_fields| { // when trying to understand the inner workings of how does the matching work, note that // this closure might be called zero times if the span is disabled. normally it is called // once per span hierarchy level. missing.retain(|extractor| match extractor.extract(md.fields()) { ExtractionResult::Present => false, ExtractionResult::Absent => true, }); // continue walking up until we've found all missing !missing.is_empty() }); if missing.is_empty() { Ok(Summary::FoundEverything) } else if !tracing_subscriber_configured() { Ok(Summary::Unconfigured) } else { // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is // missing, which can be annoying. for this case, we could probably use // SpanTrace::status(). // // another way to end up here is with RUST_LOG=pageserver=off while configuring the // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid. // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`. Err(missing) } } /// Checks that the given extractors are satisfied with the current span hierarchy. /// /// The macro is the preferred way of checking if fields exist while passing checks if a test does /// not have tracing configured. /// /// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present. /// However we can game a module namespaced macro for `use` purposes by re-exporting the /// #[macro_export] exported name with an alias (below). #[doc(hidden)] #[macro_export] macro_rules! __check_fields_present { ($extractors:expr) => {{ { use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor}; match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) } } }} } pub use crate::__check_fields_present as check_fields_present; /// Explanation for why the check was deemed ok. /// /// Mainly useful for testing, or configuring per-crate behaviour as in with /// [`check_fields_present`]. #[derive(Debug)] pub enum Summary { /// All extractors were found. /// /// Should only happen when tracing is properly configured. FoundEverything, /// Tracing has not been configured at all. This is ok for tests running without tracing set /// up. Unconfigured, } fn tracing_subscriber_configured() -> bool { let mut noop_configured = false; tracing::dispatcher::get_default(|d| { // it is possible that this closure will not be invoked, but the current implementation // always invokes it noop_configured = d.is::(); }); !noop_configured } #[cfg(test)] mod tests { use std::collections::HashSet; use std::fmt::{self}; use std::hash::{Hash, Hasher}; use tracing_subscriber::prelude::*; use super::*; struct MemoryIdentity<'a>(&'a dyn Extractor); impl MemoryIdentity<'_> { fn as_ptr(&self) -> *const () { self.0 as *const _ as *const () } } impl PartialEq for MemoryIdentity<'_> { fn eq(&self, other: &Self) -> bool { self.as_ptr() == other.as_ptr() } } impl Eq for MemoryIdentity<'_> {} impl Hash for MemoryIdentity<'_> { fn hash(&self, state: &mut H) { self.as_ptr().hash(state); } } impl fmt::Debug for MemoryIdentity<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } } struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, tenant_extractor: ConstExtractor, timeline_extractor: ConstExtractor, } fn setup_current_thread() -> Setup { let tenant_extractor = ConstExtractor::new("tenant_id"); let timeline_extractor = ConstExtractor::new("timeline_id"); let registry = tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) .with(tracing_error::ErrorLayer::default()); let guard = tracing::subscriber::set_default(registry); Setup { _current_thread_subscriber_guard: guard, tenant_extractor, timeline_extractor, } } fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) { let missing: HashSet = HashSet::from_iter(missing.into_iter().map(MemoryIdentity)); let expected: HashSet = HashSet::from_iter(expected.into_iter().map(MemoryIdentity)); assert_eq!(missing, expected); } #[test] fn positive_one_level() { let setup = setup_current_thread(); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let _guard = span.enter(); let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] fn negative_one_level() { let setup = setup_current_thread(); let span = tracing::info_span!("root", timeline_id = "timeline-1"); let _guard = span.enter(); let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]) .unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } #[test] fn positive_multiple_levels() { let setup = setup_current_thread(); let span = tracing::info_span!("root"); let _guard = span.enter(); let span = tracing::info_span!("child", tenant_id = "tenant-1"); let _guard = span.enter(); let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let _guard = span.enter(); let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] fn negative_multiple_levels() { let setup = setup_current_thread(); let span = tracing::info_span!("root"); let _guard = span.enter(); let span = tracing::info_span!("child", timeline_id = "timeline-1"); let _guard = span.enter(); let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } #[test] fn positive_subset_one_level() { let setup = setup_current_thread(); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let _guard = span.enter(); let res = check_fields_present0([&setup.tenant_extractor]); assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] fn positive_subset_multiple_levels() { let setup = setup_current_thread(); let span = tracing::info_span!("root"); let _guard = span.enter(); let span = tracing::info_span!("child", tenant_id = "tenant-1"); let _guard = span.enter(); let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let _guard = span.enter(); let res = check_fields_present0([&setup.tenant_extractor]); assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] fn negative_subset_one_level() { let setup = setup_current_thread(); let span = tracing::info_span!("root", timeline_id = "timeline-1"); let _guard = span.enter(); let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } #[test] fn negative_subset_multiple_levels() { let setup = setup_current_thread(); let span = tracing::info_span!("root"); let _guard = span.enter(); let span = tracing::info_span!("child", timeline_id = "timeline-1"); let _guard = span.enter(); let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } #[test] fn tracing_error_subscriber_not_set_up_straight_line() { // no setup let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); let extractor = ConstExtractor::new("e"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key let extractor = ConstExtractor::new("foobar"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } #[test] fn tracing_error_subscriber_not_set_up_with_instrument() { // no setup // demo a case where span entering is used to establish a parent child connection, but // when we re-enter the subspan SpanTrace::with_spans iterates over nothing. let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); let subspan = tracing::info_span!("bar", f = "foobar"); drop(_guard); // normally this would work, but without any tracing-subscriber configured, both // check_field_present find nothing let _guard = subspan.enter(); let extractors: [&dyn Extractor; 2] = [&ConstExtractor::new("e"), &ConstExtractor::new("f")]; let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key let extractor = ConstExtractor::new("g"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } #[test] fn tracing_subscriber_configured() { // this will fail if any utils::logging::init callers appear, but let's hope they do not // appear. assert!(!super::tracing_subscriber_configured()); let _g = setup_current_thread(); assert!(super::tracing_subscriber_configured()); } #[test] fn not_found_when_disabled_by_filter() { let r = tracing_subscriber::registry().with({ tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn( |md| !(md.is_span() && *md.level() == tracing::Level::INFO), )) }); let _guard = tracing::subscriber::set_default(r); // this test is a rather tricky one, it has a number of possible outcomes depending on the // execution order when executed with other tests even if no test sets the global default // subscriber. let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")]; if span.is_disabled() { // the tests are running single threaded, or we got lucky and no other tests subscriber // was got to register their per-CALLSITE::META interest between `set_default` and // creation of the span, thus the filter got to apply and registered interest of Never, // so the span was never created. // // as the span is disabled, no keys were recorded to it, leading check_fields_present0 // to find an error. let missing = check_fields_present0(extractors).unwrap_err(); assert_missing(missing, vec![extractors[0]]); } else { // when the span is enabled, it is because some other test is running at the same time, // and that tests registry has filters which are interested in our above span. // // because the span is now enabled, all keys will be found for it. the // tracing_error::SpanTrace does not consider layer filters during the span hierarchy // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in // this test-induced issue. let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } } } ================================================ FILE: libs/utils/src/try_rcu.rs ================================================ //! Try RCU extension lifted from pub trait ArcSwapExt { /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error. fn try_rcu(&self, f: F) -> Result where F: FnMut(&T) -> Result, R: Into; } impl ArcSwapExt for arc_swap::ArcSwapAny where T: arc_swap::RefCnt, S: arc_swap::strategy::CaS, { fn try_rcu(&self, mut f: F) -> Result where F: FnMut(&T) -> Result, R: Into, { fn ptr_eq(a: A, b: B) -> bool where A: arc_swap::AsRaw, B: arc_swap::AsRaw, { let a = a.as_raw(); let b = b.as_raw(); std::ptr::eq(a, b) } let mut cur = self.load(); loop { let new = f(&cur)?.into(); let prev = self.compare_and_swap(&*cur, new); let swapped = ptr_eq(&*cur, &*prev); if swapped { return Ok(arc_swap::Guard::into_inner(prev)); } else { cur = prev; } } } } #[cfg(test)] mod tests { use std::sync::Arc; use arc_swap::ArcSwap; use super::*; #[test] fn test_try_rcu_success() { let swap = ArcSwap::from(Arc::new(42)); let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) }); assert!(result.is_ok()); assert_eq!(**swap.load(), 43); } #[test] fn test_try_rcu_error() { let swap = ArcSwap::from(Arc::new(42)); let result = swap.try_rcu(|value| -> Result { if **value == 42 { Err("err") } else { Ok(**value + 1) } }); assert!(result.is_err()); assert_eq!(result.unwrap_err(), "err"); assert_eq!(**swap.load(), 42); } } ================================================ FILE: libs/utils/src/vec_map.rs ================================================ use std::alloc::Layout; use std::cmp::Ordering; use std::ops::RangeBounds; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VecMapOrdering { Greater, GreaterOrEqual, } /// Ordered map datastructure implemented in a Vec. /// /// Append only - can only add keys that are larger than the /// current max key. /// Ordering can be adjusted using [`VecMapOrdering`] /// during `VecMap` construction. #[derive(Clone, Debug)] pub struct VecMap { data: Vec<(K, V)>, ordering: VecMapOrdering, } impl Default for VecMap { fn default() -> Self { VecMap { data: Default::default(), ordering: VecMapOrdering::Greater, } } } #[derive(thiserror::Error, Debug)] pub enum VecMapError { #[error("Key violates ordering constraint")] InvalidKey, #[error("Mismatched ordering constraints")] ExtendOrderingError, } impl VecMap { pub fn new(ordering: VecMapOrdering) -> Self { Self { data: Vec::new(), ordering, } } pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self { Self { data: Vec::with_capacity(capacity), ordering, } } pub fn is_empty(&self) -> bool { self.data.is_empty() } pub fn as_slice(&self) -> &[(K, V)] { self.data.as_slice() } /// This function may panic if given a range where the lower bound is /// greater than the upper bound. pub fn slice_range>(&self, range: R) -> &[(K, V)] { use std::ops::Bound::*; let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key); let start_idx = match range.start_bound() { Unbounded => 0, Included(k) => binary_search(k).unwrap_or_else(std::convert::identity), Excluded(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, }, }; let end_idx = match range.end_bound() { Unbounded => self.data.len(), Included(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, }, Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), }; &self.data[start_idx..end_idx] } /// Add a key value pair to the map. /// If `key` is not respective of the `self` ordering the /// pair will not be added and `InvalidKey` error will be returned. pub fn append(&mut self, key: K, value: V) -> Result { self.validate_key_order(&key)?; let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok(delta_size) } /// Update the maximum key value pair or add a new key value pair to the map. /// If `key` is not respective of the `self` ordering no updates or additions /// will occur and `InvalidKey` error will be returned. pub fn append_or_update_last( &mut self, key: K, mut value: V, ) -> Result<(Option, usize), VecMapError> { if let Some((last_key, last_value)) = self.data.last_mut() { match key.cmp(last_key) { Ordering::Less => return Err(VecMapError::InvalidKey), Ordering::Equal => { std::mem::swap(last_value, &mut value); const DELTA_SIZE: usize = 0; return Ok((Some(value), DELTA_SIZE)); } Ordering::Greater => {} } } let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok((None, delta_size)) } /// Move items from `other` to the end of `self`, leaving `other` empty. /// If the `other` ordering is different from `self` ordering /// `ExtendOrderingError` error will be returned. /// If any keys in `other` is not respective of the ordering defined in /// `self`, `InvalidKey` error will be returned and no mutation will occur. pub fn extend(&mut self, other: &mut Self) -> Result { if self.ordering != other.ordering { return Err(VecMapError::ExtendOrderingError); } let other_first_opt = other.data.last().map(extract_key); if let Some(other_first) = other_first_opt { self.validate_key_order(other_first)?; } let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data)); Ok(delta_size) } /// Validate the current last key in `self` and key being /// inserted against the order defined in `self`. fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> { if let Some(last_key) = self.data.last().map(extract_key) { match (&self.ordering, &key.cmp(last_key)) { (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => { return Err(VecMapError::InvalidKey); } (VecMapOrdering::Greater, Ordering::Greater) => {} (VecMapOrdering::GreaterOrEqual, Ordering::Less) => { return Err(VecMapError::InvalidKey); } (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {} } } Ok(()) } /// Instrument an operation on the underlying [`Vec`]. /// Will panic if the operation decreases capacity. /// Returns the increase in memory usage caused by the op. fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize { let old_cap = self.data.capacity(); op(&mut self.data); let new_cap = self.data.capacity(); match old_cap.cmp(&new_cap) { Ordering::Less => { let old_size = Layout::array::<(K, V)>(old_cap).unwrap().size(); let new_size = Layout::array::<(K, V)>(new_cap).unwrap().size(); new_size - old_size } Ordering::Equal => 0, Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"), } } /// Similar to `from_iter` defined in `FromIter` trait except /// that it accepts an [`VecMapOrdering`] pub fn from_iter>(iter: I, ordering: VecMapOrdering) -> Self { let iter = iter.into_iter(); let initial_capacity = { match iter.size_hint() { (lower_bound, None) => lower_bound, (_, Some(upper_bound)) => upper_bound, } }; let mut vec_map = VecMap::with_capacity(initial_capacity, ordering); for (key, value) in iter { vec_map .append(key, value) .expect("The passed collection needs to be sorted!"); } vec_map } } impl IntoIterator for VecMap { type Item = (K, V); type IntoIter = std::vec::IntoIter<(K, V)>; fn into_iter(self) -> Self::IntoIter { self.data.into_iter() } } fn extract_key(entry: &(K, V)) -> &K { &entry.0 } #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::ops::Bound; use super::{VecMap, VecMapOrdering}; #[test] fn unbounded_range() { let mut vec = VecMap::default(); vec.append(0, ()).unwrap(); assert_eq!(vec.slice_range(0..0), &[]); } #[test] #[should_panic] fn invalid_ordering_range() { let mut vec = VecMap::default(); vec.append(0, ()).unwrap(); #[allow(clippy::reversed_empty_ranges)] vec.slice_range(1..0); } #[test] fn range_tests() { let mut vec = VecMap::default(); vec.append(0, ()).unwrap(); vec.append(2, ()).unwrap(); vec.append(4, ()).unwrap(); assert_eq!(vec.slice_range(0..0), &[]); assert_eq!(vec.slice_range(0..1), &[(0, ())]); assert_eq!(vec.slice_range(0..2), &[(0, ())]); assert_eq!(vec.slice_range(0..3), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(..0), &[]); assert_eq!(vec.slice_range(..1), &[(0, ())]); assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(0..=0), &[(0, ())]); assert_eq!(vec.slice_range(0..=1), &[(0, ())]); assert_eq!(vec.slice_range(0..=2), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(0..=3), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(..=0), &[(0, ())]); assert_eq!(vec.slice_range(..=1), &[(0, ())]); assert_eq!(vec.slice_range(..=2), &[(0, ()), (2, ())]); assert_eq!(vec.slice_range(..=3), &[(0, ()), (2, ())]); } struct BoundIter { min: i32, max: i32, next: Option>, } impl BoundIter { fn new(min: i32, max: i32) -> Self { Self { min, max, next: Some(Bound::Unbounded), } } } impl Iterator for BoundIter { type Item = Bound; fn next(&mut self) -> Option { let cur = self.next?; self.next = match &cur { Bound::Unbounded => Some(Bound::Included(self.min)), Bound::Included(x) => { if *x >= self.max { Some(Bound::Excluded(self.min)) } else { Some(Bound::Included(x + 1)) } } Bound::Excluded(x) => { if *x >= self.max { None } else { Some(Bound::Excluded(x + 1)) } } }; Some(cur) } } #[test] fn range_exhaustive() { let map: BTreeMap = (1..=7).step_by(2).map(|x| (x, ())).collect(); let mut vec = VecMap::default(); for &key in map.keys() { vec.append(key, ()).unwrap(); } const RANGE_MIN: i32 = 0; const RANGE_MAX: i32 = 8; for lower_bound in BoundIter::new(RANGE_MIN, RANGE_MAX) { let ub_min = match lower_bound { Bound::Unbounded => RANGE_MIN, Bound::Included(x) => x, Bound::Excluded(x) => x + 1, }; for upper_bound in BoundIter::new(ub_min, RANGE_MAX) { let map_range: Vec<(i32, ())> = map .range((lower_bound, upper_bound)) .map(|(&x, _)| (x, ())) .collect(); let vec_slice = vec.slice_range((lower_bound, upper_bound)); assert_eq!(map_range, vec_slice); } } } #[test] fn extend() { let mut left = VecMap::default(); left.append(0, ()).unwrap(); assert_eq!(left.as_slice(), &[(0, ())]); let mut empty = VecMap::default(); left.extend(&mut empty).unwrap(); assert_eq!(left.as_slice(), &[(0, ())]); assert_eq!(empty.as_slice(), &[]); let mut right = VecMap::default(); right.append(1, ()).unwrap(); left.extend(&mut right).unwrap(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(right.as_slice(), &[]); let mut zero_map = VecMap::default(); zero_map.append(0, ()).unwrap(); left.extend(&mut zero_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(zero_map.as_slice(), &[(0, ())]); let mut one_map = VecMap::default(); one_map.append(1, ()).unwrap(); left.extend(&mut one_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(one_map.as_slice(), &[(1, ())]); let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual); map_greater_or_equal.append(2, ()).unwrap(); map_greater_or_equal.append(2, ()).unwrap(); left.extend(&mut map_greater_or_equal).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]); } #[test] fn extend_with_ordering() { let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual); left.append(0, ()).unwrap(); assert_eq!(left.as_slice(), &[(0, ())]); let mut greater_right = VecMap::new(VecMapOrdering::Greater); greater_right.append(0, ()).unwrap(); left.extend(&mut greater_right).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ())]); let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual); greater_or_equal_right.append(2, ()).unwrap(); greater_or_equal_right.append(2, ()).unwrap(); left.extend(&mut greater_or_equal_right).unwrap(); assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]); } #[test] fn vec_map_from_sorted() { let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())]; let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater); assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]); let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]; let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); assert_eq!( vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())] ); } #[test] #[should_panic] fn vec_map_from_unsorted_greater() { let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())]; let _ = VecMap::from_iter(vec, VecMapOrdering::Greater); } #[test] #[should_panic] fn vec_map_from_unsorted_greater_or_equal() { let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())]; let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); } } ================================================ FILE: libs/utils/src/yielding_loop.rs ================================================ use tokio_util::sync::CancellationToken; #[derive(thiserror::Error, Debug)] pub enum YieldingLoopError { #[error("Cancelled")] Cancelled, } /// Helper for long synchronous loops, e.g. over all tenants in the system. /// /// Periodically yields to avoid blocking the executor, and after resuming /// checks the provided cancellation token to drop out promptly on shutdown. #[inline(always)] pub async fn yielding_loop( interval: usize, cancel: &CancellationToken, iter: I, mut visitor: F, ) -> Result<(), YieldingLoopError> where I: Iterator, F: FnMut(T), { for (i, item) in iter.enumerate() { visitor(item); if (i + 1) % interval == 0 { tokio::task::yield_now().await; if cancel.is_cancelled() { return Err(YieldingLoopError::Cancelled); } } } Ok(()) } ================================================ FILE: libs/utils/src/zstd.rs ================================================ use std::io::SeekFrom; use anyhow::{Context, Result}; use async_compression::Level; use async_compression::tokio::bufread::ZstdDecoder; use async_compression::tokio::write::ZstdEncoder; use async_compression::zstd::CParameter; use camino::Utf8Path; use nix::NixPath; use tokio::fs::{File, OpenOptions}; use tokio::io::{AsyncBufRead, AsyncSeekExt, AsyncWriteExt}; use tokio_tar::{Archive, Builder, HeaderMode}; use walkdir::WalkDir; /// Creates a Zstandard tarball. pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> { let file = OpenOptions::new() .create(true) .truncate(true) .read(true) .write(true) .open(&tarball) .await .with_context(|| format!("tempfile creation {tarball}"))?; let mut paths = Vec::new(); for entry in WalkDir::new(path) { let entry = entry?; let metadata = entry.metadata().expect("error getting dir entry metadata"); // Also allow directories so that we also get empty directories if !(metadata.is_file() || metadata.is_dir()) { continue; } let path = entry.into_path(); paths.push(path); } // Do a sort to get a more consistent listing paths.sort_unstable(); let zstd = ZstdEncoder::with_quality_and_params( file, Level::Default, &[CParameter::enable_long_distance_matching(true)], ); let mut builder = Builder::new(zstd); // Use reproducible header mode builder.mode(HeaderMode::Deterministic); for p in paths { let rel_path = p.strip_prefix(path)?; if rel_path.is_empty() { // The top directory should not be compressed, // the tar crate doesn't like that continue; } builder.append_path_with_name(&p, rel_path).await?; } let mut zstd = builder.into_inner().await?; zstd.shutdown().await?; let mut compressed = zstd.into_inner(); let compressed_len = compressed.metadata().await?.len(); compressed.seek(SeekFrom::Start(0)).await?; Ok((compressed, compressed_len)) } /// Creates a Zstandard tarball. pub async fn extract_zst_tarball( path: &Utf8Path, tarball: impl AsyncBufRead + Unpin, ) -> Result<()> { let decoder = Box::pin(ZstdDecoder::new(tarball)); let mut archive = Archive::new(decoder); archive.unpack(path).await?; Ok(()) } ================================================ FILE: libs/utils/tests/bin_ser_test.rs ================================================ use std::io::Read; use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; use utils::bin_ser::LeSer; #[derive(Debug, PartialEq, Eq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, tli: u32, pageaddr: u64, len: u32, } // A manual implementation using BytesMut, just so we can // verify that we decode the same way. pub fn decode_header_data(buf: &mut BytesMut) -> HeaderData { HeaderData { magic: buf.get_u16_le(), info: buf.get_u16_le(), tli: buf.get_u32_le(), pageaddr: buf.get_u64_le(), len: buf.get_u32_le(), } } pub fn decode2(reader: &mut R) -> HeaderData { HeaderData::des_from(reader).unwrap() } #[test] fn test1() { let raw1 = hex!("8940 7890 5534 7890 1289 5379 8378 7893 4207 8923 4712 3218"); let mut buf1 = BytesMut::from(&raw1[..]); let mut buf2 = &raw1[..]; let dec1 = decode_header_data(&mut buf1); let dec2 = decode2(&mut buf2); assert_eq!(dec1, dec2); assert_eq!(buf1, buf2); } ================================================ FILE: libs/vm_monitor/Cargo.toml ================================================ [package] name = "vm_monitor" version = "0.1.0" edition = "2024" license.workspace = true [[bin]] name = "vm-monitor" path = "./src/bin/monitor.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow.workspace = true axum.workspace = true clap.workspace = true futures.workspace = true serde.workspace = true serde_json.workspace = true sysinfo.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true [target.'cfg(target_os = "linux")'.dependencies] cgroups-rs = "0.3.3" ================================================ FILE: libs/vm_monitor/README.md ================================================ # `vm-monitor` The `vm-monitor` (or just monitor) is a core component of the autoscaling system, along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has two primary roles: 1) notifying agents when immediate upscaling is necessary due to memory conditions and 2) managing Postgres' file cache and a cgroup to carry out upscaling and downscaling decisions. ## More on scaling We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes. To control thresholds for receiving memory usage notifications, we start Postgres in the `neon-postgres` cgroup and set its `memory.{max,high}`. * See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/) * See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/), where initial development of the monitor happened. The repository is no longer maintained but the commit history may be useful for debugging. ## Structure The `vm-monitor` is loosely comprised of a few systems. These are: * the server: this is just a simple `axum` server that accepts requests and upgrades them to websocket connections. The server only allows one connection at a time. This means that upon receiving a new connection, the server will terminate and old one if it exists. * the filecache: a struct that allows communication with the Postgres file cache. On startup, we connect to the filecache and hold on to the connection for the entire monitor lifetime. * the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory usage and sends rolling aggregates to the runner. * the runner: the runner marries the filecache and cgroup watcher together, communicating with the agent throught the `Dispatcher`, and then calling filecache and cgroup watcher functions as needed to upscale and downscale ================================================ FILE: libs/vm_monitor/src/bin/monitor.rs ================================================ // We expose a standalone binary _and_ start the monitor in `compute_ctl` so that // we can test the monitor as part of the entire autoscaling system in // neondatabase/autoscaling. // // The monitor was previously started by vm-builder, and for testing purposes, // we can mimic that setup with this binary. #[cfg(target_os = "linux")] #[tokio::main] async fn main() -> anyhow::Result<()> { use clap::Parser; use tokio_util::sync::CancellationToken; use tracing_subscriber::EnvFilter; use vm_monitor::Args; let subscriber = tracing_subscriber::fmt::Subscriber::builder() .json() .with_file(true) .with_line_number(true) .with_span_list(true) .with_env_filter(EnvFilter::from_default_env()) .finish(); tracing::subscriber::set_global_default(subscriber)?; let args: &'static Args = Box::leak(Box::new(Args::parse())); let token = CancellationToken::new(); vm_monitor::start(args, token).await } #[cfg(not(target_os = "linux"))] fn main() { panic!("the monitor requires cgroups, which are only available on linux") } ================================================ FILE: libs/vm_monitor/src/cgroup.rs ================================================ use std::fmt::{self, Debug, Formatter}; use std::time::{Duration, Instant}; use anyhow::{Context, anyhow}; use cgroups_rs::Subsystem; use cgroups_rs::hierarchies::{self, is_cgroup2_unified_mode}; use cgroups_rs::memory::MemController; use tokio::sync::watch; use tracing::{info, warn}; /// Configuration for a `CgroupWatcher` #[derive(Debug, Clone)] pub struct Config { /// Interval at which we should be fetching memory statistics memory_poll_interval: Duration, /// The number of samples used in constructing aggregated memory statistics memory_history_len: usize, /// The number of most recent samples that will be periodically logged. /// /// Each sample is logged exactly once. Increasing this value means that recent samples will be /// logged less frequently, and vice versa. /// /// For simplicity, this value must be greater than or equal to `memory_history_len`. memory_history_log_interval: usize, /// The max number of iterations to skip before logging the next iteration memory_history_log_noskip_interval: Duration, } impl Default for Config { fn default() -> Self { Self { memory_poll_interval: Duration::from_millis(100), memory_history_len: 5, // use 500ms of history for decision-making memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy) memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed } } } /// Responds to `MonitorEvents` to manage the cgroup: preventing it from being /// OOM killed or throttling. /// /// The `CgroupWatcher` primarily achieves this by reading from a stream of /// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the /// cgroup happy. #[derive(Debug)] pub struct CgroupWatcher { pub config: Config, /// The actual cgroup we are watching and managing. cgroup: cgroups_rs::Cgroup, } impl CgroupWatcher { /// Create a new `CgroupWatcher`. #[tracing::instrument(skip_all, fields(%name))] pub fn new(name: String) -> anyhow::Result { // TODO: clarify exactly why we need v2 // Make sure cgroups v2 (aka unified) are supported if !is_cgroup2_unified_mode() { anyhow::bail!("cgroups v2 not supported"); } let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name); Ok(Self { cgroup, config: Default::default(), }) } /// The entrypoint for the `CgroupWatcher`. #[tracing::instrument(skip_all)] pub async fn watch( &self, updates: watch::Sender<(Instant, MemoryHistory)>, ) -> anyhow::Result<()> { // this requirement makes the code a bit easier to work with; see the config for more. assert!(self.config.memory_history_len <= self.config.memory_history_log_interval); let mut ticker = tokio::time::interval(self.config.memory_poll_interval); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0 let mem_controller = self.memory()?; // buffer for samples that will be logged. once full, it remains so. let history_log_len = self.config.memory_history_log_interval; let max_skip = self.config.memory_history_log_noskip_interval; let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len]; let mut last_logged_memusage = MemoryStatus::zeroed(); // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems. let mut can_skip_logs_until = Instant::now() - max_skip; for t in 0_u64.. { ticker.tick().await; let now = Instant::now(); let mem = Self::memory_usage(mem_controller); let i = t as usize % history_log_len; history_log_buf[i] = mem; // We're taking *at most* memory_history_len values; we may be bounded by the total // number of samples that have come in so far. let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize; // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact // that we just inserted a value there, so the end of the iterator will *include* the // value at i, rather than stopping just short of it. let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count); let summary = MemoryHistory { avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::() / samples_count as u64, samples_count, samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32, }; // Log the current history if it's time to do so. Because `history_log_buf` has length // equal to the logging interval, we can just log the entire buffer every time we set // the last entry, which also means that for this log line, we can ignore that it's a // ring buffer (because all the entries are in order of increasing time). // // We skip logging the data if data hasn't meaningfully changed in a while, unless // we've already ignored previous iterations for the last max_skip period. if i == history_log_len - 1 && (now > can_skip_logs_until || !history_log_buf .iter() .all(|usage| last_logged_memusage.status_is_close_or_similar(usage))) { info!( history = ?MemoryStatus::debug_slice(&history_log_buf), summary = ?summary, "Recent cgroup memory statistics history" ); can_skip_logs_until = now + max_skip; last_logged_memusage = *history_log_buf.last().unwrap(); } updates .send((now, summary)) .context("failed to send MemoryHistory")?; } unreachable!() } /// Get a handle on the memory subsystem. fn memory(&self) -> anyhow::Result<&MemController> { self.cgroup .subsystems() .iter() .find_map(|sub| match sub { Subsystem::Mem(c) => Some(c), _ => None, }) .ok_or_else(|| anyhow!("could not find memory subsystem")) } /// Given a handle on the memory subsystem, returns the current memory information fn memory_usage(mem_controller: &MemController) -> MemoryStatus { let stat = mem_controller.memory_stat().stat; MemoryStatus { non_reclaimable: stat.active_anon + stat.inactive_anon, } } } // Helper function for `CgroupWatcher::watch` fn ring_buf_recent_values_iter( buf: &[T], last_value_idx: usize, count: usize, ) -> impl '_ + Iterator { // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function // easier (we only have to add `buf.len()` once, rather than a dynamic number of times). assert!(count <= buf.len()); buf.iter() // 'cycle' because the values could wrap around .cycle() // with 'cycle', this skip is more like 'offset', and functionally this is // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be // careful to avoid underflow, so we pre-add buf.len(). // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive. .skip((buf.len() + last_value_idx + 1 - count) % buf.len()) .take(count) } /// Summary of recent memory usage #[derive(Debug, Copy, Clone)] pub struct MemoryHistory { /// Rolling average of non-reclaimable memory usage samples over the last `history_period` pub avg_non_reclaimable: u64, /// The number of samples used to construct this summary pub samples_count: usize, /// Total timespan between the first and last sample used for this summary pub samples_span: Duration, } #[derive(Debug, Copy, Clone)] pub struct MemoryStatus { non_reclaimable: u64, } impl MemoryStatus { fn zeroed() -> Self { MemoryStatus { non_reclaimable: 0 } } fn debug_slice(slice: &[Self]) -> impl '_ + Debug { struct DS<'a>(&'a [MemoryStatus]); impl Debug for DS<'_> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.debug_struct("[MemoryStatus]") .field( "non_reclaimable[..]", &Fields(self.0, |stat: &MemoryStatus| { BytesToGB(stat.non_reclaimable) }), ) .finish() } } struct Fields<'a, F>(&'a [MemoryStatus], F); impl T, T: Debug> Debug for Fields<'_, F> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.debug_list().entries(self.0.iter().map(&self.1)).finish() } } struct BytesToGB(u64); impl Debug for BytesToGB { fn fmt(&self, f: &mut Formatter) -> fmt::Result { f.write_fmt(format_args!( "{:.3}Gi", self.0 as f64 / (1_u64 << 30) as f64 )) } } DS(slice) } /// Check if the other memory status is a close or similar result. /// Returns true if the larger value is not larger than the smaller value /// by 1/8 of the smaller value, and within 128MiB. /// See tests::check_similarity_behaviour for examples of behaviour fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool { let margin; let diff; if self.non_reclaimable >= other.non_reclaimable { margin = other.non_reclaimable / 8; diff = self.non_reclaimable - other.non_reclaimable; } else { margin = self.non_reclaimable / 8; diff = other.non_reclaimable - self.non_reclaimable; } diff < margin && diff < 128 * 1024 * 1024 } } #[cfg(test)] mod tests { #[test] fn ring_buf_iter() { let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let values = |offset, count| { super::ring_buf_recent_values_iter(&buf, offset, count) .copied() .collect::>() }; // Boundary conditions: start, end, and entire thing: assert_eq!(values(0, 1), [0]); assert_eq!(values(3, 4), [0, 1, 2, 3]); assert_eq!(values(9, 4), [6, 7, 8, 9]); assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); // "normal" operation: no wraparound assert_eq!(values(7, 4), [4, 5, 6, 7]); // wraparound: assert_eq!(values(0, 4), [7, 8, 9, 0]); assert_eq!(values(1, 4), [8, 9, 0, 1]); assert_eq!(values(2, 4), [9, 0, 1, 2]); assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]); } #[test] fn check_similarity_behaviour() { // This all accesses private methods, so we can't actually run this // as doctests, because doctests run as an external crate. let mut small = super::MemoryStatus { non_reclaimable: 1024, }; let mut large = super::MemoryStatus { non_reclaimable: 1024 * 1024 * 1024 * 1024, }; // objects are self-similar, no matter the size assert!(small.status_is_close_or_similar(&small)); assert!(large.status_is_close_or_similar(&large)); // inequality is symmetric assert!(!small.status_is_close_or_similar(&large)); assert!(!large.status_is_close_or_similar(&small)); small.non_reclaimable = 64; large.non_reclaimable = (small.non_reclaimable / 8) * 9; // objects are self-similar, no matter the size assert!(small.status_is_close_or_similar(&small)); assert!(large.status_is_close_or_similar(&large)); // values are similar if the larger value is larger by less than // 12.5%, i.e. 1/8 of the smaller value. // In the example above, large is exactly 12.5% larger, so this doesn't // match. assert!(!small.status_is_close_or_similar(&large)); assert!(!large.status_is_close_or_similar(&small)); large.non_reclaimable -= 1; assert!(large.status_is_close_or_similar(&large)); assert!(small.status_is_close_or_similar(&large)); assert!(large.status_is_close_or_similar(&small)); // The 1/8 rule only applies up to 128MiB of difference small.non_reclaimable = 1024 * 1024 * 1024 * 1024; large.non_reclaimable = small.non_reclaimable / 8 * 9; assert!(small.status_is_close_or_similar(&small)); assert!(large.status_is_close_or_similar(&large)); assert!(!small.status_is_close_or_similar(&large)); assert!(!large.status_is_close_or_similar(&small)); // the large value is put just above the threshold large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024; assert!(large.status_is_close_or_similar(&large)); assert!(!small.status_is_close_or_similar(&large)); assert!(!large.status_is_close_or_similar(&small)); // now below large.non_reclaimable -= 1; assert!(large.status_is_close_or_similar(&large)); assert!(small.status_is_close_or_similar(&large)); assert!(large.status_is_close_or_similar(&small)); } } ================================================ FILE: libs/vm_monitor/src/dispatcher.rs ================================================ //! Managing the websocket connection and other signals in the monitor. //! //! Contains types that manage the interaction (not data interchange, see `protocol`) //! between agent and monitor, allowing us to to process and send messages in a //! straightforward way. The dispatcher also manages that signals that come from //! the cgroup (requesting upscale), and the signals that go to the cgroup //! (notifying it of upscale). use anyhow::{Context, bail}; use axum::extract::ws::{Message, Utf8Bytes, WebSocket}; use futures::stream::{SplitSink, SplitStream}; use futures::{SinkExt, StreamExt}; use tracing::{debug, info}; use crate::protocol::{ OutboundMsg, OutboundMsgKind, PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, ProtocolRange, ProtocolResponse, ProtocolVersion, }; /// The central handler for all communications in the monitor. /// /// The dispatcher has two purposes: /// 1. Manage the connection to the agent, sending and receiving messages. /// 2. Communicate with the cgroup manager, notifying it when upscale is received, /// and sending a message to the agent when the cgroup manager requests /// upscale. #[derive(Debug)] pub struct Dispatcher { /// We read agent messages of of `source` pub(crate) source: SplitStream, /// We send messages to the agent through `sink` sink: SplitSink, /// The protocol version we have agreed to use with the agent. This is negotiated /// during the creation of the dispatcher, and should be the highest shared protocol /// version. /// // NOTE: currently unused, but will almost certainly be used in the futures // as the protocol changes #[allow(unused)] pub(crate) proto_version: ProtocolVersion, } impl Dispatcher { /// Creates a new dispatcher using the passed-in connection. /// /// Performs a negotiation with the agent to determine the highest protocol /// version that both support. This consists of two steps: /// 1. Wait for the agent to sent the range of protocols it supports. /// 2. Send a protocol version that works for us as well, or an error if there /// is no compatible version. pub async fn new(stream: WebSocket) -> anyhow::Result { let (mut sink, mut source) = stream.split(); // Figure out the highest protocol version we both support info!("waiting for agent to send protocol version range"); let Some(message) = source.next().await else { bail!("websocket connection closed while performing protocol handshake") }; let message = message.context("failed to read protocol version range off connection")?; let Message::Text(message_text) = message else { // All messages should be in text form, since we don't do any // pinging/ponging. See nhooyr/websocket's implementation and the // agent for more info bail!("received non-text message during proocol handshake: {message:?}") }; let monitor_range = ProtocolRange { min: PROTOCOL_MIN_VERSION, max: PROTOCOL_MAX_VERSION, }; let agent_range: ProtocolRange = serde_json::from_str(&message_text) .context("failed to deserialize protocol version range")?; info!(range = ?agent_range, "received protocol version range"); let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) { Ok(version) => { sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(), ))) .await .context("failed to notify agent of negotiated protocol version")?; version } Err(e) => { sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Error(format!( "Received protocol version range {agent_range} which does not overlap with {monitor_range}" ))) .unwrap(), ))) .await .context("failed to notify agent of no overlap between protocol version ranges")?; Err(e).context("error determining suitable protocol version range")? } }; Ok(Self { sink, source, proto_version: highest_shared_version, }) } /// Send a message to the agent. /// /// Although this function is small, it has one major benefit: it is the only /// way to send data accross the connection, and you can only pass in a proper /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally /// serialize the wrong thing and send it, since `self.sink.send` will take /// any string. pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> { if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) { debug!(?message, "sending message"); } else { info!(?message, "sending message"); } let json = serde_json::to_string(&message).context("failed to serialize message")?; self.sink .send(Message::Text(Utf8Bytes::from(json))) .await .context("stream error sending message") } } ================================================ FILE: libs/vm_monitor/src/filecache.rs ================================================ //! Logic for configuring and scaling the Postgres file cache. use std::num::NonZeroU64; use anyhow::{Context, anyhow}; use tokio_postgres::types::ToSql; use tokio_postgres::{Client, NoTls, Row}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; use crate::MiB; /// Manages Postgres' file cache by keeping a connection open. #[derive(Debug)] pub struct FileCacheState { client: Client, conn_str: String, pub(crate) config: FileCacheConfig, /// A token for cancelling spawned threads during shutdown. token: CancellationToken, } #[derive(Debug)] pub struct FileCacheConfig { /// The size of the file cache, in terms of the size of the resource it consumes /// (currently: only memory) /// /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total /// resources. /// /// This value must be strictly between 0 and 1. resource_multiplier: f64, /// The required minimum amount of memory, in bytes, that must remain available /// after subtracting the file cache. /// /// This value must be non-zero. min_remaining_after_cache: NonZeroU64, /// Controls the rate of increase in the file cache's size as it grows from zero /// (when total resources equals min_remaining_after_cache) to the desired size based on /// `resource_multiplier`. /// /// A `spread_factor` of zero means that all additional resources will go to the cache until it /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to /// its desired size". /// /// This value must be >= 0, and must retain an increase that is more than what would be given by /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1 /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75% /// as desired by `resource_multiplier`. /// /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`. spread_factor: f64, } impl Default for FileCacheConfig { fn default() -> Self { Self { resource_multiplier: 0.75, // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have // memory, the kernel will just evict from its page cache, rather than e.g. killing // everything. min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(), spread_factor: 0.1, } } } impl FileCacheConfig { /// Make sure fields of the config are consistent. pub fn validate(&self) -> anyhow::Result<()> { // Single field validity anyhow::ensure!( 0.0 < self.resource_multiplier && self.resource_multiplier < 1.0, "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}", self.resource_multiplier ); anyhow::ensure!( self.spread_factor >= 0.0, "spread_factor must be >= 0, got {}", self.spread_factor ); // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other. // // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and // `spread_factor`, respectively. They are: // // `total` `min_remaining_after_cache` // size = ————————————————————— - ————————————————————————————— // `spread_factor` + 1 `spread_factor` + 1 // // and // // size = `resource_multiplier` × total // // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b' // form, with y = "size" and x = "total". // // These lines intersect at: // // `min_remaining_after_cache` // ——————————————————————————————————————————————————— // 1 - `resource_multiplier` × (`spread_factor` + 1) // // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1. // (We also need it to be >= 0, but that's already guaranteed.) let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0); anyhow::ensure!( intersect_factor < 1.0, "incompatible resource_multipler and spread_factor" ); Ok(()) } /// Calculate the desired size of the cache, given the total memory pub fn calculate_cache_size(&self, total: u64) -> u64 { // *Note*: all units are in bytes, until the very last line. let available = total.saturating_sub(self.min_remaining_after_cache.get()); if available == 0 { return 0; } // Conversions to ensure we don't overflow from floating-point ops let size_from_spread = i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64; let size_from_normal = (total as f64 * self.resource_multiplier) as u64; let byte_size = u64::min(size_from_spread, size_from_normal); // The file cache operates in units of mebibytes, so the sizes we produce should // be rounded to a mebibyte. We round down to be conservative. byte_size / MiB * MiB } } impl FileCacheState { /// Connect to the file cache. #[tracing::instrument(skip_all, fields(%conn_str, ?config))] pub async fn new( conn_str: &str, config: FileCacheConfig, token: CancellationToken, ) -> anyhow::Result { config.validate().context("file cache config is invalid")?; info!(conn_str, "connecting to Postgres file cache"); let client = FileCacheState::connect(conn_str, token.clone()) .await .context("failed to connect to postgres file cache")?; let conn_str = conn_str.to_string(); Ok(Self { client, config, conn_str, token, }) } /// Connect to Postgres. /// /// Aborts the spawned thread if the kill signal is received. This is not /// a method as it is called in [`FileCacheState::new`]. #[tracing::instrument(skip_all, fields(%conn_str))] async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result { let (client, conn) = tokio_postgres::connect(conn_str, NoTls) .await .context("failed to connect to pg client")?; // The connection object performs the actual communication with the database, // so spawn it off to run on its own. See tokio-postgres docs. crate::spawn_with_cancel( token, |res| { if let Err(e) = res { error!(error = format_args!("{e:#}"), "postgres error"); } }, conn, ); Ok(client) } /// Execute a query with a retry if necessary. /// /// If the initial query fails, we restart the database connection and attempt /// if again. #[tracing::instrument(skip_all, fields(%statement))] pub async fn query_with_retry( &mut self, statement: &str, params: &[&(dyn ToSql + Sync)], ) -> anyhow::Result> { match self .client .query(statement, params) .await .context("failed to execute query") { Ok(rows) => Ok(rows), Err(e) => { error!(error = format_args!("{e:#}"), "postgres error -> retrying"); let client = FileCacheState::connect(&self.conn_str, self.token.clone()) .await .context("failed to connect to postgres file cache")?; info!("successfully reconnected to postgres client"); // Replace the old client and attempt the query with the new one self.client = client; self.client .query(statement, params) .await .context("failed to execute query a second time") } } } /// Get the current size of the file cache. #[tracing::instrument(skip_all)] pub async fn get_file_cache_size(&mut self) -> anyhow::Result { self.query_with_retry( // The file cache GUC variable is in MiB, but the conversion with // pg_size_bytes means that the end result we get is in bytes. "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));", &[], ) .await .context("failed to query pg for file cache size")? .first() .ok_or_else(|| anyhow!("file cache size query returned no rows"))? // pg_size_bytes returns a bigint which is the same as an i64. .try_get::<_, i64>(0) // Since the size of the table is not negative, the cast is sound. .map(|bytes| bytes as u64) .context("failed to extract file cache size from query result") } /// Attempt to set the file cache size, returning the size it was actually /// set to. #[tracing::instrument(skip_all, fields(%num_bytes))] pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result { let max_bytes = self // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes // means that the end result we get is in bytes. .query_with_retry( "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));", &[], ) .await .context("failed to query pg for max file cache size")? .first() .ok_or_else(|| anyhow!("max file cache size query returned no rows"))? .try_get::<_, i64>(0) .map(|bytes| bytes as u64) .context("failed to extract max file cache size from query result")?; let max_mb = max_bytes / MiB; let num_mb = u64::min(num_bytes, max_bytes) / MiB; let capped = if num_bytes > max_bytes { " (capped by maximum size)" } else { "" }; info!( size = num_mb, max = max_mb, "updating file cache size {capped}", ); // note: even though the normal ways to get the cache size produce values with trailing "MB" // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format // it expects to set the value is "integer number of MB" without trailing units. // For some reason, this *really* wasn't working with normal arguments, so that's // why we're constructing the query here. self.client .query( &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {num_mb};"), &[], ) .await .context("failed to change file cache size limit")?; // must use pg_reload_conf to have the settings change take effect self.client .execute("SELECT pg_reload_conf();", &[]) .await .context("failed to reload config")?; Ok(num_mb * MiB) } } ================================================ FILE: libs/vm_monitor/src/lib.rs ================================================ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] #![cfg(target_os = "linux")] use std::fmt::Debug; use std::net::SocketAddr; use std::time::Duration; use anyhow::Context; use axum::Router; use axum::extract::ws::WebSocket; use axum::extract::{State, WebSocketUpgrade}; use axum::response::Response; use axum::routing::get; use clap::Parser; use futures::Future; use runner::Runner; use sysinfo::{RefreshKind, System, SystemExt}; use tokio::net::TcpListener; use tokio::sync::broadcast; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{error, info}; // Code that interfaces with agent pub mod dispatcher; pub mod protocol; pub mod cgroup; pub mod filecache; pub mod runner; /// The vm-monitor is an autoscaling component started by compute_ctl. /// /// It carries out autoscaling decisions (upscaling/downscaling) and responds to /// memory pressure by making requests to the autoscaler-agent. #[derive(Debug, Parser)] pub struct Args { /// The name of the cgroup we should monitor for memory.high events. This /// is the cgroup that postgres should be running in. #[arg(short, long)] pub cgroup: Option, /// The connection string for the Postgres file cache we should manage. #[arg(short, long)] pub pgconnstr: Option, /// The address we should listen on for connection requests. For the /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369. #[arg(short, long)] pub addr: String, } impl Args { pub fn addr(&self) -> &str { &self.addr } } /// The number of bytes in one mebibyte. #[allow(non_upper_case_globals)] const MiB: u64 = 1 << 20; /// Convert a quantity in bytes to a quantity in mebibytes, generally for display /// purposes. (Most calculations in this crate use bytes directly) pub fn bytes_to_mebibytes(bytes: u64) -> f32 { (bytes as f32) / (MiB as f32) } pub fn get_total_system_memory() -> u64 { System::new_with_specifics(RefreshKind::new().with_memory()).total_memory() } /// Global app state for the Axum server #[derive(Debug, Clone)] pub struct ServerState { /// Used to close old connections. /// /// When a new connection is made, we send a message signalling to the old /// connection to close. pub sender: broadcast::Sender<()>, /// Used to cancel all spawned threads in the monitor. pub token: CancellationToken, // The CLI args pub args: &'static Args, } /// Spawn a thread that may get cancelled by the provided [`CancellationToken`]. /// /// This is mainly meant to be called with futures that will be pending for a very /// long time, or are not mean to return. If it is not desirable for the future to /// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can /// be logged with `f`. pub fn spawn_with_cancel( token: CancellationToken, f: F, future: T, ) -> JoinHandle> where T: Future + Send + 'static, T::Output: Send + 'static, F: FnOnce(&T::Output) + Send + 'static, { tokio::spawn(async move { tokio::select! { _ = token.cancelled() => { info!("received global kill signal"); None } res = future => { f(&res); Some(res) } } }) } /// The entrypoint to the binary. /// /// Set up tracing, parse arguments, and start an http server. pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> { // This channel is used to close old connections. When a new connection is // made, we send a message signalling to the old connection to close. let (sender, _) = tokio::sync::broadcast::channel::<()>(1); let app = Router::new() // This route gets upgraded to a websocket connection. We only support // one connection at a time, which we enforce by killing old connections // when we receive a new one. .route("/monitor", get(ws_handler)) .with_state(ServerState { sender, token, args, }); let addr_str = args.addr(); let addr: SocketAddr = addr_str.parse().expect("parsing address should not fail"); let listener = TcpListener::bind(&addr) .await .with_context(|| format!("failed to bind to {addr}"))?; info!(addr_str, "server bound"); axum::serve(listener, app.into_make_service()) .await .context("server exited")?; Ok(()) } /// Handles incoming websocket connections. /// /// If we are already to connected to an agent, we kill that old connection /// and accept the new one. #[tracing::instrument(name = "/monitor", skip_all, fields(?args))] pub async fn ws_handler( ws: WebSocketUpgrade, State(ServerState { sender, token, args, }): State, ) -> Response { // Kill the old monitor info!("closing old connection if there is one"); let _ = sender.send(()); // Start the new one. Wow, the cycle of death and rebirth let closer = sender.subscribe(); ws.on_upgrade(|ws| start_monitor(ws, args, closer, token)) } /// Starts the monitor. If startup fails or the monitor exits, an error will /// be logged and our internal state will be reset to allow for new connections. #[tracing::instrument(skip_all)] async fn start_monitor( ws: WebSocket, args: &Args, kill: broadcast::Receiver<()>, token: CancellationToken, ) { info!( ?args, "accepted new websocket connection -> starting monitor" ); let timeout = Duration::from_secs(4); let monitor = tokio::time::timeout( timeout, Runner::new(Default::default(), args, ws, kill, token), ) .await; let mut monitor = match monitor { Ok(Ok(monitor)) => monitor, Ok(Err(e)) => { error!(error = format_args!("{e:#}"), "failed to create monitor"); return; } Err(_) => { error!(?timeout, "creating monitor timed out"); return; } }; info!("connected to agent"); match monitor.run().await { Ok(()) => info!("monitor was killed due to new connection"), Err(e) => error!( error = format_args!("{e:#}"), "monitor terminated unexpectedly" ), } } ================================================ FILE: libs/vm_monitor/src/protocol.rs ================================================ //! Types representing protocols and actual agent-monitor messages. //! //! The pervasive use of serde modifiers throughout this module is to ease //! serialization on the go side. Because go does not have enums (which model //! messages well), it is harder to model messages, and we accomodate that with //! serde. //! //! *Note*: the agent sends and receives messages in different ways. //! //! The agent serializes messages in the form and then sends them. The use //! of `#[serde(tag = "type", content = "content")]` allows us to use `Type` //! to determine how to deserialize `Content`. //! ```ignore //! struct { //! Content any //! Type string //! Id uint64 //! } //! ``` //! and receives messages in the form: //! ```ignore //! struct { //! {fields embedded} //! Type string //! Id uint64 //! } //! ``` //! After reading the type field, the agent will decode the entire message //! again, this time into the correct type using the embedded fields. //! Because the agent cannot just extract the json contained in a certain field //! (it initially deserializes to `map[string]interface{}`), we keep the fields //! at the top level, so the entire piece of json can be deserialized into a struct, //! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored. use core::fmt; use std::cmp; use serde::de::Error; use serde::{Deserialize, Serialize}; /// A Message we send to the agent. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct OutboundMsg { #[serde(flatten)] pub(crate) inner: OutboundMsgKind, pub(crate) id: usize, } impl OutboundMsg { pub fn new(inner: OutboundMsgKind, id: usize) -> Self { Self { inner, id } } } /// The different underlying message types we can send to the agent. #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(tag = "type")] pub enum OutboundMsgKind { /// Indicates that the agent sent an invalid message, i.e, we couldn't /// properly deserialize it. InvalidMessage { error: String }, /// Indicates that we experienced an internal error while processing a message. /// For example, if a cgroup operation fails while trying to handle an upscale, /// we return `InternalError`. InternalError { error: String }, /// Returned to the agent once we have finished handling an upscale. If the /// handling was unsuccessful, an `InternalError` will get returned instead. /// *Note*: this is a struct variant because of the way go serializes struct{} UpscaleConfirmation {}, /// Indicates to the monitor that we are urgently requesting resources. /// *Note*: this is a struct variant because of the way go serializes struct{} UpscaleRequest {}, /// Returned to the agent once we have finished attempting to downscale. If /// an error occured trying to do so, an `InternalError` will get returned instead. /// However, if we are simply unsuccessful (for example, do to needing the resources), /// that gets included in the `DownscaleResult`. DownscaleResult { // FIXME for the future (once the informant is deprecated) // As of the time of writing, the agent/informant version of this struct is // called api.DownscaleResult. This struct has uppercase fields which are // serialized as such. Thus, we serialize using uppercase names so we don't // have to make a breaking change to the agent<->informant protocol. Once // the informant has been superseded by the monitor, we can add the correct // struct tags to api.DownscaleResult without causing a breaking change, // since we don't need to support the agent<->informant protocol anymore. #[serde(rename = "Ok")] ok: bool, #[serde(rename = "Status")] status: String, }, /// Part of the bidirectional heartbeat. The heartbeat is initiated by the /// agent. /// *Note*: this is a struct variant because of the way go serializes struct{} HealthCheck {}, } /// A message received form the agent. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct InboundMsg { #[serde(flatten)] pub(crate) inner: InboundMsgKind, pub(crate) id: usize, } /// The different underlying message types we can receive from the agent. #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(tag = "type", content = "content")] pub enum InboundMsgKind { /// Indicates that the we sent an invalid message, i.e, we couldn't /// properly deserialize it. InvalidMessage { error: String }, /// Indicates that the informan experienced an internal error while processing /// a message. For example, if it failed to request upsacle from the agent, it /// would return an `InternalError`. InternalError { error: String }, /// Indicates to us that we have been granted more resources. We should respond /// with an `UpscaleConfirmation` when done handling the resources (increasins /// file cache size, cgorup memory limits). UpscaleNotification { granted: Resources }, /// A request to reduce resource usage. We should response with a `DownscaleResult`, /// when done. DownscaleRequest { target: Resources }, /// Part of the bidirectional heartbeat. The heartbeat is initiated by the /// agent. /// *Note*: this is a struct variant because of the way go serializes struct{} HealthCheck {}, } /// Represents the resources granted to a VM. #[derive(Serialize, Deserialize, Debug, Clone, Copy)] // Renamed because the agent has multiple resources types: // `Resources` (milliCPU/memory slots) // `Allocation` (vCPU/bytes) <- what we correspond to #[serde(rename(serialize = "Allocation", deserialize = "Allocation"))] pub struct Resources { /// Number of vCPUs pub(crate) cpu: f64, /// Bytes of memory pub(crate) mem: u64, } impl Resources { pub fn new(cpu: f64, mem: u64) -> Self { Self { cpu, mem } } } pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0; pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0; #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)] pub struct ProtocolVersion(u8); impl ProtocolVersion { /// Represents v1.0 of the agent<-> monitor protocol - the initial version /// /// Currently the latest version. const V1_0: ProtocolVersion = ProtocolVersion(1); } impl fmt::Display for ProtocolVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { ProtocolVersion(0) => f.write_str(""), ProtocolVersion::V1_0 => f.write_str("v1.0"), other => write!(f, ""), } } } /// A set of protocol bounds that determines what we are speaking. /// /// These bounds are inclusive. #[derive(Debug)] pub struct ProtocolRange { pub min: ProtocolVersion, pub max: ProtocolVersion, } // Use a custom deserialize impl to ensure that `self.min <= self.max` impl<'de> Deserialize<'de> for ProtocolRange { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { #[derive(Deserialize)] struct InnerProtocolRange { min: ProtocolVersion, max: ProtocolVersion, } let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?; if min > max { Err(D::Error::custom(format!( "min version = {min} is greater than max version = {max}", ))) } else { Ok(ProtocolRange { min, max }) } } } impl fmt::Display for ProtocolRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.min == self.max { f.write_fmt(format_args!("{}", self.max)) } else { f.write_fmt(format_args!("{} to {}", self.min, self.max)) } } } impl ProtocolRange { /// Find the highest shared version between two `ProtocolRange`'s pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result { // We first have to make sure the ranges are overlapping. Once we know // this, we can merge the ranges by taking the max of the mins and the // mins of the maxes. if self.min > other.max { anyhow::bail!( "Non-overlapping bounds: other.max = {} was less than self.min = {}", other.max, self.min, ) } else if self.max < other.min { anyhow::bail!( "Non-overlappinng bounds: self.max = {} was less than other.min = {}", self.max, other.min ) } else { Ok(cmp::min(self.max, other.max)) } } } /// We send this to the monitor after negotiating which protocol to use #[derive(Serialize, Debug)] #[serde(rename_all = "camelCase")] pub enum ProtocolResponse { Error(String), Version(ProtocolVersion), } ================================================ FILE: libs/vm_monitor/src/runner.rs ================================================ //! Exposes the `Runner`, which handles messages received from agent and //! sends upscale requests. //! //! This is the "Monitor" part of the monitor binary and is the main entrypoint for //! all functionality. use std::fmt::Debug; use std::time::{Duration, Instant}; use anyhow::{Context, bail}; use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; use crate::filecache::{FileCacheConfig, FileCacheState}; use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources}; use crate::{Args, MiB, bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel}; /// Central struct that interacts with agent, dispatcher, and cgroup to handle /// signals from the agent. #[derive(Debug)] pub struct Runner { config: Config, filecache: Option, cgroup: Option, dispatcher: Dispatcher, /// We "mint" new message ids by incrementing this counter and taking the value. /// /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated /// by us vs the autoscaler-agent. counter: usize, last_upscale_request_at: Option, /// A signal to kill the main thread produced by `self.run()`. This is triggered /// when the server receives a new connection. When the thread receives the /// signal off this channel, it will gracefully shutdown. kill: broadcast::Receiver<()>, } #[derive(Debug)] struct CgroupState { watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>, /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale /// requests. threshold: u64, } /// Configuration for a `Runner` #[derive(Debug)] pub struct Config { /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before /// handing out the rest to userspace. This value is the estimated difference between the /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`. /// /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)". /// /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory /// size, rather than the self-reported memory size, according to the kernel. /// /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field /// should be removed once we have a better solution there. sys_buffer_bytes: u64, /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in /// other words, providing a ceiling for the highest value of the threshold by enforcing that /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the /// threshold. /// /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total /// memory. /// /// The default value of `0.15` means that we *guarantee* sending upscale requests if the /// cgroup is using more than 85% of total memory. cgroup_min_overhead_fraction: f64, cgroup_downscale_threshold_buffer_bytes: u64, } impl Default for Config { fn default() -> Self { Self { sys_buffer_bytes: 100 * MiB, cgroup_min_overhead_fraction: 0.15, cgroup_downscale_threshold_buffer_bytes: 100 * MiB, } } } impl Config { fn cgroup_threshold(&self, total_mem: u64) -> u64 { // We want our threshold to be met gracefully instead of letting postgres get OOM-killed // (or if there's room, spilling to swap). // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory // remaining above the threshold. (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64 } } impl Runner { /// Create a new monitor. #[tracing::instrument(skip_all, fields(?config, ?args))] pub async fn new( config: Config, args: &Args, ws: WebSocket, kill: broadcast::Receiver<()>, token: CancellationToken, ) -> anyhow::Result { anyhow::ensure!( config.sys_buffer_bytes != 0, "invalid monitor Config: sys_buffer_bytes cannot be 0" ); let dispatcher = Dispatcher::new(ws) .await .context("error creating new dispatcher")?; let mut state = Runner { config, filecache: None, cgroup: None, dispatcher, counter: 1, // NB: must be odd, see the comment about the field for more. last_upscale_request_at: None, kill, }; let mem = get_total_system_memory(); if let Some(connstr) = &args.pgconnstr { info!("initializing file cache"); let config = FileCacheConfig::default(); let mut file_cache = FileCacheState::new(connstr, config, token.clone()) .await .context("failed to create file cache")?; let size = file_cache .get_file_cache_size() .await .context("error getting file cache size")?; let new_size = file_cache.config.calculate_cache_size(mem); info!( initial = bytes_to_mebibytes(size), new = bytes_to_mebibytes(new_size), "setting initial file cache size", ); // note: even if size == new_size, we want to explicitly set it, just // to make sure that we have the permissions to do so let actual_size = file_cache .set_file_cache_size(new_size) .await .context("failed to set file cache size, possibly due to inadequate permissions")?; if actual_size != new_size { info!("file cache size actually got set to {actual_size}") } state.filecache = Some(file_cache); } if let Some(name) = &args.cgroup { // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state // now, and then set limits later. info!("initializing cgroup"); let cgroup = CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?; let init_value = cgroup::MemoryHistory { avg_non_reclaimable: 0, samples_count: 0, samples_span: Duration::ZERO, }; let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value)); spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move { cgroup.watch(hist_tx).await }); let threshold = state.config.cgroup_threshold(mem); info!(threshold, "set initial cgroup threshold",); state.cgroup = Some(CgroupState { watcher: hist_rx, threshold, }); } Ok(state) } /// Attempt to downscale filecache + cgroup #[tracing::instrument(skip_all, fields(?target))] pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> { // Nothing to adjust if self.cgroup.is_none() && self.filecache.is_none() { info!("no action needed for downscale (no cgroup or file cache enabled)"); return Ok(( true, "monitor is not managing cgroup or file cache".to_string(), )); } let requested_mem = target.mem; let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes); let expected_file_cache_size = self .filecache .as_ref() .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory)) .unwrap_or(0); if let Some(cgroup) = &self.cgroup { let (last_time, last_history) = *cgroup.watcher.borrow(); // NB: The ordering of these conditions is intentional. During startup, we should deny // downscaling until we have enough information to determine that it's safe to do so // (i.e. enough samples have come in). But if it's been a while and we *still* haven't // received any information, we should *fail* instead of just denying downscaling. // // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()` // serves double-duty: it trips if we haven't received *any* metrics for long enough, // OR if we haven't received metrics *recently enough*. // // TODO: make the duration here configurable. if last_time.elapsed() > Duration::from_secs(5) { bail!( "haven't gotten cgroup memory stats recently enough to determine downscaling information" ); } else if last_history.samples_count <= 1 { let status = "haven't received enough cgroup memory stats yet"; info!(status, "discontinuing downscale"); return Ok((false, status.to_owned())); } let new_threshold = self.config.cgroup_threshold(usable_system_memory); let current = last_history.avg_non_reclaimable; if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes { let status = format!( "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)", "calculated memory threshold too low", bytes_to_mebibytes(new_threshold), bytes_to_mebibytes(current), bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes) ); info!(status, "discontinuing downscale"); return Ok((false, status)); } } // The downscaling has been approved. Downscale the file cache, then the cgroup. let mut status = vec![]; if let Some(file_cache) = &mut self.filecache { let actual_usage = file_cache .set_file_cache_size(expected_file_cache_size) .await .context("failed to set file cache size")?; let message = format!( "set file cache size to {} MiB", bytes_to_mebibytes(actual_usage), ); info!("downscale: {message}"); status.push(message); } if let Some(cgroup) = &mut self.cgroup { let new_threshold = self.config.cgroup_threshold(usable_system_memory); let message = format!( "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB", bytes_to_mebibytes(cgroup.threshold), bytes_to_mebibytes(new_threshold), bytes_to_mebibytes(usable_system_memory) ); cgroup.threshold = new_threshold; info!("downscale: {message}"); status.push(message); } // TODO: make this status thing less jank let status = status.join("; "); Ok((true, status)) } /// Handle new resources #[tracing::instrument(skip_all, fields(?resources))] pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> { if self.filecache.is_none() && self.cgroup.is_none() { info!("no action needed for upscale (no cgroup or file cache enabled)"); return Ok(()); } let new_mem = resources.mem; let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes); if let Some(file_cache) = &mut self.filecache { let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory); info!( target = bytes_to_mebibytes(expected_usage), total = bytes_to_mebibytes(new_mem), "updating file cache size", ); let actual_usage = file_cache .set_file_cache_size(expected_usage) .await .context("failed to set file cache size")?; if actual_usage != expected_usage { warn!( "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib", bytes_to_mebibytes(expected_usage), bytes_to_mebibytes(actual_usage) ) } } if let Some(cgroup) = &mut self.cgroup { let new_threshold = self.config.cgroup_threshold(usable_system_memory); info!( "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB", bytes_to_mebibytes(cgroup.threshold), bytes_to_mebibytes(new_threshold), bytes_to_mebibytes(usable_system_memory) ); cgroup.threshold = new_threshold; } Ok(()) } /// Take in a message and perform some action, such as downscaling or upscaling, /// and return a message to be send back. #[tracing::instrument(skip_all, fields(%id, message = ?inner))] pub async fn process_message( &mut self, InboundMsg { inner, id }: InboundMsg, ) -> anyhow::Result> { match inner { InboundMsgKind::UpscaleNotification { granted } => { self.handle_upscale(granted) .await .context("failed to handle upscale")?; Ok(Some(OutboundMsg::new( OutboundMsgKind::UpscaleConfirmation {}, id, ))) } InboundMsgKind::DownscaleRequest { target } => self .try_downscale(target) .await .context("failed to downscale") .map(|(ok, status)| { Some(OutboundMsg::new( OutboundMsgKind::DownscaleResult { ok, status }, id, )) }), InboundMsgKind::InvalidMessage { error } => { warn!( error = format_args!("{error:#}"), id, "received notification of an invalid message we sent" ); Ok(None) } InboundMsgKind::InternalError { error } => { warn!( error = format_args!("{error:#}"), id, "agent experienced an internal error" ); Ok(None) } InboundMsgKind::HealthCheck {} => { Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id))) } } } // TODO: don't propagate errors, probably just warn!? #[tracing::instrument(skip_all)] pub async fn run(&mut self) -> anyhow::Result<()> { info!("starting dispatcher"); loop { tokio::select! { signal = self.kill.recv() => { match signal { Ok(()) => return Ok(()), Err(e) => bail!("failed to receive kill signal: {e}") } } // New memory stats from the cgroup, *may* need to request upscaling, if we've // exceeded the threshold result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => { result.context("failed to receive from cgroup memory stats watcher")?; let cgroup = self.cgroup.as_ref().unwrap(); let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow(); // If we haven't exceeded the threshold, then we're all ok if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold { continue; } // Otherwise, we generally want upscaling. But, if it's been less than 1 second // since the last time we requested upscaling, ignore the event, to avoid // spamming the agent. if let Some(t) = self.last_upscale_request_at { let elapsed = t.elapsed(); if elapsed < Duration::from_secs(1) { // *Ideally* we'd like to log here that we're ignoring the fact the // memory stats are too high, but in practice this can result in // spamming the logs with repetitive messages about ignoring the signal // // See https://github.com/neondatabase/neon/issues/5865 for more. continue; } } self.last_upscale_request_at = Some(Instant::now()); info!( avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable), threshold = bytes_to_mebibytes(cgroup.threshold), "cgroup memory stats are high enough to upscale, requesting upscale", ); self.counter += 2; // Increment, preserving parity (i.e. keep the // counter odd). See the field comment for more. self.dispatcher .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter)) .await .context("failed to send message")?; }, // there is a message from the agent msg = self.dispatcher.source.next() => { if let Some(msg) = msg { match &msg { Ok(msg) => { let message: InboundMsg = match msg { Message::Text(text) => { serde_json::from_str(text).context("failed to deserialize text message")? } other => { warn!( // Don't use 'message' as a key as the // string also uses that for its key msg = ?other, "problem processing incoming message: agent should only send text messages but received different type" ); continue }, }; if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) { debug!(?msg, "received message"); } else { info!(?msg, "received message"); } let out = match self.process_message(message.clone()).await { Ok(Some(out)) => out, Ok(None) => continue, Err(e) => { // use {:#} for our logging because the display impl only // gives the outermost cause, and the debug impl // pretty-prints the error, whereas {:#} contains all the // causes, but is compact (no newlines). warn!(error = format_args!("{e:#}"), "error handling message"); OutboundMsg::new( OutboundMsgKind::InternalError { error: e.to_string(), }, message.id ) } }; self.dispatcher .send(out) .await .context("failed to send message")?; } Err(e) => warn!( error = format_args!("{e:#}"), msg = ?msg, "received error message" ), } } else { anyhow::bail!("dispatcher connection closed") } } } } } } ================================================ FILE: libs/wal_decoder/Cargo.toml ================================================ [package] name = "wal_decoder" version = "0.1.0" edition.workspace = true license.workspace = true [features] testing = ["pageserver_api/testing"] [dependencies] async-compression.workspace = true anyhow.workspace = true bytes.workspace = true pageserver_api.workspace = true prost.workspace = true postgres_ffi.workspace = true postgres_ffi_types.workspace = true serde.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["io-util"] } tracing.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } [build-dependencies] tonic-build.workspace = true [dev-dependencies] criterion.workspace = true camino.workspace = true camino-tempfile.workspace = true remote_storage.workspace = true tokio-util.workspace = true serde_json.workspace = true futures.workspace = true tikv-jemallocator.workspace = true pprof.workspace = true [[bench]] name = "bench_interpret_wal" harness = false ================================================ FILE: libs/wal_decoder/benches/README.md ================================================ ## WAL Decoding and Interpretation Benchmarks Note that these benchmarks pull WAL from a public bucket in S3 as a preparation step. Hence, you need a way to auth with AWS. You can achieve this by copying the `~/.aws/config` file from the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking the benchmarks. To run benchmarks: ```sh aws sso login --profile dev # All benchmarks. AWS_PROFILE=dev cargo bench --package wal_decoder # Specific file. AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal # Specific benchmark. AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded # List available benchmarks. cargo bench --package wal_decoder --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. Benchmarks are automatically compared against the previous run. To compare against other runs, see `--baseline` and `--save-baseline`. ================================================ FILE: libs/wal_decoder/benches/bench_interpret_wal.rs ================================================ use std::env; use std::num::NonZeroUsize; use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::Utf8TempDir; use criterion::{Criterion, criterion_group, criterion_main}; use futures::StreamExt; use futures::stream::FuturesUnordered; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, WAL_SEGMENT_SIZE}; use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use serde::Deserialize; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; use utils::shard::{ShardCount, ShardNumber}; use wal_decoder::models::InterpretedWalRecord; const S3_BUCKET: &str = "neon-github-public-dev"; const S3_REGION: &str = "eu-central-1"; const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/"; const METADATA_FILENAME: &str = "metadata.json"; /// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. /// This mirrors the configuration in bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; async fn create_s3_client() -> anyhow::Result> { let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: S3_BUCKET.to_string(), bucket_region: S3_REGION.to_string(), prefix_in_bucket: Some(BUCKET_PREFIX.to_string()), endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: None, upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) .await .context("remote storage init")?, )) } async fn download_bench_data( client: Arc, cancel: &CancellationToken, ) -> anyhow::Result { let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?; let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?; eprintln!("Downloading benchmark data to {temp_dir:?}"); let listing = client .list(None, ListingMode::NoDelimiter, None, cancel) .await?; let mut downloads = listing .keys .into_iter() .map(|obj| { let client = client.clone(); let temp_dir_path = temp_dir.path().to_owned(); async move { let remote_path = obj.key; let download = client .download(&remote_path, &DownloadOpts::default(), cancel) .await?; let mut body = tokio_util::io::StreamReader::new(download.download_stream); let file_name = remote_path.object_name().unwrap(); let file_path = temp_dir_path.join(file_name); let file = tokio::fs::OpenOptions::new() .create(true) .truncate(true) .write(true) .open(&file_path) .await?; let mut writer = tokio::io::BufWriter::new(file); tokio::io::copy_buf(&mut body, &mut writer).await?; Ok::<(), anyhow::Error>(()) } }) .collect::>(); while let Some(download) = downloads.next().await { download?; } Ok(temp_dir) } struct BenchmarkData { wal: Vec, meta: BenchmarkMetadata, } #[derive(Deserialize)] struct BenchmarkMetadata { pg_version: PgMajorVersion, start_lsn: Lsn, } async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result { eprintln!("Loading benchmark data from {path:?}"); let mut entries = tokio::fs::read_dir(path).await?; let mut ordered_segment_paths = Vec::new(); let mut metadata = None; while let Some(entry) = entries.next_entry().await? { if entry.file_name() == METADATA_FILENAME { let bytes = tokio::fs::read(entry.path()).await?; metadata = Some( serde_json::from_slice::(&bytes) .context("failed to deserialize metadata.json")?, ); } else { ordered_segment_paths.push(entry.path()); } } ordered_segment_paths.sort(); let mut buffer = Vec::new(); for path in ordered_segment_paths { if buffer.len() >= input_size { break; } use async_compression::tokio::bufread::ZstdDecoder; let file = tokio::fs::File::open(path).await?; let reader = tokio::io::BufReader::new(file); let decoder = ZstdDecoder::new(reader); let mut reader = tokio::io::BufReader::new(decoder); tokio::io::copy_buf(&mut reader, &mut buffer).await?; } buffer.truncate(input_size); Ok(BenchmarkData { wal: buffer, meta: metadata.unwrap(), }) } fn criterion_benchmark(c: &mut Criterion) { const INPUT_SIZE: usize = 128 * 1024 * 1024; let setup_runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); let (_temp_dir, bench_data) = setup_runtime.block_on(async move { let cancel = CancellationToken::new(); let client = create_s3_client().await.unwrap(); let temp_dir = download_bench_data(client, &cancel).await.unwrap(); let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap(); (temp_dir, bench_data) }); eprintln!( "Benchmarking against {} MiB of WAL", INPUT_SIZE / 1024 / 1024 ); let mut group = c.benchmark_group("decode-interpret-wal"); group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64)); group.sample_size(10); group.bench_function("unsharded", |b| { b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()])) }); let eight_shards = (0..8) .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap()) .collect::>(); group.bench_function("8/8-shards", |b| { b.iter(|| decode_interpret_main(&bench_data, &eight_shards)) }); let four_shards = eight_shards .into_iter() .filter(|s| s.number.0 % 2 == 0) .collect::>(); group.bench_function("4/8-shards", |b| { b.iter(|| decode_interpret_main(&bench_data, &four_shards)) }); let two_shards = four_shards .into_iter() .filter(|s| s.number.0 % 4 == 0) .collect::>(); group.bench_function("2/8-shards", |b| { b.iter(|| decode_interpret_main(&bench_data, &two_shards)) }); } fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) { let r = decode_interpret(bench, shards); if let Err(e) = r { panic!("{e:?}"); } } fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> { let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version); let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE); for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { assert!(lsn.is_aligned()); let _ = InterpretedWalRecord::from_bytes_filtered( recdata, shard, lsn, bench.meta.pg_version, ) .unwrap(); } } Ok(()) } criterion_group!( name=benches; config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets=criterion_benchmark ); criterion_main!(benches); ================================================ FILE: libs/wal_decoder/build.rs ================================================ fn main() -> Result<(), Box> { // Generate rust code from .proto protobuf. // // Note: we previously tried to use deterministic location at proto/ for // easy location, but apparently interference with cachepot sometimes fails // the build then. Anyway, per cargo docs build script shouldn't output to // anywhere but $OUT_DIR. tonic_build::compile_protos("proto/interpreted_wal.proto") .unwrap_or_else(|e| panic!("failed to compile protos {e:?}")); Ok(()) } ================================================ FILE: libs/wal_decoder/proto/interpreted_wal.proto ================================================ syntax = "proto3"; package interpreted_wal; message InterpretedWalRecords { repeated InterpretedWalRecord records = 1; optional uint64 next_record_lsn = 2; optional uint64 raw_wal_start_lsn = 3; } message InterpretedWalRecord { optional bytes metadata_record = 1; SerializedValueBatch batch = 2; uint64 next_record_lsn = 3; bool flush_uncommitted = 4; uint32 xid = 5; } message SerializedValueBatch { bytes raw = 1; repeated ValueMeta metadata = 2; uint64 max_lsn = 3; uint64 len = 4; } enum ValueMetaType { Serialized = 0; Observed = 1; } message ValueMeta { ValueMetaType type = 1; CompactKey key = 2; uint64 lsn = 3; optional uint64 batch_offset = 4; optional uint64 len = 5; optional bool will_init = 6; } message CompactKey { uint64 high = 1; uint64 low = 2; } ================================================ FILE: libs/wal_decoder/src/decoder.rs ================================================ //! This module contains logic for decoding and interpreting //! raw bytes which represent a raw Postgres WAL record. use std::collections::HashMap; use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::*; use postgres_ffi::{PgMajorVersion, pg_constants}; use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use utils::lsn::Lsn; use crate::models::*; use crate::serialized_batch::SerializedValueBatch; impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. /// Data blocks which do not match any of the provided shard identities are filtered out. /// Shard 0 is a special case since it tracks all relation sizes. We only give it /// the keys that are being written as that is enough for updating relation sizes. pub fn from_bytes_filtered( buf: Bytes, shards: &[ShardIdentity], next_record_lsn: Lsn, pg_version: PgMajorVersion, ) -> anyhow::Result> { let mut decoded = DecodedWALRecord::default(); decode_wal_record(buf, &mut decoded, pg_version)?; let xid = decoded.xl_xid; let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) { FlushUncommittedRecords::Yes } else { FlushUncommittedRecords::No }; let mut shard_records: HashMap = HashMap::with_capacity(shards.len()); for shard in shards { shard_records.insert( *shard, InterpretedWalRecord { metadata_record: None, batch: SerializedValueBatch::default(), next_record_lsn, flush_uncommitted, xid, }, ); } MetadataRecord::from_decoded_filtered( &decoded, &mut shard_records, next_record_lsn, pg_version, )?; SerializedValueBatch::from_decoded_filtered( decoded, &mut shard_records, next_record_lsn, pg_version, )?; Ok(shard_records) } } impl MetadataRecord { /// Populates the given `shard_records` with metadata records from this WAL record, if any, /// discarding those belonging to other shards. /// /// Only metadata records relevant for the given shards is emitted. Currently, most metadata /// records are broadcast to all shards for simplicity, but this should be improved. fn from_decoded_filtered( decoded: &DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Note: this doesn't actually copy the bytes since // the [`Bytes`] type implements it via a level of indirection. let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // First, generate metadata records from the decoded WAL record. let metadata_record = match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { Self::decode_heapam_record(&mut buf, decoded, pg_version)? } pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?, // Handle other special record types pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?, pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?, pg_constants::RM_TBLSPC_ID => { tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet"); None } pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?, pg_constants::RM_XACT_ID => { Self::decode_xact_record(&mut buf, decoded, next_record_lsn)? } pg_constants::RM_MULTIXACT_ID => { Self::decode_multixact_record(&mut buf, decoded, pg_version)? } pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?, // This is an odd duck. It needs to go to all shards. // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY // in WalIngest::new), we have to send the whole DecodedWalRecord::record to // the pageserver and decode it there. // // Alternatively, one can make the checkpoint part of the subscription protocol // to the pageserver. This should work fine, but can be done at a later point. pg_constants::RM_XLOG_ID => { Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)? } pg_constants::RM_LOGICALMSG_ID => { Self::decode_logical_message_record(&mut buf, decoded)? } pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?, pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?, _unexpected => { // TODO: consider failing here instead of blindly doing something without // understanding the protocol None } }; // Next, filter the metadata record by shard. for (shard, record) in shard_records.iter_mut() { match metadata_record { Some( MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits)) | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)), ) => { // Route VM page updates to the shards that own them. VM pages are stored in the VM fork // of the main relation. These are sharded and managed just like regular relation pages. // See: https://github.com/neondatabase/neon/issues/9855 let is_local_vm_page = |heap_blk| { let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) }; // Send the old and new VM page updates to their respective shards. let updated_old_heap_blkno = clear_vm_bits .old_heap_blkno .filter(|&blkno| is_local_vm_page(blkno)); let updated_new_heap_blkno = clear_vm_bits .new_heap_blkno .filter(|&blkno| is_local_vm_page(blkno)); // If neither VM page belongs to this shard, discard the record. if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() { // Clone the record and update it for the current shard. let mut for_shard = metadata_record.clone(); match for_shard { Some( MetadataRecord::Heapam(HeapamRecord::ClearVmBits( ref mut clear_vm_bits, )) | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits( ref mut clear_vm_bits, )), ) => { clear_vm_bits.old_heap_blkno = updated_old_heap_blkno; clear_vm_bits.new_heap_blkno = updated_new_heap_blkno; record.metadata_record = for_shard; } _ => { unreachable!("for_shard is a clone of what we checked above") } } } } Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { // Filter LogicalMessage records (AUX files) to only be stored on shard zero if shard.is_shard_zero() { record.metadata_record = metadata_record; // No other shards should receive this record, so we stop traversing shards early. break; } } _ => { // All other metadata records are sent to all shards. record.metadata_record = metadata_record.clone(); } } } Ok(()) } fn decode_heapam_record( buf: &mut Bytes, decoded: &DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. // First, look at the record to determine which VM bits need // to be cleared. If either of these variables is set, we // need to clear the corresponding bits in the visibility map. let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; match pg_version { PgMajorVersion::PG14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP_INSERT { let xlrec = v14::XlHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_DELETE { let xlrec = v14::XlHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_UPDATE || info == pg_constants::XLOG_HEAP_HOT_UPDATE { let xlrec = v14::XlHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); } if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a // non-HOT update where the new tuple goes to different page than // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is // set. new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_LOCK { let xlrec = v14::XlHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { let xlrec = v14::XlHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { let xlrec = v14::XlHeapLockUpdated::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } PgMajorVersion::PG15 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP_INSERT { let xlrec = v15::XlHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_DELETE { let xlrec = v15::XlHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_UPDATE || info == pg_constants::XLOG_HEAP_HOT_UPDATE { let xlrec = v15::XlHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); } if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a // non-HOT update where the new tuple goes to different page than // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is // set. new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_LOCK { let xlrec = v15::XlHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { let xlrec = v15::XlHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { let xlrec = v15::XlHeapLockUpdated::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } PgMajorVersion::PG16 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP_INSERT { let xlrec = v16::XlHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_DELETE { let xlrec = v16::XlHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_UPDATE || info == pg_constants::XLOG_HEAP_HOT_UPDATE { let xlrec = v16::XlHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); } if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a // non-HOT update where the new tuple goes to different page than // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is // set. new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_LOCK { let xlrec = v16::XlHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { let xlrec = v16::XlHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { let xlrec = v16::XlHeapLockUpdated::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } PgMajorVersion::PG17 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP_INSERT { let xlrec = v17::XlHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_DELETE { let xlrec = v17::XlHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_UPDATE || info == pg_constants::XLOG_HEAP_HOT_UPDATE { let xlrec = v17::XlHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); } if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a // non-HOT update where the new tuple goes to different page than // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is // set. new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP_LOCK { let xlrec = v17::XlHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { let xlrec = v17::XlHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { let xlrec = v17::XlHeapLockUpdated::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } } else { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } } if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, }; Ok(Some(MetadataRecord::Heapam(HeapamRecord::ClearVmBits( ClearVmBits { new_heap_blkno, old_heap_blkno, vm_rel, flags, }, )))) } else { Ok(None) } } fn decode_neonmgr_record( buf: &mut Bytes, decoded: &DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. // First, look at the record to determine which VM bits need // to be cleared. If either of these variables is set, we // need to clear the corresponding bits in the visibility map. let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { PgMajorVersion::PG16 | PgMajorVersion::PG17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); } if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a // non-HOT update where the new tuple goes to different page than // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is // set. new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_LOCK => { let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; } } info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info), } } PgMajorVersion::PG15 | PgMajorVersion::PG14 => anyhow::bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", pg_version ), } if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, }; Ok(Some(MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits( ClearVmBits { new_heap_blkno, old_heap_blkno, vm_rel, flags, }, )))) } else { Ok(None) } } fn decode_smgr_record( buf: &mut Bytes, decoded: &DecodedWALRecord, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(buf); let rel = RelTag { spcnode: create.rnode.spcnode, dbnode: create.rnode.dbnode, relnode: create.rnode.relnode, forknum: create.forknum, }; return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Create(SmgrCreate { rel, })))); } else if info == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(buf); return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Truncate(truncate)))); } Ok(None) } fn decode_dbase_record( buf: &mut Bytes, decoded: &DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result> { // TODO: Refactor this to avoid the duplication between postgres versions. let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID"); match pg_version { PgMajorVersion::PG14 => { if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(buf); tracing::debug!("XLOG_DBASE_CREATE v14"); let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { db_id: createdb.db_id, tablespace_id: createdb.tablespace_id, src_db_id: createdb.src_db_id, src_tablespace_id: createdb.src_tablespace_id, })); return Ok(Some(record)); } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { db_id: dropdb.db_id, tablespace_ids: dropdb.tablespace_ids, })); return Ok(Some(record)); } } PgMajorVersion::PG15 => { if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { // The XLOG record was renamed between v14 and v15, // but the record format is the same. // So we can reuse XlCreateDatabase here. tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { db_id: createdb.db_id, tablespace_id: createdb.tablespace_id, src_db_id: createdb.src_db_id, src_tablespace_id: createdb.src_tablespace_id, })); return Ok(Some(record)); } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { db_id: dropdb.db_id, tablespace_ids: dropdb.tablespace_ids, })); return Ok(Some(record)); } } PgMajorVersion::PG16 => { if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { // The XLOG record was renamed between v14 and v15, // but the record format is the same. // So we can reuse XlCreateDatabase here. tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { db_id: createdb.db_id, tablespace_id: createdb.tablespace_id, src_db_id: createdb.src_db_id, src_tablespace_id: createdb.src_tablespace_id, })); return Ok(Some(record)); } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { db_id: dropdb.db_id, tablespace_ids: dropdb.tablespace_ids, })); return Ok(Some(record)); } } PgMajorVersion::PG17 => { if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { // The XLOG record was renamed between v14 and v15, // but the record format is the same. // So we can reuse XlCreateDatabase here. tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { db_id: createdb.db_id, tablespace_id: createdb.tablespace_id, src_db_id: createdb.src_db_id, src_tablespace_id: createdb.src_tablespace_id, })); return Ok(Some(record)); } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(buf); let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { db_id: dropdb.db_id, tablespace_ids: dropdb.tablespace_ids, })); return Ok(Some(record)); } } } Ok(None) } fn decode_clog_record( buf: &mut Bytes, decoded: &DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; Ok(Some(MetadataRecord::Clog(ClogRecord::ZeroPage( ClogZeroPage { segno, rpageno }, )))) } else { assert_eq!(info, pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(buf, pg_version); Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate( ClogTruncate { pageno: xlrec.pageno, oldest_xid: xlrec.oldest_xid, oldest_xid_db: xlrec.oldest_xid_db, }, )))) } } fn decode_xact_record( buf: &mut Bytes, decoded: &DecodedWALRecord, lsn: Lsn, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; let origin_id = decoded.origin_id; let xl_xid = decoded.xl_xid; if info == pg_constants::XLOG_XACT_COMMIT { let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info); return Ok(Some(MetadataRecord::Xact(XactRecord::Commit(XactCommon { parsed, origin_id, xl_xid, lsn, })))); } else if info == pg_constants::XLOG_XACT_ABORT { let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info); return Ok(Some(MetadataRecord::Xact(XactRecord::Abort(XactCommon { parsed, origin_id, xl_xid, lsn, })))); } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED { let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info); return Ok(Some(MetadataRecord::Xact(XactRecord::CommitPrepared( XactCommon { parsed, origin_id, xl_xid, lsn, }, )))); } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED { let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info); return Ok(Some(MetadataRecord::Xact(XactRecord::AbortPrepared( XactCommon { parsed, origin_id, xl_xid, lsn, }, )))); } else if info == pg_constants::XLOG_XACT_PREPARE { return Ok(Some(MetadataRecord::Xact(XactRecord::Prepare( XactPrepare { xl_xid: decoded.xl_xid, data: Bytes::copy_from_slice(&buf[..]), }, )))); } Ok(None) } fn decode_multixact_record( buf: &mut Bytes, decoded: &DecodedWALRecord, pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let slru_kind = match info { pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets, pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers, _ => unreachable!(), }; return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::ZeroPage( MultiXactZeroPage { slru_kind, segno, rpageno, }, )))); } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(buf); return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Create( xlrec, )))); } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(buf); return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Truncate( xlrec, )))); } Ok(None) } fn decode_relmap_record( buf: &mut Bytes, decoded: &DecodedWALRecord, ) -> anyhow::Result> { let update = XlRelmapUpdate::decode(buf); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // skip xl_relmap_update buf.advance(12); Ok(Some(MetadataRecord::Relmap(RelmapRecord::Update( RelmapUpdate { update, buf: Bytes::copy_from_slice(&buf[..]), }, )))) } fn decode_xlog_record( buf: &mut Bytes, decoded: &DecodedWALRecord, lsn: Lsn, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; Ok(Some(MetadataRecord::Xlog(XlogRecord::Raw(RawXlogRecord { info, lsn, buf: buf.clone(), })))) } fn decode_logical_message_record( buf: &mut Bytes, decoded: &DecodedWALRecord, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_LOGICAL_MESSAGE { let xlrec = XlLogicalMessage::decode(buf); let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; #[cfg(feature = "testing")] if prefix == "neon-test" { return Ok(Some(MetadataRecord::LogicalMessage( LogicalMessageRecord::Failpoint, ))); } if let Some(path) = prefix.strip_prefix("neon-file:") { let buf_size = xlrec.prefix_size + xlrec.message_size; let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]); return Ok(Some(MetadataRecord::LogicalMessage( LogicalMessageRecord::Put(PutLogicalMessage { path: path.to_string(), buf, }), ))); } } Ok(None) } fn decode_standby_record( buf: &mut Bytes, decoded: &DecodedWALRecord, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = XlRunningXacts::decode(buf); return Ok(Some(MetadataRecord::Standby(StandbyRecord::RunningXacts( StandbyRunningXacts { oldest_running_xid: xlrec.oldest_running_xid, }, )))); } Ok(None) } fn decode_replorigin_record( buf: &mut Bytes, decoded: &DecodedWALRecord, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_REPLORIGIN_SET { let xlrec = XlReploriginSet::decode(buf); return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Set( xlrec, )))); } else if info == pg_constants::XLOG_REPLORIGIN_DROP { let xlrec = XlReploriginDrop::decode(buf); return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Drop( xlrec, )))); } Ok(None) } } ================================================ FILE: libs/wal_decoder/src/lib.rs ================================================ pub mod decoder; pub mod models; pub mod serialized_batch; pub mod wire_format; ================================================ FILE: libs/wal_decoder/src/models/record.rs ================================================ //! This module defines the WAL record format used within the pageserver. use bytes::Bytes; use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use postgres_ffi_types::TimestampTz; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom neon-specific "record". #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum NeonWalRecord { /// Native PostgreSQL WAL record Postgres { will_init: bool, rec: Bytes }, /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) ClearVisibilityMapFlags { new_heap_blkno: Option, old_heap_blkno: Option, flags: u8, }, /// Mark transaction IDs as committed on a CLOG page ClogSetCommitted { xids: Vec, timestamp: TimestampTz, }, /// Mark transaction IDs as aborted on a CLOG page ClogSetAborted { xids: Vec }, /// Extend multixact offsets SLRU MultixactOffsetCreate { mid: MultiXactId, moff: MultiXactOffset, }, /// Extend multixact members SLRU. MultixactMembersCreate { moff: MultiXactOffset, members: Vec, }, /// Update the map of AUX files, either writing or dropping an entry AuxFile { file_path: String, content: Option, }, // Truncate visibility map page TruncateVisibilityMap { trunc_byte: usize, trunc_offs: usize, }, /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. #[cfg(feature = "testing")] Test { /// Append a string to the image. append: String, /// Clear the image before appending. clear: bool, /// Treat this record as an init record. `clear` should be set to true if this field is set /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and /// its references in `timeline.rs`. will_init: bool, /// Only append the record if the current image is the same as the one specified in this field. only_if: Option, }, } impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { // If you change this function, you'll also need to change ValueBytes::will_init match self { NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, #[cfg(feature = "testing")] NeonWalRecord::Test { will_init, .. } => *will_init, // None of the special neon record types currently initialize the page _ => false, } } #[cfg(feature = "testing")] pub fn wal_append(s: impl AsRef) -> Self { Self::Test { append: s.as_ref().to_string(), clear: false, will_init: false, only_if: None, } } #[cfg(feature = "testing")] pub fn wal_append_conditional(s: impl AsRef, only_if: impl AsRef) -> Self { Self::Test { append: s.as_ref().to_string(), clear: false, will_init: false, only_if: Some(only_if.as_ref().to_string()), } } #[cfg(feature = "testing")] pub fn wal_clear(s: impl AsRef) -> Self { Self::Test { append: s.as_ref().to_string(), clear: true, will_init: false, only_if: None, } } #[cfg(feature = "testing")] pub fn wal_init(s: impl AsRef) -> Self { Self::Test { append: s.as_ref().to_string(), clear: true, will_init: true, only_if: None, } } } /// Build a human-readable string to describe a WAL record /// /// For debugging purposes pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { match rec { NeonWalRecord::Postgres { will_init, rec } => Ok(format!( "will_init: {}, {}", will_init, describe_postgres_wal_record(rec)? )), _ => Ok(format!("{rec:?}")), } } ================================================ FILE: libs/wal_decoder/src/models/value.rs ================================================ //! This module defines the value type used by the storage engine. //! //! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]), //! or a "delta" of how to get from previous version of the value to the new one //! ([`Value::WalRecord`]]) //! //! Note that the [`Value`] type is used for the permananent storage format, so any //! changes to it must be backwards compatible. use bytes::Bytes; use serde::{Deserialize, Serialize}; use crate::models::record::NeonWalRecord; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value Image(Bytes), /// A WalRecord value contains a WAL record that needs to be /// replayed get the full value. Replaying the WAL record /// might need a previous version of the value (if will_init() /// returns false), or it may be replayed stand-alone (true). WalRecord(NeonWalRecord), } impl Value { #[inline(always)] pub fn is_image(&self) -> bool { matches!(self, Value::Image(_)) } #[inline(always)] pub fn will_init(&self) -> bool { match self { Value::Image(_) => true, Value::WalRecord(rec) => rec.will_init(), } } #[inline(always)] pub fn estimated_size(&self) -> usize { match self { Value::Image(image) => image.len(), Value::WalRecord(NeonWalRecord::AuxFile { content: Some(content), .. }) => content.len(), Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4, Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4, Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => { members.len() * 8 } _ => 8192, /* use image size as the estimation */ } } } #[derive(Debug, PartialEq)] pub enum InvalidInput { TooShortValue, TooShortPostgresRecord, } /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets /// use this type for querying if a slice looks some particular way. pub struct ValueBytes; impl ValueBytes { #[inline(always)] pub fn will_init(raw: &[u8]) -> Result { if raw.len() < 12 { return Err(InvalidInput::TooShortValue); } let value_discriminator = &raw[0..4]; if value_discriminator == [0, 0, 0, 0] { // Value::Image always initializes return Ok(true); } if value_discriminator != [0, 0, 0, 1] { // not a Value::WalRecord(..) return Ok(false); } let walrecord_discriminator = &raw[4..8]; if walrecord_discriminator != [0, 0, 0, 0] { // only NeonWalRecord::Postgres can have will_init return Ok(false); } if raw.len() < 17 { return Err(InvalidInput::TooShortPostgresRecord); } Ok(raw[8] == 1) } } #[cfg(test)] mod test { use bytes::Bytes; use utils::bin_ser::BeSer; use super::*; macro_rules! roundtrip { ($orig:expr, $expected:expr) => {{ let orig: Value = $orig; let actual = Value::ser(&orig).unwrap(); let expected: &[u8] = &$expected; assert_eq!(utils::Hex(&actual), utils::Hex(expected)); let deser = Value::des(&actual).unwrap(); assert_eq!(orig, deser); }}; } #[test] fn image_roundtrip() { let image = Bytes::from_static(b"foobar"); let image = Value::Image(image); #[rustfmt::skip] let expected = [ // top level discriminator of 4 bytes 0x00, 0x00, 0x00, 0x00, // 8 byte length 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, // foobar 0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72 ]; roundtrip!(image, expected); assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] fn walrecord_postgres_roundtrip() { let rec = NeonWalRecord::Postgres { will_init: true, rec: Bytes::from_static(b"foobar"), }; let rec = Value::WalRecord(rec); #[rustfmt::skip] let expected = [ // flattened discriminator of total 8 bytes 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, // will_init 0x01, // 8 byte length 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, // foobar 0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72 ]; roundtrip!(rec, expected); assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] fn bytes_inspection_too_short_image() { let rec = Value::Image(Bytes::from_static(b"")); #[rustfmt::skip] let expected = [ // top level discriminator of 4 bytes 0x00, 0x00, 0x00, 0x00, // 8 byte length 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; roundtrip!(rec, expected); assert!(ValueBytes::will_init(&expected).unwrap()); assert_eq!(expected.len(), 12); for len in 0..12 { assert_eq!( ValueBytes::will_init(&expected[..len]).unwrap_err(), InvalidInput::TooShortValue ); } } #[test] fn bytes_inspection_too_short_postgres_record() { let rec = NeonWalRecord::Postgres { will_init: false, rec: Bytes::from_static(b""), }; let rec = Value::WalRecord(rec); #[rustfmt::skip] let expected = [ // flattened discriminator of total 8 bytes 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, // will_init 0x00, // 8 byte length 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; roundtrip!(rec, expected); assert!(!ValueBytes::will_init(&expected).unwrap()); assert_eq!(expected.len(), 17); for len in 12..17 { assert_eq!( ValueBytes::will_init(&expected[..len]).unwrap_err(), InvalidInput::TooShortPostgresRecord ) } for len in 0..12 { assert_eq!( ValueBytes::will_init(&expected[..len]).unwrap_err(), InvalidInput::TooShortValue ) } } #[test] fn clear_visibility_map_flags_example() { let rec = NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: Some(0x11), old_heap_blkno: None, flags: 0x03, }; let rec = Value::WalRecord(rec); #[rustfmt::skip] let expected = [ // discriminators 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, // Some == 1 followed by 4 bytes 0x01, 0x00, 0x00, 0x00, 0x11, // None == 0 0x00, // flags 0x03 ]; roundtrip!(rec, expected); assert!(!ValueBytes::will_init(&expected).unwrap()); } } ================================================ FILE: libs/wal_decoder/src/models.rs ================================================ //! This module houses types which represent decoded PG WAL records //! ready for the pageserver to interpret. They are derived from the original //! WAL records, so that each struct corresponds closely to one WAL record of //! a specific kind. They contain the same information as the original WAL records, //! but the values are already serialized in a [`SerializedValueBatch`], which //! is the format that the pageserver is expecting them in. //! //! The ingestion code uses these structs to help with parsing the WAL records, //! and it splits them into a stream of modifications to the key-value pairs that //! are ultimately stored in delta layers. See also the split-out counterparts in //! [`postgres_ffi::walrecord`]. //! //! The pipeline which processes WAL records is not super obvious, so let's follow //! the flow of an example XACT_COMMIT Postgres record: //! //! (Postgres XACT_COMMIT record) //! | //! |--> pageserver::walingest::WalIngest::decode_xact_record //! | //! |--> ([`XactRecord::Commit`]) //! | //! |--> pageserver::walingest::WalIngest::ingest_xact_record //! | //! |--> (NeonWalRecord::ClogSetCommitted) //! | //! |--> write to KV store within the pageserver pub mod record; pub mod value; use bytes::Bytes; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::walrecord::{ XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet, XlSmgrTruncate, XlXactParsedRecord, }; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use utils::lsn::Lsn; use crate::serialized_batch::SerializedValueBatch; // Code generated by protobuf. pub mod proto { // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]` // we don't use these types for anything but broker data transmission, // so it's ok to ignore this one. #![allow(clippy::derive_partial_eq_without_eq)] // The generated ValueMeta has a `len` method generate for its `len` field. #![allow(clippy::len_without_is_empty)] include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs"))); } #[derive(Copy, Clone, Serialize, Deserialize)] pub enum FlushUncommittedRecords { Yes, No, } /// A batch of interpreted WAL records #[derive(Serialize, Deserialize)] pub struct InterpretedWalRecords { pub records: Vec, // Start LSN of the next record after the batch. // Note that said record may not belong to the current shard. pub next_record_lsn: Lsn, // Inclusive start LSN of the PG WAL from which the interpreted // WAL records were extracted. Note that this is not necessarily the // start LSN of the first interpreted record in the batch. pub raw_wal_start_lsn: Option, } /// An interpreted Postgres WAL record, ready to be handled by the pageserver #[derive(Serialize, Deserialize, Clone)] pub struct InterpretedWalRecord { /// Optional metadata record - may cause writes to metadata keys /// in the storage engine pub metadata_record: Option, /// A pre-serialized batch along with the required metadata for ingestion /// by the pageserver pub batch: SerializedValueBatch, /// Byte offset within WAL for the start of the next PG WAL record. /// Usually this is the end LSN of the current record, but in case of /// XLOG SWITCH records it will be within the next segment. pub next_record_lsn: Lsn, /// Whether to flush all uncommitted modifications to the storage engine /// before ingesting this record. This is currently only used for legacy PG /// database creations which read pages from a template database. Such WAL /// records require reading data blocks while ingesting, hence the need to flush. pub flush_uncommitted: FlushUncommittedRecords, /// Transaction id of the original PG WAL record pub xid: TransactionId, } impl InterpretedWalRecord { /// Checks if the WAL record is empty /// /// An empty interpreted WAL record has no data or metadata and does not have to be sent to the /// pageserver. pub fn is_empty(&self) -> bool { self.batch.is_empty() && self.metadata_record.is_none() && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) } /// Checks if the WAL record is observed (i.e. contains only metadata /// for observed values) pub fn is_observed(&self) -> bool { self.batch.is_observed() && self.metadata_record.is_none() && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) } } /// The interpreted part of the Postgres WAL record which requires metadata /// writes to the underlying storage engine. #[derive(Clone, Serialize, Deserialize)] pub enum MetadataRecord { Heapam(HeapamRecord), Neonrmgr(NeonrmgrRecord), Smgr(SmgrRecord), Dbase(DbaseRecord), Clog(ClogRecord), Xact(XactRecord), MultiXact(MultiXactRecord), Relmap(RelmapRecord), Xlog(XlogRecord), LogicalMessage(LogicalMessageRecord), Standby(StandbyRecord), Replorigin(ReploriginRecord), } #[derive(Clone, Serialize, Deserialize)] pub enum HeapamRecord { ClearVmBits(ClearVmBits), } #[derive(Clone, Serialize, Deserialize)] pub struct ClearVmBits { pub new_heap_blkno: Option, pub old_heap_blkno: Option, pub vm_rel: RelTag, pub flags: u8, } #[derive(Clone, Serialize, Deserialize)] pub enum NeonrmgrRecord { ClearVmBits(ClearVmBits), } #[derive(Clone, Serialize, Deserialize)] pub enum SmgrRecord { Create(SmgrCreate), Truncate(XlSmgrTruncate), } #[derive(Clone, Serialize, Deserialize)] pub struct SmgrCreate { pub rel: RelTag, } #[derive(Clone, Serialize, Deserialize)] pub enum DbaseRecord { Create(DbaseCreate), Drop(DbaseDrop), } #[derive(Clone, Serialize, Deserialize)] pub struct DbaseCreate { pub db_id: Oid, pub tablespace_id: Oid, pub src_db_id: Oid, pub src_tablespace_id: Oid, } #[derive(Clone, Serialize, Deserialize)] pub struct DbaseDrop { pub db_id: Oid, pub tablespace_ids: Vec, } #[derive(Clone, Serialize, Deserialize)] pub enum ClogRecord { ZeroPage(ClogZeroPage), Truncate(ClogTruncate), } #[derive(Clone, Serialize, Deserialize)] pub struct ClogZeroPage { pub segno: u32, pub rpageno: u32, } #[derive(Clone, Serialize, Deserialize)] pub struct ClogTruncate { pub pageno: u32, pub oldest_xid: TransactionId, pub oldest_xid_db: Oid, } #[derive(Clone, Serialize, Deserialize)] pub enum XactRecord { Commit(XactCommon), Abort(XactCommon), CommitPrepared(XactCommon), AbortPrepared(XactCommon), Prepare(XactPrepare), } #[derive(Clone, Serialize, Deserialize)] pub struct XactCommon { pub parsed: XlXactParsedRecord, pub origin_id: u16, // Fields below are only used for logging pub xl_xid: TransactionId, pub lsn: Lsn, } #[derive(Clone, Serialize, Deserialize)] pub struct XactPrepare { pub xl_xid: TransactionId, pub data: Bytes, } #[derive(Clone, Serialize, Deserialize)] pub enum MultiXactRecord { ZeroPage(MultiXactZeroPage), Create(XlMultiXactCreate), Truncate(XlMultiXactTruncate), } #[derive(Clone, Serialize, Deserialize)] pub struct MultiXactZeroPage { pub slru_kind: SlruKind, pub segno: u32, pub rpageno: u32, } #[derive(Clone, Serialize, Deserialize)] pub enum RelmapRecord { Update(RelmapUpdate), } #[derive(Clone, Serialize, Deserialize)] pub struct RelmapUpdate { pub update: XlRelmapUpdate, pub buf: Bytes, } #[derive(Clone, Serialize, Deserialize)] pub enum XlogRecord { Raw(RawXlogRecord), } #[derive(Clone, Serialize, Deserialize)] pub struct RawXlogRecord { pub info: u8, pub lsn: Lsn, pub buf: Bytes, } #[derive(Clone, Serialize, Deserialize)] pub enum LogicalMessageRecord { Put(PutLogicalMessage), #[cfg(feature = "testing")] Failpoint, } #[derive(Clone, Serialize, Deserialize)] pub struct PutLogicalMessage { pub path: String, pub buf: Bytes, } #[derive(Clone, Serialize, Deserialize)] pub enum StandbyRecord { RunningXacts(StandbyRunningXacts), } #[derive(Clone, Serialize, Deserialize)] pub struct StandbyRunningXacts { pub oldest_running_xid: TransactionId, } #[derive(Clone, Serialize, Deserialize)] pub enum ReploriginRecord { Set(XlReploriginSet), Drop(XlReploriginDrop), } ================================================ FILE: libs/wal_decoder/src/serialized_batch.rs ================================================ //! This module implements batch type for serialized [`crate::models::value::Value`] //! instances. Each batch contains a raw buffer (serialized values) //! and a list of metadata for each (key, LSN) tuple present in the batch. //! //! Such batches are created from decoded PG wal records and ingested //! by the pageserver by writing directly to the ephemeral file. use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; use pageserver_api::key::{CompactKey, Key, rel_block_to_key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; use postgres_ffi::{BLCKSZ, PgMajorVersion, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; use crate::models::InterpretedWalRecord; use crate::models::record::NeonWalRecord; use crate::models::value::Value; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); /// Accompanying metadata for the batch /// A value may be serialized and stored into the batch or just "observed". /// Shard 0 currently "observes" all values in order to accurately track /// relation sizes. In the case of "observed" values, we only need to know /// the key and LSN, so two types of metadata are supported to save on network /// bandwidth. #[derive(Serialize, Deserialize, Clone)] pub enum ValueMeta { Serialized(SerializedValueMeta), Observed(ObservedValueMeta), } impl ValueMeta { pub fn key(&self) -> CompactKey { match self { Self::Serialized(ser) => ser.key, Self::Observed(obs) => obs.key, } } pub fn lsn(&self) -> Lsn { match self { Self::Serialized(ser) => ser.lsn, Self::Observed(obs) => obs.lsn, } } } /// Wrapper around [`ValueMeta`] that implements ordering by /// (key, LSN) tuples struct OrderedValueMeta(ValueMeta); impl Ord for OrderedValueMeta { fn cmp(&self, other: &Self) -> std::cmp::Ordering { (self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn())) } } impl PartialOrd for OrderedValueMeta { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl PartialEq for OrderedValueMeta { fn eq(&self, other: &Self) -> bool { (self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn()) } } impl Eq for OrderedValueMeta {} /// Metadata for a [`Value`] serialized into the batch. #[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueMeta { pub key: CompactKey, pub lsn: Lsn, /// Starting offset of the value for the (key, LSN) tuple /// in [`SerializedValueBatch::raw`] pub batch_offset: u64, pub len: usize, pub will_init: bool, } /// Metadata for a [`Value`] observed by the batch #[derive(Serialize, Deserialize, Clone)] pub struct ObservedValueMeta { pub key: CompactKey, pub lsn: Lsn, } /// Batch of serialized [`Value`]s. #[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueBatch { /// [`Value`]s serialized in EphemeralFile's native format, /// ready for disk write by the pageserver pub raw: Vec, /// Metadata to make sense of the bytes in [`Self::raw`] /// and represent "observed" values. /// /// Invariant: Metadata entries for any given key are ordered /// by LSN. Note that entries for a key do not have to be contiguous. pub metadata: Vec, /// The highest LSN of any value in the batch pub max_lsn: Lsn, /// Number of values encoded by [`Self::raw`] pub len: usize, } impl Default for SerializedValueBatch { fn default() -> Self { Self { raw: Default::default(), metadata: Default::default(), max_lsn: Lsn(0), len: 0, } } } impl SerializedValueBatch { /// Populates the given `shard_records` with value batches from this WAL record, if any, /// discarding those belonging to other shards. /// /// The batch will only contain values for keys targeting the specifiec /// shard. Shard 0 is a special case, where any keys that don't belong to /// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`], /// but absent from the raw buffer [`SerializedValueBatch::raw`]). pub(crate) fn from_decoded_filtered( decoded: DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // First determine how big the buffers need to be and allocate it up-front. // This duplicates some of the work below, but it's empirically much faster. for (shard, record) in shard_records.iter_mut() { assert!(record.batch.is_empty()); let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version); record.batch.raw = Vec::with_capacity(estimate); } for blk in decoded.blocks.iter() { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum, }; let key = rel_block_to_key(rel, blk.blkno); if !key.is_valid_key_on_write_path() { anyhow::bail!( "Unsupported key decoded at LSN {}: {}", next_record_lsn, key ); } for (shard, record) in shard_records.iter_mut() { let key_is_local = shard.is_key_local(&key); tracing::debug!( lsn=%next_record_lsn, key=%key, "ingest: shard decision {}", if !key_is_local { "drop" } else { "keep" }, ); if !key_is_local { if shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. record .batch .metadata .push(ValueMeta::Observed(ObservedValueMeta { key: key.to_compact(), lsn: next_record_lsn, })) } continue; } // Instead of storing full-page-image WAL record, // it is better to store extracted image: we can skip wal-redo // in this case. Also some FPI records may contain multiple (up to 32) pages, // so them have to be copied multiple times. // let val = if Self::block_is_image(&decoded, blk, pg_version) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; let img_offs = blk.bimg_offset as usize; let mut image = BytesMut::with_capacity(BLCKSZ as usize); // TODO(vlad): skip the copy image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); if blk.hole_length != 0 { let tail = image.split_off(blk.hole_offset as usize); image.resize(image.len() + blk.hole_length as usize, 0u8); image.unsplit(tail); } // // Match the logic of XLogReadBufferForRedoExtended: // The page may be uninitialized. If so, we can't set the LSN because // that would corrupt the page. // if !page_is_new(&image) { page_set_lsn(&mut image, next_record_lsn) } assert_eq!(image.len(), BLCKSZ as usize); Value::Image(image.freeze()) } else { Value::WalRecord(NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }) }; let relative_off = record.batch.raw.len() as u64; val.ser_into(&mut record.batch.raw) .expect("Writing into in-memory buffer is infallible"); let val_ser_size = record.batch.raw.len() - relative_off as usize; record .batch .metadata .push(ValueMeta::Serialized(SerializedValueMeta { key: key.to_compact(), lsn: next_record_lsn, batch_offset: relative_off, len: val_ser_size, will_init: val.will_init(), })); record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn); record.batch.len += 1; } } if cfg!(any(debug_assertions, test)) { // Validate that the batches are correct for record in shard_records.values() { record.batch.validate_lsn_order(); } } Ok(()) } /// Look into the decoded PG WAL record and determine /// roughly how large the buffer for serialized values needs to be. fn estimate_buffer_size( decoded: &DecodedWALRecord, shard: &ShardIdentity, pg_version: PgMajorVersion, ) -> usize { let mut estimate: usize = 0; for blk in decoded.blocks.iter() { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum, }; let key = rel_block_to_key(rel, blk.blkno); if !shard.is_key_local(&key) { continue; } if Self::block_is_image(decoded, blk, pg_version) { // 4 bytes for the Value::Image discriminator // 8 bytes for encoding the size of the buffer // BLCKSZ for the raw image estimate += (4 + 8 + BLCKSZ) as usize; } else { // 4 bytes for the Value::WalRecord discriminator // 4 bytes for the NeonWalRecord::Postgres discriminator // 1 bytes for NeonWalRecord::Postgres::will_init // 8 bytes for encoding the size of the buffer // length of the raw record estimate += 8 + 1 + 8 + decoded.record.len(); } } estimate } fn block_is_image( decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: PgMajorVersion, ) -> bool { blk.apply_image && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version) // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 } /// Encode a list of values and metadata into a serialized batch /// /// This is used by the pageserver ingest code to conveniently generate /// batches for metadata writes. pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self { // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`] let buffer_size = batch.iter().map(|i| i.2).sum::(); let mut buf = Vec::::with_capacity(buffer_size); let mut metadata: Vec = Vec::with_capacity(batch.len()); let mut max_lsn: Lsn = Lsn(0); let len = batch.len(); for (key, lsn, val_ser_size, val) in batch { let relative_off = buf.len() as u64; val.ser_into(&mut buf) .expect("Writing into in-memory buffer is infallible"); metadata.push(ValueMeta::Serialized(SerializedValueMeta { key, lsn, batch_offset: relative_off, len: val_ser_size, will_init: val.will_init(), })); max_lsn = std::cmp::max(max_lsn, lsn); } // Assert that we didn't do any extra allocations while building buffer. debug_assert!(buf.len() <= buffer_size); if cfg!(any(debug_assertions, test)) { let batch = Self { raw: buf, metadata, max_lsn, len, }; batch.validate_lsn_order(); return batch; } Self { raw: buf, metadata, max_lsn, len, } } /// Add one value to the batch /// /// This is used by the pageserver ingest code to include metadata block /// updates for a single key. pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) { let relative_off = self.raw.len() as u64; value.ser_into(&mut self.raw).unwrap(); let val_ser_size = self.raw.len() - relative_off as usize; self.metadata .push(ValueMeta::Serialized(SerializedValueMeta { key, lsn, batch_offset: relative_off, len: val_ser_size, will_init: value.will_init(), })); self.max_lsn = std::cmp::max(self.max_lsn, lsn); self.len += 1; if cfg!(any(debug_assertions, test)) { self.validate_lsn_order(); } } /// Extend with the contents of another batch /// /// One batch is generated for each decoded PG WAL record. /// They are then merged to accumulate reasonably sized writes. pub fn extend(&mut self, mut other: SerializedValueBatch) { let extend_batch_start_offset = self.raw.len() as u64; self.raw.extend(other.raw); // Shift the offsets in the batch we are extending with other.metadata.iter_mut().for_each(|meta| match meta { ValueMeta::Serialized(ser) => { ser.batch_offset += extend_batch_start_offset; if cfg!(debug_assertions) { let value_end = ser.batch_offset + ser.len as u64; assert!((value_end as usize) <= self.raw.len()); } } ValueMeta::Observed(_) => {} }); self.metadata.extend(other.metadata); self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn); self.len += other.len; if cfg!(any(debug_assertions, test)) { self.validate_lsn_order(); } } /// Add zero images for the (key, LSN) tuples specified /// /// PG versions below 16 do not zero out pages before extending /// a relation and may leave gaps. Such gaps need to be identified /// by the pageserver ingest logic and get patched up here. /// /// Note that this function does not validate that the gaps have been /// identified correctly (it does not know relation sizes), so it's up /// to the call-site to do it properly. pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) { // Implementation note: // // Values within [`SerializedValueBatch::raw`] do not have any ordering requirements, // but the metadata entries should be ordered properly (see // [`SerializedValueBatch::metadata`]). // // Exploiting this observation we do: // 1. Drain all the metadata entries into an ordered set. // The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never // includes more than one update to the same block in the same WAL record. // 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer // and add an index entry to the ordered metadata set. // 3. Drain the ordered set back into a metadata vector let mut ordered_metas = self .metadata .drain(..) .map(OrderedValueMeta) .collect::>(); for (keyspace, lsn) in gaps { self.max_lsn = std::cmp::max(self.max_lsn, lsn); for gap_range in keyspace.ranges { let mut key = gap_range.start; while key != gap_range.end { let relative_off = self.raw.len() as u64; // TODO(vlad): Can we be cheeky and write only one zero image, and // make all index entries requiring a zero page point to it? // Alternatively, we can change the index entry format to represent zero pages // without writing them at all. Value::Image(ZERO_PAGE.clone()) .ser_into(&mut self.raw) .unwrap(); let val_ser_size = self.raw.len() - relative_off as usize; ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized( SerializedValueMeta { key: key.to_compact(), lsn, batch_offset: relative_off, len: val_ser_size, will_init: true, }, ))); self.len += 1; key = key.next(); } } } self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect(); if cfg!(any(debug_assertions, test)) { self.validate_lsn_order(); } } /// Checks if the batch contains any serialized or observed values pub fn is_empty(&self) -> bool { !self.has_data() && self.metadata.is_empty() } /// Checks if the batch contains only observed values pub fn is_observed(&self) -> bool { !self.has_data() && !self.metadata.is_empty() } /// Checks if the batch contains data /// /// Note that if this returns false, it may still contain observed values or /// a metadata record. pub fn has_data(&self) -> bool { let empty = self.raw.is_empty(); if cfg!(debug_assertions) && empty { assert!( self.metadata .iter() .all(|meta| matches!(meta, ValueMeta::Observed(_))) ); } !empty } /// Returns the number of values serialized in the batch pub fn len(&self) -> usize { self.len } /// Returns the size of the buffer wrapped by the batch pub fn buffer_size(&self) -> usize { self.raw.len() } pub fn updates_key(&self, key: &Key) -> bool { self.metadata.iter().any(|meta| match meta { ValueMeta::Serialized(ser) => key.to_compact() == ser.key, ValueMeta::Observed(_) => false, }) } pub fn validate_lsn_order(&self) { use std::collections::HashMap; let mut last_seen_lsn_per_key: HashMap = HashMap::default(); for meta in self.metadata.iter() { let lsn = meta.lsn(); let key = meta.key(); if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) { assert!( lsn >= prev_lsn, "Ordering violated by {}: {} < {}", Key::from_compact(key), lsn, prev_lsn ); } } } } #[cfg(all(test, feature = "testing"))] mod tests { use super::*; fn validate_batch( batch: &SerializedValueBatch, values: &[(CompactKey, Lsn, usize, Value)], gaps: Option<&Vec<(KeySpace, Lsn)>>, ) { // Invariant 1: The metadata for a given entry in the batch // is correct and can be used to deserialize back to the original value. for (key, lsn, size, value) in values.iter() { let meta = batch .metadata .iter() .find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn)) .unwrap(); let meta = match meta { ValueMeta::Serialized(ser) => ser, ValueMeta::Observed(_) => unreachable!(), }; assert_eq!(meta.len, *size); assert_eq!(meta.will_init, value.will_init()); let start = meta.batch_offset as usize; let end = meta.batch_offset as usize + meta.len; let value_from_batch = Value::des(&batch.raw[start..end]).unwrap(); assert_eq!(&value_from_batch, value); } let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum(); let mut gap_pages_count: usize = 0; // Invariant 2: Zero pages were added for identified gaps and their metadata // is correct. if let Some(gaps) = gaps { for (gap_keyspace, lsn) in gaps { for gap_range in &gap_keyspace.ranges { let mut gap_key = gap_range.start; while gap_key != gap_range.end { let meta = batch .metadata .iter() .find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn)) .unwrap(); let meta = match meta { ValueMeta::Serialized(ser) => ser, ValueMeta::Observed(_) => unreachable!(), }; let zero_value = Value::Image(ZERO_PAGE.clone()); let zero_value_size = zero_value.serialized_size().unwrap() as usize; assert_eq!(meta.len, zero_value_size); assert_eq!(meta.will_init, zero_value.will_init()); let start = meta.batch_offset as usize; let end = meta.batch_offset as usize + meta.len; let value_from_batch = Value::des(&batch.raw[start..end]).unwrap(); assert_eq!(value_from_batch, zero_value); gap_pages_count += 1; expected_buffer_size += zero_value_size; gap_key = gap_key.next(); } } } } // Invariant 3: The length of the batch is equal to the number // of values inserted, plus the number of gap pages. This extends // to the raw buffer size. assert_eq!(batch.len(), values.len() + gap_pages_count); assert_eq!(expected_buffer_size, batch.buffer_size()); // Invariant 4: Metadata entries for any given key are sorted in LSN order. batch.validate_lsn_order(); } #[test] fn test_creation_from_values() { const LSN: Lsn = Lsn(0x10); let key = Key::from_hex("110000000033333333444444445500000001").unwrap(); let values = vec![ ( key.to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo")), ), ( key.next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("bar")), ), ( key.to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("baz")), ), ( key.next().next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("taz")), ), ]; let values = values .into_iter() .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value)) .collect::>(); let batch = SerializedValueBatch::from_values(values.clone()); validate_batch(&batch, &values, None); assert!(!batch.is_empty()); } #[test] fn test_put() { const LSN: Lsn = Lsn(0x10); let key = Key::from_hex("110000000033333333444444445500000001").unwrap(); let values = vec![ ( key.to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo")), ), ( key.next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("bar")), ), ]; let mut values = values .into_iter() .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value)) .collect::>(); let mut batch = SerializedValueBatch::from_values(values.clone()); validate_batch(&batch, &values, None); let value = ( key.to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("baz")), ); let serialized_size = value.2.serialized_size().unwrap() as usize; let value = (value.0, value.1, serialized_size, value.2); values.push(value.clone()); batch.put(value.0, value.3, value.1); validate_batch(&batch, &values, None); let value = ( key.next().next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("taz")), ); let serialized_size = value.2.serialized_size().unwrap() as usize; let value = (value.0, value.1, serialized_size, value.2); values.push(value.clone()); batch.put(value.0, value.3, value.1); validate_batch(&batch, &values, None); } #[test] fn test_extension() { const LSN: Lsn = Lsn(0x10); let key = Key::from_hex("110000000033333333444444445500000001").unwrap(); let values = vec![ ( key.to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo")), ), ( key.next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("bar")), ), ( key.next().next().to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("taz")), ), ]; let mut values = values .into_iter() .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value)) .collect::>(); let mut batch = SerializedValueBatch::from_values(values.clone()); let other_values = vec![ ( key.to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("foo")), ), ( key.next().to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("bar")), ), ( key.next().next().to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("taz")), ), ]; let other_values = other_values .into_iter() .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value)) .collect::>(); let other_batch = SerializedValueBatch::from_values(other_values.clone()); values.extend(other_values); batch.extend(other_batch); validate_batch(&batch, &values, None); } #[test] fn test_gap_zeroing() { const LSN: Lsn = Lsn(0x10); let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap(); let rel_bar_base_key = { let mut key = rel_foo_base_key; key.field4 += 1; key }; let values = vec![ ( rel_foo_base_key.to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo1")), ), ( rel_foo_base_key.add(1).to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo2")), ), ( rel_foo_base_key.add(5).to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("foo3")), ), ( rel_foo_base_key.add(1).to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("foo4")), ), ( rel_foo_base_key.add(10).to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("foo5")), ), ( rel_foo_base_key.add(11).to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("foo6")), ), ( rel_foo_base_key.add(12).to_compact(), Lsn(LSN.0 + 0x10), Value::WalRecord(NeonWalRecord::wal_append("foo7")), ), ( rel_bar_base_key.to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("bar1")), ), ( rel_bar_base_key.add(4).to_compact(), LSN, Value::WalRecord(NeonWalRecord::wal_append("bar2")), ), ]; let values = values .into_iter() .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value)) .collect::>(); let mut batch = SerializedValueBatch::from_values(values.clone()); let gaps = vec![ ( KeySpace { ranges: vec![ rel_foo_base_key.add(2)..rel_foo_base_key.add(5), rel_bar_base_key.add(1)..rel_bar_base_key.add(4), ], }, LSN, ), ( KeySpace { ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)], }, Lsn(LSN.0 + 0x10), ), ]; batch.zero_gaps(gaps.clone()); validate_batch(&batch, &values, Some(&gaps)); } } ================================================ FILE: libs/wal_decoder/src/wire_format.rs ================================================ use bytes::{BufMut, Bytes, BytesMut}; use pageserver_api::key::CompactKey; use prost::{DecodeError, EncodeError, Message}; use tokio::io::AsyncWriteExt; use utils::bin_ser::{BeSer, DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::postgres_client::{Compression, InterpretedFormat}; use crate::models::{ FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, proto, }; use crate::serialized_batch::{ ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta, }; #[derive(Debug, thiserror::Error)] pub enum ToWireFormatError { #[error("{0}")] Bincode(#[from] SerializeError), #[error("{0}")] Protobuf(#[from] ProtobufSerializeError), #[error("{0}")] Compression(#[from] std::io::Error), } #[derive(Debug, thiserror::Error)] pub enum ProtobufSerializeError { #[error("{0}")] MetadataRecord(#[from] SerializeError), #[error("{0}")] Encode(#[from] EncodeError), } #[derive(Debug, thiserror::Error)] pub enum FromWireFormatError { #[error("{0}")] Bincode(#[from] DeserializeError), #[error("{0}")] Protobuf(#[from] ProtobufDeserializeError), #[error("{0}")] Decompress(#[from] std::io::Error), } #[derive(Debug, thiserror::Error)] pub enum ProtobufDeserializeError { #[error("{0}")] Transcode(#[from] TranscodeError), #[error("{0}")] Decode(#[from] DecodeError), } #[derive(Debug, thiserror::Error)] pub enum TranscodeError { #[error("{0}")] BadInput(String), #[error("{0}")] MetadataRecord(#[from] DeserializeError), } pub trait ToWireFormat { fn to_wire( self, format: InterpretedFormat, compression: Option, ) -> impl std::future::Future> + Send; } pub trait FromWireFormat { type T; fn from_wire( buf: &Bytes, format: InterpretedFormat, compression: Option, ) -> impl std::future::Future> + Send; } impl ToWireFormat for InterpretedWalRecords { async fn to_wire( self, format: InterpretedFormat, compression: Option, ) -> Result { use async_compression::Level; use async_compression::tokio::write::ZstdEncoder; let encode_res: Result = match format { InterpretedFormat::Bincode => { let buf = BytesMut::new(); let mut buf = buf.writer(); self.ser_into(&mut buf)?; Ok(buf.into_inner().freeze()) } InterpretedFormat::Protobuf => { let proto: proto::InterpretedWalRecords = self.try_into()?; let mut buf = BytesMut::new(); proto .encode(&mut buf) .map_err(|e| ToWireFormatError::Protobuf(e.into()))?; Ok(buf.freeze()) } }; let buf = encode_res?; let compressed_buf = match compression { Some(Compression::Zstd { level }) => { let mut encoder = ZstdEncoder::with_quality( Vec::with_capacity(buf.len() / 4), Level::Precise(level as i32), ); encoder.write_all(&buf).await?; encoder.shutdown().await?; Bytes::from(encoder.into_inner()) } None => buf, }; Ok(compressed_buf) } } impl FromWireFormat for InterpretedWalRecords { type T = Self; async fn from_wire( buf: &Bytes, format: InterpretedFormat, compression: Option, ) -> Result { let decompressed_buf = match compression { Some(Compression::Zstd { .. }) => { use async_compression::tokio::write::ZstdDecoder; let mut decoded_buf = Vec::with_capacity(buf.len()); let mut decoder = ZstdDecoder::new(&mut decoded_buf); decoder.write_all(buf).await?; decoder.flush().await?; Bytes::from(decoded_buf) } None => buf.clone(), }; match format { InterpretedFormat::Bincode => { InterpretedWalRecords::des(&decompressed_buf).map_err(FromWireFormatError::Bincode) } InterpretedFormat::Protobuf => { let proto = proto::InterpretedWalRecords::decode(decompressed_buf) .map_err(|e| FromWireFormatError::Protobuf(e.into()))?; InterpretedWalRecords::try_from(proto) .map_err(|e| FromWireFormatError::Protobuf(e.into())) } } } } impl TryFrom for proto::InterpretedWalRecords { type Error = SerializeError; fn try_from(value: InterpretedWalRecords) -> Result { let records = value .records .into_iter() .map(proto::InterpretedWalRecord::try_from) .collect::, _>>()?; Ok(proto::InterpretedWalRecords { records, next_record_lsn: Some(value.next_record_lsn.0), raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0), }) } } impl TryFrom for proto::InterpretedWalRecord { type Error = SerializeError; fn try_from(value: InterpretedWalRecord) -> Result { let metadata_record = value .metadata_record .map(|meta_rec| -> Result, Self::Error> { let mut buf = Vec::new(); meta_rec.ser_into(&mut buf)?; Ok(buf) }) .transpose()?; Ok(proto::InterpretedWalRecord { metadata_record, batch: Some(proto::SerializedValueBatch::from(value.batch)), next_record_lsn: value.next_record_lsn.0, flush_uncommitted: matches!(value.flush_uncommitted, FlushUncommittedRecords::Yes), xid: value.xid, }) } } impl From for proto::SerializedValueBatch { fn from(value: SerializedValueBatch) -> Self { proto::SerializedValueBatch { raw: value.raw, metadata: value .metadata .into_iter() .map(proto::ValueMeta::from) .collect(), max_lsn: value.max_lsn.0, len: value.len as u64, } } } impl From for proto::ValueMeta { fn from(value: ValueMeta) -> Self { match value { ValueMeta::Observed(obs) => proto::ValueMeta { r#type: proto::ValueMetaType::Observed.into(), key: Some(proto::CompactKey::from(obs.key)), lsn: obs.lsn.0, batch_offset: None, len: None, will_init: None, }, ValueMeta::Serialized(ser) => proto::ValueMeta { r#type: proto::ValueMetaType::Serialized.into(), key: Some(proto::CompactKey::from(ser.key)), lsn: ser.lsn.0, batch_offset: Some(ser.batch_offset), len: Some(ser.len as u64), will_init: Some(ser.will_init), }, } } } impl From for proto::CompactKey { fn from(value: CompactKey) -> Self { proto::CompactKey { high: (value.raw() >> 64) as u64, low: value.raw() as u64, } } } impl TryFrom for InterpretedWalRecords { type Error = TranscodeError; fn try_from(value: proto::InterpretedWalRecords) -> Result { let records = value .records .into_iter() .map(InterpretedWalRecord::try_from) .collect::>()?; Ok(InterpretedWalRecords { records, next_record_lsn: value .next_record_lsn .map(Lsn::from) .expect("Always provided"), raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from), }) } } impl TryFrom for InterpretedWalRecord { type Error = TranscodeError; fn try_from(value: proto::InterpretedWalRecord) -> Result { let metadata_record = value .metadata_record .map(|mrec| -> Result<_, DeserializeError> { MetadataRecord::des(&mrec) }) .transpose()?; let batch = { let batch = value.batch.ok_or_else(|| { TranscodeError::BadInput("InterpretedWalRecord::batch missing".to_string()) })?; SerializedValueBatch::try_from(batch)? }; Ok(InterpretedWalRecord { metadata_record, batch, next_record_lsn: Lsn(value.next_record_lsn), flush_uncommitted: if value.flush_uncommitted { FlushUncommittedRecords::Yes } else { FlushUncommittedRecords::No }, xid: value.xid, }) } } impl TryFrom for SerializedValueBatch { type Error = TranscodeError; fn try_from(value: proto::SerializedValueBatch) -> Result { let metadata = value .metadata .into_iter() .map(ValueMeta::try_from) .collect::, _>>()?; Ok(SerializedValueBatch { raw: value.raw, metadata, max_lsn: Lsn(value.max_lsn), len: value.len as usize, }) } } impl TryFrom for ValueMeta { type Error = TranscodeError; fn try_from(value: proto::ValueMeta) -> Result { match proto::ValueMetaType::try_from(value.r#type) { Ok(proto::ValueMetaType::Serialized) => { Ok(ValueMeta::Serialized(SerializedValueMeta { key: value .key .ok_or_else(|| { TranscodeError::BadInput("ValueMeta::key missing".to_string()) })? .into(), lsn: Lsn(value.lsn), batch_offset: value.batch_offset.ok_or_else(|| { TranscodeError::BadInput("ValueMeta::batch_offset missing".to_string()) })?, len: value.len.ok_or_else(|| { TranscodeError::BadInput("ValueMeta::len missing".to_string()) })? as usize, will_init: value.will_init.ok_or_else(|| { TranscodeError::BadInput("ValueMeta::will_init missing".to_string()) })?, })) } Ok(proto::ValueMetaType::Observed) => Ok(ValueMeta::Observed(ObservedValueMeta { key: value .key .ok_or_else(|| TranscodeError::BadInput("ValueMeta::key missing".to_string()))? .into(), lsn: Lsn(value.lsn), })), Err(_) => Err(TranscodeError::BadInput(format!( "Unexpected ValueMeta::type {}", value.r#type ))), } } } impl From for CompactKey { fn from(value: proto::CompactKey) -> Self { (((value.high as i128) << 64) | (value.low as i128)).into() } } #[test] fn test_compact_key_with_large_relnode() { use pageserver_api::key::Key; let inputs = vec![ Key { field1: 0, field2: 0x100, field3: 0x200, field4: 0, field5: 0x10, field6: 0x5, }, Key { field1: 0, field2: 0x100, field3: 0x200, field4: 0x007FFFFF, field5: 0x10, field6: 0x5, }, Key { field1: 0, field2: 0x100, field3: 0x200, field4: 0x00800000, field5: 0x10, field6: 0x5, }, Key { field1: 0, field2: 0x100, field3: 0x200, field4: 0x00800001, field5: 0x10, field6: 0x5, }, Key { field1: 0, field2: 0xFFFFFFFF, field3: 0xFFFFFFFF, field4: 0xFFFFFFFF, field5: 0x0, field6: 0x0, }, ]; for input in inputs { assert!(input.is_valid_key_on_write_path()); let compact = input.to_compact(); let proto: proto::CompactKey = compact.into(); let from_proto: CompactKey = proto.into(); assert_eq!( compact, from_proto, "Round trip failed for key with relnode={:#x}", input.field4 ); } } ================================================ FILE: libs/walproposer/Cargo.toml ================================================ [package] name = "walproposer" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true utils.workspace = true postgres_ffi.workspace = true [build-dependencies] anyhow.workspace = true bindgen.workspace = true ================================================ FILE: libs/walproposer/bindgen_deps.h ================================================ #include "postgres.h" #include "walproposer.h" ================================================ FILE: libs/walproposer/build.rs ================================================ //! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h //! to generate Rust bindings for it. use std::env; use std::path::PathBuf; use std::process::Command; use anyhow::{Context, anyhow}; const WALPROPOSER_PG_VERSION: &str = "v17"; fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); let root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); // Finding the location of built libraries and Postgres C headers: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { root_path.join("pg_install") }; let pg_install_abs = std::fs::canonicalize(pg_install_dir)?; let walproposer_lib_dir = root_path.join("build/walproposer-lib"); let walproposer_lib_search_str = walproposer_lib_dir .to_str() .ok_or(anyhow!("Bad non-UTF path"))?; let pgxn_neon = root_path.join("pgxn/neon"); let pgxn_neon = std::fs::canonicalize(pgxn_neon)?; let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?; println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-lib=static=pgport"); println!("cargo:rustc-link-lib=static=pgcommon"); println!("cargo:rustc-link-search={walproposer_lib_search_str}"); // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); let pg_config_bin = pg_install_abs .join(WALPROPOSER_PG_VERSION) .join("bin") .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } String::from_utf8(output.stdout) .context("pg_config output is not UTF-8")? .trim_end() .into() } else { let server_path = pg_install_abs .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") .into_os_string(); server_path .into_string() .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; let unwind_abi_functions = [ "log_internal", "recovery_download", "start_streaming", "finish_sync_safekeepers", "wait_event_set", "WalProposerStart", ]; // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. let mut builder = bindgen::Builder::default() // The input header we would like to generate // bindings for. .header("bindgen_deps.h") // Tell cargo to invalidate the built crate whenever any of the // included header files changed. .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .allowlist_type("WalProposer") .allowlist_type("WalProposerConfig") .allowlist_type("walproposer_api") .allowlist_function("WalProposerCreate") .allowlist_function("WalProposerStart") .allowlist_function("WalProposerBroadcast") .allowlist_function("WalProposerPoll") .allowlist_function("WalProposerFree") .allowlist_function("SafekeeperStateDesiredEvents") .allowlist_var("DEBUG5") .allowlist_var("DEBUG4") .allowlist_var("DEBUG3") .allowlist_var("DEBUG2") .allowlist_var("DEBUG1") .allowlist_var("LOG") .allowlist_var("INFO") .allowlist_var("NOTICE") .allowlist_var("WARNING") .allowlist_var("ERROR") .allowlist_var("FATAL") .allowlist_var("PANIC") .allowlist_var("PG_VERSION_NUM") .allowlist_var("WPEVENT") .allowlist_var("WL_LATCH_SET") .allowlist_var("WL_SOCKET_READABLE") .allowlist_var("WL_SOCKET_WRITEABLE") .allowlist_var("WL_TIMEOUT") .allowlist_var("WL_SOCKET_CLOSED") .allowlist_var("WL_SOCKET_MASK") .clang_arg("-DWALPROPOSER_LIB") .clang_arg(format!("-I{pgxn_neon}")) .clang_arg(format!("-I{inc_server_path}")); for name in unwind_abi_functions { builder = builder.override_abi(bindgen::Abi::CUnwind, name); } let bindings = builder // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. .expect("Unable to generate bindings"); // Write the bindings to the $OUT_DIR/bindings.rs file. let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"); bindings .write_to_file(out_path) .expect("Couldn't write bindings!"); Ok(()) } ================================================ FILE: libs/walproposer/src/api_bindings.rs ================================================ //! A C-Rust shim: defines implementation of C walproposer API, assuming wp //! callback_data stores Box to some Rust implementation. #![allow(dead_code)] use std::ffi::{CStr, CString}; use crate::bindings::{ NeonWALReadResult, PGAsyncReadResult, PGAsyncWriteResult, Safekeeper, Size, StringInfoData, TimestampTz, WalProposer, WalProposerConnStatusType, WalProposerConnectPollStatusType, WalProposerExecStatusType, WalproposerShmemState, XLogRecPtr, uint32, walproposer_api, }; use crate::walproposer::{ApiImpl, StreamingCallback, WaitResult}; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).get_shmem_state() } } extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; let callback = StreamingCallback::new(wp); (*api).start_streaming(startpos, &callback); } } extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).get_flush_rec_ptr() } } extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).update_donor(&mut (*donor), donor_lsn) } } extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).get_current_timestamp() } } extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; let msg = (*api).conn_error_message(&mut (*sk)); let msg = CString::new(msg).unwrap(); // TODO: fix leaking error message msg.into_raw() } } extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_status(&mut (*sk)) } } extern "C" fn conn_connect_start(sk: *mut Safekeeper) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_connect_start(&mut (*sk)) } } extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_connect_poll(&mut (*sk)) } } extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool { let query = unsafe { CStr::from_ptr(query) }; let query = query.to_str().unwrap(); unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_send_query(&mut (*sk), query) } } extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_get_query_result(&mut (*sk)) } } extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_flush(&mut (*sk)) } } extern "C" fn conn_finish(sk: *mut Safekeeper) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_finish(&mut (*sk)) } } extern "C" fn conn_async_read( sk: *mut Safekeeper, buf: *mut *mut ::std::os::raw::c_char, amount: *mut ::std::os::raw::c_int, ) -> PGAsyncReadResult { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; // This function has guarantee that returned buf will be valid until // the next call. So we can store a Vec in each Safekeeper and reuse // it on the next call. let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default(); inbuf.clear(); let result = (*api).conn_async_read(&mut (*sk), &mut inbuf); // Put a Vec back to sk->inbuf and return data ptr. *amount = inbuf.len() as i32; *buf = store_vec_u8(&mut (*sk).inbuf, inbuf); result } } extern "C" fn conn_async_write( sk: *mut Safekeeper, buf: *const ::std::os::raw::c_void, size: usize, ) -> PGAsyncWriteResult { unsafe { let buf = std::slice::from_raw_parts(buf as *const u8, size); let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_async_write(&mut (*sk), buf) } } extern "C" fn conn_blocking_write( sk: *mut Safekeeper, buf: *const ::std::os::raw::c_void, size: usize, ) -> bool { unsafe { let buf = std::slice::from_raw_parts(buf as *const u8, size); let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).conn_blocking_write(&mut (*sk), buf) } } extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; // currently `recovery_download` is always called right after election (*api).after_election(&mut (*wp)); (*api).recovery_download(&mut (*wp), &mut (*sk)) } } extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).wal_reader_allocate(&mut (*sk)); } } #[allow(clippy::unnecessary_cast)] extern "C" fn wal_read( sk: *mut Safekeeper, buf: *mut ::std::os::raw::c_char, startptr: XLogRecPtr, count: Size, _errmsg: *mut *mut ::std::os::raw::c_char, ) -> NeonWALReadResult { unsafe { let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; // TODO: errmsg is not forwarded (*api).wal_read(&mut (*sk), buf, startptr) } } extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).wal_reader_events(&mut (*sk)) } } extern "C" fn init_event_set(wp: *mut WalProposer) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).init_event_set(&mut (*wp)); } } extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).update_event_set(&mut (*sk), events); } } extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).active_state_update_event_set(&mut (*sk)); } } extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).add_safekeeper_event_set(&mut (*sk), events); } } extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; (*api).rm_safekeeper_event_set(&mut (*sk)); } } extern "C-unwind" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, event_sk: *mut *mut Safekeeper, events: *mut uint32, ) -> ::std::os::raw::c_int { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; let result = (*api).wait_event_set(&mut (*wp), timeout); match result { WaitResult::Latch => { *event_sk = std::ptr::null_mut(); *events = crate::bindings::WL_LATCH_SET; 1 } WaitResult::Timeout => { *event_sk = std::ptr::null_mut(); // WaitEventSetWait returns 0 for timeout. *events = 0; 0 } WaitResult::Network(sk, event_mask) => { *event_sk = sk; *events = event_mask; 1 } } } } extern "C" fn strong_random( wp: *mut WalProposer, buf: *mut ::std::os::raw::c_void, len: usize, ) -> bool { unsafe { let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len); let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).strong_random(buf) } } extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).get_redo_start_lsn() } } unsafe extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) -> ! { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).finish_sync_safekeepers(lsn) } } extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk)); } } extern "C-unwind" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, line: *const ::std::os::raw::c_char, ) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; let line = CStr::from_ptr(line); let line = line.to_str().unwrap(); (*api).log_internal(&mut (*wp), Level::from(level as u32), line) } } /* BEGIN_HADRON */ extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; if api.is_null() { return; } (*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers); } } extern "C" fn update_safekeeper_status_for_metrics( wp: *mut WalProposer, sk_index: u32, status: u8, ) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; if api.is_null() { return; } (*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status); } } /* END_HADRON */ #[derive(Debug, PartialEq)] pub enum Level { Debug5, Debug4, Debug3, Debug2, Debug1, Log, Info, Notice, Warning, Error, Fatal, Panic, WPEvent, } impl Level { pub fn from(elevel: u32) -> Level { use crate::bindings::*; match elevel { DEBUG5 => Level::Debug5, DEBUG4 => Level::Debug4, DEBUG3 => Level::Debug3, DEBUG2 => Level::Debug2, DEBUG1 => Level::Debug1, LOG => Level::Log, INFO => Level::Info, NOTICE => Level::Notice, WARNING => Level::Warning, ERROR => Level::Error, FATAL => Level::Fatal, PANIC => Level::Panic, WPEVENT => Level::WPEvent, _ => panic!("unknown log level {elevel}"), } } } pub(crate) fn create_api() -> walproposer_api { walproposer_api { get_shmem_state: Some(get_shmem_state), start_streaming: Some(start_streaming), get_flush_rec_ptr: Some(get_flush_rec_ptr), update_donor: Some(update_donor), get_current_timestamp: Some(get_current_timestamp), conn_error_message: Some(conn_error_message), conn_status: Some(conn_status), conn_connect_start: Some(conn_connect_start), conn_connect_poll: Some(conn_connect_poll), conn_send_query: Some(conn_send_query), conn_get_query_result: Some(conn_get_query_result), conn_flush: Some(conn_flush), conn_finish: Some(conn_finish), conn_async_read: Some(conn_async_read), conn_async_write: Some(conn_async_write), conn_blocking_write: Some(conn_blocking_write), recovery_download: Some(recovery_download), wal_reader_allocate: Some(wal_reader_allocate), wal_read: Some(wal_read), wal_reader_events: Some(wal_reader_events), init_event_set: Some(init_event_set), update_event_set: Some(update_event_set), active_state_update_event_set: Some(active_state_update_event_set), add_safekeeper_event_set: Some(add_safekeeper_event_set), rm_safekeeper_event_set: Some(rm_safekeeper_event_set), wait_event_set: Some(wait_event_set), strong_random: Some(strong_random), get_redo_start_lsn: Some(get_redo_start_lsn), finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), log_internal: Some(log_internal), /* BEGIN_HADRON */ reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics), update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics), /* END_HADRON */ } } pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { let empty_feedback = crate::bindings::PageserverFeedback { present: false, currentClusterSize: 0, last_received_lsn: 0, disk_consistent_lsn: 0, remote_consistent_lsn: 0, replytime: 0, shard_number: 0, corruption_detected: false, }; let empty_wal_rate_limiter = crate::bindings::WalRateLimiter { effective_max_wal_bytes_per_second: crate::bindings::pg_atomic_uint32 { value: 0 }, should_limit: crate::bindings::pg_atomic_uint32 { value: 0 }, sent_bytes: 0, batch_start_time_us: crate::bindings::pg_atomic_uint64 { value: 0 }, batch_end_time_us: crate::bindings::pg_atomic_uint64 { value: 0 }, }; crate::bindings::WalproposerShmemState { propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, donor_name: [0; 64], donor_conninfo: [0; 1024], donor_lsn: 0, mutex: 0, mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 }, backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 }, currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, shard_ps_feedback: [empty_feedback; 128], num_shards: 0, replica_promote: false, min_ps_feedback: empty_feedback, wal_rate_limiter: empty_wal_rate_limiter, num_safekeepers: 0, safekeeper_status: [0; 32], } } impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{self:?}") } } /// Take ownership of `Vec` from StringInfoData. #[allow(clippy::unnecessary_cast)] pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option> { if pg.data.is_null() { return None; } let ptr = pg.data as *mut u8; let length = pg.len as usize; let capacity = pg.maxlen as usize; pg.data = std::ptr::null_mut(); pg.len = 0; pg.maxlen = 0; unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) } } /// Store `Vec` in StringInfoData. fn store_vec_u8(pg: &mut StringInfoData, vec: Vec) -> *mut ::std::os::raw::c_char { let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char; let length = vec.len(); let capacity = vec.capacity(); assert!(pg.data.is_null()); pg.data = ptr; pg.len = length as i32; pg.maxlen = capacity as i32; std::mem::forget(vec); ptr } ================================================ FILE: libs/walproposer/src/lib.rs ================================================ pub mod bindings { #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. #![allow(clippy::useless_transmute)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); } pub mod api_bindings; pub mod walproposer; ================================================ FILE: libs/walproposer/src/walproposer.rs ================================================ #![allow(clippy::todo)] use std::ffi::CString; use std::str::FromStr; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use crate::api_bindings::{Level, create_api, take_vec_u8}; use crate::bindings::{ NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. /// /// Refer to `pgxn/neon/walproposer.h` for documentation. pub trait ApiImpl { fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { todo!() } fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) { todo!() } fn get_flush_rec_ptr(&self) -> u64 { todo!() } fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) { todo!() } fn get_current_timestamp(&self) -> i64 { todo!() } fn conn_error_message(&self, _sk: &mut Safekeeper) -> String { todo!() } fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType { todo!() } fn conn_connect_start(&self, _sk: &mut Safekeeper) { todo!() } fn conn_connect_poll( &self, _sk: &mut Safekeeper, ) -> crate::bindings::WalProposerConnectPollStatusType { todo!() } fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool { todo!() } fn conn_get_query_result( &self, _sk: &mut Safekeeper, ) -> crate::bindings::WalProposerExecStatusType { todo!() } fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 { todo!() } fn conn_finish(&self, _sk: &mut Safekeeper) { todo!() } fn conn_async_read( &self, _sk: &mut Safekeeper, _vec: &mut Vec, ) -> crate::bindings::PGAsyncReadResult { todo!() } fn conn_async_write( &self, _sk: &mut Safekeeper, _buf: &[u8], ) -> crate::bindings::PGAsyncWriteResult { todo!() } fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool { todo!() } fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool { todo!() } fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult { todo!() } fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult { todo!() } fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 { todo!() } fn init_event_set(&self, _wp: &mut WalProposer) { todo!() } fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) { todo!() } fn active_state_update_event_set(&self, _sk: &mut Safekeeper) { todo!() } fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) { todo!() } fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) { todo!() } fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult { todo!() } fn strong_random(&self, _buf: &mut [u8]) -> bool { todo!() } fn get_redo_start_lsn(&self) -> u64 { todo!() } fn finish_sync_safekeepers(&self, _lsn: u64) -> ! { todo!() } fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) { todo!() } fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) { todo!() } fn after_election(&self, _wp: &mut WalProposer) { todo!() } /* BEGIN_HADRON */ fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) { // Do nothing for testing purposes. } fn update_safekeeper_status_for_metrics( &self, _wp: &mut WalProposer, _sk_index: u32, _status: u8, ) { // Do nothing for testing purposes. } /* END_HADRON */ } #[derive(Debug)] pub enum WaitResult { Latch, Timeout, Network(*mut Safekeeper, u32), } #[derive(Clone)] pub struct Config { /// Tenant and timeline id pub ttid: TenantTimelineId, /// List of safekeepers in format `host:port` pub safekeepers_list: Vec, /// libpq connection info options pub safekeeper_conninfo_options: String, /// Safekeeper reconnect timeout in milliseconds pub safekeeper_reconnect_timeout: i32, /// Safekeeper connection timeout in milliseconds pub safekeeper_connection_timeout: i32, /// walproposer mode, finish when all safekeepers are synced or subscribe /// to WAL streaming pub sync_safekeepers: bool, } /// WalProposer main struct. C methods are reexported as Rust functions. pub struct Wrapper { wp: *mut WalProposer, _safekeepers_list_vec: Vec, } impl Wrapper { pub fn new(api: Box, config: Config) -> Wrapper { let neon_tenant = CString::new(config.ttid.tenant_id.to_string()) .unwrap() .into_raw(); let neon_timeline = CString::new(config.ttid.timeline_id.to_string()) .unwrap() .into_raw(); let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(",")) .unwrap() .into_bytes_with_nul(); assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity()); let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char; let safekeeper_conninfo_options = CString::from_str(&config.safekeeper_conninfo_options) .unwrap() .into_raw(); let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void; let c_config = WalProposerConfig { neon_tenant, neon_timeline, safekeepers_list, safekeeper_conninfo_options, safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout, safekeeper_connection_timeout: config.safekeeper_connection_timeout, wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB syncSafekeepers: config.sync_safekeepers, systemId: 0, pgTimeline: 1, proto_version: 3, callback_data, }; let c_config = Box::into_raw(Box::new(c_config)); let api = create_api(); let wp = unsafe { WalProposerCreate(c_config, api) }; Wrapper { wp, _safekeepers_list_vec: safekeepers_list_vec, } } pub fn start(&self) { unsafe { WalProposerStart(self.wp) } } } impl Drop for Wrapper { fn drop(&mut self) { unsafe { let config = (*self.wp).config; drop(Box::from_raw( (*config).callback_data as *mut Box, )); drop(CString::from_raw((*config).neon_tenant)); drop(CString::from_raw((*config).neon_timeline)); drop(Box::from_raw(config)); for i in 0..(*self.wp).n_safekeepers { let sk = &mut (*self.wp).safekeeper[i as usize]; take_vec_u8(&mut sk.inbuf); } WalProposerFree(self.wp); } } } pub struct StreamingCallback { wp: *mut WalProposer, } impl StreamingCallback { pub fn new(wp: *mut WalProposer) -> StreamingCallback { StreamingCallback { wp } } pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) { unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) } } pub fn poll(&self) { unsafe { WalProposerPoll(self.wp) } } } #[cfg(test)] mod tests { use core::panic; use std::cell::{Cell, UnsafeCell}; use std::ffi::CString; use std::sync::atomic::AtomicUsize; use std::sync::mpsc::sync_channel; use utils::id::TenantTimelineId; use super::ApiImpl; use crate::api_bindings::Level; use crate::bindings::{NeonWALReadResult, PG_VERSION_NUM}; use crate::walproposer::Wrapper; #[derive(Clone, Copy, Debug)] struct WaitEventsData { sk: *mut crate::bindings::Safekeeper, event_mask: u32, } struct MockImpl { // data to return from wait_event_set wait_events: Cell, // walproposer->safekeeper messages expected_messages: Vec>, expected_ptr: AtomicUsize, // safekeeper->walproposer messages safekeeper_replies: Vec>, replies_ptr: AtomicUsize, // channel to send LSN to the main thread sync_channel: std::sync::mpsc::SyncSender, // Shmem state, used for storing donor info shmem: UnsafeCell, } impl MockImpl { fn check_walproposer_msg(&self, msg: &[u8]) { let ptr = self .expected_ptr .fetch_add(1, std::sync::atomic::Ordering::SeqCst); if ptr >= self.expected_messages.len() { panic!("unexpected message from walproposer"); } let expected_msg = &self.expected_messages[ptr]; assert_eq!(msg, expected_msg.as_slice()); } fn next_safekeeper_reply(&self) -> &[u8] { let ptr = self .replies_ptr .fetch_add(1, std::sync::atomic::Ordering::SeqCst); if ptr >= self.safekeeper_replies.len() { panic!("no more safekeeper replies"); } &self.safekeeper_replies[ptr] } } impl ApiImpl for MockImpl { fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { self.shmem.get() } fn get_current_timestamp(&self) -> i64 { println!("get_current_timestamp"); 0 } fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) { let mut shmem = unsafe { *self.get_shmem_state() }; shmem.propEpochStartLsn.value = donor_lsn; shmem.donor_conninfo = donor.conninfo; shmem.donor_lsn = donor_lsn; } fn conn_status( &self, _: &mut crate::bindings::Safekeeper, ) -> crate::bindings::WalProposerConnStatusType { println!("conn_status"); crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK } fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) { println!("conn_connect_start"); } fn conn_connect_poll( &self, _: &mut crate::bindings::Safekeeper, ) -> crate::bindings::WalProposerConnectPollStatusType { println!("conn_connect_poll"); crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK } fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool { println!("conn_send_query: {query}"); true } fn conn_get_query_result( &self, _: &mut crate::bindings::Safekeeper, ) -> crate::bindings::WalProposerExecStatusType { println!("conn_get_query_result"); crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH } fn conn_async_read( &self, _: &mut crate::bindings::Safekeeper, vec: &mut Vec, ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); println!("conn_async_read result: {reply:?}"); vec.extend_from_slice(reply); crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { println!("conn_blocking_write: {buf:?}"); self.check_walproposer_msg(buf); true } fn recovery_download( &self, _wp: &mut crate::bindings::WalProposer, _sk: &mut crate::bindings::Safekeeper, ) -> bool { true } fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult { println!("wal_reader_allocate"); crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS } fn init_event_set(&self, _: &mut crate::bindings::WalProposer) { println!("init_event_set") } fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) { println!( "update_event_set, sk={:?}, events_mask={:#b}", sk as *mut crate::bindings::Safekeeper, event_mask ); self.wait_events.set(WaitEventsData { sk, event_mask }); } fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) { println!( "add_safekeeper_event_set, sk={:?}, events_mask={:#b}", sk as *mut crate::bindings::Safekeeper, event_mask ); self.wait_events.set(WaitEventsData { sk, event_mask }); } fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) { println!( "rm_safekeeper_event_set, sk={:?}", sk as *mut crate::bindings::Safekeeper ); } fn wait_event_set( &self, _: &mut crate::bindings::WalProposer, timeout_millis: i64, ) -> super::WaitResult { let data = self.wait_events.get(); println!("wait_event_set, timeout_millis={timeout_millis}, res={data:?}"); super::WaitResult::Network(data.sk, data.event_mask) } fn strong_random(&self, buf: &mut [u8]) -> bool { println!("strong_random"); buf.fill(0); true } fn finish_sync_safekeepers(&self, lsn: u64) -> ! { self.sync_channel.send(lsn).unwrap(); panic!("sync safekeepers finished at lsn={}", lsn); } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { println!("wp_log[{level}] {msg}"); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { println!("after_election"); } } /// Test that walproposer can successfully connect to safekeeper and finish /// sync_safekeepers. API is mocked in MockImpl. /// /// Run this test with valgrind to detect leaks: /// `valgrind --leak-check=full target/debug/deps/walproposer-` #[test] fn test_simple_sync_safekeepers() -> anyhow::Result<()> { let ttid = TenantTimelineId::new( "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?, "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?, ); let (sender, receiver) = sync_channel(1); // Messages definitions are at walproposer.h // xxx: it would be better to extract them from safekeeper crate and // use serialization/deserialization here. let greeting_tag = (b'g').to_be_bytes(); let tenant_id = CString::new(ttid.tenant_id.to_string()) .unwrap() .into_bytes_with_nul(); let timeline_id = CString::new(ttid.timeline_id.to_string()) .unwrap() .into_bytes_with_nul(); let mconf_gen = 0_u32.to_be_bytes(); let mconf_members_len = 0_u32.to_be_bytes(); let mconf_members_new_len = 0_u32.to_be_bytes(); let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes(); let system_id = 0_u64.to_be_bytes(); let wal_seg_size = 16777216_u32.to_be_bytes(); let proposer_greeting = [ greeting_tag.as_slice(), tenant_id.as_slice(), timeline_id.as_slice(), mconf_gen.as_slice(), mconf_members_len.as_slice(), mconf_members_new_len.as_slice(), pg_version.as_slice(), system_id.as_slice(), wal_seg_size.as_slice(), ] .concat(); let voting_tag = (b'v').to_be_bytes(); let vote_request_term = 3_u64.to_be_bytes(); let vote_request = [ voting_tag.as_slice(), mconf_gen.as_slice(), vote_request_term.as_slice(), ] .concat(); let acceptor_greeting_term = 2_u64.to_be_bytes(); let acceptor_greeting_node_id = 1_u64.to_be_bytes(); let acceptor_greeting = [ greeting_tag.as_slice(), acceptor_greeting_node_id.as_slice(), mconf_gen.as_slice(), mconf_members_len.as_slice(), mconf_members_new_len.as_slice(), acceptor_greeting_term.as_slice(), ] .concat(); let vote_response_term = 3_u64.to_be_bytes(); let vote_given = 1_u8.to_be_bytes(); let flush_lsn = 0x539_u64.to_be_bytes(); let truncate_lsn = 0x539_u64.to_be_bytes(); let th_len = 1_u32.to_be_bytes(); let th_term = 2_u64.to_be_bytes(); let th_lsn = 0x539_u64.to_be_bytes(); let vote_response = [ voting_tag.as_slice(), mconf_gen.as_slice(), vote_response_term.as_slice(), vote_given.as_slice(), flush_lsn.as_slice(), truncate_lsn.as_slice(), th_len.as_slice(), th_term.as_slice(), th_lsn.as_slice(), ] .concat(); let my_impl: Box = Box::new(MockImpl { wait_events: Cell::new(WaitEventsData { sk: std::ptr::null_mut(), event_mask: 0, }), expected_messages: vec![proposer_greeting, vote_request], expected_ptr: AtomicUsize::new(0), safekeeper_replies: vec![acceptor_greeting, vote_response], replies_ptr: AtomicUsize::new(0), sync_channel: sender, shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), }); let config = crate::walproposer::Config { ttid, safekeepers_list: vec!["localhost:5000".to_string()], safekeeper_conninfo_options: String::new(), safekeeper_reconnect_timeout: 1000, safekeeper_connection_timeout: 10000, sync_safekeepers: true, }; let wp = Wrapper::new(my_impl, config); // walproposer will panic when it finishes sync_safekeepers std::panic::catch_unwind(|| wp.start()).unwrap_err(); // validate the resulting LSN assert_eq!(receiver.try_recv(), Ok(1337)); Ok(()) // drop() will free up resources here } } ================================================ FILE: pageserver/Cargo.toml ================================================ [package] name = "pageserver" version = "0.1.0" edition = "2024" license.workspace = true [features] default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] # Direct IO alignment options (propagated to pageserver_api) io-align-512 = ["pageserver_api/io-align-512"] io-align-4k = ["pageserver_api/io-align-4k"] fuzz-read-path = ["testing"] # Enables benchmarking only APIs benchmarking = [] [dependencies] anyhow.workspace = true arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true bincode.workspace = true bit_field.workspace = true byteorder.workspace = true bytes.workspace = true camino-tempfile.workspace = true camino.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } consumption_metrics.workspace = true crc32c.workspace = true either.workspace = true enum-map.workspace = true enumset = { workspace = true, features = ["serde"]} fail.workspace = true futures.workspace = true hashlink.workspace = true hex.workspace = true http.workspace = true http-utils.workspace = true humantime-serde.workspace = true humantime.workspace = true hyper0.workspace = true itertools.workspace = true jsonwebtoken.workspace = true md5.workspace = true metrics.workspace = true nix.workspace = true num_cpus.workspace = true # hack to get the number of worker threads tokio uses num-traits.workspace = true once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that pageserver_compaction.workspace = true pageserver_page_api.workspace = true pem.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true postgres_ffi_types.workspace = true postgres_initdb.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true posthog_client_lite.workspace = true pprof.workspace = true pq_proto.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true remote_storage.workspace = true reqwest.workspace = true rpds.workspace = true rustls.workspace = true scopeguard.workspace = true send-future.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true serde_with.workspace = true serde.workspace = true smallvec.workspace = true storage_broker.workspace = true strum_macros.workspace = true strum.workspace = true sysinfo.workspace = true tenant_size_model.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-stream.workspace = true tokio-tar.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tonic.workspace = true tonic-reflection.workspace = true tower.workspace = true tracing.workspace = true tracing-utils.workspace = true url.workspace = true utils.workspace = true wal_decoder.workspace = true walkdir.workspace = true workspace_hack.workspace = true twox-hash.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true [dev-dependencies] base64.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } indoc.workspace = true uuid.workspace = true rstest.workspace = true [[bench]] name = "bench_layer_map" harness = false [[bench]] name = "bench_walredo" harness = false [[bench]] name = "bench_ingest" harness = false required-features = ["benchmarking"] [[bench]] name = "upload_queue" harness = false [[bench]] name = "bench_metrics" harness = false [[bin]] name = "test_helper_slow_client_reads" required-features = [ "testing" ] ================================================ FILE: pageserver/benches/README.md ================================================ ## Pageserver Benchmarks # How to run To run all benchmarks: `cargo bench` To run a specific file: `cargo bench --bench bench_layer_map` To run a specific function: `cargo bench --bench bench_layer_map -- real_map_uniform_queries` ================================================ FILE: pageserver/benches/bench_ingest.rs ================================================ use std::env; use std::num::NonZeroUsize; use std::sync::Arc; use bytes::Bytes; use camino::Utf8PathBuf; use criterion::{Criterion, criterion_group, criterion_main}; use futures::stream::FuturesUnordered; use pageserver::config::PageServerConf; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::keyspace::KeySpace; use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::storage_layer::IoConcurrency; use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState}; use pageserver::{page_cache, virtual_file}; use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::sync::gate::Gate; use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. fn murmurhash32(mut h: u32) -> u32 { h ^= h >> 16; h = h.wrapping_mul(0x85ebca6b); h ^= h >> 13; h = h.wrapping_mul(0xc2b2ae35); h ^= h >> 16; h } #[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum KeyLayout { /// Sequential unique keys Sequential, /// Random unique keys Random, /// Random keys, but only use the bits from the mask of them RandomReuse(u32), } #[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum WriteDelta { Yes, No, } #[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum ConcurrentReads { Yes, No, } async fn ingest( conf: &'static PageServerConf, put_size: usize, put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, concurrent_reads: ConcurrentReads, ) -> anyhow::Result<()> { if concurrent_reads == ConcurrentReads::Yes { assert_eq!(key_layout, KeyLayout::Sequential); } let mut lsn = utils::lsn::Lsn(1000); let mut key = Key::from_i128(0x0); let timeline_id = TimelineId::generate(); let tenant_id = TenantId::generate(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let layer = Arc::new( InMemoryLayer::create( conf, timeline_id, tenant_shard_id, lsn, &gate, &cancel, &ctx, ) .await?, ); let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; let ctx = RequestContext::new( pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, pageserver::context::DownloadBehavior::Download, ); const READ_BATCH_SIZE: u32 = 32; let (tx, mut rx) = tokio::sync::watch::channel::>(None); let reader_cancel = CancellationToken::new(); let reader_handle = if concurrent_reads == ConcurrentReads::Yes { Some(tokio::task::spawn({ let cancel = reader_cancel.clone(); let layer = layer.clone(); let ctx = ctx.attached_child(); async move { let gate = Gate::default(); let gate_guard = gate.enter().unwrap(); let io_concurrency = IoConcurrency::spawn_from_conf( GetVectoredConcurrentIo::SidecarTask, gate_guard, ); rx.wait_for(|key| key.is_some()).await.unwrap(); while !cancel.is_cancelled() { let key = match *rx.borrow() { Some(some) => some, None => unreachable!(), }; let mut start_key = key; start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE); let key_range = start_key..key.next(); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( KeySpace::single(key_range), Lsn(1)..Lsn(u64::MAX), &mut reconstruct_state, &ctx, ) .await .unwrap(); let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) .into_values() .map(|state| state.sink_pending_ios()) .collect::>(); while collect_futs.next().await.is_some() {} } drop(io_concurrency); gate.close().await; } })) } else { None }; const BATCH_SIZE: usize = 16; let mut batch = Vec::new(); for i in 0..put_count { lsn += put_size as u64; // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people // usually care the most about write performance when they're blasting a huge batch of data into a huge table. match key_layout { KeyLayout::Sequential => { // Use sequential order to illustrate the experience a user is likely to have // when ingesting bulk data. key.field6 = i as u32; } KeyLayout::Random => { // Use random-order keys to avoid giving a false advantage to data structures that are // faster when inserting on the end. key.field6 = murmurhash32(i as u32); } KeyLayout::RandomReuse(mask) => { // Use low bits only, to limit cardinality key.field6 = murmurhash32(i as u32) & mask; } } batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); if batch.len() >= BATCH_SIZE { let last_key = Key::from_compact(batch.last().unwrap().0); let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; tx.send(Some(last_key)).unwrap(); } } if !batch.is_empty() { let last_key = Key::from_compact(batch.last().unwrap().0); let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; tx.send(Some(last_key)).unwrap(); } layer.freeze(lsn + 1).await; if write_delta == WriteDelta::Yes { let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { max_concurrency: NonZeroUsize::new(1).unwrap(), }); let (_desc, path) = layer .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone()) .await? .unwrap(); tokio::fs::remove_file(path).await?; } reader_cancel.cancel(); if let Some(handle) = reader_handle { handle.await.unwrap(); } Ok(()) } /// Wrapper to instantiate a tokio runtime fn ingest_main( conf: &'static PageServerConf, io_mode: IoMode, put_size: usize, put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, concurrent_reads: ConcurrentReads, ) { pageserver::virtual_file::set_io_mode(io_mode); let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); runtime.block_on(async move { let r = ingest( conf, put_size, put_count, key_layout, write_delta, concurrent_reads, ) .await; if let Err(e) = r { panic!("{e:?}"); } }); } /// Declare a series of benchmarks for the Pageserver's ingest write path. /// /// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either /// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set). /// /// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on /// a fast disk, CPU is the bottleneck at time of writing. fn criterion_benchmark(c: &mut Criterion) { let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap(); let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap(); eprintln!("Data directory: {}", temp_dir.path()); let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); virtual_file::init( 16384, virtual_file::io_engine_for_bench(), // immaterial, each `ingest_main` invocation below overrides this conf.virtual_file_io_mode, // without actually doing syncs, buffered writes have an unfair advantage over direct IO writes virtual_file::SyncMode::Sync, ); page_cache::init(conf.page_cache_size); #[derive(serde::Serialize)] struct ExplodedParameters { io_mode: IoMode, volume_mib: usize, key_size: usize, key_layout: KeyLayout, write_delta: WriteDelta, concurrent_reads: ConcurrentReads, } #[derive(Clone)] struct HandPickedParameters { volume_mib: usize, key_size: usize, key_layout: KeyLayout, write_delta: WriteDelta, } let expect = vec![ // Small values (100b) tests HandPickedParameters { volume_mib: 128, key_size: 100, key_layout: KeyLayout::Sequential, write_delta: WriteDelta::Yes, }, HandPickedParameters { volume_mib: 128, key_size: 100, key_layout: KeyLayout::Random, write_delta: WriteDelta::Yes, }, HandPickedParameters { volume_mib: 128, key_size: 100, key_layout: KeyLayout::RandomReuse(0x3ff), write_delta: WriteDelta::Yes, }, HandPickedParameters { volume_mib: 128, key_size: 100, key_layout: KeyLayout::Sequential, write_delta: WriteDelta::No, }, // Large values (8k) tests HandPickedParameters { volume_mib: 128, key_size: 8192, key_layout: KeyLayout::Sequential, write_delta: WriteDelta::Yes, }, HandPickedParameters { volume_mib: 128, key_size: 8192, key_layout: KeyLayout::Sequential, write_delta: WriteDelta::No, }, ]; let exploded_parameters = { let mut out = Vec::new(); for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] { for param in expect.clone() { let HandPickedParameters { volume_mib, key_size, key_layout, write_delta, } = param; if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes { continue; } out.push(ExplodedParameters { io_mode: IoMode::DirectRw, volume_mib, key_size, key_layout, write_delta, concurrent_reads, }); } } out }; impl ExplodedParameters { fn benchmark_id(&self) -> String { let ExplodedParameters { io_mode, volume_mib, key_size, key_layout, write_delta, concurrent_reads, } = self; format!( "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}" ) } } let mut group = c.benchmark_group("ingest"); for params in exploded_parameters { let id = params.benchmark_id(); let ExplodedParameters { io_mode, volume_mib, key_size, key_layout, write_delta, concurrent_reads, } = params; let put_count = volume_mib * 1024 * 1024 / key_size; group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64)); group.sample_size(10); group.bench_function(id, |b| { b.iter(|| { ingest_main( conf, io_mode, key_size, put_count, key_layout, write_delta, concurrent_reads, ) }) }); } } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); /* cargo bench --bench bench_ingest im4gn.2xlarge: ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [1.2901 s 1.2943 s 1.2991 s] thrpt: [98.533 MiB/s 98.892 MiB/s 99.220 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [2.1387 s 2.1623 s 2.1845 s] thrpt: [58.595 MiB/s 59.197 MiB/s 59.851 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... time: [1.2036 s 1.2074 s 1.2122 s] thrpt: [105.60 MiB/s 106.01 MiB/s 106.35 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [520.55 ms 521.46 ms 522.57 ms] thrpt: [244.94 MiB/s 245.47 MiB/s 245.89 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [440.33 ms 442.24 ms 444.10 ms] thrpt: [288.22 MiB/s 289.43 MiB/s 290.69 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [168.78 ms 169.42 ms 170.18 ms] thrpt: [752.16 MiB/s 755.52 MiB/s 758.40 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [1.2978 s 1.3094 s 1.3227 s] thrpt: [96.775 MiB/s 97.758 MiB/s 98.632 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [2.1976 s 2.2067 s 2.2154 s] thrpt: [57.777 MiB/s 58.006 MiB/s 58.245 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes time: [1.2103 s 1.2160 s 1.2233 s] thrpt: [104.64 MiB/s 105.26 MiB/s 105.76 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [525.05 ms 526.37 ms 527.79 ms] thrpt: [242.52 MiB/s 243.17 MiB/s 243.79 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [443.06 ms 444.88 ms 447.15 ms] thrpt: [286.26 MiB/s 287.72 MiB/s 288.90 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [169.40 ms 169.80 ms 170.17 ms] thrpt: [752.21 MiB/s 753.81 MiB/s 755.60 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [1.2844 s 1.2915 s 1.2990 s] thrpt: [98.536 MiB/s 99.112 MiB/s 99.657 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [2.1431 s 2.1663 s 2.1900 s] thrpt: [58.446 MiB/s 59.087 MiB/s 59.726 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... time: [1.1906 s 1.1926 s 1.1947 s] thrpt: [107.14 MiB/s 107.33 MiB/s 107.51 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [516.86 ms 518.25 ms 519.47 ms] thrpt: [246.40 MiB/s 246.98 MiB/s 247.65 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [536.50 ms 536.53 ms 536.60 ms] thrpt: [238.54 MiB/s 238.57 MiB/s 238.59 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [267.77 ms 267.90 ms 268.04 ms] thrpt: [477.53 MiB/s 477.79 MiB/s 478.02 MiB/s] Hetzner AX102: ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [836.58 ms 861.93 ms 886.57 ms] thrpt: [144.38 MiB/s 148.50 MiB/s 153.00 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [1.2782 s 1.3191 s 1.3665 s] thrpt: [93.668 MiB/s 97.037 MiB/s 100.14 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... time: [791.27 ms 807.08 ms 822.95 ms] thrpt: [155.54 MiB/s 158.60 MiB/s 161.77 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [310.78 ms 314.66 ms 318.47 ms] thrpt: [401.92 MiB/s 406.79 MiB/s 411.87 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [377.11 ms 387.77 ms 399.21 ms] thrpt: [320.63 MiB/s 330.10 MiB/s 339.42 MiB/s] ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [128.37 ms 132.96 ms 138.55 ms] thrpt: [923.83 MiB/s 962.69 MiB/s 997.11 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [900.38 ms 914.88 ms 928.86 ms] thrpt: [137.80 MiB/s 139.91 MiB/s 142.16 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [1.2538 s 1.2936 s 1.3313 s] thrpt: [96.149 MiB/s 98.946 MiB/s 102.09 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes time: [787.17 ms 803.89 ms 820.63 ms] thrpt: [155.98 MiB/s 159.23 MiB/s 162.61 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [318.78 ms 321.89 ms 324.74 ms] thrpt: [394.16 MiB/s 397.65 MiB/s 401.53 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [374.01 ms 383.45 ms 393.20 ms] thrpt: [325.53 MiB/s 333.81 MiB/s 342.24 MiB/s] ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [137.98 ms 141.31 ms 143.57 ms] thrpt: [891.58 MiB/s 905.79 MiB/s 927.66 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes time: [613.69 ms 622.48 ms 630.97 ms] thrpt: [202.86 MiB/s 205.63 MiB/s 208.57 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes time: [1.0299 s 1.0766 s 1.1273 s] thrpt: [113.55 MiB/s 118.90 MiB/s 124.29 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y... time: [637.80 ms 647.78 ms 658.01 ms] thrpt: [194.53 MiB/s 197.60 MiB/s 200.69 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No time: [266.09 ms 267.20 ms 268.31 ms] thrpt: [477.06 MiB/s 479.04 MiB/s 481.04 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes time: [269.34 ms 273.27 ms 277.69 ms] thrpt: [460.95 MiB/s 468.40 MiB/s 475.24 MiB/s] ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No time: [123.18 ms 124.24 ms 125.15 ms] thrpt: [1022.8 MiB/s 1.0061 GiB/s 1.0148 GiB/s] */ ================================================ FILE: pageserver/benches/bench_layer_map.rs ================================================ use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use std::time::Instant; use criterion::measurement::WallTime; use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, StdRng}; use rand::seq::IndexedRandom; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::default(); let mut min_lsn = Lsn(u64::MAX); let mut max_lsn = Lsn(0); let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines(); let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); updates.insert_historic(layer); } println!("min: {min_lsn}, max: {max_lsn}"); updates.flush(); layer_map } /// Construct a layer map query pattern for benchmarks fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { // For each image layer we query one of the pages contained, at LSN right // before the image layer was created. This gives us a somewhat uniform // coverage of both the lsn and key space because image layers have // approximately equal sizes and cover approximately equal WAL since // last image. layer_map .iter_historic_layers() .filter_map(|l| { if l.is_incremental() { None } else { let kr = l.get_key_range(); let lr = l.get_lsn_range(); let key_inside = kr.start.next(); let lsn_before = Lsn(lr.start.0 - 1); Some((key_inside, lsn_before)) } }) .collect() } // Benchmark using metadata extracted from our performance test environment, from // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. fn bench_from_captest_env(c: &mut Criterion) { // TODO consider compressing this file let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); // Test with uniform query pattern c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { black_box(layer_map.search(q.0, q.1)); } }); }); // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs. c.bench_function("captest_rel_dir_query", |b| { b.iter(|| { let result = black_box(layer_map.search( Key::from_hex("000000067F00008000000000000000000001").unwrap(), // This LSN is higher than any of the LSNs in the tree Lsn::from_str("D0/80208AE1").unwrap(), )); result.unwrap(); }); }); } // Benchmark using metadata extracted from a real project that was taknig // too long processing layer map queries. fn bench_from_real_project(c: &mut Criterion) { // Init layer map let now = Instant::now(); let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); println!("Finished layer map init in {:?}", now.elapsed()); // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); // Define and name the benchmark function let mut group = c.benchmark_group("real_map"); group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { black_box(layer_map.search(q.0, q.1)); } }); }); group.finish(); } // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. fn bench_sequential(c: &mut Criterion) { // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. // // TODO This code is pretty slow and runs even if we're only running other // benchmarks. It needs to be somewhere else, but it's not clear where. // Putting it inside the `bench_function` closure is not a solution // because then it runs multiple times during warmup. let now = Instant::now(); let mut layer_map = LayerMap::default(); let mut updates = layer_map.batch_update(); for i in 0..100_000 { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = PersistentLayerDesc::new_img( TenantShardId::unsharded(TenantId::generate()), TimelineId::generate(), zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), 0, ); updates.insert_historic(layer); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); // Choose 100 uniformly random queries let rng = &mut StdRng::seed_from_u64(1); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map) .choose_multiple(rng, 100) .copied() .collect(); // Define and name the benchmark function let mut group = c.benchmark_group("sequential"); group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { black_box(layer_map.search(q.0, q.1)); } }); }); group.finish(); } fn bench_visibility_with_map( group: &mut BenchmarkGroup, layer_map: LayerMap, read_points: Vec, bench_name: &str, ) { group.bench_function(bench_name, |b| { b.iter(|| black_box(layer_map.get_visibility(read_points.clone()))); }); } // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. fn bench_visibility(c: &mut Criterion) { let mut group = c.benchmark_group("visibility"); { // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. let now = Instant::now(); let mut layer_map = LayerMap::default(); let mut updates = layer_map.batch_update(); for i in 0..100_000 { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = PersistentLayerDesc::new_img( TenantShardId::unsharded(TenantId::generate()), TimelineId::generate(), zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), 0, ); updates.insert_historic(layer); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); let mut read_points = Vec::new(); for i in (0..100_000).step_by(1000) { read_points.push(Lsn(i)); } bench_visibility_with_map(&mut group, layer_map, read_points, "sequential"); } { let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let read_points = vec![Lsn(0x1C760FA190)]; bench_visibility_with_map(&mut group, layer_map, read_points, "real_map"); let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let read_points = vec![ Lsn(0x1C760FA190), Lsn(0x000000931BEAD539), Lsn(0x000000931BF63011), Lsn(0x000000931B33AE68), Lsn(0x00000038E67ABFA0), Lsn(0x000000931B33AE68), Lsn(0x000000914E3F38F0), Lsn(0x000000931B33AE68), ]; bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches"); } group.finish(); } criterion_group!(group_1, bench_from_captest_env); criterion_group!(group_2, bench_from_real_project); criterion_group!(group_3, bench_sequential); criterion_group!(group_4, bench_visibility); criterion_main!(group_1, group_2, group_3, group_4); ================================================ FILE: pageserver/benches/bench_metrics.rs ================================================ use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; use utils::id::{TenantId, TimelineId}; // // Demonstrates that repeat label values lookup is a multicore scalability bottleneck // that is worth avoiding. // criterion_group!( label_values, label_values::bench_naive_usage, label_values::bench_cache_label_values_lookup ); mod label_values { use super::*; pub fn bench_naive_usage(c: &mut Criterion) { let mut g = c.benchmark_group("label_values__naive_usage"); for ntimelines in [1, 4, 8] { g.bench_with_input( BenchmarkId::new("ntimelines", ntimelines), &ntimelines, |b, ntimelines| { b.iter_custom(|iters| { let barrier = std::sync::Barrier::new(*ntimelines + 1); let timelines = (0..*ntimelines) .map(|_| { ( TenantId::generate().to_string(), "0000".to_string(), TimelineId::generate().to_string(), ) }) .collect::>(); let metric_vec = metrics::UIntGaugeVec::new( metrics::opts!("testmetric", "testhelp"), &["tenant_id", "shard_id", "timeline_id"], ) .unwrap(); std::thread::scope(|s| { for (tenant_id, shard_id, timeline_id) in &timelines { s.spawn(|| { barrier.wait(); for _ in 0..iters { metric_vec .with_label_values(&[tenant_id, shard_id, timeline_id]) .inc(); } barrier.wait(); }); } barrier.wait(); let start = std::time::Instant::now(); barrier.wait(); start.elapsed() }) }) }, ); } g.finish(); } pub fn bench_cache_label_values_lookup(c: &mut Criterion) { let mut g = c.benchmark_group("label_values__cache_label_values_lookup"); for ntimelines in [1, 4, 8] { g.bench_with_input( BenchmarkId::new("ntimelines", ntimelines), &ntimelines, |b, ntimelines| { b.iter_custom(|iters| { let barrier = std::sync::Barrier::new(*ntimelines + 1); let timelines = (0..*ntimelines) .map(|_| { ( TenantId::generate().to_string(), "0000".to_string(), TimelineId::generate().to_string(), ) }) .collect::>(); let metric_vec = metrics::UIntGaugeVec::new( metrics::opts!("testmetric", "testhelp"), &["tenant_id", "shard_id", "timeline_id"], ) .unwrap(); std::thread::scope(|s| { for (tenant_id, shard_id, timeline_id) in &timelines { s.spawn(|| { let metric = metric_vec.with_label_values(&[ tenant_id, shard_id, timeline_id, ]); barrier.wait(); for _ in 0..iters { metric.inc(); } barrier.wait(); }); } barrier.wait(); let start = std::time::Instant::now(); barrier.wait(); start.elapsed() }) }) }, ); } g.finish(); } } // // Demonstrates that even a single metric can be a scalability bottleneck // if multiple threads in it concurrently but there's nothing we can do // about it without changing the metrics framework to use e.g. sharded counte atomics. // criterion_group!( single_metric_multicore_scalability, single_metric_multicore_scalability::bench, ); mod single_metric_multicore_scalability { use super::*; pub fn bench(c: &mut Criterion) { let mut g = c.benchmark_group("single_metric_multicore_scalability"); for nthreads in [1, 4, 8] { g.bench_with_input( BenchmarkId::new("nthreads", nthreads), &nthreads, |b, nthreads| { b.iter_custom(|iters| { let barrier = std::sync::Barrier::new(*nthreads + 1); let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); std::thread::scope(|s| { for _ in 0..*nthreads { s.spawn(|| { barrier.wait(); for _ in 0..iters { metric.inc(); } barrier.wait(); }); } barrier.wait(); let start = std::time::Instant::now(); barrier.wait(); start.elapsed() }) }) }, ); } g.finish(); } } // // Demonstrates that even if we cache label value, the propagation of such a cached metric value // by Clone'ing it is a scalability bottleneck. // The reason is that it's an Arc internally and thus there's contention on the reference count atomics. // // We can avoid that by having long-lived references per thread (= indirection). // criterion_group!( propagation_of_cached_label_value, propagation_of_cached_label_value::bench_naive, propagation_of_cached_label_value::bench_long_lived_reference_per_thread, ); mod propagation_of_cached_label_value { use std::sync::Arc; use super::*; pub fn bench_naive(c: &mut Criterion) { let mut g = c.benchmark_group("propagation_of_cached_label_value__naive"); for nthreads in [1, 4, 8] { g.bench_with_input( BenchmarkId::new("nthreads", nthreads), &nthreads, |b, nthreads| { b.iter_custom(|iters| { let barrier = std::sync::Barrier::new(*nthreads + 1); let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); std::thread::scope(|s| { for _ in 0..*nthreads { s.spawn(|| { barrier.wait(); for _ in 0..iters { // propagating the metric means we'd clone it into the child RequestContext let propagated = metric.clone(); // simulate some work criterion::black_box(propagated); } barrier.wait(); }); } barrier.wait(); let start = std::time::Instant::now(); barrier.wait(); start.elapsed() }) }) }, ); } g.finish(); } pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) { let mut g = c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread"); for nthreads in [1, 4, 8] { g.bench_with_input( BenchmarkId::new("nthreads", nthreads), &nthreads, |b, nthreads| { b.iter_custom(|iters| { let barrier = std::sync::Barrier::new(*nthreads + 1); let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); std::thread::scope(|s| { for _ in 0..*nthreads { s.spawn(|| { // This is the technique. let this_threads_metric_reference = Arc::new(metric.clone()); barrier.wait(); for _ in 0..iters { // propagating the metric means we'd clone it into the child RequestContext let propagated = Arc::clone(&this_threads_metric_reference); // simulate some work (include the pointer chase!) criterion::black_box(&*propagated); } barrier.wait(); }); } barrier.wait(); let start = std::time::Instant::now(); barrier.wait(); start.elapsed() }) }) }, ); } } } criterion_group!(histograms, histograms::bench_bucket_scalability); mod histograms { use std::time::Instant; use criterion::{BenchmarkId, Criterion}; use metrics::core::Collector; pub fn bench_bucket_scalability(c: &mut Criterion) { let mut g = c.benchmark_group("bucket_scalability"); for n in [1, 4, 8, 16, 32, 64, 128, 256] { g.bench_with_input(BenchmarkId::new("nbuckets", n), &n, |b, n| { b.iter_custom(|iters| { let buckets: Vec = (0..*n).map(|i| i as f64 * 100.0).collect(); let histo = metrics::Histogram::with_opts( metrics::prometheus::HistogramOpts::new("name", "help") .buckets(buckets.clone()), ) .unwrap(); let start = Instant::now(); for i in 0..usize::try_from(iters).unwrap() { histo.observe(buckets[i % buckets.len()]); } let elapsed = start.elapsed(); // self-test let mfs = histo.collect(); assert_eq!(mfs.len(), 1); let metrics = mfs[0].get_metric(); assert_eq!(metrics.len(), 1); let histo = metrics[0].get_histogram(); let buckets = histo.get_bucket(); assert!( buckets .iter() .enumerate() .all(|(i, b)| b.get_cumulative_count() >= i as u64 * (iters / buckets.len() as u64)) ); elapsed }) }); } } } criterion_main!( label_values, single_metric_multicore_scalability, propagation_of_cached_label_value, histograms, ); /* RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot Results on an im4gn.2xlarge instance label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns] label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns] label_values__naive_usage/ntimelines/8 time: [1.1082 µs 1.1109 µs 1.1135 µs] label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns] label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns] label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns] single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns] single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns] single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns] propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns] propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns] propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns] bucket_scalability/nbuckets/1 time: [30.352 ns 30.353 ns 30.354 ns] bucket_scalability/nbuckets/4 time: [30.464 ns 30.465 ns 30.467 ns] bucket_scalability/nbuckets/8 time: [30.569 ns 30.575 ns 30.584 ns] bucket_scalability/nbuckets/16 time: [30.961 ns 30.965 ns 30.969 ns] bucket_scalability/nbuckets/32 time: [35.691 ns 35.707 ns 35.722 ns] bucket_scalability/nbuckets/64 time: [47.829 ns 47.898 ns 47.974 ns] bucket_scalability/nbuckets/128 time: [73.479 ns 73.512 ns 73.545 ns] bucket_scalability/nbuckets/256 time: [127.92 ns 127.94 ns 127.96 ns] Results on an i3en.3xlarge instance label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns] label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns] label_values__naive_usage/ntimelines/8 time: [1.4513 µs 1.4596 µs 1.4665 µs] label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns] label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns] label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns] single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns] single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns] single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns] propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns] propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns] propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns] Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns] label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns] label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns] label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns] label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns] label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns] single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns] single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns] single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns] propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns] propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns] propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency) label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns] label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns] label_values__naive_usage/ntimelines/8 time: [1.1615 µs 1.1998 µs 1.2399 µs] label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns] label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns] label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns] single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns] single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns] single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns] propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns] propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns] propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns] bucket_scalability/nbuckets/1 time: [4.8455 ns 4.8542 ns 4.8646 ns] bucket_scalability/nbuckets/4 time: [4.5663 ns 4.5722 ns 4.5787 ns] bucket_scalability/nbuckets/8 time: [4.5531 ns 4.5670 ns 4.5842 ns] bucket_scalability/nbuckets/16 time: [4.6392 ns 4.6524 ns 4.6685 ns] bucket_scalability/nbuckets/32 time: [6.0302 ns 6.0439 ns 6.0589 ns] bucket_scalability/nbuckets/64 time: [10.608 ns 10.644 ns 10.691 ns] bucket_scalability/nbuckets/128 time: [22.178 ns 22.316 ns 22.483 ns] bucket_scalability/nbuckets/256 time: [42.190 ns 42.328 ns 42.492 ns] Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns] label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns] label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns] label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns] label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns] label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns] single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns] single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns] single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns] propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns] propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns] propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns] propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns] bucket_scalability/nbuckets/1 time: [6.3998 ns 6.4288 ns 6.4684 ns] bucket_scalability/nbuckets/4 time: [6.3603 ns 6.3620 ns 6.3637 ns] bucket_scalability/nbuckets/8 time: [6.1646 ns 6.1654 ns 6.1667 ns] bucket_scalability/nbuckets/16 time: [6.1341 ns 6.1391 ns 6.1454 ns] bucket_scalability/nbuckets/32 time: [8.2206 ns 8.2254 ns 8.2301 ns] bucket_scalability/nbuckets/64 time: [13.988 ns 13.994 ns 14.000 ns] bucket_scalability/nbuckets/128 time: [28.180 ns 28.216 ns 28.251 ns] bucket_scalability/nbuckets/256 time: [54.914 ns 54.931 ns 54.951 ns] */ ================================================ FILE: pageserver/benches/bench_walredo.rs ================================================ //! Quantify a single walredo manager's throughput under N concurrent callers. //! //! The benchmark implementation ([`bench_impl`]) is parametrized by //! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo //! - `n_redos` => number of times the benchmark shell execute the `redo_work` //! - `nclients` => number of clients (more on this shortly). //! //! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. //! It spawns `nclients` times [`client`] tokio tasks. //! Each task executes the `redo_work` `n_redos/nclients` times. //! //! We exercise the following combinations: //! - `redo_work = ping / short / medium`` //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` //! //! We let `criterion` determine the `n_redos` using `iter_custom`. //! The idea is that for each `(redo_work, nclients)` combination, //! criterion will run the `bench_impl` multiple times with different `n_redos`. //! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. //! Criterion will divide that by `n_redos` to compute the "time per iteration". //! In our case, "time per iteration" means "time per redo_work execution". //! //! NB: the way by which `iter_custom` determines the "number of iterations" //! is called sampling. Apparently the idea here is to detect outliers. //! We're not sure whether the current choice of sampling method makes sense. //! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples //! //! # Reference Numbers //! //! 2024-09-18 on im4gn.2xlarge //! //! ```text //! ping/1 time: [21.789 µs 21.918 µs 22.078 µs] //! ping/2 time: [27.686 µs 27.812 µs 27.970 µs] //! ping/4 time: [35.468 µs 35.671 µs 35.926 µs] //! ping/8 time: [59.682 µs 59.987 µs 60.363 µs] //! ping/16 time: [101.79 µs 102.37 µs 103.08 µs] //! ping/32 time: [184.18 µs 185.15 µs 186.36 µs] //! ping/64 time: [349.86 µs 351.45 µs 353.47 µs] //! ping/128 time: [684.53 µs 687.98 µs 692.17 µs] //! short/1 time: [31.833 µs 32.126 µs 32.428 µs] //! short/2 time: [35.558 µs 35.756 µs 35.992 µs] //! short/4 time: [44.850 µs 45.138 µs 45.484 µs] //! short/8 time: [65.985 µs 66.379 µs 66.853 µs] //! short/16 time: [127.06 µs 127.90 µs 128.87 µs] //! short/32 time: [252.98 µs 254.70 µs 256.73 µs] //! short/64 time: [497.13 µs 499.86 µs 503.26 µs] //! short/128 time: [987.46 µs 993.45 µs 1.0004 ms] //! medium/1 time: [137.91 µs 138.55 µs 139.35 µs] //! medium/2 time: [192.00 µs 192.91 µs 194.07 µs] //! medium/4 time: [389.62 µs 391.55 µs 394.01 µs] //! medium/8 time: [776.80 µs 780.33 µs 784.77 µs] //! medium/16 time: [1.5323 ms 1.5383 ms 1.5459 ms] //! medium/32 time: [3.0120 ms 3.0226 ms 3.0350 ms] //! medium/64 time: [5.7405 ms 5.7787 ms 5.8166 ms] //! medium/128 time: [10.412 ms 10.574 ms 10.718 ms] //! ``` use std::future::Future; use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; use pageserver::config::PageServerConf; use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::sync::Barrier; use tokio::task::JoinSet; use utils::id::TenantId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; fn bench(c: &mut Criterion) { macro_rules! bench_group { ($name:expr, $redo_work:expr) => {{ let name: &str = $name; let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; for nclients in nclients { let mut group = c.benchmark_group(name); group.bench_with_input( BenchmarkId::from_parameter(nclients), &nclients, |b, nclients| { b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients)); }, ); } }}; } // // benchmark the protocol implementation // let pg_version = PgMajorVersion::PG14; bench_group!( "ping", Arc::new(move |mgr: Arc| async move { let _: () = mgr.ping(pg_version).await.unwrap(); }) ); // // benchmarks with actual record redo // let make_redo_work = |req: &'static Request| { Arc::new(move |mgr: Arc| async move { let page = req.execute(&mgr).await.unwrap(); assert_eq!(page.remaining(), BLCKSZ as usize); }) }; bench_group!("short", { static REQUEST: Lazy = Lazy::new(Request::short_input); make_redo_work(&REQUEST) }); bench_group!("medium", { static REQUEST: Lazy = Lazy::new(Request::medium_input); make_redo_work(&REQUEST) }); } criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration where F: Fn(Arc) -> Fut + Send + Sync + 'static, Fut: Future + Send + 'static, { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); let start = Arc::new(Barrier::new(nclients as usize)); let mut tasks = JoinSet::new(); let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); // divide the amount of work equally among the clients. let nredos_per_client = n_redos / nclients; for _ in 0..nclients { rt.block_on(async { tasks.spawn(client( Arc::clone(&manager), Arc::clone(&start), Arc::clone(&redo_work), nredos_per_client, )) }); } rt.block_on(async move { let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time }) } async fn client( mgr: Arc, start: Arc, redo_work: Arc, n_redos: u64, ) -> Duration where F: Fn(Arc) -> Fut + Send + Sync + 'static, Fut: Future + Send + 'static, { start.wait().await; let start = Instant::now(); for _ in 0..n_redos { redo_work(Arc::clone(&mgr)).await; // The real pageserver will rarely if ever do 2 walredos in a row without // yielding to the executor. tokio::task::yield_now().await; } start.elapsed() } macro_rules! lsn { ($input:expr) => {{ let input = $input; match ::from_str(input) { Ok(lsn) => lsn, Err(e) => panic!("failed to parse {}: {}", input, e), } }}; } /// Simple wrapper around `WalRedoManager::request_redo`. /// /// In benchmarks this is cloned around. #[derive(Clone)] struct Request { key: Key, lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: PgMajorVersion, } impl Request { async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { let Request { key, lsn, base_img, records, pg_version, } = self; // TODO: avoid these clones manager .request_redo( *key, *lsn, base_img.clone(), records.clone(), *pg_version, RedoAttemptType::ReadPage, ) .await .context("request_redo") } fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { let rec = Bytes::from_static(bytes); NeonWalRecord::Postgres { will_init, rec } } /// Short payload, 1132 bytes. // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 // for null bytes. #[allow(clippy::octal_escapes)] pub fn short_input() -> Request { let pg_record = Self::pg_record; Request { key: Key { field1: 0, field2: 1663, field3: 13010, field4: 1259, field5: 0, field6: 0, }, lsn: lsn!("0/16E2408"), base_img: None, records: vec![ ( lsn!("0/16A9388"), pg_record(true, b"j\x03\0\0\0\x04\0\0\xe8\x7fj\x01\0\0\0\0\0\n\0\0\xd0\x16\x13Y\0\x10\0\04\x03\xd4\0\x05\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x03\0\0\0\0\x80\xeca\x01\0\0\x01\0\xd4\0\xa0\x1d\0 \x04 \0\0\0\0/\0\x01\0\xa0\x9dX\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\00\x9f\x9a\x01P\x9e\xb2\x01\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0!\0\x01\x08 \xff\xff\xff?\0\0\0\0\0\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\0\0\0\0\0\0\x80\xbf\0\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\0\0\0\0\x0c\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0/\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0\xdf\x04\0\0pg_type\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0G\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\x0e\0\0\0\0@\x16D\x0e\0\0\0K\x10\0\0\x01\0pr \0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0[\x01\0\0\0\0\0\0\0\t\x04\0\0\x02\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0C\x01\0\0\x15\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0;\n\0\0pg_statistic\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xfd.\0\0\0\0\0\0\n\0\0\0\x02\0\0\0;\n\0\0\0\0\0\0\x13\0\0\0\0\0\xcbC\x13\0\0\0\x18\x0b\0\0\x01\0pr\x1f\0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0C\x01\0\0\0\0\0\0\0\t\x04\0\0\x01\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\x02\0\x01"), ), ( lsn!("0/16D4080"), pg_record(false, b"\xbc\0\0\0\0\0\0\0h?m\x01\0\0\0\0p\n\0\09\x08\xa3\xea\0 \x8c\0\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x02\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\x05\0\0\0\0@zD\x05\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\x02\0"), ), ], pg_version: PgMajorVersion::PG14, } } /// Medium sized payload, serializes as 26393 bytes. // see [`short`] #[allow(clippy::octal_escapes)] pub fn medium_input() -> Request { let pg_record = Self::pg_record; Request { key: Key { field1: 0, field2: 1663, field3: 13010, field4: 16384, field5: 0, field6: 0, }, lsn: lsn!("0/16E2440"), base_img: None, records: vec![ (lsn!("0/16B40A0"), pg_record(true, b"C\0\0\0\0\x04\0\0(@k\x01\0\0\0\0\x80\n\0\0\x9c$2\xb4\0`\x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\0\0\0\0\0\0\0\0\x01\0\0")), (lsn!("0/16B40E8"), pg_record(false, b"C\0\0\0\0\x04\0\0X@k\x01\0\0\0\0\0\n\0\0\x8c\xe7\xaa}\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x01\0\0\0\0\0\0\0\x02\0\0")), (lsn!("0/16B4130"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0@k\x01\0\0\0\0\0\n\0\0\xb3\xa9a\x89\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x02\0\0\0\0\0\0\0\x03\0\0")), (lsn!("0/16B4178"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8@k\x01\0\0\0\0\0\n\0\0Z\xd8\xd4W\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x03\0\0\0\0\0\0\0\x04\0\0")), (lsn!("0/16B41C0"), pg_record(false, b"C\0\0\0\0\x04\0\00Ak\x01\0\0\0\0\0\n\0\0G%L\xe1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x04\0\0\0\0\0\0\0\x05\0\0")), (lsn!("0/16B4208"), pg_record(false, b"C\0\0\0\0\x04\0\0xAk\x01\0\0\0\0\0\n\0\0\xbf\xe2Z\xed\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x05\0\0\0\0\0\0\0\x06\0\0")), (lsn!("0/16B4250"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0Ak\x01\0\0\0\0\0\n\0\0\xcc\xcc6}\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x06\0\0\0\0\0\0\0\x07\0\0")), (lsn!("0/16B4298"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08Bk\x01\0\0\0\0\0\n\0\0\xdc\t\x18v\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x07\0\0\0\0\0\0\0\x08\0\0")), (lsn!("0/16B42E0"), pg_record(false, b"C\0\0\0\0\x04\0\0PBk\x01\0\0\0\0\0\n\0\0\xe3\\\xb0U\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x08\0\0\0\0\0\0\0\t\0\0")), (lsn!("0/16B4328"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98Bk\x01\0\0\0\0\0\n\0\0\x83[\xe8\x90\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\t\0\0\0\0\0\0\0\n\0\0")), (lsn!("0/16B4370"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0Bk\x01\0\0\0\0\0\n\0\0$\xd5m\xad\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\n\0\0\0\0\0\0\0\x0b\0\0")), (lsn!("0/16B43B8"), pg_record(false, b"C\0\0\0\0\x04\0\0(Ck\x01\0\0\0\0\0\n\0\0\x94\x93\xe7-\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x0b\0\0\0\0\0\0\0\x0c\0\0")), (lsn!("0/16B4400"), pg_record(false, b"C\0\0\0\0\x04\0\0pCk\x01\0\0\0\0\0\n\0\0\xd0Y@\xc5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x0c\0\0\0\0\0\0\0\r\0\0")), (lsn!("0/16B4448"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8Ck\x01\0\0\0\0\0\n\0\0\xb0^\x18\0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\r\0\0\0\0\0\0\0\x0e\0\0")), (lsn!("0/16B4490"), pg_record(false, b"C\0\0\0\0\x04\0\0\0Dk\x01\0\0\0\0\0\n\0\0\x97,\x15z\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x0e\0\0\0\0\0\0\0\x0f\0\0")), (lsn!("0/16B44D8"), pg_record(false, b"C\0\0\0\0\x04\0\0HDk\x01\0\0\0\0\0\n\0\0\xfa\x04\xb1@\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x0f\0\0\0\0\0\0\0\x10\0\0")), (lsn!("0/16B4520"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90Dk\x01\0\0\0\0\0\n\0\0Z\xd9\xa49\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x10\0\0\0\0\0\0\0\x11\0\0")), (lsn!("0/16B4568"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8Dk\x01\0\0\0\0\0\n\0\0\xa2\x1e\xb25\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x11\0\0\0\0\0\0\0\x12\0\0")), (lsn!("0/16B45B0"), pg_record(false, b"C\0\0\0\0\x04\0\0 Ek\x01\0\0\0\0\0\n\0\0\\\xa7\x08V\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x12\0\0\0\0\0\0\0\x13\0\0")), (lsn!("0/16B45F8"), pg_record(false, b"C\0\0\0\0\x04\0\0hEk\x01\0\0\0\0\0\n\0\0\xb5\xd6\xbd\x88\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x13\0\0\0\0\0\0\0\x14\0\0")), (lsn!("0/16B4640"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0Ek\x01\0\0\0\0\0\n\0\0i\xdcT\xa9\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x14\0\0\0\0\0\0\0\x15\0\0")), (lsn!("0/16B4688"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8Ek\x01\0\0\0\0\0\n\0\0\x91\x1bB\xa5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x15\0\0\0\0\0\0\0\x16\0\0")), (lsn!("0/16B46D0"), pg_record(false, b"C\0\0\0\0\x04\0\0@Fk\x01\0\0\0\0\0\n\0\0P[P\x89\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x16\0\0\0\0\0\0\0\x17\0\0")), (lsn!("0/16B4718"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88Fk\x01\0\0\0\0\0\n\0\0\xf2\xf0\0>\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x17\0\0\0\0\0\0\0\x18\0\0")), (lsn!("0/16B4760"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0Fk\x01\0\0\0\0\0\n\0\0\xcd\xa5\xa8\x1d\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x18\0\0\0\0\0\0\0\x19\0\0")), (lsn!("0/16B47A8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18Gk\x01\0\0\0\0\0\n\0\0lU\x81O\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x19\0\0\0\0\0\0\0\x1a\0\0")), (lsn!("0/16B47F0"), pg_record(false, b"C\0\0\0\0\x04\0\0`Gk\x01\0\0\0\0\0\n\0\0\xcb\xdb\x04r\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1a\0\0\0\0\0\0\0\x1b\0\0")), (lsn!("0/16B4838"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8Gk\x01\0\0\0\0\0\n\0\0\xbaj\xffe\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1b\0\0\0\0\0\0\0\x1c\0\0")), (lsn!("0/16B4880"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0Gk\x01\0\0\0\0\0\n\0\0\xfe\xa0X\x8d\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1c\0\0\0\0\0\0\0\x1d\0\0")), (lsn!("0/16B48C8"), pg_record(false, b"C\0\0\0\0\x04\0\08Hk\x01\0\0\0\0\0\n\0\0\x06\x9e_\x0e\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1d\0\0\0\0\0\0\0\x1e\0\0")), (lsn!("0/16B4910"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80Hk\x01\0\0\0\0\0\n\0\0u\xb03\x9e\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1e\0\0\0\0\0\0\0\x1f\0\0")), (lsn!("0/16B4958"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8Hk\x01\0\0\0\0\0\n\0\0\xb6\x1e\xe3-\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x1f\0\0\0\0\0\0\0 \0\0")), (lsn!("0/16B49A0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10Ik\x01\0\0\0\0\0\n\0\0(\xd2\x8d\xe1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0 \0\0\0\0\0\0\0!\0\0")), (lsn!("0/16B49E8"), pg_record(false, b"C\0\0\0\0\x04\0\0XIk\x01\0\0\0\0\0\n\0\0\xd0\x15\x9b\xed\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0!\0\0\0\0\0\0\0\"\0\0")), (lsn!("0/16B4A30"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0Ik\x01\0\0\0\0\0\n\0\0\xef[P\x19\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\"\0\0\0\0\0\0\0#\0\0")), (lsn!("0/16B4A78"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8Ik\x01\0\0\0\0\0\n\0\0\x06*\xe5\xc7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0#\0\0\0\0\0\0\0$\0\0")), (lsn!("0/16B4AC0"), pg_record(false, b"C\0\0\0\0\x04\0\00Jk\x01\0\0\0\0\0\n\0\0hNrZ\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0$\0\0\0\0\0\0\0%\0\0")), (lsn!("0/16B4B08"), pg_record(false, b"C\0\0\0\0\x04\0\0xJk\x01\0\0\0\0\0\n\0\0\x90\x89dV\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0%\0\0\0\0\0\0\0&\0\0")), (lsn!("0/16B4B50"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0Jk\x01\0\0\0\0\0\n\0\0\xe3\xa7\x08\xc6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0&\0\0\0\0\0\0\0'\0\0")), (lsn!("0/16B4B98"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08Kk\x01\0\0\0\0\0\n\0\0\x80\xfb)\xe6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0'\0\0\0\0\0\0\0(\0\0")), (lsn!("0/16B4BE0"), pg_record(false, b"C\0\0\0\0\x04\0\0PKk\x01\0\0\0\0\0\n\0\0\xbf\xae\x81\xc5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0(\0\0\0\0\0\0\0)\0\0")), (lsn!("0/16B4C28"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98Kk\x01\0\0\0\0\0\n\0\0\xdf\xa9\xd9\0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0)\0\0\0\0\0\0\0*\0\0")), (lsn!("0/16B4C70"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0Kk\x01\0\0\0\0\0\n\0\0x'\\=\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0*\0\0\0\0\0\0\0+\0\0")), (lsn!("0/16B4CB8"), pg_record(false, b"C\0\0\0\0\x04\0\0(Lk\x01\0\0\0\0\0\n\0\0]\xca\xc6\xc0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0+\0\0\0\0\0\0\0,\0\0")), (lsn!("0/16B4D00"), pg_record(false, b"C\0\0\0\0\x04\0\0pLk\x01\0\0\0\0\0\n\0\0\x19\0a(\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0,\0\0\0\0\0\0\0-\0\0")), (lsn!("0/16B4D48"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8Lk\x01\0\0\0\0\0\n\0\0y\x079\xed\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0-\0\0\0\0\0\0\0.\0\0")), (lsn!("0/16B4D90"), pg_record(false, b"C\0\0\0\0\x04\0\0\0Mk\x01\0\0\0\0\0\n\0\0\xcb\xde$\xea\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0.\0\0\0\0\0\0\0/\0\0")), (lsn!("0/16B4DD8"), pg_record(false, b"C\0\0\0\0\x04\0\0HMk\x01\0\0\0\0\0\n\0\0\xa6\xf6\x80\xd0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0/\0\0\0\0\0\0\00\0\0")), (lsn!("0/16B4E20"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90Mk\x01\0\0\0\0\0\n\0\0\x06+\x95\xa9\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\00\0\0\0\0\0\0\01\0\0")), (lsn!("0/16B4E68"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8Mk\x01\0\0\0\0\0\n\0\0\xfe\xec\x83\xa5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\01\0\0\0\0\0\0\02\0\0")), (lsn!("0/16B4EB0"), pg_record(false, b"C\0\0\0\0\x04\0\0 Nk\x01\0\0\0\0\0\n\0\0s\xcc6\xed\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\02\0\0\0\0\0\0\03\0\0")), (lsn!("0/16B4EF8"), pg_record(false, b"C\0\0\0\0\x04\0\0hNk\x01\0\0\0\0\0\n\0\0\x9a\xbd\x833\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\03\0\0\0\0\0\0\04\0\0")), (lsn!("0/16B4F40"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0Nk\x01\0\0\0\0\0\n\0\0F\xb7j\x12\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\04\0\0\0\0\0\0\05\0\0")), (lsn!("0/16B4F88"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8Nk\x01\0\0\0\0\0\n\0\0\xbep|\x1e\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\05\0\0\0\0\0\0\06\0\0")), (lsn!("0/16B4FD0"), pg_record(false, b"C\0\0\0\0\x04\0\0@Ok\x01\0\0\0\0\0\n\0\0\x0c\xa9a\x19\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\06\0\0\0\0\0\0\07\0\0")), (lsn!("0/16B5018"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88Ok\x01\0\0\0\0\0\n\0\0\xae\x021\xae\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\07\0\0\0\0\0\0\08\0\0")), (lsn!("0/16B5060"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0Ok\x01\0\0\0\0\0\n\0\0\x91W\x99\x8d\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\08\0\0\0\0\0\0\09\0\0")), (lsn!("0/16B50A8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18Pk\x01\0\0\0\0\0\n\0\0\0\xd4\x0eS\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\09\0\0\0\0\0\0\0:\0\0")), (lsn!("0/16B50F0"), pg_record(false, b"C\0\0\0\0\x04\0\0`Pk\x01\0\0\0\0\0\n\0\0\xa7Z\x8bn\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0:\0\0\0\0\0\0\0;\0\0")), (lsn!("0/16B5138"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8Pk\x01\0\0\0\0\0\n\0\0\xd6\xebpy\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0;\0\0\0\0\0\0\0<\0\0")), (lsn!("0/16B5180"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0Pk\x01\0\0\0\0\0\n\0\0\x92!\xd7\x91\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0<\0\0\0\0\0\0\0=\0\0")), (lsn!("0/16B51C8"), pg_record(false, b"C\0\0\0\0\x04\0\08Qk\x01\0\0\0\0\0\n\0\03\xd1\xfe\xc3\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0=\0\0\0\0\0\0\0>\0\0")), (lsn!("0/16B5210"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80Qk\x01\0\0\0\0\0\n\0\0@\xff\x92S\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0>\0\0\0\0\0\0\0?\0\0")), (lsn!("0/16B5258"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8Qk\x01\0\0\0\0\0\n\0\0.*G\xf7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0?\0\0\0\0\0\0\0@\0\0")), (lsn!("0/16B52A0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10Rk\x01\0\0\0\0\0\n\0\0=\xb23T\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0@\0\0\0\0\0\0\0A\0\0")), (lsn!("0/16B52E8"), pg_record(false, b"C\0\0\0\0\x04\0\0XRk\x01\0\0\0\0\0\n\0\0\xc5u%X\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0A\0\0\0\0\0\0\0B\0\0")), (lsn!("0/16B5330"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0Rk\x01\0\0\0\0\0\n\0\0\xfa;\xee\xac\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0B\0\0\0\0\0\0\0C\0\0")), (lsn!("0/16B5378"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8Rk\x01\0\0\0\0\0\n\0\0\x13J[r\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0C\0\0\0\0\0\0\0D\0\0")), (lsn!("0/16B53C0"), pg_record(false, b"C\0\0\0\0\x04\0\00Sk\x01\0\0\0\0\0\n\0\0\x0e\xb7\xc3\xc4\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0D\0\0\0\0\0\0\0E\0\0")), (lsn!("0/16B5408"), pg_record(false, b"C\0\0\0\0\x04\0\0xSk\x01\0\0\0\0\0\n\0\0\xf6p\xd5\xc8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0E\0\0\0\0\0\0\0F\0\0")), (lsn!("0/16B5450"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0Sk\x01\0\0\0\0\0\n\0\0\x85^\xb9X\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0F\0\0\0\0\0\0\0G\0\0")), (lsn!("0/16B5498"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08Tk\x01\0\0\0\0\0\n\0\0s\xa9\x88\x05\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0G\0\0\0\0\0\0\0H\0\0")), (lsn!("0/16B54E0"), pg_record(false, b"C\0\0\0\0\x04\0\0PTk\x01\0\0\0\0\0\n\0\0L\xfc &\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0H\0\0\0\0\0\0\0I\0\0")), (lsn!("0/16B5528"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98Tk\x01\0\0\0\0\0\n\0\0,\xfbx\xe3\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0I\0\0\0\0\0\0\0J\0\0")), (lsn!("0/16B5570"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0Tk\x01\0\0\0\0\0\n\0\0\x8bu\xfd\xde\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0J\0\0\0\0\0\0\0K\0\0")), (lsn!("0/16B55B8"), pg_record(false, b"C\0\0\0\0\x04\0\0(Uk\x01\0\0\0\0\0\n\0\0;3w^\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0K\0\0\0\0\0\0\0L\0\0")), (lsn!("0/16B5600"), pg_record(false, b"C\0\0\0\0\x04\0\0pUk\x01\0\0\0\0\0\n\0\0\x7f\xf9\xd0\xb6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0L\0\0\0\0\0\0\0M\0\0")), (lsn!("0/16B5648"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8Uk\x01\0\0\0\0\0\n\0\0\x1f\xfe\x88s\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0M\0\0\0\0\0\0\0N\0\0")), (lsn!("0/16B5690"), pg_record(false, b"C\0\0\0\0\x04\0\0\0Vk\x01\0\0\0\0\0\n\0\0\xde\xbe\x9a_\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0N\0\0\0\0\0\0\0O\0\0")), (lsn!("0/16B56D8"), pg_record(false, b"C\0\0\0\0\x04\0\0HVk\x01\0\0\0\0\0\n\0\0\xb3\x96>e\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0O\0\0\0\0\0\0\0P\0\0")), (lsn!("0/16B5720"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90Vk\x01\0\0\0\0\0\n\0\0\x13K+\x1c\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0P\0\0\0\0\0\0\0Q\0\0")), (lsn!("0/16B5768"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8Vk\x01\0\0\0\0\0\n\0\0\xeb\x8c=\x10\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0Q\0\0\0\0\0\0\0R\0\0")), (lsn!("0/16B57B0"), pg_record(false, b"C\0\0\0\0\x04\0\0 Wk\x01\0\0\0\0\0\n\0\0\x155\x87s\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0R\0\0\0\0\0\0\0S\0\0")), (lsn!("0/16B57F8"), pg_record(false, b"C\0\0\0\0\x04\0\0hWk\x01\0\0\0\0\0\n\0\0\xfcD2\xad\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0S\0\0\0\0\0\0\0T\0\0")), (lsn!("0/16B5840"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0Wk\x01\0\0\0\0\0\n\0\0 N\xdb\x8c\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0T\0\0\0\0\0\0\0U\0\0")), (lsn!("0/16B5888"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8Wk\x01\0\0\0\0\0\n\0\0\xd8\x89\xcd\x80\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0U\0\0\0\0\0\0\0V\0\0")), (lsn!("0/16B58D0"), pg_record(false, b"C\0\0\0\0\x04\0\0@Xk\x01\0\0\0\0\0\n\0\03\x9e\xfeV\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0V\0\0\0\0\0\0\0W\0\0")), (lsn!("0/16B5918"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88Xk\x01\0\0\0\0\0\n\0\0\x915\xae\xe1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0W\0\0\0\0\0\0\0X\0\0")), (lsn!("0/16B5960"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0Xk\x01\0\0\0\0\0\n\0\0\xae`\x06\xc2\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0X\0\0\0\0\0\0\0Y\0\0")), (lsn!("0/16B59A8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18Yk\x01\0\0\0\0\0\n\0\0\x0f\x90/\x90\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0Y\0\0\0\0\0\0\0Z\0\0")), (lsn!("0/16B59F0"), pg_record(false, b"C\0\0\0\0\x04\0\0`Yk\x01\0\0\0\0\0\n\0\0\xa8\x1e\xaa\xad\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0Z\0\0\0\0\0\0\0[\0\0")), (lsn!("0/16B5A38"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8Yk\x01\0\0\0\0\0\n\0\0\xd9\xafQ\xba\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0[\0\0\0\0\0\0\0\\\0\0")), (lsn!("0/16B5A80"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0Yk\x01\0\0\0\0\0\n\0\0\x9de\xf6R\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\\\0\0\0\0\0\0\0]\0\0")), (lsn!("0/16B5AC8"), pg_record(false, b"C\0\0\0\0\x04\0\08Zk\x01\0\0\0\0\0\n\0\0O\x0c\xd0+\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0]\0\0\0\0\0\0\0^\0\0")), (lsn!("0/16B5B10"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80Zk\x01\0\0\0\0\0\n\0\0<\"\xbc\xbb\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0^\0\0\0\0\0\0\0_\0\0")), (lsn!("0/16B5B58"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8Zk\x01\0\0\0\0\0\n\0\0\xff\x8cl\x08\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0_\0\0\0\0\0\0\0`\0\0")), (lsn!("0/16B5BA0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10[k\x01\0\0\0\0\0\n\0\0a@\x02\xc4\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0`\0\0\0\0\0\0\0a\0\0")), (lsn!("0/16B5BE8"), pg_record(false, b"C\0\0\0\0\x04\0\0X[k\x01\0\0\0\0\0\n\0\0\x99\x87\x14\xc8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0a\0\0\0\0\0\0\0b\0\0")), (lsn!("0/16B5C30"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0[k\x01\0\0\0\0\0\n\0\0\xa6\xc9\xdf<\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0b\0\0\0\0\0\0\0c\0\0")), (lsn!("0/16B5C78"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8[k\x01\0\0\0\0\0\n\0\0O\xb8j\xe2\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0c\0\0\0\0\0\0\0d\0\0")), (lsn!("0/16B5CC0"), pg_record(false, b"C\0\0\0\0\x04\0\00\\k\x01\0\0\0\0\0\n\0\0\xc7\xee\xe2)\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0d\0\0\0\0\0\0\0e\0\0")), (lsn!("0/16B5D08"), pg_record(false, b"C\0\0\0\0\x04\0\0x\\k\x01\0\0\0\0\0\n\0\0?)\xf4%\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0e\0\0\0\0\0\0\0f\0\0")), (lsn!("0/16B5D50"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0\\k\x01\0\0\0\0\0\n\0\0L\x07\x98\xb5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0f\0\0\0\0\0\0\0g\0\0")), (lsn!("0/16B5D98"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08]k\x01\0\0\0\0\0\n\0\0/[\xb9\x95\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0g\0\0\0\0\0\0\0h\0\0")), (lsn!("0/16B5DE0"), pg_record(false, b"C\0\0\0\0\x04\0\0P]k\x01\0\0\0\0\0\n\0\0\x10\x0e\x11\xb6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0h\0\0\0\0\0\0\0i\0\0")), (lsn!("0/16B5E28"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98]k\x01\0\0\0\0\0\n\0\0p\tIs\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0i\0\0\0\0\0\0\0j\0\0")), (lsn!("0/16B5E70"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0]k\x01\0\0\0\0\0\n\0\0\xd7\x87\xccN\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0j\0\0\0\0\0\0\0k\0\0")), (lsn!("0/16B5EB8"), pg_record(false, b"C\0\0\0\0\x04\0\0(^k\x01\0\0\0\0\0\n\0\0\x14XI\xe5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0k\0\0\0\0\0\0\0l\0\0")), (lsn!("0/16B5F00"), pg_record(false, b"C\0\0\0\0\x04\0\0p^k\x01\0\0\0\0\0\n\0\0P\x92\xee\r\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0l\0\0\0\0\0\0\0m\0\0")), (lsn!("0/16B5F48"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8^k\x01\0\0\0\0\0\n\0\00\x95\xb6\xc8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0m\0\0\0\0\0\0\0n\0\0")), (lsn!("0/16B5F90"), pg_record(false, b"C\0\0\0\0\x04\0\0\0_k\x01\0\0\0\0\0\n\0\0\x82L\xab\xcf\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0n\0\0\0\0\0\0\0o\0\0")), (lsn!("0/16B5FD8"), pg_record(false, b"C\0\0\0\0\x04\0\0H_k\x01\0\0\0\0\0\n\0\0\xefd\x0f\xf5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0o\0\0\0\0\0\0\0p\0\0")), (lsn!("0/16B6038"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90_k\x01\0\0\0\0\0\n\0\0O\xb9\x1a\x8c\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0p\0\0\0\0\0\0\0q\0\0")), (lsn!("0/16B6080"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8_k\x01\0\0\0\0\0\n\0\0\xb7~\x0c\x80\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0q\0\0\0\0\0\0\0r\0\0")), (lsn!("0/16B60C8"), pg_record(false, b"C\0\0\0\0\x04\0\08`k\x01\0\0\0\0\0\n\0\0\xc9\xc1bC\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0r\0\0\0\0\0\0\0s\0\0")), (lsn!("0/16B6110"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80`k\x01\0\0\0\0\0\n\0\0\xc1xD\x1b\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0s\0\0\0\0\0\0\0t\0\0")), (lsn!("0/16B6158"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8`k\x01\0\0\0\0\0\n\0\0\x96j\xca\xea\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0t\0\0\0\0\0\0\0u\0\0")), (lsn!("0/16B61A0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10ak\x01\0\0\0\0\0\n\0\0$B\xca\xa1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0u\0\0\0\0\0\0\0v\0\0")), (lsn!("0/16B61E8"), pg_record(false, b"C\0\0\0\0\x04\0\0Xak\x01\0\0\0\0\0\n\0\0\xb6\xa45\xb7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0v\0\0\0\0\0\0\0w\0\0")), (lsn!("0/16B6230"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0ak\x01\0\0\0\0\0\n\0\0!g\x1f+\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0w\0\0\0\0\0\0\0x\0\0")), (lsn!("0/16B6278"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8ak\x01\0\0\0\0\0\n\0\0\r\xea\x9e\x11\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0x\0\0\0\0\0\0\0y\0\0")), (lsn!("0/16B62C0"), pg_record(false, b"C\0\0\0\0\x04\0\00bk\x01\0\0\0\0\0\n\0\0\xcc[\x91q\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0y\0\0\0\0\0\0\0z\0\0")), (lsn!("0/16B6308"), pg_record(false, b"C\0\0\0\0\x04\0\0xbk\x01\0\0\0\0\0\n\0\0^\xbdng\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0z\0\0\0\0\0\0\0{\0\0")), (lsn!("0/16B6350"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0bk\x01\0\0\0\0\0\n\0\0V\x04H?\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0{\0\0\0\0\0\0\0|\0\0")), (lsn!("0/16B6398"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08ck\x01\0\0\0\0\0\n\0\0X!\xf9\x90\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0|\0\0\0\0\0\0\0}\0\0")), (lsn!("0/16B63E0"), pg_record(false, b"C\0\0\0\0\x04\0\0Pck\x01\0\0\0\0\0\n\0\0\xb3>\xc6\x85\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0}\0\0\0\0\0\0\0~\0\0")), (lsn!("0/16B6428"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98ck\x01\0\0\0\0\0\n\0\0\xb9\x18wZ\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0~\0\0\0\0\0\0\0\x7f\0\0")), (lsn!("0/16B6470"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0ck\x01\0\0\0\0\0\n\0\0\xb8R\xd2\xfb\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x7f\0\0\0\0\0\0\0\x80\0\0")), (lsn!("0/16B64B8"), pg_record(false, b"C\0\0\0\0\x04\0\0(dk\x01\0\0\0\0\0\n\0\0\xa2\xbb\xbb\x9f\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x80\0\0\0\0\0\0\0\x81\0\0")), (lsn!("0/16B6500"), pg_record(false, b"C\0\0\0\0\x04\0\0pdk\x01\0\0\0\0\0\n\0\0I\xa4\x84\x8a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x81\0\0\0\0\0\0\0\x82\0\0")), (lsn!("0/16B6548"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8dk\x01\0\0\0\0\0\n\0\0C\x825U\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x82\0\0\0\0\0\0\0\x83\0\0")), (lsn!("0/16B6590"), pg_record(false, b"C\0\0\0\0\x04\0\0\0ek\x01\0\0\0\0\0\n\0\0\x8a\xccb\x9a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x83\0\0\0\0\0\0\0\x84\0\0")), (lsn!("0/16B65D8"), pg_record(false, b"C\0\0\0\0\x04\0\0Hek\x01\0\0\0\0\0\n\0\0\xdd\xde\xeck\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x84\0\0\0\0\0\0\0\x85\0\0")), (lsn!("0/16B6620"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90ek\x01\0\0\0\0\0\n\0\0\xae\x01\x9d\xb7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x85\0\0\0\0\0\0\0\x86\0\0")), (lsn!("0/16B6668"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8ek\x01\0\0\0\0\0\n\0\0<\xe7b\xa1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x86\0\0\0\0\0\0\0\x87\0\0")), (lsn!("0/16B66B0"), pg_record(false, b"C\0\0\0\0\x04\0\0 fk\x01\0\0\0\0\0\n\0\0\x19J6\x81\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x87\0\0\0\0\0\0\0\x88\0\0")), (lsn!("0/16B66F8"), pg_record(false, b"C\0\0\0\0\x04\0\0hfk\x01\0\0\0\0\0\n\0\05\xc7\xb7\xbb\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x88\0\0\0\0\0\0\0\x89\0\0")), (lsn!("0/16B6740"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0fk\x01\0\0\0\0\0\n\0\0F\x18\xc6g\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x89\0\0\0\0\0\0\0\x8a\0\0")), (lsn!("0/16B6788"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8fk\x01\0\0\0\0\0\n\0\0\xd4\xfe9q\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8a\0\0\0\0\0\0\0\x8b\0\0")), (lsn!("0/16B67D0"), pg_record(false, b"C\0\0\0\0\x04\0\0@gk\x01\0\0\0\0\0\n\0\0\x1d\xb0n\xbe\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8b\0\0\0\0\0\0\0\x8c\0\0")), (lsn!("0/16B6818"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88gk\x01\0\0\0\0\0\n\0\0\xd2b\xae\x86\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8c\0\0\0\0\0\0\0\x8d\0\0")), (lsn!("0/16B6860"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0gk\x01\0\0\0\0\0\n\0\09}\x91\x93\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8d\0\0\0\0\0\0\0\x8e\0\0")), (lsn!("0/16B68A8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18hk\x01\0\0\0\0\0\n\0\0\xabb\x7f\n\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8e\0\0\0\0\0\0\0\x8f\0\0")), (lsn!("0/16B68F0"), pg_record(false, b"C\0\0\0\0\x04\0\0`hk\x01\0\0\0\0\0\n\0\0\xf3\"\xa1\x1b\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x8f\0\0\0\0\0\0\0\x90\0\0")), (lsn!("0/16B6938"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8hk\x01\0\0\0\0\0\n\0\0@'\x9d{\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x90\0\0\0\0\0\0\0\x91\0\0")), (lsn!("0/16B6980"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0hk\x01\0\0\0\0\0\n\0\0\xab8\xa2n\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x91\0\0\0\0\0\0\0\x92\0\0")), (lsn!("0/16B69C8"), pg_record(false, b"C\0\0\0\0\x04\0\08ik\x01\0\0\0\0\0\n\0\0`\xe9b&\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x92\0\0\0\0\0\0\0\x93\0\0")), (lsn!("0/16B6A10"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80ik\x01\0\0\0\0\0\n\0\0hPD~\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x93\0\0\0\0\0\0\0\x94\0\0")), (lsn!("0/16B6A58"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8ik\x01\0\0\0\0\0\n\0\0?B\xca\x8f\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x94\0\0\0\0\0\0\0\x95\0\0")), (lsn!("0/16B6AA0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10jk\x01\0\0\0\0\0\n\0\0\xfe\xf3\xc5\xef\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x95\0\0\0\0\0\0\0\x96\0\0")), (lsn!("0/16B6AE8"), pg_record(false, b"C\0\0\0\0\x04\0\0Xjk\x01\0\0\0\0\0\n\0\0l\x15:\xf9\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x96\0\0\0\0\0\0\0\x97\0\0")), (lsn!("0/16B6B30"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0jk\x01\0\0\0\0\0\n\0\0\xfb\xd6\x10e\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x97\0\0\0\0\0\0\0\x98\0\0")), (lsn!("0/16B6B78"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8jk\x01\0\0\0\0\0\n\0\0\xd7[\x91_\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x98\0\0\0\0\0\0\0\x99\0\0")), (lsn!("0/16B6BC0"), pg_record(false, b"C\0\0\0\0\x04\0\00kk\x01\0\0\0\0\0\n\0\0es\x91\x14\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x99\0\0\0\0\0\0\0\x9a\0\0")), (lsn!("0/16B6C08"), pg_record(false, b"C\0\0\0\0\x04\0\0xkk\x01\0\0\0\0\0\n\0\0\xf7\x95n\x02\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9a\0\0\0\0\0\0\0\x9b\0\0")), (lsn!("0/16B6C50"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0kk\x01\0\0\0\0\0\n\0\0\xff,HZ\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9b\0\0\0\0\0\0\0\x9c\0\0")), (lsn!("0/16B6C98"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08lk\x01\0\0\0\0\0\n\0\0d\xa2\xe9\x88\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9c\0\0\0\0\0\0\0\x9d\0\0")), (lsn!("0/16B6CE0"), pg_record(false, b"C\0\0\0\0\x04\0\0Plk\x01\0\0\0\0\0\n\0\0\x8f\xbd\xd6\x9d\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9d\0\0\0\0\0\0\0\x9e\0\0")), (lsn!("0/16B6D28"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98lk\x01\0\0\0\0\0\n\0\0\x85\x9bgB\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9e\0\0\0\0\0\0\0\x9f\0\0")), (lsn!("0/16B6D70"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0lk\x01\0\0\0\0\0\n\0\0s]\xcd\xda\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\x9f\0\0\0\0\0\0\0\xa0\0\0")), (lsn!("0/16B6DB8"), pg_record(false, b"C\0\0\0\0\x04\0\0(mk\x01\0\0\0\0\0\n\0\0\xfeI\x8a\x0f\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa0\0\0\0\0\0\0\0\xa1\0\0")), (lsn!("0/16B6E00"), pg_record(false, b"C\0\0\0\0\x04\0\0pmk\x01\0\0\0\0\0\n\0\0\x15V\xb5\x1a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa1\0\0\0\0\0\0\0\xa2\0\0")), (lsn!("0/16B6E48"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8mk\x01\0\0\0\0\0\n\0\0\x1fp\x04\xc5\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa2\0\0\0\0\0\0\0\xa3\0\0")), (lsn!("0/16B6E90"), pg_record(false, b"C\0\0\0\0\x04\0\0\0nk\x01\0\0\0\0\0\n\0\0\xa5\xa7\\!\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa3\0\0\0\0\0\0\0\xa4\0\0")), (lsn!("0/16B6ED8"), pg_record(false, b"C\0\0\0\0\x04\0\0Hnk\x01\0\0\0\0\0\n\0\0\xf2\xb5\xd2\xd0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa4\0\0\0\0\0\0\0\xa5\0\0")), (lsn!("0/16B6F20"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90nk\x01\0\0\0\0\0\n\0\0\x81j\xa3\x0c\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa5\0\0\0\0\0\0\0\xa6\0\0")), (lsn!("0/16B6F68"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8nk\x01\0\0\0\0\0\n\0\0\x13\x8c\\\x1a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa6\0\0\0\0\0\0\0\xa7\0\0")), (lsn!("0/16B6FB0"), pg_record(false, b"C\0\0\0\0\x04\0\0 ok\x01\0\0\0\0\0\n\0\0E\xb8\x07\x11\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa7\0\0\0\0\0\0\0\xa8\0\0")), (lsn!("0/16B6FF8"), pg_record(false, b"C\0\0\0\0\x04\0\0hok\x01\0\0\0\0\0\n\0\0i5\x86+\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa8\0\0\0\0\0\0\0\xa9\0\0")), (lsn!("0/16B7040"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0ok\x01\0\0\0\0\0\n\0\0\x1a\xea\xf7\xf7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xa9\0\0\0\0\0\0\0\xaa\0\0")), (lsn!("0/16B7088"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8ok\x01\0\0\0\0\0\n\0\0\x88\x0c\x08\xe1\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xaa\0\0\0\0\0\0\0\xab\0\0")), (lsn!("0/16B70D0"), pg_record(false, b"C\0\0\0\0\x04\0\0@pk\x01\0\0\0\0\0\n\0\0q1\xe1\xa2\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xab\0\0\0\0\0\0\0\xac\0\0")), (lsn!("0/16B7118"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88pk\x01\0\0\0\0\0\n\0\0\xbe\xe3!\x9a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xac\0\0\0\0\0\0\0\xad\0\0")), (lsn!("0/16B7160"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0pk\x01\0\0\0\0\0\n\0\0U\xfc\x1e\x8f\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xad\0\0\0\0\0\0\0\xae\0\0")), (lsn!("0/16B71A8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18qk\x01\0\0\0\0\0\n\0\0\x9e-\xde\xc7\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xae\0\0\0\0\0\0\0\xaf\0\0")), (lsn!("0/16B71F0"), pg_record(false, b"C\0\0\0\0\x04\0\0`qk\x01\0\0\0\0\0\n\0\0\xc6m\0\xd6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xaf\0\0\0\0\0\0\0\xb0\0\0")), (lsn!("0/16B7238"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8qk\x01\0\0\0\0\0\n\0\0uh<\xb6\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb0\0\0\0\0\0\0\0\xb1\0\0")), (lsn!("0/16B7280"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0qk\x01\0\0\0\0\0\n\0\0\x9ew\x03\xa3\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb1\0\0\0\0\0\0\0\xb2\0\0")), (lsn!("0/16B72C8"), pg_record(false, b"C\0\0\0\0\x04\0\08rk\x01\0\0\0\0\0\n\0\0&?\xcc\xc0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb2\0\0\0\0\0\0\0\xb3\0\0")), (lsn!("0/16B7310"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80rk\x01\0\0\0\0\0\n\0\0.\x86\xea\x98\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb3\0\0\0\0\0\0\0\xb4\0\0")), (lsn!("0/16B7358"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8rk\x01\0\0\0\0\0\n\0\0y\x94di\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb4\0\0\0\0\0\0\0\xb5\0\0")), (lsn!("0/16B73A0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10sk\x01\0\0\0\0\0\n\0\0\xcb\xbcd\"\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb5\0\0\0\0\0\0\0\xb6\0\0")), (lsn!("0/16B73E8"), pg_record(false, b"C\0\0\0\0\x04\0\0Xsk\x01\0\0\0\0\0\n\0\0YZ\x9b4\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb6\0\0\0\0\0\0\0\xb7\0\0")), (lsn!("0/16B7430"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0sk\x01\0\0\0\0\0\n\0\0\xce\x99\xb1\xa8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb7\0\0\0\0\0\0\0\xb8\0\0")), (lsn!("0/16B7478"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8sk\x01\0\0\0\0\0\n\0\0\xe2\x140\x92\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb8\0\0\0\0\0\0\0\xb9\0\0")), (lsn!("0/16B74C0"), pg_record(false, b"C\0\0\0\0\x04\0\00tk\x01\0\0\0\0\0\n\0\0\xc5\x97 \xa4\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xb9\0\0\0\0\0\0\0\xba\0\0")), (lsn!("0/16B7508"), pg_record(false, b"C\0\0\0\0\x04\0\0xtk\x01\0\0\0\0\0\n\0\0Wq\xdf\xb2\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xba\0\0\0\0\0\0\0\xbb\0\0")), (lsn!("0/16B7550"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0tk\x01\0\0\0\0\0\n\0\0_\xc8\xf9\xea\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xbb\0\0\0\0\0\0\0\xbc\0\0")), (lsn!("0/16B7598"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08uk\x01\0\0\0\0\0\n\0\0Q\xedHE\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xbc\0\0\0\0\0\0\0\xbd\0\0")), (lsn!("0/16B75E0"), pg_record(false, b"C\0\0\0\0\x04\0\0Puk\x01\0\0\0\0\0\n\0\0\xba\xf2wP\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xbd\0\0\0\0\0\0\0\xbe\0\0")), (lsn!("0/16B7628"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98uk\x01\0\0\0\0\0\n\0\0\xb0\xd4\xc6\x8f\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xbe\0\0\0\0\0\0\0\xbf\0\0")), (lsn!("0/16B7670"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0uk\x01\0\0\0\0\0\n\0\0\xebii\0\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xbf\0\0\0\0\0\0\0\xc0\0\0")), (lsn!("0/16B76B8"), pg_record(false, b"C\0\0\0\0\x04\0\0(vk\x01\0\0\0\0\0\n\0\0\xeb)4\xba\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc0\0\0\0\0\0\0\0\xc1\0\0")), (lsn!("0/16B7700"), pg_record(false, b"C\0\0\0\0\x04\0\0pvk\x01\0\0\0\0\0\n\0\0\06\x0b\xaf\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc1\0\0\0\0\0\0\0\xc2\0\0")), (lsn!("0/16B7748"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb8vk\x01\0\0\0\0\0\n\0\0\n\x10\xbap\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc2\0\0\0\0\0\0\0\xc3\0\0")), (lsn!("0/16B7790"), pg_record(false, b"C\0\0\0\0\x04\0\0\0wk\x01\0\0\0\0\0\n\0\0\xc3^\xed\xbf\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc3\0\0\0\0\0\0\0\xc4\0\0")), (lsn!("0/16B77D8"), pg_record(false, b"C\0\0\0\0\x04\0\0Hwk\x01\0\0\0\0\0\n\0\0\x94LcN\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc4\0\0\0\0\0\0\0\xc5\0\0")), (lsn!("0/16B7820"), pg_record(false, b"C\0\0\0\0\x04\0\0\x90wk\x01\0\0\0\0\0\n\0\0\xe7\x93\x12\x92\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc5\0\0\0\0\0\0\0\xc6\0\0")), (lsn!("0/16B7868"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd8wk\x01\0\0\0\0\0\n\0\0uu\xed\x84\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc6\0\0\0\0\0\0\0\xc7\0\0")), (lsn!("0/16B78B0"), pg_record(false, b"C\0\0\0\0\x04\0\0 xk\x01\0\0\0\0\0\n\0\0z\x8f\x98^\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc7\0\0\0\0\0\0\0\xc8\0\0")), (lsn!("0/16B78F8"), pg_record(false, b"C\0\0\0\0\x04\0\0hxk\x01\0\0\0\0\0\n\0\0V\x02\x19d\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc8\0\0\0\0\0\0\0\xc9\0\0")), (lsn!("0/16B7940"), pg_record(false, b"C\0\0\0\0\x04\0\0\xb0xk\x01\0\0\0\0\0\n\0\0%\xddh\xb8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xc9\0\0\0\0\0\0\0\xca\0\0")), (lsn!("0/16B7988"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf8xk\x01\0\0\0\0\0\n\0\0\xb7;\x97\xae\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xca\0\0\0\0\0\0\0\xcb\0\0")), (lsn!("0/16B79D0"), pg_record(false, b"C\0\0\0\0\x04\0\0@yk\x01\0\0\0\0\0\n\0\0~u\xc0a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xcb\0\0\0\0\0\0\0\xcc\0\0")), (lsn!("0/16B7A18"), pg_record(false, b"C\0\0\0\0\x04\0\0\x88yk\x01\0\0\0\0\0\n\0\0\xb1\xa7\0Y\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xcc\0\0\0\0\0\0\0\xcd\0\0")), (lsn!("0/16B7A60"), pg_record(false, b"C\0\0\0\0\x04\0\0\xd0yk\x01\0\0\0\0\0\n\0\0Z\xb8?L\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xcd\0\0\0\0\0\0\0\xce\0\0")), (lsn!("0/16B7AA8"), pg_record(false, b"C\0\0\0\0\x04\0\0\x18zk\x01\0\0\0\0\0\n\0\0\xe2\xf0\xf0/\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xce\0\0\0\0\0\0\0\xcf\0\0")), (lsn!("0/16B7AF0"), pg_record(false, b"C\0\0\0\0\x04\0\0`zk\x01\0\0\0\0\0\n\0\0\xba\xb0.>\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xcf\0\0\0\0\0\0\0\xd0\0\0")), (lsn!("0/16B7B38"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa8zk\x01\0\0\0\0\0\n\0\0\t\xb5\x12^\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd0\0\0\0\0\0\0\0\xd1\0\0")), (lsn!("0/16B7B80"), pg_record(false, b"C\0\0\0\0\x04\0\0\xf0zk\x01\0\0\0\0\0\n\0\0\xe2\xaa-K\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd1\0\0\0\0\0\0\0\xd2\0\0")), (lsn!("0/16B7BC8"), pg_record(false, b"C\0\0\0\0\x04\0\08{k\x01\0\0\0\0\0\n\0\0){\xed\x03\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd2\0\0\0\0\0\0\0\xd3\0\0")), (lsn!("0/16B7C10"), pg_record(false, b"C\0\0\0\0\x04\0\0\x80{k\x01\0\0\0\0\0\n\0\0!\xc2\xcb[\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd3\0\0\0\0\0\0\0\xd4\0\0")), (lsn!("0/16B7C58"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc8{k\x01\0\0\0\0\0\n\0\0v\xd0E\xaa\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd4\0\0\0\0\0\0\0\xd5\0\0")), (lsn!("0/16B7CA0"), pg_record(false, b"C\0\0\0\0\x04\0\0\x10|k\x01\0\0\0\0\0\n\0\0QSU\x9c\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd5\0\0\0\0\0\0\0\xd6\0\0")), (lsn!("0/16B7CE8"), pg_record(false, b"C\0\0\0\0\x04\0\0X|k\x01\0\0\0\0\0\n\0\0\xc3\xb5\xaa\x8a\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd6\0\0\0\0\0\0\0\xd7\0\0")), (lsn!("0/16B7D30"), pg_record(false, b"C\0\0\0\0\x04\0\0\xa0|k\x01\0\0\0\0\0\n\0\0Tv\x80\x16\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd7\0\0\0\0\0\0\0\xd8\0\0")), (lsn!("0/16B7D78"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe8|k\x01\0\0\0\0\0\n\0\0x\xfb\x01,\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd8\0\0\0\0\0\0\0\xd9\0\0")), (lsn!("0/16B7DC0"), pg_record(false, b"C\0\0\0\0\x04\0\00}k\x01\0\0\0\0\0\n\0\0\xca\xd3\x01g\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xd9\0\0\0\0\0\0\0\xda\0\0")), (lsn!("0/16B7E08"), pg_record(false, b"C\0\0\0\0\x04\0\0x}k\x01\0\0\0\0\0\n\0\0X5\xfeq\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xda\0\0\0\0\0\0\0\xdb\0\0")), (lsn!("0/16B7E50"), pg_record(false, b"C\0\0\0\0\x04\0\0\xc0}k\x01\0\0\0\0\0\n\0\0P\x8c\xd8)\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xdb\0\0\0\0\0\0\0\xdc\0\0")), (lsn!("0/16B7E98"), pg_record(false, b"C\0\0\0\0\x04\0\0\x08~k\x01\0\0\0\0\0\n\0\0-0f\xad\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xdc\0\0\0\0\0\0\0\xdd\0\0")), (lsn!("0/16B7EE0"), pg_record(false, b"C\0\0\0\0\x04\0\0P~k\x01\0\0\0\0\0\n\0\0\xc6/Y\xb8\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xdd\0\0\0\0\0\0\0\xde\0\0")), (lsn!("0/16B7F28"), pg_record(false, b"C\0\0\0\0\x04\0\0\x98~k\x01\0\0\0\0\0\n\0\0\xcc\t\xe8g\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xde\0\0\0\0\0\0\0\xdf\0\0")), (lsn!("0/16B7F70"), pg_record(false, b"C\0\0\0\0\x04\0\0\xe0~k\x01\0\0\0\0\0\n\0\0:\xcfB\xff\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xdf\0\0\0\0\0\0\0\xe0\0\0")), (lsn!("0/16B7FB8"), pg_record(false, b"C\0\0\0\0\x04\0\0(\x7fk\x01\0\0\0\0\0\n\0\0\xb7\xdb\x05*\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xe0\0\0\0\0\0\0\0\xe1\0\0")), (lsn!("0/16B8000"), pg_record(false, b"C\0\0\0\0\x04\0\0p\x7fk\x01\0\0\0\0\0\n\0\0\\\xc4:?\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xe1\0\0\0\0\0\0\0\xe2\0\0")), (lsn!("0/16CBD68"), pg_record(false, b"@ \0\0\0\0\0\0\xc0|l\x01\0\0\0\0@\t\0\0\xdf\xb0\x1a`\0\x12\0\0\0 \0\0\x04\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\xff\x05\0\0\0\0\0\0\0\0\0\0\0\0\x18\0\0 \0 \x04 \0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\x01")), ], pg_version: PgMajorVersion::PG14, } } } ================================================ FILE: pageserver/benches/large-layer-map-layernames.txt ================================================ 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006CF69CD8B0 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006F949B7C08 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000071F15CF6B0 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000072AEE2BFE0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000756884A510 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000077B1836CA0 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007D41715570 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007F12B83FE8 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000083D5DE3FD0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000873B520940 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000890CF51FE0 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008C71903720 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008E43487FF0 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009445A06DC8 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096187D1FC8 000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096E85806C0 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009921F3B4A8 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009B5229DFE8 000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009EBB11FFC0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000A93DDE5FE0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000AD3698E000 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B3AC039FE8 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B8606C92A0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BC59629F98 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BD25E66810 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BEF683BFD0 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C14270A078 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C3687EDFE8 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C6C7BD8140 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C896B8DFD8 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CB82C2FF68 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CD51009FE8 000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CF7E08BFD0 000000000000000000000000000000000000-000000067F00008000000540090100000000__0000006AEF261AF8 000000000000000000000000000000000000-000000067F00008000000560090100000000__0000006DA30DA180 000000000000000000000000000000000000-000000067F00008000000580090100000000__0000006FAFE25518 000000000000000000000000000000000000-000000067F000080000005E0090100000000__00000073AF75E930 000000000000000000000000000000000000-000000067F00008000000620090100000000__00000078B2CB1C68 000000000000000000000000000000000000-000000067F00008000000640090100000000__0000007B9877EF40 000000000000000000000000000000000000-000000067F00008000000680090100000000__00000080E477E868 000000000000000000000000000000000000-000000067F000080000006C0090100000000__00000085BE169568 000000000000000000000000000000000000-000000067F00008000000700090100000000__0000008AF15FEF50 000000000000000000000000000000000000-000000067F00008000000740090100000000__000000902186B1D0 000000000000000000000000000000000000-000000067F00008000000760090100000000__00000092CA5E4EA8 000000000000000000000000000000000000-000000067F000080000007E0090100000000__0000009D34F8D4D8 000000000000000000000000000000000000-000000067F00008000000820090100000000__000000A29F1D8950 000000000000000000000000000000000000-000000067F00008000000860090100000000__000000A434813A68 000000000000000000000000000000000000-000000067F000080000008C0090100000000__000000AAEBE534F8 000000000000000000000000000000000000-000000067F00008000000960090100000000__000000B6C2E92A88 000000000000000000000000000000000000-000000067F00008000000A20090100000000__000000C5745579F0 000000000000000000000000000000000000-000000067F00008000000A60090100000000__000000CA2C877DC8 000000000000000000000000000000000000-030000000000000000000000000000000002__000000AFB4666000 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF7DC97FD1-000000CF801FC221 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FC221-000000CF801FDB61 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FDB61-000000CF80201FA1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80201FA1-000000CF80203CC1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80203CC1-000000CF802067C1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF802067C1-000000CF80208AE1 000000067F000032AC000040040000000000-000000067F000080000005400C0000007DD8__0000006A5C770149-0000006ACEF98449 000000067F000032AC000040040000000000-000000067F000080000005600C0000008077__0000006CF7781D19-0000006D69B48989 000000067F000032AC000040040000000000-000000067F000080000005800C0000007A49__0000006F95E72491-0000006FA8EDF3B9 000000067F000032AC000040040000000000-000000067F000080000005A00C0000007614__000000723877FF21-00000072A0D7CEA1 000000067F000032AC000040040000000000-000000067F000080000005C00C0000016516__00000072A0D7CEA1-0000007318DDE691 000000067F000032AC000040040000000000-000000067F000080000006000C0000008FB7__00000075687C3009-00000075E915EBC9 000000067F000032AC000040040000000000-000000067F000080000006200C0000009441__0000007805801C41-00000078859FEA11 000000067F000032AC000040040000000000-000000067F000080000006400C0000007987__0000007AA1DF6639-0000007B14D5C521 000000067F000032AC000040040000000000-000000067F000080000006600C0000009381__0000007D41EA8D51-0000007DC21DE569 000000067F000032AC000040040000000000-000000067F000080000006800C0000007D6A__0000007FDCDCE659-000000804F6BFFC1 000000067F000032AC000040040000000000-000000067F000080000006801400000044E4__00000081AFAF5FD1-0000008215AFE5A9 000000067F000032AC000040040000000000-000000067F000080000006C00C00000090F5__00000084A325AA01-00000085239DFB81 000000067F000032AC000040040000000000-000000067F000080000006E00C00000096C8__000000873C9A2551-00000087BC75E5B1 000000067F000032AC000040040000000000-000000067F000080000007000C000000955C__00000089D6B8EE99-0000008A56BBF739 000000067F000032AC000040040000000000-000000067F000080000007200C000000933D__0000008C72843D41-0000008CF2BFFC89 000000067F000032AC000040040000000000-000000067F000080000007400C00000090E9__0000008F10E3E189-0000008F915DE591 000000067F000032AC000040040000000000-000000067F000080000007600C0000008180__00000091A6DD7A79-0000009228F7FA79 000000067F000032AC000040040000000000-000000067F000080000007800C000000974C__0000009446B52FD1-00000094D67DF4F9 000000067F000032AC000040040000000000-000000067F000080000007A00C000000974B__00000096E85829C9-00000098A7ADFC91 000000067F000032AC000040040000000000-000000067F000080000007C00C0000007EA5__000000997F5D23C9-00000099F1C9FC71 000000067F000032AC000040040000000000-000000067F000080000007E00C00000092CD__0000009C1E8CC879-0000009C9ED3F059 000000067F000032AC000040040000000000-000000067F000080000008000C00000081F6__0000009EBBC72771-000000A154401909 000000067F000032AC000040040000000000-000000067F000080000008200C000000974D__000000A154401909-000000A1E407F839 000000067F000032AC000040040000000000-000000067F0000800000082014000000393C__000000A323C9E001-000000A37A60B1A9 000000067F000032AC000040040000000000-000000067F000080000008600C0000009747__000000A37A60B1A9-000000A3CA47ECA9 000000067F000032AC000040040000000000-000000067F000080000008801C0000009703__000000A5A081B661-000000A6503DE919 000000067F000032AC000040040000000000-000000067F000080000008801C00000CF6B0__000000A6F001F909-000000A91D97FD49 000000067F000032AC000040040000000000-000000067F000080000008C00C0000002330__000000A98AB7EE49-000000AA2597E9A1 000000067F000032AC000040040000000000-000000067F000080000008E00C00000077B3__000000AB6533BFD9-000000ABF63DF511 000000067F000032AC000040040000000000-000000067F000080000008E02A000000529F__000000AF5D587FE1-000000AFB4666001 000000067F000032AC000040040000000000-000000067F000080000009004000000047E0__000000B18495C001-000000B1FA75F501 000000067F000032AC000040040000000000-000000067F00008000000920140000005289__000000B3AB3B7FC9-000000B4208FF3D1 000000067F000032AC000040040000000000-000000067F000080000009400C000008DEA4__000000B4E047E5A9-000000B5CED8CF79 000000067F000032AC000040040000000000-000000067F000080000009600C000000974F__000000B5CED8CF79-000000B63EADE5B9 000000067F000032AC000040040000000000-000000067F000080000009600C0000055A74__000000B808718889-000000B8606C92A1 000000067F000032AC000040040000000000-000000067F000080000009800C0000009748__000000B8606C92A1-000000B8E03BF0B9 000000067F000032AC000040040000000000-000000067F000080000009800C000010EC71__000000BA1FC3FB39-000000BA9685E7C1 000000067F000032AC000040040000000000-000000067F000080000009A00C0000071F6F__000000BCEF79BE91-000000BD263A5849 000000067F000032AC000040040000000000-000000067F000080000009C00C0000009749__000000BD263A5849-000000BDA607F261 000000067F000032AC000040040000000000-000000067F000080000009E00C0000004916__000000BEF5F47FD1-000000BF48FFEB11 000000067F000032AC000040040000000000-000000067F00008000000A000C0000008EF9__000000C19744E959-000000C217F3F379 000000067F000032AC000040040000000000-000000067F00008000000A200C0000009748__000000C430961E71-000000C4C05DDB29 000000067F000032AC000040040000000000-000000067F00008000000A400C0000009743__000000C6C87B6329-000000C74849FAE1 000000067F000032AC000040040000000000-000000067F00008000000A600C0000009746__000000C90726D0D9-000000C986F5F0D9 000000067F000032AC000040040000000000-000000067F00008000000A600C000007A149__000000CB40C16489-000000CB82C37859 000000067F000032AC000040040000000000-000000067F00008000000A800C0000009748__000000CB82C37859-000000CC11F5EDC9 000000067F000032AC000040040000000000-000000067F00008000000A800F0100000003__000000CD51344F89-000000CDCC7BF889 000000067F00008000000000000000000001-000000067F000080000005400C000004B479__0000006C98B77D29-0000006CF7781D19 000000067F00008000000000000000000001-000000067F000080000005400C0000104BE4__0000006C1E7C73C1-0000006C98B77D29 000000067F00008000000000000000000001-000000067F000080000005600C0000048643__0000006F3370DD59-0000006F95E72491 000000067F00008000000000000000000001-000000067F000080000005600C0000100001__0000006EB935F989-0000006F3370DD59 000000067F00008000000000000000000001-000000067F000080000005800C000005CF06__00000071F21624D1-000000723877FF21 000000067F00008000000000000000000001-000000067F000080000005800C000009D78D__000000716A103FC9-00000071F21624D1 000000067F00008000000000000000000001-000000067F000080000005800C00000CDE2D__00000070E8761431-000000716A103FC9 000000067F00008000000000000000000001-000000067F000080000005E00C00000385D9__0000007318DDE691-0000007497B01FF9 000000067F00008000000000000000000001-000000067F000080000005E00C0000050175__000000751253A4C1-00000075687C3009 000000067F00008000000000000000000001-000000067F000080000005E00C00000AF576__0000007497B01FF9-000000751253A4C1 000000067F00008000000000000000000001-000000067F000080000006000C0000051A02__00000077B2AD0F91-0000007805801C41 000000067F00008000000000000000000001-000000067F000080000006000C00000C3C38__00000077391A8001-00000077B2AD0F91 000000067F00008000000000000000000001-000000067F000080000006000C00000C56C1__00000076A8CDE8F9-00000077391A8001 000000067F00008000000000000000000001-000000067F000080000006200C000004811C__0000007A3F679FA1-0000007AA1DF6639 000000067F00008000000000000000000001-000000067F000080000006200C0000107883__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000000000000000001-000000067F000080000006400C000004B4C9__0000007B14D5C521-0000007C73B53FC9 000000067F00008000000000000000000001-000000067F000080000006400C000005258F__0000007CEE5A0B91-0000007D41EA8D51 000000067F00008000000000000000000001-000000067F000080000006400C00000A887C__0000007C73B53FC9-0000007CEE5A0B91 000000067F00008000000000000000000001-000000067F000080000006600C0000049742__0000007F7BE4E6F1-0000007FDCDCE659 000000067F00008000000000000000000001-000000067F000080000006600C00000BC29F__0000007E71DBF8F9-0000007F11E4BFE9 000000067F00008000000000000000000001-000000067F000080000006600C0000111C82__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000000000000000001-000000067F000080000006800C00000A8D4C__00000080EF2FF5B9-00000081AFAF5FD1 000000067F00008000000000000000000001-000000067F000080000006A00C0000051984__000000844F1A6789-00000084A325AA01 000000067F00008000000000000000000001-000000067F000080000006A00C00000703EC__00000082B573F579-00000083D5901FD9 000000067F00008000000000000000000001-000000067F000080000006A00C00000C4CC8__00000083D5901FD9-000000844F1A6789 000000067F00008000000000000000000001-000000067F000080000006C00C0000055EA3__00000086ED29E361-000000873C9A2551 000000067F00008000000000000000000001-000000067F000080000006C00C00000BC102__00000085D35BF439-0000008673817FC9 000000067F00008000000000000000000001-000000067F000080000006C00C00000BFB6E__0000008673817FC9-00000086ED29E361 000000067F00008000000000000000000001-000000067F000080000006E00C0000054244__0000008985FD3611-00000089D6B8EE99 000000067F00008000000000000000000001-000000067F000080000006E00C00000B6F42__000000890C5B6001-0000008985FD3611 000000067F00008000000000000000000001-000000067F000080000006E00C00000C5883__000000887C2DFE59-000000890C5B6001 000000067F00008000000000000000000001-000000067F000080000007000C0000053C20__0000008C2045B721-0000008C72843D41 000000067F00008000000000000000000001-000000067F000080000007000C00000B2B06__0000008AF67FEC19-0000008BA6803FC9 000000067F00008000000000000000000001-000000067F000080000007000C00000BF157__0000008BA6803FC9-0000008C2045B721 000000067F00008000000000000000000001-000000067F000080000007200C0000051312__0000008EBC4827C1-0000008F10E3E189 000000067F00008000000000000000000001-000000067F000080000007200C00000BA086__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000000000000000001-000000067F000080000007200C00000C58B0__0000008DB277FA49-0000008E42A19FD1 000000067F00008000000000000000000001-000000067F000080000007400C000004DF08__000000914B2393B1-00000091A6DD7A79 000000067F00008000000000000000000001-000000067F000080000007400C00000FCCA8__00000090D0E5EA29-000000914B2393B1 000000067F00008000000000000000000001-000000067F000080000007600C00000544BA__0000009228F7FA79-00000093786F8001 000000067F00008000000000000000000001-000000067F000080000007600C0000061028__0000009402435A49-0000009446B52FD1 000000067F00008000000000000000000001-000000067F000080000007600C000008C52F__00000093786F8001-0000009402435A49 000000067F00008000000000000000000001-000000067F000080000007800C000006D445__00000096AEF27399-00000096E85829C9 000000067F00008000000000000000000001-000000067F000080000007800C000007B8BC__00000096193A8001-00000096AEF27399 000000067F00008000000000000000000001-000000067F000080000007800C00000CD6B6__000000959635F2A9-00000096193A8001 000000067F00008000000000000000000001-000000067F000080000007A00C000004B9A5__0000009921E47AA1-000000997F5D23C9 000000067F00008000000000000000000001-000000067F000080000007A00C00000F720F__00000098A7ADFC91-0000009921E47AA1 000000067F00008000000000000000000001-000000067F000080000007C00C0000052A9D__0000009BCB4E4461-0000009C1E8CC879 000000067F00008000000000000000000001-000000067F000080000007C00C00000A9244__0000009A918DF181-0000009B51A8BBB9 000000067F00008000000000000000000001-000000067F000080000007C00C00000BA258__0000009B51A8BBB9-0000009BCB4E4461 000000067F00008000000000000000000001-000000067F000080000007E00C0000061ADC__0000009E781A9731-0000009EBBC72771 000000067F00008000000000000000000001-000000067F000080000007E00C0000093E3A__0000009DEEE6BFF9-0000009E781A9731 000000067F00008000000000000000000001-000000067F000080000007E00C00000B2704__0000009D3E97E549-0000009DEEE6BFF9 000000067F00008000000000000000000001-000000067F000080000008200C000005D8FE__000000A1E407F839-000000A323C9E001 000000067F00008000000000000000000001-000000067F000080000008600C000010ECC4__000000A539BDE561-000000A5A081B661 000000067F00008000000000000000000001-000000067F000080000008A00C0000104A0C__000000A91D97FD49-000000A98AB7EE49 000000067F00008000000000000000000001-000000067F000080000008C00C000005DA8C__000000AA2597E9A1-000000AB6533BFD9 000000067F00008000000000000000000001-000000067F000080000008E00C00000BC018__000000AC9601EA19-000000AD36393FE9 000000067F00008000000000000000000001-000000067F000080000008E0140000003E33__000000AD36393FE9-000000ADB047EAB9 000000067F00008000000000000000000001-000000067F000080000008E022000008E3D1__000000AE6FFFE799-000000AF5D587FE1 000000067F00008000000000000000000001-000000067F000080000009003800000C5213__000000B0F3EDEAC9-000000B18495C001 000000067F00008000000000000000000001-000000067F000080000009200C000009567A__000000B2CA27F641-000000B3AB3B7FC9 000000067F00008000000000000000000001-000000067F000080000009600C00000A93FD__000000B6DE71F5F9-000000B79E68FFF9 000000067F00008000000000000000000001-000000067F000080000009600C020000000B__000000B79E68FFF9-000000B808718889 000000067F00008000000000000000000001-000000067F000080000009A00C00000794DC__000000BC596B5D59-000000BCEF79BE91 000000067F00008000000000000000000001-000000067F000080000009A00C00000D6C06__000000BBE607E8F1-000000BC596B5D59 000000067F00008000000000000000000001-000000067F000080000009C00C00000B2921__000000BE45CBFBB9-000000BEF5F47FD1 000000067F00008000000000000000000001-000000067F000080000009E00C0000050E55__000000C1426D92E1-000000C19744E959 000000067F00008000000000000000000001-000000067F000080000009E00C000009FB21__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F00008000000000000000000001-000000067F000080000009E00C00000C0C74__000000C0C8CA5FF1-000000C1426D92E1 000000067F00008000000000000000000001-000000067F00008000000A000C000005635B__000000C3E17E01A1-000000C430961E71 000000067F00008000000000000000000001-000000067F00008000000A000C00000B8B52__000000C367E48001-000000C3E17E01A1 000000067F00008000000000000000000001-000000067F00008000000A000C00000BC072__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000000000000000001-000000067F00008000000A200C00000677D8__000000C689AF4AC1-000000C6C87B6329 000000067F00008000000000000000000001-000000067F00008000000A200C00000933F0__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000000000000000001-000000067F00008000000A200C00000BBC1F__000000C56021EB29-000000C600A8FFF9 000000067F00008000000000000000000001-000000067F00008000000A400C00000C4AE6__000000C80801E859-000000C8993EBFF9 000000067F00008000000000000000000001-000000067F00008000000A400C0000107F8F__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000000000000000001-000000067F00008000000A600C0000054BFB__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000000000000000001-000000067F00008000000A600C00001117CB__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000000000000000001-000000067F00008000000A800C00000BCB46__000000CCB1B9E181-000000CD51344F89 000000067F00008000000000000000000001-000000067F00008000000AA00C0000078E97__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000004E10100000002-000000067F000080000005400C000004BA9C__0000006ACEF98449-0000006C1E7C73C1 000000067F00008000000004E10100000002-000000067F000080000005800C0000071854__0000007048B1EC09-00000070E8761431 000000067F00008000000004E10200000000-000000067F000080000005600C000004BA9D__0000006D69B48989-0000006EB935F989 000000067F00008000000004EB0100000002-000000067F00008000000A400C00000551FC__000000C74849FAE1-000000C80801E859 000000067F000080000005200C000006C000-030000000000000000000000000000000002__000000687B67FC58 000000067F00008000000520140000028A69-030000000000000000000000000000000002__0000006981B5FDC9-00000069FBEEB099 000000067F0000800000052014000002C260-030000000000000000000000000000000002__00000069FBEEB099-0000006A5C770149 000000067F000080000005400C0000000000-000000067F000080000005400C0000004000__0000006CF69CD8B0 000000067F000080000005400C0000004000-000000067F000080000005400C0000008000__0000006CF69CD8B0 000000067F000080000005400C0000008000-000000067F000080000005400C000000C000__0000006CF69CD8B0 000000067F000080000005400C000000C000-000000067F000080000005400C0000010000__0000006CF69CD8B0 000000067F000080000005400C0000010000-000000067F000080000005400C0000014000__0000006CF69CD8B0 000000067F000080000005400C0000014000-000000067F000080000005400C0000018000__0000006CF69CD8B0 000000067F000080000005400C0000018000-000000067F000080000005400C000001C000__0000006CF69CD8B0 000000067F000080000005400C000001C000-000000067F000080000005400C0000020000__0000006CF69CD8B0 000000067F000080000005400C0000020000-000000067F000080000005400C0000024000__0000006CF69CD8B0 000000067F000080000005400C0000024000-000000067F000080000005400C0000028000__0000006CF69CD8B0 000000067F000080000005400C0000028000-000000067F000080000005400C000002C000__0000006CF69CD8B0 000000067F000080000005400C000002C000-000000067F000080000005400C0000030000__0000006CF69CD8B0 000000067F000080000005400C0000030000-000000067F000080000005400C0000034000__0000006CF69CD8B0 000000067F000080000005400C0000034000-000000067F000080000005400C0000038000__0000006CF69CD8B0 000000067F000080000005400C0000038000-000000067F000080000005400C000003C000__0000006CF69CD8B0 000000067F000080000005400C000003C000-000000067F000080000005400C0000040000__0000006CF69CD8B0 000000067F000080000005400C0000040000-000000067F000080000005400C0000044000__0000006CF69CD8B0 000000067F000080000005400C0000044000-000000067F000080000005400C0000048000__0000006CF69CD8B0 000000067F000080000005400C0000048000-000000067F000080000005400C000004C000__0000006CF69CD8B0 000000067F000080000005400C000004B483-000000067F000080000005400C00000967AD__0000006C98B77D29-0000006CF7781D19 000000067F000080000005400C000004C000-000000067F000080000005400C0000050000__0000006CF69CD8B0 000000067F000080000005400C0000050000-000000067F000080000005400C0000054000__0000006CF69CD8B0 000000067F000080000005400C0000054000-000000067F000080000005400C0000058000__0000006CF69CD8B0 000000067F000080000005400C0000054000-030000000000000000000000000000000002__0000006AEF261AF8 000000067F000080000005400C0000058000-000000067F000080000005400C000005C000__0000006CF69CD8B0 000000067F000080000005400C000005C000-000000067F000080000005400C0000060000__0000006CF69CD8B0 000000067F000080000005400C0000060000-000000067F000080000005400C0000064000__0000006CF69CD8B0 000000067F000080000005400C0000064000-000000067F000080000005400C0000068000__0000006CF69CD8B0 000000067F000080000005400C0000068000-000000067F000080000005400C000006C000__0000006CF69CD8B0 000000067F000080000005400C000006C000-000000067F000080000005400C0000070000__0000006CF69CD8B0 000000067F000080000005400C0000070000-000000067F000080000005400C0000074000__0000006CF69CD8B0 000000067F000080000005400C0000074000-000000067F000080000005400C0000078000__0000006CF69CD8B0 000000067F000080000005400C0000078000-000000067F000080000005400C000007C000__0000006CF69CD8B0 000000067F000080000005400C000007C000-000000067F000080000005400C0000080000__0000006CF69CD8B0 000000067F000080000005400C0000080000-000000067F000080000005400C0000084000__0000006CF69CD8B0 000000067F000080000005400C0000084000-000000067F000080000005400C0000088000__0000006CF69CD8B0 000000067F000080000005400C0000088000-000000067F000080000005400C000008C000__0000006CF69CD8B0 000000067F000080000005400C000008C000-000000067F000080000005400C0000090000__0000006CF69CD8B0 000000067F000080000005400C0000090000-000000067F000080000005400C0000094000__0000006CF69CD8B0 000000067F000080000005400C0000094000-000000067F000080000005400C0000098000__0000006CF69CD8B0 000000067F000080000005400C00000967BA-000000067F000080000005400C00000E2771__0000006C98B77D29-0000006CF7781D19 000000067F000080000005400C0000098000-000000067F000080000005400C000009C000__0000006CF69CD8B0 000000067F000080000005400C000009C000-000000067F000080000005400C00000A0000__0000006CF69CD8B0 000000067F000080000005400C00000A0000-000000067F000080000005400C00000A4000__0000006CF69CD8B0 000000067F000080000005400C00000A4000-000000067F000080000005400C00000A8000__0000006CF69CD8B0 000000067F000080000005400C00000A8000-000000067F000080000005400C00000AC000__0000006CF69CD8B0 000000067F000080000005400C00000AC000-000000067F000080000005400C00000B0000__0000006CF69CD8B0 000000067F000080000005400C00000B0000-000000067F000080000005400C00000B4000__0000006CF69CD8B0 000000067F000080000005400C00000B4000-000000067F000080000005400C00000B8000__0000006CF69CD8B0 000000067F000080000005400C00000B8000-000000067F000080000005400C00000BC000__0000006CF69CD8B0 000000067F000080000005400C00000BC000-000000067F000080000005400C00000C0000__0000006CF69CD8B0 000000067F000080000005400C00000C0000-000000067F000080000005400C00000C4000__0000006CF69CD8B0 000000067F000080000005400C00000C4000-000000067F000080000005400C00000C8000__0000006CF69CD8B0 000000067F000080000005400C00000C8000-000000067F000080000005400C00000CC000__0000006CF69CD8B0 000000067F000080000005400C00000CC000-000000067F000080000005400C00000D0000__0000006CF69CD8B0 000000067F000080000005400C00000D0000-000000067F000080000005400C00000D4000__0000006CF69CD8B0 000000067F000080000005400C00000D4000-000000067F000080000005400C00000D8000__0000006CF69CD8B0 000000067F000080000005400C00000D8000-000000067F000080000005400C00000DC000__0000006CF69CD8B0 000000067F000080000005400C00000DC000-000000067F000080000005400C00000E0000__0000006CF69CD8B0 000000067F000080000005400C00000E0000-000000067F000080000005400C00000E4000__0000006CF69CD8B0 000000067F000080000005400C00000E277B-000000067F00008000000540140000005B2E__0000006C98B77D29-0000006CF7781D19 000000067F000080000005400C00000E4000-000000067F000080000005400C00000E8000__0000006CF69CD8B0 000000067F000080000005400C00000E8000-000000067F000080000005400C00000EC000__0000006CF69CD8B0 000000067F000080000005400C00000EC000-000000067F000080000005400C00000F0000__0000006CF69CD8B0 000000067F000080000005400C00000F0000-000000067F000080000005400C00000F4000__0000006CF69CD8B0 000000067F000080000005400C00000F4000-000000067F000080000005400C00000F8000__0000006CF69CD8B0 000000067F000080000005400C00000F8000-000000067F000080000005400C00000FC000__0000006CF69CD8B0 000000067F000080000005400C00000FC000-000000067F000080000005400C0000100000__0000006CF69CD8B0 000000067F000080000005400C0000100000-000000067F000080000005400C0000104000__0000006CF69CD8B0 000000067F000080000005400C0000104000-000000067F000080000005400C0000108000__0000006CF69CD8B0 000000067F000080000005400C0000108000-000000067F000080000005400C000010C000__0000006CF69CD8B0 000000067F000080000005400C000010C000-000000067F000080000005400C0000110000__0000006CF69CD8B0 000000067F000080000005400C0000110000-000000067F00008000000540120100000000__0000006CF69CD8B0 000000067F000080000005400C0100000000-000000067F00008000000540140000004760__0000006C1E7C73C1-0000006C98B77D29 000000067F00008000000540140000004760-000000067F0000800000054014000000BB51__0000006C1E7C73C1-0000006C98B77D29 000000067F00008000000540140000005B2F-000000067F0000800000054014000001A04C__0000006C98B77D29-0000006CF7781D19 000000067F0000800000054014000000BB51-000000067F00008000000540140000012EFA__0000006C1E7C73C1-0000006C98B77D29 000000067F00008000000540140000012EFA-000000067F0000800000054014000001A2E5__0000006C1E7C73C1-0000006C98B77D29 000000067F0000800000054014000001A04E-000000067F0000800000054016000000022B__0000006C98B77D29-0000006CF7781D19 000000067F0000800000054014000001A2E5-000000067F000080000005401400000216D5__0000006C1E7C73C1-0000006C98B77D29 000000067F000080000005401400000216D5-000000067F00008000000540140000028AD9__0000006C1E7C73C1-0000006C98B77D29 000000067F00008000000540140000028AD9-030000000000000000000000000000000002__0000006C1E7C73C1-0000006C98B77D29 000000067F0000800000054016000000022B-030000000000000000000000000000000002__0000006C98B77D29-0000006CF7781D19 000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006DA30DA180 000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006F949B7C08 000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006DA30DA180 000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006F949B7C08 000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006DA30DA180 000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006F949B7C08 000000067F000080000005600C0000008077-000000067F000080000005600C00000117CE__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006DA30DA180 000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006F949B7C08 000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006DA30DA180 000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006F949B7C08 000000067F000080000005600C00000117CE-000000067F000080000005600C000001AF0A__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006DA30DA180 000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006F949B7C08 000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006DA30DA180 000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006F949B7C08 000000067F000080000005600C000001AF0A-000000067F000080000005600C0000024670__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006DA30DA180 000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006F949B7C08 000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006DA30DA180 000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006F949B7C08 000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006DA30DA180 000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006F949B7C08 000000067F000080000005600C0000024670-000000067F000080000005600C000002DDD6__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006DA30DA180 000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006F949B7C08 000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006DA30DA180 000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006F949B7C08 000000067F000080000005600C000002DDD6-000000067F000080000005600C000003752A__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006DA30DA180 000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006F949B7C08 000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006DA30DA180 000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006F949B7C08 000000067F000080000005600C000003752A-000000067F000080000005600C0000040C90__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006DA30DA180 000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006F949B7C08 000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006DA30DA180 000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006F949B7C08 000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006DA30DA180 000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006F949B7C08 000000067F000080000005600C0000040C90-030000000000000000000000000000000002__0000006CF7781D19-0000006D69B48989 000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006DA30DA180 000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006F949B7C08 000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006DA30DA180 000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006F949B7C08 000000067F000080000005600C0000048643-000000067F000080000005600C00000907F3__0000006F3370DD59-0000006F95E72491 000000067F000080000005600C000004BA9D-000000067F000080000005600C00000551D2__0000006D69B48989-0000006EB935F989 000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006DA30DA180 000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006F949B7C08 000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006DA30DA180 000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006F949B7C08 000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006DA30DA180 000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006F949B7C08 000000067F000080000005600C00000551D2-000000067F000080000005600C000005E90B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006DA30DA180 000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006F949B7C08 000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006DA30DA180 000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006F949B7C08 000000067F000080000005600C000005E90B-000000067F000080000005600C000006802B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006DA30DA180 000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006F949B7C08 000000067F000080000005600C0000064000-000000067F000080000005600C0000068000__0000006F949B7C08 000000067F000080000005600C0000064000-030000000000000000000000000000000002__0000006DA30DA180 000000067F000080000005600C0000068000-000000067F000080000005600C000006C000__0000006F949B7C08 000000067F000080000005600C000006802B-000000067F000080000005600C0000071782__0000006D69B48989-0000006EB935F989 000000067F000080000005600C000006C000-000000067F000080000005600C0000070000__0000006F949B7C08 000000067F000080000005600C0000070000-000000067F000080000005600C0000074000__0000006F949B7C08 000000067F000080000005600C0000071782-000000067F000080000005600C000007AEE8__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000074000-000000067F000080000005600C0000078000__0000006F949B7C08 000000067F000080000005600C0000078000-000000067F000080000005600C000007C000__0000006F949B7C08 000000067F000080000005600C000007AEE8-000000067F000080000005600C000008460B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C000007C000-000000067F000080000005600C0000080000__0000006F949B7C08 000000067F000080000005600C0000080000-000000067F000080000005600C0000084000__0000006F949B7C08 000000067F000080000005600C0000084000-000000067F000080000005600C0000088000__0000006F949B7C08 000000067F000080000005600C000008460B-000000067F000080000005600C000008DD71__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000088000-000000067F000080000005600C000008C000__0000006F949B7C08 000000067F000080000005600C000008C000-000000067F000080000005600C0000090000__0000006F949B7C08 000000067F000080000005600C000008DD71-000000067F000080000005600C00000974D7__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000090000-000000067F000080000005600C0000094000__0000006F949B7C08 000000067F000080000005600C00000907F5-000000067F000080000005600C00000D90E0__0000006F3370DD59-0000006F95E72491 000000067F000080000005600C0000094000-000000067F000080000005600C0000098000__0000006F949B7C08 000000067F000080000005600C00000974D7-000000067F000080000005600C00000A0C0B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000098000-000000067F000080000005600C000009C000__0000006F949B7C08 000000067F000080000005600C000009C000-000000067F000080000005600C00000A0000__0000006F949B7C08 000000067F000080000005600C00000A0000-000000067F000080000005600C00000A4000__0000006F949B7C08 000000067F000080000005600C00000A0C0B-000000067F000080000005600C00000AA371__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000A4000-000000067F000080000005600C00000A8000__0000006F949B7C08 000000067F000080000005600C00000A8000-000000067F000080000005600C00000AC000__0000006F949B7C08 000000067F000080000005600C00000AA371-000000067F000080000005600C00000B3AD7__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000AC000-000000067F000080000005600C00000B0000__0000006F949B7C08 000000067F000080000005600C00000B0000-000000067F000080000005600C00000B4000__0000006F949B7C08 000000067F000080000005600C00000B3AD7-000000067F000080000005600C00000BD20B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000B4000-000000067F000080000005600C00000B8000__0000006F949B7C08 000000067F000080000005600C00000B8000-000000067F000080000005600C00000BC000__0000006F949B7C08 000000067F000080000005600C00000BC000-000000067F000080000005600C00000C0000__0000006F949B7C08 000000067F000080000005600C00000BD20B-000000067F000080000005600C00000C6932__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000C0000-000000067F000080000005600C00000C4000__0000006F949B7C08 000000067F000080000005600C00000C4000-000000067F000080000005600C00000C8000__0000006F949B7C08 000000067F000080000005600C00000C6932-000000067F000080000005600C00000D0098__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000C8000-000000067F000080000005600C00000CC000__0000006F949B7C08 000000067F000080000005600C00000CC000-000000067F000080000005600C00000D0000__0000006F949B7C08 000000067F000080000005600C00000D0000-000000067F000080000005600C00000D4000__0000006F949B7C08 000000067F000080000005600C00000D0098-000000067F000080000005600C00000D97FE__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000D4000-000000067F000080000005600C00000D8000__0000006F949B7C08 000000067F000080000005600C00000D8000-000000067F000080000005600C00000DC000__0000006F949B7C08 000000067F000080000005600C00000D90F8-000000067F00008000000560140000002A9A__0000006F3370DD59-0000006F95E72491 000000067F000080000005600C00000D97FE-000000067F000080000005600C00000E2F0B__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000DC000-000000067F000080000005600C00000E0000__0000006F949B7C08 000000067F000080000005600C00000E0000-000000067F000080000005600C00000E4000__0000006F949B7C08 000000067F000080000005600C00000E2F0B-000000067F000080000005600C00000EC671__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000E4000-000000067F000080000005600C00000E8000__0000006F949B7C08 000000067F000080000005600C00000E8000-000000067F000080000005600C00000EC000__0000006F949B7C08 000000067F000080000005600C00000EC000-000000067F000080000005600C00000F0000__0000006F949B7C08 000000067F000080000005600C00000EC671-000000067F000080000005600C00000F5D9F__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000F0000-000000067F000080000005600C00000F4000__0000006F949B7C08 000000067F000080000005600C00000F4000-000000067F000080000005600C00000F8000__0000006F949B7C08 000000067F000080000005600C00000F5D9F-000000067F000080000005600C00000FF505__0000006D69B48989-0000006EB935F989 000000067F000080000005600C00000F8000-000000067F000080000005600C00000FC000__0000006F949B7C08 000000067F000080000005600C00000FC000-000000067F000080000005600C0000100000__0000006F949B7C08 000000067F000080000005600C00000FF505-000000067F000080000005600C0000108C10__0000006D69B48989-0000006EB935F989 000000067F000080000005600C0000100000-000000067F000080000005600C0000104000__0000006F949B7C08 000000067F000080000005600C0000100001-000000067F000080000005600C0000111BF7__0000006EB935F989-0000006F3370DD59 000000067F000080000005600C0000104000-000000067F000080000005600C0000108000__0000006F949B7C08 000000067F000080000005600C0000108000-000000067F000080000005600C000010C000__0000006F949B7C08 000000067F000080000005600C0000108C10-000000067F000080000005600C0100000000__0000006D69B48989-0000006EB935F989 000000067F000080000005600C000010C000-000000067F000080000005600C0000110000__0000006F949B7C08 000000067F000080000005600C0000110000-000000067F00008000000560120100000000__0000006F949B7C08 000000067F000080000005600C0000111BF7-000000067F0000800000056014000000451D__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000002A9A-000000067F00008000000560140000016143__0000006F3370DD59-0000006F95E72491 000000067F0000800000056014000000451D-000000067F0000800000056014000000B9A7__0000006EB935F989-0000006F3370DD59 000000067F0000800000056014000000B9A7-000000067F00008000000560140000012DE3__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000012DE3-000000067F0000800000056014000001A213__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000016143-000000067F00008000000560140000029CE0__0000006F3370DD59-0000006F95E72491 000000067F0000800000056014000001A213-000000067F00008000000560140000021666__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000021666-000000067F00008000000560140000028A7C__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000028A7C-030000000000000000000000000000000002__0000006EB935F989-0000006F3370DD59 000000067F00008000000560140000029CE2-030000000000000000000000000000000002__0000006F3370DD59-0000006F95E72491 000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__0000006FAFE25518 000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__00000071F15CF6B0 000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__0000006FAFE25518 000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__00000071F15CF6B0 000000067F000080000005800C0000007A49-030000000000000000000000000000000002__0000006F95E72491-0000006FA8EDF3B9 000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000006FAFE25518 000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000007168C9DFF8 000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__00000072377CDB60 000000067F000080000005800C00000096DE-000000067F000080000005800C0000012E0C__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__0000007168C9DFF8 000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__00000072377CDB60 000000067F000080000005800C000000C000-030000000000000000000000000000000002__0000006FAFE25518 000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__0000007168C9DFF8 000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__00000072377CDB60 000000067F000080000005800C0000012E0C-000000067F000080000005800C000001C572__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__0000007168C9DFF8 000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__00000072377CDB60 000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__0000007168C9DFF8 000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__00000072377CDB60 000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__0000007168C9DFF8 000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__00000072377CDB60 000000067F000080000005800C000001C572-000000067F000080000005800C0000025CD8__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__0000007168C9DFF8 000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__00000072377CDB60 000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__0000007168C9DFF8 000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__00000072377CDB60 000000067F000080000005800C0000025CD8-000000067F000080000005800C000002F40B__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__0000007168C9DFF8 000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__00000072377CDB60 000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__0000007168C9DFF8 000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__00000072377CDB60 000000067F000080000005800C000002F40B-000000067F000080000005800C0000038B1E__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__0000007168C9DFF8 000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__00000072377CDB60 000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__0000007168C9DFF8 000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__00000072377CDB60 000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__0000007168C9DFF8 000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__00000072377CDB60 000000067F000080000005800C0000038B1E-000000067F000080000005800C0000042284__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__0000007168C9DFF8 000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__00000072377CDB60 000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__0000007168C9DFF8 000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__00000072377CDB60 000000067F000080000005800C0000042284-000000067F000080000005800C000004B9EA__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__0000007168C9DFF8 000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__00000072377CDB60 000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__0000007168C9DFF8 000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__00000072377CDB60 000000067F000080000005800C000004B9EA-000000067F000080000005800C000005510B__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__0000007168C9DFF8 000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__00000072377CDB60 000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__0000007168C9DFF8 000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__00000072377CDB60 000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__0000007168C9DFF8 000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__00000072377CDB60 000000067F000080000005800C000005510B-000000067F000080000005800C000005E871__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__0000007168C9DFF8 000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__00000072377CDB60 000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__0000007168C9DFF8 000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__00000072377CDB60 000000067F000080000005800C000005CF08-000000067F000080000005800C00000BAF56__00000071F21624D1-000000723877FF21 000000067F000080000005800C000005E871-000000067F000080000005800C0000067F8B__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__0000007168C9DFF8 000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__00000072377CDB60 000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__0000007168C9DFF8 000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__00000072377CDB60 000000067F000080000005800C0000067F8B-000000067F000080000005800C0100000000__0000006FA8EDF3B9-0000007048B1EC09 000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__0000007168C9DFF8 000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__00000072377CDB60 000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__0000007168C9DFF8 000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__00000072377CDB60 000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__0000007168C9DFF8 000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__00000072377CDB60 000000067F000080000005800C0000071854-000000067F000080000005800C000007AFBA__0000007048B1EC09-00000070E8761431 000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__0000007168C9DFF8 000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__00000072377CDB60 000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__0000007168C9DFF8 000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__00000072377CDB60 000000067F000080000005800C000007AFBA-000000067F000080000005800C0000084720__0000007048B1EC09-00000070E8761431 000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__0000007168C9DFF8 000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__00000072377CDB60 000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__0000007168C9DFF8 000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__00000072377CDB60 000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__0000007168C9DFF8 000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__00000072377CDB60 000000067F000080000005800C0000084720-000000067F000080000005800C000008DE86__0000007048B1EC09-00000070E8761431 000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__0000007168C9DFF8 000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__00000072377CDB60 000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__0000007168C9DFF8 000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__00000072377CDB60 000000067F000080000005800C000008DE86-000000067F000080000005800C00000975A6__0000007048B1EC09-00000070E8761431 000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__0000007168C9DFF8 000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__00000072377CDB60 000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__0000007168C9DFF8 000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__00000072377CDB60 000000067F000080000005800C00000975A6-000000067F000080000005800C00000A0D0C__0000007048B1EC09-00000070E8761431 000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__0000007168C9DFF8 000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__00000072377CDB60 000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__0000007168C9DFF8 000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__00000072377CDB60 000000067F000080000005800C000009D78D-000000067F000080000005800C0200000018__000000716A103FC9-00000071F21624D1 000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__0000007168C9DFF8 000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__00000072377CDB60 000000067F000080000005800C00000A0D0C-000000067F000080000005800C00000AA472__0000007048B1EC09-00000070E8761431 000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__0000007168C9DFF8 000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__00000072377CDB60 000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__0000007168C9DFF8 000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__00000072377CDB60 000000067F000080000005800C00000AA472-000000067F000080000005800C00000B3BB4__0000007048B1EC09-00000070E8761431 000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__0000007168C9DFF8 000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__00000072377CDB60 000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__0000007168C9DFF8 000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__00000072377CDB60 000000067F000080000005800C00000B3BB4-000000067F000080000005800C00000BD30B__0000007048B1EC09-00000070E8761431 000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__0000007168C9DFF8 000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__00000072377CDB60 000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__0000007168C9DFF8 000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__00000072377CDB60 000000067F000080000005800C00000BAF5F-000000067F000080000005801400000007C1__00000071F21624D1-000000723877FF21 000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__0000007168C9DFF8 000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__00000072377CDB60 000000067F000080000005800C00000BD30B-000000067F000080000005800C00000C6A32__0000007048B1EC09-00000070E8761431 000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__0000007168C9DFF8 000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__00000072377CDB60 000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__0000007168C9DFF8 000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__00000072377CDB60 000000067F000080000005800C00000C6A32-000000067F000080000005800C0100000000__0000007048B1EC09-00000070E8761431 000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__0000007168C9DFF8 000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__00000072377CDB60 000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__0000007168C9DFF8 000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__00000072377CDB60 000000067F000080000005800C00000CDE2D-000000067F000080000005800C00000D754D__00000070E8761431-000000716A103FC9 000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__0000007168C9DFF8 000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__00000072377CDB60 000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__0000007168C9DFF8 000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__00000072377CDB60 000000067F000080000005800C00000D754D-000000067F000080000005800C00000E0CB3__00000070E8761431-000000716A103FC9 000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__0000007168C9DFF8 000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__00000072377CDB60 000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__0000007168C9DFF8 000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__00000072377CDB60 000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__0000007168C9DFF8 000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__00000072377CDB60 000000067F000080000005800C00000E0CB3-000000067F000080000005800C00000EA409__00000070E8761431-000000716A103FC9 000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__0000007168C9DFF8 000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__00000072377CDB60 000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__0000007168C9DFF8 000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__00000072377CDB60 000000067F000080000005800C00000EA409-000000067F000080000005800C00000F3B4B__00000070E8761431-000000716A103FC9 000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__0000007168C9DFF8 000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__00000072377CDB60 000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__0000007168C9DFF8 000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__00000072377CDB60 000000067F000080000005800C00000F3B4B-000000067F000080000005800C00000FD2B1__00000070E8761431-000000716A103FC9 000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__0000007168C9DFF8 000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__00000072377CDB60 000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__0000007168C9DFF8 000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__00000072377CDB60 000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__0000007168C9DFF8 000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__00000072377CDB60 000000067F000080000005800C00000FD2B1-000000067F000080000005800C00001069D8__00000070E8761431-000000716A103FC9 000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__0000007168C9DFF8 000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__00000072377CDB60 000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__0000007168C9DFF8 000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__00000072377CDB60 000000067F000080000005800C00001069D8-000000067F000080000005800C000011010C__00000070E8761431-000000716A103FC9 000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__0000007168C9DFF8 000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__00000072377CDB60 000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__0000007168C9DFF8 000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__00000072377CDB60 000000067F000080000005800C0000110000-000000067F00008000000580120100000000__00000072377CDB60 000000067F000080000005800C0000110000-030000000000000000000000000000000002__0000007168C9DFF8 000000067F000080000005800C000011010C-01000000000000000100000002000000001E__00000070E8761431-000000716A103FC9 000000067F000080000005800C0200000018-000000067F000080000005801400000059BE__000000716A103FC9-00000071F21624D1 000000067F00008000000580140000000000-000000067F00008000000580140000004000__00000072377CDB60 000000067F000080000005801400000007C3-000000067F00008000000580140000020462__00000071F21624D1-000000723877FF21 000000067F00008000000580140000004000-000000067F00008000000580140000008000__00000072377CDB60 000000067F000080000005801400000059BE-000000067F0000800000058014000000BF38__000000716A103FC9-00000071F21624D1 000000067F00008000000580140000008000-000000067F0000800000058014000000C000__00000072377CDB60 000000067F0000800000058014000000BF38-000000067F00008000000580140000012530__000000716A103FC9-00000071F21624D1 000000067F0000800000058014000000C000-000000067F00008000000580140000010000__00000072377CDB60 000000067F00008000000580140000010000-000000067F00008000000580140000014000__00000072377CDB60 000000067F00008000000580140000012530-000000067F00008000000580140000018B50__000000716A103FC9-00000071F21624D1 000000067F00008000000580140000014000-000000067F00008000000580140000018000__00000072377CDB60 000000067F00008000000580140000018000-000000067F0000800000058014000001C000__00000072377CDB60 000000067F00008000000580140000018B50-000000067F0000800000058014000001F0D3__000000716A103FC9-00000071F21624D1 000000067F0000800000058014000001C000-000000067F00008000000580140000020000__00000072377CDB60 000000067F0000800000058014000001F0D3-000000067F0000800000058014000002562B__000000716A103FC9-00000071F21624D1 000000067F00008000000580140000020000-000000067F00008000000580140000024000__00000072377CDB60 000000067F00008000000580140000020464-030000000000000000000000000000000002__00000071F21624D1-000000723877FF21 000000067F00008000000580140000024000-000000067F00008000000580140000028000__00000072377CDB60 000000067F0000800000058014000002562B-000000067F0000800000058014000002BC37__000000716A103FC9-00000071F21624D1 000000067F00008000000580140000028000-000000067F0000800000058014000002C000__00000072377CDB60 000000067F0000800000058014000002BC37-030000000000000000000000000000000002__000000716A103FC9-00000071F21624D1 000000067F0000800000058014000002C000-030000000000000000000000000000000002__00000072377CDB60 000000067F000080000005A00C0000007614-000000067F000080000005A00C000000ED44__000000723877FF21-00000072A0D7CEA1 000000067F000080000005A00C000000ED44-000000067F000080000005A00C0000016337__000000723877FF21-00000072A0D7CEA1 000000067F000080000005A00C0000016337-000000067F000080000005A014000000148C__000000723877FF21-00000072A0D7CEA1 000000067F000080000005A014000000148C-000000067F000080000005C00C0000003207__000000723877FF21-00000072A0D7CEA1 000000067F000080000005C00C0000003207-000000067F000080000005C00C000000C96D__000000723877FF21-00000072A0D7CEA1 000000067F000080000005C00C000000C96D-030000000000000000000000000000000002__000000723877FF21-00000072A0D7CEA1 000000067F000080000005C00C0000016516-000000067F000080000005C0140000001694__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005C0140000001694-000000067F000080000005E00C000000360C__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__00000073AF75E930 000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__000000756884A510 000000067F000080000005E00C000000360C-000000067F000080000005E00C000000CD72__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__00000073AF75E930 000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__000000756884A510 000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__00000073AF75E930 000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__000000756884A510 000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__00000073AF75E930 000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__000000756884A510 000000067F000080000005E00C000000CD72-000000067F000080000005E00C00000164D8__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__00000073AF75E930 000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__000000756884A510 000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__00000073AF75E930 000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__000000756884A510 000000067F000080000005E00C00000164D8-000000067F000080000005E00C000001FC0B__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__00000073AF75E930 000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__000000756884A510 000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__00000073AF75E930 000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__000000756884A510 000000067F000080000005E00C000001FC0B-000000067F000080000005E00C0000029319__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__00000073AF75E930 000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__000000756884A510 000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__00000073AF75E930 000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__000000756884A510 000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__00000073AF75E930 000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__000000756884A510 000000067F000080000005E00C0000029319-030000000000000000000000000000000002__00000072A0D7CEA1-0000007318DDE691 000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__00000073AF75E930 000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__000000756884A510 000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__00000073AF75E930 000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__000000756884A510 000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__00000073AF75E930 000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__000000756884A510 000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__00000073AF75E930 000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__000000756884A510 000000067F000080000005E00C00000385D9-000000067F000080000005E00C0000041D0A__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__00000073AF75E930 000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__000000756884A510 000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__00000073AF75E930 000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__000000756884A510 000000067F000080000005E00C0000041D0A-000000067F000080000005E00C000004B470__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__00000073AF75E930 000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__000000756884A510 000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__00000073AF75E930 000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__000000756884A510 000000067F000080000005E00C000004B470-000000067F000080000005E00C0000054BA9__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__00000073AF75E930 000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__000000756884A510 000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__00000073AF75E930 000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__000000756884A510 000000067F000080000005E00C000005017A-000000067F000080000005E00C000009FEAD__000000751253A4C1-00000075687C3009 000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__00000073AF75E930 000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__000000756884A510 000000067F000080000005E00C0000054BA9-000000067F000080000005E00C000005E30B__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__00000073AF75E930 000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__000000756884A510 000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__00000073AF75E930 000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__000000756884A510 000000067F000080000005E00C000005E30B-000000067F000080000005E00C0000067A2C__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__00000073AF75E930 000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__000000756884A510 000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__00000073AF75E930 000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__000000756884A510 000000067F000080000005E00C0000067A2C-000000067F000080000005E00C0000071187__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__00000073AF75E930 000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__000000756884A510 000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__00000073AF75E930 000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__000000756884A510 000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__00000073AF75E930 000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__000000756884A510 000000067F000080000005E00C0000071187-000000067F000080000005E00C000007A8ED__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__00000073AF75E930 000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__000000756884A510 000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__00000073AF75E930 000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__000000756884A510 000000067F000080000005E00C000007A8ED-000000067F000080000005E00C000008400B__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__00000073AF75E930 000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__000000756884A510 000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__00000073AF75E930 000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__000000756884A510 000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__00000073AF75E930 000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__000000756884A510 000000067F000080000005E00C000008400B-000000067F000080000005E00C000008D771__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000088000-000000067F000080000005E00C000008C000__000000756884A510 000000067F000080000005E00C0000088000-030000000000000000000000000000000002__00000073AF75E930 000000067F000080000005E00C000008C000-000000067F000080000005E00C0000090000__000000756884A510 000000067F000080000005E00C000008D771-000000067F000080000005E00C0000096ED7__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000090000-000000067F000080000005E00C0000094000__000000756884A510 000000067F000080000005E00C0000094000-000000067F000080000005E00C0000098000__000000756884A510 000000067F000080000005E00C0000096ED7-000000067F000080000005E00C00000A060B__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000098000-000000067F000080000005E00C000009C000__000000756884A510 000000067F000080000005E00C000009C000-000000067F000080000005E00C00000A0000__000000756884A510 000000067F000080000005E00C000009FEB2-000000067F000080000005E00C00000EF4ED__000000751253A4C1-00000075687C3009 000000067F000080000005E00C00000A0000-000000067F000080000005E00C00000A4000__000000756884A510 000000067F000080000005E00C00000A060B-000000067F000080000005E00C00000A9D71__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000A4000-000000067F000080000005E00C00000A8000__000000756884A510 000000067F000080000005E00C00000A8000-000000067F000080000005E00C00000AC000__000000756884A510 000000067F000080000005E00C00000A9D71-000000067F000080000005E00C00000B34D7__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000AC000-000000067F000080000005E00C00000B0000__000000756884A510 000000067F000080000005E00C00000AF576-000000067F000080000005E00C0200000023__0000007497B01FF9-000000751253A4C1 000000067F000080000005E00C00000B0000-000000067F000080000005E00C00000B4000__000000756884A510 000000067F000080000005E00C00000B34D7-000000067F000080000005E00C00000BCC0C__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000B4000-000000067F000080000005E00C00000B8000__000000756884A510 000000067F000080000005E00C00000B8000-000000067F000080000005E00C00000BC000__000000756884A510 000000067F000080000005E00C00000BC000-000000067F000080000005E00C00000C0000__000000756884A510 000000067F000080000005E00C00000BCC0C-000000067F000080000005E00C00000C6336__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000C0000-000000067F000080000005E00C00000C4000__000000756884A510 000000067F000080000005E00C00000C4000-000000067F000080000005E00C00000C8000__000000756884A510 000000067F000080000005E00C00000C6336-000000067F000080000005E00C00000CFA9C__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000C8000-000000067F000080000005E00C00000CC000__000000756884A510 000000067F000080000005E00C00000CC000-000000067F000080000005E00C00000D0000__000000756884A510 000000067F000080000005E00C00000CFA9C-000000067F000080000005E00C00000D91AB__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000D0000-000000067F000080000005E00C00000D4000__000000756884A510 000000067F000080000005E00C00000D4000-000000067F000080000005E00C00000D8000__000000756884A510 000000067F000080000005E00C00000D8000-000000067F000080000005E00C00000DC000__000000756884A510 000000067F000080000005E00C00000D91AB-000000067F000080000005E00C00000E2911__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000DC000-000000067F000080000005E00C00000E0000__000000756884A510 000000067F000080000005E00C00000E0000-000000067F000080000005E00C00000E4000__000000756884A510 000000067F000080000005E00C00000E2911-000000067F000080000005E00C00000EC077__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000E4000-000000067F000080000005E00C00000E8000__000000756884A510 000000067F000080000005E00C00000E8000-000000067F000080000005E00C00000EC000__000000756884A510 000000067F000080000005E00C00000EC000-000000067F000080000005E00C00000F0000__000000756884A510 000000067F000080000005E00C00000EC077-000000067F000080000005E00C00000F57A8__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000EF4F1-000000067F000080000005E014000000BDDE__000000751253A4C1-00000075687C3009 000000067F000080000005E00C00000F0000-000000067F000080000005E00C00000F4000__000000756884A510 000000067F000080000005E00C00000F4000-000000067F000080000005E00C00000F8000__000000756884A510 000000067F000080000005E00C00000F57A8-000000067F000080000005E00C00000FEF0A__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C00000F8000-000000067F000080000005E00C00000FC000__000000756884A510 000000067F000080000005E00C00000FC000-000000067F000080000005E00C0000100000__000000756884A510 000000067F000080000005E00C00000FEF0A-000000067F000080000005E00C000010862B__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C0000100000-000000067F000080000005E00C0000104000__000000756884A510 000000067F000080000005E00C0000104000-000000067F000080000005E00C0000108000__000000756884A510 000000067F000080000005E00C0000108000-000000067F000080000005E00C000010C000__000000756884A510 000000067F000080000005E00C000010862B-000000067F000080000005E00C0000111C20__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C000010C000-000000067F000080000005E00C0000110000__000000756884A510 000000067F000080000005E00C0000110000-000000067F000080000005E0120100000000__000000756884A510 000000067F000080000005E00C00FFFFFFFF-010000000000000001000000030000000002__0000007318DDE691-0000007497B01FF9 000000067F000080000005E00C02FFFFFFFF-000000067F000080000005E0140000006C41__0000007497B01FF9-000000751253A4C1 000000067F000080000005E0140000000000-000000067F000080000005E0140000004000__000000756884A510 000000067F000080000005E0140000004000-000000067F000080000005E0140000008000__000000756884A510 000000067F000080000005E0140000006C41-000000067F000080000005E014000000D890__0000007497B01FF9-000000751253A4C1 000000067F000080000005E0140000008000-000000067F000080000005E014000000C000__000000756884A510 000000067F000080000005E014000000BDDE-000000067F000080000005E0140000023A18__000000751253A4C1-00000075687C3009 000000067F000080000005E014000000C000-000000067F000080000005E0140000010000__000000756884A510 000000067F000080000005E014000000D890-000000067F000080000005E01400000144C8__0000007497B01FF9-000000751253A4C1 000000067F000080000005E0140000010000-000000067F000080000005E0140000014000__000000756884A510 000000067F000080000005E0140000014000-000000067F000080000005E0140000018000__000000756884A510 000000067F000080000005E01400000144C8-000000067F000080000005E014000001B1AC__0000007497B01FF9-000000751253A4C1 000000067F000080000005E0140000018000-000000067F000080000005E014000001C000__000000756884A510 000000067F000080000005E014000001B1AC-000000067F000080000005E0140000021E03__0000007497B01FF9-000000751253A4C1 000000067F000080000005E014000001C000-000000067F000080000005E0140000020000__000000756884A510 000000067F000080000005E0140000020000-000000067F000080000005E0140000024000__000000756884A510 000000067F000080000005E0140000021E03-000000067F000080000005E0140000028A36__0000007497B01FF9-000000751253A4C1 000000067F000080000005E0140000023A18-030000000000000000000000000000000002__000000751253A4C1-00000075687C3009 000000067F000080000005E0140000024000-000000067F000080000005E0140000028000__000000756884A510 000000067F000080000005E0140000028000-000000067F000080000005E014000002C000__000000756884A510 000000067F000080000005E0140000028A36-030000000000000000000000000000000002__0000007497B01FF9-000000751253A4C1 000000067F000080000005E014000002C000-030000000000000000000000000000000002__000000756884A510 000000067F000080000006000C0000000000-000000067F000080000006000C0000004000__00000077B1836CA0 000000067F000080000006000C0000004000-000000067F000080000006000C0000008000__00000077B1836CA0 000000067F000080000006000C0000008000-000000067F000080000006000C000000C000__00000077B1836CA0 000000067F000080000006000C0000008FB7-000000067F000080000006000C000001271D__00000075687C3009-00000075E915EBC9 000000067F000080000006000C000000C000-000000067F000080000006000C0000010000__00000077B1836CA0 000000067F000080000006000C0000010000-000000067F000080000006000C0000014000__00000077B1836CA0 000000067F000080000006000C000001271D-000000067F000080000006000C000001BE83__00000075687C3009-00000075E915EBC9 000000067F000080000006000C0000014000-000000067F000080000006000C0000018000__00000077B1836CA0 000000067F000080000006000C0000018000-000000067F000080000006000C000001C000__00000077B1836CA0 000000067F000080000006000C000001BE83-000000067F000080000006000C00000255B6__00000075687C3009-00000075E915EBC9 000000067F000080000006000C000001C000-000000067F000080000006000C0000020000__00000077B1836CA0 000000067F000080000006000C0000020000-000000067F000080000006000C0000024000__00000077B1836CA0 000000067F000080000006000C0000024000-000000067F000080000006000C0000028000__00000077B1836CA0 000000067F000080000006000C00000255B6-000000067F000080000006000C000002ED0B__00000075687C3009-00000075E915EBC9 000000067F000080000006000C0000028000-000000067F000080000006000C000002C000__00000077B1836CA0 000000067F000080000006000C000002C000-000000067F000080000006000C0000030000__00000077B1836CA0 000000067F000080000006000C000002ED0B-000000067F000080000006000C000003842B__00000075687C3009-00000075E915EBC9 000000067F000080000006000C0000030000-000000067F000080000006000C0000034000__00000077B1836CA0 000000067F000080000006000C0000034000-000000067F000080000006000C0000038000__00000077B1836CA0 000000067F000080000006000C0000038000-000000067F000080000006000C000003C000__00000077B1836CA0 000000067F000080000006000C000003842B-000000067F000080000006000C0000041B80__00000075687C3009-00000075E915EBC9 000000067F000080000006000C000003C000-000000067F000080000006000C0000040000__00000077B1836CA0 000000067F000080000006000C0000040000-000000067F000080000006000C0000044000__00000077B1836CA0 000000067F000080000006000C0000041B80-000000067F000080000006000C000004B2E6__00000075687C3009-00000075E915EBC9 000000067F000080000006000C0000044000-000000067F000080000006000C0000048000__00000077B1836CA0 000000067F000080000006000C0000048000-000000067F000080000006000C000004C000__0000007739203FF0 000000067F000080000006000C000004B2E6-030000000000000000000000000000000002__00000075687C3009-00000075E915EBC9 000000067F000080000006000C000004BAC2-000000067F000080000006000C00000551F7__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C000004C000-000000067F000080000006000C0000050000__0000007739203FF0 000000067F000080000006000C0000050000-000000067F000080000006000C0000054000__0000007739203FF0 000000067F000080000006000C0000051A05-000000067F000080000006000C00000A4D93__00000077B2AD0F91-0000007805801C41 000000067F000080000006000C0000054000-000000067F000080000006000C0000058000__0000007739203FF0 000000067F000080000006000C00000551F7-000000067F000080000006000C000005E90B__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000058000-000000067F000080000006000C000005C000__0000007739203FF0 000000067F000080000006000C000005C000-000000067F000080000006000C0000060000__0000007739203FF0 000000067F000080000006000C000005E90B-000000067F000080000006000C000006802B__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000060000-000000067F000080000006000C0000064000__0000007739203FF0 000000067F000080000006000C0000064000-000000067F000080000006000C0000068000__0000007739203FF0 000000067F000080000006000C0000068000-000000067F000080000006000C000006C000__0000007739203FF0 000000067F000080000006000C000006802B-000000067F000080000006000C0000071782__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C000006C000-000000067F000080000006000C0000070000__0000007739203FF0 000000067F000080000006000C0000070000-000000067F000080000006000C0000074000__0000007739203FF0 000000067F000080000006000C0000071782-000000067F000080000006000C000007AEE8__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000074000-000000067F000080000006000C0000078000__0000007739203FF0 000000067F000080000006000C0000078000-000000067F000080000006000C000007C000__0000007739203FF0 000000067F000080000006000C000007AEE8-000000067F000080000006000C000008460B__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C000007C000-000000067F000080000006000C0000080000__0000007739203FF0 000000067F000080000006000C0000080000-000000067F000080000006000C0000084000__0000007739203FF0 000000067F000080000006000C0000084000-000000067F000080000006000C0000088000__0000007739203FF0 000000067F000080000006000C000008460B-000000067F000080000006000C000008DD71__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000088000-000000067F000080000006000C000008C000__0000007739203FF0 000000067F000080000006000C000008C000-000000067F000080000006000C0000090000__0000007739203FF0 000000067F000080000006000C000008DD71-000000067F000080000006000C00000974D7__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000090000-000000067F000080000006000C0000094000__0000007739203FF0 000000067F000080000006000C0000094000-000000067F000080000006000C0000098000__0000007739203FF0 000000067F000080000006000C00000974D7-000000067F000080000006000C00000A0C0B__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C0000098000-000000067F000080000006000C000009C000__0000007739203FF0 000000067F000080000006000C000009C000-000000067F000080000006000C00000A0000__0000007739203FF0 000000067F000080000006000C00000A0000-000000067F000080000006000C00000A4000__0000007739203FF0 000000067F000080000006000C00000A0C0B-000000067F000080000006000C00000AA371__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C00000A4000-000000067F000080000006000C00000A8000__0000007739203FF0 000000067F000080000006000C00000A4D95-000000067F000080000006000C00000F7C7B__00000077B2AD0F91-0000007805801C41 000000067F000080000006000C00000A8000-000000067F000080000006000C00000AC000__0000007739203FF0 000000067F000080000006000C00000AA371-000000067F000080000006000C00000B3AD7__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C00000AC000-000000067F000080000006000C00000B0000__0000007739203FF0 000000067F000080000006000C00000B0000-000000067F000080000006000C00000B4000__0000007739203FF0 000000067F000080000006000C00000B3AD7-000000067F000080000006000C00000BD20B__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C00000B4000-000000067F000080000006000C00000B8000__0000007739203FF0 000000067F000080000006000C00000B8000-000000067F000080000006000C00000BC000__0000007739203FF0 000000067F000080000006000C00000BC000-000000067F000080000006000C00000C0000__0000007739203FF0 000000067F000080000006000C00000BD20B-000000067F000080000006000C0100000000__00000075E915EBC9-00000076A8CDE8F9 000000067F000080000006000C00000C0000-000000067F000080000006000C00000C4000__0000007739203FF0 000000067F000080000006000C00000C3C38-000000067F00008000000600140000001B38__00000077391A8001-00000077B2AD0F91 000000067F000080000006000C00000C4000-000000067F000080000006000C00000C8000__0000007739203FF0 000000067F000080000006000C00000C56C1-000000067F000080000006000C00000CEE0A__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000C8000-000000067F000080000006000C00000CC000__0000007739203FF0 000000067F000080000006000C00000CC000-000000067F000080000006000C00000D0000__0000007739203FF0 000000067F000080000006000C00000CEE0A-000000067F000080000006000C00000D8520__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000D0000-000000067F000080000006000C00000D4000__0000007739203FF0 000000067F000080000006000C00000D4000-000000067F000080000006000C00000D8000__0000007739203FF0 000000067F000080000006000C00000D8000-000000067F000080000006000C00000DC000__0000007739203FF0 000000067F000080000006000C00000D8520-000000067F000080000006000C00000E1C86__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000DC000-000000067F000080000006000C00000E0000__0000007739203FF0 000000067F000080000006000C00000E0000-000000067F000080000006000C00000E4000__0000007739203FF0 000000067F000080000006000C00000E1C86-000000067F000080000006000C00000EB3EC__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000E4000-000000067F000080000006000C00000E8000__0000007739203FF0 000000067F000080000006000C00000E8000-000000067F000080000006000C00000EC000__0000007739203FF0 000000067F000080000006000C00000EB3EC-000000067F000080000006000C00000F4B0C__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000EC000-000000067F000080000006000C00000F0000__0000007739203FF0 000000067F000080000006000C00000F0000-000000067F000080000006000C00000F4000__0000007739203FF0 000000067F000080000006000C00000F4000-000000067F000080000006000C00000F8000__0000007739203FF0 000000067F000080000006000C00000F4B0C-000000067F000080000006000C00000FE272__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C00000F7C96-000000067F0000800000060014000000F3A9__00000077B2AD0F91-0000007805801C41 000000067F000080000006000C00000F8000-000000067F000080000006000C00000FC000__0000007739203FF0 000000067F000080000006000C00000FC000-000000067F000080000006000C0000100000__0000007739203FF0 000000067F000080000006000C00000FE272-000000067F000080000006000C000010798F__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C0000100000-000000067F000080000006000C0000104000__0000007739203FF0 000000067F000080000006000C0000104000-000000067F000080000006000C0000108000__0000007739203FF0 000000067F000080000006000C000010798F-000000067F000080000006000C00001110F5__00000076A8CDE8F9-00000077391A8001 000000067F000080000006000C0000108000-000000067F000080000006000C000010C000__0000007739203FF0 000000067F000080000006000C000010C000-000000067F000080000006000C0000110000__0000007739203FF0 000000067F000080000006000C0000110000-030000000000000000000000000000000002__0000007739203FF0 000000067F000080000006000C00001110F5-010000000000000001000000030000000006__00000076A8CDE8F9-00000077391A8001 000000067F00008000000600140000001B38-000000067F00008000000600140000008758__00000077391A8001-00000077B2AD0F91 000000067F00008000000600140000008758-000000067F0000800000060014000000F32F__00000077391A8001-00000077B2AD0F91 000000067F0000800000060014000000F32F-000000067F00008000000600140000015EDC__00000077391A8001-00000077B2AD0F91 000000067F0000800000060014000000F3A9-000000067F00008000000600140000028656__00000077B2AD0F91-0000007805801C41 000000067F00008000000600140000015EDC-000000067F0000800000060014000001CB12__00000077391A8001-00000077B2AD0F91 000000067F0000800000060014000001CB12-000000067F000080000006001400000236BC__00000077391A8001-00000077B2AD0F91 000000067F000080000006001400000236BC-000000067F0000800000060014000002A294__00000077391A8001-00000077B2AD0F91 000000067F00008000000600140000028657-030000000000000000000000000000000002__00000077B2AD0F91-0000007805801C41 000000067F0000800000060014000002A294-030000000000000000000000000000000002__00000077391A8001-00000077B2AD0F91 000000067F000080000006200C0000000000-000000067F000080000006200C0000004000__00000078B2CB1C68 000000067F000080000006200C0000004000-000000067F000080000006200C0000008000__00000078B2CB1C68 000000067F000080000006200C0000008000-000000067F000080000006200C000000C000__00000078B2CB1C68 000000067F000080000006200C0000009441-000000067F000080000006200C0000012B8D__0000007805801C41-00000078859FEA11 000000067F000080000006200C000000C000-000000067F000080000006200C0000010000__00000078B2CB1C68 000000067F000080000006200C0000010000-000000067F000080000006200C0000014000__00000078B2CB1C68 000000067F000080000006200C0000012B8D-000000067F000080000006200C000001C2F3__0000007805801C41-00000078859FEA11 000000067F000080000006200C0000014000-000000067F000080000006200C0000018000__00000078B2CB1C68 000000067F000080000006200C0000018000-000000067F000080000006200C000001C000__00000078B2CB1C68 000000067F000080000006200C000001C000-000000067F000080000006200C0000020000__00000078B2CB1C68 000000067F000080000006200C000001C2F3-000000067F000080000006200C0000025A0C__0000007805801C41-00000078859FEA11 000000067F000080000006200C0000020000-000000067F000080000006200C0000024000__00000078B2CB1C68 000000067F000080000006200C0000024000-000000067F000080000006200C0000028000__00000078B2CB1C68 000000067F000080000006200C0000025A0C-000000067F000080000006200C000002F172__0000007805801C41-00000078859FEA11 000000067F000080000006200C0000028000-000000067F000080000006200C000002C000__00000078B2CB1C68 000000067F000080000006200C000002C000-000000067F000080000006200C0000030000__00000078B2CB1C68 000000067F000080000006200C000002F172-000000067F000080000006200C00000388D8__0000007805801C41-00000078859FEA11 000000067F000080000006200C0000030000-000000067F000080000006200C0000034000__00000078B2CB1C68 000000067F000080000006200C0000034000-000000067F000080000006200C0000038000__00000078B2CB1C68 000000067F000080000006200C0000038000-000000067F000080000006200C000003C000__00000078B2CB1C68 000000067F000080000006200C00000388D8-000000067F000080000006200C0000042009__0000007805801C41-00000078859FEA11 000000067F000080000006200C000003C000-000000067F000080000006200C0000040000__00000078B2CB1C68 000000067F000080000006200C0000040000-000000067F000080000006200C0000044000__00000078B2CB1C68 000000067F000080000006200C0000042009-000000067F000080000006200C000004B76F__0000007805801C41-00000078859FEA11 000000067F000080000006200C0000044000-000000067F000080000006200C0000048000__00000078B2CB1C68 000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__00000078B2CB1C68 000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__0000007AA0A6FB48 000000067F000080000006200C0000048121-000000067F000080000006200C0000090C08__0000007A3F679FA1-0000007AA1DF6639 000000067F000080000006200C000004B76F-030000000000000000000000000000000002__0000007805801C41-00000078859FEA11 000000067F000080000006200C000004BAC9-000000067F000080000006200C00000551FE__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__00000078B2CB1C68 000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__0000007AA0A6FB48 000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__00000078B2CB1C68 000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__0000007AA0A6FB48 000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__00000078B2CB1C68 000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__0000007AA0A6FB48 000000067F000080000006200C00000551FE-000000067F000080000006200C000005E90C__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__00000078B2CB1C68 000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__0000007AA0A6FB48 000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__00000078B2CB1C68 000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__0000007AA0A6FB48 000000067F000080000006200C000005E90C-000000067F000080000006200C000006802C__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__00000078B2CB1C68 000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__0000007AA0A6FB48 000000067F000080000006200C0000064000-000000067F000080000006200C0000068000__0000007AA0A6FB48 000000067F000080000006200C0000064000-030000000000000000000000000000000002__00000078B2CB1C68 000000067F000080000006200C0000068000-000000067F000080000006200C000006C000__0000007AA0A6FB48 000000067F000080000006200C000006802C-000000067F000080000006200C0000071783__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C000006C000-000000067F000080000006200C0000070000__0000007AA0A6FB48 000000067F000080000006200C0000070000-000000067F000080000006200C0000074000__0000007AA0A6FB48 000000067F000080000006200C0000071783-000000067F000080000006200C000007AEE9__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000074000-000000067F000080000006200C0000078000__0000007AA0A6FB48 000000067F000080000006200C0000078000-000000067F000080000006200C000007C000__0000007AA0A6FB48 000000067F000080000006200C000007AEE9-000000067F000080000006200C000008460B__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C000007C000-000000067F000080000006200C0000080000__0000007AA0A6FB48 000000067F000080000006200C0000080000-000000067F000080000006200C0000084000__0000007AA0A6FB48 000000067F000080000006200C0000084000-000000067F000080000006200C0000088000__0000007AA0A6FB48 000000067F000080000006200C000008460B-000000067F000080000006200C000008DD71__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000088000-000000067F000080000006200C000008C000__0000007AA0A6FB48 000000067F000080000006200C000008C000-000000067F000080000006200C0000090000__0000007AA0A6FB48 000000067F000080000006200C000008DD71-000000067F000080000006200C00000974D7__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000090000-000000067F000080000006200C0000094000__0000007AA0A6FB48 000000067F000080000006200C0000090C11-000000067F000080000006200C00000DA35B__0000007A3F679FA1-0000007AA1DF6639 000000067F000080000006200C0000094000-000000067F000080000006200C0000098000__0000007AA0A6FB48 000000067F000080000006200C00000974D7-000000067F000080000006200C00000A0C0B__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000098000-000000067F000080000006200C000009C000__0000007AA0A6FB48 000000067F000080000006200C000009C000-000000067F000080000006200C00000A0000__0000007AA0A6FB48 000000067F000080000006200C00000A0000-000000067F000080000006200C00000A4000__0000007AA0A6FB48 000000067F000080000006200C00000A0C0B-000000067F000080000006200C00000AA371__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000A4000-000000067F000080000006200C00000A8000__0000007AA0A6FB48 000000067F000080000006200C00000A8000-000000067F000080000006200C00000AC000__0000007AA0A6FB48 000000067F000080000006200C00000AA371-000000067F000080000006200C00000B3AD7__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000AC000-000000067F000080000006200C00000B0000__0000007AA0A6FB48 000000067F000080000006200C00000B0000-000000067F000080000006200C00000B4000__0000007AA0A6FB48 000000067F000080000006200C00000B3AD7-000000067F000080000006200C00000BD20B__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000B4000-000000067F000080000006200C00000B8000__0000007AA0A6FB48 000000067F000080000006200C00000B8000-000000067F000080000006200C00000BC000__0000007AA0A6FB48 000000067F000080000006200C00000BC000-000000067F000080000006200C00000C0000__0000007AA0A6FB48 000000067F000080000006200C00000BD20B-000000067F000080000006200C00000C6932__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000C0000-000000067F000080000006200C00000C4000__0000007AA0A6FB48 000000067F000080000006200C00000C4000-000000067F000080000006200C00000C8000__0000007AA0A6FB48 000000067F000080000006200C00000C6932-000000067F000080000006200C00000D0098__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000C8000-000000067F000080000006200C00000CC000__0000007AA0A6FB48 000000067F000080000006200C00000CC000-000000067F000080000006200C00000D0000__0000007AA0A6FB48 000000067F000080000006200C00000D0000-000000067F000080000006200C00000D4000__0000007AA0A6FB48 000000067F000080000006200C00000D0098-000000067F000080000006200C00000D97FE__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000D4000-000000067F000080000006200C00000D8000__0000007AA0A6FB48 000000067F000080000006200C00000D8000-000000067F000080000006200C00000DC000__0000007AA0A6FB48 000000067F000080000006200C00000D97FE-000000067F000080000006200C00000E2F0B__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000DA36C-000000067F00008000000620140000002D07__0000007A3F679FA1-0000007AA1DF6639 000000067F000080000006200C00000DC000-000000067F000080000006200C00000E0000__0000007AA0A6FB48 000000067F000080000006200C00000E0000-000000067F000080000006200C00000E4000__0000007AA0A6FB48 000000067F000080000006200C00000E2F0B-000000067F000080000006200C00000EC671__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000E4000-000000067F000080000006200C00000E8000__0000007AA0A6FB48 000000067F000080000006200C00000E8000-000000067F000080000006200C00000EC000__0000007AA0A6FB48 000000067F000080000006200C00000EC000-000000067F000080000006200C00000F0000__0000007AA0A6FB48 000000067F000080000006200C00000EC671-000000067F000080000006200C00000F5D9F__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000F0000-000000067F000080000006200C00000F4000__0000007AA0A6FB48 000000067F000080000006200C00000F4000-000000067F000080000006200C00000F8000__0000007AA0A6FB48 000000067F000080000006200C00000F5D9F-000000067F000080000006200C00000FF505__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C00000F8000-000000067F000080000006200C00000FC000__0000007AA0A6FB48 000000067F000080000006200C00000FC000-000000067F000080000006200C0000100000__0000007AA0A6FB48 000000067F000080000006200C00000FF505-000000067F000080000006200C0000108C10__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C0000100000-000000067F000080000006200C0000104000__0000007AA0A6FB48 000000067F000080000006200C0000104000-000000067F000080000006200C0000108000__0000007AA0A6FB48 000000067F000080000006200C0000107883-000000067F000080000006200C01000000AF__00000079C527F0D9-0000007A3F679FA1 000000067F000080000006200C0000108000-000000067F000080000006200C000010C000__0000007AA0A6FB48 000000067F000080000006200C0000108C10-000000067F000080000006200C0100000000__00000078859FEA11-00000079C527F0D9 000000067F000080000006200C000010C000-000000067F000080000006200C0000110000__0000007AA0A6FB48 000000067F000080000006200C0000110000-000000067F00008000000620120100000000__0000007AA0A6FB48 000000067F000080000006200C01000000AF-000000067F00008000000620140000004888__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000620140000002D0A-000000067F00008000000620140000016355__0000007A3F679FA1-0000007AA1DF6639 000000067F00008000000620140000004888-000000067F0000800000062014000000BC11__00000079C527F0D9-0000007A3F679FA1 000000067F0000800000062014000000BC11-000000067F00008000000620140000012FA7__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000620140000012FA7-000000067F0000800000062014000001A33D__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000620140000016357-000000067F00008000000620140000029C35__0000007A3F679FA1-0000007AA1DF6639 000000067F0000800000062014000001A33D-000000067F000080000006201400000216B4__00000079C527F0D9-0000007A3F679FA1 000000067F000080000006201400000216B4-000000067F00008000000620140000028A65__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000620140000028A65-030000000000000000000000000000000002__00000079C527F0D9-0000007A3F679FA1 000000067F00008000000620140000029C38-030000000000000000000000000000000002__0000007A3F679FA1-0000007AA1DF6639 000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007B9877EF40 000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007D41715570 000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007B9877EF40 000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007D41715570 000000067F000080000006400C0000007987-000000067F000080000006400C00000110ED__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007B9877EF40 000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007D41715570 000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007B9877EF40 000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007D41715570 000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007B9877EF40 000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007D41715570 000000067F000080000006400C00000110ED-000000067F000080000006400C000001A80A__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007B9877EF40 000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007D41715570 000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007B9877EF40 000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007D41715570 000000067F000080000006400C000001A80A-000000067F000080000006400C0000023F4A__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007B9877EF40 000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007D41715570 000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007B9877EF40 000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007D41715570 000000067F000080000006400C0000023F4A-000000067F000080000006400C000002D6B0__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007B9877EF40 000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007D41715570 000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007B9877EF40 000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007D41715570 000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007B9877EF40 000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007D41715570 000000067F000080000006400C000002D6B0-000000067F000080000006400C0000036DD4__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007B9877EF40 000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007D41715570 000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007B9877EF40 000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007D41715570 000000067F000080000006400C0000036DD4-000000067F000080000006400C000004050A__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007B9877EF40 000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007D41715570 000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007B9877EF40 000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007D41715570 000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007B9877EF40 000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007D41715570 000000067F000080000006400C000004050A-030000000000000000000000000000000002__0000007AA1DF6639-0000007B14D5C521 000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007B9877EF40 000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007D41715570 000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007B9877EF40 000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007D41715570 000000067F000080000006400C000004B4C9-000000067F000080000006400C0000054C01__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007B9877EF40 000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007D41715570 000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007B9877EF40 000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007D41715570 000000067F000080000006400C00000525C4-000000067F000080000006400C00000A47A7__0000007CEE5A0B91-0000007D41EA8D51 000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007B9877EF40 000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007D41715570 000000067F000080000006400C0000054C01-000000067F000080000006400C000005E30C__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007B9877EF40 000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007D41715570 000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007B9877EF40 000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007D41715570 000000067F000080000006400C000005E30C-000000067F000080000006400C0000067A2C__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007B9877EF40 000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007D41715570 000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007B9877EF40 000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007D41715570 000000067F000080000006400C0000067A2C-000000067F000080000006400C0000071187__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007B9877EF40 000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007D41715570 000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007B9877EF40 000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007D41715570 000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007B9877EF40 000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007D41715570 000000067F000080000006400C0000071187-000000067F000080000006400C000007A8ED__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007B9877EF40 000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007D41715570 000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007B9877EF40 000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007D41715570 000000067F000080000006400C000007A8ED-000000067F000080000006400C000008400B__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007B9877EF40 000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007D41715570 000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007B9877EF40 000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007D41715570 000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007B9877EF40 000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007D41715570 000000067F000080000006400C000008400B-000000067F000080000006400C000008D771__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007B9877EF40 000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007D41715570 000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007B9877EF40 000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007D41715570 000000067F000080000006400C000008D771-000000067F000080000006400C0000096ED7__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000090000-000000067F000080000006400C0000094000__0000007D41715570 000000067F000080000006400C0000090000-030000000000000000000000000000000002__0000007B9877EF40 000000067F000080000006400C0000094000-000000067F000080000006400C0000098000__0000007D41715570 000000067F000080000006400C0000096ED7-000000067F000080000006400C00000A060B__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000098000-000000067F000080000006400C000009C000__0000007D41715570 000000067F000080000006400C000009C000-000000067F000080000006400C00000A0000__0000007D41715570 000000067F000080000006400C00000A0000-000000067F000080000006400C00000A4000__0000007D41715570 000000067F000080000006400C00000A060B-000000067F000080000006400C00000A9D71__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000A4000-000000067F000080000006400C00000A8000__0000007D41715570 000000067F000080000006400C00000A47B1-000000067F000080000006400C00000F593E__0000007CEE5A0B91-0000007D41EA8D51 000000067F000080000006400C00000A8000-000000067F000080000006400C00000AC000__0000007D41715570 000000067F000080000006400C00000A887C-000000067F000080000006400C020000001F__0000007C73B53FC9-0000007CEE5A0B91 000000067F000080000006400C00000A9D71-000000067F000080000006400C00000B34D7__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000AC000-000000067F000080000006400C00000B0000__0000007D41715570 000000067F000080000006400C00000B0000-000000067F000080000006400C00000B4000__0000007D41715570 000000067F000080000006400C00000B34D7-000000067F000080000006400C00000BCC0C__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000B4000-000000067F000080000006400C00000B8000__0000007D41715570 000000067F000080000006400C00000B8000-000000067F000080000006400C00000BC000__0000007D41715570 000000067F000080000006400C00000BC000-000000067F000080000006400C00000C0000__0000007D41715570 000000067F000080000006400C00000BCC0C-000000067F000080000006400C00000C6336__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000C0000-000000067F000080000006400C00000C4000__0000007D41715570 000000067F000080000006400C00000C4000-000000067F000080000006400C00000C8000__0000007D41715570 000000067F000080000006400C00000C6336-000000067F000080000006400C00000CFA9C__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000C8000-000000067F000080000006400C00000CC000__0000007D41715570 000000067F000080000006400C00000CC000-000000067F000080000006400C00000D0000__0000007D41715570 000000067F000080000006400C00000CFA9C-000000067F000080000006400C00000D91AB__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000D0000-000000067F000080000006400C00000D4000__0000007D41715570 000000067F000080000006400C00000D4000-000000067F000080000006400C00000D8000__0000007D41715570 000000067F000080000006400C00000D8000-000000067F000080000006400C00000DC000__0000007D41715570 000000067F000080000006400C00000D91AB-000000067F000080000006400C00000E2911__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000DC000-000000067F000080000006400C00000E0000__0000007D41715570 000000067F000080000006400C00000E0000-000000067F000080000006400C00000E4000__0000007D41715570 000000067F000080000006400C00000E2911-000000067F000080000006400C00000EC077__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000E4000-000000067F000080000006400C00000E8000__0000007D41715570 000000067F000080000006400C00000E8000-000000067F000080000006400C00000EC000__0000007D41715570 000000067F000080000006400C00000EC000-000000067F000080000006400C00000F0000__0000007D41715570 000000067F000080000006400C00000EC077-000000067F000080000006400C00000F57A8__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000F0000-000000067F000080000006400C00000F4000__0000007D41715570 000000067F000080000006400C00000F4000-000000067F000080000006400C00000F8000__0000007D41715570 000000067F000080000006400C00000F57A8-000000067F000080000006400C00000FEF0A__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C00000F5940-000000067F0000800000064014000000E7FF__0000007CEE5A0B91-0000007D41EA8D51 000000067F000080000006400C00000F8000-000000067F000080000006400C00000FC000__0000007D41715570 000000067F000080000006400C00000FC000-000000067F000080000006400C0000100000__0000007D41715570 000000067F000080000006400C00000FEF0A-000000067F000080000006400C000010862B__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C0000100000-000000067F000080000006400C0000104000__0000007D41715570 000000067F000080000006400C0000104000-000000067F000080000006400C0000108000__0000007D41715570 000000067F000080000006400C0000108000-000000067F000080000006400C000010C000__0000007D41715570 000000067F000080000006400C000010862B-000000067F000080000006400C0000111C20__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C000010C000-000000067F000080000006400C0000110000__0000007D41715570 000000067F000080000006400C0000110000-000000067F00008000000640120100000000__0000007D41715570 000000067F000080000006400C00FFFFFFFF-01000000000000000100000003000000000D__0000007B14D5C521-0000007C73B53FC9 000000067F000080000006400C020000001F-000000067F0000800000064014000000691F__0000007C73B53FC9-0000007CEE5A0B91 000000067F00008000000640140000000000-000000067F00008000000640140000004000__0000007D41715570 000000067F00008000000640140000004000-000000067F00008000000640140000008000__0000007D41715570 000000067F0000800000064014000000691F-000000067F0000800000064014000000D68F__0000007C73B53FC9-0000007CEE5A0B91 000000067F00008000000640140000008000-000000067F0000800000064014000000C000__0000007D41715570 000000067F0000800000064014000000C000-000000067F00008000000640140000010000__0000007D41715570 000000067F0000800000064014000000D68F-000000067F00008000000640140000014406__0000007C73B53FC9-0000007CEE5A0B91 000000067F0000800000064014000000E803-000000067F000080000006401400000274BB__0000007CEE5A0B91-0000007D41EA8D51 000000067F00008000000640140000010000-000000067F00008000000640140000014000__0000007D41715570 000000067F00008000000640140000014000-000000067F00008000000640140000018000__0000007D41715570 000000067F00008000000640140000014406-000000067F0000800000064014000001B192__0000007C73B53FC9-0000007CEE5A0B91 000000067F00008000000640140000018000-000000067F0000800000064014000001C000__0000007D41715570 000000067F0000800000064014000001B192-000000067F00008000000640140000021F03__0000007C73B53FC9-0000007CEE5A0B91 000000067F0000800000064014000001C000-000000067F00008000000640140000020000__0000007D41715570 000000067F00008000000640140000020000-000000067F00008000000640140000024000__0000007D41715570 000000067F00008000000640140000021F03-000000067F00008000000640140000028C6A__0000007C73B53FC9-0000007CEE5A0B91 000000067F00008000000640140000024000-000000067F00008000000640140000028000__0000007D41715570 000000067F000080000006401400000274BF-030000000000000000000000000000000002__0000007CEE5A0B91-0000007D41EA8D51 000000067F00008000000640140000028000-000000067F0000800000064014000002C000__0000007D41715570 000000067F00008000000640140000028C6A-030000000000000000000000000000000002__0000007C73B53FC9-0000007CEE5A0B91 000000067F0000800000064014000002C000-030000000000000000000000000000000002__0000007D41715570 000000067F000080000006600C0000000000-000000067F000080000006600C0000004000__0000007F12B83FE8 000000067F000080000006600C0000004000-000000067F000080000006600C0000008000__0000007F12B83FE8 000000067F000080000006600C0000008000-000000067F000080000006600C000000C000__0000007F12B83FE8 000000067F000080000006600C0000009381-000000067F000080000006600C0000012AE7__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C000000C000-000000067F000080000006600C0000010000__0000007F12B83FE8 000000067F000080000006600C0000010000-000000067F000080000006600C0000014000__0000007F12B83FE8 000000067F000080000006600C0000012AE7-000000067F000080000006600C000001C20B__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C0000014000-000000067F000080000006600C0000018000__0000007F12B83FE8 000000067F000080000006600C0000018000-000000067F000080000006600C000001C000__0000007F12B83FE8 000000067F000080000006600C000001C000-000000067F000080000006600C0000020000__0000007F12B83FE8 000000067F000080000006600C000001C20B-000000067F000080000006600C000002593B__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C0000020000-000000067F000080000006600C0000024000__0000007F12B83FE8 000000067F000080000006600C0000024000-000000067F000080000006600C0000028000__0000007F12B83FE8 000000067F000080000006600C000002593B-000000067F000080000006600C000002F0A1__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C0000028000-000000067F000080000006600C000002C000__0000007F12B83FE8 000000067F000080000006600C000002C000-000000067F000080000006600C0000030000__0000007F12B83FE8 000000067F000080000006600C000002F0A1-000000067F000080000006600C00000387B6__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C0000030000-000000067F000080000006600C0000034000__0000007F12B83FE8 000000067F000080000006600C0000034000-000000067F000080000006600C0000038000__0000007F12B83FE8 000000067F000080000006600C0000038000-000000067F000080000006600C000003C000__0000007F12B83FE8 000000067F000080000006600C00000387B6-000000067F000080000006600C0000041F1C__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C000003C000-000000067F000080000006600C0000040000__0000007F12B83FE8 000000067F000080000006600C0000040000-000000067F000080000006600C0000044000__0000007F12B83FE8 000000067F000080000006600C0000041F1C-000000067F000080000006600C000004B682__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C0000044000-000000067F000080000006600C0000048000__0000007F12B83FE8 000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007F108C1FD8 000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007FDCA75700 000000067F000080000006600C0000049743-000000067F000080000006600C0000093532__0000007F7BE4E6F1-0000007FDCDCE659 000000067F000080000006600C000004B682-030000000000000000000000000000000002__0000007D41EA8D51-0000007DC21DE569 000000067F000080000006600C000004BAC3-000000067F000080000006600C00000551F8__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007F108C1FD8 000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007FDCA75700 000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007F108C1FD8 000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007FDCA75700 000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007F108C1FD8 000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007FDCA75700 000000067F000080000006600C00000551F8-000000067F000080000006600C000005E90C__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007F108C1FD8 000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007FDCA75700 000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007F108C1FD8 000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007FDCA75700 000000067F000080000006600C000005E90C-000000067F000080000006600C000006802C__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007F108C1FD8 000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007FDCA75700 000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007F108C1FD8 000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007FDCA75700 000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007F108C1FD8 000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007FDCA75700 000000067F000080000006600C000006802C-000000067F000080000006600C0000071783__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007F108C1FD8 000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007FDCA75700 000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007F108C1FD8 000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007FDCA75700 000000067F000080000006600C0000071783-000000067F000080000006600C000007AEE9__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007F108C1FD8 000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007FDCA75700 000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007F108C1FD8 000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007FDCA75700 000000067F000080000006600C000007AEE9-000000067F000080000006600C000008460B__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007F108C1FD8 000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007FDCA75700 000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007F108C1FD8 000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007FDCA75700 000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007F108C1FD8 000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007FDCA75700 000000067F000080000006600C000008460B-000000067F000080000006600C000008DD71__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007F108C1FD8 000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007FDCA75700 000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007F108C1FD8 000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007FDCA75700 000000067F000080000006600C000008DD71-000000067F000080000006600C00000974D7__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007F108C1FD8 000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007FDCA75700 000000067F000080000006600C0000093532-000000067F000080000006600C00000DD150__0000007F7BE4E6F1-0000007FDCDCE659 000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007F108C1FD8 000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007FDCA75700 000000067F000080000006600C00000974D7-000000067F000080000006600C00000A0C0B__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007F108C1FD8 000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007FDCA75700 000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007F108C1FD8 000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007FDCA75700 000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007F108C1FD8 000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007FDCA75700 000000067F000080000006600C00000A0C0B-000000067F000080000006600C00000AA371__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007F108C1FD8 000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007FDCA75700 000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007F108C1FD8 000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007FDCA75700 000000067F000080000006600C00000AA371-000000067F000080000006600C00000B3AD7__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007F108C1FD8 000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007FDCA75700 000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007F108C1FD8 000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007FDCA75700 000000067F000080000006600C00000B3AD7-000000067F000080000006600C0100000000__0000007DC21DE569-0000007E71DBF8F9 000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007F108C1FD8 000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007FDCA75700 000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007F108C1FD8 000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007FDCA75700 000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007F108C1FD8 000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007FDCA75700 000000067F000080000006600C00000BC29F-000000067F000080000006600C00000C59CF__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007F108C1FD8 000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007FDCA75700 000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007F108C1FD8 000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007FDCA75700 000000067F000080000006600C00000C59CF-000000067F000080000006600C00000CF10B__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007F108C1FD8 000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007FDCA75700 000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007F108C1FD8 000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007FDCA75700 000000067F000080000006600C00000CF10B-000000067F000080000006600C00000D882C__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007F108C1FD8 000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007FDCA75700 000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007F108C1FD8 000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007FDCA75700 000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007F108C1FD8 000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007FDCA75700 000000067F000080000006600C00000D882C-000000067F000080000006600C00000E1F7F__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007F108C1FD8 000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007FDCA75700 000000067F000080000006600C00000DD152-000000067F00008000000660140000003DA8__0000007F7BE4E6F1-0000007FDCDCE659 000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007F108C1FD8 000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007FDCA75700 000000067F000080000006600C00000E1F7F-000000067F000080000006600C00000EB6E5__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007F108C1FD8 000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007FDCA75700 000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007F108C1FD8 000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007FDCA75700 000000067F000080000006600C00000EB6E5-000000067F000080000006600C00000F4E0C__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007F108C1FD8 000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007FDCA75700 000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007F108C1FD8 000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007FDCA75700 000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007F108C1FD8 000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007FDCA75700 000000067F000080000006600C00000F4E0C-000000067F000080000006600C00000FE572__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007F108C1FD8 000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007FDCA75700 000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007F108C1FD8 000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007FDCA75700 000000067F000080000006600C00000FE572-000000067F000080000006600C0000107CD8__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007F108C1FD8 000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007FDCA75700 000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007F108C1FD8 000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007FDCA75700 000000067F000080000006600C0000107CD8-000000067F000080000006600C000011140B__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007F108C1FD8 000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007FDCA75700 000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007F108C1FD8 000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007FDCA75700 000000067F000080000006600C0000110000-000000067F00008000000660120100000000__0000007FDCA75700 000000067F000080000006600C0000110000-030000000000000000000000000000000002__0000007F108C1FD8 000000067F000080000006600C000011140B-010000000000000001000000030000000010__0000007E71DBF8F9-0000007F11E4BFE9 000000067F000080000006600C0000111C82-000000067F0000800000066014000000535B__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000660140000000000-000000067F00008000000660140000004000__0000007FDCA75700 000000067F00008000000660140000003DAA-000000067F00008000000660140000017C4D__0000007F7BE4E6F1-0000007FDCDCE659 000000067F00008000000660140000004000-000000067F00008000000660140000008000__0000007FDCA75700 000000067F0000800000066014000000535B-000000067F0000800000066014000000C839__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000660140000008000-000000067F0000800000066014000000C000__0000007FDCA75700 000000067F0000800000066014000000C000-000000067F00008000000660140000010000__0000007FDCA75700 000000067F0000800000066014000000C839-000000067F00008000000660140000013D42__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000660140000010000-000000067F00008000000660140000014000__0000007FDCA75700 000000067F00008000000660140000013D42-000000067F0000800000066014000001B222__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000660140000014000-000000067F00008000000660140000018000__0000007FDCA75700 000000067F00008000000660140000017C51-000000067F0000800000066014000002B9D0__0000007F7BE4E6F1-0000007FDCDCE659 000000067F00008000000660140000018000-000000067F0000800000066014000001C000__0000007FDCA75700 000000067F0000800000066014000001B222-000000067F00008000000660140000022704__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F0000800000066014000001C000-000000067F00008000000660140000020000__0000007FDCA75700 000000067F00008000000660140000020000-000000067F00008000000660140000024000__0000007FDCA75700 000000067F00008000000660140000022704-000000067F00008000000660140000029C2D__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F00008000000660140000024000-000000067F00008000000660140000028000__0000007FDCA75700 000000067F00008000000660140000028000-000000067F0000800000066014000002C000__0000007FDCA75700 000000067F00008000000660140000029C2D-030000000000000000000000000000000002__0000007F11E4BFE9-0000007F7BE4E6F1 000000067F0000800000066014000002B9D1-030000000000000000000000000000000002__0000007F7BE4E6F1-0000007FDCDCE659 000000067F0000800000066014000002C000-030000000000000000000000000000000002__0000007FDCA75700 000000067F000080000006800C0000000000-000000067F000080000006800C0000004000__00000081AFEDBFE0 000000067F000080000006800C0000004000-000000067F000080000006800C0000008000__00000081AFEDBFE0 000000067F000080000006800C0000007D6A-000000067F000080000006800C00000114D0__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000008000-000000067F000080000006800C000000C000__00000081AFEDBFE0 000000067F000080000006800C000000C000-000000067F000080000006800C0000010000__00000081AFEDBFE0 000000067F000080000006800C0000010000-000000067F000080000006800C0000014000__00000081AFEDBFE0 000000067F000080000006800C00000114D0-000000067F000080000006800C000001AC0B__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000014000-000000067F000080000006800C0000018000__00000081AFEDBFE0 000000067F000080000006800C0000018000-000000067F000080000006800C000001C000__00000081AFEDBFE0 000000067F000080000006800C000001AC0B-000000067F000080000006800C0000024348__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C000001C000-000000067F000080000006800C0000020000__00000081AFEDBFE0 000000067F000080000006800C0000020000-000000067F000080000006800C0000024000__00000081AFEDBFE0 000000067F000080000006800C0000024000-000000067F000080000006800C0000028000__00000081AFEDBFE0 000000067F000080000006800C0000024348-000000067F000080000006800C000002DAAE__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000028000-000000067F000080000006800C000002C000__00000081AFEDBFE0 000000067F000080000006800C000002C000-000000067F000080000006800C0000030000__00000081AFEDBFE0 000000067F000080000006800C000002DAAE-000000067F000080000006800C00000371D0__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000030000-000000067F000080000006800C0000034000__00000081AFEDBFE0 000000067F000080000006800C0000034000-000000067F000080000006800C0000038000__00000081AFEDBFE0 000000067F000080000006800C00000371D0-000000067F000080000006800C000004090B__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000038000-000000067F000080000006800C000003C000__00000081AFEDBFE0 000000067F000080000006800C000003C000-000000067F000080000006800C0000040000__00000081AFEDBFE0 000000067F000080000006800C0000040000-000000067F000080000006800C0000044000__00000081A164D628 000000067F000080000006800C000004090B-030000000000000000000000000000000002__0000007FDCDCE659-000000804F6BFFC1 000000067F000080000006800C0000042368-000000067F000080000006800C000004BACE__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000044000-000000067F000080000006800C0000048000__00000081A164D628 000000067F000080000006800C0000048000-000000067F000080000006800C000004C000__00000081A164D628 000000067F000080000006800C000004BACE-000000067F000080000006800C0000055202__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C000004C000-000000067F000080000006800C0000050000__00000081A164D628 000000067F000080000006800C0000050000-000000067F000080000006800C0000054000__00000081A164D628 000000067F000080000006800C0000054000-000000067F000080000006800C0000058000__00000081A164D628 000000067F000080000006800C0000055202-000000067F000080000006800C000005E90D__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000058000-000000067F000080000006800C000005C000__00000081A164D628 000000067F000080000006800C000005C000-000000067F000080000006800C0000060000__00000081A164D628 000000067F000080000006800C000005E90D-000000067F000080000006800C000006802B__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000060000-000000067F000080000006800C0000064000__00000081A164D628 000000067F000080000006800C0000064000-000000067F000080000006800C0000068000__00000081A164D628 000000067F000080000006800C0000068000-000000067F000080000006800C000006C000__00000081A164D628 000000067F000080000006800C000006802B-000000067F000080000006800C0000071782__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C000006C000-000000067F000080000006800C0000070000__00000081A164D628 000000067F000080000006800C0000070000-000000067F000080000006800C0000074000__00000081A164D628 000000067F000080000006800C0000071782-000000067F000080000006800C000007AEE8__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000074000-000000067F000080000006800C0000078000__00000081A164D628 000000067F000080000006800C0000078000-000000067F000080000006800C000007C000__00000081A164D628 000000067F000080000006800C000007AEE8-000000067F000080000006800C000008460B__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C000007C000-000000067F000080000006800C0000080000__00000081A164D628 000000067F000080000006800C0000080000-000000067F000080000006800C0000084000__00000081A164D628 000000067F000080000006800C0000084000-000000067F000080000006800C0000088000__00000081A164D628 000000067F000080000006800C000008460B-000000067F000080000006800C000008DD71__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000088000-000000067F000080000006800C000008C000__00000081A164D628 000000067F000080000006800C000008C000-000000067F000080000006800C0000090000__00000081A164D628 000000067F000080000006800C000008DD71-000000067F000080000006800C00000974D7__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000090000-000000067F000080000006800C0000094000__00000081A164D628 000000067F000080000006800C0000094000-000000067F000080000006800C0000098000__00000081A164D628 000000067F000080000006800C00000974D7-000000067F000080000006800C00000A0C0B__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C0000098000-000000067F000080000006800C000009C000__00000081A164D628 000000067F000080000006800C000009C000-000000067F000080000006800C00000A0000__00000081A164D628 000000067F000080000006800C00000A0000-000000067F000080000006800C00000A4000__00000081A164D628 000000067F000080000006800C00000A0C0B-000000067F000080000006800C0100000000__000000804F6BFFC1-00000080EF2FF5B9 000000067F000080000006800C00000A4000-000000067F000080000006800C00000A8000__00000081A164D628 000000067F000080000006800C00000A8000-000000067F000080000006800C00000AC000__00000081A164D628 000000067F000080000006800C00000A8D4C-000000067F000080000006800C00000B24B2__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000AC000-000000067F000080000006800C00000B0000__00000081A164D628 000000067F000080000006800C00000B0000-000000067F000080000006800C00000B4000__00000081A164D628 000000067F000080000006800C00000B24B2-000000067F000080000006800C00000BBC0B__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000B4000-000000067F000080000006800C00000B8000__00000081A164D628 000000067F000080000006800C00000B8000-000000067F000080000006800C00000BC000__00000081A164D628 000000067F000080000006800C00000BBC0B-000000067F000080000006800C00000C533F__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000BC000-000000067F000080000006800C00000C0000__00000081A164D628 000000067F000080000006800C00000C0000-000000067F000080000006800C00000C4000__00000081A164D628 000000067F000080000006800C00000C4000-000000067F000080000006800C00000C8000__00000081A164D628 000000067F000080000006800C00000C533F-000000067F000080000006800C00000CEAA5__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000C8000-000000067F000080000006800C00000CC000__00000081A164D628 000000067F000080000006800C00000CC000-000000067F000080000006800C00000D0000__00000081A164D628 000000067F000080000006800C00000CEAA5-000000067F000080000006800C00000D81BE__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000D0000-000000067F000080000006800C00000D4000__00000081A164D628 000000067F000080000006800C00000D4000-000000067F000080000006800C00000D8000__00000081A164D628 000000067F000080000006800C00000D8000-000000067F000080000006800C00000DC000__00000081A164D628 000000067F000080000006800C00000D81BE-000000067F000080000006800C00000E190B__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000DC000-000000067F000080000006800C00000E0000__00000081A164D628 000000067F000080000006800C00000E0000-000000067F000080000006800C00000E4000__00000081A164D628 000000067F000080000006800C00000E190B-000000067F000080000006800C00000EB071__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000E4000-000000067F000080000006800C00000E8000__00000081A164D628 000000067F000080000006800C00000E8000-000000067F000080000006800C00000EC000__00000081A164D628 000000067F000080000006800C00000EB071-000000067F000080000006800C00000F47AC__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000EC000-000000067F000080000006800C00000F0000__00000081A164D628 000000067F000080000006800C00000F0000-000000067F000080000006800C00000F4000__00000081A164D628 000000067F000080000006800C00000F4000-000000067F000080000006800C00000F8000__00000081A164D628 000000067F000080000006800C00000F47AC-000000067F000080000006800C00000FDF0A__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C00000F8000-000000067F000080000006800C00000FC000__00000081A164D628 000000067F000080000006800C00000FC000-000000067F000080000006800C0000100000__00000081A164D628 000000067F000080000006800C00000FDF0A-000000067F000080000006800C000010762B__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C0000100000-000000067F000080000006800C0000104000__00000081A164D628 000000067F000080000006800C0000104000-000000067F000080000006800C0000108000__00000081A164D628 000000067F000080000006800C000010762B-000000067F000080000006800C0000110D88__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006800C0000108000-030000000000000000000000000000000002__00000081A164D628 000000067F000080000006800C0000110D88-010000000000000001000000030000000014__00000080EF2FF5B9-00000081AFAF5FD1 000000067F000080000006801400000044E4-000000067F0000800000068014000000C3F5__00000081AFAF5FD1-0000008215AFE5A9 000000067F0000800000068014000000C3F5-000000067F00008000000680140000014303__00000081AFAF5FD1-0000008215AFE5A9 000000067F00008000000680140000014303-000000067F0000800000068014000001C214__00000081AFAF5FD1-0000008215AFE5A9 000000067F0000800000068014000001C214-000000067F00008000000680140000024125__00000081AFAF5FD1-0000008215AFE5A9 000000067F00008000000680140000024125-000000067F0000800000068014000002C035__00000081AFAF5FD1-0000008215AFE5A9 000000067F0000800000068014000002C035-000000067F000080000006A00C00000072CA__00000081AFAF5FD1-0000008215AFE5A9 000000067F000080000006A00C0000000000-000000067F000080000006A00C0000004000__00000083D5DE3FD0 000000067F000080000006A00C0000004000-000000067F000080000006A00C0000008000__00000083D5DE3FD0 000000067F000080000006A00C00000072CA-030000000000000000000000000000000002__00000081AFAF5FD1-0000008215AFE5A9 000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000083865C64B8 000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000084A1F03030 000000067F000080000006A00C00000096E3-000000067F000080000006A00C0000012E0B__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000083865C64B8 000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000084A1F03030 000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000083865C64B8 000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000084A1F03030 000000067F000080000006A00C0000012E0B-000000067F000080000006A00C000001C571__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000083865C64B8 000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000084A1F03030 000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000083865C64B8 000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000084A1F03030 000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000083865C64B8 000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000084A1F03030 000000067F000080000006A00C000001C571-000000067F000080000006A00C0000025CD7__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000083865C64B8 000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000084A1F03030 000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000083865C64B8 000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000084A1F03030 000000067F000080000006A00C0000025CD7-000000067F000080000006A00C000002F40B__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000083865C64B8 000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000084A1F03030 000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000083865C64B8 000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000084A1F03030 000000067F000080000006A00C000002F40B-000000067F000080000006A00C0000038B1E__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000083865C64B8 000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000084A1F03030 000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000083865C64B8 000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000084A1F03030 000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000083865C64B8 000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000084A1F03030 000000067F000080000006A00C0000038B1E-000000067F000080000006A00C0000042284__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000083865C64B8 000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000084A1F03030 000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000083865C64B8 000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000084A1F03030 000000067F000080000006A00C0000042284-000000067F000080000006A00C000004B9EA__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000083865C64B8 000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000084A1F03030 000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000083865C64B8 000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000084A1F03030 000000067F000080000006A00C000004B9EA-000000067F000080000006A00C000005510B__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000083865C64B8 000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000084A1F03030 000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000083865C64B8 000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000084A1F03030 000000067F000080000006A00C000005198B-000000067F000080000006A00C00000A31A6__000000844F1A6789-00000084A325AA01 000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000083865C64B8 000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000084A1F03030 000000067F000080000006A00C000005510B-000000067F000080000006A00C000005E871__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000083865C64B8 000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000084A1F03030 000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000083865C64B8 000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000084A1F03030 000000067F000080000006A00C000005E871-000000067F000080000006A00C0000067F8B__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000083865C64B8 000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000084A1F03030 000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000083865C64B8 000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000084A1F03030 000000067F000080000006A00C0000067F8B-000000067F000080000006A00C0100000000__0000008215AFE5A9-00000082B573F579 000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000083865C64B8 000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000084A1F03030 000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000083865C64B8 000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000084A1F03030 000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000083865C64B8 000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000084A1F03030 000000067F000080000006A00C00000703EC-000000067F000080000006A00C0000079B0C__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000083865C64B8 000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000084A1F03030 000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000083865C64B8 000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000084A1F03030 000000067F000080000006A00C0000079B0C-000000067F000080000006A00C0000083272__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000083865C64B8 000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000084A1F03030 000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000083865C64B8 000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000084A1F03030 000000067F000080000006A00C0000083272-000000067F000080000006A00C000008C9D8__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000083865C64B8 000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000084A1F03030 000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000083865C64B8 000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000084A1F03030 000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000083865C64B8 000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000084A1F03030 000000067F000080000006A00C000008C9D8-000000067F000080000006A00C0000096129__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000083865C64B8 000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000084A1F03030 000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000083865C64B8 000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000084A1F03030 000000067F000080000006A00C0000096129-000000067F000080000006A00C000009F88F__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000083865C64B8 000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000084A1F03030 000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000083865C64B8 000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000084A1F03030 000000067F000080000006A00C000009F88F-000000067F000080000006A00C00000A8F9F__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000083865C64B8 000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000084A1F03030 000000067F000080000006A00C00000A31B0-000000067F000080000006A00C00000F4C19__000000844F1A6789-00000084A325AA01 000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000083865C64B8 000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000084A1F03030 000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000083865C64B8 000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000084A1F03030 000000067F000080000006A00C00000A8F9F-000000067F000080000006A00C00000B2705__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000083865C64B8 000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000084A1F03030 000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000083865C64B8 000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000084A1F03030 000000067F000080000006A00C00000B2705-000000067F000080000006A00C00000BBE10__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000083865C64B8 000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000084A1F03030 000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000083865C64B8 000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000084A1F03030 000000067F000080000006A00C00000BBE10-000000067F000080000006A00C00000C5543__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000083865C64B8 000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000084A1F03030 000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000083865C64B8 000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000084A1F03030 000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000083865C64B8 000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000084A1F03030 000000067F000080000006A00C00000C4CC8-000000067F000080000006A0140000001CBC__00000083D5901FD9-000000844F1A6789 000000067F000080000006A00C00000C5543-000000067F000080000006A00C00000CECA9__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000083865C64B8 000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000084A1F03030 000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000083865C64B8 000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000084A1F03030 000000067F000080000006A00C00000CECA9-000000067F000080000006A00C00000D83C0__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000083865C64B8 000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000084A1F03030 000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000083865C64B8 000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000084A1F03030 000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000083865C64B8 000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000084A1F03030 000000067F000080000006A00C00000D83C0-000000067F000080000006A00C00000E1B0A__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000083865C64B8 000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000084A1F03030 000000067F000080000006A00C00000E0000-000000067F000080000006A00C00000E4000__00000084A1F03030 000000067F000080000006A00C00000E0000-030000000000000000000000000000000002__00000083865C64B8 000000067F000080000006A00C00000E1B0A-000000067F000080000006A00C00000EB270__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000E4000-000000067F000080000006A00C00000E8000__00000084A1F03030 000000067F000080000006A00C00000E8000-000000067F000080000006A00C00000EC000__00000084A1F03030 000000067F000080000006A00C00000EB270-000000067F000080000006A00C00000F49AA__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000EC000-000000067F000080000006A00C00000F0000__00000084A1F03030 000000067F000080000006A00C00000F0000-000000067F000080000006A00C00000F4000__00000084A1F03030 000000067F000080000006A00C00000F4000-000000067F000080000006A00C00000F8000__00000084A1F03030 000000067F000080000006A00C00000F49AA-000000067F000080000006A00C00000FE10A__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C00000F4C23-000000067F000080000006A014000000E1C2__000000844F1A6789-00000084A325AA01 000000067F000080000006A00C00000F8000-000000067F000080000006A00C00000FC000__00000084A1F03030 000000067F000080000006A00C00000FC000-000000067F000080000006A00C0000100000__00000084A1F03030 000000067F000080000006A00C00000FE10A-000000067F000080000006A00C000010782C__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000100000-000000067F000080000006A00C0000104000__00000084A1F03030 000000067F000080000006A00C0000104000-000000067F000080000006A00C0000108000__00000084A1F03030 000000067F000080000006A00C000010782C-000000067F000080000006A00C0000110F88__00000082B573F579-00000083D5901FD9 000000067F000080000006A00C0000108000-000000067F000080000006A00C000010C000__00000084A1F03030 000000067F000080000006A00C000010C000-000000067F000080000006A00C0000110000__00000084A1F03030 000000067F000080000006A00C0000110000-000000067F000080000006A0120100000000__00000084A1F03030 000000067F000080000006A00C0000110F88-010000000000000001000000030000000014__00000082B573F579-00000083D5901FD9 000000067F000080000006A0140000000000-000000067F000080000006A0140000004000__00000084A1F03030 000000067F000080000006A0140000001CBC-000000067F000080000006A01400000088E1__00000083D5901FD9-000000844F1A6789 000000067F000080000006A0140000004000-000000067F000080000006A0140000008000__00000084A1F03030 000000067F000080000006A0140000008000-000000067F000080000006A014000000C000__00000084A1F03030 000000067F000080000006A01400000088E1-000000067F000080000006A014000000F459__00000083D5901FD9-000000844F1A6789 000000067F000080000006A014000000C000-000000067F000080000006A0140000010000__00000084A1F03030 000000067F000080000006A014000000E1C2-000000067F000080000006A014000002682C__000000844F1A6789-00000084A325AA01 000000067F000080000006A014000000F459-000000067F000080000006A0140000016068__00000083D5901FD9-000000844F1A6789 000000067F000080000006A0140000010000-000000067F000080000006A0140000014000__00000084A1F03030 000000067F000080000006A0140000014000-000000067F000080000006A0140000018000__00000084A1F03030 000000067F000080000006A0140000016068-000000067F000080000006A014000001CC14__00000083D5901FD9-000000844F1A6789 000000067F000080000006A0140000018000-000000067F000080000006A014000001C000__00000084A1F03030 000000067F000080000006A014000001C000-000000067F000080000006A0140000020000__00000084A1F03030 000000067F000080000006A014000001CC14-000000067F000080000006A014000002384E__00000083D5901FD9-000000844F1A6789 000000067F000080000006A0140000020000-000000067F000080000006A0140000024000__00000084A1F03030 000000067F000080000006A014000002384E-000000067F000080000006A014000002A467__00000083D5901FD9-000000844F1A6789 000000067F000080000006A0140000024000-000000067F000080000006A0140000028000__00000084A1F03030 000000067F000080000006A0140000026831-030000000000000000000000000000000002__000000844F1A6789-00000084A325AA01 000000067F000080000006A0140000028000-000000067F000080000006A014000002C000__00000084A1F03030 000000067F000080000006A014000002A467-030000000000000000000000000000000002__00000083D5901FD9-000000844F1A6789 000000067F000080000006A014000002C000-030000000000000000000000000000000002__00000084A1F03030 000000067F000080000006C00C0000000000-000000067F000080000006C00C0000004000__00000086746BDFE0 000000067F000080000006C00C0000004000-000000067F000080000006C00C0000008000__00000086746BDFE0 000000067F000080000006C00C0000008000-000000067F000080000006C00C000000C000__00000086746BDFE0 000000067F000080000006C00C00000090F5-000000067F000080000006C00C000001280C__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C000000C000-000000067F000080000006C00C0000010000__00000086746BDFE0 000000067F000080000006C00C0000010000-000000067F000080000006C00C0000014000__00000086746BDFE0 000000067F000080000006C00C000001280C-000000067F000080000006C00C000001BF72__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C0000014000-000000067F000080000006C00C0000018000__00000086746BDFE0 000000067F000080000006C00C0000018000-000000067F000080000006C00C000001C000__00000086746BDFE0 000000067F000080000006C00C000001BF72-000000067F000080000006C00C00000256D8__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C000001C000-000000067F000080000006C00C0000020000__00000086746BDFE0 000000067F000080000006C00C0000020000-000000067F000080000006C00C0000024000__00000086746BDFE0 000000067F000080000006C00C0000024000-000000067F000080000006C00C0000028000__00000086746BDFE0 000000067F000080000006C00C00000256D8-000000067F000080000006C00C000002EE0B__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C0000028000-000000067F000080000006C00C000002C000__00000086746BDFE0 000000067F000080000006C00C000002C000-000000067F000080000006C00C0000030000__00000086746BDFE0 000000067F000080000006C00C000002EE0B-000000067F000080000006C00C0000038521__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C0000030000-000000067F000080000006C00C0000034000__00000086746BDFE0 000000067F000080000006C00C0000034000-000000067F000080000006C00C0000038000__00000086746BDFE0 000000067F000080000006C00C0000038000-000000067F000080000006C00C000003C000__00000086746BDFE0 000000067F000080000006C00C0000038521-000000067F000080000006C00C0000041C87__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C000003C000-000000067F000080000006C00C0000040000__00000086746BDFE0 000000067F000080000006C00C0000040000-000000067F000080000006C00C0000044000__00000086746BDFE0 000000067F000080000006C00C0000041C87-000000067F000080000006C00C000004B3ED__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C0000044000-000000067F000080000006C00C0000048000__00000086746BDFE0 000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__00000086720CFFF0 000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__000000873B520940 000000067F000080000006C00C000004B3ED-030000000000000000000000000000000002__00000084A325AA01-00000085239DFB81 000000067F000080000006C00C000004BAC4-000000067F000080000006C00C00000551F9__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__00000086720CFFF0 000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__000000873B520940 000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__00000086720CFFF0 000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__000000873B520940 000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__00000086720CFFF0 000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__000000873B520940 000000067F000080000006C00C00000551F9-000000067F000080000006C00C000005E90C__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000055EB3-000000067F000080000006C00C00000AB316__00000086ED29E361-000000873C9A2551 000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__00000086720CFFF0 000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__000000873B520940 000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__00000086720CFFF0 000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__000000873B520940 000000067F000080000006C00C000005E90C-000000067F000080000006C00C000006802C__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__00000086720CFFF0 000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__000000873B520940 000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__00000086720CFFF0 000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__000000873B520940 000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__00000086720CFFF0 000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__000000873B520940 000000067F000080000006C00C000006802C-000000067F000080000006C00C0000071783__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__00000086720CFFF0 000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__000000873B520940 000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__00000086720CFFF0 000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__000000873B520940 000000067F000080000006C00C0000071783-000000067F000080000006C00C000007AEE9__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__00000086720CFFF0 000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__000000873B520940 000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__00000086720CFFF0 000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__000000873B520940 000000067F000080000006C00C000007AEE9-000000067F000080000006C00C000008460B__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__00000086720CFFF0 000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__000000873B520940 000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__00000086720CFFF0 000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__000000873B520940 000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__00000086720CFFF0 000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__000000873B520940 000000067F000080000006C00C000008460B-000000067F000080000006C00C000008DD71__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__00000086720CFFF0 000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__000000873B520940 000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__00000086720CFFF0 000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__000000873B520940 000000067F000080000006C00C000008DD71-000000067F000080000006C00C00000974D7__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__00000086720CFFF0 000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__000000873B520940 000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__00000086720CFFF0 000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__000000873B520940 000000067F000080000006C00C00000974D7-000000067F000080000006C00C00000A0C0B__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__00000086720CFFF0 000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__000000873B520940 000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__00000086720CFFF0 000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__000000873B520940 000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__00000086720CFFF0 000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__000000873B520940 000000067F000080000006C00C00000A0C0B-000000067F000080000006C00C00000AA371__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__00000086720CFFF0 000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__000000873B520940 000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__00000086720CFFF0 000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__000000873B520940 000000067F000080000006C00C00000AA371-000000067F000080000006C00C00000B3AD7__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C00000AB316-000000067F000080000006C00C00001015F1__00000086ED29E361-000000873C9A2551 000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__00000086720CFFF0 000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__000000873B520940 000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__00000086720CFFF0 000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__000000873B520940 000000067F000080000006C00C00000B3AD7-000000067F000080000006C00C0100000000__00000085239DFB81-00000085D35BF439 000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__00000086720CFFF0 000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__000000873B520940 000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__00000086720CFFF0 000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__000000873B520940 000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__00000086720CFFF0 000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__000000873B520940 000000067F000080000006C00C00000BC102-000000067F000080000006C00C00000C580D__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000BFB6E-000000067F000080000006C01400000016BC__0000008673817FC9-00000086ED29E361 000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__00000086720CFFF0 000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__000000873B520940 000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__00000086720CFFF0 000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__000000873B520940 000000067F000080000006C00C00000C580D-000000067F000080000006C00C00000CEF73__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__00000086720CFFF0 000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__000000873B520940 000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__00000086720CFFF0 000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__000000873B520940 000000067F000080000006C00C00000CEF73-000000067F000080000006C00C00000D86D9__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__00000086720CFFF0 000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__000000873B520940 000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__00000086720CFFF0 000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__000000873B520940 000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__00000086720CFFF0 000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__000000873B520940 000000067F000080000006C00C00000D86D9-000000067F000080000006C00C00000E1E0C__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__00000086720CFFF0 000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__000000873B520940 000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__00000086720CFFF0 000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__000000873B520940 000000067F000080000006C00C00000E1E0C-000000067F000080000006C00C00000EB572__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__00000086720CFFF0 000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__000000873B520940 000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__00000086720CFFF0 000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__000000873B520940 000000067F000080000006C00C00000EB572-000000067F000080000006C00C00000F4CD8__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__00000086720CFFF0 000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__000000873B520940 000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__00000086720CFFF0 000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__000000873B520940 000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__00000086720CFFF0 000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__000000873B520940 000000067F000080000006C00C00000F4CD8-000000067F000080000006C00C00000FE40B__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__00000086720CFFF0 000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__000000873B520940 000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__00000086720CFFF0 000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__000000873B520940 000000067F000080000006C00C00000FE40B-000000067F000080000006C00C0000107B27__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__00000086720CFFF0 000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__000000873B520940 000000067F000080000006C00C00001015F3-000000067F000080000006C0140000013635__00000086ED29E361-000000873C9A2551 000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__00000086720CFFF0 000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__000000873B520940 000000067F000080000006C00C0000107B27-000000067F000080000006C00C000011128D__00000085D35BF439-0000008673817FC9 000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__00000086720CFFF0 000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__000000873B520940 000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__00000086720CFFF0 000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__000000873B520940 000000067F000080000006C00C0000110000-000000067F000080000006C0120100000000__000000873B520940 000000067F000080000006C00C0000110000-030000000000000000000000000000000002__00000086720CFFF0 000000067F000080000006C00C000011128D-010000000000000001000000030000000017__00000085D35BF439-0000008673817FC9 000000067F000080000006C0140000000000-000000067F000080000006C0140000004000__000000873B520940 000000067F000080000006C01400000016BC-000000067F000080000006C014000000830F__0000008673817FC9-00000086ED29E361 000000067F000080000006C0140000004000-000000067F000080000006C0140000008000__000000873B520940 000000067F000080000006C0140000008000-000000067F000080000006C014000000C000__000000873B520940 000000067F000080000006C014000000830F-000000067F000080000006C014000000EF5B__0000008673817FC9-00000086ED29E361 000000067F000080000006C014000000C000-000000067F000080000006C0140000010000__000000873B520940 000000067F000080000006C014000000EF5B-000000067F000080000006C0140000015BA7__0000008673817FC9-00000086ED29E361 000000067F000080000006C0140000010000-000000067F000080000006C0140000014000__000000873B520940 000000067F000080000006C0140000013636-000000067F000080000006C014000002DB5F__00000086ED29E361-000000873C9A2551 000000067F000080000006C0140000014000-000000067F000080000006C0140000018000__000000873B520940 000000067F000080000006C0140000015BA7-000000067F000080000006C014000001C7F0__0000008673817FC9-00000086ED29E361 000000067F000080000006C0140000018000-000000067F000080000006C014000001C000__000000873B520940 000000067F000080000006C014000001C000-000000067F000080000006C0140000020000__000000873B520940 000000067F000080000006C014000001C7F0-000000067F000080000006C0140000023430__0000008673817FC9-00000086ED29E361 000000067F000080000006C0140000020000-000000067F000080000006C0140000024000__000000873B520940 000000067F000080000006C0140000023430-000000067F000080000006C014000002A049__0000008673817FC9-00000086ED29E361 000000067F000080000006C0140000024000-000000067F000080000006C0140000028000__000000873B520940 000000067F000080000006C0140000028000-000000067F000080000006C014000002C000__000000873B520940 000000067F000080000006C014000002A049-030000000000000000000000000000000002__0000008673817FC9-00000086ED29E361 000000067F000080000006C014000002C000-030000000000000000000000000000000002__000000873B520940 000000067F000080000006C014000002DB60-030000000000000000000000000000000002__00000086ED29E361-000000873C9A2551 000000067F000080000006E00C0000000000-000000067F000080000006E00C0000004000__000000890CF51FE0 000000067F000080000006E00C0000004000-000000067F000080000006E00C0000008000__000000890CF51FE0 000000067F000080000006E00C0000008000-000000067F000080000006E00C000000C000__000000890CF51FE0 000000067F000080000006E00C00000096C8-000000067F000080000006E00C0000012E0A__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C000000C000-000000067F000080000006E00C0000010000__000000890CF51FE0 000000067F000080000006E00C0000010000-000000067F000080000006E00C0000014000__000000890CF51FE0 000000067F000080000006E00C0000012E0A-000000067F000080000006E00C000001C570__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C0000014000-000000067F000080000006E00C0000018000__000000890CF51FE0 000000067F000080000006E00C0000018000-000000067F000080000006E00C000001C000__000000890CF51FE0 000000067F000080000006E00C000001C000-000000067F000080000006E00C0000020000__000000890CF51FE0 000000067F000080000006E00C000001C570-000000067F000080000006E00C0000025CD6__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C0000020000-000000067F000080000006E00C0000024000__000000890CF51FE0 000000067F000080000006E00C0000024000-000000067F000080000006E00C0000028000__000000890CF51FE0 000000067F000080000006E00C0000025CD6-000000067F000080000006E00C000002F40A__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C0000028000-000000067F000080000006E00C000002C000__000000890CF51FE0 000000067F000080000006E00C000002C000-000000067F000080000006E00C0000030000__000000890CF51FE0 000000067F000080000006E00C000002F40A-000000067F000080000006E00C0000038B1D__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C0000030000-000000067F000080000006E00C0000034000__000000890CF51FE0 000000067F000080000006E00C0000034000-000000067F000080000006E00C0000038000__000000890CF51FE0 000000067F000080000006E00C0000038000-000000067F000080000006E00C000003C000__000000890CF51FE0 000000067F000080000006E00C0000038B1D-000000067F000080000006E00C0000042283__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C000003C000-000000067F000080000006E00C0000040000__000000890CF51FE0 000000067F000080000006E00C0000040000-000000067F000080000006E00C0000044000__000000890CF51FE0 000000067F000080000006E00C0000042283-000000067F000080000006E00C000004B9E9__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C0000044000-000000067F000080000006E00C0000048000__000000890CF51FE0 000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__000000890AE2DFC8 000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__00000089D5AEF6E8 000000067F000080000006E00C000004B9E9-030000000000000000000000000000000002__000000873C9A2551-00000087BC75E5B1 000000067F000080000006E00C000004BACB-000000067F000080000006E00C0000055200__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__000000890AE2DFC8 000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__00000089D5AEF6E8 000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__000000890AE2DFC8 000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__00000089D5AEF6E8 000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__000000890AE2DFC8 000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__00000089D5AEF6E8 000000067F000080000006E00C0000054246-000000067F000080000006E00C00000A83ED__0000008985FD3611-00000089D6B8EE99 000000067F000080000006E00C0000055200-000000067F000080000006E00C000005E90B__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__000000890AE2DFC8 000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__00000089D5AEF6E8 000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__000000890AE2DFC8 000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__00000089D5AEF6E8 000000067F000080000006E00C000005E90B-000000067F000080000006E00C000006802B__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__000000890AE2DFC8 000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__00000089D5AEF6E8 000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__000000890AE2DFC8 000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__00000089D5AEF6E8 000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__000000890AE2DFC8 000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__00000089D5AEF6E8 000000067F000080000006E00C000006802B-000000067F000080000006E00C0000071782__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__000000890AE2DFC8 000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__00000089D5AEF6E8 000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__000000890AE2DFC8 000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__00000089D5AEF6E8 000000067F000080000006E00C0000071782-000000067F000080000006E00C000007AEE8__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__000000890AE2DFC8 000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__00000089D5AEF6E8 000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__000000890AE2DFC8 000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__00000089D5AEF6E8 000000067F000080000006E00C000007AEE8-000000067F000080000006E00C000008460B__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__000000890AE2DFC8 000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__00000089D5AEF6E8 000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__000000890AE2DFC8 000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__00000089D5AEF6E8 000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__000000890AE2DFC8 000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__00000089D5AEF6E8 000000067F000080000006E00C000008460B-000000067F000080000006E00C000008DD71__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__000000890AE2DFC8 000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__00000089D5AEF6E8 000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__000000890AE2DFC8 000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__00000089D5AEF6E8 000000067F000080000006E00C000008DD71-000000067F000080000006E00C00000974D7__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__000000890AE2DFC8 000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__00000089D5AEF6E8 000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__000000890AE2DFC8 000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__00000089D5AEF6E8 000000067F000080000006E00C00000974D7-000000067F000080000006E00C00000A0C0B__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__000000890AE2DFC8 000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__00000089D5AEF6E8 000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__000000890AE2DFC8 000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__00000089D5AEF6E8 000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__000000890AE2DFC8 000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__00000089D5AEF6E8 000000067F000080000006E00C00000A0C0B-000000067F000080000006E00C00000AA371__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__000000890AE2DFC8 000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__00000089D5AEF6E8 000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__000000890AE2DFC8 000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__00000089D5AEF6E8 000000067F000080000006E00C00000A8407-000000067F000080000006E00C00000FD787__0000008985FD3611-00000089D6B8EE99 000000067F000080000006E00C00000AA371-000000067F000080000006E00C00000B3AD7__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__000000890AE2DFC8 000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__00000089D5AEF6E8 000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__000000890AE2DFC8 000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__00000089D5AEF6E8 000000067F000080000006E00C00000B3AD7-000000067F000080000006E00C00000BD20B__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__000000890AE2DFC8 000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__00000089D5AEF6E8 000000067F000080000006E00C00000B6F42-000000067F000080000006E0140000000EEF__000000890C5B6001-0000008985FD3611 000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__000000890AE2DFC8 000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__00000089D5AEF6E8 000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__000000890AE2DFC8 000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__00000089D5AEF6E8 000000067F000080000006E00C00000BD20B-000000067F000080000006E00C0100000000__00000087BC75E5B1-000000887C2DFE59 000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__000000890AE2DFC8 000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__00000089D5AEF6E8 000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__000000890AE2DFC8 000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__00000089D5AEF6E8 000000067F000080000006E00C00000C5883-000000067F000080000006E00C00000CEFE9__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__000000890AE2DFC8 000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__00000089D5AEF6E8 000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__000000890AE2DFC8 000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__00000089D5AEF6E8 000000067F000080000006E00C00000CEFE9-000000067F000080000006E00C00000D872B__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__000000890AE2DFC8 000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__00000089D5AEF6E8 000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__000000890AE2DFC8 000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__00000089D5AEF6E8 000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__000000890AE2DFC8 000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__00000089D5AEF6E8 000000067F000080000006E00C00000D872B-000000067F000080000006E00C00000E1E91__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__000000890AE2DFC8 000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__00000089D5AEF6E8 000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__000000890AE2DFC8 000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__00000089D5AEF6E8 000000067F000080000006E00C00000E1E91-000000067F000080000006E00C00000EB5F7__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__000000890AE2DFC8 000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__00000089D5AEF6E8 000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__000000890AE2DFC8 000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__00000089D5AEF6E8 000000067F000080000006E00C00000EB5F7-000000067F000080000006E00C00000F4D0C__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__000000890AE2DFC8 000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__00000089D5AEF6E8 000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__000000890AE2DFC8 000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__00000089D5AEF6E8 000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__000000890AE2DFC8 000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__00000089D5AEF6E8 000000067F000080000006E00C00000F4D0C-000000067F000080000006E00C00000FE472__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__000000890AE2DFC8 000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__00000089D5AEF6E8 000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__000000890AE2DFC8 000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__00000089D5AEF6E8 000000067F000080000006E00C00000FD78D-000000067F000080000006E0140000011DB5__0000008985FD3611-00000089D6B8EE99 000000067F000080000006E00C00000FE472-000000067F000080000006E00C0000107B8E__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__000000890AE2DFC8 000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__00000089D5AEF6E8 000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__000000890AE2DFC8 000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__00000089D5AEF6E8 000000067F000080000006E00C0000107B8E-000000067F000080000006E00C00001112F4__000000887C2DFE59-000000890C5B6001 000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__000000890AE2DFC8 000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__00000089D5AEF6E8 000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__000000890AE2DFC8 000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__00000089D5AEF6E8 000000067F000080000006E00C0000110000-000000067F000080000006E0120100000000__00000089D5AEF6E8 000000067F000080000006E00C0000110000-030000000000000000000000000000000002__000000890AE2DFC8 000000067F000080000006E00C00001112F4-01000000000000000100000003000000001A__000000887C2DFE59-000000890C5B6001 000000067F000080000006E0140000000000-000000067F000080000006E0140000004000__00000089D5AEF6E8 000000067F000080000006E0140000000EEF-000000067F000080000006E0140000007C4F__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000004000-000000067F000080000006E0140000008000__00000089D5AEF6E8 000000067F000080000006E0140000007C4F-000000067F000080000006E014000000E97E__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000008000-000000067F000080000006E014000000C000__00000089D5AEF6E8 000000067F000080000006E014000000C000-000000067F000080000006E0140000010000__00000089D5AEF6E8 000000067F000080000006E014000000E97E-000000067F000080000006E01400000156DC__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000010000-000000067F000080000006E0140000014000__00000089D5AEF6E8 000000067F000080000006E0140000011DB5-000000067F000080000006E014000002B9CE__0000008985FD3611-00000089D6B8EE99 000000067F000080000006E0140000014000-000000067F000080000006E0140000018000__00000089D5AEF6E8 000000067F000080000006E01400000156DC-000000067F000080000006E014000001C468__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000018000-000000067F000080000006E014000001C000__00000089D5AEF6E8 000000067F000080000006E014000001C000-000000067F000080000006E0140000020000__00000089D5AEF6E8 000000067F000080000006E014000001C468-000000067F000080000006E01400000231D5__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000020000-000000067F000080000006E0140000024000__00000089D5AEF6E8 000000067F000080000006E01400000231D5-000000067F000080000006E0140000029F96__000000890C5B6001-0000008985FD3611 000000067F000080000006E0140000024000-000000067F000080000006E0140000028000__00000089D5AEF6E8 000000067F000080000006E0140000028000-000000067F000080000006E014000002C000__00000089D5AEF6E8 000000067F000080000006E0140000029F96-030000000000000000000000000000000002__000000890C5B6001-0000008985FD3611 000000067F000080000006E014000002B9D0-030000000000000000000000000000000002__0000008985FD3611-00000089D6B8EE99 000000067F000080000006E014000002C000-030000000000000000000000000000000002__00000089D5AEF6E8 000000067F000080000007000C0000000000-000000067F000080000007000C0000004000__0000008BA730BFE8 000000067F000080000007000C0000004000-000000067F000080000007000C0000008000__0000008BA730BFE8 000000067F000080000007000C0000008000-000000067F000080000007000C000000C000__0000008BA730BFE8 000000067F000080000007000C000000955C-000000067F000080000007000C0000012CC2__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C000000C000-000000067F000080000007000C0000010000__0000008BA730BFE8 000000067F000080000007000C0000010000-000000067F000080000007000C0000014000__0000008BA730BFE8 000000067F000080000007000C0000012CC2-000000067F000080000007000C000001C40A__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C0000014000-000000067F000080000007000C0000018000__0000008BA730BFE8 000000067F000080000007000C0000018000-000000067F000080000007000C000001C000__0000008BA730BFE8 000000067F000080000007000C000001C000-000000067F000080000007000C0000020000__0000008BA730BFE8 000000067F000080000007000C000001C40A-000000067F000080000007000C0000025B39__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C0000020000-000000067F000080000007000C0000024000__0000008BA730BFE8 000000067F000080000007000C0000024000-000000067F000080000007000C0000028000__0000008BA730BFE8 000000067F000080000007000C0000025B39-000000067F000080000007000C000002F29F__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C0000028000-000000067F000080000007000C000002C000__0000008BA730BFE8 000000067F000080000007000C000002C000-000000067F000080000007000C0000030000__0000008BA730BFE8 000000067F000080000007000C000002F29F-000000067F000080000007000C00000389B3__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C0000030000-000000067F000080000007000C0000034000__0000008BA730BFE8 000000067F000080000007000C0000034000-000000067F000080000007000C0000038000__0000008BA730BFE8 000000067F000080000007000C0000038000-000000067F000080000007000C000003C000__0000008BA730BFE8 000000067F000080000007000C00000389B3-000000067F000080000007000C0000042119__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C000003C000-000000067F000080000007000C0000040000__0000008BA730BFE8 000000067F000080000007000C0000040000-000000067F000080000007000C0000044000__0000008BA730BFE8 000000067F000080000007000C0000042119-000000067F000080000007000C000004B87F__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C0000044000-000000067F000080000007000C0000048000__0000008BA730BFE8 000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008B9669EDB0 000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008C71903720 000000067F000080000007000C000004B87F-030000000000000000000000000000000002__00000089D6B8EE99-0000008A56BBF739 000000067F000080000007000C000004BAD3-000000067F000080000007000C0000055207__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008B9669EDB0 000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008C71903720 000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008B9669EDB0 000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008C71903720 000000067F000080000007000C0000053C23-000000067F000080000007000C00000A6F76__0000008C2045B721-0000008C72843D41 000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008B9669EDB0 000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008C71903720 000000067F000080000007000C0000055207-000000067F000080000007000C000005E912__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008B9669EDB0 000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008C71903720 000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008B9669EDB0 000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008C71903720 000000067F000080000007000C000005E912-000000067F000080000007000C000006802C__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008B9669EDB0 000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008C71903720 000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008B9669EDB0 000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008C71903720 000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008B9669EDB0 000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008C71903720 000000067F000080000007000C000006802C-000000067F000080000007000C0000071783__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008B9669EDB0 000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008C71903720 000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008B9669EDB0 000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008C71903720 000000067F000080000007000C0000071783-000000067F000080000007000C000007AEE9__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008B9669EDB0 000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008C71903720 000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008B9669EDB0 000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008C71903720 000000067F000080000007000C000007AEE9-000000067F000080000007000C000008460B__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008B9669EDB0 000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008C71903720 000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008B9669EDB0 000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008C71903720 000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008B9669EDB0 000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008C71903720 000000067F000080000007000C000008460B-000000067F000080000007000C000008DD71__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008B9669EDB0 000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008C71903720 000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008B9669EDB0 000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008C71903720 000000067F000080000007000C000008DD71-000000067F000080000007000C00000974D7__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008B9669EDB0 000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008C71903720 000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008B9669EDB0 000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008C71903720 000000067F000080000007000C00000974D7-000000067F000080000007000C00000A0C0B__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008B9669EDB0 000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008C71903720 000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008B9669EDB0 000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008C71903720 000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008B9669EDB0 000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008C71903720 000000067F000080000007000C00000A0C0B-000000067F000080000007000C00000AA371__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008B9669EDB0 000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008C71903720 000000067F000080000007000C00000A6F77-000000067F000080000007000C00000FA170__0000008C2045B721-0000008C72843D41 000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008B9669EDB0 000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008C71903720 000000067F000080000007000C00000AA371-000000067F000080000007000C0100000000__0000008A56BBF739-0000008AF67FEC19 000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008B9669EDB0 000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008C71903720 000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008B9669EDB0 000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008C71903720 000000067F000080000007000C00000B2B06-000000067F000080000007000C00000BC211__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008B9669EDB0 000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008C71903720 000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008B9669EDB0 000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008C71903720 000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008B9669EDB0 000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008C71903720 000000067F000080000007000C00000BC211-000000067F000080000007000C00000C5941__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000BF157-000000067F000080000007001400000016B2__0000008BA6803FC9-0000008C2045B721 000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008B9669EDB0 000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008C71903720 000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008B9669EDB0 000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008C71903720 000000067F000080000007000C00000C5941-000000067F000080000007000C00000CF0A7__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008B9669EDB0 000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008C71903720 000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008B9669EDB0 000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008C71903720 000000067F000080000007000C00000CF0A7-000000067F000080000007000C00000D87BC__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008B9669EDB0 000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008C71903720 000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008B9669EDB0 000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008C71903720 000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008B9669EDB0 000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008C71903720 000000067F000080000007000C00000D87BC-000000067F000080000007000C00000E1F0A__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008B9669EDB0 000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008C71903720 000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008B9669EDB0 000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008C71903720 000000067F000080000007000C00000E1F0A-000000067F000080000007000C00000EB670__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008B9669EDB0 000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008C71903720 000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008B9669EDB0 000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008C71903720 000000067F000080000007000C00000EB670-000000067F000080000007000C00000F4DA7__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008B9669EDB0 000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008C71903720 000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008B9669EDB0 000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008C71903720 000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008B9669EDB0 000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008C71903720 000000067F000080000007000C00000F4DA7-000000067F000080000007000C00000FE509__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008B9669EDB0 000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008C71903720 000000067F000080000007000C00000FA175-000000067F00008000000700140000010412__0000008C2045B721-0000008C72843D41 000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008B9669EDB0 000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008C71903720 000000067F000080000007000C00000FE509-000000067F000080000007000C0000107C2B__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008B9669EDB0 000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008C71903720 000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008B9669EDB0 000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008C71903720 000000067F000080000007000C0000107C2B-000000067F000080000007000C0000111385__0000008AF67FEC19-0000008BA6803FC9 000000067F000080000007000C0000108000-000000067F000080000007000C000010C000__0000008C71903720 000000067F000080000007000C0000108000-030000000000000000000000000000000002__0000008B9669EDB0 000000067F000080000007000C000010C000-000000067F000080000007000C0000110000__0000008C71903720 000000067F000080000007000C0000110000-000000067F00008000000700120100000000__0000008C71903720 000000067F000080000007000C0000111385-01000000000000000100000003000000001E__0000008AF67FEC19-0000008BA6803FC9 000000067F00008000000700140000000000-000000067F00008000000700140000004000__0000008C71903720 000000067F000080000007001400000016B2-000000067F000080000007001400000082A6__0000008BA6803FC9-0000008C2045B721 000000067F00008000000700140000004000-000000067F00008000000700140000008000__0000008C71903720 000000067F00008000000700140000008000-000000067F0000800000070014000000C000__0000008C71903720 000000067F000080000007001400000082A6-000000067F0000800000070014000000EED0__0000008BA6803FC9-0000008C2045B721 000000067F0000800000070014000000C000-000000067F00008000000700140000010000__0000008C71903720 000000067F0000800000070014000000EED0-000000067F00008000000700140000015ADC__0000008BA6803FC9-0000008C2045B721 000000067F00008000000700140000010000-000000067F00008000000700140000014000__0000008C71903720 000000067F0000800000070014000001041E-000000067F000080000007001400000294B8__0000008C2045B721-0000008C72843D41 000000067F00008000000700140000014000-000000067F00008000000700140000018000__0000008C71903720 000000067F00008000000700140000015ADC-000000067F0000800000070014000001C6D6__0000008BA6803FC9-0000008C2045B721 000000067F00008000000700140000018000-000000067F0000800000070014000001C000__0000008C71903720 000000067F0000800000070014000001C000-000000067F00008000000700140000020000__0000008C71903720 000000067F0000800000070014000001C6D6-000000067F000080000007001400000232FD__0000008BA6803FC9-0000008C2045B721 000000067F00008000000700140000020000-000000067F00008000000700140000024000__0000008C71903720 000000067F000080000007001400000232FD-000000067F00008000000700140000029F07__0000008BA6803FC9-0000008C2045B721 000000067F00008000000700140000024000-000000067F00008000000700140000028000__0000008C71903720 000000067F00008000000700140000028000-000000067F0000800000070014000002C000__0000008C71903720 000000067F000080000007001400000294BA-030000000000000000000000000000000002__0000008C2045B721-0000008C72843D41 000000067F00008000000700140000029F07-030000000000000000000000000000000002__0000008BA6803FC9-0000008C2045B721 000000067F0000800000070014000002C000-030000000000000000000000000000000002__0000008C71903720 000000067F000080000007200C0000000000-000000067F000080000007200C0000004000__0000008E43487FF0 000000067F000080000007200C0000004000-000000067F000080000007200C0000008000__0000008E43487FF0 000000067F000080000007200C0000008000-000000067F000080000007200C000000C000__0000008E43487FF0 000000067F000080000007200C000000933D-000000067F000080000007200C0000012AA3__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C000000C000-000000067F000080000007200C0000010000__0000008E43487FF0 000000067F000080000007200C0000010000-000000067F000080000007200C0000014000__0000008E43487FF0 000000067F000080000007200C0000012AA3-000000067F000080000007200C000001C209__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C0000014000-000000067F000080000007200C0000018000__0000008E43487FF0 000000067F000080000007200C0000018000-000000067F000080000007200C000001C000__0000008E43487FF0 000000067F000080000007200C000001C000-000000067F000080000007200C0000020000__0000008E43487FF0 000000067F000080000007200C000001C209-000000067F000080000007200C0000025939__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C0000020000-000000067F000080000007200C0000024000__0000008E43487FF0 000000067F000080000007200C0000024000-000000067F000080000007200C0000028000__0000008E43487FF0 000000067F000080000007200C0000025939-000000067F000080000007200C000002F09F__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C0000028000-000000067F000080000007200C000002C000__0000008E43487FF0 000000067F000080000007200C000002C000-000000067F000080000007200C0000030000__0000008E43487FF0 000000067F000080000007200C000002F09F-000000067F000080000007200C00000387B4__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C0000030000-000000067F000080000007200C0000034000__0000008E43487FF0 000000067F000080000007200C0000034000-000000067F000080000007200C0000038000__0000008E43487FF0 000000067F000080000007200C0000038000-000000067F000080000007200C000003C000__0000008E43487FF0 000000067F000080000007200C00000387B4-000000067F000080000007200C0000041F1A__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C000003C000-000000067F000080000007200C0000040000__0000008E43487FF0 000000067F000080000007200C0000040000-000000067F000080000007200C0000044000__0000008E43487FF0 000000067F000080000007200C0000041F1A-000000067F000080000007200C000004B680__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C0000044000-000000067F000080000007200C0000048000__0000008E43487FF0 000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008E3CDF59C0 000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008F10EA21C8 000000067F000080000007200C000004B680-030000000000000000000000000000000002__0000008C72843D41-0000008CF2BFFC89 000000067F000080000007200C000004BACE-000000067F000080000007200C0000055202__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008E3CDF59C0 000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008F10EA21C8 000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008E3CDF59C0 000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008F10EA21C8 000000067F000080000007200C000005131D-000000067F000080000007200C00000A2138__0000008EBC4827C1-0000008F10E3E189 000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008E3CDF59C0 000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008F10EA21C8 000000067F000080000007200C0000055202-000000067F000080000007200C000005E90D__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008E3CDF59C0 000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008F10EA21C8 000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008E3CDF59C0 000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008F10EA21C8 000000067F000080000007200C000005E90D-000000067F000080000007200C000006802B__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008E3CDF59C0 000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008F10EA21C8 000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008E3CDF59C0 000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008F10EA21C8 000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008E3CDF59C0 000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008F10EA21C8 000000067F000080000007200C000006802B-000000067F000080000007200C0000071782__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008E3CDF59C0 000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008F10EA21C8 000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008E3CDF59C0 000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008F10EA21C8 000000067F000080000007200C0000071782-000000067F000080000007200C000007AEE8__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008E3CDF59C0 000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008F10EA21C8 000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008E3CDF59C0 000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008F10EA21C8 000000067F000080000007200C000007AEE8-000000067F000080000007200C000008460B__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008E3CDF59C0 000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008F10EA21C8 000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008E3CDF59C0 000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008F10EA21C8 000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008E3CDF59C0 000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008F10EA21C8 000000067F000080000007200C000008460B-000000067F000080000007200C000008DD71__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008E3CDF59C0 000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008F10EA21C8 000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008E3CDF59C0 000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008F10EA21C8 000000067F000080000007200C000008DD71-000000067F000080000007200C00000974D7__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008E3CDF59C0 000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008F10EA21C8 000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008E3CDF59C0 000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008F10EA21C8 000000067F000080000007200C00000974D7-000000067F000080000007200C00000A0C0B__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008E3CDF59C0 000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008F10EA21C8 000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008E3CDF59C0 000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008F10EA21C8 000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008E3CDF59C0 000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008F10EA21C8 000000067F000080000007200C00000A0C0B-000000067F000080000007200C00000AA371__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C00000A2138-000000067F000080000007200C00000F342E__0000008EBC4827C1-0000008F10E3E189 000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008E3CDF59C0 000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008F10EA21C8 000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008E3CDF59C0 000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008F10EA21C8 000000067F000080000007200C00000AA371-000000067F000080000007200C00000B3AD7__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008E3CDF59C0 000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008F10EA21C8 000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008E3CDF59C0 000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008F10EA21C8 000000067F000080000007200C00000B3AD7-000000067F000080000007200C00000BD20B__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008E3CDF59C0 000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008F10EA21C8 000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008E3CDF59C0 000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008F10EA21C8 000000067F000080000007200C00000BA086-000000067F00008000000720140000001101__0000008E42A19FD1-0000008EBC4827C1 000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008E3CDF59C0 000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008F10EA21C8 000000067F000080000007200C00000BD20B-000000067F000080000007200C0100000000__0000008CF2BFFC89-0000008DB277FA49 000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008E3CDF59C0 000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008F10EA21C8 000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008E3CDF59C0 000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008F10EA21C8 000000067F000080000007200C00000C58B0-000000067F000080000007200C00000CF00A__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008E3CDF59C0 000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008F10EA21C8 000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008E3CDF59C0 000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008F10EA21C8 000000067F000080000007200C00000CF00A-000000067F000080000007200C00000D871F__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008E3CDF59C0 000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008F10EA21C8 000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008E3CDF59C0 000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008F10EA21C8 000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008E3CDF59C0 000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008F10EA21C8 000000067F000080000007200C00000D871F-000000067F000080000007200C00000E1E85__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008E3CDF59C0 000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008F10EA21C8 000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008E3CDF59C0 000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008F10EA21C8 000000067F000080000007200C00000E1E85-000000067F000080000007200C00000EB5EB__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008E3CDF59C0 000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008F10EA21C8 000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008E3CDF59C0 000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008F10EA21C8 000000067F000080000007200C00000EB5EB-000000067F000080000007200C00000F4D0C__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008E3CDF59C0 000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008F10EA21C8 000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008E3CDF59C0 000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008F10EA21C8 000000067F000080000007200C00000F342F-000000067F0000800000072014000000D54C__0000008EBC4827C1-0000008F10E3E189 000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008E3CDF59C0 000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008F10EA21C8 000000067F000080000007200C00000F4D0C-000000067F000080000007200C00000FE472__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008E3CDF59C0 000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008F10EA21C8 000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008E3CDF59C0 000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008F10EA21C8 000000067F000080000007200C00000FE472-000000067F000080000007200C0000107B8E__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008E3CDF59C0 000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008F10EA21C8 000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008E3CDF59C0 000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008F10EA21C8 000000067F000080000007200C0000107B8E-000000067F000080000007200C00001112F4__0000008DB277FA49-0000008E42A19FD1 000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008E3CDF59C0 000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008F10EA21C8 000000067F000080000007200C000010C000-000000067F000080000007200C0000110000__0000008F10EA21C8 000000067F000080000007200C000010C000-030000000000000000000000000000000002__0000008E3CDF59C0 000000067F000080000007200C0000110000-000000067F00008000000720120100000000__0000008F10EA21C8 000000067F000080000007200C00001112F4-010000000000000001000000040000000001__0000008DB277FA49-0000008E42A19FD1 000000067F00008000000720140000000000-000000067F00008000000720140000004000__0000008F10EA21C8 000000067F00008000000720140000001101-000000067F00008000000720140000007E82__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000004000-000000067F00008000000720140000008000__0000008F10EA21C8 000000067F00008000000720140000007E82-000000067F0000800000072014000000EB9D__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000008000-000000067F0000800000072014000000C000__0000008F10EA21C8 000000067F0000800000072014000000C000-000000067F00008000000720140000010000__0000008F10EA21C8 000000067F0000800000072014000000D54D-000000067F00008000000720140000025E6D__0000008EBC4827C1-0000008F10E3E189 000000067F0000800000072014000000EB9D-000000067F00008000000720140000015866__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000010000-000000067F00008000000720140000014000__0000008F10EA21C8 000000067F00008000000720140000014000-000000067F00008000000720140000018000__0000008F10EA21C8 000000067F00008000000720140000015866-000000067F0000800000072014000001C591__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000018000-000000067F0000800000072014000001C000__0000008F10EA21C8 000000067F0000800000072014000001C000-000000067F00008000000720140000020000__0000008F10EA21C8 000000067F0000800000072014000001C591-000000067F0000800000072014000002326E__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000020000-000000067F00008000000720140000024000__0000008F10EA21C8 000000067F0000800000072014000002326E-000000067F00008000000720140000029F59__0000008E42A19FD1-0000008EBC4827C1 000000067F00008000000720140000024000-000000067F00008000000720140000028000__0000008F10EA21C8 000000067F00008000000720140000025E75-030000000000000000000000000000000002__0000008EBC4827C1-0000008F10E3E189 000000067F00008000000720140000028000-000000067F0000800000072014000002C000__0000008F10EA21C8 000000067F00008000000720140000029F59-030000000000000000000000000000000002__0000008E42A19FD1-0000008EBC4827C1 000000067F0000800000072014000002C000-030000000000000000000000000000000002__0000008F10EA21C8 000000067F000080000007400C0000000000-000000067F000080000007400C0000004000__00000091A67E3E18 000000067F000080000007400C0000004000-000000067F000080000007400C0000008000__00000091A67E3E18 000000067F000080000007400C0000008000-000000067F000080000007400C000000C000__00000091A67E3E18 000000067F000080000007400C00000090E9-000000067F000080000007400C000001280C__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C000000C000-000000067F000080000007400C0000010000__00000091A67E3E18 000000067F000080000007400C0000010000-000000067F000080000007400C0000014000__00000091A67E3E18 000000067F000080000007400C000001280C-000000067F000080000007400C000001BF72__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C0000014000-000000067F000080000007400C0000018000__00000091A67E3E18 000000067F000080000007400C0000018000-000000067F000080000007400C000001C000__00000091A67E3E18 000000067F000080000007400C000001BF72-000000067F000080000007400C00000256D8__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C000001C000-000000067F000080000007400C0000020000__00000091A67E3E18 000000067F000080000007400C0000020000-000000067F000080000007400C0000024000__00000091A67E3E18 000000067F000080000007400C0000024000-000000067F000080000007400C0000028000__00000091A67E3E18 000000067F000080000007400C00000256D8-000000067F000080000007400C000002EE0B__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C0000028000-000000067F000080000007400C000002C000__00000091A67E3E18 000000067F000080000007400C000002C000-000000067F000080000007400C0000030000__00000091A67E3E18 000000067F000080000007400C000002EE0B-000000067F000080000007400C0000038521__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C0000030000-000000067F000080000007400C0000034000__00000091A67E3E18 000000067F000080000007400C0000034000-000000067F000080000007400C0000038000__00000091A67E3E18 000000067F000080000007400C0000038000-000000067F000080000007400C000003C000__00000091A67E3E18 000000067F000080000007400C0000038521-000000067F000080000007400C0000041C87__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C000003C000-000000067F000080000007400C0000040000__00000091A67E3E18 000000067F000080000007400C0000040000-000000067F000080000007400C0000044000__00000091A67E3E18 000000067F000080000007400C0000041C87-000000067F000080000007400C000004B3ED__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C0000044000-000000067F000080000007400C0000048000__00000091A67E3E18 000000067F000080000007400C0000048000-000000067F000080000007400C000004C000__000000914B20A810 000000067F000080000007400C000004B3ED-030000000000000000000000000000000002__0000008F10E3E189-0000008F915DE591 000000067F000080000007400C000004BAC9-000000067F000080000007400C00000551FE__0000008F915DE591-000000903121F569 000000067F000080000007400C000004C000-000000067F000080000007400C0000050000__000000914B20A810 000000067F000080000007400C000004DF0B-000000067F000080000007400C000009B41F__000000914B2393B1-00000091A6DD7A79 000000067F000080000007400C0000050000-000000067F000080000007400C0000054000__000000914B20A810 000000067F000080000007400C0000054000-000000067F000080000007400C0000058000__000000914B20A810 000000067F000080000007400C00000551FE-000000067F000080000007400C000005E90C__0000008F915DE591-000000903121F569 000000067F000080000007400C0000058000-000000067F000080000007400C000005C000__000000914B20A810 000000067F000080000007400C000005C000-000000067F000080000007400C0000060000__000000914B20A810 000000067F000080000007400C000005E90C-000000067F000080000007400C000006802C__0000008F915DE591-000000903121F569 000000067F000080000007400C0000060000-000000067F000080000007400C0000064000__000000914B20A810 000000067F000080000007400C0000064000-000000067F000080000007400C0000068000__000000914B20A810 000000067F000080000007400C0000068000-000000067F000080000007400C000006C000__000000914B20A810 000000067F000080000007400C000006802C-000000067F000080000007400C0000071783__0000008F915DE591-000000903121F569 000000067F000080000007400C000006C000-000000067F000080000007400C0000070000__000000914B20A810 000000067F000080000007400C0000070000-000000067F000080000007400C0000074000__000000914B20A810 000000067F000080000007400C0000071783-000000067F000080000007400C000007AEE9__0000008F915DE591-000000903121F569 000000067F000080000007400C0000074000-000000067F000080000007400C0000078000__000000914B20A810 000000067F000080000007400C0000078000-000000067F000080000007400C000007C000__000000914B20A810 000000067F000080000007400C000007AEE9-000000067F000080000007400C000008460B__0000008F915DE591-000000903121F569 000000067F000080000007400C000007C000-000000067F000080000007400C0000080000__000000914B20A810 000000067F000080000007400C0000080000-000000067F000080000007400C0000084000__000000914B20A810 000000067F000080000007400C0000084000-000000067F000080000007400C0000088000__000000914B20A810 000000067F000080000007400C000008460B-000000067F000080000007400C000008DD71__0000008F915DE591-000000903121F569 000000067F000080000007400C0000088000-000000067F000080000007400C000008C000__000000914B20A810 000000067F000080000007400C000008C000-000000067F000080000007400C0000090000__000000914B20A810 000000067F000080000007400C000008DD71-000000067F000080000007400C00000974D7__0000008F915DE591-000000903121F569 000000067F000080000007400C0000090000-000000067F000080000007400C0000094000__000000914B20A810 000000067F000080000007400C0000094000-000000067F000080000007400C0000098000__000000914B20A810 000000067F000080000007400C00000974D7-000000067F000080000007400C00000A0C0B__0000008F915DE591-000000903121F569 000000067F000080000007400C0000098000-000000067F000080000007400C000009C000__000000914B20A810 000000067F000080000007400C000009B420-000000067F000080000007400C00000E830A__000000914B2393B1-00000091A6DD7A79 000000067F000080000007400C000009C000-000000067F000080000007400C00000A0000__000000914B20A810 000000067F000080000007400C00000A0000-000000067F000080000007400C00000A4000__000000914B20A810 000000067F000080000007400C00000A0C0B-000000067F000080000007400C00000AA371__0000008F915DE591-000000903121F569 000000067F000080000007400C00000A4000-000000067F000080000007400C00000A8000__000000914B20A810 000000067F000080000007400C00000A8000-000000067F000080000007400C00000AC000__00000090DFD64240 000000067F000080000007400C00000AA371-000000067F000080000007400C0100000000__0000008F915DE591-000000903121F569 000000067F000080000007400C00000AA4EC-000000067F000080000007400C00000B3C0C__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000AC000-000000067F000080000007400C00000B0000__00000090DFD64240 000000067F000080000007400C00000B0000-000000067F000080000007400C00000B4000__00000090DFD64240 000000067F000080000007400C00000B3C0C-000000067F000080000007400C00000BD372__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000B4000-000000067F000080000007400C00000B8000__00000090DFD64240 000000067F000080000007400C00000B8000-000000067F000080000007400C00000BC000__00000090DFD64240 000000067F000080000007400C00000BC000-000000067F000080000007400C00000C0000__00000090DFD64240 000000067F000080000007400C00000BD372-000000067F000080000007400C00000C6AD8__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000C0000-000000067F000080000007400C00000C4000__00000090DFD64240 000000067F000080000007400C00000C4000-000000067F000080000007400C00000C8000__00000090DFD64240 000000067F000080000007400C00000C6AD8-000000067F000080000007400C00000D020B__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000C8000-000000067F000080000007400C00000CC000__00000090DFD64240 000000067F000080000007400C00000CC000-000000067F000080000007400C00000D0000__00000090DFD64240 000000067F000080000007400C00000D0000-000000067F000080000007400C00000D4000__00000090DFD64240 000000067F000080000007400C00000D020B-000000067F000080000007400C00000D9971__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000D4000-000000067F000080000007400C00000D8000__00000090DFD64240 000000067F000080000007400C00000D8000-000000067F000080000007400C00000DC000__00000090DFD64240 000000067F000080000007400C00000D9971-000000067F000080000007400C00000E30D7__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000DC000-000000067F000080000007400C00000E0000__00000090DFD64240 000000067F000080000007400C00000E0000-000000067F000080000007400C00000E4000__00000090DFD64240 000000067F000080000007400C00000E30D7-000000067F000080000007400C00000EC80B__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000E4000-000000067F000080000007400C00000E8000__00000090DFD64240 000000067F000080000007400C00000E8000-000000067F000080000007400C00000EC000__00000090DFD64240 000000067F000080000007400C00000E8314-000000067F00008000000740140000008178__000000914B2393B1-00000091A6DD7A79 000000067F000080000007400C00000EC000-000000067F000080000007400C00000F0000__00000090DFD64240 000000067F000080000007400C00000EC80B-000000067F000080000007400C00000F5F38__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000F0000-000000067F000080000007400C00000F4000__00000090DFD64240 000000067F000080000007400C00000F4000-000000067F000080000007400C00000F8000__00000090DFD64240 000000067F000080000007400C00000F5F38-000000067F000080000007400C00000FF69E__000000903121F569-00000090D0E5EA29 000000067F000080000007400C00000F8000-000000067F000080000007400C00000FC000__00000090DFD64240 000000067F000080000007400C00000FC000-000000067F000080000007400C0000100000__00000090DFD64240 000000067F000080000007400C00000FCCA8-000000067F000080000007400C00001119BA__00000090D0E5EA29-000000914B2393B1 000000067F000080000007400C00000FF69E-000000067F000080000007400C0000108DAF__000000903121F569-00000090D0E5EA29 000000067F000080000007400C0000100000-000000067F000080000007400C0000104000__00000090DFD64240 000000067F000080000007400C0000104000-000000067F000080000007400C0000108000__00000090DFD64240 000000067F000080000007400C0000108000-000000067F000080000007400C000010C000__00000090DFD64240 000000067F000080000007400C0000108DAF-000000067F000080000007400C0100000000__000000903121F569-00000090D0E5EA29 000000067F000080000007400C000010C000-000000067F000080000007400C0000110000__00000090DFD64240 000000067F000080000007400C0000110000-030000000000000000000000000000000002__00000090DFD64240 000000067F000080000007400C00001119BA-000000067F00008000000740140000004326__00000090D0E5EA29-000000914B2393B1 000000067F00008000000740140000004326-000000067F0000800000074014000000B7EE__00000090D0E5EA29-000000914B2393B1 000000067F00008000000740140000008179-000000067F0000800000074014000001D4B7__000000914B2393B1-00000091A6DD7A79 000000067F0000800000074014000000B7EE-000000067F00008000000740140000012CCD__00000090D0E5EA29-000000914B2393B1 000000067F00008000000740140000012CCD-000000067F0000800000074014000001A16B__00000090D0E5EA29-000000914B2393B1 000000067F0000800000074014000001A16B-000000067F000080000007401400000215C9__00000090D0E5EA29-000000914B2393B1 000000067F0000800000074014000001D4BA-030000000000000000000000000000000002__000000914B2393B1-00000091A6DD7A79 000000067F000080000007401400000215C9-000000067F00008000000740140000028A4A__00000090D0E5EA29-000000914B2393B1 000000067F00008000000740140000028A4A-030000000000000000000000000000000002__00000090D0E5EA29-000000914B2393B1 000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__00000092CA5E4EA8 000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__0000009445A06DC8 000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__00000092CA5E4EA8 000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__0000009445A06DC8 000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__00000092CA5E4EA8 000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__0000009445A06DC8 000000067F000080000007600C0000008180-000000067F000080000007600C00000118E6__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__00000092CA5E4EA8 000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__0000009445A06DC8 000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__00000092CA5E4EA8 000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__0000009445A06DC8 000000067F000080000007600C00000118E6-000000067F000080000007600C000001B00A__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__00000092CA5E4EA8 000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__0000009445A06DC8 000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__00000092CA5E4EA8 000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__0000009445A06DC8 000000067F000080000007600C000001B00A-000000067F000080000007600C0000024745__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__00000092CA5E4EA8 000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__0000009445A06DC8 000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__00000092CA5E4EA8 000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__0000009445A06DC8 000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__00000092CA5E4EA8 000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__0000009445A06DC8 000000067F000080000007600C0000024745-000000067F000080000007600C000002DEAB__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__00000092CA5E4EA8 000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__0000009445A06DC8 000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__00000092CA5E4EA8 000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__0000009445A06DC8 000000067F000080000007600C000002DEAB-000000067F000080000007600C00000375CB__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__00000092CA5E4EA8 000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__0000009445A06DC8 000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__00000092CA5E4EA8 000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__0000009445A06DC8 000000067F000080000007600C00000375CB-000000067F000080000007600C0000040D0B__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__00000092CA5E4EA8 000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__0000009445A06DC8 000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__00000092CA5E4EA8 000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__0000009445A06DC8 000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__00000092CA5E4EA8 000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__0000009445A06DC8 000000067F000080000007600C0000040D0B-000000067F000080000007600C000004A471__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__00000092CA5E4EA8 000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__0000009445A06DC8 000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__00000092CA5E4EA8 000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__0000009445A06DC8 000000067F000080000007600C000004A471-030000000000000000000000000000000002__00000091A6DD7A79-0000009228F7FA79 000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__00000092CA5E4EA8 000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__0000009445A06DC8 000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__00000092CA5E4EA8 000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__0000009445A06DC8 000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__00000092CA5E4EA8 000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__0000009445A06DC8 000000067F000080000007600C00000544BA-000000067F000080000007600C000005DC0A__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__00000092CA5E4EA8 000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__0000009445A06DC8 000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__00000092CA5E4EA8 000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__0000009445A06DC8 000000067F000080000007600C000005DC0A-000000067F000080000007600C000006732B__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__00000092CA5E4EA8 000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__0000009445A06DC8 000000067F000080000007600C0000061031-000000067F000080000007600C00000C1159__0000009402435A49-0000009446B52FD1 000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__00000092CA5E4EA8 000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__0000009445A06DC8 000000067F000080000007600C000006732B-000000067F000080000007600C0000070A91__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__00000092CA5E4EA8 000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__0000009445A06DC8 000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__00000092CA5E4EA8 000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__0000009445A06DC8 000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__00000092CA5E4EA8 000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__0000009445A06DC8 000000067F000080000007600C0000070A91-000000067F000080000007600C000007A1F7__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__00000092CA5E4EA8 000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__0000009445A06DC8 000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__00000092CA5E4EA8 000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__0000009445A06DC8 000000067F000080000007600C000007A1F7-000000067F000080000007600C000008390C__0000009228F7FA79-00000093786F8001 000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__00000092CA5E4EA8 000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__0000009445A06DC8 000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__00000092CA5E4EA8 000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__0000009445A06DC8 000000067F000080000007600C000008390C-000000067F000080000007600C000008D072__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__00000092CA5E4EA8 000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__0000009445A06DC8 000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__00000092CA5E4EA8 000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__0000009445A06DC8 000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__00000092CA5E4EA8 000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__0000009445A06DC8 000000067F000080000007600C000008C52F-000000067F000080000007600C000010B57A__00000093786F8001-0000009402435A49 000000067F000080000007600C000008D072-000000067F000080000007600C000009679A__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__00000092CA5E4EA8 000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__0000009445A06DC8 000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__00000092CA5E4EA8 000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__0000009445A06DC8 000000067F000080000007600C000009679A-000000067F000080000007600C000009FF00__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__00000092CA5E4EA8 000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__0000009445A06DC8 000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__00000092CA5E4EA8 000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__0000009445A06DC8 000000067F000080000007600C000009FF00-000000067F000080000007600C00000A960B__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__00000092CA5E4EA8 000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__0000009445A06DC8 000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__00000092CA5E4EA8 000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__0000009445A06DC8 000000067F000080000007600C00000A8000-000000067F000080000007600C00000AC000__0000009445A06DC8 000000067F000080000007600C00000A8000-030000000000000000000000000000000002__00000092CA5E4EA8 000000067F000080000007600C00000A960B-000000067F000080000007600C00000B2D55__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000AC000-000000067F000080000007600C00000B0000__0000009445A06DC8 000000067F000080000007600C00000B0000-000000067F000080000007600C00000B4000__0000009445A06DC8 000000067F000080000007600C00000B2D55-000000067F000080000007600C00000BC4BB__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000B4000-000000067F000080000007600C00000B8000__0000009445A06DC8 000000067F000080000007600C00000B8000-000000067F000080000007600C00000BC000__0000009445A06DC8 000000067F000080000007600C00000BC000-000000067F000080000007600C00000C0000__0000009445A06DC8 000000067F000080000007600C00000BC4BB-000000067F000080000007600C00000C5BEA__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000C0000-000000067F000080000007600C00000C4000__0000009445A06DC8 000000067F000080000007600C00000C115D-000000067F0000800000076014000000333A__0000009402435A49-0000009446B52FD1 000000067F000080000007600C00000C4000-000000067F000080000007600C00000C8000__0000009445A06DC8 000000067F000080000007600C00000C5BEA-000000067F000080000007600C00000CF30B__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000C8000-000000067F000080000007600C00000CC000__0000009445A06DC8 000000067F000080000007600C00000CC000-000000067F000080000007600C00000D0000__0000009445A06DC8 000000067F000080000007600C00000CF30B-000000067F000080000007600C00000D8A2B__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000D0000-000000067F000080000007600C00000D4000__0000009445A06DC8 000000067F000080000007600C00000D4000-000000067F000080000007600C00000D8000__0000009445A06DC8 000000067F000080000007600C00000D8000-000000067F000080000007600C00000DC000__0000009445A06DC8 000000067F000080000007600C00000D8A2B-000000067F000080000007600C00000E217C__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000DC000-000000067F000080000007600C00000E0000__0000009445A06DC8 000000067F000080000007600C00000E0000-000000067F000080000007600C00000E4000__0000009445A06DC8 000000067F000080000007600C00000E217C-000000067F000080000007600C00000EB8E2__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000E4000-000000067F000080000007600C00000E8000__0000009445A06DC8 000000067F000080000007600C00000E8000-000000067F000080000007600C00000EC000__0000009445A06DC8 000000067F000080000007600C00000EB8E2-000000067F000080000007600C00000F500B__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000EC000-000000067F000080000007600C00000F0000__0000009445A06DC8 000000067F000080000007600C00000F0000-000000067F000080000007600C00000F4000__0000009445A06DC8 000000067F000080000007600C00000F4000-000000067F000080000007600C00000F8000__0000009445A06DC8 000000067F000080000007600C00000F500B-000000067F000080000007600C00000FE771__0000009228F7FA79-00000093786F8001 000000067F000080000007600C00000F8000-000000067F000080000007600C00000FC000__0000009445A06DC8 000000067F000080000007600C00000FC000-000000067F000080000007600C0000100000__0000009445A06DC8 000000067F000080000007600C00000FE771-000000067F000080000007600C0000107ED7__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000100000-000000067F000080000007600C0000104000__0000009445A06DC8 000000067F000080000007600C0000104000-000000067F000080000007600C0000108000__0000009445A06DC8 000000067F000080000007600C0000107ED7-000000067F000080000007600C000011160C__0000009228F7FA79-00000093786F8001 000000067F000080000007600C0000108000-000000067F000080000007600C000010C000__0000009445A06DC8 000000067F000080000007600C000010B57A-000000067F00008000000760140000003D14__00000093786F8001-0000009402435A49 000000067F000080000007600C000010C000-000000067F000080000007600C0000110000__0000009445A06DC8 000000067F000080000007600C0000110000-000000067F00008000000760120100000000__0000009445A06DC8 000000067F000080000007600C000011160C-010000000000000001000000040000000008__0000009228F7FA79-00000093786F8001 000000067F00008000000760140000000000-000000067F00008000000760140000004000__0000009445A06DC8 000000067F00008000000760140000003354-000000067F00008000000760140000023CAB__0000009402435A49-0000009446B52FD1 000000067F00008000000760140000003D14-000000067F0000800000076014000000A251__00000093786F8001-0000009402435A49 000000067F00008000000760140000004000-000000067F00008000000760140000008000__0000009445A06DC8 000000067F00008000000760140000008000-000000067F0000800000076014000000C000__0000009445A06DC8 000000067F0000800000076014000000A251-000000067F000080000007601400000107AC__00000093786F8001-0000009402435A49 000000067F0000800000076014000000C000-000000067F00008000000760140000010000__0000009445A06DC8 000000067F00008000000760140000010000-000000067F00008000000760140000014000__0000009445A06DC8 000000067F000080000007601400000107AC-000000067F00008000000760140000016CC4__00000093786F8001-0000009402435A49 000000067F00008000000760140000014000-000000067F00008000000760140000018000__0000009445A06DC8 000000067F00008000000760140000016CC4-000000067F0000800000076014000001D272__00000093786F8001-0000009402435A49 000000067F00008000000760140000018000-000000067F0000800000076014000001C000__0000009445A06DC8 000000067F0000800000076014000001C000-000000067F00008000000760140000020000__0000009445A06DC8 000000067F0000800000076014000001D272-000000067F000080000007601400000237C3__00000093786F8001-0000009402435A49 000000067F00008000000760140000020000-000000067F00008000000760140000024000__0000009445A06DC8 000000067F000080000007601400000237C3-000000067F00008000000760140000029CC5__00000093786F8001-0000009402435A49 000000067F00008000000760140000023CB3-030000000000000000000000000000000002__0000009402435A49-0000009446B52FD1 000000067F00008000000760140000024000-000000067F00008000000760140000028000__0000009445A06DC8 000000067F00008000000760140000028000-000000067F0000800000076014000002C000__0000009445A06DC8 000000067F00008000000760140000029CC5-030000000000000000000000000000000002__00000093786F8001-0000009402435A49 000000067F0000800000076014000002C000-030000000000000000000000000000000002__0000009445A06DC8 000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096187D1FC8 000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096E85806C0 000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096187D1FC8 000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096E85806C0 000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096187D1FC8 000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096E85806C0 000000067F000080000007800C000000974C-000000067F000080000007800C0000012EB2__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096187D1FC8 000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096E85806C0 000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096187D1FC8 000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096E85806C0 000000067F000080000007800C0000012EB2-000000067F000080000007800C000001C60B__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096187D1FC8 000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096E85806C0 000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096187D1FC8 000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096E85806C0 000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096187D1FC8 000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096E85806C0 000000067F000080000007800C000001C60B-000000067F000080000007800C0000025D39__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096187D1FC8 000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096E85806C0 000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096187D1FC8 000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096E85806C0 000000067F000080000007800C0000025D39-000000067F000080000007800C000002F49F__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096187D1FC8 000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096E85806C0 000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096187D1FC8 000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096E85806C0 000000067F000080000007800C000002F49F-000000067F000080000007800C0000038BB2__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096187D1FC8 000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096E85806C0 000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096187D1FC8 000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096E85806C0 000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096187D1FC8 000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096E85806C0 000000067F000080000007800C0000038BB2-000000067F000080000007800C0000042318__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096187D1FC8 000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096E85806C0 000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096187D1FC8 000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096E85806C0 000000067F000080000007800C0000042318-000000067F000080000007800C000004BA7E__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096187D1FC8 000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096E85806C0 000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096187D1FC8 000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096E85806C0 000000067F000080000007800C000004BA7E-000000067F000080000007800C00000551B3__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096187D1FC8 000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096E85806C0 000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096187D1FC8 000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096E85806C0 000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__0000009614F1FFE8 000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__00000096E85806C0 000000067F000080000007800C00000551B3-030000000000000000000000000000000002__0000009446B52FD1-00000094D67DF4F9 000000067F000080000007800C000005523E-000000067F000080000007800C000005E9A4__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__0000009614F1FFE8 000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__00000096E85806C0 000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__0000009614F1FFE8 000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__00000096E85806C0 000000067F000080000007800C000005E9A4-000000067F000080000007800C000006810A__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__0000009614F1FFE8 000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__00000096E85806C0 000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__0000009614F1FFE8 000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__00000096E85806C0 000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__0000009614F1FFE8 000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__00000096E85806C0 000000067F000080000007800C000006810A-000000067F000080000007800C0000071870__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__0000009614F1FFE8 000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__00000096E85806C0 000000067F000080000007800C000006D446-000000067F000080000007800C00000D9B82__00000096AEF27399-00000096E85829C9 000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__0000009614F1FFE8 000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__00000096E85806C0 000000067F000080000007800C0000071870-000000067F000080000007800C000007AFD6__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__0000009614F1FFE8 000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__00000096E85806C0 000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__0000009614F1FFE8 000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__00000096E85806C0 000000067F000080000007800C000007AFD6-000000067F000080000007800C000008470B__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C000007B8DE-000000067F000080000007800C00000F73DA__00000096193A8001-00000096AEF27399 000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__0000009614F1FFE8 000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__00000096E85806C0 000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__0000009614F1FFE8 000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__00000096E85806C0 000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__0000009614F1FFE8 000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__00000096E85806C0 000000067F000080000007800C000008470B-000000067F000080000007800C000008DE71__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__0000009614F1FFE8 000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__00000096E85806C0 000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__0000009614F1FFE8 000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__00000096E85806C0 000000067F000080000007800C000008DE71-000000067F000080000007800C0000097591__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__0000009614F1FFE8 000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__00000096E85806C0 000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__0000009614F1FFE8 000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__00000096E85806C0 000000067F000080000007800C0000097591-000000067F000080000007800C00000A0CF7__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__0000009614F1FFE8 000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__00000096E85806C0 000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__0000009614F1FFE8 000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__00000096E85806C0 000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__0000009614F1FFE8 000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__00000096E85806C0 000000067F000080000007800C00000A0CF7-000000067F000080000007800C00000AA40B__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__0000009614F1FFE8 000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__00000096E85806C0 000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__0000009614F1FFE8 000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__00000096E85806C0 000000067F000080000007800C00000AA40B-000000067F000080000007800C00000B3B4D__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__0000009614F1FFE8 000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__00000096E85806C0 000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__0000009614F1FFE8 000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__00000096E85806C0 000000067F000080000007800C00000B3B4D-000000067F000080000007800C00000BD2B3__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__0000009614F1FFE8 000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__00000096E85806C0 000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__0000009614F1FFE8 000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__00000096E85806C0 000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__0000009614F1FFE8 000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__00000096E85806C0 000000067F000080000007800C00000BD2B3-000000067F000080000007800C00000C69DA__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__0000009614F1FFE8 000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__00000096E85806C0 000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__0000009614F1FFE8 000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__00000096E85806C0 000000067F000080000007800C00000C69DA-000000067F000080000007800C0100000000__00000094D67DF4F9-000000959635F2A9 000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__0000009614F1FFE8 000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__00000096E85806C0 000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__0000009614F1FFE8 000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__00000096E85806C0 000000067F000080000007800C00000CD6B6-000000067F000080000007800C00000D6C18__000000959635F2A9-00000096193A8001 000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__0000009614F1FFE8 000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__00000096E85806C0 000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__0000009614F1FFE8 000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__00000096E85806C0 000000067F000080000007800C00000D6C18-000000067F000080000007800C00000E0179__000000959635F2A9-00000096193A8001 000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__0000009614F1FFE8 000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__00000096E85806C0 000000067F000080000007800C00000D9BA3-000000067F00008000000780140000013481__00000096AEF27399-00000096E85829C9 000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__0000009614F1FFE8 000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__00000096E85806C0 000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__0000009614F1FFE8 000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__00000096E85806C0 000000067F000080000007800C00000E0179-000000067F000080000007800C00000E96DC__000000959635F2A9-00000096193A8001 000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__0000009614F1FFE8 000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__00000096E85806C0 000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__0000009614F1FFE8 000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__00000096E85806C0 000000067F000080000007800C00000E96DC-000000067F000080000007800C00000F2C3E__000000959635F2A9-00000096193A8001 000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__0000009614F1FFE8 000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__00000096E85806C0 000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__0000009614F1FFE8 000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__00000096E85806C0 000000067F000080000007800C00000F2C3E-000000067F000080000007800C00000FC1A0__000000959635F2A9-00000096193A8001 000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__0000009614F1FFE8 000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__00000096E85806C0 000000067F000080000007800C00000F73E3-000000067F00008000000780140000003F18__00000096193A8001-00000096AEF27399 000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__0000009614F1FFE8 000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__00000096E85806C0 000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__0000009614F1FFE8 000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__00000096E85806C0 000000067F000080000007800C00000FC1A0-000000067F000080000007800C00001057C1__000000959635F2A9-00000096193A8001 000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__0000009614F1FFE8 000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__00000096E85806C0 000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__0000009614F1FFE8 000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__00000096E85806C0 000000067F000080000007800C00001057C1-000000067F000080000007800C000010EF0B__000000959635F2A9-00000096193A8001 000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__0000009614F1FFE8 000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__00000096E85806C0 000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__0000009614F1FFE8 000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__00000096E85806C0 000000067F000080000007800C000010EF0B-01000000000000000100000004000000000B__000000959635F2A9-00000096193A8001 000000067F000080000007800C0000110000-000000067F00008000000780120100000000__00000096E85806C0 000000067F000080000007800C0000110000-030000000000000000000000000000000002__0000009614F1FFE8 000000067F00008000000780140000000000-000000067F00008000000780140000004000__00000096E85806C0 000000067F00008000000780140000003F18-000000067F00008000000780140000009ED4__00000096193A8001-00000096AEF27399 000000067F00008000000780140000004000-000000067F00008000000780140000008000__00000096E85806C0 000000067F00008000000780140000008000-000000067F0000800000078014000000C000__00000096E85806C0 000000067F00008000000780140000009ED4-000000067F0000800000078014000000FE9A__00000096193A8001-00000096AEF27399 000000067F0000800000078014000000C000-000000067F00008000000780140000010000__00000096E85806C0 000000067F0000800000078014000000FE9A-000000067F00008000000780140000015DD1__00000096193A8001-00000096AEF27399 000000067F00008000000780140000010000-000000067F00008000000780140000014000__00000096E85806C0 000000067F00008000000780140000013481-030000000000000000000000000000000002__00000096AEF27399-00000096E85829C9 000000067F00008000000780140000014000-000000067F00008000000780140000018000__00000096E85806C0 000000067F00008000000780140000015DD1-000000067F0000800000078014000001BD7E__00000096193A8001-00000096AEF27399 000000067F00008000000780140000018000-000000067F0000800000078014000001C000__00000096E85806C0 000000067F0000800000078014000001BD7E-000000067F00008000000780140000021CF0__00000096193A8001-00000096AEF27399 000000067F0000800000078014000001C000-000000067F00008000000780140000020000__00000096E85806C0 000000067F00008000000780140000020000-000000067F00008000000780140000024000__00000096E85806C0 000000067F00008000000780140000021CF0-000000067F00008000000780140000027CF8__00000096193A8001-00000096AEF27399 000000067F00008000000780140000024000-000000067F00008000000780140000028000__00000096E85806C0 000000067F00008000000780140000027CF8-000000067F0000800000078014000002DC88__00000096193A8001-00000096AEF27399 000000067F00008000000780140000028000-000000067F0000800000078014000002C000__00000096E85806C0 000000067F0000800000078014000002C000-030000000000000000000000000000000002__00000096E85806C0 000000067F0000800000078014000002DC88-030000000000000000000000000000000002__00000096193A8001-00000096AEF27399 000000067F000080000007A00C0000000000-000000067F000080000007A00C0000004000__0000009921F3B4A8 000000067F000080000007A00C0000004000-000000067F000080000007A00C0000008000__0000009921F3B4A8 000000067F000080000007A00C0000008000-000000067F000080000007A00C000000C000__0000009921F3B4A8 000000067F000080000007A00C000000974B-000000067F000080000007A00C0000012EB1__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000000C000-000000067F000080000007A00C0000010000__0000009921F3B4A8 000000067F000080000007A00C0000010000-000000067F000080000007A00C0000014000__0000009921F3B4A8 000000067F000080000007A00C0000012EB1-000000067F000080000007A00C000001C60B__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000014000-000000067F000080000007A00C0000018000__0000009921F3B4A8 000000067F000080000007A00C0000018000-000000067F000080000007A00C000001C000__0000009921F3B4A8 000000067F000080000007A00C000001C000-000000067F000080000007A00C0000020000__0000009921F3B4A8 000000067F000080000007A00C000001C60B-000000067F000080000007A00C0000025D39__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000020000-000000067F000080000007A00C0000024000__0000009921F3B4A8 000000067F000080000007A00C0000024000-000000067F000080000007A00C0000028000__0000009921F3B4A8 000000067F000080000007A00C0000025D39-000000067F000080000007A00C000002F49F__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000028000-000000067F000080000007A00C000002C000__0000009921F3B4A8 000000067F000080000007A00C000002C000-000000067F000080000007A00C0000030000__0000009921F3B4A8 000000067F000080000007A00C000002F49F-000000067F000080000007A00C0000038BB2__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000030000-000000067F000080000007A00C0000034000__0000009921F3B4A8 000000067F000080000007A00C0000034000-000000067F000080000007A00C0000038000__0000009921F3B4A8 000000067F000080000007A00C0000038000-000000067F000080000007A00C000003C000__0000009921F3B4A8 000000067F000080000007A00C0000038BB2-000000067F000080000007A00C0000042318__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000003C000-000000067F000080000007A00C0000040000__0000009921F3B4A8 000000067F000080000007A00C0000040000-000000067F000080000007A00C0000044000__0000009921F3B4A8 000000067F000080000007A00C0000042318-000000067F000080000007A00C000004BA7E__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000044000-000000067F000080000007A00C0000048000__0000009921F3B4A8 000000067F000080000007A00C0000048000-000000067F000080000007A00C000004C000__0000009921F3B4A8 000000067F000080000007A00C000004B9B2-000000067F000080000007A00C0000097B6D__0000009921E47AA1-000000997F5D23C9 000000067F000080000007A00C000004BA7E-000000067F000080000007A00C00000551B3__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000004C000-000000067F000080000007A00C0000050000__0000009921F3B4A8 000000067F000080000007A00C0000050000-000000067F000080000007A00C0000054000__0000009921F3B4A8 000000067F000080000007A00C0000054000-000000067F000080000007A00C0000058000__0000009921F3B4A8 000000067F000080000007A00C00000551B3-000000067F000080000007A00C000005E90A__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000058000-000000067F000080000007A00C000005C000__0000009921F3B4A8 000000067F000080000007A00C000005C000-000000067F000080000007A00C0000060000__0000009921F3B4A8 000000067F000080000007A00C000005E90A-000000067F000080000007A00C000006802C__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000060000-000000067F000080000007A00C0000064000__0000009921F3B4A8 000000067F000080000007A00C0000064000-000000067F000080000007A00C0000068000__0000009921F3B4A8 000000067F000080000007A00C0000068000-000000067F000080000007A00C000006C000__0000009921F3B4A8 000000067F000080000007A00C000006802C-000000067F000080000007A00C0000071783__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000006C000-000000067F000080000007A00C0000070000__0000009921F3B4A8 000000067F000080000007A00C0000070000-000000067F000080000007A00C0000074000__0000009921F3B4A8 000000067F000080000007A00C0000071783-000000067F000080000007A00C000007AEE8__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000074000-000000067F000080000007A00C0000078000__0000009921F3B4A8 000000067F000080000007A00C0000078000-000000067F000080000007A00C000007C000__0000009921F3B4A8 000000067F000080000007A00C000007AEE8-000000067F000080000007A00C000008460B__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000007C000-000000067F000080000007A00C0000080000__0000009921F3B4A8 000000067F000080000007A00C0000080000-000000067F000080000007A00C0000084000__0000009921F3B4A8 000000067F000080000007A00C0000084000-000000067F000080000007A00C0000088000__0000009921F3B4A8 000000067F000080000007A00C000008460B-000000067F000080000007A00C000008DD71__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000088000-000000067F000080000007A00C000008C000__0000009921F3B4A8 000000067F000080000007A00C000008C000-000000067F000080000007A00C0000090000__0000009921F3B4A8 000000067F000080000007A00C000008DD71-000000067F000080000007A00C00000974D7__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000090000-000000067F000080000007A00C0000094000__0000009921F3B4A8 000000067F000080000007A00C0000094000-000000067F000080000007A00C0000098000__0000009921F3B4A8 000000067F000080000007A00C00000974D7-000000067F000080000007A00C00000A0C0B__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000097B7A-000000067F000080000007A00C00000E3627__0000009921E47AA1-000000997F5D23C9 000000067F000080000007A00C0000098000-000000067F000080000007A00C000009C000__0000009921F3B4A8 000000067F000080000007A00C000009C000-000000067F000080000007A00C00000A0000__0000009921F3B4A8 000000067F000080000007A00C00000A0000-000000067F000080000007A00C00000A4000__0000009921F3B4A8 000000067F000080000007A00C00000A0C0B-000000067F000080000007A00C00000AA371__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000A4000-000000067F000080000007A00C00000A8000__0000009921F3B4A8 000000067F000080000007A00C00000A8000-000000067F000080000007A00C00000AC000__0000009921F3B4A8 000000067F000080000007A00C00000AA371-000000067F000080000007A00C00000B3AD7__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000AC000-000000067F000080000007A00C00000B0000__0000009921F3B4A8 000000067F000080000007A00C00000B0000-000000067F000080000007A00C00000B4000__0000009921F3B4A8 000000067F000080000007A00C00000B3AD7-000000067F000080000007A00C00000BD20B__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000B4000-000000067F000080000007A00C00000B8000__0000009921F3B4A8 000000067F000080000007A00C00000B8000-000000067F000080000007A00C00000BC000__0000009921F3B4A8 000000067F000080000007A00C00000BC000-000000067F000080000007A00C00000C0000__0000009921F3B4A8 000000067F000080000007A00C00000BD20B-000000067F000080000007A00C00000C6932__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000C0000-000000067F000080000007A00C00000C4000__0000009921F3B4A8 000000067F000080000007A00C00000C4000-000000067F000080000007A00C00000C8000__0000009921F3B4A8 000000067F000080000007A00C00000C6932-000000067F000080000007A00C00000D0098__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000C8000-000000067F000080000007A00C00000CC000__0000009921F3B4A8 000000067F000080000007A00C00000CC000-000000067F000080000007A00C00000D0000__0000009921F3B4A8 000000067F000080000007A00C00000D0000-000000067F000080000007A00C00000D4000__0000009921F3B4A8 000000067F000080000007A00C00000D0098-000000067F000080000007A00C00000D97FE__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000D4000-000000067F000080000007A00C00000D8000__0000009921F3B4A8 000000067F000080000007A00C00000D8000-000000067F000080000007A00C00000DC000__0000009921F3B4A8 000000067F000080000007A00C00000D97FE-000000067F000080000007A00C00000E2F0B__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000DC000-000000067F000080000007A00C00000E0000__0000009921F3B4A8 000000067F000080000007A00C00000E0000-000000067F000080000007A00C00000E4000__0000009921F3B4A8 000000067F000080000007A00C00000E2F0B-000000067F000080000007A00C00000EC671__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000E364A-000000067F000080000007A01400000065FE__0000009921E47AA1-000000997F5D23C9 000000067F000080000007A00C00000E4000-000000067F000080000007A00C00000E8000__0000009921F3B4A8 000000067F000080000007A00C00000E8000-000000067F000080000007A00C00000EC000__0000009921F3B4A8 000000067F000080000007A00C00000EC000-000000067F000080000007A00C00000F0000__0000009921F3B4A8 000000067F000080000007A00C00000EC671-000000067F000080000007A00C00000F5D9F__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000F0000-000000067F000080000007A00C00000F4000__0000009921F3B4A8 000000067F000080000007A00C00000F4000-000000067F000080000007A00C00000F8000__0000009921F3B4A8 000000067F000080000007A00C00000F5D9F-000000067F000080000007A00C00000FF505__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C00000F720F-000000067F000080000007A00C0000111692__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A00C00000F8000-000000067F000080000007A00C00000FC000__0000009921F3B4A8 000000067F000080000007A00C00000FC000-000000067F000080000007A00C0000100000__0000009921F3B4A8 000000067F000080000007A00C00000FF505-000000067F000080000007A00C0000108C10__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C0000100000-000000067F000080000007A00C0000104000__0000009921F3B4A8 000000067F000080000007A00C0000104000-000000067F000080000007A00C0000108000__0000009921F3B4A8 000000067F000080000007A00C0000108000-000000067F000080000007A00C000010C000__0000009921F3B4A8 000000067F000080000007A00C0000108C10-030000000000000000000000000000000002__00000096E85829C9-00000098A7ADFC91 000000067F000080000007A00C000010C000-000000067F000080000007A00C0000110000__0000009921F3B4A8 000000067F000080000007A00C0000110000-000000067F000080000007A0120100000000__0000009921F3B4A8 000000067F000080000007A00C0000111692-000000067F000080000007A01400000040E7__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A0140000000000-000000067F000080000007A0140000004000__0000009921F3B4A8 000000067F000080000007A0140000004000-000000067F000080000007A0140000008000__0000009921F3B4A8 000000067F000080000007A01400000040E7-000000067F000080000007A014000000B5F6__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A0140000006601-000000067F000080000007A014000001B4CB__0000009921E47AA1-000000997F5D23C9 000000067F000080000007A0140000008000-000000067F000080000007A014000000C000__0000009921F3B4A8 000000067F000080000007A014000000B5F6-000000067F000080000007A0140000012AFC__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A014000000C000-000000067F000080000007A0140000010000__0000009921F3B4A8 000000067F000080000007A0140000010000-000000067F000080000007A0140000014000__0000009921F3B4A8 000000067F000080000007A0140000012AFC-000000067F000080000007A0140000019F9B__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A0140000014000-000000067F000080000007A0140000018000__0000009921F3B4A8 000000067F000080000007A0140000018000-000000067F000080000007A014000001C000__0000009921F3B4A8 000000067F000080000007A0140000019F9B-000000067F000080000007A01400000214BE__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A014000001B4CB-030000000000000000000000000000000002__0000009921E47AA1-000000997F5D23C9 000000067F000080000007A014000001C000-000000067F000080000007A0140000020000__0000009921F3B4A8 000000067F000080000007A0140000020000-000000067F000080000007A0140000024000__0000009921F3B4A8 000000067F000080000007A01400000214BE-000000067F000080000007A01400000289C9__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A0140000024000-000000067F000080000007A0140000028000__0000009921F3B4A8 000000067F000080000007A0140000028000-000000067F000080000007A014000002C000__0000009921F3B4A8 000000067F000080000007A01400000289C9-030000000000000000000000000000000002__00000098A7ADFC91-0000009921E47AA1 000000067F000080000007A014000002C000-030000000000000000000000000000000002__0000009921F3B4A8 000000067F000080000007C00C0000000000-000000067F000080000007C00C0000004000__0000009B5229DFE8 000000067F000080000007C00C0000004000-000000067F000080000007C00C0000008000__0000009B5229DFE8 000000067F000080000007C00C0000007EA5-000000067F000080000007C00C00000115FE__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000008000-000000067F000080000007C00C000000C000__0000009B5229DFE8 000000067F000080000007C00C000000C000-000000067F000080000007C00C0000010000__0000009B5229DFE8 000000067F000080000007C00C0000010000-000000067F000080000007C00C0000014000__0000009B5229DFE8 000000067F000080000007C00C00000115FE-000000067F000080000007C00C000001AD0C__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000014000-000000067F000080000007C00C0000018000__0000009B5229DFE8 000000067F000080000007C00C0000018000-000000067F000080000007C00C000001C000__0000009B5229DFE8 000000067F000080000007C00C000001AD0C-000000067F000080000007C00C0000024472__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C000001C000-000000067F000080000007C00C0000020000__0000009B5229DFE8 000000067F000080000007C00C0000020000-000000067F000080000007C00C0000024000__0000009B5229DFE8 000000067F000080000007C00C0000024000-000000067F000080000007C00C0000028000__0000009B5229DFE8 000000067F000080000007C00C0000024472-000000067F000080000007C00C000002DBD8__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000028000-000000067F000080000007C00C000002C000__0000009B5229DFE8 000000067F000080000007C00C000002C000-000000067F000080000007C00C0000030000__0000009B5229DFE8 000000067F000080000007C00C000002DBD8-000000067F000080000007C00C000003732B__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000030000-000000067F000080000007C00C0000034000__0000009B5229DFE8 000000067F000080000007C00C0000034000-000000067F000080000007C00C0000038000__0000009B5229DFE8 000000067F000080000007C00C000003732B-000000067F000080000007C00C0000040A91__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000038000-000000067F000080000007C00C000003C000__0000009B5229DFE8 000000067F000080000007C00C000003C000-000000067F000080000007C00C0000040000__0000009B5229DFE8 000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009B40525F80 000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009C1E3799F0 000000067F000080000007C00C0000040A91-030000000000000000000000000000000002__000000997F5D23C9-00000099F1C9FC71 000000067F000080000007C00C0000042360-000000067F000080000007C00C000004BAC6__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009B40525F80 000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009C1E3799F0 000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009B40525F80 000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009C1E3799F0 000000067F000080000007C00C000004BAC6-000000067F000080000007C00C00000551FB__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009B40525F80 000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009C1E3799F0 000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009B40525F80 000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009C1E3799F0 000000067F000080000007C00C0000052AA4-000000067F000080000007C00C00000A4244__0000009BCB4E4461-0000009C1E8CC879 000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009B40525F80 000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009C1E3799F0 000000067F000080000007C00C00000551FB-000000067F000080000007C00C000005E90B__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009B40525F80 000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009C1E3799F0 000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009B40525F80 000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009C1E3799F0 000000067F000080000007C00C000005E90B-000000067F000080000007C00C000006802B__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009B40525F80 000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009C1E3799F0 000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009B40525F80 000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009C1E3799F0 000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009B40525F80 000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009C1E3799F0 000000067F000080000007C00C000006802B-000000067F000080000007C00C0000071782__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009B40525F80 000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009C1E3799F0 000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009B40525F80 000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009C1E3799F0 000000067F000080000007C00C0000071782-000000067F000080000007C00C000007AEE8__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009B40525F80 000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009C1E3799F0 000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009B40525F80 000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009C1E3799F0 000000067F000080000007C00C000007AEE8-000000067F000080000007C00C000008460B__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009B40525F80 000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009C1E3799F0 000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009B40525F80 000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009C1E3799F0 000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009B40525F80 000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009C1E3799F0 000000067F000080000007C00C000008460B-000000067F000080000007C00C000008DD71__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009B40525F80 000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009C1E3799F0 000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009B40525F80 000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009C1E3799F0 000000067F000080000007C00C000008DD71-000000067F000080000007C00C00000974D7__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009B40525F80 000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009C1E3799F0 000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009B40525F80 000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009C1E3799F0 000000067F000080000007C00C00000974D7-000000067F000080000007C00C00000A0C0B__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009B40525F80 000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009C1E3799F0 000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009B40525F80 000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009C1E3799F0 000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009B40525F80 000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009C1E3799F0 000000067F000080000007C00C00000A0C0B-000000067F000080000007C00C0100000000__00000099F1C9FC71-0000009A918DF181 000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009B40525F80 000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009C1E3799F0 000000067F000080000007C00C00000A424C-000000067F000080000007C00C00000F5B43__0000009BCB4E4461-0000009C1E8CC879 000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009B40525F80 000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009C1E3799F0 000000067F000080000007C00C00000A9244-000000067F000080000007C00C00000B2991__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009B40525F80 000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009C1E3799F0 000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009B40525F80 000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009C1E3799F0 000000067F000080000007C00C00000B2991-000000067F000080000007C00C00000BC0F7__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009B40525F80 000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009C1E3799F0 000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009B40525F80 000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009C1E3799F0 000000067F000080000007C00C00000BA258-000000067F000080000007C01400000011E2__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009B40525F80 000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009C1E3799F0 000000067F000080000007C00C00000BC0F7-000000067F000080000007C00C00000C580C__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009B40525F80 000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009C1E3799F0 000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009B40525F80 000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009C1E3799F0 000000067F000080000007C00C00000C580C-000000067F000080000007C00C00000CEF72__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009B40525F80 000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009C1E3799F0 000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009B40525F80 000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009C1E3799F0 000000067F000080000007C00C00000CEF72-000000067F000080000007C00C00000D86D8__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009B40525F80 000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009C1E3799F0 000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009B40525F80 000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009C1E3799F0 000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009B40525F80 000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009C1E3799F0 000000067F000080000007C00C00000D86D8-000000067F000080000007C00C00000E1E0B__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009B40525F80 000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009C1E3799F0 000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009B40525F80 000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009C1E3799F0 000000067F000080000007C00C00000E1E0B-000000067F000080000007C00C00000EB571__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009B40525F80 000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009C1E3799F0 000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009B40525F80 000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009C1E3799F0 000000067F000080000007C00C00000EB571-000000067F000080000007C00C00000F4CD7__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009B40525F80 000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009C1E3799F0 000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009B40525F80 000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009C1E3799F0 000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009B40525F80 000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009C1E3799F0 000000067F000080000007C00C00000F4CD7-000000067F000080000007C00C00000FE40B__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C00000F5B56-000000067F000080000007C014000000EB5A__0000009BCB4E4461-0000009C1E8CC879 000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009B40525F80 000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009C1E3799F0 000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009B40525F80 000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009C1E3799F0 000000067F000080000007C00C00000FE40B-000000067F000080000007C00C0000107B27__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009B40525F80 000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009C1E3799F0 000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009B40525F80 000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009C1E3799F0 000000067F000080000007C00C0000107B27-000000067F000080000007C00C000011128D__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C00C0000108000-000000067F000080000007C00C000010C000__0000009C1E3799F0 000000067F000080000007C00C0000108000-030000000000000000000000000000000002__0000009B40525F80 000000067F000080000007C00C000010C000-000000067F000080000007C00C0000110000__0000009C1E3799F0 000000067F000080000007C00C0000110000-000000067F000080000007C0120100000000__0000009C1E3799F0 000000067F000080000007C00C000011128D-010000000000000001000000040000000012__0000009A918DF181-0000009B51A8BBB9 000000067F000080000007C0140000000000-000000067F000080000007C0140000004000__0000009C1E3799F0 000000067F000080000007C01400000011E2-000000067F000080000007C0140000007F04__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000004000-000000067F000080000007C0140000008000__0000009C1E3799F0 000000067F000080000007C0140000007F04-000000067F000080000007C014000000EC12__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000008000-000000067F000080000007C014000000C000__0000009C1E3799F0 000000067F000080000007C014000000C000-000000067F000080000007C0140000010000__0000009C1E3799F0 000000067F000080000007C014000000EB5A-000000067F000080000007C0140000027B5C__0000009BCB4E4461-0000009C1E8CC879 000000067F000080000007C014000000EC12-000000067F000080000007C0140000015910__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000010000-000000067F000080000007C0140000014000__0000009C1E3799F0 000000067F000080000007C0140000014000-000000067F000080000007C0140000018000__0000009C1E3799F0 000000067F000080000007C0140000015910-000000067F000080000007C014000001C5BB__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000018000-000000067F000080000007C014000001C000__0000009C1E3799F0 000000067F000080000007C014000001C000-000000067F000080000007C0140000020000__0000009C1E3799F0 000000067F000080000007C014000001C5BB-000000067F000080000007C0140000023298__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000020000-000000067F000080000007C0140000024000__0000009C1E3799F0 000000067F000080000007C0140000023298-000000067F000080000007C0140000029F9A__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C0140000024000-000000067F000080000007C0140000028000__0000009C1E3799F0 000000067F000080000007C0140000027B5E-030000000000000000000000000000000002__0000009BCB4E4461-0000009C1E8CC879 000000067F000080000007C0140000028000-000000067F000080000007C014000002C000__0000009C1E3799F0 000000067F000080000007C0140000029F9A-030000000000000000000000000000000002__0000009B51A8BBB9-0000009BCB4E4461 000000067F000080000007C014000002C000-030000000000000000000000000000000002__0000009C1E3799F0 000000067F000080000007E00C0000000000-000000067F000080000007E00C0000004000__0000009DEF760000 000000067F000080000007E00C0000004000-000000067F000080000007E00C0000008000__0000009DEF760000 000000067F000080000007E00C0000008000-000000067F000080000007E00C000000C000__0000009DEF760000 000000067F000080000007E00C00000092CD-000000067F000080000007E00C0000012A0A__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C000000C000-000000067F000080000007E00C0000010000__0000009DEF760000 000000067F000080000007E00C0000010000-000000067F000080000007E00C0000014000__0000009DEF760000 000000067F000080000007E00C0000012A0A-000000067F000080000007E00C000001C170__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C0000014000-000000067F000080000007E00C0000018000__0000009DEF760000 000000067F000080000007E00C0000018000-000000067F000080000007E00C000001C000__0000009DEF760000 000000067F000080000007E00C000001C000-000000067F000080000007E00C0000020000__0000009DEF760000 000000067F000080000007E00C000001C170-000000067F000080000007E00C00000258D6__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C0000020000-000000067F000080000007E00C0000024000__0000009DEF760000 000000067F000080000007E00C0000024000-000000067F000080000007E00C0000028000__0000009DEF760000 000000067F000080000007E00C00000258D6-000000067F000080000007E00C000002F00B__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C0000028000-000000067F000080000007E00C000002C000__0000009DEF760000 000000067F000080000007E00C000002C000-000000067F000080000007E00C0000030000__0000009DEF760000 000000067F000080000007E00C000002F00B-000000067F000080000007E00C0000038720__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C0000030000-000000067F000080000007E00C0000034000__0000009DEF760000 000000067F000080000007E00C0000034000-000000067F000080000007E00C0000038000__0000009DEF760000 000000067F000080000007E00C0000038000-000000067F000080000007E00C000003C000__0000009DEF760000 000000067F000080000007E00C0000038720-000000067F000080000007E00C0000041E86__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C000003C000-000000067F000080000007E00C0000040000__0000009DEF760000 000000067F000080000007E00C0000040000-000000067F000080000007E00C0000044000__0000009DEF760000 000000067F000080000007E00C0000041E86-000000067F000080000007E00C000004B5EC__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C0000044000-000000067F000080000007E00C0000048000__0000009DEF760000 000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009DDBE10620 000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009EBB11FFC0 000000067F000080000007E00C000004B5EC-030000000000000000000000000000000002__0000009C1E8CC879-0000009C9ED3F059 000000067F000080000007E00C000004BACA-000000067F000080000007E00C00000551FF__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009DDBE10620 000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009EBB11FFC0 000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009DDBE10620 000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009EBB11FFC0 000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009DDBE10620 000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009EBB11FFC0 000000067F000080000007E00C00000551FF-000000067F000080000007E00C000005E90C__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009DDBE10620 000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009EBB11FFC0 000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009DDBE10620 000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009EBB11FFC0 000000067F000080000007E00C000005E90C-000000067F000080000007E00C000006802C__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009DDBE10620 000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009EBB11FFC0 000000067F000080000007E00C0000061AE1-000000067F000080000007E00C00000C2A6C__0000009E781A9731-0000009EBBC72771 000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009DDBE10620 000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009EBB11FFC0 000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009DDBE10620 000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009EBB11FFC0 000000067F000080000007E00C000006802C-000000067F000080000007E00C0000071783__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009DDBE10620 000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009EBB11FFC0 000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009DDBE10620 000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009EBB11FFC0 000000067F000080000007E00C0000071783-000000067F000080000007E00C000007AEE9__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009DDBE10620 000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009EBB11FFC0 000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009DDBE10620 000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009EBB11FFC0 000000067F000080000007E00C000007AEE9-000000067F000080000007E00C000008460B__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009DDBE10620 000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009EBB11FFC0 000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009DDBE10620 000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009EBB11FFC0 000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009DDBE10620 000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009EBB11FFC0 000000067F000080000007E00C000008460B-000000067F000080000007E00C000008DD71__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009DDBE10620 000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009EBB11FFC0 000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009DDBE10620 000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009EBB11FFC0 000000067F000080000007E00C000008DD71-000000067F000080000007E00C00000974D7__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009DDBE10620 000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009EBB11FFC0 000000067F000080000007E00C0000093E3A-000000067F000080000007E00C0000111CED__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009DDBE10620 000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009EBB11FFC0 000000067F000080000007E00C00000974D7-000000067F000080000007E00C00000A0C0B__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009DDBE10620 000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009EBB11FFC0 000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009DDBE10620 000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009EBB11FFC0 000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009DDBE10620 000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009EBB11FFC0 000000067F000080000007E00C00000A0C0B-000000067F000080000007E00C00000AA371__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009DDBE10620 000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009EBB11FFC0 000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009DDBE10620 000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009EBB11FFC0 000000067F000080000007E00C00000AA371-000000067F000080000007E00C0100000000__0000009C9ED3F059-0000009D3E97E549 000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009DDBE10620 000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009EBB11FFC0 000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009DDBE10620 000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009EBB11FFC0 000000067F000080000007E00C00000B2704-000000067F000080000007E00C00000BBE0F__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009DDBE10620 000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009EBB11FFC0 000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009DDBE10620 000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009EBB11FFC0 000000067F000080000007E00C00000BBE0F-000000067F000080000007E00C00000C5542__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009DDBE10620 000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009EBB11FFC0 000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009DDBE10620 000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009EBB11FFC0 000000067F000080000007E00C00000C2A75-000000067F000080000007E0140000004415__0000009E781A9731-0000009EBBC72771 000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009DDBE10620 000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009EBB11FFC0 000000067F000080000007E00C00000C5542-000000067F000080000007E00C00000CECA8__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009DDBE10620 000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009EBB11FFC0 000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009DDBE10620 000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009EBB11FFC0 000000067F000080000007E00C00000CECA8-000000067F000080000007E00C00000D83BF__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009DDBE10620 000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009EBB11FFC0 000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009DDBE10620 000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009EBB11FFC0 000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009DDBE10620 000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009EBB11FFC0 000000067F000080000007E00C00000D83BF-000000067F000080000007E00C00000E1B0A__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009DDBE10620 000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009EBB11FFC0 000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009DDBE10620 000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009EBB11FFC0 000000067F000080000007E00C00000E1B0A-000000067F000080000007E00C00000EB270__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009DDBE10620 000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009EBB11FFC0 000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009DDBE10620 000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009EBB11FFC0 000000067F000080000007E00C00000EB270-000000067F000080000007E00C00000F49AA__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009DDBE10620 000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009EBB11FFC0 000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009DDBE10620 000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009EBB11FFC0 000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009DDBE10620 000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009EBB11FFC0 000000067F000080000007E00C00000F49AA-000000067F000080000007E00C00000FE10A__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009DDBE10620 000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009EBB11FFC0 000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009DDBE10620 000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009EBB11FFC0 000000067F000080000007E00C00000FE10A-000000067F000080000007E00C000010782C__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009DDBE10620 000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009EBB11FFC0 000000067F000080000007E00C0000104000-000000067F000080000007E00C0000108000__0000009EBB11FFC0 000000067F000080000007E00C0000104000-030000000000000000000000000000000002__0000009DDBE10620 000000067F000080000007E00C000010782C-000000067F000080000007E00C0000110F88__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C0000108000-000000067F000080000007E00C000010C000__0000009EBB11FFC0 000000067F000080000007E00C000010C000-000000067F000080000007E00C0000110000__0000009EBB11FFC0 000000067F000080000007E00C0000110000-000000067F000080000007E0120100000000__0000009EBB11FFC0 000000067F000080000007E00C0000110F88-010000000000000001000000040000000015__0000009D3E97E549-0000009DEEE6BFF9 000000067F000080000007E00C0000111CED-000000067F000080000007E0140000004818__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000000000-000000067F000080000007E0140000004000__0000009EBB11FFC0 000000067F000080000007E0140000004000-000000067F000080000007E0140000008000__0000009EBB11FFC0 000000067F000080000007E0140000004418-000000067F000080000007E0140000025351__0000009E781A9731-0000009EBBC72771 000000067F000080000007E0140000004818-000000067F000080000007E014000000AD57__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000008000-000000067F000080000007E014000000C000__0000009EBB11FFC0 000000067F000080000007E014000000AD57-000000067F000080000007E0140000011291__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E014000000C000-000000067F000080000007E0140000010000__0000009EBB11FFC0 000000067F000080000007E0140000010000-000000067F000080000007E0140000014000__0000009EBB11FFC0 000000067F000080000007E0140000011291-000000067F000080000007E0140000017809__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000014000-000000067F000080000007E0140000018000__0000009EBB11FFC0 000000067F000080000007E0140000017809-000000067F000080000007E014000001DD22__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000018000-000000067F000080000007E014000001C000__0000009EBB11FFC0 000000067F000080000007E014000001C000-000000067F000080000007E0140000020000__0000009EBB11FFC0 000000067F000080000007E014000001DD22-000000067F000080000007E0140000024244__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000020000-000000067F000080000007E0140000024000__0000009EBB11FFC0 000000067F000080000007E0140000024000-000000067F000080000007E0140000028000__0000009EBB11FFC0 000000067F000080000007E0140000024244-000000067F000080000007E014000002A798__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E0140000025355-030000000000000000000000000000000002__0000009E781A9731-0000009EBBC72771 000000067F000080000007E0140000028000-000000067F000080000007E014000002C000__0000009EBB11FFC0 000000067F000080000007E014000002A798-030000000000000000000000000000000002__0000009DEEE6BFF9-0000009E781A9731 000000067F000080000007E014000002C000-030000000000000000000000000000000002__0000009EBB11FFC0 000000067F000080000008000C00000081F6-000000067F000080000008000C0000010448__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000010448-000000067F000080000008000C000001870A__0000009EBBC72771-000000A154401909 000000067F000080000008000C000001870A-000000067F000080000008000C0000020905__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000020905-000000067F000080000008000C0000028AF3__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000028AF3-000000067F000080000008000C0000030CEA__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000030CEA-000000067F000080000008000C0000038EB6__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000038EB6-000000067F000080000008000C00000410B5__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000410B5-000000067F000080000008000C00000492CB__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000492CB-000000067F000080000008000C00000514F8__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000514F8-000000067F000080000008000C000005977B__0000009EBBC72771-000000A154401909 000000067F000080000008000C000005977B-000000067F000080000008000C00000619C6__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000619C6-000000067F000080000008000C0000069B6B__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000069B6B-000000067F000080000008000C0000071DBE__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000071DBE-000000067F000080000008000C0000079F8E__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000079F8E-000000067F000080000008000C00000821D7__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000821D7-000000067F000080000008000C000008A3AB__0000009EBBC72771-000000A154401909 000000067F000080000008000C000008A3AB-000000067F000080000008000C0000092556__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000092556-000000067F000080000008000C000009A744__0000009EBBC72771-000000A154401909 000000067F000080000008000C000009A744-000000067F000080000008000C00000A29B0__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000A29B0-000000067F000080000008000C00000AAC4B__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000AAC4B-000000067F000080000008000C00000B2E21__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000B2E21-000000067F000080000008000C00000BB0DB__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000BB0DB-000000067F000080000008000C00000C331B__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000C331B-000000067F000080000008000C00000CB4D2__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000CB4D2-000000067F000080000008000C00000D3754__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000D3754-000000067F000080000008000C00000DB9C6__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000DB9C6-000000067F000080000008000C00000E3BC1__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000E3BC1-000000067F000080000008000C00000EBE00__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000EBE00-000000067F000080000008000C00000F3F63__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000F3F63-000000067F000080000008000C00000FC160__0000009EBBC72771-000000A154401909 000000067F000080000008000C00000FC160-000000067F000080000008000C0000104448__0000009EBBC72771-000000A154401909 000000067F000080000008000C0000104448-000000067F000080000008000C000010C675__0000009EBBC72771-000000A154401909 000000067F000080000008000C000010C675-000000067F000080000008000C020000000B__0000009EBBC72771-000000A154401909 000000067F000080000008000C020000000B-000000067F00008000000800140000003ED1__0000009EBBC72771-000000A154401909 000000067F00008000000800140000003ED1-000000067F00008000000800140000009486__0000009EBBC72771-000000A154401909 000000067F00008000000800140000009486-000000067F0000800000080014000000EA73__0000009EBBC72771-000000A154401909 000000067F0000800000080014000000EA73-000000067F0000800000080014000001404D__0000009EBBC72771-000000A154401909 000000067F0000800000080014000001404D-000000067F000080000008001400000195A4__0000009EBBC72771-000000A154401909 000000067F000080000008001400000195A4-000000067F0000800000080014000001EBB4__0000009EBBC72771-000000A154401909 000000067F0000800000080014000001EBB4-000000067F000080000008001400000241E2__0000009EBBC72771-000000A154401909 000000067F000080000008001400000241E2-000000067F00008000000800140000029762__0000009EBBC72771-000000A154401909 000000067F00008000000800140000029762-030000000000000000000000000000000002__0000009EBBC72771-000000A154401909 000000067F000080000008200C0000000000-000000067F000080000008200C0000004000__000000A29F1D8950 000000067F000080000008200C0000004000-000000067F000080000008200C0000008000__000000A29F1D8950 000000067F000080000008200C0000008000-000000067F000080000008200C000000C000__000000A29F1D8950 000000067F000080000008200C000000974D-000000067F000080000008200C0000012EB3__000000A154401909-000000A1E407F839 000000067F000080000008200C000000C000-000000067F000080000008200C0000010000__000000A29F1D8950 000000067F000080000008200C0000010000-000000067F000080000008200C0000014000__000000A29F1D8950 000000067F000080000008200C0000012EB3-000000067F000080000008200C000001C60A__000000A154401909-000000A1E407F839 000000067F000080000008200C0000014000-000000067F000080000008200C0000018000__000000A29F1D8950 000000067F000080000008200C0000018000-000000067F000080000008200C000001C000__000000A29F1D8950 000000067F000080000008200C000001C000-000000067F000080000008200C0000020000__000000A29F1D8950 000000067F000080000008200C000001C60A-000000067F000080000008200C0000025D38__000000A154401909-000000A1E407F839 000000067F000080000008200C0000020000-000000067F000080000008200C0000024000__000000A29F1D8950 000000067F000080000008200C0000024000-000000067F000080000008200C0000028000__000000A29F1D8950 000000067F000080000008200C0000025D38-000000067F000080000008200C000002F49E__000000A154401909-000000A1E407F839 000000067F000080000008200C0000028000-000000067F000080000008200C000002C000__000000A29F1D8950 000000067F000080000008200C000002C000-000000067F000080000008200C0000030000__000000A29F1D8950 000000067F000080000008200C000002F49E-000000067F000080000008200C0000038BB1__000000A154401909-000000A1E407F839 000000067F000080000008200C0000030000-000000067F000080000008200C0000034000__000000A29F1D8950 000000067F000080000008200C0000034000-000000067F000080000008200C0000038000__000000A29F1D8950 000000067F000080000008200C0000038000-000000067F000080000008200C000003C000__000000A29F1D8950 000000067F000080000008200C0000038BB1-000000067F000080000008200C0000042317__000000A154401909-000000A1E407F839 000000067F000080000008200C000003C000-000000067F000080000008200C0000040000__000000A29F1D8950 000000067F000080000008200C0000040000-000000067F000080000008200C0000044000__000000A29F1D8950 000000067F000080000008200C0000042317-000000067F000080000008200C000004BA7D__000000A154401909-000000A1E407F839 000000067F000080000008200C0000044000-000000067F000080000008200C0000048000__000000A29F1D8950 000000067F000080000008200C0000048000-000000067F000080000008200C000004C000__000000A29F1D8950 000000067F000080000008200C000004BA7D-000000067F000080000008200C00000551B2__000000A154401909-000000A1E407F839 000000067F000080000008200C000004C000-000000067F000080000008200C0000050000__000000A29F1D8950 000000067F000080000008200C0000050000-000000067F000080000008200C0000054000__000000A29F1D8950 000000067F000080000008200C0000054000-000000067F000080000008200C0000058000__000000A29F1D8950 000000067F000080000008200C00000551B2-030000000000000000000000000000000002__000000A154401909-000000A1E407F839 000000067F000080000008200C0000058000-000000067F000080000008200C000005C000__000000A29F1D8950 000000067F000080000008200C000005C000-000000067F000080000008200C0000060000__000000A29F1D8950 000000067F000080000008200C000005D8FE-000000067F000080000008200C000006700C__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000060000-000000067F000080000008200C0000064000__000000A29F1D8950 000000067F000080000008200C0000064000-000000067F000080000008200C0000068000__000000A29F1D8950 000000067F000080000008200C000006700C-000000067F000080000008200C000007076D__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000068000-000000067F000080000008200C000006C000__000000A29F1D8950 000000067F000080000008200C000006C000-000000067F000080000008200C0000070000__000000A29F1D8950 000000067F000080000008200C0000070000-000000067F000080000008200C0000074000__000000A29F1D8950 000000067F000080000008200C000007076D-000000067F000080000008200C0000079ED3__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000074000-000000067F000080000008200C0000078000__000000A29F1D8950 000000067F000080000008200C0000078000-000000067F000080000008200C000007C000__000000A29F1D8950 000000067F000080000008200C0000079ED3-000000067F000080000008200C000008360A__000000A1E407F839-000000A323C9E001 000000067F000080000008200C000007C000-000000067F000080000008200C0000080000__000000A29F1D8950 000000067F000080000008200C0000080000-000000067F000080000008200C0000084000__000000A29F1D8950 000000067F000080000008200C000008360A-000000067F000080000008200C000008CD70__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000084000-000000067F000080000008200C0000088000__000000A29F1D8950 000000067F000080000008200C0000088000-000000067F000080000008200C000008C000__000000A29F1D8950 000000067F000080000008200C000008C000-000000067F000080000008200C0000090000__000000A29F1D8950 000000067F000080000008200C000008CD70-000000067F000080000008200C00000964D6__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000090000-000000067F000080000008200C0000094000__000000A29F1D8950 000000067F000080000008200C0000094000-000000067F000080000008200C0000098000__000000A29F1D8950 000000067F000080000008200C00000964D6-000000067F000080000008200C000009FC0B__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000098000-000000067F000080000008200C000009C000__000000A29F1D8950 000000067F000080000008200C000009C000-000000067F000080000008200C00000A0000__000000A29F1D8950 000000067F000080000008200C000009FC0B-000000067F000080000008200C00000A9319__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000A0000-000000067F000080000008200C00000A4000__000000A29F1D8950 000000067F000080000008200C00000A4000-000000067F000080000008200C00000A8000__000000A29F1D8950 000000067F000080000008200C00000A8000-000000067F000080000008200C00000AC000__000000A29F1D8950 000000067F000080000008200C00000A9319-000000067F000080000008200C00000B2A7F__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000AC000-000000067F000080000008200C00000B0000__000000A29F1D8950 000000067F000080000008200C00000B0000-000000067F000080000008200C00000B4000__000000A29F1D8950 000000067F000080000008200C00000B2A7F-000000067F000080000008200C00000BC1E5__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000B4000-000000067F000080000008200C00000B8000__000000A29F1D8950 000000067F000080000008200C00000B8000-000000067F000080000008200C00000BC000__000000A29F1D8950 000000067F000080000008200C00000BC000-000000067F000080000008200C00000C0000__000000A29F1D8950 000000067F000080000008200C00000BC1E5-000000067F000080000008200C00000C590C__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000C0000-010000000000000000000000000000000001__000000A29F1D8950 000000067F000080000008200C00000C590C-000000067F000080000008200C00000CF071__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000CF071-000000067F000080000008200C00000D8786__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000D8786-000000067F000080000008200C00000E1EEC__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000E1EEC-000000067F000080000008200C00000EB60C__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000EB60C-000000067F000080000008200C00000F4D43__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000F4D43-000000067F000080000008200C00000FE4A9__000000A1E407F839-000000A323C9E001 000000067F000080000008200C00000FE4A9-000000067F000080000008200C0000107BC5__000000A1E407F839-000000A323C9E001 000000067F000080000008200C0000107BC5-000000067F000080000008200C000011130B__000000A1E407F839-000000A323C9E001 000000067F000080000008200C000011130B-01000000000000000100000004000000001C__000000A1E407F839-000000A323C9E001 000000067F0000800000082014000000393C-000000067F0000800000082014000000B84D__000000A323C9E001-000000A37A60B1A9 000000067F0000800000082014000000B84D-000000067F0000800000082014000001375E__000000A323C9E001-000000A37A60B1A9 000000067F0000800000082014000001375E-000000067F0000800000082014000001B66D__000000A323C9E001-000000A37A60B1A9 000000067F0000800000082014000001B66D-000000067F0000800000082014000002357E__000000A323C9E001-000000A37A60B1A9 000000067F0000800000082014000002357E-000000067F0000800000082014000002B48D__000000A323C9E001-000000A37A60B1A9 000000067F0000800000082014000002B48D-030000000000000000000000000000000002__000000A323C9E001-000000A37A60B1A9 000000067F000080000008600C0000000000-000000067F000080000008600C0000004000__000000A434813A68 000000067F000080000008600C0000004000-000000067F000080000008600C0000008000__000000A434813A68 000000067F000080000008600C0000008000-000000067F000080000008600C000000C000__000000A434813A68 000000067F000080000008600C0000009747-000000067F000080000008600C0000012EAD__000000A37A60B1A9-000000A3CA47ECA9 000000067F000080000008600C000000C000-000000067F000080000008600C0000010000__000000A434813A68 000000067F000080000008600C0000010000-000000067F000080000008600C0000014000__000000A434813A68 000000067F000080000008600C0000012EAD-000000067F000080000008600C000001C60A__000000A37A60B1A9-000000A3CA47ECA9 000000067F000080000008600C0000014000-000000067F000080000008600C0000018000__000000A434813A68 000000067F000080000008600C0000018000-000000067F000080000008600C000001C000__000000A434813A68 000000067F000080000008600C000001C000-000000067F000080000008600C0000020000__000000A434813A68 000000067F000080000008600C000001C60A-000000067F000080000008600C0000025D38__000000A37A60B1A9-000000A3CA47ECA9 000000067F000080000008600C0000020000-000000067F000080000008600C0000024000__000000A434813A68 000000067F000080000008600C0000024000-000000067F000080000008600C0000028000__000000A434813A68 000000067F000080000008600C0000025D38-000000067F000080000008600C000002F49E__000000A37A60B1A9-000000A3CA47ECA9 000000067F000080000008600C0000028000-000000067F000080000008600C000002C000__000000A434813A68 000000067F000080000008600C000002C000-000000067F000080000008600C0000030000__000000A434813A68 000000067F000080000008600C000002F49E-030000000000000000000000000000000002__000000A37A60B1A9-000000A3CA47ECA9 000000067F000080000008600C000002F4CA-000000067F000080000008600C0000038BDD__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C0000030000-000000067F000080000008600C0000034000__000000A434813A68 000000067F000080000008600C0000034000-000000067F000080000008600C0000038000__000000A434813A68 000000067F000080000008600C0000038000-000000067F000080000008600C000003C000__000000A434813A68 000000067F000080000008600C0000038BDD-000000067F000080000008600C000004230B__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000003C000-000000067F000080000008600C0000040000__000000A434813A68 000000067F000080000008600C0000040000-000000067F000080000008600C0000044000__000000A434813A68 000000067F000080000008600C000004230B-000000067F000080000008600C000004BA71__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C0000044000-000000067F000080000008600C0000048000__000000A434813A68 000000067F000080000008600C0000048000-000000067F000080000008600C000004C000__000000A434813A68 000000067F000080000008600C000004BA71-000000067F000080000008600C00000551A6__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000004C000-000000067F000080000008600C0000050000__000000A434813A68 000000067F000080000008600C0000050000-000000067F000080000008600C0000054000__000000A434813A68 000000067F000080000008600C0000054000-000000067F000080000008600C0000058000__000000A434813A68 000000067F000080000008600C00000551A6-000000067F000080000008600C000005E90A__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C0000058000-000000067F000080000008600C000005C000__000000A434813A68 000000067F000080000008600C000005C000-000000067F000080000008600C0000060000__000000A434813A68 000000067F000080000008600C000005E90A-000000067F000080000008600C000006802C__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C0000060000-000000067F000080000008600C0000064000__000000A434813A68 000000067F000080000008600C0000064000-000000067F000080000008600C0000068000__000000A434813A68 000000067F000080000008600C0000068000-000000067F000080000008600C000006C000__000000A434813A68 000000067F000080000008600C000006802C-000000067F000080000008600C0000071783__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000006C000-030000000000000000000000000000000002__000000A434813A68 000000067F000080000008600C0000071783-000000067F000080000008600C000007AEE9__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000007AEE9-000000067F000080000008600C000008460B__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000008460B-000000067F000080000008600C000008DD71__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000008DD71-000000067F000080000008600C00000974D7__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000974D7-000000067F000080000008600C00000A0C0B__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000A0C0B-000000067F000080000008600C00000AA371__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000AA371-000000067F000080000008600C00000B3AD7__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000B3AD7-000000067F000080000008600C00000BD20B__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000BD20B-000000067F000080000008600C00000C6932__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000C6932-000000067F000080000008600C00000D0098__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000D0098-000000067F000080000008600C00000D97FE__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000D97FE-000000067F000080000008600C00000E2F0B__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000E2F0B-000000067F000080000008600C00000EC671__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000EC671-000000067F000080000008600C00000F5D9F__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000F5D9F-000000067F000080000008600C00000FF505__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C00000FF505-000000067F000080000008600C0000108C10__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C0000108C10-000000067F000080000008600C0100000000__000000A3CA47ECA9-000000A539BDE561 000000067F000080000008600C000010ECC4-000000067F00008000000860140000002607__000000A539BDE561-000000A5A081B661 000000067F00008000000860140000002607-000000067F0000800000086014000000A518__000000A539BDE561-000000A5A081B661 000000067F0000800000086014000000A518-000000067F00008000000860140000012429__000000A539BDE561-000000A5A081B661 000000067F00008000000860140000012429-000000067F0000800000086014000001A338__000000A539BDE561-000000A5A081B661 000000067F0000800000086014000001A338-000000067F00008000000860140000022249__000000A539BDE561-000000A5A081B661 000000067F00008000000860140000022249-000000067F0000800000086014000002A159__000000A539BDE561-000000A5A081B661 000000067F0000800000086014000002A159-030000000000000000000000000000000002__000000A539BDE561-000000A5A081B661 000000067F000080000008801C0000009703-000000067F000080000008801C0000012E0E__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000012E0E-000000067F000080000008801C000001C574__000000A5A081B661-000000A6503DE919 000000067F000080000008801C000001C574-000000067F000080000008801C0000025CDA__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000025CDA-000000067F000080000008801C000002F40A__000000A5A081B661-000000A6503DE919 000000067F000080000008801C000002F40A-000000067F000080000008801C0000038B1D__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000038B1D-000000067F000080000008801C0000042283__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000042283-000000067F000080000008801C000004B9E9__000000A5A081B661-000000A6503DE919 000000067F000080000008801C000004B9E9-000000067F000080000008801C000005510B__000000A5A081B661-000000A6503DE919 000000067F000080000008801C000005510B-000000067F000080000008801C000005E871__000000A5A081B661-000000A6503DE919 000000067F000080000008801C000005E871-000000067F000080000008801C0000067F8B__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000067F8B-030000000000000000000000000000000002__000000A5A081B661-000000A6503DE919 000000067F000080000008801C0000068000-000000067F000080000008801C000006C000__000000A76EC5DFE8 000000067F000080000008801C00000680F7-000000067F000080000008801C000007180C__000000A6503DE919-000000A6F001F909 000000067F000080000008801C000006C000-000000067F000080000008801C0000070000__000000A76EC5DFE8 000000067F000080000008801C0000070000-000000067F000080000008801C0000074000__000000A76EC5DFE8 000000067F000080000008801C000007180C-000000067F000080000008801C000007AF72__000000A6503DE919-000000A6F001F909 000000067F000080000008801C0000074000-000000067F000080000008801C0000078000__000000A76EC5DFE8 000000067F000080000008801C0000078000-000000067F000080000008801C000007C000__000000A76F097A80 000000067F000080000008801C000007AF72-000000067F000080000008801C00000846D8__000000A6503DE919-000000A6F001F909 000000067F000080000008801C000007C000-000000067F000080000008801C0000080000__000000A76F097A80 000000067F000080000008801C0000080000-000000067F000080000008801C0000084000__000000A76F097A80 000000067F000080000008801C0000084000-000000067F000080000008801C0000088000__000000A76F097A80 000000067F000080000008801C00000846D8-000000067F000080000008801C000008DE0B__000000A6503DE919-000000A6F001F909 000000067F000080000008801C0000088000-000000067F000080000008801C000008C000__000000A76F097A80 000000067F000080000008801C000008C000-000000067F000080000008801C0000090000__000000A76F097A80 000000067F000080000008801C000008DE0B-000000067F000080000008801C000009752B__000000A6503DE919-000000A6F001F909 000000067F000080000008801C0000090000-000000067F000080000008801C0000094000__000000A76F097A80 000000067F000080000008801C0000094000-000000067F000080000008801C0000098000__000000A76F097A80 000000067F000080000008801C000009752B-000000067F000080000008801C00000A0C91__000000A6503DE919-000000A6F001F909 000000067F000080000008801C0000098000-000000067F000080000008801C000009C000__000000A76F097A80 000000067F000080000008801C000009C000-000000067F000080000008801C00000A0000__000000A76F097A80 000000067F000080000008801C00000A0000-000000067F000080000008801C00000A4000__000000A76F097A80 000000067F000080000008801C00000A0C91-000000067F000080000008801C00000AA3F7__000000A6503DE919-000000A6F001F909 000000067F000080000008801C00000A4000-000000067F000080000008801C00000A8000__000000A76F097A80 000000067F000080000008801C00000A8000-000000067F000080000008801C00000AC000__000000A76F097A80 000000067F000080000008801C00000AA3F7-000000067F000080000008801C00000B3B0C__000000A6503DE919-000000A6F001F909 000000067F000080000008801C00000AC000-000000067F000080000008801C00000B0000__000000A76F097A80 000000067F000080000008801C00000B0000-000000067F000080000008801C00000B4000__000000A76F097A80 000000067F000080000008801C00000B3B0C-000000067F000080000008801C00000BD272__000000A6503DE919-000000A6F001F909 000000067F000080000008801C00000B4000-000000067F000080000008801C00000B8000__000000A76F097A80 000000067F000080000008801C00000B8000-000000067F000080000008801C00000BC000__000000A76F097A80 000000067F000080000008801C00000BC000-000000067F000080000008801C00000C0000__000000A76F097A80 000000067F000080000008801C00000BD272-000000067F000080000008801C00000C6999__000000A6503DE919-000000A6F001F909 000000067F000080000008801C00000C0000-000000067F000080000008801C00000C4000__000000A76F097A80 000000067F000080000008801C00000C4000-000000067F000080000008801C00000C8000__000000A76F097A80 000000067F000080000008801C00000C6999-000000067F000080000008801C0100000000__000000A6503DE919-000000A6F001F909 000000067F000080000008801C00000C8000-000000067F000080000008801C00000CC000__000000A76F097A80 000000067F000080000008801C00000CC000-000000067F000080000008801C00000D0000__000000A76F097A80 000000067F000080000008801C00000CF6B0-000000067F000080000008801C00000D8DC1__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C00000D0000-000000067F000080000008801C00000D4000__000000A76F097A80 000000067F000080000008801C00000D4000-000000067F000080000008801C00000D8000__000000A76F097A80 000000067F000080000008801C00000D8000-000000067F000080000008801C00000DC000__000000A76F097A80 000000067F000080000008801C00000D8DC1-000000067F000080000008801C00000E250B__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C00000DC000-000000067F000080000008801C00000E0000__000000A76F097A80 000000067F000080000008801C00000E0000-000000067F000080000008801C00000E4000__000000A76F097A80 000000067F000080000008801C00000E250B-000000067F000080000008801C00000EBC71__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C00000E4000-000000067F000080000008801C00000E8000__000000A76F097A80 000000067F000080000008801C00000E8000-000000067F000080000008801C00000EC000__000000A76F097A80 000000067F000080000008801C00000EBC71-000000067F000080000008801C00000F53A5__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C00000EC000-000000067F000080000008801C00000F0000__000000A76F097A80 000000067F000080000008801C00000F0000-000000067F000080000008801C00000F4000__000000A76F097A80 000000067F000080000008801C00000F4000-000000067F000080000008801C00000F8000__000000A76F097A80 000000067F000080000008801C00000F53A5-000000067F000080000008801C00000FEB0B__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C00000F8000-000000067F000080000008801C00000FC000__000000A76F097A80 000000067F000080000008801C00000FC000-000000067F000080000008801C0000100000__000000A76F097A80 000000067F000080000008801C00000FEB0B-000000067F000080000008801C000010822C__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C0000100000-000000067F000080000008801C0000104000__000000A76F097A80 000000067F000080000008801C0000104000-000000067F000080000008801C0000108000__000000A76F097A80 000000067F000080000008801C0000108000-000000067F000080000008801C000010C000__000000A76F097A80 000000067F000080000008801C000010822C-000000067F000080000008801C0000111982__000000A6F001F909-000000A91D97FD49 000000067F000080000008801C000010C000-000000067F000080000008801C0000110000__000000A76F097A80 000000067F000080000008801C0000110000-030000000000000000000000000000000002__000000A76F097A80 000000067F000080000008801C0000111982-000000067F000080000008A00C00000084EA__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000084EA-000000067F000080000008A00C0000011C0C__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000011C0C-000000067F000080000008A00C000001B372__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000001B372-000000067F000080000008A00C0000024AD8__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000024AD8-000000067F000080000008A00C000002E20B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000002E20B-000000067F000080000008A00C0000037928__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000037928-000000067F000080000008A00C000004108E__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000004108E-000000067F000080000008A00C000004A7F4__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000004A7F4-000000067F000080000008A00C0000053F0B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000053F0B-000000067F000080000008A00C000005D671__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000005D671-000000067F000080000008A00C0000066D95__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000066D95-000000067F000080000008A00C00000704FB__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000704FB-000000067F000080000008A00C0000079C0B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000079C0B-000000067F000080000008A00C0000083351__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000083351-000000067F000080000008A00C000008CAB7__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000008CAB7-000000067F000080000008A00C00000961E2__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000961E2-000000067F000080000008A00C000009F90B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C000009F90B-000000067F000080000008A00C00000A902B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000A902B-000000067F000080000008A00C00000B2779__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000B2779-000000067F000080000008A00C00000BBEDF__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000BBEDF-000000067F000080000008A00C00000C560A__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000C560A-000000067F000080000008A00C00000CED70__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000CED70-000000067F000080000008A00C00000D84D6__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000D84D6-000000067F000080000008A00C00000E1C0A__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000E1C0A-000000067F000080000008A00C00000EB370__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000EB370-000000067F000080000008A00C00000F4AD6__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000F4AD6-000000067F000080000008A00C00000FE20B__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C00000FE20B-030000000000000000000000000000000002__000000A6F001F909-000000A91D97FD49 000000067F000080000008A00C0000104A0C-000000067F000080000008A00C000010DF6E__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A00C000010DF6E-000000067F000080000008A0140000001A21__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A0140000001A21-000000067F000080000008A0140000009932__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A0140000009932-000000067F000080000008A0140000011843__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A0140000011843-000000067F000080000008A0140000019753__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A0140000019753-000000067F000080000008A0140000021664__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008A0140000021664-01000000000000000100000004000000001C__000000A91D97FD49-000000A98AB7EE49 000000067F000080000008C00C0000000000-000000067F000080000008C00C0000004000__000000AAEBE534F8 000000067F000080000008C00C0000002330-000000067F000080000008C00C000000BA96__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000004000-000000067F000080000008C00C0000008000__000000AAEBE534F8 000000067F000080000008C00C0000008000-000000067F000080000008C00C000000C000__000000AAEBE534F8 000000067F000080000008C00C000000BA96-000000067F000080000008C00C00000151CB__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C000000C000-000000067F000080000008C00C0000010000__000000AAEBE534F8 000000067F000080000008C00C0000010000-000000067F000080000008C00C0000014000__000000AAEBE534F8 000000067F000080000008C00C0000014000-000000067F000080000008C00C0000018000__000000AAEBE534F8 000000067F000080000008C00C00000151CB-000000067F000080000008C00C000001E90B__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000018000-000000067F000080000008C00C000001C000__000000AAEBE534F8 000000067F000080000008C00C000001C000-000000067F000080000008C00C0000020000__000000AAEBE534F8 000000067F000080000008C00C000001E90B-000000067F000080000008C00C000002802C__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000020000-000000067F000080000008C00C0000024000__000000AAEBE534F8 000000067F000080000008C00C0000024000-000000067F000080000008C00C0000028000__000000AAEBE534F8 000000067F000080000008C00C0000028000-000000067F000080000008C00C000002C000__000000AAEBE534F8 000000067F000080000008C00C000002802C-000000067F000080000008C00C0000031783__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C000002C000-000000067F000080000008C00C0000030000__000000AAEBE534F8 000000067F000080000008C00C0000030000-000000067F000080000008C00C0000034000__000000AAEBE534F8 000000067F000080000008C00C0000031783-000000067F000080000008C00C000003AEE9__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000034000-000000067F000080000008C00C0000038000__000000AAEBE534F8 000000067F000080000008C00C0000038000-000000067F000080000008C00C000003C000__000000AAEBE534F8 000000067F000080000008C00C000003AEE9-000000067F000080000008C00C000004460B__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C000003C000-000000067F000080000008C00C0000040000__000000AAEBE534F8 000000067F000080000008C00C0000040000-000000067F000080000008C00C0000044000__000000AAEBE534F8 000000067F000080000008C00C0000044000-000000067F000080000008C00C0000048000__000000AAEBE534F8 000000067F000080000008C00C000004460B-000000067F000080000008C00C000004DD71__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000048000-000000067F000080000008C00C000004C000__000000AAEBE534F8 000000067F000080000008C00C000004C000-000000067F000080000008C00C0000050000__000000AAEBE534F8 000000067F000080000008C00C000004DD71-030000000000000000000000000000000002__000000A98AB7EE49-000000AA2597E9A1 000000067F000080000008C00C0000050000-000000067F000080000008C00C0000054000__000000AAEBE534F8 000000067F000080000008C00C0000054000-000000067F000080000008C00C0000058000__000000AAEBE534F8 000000067F000080000008C00C0000058000-000000067F000080000008C00C000005C000__000000AAEBE534F8 000000067F000080000008C00C000005C000-000000067F000080000008C00C0000060000__000000AAEBE534F8 000000067F000080000008C00C000005DA8C-000000067F000080000008C00C00000671AE__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000060000-000000067F000080000008C00C0000064000__000000AAEBE534F8 000000067F000080000008C00C0000064000-000000067F000080000008C00C0000068000__000000AAEBE534F8 000000067F000080000008C00C00000671AE-000000067F000080000008C00C000007090A__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000068000-000000067F000080000008C00C000006C000__000000AAEBE534F8 000000067F000080000008C00C000006C000-000000067F000080000008C00C0000070000__000000AAEBE534F8 000000067F000080000008C00C0000070000-000000067F000080000008C00C0000074000__000000AAEBE534F8 000000067F000080000008C00C000007090A-000000067F000080000008C00C000007A070__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000074000-000000067F000080000008C00C0000078000__000000AAEBE534F8 000000067F000080000008C00C0000078000-000000067F000080000008C00C000007C000__000000AAEBE534F8 000000067F000080000008C00C000007A070-000000067F000080000008C00C00000837B4__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C000007C000-000000067F000080000008C00C0000080000__000000AAEBE534F8 000000067F000080000008C00C0000080000-000000067F000080000008C00C0000084000__000000AAEBE534F8 000000067F000080000008C00C00000837B4-000000067F000080000008C00C000008CF0A__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000084000-000000067F000080000008C00C0000088000__000000AAEBE534F8 000000067F000080000008C00C0000088000-000000067F000080000008C00C000008C000__000000AAEBE534F8 000000067F000080000008C00C000008C000-000000067F000080000008C00C0000090000__000000AAEBE534F8 000000067F000080000008C00C000008CF0A-000000067F000080000008C00C0000096670__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000090000-000000067F000080000008C00C0000094000__000000AAEBE534F8 000000067F000080000008C00C0000094000-000000067F000080000008C00C0000098000__000000AAEBE534F8 000000067F000080000008C00C0000096670-000000067F000080000008C00C000009FDD6__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000098000-000000067F000080000008C00C000009C000__000000AAEBE534F8 000000067F000080000008C00C000009C000-000000067F000080000008C00C00000A0000__000000AAEBE534F8 000000067F000080000008C00C000009FDD6-000000067F000080000008C00C00000A952A__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000A0000-000000067F000080000008C00C00000A4000__000000AAEBE534F8 000000067F000080000008C00C00000A4000-000000067F000080000008C00C00000A8000__000000AAEBE534F8 000000067F000080000008C00C00000A8000-000000067F000080000008C00C00000AC000__000000AAEBE534F8 000000067F000080000008C00C00000A952A-000000067F000080000008C00C00000B2C90__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000AC000-000000067F000080000008C00C00000B0000__000000AAEBE534F8 000000067F000080000008C00C00000B0000-000000067F000080000008C00C00000B4000__000000AAEBE534F8 000000067F000080000008C00C00000B2C90-000000067F000080000008C00C00000BC3F6__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000B4000-000000067F000080000008C00C00000B8000__000000AAEBE534F8 000000067F000080000008C00C00000B8000-000000067F000080000008C00C00000BC000__000000AAEBE534F8 000000067F000080000008C00C00000BC000-000000067F000080000008C00C00000C0000__000000AAEBE534F8 000000067F000080000008C00C00000BC3F6-000000067F000080000008C00C00000C5B0C__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000C0000-000000067F000080000008C00C00000C4000__000000AAEBE534F8 000000067F000080000008C00C00000C4000-000000067F000080000008C00C00000C8000__000000AAEBE534F8 000000067F000080000008C00C00000C5B0C-000000067F000080000008C00C00000CF272__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000C8000-030000000000000000000000000000000002__000000AAEBE534F8 000000067F000080000008C00C00000CF272-000000067F000080000008C00C00000D8986__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000D8986-000000067F000080000008C00C00000E20EC__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000E20EC-000000067F000080000008C00C00000EB80A__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000EB80A-000000067F000080000008C00C00000F4F40__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000F4F40-000000067F000080000008C00C00000FE6A6__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C00000FE6A6-000000067F000080000008C00C0000107DC1__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C0000107DC1-000000067F000080000008C00C000011150A__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008C00C000011150A-01000000000000000100000004000000001C__000000AA2597E9A1-000000AB6533BFD9 000000067F000080000008E00C0000000000-000000067F000080000008E00C0000004000__000000AD3698E000 000000067F000080000008E00C0000004000-000000067F000080000008E00C0000008000__000000AD3698E000 000000067F000080000008E00C00000077B3-000000067F000080000008E00C0000010F0A__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000008000-000000067F000080000008E00C000000C000__000000AD3698E000 000000067F000080000008E00C000000C000-000000067F000080000008E00C0000010000__000000AD3698E000 000000067F000080000008E00C0000010000-000000067F000080000008E00C0000014000__000000AD3698E000 000000067F000080000008E00C0000010F0A-000000067F000080000008E00C000001A670__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000014000-000000067F000080000008E00C0000018000__000000AD3698E000 000000067F000080000008E00C0000018000-000000067F000080000008E00C000001C000__000000AD3698E000 000000067F000080000008E00C000001A670-000000067F000080000008E00C0000023DB1__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C000001C000-000000067F000080000008E00C0000020000__000000AD3698E000 000000067F000080000008E00C0000020000-000000067F000080000008E00C0000024000__000000AD3698E000 000000067F000080000008E00C0000023DB1-000000067F000080000008E00C000002D50A__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000024000-000000067F000080000008E00C0000028000__000000AD3698E000 000000067F000080000008E00C0000028000-000000067F000080000008E00C000002C000__000000AD3698E000 000000067F000080000008E00C000002C000-000000067F000080000008E00C0000030000__000000AD3698E000 000000067F000080000008E00C000002D50A-000000067F000080000008E00C0000036C30__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000030000-000000067F000080000008E00C0000034000__000000AD3698E000 000000067F000080000008E00C0000034000-000000067F000080000008E00C0000038000__000000AD3698E000 000000067F000080000008E00C0000036C30-000000067F000080000008E00C0000040393__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000038000-000000067F000080000008E00C000003C000__000000AD3698E000 000000067F000080000008E00C000003C000-000000067F000080000008E00C0000040000__000000AD3698E000 000000067F000080000008E00C0000040000-000000067F000080000008E00C0000044000__000000AD3698E000 000000067F000080000008E00C0000040393-000000067F000080000008E00C0000049AF9__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000044000-000000067F000080000008E00C0000048000__000000AD3698E000 000000067F000080000008E00C0000048000-000000067F000080000008E00C000004C000__000000AD3698E000 000000067F000080000008E00C0000049AF9-000000067F000080000008E00C000005320C__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C000004C000-000000067F000080000008E00C0000050000__000000AD3698E000 000000067F000080000008E00C0000050000-000000067F000080000008E00C0000054000__000000AD3698E000 000000067F000080000008E00C000005320C-030000000000000000000000000000000002__000000AB6533BFD9-000000ABF63DF511 000000067F000080000008E00C0000054000-000000067F000080000008E00C0000058000__000000AD34AF7FD8 000000067F000080000008E00C000005523E-000000067F000080000008E00C000005E9A4__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000058000-000000067F000080000008E00C000005C000__000000AD34AF7FD8 000000067F000080000008E00C000005C000-000000067F000080000008E00C0000060000__000000AD34AF7FD8 000000067F000080000008E00C000005E9A4-000000067F000080000008E00C000006810A__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000060000-000000067F000080000008E00C0000064000__000000AD34AF7FD8 000000067F000080000008E00C0000064000-000000067F000080000008E00C0000068000__000000AD34AF7FD8 000000067F000080000008E00C0000068000-000000067F000080000008E00C000006C000__000000AD34AF7FD8 000000067F000080000008E00C000006810A-000000067F000080000008E00C0000071870__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C000006C000-000000067F000080000008E00C0000070000__000000AD34AF7FD8 000000067F000080000008E00C0000070000-000000067F000080000008E00C0000074000__000000AD34AF7FD8 000000067F000080000008E00C0000071870-000000067F000080000008E00C000007AFD6__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000074000-000000067F000080000008E00C0000078000__000000AD34AF7FD8 000000067F000080000008E00C0000078000-000000067F000080000008E00C000007C000__000000AD34AF7FD8 000000067F000080000008E00C000007AFD6-000000067F000080000008E00C000008470B__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C000007C000-000000067F000080000008E00C0000080000__000000AD34AF7FD8 000000067F000080000008E00C0000080000-000000067F000080000008E00C0000084000__000000AD34AF7FD8 000000067F000080000008E00C0000084000-000000067F000080000008E00C0000088000__000000AD34AF7FD8 000000067F000080000008E00C000008470B-000000067F000080000008E00C000008DE71__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000088000-000000067F000080000008E00C000008C000__000000AD34AF7FD8 000000067F000080000008E00C000008C000-000000067F000080000008E00C0000090000__000000AD34AF7FD8 000000067F000080000008E00C000008DE71-000000067F000080000008E00C0000097591__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000090000-000000067F000080000008E00C0000094000__000000AD34AF7FD8 000000067F000080000008E00C0000094000-000000067F000080000008E00C0000098000__000000AD34AF7FD8 000000067F000080000008E00C0000097591-000000067F000080000008E00C00000A0CF7__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C0000098000-000000067F000080000008E00C000009C000__000000AD34AF7FD8 000000067F000080000008E00C000009C000-000000067F000080000008E00C00000A0000__000000AD34AF7FD8 000000067F000080000008E00C00000A0000-000000067F000080000008E00C00000A4000__000000AD34AF7FD8 000000067F000080000008E00C00000A0CF7-000000067F000080000008E00C00000AA40B__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C00000A4000-000000067F000080000008E00C00000A8000__000000AD34AF7FD8 000000067F000080000008E00C00000A8000-000000067F000080000008E00C00000AC000__000000AD34AF7FD8 000000067F000080000008E00C00000AA40B-000000067F000080000008E00C00000B3B4D__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C00000AC000-000000067F000080000008E00C00000B0000__000000AD34AF7FD8 000000067F000080000008E00C00000B0000-000000067F000080000008E00C00000B4000__000000AD34AF7FD8 000000067F000080000008E00C00000B3B4D-000000067F000080000008E00C0100000000__000000ABF63DF511-000000AC9601EA19 000000067F000080000008E00C00000B4000-000000067F000080000008E00C00000B8000__000000AD34AF7FD8 000000067F000080000008E00C00000B8000-000000067F000080000008E00C00000BC000__000000AD34AF7FD8 000000067F000080000008E00C00000BC000-000000067F000080000008E00C00000C0000__000000AD34AF7FD8 000000067F000080000008E00C00000BC018-000000067F000080000008E00C00000C5749__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000C0000-000000067F000080000008E00C00000C4000__000000AD34AF7FD8 000000067F000080000008E00C00000C4000-000000067F000080000008E00C00000C8000__000000AD34AF7FD8 000000067F000080000008E00C00000C5749-000000067F000080000008E00C00000CEEAF__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000C8000-000000067F000080000008E00C00000CC000__000000AD34AF7FD8 000000067F000080000008E00C00000CC000-000000067F000080000008E00C00000D0000__000000AD34AF7FD8 000000067F000080000008E00C00000CEEAF-000000067F000080000008E00C00000D85C5__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000D0000-000000067F000080000008E00C00000D4000__000000AD34AF7FD8 000000067F000080000008E00C00000D4000-000000067F000080000008E00C00000D8000__000000AD34AF7FD8 000000067F000080000008E00C00000D8000-000000067F000080000008E00C00000DC000__000000AD34AF7FD8 000000067F000080000008E00C00000D85C5-000000067F000080000008E00C00000E1D0B__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000DC000-000000067F000080000008E00C00000E0000__000000AD34AF7FD8 000000067F000080000008E00C00000E0000-000000067F000080000008E00C00000E4000__000000AD34AF7FD8 000000067F000080000008E00C00000E1D0B-000000067F000080000008E00C00000EB471__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000E4000-000000067F000080000008E00C00000E8000__000000AD34AF7FD8 000000067F000080000008E00C00000E8000-000000067F000080000008E00C00000EC000__000000AD34AF7FD8 000000067F000080000008E00C00000EB471-000000067F000080000008E00C00000F4BAA__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000EC000-000000067F000080000008E00C00000F0000__000000AD34AF7FD8 000000067F000080000008E00C00000F0000-000000067F000080000008E00C00000F4000__000000AD34AF7FD8 000000067F000080000008E00C00000F4000-000000067F000080000008E00C00000F8000__000000AD34AF7FD8 000000067F000080000008E00C00000F4BAA-000000067F000080000008E00C00000FE30A__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C00000F8000-000000067F000080000008E00C00000FC000__000000AD34AF7FD8 000000067F000080000008E00C00000FC000-000000067F000080000008E00C0000100000__000000AD34AF7FD8 000000067F000080000008E00C00000FE30A-000000067F000080000008E00C0000107A2C__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C0000100000-000000067F000080000008E00C0000104000__000000AD34AF7FD8 000000067F000080000008E00C0000104000-000000067F000080000008E00C0000108000__000000AD34AF7FD8 000000067F000080000008E00C0000107A2C-000000067F000080000008E00C0000111187__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E00C0000108000-000000067F000080000008E00C000010C000__000000AD34AF7FD8 000000067F000080000008E00C000010C000-000000067F000080000008E00C0000110000__000000AD34AF7FD8 000000067F000080000008E00C0000110000-030000000000000000000000000000000002__000000AD34AF7FD8 000000067F000080000008E00C0000111187-01000000000000000100000004000000001C__000000AC9601EA19-000000AD36393FE9 000000067F000080000008E0140000003E33-000000067F000080000008E014000000BD44__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E014000000BD44-000000067F000080000008E0140000013C54__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E0140000013C54-000000067F000080000008E014000001BB63__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E014000001BB63-000000067F000080000008E0140000023A74__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E0140000023A74-000000067F000080000008E014000002B984__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E014000002B984-000000067F000080000008E0220000006AD0__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E0220000000000-000000067F000080000008E0220000004000__000000AF5D7D4000 000000067F000080000008E0220000004000-000000067F000080000008E0220000008000__000000AF5D7D4000 000000067F000080000008E0220000006AD0-000000067F000080000008E022000001020C__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E0220000008000-000000067F000080000008E022000000C000__000000AF5D7D4000 000000067F000080000008E022000000C000-000000067F000080000008E0220000010000__000000AF5D7D4000 000000067F000080000008E0220000010000-000000067F000080000008E0220000014000__000000AF5D7D4000 000000067F000080000008E022000001020C-01000000000000000100000004000000001C__000000AD36393FE9-000000ADB047EAB9 000000067F000080000008E0220000014000-000000067F000080000008E0220000018000__000000AF56604248 000000067F000080000008E02200000151DD-000000067F000080000008E022000001E90B__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000018000-000000067F000080000008E022000001C000__000000AF56604248 000000067F000080000008E022000001C000-000000067F000080000008E0220000020000__000000AF56604248 000000067F000080000008E022000001E90B-000000067F000080000008E022000002802C__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000020000-000000067F000080000008E0220000024000__000000AF56604248 000000067F000080000008E0220000024000-000000067F000080000008E0220000028000__000000AF56604248 000000067F000080000008E0220000028000-000000067F000080000008E022000002C000__000000AF56604248 000000067F000080000008E022000002802C-000000067F000080000008E0220000031783__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E022000002C000-000000067F000080000008E0220000030000__000000AF56604248 000000067F000080000008E0220000030000-000000067F000080000008E0220000034000__000000AF56604248 000000067F000080000008E0220000031783-000000067F000080000008E022000003AEE9__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000034000-000000067F000080000008E0220000038000__000000AF56604248 000000067F000080000008E0220000038000-000000067F000080000008E022000003C000__000000AF56604248 000000067F000080000008E022000003AEE9-000000067F000080000008E022000004460B__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E022000003C000-000000067F000080000008E0220000040000__000000AF56604248 000000067F000080000008E0220000040000-000000067F000080000008E0220000044000__000000AF56604248 000000067F000080000008E0220000044000-000000067F000080000008E0220000048000__000000AF56604248 000000067F000080000008E022000004460B-000000067F000080000008E022000004DD71__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000048000-000000067F000080000008E022000004C000__000000AF56604248 000000067F000080000008E022000004C000-000000067F000080000008E0220000050000__000000AF56604248 000000067F000080000008E022000004DD71-000000067F000080000008E02200000574D7__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000050000-000000067F000080000008E0220000054000__000000AF56604248 000000067F000080000008E0220000054000-000000067F000080000008E0220000058000__000000AF56604248 000000067F000080000008E02200000574D7-000000067F000080000008E0220000060C0B__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000058000-000000067F000080000008E022000005C000__000000AF56604248 000000067F000080000008E022000005C000-000000067F000080000008E0220000060000__000000AF56604248 000000067F000080000008E0220000060000-000000067F000080000008E0220000064000__000000AF56604248 000000067F000080000008E0220000060C0B-000000067F000080000008E022000006A371__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000064000-000000067F000080000008E0220000068000__000000AF56604248 000000067F000080000008E0220000068000-000000067F000080000008E022000006C000__000000AF56604248 000000067F000080000008E022000006A371-000000067F000080000008E0220000073AD7__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E022000006C000-000000067F000080000008E0220000070000__000000AF56604248 000000067F000080000008E0220000070000-000000067F000080000008E0220000074000__000000AF56604248 000000067F000080000008E0220000073AD7-000000067F000080000008E022000007D20B__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000074000-000000067F000080000008E0220000078000__000000AF56604248 000000067F000080000008E0220000078000-000000067F000080000008E022000007C000__000000AF56604248 000000067F000080000008E022000007C000-000000067F000080000008E0220000080000__000000AF56604248 000000067F000080000008E022000007D20B-000000067F000080000008E0220000086932__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000080000-000000067F000080000008E0220000084000__000000AF56604248 000000067F000080000008E0220000084000-000000067F000080000008E0220000088000__000000AF56604248 000000067F000080000008E0220000086932-000000067F000080000008E0220100000000__000000ADB047EAB9-000000AE6FFFE799 000000067F000080000008E0220000088000-000000067F000080000008E022000008C000__000000AF56604248 000000067F000080000008E022000008C000-000000067F000080000008E0220000090000__000000AF56604248 000000067F000080000008E022000008E3D1-000000067F000080000008E022000009797E__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E0220000090000-000000067F000080000008E0220000094000__000000AF56604248 000000067F000080000008E0220000094000-000000067F000080000008E0220000098000__000000AF56604248 000000067F000080000008E022000009797E-000000067F000080000008E02200000A10E4__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E0220000098000-000000067F000080000008E022000009C000__000000AF56604248 000000067F000080000008E022000009C000-000000067F000080000008E02200000A0000__000000AF56604248 000000067F000080000008E02200000A0000-000000067F000080000008E02200000A4000__000000AF56604248 000000067F000080000008E02200000A10E4-000000067F000080000008E02200000AA80B__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000A4000-000000067F000080000008E02200000A8000__000000AF56604248 000000067F000080000008E02200000A8000-000000067F000080000008E02200000AC000__000000AF56604248 000000067F000080000008E02200000AA80B-000000067F000080000008E02200000B3F4B__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000AC000-000000067F000080000008E02200000B0000__000000AF56604248 000000067F000080000008E02200000B0000-000000067F000080000008E02200000B4000__000000AF56604248 000000067F000080000008E02200000B3F4B-000000067F000080000008E02200000BD6B1__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000B4000-000000067F000080000008E02200000B8000__000000AF56604248 000000067F000080000008E02200000B8000-000000067F000080000008E02200000BC000__000000AF56604248 000000067F000080000008E02200000BC000-000000067F000080000008E02200000C0000__000000AF56604248 000000067F000080000008E02200000BD6B1-000000067F000080000008E02200000C6DD5__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000C0000-000000067F000080000008E02200000C4000__000000AF56604248 000000067F000080000008E02200000C4000-000000067F000080000008E02200000C8000__000000AF56604248 000000067F000080000008E02200000C6DD5-000000067F000080000008E02200000D050B__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000C8000-000000067F000080000008E02200000CC000__000000AF56604248 000000067F000080000008E02200000CC000-000000067F000080000008E02200000D0000__000000AF56604248 000000067F000080000008E02200000D0000-000000067F000080000008E02200000D4000__000000AF56604248 000000067F000080000008E02200000D050B-000000067F000080000008E02200000D9C71__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000D4000-000000067F000080000008E02200000D8000__000000AF56604248 000000067F000080000008E02200000D8000-000000067F000080000008E02200000DC000__000000AF56604248 000000067F000080000008E02200000D9C71-000000067F000080000008E02200000E33B8__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000DC000-000000067F000080000008E02200000E0000__000000AF56604248 000000067F000080000008E02200000E0000-000000067F000080000008E02200000E4000__000000AF56604248 000000067F000080000008E02200000E33B8-000000067F000080000008E02200000ECB09__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000E4000-000000067F000080000008E02200000E8000__000000AF56604248 000000067F000080000008E02200000E8000-000000067F000080000008E02200000EC000__000000AF56604248 000000067F000080000008E02200000EC000-000000067F000080000008E02200000F0000__000000AF56604248 000000067F000080000008E02200000ECB09-000000067F000080000008E02200000F626F__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000F0000-000000067F000080000008E02200000F4000__000000AF56604248 000000067F000080000008E02200000F4000-000000067F000080000008E02200000F8000__000000AF56604248 000000067F000080000008E02200000F626F-000000067F000080000008E02200000FF9D5__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02200000F8000-000000067F000080000008E02200000FC000__000000AF56604248 000000067F000080000008E02200000FC000-000000067F000080000008E0220000100000__000000AF56604248 000000067F000080000008E02200000FF9D5-000000067F000080000008E022000010912A__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E0220000100000-000000067F000080000008E0220000104000__000000AF56604248 000000067F000080000008E0220000104000-000000067F000080000008E0220000108000__000000AF56604248 000000067F000080000008E0220000108000-000000067F000080000008E022000010C000__000000AF56604248 000000067F000080000008E022000010912A-000000067F000080000008E0220000111C20__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E022000010C000-030000000000000000000000000000000002__000000AF56604248 000000067F000080000008E02200FFFFFFFF-01000000000000000100000004000000001C__000000AE6FFFE799-000000AF5D587FE1 000000067F000080000008E02A000000529F-000000067F000080000008E02A000000D1B0__000000AF5D587FE1-000000AFB4666001 000000067F000080000008E02A000000D1B0-000000067F000080000008E02A00000150BF__000000AF5D587FE1-000000AFB4666001 000000067F000080000008E02A00000150BF-000000067F000080000008E02A000001CFD0__000000AF5D587FE1-000000AFB4666001 000000067F000080000008E02A000001CFD0-000000067F000080000008E02A0000024EE1__000000AF5D587FE1-000000AFB4666001 000000067F000080000008E02A0000024EE1-000000067F000080000008E02A000002CDF1__000000AF5D587FE1-000000AFB4666001 000000067F000080000008E02A000002CDF1-030000000000000000000000000000000002__000000AF5D587FE1-000000AFB4666001 000000067F00008000000900380000000000-000000067F0000800000090038000000970B__000000AFB4666001-000000B05429F579 000000067F0000800000090038000000970B-000000067F00008000000900380000012E71__000000AFB4666001-000000B05429F579 000000067F00008000000900380000012E71-000000067F0000800000090038000001C5D7__000000AFB4666001-000000B05429F579 000000067F0000800000090038000001C5D7-000000067F00008000000900380000025D2B__000000AFB4666001-000000B05429F579 000000067F00008000000900380000025D2B-000000067F0000800000090038000002F491__000000AFB4666001-000000B05429F579 000000067F0000800000090038000002F491-000000067F00008000000900380000038BA4__000000AFB4666001-000000B05429F579 000000067F00008000000900380000038BA4-000000067F0000800000090038000004230A__000000AFB4666001-000000B05429F579 000000067F0000800000090038000004230A-000000067F0000800000090038000004BA70__000000AFB4666001-000000B05429F579 000000067F0000800000090038000004BA70-000000067F000080000009003800000551A5__000000AFB4666001-000000B05429F579 000000067F000080000009003800000551A5-000000067F0000800000090038000005E909__000000AFB4666001-000000B05429F579 000000067F0000800000090038000005C000-000000067F00008000000900380000060000__000000B18434BFD0 000000067F0000800000090038000005E909-000000067F000080000009003B0100000000__000000AFB4666001-000000B05429F579 000000067F0000800000090038000005EA0C-000000067F00008000000900380000068125__000000B05429F579-000000B0F3EDEAC9 000000067F00008000000900380000060000-000000067F00008000000900380000064000__000000B18434BFD0 000000067F00008000000900380000064000-000000067F00008000000900380000068000__000000B18434BFD0 000000067F00008000000900380000068000-000000067F0000800000090038000006C000__000000B18434BFD0 000000067F00008000000900380000068125-000000067F0000800000090038000007188B__000000B05429F579-000000B0F3EDEAC9 000000067F0000800000090038000006C000-000000067F00008000000900380000070000__000000B18434BFD0 000000067F00008000000900380000070000-000000067F00008000000900380000074000__000000B18434BFD0 000000067F0000800000090038000007188B-000000067F0000800000090038000007AFF1__000000B05429F579-000000B0F3EDEAC9 000000067F00008000000900380000074000-000000067F00008000000900380000078000__000000B18434BFD0 000000067F00008000000900380000078000-000000067F0000800000090038000007C000__000000B18434BFD0 000000067F0000800000090038000007AFF1-000000067F0000800000090038000008470C__000000B05429F579-000000B0F3EDEAC9 000000067F0000800000090038000007C000-000000067F00008000000900380000080000__000000B18434BFD0 000000067F00008000000900380000080000-000000067F00008000000900380000084000__000000B18434BFD0 000000067F00008000000900380000084000-000000067F00008000000900380000088000__000000B18434BFD0 000000067F0000800000090038000008470C-000000067F0000800000090038000008DE72__000000B05429F579-000000B0F3EDEAC9 000000067F00008000000900380000088000-000000067F0000800000090038000008C000__000000B18434BFD0 000000067F0000800000090038000008C000-000000067F00008000000900380000090000__000000B18434BFD0 000000067F0000800000090038000008DE72-000000067F00008000000900380000097592__000000B05429F579-000000B0F3EDEAC9 000000067F00008000000900380000090000-000000067F00008000000900380000094000__000000B18434BFD0 000000067F00008000000900380000094000-000000067F00008000000900380000098000__000000B18434BFD0 000000067F00008000000900380000097592-000000067F000080000009003800000A0CF8__000000B05429F579-000000B0F3EDEAC9 000000067F00008000000900380000098000-000000067F0000800000090038000009C000__000000B18434BFD0 000000067F0000800000090038000009C000-000000067F000080000009003800000A0000__000000B18434BFD0 000000067F000080000009003800000A0000-000000067F000080000009003800000A4000__000000B18434BFD0 000000067F000080000009003800000A0CF8-000000067F000080000009003800000AA40C__000000B05429F579-000000B0F3EDEAC9 000000067F000080000009003800000A4000-000000067F000080000009003800000A8000__000000B18434BFD0 000000067F000080000009003800000A8000-000000067F000080000009003800000AC000__000000B18434BFD0 000000067F000080000009003800000AA40C-000000067F000080000009003800000B3B4E__000000B05429F579-000000B0F3EDEAC9 000000067F000080000009003800000AC000-000000067F000080000009003800000B0000__000000B18434BFD0 000000067F000080000009003800000B0000-000000067F000080000009003800000B4000__000000B18434BFD0 000000067F000080000009003800000B3B4E-000000067F000080000009003800000BD2B4__000000B05429F579-000000B0F3EDEAC9 000000067F000080000009003800000B4000-000000067F000080000009003800000B8000__000000B18434BFD0 000000067F000080000009003800000B8000-000000067F000080000009003800000BC000__000000B18434BFD0 000000067F000080000009003800000BC000-000000067F000080000009003800000C0000__000000B18434BFD0 000000067F000080000009003800000BD2B4-000000067F00008000000900380100000000__000000B05429F579-000000B0F3EDEAC9 000000067F000080000009003800000C0000-000000067F000080000009003800000C4000__000000B18434BFD0 000000067F000080000009003800000C4000-000000067F000080000009003800000C8000__000000B18434BFD0 000000067F000080000009003800000C5213-000000067F000080000009003800000CE979__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000C8000-000000067F000080000009003800000CC000__000000B18434BFD0 000000067F000080000009003800000CC000-000000067F000080000009003800000D0000__000000B18434BFD0 000000067F000080000009003800000CE979-000000067F000080000009003800000D80DF__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000D0000-000000067F000080000009003800000D4000__000000B18434BFD0 000000067F000080000009003800000D4000-000000067F000080000009003800000D8000__000000B18434BFD0 000000067F000080000009003800000D8000-000000067F000080000009003800000DC000__000000B18434BFD0 000000067F000080000009003800000D80DF-000000067F000080000009003800000E180A__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000DC000-000000067F000080000009003800000E0000__000000B18434BFD0 000000067F000080000009003800000E0000-000000067F000080000009003800000E4000__000000B18434BFD0 000000067F000080000009003800000E180A-000000067F000080000009003800000EAF70__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000E4000-000000067F000080000009003800000E8000__000000B18434BFD0 000000067F000080000009003800000E8000-000000067F000080000009003800000EC000__000000B18434BFD0 000000067F000080000009003800000EAF70-000000067F000080000009003800000F46D6__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000EC000-000000067F000080000009003800000F0000__000000B18434BFD0 000000067F000080000009003800000F0000-000000067F000080000009003800000F4000__000000B18434BFD0 000000067F000080000009003800000F4000-000000067F000080000009003800000F8000__000000B18434BFD0 000000067F000080000009003800000F46D6-000000067F000080000009003800000FDE0B__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009003800000F8000-000000067F000080000009003800000FC000__000000B18434BFD0 000000067F000080000009003800000FC000-000000067F00008000000900380000100000__000000B18434BFD0 000000067F000080000009003800000FDE0B-000000067F0000800000090038000010752B__000000B0F3EDEAC9-000000B18495C001 000000067F00008000000900380000100000-000000067F00008000000900380000104000__000000B18434BFD0 000000067F00008000000900380000104000-000000067F00008000000900380000108000__000000B18434BFD0 000000067F0000800000090038000010752B-000000067F00008000000900380000110C91__000000B0F3EDEAC9-000000B18495C001 000000067F00008000000900380000108000-000000067F0000800000090038000010C000__000000B18434BFD0 000000067F0000800000090038000010C000-000000067F00008000000900380000110000__000000B18434BFD0 000000067F00008000000900380000110000-030000000000000000000000000000000002__000000B18434BFD0 000000067F00008000000900380000110C91-01000000000000000100000004000000001C__000000B0F3EDEAC9-000000B18495C001 000000067F000080000009004000000047E0-000000067F0000800000090040000000C6F1__000000B18495C001-000000B1FA75F501 000000067F0000800000090040000000C6F1-000000067F00008000000900400000014600__000000B18495C001-000000B1FA75F501 000000067F00008000000900400000014600-000000067F0000800000090040000001C511__000000B18495C001-000000B1FA75F501 000000067F0000800000090040000001C511-000000067F00008000000900400000024421__000000B18495C001-000000B1FA75F501 000000067F00008000000900400000024421-000000067F0000800000090040000002C331__000000B18495C001-000000B1FA75F501 000000067F0000800000090040000002C331-000000067F000080000009200C0000007658__000000B18495C001-000000B1FA75F501 000000067F000080000009200C0000000000-000000067F000080000009200C0000004000__000000B3AC039FE8 000000067F000080000009200C0000004000-000000067F000080000009200C0000008000__000000B3AC039FE8 000000067F000080000009200C0000007658-000000067F000080000009200C0000010DB5__000000B18495C001-000000B1FA75F501 000000067F000080000009200C0000008000-000000067F000080000009200C000000C000__000000B3AC039FE8 000000067F000080000009200C000000C000-000000067F000080000009200C0000010000__000000B3AC039FE8 000000067F000080000009200C0000010000-000000067F000080000009200C0000014000__000000B3A3EC82C8 000000067F000080000009200C0000010DB5-030000000000000000000000000000000002__000000B18495C001-000000B1FA75F501 000000067F000080000009200C0000012E97-000000067F000080000009200C000001C5FD__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000014000-000000067F000080000009200C0000018000__000000B3A3EC82C8 000000067F000080000009200C0000018000-000000067F000080000009200C000001C000__000000B3A3EC82C8 000000067F000080000009200C000001C000-000000067F000080000009200C0000020000__000000B3A3EC82C8 000000067F000080000009200C000001C5FD-000000067F000080000009200C0000025D0C__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000020000-000000067F000080000009200C0000024000__000000B3A3EC82C8 000000067F000080000009200C0000024000-000000067F000080000009200C0000028000__000000B3A3EC82C8 000000067F000080000009200C0000025D0C-000000067F000080000009200C000002F472__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000028000-000000067F000080000009200C000002C000__000000B3A3EC82C8 000000067F000080000009200C000002C000-000000067F000080000009200C0000030000__000000B3A3EC82C8 000000067F000080000009200C000002F472-000000067F000080000009200C0000038B85__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000030000-000000067F000080000009200C0000034000__000000B3A3EC82C8 000000067F000080000009200C0000034000-000000067F000080000009200C0000038000__000000B3A3EC82C8 000000067F000080000009200C0000038000-000000067F000080000009200C000003C000__000000B3A3EC82C8 000000067F000080000009200C0000038B85-000000067F000080000009200C00000422EB__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C000003C000-000000067F000080000009200C0000040000__000000B3A3EC82C8 000000067F000080000009200C0000040000-000000067F000080000009200C0000044000__000000B3A3EC82C8 000000067F000080000009200C00000422EB-000000067F000080000009200C000004BA0C__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000044000-000000067F000080000009200C0000048000__000000B3A3EC82C8 000000067F000080000009200C0000048000-000000067F000080000009200C000004C000__000000B3A3EC82C8 000000067F000080000009200C000004BA0C-000000067F000080000009200C0000055141__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C000004C000-000000067F000080000009200C0000050000__000000B3A3EC82C8 000000067F000080000009200C0000050000-000000067F000080000009200C0000054000__000000B3A3EC82C8 000000067F000080000009200C0000054000-000000067F000080000009200C0000058000__000000B3A3EC82C8 000000067F000080000009200C0000055141-000000067F000080000009200C000005E8A7__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000058000-000000067F000080000009200C000005C000__000000B3A3EC82C8 000000067F000080000009200C000005C000-000000067F000080000009200C0000060000__000000B3A3EC82C8 000000067F000080000009200C000005E8A7-000000067F000080000009200C0000067FC1__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000060000-000000067F000080000009200C0000064000__000000B3A3EC82C8 000000067F000080000009200C0000064000-000000067F000080000009200C0000068000__000000B3A3EC82C8 000000067F000080000009200C0000067FC1-000000067F000080000009200C0000071709__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000068000-000000067F000080000009200C000006C000__000000B3A3EC82C8 000000067F000080000009200C000006C000-000000067F000080000009200C0000070000__000000B3A3EC82C8 000000067F000080000009200C0000070000-000000067F000080000009200C0000074000__000000B3A3EC82C8 000000067F000080000009200C0000071709-000000067F000080000009200C000007AE6F__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000074000-000000067F000080000009200C0000078000__000000B3A3EC82C8 000000067F000080000009200C0000078000-000000067F000080000009200C000007C000__000000B3A3EC82C8 000000067F000080000009200C000007AE6F-000000067F000080000009200C00000845AB__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C000007C000-000000067F000080000009200C0000080000__000000B3A3EC82C8 000000067F000080000009200C0000080000-000000067F000080000009200C0000084000__000000B3A3EC82C8 000000067F000080000009200C0000084000-000000067F000080000009200C0000088000__000000B3A3EC82C8 000000067F000080000009200C00000845AB-000000067F000080000009200C000008DD09__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000088000-000000067F000080000009200C000008C000__000000B3A3EC82C8 000000067F000080000009200C000008C000-000000067F000080000009200C0000090000__000000B3A3EC82C8 000000067F000080000009200C000008DD09-000000067F000080000009200C0100000000__000000B1FA75F501-000000B2CA27F641 000000067F000080000009200C0000090000-000000067F000080000009200C0000094000__000000B3A3EC82C8 000000067F000080000009200C0000094000-000000067F000080000009200C0000098000__000000B3A3EC82C8 000000067F000080000009200C000009567A-000000067F000080000009200C000009EDE0__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C0000098000-000000067F000080000009200C000009C000__000000B3A3EC82C8 000000067F000080000009200C000009C000-000000067F000080000009200C00000A0000__000000B3A3EC82C8 000000067F000080000009200C000009EDE0-000000067F000080000009200C00000A852B__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000A0000-000000067F000080000009200C00000A4000__000000B3A3EC82C8 000000067F000080000009200C00000A4000-000000067F000080000009200C00000A8000__000000B3A3EC82C8 000000067F000080000009200C00000A8000-000000067F000080000009200C00000AC000__000000B3A3EC82C8 000000067F000080000009200C00000A852B-000000067F000080000009200C00000B1C91__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000AC000-000000067F000080000009200C00000B0000__000000B3A3EC82C8 000000067F000080000009200C00000B0000-000000067F000080000009200C00000B4000__000000B3A3EC82C8 000000067F000080000009200C00000B1C91-000000067F000080000009200C00000BB3F7__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000B4000-000000067F000080000009200C00000B8000__000000B3A3EC82C8 000000067F000080000009200C00000B8000-000000067F000080000009200C00000BC000__000000B3A3EC82C8 000000067F000080000009200C00000BB3F7-000000067F000080000009200C00000C4B0C__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000BC000-000000067F000080000009200C00000C0000__000000B3A3EC82C8 000000067F000080000009200C00000C0000-000000067F000080000009200C00000C4000__000000B3A3EC82C8 000000067F000080000009200C00000C4000-000000067F000080000009200C00000C8000__000000B3A3EC82C8 000000067F000080000009200C00000C4B0C-000000067F000080000009200C00000CE272__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000C8000-000000067F000080000009200C00000CC000__000000B3A3EC82C8 000000067F000080000009200C00000CC000-000000067F000080000009200C00000D0000__000000B3A3EC82C8 000000067F000080000009200C00000CE272-000000067F000080000009200C00000D798F__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000D0000-000000067F000080000009200C00000D4000__000000B3A3EC82C8 000000067F000080000009200C00000D4000-000000067F000080000009200C00000D8000__000000B3A3EC82C8 000000067F000080000009200C00000D798F-000000067F000080000009200C00000E10F5__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000D8000-000000067F000080000009200C00000DC000__000000B3A3EC82C8 000000067F000080000009200C00000DC000-000000067F000080000009200C00000E0000__000000B3A3EC82C8 000000067F000080000009200C00000E0000-000000067F000080000009200C00000E4000__000000B3A3EC82C8 000000067F000080000009200C00000E10F5-000000067F000080000009200C00000EA80B__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000E4000-000000067F000080000009200C00000E8000__000000B3A3EC82C8 000000067F000080000009200C00000E8000-000000067F000080000009200C00000EC000__000000B3A3EC82C8 000000067F000080000009200C00000EA80B-000000067F000080000009200C00000F3F4B__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000EC000-000000067F000080000009200C00000F0000__000000B3A3EC82C8 000000067F000080000009200C00000F0000-000000067F000080000009200C00000F4000__000000B3A3EC82C8 000000067F000080000009200C00000F3F4B-000000067F000080000009200C00000FD6B1__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C00000F4000-000000067F000080000009200C00000F8000__000000B3A3EC82C8 000000067F000080000009200C00000F8000-000000067F000080000009200C00000FC000__000000B3A3EC82C8 000000067F000080000009200C00000FC000-000000067F000080000009200C0000100000__000000B3A3EC82C8 000000067F000080000009200C00000FD6B1-000000067F000080000009200C0000106DD5__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C0000100000-000000067F000080000009200C0000104000__000000B3A3EC82C8 000000067F000080000009200C0000104000-000000067F000080000009200C0000108000__000000B3A3EC82C8 000000067F000080000009200C0000106DD5-000000067F000080000009200C000011050B__000000B2CA27F641-000000B3AB3B7FC9 000000067F000080000009200C0000108000-000000067F000080000009200C000010C000__000000B3A3EC82C8 000000067F000080000009200C000010C000-030000000000000000000000000000000002__000000B3A3EC82C8 000000067F000080000009200C000011050B-01000000000000000100000004000000001C__000000B2CA27F641-000000B3AB3B7FC9 000000067F00008000000920140000005289-000000067F0000800000092014000000D19A__000000B3AB3B7FC9-000000B4208FF3D1 000000067F0000800000092014000000D19A-000000067F000080000009201400000150A9__000000B3AB3B7FC9-000000B4208FF3D1 000000067F000080000009201400000150A9-000000067F0000800000092014000001CFBA__000000B3AB3B7FC9-000000B4208FF3D1 000000067F0000800000092014000001CFBA-000000067F00008000000920140000024ECB__000000B3AB3B7FC9-000000B4208FF3D1 000000067F00008000000920140000024ECB-000000067F0000800000092014000002CDDB__000000B3AB3B7FC9-000000B4208FF3D1 000000067F0000800000092014000002CDDB-000000067F000080000009400C000000830C__000000B3AB3B7FC9-000000B4208FF3D1 000000067F000080000009400C0000000000-000000067F000080000009400C0000004000__000000B5CED8CF78 000000067F000080000009400C0000004000-000000067F000080000009400C0000008000__000000B5CED8CF78 000000067F000080000009400C0000008000-000000067F000080000009400C000000C000__000000B5CED8CF78 000000067F000080000009400C000000830C-000000067F000080000009400C0000011A72__000000B3AB3B7FC9-000000B4208FF3D1 000000067F000080000009400C000000C000-000000067F000080000009400C0000010000__000000B5CED8CF78 000000067F000080000009400C0000010000-000000067F000080000009400C0000014000__000000B568835548 000000067F000080000009400C0000011A72-030000000000000000000000000000000002__000000B3AB3B7FC9-000000B4208FF3D1 000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B43089EC11 000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000014000-000000067F000080000009400C0000018000__000000B568835548 000000067F000080000009400C0000018000-000000067F000080000009400C000001C000__000000B568835548 000000067F000080000009400C000001C000-000000067F000080000009400C0000020000__000000B568835548 000000067F000080000009400C000001C5B7-000000067F000080000009400C0000025D1D__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C000001C5B7-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B43089EC11 000000067F000080000009400C0000020000-000000067F000080000009400C0000024000__000000B568835548 000000067F000080000009400C0000024000-000000067F000080000009400C0000028000__000000B568835548 000000067F000080000009400C0000025D1D-000000067F000080000009400C000002F483__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000028000-000000067F000080000009400C000002C000__000000B568835548 000000067F000080000009400C000002C000-000000067F000080000009400C0000030000__000000B568835548 000000067F000080000009400C000002F483-000000067F000080000009400C0000038B96__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000030000-000000067F000080000009400C0000034000__000000B568835548 000000067F000080000009400C0000034000-000000067F000080000009400C0000038000__000000B568835548 000000067F000080000009400C0000038000-000000067F000080000009400C000003C000__000000B568835548 000000067F000080000009400C0000038B96-000000067F000080000009400C00000422FC__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C000003C000-000000067F000080000009400C0000040000__000000B568835548 000000067F000080000009400C0000040000-000000067F000080000009400C0000044000__000000B568835548 000000067F000080000009400C00000422FC-000000067F000080000009400C000004BA0C__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000044000-000000067F000080000009400C0000048000__000000B568835548 000000067F000080000009400C0000048000-000000067F000080000009400C000004C000__000000B568835548 000000067F000080000009400C000004BA0C-000000067F000080000009400C0000055141__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C000004C000-000000067F000080000009400C0000050000__000000B568835548 000000067F000080000009400C0000050000-000000067F000080000009400C0000054000__000000B568835548 000000067F000080000009400C0000054000-000000067F000080000009400C0000058000__000000B568835548 000000067F000080000009400C0000055141-000000067F000080000009400C000005E8A7__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000058000-000000067F000080000009400C000005C000__000000B568835548 000000067F000080000009400C000005C000-000000067F000080000009400C0000060000__000000B568835548 000000067F000080000009400C000005E8A7-000000067F000080000009400C0000067FC1__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000060000-000000067F000080000009400C0000064000__000000B568835548 000000067F000080000009400C0000064000-000000067F000080000009400C0000068000__000000B568835548 000000067F000080000009400C0000067FC1-000000067F000080000009400C0000071709__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000068000-000000067F000080000009400C000006C000__000000B568835548 000000067F000080000009400C000006C000-000000067F000080000009400C0000070000__000000B568835548 000000067F000080000009400C0000070000-000000067F000080000009400C0000074000__000000B568835548 000000067F000080000009400C0000071709-000000067F000080000009400C000007AE6F__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000074000-000000067F000080000009400C0000078000__000000B568835548 000000067F000080000009400C0000078000-000000067F000080000009400C000007C000__000000B568835548 000000067F000080000009400C000007AE6F-000000067F000080000009400C00000845AB__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C000007C000-000000067F000080000009400C0000080000__000000B568835548 000000067F000080000009400C0000080000-000000067F000080000009400C0000084000__000000B568835548 000000067F000080000009400C0000084000-000000067F000080000009400C0000088000__000000B568835548 000000067F000080000009400C00000845AB-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B4E047E5A9 000000067F000080000009400C0000088000-000000067F000080000009400C000008C000__000000B568835548 000000067F000080000009400C000008C000-000000067F000080000009400C0000090000__000000B568835548 000000067F000080000009400C000008DEA4-000000067F000080000009400C00000975C4__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C0000090000-000000067F000080000009400C0000094000__000000B568835548 000000067F000080000009400C0000094000-000000067F000080000009400C0000098000__000000B568835548 000000067F000080000009400C00000975C4-000000067F000080000009400C00000A0D0A__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C0000098000-000000067F000080000009400C000009C000__000000B568835548 000000067F000080000009400C000009C000-000000067F000080000009400C00000A0000__000000B568835548 000000067F000080000009400C00000A0000-000000067F000080000009400C00000A4000__000000B568835548 000000067F000080000009400C00000A0D0A-000000067F000080000009400C00000AA470__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000A4000-000000067F000080000009400C00000A8000__000000B568835548 000000067F000080000009400C00000A8000-000000067F000080000009400C00000AC000__000000B568835548 000000067F000080000009400C00000AA470-000000067F000080000009400C00000B3BB2__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000AC000-000000067F000080000009400C00000B0000__000000B568835548 000000067F000080000009400C00000B0000-000000067F000080000009400C00000B4000__000000B568835548 000000067F000080000009400C00000B3BB2-000000067F000080000009400C00000BD30A__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000B4000-000000067F000080000009400C00000B8000__000000B568835548 000000067F000080000009400C00000B8000-000000067F000080000009400C00000BC000__000000B568835548 000000067F000080000009400C00000BC000-000000067F000080000009400C00000C0000__000000B568835548 000000067F000080000009400C00000BD30A-000000067F000080000009400C00000C6A30__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000C0000-000000067F000080000009400C00000C4000__000000B568835548 000000067F000080000009400C00000C4000-000000067F000080000009400C00000C8000__000000B568835548 000000067F000080000009400C00000C6A30-000000067F000080000009400C00000D0194__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000C8000-000000067F000080000009400C00000CC000__000000B568835548 000000067F000080000009400C00000CC000-000000067F000080000009400C00000D0000__000000B568835548 000000067F000080000009400C00000D0000-000000067F000080000009400C00000D4000__000000B568835548 000000067F000080000009400C00000D0194-000000067F000080000009400C00000D98FA__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000D4000-030000000000000000000000000000000002__000000B568835548 000000067F000080000009400C00000D98FA-000000067F000080000009400C00000E300D__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000E300D-000000067F000080000009400C00000EC773__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000EC773-000000067F000080000009400C00000F5ED9__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000F5ED9-000000067F000080000009400C00000FF60C__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00000FF60C-000000067F000080000009400C0000108D1D__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C0000108D1D-000000067F000080000009400C0000111C20__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009400C00FFFFFFFF-030000000000000000000000000000000002__000000B4E047E5A9-000000B5CED8CF79 000000067F000080000009600C0000000000-000000067F000080000009600C0000004000__000000B79F439FE0 000000067F000080000009600C0000004000-000000067F000080000009600C0000008000__000000B79F439FE0 000000067F000080000009600C0000008000-000000067F000080000009600C000000C000__000000B79F439FE0 000000067F000080000009600C000000974F-000000067F000080000009600C0000012EB5__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C000000C000-000000067F000080000009600C0000010000__000000B79F439FE0 000000067F000080000009600C0000010000-000000067F000080000009600C0000014000__000000B79F439FE0 000000067F000080000009600C0000012EB5-000000067F000080000009600C000001C60A__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C0000014000-000000067F000080000009600C0000018000__000000B79F439FE0 000000067F000080000009600C0000018000-000000067F000080000009600C000001C000__000000B79F439FE0 000000067F000080000009600C000001C000-000000067F000080000009600C0000020000__000000B79F439FE0 000000067F000080000009600C000001C60A-000000067F000080000009600C0000025D38__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C0000020000-000000067F000080000009600C0000024000__000000B79F439FE0 000000067F000080000009600C0000024000-000000067F000080000009600C0000028000__000000B79F439FE0 000000067F000080000009600C0000025D38-000000067F000080000009600C000002F49E__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C0000028000-000000067F000080000009600C000002C000__000000B79F439FE0 000000067F000080000009600C000002C000-000000067F000080000009600C0000030000__000000B79F439FE0 000000067F000080000009600C000002F49E-000000067F000080000009600C0000038BB1__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C0000030000-000000067F000080000009600C0000034000__000000B79F439FE0 000000067F000080000009600C0000034000-000000067F000080000009600C0000038000__000000B79F439FE0 000000067F000080000009600C0000038000-000000067F000080000009600C000003C000__000000B79F439FE0 000000067F000080000009600C0000038BB1-000000067F000080000009600C0000042317__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C000003C000-000000067F000080000009600C0000040000__000000B79F439FE0 000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B79D17BFD0 000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B8606C92A0 000000067F000080000009600C0000042317-030000000000000000000000000000000002__000000B5CED8CF79-000000B63EADE5B9 000000067F000080000009600C000004236E-000000067F000080000009600C000004BAD4__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B79D17BFD0 000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B8606C92A0 000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B79D17BFD0 000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B8606C92A0 000000067F000080000009600C000004BAD4-000000067F000080000009600C0000055208__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B79D17BFD0 000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B8606C92A0 000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B79D17BFD0 000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B8606C92A0 000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B79D17BFD0 000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B8606C92A0 000000067F000080000009600C0000055208-000000067F000080000009600C000005E96E__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000055A77-000000067F000080000009600C00000AAEA5__000000B808718889-000000B8606C92A1 000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B79D17BFD0 000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B8606C92A0 000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B79D17BFD0 000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B8606C92A0 000000067F000080000009600C000005E96E-000000067F000080000009600C00000680D4__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B79D17BFD0 000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B8606C92A0 000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B79D17BFD0 000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B8606C92A0 000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B79D17BFD0 000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B8606C92A0 000000067F000080000009600C00000680D4-000000067F000080000009600C000007180B__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B79D17BFD0 000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B8606C92A0 000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B79D17BFD0 000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B8606C92A0 000000067F000080000009600C000007180B-000000067F000080000009600C000007AF71__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B79D17BFD0 000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B8606C92A0 000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B79D17BFD0 000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B8606C92A0 000000067F000080000009600C000007AF71-000000067F000080000009600C00000846D7__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B79D17BFD0 000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B8606C92A0 000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B79D17BFD0 000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B8606C92A0 000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B79D17BFD0 000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B8606C92A0 000000067F000080000009600C00000846D7-000000067F000080000009600C000008DE0C__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B79D17BFD0 000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B8606C92A0 000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B79D17BFD0 000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B8606C92A0 000000067F000080000009600C000008DE0C-000000067F000080000009600C000009752C__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B79D17BFD0 000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B8606C92A0 000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B79D17BFD0 000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B8606C92A0 000000067F000080000009600C000009752C-000000067F000080000009600C00000A0C92__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B79D17BFD0 000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B8606C92A0 000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B79D17BFD0 000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B8606C92A0 000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B79D17BFD0 000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B8606C92A0 000000067F000080000009600C00000A0C92-000000067F000080000009600C0100000000__000000B63EADE5B9-000000B6DE71F5F9 000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B79D17BFD0 000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B8606C92A0 000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B79D17BFD0 000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B8606C92A0 000000067F000080000009600C00000A93FD-000000067F000080000009600C00000B2B0C__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000AAEA5-000000067F000080000009600C0000101445__000000B808718889-000000B8606C92A1 000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B79D17BFD0 000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B8606C92A0 000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B79D17BFD0 000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B8606C92A0 000000067F000080000009600C00000B2B0C-000000067F000080000009600C00000BC272__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B79D17BFD0 000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B8606C92A0 000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B79D17BFD0 000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B8606C92A0 000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B79D17BFD0 000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B8606C92A0 000000067F000080000009600C00000BC272-000000067F000080000009600C00000C59A2__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B79D17BFD0 000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B8606C92A0 000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B79D17BFD0 000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B8606C92A0 000000067F000080000009600C00000C59A2-000000067F000080000009600C00000CF108__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B79D17BFD0 000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B8606C92A0 000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B79D17BFD0 000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B8606C92A0 000000067F000080000009600C00000CF108-000000067F000080000009600C00000D882B__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B79D17BFD0 000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B8606C92A0 000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B79D17BFD0 000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B8606C92A0 000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B79D17BFD0 000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B8606C92A0 000000067F000080000009600C00000D882B-000000067F000080000009600C00000E1F7E__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B79D17BFD0 000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B8606C92A0 000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B79D17BFD0 000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B8606C92A0 000000067F000080000009600C00000E1F7E-000000067F000080000009600C00000EB6E4__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B79D17BFD0 000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B8606C92A0 000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B79D17BFD0 000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B8606C92A0 000000067F000080000009600C00000EB6E4-000000067F000080000009600C00000F4E0B__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B79D17BFD0 000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B8606C92A0 000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B79D17BFD0 000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B8606C92A0 000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B79D17BFD0 000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B8606C92A0 000000067F000080000009600C00000F4E0B-000000067F000080000009600C00000FE571__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B79D17BFD0 000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B8606C92A0 000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B79D17BFD0 000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B8606C92A0 000000067F000080000009600C00000FE571-000000067F000080000009600C0000107CD7__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B79D17BFD0 000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B8606C92A0 000000067F000080000009600C000010144D-000000067F0000800000096014000000E7D9__000000B808718889-000000B8606C92A1 000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B79D17BFD0 000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B8606C92A0 000000067F000080000009600C0000107CD7-000000067F000080000009600C000011140C__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B79D17BFD0 000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B8606C92A0 000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B79D17BFD0 000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B8606C92A0 000000067F000080000009600C0000110000-000000067F00008000000960120100000000__000000B8606C92A0 000000067F000080000009600C0000110000-030000000000000000000000000000000002__000000B79D17BFD0 000000067F000080000009600C000011140C-01000000000000000100000004000000001C__000000B6DE71F5F9-000000B79E68FFF9 000000067F000080000009600C020000000B-000000067F0000800000096014000000571F__000000B79E68FFF9-000000B808718889 000000067F00008000000960140000000000-000000067F00008000000960140000004000__000000B8606C92A0 000000067F00008000000960140000004000-000000067F00008000000960140000008000__000000B8606C92A0 000000067F0000800000096014000000571F-000000067F0000800000096014000000CB61__000000B79E68FFF9-000000B808718889 000000067F00008000000960140000008000-000000067F0000800000096014000000C000__000000B8606C92A0 000000067F0000800000096014000000C000-000000067F00008000000960140000010000__000000B8606C92A0 000000067F0000800000096014000000CB61-000000067F00008000000960140000013F98__000000B79E68FFF9-000000B808718889 000000067F0000800000096014000000E7DB-000000067F00008000000960140000022A8D__000000B808718889-000000B8606C92A1 000000067F00008000000960140000010000-000000067F00008000000960140000014000__000000B8606C92A0 000000067F00008000000960140000013F98-000000067F0000800000096014000001B3C2__000000B79E68FFF9-000000B808718889 000000067F00008000000960140000014000-000000067F00008000000960140000018000__000000B8606C92A0 000000067F00008000000960140000018000-000000067F0000800000096014000001C000__000000B8606C92A0 000000067F0000800000096014000001B3C2-000000067F000080000009601400000227FC__000000B79E68FFF9-000000B808718889 000000067F0000800000096014000001C000-000000067F00008000000960140000020000__000000B8606C92A0 000000067F00008000000960140000020000-000000067F00008000000960140000024000__000000B8606C92A0 000000067F000080000009601400000227FC-000000067F00008000000960140000029BD8__000000B79E68FFF9-000000B808718889 000000067F00008000000960140000022A8D-030000000000000000000000000000000002__000000B808718889-000000B8606C92A1 000000067F00008000000960140000024000-000000067F00008000000960140000028000__000000B8606C92A0 000000067F00008000000960140000028000-000000067F0000800000096014000002C000__000000B8606C92A0 000000067F00008000000960140000029BD8-030000000000000000000000000000000002__000000B79E68FFF9-000000B808718889 000000067F0000800000096014000002C000-030000000000000000000000000000000002__000000B8606C92A0 000000067F000080000009800C0000009748-000000067F000080000009800C0000012EAE__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C0000012EAE-000000067F000080000009800C000001C60A__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C000001C60A-000000067F000080000009800C0000025D38__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C0000025D38-000000067F000080000009800C000002F49E__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C000002F49E-000000067F000080000009800C0000038BB1__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C0000038BB1-000000067F000080000009800C0000042317__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C0000042317-000000067F000080000009800C000004BA7D__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C000004BA7D-030000000000000000000000000000000002__000000B8606C92A1-000000B8E03BF0B9 000000067F000080000009800C000004BAD2-000000067F000080000009800C0000055206__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C0000055206-000000067F000080000009800C000005E911__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C000005E911-000000067F000080000009800C000006802B__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C000006802B-000000067F000080000009800C0000071782__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C0000071782-000000067F000080000009800C000007AEE8__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C000007AEE8-000000067F000080000009800C000008460B__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C000008460B-000000067F000080000009800C000008DD71__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C000008DD71-000000067F000080000009800C00000974D7__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C00000974D7-000000067F000080000009800C00000A0C0B__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C00000A0C0B-000000067F000080000009800C00000AA371__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C00000A8000-000000067F000080000009800C00000AC000__000000BA2E67EA20 000000067F000080000009800C00000AA371-000000067F000080000009800C0100000000__000000B8E03BF0B9-000000B97FFFFFE9 000000067F000080000009800C00000AA4F5-000000067F000080000009800C00000B3C0B__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000AC000-000000067F000080000009800C00000B0000__000000BA2E67EA20 000000067F000080000009800C00000B0000-000000067F000080000009800C00000B4000__000000BA2E67EA20 000000067F000080000009800C00000B3C0B-000000067F000080000009800C00000BD371__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000B4000-000000067F000080000009800C00000B8000__000000BA2E67EA20 000000067F000080000009800C00000B8000-000000067F000080000009800C00000BC000__000000BA2E67EA20 000000067F000080000009800C00000BC000-000000067F000080000009800C00000C0000__000000BA2E67EA20 000000067F000080000009800C00000BD371-000000067F000080000009800C00000C6AD7__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000C0000-000000067F000080000009800C00000C4000__000000BA2E67EA20 000000067F000080000009800C00000C4000-000000067F000080000009800C00000C8000__000000BA2E67EA20 000000067F000080000009800C00000C6AD7-000000067F000080000009800C00000D020B__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000C8000-000000067F000080000009800C00000CC000__000000BA2E67EA20 000000067F000080000009800C00000CC000-000000067F000080000009800C00000D0000__000000BA2E67EA20 000000067F000080000009800C00000D0000-000000067F000080000009800C00000D4000__000000BA2E67EA20 000000067F000080000009800C00000D020B-000000067F000080000009800C00000D9971__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000D4000-000000067F000080000009800C00000D8000__000000BA2E67EA20 000000067F000080000009800C00000D8000-000000067F000080000009800C00000DC000__000000BA2E67EA20 000000067F000080000009800C00000D9971-000000067F000080000009800C00000E30D7__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000DC000-000000067F000080000009800C00000E0000__000000BA2E67EA20 000000067F000080000009800C00000E0000-000000067F000080000009800C00000E4000__000000BA2E67EA20 000000067F000080000009800C00000E30D7-000000067F000080000009800C00000EC80B__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000E4000-000000067F000080000009800C00000E8000__000000BA2E67EA20 000000067F000080000009800C00000E8000-000000067F000080000009800C00000EC000__000000BA2E67EA20 000000067F000080000009800C00000EC000-000000067F000080000009800C00000F0000__000000BA2E67EA20 000000067F000080000009800C00000EC80B-000000067F000080000009800C00000F5F38__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000F0000-000000067F000080000009800C00000F4000__000000BA2E67EA20 000000067F000080000009800C00000F4000-000000067F000080000009800C00000F8000__000000BA2E67EA20 000000067F000080000009800C00000F5F38-000000067F000080000009800C00000FF69E__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C00000F8000-000000067F000080000009800C00000FC000__000000BA2E67EA20 000000067F000080000009800C00000FC000-000000067F000080000009800C0000100000__000000BA2E67EA20 000000067F000080000009800C00000FF69E-000000067F000080000009800C0000108DAF__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C0000100000-000000067F000080000009800C0000104000__000000BA2E67EA20 000000067F000080000009800C0000104000-000000067F000080000009800C0000108000__000000BA2E67EA20 000000067F000080000009800C0000108000-000000067F000080000009800C000010C000__000000BA2E67EA20 000000067F000080000009800C0000108DAF-000000067F000080000009800F0100000003__000000B97FFFFFE9-000000BA1FC3FB39 000000067F000080000009800C000010C000-000000067F000080000009800C0000110000__000000BA2E67EA20 000000067F000080000009800C000010EC71-000000067F000080000009801400000025C3__000000BA1FC3FB39-000000BA9685E7C1 000000067F000080000009800C0000110000-030000000000000000000000000000000002__000000BA2E67EA20 000000067F000080000009801400000025C3-000000067F0000800000098014000000A4D3__000000BA1FC3FB39-000000BA9685E7C1 000000067F0000800000098014000000A4D3-000000067F000080000009801400000123E4__000000BA1FC3FB39-000000BA9685E7C1 000000067F000080000009801400000123E4-000000067F0000800000098014000001A2F3__000000BA1FC3FB39-000000BA9685E7C1 000000067F0000800000098014000001A2F3-000000067F00008000000980140000022204__000000BA1FC3FB39-000000BA9685E7C1 000000067F00008000000980140000022204-000000067F0000800000098014000002A114__000000BA1FC3FB39-000000BA9685E7C1 000000067F0000800000098014000002A114-000000067F000080000009A00C0000004DB3__000000BA1FC3FB39-000000BA9685E7C1 000000067F000080000009A00C0000000000-000000067F000080000009A00C0000004000__000000BCEF79BE90 000000067F000080000009A00C0000004000-000000067F000080000009A00C0000008000__000000BCEF79BE90 000000067F000080000009A00C0000004DB3-030000000000000000000000000000000002__000000BA1FC3FB39-000000BA9685E7C1 000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BC59629F98 000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BD25E66810 000000067F000080000009A00C00000096E8-000000067F000080000009A00C0000012E0B__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BC59629F98 000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BD25E66810 000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BC59629F98 000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BD25E66810 000000067F000080000009A00C0000012E0B-000000067F000080000009A00C000001C571__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BC59629F98 000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BD25E66810 000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BC59629F98 000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BD25E66810 000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BC59629F98 000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BD25E66810 000000067F000080000009A00C000001C571-000000067F000080000009A00C0000025CD7__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BC59629F98 000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BD25E66810 000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BC59629F98 000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BD25E66810 000000067F000080000009A00C0000025CD7-000000067F000080000009A00C000002F40B__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BC59629F98 000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BD25E66810 000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BC59629F98 000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BD25E66810 000000067F000080000009A00C000002F40B-000000067F000080000009A00C0000038B1E__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BC59629F98 000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BD25E66810 000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BC59629F98 000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BD25E66810 000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BC59629F98 000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BD25E66810 000000067F000080000009A00C0000038B1E-000000067F000080000009A00C0000042284__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BC59629F98 000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BD25E66810 000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BC59629F98 000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BD25E66810 000000067F000080000009A00C0000042284-000000067F000080000009A00C000004B9EA__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BC59629F98 000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BD25E66810 000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BC59629F98 000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BD25E66810 000000067F000080000009A00C000004B9EA-000000067F000080000009A00C000005510B__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BC59629F98 000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BD25E66810 000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BC59629F98 000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BD25E66810 000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BC59629F98 000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BD25E66810 000000067F000080000009A00C000005510B-000000067F000080000009A00C000005E871__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BC59629F98 000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BD25E66810 000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BC59629F98 000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BD25E66810 000000067F000080000009A00C000005E871-000000067F000080000009A00C0000067F8B__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BC59629F98 000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BD25E66810 000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BC59629F98 000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BD25E66810 000000067F000080000009A00C0000067F8B-000000067F000080000009A00C00000716F1__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BC59629F98 000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BD25E66810 000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BC59629F98 000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BD25E66810 000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BC53F74828 000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BD25E66810 000000067F000080000009A00C00000716F1-000000067F000080000009A00C0100000000__000000BA9685E7C1-000000BB4643FBD1 000000067F000080000009A00C0000071875-000000067F000080000009A00C000007AFDB__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C0000071F8D-000000067F000080000009A00C00000E4F8F__000000BCEF79BE91-000000BD263A5849 000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BC53F74828 000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BD25E66810 000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BC53F74828 000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BD25E66810 000000067F000080000009A00C00000794E0-000000067F000080000009A00C00000F2480__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A00C000007AFDB-000000067F000080000009A00C000008470A__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BC53F74828 000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BD25E66810 000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BC53F74828 000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BD25E66810 000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BC53F74828 000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BD25E66810 000000067F000080000009A00C000008470A-000000067F000080000009A00C000008DE70__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BC53F74828 000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BD25E66810 000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BC53F74828 000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BD25E66810 000000067F000080000009A00C000008DE70-000000067F000080000009A00C0000097590__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BC53F74828 000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BD25E66810 000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BC53F74828 000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BD25E66810 000000067F000080000009A00C0000097590-000000067F000080000009A00C00000A0CF6__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BC53F74828 000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BD25E66810 000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BC53F74828 000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BD25E66810 000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BC53F74828 000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BD25E66810 000000067F000080000009A00C00000A0CF6-000000067F000080000009A00C00000AA40B__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BC53F74828 000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BD25E66810 000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BC53F74828 000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BD25E66810 000000067F000080000009A00C00000AA40B-000000067F000080000009A00C00000B3B4D__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BC53F74828 000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BD25E66810 000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BC53F74828 000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BD25E66810 000000067F000080000009A00C00000B3B4D-000000067F000080000009A00C00000BD2B3__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BC53F74828 000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BD25E66810 000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BC53F74828 000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BD25E66810 000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BC53F74828 000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BD25E66810 000000067F000080000009A00C00000BD2B3-000000067F000080000009A00C00000C69D9__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BC53F74828 000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BD25E66810 000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BC53F74828 000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BD25E66810 000000067F000080000009A00C00000C69D9-000000067F000080000009A00C00000D010C__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BC53F74828 000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BD25E66810 000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BC53F74828 000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BD25E66810 000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BC53F74828 000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BD25E66810 000000067F000080000009A00C00000D010C-000000067F000080000009A00C0100000000__000000BB4643FBD1-000000BBE607E8F1 000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BC53F74828 000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BD25E66810 000000067F000080000009A00C00000D6C06-000000067F000080000009A00C00000E0166__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BC53F74828 000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BD25E66810 000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BC53F74828 000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BD25E66810 000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BC53F74828 000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BD25E66810 000000067F000080000009A00C00000E0166-000000067F000080000009A00C00000E96C9__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BC53F74828 000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BD25E66810 000000067F000080000009A00C00000E4F97-000000067F000080000009A0140000019842__000000BCEF79BE91-000000BD263A5849 000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BC53F74828 000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BD25E66810 000000067F000080000009A00C00000E96C9-000000067F000080000009A00C00000F2C2B__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BC53F74828 000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BD25E66810 000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BC53F74828 000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BD25E66810 000000067F000080000009A00C00000F248B-000000067F000080000009A0140000004031__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A00C00000F2C2B-000000067F000080000009A00C00000FC18E__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BC53F74828 000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BD25E66810 000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BC53F74828 000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BD25E66810 000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BC53F74828 000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BD25E66810 000000067F000080000009A00C00000FC18E-000000067F000080000009A00C00001056F2__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BC53F74828 000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BD25E66810 000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BC53F74828 000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BD25E66810 000000067F000080000009A00C00001056F2-000000067F000080000009A00C000010EC54__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BC53F74828 000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BD25E66810 000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BC53F74828 000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BD25E66810 000000067F000080000009A00C000010EC54-010000000000000001000000040000000020__000000BBE607E8F1-000000BC596B5D59 000000067F000080000009A00C0000110000-000000067F000080000009A0120100000000__000000BD25E66810 000000067F000080000009A00C0000110000-030000000000000000000000000000000002__000000BC53F74828 000000067F000080000009A0140000000000-000000067F000080000009A0140000004000__000000BD25E66810 000000067F000080000009A0140000004000-000000067F000080000009A0140000008000__000000BD25E66810 000000067F000080000009A0140000004031-000000067F000080000009A0140000009FC7__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A0140000008000-000000067F000080000009A014000000C000__000000BD25E66810 000000067F000080000009A0140000009FC7-000000067F000080000009A014000000FF53__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A014000000C000-000000067F000080000009A0140000010000__000000BD25E66810 000000067F000080000009A014000000FF53-000000067F000080000009A0140000015F1C__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A0140000010000-000000067F000080000009A0140000014000__000000BD25E66810 000000067F000080000009A0140000014000-000000067F000080000009A0140000018000__000000BD25E66810 000000067F000080000009A0140000015F1C-000000067F000080000009A014000001BED0__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A0140000018000-000000067F000080000009A014000001C000__000000BD25E66810 000000067F000080000009A0140000019844-030000000000000000000000000000000002__000000BCEF79BE91-000000BD263A5849 000000067F000080000009A014000001BED0-000000067F000080000009A0140000021E6C__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A014000001C000-000000067F000080000009A0140000020000__000000BD25E66810 000000067F000080000009A0140000020000-000000067F000080000009A0140000024000__000000BD25E66810 000000067F000080000009A0140000021E6C-000000067F000080000009A0140000027DB1__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A0140000024000-000000067F000080000009A0140000028000__000000BD25E66810 000000067F000080000009A0140000027DB1-000000067F000080000009A014000002DC9E__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009A0140000028000-000000067F000080000009A014000002C000__000000BD25E66810 000000067F000080000009A014000002C000-030000000000000000000000000000000002__000000BD25E66810 000000067F000080000009A01400FFFFFFFF-030000000000000000000000000000000002__000000BC596B5D59-000000BCEF79BE91 000000067F000080000009C00C0000000000-000000067F000080000009C00C0000004000__000000BEF683BFD0 000000067F000080000009C00C0000004000-000000067F000080000009C00C0000008000__000000BEF683BFD0 000000067F000080000009C00C0000008000-000000067F000080000009C00C000000C000__000000BEF683BFD0 000000067F000080000009C00C0000009749-000000067F000080000009C00C0000012EAF__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C000000C000-000000067F000080000009C00C0000010000__000000BEF683BFD0 000000067F000080000009C00C0000010000-000000067F000080000009C00C0000014000__000000BEF683BFD0 000000067F000080000009C00C0000012EAF-000000067F000080000009C00C000001C60B__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C0000014000-000000067F000080000009C00C0000018000__000000BEF683BFD0 000000067F000080000009C00C0000018000-000000067F000080000009C00C000001C000__000000BEF683BFD0 000000067F000080000009C00C000001C000-000000067F000080000009C00C0000020000__000000BEF683BFD0 000000067F000080000009C00C000001C60B-000000067F000080000009C00C0000025D39__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C0000020000-000000067F000080000009C00C0000024000__000000BEF683BFD0 000000067F000080000009C00C0000024000-000000067F000080000009C00C0000028000__000000BEF683BFD0 000000067F000080000009C00C0000025D39-000000067F000080000009C00C000002F49F__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C0000028000-000000067F000080000009C00C000002C000__000000BEF683BFD0 000000067F000080000009C00C000002C000-000000067F000080000009C00C0000030000__000000BEF683BFD0 000000067F000080000009C00C000002F49F-000000067F000080000009C00C0000038BB2__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C0000030000-000000067F000080000009C00C0000034000__000000BEF683BFD0 000000067F000080000009C00C0000034000-000000067F000080000009C00C0000038000__000000BEF683BFD0 000000067F000080000009C00C0000038000-000000067F000080000009C00C000003C000__000000BEF683BFD0 000000067F000080000009C00C0000038BB2-000000067F000080000009C00C0000042318__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C000003C000-000000067F000080000009C00C0000040000__000000BEF683BFD0 000000067F000080000009C00C0000040000-000000067F000080000009C00C0000044000__000000BEF683BFD0 000000067F000080000009C00C0000042318-000000067F000080000009C00C000004BA7E__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C0000044000-000000067F000080000009C00C0000048000__000000BEF683BFD0 000000067F000080000009C00C0000048000-000000067F000080000009C00C000004C000__000000BEF06884C8 000000067F000080000009C00C000004BA7E-030000000000000000000000000000000002__000000BD263A5849-000000BDA607F261 000000067F000080000009C00C000004BAC3-000000067F000080000009C00C00000551F8__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C000004C000-000000067F000080000009C00C0000050000__000000BEF06884C8 000000067F000080000009C00C0000050000-000000067F000080000009C00C0000054000__000000BEF06884C8 000000067F000080000009C00C0000054000-000000067F000080000009C00C0000058000__000000BEF06884C8 000000067F000080000009C00C00000551F8-000000067F000080000009C00C000005E90C__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000058000-000000067F000080000009C00C000005C000__000000BEF06884C8 000000067F000080000009C00C000005C000-000000067F000080000009C00C0000060000__000000BEF06884C8 000000067F000080000009C00C000005E90C-000000067F000080000009C00C000006802C__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000060000-000000067F000080000009C00C0000064000__000000BEF06884C8 000000067F000080000009C00C0000064000-000000067F000080000009C00C0000068000__000000BEF06884C8 000000067F000080000009C00C0000068000-000000067F000080000009C00C000006C000__000000BEF06884C8 000000067F000080000009C00C000006802C-000000067F000080000009C00C0000071783__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C000006C000-000000067F000080000009C00C0000070000__000000BEF06884C8 000000067F000080000009C00C0000070000-000000067F000080000009C00C0000074000__000000BEF06884C8 000000067F000080000009C00C0000071783-000000067F000080000009C00C000007AEE9__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000074000-000000067F000080000009C00C0000078000__000000BEF06884C8 000000067F000080000009C00C0000078000-000000067F000080000009C00C000007C000__000000BEF06884C8 000000067F000080000009C00C000007AEE9-000000067F000080000009C00C000008460B__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C000007C000-000000067F000080000009C00C0000080000__000000BEF06884C8 000000067F000080000009C00C0000080000-000000067F000080000009C00C0000084000__000000BEF06884C8 000000067F000080000009C00C0000084000-000000067F000080000009C00C0000088000__000000BEF06884C8 000000067F000080000009C00C000008460B-000000067F000080000009C00C000008DD71__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000088000-000000067F000080000009C00C000008C000__000000BEF06884C8 000000067F000080000009C00C000008C000-000000067F000080000009C00C0000090000__000000BEF06884C8 000000067F000080000009C00C000008DD71-000000067F000080000009C00C00000974D7__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000090000-000000067F000080000009C00C0000094000__000000BEF06884C8 000000067F000080000009C00C0000094000-000000067F000080000009C00C0000098000__000000BEF06884C8 000000067F000080000009C00C00000974D7-000000067F000080000009C00C00000A0C0B__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C0000098000-000000067F000080000009C00C000009C000__000000BEF06884C8 000000067F000080000009C00C000009C000-000000067F000080000009C00C00000A0000__000000BEF06884C8 000000067F000080000009C00C00000A0000-000000067F000080000009C00C00000A4000__000000BEF06884C8 000000067F000080000009C00C00000A0C0B-000000067F000080000009C00C00000AA371__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C00000A4000-000000067F000080000009C00C00000A8000__000000BEF06884C8 000000067F000080000009C00C00000A8000-000000067F000080000009C00C00000AC000__000000BEF06884C8 000000067F000080000009C00C00000AA371-000000067F000080000009C00C0100000000__000000BDA607F261-000000BE45CBFBB9 000000067F000080000009C00C00000AC000-000000067F000080000009C00C00000B0000__000000BEF06884C8 000000067F000080000009C00C00000B0000-000000067F000080000009C00C00000B4000__000000BEF06884C8 000000067F000080000009C00C00000B2921-000000067F000080000009C00C00000BC087__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000B4000-000000067F000080000009C00C00000B8000__000000BEF06884C8 000000067F000080000009C00C00000B8000-000000067F000080000009C00C00000BC000__000000BEF06884C8 000000067F000080000009C00C00000BC000-000000067F000080000009C00C00000C0000__000000BEF06884C8 000000067F000080000009C00C00000BC087-000000067F000080000009C00C00000C57B8__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000C0000-000000067F000080000009C00C00000C4000__000000BEF06884C8 000000067F000080000009C00C00000C4000-000000067F000080000009C00C00000C8000__000000BEF06884C8 000000067F000080000009C00C00000C57B8-000000067F000080000009C00C00000CEF09__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000C8000-000000067F000080000009C00C00000CC000__000000BEF06884C8 000000067F000080000009C00C00000CC000-000000067F000080000009C00C00000D0000__000000BEF06884C8 000000067F000080000009C00C00000CEF09-000000067F000080000009C00C00000D862B__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000D0000-000000067F000080000009C00C00000D4000__000000BEF06884C8 000000067F000080000009C00C00000D4000-000000067F000080000009C00C00000D8000__000000BEF06884C8 000000067F000080000009C00C00000D8000-000000067F000080000009C00C00000DC000__000000BEF06884C8 000000067F000080000009C00C00000D862B-000000067F000080000009C00C00000E1D7F__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000DC000-000000067F000080000009C00C00000E0000__000000BEF06884C8 000000067F000080000009C00C00000E0000-000000067F000080000009C00C00000E4000__000000BEF06884C8 000000067F000080000009C00C00000E1D7F-000000067F000080000009C00C00000EB4E5__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000E4000-000000067F000080000009C00C00000E8000__000000BEF06884C8 000000067F000080000009C00C00000E8000-000000067F000080000009C00C00000EC000__000000BEF06884C8 000000067F000080000009C00C00000EB4E5-000000067F000080000009C00C00000F4C0B__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000EC000-000000067F000080000009C00C00000F0000__000000BEF06884C8 000000067F000080000009C00C00000F0000-000000067F000080000009C00C00000F4000__000000BEF06884C8 000000067F000080000009C00C00000F4000-000000067F000080000009C00C00000F8000__000000BEF06884C8 000000067F000080000009C00C00000F4C0B-000000067F000080000009C00C00000FE371__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C00000F8000-000000067F000080000009C00C00000FC000__000000BEF06884C8 000000067F000080000009C00C00000FC000-000000067F000080000009C00C0000100000__000000BEF06884C8 000000067F000080000009C00C00000FE371-000000067F000080000009C00C0000107AD7__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C0000100000-000000067F000080000009C00C0000104000__000000BEF06884C8 000000067F000080000009C00C0000104000-000000067F000080000009C00C0000108000__000000BEF06884C8 000000067F000080000009C00C0000107AD7-000000067F000080000009C00C000011120B__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009C00C0000108000-000000067F000080000009C00C000010C000__000000BEF06884C8 000000067F000080000009C00C000010C000-030000000000000000000000000000000002__000000BEF06884C8 000000067F000080000009C00C000011120B-010000000000000001000000050000000003__000000BE45CBFBB9-000000BEF5F47FD1 000000067F000080000009E00C0000000000-000000067F000080000009E00C0000004000__000000C0C9769FD8 000000067F000080000009E00C0000004000-000000067F000080000009E00C0000008000__000000C0C9769FD8 000000067F000080000009E00C0000004916-000000067F000080000009E00C000000E07C__000000BEF5F47FD1-000000BF48FFEB11 000000067F000080000009E00C0000008000-000000067F000080000009E00C000000C000__000000C0C9769FD8 000000067F000080000009E00C000000C000-000000067F000080000009E00C0000010000__000000C0C9769FD8 000000067F000080000009E00C000000E07C-000000067F000080000009E00C000001779A__000000BEF5F47FD1-000000BF48FFEB11 000000067F000080000009E00C0000010000-000000067F000080000009E00C0000014000__000000C0C9769FD8 000000067F000080000009E00C0000014000-000000067F000080000009E00C0000018000__000000C0C9769FD8 000000067F000080000009E00C000001779A-000000067F000080000009E00C0000020F00__000000BEF5F47FD1-000000BF48FFEB11 000000067F000080000009E00C0000018000-000000067F000080000009E00C000001C000__000000C0C9769FD8 000000067F000080000009E00C000001C000-000000067F000080000009E00C0000020000__000000C0C9769FD8 000000067F000080000009E00C0000020000-000000067F000080000009E00C0000024000__000000C0C9769FD8 000000067F000080000009E00C0000020F00-000000067F000080000009E00C000002A60B__000000BEF5F47FD1-000000BF48FFEB11 000000067F000080000009E00C0000024000-000000067F000080000009E00C0000028000__000000C0C9769FD8 000000067F000080000009E00C0000028000-000000067F000080000009E00C000002C000__000000C0C9769FD8 000000067F000080000009E00C000002A60B-030000000000000000000000000000000002__000000BEF5F47FD1-000000BF48FFEB11 000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C0B597E900 000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C1972392A8 000000067F000080000009E00C000002F506-000000067F000080000009E00C0000038C11__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C0B597E900 000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C1972392A8 000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C0B597E900 000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C1972392A8 000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C0B597E900 000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C1972392A8 000000067F000080000009E00C0000038C11-000000067F000080000009E00C0000042361__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C0B597E900 000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C1972392A8 000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C0B597E900 000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C1972392A8 000000067F000080000009E00C0000042361-000000067F000080000009E00C000004BAC7__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C0B597E900 000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C1972392A8 000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C0B597E900 000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C1972392A8 000000067F000080000009E00C000004BAC7-000000067F000080000009E00C00000551FC__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C0B597E900 000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C1972392A8 000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C0B597E900 000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C1972392A8 000000067F000080000009E00C0000050E89-000000067F000080000009E00C00000A18A0__000000C1426D92E1-000000C19744E959 000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C0B597E900 000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C1972392A8 000000067F000080000009E00C00000551FC-000000067F000080000009E00C000005E90B__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C0B597E900 000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C1972392A8 000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C0B597E900 000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C1972392A8 000000067F000080000009E00C000005E90B-000000067F000080000009E00C000006802B__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C0B597E900 000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C1972392A8 000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C0B597E900 000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C1972392A8 000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C0B597E900 000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C1972392A8 000000067F000080000009E00C000006802B-000000067F000080000009E00C0000071782__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C0B597E900 000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C1972392A8 000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C0B597E900 000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C1972392A8 000000067F000080000009E00C0000071782-000000067F000080000009E00C000007AEE8__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C0B597E900 000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C1972392A8 000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C0B597E900 000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C1972392A8 000000067F000080000009E00C000007AEE8-000000067F000080000009E00C000008460B__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C0B597E900 000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C1972392A8 000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C0B597E900 000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C1972392A8 000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C0B597E900 000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C1972392A8 000000067F000080000009E00C000008460B-000000067F000080000009E00C000008DD71__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C0B597E900 000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C1972392A8 000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C0B597E900 000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C1972392A8 000000067F000080000009E00C000008DD71-000000067F000080000009E00C00000974D7__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C0B597E900 000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C1972392A8 000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C0B597E900 000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C1972392A8 000000067F000080000009E00C00000974D7-000000067F000080000009E00C0100000000__000000BF48FFEB11-000000BFF8BDFEE9 000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C0B597E900 000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C1972392A8 000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C0B597E900 000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C1972392A8 000000067F000080000009E00C000009FB21-000000067F000080000009E00C00000A9230__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C0B597E900 000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C1972392A8 000000067F000080000009E00C00000A18A4-000000067F000080000009E00C00000F2B76__000000C1426D92E1-000000C19744E959 000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C0B597E900 000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C1972392A8 000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C0B597E900 000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C1972392A8 000000067F000080000009E00C00000A9230-000000067F000080000009E00C00000B297D__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C0B597E900 000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C1972392A8 000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C0B597E900 000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C1972392A8 000000067F000080000009E00C00000B297D-000000067F000080000009E00C00000BC0E3__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C0B597E900 000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C1972392A8 000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C0B597E900 000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C1972392A8 000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C0B597E900 000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C1972392A8 000000067F000080000009E00C00000BC0E3-000000067F000080000009E00C00000C580C__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C0B597E900 000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C1972392A8 000000067F000080000009E00C00000C0C74-000000067F000080000009E0140000001880__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C0B597E900 000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C1972392A8 000000067F000080000009E00C00000C580C-000000067F000080000009E00C00000CEF71__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C0B597E900 000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C1972392A8 000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C0B597E900 000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C1972392A8 000000067F000080000009E00C00000CEF71-000000067F000080000009E00C00000D86D7__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C0B597E900 000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C1972392A8 000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C0B597E900 000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C1972392A8 000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C0B597E900 000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C1972392A8 000000067F000080000009E00C00000D86D7-000000067F000080000009E00C00000E1E0C__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C0B597E900 000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C1972392A8 000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C0B597E900 000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C1972392A8 000000067F000080000009E00C00000E1E0C-000000067F000080000009E00C00000EB572__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C0B597E900 000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C1972392A8 000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C0B597E900 000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C1972392A8 000000067F000080000009E00C00000EB572-000000067F000080000009E00C00000F4CD8__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C0B597E900 000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C1972392A8 000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C0B597E900 000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C1972392A8 000000067F000080000009E00C00000F2B77-000000067F000080000009E014000000D3EB__000000C1426D92E1-000000C19744E959 000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C0B597E900 000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C1972392A8 000000067F000080000009E00C00000F4CD8-000000067F000080000009E00C00000FE40B__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C0B597E900 000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C1972392A8 000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C0B597E900 000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C1972392A8 000000067F000080000009E00C00000FE40B-000000067F000080000009E00C0000107B27__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C0B597E900 000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C1972392A8 000000067F000080000009E00C0000104000-000000067F000080000009E00C0000108000__000000C1972392A8 000000067F000080000009E00C0000104000-030000000000000000000000000000000002__000000C0B597E900 000000067F000080000009E00C0000107B27-000000067F000080000009E00C000011128D__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E00C0000108000-000000067F000080000009E00C000010C000__000000C1972392A8 000000067F000080000009E00C000010C000-000000067F000080000009E00C0000110000__000000C1972392A8 000000067F000080000009E00C0000110000-000000067F000080000009E0120100000000__000000C1972392A8 000000067F000080000009E00C000011128D-010000000000000001000000050000000003__000000BFF8BDFEE9-000000C0C8CA5FF1 000000067F000080000009E0140000000000-000000067F000080000009E0140000004000__000000C1972392A8 000000067F000080000009E0140000001880-000000067F000080000009E014000000842E__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E0140000004000-000000067F000080000009E0140000008000__000000C1972392A8 000000067F000080000009E0140000008000-000000067F000080000009E014000000C000__000000C1972392A8 000000067F000080000009E014000000842E-000000067F000080000009E014000000F011__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E014000000C000-000000067F000080000009E0140000010000__000000C1972392A8 000000067F000080000009E014000000D3EB-000000067F000080000009E014000002578F__000000C1426D92E1-000000C19744E959 000000067F000080000009E014000000F011-000000067F000080000009E0140000015BD8__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E0140000010000-000000067F000080000009E0140000014000__000000C1972392A8 000000067F000080000009E0140000014000-000000067F000080000009E0140000018000__000000C1972392A8 000000067F000080000009E0140000015BD8-000000067F000080000009E014000001C7C5__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E0140000018000-000000067F000080000009E014000001C000__000000C1972392A8 000000067F000080000009E014000001C000-000000067F000080000009E0140000020000__000000C1972392A8 000000067F000080000009E014000001C7C5-000000067F000080000009E014000002337F__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E0140000020000-000000067F000080000009E0140000024000__000000C1972392A8 000000067F000080000009E014000002337F-000000067F000080000009E0140000029F4A__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E0140000024000-000000067F000080000009E0140000028000__000000C1972392A8 000000067F000080000009E0140000025790-030000000000000000000000000000000002__000000C1426D92E1-000000C19744E959 000000067F000080000009E0140000028000-000000067F000080000009E014000002C000__000000C1972392A8 000000067F000080000009E0140000029F4A-030000000000000000000000000000000002__000000C0C8CA5FF1-000000C1426D92E1 000000067F000080000009E014000002C000-030000000000000000000000000000000002__000000C1972392A8 000000067F00008000000A000C0000000000-000000067F00008000000A000C0000004000__000000C3687EDFE8 000000067F00008000000A000C0000004000-000000067F00008000000A000C0000008000__000000C3687EDFE8 000000067F00008000000A000C0000008000-000000067F00008000000A000C000000C000__000000C3687EDFE8 000000067F00008000000A000C0000008EF9-000000067F00008000000A000C000001260C__000000C19744E959-000000C217F3F379 000000067F00008000000A000C000000C000-000000067F00008000000A000C0000010000__000000C3687EDFE8 000000067F00008000000A000C0000010000-000000067F00008000000A000C0000014000__000000C3687EDFE8 000000067F00008000000A000C000001260C-000000067F00008000000A000C000001BD72__000000C19744E959-000000C217F3F379 000000067F00008000000A000C0000014000-000000067F00008000000A000C0000018000__000000C3687EDFE8 000000067F00008000000A000C0000018000-000000067F00008000000A000C000001C000__000000C3687EDFE8 000000067F00008000000A000C000001BD72-000000067F00008000000A000C00000254D8__000000C19744E959-000000C217F3F379 000000067F00008000000A000C000001C000-000000067F00008000000A000C0000020000__000000C3687EDFE8 000000067F00008000000A000C0000020000-000000067F00008000000A000C0000024000__000000C3687EDFE8 000000067F00008000000A000C0000024000-000000067F00008000000A000C0000028000__000000C3687EDFE8 000000067F00008000000A000C00000254D8-000000067F00008000000A000C000002EC0B__000000C19744E959-000000C217F3F379 000000067F00008000000A000C0000028000-000000067F00008000000A000C000002C000__000000C3687EDFE8 000000067F00008000000A000C000002C000-000000067F00008000000A000C0000030000__000000C3687EDFE8 000000067F00008000000A000C000002EC0B-000000067F00008000000A000C0000038322__000000C19744E959-000000C217F3F379 000000067F00008000000A000C0000030000-000000067F00008000000A000C0000034000__000000C3687EDFE8 000000067F00008000000A000C0000034000-000000067F00008000000A000C0000038000__000000C3687EDFE8 000000067F00008000000A000C0000038000-000000067F00008000000A000C000003C000__000000C3687EDFE8 000000067F00008000000A000C0000038322-000000067F00008000000A000C0000041A88__000000C19744E959-000000C217F3F379 000000067F00008000000A000C000003C000-000000067F00008000000A000C0000040000__000000C3687EDFE8 000000067F00008000000A000C0000040000-000000067F00008000000A000C0000044000__000000C3687EDFE8 000000067F00008000000A000C0000041A88-000000067F00008000000A000C000004B1EE__000000C19744E959-000000C217F3F379 000000067F00008000000A000C0000044000-000000067F00008000000A000C0000048000__000000C3687EDFE8 000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C366619FD8 000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C42FE73810 000000067F00008000000A000C000004B1EE-030000000000000000000000000000000002__000000C19744E959-000000C217F3F379 000000067F00008000000A000C000004BACE-000000067F00008000000A000C0000055202__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C366619FD8 000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C42FE73810 000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C366619FD8 000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C42FE73810 000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C366619FD8 000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C42FE73810 000000067F00008000000A000C0000055202-000000067F00008000000A000C000005E90D__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000056365-000000067F00008000000A000C00000ACA1A__000000C3E17E01A1-000000C430961E71 000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C366619FD8 000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C42FE73810 000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C366619FD8 000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C42FE73810 000000067F00008000000A000C000005E90D-000000067F00008000000A000C000006802B__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C366619FD8 000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C42FE73810 000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C366619FD8 000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C42FE73810 000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C366619FD8 000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C42FE73810 000000067F00008000000A000C000006802B-000000067F00008000000A000C0000071782__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C366619FD8 000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C42FE73810 000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C366619FD8 000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C42FE73810 000000067F00008000000A000C0000071782-000000067F00008000000A000C000007AEE8__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C366619FD8 000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C42FE73810 000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C366619FD8 000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C42FE73810 000000067F00008000000A000C000007AEE8-000000067F00008000000A000C000008460B__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C366619FD8 000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C42FE73810 000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C366619FD8 000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C42FE73810 000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C366619FD8 000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C42FE73810 000000067F00008000000A000C000008460B-000000067F00008000000A000C000008DD71__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C366619FD8 000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C42FE73810 000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C366619FD8 000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C42FE73810 000000067F00008000000A000C000008DD71-000000067F00008000000A000C00000974D7__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C366619FD8 000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C42FE73810 000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C366619FD8 000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C42FE73810 000000067F00008000000A000C00000974D7-000000067F00008000000A000C00000A0C0B__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C366619FD8 000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C42FE73810 000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C366619FD8 000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C42FE73810 000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C366619FD8 000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C42FE73810 000000067F00008000000A000C00000A0C0B-000000067F00008000000A000C00000AA371__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C366619FD8 000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C42FE73810 000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C366619FD8 000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C42FE73810 000000067F00008000000A000C00000AA371-000000067F00008000000A000C00000B3AD7__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C366619FD8 000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C42FE73810 000000067F00008000000A000C00000ACA25-000000067F00008000000A000C0000102D7C__000000C3E17E01A1-000000C430961E71 000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C366619FD8 000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C42FE73810 000000067F00008000000A000C00000B3AD7-000000067F00008000000A000C0100000000__000000C217F3F379-000000C2C7B1ECC1 000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C366619FD8 000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C42FE73810 000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C366619FD8 000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C42FE73810 000000067F00008000000A000C00000B8B52-000000067F00008000000A00140000001132__000000C367E48001-000000C3E17E01A1 000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C366619FD8 000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C42FE73810 000000067F00008000000A000C00000BC072-000000067F00008000000A000C00000C57A3__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C366619FD8 000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C42FE73810 000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C366619FD8 000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C42FE73810 000000067F00008000000A000C00000C57A3-000000067F00008000000A000C00000CEF09__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C366619FD8 000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C42FE73810 000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C366619FD8 000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C42FE73810 000000067F00008000000A000C00000CEF09-000000067F00008000000A000C00000D862B__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C366619FD8 000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C42FE73810 000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C366619FD8 000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C42FE73810 000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C366619FD8 000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C42FE73810 000000067F00008000000A000C00000D862B-000000067F00008000000A000C00000E1D7F__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C366619FD8 000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C42FE73810 000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C366619FD8 000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C42FE73810 000000067F00008000000A000C00000E1D7F-000000067F00008000000A000C00000EB4E5__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C366619FD8 000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C42FE73810 000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C366619FD8 000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C42FE73810 000000067F00008000000A000C00000EB4E5-000000067F00008000000A000C00000F4C0B__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C366619FD8 000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C42FE73810 000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C366619FD8 000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C42FE73810 000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C366619FD8 000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C42FE73810 000000067F00008000000A000C00000F4C0B-000000067F00008000000A000C00000FE371__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C366619FD8 000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C42FE73810 000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C366619FD8 000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C42FE73810 000000067F00008000000A000C00000FE371-000000067F00008000000A000C0000107AD7__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C366619FD8 000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C42FE73810 000000067F00008000000A000C0000102D7F-000000067F00008000000A0014000001409C__000000C3E17E01A1-000000C430961E71 000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C366619FD8 000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C42FE73810 000000067F00008000000A000C0000107AD7-000000067F00008000000A000C000011120B__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C366619FD8 000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C42FE73810 000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C366619FD8 000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C42FE73810 000000067F00008000000A000C0000110000-000000067F00008000000A00120100000000__000000C42FE73810 000000067F00008000000A000C0000110000-030000000000000000000000000000000002__000000C366619FD8 000000067F00008000000A000C000011120B-010000000000000001000000050000000007__000000C2C7B1ECC1-000000C367E48001 000000067F00008000000A00140000000000-000000067F00008000000A00140000004000__000000C42FE73810 000000067F00008000000A00140000001132-000000067F00008000000A00140000007E49__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000004000-000000067F00008000000A00140000008000__000000C42FE73810 000000067F00008000000A00140000007E49-000000067F00008000000A0014000000EBBC__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000008000-000000067F00008000000A0014000000C000__000000C42FE73810 000000067F00008000000A0014000000C000-000000067F00008000000A00140000010000__000000C42FE73810 000000067F00008000000A0014000000EBBC-000000067F00008000000A00140000015925__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000010000-000000067F00008000000A00140000014000__000000C42FE73810 000000067F00008000000A00140000014000-000000067F00008000000A00140000018000__000000C42FE73810 000000067F00008000000A0014000001409F-000000067F00008000000A0016000000020E__000000C3E17E01A1-000000C430961E71 000000067F00008000000A00140000015925-000000067F00008000000A0014000001C612__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000018000-000000067F00008000000A0014000001C000__000000C42FE73810 000000067F00008000000A0014000001C000-000000067F00008000000A00140000020000__000000C42FE73810 000000067F00008000000A0014000001C612-000000067F00008000000A00140000023364__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000020000-000000067F00008000000A00140000024000__000000C42FE73810 000000067F00008000000A00140000023364-000000067F00008000000A0014000002A070__000000C367E48001-000000C3E17E01A1 000000067F00008000000A00140000024000-000000067F00008000000A00140000028000__000000C42FE73810 000000067F00008000000A00140000028000-000000067F00008000000A0014000002C000__000000C42FE73810 000000067F00008000000A0014000002A070-030000000000000000000000000000000002__000000C367E48001-000000C3E17E01A1 000000067F00008000000A0014000002C000-030000000000000000000000000000000002__000000C42FE73810 000000067F00008000000A0016000000020E-030000000000000000000000000000000002__000000C3E17E01A1-000000C430961E71 000000067F00008000000A200C0000000000-000000067F00008000000A200C0000004000__000000C601294000 000000067F00008000000A200C0000004000-000000067F00008000000A200C0000008000__000000C601294000 000000067F00008000000A200C0000008000-000000067F00008000000A200C000000C000__000000C601294000 000000067F00008000000A200C0000009748-000000067F00008000000A200C0000012EAE__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C000000C000-000000067F00008000000A200C0000010000__000000C601294000 000000067F00008000000A200C0000010000-000000067F00008000000A200C0000014000__000000C601294000 000000067F00008000000A200C0000012EAE-000000067F00008000000A200C000001C60A__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000014000-000000067F00008000000A200C0000018000__000000C601294000 000000067F00008000000A200C0000018000-000000067F00008000000A200C000001C000__000000C601294000 000000067F00008000000A200C000001C000-000000067F00008000000A200C0000020000__000000C601294000 000000067F00008000000A200C000001C60A-000000067F00008000000A200C0000025D38__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000020000-000000067F00008000000A200C0000024000__000000C601294000 000000067F00008000000A200C0000024000-000000067F00008000000A200C0000028000__000000C601294000 000000067F00008000000A200C0000025D38-000000067F00008000000A200C000002F49E__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000028000-000000067F00008000000A200C000002C000__000000C601294000 000000067F00008000000A200C000002C000-000000067F00008000000A200C0000030000__000000C601294000 000000067F00008000000A200C000002F49E-000000067F00008000000A200C0000038BB1__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000030000-000000067F00008000000A200C0000034000__000000C601294000 000000067F00008000000A200C0000034000-000000067F00008000000A200C0000038000__000000C601294000 000000067F00008000000A200C0000038000-000000067F00008000000A200C000003C000__000000C601294000 000000067F00008000000A200C0000038BB1-000000067F00008000000A200C0000042317__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C000003C000-000000067F00008000000A200C0000040000__000000C601294000 000000067F00008000000A200C0000040000-000000067F00008000000A200C0000044000__000000C601294000 000000067F00008000000A200C0000042317-000000067F00008000000A200C000004BA7D__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000044000-000000067F00008000000A200C0000048000__000000C601294000 000000067F00008000000A200C0000048000-000000067F00008000000A200C000004C000__000000C601294000 000000067F00008000000A200C000004BA7D-000000067F00008000000A200C00000551B2__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C000004C000-000000067F00008000000A200C0000050000__000000C601294000 000000067F00008000000A200C0000050000-000000067F00008000000A200C0000054000__000000C601294000 000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C5FED35FC8 000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C6C7BD8140 000000067F00008000000A200C00000551B2-030000000000000000000000000000000002__000000C430961E71-000000C4C05DDB29 000000067F00008000000A200C0000055230-000000067F00008000000A200C000005E996__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C5FED35FC8 000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C6C7BD8140 000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C5FED35FC8 000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C6C7BD8140 000000067F00008000000A200C000005E996-000000067F00008000000A200C00000680FC__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C5FED35FC8 000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C6C7BD8140 000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C5FED35FC8 000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C6C7BD8140 000000067F00008000000A200C00000677DB-000000067F00008000000A200C00000CF739__000000C689AF4AC1-000000C6C87B6329 000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C5FED35FC8 000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C6C7BD8140 000000067F00008000000A200C00000680FC-000000067F00008000000A200C000007180C__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C5FED35FC8 000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C6C7BD8140 000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C5FED35FC8 000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C6C7BD8140 000000067F00008000000A200C000007180C-000000067F00008000000A200C000007AF72__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C5FED35FC8 000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C6C7BD8140 000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C5FED35FC8 000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C6C7BD8140 000000067F00008000000A200C000007AF72-000000067F00008000000A200C00000846D8__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C5FED35FC8 000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C6C7BD8140 000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C5FED35FC8 000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C6C7BD8140 000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C5FED35FC8 000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C6C7BD8140 000000067F00008000000A200C00000846D8-000000067F00008000000A200C000008DE0B__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C5FED35FC8 000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C6C7BD8140 000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C5FED35FC8 000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C6C7BD8140 000000067F00008000000A200C000008DE0B-000000067F00008000000A200C000009752B__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C5FED35FC8 000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C6C7BD8140 000000067F00008000000A200C00000933F0-000000067F00008000000A200C0000110901__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C5FED35FC8 000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C6C7BD8140 000000067F00008000000A200C000009752B-000000067F00008000000A200C00000A0C91__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C5FED35FC8 000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C6C7BD8140 000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C5FED35FC8 000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C6C7BD8140 000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C5FED35FC8 000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C6C7BD8140 000000067F00008000000A200C00000A0C91-000000067F00008000000A200C00000AA3F7__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C5FED35FC8 000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C6C7BD8140 000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C5FED35FC8 000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C6C7BD8140 000000067F00008000000A200C00000AA3F7-000000067F00008000000A200C00000B3B0C__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C5FED35FC8 000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C6C7BD8140 000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C5FED35FC8 000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C6C7BD8140 000000067F00008000000A200C00000B3B0C-000000067F00008000000A200C0100000000__000000C4C05DDB29-000000C56021EB29 000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C5FED35FC8 000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C6C7BD8140 000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C5FED35FC8 000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C6C7BD8140 000000067F00008000000A200C00000BBC1F-000000067F00008000000A200C00000C5353__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C5FED35FC8 000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C6C7BD8140 000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C5FED35FC8 000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C6C7BD8140 000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C5FED35FC8 000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C6C7BD8140 000000067F00008000000A200C00000C5353-000000067F00008000000A200C00000CEAB9__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C5FED35FC8 000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C6C7BD8140 000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C5FED35FC8 000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C6C7BD8140 000000067F00008000000A200C00000CEAB9-000000067F00008000000A200C00000D81D2__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000CF742-000000067F00008000000A2014000000B47B__000000C689AF4AC1-000000C6C87B6329 000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C5FED35FC8 000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C6C7BD8140 000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C5FED35FC8 000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C6C7BD8140 000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C5FED35FC8 000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C6C7BD8140 000000067F00008000000A200C00000D81D2-000000067F00008000000A200C00000E190B__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C5FED35FC8 000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C6C7BD8140 000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C5FED35FC8 000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C6C7BD8140 000000067F00008000000A200C00000E190B-000000067F00008000000A200C00000EB071__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C5FED35FC8 000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C6C7BD8140 000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C5FED35FC8 000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C6C7BD8140 000000067F00008000000A200C00000EB071-000000067F00008000000A200C00000F47AC__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C5FED35FC8 000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C6C7BD8140 000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C5FED35FC8 000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C6C7BD8140 000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C5FED35FC8 000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C6C7BD8140 000000067F00008000000A200C00000F47AC-000000067F00008000000A200C00000FDF0A__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C5FED35FC8 000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C6C7BD8140 000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C5FED35FC8 000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C6C7BD8140 000000067F00008000000A200C00000FDF0A-000000067F00008000000A200C000010762B__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C5FED35FC8 000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C6C7BD8140 000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C5FED35FC8 000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C6C7BD8140 000000067F00008000000A200C000010762B-000000067F00008000000A200C0000110D88__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C5FED35FC8 000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C6C7BD8140 000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C5FED35FC8 000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C6C7BD8140 000000067F00008000000A200C0000110000-000000067F00008000000A20120100000000__000000C6C7BD8140 000000067F00008000000A200C0000110000-030000000000000000000000000000000002__000000C5FED35FC8 000000067F00008000000A200C0000110901-000000067F00008000000A201400000047CD__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A200C0000110D88-01000000000000000100000005000000000A__000000C56021EB29-000000C600A8FFF9 000000067F00008000000A20140000000000-000000067F00008000000A20140000004000__000000C6C7BD8140 000000067F00008000000A20140000004000-000000067F00008000000A20140000008000__000000C6C7BD8140 000000067F00008000000A201400000047CD-000000067F00008000000A2014000000ADA8__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A20140000008000-000000067F00008000000A2014000000C000__000000C6C7BD8140 000000067F00008000000A2014000000ADA8-000000067F00008000000A201400000113B8__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A2014000000B47C-010000000000000001000000050100000000__000000C689AF4AC1-000000C6C87B6329 000000067F00008000000A2014000000C000-000000067F00008000000A20140000010000__000000C6C7BD8140 000000067F00008000000A20140000010000-000000067F00008000000A20140000014000__000000C6C7BD8140 000000067F00008000000A201400000113B8-000000067F00008000000A20140000017969__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A20140000014000-000000067F00008000000A20140000018000__000000C6C7BD8140 000000067F00008000000A20140000017969-000000067F00008000000A2014000001DF7E__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A20140000018000-000000067F00008000000A2014000001C000__000000C6C7BD8140 000000067F00008000000A2014000001C000-000000067F00008000000A20140000020000__000000C6C7BD8140 000000067F00008000000A2014000001DF7E-000000067F00008000000A2014000002457D__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A20140000020000-000000067F00008000000A20140000024000__000000C6C7BD8140 000000067F00008000000A20140000024000-000000067F00008000000A20140000028000__000000C6C7BD8140 000000067F00008000000A2014000002457D-000000067F00008000000A2014000002AB1D__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A20140000028000-000000067F00008000000A2014000002C000__000000C6C7BD8140 000000067F00008000000A2014000002AB1D-030000000000000000000000000000000002__000000C600A8FFF9-000000C689AF4AC1 000000067F00008000000A2014000002C000-030000000000000000000000000000000002__000000C6C7BD8140 000000067F00008000000A400C0000000000-000000067F00008000000A400C0000004000__000000C896B8DFD8 000000067F00008000000A400C0000004000-000000067F00008000000A400C0000008000__000000C896B8DFD8 000000067F00008000000A400C0000008000-000000067F00008000000A400C000000C000__000000C896B8DFD8 000000067F00008000000A400C0000009743-000000067F00008000000A400C0000012EA9__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C000000C000-000000067F00008000000A400C0000010000__000000C896B8DFD8 000000067F00008000000A400C0000010000-000000067F00008000000A400C0000014000__000000C896B8DFD8 000000067F00008000000A400C0000012EA9-000000067F00008000000A400C000001C60A__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C0000014000-000000067F00008000000A400C0000018000__000000C896B8DFD8 000000067F00008000000A400C0000018000-000000067F00008000000A400C000001C000__000000C896B8DFD8 000000067F00008000000A400C000001C000-000000067F00008000000A400C0000020000__000000C896B8DFD8 000000067F00008000000A400C000001C60A-000000067F00008000000A400C0000025D38__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C0000020000-000000067F00008000000A400C0000024000__000000C896B8DFD8 000000067F00008000000A400C0000024000-000000067F00008000000A400C0000028000__000000C896B8DFD8 000000067F00008000000A400C0000025D38-000000067F00008000000A400C000002F49E__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C0000028000-000000067F00008000000A400C000002C000__000000C896B8DFD8 000000067F00008000000A400C000002C000-000000067F00008000000A400C0000030000__000000C896B8DFD8 000000067F00008000000A400C000002F49E-000000067F00008000000A400C0000038BB1__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C0000030000-000000067F00008000000A400C0000034000__000000C896B8DFD8 000000067F00008000000A400C0000034000-000000067F00008000000A400C0000038000__000000C896B8DFD8 000000067F00008000000A400C0000038000-000000067F00008000000A400C000003C000__000000C896B8DFD8 000000067F00008000000A400C0000038BB1-000000067F00008000000A400C0000042317__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C000003C000-000000067F00008000000A400C0000040000__000000C896B8DFD8 000000067F00008000000A400C0000040000-000000067F00008000000A400C0000044000__000000C896B8DFD8 000000067F00008000000A400C0000042317-000000067F00008000000A400C000004BA7D__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C0000044000-000000067F00008000000A400C0000048000__000000C896B8DFD8 000000067F00008000000A400C0000048000-000000067F00008000000A400C000004C000__000000C896B8DFD8 000000067F00008000000A400C000004BA7D-030000000000000000000000000000000002__000000C6C87B6329-000000C74849FAE1 000000067F00008000000A400C000004C000-000000067F00008000000A400C0000050000__000000C896B8DFD8 000000067F00008000000A400C0000050000-000000067F00008000000A400C0000054000__000000C896B8DFD8 000000067F00008000000A400C0000054000-000000067F00008000000A400C0000058000__000000C896B8DFD8 000000067F00008000000A400C00000551FC-000000067F00008000000A400C000005E90B__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000058000-000000067F00008000000A400C000005C000__000000C896B8DFD8 000000067F00008000000A400C000005C000-000000067F00008000000A400C0000060000__000000C896B8DFD8 000000067F00008000000A400C000005E90B-000000067F00008000000A400C000006802B__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000060000-000000067F00008000000A400C0000064000__000000C896B8DFD8 000000067F00008000000A400C0000064000-000000067F00008000000A400C0000068000__000000C896B8DFD8 000000067F00008000000A400C0000068000-000000067F00008000000A400C000006C000__000000C896B8DFD8 000000067F00008000000A400C000006802B-000000067F00008000000A400C0000071782__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C000006C000-000000067F00008000000A400C0000070000__000000C896B8DFD8 000000067F00008000000A400C0000070000-000000067F00008000000A400C0000074000__000000C896B8DFD8 000000067F00008000000A400C0000071782-000000067F00008000000A400C000007AEE8__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000074000-000000067F00008000000A400C0000078000__000000C896B8DFD8 000000067F00008000000A400C0000078000-000000067F00008000000A400C000007C000__000000C896B8DFD8 000000067F00008000000A400C000007AEE8-000000067F00008000000A400C000008460B__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C000007C000-000000067F00008000000A400C0000080000__000000C896B8DFD8 000000067F00008000000A400C0000080000-000000067F00008000000A400C0000084000__000000C896B8DFD8 000000067F00008000000A400C0000084000-000000067F00008000000A400C0000088000__000000C896B8DFD8 000000067F00008000000A400C000008460B-000000067F00008000000A400C000008DD71__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000088000-000000067F00008000000A400C000008C000__000000C896B8DFD8 000000067F00008000000A400C000008C000-000000067F00008000000A400C0000090000__000000C896B8DFD8 000000067F00008000000A400C000008DD71-000000067F00008000000A400C00000974D7__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000090000-000000067F00008000000A400C0000094000__000000C896B8DFD8 000000067F00008000000A400C0000094000-000000067F00008000000A400C0000098000__000000C896B8DFD8 000000067F00008000000A400C00000974D7-000000067F00008000000A400C00000A0C0B__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C0000098000-000000067F00008000000A400C000009C000__000000C896B8DFD8 000000067F00008000000A400C000009C000-000000067F00008000000A400C00000A0000__000000C896B8DFD8 000000067F00008000000A400C00000A0000-000000067F00008000000A400C00000A4000__000000C896B8DFD8 000000067F00008000000A400C00000A0C0B-000000067F00008000000A400C00000AA371__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C00000A4000-000000067F00008000000A400C00000A8000__000000C896B8DFD8 000000067F00008000000A400C00000A8000-000000067F00008000000A400C00000AC000__000000C896B8DFD8 000000067F00008000000A400C00000AA371-000000067F00008000000A400C00000B3AD7__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C00000AC000-000000067F00008000000A400C00000B0000__000000C896B8DFD8 000000067F00008000000A400C00000B0000-000000067F00008000000A400C00000B4000__000000C896B8DFD8 000000067F00008000000A400C00000B3AD7-000000067F00008000000A400C00000BD20B__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C00000B4000-000000067F00008000000A400C00000B8000__000000C896B8DFD8 000000067F00008000000A400C00000B8000-000000067F00008000000A400C00000BC000__000000C896B8DFD8 000000067F00008000000A400C00000BC000-000000067F00008000000A400C00000C0000__000000C896B8DFD8 000000067F00008000000A400C00000BD20B-000000067F00008000000A400C0100000000__000000C74849FAE1-000000C80801E859 000000067F00008000000A400C00000C0000-000000067F00008000000A400C00000C4000__000000C896B8DFD8 000000067F00008000000A400C00000C4000-000000067F00008000000A400C00000C8000__000000C896B8DFD8 000000067F00008000000A400C00000C4AE6-000000067F00008000000A400C00000CE20C__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000C8000-000000067F00008000000A400C00000CC000__000000C896B8DFD8 000000067F00008000000A400C00000CC000-000000067F00008000000A400C00000D0000__000000C896B8DFD8 000000067F00008000000A400C00000CE20C-000000067F00008000000A400C00000D7929__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000D0000-000000067F00008000000A400C00000D4000__000000C896B8DFD8 000000067F00008000000A400C00000D4000-000000067F00008000000A400C00000D8000__000000C896B8DFD8 000000067F00008000000A400C00000D7929-000000067F00008000000A400C00000E108F__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000D8000-000000067F00008000000A400C00000DC000__000000C896B8DFD8 000000067F00008000000A400C00000DC000-000000067F00008000000A400C00000E0000__000000C896B8DFD8 000000067F00008000000A400C00000E0000-000000067F00008000000A400C00000E4000__000000C896B8DFD8 000000067F00008000000A400C00000E108F-000000067F00008000000A400C00000EA7F5__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000E4000-000000067F00008000000A400C00000E8000__000000C896B8DFD8 000000067F00008000000A400C00000E8000-000000067F00008000000A400C00000EC000__000000C896B8DFD8 000000067F00008000000A400C00000EA7F5-000000067F00008000000A400C00000F3F0B__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000EC000-000000067F00008000000A400C00000F0000__000000C896B8DFD8 000000067F00008000000A400C00000F0000-000000067F00008000000A400C00000F4000__000000C896B8DFD8 000000067F00008000000A400C00000F3F0B-000000067F00008000000A400C00000FD671__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C00000F4000-000000067F00008000000A400C00000F8000__000000C896B8DFD8 000000067F00008000000A400C00000F8000-000000067F00008000000A400C00000FC000__000000C896B8DFD8 000000067F00008000000A400C00000FC000-000000067F00008000000A400C0000100000__000000C896B8DFD8 000000067F00008000000A400C00000FD671-000000067F00008000000A400C0000106D95__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C0000100000-000000067F00008000000A400C0000104000__000000C896B8DFD8 000000067F00008000000A400C0000104000-000000067F00008000000A400C0000108000__000000C896B8DFD8 000000067F00008000000A400C0000106D95-000000067F00008000000A400C00001104FB__000000C80801E859-000000C8993EBFF9 000000067F00008000000A400C0000107F8F-000000067F00008000000A40140000005626__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A400C0000108000-000000067F00008000000A400C000010C000__000000C896B8DFD8 000000067F00008000000A400C000010C000-000000067F00008000000A400C0000110000__000000C896B8DFD8 000000067F00008000000A400C0000110000-030000000000000000000000000000000002__000000C896B8DFD8 000000067F00008000000A400C00001104FB-01000000000000000100000005000000000D__000000C80801E859-000000C8993EBFF9 000000067F00008000000A40140000005626-000000067F00008000000A4014000000C7F9__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A4014000000C7F9-000000067F00008000000A401400000139F8__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A401400000139F8-000000067F00008000000A4014000001ABE9__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A4014000001ABE9-000000067F00008000000A40140000021DF4__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A40140000021DF4-000000067F00008000000A40140000028FA9__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A40140000028FA9-030000000000000000000000000000000002__000000C8993EBFF9-000000C90726D0D9 000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CA2C877DC8 000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CB82C2FF68 000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CA2C877DC8 000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CB82C2FF68 000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CA2C877DC8 000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CB82C2FF68 000000067F00008000000A600C0000009746-000000067F00008000000A600C0000012EAC__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CA2C877DC8 000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CB82C2FF68 000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CA2C877DC8 000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CB82C2FF68 000000067F00008000000A600C0000012EAC-000000067F00008000000A600C000001C60A__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CA2C877DC8 000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CB82C2FF68 000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CA2C877DC8 000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CB82C2FF68 000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CA2C877DC8 000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CB82C2FF68 000000067F00008000000A600C000001C60A-000000067F00008000000A600C0000025D38__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CA2C877DC8 000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CB82C2FF68 000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CA2C877DC8 000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CB82C2FF68 000000067F00008000000A600C0000025D38-000000067F00008000000A600C000002F49E__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CA2C877DC8 000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CB82C2FF68 000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CA2C877DC8 000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CB82C2FF68 000000067F00008000000A600C000002F49E-000000067F00008000000A600C0000038BB1__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CA2C877DC8 000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CB82C2FF68 000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CA2C877DC8 000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CB82C2FF68 000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CA2C877DC8 000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CB82C2FF68 000000067F00008000000A600C0000038BB1-000000067F00008000000A600C0000042317__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CA2C877DC8 000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CB82C2FF68 000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CA2C877DC8 000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CB82C2FF68 000000067F00008000000A600C0000042317-000000067F00008000000A600C000004BA7D__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CA2C877DC8 000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CB82C2FF68 000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CA2C877DC8 000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CB82C2FF68 000000067F00008000000A600C000004BA7D-030000000000000000000000000000000002__000000C90726D0D9-000000C986F5F0D9 000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CA2C877DC8 000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CB82C2FF68 000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CA2C877DC8 000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CB82C2FF68 000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CA2C877DC8 000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CB82C2FF68 000000067F00008000000A600C0000054BFB-000000067F00008000000A600C000005E30C__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CA2C877DC8 000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CB82C2FF68 000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CA2C877DC8 000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CB82C2FF68 000000067F00008000000A600C000005E30C-000000067F00008000000A600C0000067A2B__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CA2C877DC8 000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CB82C2FF68 000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CA2C877DC8 000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CB82C2FF68 000000067F00008000000A600C0000067A2B-000000067F00008000000A600C0000071186__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CA2C877DC8 000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CB82C2FF68 000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CA2C877DC8 000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CB82C2FF68 000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CA2C877DC8 000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CB82C2FF68 000000067F00008000000A600C0000071186-000000067F00008000000A600C000007A8EC__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CA2C877DC8 000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CB82C2FF68 000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CA2C877DC8 000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CB82C2FF68 000000067F00008000000A600C000007A149-000000067F00008000000A600C00000F5F42__000000CB40C16489-000000CB82C37859 000000067F00008000000A600C000007A8EC-000000067F00008000000A600C000008400A__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CA2C877DC8 000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CB82C2FF68 000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CA2C877DC8 000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CB82C2FF68 000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CA2C877DC8 000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CB82C2FF68 000000067F00008000000A600C000008400A-000000067F00008000000A600C000008D770__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CA2C877DC8 000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CB82C2FF68 000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CA2C877DC8 000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CB82C2FF68 000000067F00008000000A600C000008D770-000000067F00008000000A600C0000096ED6__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CA2C877DC8 000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CB82C2FF68 000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CA2C877DC8 000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CB82C2FF68 000000067F00008000000A600C0000096ED6-000000067F00008000000A600C00000A060B__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CA2C877DC8 000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CB82C2FF68 000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CA2C877DC8 000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CB82C2FF68 000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CA2C877DC8 000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CB82C2FF68 000000067F00008000000A600C00000A060B-000000067F00008000000A600C00000A9D71__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CA2C877DC8 000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CB82C2FF68 000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CA2C877DC8 000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CB82C2FF68 000000067F00008000000A600C00000A9D71-000000067F00008000000A600C00000B34D7__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000AC000-000000067F00008000000A600C00000B0000__000000CB82C2FF68 000000067F00008000000A600C00000AC000-030000000000000000000000000000000002__000000CA2C877DC8 000000067F00008000000A600C00000B0000-000000067F00008000000A600C00000B4000__000000CB82C2FF68 000000067F00008000000A600C00000B34D7-000000067F00008000000A600C00000BCC0C__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000B4000-000000067F00008000000A600C00000B8000__000000CB82C2FF68 000000067F00008000000A600C00000B8000-000000067F00008000000A600C00000BC000__000000CB82C2FF68 000000067F00008000000A600C00000BC000-000000067F00008000000A600C00000C0000__000000CB82C2FF68 000000067F00008000000A600C00000BCC0C-000000067F00008000000A600C00000C6336__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000C0000-000000067F00008000000A600C00000C4000__000000CB82C2FF68 000000067F00008000000A600C00000C4000-000000067F00008000000A600C00000C8000__000000CB82C2FF68 000000067F00008000000A600C00000C6336-000000067F00008000000A600C00000CFA9C__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000C8000-000000067F00008000000A600C00000CC000__000000CB82C2FF68 000000067F00008000000A600C00000CC000-000000067F00008000000A600C00000D0000__000000CB82C2FF68 000000067F00008000000A600C00000CFA9C-000000067F00008000000A600C00000D91AB__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000D0000-000000067F00008000000A600C00000D4000__000000CB82C2FF68 000000067F00008000000A600C00000D4000-000000067F00008000000A600C00000D8000__000000CB82C2FF68 000000067F00008000000A600C00000D8000-000000067F00008000000A600C00000DC000__000000CB82C2FF68 000000067F00008000000A600C00000D91AB-000000067F00008000000A600C00000E2911__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000DC000-000000067F00008000000A600C00000E0000__000000CB82C2FF68 000000067F00008000000A600C00000E0000-000000067F00008000000A600C00000E4000__000000CB82C2FF68 000000067F00008000000A600C00000E2911-000000067F00008000000A600C00000EC077__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000E4000-000000067F00008000000A600C00000E8000__000000CB82C2FF68 000000067F00008000000A600C00000E8000-000000067F00008000000A600C00000EC000__000000CB82C2FF68 000000067F00008000000A600C00000EC000-000000067F00008000000A600C00000F0000__000000CB82C2FF68 000000067F00008000000A600C00000EC077-000000067F00008000000A600C00000F57A8__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000F0000-000000067F00008000000A600C00000F4000__000000CB82C2FF68 000000067F00008000000A600C00000F4000-000000067F00008000000A600C00000F8000__000000CB82C2FF68 000000067F00008000000A600C00000F57A8-000000067F00008000000A600C00000FEF0A__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C00000F5F4F-000000067F00008000000A60140000011158__000000CB40C16489-000000CB82C37859 000000067F00008000000A600C00000F8000-000000067F00008000000A600C00000FC000__000000CB82C2FF68 000000067F00008000000A600C00000FC000-000000067F00008000000A600C0000100000__000000CB82C2FF68 000000067F00008000000A600C00000FEF0A-000000067F00008000000A600C000010862B__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C0000100000-000000067F00008000000A600C0000104000__000000CB82C2FF68 000000067F00008000000A600C0000104000-000000067F00008000000A600C0000108000__000000CB82C2FF68 000000067F00008000000A600C0000108000-000000067F00008000000A600C000010C000__000000CB82C2FF68 000000067F00008000000A600C000010862B-000000067F00008000000A600C0000111C20__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A600C000010C000-000000067F00008000000A600C0000110000__000000CB82C2FF68 000000067F00008000000A600C0000110000-000000067F00008000000A60120100000000__000000CB82C2FF68 000000067F00008000000A600C00001117CB-000000067F00008000000A6014000000499B__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A600C00FFFFFFFF-01000000000000000100000005000000000E__000000C986F5F0D9-000000CAD5D7FFF1 000000067F00008000000A60140000000000-000000067F00008000000A60140000004000__000000CB82C2FF68 000000067F00008000000A60140000004000-000000067F00008000000A60140000008000__000000CB82C2FF68 000000067F00008000000A6014000000499B-000000067F00008000000A6014000000BD4E__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A60140000008000-000000067F00008000000A6014000000C000__000000CB82C2FF68 000000067F00008000000A6014000000BD4E-000000067F00008000000A601400000130ED__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A6014000000C000-000000067F00008000000A60140000010000__000000CB82C2FF68 000000067F00008000000A60140000010000-000000067F00008000000A60140000014000__000000CB82C2FF68 000000067F00008000000A60140000011159-000000067F00008000000A60140000029BB2__000000CB40C16489-000000CB82C37859 000000067F00008000000A601400000130ED-000000067F00008000000A6014000001A4BD__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A60140000014000-000000067F00008000000A60140000018000__000000CB82C2FF68 000000067F00008000000A60140000018000-000000067F00008000000A6014000001C000__000000CB82C2FF68 000000067F00008000000A6014000001A4BD-000000067F00008000000A60140000021886__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A6014000001C000-000000067F00008000000A60140000020000__000000CB82C2FF68 000000067F00008000000A60140000020000-000000067F00008000000A60140000024000__000000CB82C2FF68 000000067F00008000000A60140000021886-000000067F00008000000A60140000028C0A__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A60140000024000-000000067F00008000000A60140000028000__000000CB82C2FF68 000000067F00008000000A60140000028000-000000067F00008000000A6014000002C000__000000CB82C2FF68 000000067F00008000000A60140000028C0A-030000000000000000000000000000000002__000000CAD5D7FFF1-000000CB40C16489 000000067F00008000000A60140000029BB2-030000000000000000000000000000000002__000000CB40C16489-000000CB82C37859 000000067F00008000000A6014000002C000-030000000000000000000000000000000002__000000CB82C2FF68 000000067F00008000000A800C0000000000-000000067F00008000000A800C0000004000__000000CD51009FE8 000000067F00008000000A800C0000004000-000000067F00008000000A800C0000008000__000000CD51009FE8 000000067F00008000000A800C0000008000-000000067F00008000000A800C000000C000__000000CD51009FE8 000000067F00008000000A800C0000009748-000000067F00008000000A800C0000012EAE__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C000000C000-000000067F00008000000A800C0000010000__000000CD51009FE8 000000067F00008000000A800C0000010000-000000067F00008000000A800C0000014000__000000CD51009FE8 000000067F00008000000A800C0000012EAE-000000067F00008000000A800C000001C60A__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C0000014000-000000067F00008000000A800C0000018000__000000CD51009FE8 000000067F00008000000A800C0000018000-000000067F00008000000A800C000001C000__000000CD51009FE8 000000067F00008000000A800C000001C000-000000067F00008000000A800C0000020000__000000CD51009FE8 000000067F00008000000A800C000001C60A-000000067F00008000000A800C0000025D38__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C0000020000-000000067F00008000000A800C0000024000__000000CD51009FE8 000000067F00008000000A800C0000024000-000000067F00008000000A800C0000028000__000000CD51009FE8 000000067F00008000000A800C0000025D38-000000067F00008000000A800C000002F49E__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C0000028000-000000067F00008000000A800C000002C000__000000CD51009FE8 000000067F00008000000A800C000002C000-000000067F00008000000A800C0000030000__000000CD51009FE8 000000067F00008000000A800C000002F49E-000000067F00008000000A800C0000038BB1__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C0000030000-000000067F00008000000A800C0000034000__000000CD51009FE8 000000067F00008000000A800C0000034000-000000067F00008000000A800C0000038000__000000CD51009FE8 000000067F00008000000A800C0000038000-000000067F00008000000A800C000003C000__000000CD51009FE8 000000067F00008000000A800C0000038BB1-000000067F00008000000A800C0000042317__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C000003C000-000000067F00008000000A800C0000040000__000000CD51009FE8 000000067F00008000000A800C0000040000-000000067F00008000000A800C0000044000__000000CD51009FE8 000000067F00008000000A800C0000042317-000000067F00008000000A800C000004BA7D__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C0000044000-000000067F00008000000A800C0000048000__000000CD51009FE8 000000067F00008000000A800C0000048000-000000067F00008000000A800C000004C000__000000CD51009FE8 000000067F00008000000A800C000004BA7D-000000067F00008000000A800C0000054CA0__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800C000004C000-000000067F00008000000A800C0000050000__000000CD51009FE8 000000067F00008000000A800C0000050000-000000067F00008000000A800C0000054000__000000CD51009FE8 000000067F00008000000A800C0000054000-000000067F00008000000A800C0000058000__000000CD51009FE8 000000067F00008000000A800C0000054C9F-000000067F00008000000A800C000005E405__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000058000-000000067F00008000000A800C000005C000__000000CD51009FE8 000000067F00008000000A800C000005C000-000000067F00008000000A800C0000060000__000000CD51009FE8 000000067F00008000000A800C000005E405-000000067F00008000000A800C0000067B10__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000060000-000000067F00008000000A800C0000064000__000000CD51009FE8 000000067F00008000000A800C0000064000-000000067F00008000000A800C0000068000__000000CD51009FE8 000000067F00008000000A800C0000067B10-000000067F00008000000A800C0000071276__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000068000-000000067F00008000000A800C000006C000__000000CD51009FE8 000000067F00008000000A800C000006C000-000000067F00008000000A800C0000070000__000000CD51009FE8 000000067F00008000000A800C0000070000-000000067F00008000000A800C0000074000__000000CD51009FE8 000000067F00008000000A800C0000071276-000000067F00008000000A800C000007A9DC__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000074000-000000067F00008000000A800C0000078000__000000CD51009FE8 000000067F00008000000A800C0000078000-000000067F00008000000A800C000007C000__000000CD51009FE8 000000067F00008000000A800C000007A9DC-000000067F00008000000A800C000008410B__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C000007C000-000000067F00008000000A800C0000080000__000000CD51009FE8 000000067F00008000000A800C0000080000-000000067F00008000000A800C0000084000__000000CD51009FE8 000000067F00008000000A800C0000084000-000000067F00008000000A800C0000088000__000000CD51009FE8 000000067F00008000000A800C000008410B-000000067F00008000000A800C000008D871__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000088000-000000067F00008000000A800C000008C000__000000CD51009FE8 000000067F00008000000A800C000008C000-000000067F00008000000A800C0000090000__000000CD51009FE8 000000067F00008000000A800C000008D871-000000067F00008000000A800C0000096F94__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000090000-000000067F00008000000A800C0000094000__000000CD51009FE8 000000067F00008000000A800C0000094000-000000067F00008000000A800C0000098000__000000CD51009FE8 000000067F00008000000A800C0000096F94-000000067F00008000000A800C00000A06FA__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C0000098000-000000067F00008000000A800C000009C000__000000CD51009FE8 000000067F00008000000A800C000009C000-000000067F00008000000A800C00000A0000__000000CD51009FE8 000000067F00008000000A800C00000A0000-000000067F00008000000A800C00000A4000__000000CD51009FE8 000000067F00008000000A800C00000A06FA-000000067F00008000000A800C00000A9E0D__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C00000A4000-000000067F00008000000A800C00000A8000__000000CD51009FE8 000000067F00008000000A800C00000A8000-000000067F00008000000A800C00000AC000__000000CD51009FE8 000000067F00008000000A800C00000A9E0D-000000067F00008000000A800C00000B3553__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C00000AC000-000000067F00008000000A800C00000B0000__000000CD51009FE8 000000067F00008000000A800C00000B0000-000000067F00008000000A800C00000B4000__000000CD51009FE8 000000067F00008000000A800C00000B3553-000000067F00008000000A800C0100000000__000000CC11F5EDC9-000000CCB1B9E181 000000067F00008000000A800C00000B4000-000000067F00008000000A800C00000B8000__000000CD51009FE8 000000067F00008000000A800C00000B8000-000000067F00008000000A800C00000BC000__000000CD51009FE8 000000067F00008000000A800C00000BC000-000000067F00008000000A800C00000C0000__000000CD51009FE8 000000067F00008000000A800C00000BCB46-000000067F00008000000A800C00000C62AC__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000C0000-000000067F00008000000A800C00000C4000__000000CD51009FE8 000000067F00008000000A800C00000C4000-000000067F00008000000A800C00000C8000__000000CD51009FE8 000000067F00008000000A800C00000C62AC-000000067F00008000000A800C00000CFA09__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000C8000-000000067F00008000000A800C00000CC000__000000CD51009FE8 000000067F00008000000A800C00000CC000-000000067F00008000000A800C00000D0000__000000CD51009FE8 000000067F00008000000A800C00000CFA09-000000067F00008000000A800C00000D9118__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000D0000-000000067F00008000000A800C00000D4000__000000CD51009FE8 000000067F00008000000A800C00000D4000-000000067F00008000000A800C00000D8000__000000CD51009FE8 000000067F00008000000A800C00000D8000-000000067F00008000000A800C00000DC000__000000CD51009FE8 000000067F00008000000A800C00000D9118-000000067F00008000000A800C00000E287E__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000DC000-000000067F00008000000A800C00000E0000__000000CD51009FE8 000000067F00008000000A800C00000E0000-000000067F00008000000A800C00000E4000__000000CD51009FE8 000000067F00008000000A800C00000E287E-000000067F00008000000A800C00000EBFE4__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000E4000-000000067F00008000000A800C00000E8000__000000CD51009FE8 000000067F00008000000A800C00000E8000-000000067F00008000000A800C00000EC000__000000CD51009FE8 000000067F00008000000A800C00000EBFE4-000000067F00008000000A800C00000F570B__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000EC000-000000067F00008000000A800C00000F0000__000000CD51009FE8 000000067F00008000000A800C00000F0000-000000067F00008000000A800C00000F4000__000000CD51009FE8 000000067F00008000000A800C00000F4000-000000067F00008000000A800C00000F8000__000000CD51009FE8 000000067F00008000000A800C00000F570B-000000067F00008000000A800C00000FEE71__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00000F8000-000000067F00008000000A800C00000FC000__000000CD51009FE8 000000067F00008000000A800C00000FC000-000000067F00008000000A800C0000100000__000000CD51009FE8 000000067F00008000000A800C00000FEE71-000000067F00008000000A800C0000108587__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C0000100000-000000067F00008000000A800C0000104000__000000CD51009FE8 000000067F00008000000A800C0000104000-000000067F00008000000A800C0000108000__000000CD51009FE8 000000067F00008000000A800C0000108000-000000067F00008000000A800C000010C000__000000CD51009FE8 000000067F00008000000A800C0000108587-000000067F00008000000A800C0000111C20__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C000010C000-000000067F00008000000A800C0000110000__000000CD51009FE8 000000067F00008000000A800C0000110000-030000000000000000000000000000000002__000000CD51009FE8 000000067F00008000000A800C00FFFFFFFF-010000000000000001000000050000000011__000000CCB1B9E181-000000CD51344F89 000000067F00008000000A800C00FFFFFFFF-030000000000000000000000000000000002__000000CB82C37859-000000CC11F5EDC9 000000067F00008000000A800F0200000000-000000067F00008000000A80140000007ADF__000000CD51344F89-000000CDCC7BF889 000000067F00008000000A80140000007ADF-000000067F00008000000A8014000000F7D0__000000CD51344F89-000000CDCC7BF889 000000067F00008000000A8014000000F7D0-000000067F00008000000A801400000176D0__000000CD51344F89-000000CDCC7BF889 000000067F00008000000A801400000176D0-000000067F00008000000A8014000001F5D2__000000CD51344F89-000000CDCC7BF889 000000067F00008000000A8014000001F5D2-000000067F00008000000A801400000274D5__000000CD51344F89-000000CDCC7BF889 000000067F00008000000A801400000274D5-000000067F00008000000AA00C0000001863__000000CD51344F89-000000CDCC7BF889 000000067F00008000000AA00C0000000000-000000067F00008000000AA00C0000004000__000000CF7E08BFD0 000000067F00008000000AA00C0000001863-000000067F00008000000AA00C000000AFC9__000000CD51344F89-000000CDCC7BF889 000000067F00008000000AA00C0000004000-000000067F00008000000AA00C0000008000__000000CF7E08BFD0 000000067F00008000000AA00C0000008000-000000067F00008000000AA00C000000C000__000000CF7E08BFD0 000000067F00008000000AA00C000000AFC9-030000000000000000000000000000000002__000000CD51344F89-000000CDCC7BF889 000000067F00008000000AA00C000000C000-000000067F00008000000AA00C0000010000__000000CF7E08BFD0 000000067F00008000000AA00C0000010000-000000067F00008000000AA00C0000014000__000000CF7B8D3FD0 000000067F00008000000AA00C00000126EC-000000067F00008000000AA00C000001BE0C__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000014000-000000067F00008000000AA00C0000018000__000000CF7B8D3FD0 000000067F00008000000AA00C0000018000-000000067F00008000000AA00C000001C000__000000CF7B8D3FD0 000000067F00008000000AA00C000001BE0C-000000067F00008000000AA00C000002553F__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C000001C000-000000067F00008000000AA00C0000020000__000000CF7B8D3FD0 000000067F00008000000AA00C0000020000-000000067F00008000000AA00C0000024000__000000CF7B8D3FD0 000000067F00008000000AA00C0000024000-000000067F00008000000AA00C0000028000__000000CF7B8D3FD0 000000067F00008000000AA00C000002553F-000000067F00008000000AA00C000002ECA5__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000028000-000000067F00008000000AA00C000002C000__000000CF7B8D3FD0 000000067F00008000000AA00C000002C000-000000067F00008000000AA00C0000030000__000000CF7B8D3FD0 000000067F00008000000AA00C000002ECA5-000000067F00008000000AA00C00000383BC__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000030000-000000067F00008000000AA00C0000034000__000000CF7B8D3FD0 000000067F00008000000AA00C0000034000-000000067F00008000000AA00C0000038000__000000CF7B8D3FD0 000000067F00008000000AA00C0000038000-000000067F00008000000AA00C000003C000__000000CF7B8D3FD0 000000067F00008000000AA00C00000383BC-000000067F00008000000AA00C0000041B0A__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C000003C000-000000067F00008000000AA00C0000040000__000000CF7B8D3FD0 000000067F00008000000AA00C0000040000-000000067F00008000000AA00C0000044000__000000CF7B8D3FD0 000000067F00008000000AA00C0000041B0A-000000067F00008000000AA00C000004B270__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000044000-000000067F00008000000AA00C0000048000__000000CF7B8D3FD0 000000067F00008000000AA00C0000048000-000000067F00008000000AA00C000004C000__000000CF7B8D3FD0 000000067F00008000000AA00C000004B270-000000067F00008000000AA00C00000549AA__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C000004C000-000000067F00008000000AA00C0000050000__000000CF7B8D3FD0 000000067F00008000000AA00C0000050000-000000067F00008000000AA00C0000054000__000000CF7B8D3FD0 000000067F00008000000AA00C0000054000-000000067F00008000000AA00C0000058000__000000CF7B8D3FD0 000000067F00008000000AA00C00000549AA-000000067F00008000000AA00C000005E10B__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000058000-000000067F00008000000AA00C000005C000__000000CF7B8D3FD0 000000067F00008000000AA00C000005C000-000000067F00008000000AA00C0000060000__000000CF7B8D3FD0 000000067F00008000000AA00C000005E10B-000000067F00008000000AA00C000006782C__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000060000-000000067F00008000000AA00C0000064000__000000CF7B8D3FD0 000000067F00008000000AA00C0000064000-000000067F00008000000AA00C0000068000__000000CF7B8D3FD0 000000067F00008000000AA00C000006782C-000000067F00008000000AA00C0000070F88__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000068000-000000067F00008000000AA00C000006C000__000000CF7B8D3FD0 000000067F00008000000AA00C000006C000-000000067F00008000000AA00C0000070000__000000CF7B8D3FD0 000000067F00008000000AA00C0000070000-000000067F00008000000AA00C0000074000__000000CF7B8D3FD0 000000067F00008000000AA00C0000070F88-000000067F00008000000AA00C0100000000__000000CDCC7BF889-000000CE6C3FED31 000000067F00008000000AA00C0000074000-000000067F00008000000AA00C0000078000__000000CF7B8D3FD0 000000067F00008000000AA00C0000078000-000000067F00008000000AA00C000007C000__000000CF7B8D3FD0 000000067F00008000000AA00C0000078E97-000000067F00008000000AA00C00000823F9__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C000007C000-000000067F00008000000AA00C0000080000__000000CF7B8D3FD0 000000067F00008000000AA00C0000080000-000000067F00008000000AA00C0000084000__000000CF7B8D3FD0 000000067F00008000000AA00C00000823F9-000000067F00008000000AA00C000008BA8A__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C0000084000-000000067F00008000000AA00C0000088000__000000CF7B8D3FD0 000000067F00008000000AA00C0000088000-000000067F00008000000AA00C000008C000__000000CF7B8D3FD0 000000067F00008000000AA00C000008BA8A-000000067F00008000000AA00C00000951BF__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C000008C000-000000067F00008000000AA00C0000090000__000000CF7B8D3FD0 000000067F00008000000AA00C0000090000-000000067F00008000000AA00C0000094000__000000CF7B8D3FD0 000000067F00008000000AA00C0000094000-000000067F00008000000AA00C0000098000__000000CF7B8D3FD0 000000067F00008000000AA00C00000951BF-000000067F00008000000AA00C000009E90A__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C0000098000-000000067F00008000000AA00C000009C000__000000CF7B8D3FD0 000000067F00008000000AA00C000009C000-000000067F00008000000AA00C00000A0000__000000CF7B8D3FD0 000000067F00008000000AA00C000009E90A-000000067F00008000000AA00C00000A802B__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000A0000-000000067F00008000000AA00C00000A4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000A4000-000000067F00008000000AA00C00000A8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000A8000-000000067F00008000000AA00C00000AC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000A802B-000000067F00008000000AA00C00000B1782__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000AC000-000000067F00008000000AA00C00000B0000__000000CF7B8D3FD0 000000067F00008000000AA00C00000B0000-000000067F00008000000AA00C00000B4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000B1782-000000067F00008000000AA00C00000BAEE8__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000B4000-000000067F00008000000AA00C00000B8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000B8000-000000067F00008000000AA00C00000BC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000BAEE8-000000067F00008000000AA00C00000C460C__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000BC000-000000067F00008000000AA00C00000C0000__000000CF7B8D3FD0 000000067F00008000000AA00C00000C0000-000000067F00008000000AA00C00000C4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000C4000-000000067F00008000000AA00C00000C8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000C460C-000000067F00008000000AA00C00000CDD72__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000C8000-000000067F00008000000AA00C00000CC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000CC000-000000067F00008000000AA00C00000D0000__000000CF7B8D3FD0 000000067F00008000000AA00C00000CDD72-000000067F00008000000AA00C00000D74D8__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000D0000-000000067F00008000000AA00C00000D4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000D4000-000000067F00008000000AA00C00000D8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000D74D8-000000067F00008000000AA00C00000E0C0B__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000D8000-000000067F00008000000AA00C00000DC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000DC000-000000067F00008000000AA00C00000E0000__000000CF7B8D3FD0 000000067F00008000000AA00C00000E0000-000000067F00008000000AA00C00000E4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000E0C0B-000000067F00008000000AA00C00000EA371__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000E4000-000000067F00008000000AA00C00000E8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000E8000-000000067F00008000000AA00C00000EC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000EA371-000000067F00008000000AA00C00000F3AD7__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000EC000-000000067F00008000000AA00C00000F0000__000000CF7B8D3FD0 000000067F00008000000AA00C00000F0000-000000067F00008000000AA00C00000F4000__000000CF7B8D3FD0 000000067F00008000000AA00C00000F3AD7-000000067F00008000000AA00C00000FD20B__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C00000F4000-000000067F00008000000AA00C00000F8000__000000CF7B8D3FD0 000000067F00008000000AA00C00000F8000-000000067F00008000000AA00C00000FC000__000000CF7B8D3FD0 000000067F00008000000AA00C00000FC000-000000067F00008000000AA00C0000100000__000000CF7B8D3FD0 000000067F00008000000AA00C00000FD20B-000000067F00008000000AA00C0000106932__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C0000100000-000000067F00008000000AA00C0000104000__000000CF7B8D3FD0 000000067F00008000000AA00C0000104000-000000067F00008000000AA00C0000108000__000000CF7B8D3FD0 000000067F00008000000AA00C0000106932-000000067F00008000000AA00C0000110098__000000CE6C3FED31-000000CF7DC97FD1 000000067F00008000000AA00C0000108000-000000067F00008000000AA00C000010C000__000000CF7B8D3FD0 000000067F00008000000AA00C000010C000-000000067F00008000000AA00C0000110000__000000CF7B8D3FD0 000000067F00008000000AA00C0000110000-030000000000000000000000000000000002__000000CF7B8D3FD0 000000067F00008000000AA00C0000110098-010000000000000001000000050000000012__000000CE6C3FED31-000000CF7DC97FD1 010000000000000001000000000000000000-030000000000000000000000000000000002__000000A29F1D8950 030000000000000000000000000000000001-030000000000000000000000000000000002__000000C689AF4AC1-000000C6C87B6329 ================================================ FILE: pageserver/benches/odd-brook-layernames.txt ================================================ 000000000000000000000000000000000000-000000067F00004002000089C30100000000__0000001C760FA190 000000000000000000000000000000000000-000000067F00004002000089C30100000000__00000038E67ABFA0 000000000000000000000000000000000000-000000067F00004002000089C30100000000__0000003903F1CFE8 000000000000000000000000000000000000-000000067F00004002000089C30100000000__0000003B99F7F8A0 000000000000000000000000000000000000-000000067F00004002000089C30100000000__0000005D2FFFFB38 000000000000000000000000000000000000-000000067F00004002000089C30100000000__00000073AD3FE6B8 000000000000000000000000000000000000-000000067F00004002000089C30100000000__000000914E3F38F0 000000000000000000000000000000000000-000000067F00004002000089C30100000000__000000931B33AE68 000000000000000000000000000000000000-000000067F00004002000089C30100000000__000000931B9AFDF8 000000000000000000000000000000000000-000000067F0000400200008A4F0100000000__000000009E3FE898 000000000000000000000000000000000000-030000000000000000000000000000000002__0000000001696070-00000000016E8B31 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C070601-000000931C075661 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C075661-000000931C0794A1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C0794A1-000000931C07C709 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C07C709-000000931C07FED1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C07FED1-000000931C081909 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C081909-000000931C083E31 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C083E31-000000931C088149 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C088149-000000931C088409 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000931C088409-000000931C0887F1 000000067F000032AC000040040000000000-000000067F0000400200008A5900000080D3__00000000016F8AC9-000000007119E789 000000067F000032AC000040040000000000-000000067F000040020000A0000000008989__0000001C725D0191-0000002070591C61 000000067F000032AC000040040000000000-000000067F000040020000C0000000007F72__00000038ED8FA069-0000003ABA685F11 000000067F000032AC000040040000000000-000000067F000040020000E000000000899C__0000003ABA698781-0000003B6A0FFB09 000000067F000032AC000040040000000000-000000067F000040020000E0000000F4FCF9__00000056FC37F3D9-000000572A7B4CD9 000000067F000032AC000040040000000000-000000067F0000400200010000000000899C__000000572A7C74A1-0000005CA7BBD6F9 000000067F000032AC000040040000000000-000000067F00004002000140000000008988__000000739A920D71-0000008D2DB5E0C1 000000067F000032AC000040040000000000-000000067F00004002000140000000F32D01__0000008FAC75E259-000000900BB52179 000000067F000032AC000040040000000000-000000067F00004002000160000000007F7A__000000900BB52179-0000009046EDA719 000000067F000032AC000040040000000000-000000067F00004002000160000000037E1D__0000009046EDA719-000000914E3FE031 000000067F000032AC000040040000000000-000000067F00004002000180000000007F7A__000000914E3FE031-000000919CCE8B21 000000067F000032AC000040040000000000-000000067F0000400200018000000014F52F__00000092D346E5E9-000000931B991E09 000000067F000032AC000040040000000000-030000000000000000000000000000000002__00000000016E8B31-00000000016F8AC9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__0000001C725A5929-0000001C725C25F1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__0000001C725C25F1-0000001C725D0191 000000067F000032AC000040040000000000-030000000000000000000000000000000002__00000038ECF55FD9-00000038ED8D1E61 000000067F000032AC000040040000000000-030000000000000000000000000000000002__00000038ED8D1E61-00000038ED8E5D49 000000067F000032AC000040040000000000-030000000000000000000000000000000002__00000038ED8E5D49-00000038ED8FA069 000000067F000032AC000040040000000000-030000000000000000000000000000000002__0000003ABA685F11-0000003ABA698781 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000572A7B4CD9-000000572A7C74A1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000739A8D1299-000000739A8E6EF9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000739A8E6EF9-000000739A8FC4B9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000739A8FC4B9-000000739A920D71 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931B991E09-000000931B9AAA89 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931B9AAA89-000000931B9D7EF9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931B9D7EF9-000000931B9E97C9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931B9E97C9-000000931BA45F31 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BA45F31-000000931BA69491 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BA69491-000000931BA85AD9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BA85AD9-000000931BAB3D49 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BAB3D49-000000931BAD4F09 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BAD4F09-000000931BAFBE51 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BAFBE51-000000931BB20A89 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BB20A89-000000931BB445C9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BB445C9-000000931BB6C539 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BB6C539-000000931BB94A11 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BB94A11-000000931BBC0179 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BBC0179-000000931BBE4B21 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BBE4B21-000000931BC0FCC9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BC0FCC9-000000931BC36E61 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BC36E61-000000931BC579B1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BC579B1-000000931BC790F1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BC790F1-000000931BC96EC9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BC96EC9-000000931BCB5D09 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BCB5D09-000000931BCD7991 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BCD7991-000000931BCF66C9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BCF66C9-000000931BD15B61 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BD15B61-000000931BD3B251 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BD3B251-000000931BD5E7D9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BD5E7D9-000000931BD82A51 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BD82A51-000000931BDA7A71 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BDA7A71-000000931BDD2F29 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BDD2F29-000000931BDF89D1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BDF89D1-000000931BE1D831 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BE1D831-000000931BE40719 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BE40719-000000931BE6B0D1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BE6B0D1-000000931BE887A9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BE887A9-000000931BEAD539 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BEAD539-000000931BEC56B9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BEC56B9-000000931BEE27D9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BEE27D9-000000931BF00151 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BF00151-000000931BF24059 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BF24059-000000931BF3EB61 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BF3EB61-000000931BF63011 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BF63011-000000931BF84BB9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BF84BB9-000000931BFAAFF1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BFAAFF1-000000931BFD3511 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BFD3511-000000931BFF93D9 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931BFF93D9-000000931C01DAE1 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931C01DAE1-000000931C045291 000000067F000032AC000040040000000000-030000000000000000000000000000000002__000000931C045291-000000931C070601 000000067F00004002000000000000000001-000000067F0000400200008A590000F1B7DD__0000001BE353E181-0000001C725A5929 000000067F00004002000000000000000001-000000067F000040020000A0000000F11587__000000384463E2C1-00000038E1E2FE19 000000067F00004002000000000000000001-000000067F0000400200010000000030067A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000000000000000001-010000000000000001000000000000000001__00000038E5077EE1-00000038E68FBE49 000000067F00004002000000000000000001-010000000000000001000000000000000001__00000038E99BFDE9-00000038EAFDDF91 000000067F000040020000840E0100000000-000000067F0000400200008A590000044853__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__000000009E3FE898 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__0000001C760FA190 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__00000038E67ABFA0 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__0000003903F1CFE8 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__0000003B99F7F8A0 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__0000005D2FFFFB38 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__00000073AD3FE6B8 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__000000914E3F38F0 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__000000931B33AE68 000000067F0000400200008A590000000000-000000067F0000400200008A590000004000__000000931B9AFDF8 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__000000009E3FE898 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__0000001C760FA190 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__00000038E67ABFA0 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__0000003903F1CFE8 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__0000003B99F7F8A0 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__0000005D2FFFFB38 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__00000073AD3FE6B8 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__000000914E3F38F0 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__000000931B33AE68 000000067F0000400200008A590000004000-000000067F0000400200008A590000008000__000000931B9AFDF8 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__000000009E3FE898 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__0000001C760FA190 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__00000038E67ABFA0 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__0000003903F1CFE8 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__0000003B99F7F8A0 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__0000005D2FFFFB38 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__00000073AD3FE6B8 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__000000914E3F38F0 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__000000931B33AE68 000000067F0000400200008A590000008000-000000067F0000400200008A59000000C000__000000931B9AFDF8 000000067F0000400200008A5900000080D3-000000067F0000400200008A590000010AB3__00000000016F8AC9-000000007119E789 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__000000009E3FE898 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__0000001C760FA190 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__00000038E67ABFA0 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__0000003903F1CFE8 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__0000003B99F7F8A0 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__0000005D2FFFFB38 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__00000073AD3FE6B8 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__000000914E3F38F0 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__000000931B33AE68 000000067F0000400200008A59000000C000-000000067F0000400200008A590000010000__000000931B9AFDF8 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__000000009E3FE898 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__0000001C760FA190 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__00000038E67ABFA0 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__0000003903F1CFE8 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__0000003B99F7F8A0 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__0000005D2FFFFB38 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__00000073AD3FE6B8 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__000000914E3F38F0 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__000000931B33AE68 000000067F0000400200008A590000010000-000000067F0000400200008A590000014000__000000931B9AFDF8 000000067F0000400200008A590000010AB3-000000067F0000400200008A5900000194AF__00000000016F8AC9-000000007119E789 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__000000009E3FE898 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__0000001C760FA190 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__00000038E67ABFA0 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__0000003903F1CFE8 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__0000003B99F7F8A0 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__0000005D2FFFFB38 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__00000073AD3FE6B8 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__000000914E3F38F0 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__000000931B33AE68 000000067F0000400200008A590000014000-000000067F0000400200008A590000018000__000000931B9AFDF8 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__000000009E3FE898 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__0000001C760FA190 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__00000038E67ABFA0 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__0000003903F1CFE8 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__0000003B99F7F8A0 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__0000005D2FFFFB38 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__00000073AD3FE6B8 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__000000914E3F38F0 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__000000931B33AE68 000000067F0000400200008A590000018000-000000067F0000400200008A59000001C000__000000931B9AFDF8 000000067F0000400200008A5900000194AF-000000067F0000400200008A590000021EB4__00000000016F8AC9-000000007119E789 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__000000009E3FE898 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__0000001C760FA190 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__00000038E67ABFA0 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__0000003903F1CFE8 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__0000003B99F7F8A0 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__0000005D2FFFFB38 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__00000073AD3FE6B8 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__000000914E3F38F0 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__000000931B33AE68 000000067F0000400200008A59000001C000-000000067F0000400200008A590000020000__000000931B9AFDF8 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__000000009E3FE898 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__0000001C760FA190 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__00000038E67ABFA0 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__0000003903F1CFE8 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__0000003B99F7F8A0 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__0000005D2FFFFB38 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__00000073AD3FE6B8 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__000000914E3F38F0 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__000000931B33AE68 000000067F0000400200008A590000020000-000000067F0000400200008A590000024000__000000931B9AFDF8 000000067F0000400200008A590000021EB4-000000067F0000400200008A59000002A89D__00000000016F8AC9-000000007119E789 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__000000009E3FE898 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__0000001C760FA190 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__00000038E67ABFA0 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__0000003903F1CFE8 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__0000003B99F7F8A0 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__0000005D2FFFFB38 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__00000073AD3FE6B8 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__000000914E3F38F0 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__000000931B33AE68 000000067F0000400200008A590000024000-000000067F0000400200008A590000028000__000000931B9AFDF8 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__000000009E3FE898 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__0000001C760FA190 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__00000038E67ABFA0 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__0000003903F1CFE8 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__0000003B99F7F8A0 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__0000005D2FFFFB38 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__00000073AD3FE6B8 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__000000914E3F38F0 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__000000931B33AE68 000000067F0000400200008A590000028000-000000067F0000400200008A59000002C000__000000931B9AFDF8 000000067F0000400200008A59000002A89D-000000067F0000400200008A590000033278__00000000016F8AC9-000000007119E789 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__000000009E3FE898 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__0000001C760FA190 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__00000038E67ABFA0 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__0000003903F1CFE8 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__0000003B99F7F8A0 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__0000005D2FFFFB38 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__00000073AD3FE6B8 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__000000914E3F38F0 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__000000931B33AE68 000000067F0000400200008A59000002C000-000000067F0000400200008A590000030000__000000931B9AFDF8 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__000000009E3FE898 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__0000001C760FA190 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__00000038E67ABFA0 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__0000003903F1CFE8 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__0000003B99F7F8A0 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__0000005D2FFFFB38 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__00000073AD3FE6B8 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__000000914E3F38F0 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__000000931B33AE68 000000067F0000400200008A590000030000-000000067F0000400200008A590000034000__000000931B9AFDF8 000000067F0000400200008A590000033278-000000067F0000400200008A59000003BC3A__00000000016F8AC9-000000007119E789 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__000000009E3FE898 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__0000001C760FA190 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__00000038E67ABFA0 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__0000003903F1CFE8 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__0000003B99F7F8A0 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__0000005D2FFFFB38 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__00000073AD3FE6B8 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__000000914E3F38F0 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__000000931B33AE68 000000067F0000400200008A590000034000-000000067F0000400200008A590000038000__000000931B9AFDF8 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__000000009E3FE898 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__0000001C760FA190 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__00000038E67ABFA0 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__0000003903F1CFE8 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__0000003B99F7F8A0 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__0000005D2FFFFB38 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__00000073AD3FE6B8 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__000000914E3F38F0 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__000000931B33AE68 000000067F0000400200008A590000038000-000000067F0000400200008A59000003C000__000000931B9AFDF8 000000067F0000400200008A59000003BC3A-030000000000000000000000000000000002__00000000016F8AC9-000000007119E789 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__000000009E3FE898 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__0000001C760FA190 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__00000038E67ABFA0 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__0000003903F1CFE8 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__0000003B99F7F8A0 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__0000005D2FFFFB38 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__00000073AD3FE6B8 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__000000914E3F38F0 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__000000931B33AE68 000000067F0000400200008A59000003C000-000000067F0000400200008A590000040000__000000931B9AFDF8 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__000000009E3FE898 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__0000001C760FA190 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__00000038E67ABFA0 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__0000003903F1CFE8 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__0000003B99F7F8A0 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__0000005D2FFFFB38 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__00000073AD3FE6B8 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__000000914E3F38F0 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__000000931B33AE68 000000067F0000400200008A590000040000-000000067F0000400200008A590000044000__000000931B9AFDF8 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__000000009E3FE898 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__0000001C760FA190 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__00000038E67ABFA0 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__0000003903F1CFE8 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__0000003B99F7F8A0 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__0000005D2FFFFB38 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__00000073AD3FE6B8 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__000000914E3F38F0 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__000000931B33AE68 000000067F0000400200008A590000044000-000000067F0000400200008A590000048000__000000931B9AFDF8 000000067F0000400200008A590000044853-000000067F0000400200008A59000004D22E__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__000000009E3FE898 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__0000001C760FA190 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__00000038E67ABFA0 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__0000003903F1CFE8 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__0000003B99F7F8A0 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__0000005D2FFFFB38 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__00000073AD3FE6B8 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__000000914E3F38F0 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__000000931B33AE68 000000067F0000400200008A590000048000-000000067F0000400200008A59000004C000__000000931B9AFDF8 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__000000009E3FE898 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__0000001C760FA190 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__00000038E67ABFA0 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__0000003903F1CFE8 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__0000003B99F7F8A0 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__0000005D2FFFFB38 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__00000073AD3FE6B8 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__000000914E3F38F0 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__000000931B33AE68 000000067F0000400200008A59000004C000-000000067F0000400200008A590000050000__000000931B9AFDF8 000000067F0000400200008A59000004D22E-000000067F0000400200008A590000055C2F__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__000000009E3FE898 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__0000001C760FA190 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__00000038E67ABFA0 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__0000003903F1CFE8 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__0000003B99F7F8A0 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__0000005D2FFFFB38 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__00000073AD3FE6B8 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__000000914E3F38F0 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__000000931B33AE68 000000067F0000400200008A590000050000-000000067F0000400200008A590000054000__000000931B9AFDF8 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__0000001C760FA190 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__00000038E67ABFA0 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__0000003903F1CFE8 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__0000003B99F7F8A0 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__0000005D2FFFFB38 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__00000073AD3FE6B8 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__000000914E3F38F0 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__000000931B33AE68 000000067F0000400200008A590000054000-000000067F0000400200008A590000058000__000000931B9AFDF8 000000067F0000400200008A590000054000-030000000000000000000000000000000002__000000009E3FE898 000000067F0000400200008A590000055C2F-000000067F0000400200008A59000005E61C__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__0000001C760FA190 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__00000038E67ABFA0 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__0000003903F1CFE8 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__0000003B99F7F8A0 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__0000005D2FFFFB38 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__00000073AD3FE6B8 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__000000914E3F38F0 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__000000931B33AE68 000000067F0000400200008A590000058000-000000067F0000400200008A59000005C000__000000931B9AFDF8 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__0000001C760FA190 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__00000038E67ABFA0 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__0000003903F1CFE8 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__0000003B99F7F8A0 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__0000005D2FFFFB38 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__00000073AD3FE6B8 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__000000914E3F38F0 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__000000931B33AE68 000000067F0000400200008A59000005C000-000000067F0000400200008A590000060000__000000931B9AFDF8 000000067F0000400200008A59000005E61C-000000067F0000400200008A590000066FFD__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__0000001C760FA190 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__00000038E67ABFA0 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__0000003903F1CFE8 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__0000003B99F7F8A0 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__0000005D2FFFFB38 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__00000073AD3FE6B8 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__000000914E3F38F0 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__000000931B33AE68 000000067F0000400200008A590000060000-000000067F0000400200008A590000064000__000000931B9AFDF8 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__0000001C760FA190 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__00000038E67ABFA0 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__0000003903F1CFE8 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__0000003B99F7F8A0 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__0000005D2FFFFB38 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__00000073AD3FE6B8 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__000000914E3F38F0 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__000000931B33AE68 000000067F0000400200008A590000064000-000000067F0000400200008A590000068000__000000931B9AFDF8 000000067F0000400200008A590000066FFD-000000067F0000400200008A59000006F9CB__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__0000001C760FA190 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__00000038E67ABFA0 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__0000003903F1CFE8 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__0000003B99F7F8A0 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__0000005D2FFFFB38 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__00000073AD3FE6B8 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__000000914E3F38F0 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__000000931B33AE68 000000067F0000400200008A590000068000-000000067F0000400200008A59000006C000__000000931B9AFDF8 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__0000001C760FA190 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__00000038E67ABFA0 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__0000003903F1CFE8 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__0000003B99F7F8A0 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__0000005D2FFFFB38 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__00000073AD3FE6B8 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__000000914E3F38F0 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__000000931B33AE68 000000067F0000400200008A59000006C000-000000067F0000400200008A590000070000__000000931B9AFDF8 000000067F0000400200008A59000006F9CB-000000067F0000400200008A590000078388__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__0000001C760FA190 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__00000038E67ABFA0 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__0000003903F1CFE8 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__0000003B99F7F8A0 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__0000005D2FFFFB38 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__00000073AD3FE6B8 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__000000914E3F38F0 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__000000931B33AE68 000000067F0000400200008A590000070000-000000067F0000400200008A590000074000__000000931B9AFDF8 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__0000001C760FA190 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__00000038E67ABFA0 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__0000003903F1CFE8 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__0000003B99F7F8A0 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__0000005D2FFFFB38 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__00000073AD3FE6B8 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__000000914E3F38F0 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__000000931B33AE68 000000067F0000400200008A590000074000-000000067F0000400200008A590000078000__000000931B9AFDF8 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__0000001C760FA190 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__00000038E67ABFA0 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__0000003903F1CFE8 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__0000003B99F7F8A0 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__0000005D2FFFFB38 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__00000073AD3FE6B8 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__000000914E3F38F0 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__000000931B33AE68 000000067F0000400200008A590000078000-000000067F0000400200008A59000007C000__000000931B9AFDF8 000000067F0000400200008A590000078388-000000067F0000400200008A590000080D43__000000007119E789-0000000120C1DDF9 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__0000001C760FA190 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__00000038E67ABFA0 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__0000003903F1CFE8 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__0000003B99F7F8A0 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__0000005D2FFFFB38 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__00000073AD3FE6B8 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__000000914E3F38F0 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__000000931B33AE68 000000067F0000400200008A59000007C000-000000067F0000400200008A590000080000__000000931B9AFDF8 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__0000001C760FA190 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__00000038E67ABFA0 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__0000003903F1CFE8 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__0000003B99F7F8A0 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__0000005D2FFFFB38 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__00000073AD3FE6B8 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__000000914E3F38F0 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__000000931B33AE68 000000067F0000400200008A590000080000-000000067F0000400200008A590000084000__000000931B9AFDF8 000000067F0000400200008A590000080D43-000000067F0000400200008A590000089730__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__0000001C760FA190 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__00000038E67ABFA0 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__0000003903F1CFE8 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__0000003B99F7F8A0 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__0000005D2FFFFB38 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__00000073AD3FE6B8 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__000000914E3F38F0 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__000000931B33AE68 000000067F0000400200008A590000084000-000000067F0000400200008A590000088000__000000931B9AFDF8 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__0000001C760FA190 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__00000038E67ABFA0 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__0000003903F1CFE8 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__0000003B99F7F8A0 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__0000005D2FFFFB38 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__00000073AD3FE6B8 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__000000914E3F38F0 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__000000931B33AE68 000000067F0000400200008A590000088000-000000067F0000400200008A59000008C000__000000931B9AFDF8 000000067F0000400200008A590000089730-000000067F0000400200008A590000092129__000000007119E789-0000000120C1DDF9 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__0000001C760FA190 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__00000038E67ABFA0 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__0000003903F1CFE8 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__0000003B99F7F8A0 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__0000005D2FFFFB38 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__00000073AD3FE6B8 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__000000914E3F38F0 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__000000931B33AE68 000000067F0000400200008A59000008C000-000000067F0000400200008A590000090000__000000931B9AFDF8 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__0000001C760FA190 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__00000038E67ABFA0 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__0000003903F1CFE8 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__0000003B99F7F8A0 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__0000005D2FFFFB38 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__00000073AD3FE6B8 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__000000914E3F38F0 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__000000931B33AE68 000000067F0000400200008A590000090000-000000067F0000400200008A590000094000__000000931B9AFDF8 000000067F0000400200008A590000092129-000000067F0000400200008A59000009AB12__000000007119E789-0000000120C1DDF9 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__0000001C760FA190 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__00000038E67ABFA0 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__0000003903F1CFE8 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__0000003B99F7F8A0 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__0000005D2FFFFB38 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__00000073AD3FE6B8 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__000000914E3F38F0 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__000000931B33AE68 000000067F0000400200008A590000094000-000000067F0000400200008A590000098000__000000931B9AFDF8 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__0000001C725A2400 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__0000001C760FA190 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__00000038E67ABFA0 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__0000003903F1CFE8 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__0000003B99F7F8A0 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__0000005D2FFFFB38 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__00000073AD3FE6B8 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__000000914E3F38F0 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__000000931B33AE68 000000067F0000400200008A590000098000-000000067F0000400200008A59000009C000__000000931B9AFDF8 000000067F0000400200008A59000009AB12-000000067F0000400200008A590100000000__000000007119E789-0000000120C1DDF9 000000067F0000400200008A59000009AE54-000000067F0000400200008A5900000A3836__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__0000001C725A2400 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__0000001C760FA190 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__00000038E67ABFA0 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__0000003903F1CFE8 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__0000003B99F7F8A0 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__0000005D2FFFFB38 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__00000073AD3FE6B8 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__000000914E3F38F0 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__000000931B33AE68 000000067F0000400200008A59000009C000-000000067F0000400200008A5900000A0000__000000931B9AFDF8 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__0000001C725A2400 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__0000001C760FA190 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__00000038E67ABFA0 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__0000003903F1CFE8 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__0000003B99F7F8A0 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__0000005D2FFFFB38 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__00000073AD3FE6B8 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__000000914E3F38F0 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__000000931B33AE68 000000067F0000400200008A5900000A0000-000000067F0000400200008A5900000A4000__000000931B9AFDF8 000000067F0000400200008A5900000A3836-000000067F0000400200008A5900000AC1F4__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__0000001C725A2400 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__0000001C760FA190 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__00000038E67ABFA0 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__0000003903F1CFE8 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__0000003B99F7F8A0 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__0000005D2FFFFB38 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__00000073AD3FE6B8 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__000000914E3F38F0 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__000000931B33AE68 000000067F0000400200008A5900000A4000-000000067F0000400200008A5900000A8000__000000931B9AFDF8 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__0000001C725A2400 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__0000001C760FA190 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__00000038E67ABFA0 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__0000003903F1CFE8 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__0000003B99F7F8A0 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__0000005D2FFFFB38 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__00000073AD3FE6B8 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__000000914E3F38F0 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__000000931B33AE68 000000067F0000400200008A5900000A8000-000000067F0000400200008A5900000AC000__000000931B9AFDF8 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__0000001C725A2400 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__0000001C760FA190 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__00000038E67ABFA0 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__0000003903F1CFE8 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__0000003B99F7F8A0 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__0000005D2FFFFB38 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__00000073AD3FE6B8 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__000000914E3F38F0 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__000000931B33AE68 000000067F0000400200008A5900000AC000-000000067F0000400200008A5900000B0000__000000931B9AFDF8 000000067F0000400200008A5900000AC1F4-000000067F0000400200008A5900000B4BC0__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__0000001C725A2400 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__0000001C760FA190 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__00000038E67ABFA0 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__0000003903F1CFE8 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__0000003B99F7F8A0 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__0000005D2FFFFB38 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__00000073AD3FE6B8 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__000000914E3F38F0 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__000000931B33AE68 000000067F0000400200008A5900000B0000-000000067F0000400200008A5900000B4000__000000931B9AFDF8 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__0000001C725A2400 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__0000001C760FA190 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__00000038E67ABFA0 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__0000003903F1CFE8 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__0000003B99F7F8A0 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__0000005D2FFFFB38 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__00000073AD3FE6B8 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__000000914E3F38F0 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__000000931B33AE68 000000067F0000400200008A5900000B4000-000000067F0000400200008A5900000B8000__000000931B9AFDF8 000000067F0000400200008A5900000B4BC0-000000067F0000400200008A5900000BD58B__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__0000001C725A2400 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__0000001C760FA190 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__00000038E67ABFA0 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__0000003903F1CFE8 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__0000003B99F7F8A0 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__0000005D2FFFFB38 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__00000073AD3FE6B8 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__000000914E3F38F0 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__000000931B33AE68 000000067F0000400200008A5900000B8000-000000067F0000400200008A5900000BC000__000000931B9AFDF8 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__0000001C725A2400 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__0000001C760FA190 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__00000038E67ABFA0 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__0000003903F1CFE8 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__0000003B99F7F8A0 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__0000005D2FFFFB38 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__00000073AD3FE6B8 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__000000914E3F38F0 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__000000931B33AE68 000000067F0000400200008A5900000BC000-000000067F0000400200008A5900000C0000__000000931B9AFDF8 000000067F0000400200008A5900000BD58B-000000067F0000400200008A5900000C5F89__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__0000001C725A2400 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__0000001C760FA190 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__00000038E67ABFA0 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__0000003903F1CFE8 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__0000003B99F7F8A0 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__0000005D2FFFFB38 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__00000073AD3FE6B8 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__000000914E3F38F0 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__000000931B33AE68 000000067F0000400200008A5900000C0000-000000067F0000400200008A5900000C4000__000000931B9AFDF8 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__0000001C725A2400 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__0000001C760FA190 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__00000038E67ABFA0 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__0000003903F1CFE8 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__0000003B99F7F8A0 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__0000005D2FFFFB38 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__00000073AD3FE6B8 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__000000914E3F38F0 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__000000931B33AE68 000000067F0000400200008A5900000C4000-000000067F0000400200008A5900000C8000__000000931B9AFDF8 000000067F0000400200008A5900000C5F89-000000067F0000400200008A5900000CE983__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__0000001C725A2400 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__0000001C760FA190 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__00000038E67ABFA0 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__0000003903F1CFE8 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__0000003B99F7F8A0 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__0000005D2FFFFB38 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__00000073AD3FE6B8 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__000000914E3F38F0 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__000000931B33AE68 000000067F0000400200008A5900000C8000-000000067F0000400200008A5900000CC000__000000931B9AFDF8 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__0000001C725A2400 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__0000001C760FA190 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__00000038E67ABFA0 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__0000003903F1CFE8 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__0000003B99F7F8A0 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__0000005D2FFFFB38 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__00000073AD3FE6B8 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__000000914E3F38F0 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__000000931B33AE68 000000067F0000400200008A5900000CC000-000000067F0000400200008A5900000D0000__000000931B9AFDF8 000000067F0000400200008A5900000CE983-000000067F0000400200008A5900000D736F__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__0000001C725A2400 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__0000001C760FA190 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__00000038E67ABFA0 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__0000003903F1CFE8 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__0000003B99F7F8A0 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__0000005D2FFFFB38 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__00000073AD3FE6B8 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__000000914E3F38F0 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__000000931B33AE68 000000067F0000400200008A5900000D0000-000000067F0000400200008A5900000D4000__000000931B9AFDF8 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__0000001C725A2400 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__0000001C760FA190 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__00000038E67ABFA0 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__0000003903F1CFE8 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__0000003B99F7F8A0 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__0000005D2FFFFB38 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__00000073AD3FE6B8 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__000000914E3F38F0 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__000000931B33AE68 000000067F0000400200008A5900000D4000-000000067F0000400200008A5900000D8000__000000931B9AFDF8 000000067F0000400200008A5900000D736F-000000067F0000400200008A5900000DFD47__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__0000001C725A2400 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__0000001C760FA190 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__00000038E67ABFA0 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__0000003903F1CFE8 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__0000003B99F7F8A0 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__0000005D2FFFFB38 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__00000073AD3FE6B8 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__000000914E3F38F0 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__000000931B33AE68 000000067F0000400200008A5900000D8000-000000067F0000400200008A5900000DC000__000000931B9AFDF8 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__0000001C725A2400 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__0000001C760FA190 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__00000038E67ABFA0 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__0000003903F1CFE8 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__0000003B99F7F8A0 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__0000005D2FFFFB38 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__00000073AD3FE6B8 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__000000914E3F38F0 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__000000931B33AE68 000000067F0000400200008A5900000DC000-000000067F0000400200008A5900000E0000__000000931B9AFDF8 000000067F0000400200008A5900000DFD47-000000067F0000400200008A5900000E870D__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__0000001C725A2400 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__0000001C760FA190 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__00000038E67ABFA0 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__0000003903F1CFE8 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__0000003B99F7F8A0 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__0000005D2FFFFB38 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__00000073AD3FE6B8 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__000000914E3F38F0 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__000000931B33AE68 000000067F0000400200008A5900000E0000-000000067F0000400200008A5900000E4000__000000931B9AFDF8 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__0000001C725A2400 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__0000001C760FA190 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__00000038E67ABFA0 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__0000003903F1CFE8 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__0000003B99F7F8A0 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__0000005D2FFFFB38 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__00000073AD3FE6B8 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__000000914E3F38F0 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__000000931B33AE68 000000067F0000400200008A5900000E4000-000000067F0000400200008A5900000E8000__000000931B9AFDF8 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__0000001C725A2400 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__0000001C760FA190 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__00000038E67ABFA0 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__0000003903F1CFE8 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__0000003B99F7F8A0 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__0000005D2FFFFB38 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__00000073AD3FE6B8 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__000000914E3F38F0 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__000000931B33AE68 000000067F0000400200008A5900000E8000-000000067F0000400200008A5900000EC000__000000931B9AFDF8 000000067F0000400200008A5900000E870D-000000067F0000400200008A5900000F10C9__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__0000001C725A2400 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__0000001C760FA190 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__00000038E67ABFA0 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__0000003903F1CFE8 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__0000003B99F7F8A0 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__0000005D2FFFFB38 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__00000073AD3FE6B8 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__000000914E3F38F0 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__000000931B33AE68 000000067F0000400200008A5900000EC000-000000067F0000400200008A5900000F0000__000000931B9AFDF8 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__000000028BBFFDB8 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__0000001C760FA190 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__00000038E67ABFA0 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__0000003903F1CFE8 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__0000003B99F7F8A0 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__0000005D2FFFFB38 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__00000073AD3FE6B8 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__000000914E3F38F0 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__000000931B33AE68 000000067F0000400200008A5900000F0000-000000067F0000400200008A5900000F4000__000000931B9AFDF8 000000067F0000400200008A5900000F10C9-000000067F0000400200008A590100000000__0000000120C1DDF9-00000001C071E001 000000067F0000400200008A5900000F13A6-000000067F0000400200008A5900000F9D70__00000001C071E001-000000027019FBC1 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__000000028BBFFDB8 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__0000001C760FA190 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__00000038E67ABFA0 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__0000003903F1CFE8 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__0000003B99F7F8A0 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__0000005D2FFFFB38 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__00000073AD3FE6B8 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__000000914E3F38F0 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__000000931B33AE68 000000067F0000400200008A5900000F4000-000000067F0000400200008A5900000F8000__000000931B9AFDF8 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__000000028BBFFDB8 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__0000001C760FA190 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__00000038E67ABFA0 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__0000003903F1CFE8 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__0000003B99F7F8A0 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__0000005D2FFFFB38 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__00000073AD3FE6B8 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__000000914E3F38F0 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__000000931B33AE68 000000067F0000400200008A5900000F8000-000000067F0000400200008A5900000FC000__000000931B9AFDF8 000000067F0000400200008A5900000F9D70-000000067F0000400200008A59000010275E__00000001C071E001-000000027019FBC1 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__000000028BBFFDB8 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__0000001C760FA190 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__00000038E67ABFA0 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__0000003903F1CFE8 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__0000003B99F7F8A0 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__0000005D2FFFFB38 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__00000073AD3FE6B8 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__000000914E3F38F0 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__000000931B33AE68 000000067F0000400200008A5900000FC000-000000067F0000400200008A590000100000__000000931B9AFDF8 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__000000028BBFFDB8 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__0000001C760FA190 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__00000038E67ABFA0 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__0000003903F1CFE8 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__0000003B99F7F8A0 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__0000005D2FFFFB38 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__00000073AD3FE6B8 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__000000914E3F38F0 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__000000931B33AE68 000000067F0000400200008A590000100000-000000067F0000400200008A590000104000__000000931B9AFDF8 000000067F0000400200008A59000010275E-000000067F0000400200008A59000010B151__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__000000028BBFFDB8 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__0000001C760FA190 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__00000038E67ABFA0 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__0000003903F1CFE8 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__0000003B99F7F8A0 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__0000005D2FFFFB38 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__00000073AD3FE6B8 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__000000914E3F38F0 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__000000931B33AE68 000000067F0000400200008A590000104000-000000067F0000400200008A590000108000__000000931B9AFDF8 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__000000028BBFFDB8 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__0000001C760FA190 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__00000038E67ABFA0 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__0000003903F1CFE8 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__0000003B99F7F8A0 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__0000005D2FFFFB38 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__00000073AD3FE6B8 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__000000914E3F38F0 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__000000931B33AE68 000000067F0000400200008A590000108000-000000067F0000400200008A59000010C000__000000931B9AFDF8 000000067F0000400200008A59000010B151-000000067F0000400200008A590000113B39__00000001C071E001-000000027019FBC1 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__000000028BBFFDB8 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__0000001C760FA190 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__00000038E67ABFA0 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__0000003903F1CFE8 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__0000003B99F7F8A0 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__0000005D2FFFFB38 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__00000073AD3FE6B8 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__000000914E3F38F0 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__000000931B33AE68 000000067F0000400200008A59000010C000-000000067F0000400200008A590000110000__000000931B9AFDF8 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__000000028BBFFDB8 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__0000001C760FA190 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__00000038E67ABFA0 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__0000003903F1CFE8 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__0000003B99F7F8A0 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__0000005D2FFFFB38 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__00000073AD3FE6B8 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__000000914E3F38F0 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__000000931B33AE68 000000067F0000400200008A590000110000-000000067F0000400200008A590000114000__000000931B9AFDF8 000000067F0000400200008A590000113B39-000000067F0000400200008A59000011C515__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__000000028BBFFDB8 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__0000001C760FA190 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__00000038E67ABFA0 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__0000003903F1CFE8 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__0000003B99F7F8A0 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__0000005D2FFFFB38 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__00000073AD3FE6B8 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__000000914E3F38F0 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__000000931B33AE68 000000067F0000400200008A590000114000-000000067F0000400200008A590000118000__000000931B9AFDF8 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__000000028BBFFDB8 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__0000001C760FA190 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__00000038E67ABFA0 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__0000003903F1CFE8 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__0000003B99F7F8A0 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__0000005D2FFFFB38 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__00000073AD3FE6B8 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__000000914E3F38F0 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__000000931B33AE68 000000067F0000400200008A590000118000-000000067F0000400200008A59000011C000__000000931B9AFDF8 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__000000028BBFFDB8 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__0000001C760FA190 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__00000038E67ABFA0 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__0000003903F1CFE8 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__0000003B99F7F8A0 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__0000005D2FFFFB38 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__00000073AD3FE6B8 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__000000914E3F38F0 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__000000931B33AE68 000000067F0000400200008A59000011C000-000000067F0000400200008A590000120000__000000931B9AFDF8 000000067F0000400200008A59000011C515-000000067F0000400200008A590000124EDB__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__000000028BBFFDB8 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__0000001C760FA190 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__00000038E67ABFA0 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__0000003903F1CFE8 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__0000003B99F7F8A0 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__0000005D2FFFFB38 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__00000073AD3FE6B8 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__000000914E3F38F0 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__000000931B33AE68 000000067F0000400200008A590000120000-000000067F0000400200008A590000124000__000000931B9AFDF8 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__000000028BBFFDB8 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__0000001C760FA190 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__00000038E67ABFA0 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__0000003903F1CFE8 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__0000003B99F7F8A0 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__0000005D2FFFFB38 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__00000073AD3FE6B8 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__000000914E3F38F0 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__000000931B33AE68 000000067F0000400200008A590000124000-000000067F0000400200008A590000128000__000000931B9AFDF8 000000067F0000400200008A590000124EDB-000000067F0000400200008A59000012D89B__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__000000028BBFFDB8 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__0000001C760FA190 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__00000038E67ABFA0 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__0000003903F1CFE8 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__0000003B99F7F8A0 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__0000005D2FFFFB38 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__00000073AD3FE6B8 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__000000914E3F38F0 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__000000931B33AE68 000000067F0000400200008A590000128000-000000067F0000400200008A59000012C000__000000931B9AFDF8 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__000000028BBFFDB8 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__0000001C760FA190 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__00000038E67ABFA0 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__0000003903F1CFE8 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__0000003B99F7F8A0 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__0000005D2FFFFB38 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__00000073AD3FE6B8 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__000000914E3F38F0 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__000000931B33AE68 000000067F0000400200008A59000012C000-000000067F0000400200008A590000130000__000000931B9AFDF8 000000067F0000400200008A59000012D89B-000000067F0000400200008A590000136269__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__000000028BBFFDB8 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__0000001C760FA190 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__00000038E67ABFA0 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__0000003903F1CFE8 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__0000003B99F7F8A0 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__0000005D2FFFFB38 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__00000073AD3FE6B8 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__000000914E3F38F0 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__000000931B33AE68 000000067F0000400200008A590000130000-000000067F0000400200008A590000134000__000000931B9AFDF8 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__000000028BBFFDB8 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__0000001C760FA190 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__00000038E67ABFA0 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__0000003903F1CFE8 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__0000003B99F7F8A0 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__0000005D2FFFFB38 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__00000073AD3FE6B8 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__000000914E3F38F0 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__000000931B33AE68 000000067F0000400200008A590000134000-000000067F0000400200008A590000138000__000000931B9AFDF8 000000067F0000400200008A590000136269-000000067F0000400200008A59000013EC56__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__000000028BBFFDB8 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__0000001C760FA190 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__00000038E67ABFA0 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__0000003903F1CFE8 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__0000003B99F7F8A0 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__0000005D2FFFFB38 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__00000073AD3FE6B8 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__000000914E3F38F0 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__000000931B33AE68 000000067F0000400200008A590000138000-000000067F0000400200008A59000013C000__000000931B9AFDF8 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__000000028BBFFDB8 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__0000001C760FA190 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__00000038E67ABFA0 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__0000003903F1CFE8 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__0000003B99F7F8A0 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__0000005D2FFFFB38 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__00000073AD3FE6B8 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__000000914E3F38F0 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__000000931B33AE68 000000067F0000400200008A59000013C000-000000067F0000400200008A590000140000__000000931B9AFDF8 000000067F0000400200008A59000013EC56-000000067F0000400200008A590000147647__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__000000028BBFFDB8 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__0000001C760FA190 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__00000038E67ABFA0 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__0000003903F1CFE8 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__0000003B99F7F8A0 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__0000005D2FFFFB38 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__00000073AD3FE6B8 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__000000914E3F38F0 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__000000931B33AE68 000000067F0000400200008A590000140000-000000067F0000400200008A590000144000__000000931B9AFDF8 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__000000028BBFFDB8 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__0000001C760FA190 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__00000038E67ABFA0 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__0000003903F1CFE8 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__0000003B99F7F8A0 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__0000005D2FFFFB38 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__00000073AD3FE6B8 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__000000914E3F38F0 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__000000931B33AE68 000000067F0000400200008A590000144000-000000067F0000400200008A590000148000__000000931B9AFDF8 000000067F0000400200008A590000147647-000000067F0000400200008A590000150027__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__000000028BBFFDB8 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__0000001C760FA190 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__00000038E67ABFA0 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__0000003903F1CFE8 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__0000003B99F7F8A0 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__0000005D2FFFFB38 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__00000073AD3FE6B8 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__000000914E3F38F0 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__000000931B33AE68 000000067F0000400200008A590000148000-000000067F0000400200008A59000014C000__000000931B9AFDF8 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__000000028BBFFDB8 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__0000001C760FA190 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__00000038E67ABFA0 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__0000003903F1CFE8 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__0000003B99F7F8A0 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__0000005D2FFFFB38 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__00000073AD3FE6B8 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__000000914E3F38F0 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__000000931B33AE68 000000067F0000400200008A59000014C000-000000067F0000400200008A590000150000__000000931B9AFDF8 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__000000028BBFFDB8 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__0000001C760FA190 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__00000038E67ABFA0 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__0000003903F1CFE8 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__0000003B99F7F8A0 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__0000005D2FFFFB38 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__00000073AD3FE6B8 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__000000914E3F38F0 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__000000931B33AE68 000000067F0000400200008A590000150000-000000067F0000400200008A590000154000__000000931B9AFDF8 000000067F0000400200008A590000150027-000000067F0000400200008A590100000000__00000001C071E001-000000027019FBC1 000000067F0000400200008A590000150355-000000067F0000400200008A590000158D32__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__000000028BBFFDB8 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__0000001C760FA190 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__00000038E67ABFA0 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__0000003903F1CFE8 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__0000003B99F7F8A0 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__0000005D2FFFFB38 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__00000073AD3FE6B8 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__000000914E3F38F0 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__000000931B33AE68 000000067F0000400200008A590000154000-000000067F0000400200008A590000158000__000000931B9AFDF8 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__000000028BBFFDB8 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__0000001C760FA190 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__00000038E67ABFA0 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__0000003903F1CFE8 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__0000003B99F7F8A0 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__0000005D2FFFFB38 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__00000073AD3FE6B8 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__000000914E3F38F0 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__000000931B33AE68 000000067F0000400200008A590000158000-000000067F0000400200008A59000015C000__000000931B9AFDF8 000000067F0000400200008A590000158D32-000000067F0000400200008A5900001616F5__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__0000001C760FA190 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__00000038E67ABFA0 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__0000003903F1CFE8 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__0000003B99F7F8A0 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__0000005D2FFFFB38 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__00000073AD3FE6B8 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__000000914E3F38F0 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__000000931B33AE68 000000067F0000400200008A59000015C000-000000067F0000400200008A590000160000__000000931B9AFDF8 000000067F0000400200008A59000015C000-030000000000000000000000000000000002__000000028BBFFDB8 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__0000001C760FA190 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__00000038E67ABFA0 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__0000003903F1CFE8 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__0000003B99F7F8A0 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__0000005D2FFFFB38 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__00000073AD3FE6B8 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__000000914E3F38F0 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__000000931B33AE68 000000067F0000400200008A590000160000-000000067F0000400200008A590000164000__000000931B9AFDF8 000000067F0000400200008A5900001616F5-000000067F0000400200008A59000016A0B7__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__0000001C760FA190 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__00000038E67ABFA0 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__0000003903F1CFE8 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__0000003B99F7F8A0 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__0000005D2FFFFB38 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__00000073AD3FE6B8 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__000000914E3F38F0 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__000000931B33AE68 000000067F0000400200008A590000164000-000000067F0000400200008A590000168000__000000931B9AFDF8 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__0000001C760FA190 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__00000038E67ABFA0 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__0000003903F1CFE8 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__0000003B99F7F8A0 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__0000005D2FFFFB38 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__00000073AD3FE6B8 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__000000914E3F38F0 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__000000931B33AE68 000000067F0000400200008A590000168000-000000067F0000400200008A59000016C000__000000931B9AFDF8 000000067F0000400200008A59000016A0B7-000000067F0000400200008A590000172A96__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__0000001C760FA190 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__00000038E67ABFA0 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__0000003903F1CFE8 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__0000003B99F7F8A0 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__0000005D2FFFFB38 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__00000073AD3FE6B8 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__000000914E3F38F0 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__000000931B33AE68 000000067F0000400200008A59000016C000-000000067F0000400200008A590000170000__000000931B9AFDF8 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__0000001C760FA190 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__00000038E67ABFA0 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__0000003903F1CFE8 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__0000003B99F7F8A0 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__0000005D2FFFFB38 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__00000073AD3FE6B8 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__000000914E3F38F0 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__000000931B33AE68 000000067F0000400200008A590000170000-000000067F0000400200008A590000174000__000000931B9AFDF8 000000067F0000400200008A590000172A96-000000067F0000400200008A59000017B48B__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__0000001C760FA190 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__00000038E67ABFA0 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__0000003903F1CFE8 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__0000003B99F7F8A0 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__0000005D2FFFFB38 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__00000073AD3FE6B8 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__000000914E3F38F0 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__000000931B33AE68 000000067F0000400200008A590000174000-000000067F0000400200008A590000178000__000000931B9AFDF8 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__0000001C760FA190 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__00000038E67ABFA0 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__0000003903F1CFE8 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__0000003B99F7F8A0 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__0000005D2FFFFB38 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__00000073AD3FE6B8 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__000000914E3F38F0 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__000000931B33AE68 000000067F0000400200008A590000178000-000000067F0000400200008A59000017C000__000000931B9AFDF8 000000067F0000400200008A59000017B48B-000000067F0000400200008A590000183E80__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__0000001C760FA190 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__00000038E67ABFA0 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__0000003903F1CFE8 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__0000003B99F7F8A0 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__0000005D2FFFFB38 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__00000073AD3FE6B8 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__000000914E3F38F0 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__000000931B33AE68 000000067F0000400200008A59000017C000-000000067F0000400200008A590000180000__000000931B9AFDF8 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__0000001C760FA190 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__00000038E67ABFA0 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__0000003903F1CFE8 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__0000003B99F7F8A0 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__0000005D2FFFFB38 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__00000073AD3FE6B8 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__000000914E3F38F0 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__000000931B33AE68 000000067F0000400200008A590000180000-000000067F0000400200008A590000184000__000000931B9AFDF8 000000067F0000400200008A590000183E80-000000067F0000400200008A59000018C866__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__0000001C760FA190 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__00000038E67ABFA0 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__0000003903F1CFE8 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__0000003B99F7F8A0 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__0000005D2FFFFB38 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__00000073AD3FE6B8 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__000000914E3F38F0 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__000000931B33AE68 000000067F0000400200008A590000184000-000000067F0000400200008A590000188000__000000931B9AFDF8 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__0000001C760FA190 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__00000038E67ABFA0 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__0000003903F1CFE8 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__0000003B99F7F8A0 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__0000005D2FFFFB38 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__00000073AD3FE6B8 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__000000914E3F38F0 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__000000931B33AE68 000000067F0000400200008A590000188000-000000067F0000400200008A59000018C000__000000931B9AFDF8 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__0000001C760FA190 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__00000038E67ABFA0 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__0000003903F1CFE8 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__0000003B99F7F8A0 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__0000005D2FFFFB38 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__00000073AD3FE6B8 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__000000914E3F38F0 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__000000931B33AE68 000000067F0000400200008A59000018C000-000000067F0000400200008A590000190000__000000931B9AFDF8 000000067F0000400200008A59000018C866-000000067F0000400200008A590000195243__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__0000001C760FA190 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__00000038E67ABFA0 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__0000003903F1CFE8 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__0000003B99F7F8A0 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__0000005D2FFFFB38 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__00000073AD3FE6B8 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__000000914E3F38F0 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__000000931B33AE68 000000067F0000400200008A590000190000-000000067F0000400200008A590000194000__000000931B9AFDF8 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__0000001C760FA190 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__00000038E67ABFA0 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__0000003903F1CFE8 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__0000003B99F7F8A0 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__0000005D2FFFFB38 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__00000073AD3FE6B8 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__000000914E3F38F0 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__000000931B33AE68 000000067F0000400200008A590000194000-000000067F0000400200008A590000198000__000000931B9AFDF8 000000067F0000400200008A590000195243-000000067F0000400200008A59000019DC01__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__0000001C760FA190 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__00000038E67ABFA0 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__0000003903F1CFE8 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__0000003B99F7F8A0 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__0000005D2FFFFB38 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__00000073AD3FE6B8 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__000000914E3F38F0 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__000000931B33AE68 000000067F0000400200008A590000198000-000000067F0000400200008A59000019C000__000000931B9AFDF8 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__0000001C760FA190 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__00000038E67ABFA0 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__0000003903F1CFE8 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__0000003B99F7F8A0 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__0000005D2FFFFB38 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__00000073AD3FE6B8 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__000000914E3F38F0 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__000000931B33AE68 000000067F0000400200008A59000019C000-000000067F0000400200008A5900001A0000__000000931B9AFDF8 000000067F0000400200008A59000019DC01-000000067F0000400200008A5900001A65B5__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__0000001C760FA190 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__00000038E67ABFA0 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__0000003903F1CFE8 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__0000003B99F7F8A0 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__0000005D2FFFFB38 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__00000073AD3FE6B8 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__000000914E3F38F0 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__000000931B33AE68 000000067F0000400200008A5900001A0000-000000067F0000400200008A5900001A4000__000000931B9AFDF8 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__0000001C725A2400 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__0000001C760FA190 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__00000038E67ABFA0 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__0000003903F1CFE8 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__0000003B99F7F8A0 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__0000005D2FFFFB38 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__00000073AD3FE6B8 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__000000914E3F38F0 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__000000931B33AE68 000000067F0000400200008A5900001A4000-000000067F0000400200008A5900001A8000__000000931B9AFDF8 000000067F0000400200008A5900001A65B5-000000067F0000400200008A590100000000__000000027019FBC1-000000030FC9ED71 000000067F0000400200008A5900001A6891-000000067F0000400200008A5900001AF277__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__0000001C725A2400 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__0000001C760FA190 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__00000038E67ABFA0 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__0000003903F1CFE8 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__0000003B99F7F8A0 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__0000005D2FFFFB38 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__00000073AD3FE6B8 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__000000914E3F38F0 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__000000931B33AE68 000000067F0000400200008A5900001A8000-000000067F0000400200008A5900001AC000__000000931B9AFDF8 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__0000001C725A2400 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__0000001C760FA190 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__00000038E67ABFA0 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__0000003903F1CFE8 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__0000003B99F7F8A0 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__0000005D2FFFFB38 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__00000073AD3FE6B8 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__000000914E3F38F0 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__000000931B33AE68 000000067F0000400200008A5900001AC000-000000067F0000400200008A5900001B0000__000000931B9AFDF8 000000067F0000400200008A5900001AF277-000000067F0000400200008A5900001B7C62__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__0000001C725A2400 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__0000001C760FA190 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__00000038E67ABFA0 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__0000003903F1CFE8 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__0000003B99F7F8A0 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__0000005D2FFFFB38 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__00000073AD3FE6B8 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__000000914E3F38F0 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__000000931B33AE68 000000067F0000400200008A5900001B0000-000000067F0000400200008A5900001B4000__000000931B9AFDF8 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__0000001C725A2400 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__0000001C760FA190 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__00000038E67ABFA0 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__0000003903F1CFE8 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__0000003B99F7F8A0 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__0000005D2FFFFB38 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__00000073AD3FE6B8 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__000000914E3F38F0 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__000000931B33AE68 000000067F0000400200008A5900001B4000-000000067F0000400200008A5900001B8000__000000931B9AFDF8 000000067F0000400200008A5900001B7C62-000000067F0000400200008A5900001C0650__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__0000001C725A2400 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__0000001C760FA190 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__00000038E67ABFA0 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__0000003903F1CFE8 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__0000003B99F7F8A0 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__0000005D2FFFFB38 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__00000073AD3FE6B8 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__000000914E3F38F0 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__000000931B33AE68 000000067F0000400200008A5900001B8000-000000067F0000400200008A5900001BC000__000000931B9AFDF8 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__0000001C725A2400 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__0000001C760FA190 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__00000038E67ABFA0 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__0000003903F1CFE8 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__0000003B99F7F8A0 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__0000005D2FFFFB38 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__00000073AD3FE6B8 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__000000914E3F38F0 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__000000931B33AE68 000000067F0000400200008A5900001BC000-000000067F0000400200008A5900001C0000__000000931B9AFDF8 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__0000001C725A2400 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__0000001C760FA190 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__00000038E67ABFA0 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__0000003903F1CFE8 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__0000003B99F7F8A0 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__0000005D2FFFFB38 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__00000073AD3FE6B8 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__000000914E3F38F0 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__000000931B33AE68 000000067F0000400200008A5900001C0000-000000067F0000400200008A5900001C4000__000000931B9AFDF8 000000067F0000400200008A5900001C0650-000000067F0000400200008A5900001C9029__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__0000001C725A2400 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__0000001C760FA190 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__00000038E67ABFA0 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__0000003903F1CFE8 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__0000003B99F7F8A0 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__0000005D2FFFFB38 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__00000073AD3FE6B8 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__000000914E3F38F0 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__000000931B33AE68 000000067F0000400200008A5900001C4000-000000067F0000400200008A5900001C8000__000000931B9AFDF8 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__0000001C725A2400 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__0000001C760FA190 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__00000038E67ABFA0 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__0000003903F1CFE8 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__0000003B99F7F8A0 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__0000005D2FFFFB38 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__00000073AD3FE6B8 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__000000914E3F38F0 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__000000931B33AE68 000000067F0000400200008A5900001C8000-000000067F0000400200008A5900001CC000__000000931B9AFDF8 000000067F0000400200008A5900001C9029-000000067F0000400200008A5900001D19FA__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__0000001C725A2400 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__0000001C760FA190 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__00000038E67ABFA0 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__0000003903F1CFE8 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__0000003B99F7F8A0 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__0000005D2FFFFB38 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__00000073AD3FE6B8 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__000000914E3F38F0 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__000000931B33AE68 000000067F0000400200008A5900001CC000-000000067F0000400200008A5900001D0000__000000931B9AFDF8 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__0000001C725A2400 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__0000001C760FA190 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__00000038E67ABFA0 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__0000003903F1CFE8 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__0000003B99F7F8A0 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__0000005D2FFFFB38 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__00000073AD3FE6B8 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__000000914E3F38F0 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__000000931B33AE68 000000067F0000400200008A5900001D0000-000000067F0000400200008A5900001D4000__000000931B9AFDF8 000000067F0000400200008A5900001D19FA-000000067F0000400200008A5900001DA3D0__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__0000001C725A2400 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__0000001C760FA190 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__00000038E67ABFA0 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__0000003903F1CFE8 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__0000003B99F7F8A0 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__0000005D2FFFFB38 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__00000073AD3FE6B8 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__000000914E3F38F0 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__000000931B33AE68 000000067F0000400200008A5900001D4000-000000067F0000400200008A5900001D8000__000000931B9AFDF8 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__0000001C725A2400 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__0000001C760FA190 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__00000038E67ABFA0 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__0000003903F1CFE8 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__0000003B99F7F8A0 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__0000005D2FFFFB38 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__00000073AD3FE6B8 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__000000914E3F38F0 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__000000931B33AE68 000000067F0000400200008A5900001D8000-000000067F0000400200008A5900001DC000__000000931B9AFDF8 000000067F0000400200008A5900001DA3D0-000000067F0000400200008A5900001E2D99__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__0000001C725A2400 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__0000001C760FA190 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__00000038E67ABFA0 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__0000003903F1CFE8 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__0000003B99F7F8A0 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__0000005D2FFFFB38 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__00000073AD3FE6B8 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__000000914E3F38F0 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__000000931B33AE68 000000067F0000400200008A5900001DC000-000000067F0000400200008A5900001E0000__000000931B9AFDF8 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__0000001C725A2400 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__0000001C760FA190 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__00000038E67ABFA0 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__0000003903F1CFE8 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__0000003B99F7F8A0 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__0000005D2FFFFB38 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__00000073AD3FE6B8 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__000000914E3F38F0 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__000000931B33AE68 000000067F0000400200008A5900001E0000-000000067F0000400200008A5900001E4000__000000931B9AFDF8 000000067F0000400200008A5900001E2D99-000000067F0000400200008A5900001EB784__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__0000001C725A2400 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__0000001C760FA190 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__00000038E67ABFA0 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__0000003903F1CFE8 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__0000003B99F7F8A0 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__0000005D2FFFFB38 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__00000073AD3FE6B8 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__000000914E3F38F0 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__000000931B33AE68 000000067F0000400200008A5900001E4000-000000067F0000400200008A5900001E8000__000000931B9AFDF8 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__0000001C725A2400 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__0000001C760FA190 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__00000038E67ABFA0 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__0000003903F1CFE8 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__0000003B99F7F8A0 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__0000005D2FFFFB38 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__00000073AD3FE6B8 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__000000914E3F38F0 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__000000931B33AE68 000000067F0000400200008A5900001E8000-000000067F0000400200008A5900001EC000__000000931B9AFDF8 000000067F0000400200008A5900001EB784-000000067F0000400200008A5900001F4172__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__0000001C725A2400 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__0000001C760FA190 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__00000038E67ABFA0 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__0000003903F1CFE8 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__0000003B99F7F8A0 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__0000005D2FFFFB38 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__00000073AD3FE6B8 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__000000914E3F38F0 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__000000931B33AE68 000000067F0000400200008A5900001EC000-000000067F0000400200008A5900001F0000__000000931B9AFDF8 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__0000001C725A2400 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__0000001C760FA190 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__00000038E67ABFA0 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__0000003903F1CFE8 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__0000003B99F7F8A0 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__0000005D2FFFFB38 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__00000073AD3FE6B8 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__000000914E3F38F0 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__000000931B33AE68 000000067F0000400200008A5900001F0000-000000067F0000400200008A5900001F4000__000000931B9AFDF8 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__0000001C725A2400 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__0000001C760FA190 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__00000038E67ABFA0 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__0000003903F1CFE8 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__0000003B99F7F8A0 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__0000005D2FFFFB38 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__00000073AD3FE6B8 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__000000914E3F38F0 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__000000931B33AE68 000000067F0000400200008A5900001F4000-000000067F0000400200008A5900001F8000__000000931B9AFDF8 000000067F0000400200008A5900001F4172-000000067F0000400200008A5900001FCB6A__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__0000001C725A2400 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__0000001C760FA190 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__00000038E67ABFA0 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__0000003903F1CFE8 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__0000003B99F7F8A0 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__0000005D2FFFFB38 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__00000073AD3FE6B8 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__000000914E3F38F0 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__000000931B33AE68 000000067F0000400200008A5900001F8000-000000067F0000400200008A5900001FC000__000000931B9AFDF8 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__0000000478F987C0 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__0000001C760FA190 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__00000038E67ABFA0 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__0000003903F1CFE8 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__0000003B99F7F8A0 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__0000005D2FFFFB38 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__00000073AD3FE6B8 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__000000914E3F38F0 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__000000931B33AE68 000000067F0000400200008A5900001FC000-000000067F0000400200008A590000200000__000000931B9AFDF8 000000067F0000400200008A5900001FCB6A-000000067F0000400200008A590100000000__000000030FC9ED71-00000003AF79E5E9 000000067F0000400200008A5900001FCE37-000000067F0000400200008A59000020580F__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__0000000478F987C0 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__0000001C760FA190 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__00000038E67ABFA0 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__0000003903F1CFE8 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__0000003B99F7F8A0 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__0000005D2FFFFB38 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__00000073AD3FE6B8 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__000000914E3F38F0 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__000000931B33AE68 000000067F0000400200008A590000200000-000000067F0000400200008A590000204000__000000931B9AFDF8 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__0000000478F987C0 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__0000001C760FA190 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__00000038E67ABFA0 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__0000003903F1CFE8 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__0000003B99F7F8A0 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__0000005D2FFFFB38 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__00000073AD3FE6B8 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__000000914E3F38F0 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__000000931B33AE68 000000067F0000400200008A590000204000-000000067F0000400200008A590000208000__000000931B9AFDF8 000000067F0000400200008A59000020580F-000000067F0000400200008A59000020E1DF__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__0000000478F987C0 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__0000001C760FA190 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__00000038E67ABFA0 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__0000003903F1CFE8 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__0000003B99F7F8A0 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__0000005D2FFFFB38 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__00000073AD3FE6B8 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__000000914E3F38F0 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__000000931B33AE68 000000067F0000400200008A590000208000-000000067F0000400200008A59000020C000__000000931B9AFDF8 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__0000000478F987C0 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__0000001C760FA190 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__00000038E67ABFA0 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__0000003903F1CFE8 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__0000003B99F7F8A0 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__0000005D2FFFFB38 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__00000073AD3FE6B8 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__000000914E3F38F0 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__000000931B33AE68 000000067F0000400200008A59000020C000-000000067F0000400200008A590000210000__000000931B9AFDF8 000000067F0000400200008A59000020E1DF-000000067F0000400200008A590000216BBF__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__0000000478F987C0 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__0000001C760FA190 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__00000038E67ABFA0 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__0000003903F1CFE8 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__0000003B99F7F8A0 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__0000005D2FFFFB38 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__00000073AD3FE6B8 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__000000914E3F38F0 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__000000931B33AE68 000000067F0000400200008A590000210000-000000067F0000400200008A590000214000__000000931B9AFDF8 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__0000000478F987C0 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__0000001C760FA190 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__00000038E67ABFA0 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__0000003903F1CFE8 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__0000003B99F7F8A0 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__0000005D2FFFFB38 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__00000073AD3FE6B8 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__000000914E3F38F0 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__000000931B33AE68 000000067F0000400200008A590000214000-000000067F0000400200008A590000218000__000000931B9AFDF8 000000067F0000400200008A590000216BBF-000000067F0000400200008A59000021F588__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__0000000478F987C0 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__0000001C760FA190 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__00000038E67ABFA0 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__0000003903F1CFE8 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__0000003B99F7F8A0 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__0000005D2FFFFB38 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__00000073AD3FE6B8 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__000000914E3F38F0 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__000000931B33AE68 000000067F0000400200008A590000218000-000000067F0000400200008A59000021C000__000000931B9AFDF8 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__0000000478F987C0 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__0000001C760FA190 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__00000038E67ABFA0 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__0000003903F1CFE8 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__0000003B99F7F8A0 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__0000005D2FFFFB38 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__00000073AD3FE6B8 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__000000914E3F38F0 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__000000931B33AE68 000000067F0000400200008A59000021C000-000000067F0000400200008A590000220000__000000931B9AFDF8 000000067F0000400200008A59000021F588-000000067F0000400200008A590000227F75__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__0000000478F987C0 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__0000001C760FA190 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__00000038E67ABFA0 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__0000003903F1CFE8 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__0000003B99F7F8A0 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__0000005D2FFFFB38 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__00000073AD3FE6B8 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__000000914E3F38F0 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__000000931B33AE68 000000067F0000400200008A590000220000-000000067F0000400200008A590000224000__000000931B9AFDF8 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__0000000478F987C0 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__0000001C760FA190 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__00000038E67ABFA0 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__0000003903F1CFE8 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__0000003B99F7F8A0 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__0000005D2FFFFB38 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__00000073AD3FE6B8 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__000000914E3F38F0 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__000000931B33AE68 000000067F0000400200008A590000224000-000000067F0000400200008A590000228000__000000931B9AFDF8 000000067F0000400200008A590000227F75-000000067F0000400200008A590000230964__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__0000000478F987C0 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__0000001C760FA190 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__00000038E67ABFA0 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__0000003903F1CFE8 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__0000003B99F7F8A0 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__0000005D2FFFFB38 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__00000073AD3FE6B8 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__000000914E3F38F0 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__000000931B33AE68 000000067F0000400200008A590000228000-000000067F0000400200008A59000022C000__000000931B9AFDF8 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__0000000478F987C0 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__0000001C760FA190 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__00000038E67ABFA0 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__0000003903F1CFE8 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__0000003B99F7F8A0 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__0000005D2FFFFB38 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__00000073AD3FE6B8 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__000000914E3F38F0 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__000000931B33AE68 000000067F0000400200008A59000022C000-000000067F0000400200008A590000230000__000000931B9AFDF8 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__0000000478F987C0 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__0000001C760FA190 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__00000038E67ABFA0 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__0000003903F1CFE8 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__0000003B99F7F8A0 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__0000005D2FFFFB38 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__00000073AD3FE6B8 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__000000914E3F38F0 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__000000931B33AE68 000000067F0000400200008A590000230000-000000067F0000400200008A590000234000__000000931B9AFDF8 000000067F0000400200008A590000230964-000000067F0000400200008A590000239354__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__0000000478F987C0 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__0000001C760FA190 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__00000038E67ABFA0 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__0000003903F1CFE8 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__0000003B99F7F8A0 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__0000005D2FFFFB38 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__00000073AD3FE6B8 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__000000914E3F38F0 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__000000931B33AE68 000000067F0000400200008A590000234000-000000067F0000400200008A590000238000__000000931B9AFDF8 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__0000000478F987C0 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__0000001C760FA190 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__00000038E67ABFA0 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__0000003903F1CFE8 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__0000003B99F7F8A0 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__0000005D2FFFFB38 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__00000073AD3FE6B8 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__000000914E3F38F0 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__000000931B33AE68 000000067F0000400200008A590000238000-000000067F0000400200008A59000023C000__000000931B9AFDF8 000000067F0000400200008A590000239354-000000067F0000400200008A590000241D2A__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__0000000478F987C0 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__0000001C760FA190 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__00000038E67ABFA0 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__0000003903F1CFE8 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__0000003B99F7F8A0 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__0000005D2FFFFB38 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__00000073AD3FE6B8 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__000000914E3F38F0 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__000000931B33AE68 000000067F0000400200008A59000023C000-000000067F0000400200008A590000240000__000000931B9AFDF8 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__0000000478F987C0 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__0000001C760FA190 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__00000038E67ABFA0 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__0000003903F1CFE8 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__0000003B99F7F8A0 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__0000005D2FFFFB38 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__00000073AD3FE6B8 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__000000914E3F38F0 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__000000931B33AE68 000000067F0000400200008A590000240000-000000067F0000400200008A590000244000__000000931B9AFDF8 000000067F0000400200008A590000241D2A-000000067F0000400200008A59000024A6FE__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__0000000478F987C0 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__0000001C760FA190 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__00000038E67ABFA0 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__0000003903F1CFE8 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__0000003B99F7F8A0 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__0000005D2FFFFB38 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__00000073AD3FE6B8 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__000000914E3F38F0 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__000000931B33AE68 000000067F0000400200008A590000244000-000000067F0000400200008A590000248000__000000931B9AFDF8 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__0000000478F987C0 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__0000001C760FA190 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__00000038E67ABFA0 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__0000003903F1CFE8 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__0000003B99F7F8A0 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__0000005D2FFFFB38 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__00000073AD3FE6B8 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__000000914E3F38F0 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__000000931B33AE68 000000067F0000400200008A590000248000-000000067F0000400200008A59000024C000__000000931B9AFDF8 000000067F0000400200008A59000024A6FE-000000067F0000400200008A5900002530C1__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__0000000478F987C0 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__0000001C760FA190 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__00000038E67ABFA0 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__0000003903F1CFE8 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__0000003B99F7F8A0 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__0000005D2FFFFB38 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__00000073AD3FE6B8 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__000000914E3F38F0 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__000000931B33AE68 000000067F0000400200008A59000024C000-000000067F0000400200008A590000250000__000000931B9AFDF8 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__0000000478F987C0 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__0000001C760FA190 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__00000038E67ABFA0 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__0000003903F1CFE8 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__0000003B99F7F8A0 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__0000005D2FFFFB38 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__00000073AD3FE6B8 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__000000914E3F38F0 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__000000931B33AE68 000000067F0000400200008A590000250000-000000067F0000400200008A590000254000__000000931B9AFDF8 000000067F0000400200008A5900002530C1-000000067F0000400200008A590100000000__00000003AF79E5E9-000000044F29F379 000000067F0000400200008A59000025338B-000000067F0000400200008A59000025BD50__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__0000000478F987C0 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__0000001C760FA190 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__00000038E67ABFA0 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__0000003903F1CFE8 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__0000003B99F7F8A0 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__0000005D2FFFFB38 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__00000073AD3FE6B8 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__000000914E3F38F0 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__000000931B33AE68 000000067F0000400200008A590000254000-000000067F0000400200008A590000258000__000000931B9AFDF8 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__0000000478F987C0 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__0000001C760FA190 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__00000038E67ABFA0 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__0000003903F1CFE8 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__0000003B99F7F8A0 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__0000005D2FFFFB38 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__00000073AD3FE6B8 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__000000914E3F38F0 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__000000931B33AE68 000000067F0000400200008A590000258000-000000067F0000400200008A59000025C000__000000931B9AFDF8 000000067F0000400200008A59000025BD50-000000067F0000400200008A59000026473E__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__0000000478F987C0 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__0000001C760FA190 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__00000038E67ABFA0 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__0000003903F1CFE8 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__0000003B99F7F8A0 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__0000005D2FFFFB38 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__00000073AD3FE6B8 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__000000914E3F38F0 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__000000931B33AE68 000000067F0000400200008A59000025C000-000000067F0000400200008A590000260000__000000931B9AFDF8 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__0000000478F987C0 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__0000001C760FA190 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__00000038E67ABFA0 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__0000003903F1CFE8 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__0000003B99F7F8A0 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__0000005D2FFFFB38 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__00000073AD3FE6B8 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__000000914E3F38F0 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__000000931B33AE68 000000067F0000400200008A590000260000-000000067F0000400200008A590000264000__000000931B9AFDF8 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__0000000478F987C0 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__0000001C760FA190 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__00000038E67ABFA0 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__0000003903F1CFE8 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__0000003B99F7F8A0 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__0000005D2FFFFB38 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__00000073AD3FE6B8 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__000000914E3F38F0 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__000000931B33AE68 000000067F0000400200008A590000264000-000000067F0000400200008A590000268000__000000931B9AFDF8 000000067F0000400200008A59000026473E-000000067F0000400200008A59000026D126__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__0000001C760FA190 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__00000038E67ABFA0 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__0000003903F1CFE8 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__0000003B99F7F8A0 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__0000005D2FFFFB38 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__00000073AD3FE6B8 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__000000914E3F38F0 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__000000931B33AE68 000000067F0000400200008A590000268000-000000067F0000400200008A59000026C000__000000931B9AFDF8 000000067F0000400200008A590000268000-030000000000000000000000000000000002__0000000478F987C0 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__0000001C760FA190 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__00000038E67ABFA0 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__0000003903F1CFE8 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__0000003B99F7F8A0 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__0000005D2FFFFB38 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__00000073AD3FE6B8 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__000000914E3F38F0 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__000000931B33AE68 000000067F0000400200008A59000026C000-000000067F0000400200008A590000270000__000000931B9AFDF8 000000067F0000400200008A59000026D126-000000067F0000400200008A590000275B09__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__0000001C760FA190 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__00000038E67ABFA0 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__0000003903F1CFE8 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__0000003B99F7F8A0 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__0000005D2FFFFB38 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__00000073AD3FE6B8 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__000000914E3F38F0 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__000000931B33AE68 000000067F0000400200008A590000270000-000000067F0000400200008A590000274000__000000931B9AFDF8 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__0000001C760FA190 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__00000038E67ABFA0 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__0000003903F1CFE8 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__0000003B99F7F8A0 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__0000005D2FFFFB38 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__00000073AD3FE6B8 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__000000914E3F38F0 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__000000931B33AE68 000000067F0000400200008A590000274000-000000067F0000400200008A590000278000__000000931B9AFDF8 000000067F0000400200008A590000275B09-000000067F0000400200008A59000027E4E0__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__0000001C760FA190 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__00000038E67ABFA0 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__0000003903F1CFE8 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__0000003B99F7F8A0 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__0000005D2FFFFB38 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__00000073AD3FE6B8 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__000000914E3F38F0 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__000000931B33AE68 000000067F0000400200008A590000278000-000000067F0000400200008A59000027C000__000000931B9AFDF8 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__0000001C760FA190 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__00000038E67ABFA0 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__0000003903F1CFE8 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__0000003B99F7F8A0 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__0000005D2FFFFB38 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__00000073AD3FE6B8 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__000000914E3F38F0 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__000000931B33AE68 000000067F0000400200008A59000027C000-000000067F0000400200008A590000280000__000000931B9AFDF8 000000067F0000400200008A59000027E4E0-000000067F0000400200008A590000286EB2__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__0000001C760FA190 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__00000038E67ABFA0 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__0000003903F1CFE8 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__0000003B99F7F8A0 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__0000005D2FFFFB38 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__00000073AD3FE6B8 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__000000914E3F38F0 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__000000931B33AE68 000000067F0000400200008A590000280000-000000067F0000400200008A590000284000__000000931B9AFDF8 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__0000001C760FA190 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__00000038E67ABFA0 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__0000003903F1CFE8 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__0000003B99F7F8A0 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__0000005D2FFFFB38 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__00000073AD3FE6B8 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__000000914E3F38F0 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__000000931B33AE68 000000067F0000400200008A590000284000-000000067F0000400200008A590000288000__000000931B9AFDF8 000000067F0000400200008A590000286EB2-000000067F0000400200008A59000028F86E__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__0000001C760FA190 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__00000038E67ABFA0 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__0000003903F1CFE8 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__0000003B99F7F8A0 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__0000005D2FFFFB38 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__00000073AD3FE6B8 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__000000914E3F38F0 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__000000931B33AE68 000000067F0000400200008A590000288000-000000067F0000400200008A59000028C000__000000931B9AFDF8 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__0000001C760FA190 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__00000038E67ABFA0 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__0000003903F1CFE8 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__0000003B99F7F8A0 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__0000005D2FFFFB38 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__00000073AD3FE6B8 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__000000914E3F38F0 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__000000931B33AE68 000000067F0000400200008A59000028C000-000000067F0000400200008A590000290000__000000931B9AFDF8 000000067F0000400200008A59000028F86E-000000067F0000400200008A590000298236__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__0000001C760FA190 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__00000038E67ABFA0 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__0000003903F1CFE8 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__0000003B99F7F8A0 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__0000005D2FFFFB38 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__00000073AD3FE6B8 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__000000914E3F38F0 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__000000931B33AE68 000000067F0000400200008A590000290000-000000067F0000400200008A590000294000__000000931B9AFDF8 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__0000001C760FA190 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__00000038E67ABFA0 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__0000003903F1CFE8 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__0000003B99F7F8A0 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__0000005D2FFFFB38 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__00000073AD3FE6B8 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__000000914E3F38F0 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__000000931B33AE68 000000067F0000400200008A590000294000-000000067F0000400200008A590000298000__000000931B9AFDF8 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__0000001C760FA190 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__00000038E67ABFA0 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__0000003903F1CFE8 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__0000003B99F7F8A0 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__0000005D2FFFFB38 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__00000073AD3FE6B8 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__000000914E3F38F0 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__000000931B33AE68 000000067F0000400200008A590000298000-000000067F0000400200008A59000029C000__000000931B9AFDF8 000000067F0000400200008A590000298236-000000067F0000400200008A5900002A0C2B__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__0000001C760FA190 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__00000038E67ABFA0 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__0000003903F1CFE8 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__0000003B99F7F8A0 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__0000005D2FFFFB38 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__00000073AD3FE6B8 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__000000914E3F38F0 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__000000931B33AE68 000000067F0000400200008A59000029C000-000000067F0000400200008A5900002A0000__000000931B9AFDF8 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__0000001C760FA190 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__00000038E67ABFA0 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__0000003903F1CFE8 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__0000003B99F7F8A0 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__0000005D2FFFFB38 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__00000073AD3FE6B8 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__000000914E3F38F0 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__000000931B33AE68 000000067F0000400200008A5900002A0000-000000067F0000400200008A5900002A4000__000000931B9AFDF8 000000067F0000400200008A5900002A0C2B-000000067F0000400200008A5900002A961E__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__0000001C760FA190 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__00000038E67ABFA0 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__0000003903F1CFE8 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__0000003B99F7F8A0 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__0000005D2FFFFB38 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__00000073AD3FE6B8 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__000000914E3F38F0 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__000000931B33AE68 000000067F0000400200008A5900002A4000-000000067F0000400200008A5900002A8000__000000931B9AFDF8 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__0000001C760FA190 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__00000038E67ABFA0 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__0000003903F1CFE8 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__0000003B99F7F8A0 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__0000005D2FFFFB38 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__00000073AD3FE6B8 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__000000914E3F38F0 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__000000931B33AE68 000000067F0000400200008A5900002A8000-000000067F0000400200008A5900002AC000__000000931B9AFDF8 000000067F0000400200008A5900002A961E-000000067F0000400200008A5900002B2001__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__0000001C760FA190 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__00000038E67ABFA0 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__0000003903F1CFE8 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__0000003B99F7F8A0 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__0000005D2FFFFB38 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__00000073AD3FE6B8 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__000000914E3F38F0 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__000000931B33AE68 000000067F0000400200008A5900002AC000-000000067F0000400200008A5900002B0000__000000931B9AFDF8 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__0000001C725A2400 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__0000001C760FA190 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__00000038E67ABFA0 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__0000003903F1CFE8 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__0000003B99F7F8A0 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__0000005D2FFFFB38 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__00000073AD3FE6B8 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__000000914E3F38F0 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__000000931B33AE68 000000067F0000400200008A5900002B0000-000000067F0000400200008A5900002B4000__000000931B9AFDF8 000000067F0000400200008A5900002B2001-000000067F0000400200008A590100000000__000000044F29F379-00000004FED1E2E1 000000067F0000400200008A5900002B2344-000000067F0000400200008A5900002BAD21__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__0000001C725A2400 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__0000001C760FA190 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__00000038E67ABFA0 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__0000003903F1CFE8 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__0000003B99F7F8A0 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__0000005D2FFFFB38 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__00000073AD3FE6B8 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__000000914E3F38F0 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__000000931B33AE68 000000067F0000400200008A5900002B4000-000000067F0000400200008A5900002B8000__000000931B9AFDF8 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__0000001C725A2400 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__0000001C760FA190 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__00000038E67ABFA0 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__0000003903F1CFE8 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__0000003B99F7F8A0 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__0000005D2FFFFB38 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__00000073AD3FE6B8 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__000000914E3F38F0 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__000000931B33AE68 000000067F0000400200008A5900002B8000-000000067F0000400200008A5900002BC000__000000931B9AFDF8 000000067F0000400200008A5900002BAD21-000000067F0000400200008A5900002C36DD__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__0000001C725A2400 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__0000001C760FA190 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__00000038E67ABFA0 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__0000003903F1CFE8 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__0000003B99F7F8A0 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__0000005D2FFFFB38 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__00000073AD3FE6B8 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__000000914E3F38F0 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__000000931B33AE68 000000067F0000400200008A5900002BC000-000000067F0000400200008A5900002C0000__000000931B9AFDF8 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__0000001C725A2400 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__0000001C760FA190 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__00000038E67ABFA0 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__0000003903F1CFE8 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__0000003B99F7F8A0 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__0000005D2FFFFB38 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__00000073AD3FE6B8 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__000000914E3F38F0 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__000000931B33AE68 000000067F0000400200008A5900002C0000-000000067F0000400200008A5900002C4000__000000931B9AFDF8 000000067F0000400200008A5900002C36DD-000000067F0000400200008A5900002CC0AA__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__0000001C725A2400 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__0000001C760FA190 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__00000038E67ABFA0 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__0000003903F1CFE8 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__0000003B99F7F8A0 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__0000005D2FFFFB38 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__00000073AD3FE6B8 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__000000914E3F38F0 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__000000931B33AE68 000000067F0000400200008A5900002C4000-000000067F0000400200008A5900002C8000__000000931B9AFDF8 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__0000001C725A2400 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__0000001C760FA190 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__00000038E67ABFA0 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__0000003903F1CFE8 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__0000003B99F7F8A0 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__0000005D2FFFFB38 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__00000073AD3FE6B8 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__000000914E3F38F0 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__000000931B33AE68 000000067F0000400200008A5900002C8000-000000067F0000400200008A5900002CC000__000000931B9AFDF8 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__0000001C725A2400 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__0000001C760FA190 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__00000038E67ABFA0 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__0000003903F1CFE8 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__0000003B99F7F8A0 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__0000005D2FFFFB38 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__00000073AD3FE6B8 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__000000914E3F38F0 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__000000931B33AE68 000000067F0000400200008A5900002CC000-000000067F0000400200008A5900002D0000__000000931B9AFDF8 000000067F0000400200008A5900002CC0AA-000000067F0000400200008A5900002D4A82__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__0000001C725A2400 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__0000001C760FA190 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__00000038E67ABFA0 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__0000003903F1CFE8 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__0000003B99F7F8A0 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__0000005D2FFFFB38 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__00000073AD3FE6B8 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__000000914E3F38F0 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__000000931B33AE68 000000067F0000400200008A5900002D0000-000000067F0000400200008A5900002D4000__000000931B9AFDF8 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__0000001C725A2400 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__0000001C760FA190 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__00000038E67ABFA0 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__0000003903F1CFE8 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__0000003B99F7F8A0 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__0000005D2FFFFB38 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__00000073AD3FE6B8 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__000000914E3F38F0 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__000000931B33AE68 000000067F0000400200008A5900002D4000-000000067F0000400200008A5900002D8000__000000931B9AFDF8 000000067F0000400200008A5900002D4A82-000000067F0000400200008A5900002DD480__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__0000001C725A2400 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__0000001C760FA190 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__00000038E67ABFA0 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__0000003903F1CFE8 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__0000003B99F7F8A0 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__0000005D2FFFFB38 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__00000073AD3FE6B8 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__000000914E3F38F0 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__000000931B33AE68 000000067F0000400200008A5900002D8000-000000067F0000400200008A5900002DC000__000000931B9AFDF8 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__0000001C725A2400 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__0000001C760FA190 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__00000038E67ABFA0 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__0000003903F1CFE8 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__0000003B99F7F8A0 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__0000005D2FFFFB38 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__00000073AD3FE6B8 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__000000914E3F38F0 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__000000931B33AE68 000000067F0000400200008A5900002DC000-000000067F0000400200008A5900002E0000__000000931B9AFDF8 000000067F0000400200008A5900002DD480-000000067F0000400200008A5900002E5E6E__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__0000001C725A2400 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__0000001C760FA190 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__00000038E67ABFA0 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__0000003903F1CFE8 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__0000003B99F7F8A0 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__0000005D2FFFFB38 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__00000073AD3FE6B8 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__000000914E3F38F0 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__000000931B33AE68 000000067F0000400200008A5900002E0000-000000067F0000400200008A5900002E4000__000000931B9AFDF8 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__0000001C725A2400 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__0000001C760FA190 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__00000038E67ABFA0 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__0000003903F1CFE8 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__0000003B99F7F8A0 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__0000005D2FFFFB38 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__00000073AD3FE6B8 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__000000914E3F38F0 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__000000931B33AE68 000000067F0000400200008A5900002E4000-000000067F0000400200008A5900002E8000__000000931B9AFDF8 000000067F0000400200008A5900002E5E6E-000000067F0000400200008A5900002EE857__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__0000001C725A2400 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__0000001C760FA190 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__00000038E67ABFA0 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__0000003903F1CFE8 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__0000003B99F7F8A0 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__0000005D2FFFFB38 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__00000073AD3FE6B8 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__000000914E3F38F0 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__000000931B33AE68 000000067F0000400200008A5900002E8000-000000067F0000400200008A5900002EC000__000000931B9AFDF8 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__0000001C725A2400 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__0000001C760FA190 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__00000038E67ABFA0 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__0000003903F1CFE8 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__0000003B99F7F8A0 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__0000005D2FFFFB38 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__00000073AD3FE6B8 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__000000914E3F38F0 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__000000931B33AE68 000000067F0000400200008A5900002EC000-000000067F0000400200008A5900002F0000__000000931B9AFDF8 000000067F0000400200008A5900002EE857-000000067F0000400200008A5900002F722B__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__0000001C725A2400 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__0000001C760FA190 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__00000038E67ABFA0 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__0000003903F1CFE8 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__0000003B99F7F8A0 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__0000005D2FFFFB38 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__00000073AD3FE6B8 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__000000914E3F38F0 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__000000931B33AE68 000000067F0000400200008A5900002F0000-000000067F0000400200008A5900002F4000__000000931B9AFDF8 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__0000001C725A2400 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__0000001C760FA190 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__00000038E67ABFA0 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__0000003903F1CFE8 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__0000003B99F7F8A0 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__0000005D2FFFFB38 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__00000073AD3FE6B8 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__000000914E3F38F0 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__000000931B33AE68 000000067F0000400200008A5900002F4000-000000067F0000400200008A5900002F8000__000000931B9AFDF8 000000067F0000400200008A5900002F722B-000000067F0000400200008A5900002FFBF0__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__0000001C725A2400 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__0000001C760FA190 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__00000038E67ABFA0 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__0000003903F1CFE8 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__0000003B99F7F8A0 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__0000005D2FFFFB38 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__00000073AD3FE6B8 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__000000914E3F38F0 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__000000931B33AE68 000000067F0000400200008A5900002F8000-000000067F0000400200008A5900002FC000__000000931B9AFDF8 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__0000001C725A2400 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__0000001C760FA190 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__00000038E67ABFA0 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__0000003903F1CFE8 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__0000003B99F7F8A0 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__0000005D2FFFFB38 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__00000073AD3FE6B8 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__000000914E3F38F0 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__000000931B33AE68 000000067F0000400200008A5900002FC000-000000067F0000400200008A590000300000__000000931B9AFDF8 000000067F0000400200008A5900002FFBF0-000000067F0000400200008A5900003085CB__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__0000001C725A2400 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__0000001C760FA190 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__00000038E67ABFA0 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__0000003903F1CFE8 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__0000003B99F7F8A0 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__0000005D2FFFFB38 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__00000073AD3FE6B8 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__000000914E3F38F0 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__000000931B33AE68 000000067F0000400200008A590000300000-000000067F0000400200008A590000304000__000000931B9AFDF8 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__0000001C725A2400 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__0000001C760FA190 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__00000038E67ABFA0 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__0000003903F1CFE8 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__0000003B99F7F8A0 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__0000005D2FFFFB38 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__00000073AD3FE6B8 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__000000914E3F38F0 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__000000931B33AE68 000000067F0000400200008A590000304000-000000067F0000400200008A590000308000__000000931B9AFDF8 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__000000067DFFFF90 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__0000001C760FA190 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__00000038E67ABFA0 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__0000003903F1CFE8 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__0000003B99F7F8A0 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__0000005D2FFFFB38 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__00000073AD3FE6B8 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__000000914E3F38F0 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__000000931B33AE68 000000067F0000400200008A590000308000-000000067F0000400200008A59000030C000__000000931B9AFDF8 000000067F0000400200008A5900003085CB-000000067F0000400200008A590100000000__00000004FED1E2E1-000000059E81EB61 000000067F0000400200008A590000308891-000000067F0000400200008A59000031126B__000000059E81EB61-000000064E25E851 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__000000067DFFFF90 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__0000001C760FA190 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__00000038E67ABFA0 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__0000003903F1CFE8 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__0000003B99F7F8A0 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__0000005D2FFFFB38 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__00000073AD3FE6B8 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__000000914E3F38F0 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__000000931B33AE68 000000067F0000400200008A59000030C000-000000067F0000400200008A590000310000__000000931B9AFDF8 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__000000067DFFFF90 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__0000001C760FA190 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__00000038E67ABFA0 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__0000003903F1CFE8 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__0000003B99F7F8A0 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__0000005D2FFFFB38 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__00000073AD3FE6B8 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__000000914E3F38F0 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__000000931B33AE68 000000067F0000400200008A590000310000-000000067F0000400200008A590000314000__000000931B9AFDF8 000000067F0000400200008A59000031126B-000000067F0000400200008A590000319C61__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__000000067DFFFF90 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__0000001C760FA190 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__00000038E67ABFA0 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__0000003903F1CFE8 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__0000003B99F7F8A0 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__0000005D2FFFFB38 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__00000073AD3FE6B8 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__000000914E3F38F0 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__000000931B33AE68 000000067F0000400200008A590000314000-000000067F0000400200008A590000318000__000000931B9AFDF8 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__000000067DFFFF90 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__0000001C760FA190 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__00000038E67ABFA0 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__0000003903F1CFE8 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__0000003B99F7F8A0 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__0000005D2FFFFB38 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__00000073AD3FE6B8 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__000000914E3F38F0 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__000000931B33AE68 000000067F0000400200008A590000318000-000000067F0000400200008A59000031C000__000000931B9AFDF8 000000067F0000400200008A590000319C61-000000067F0000400200008A590000322645__000000059E81EB61-000000064E25E851 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__000000067DFFFF90 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__0000001C760FA190 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__00000038E67ABFA0 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__0000003903F1CFE8 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__0000003B99F7F8A0 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__0000005D2FFFFB38 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__00000073AD3FE6B8 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__000000914E3F38F0 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__000000931B33AE68 000000067F0000400200008A59000031C000-000000067F0000400200008A590000320000__000000931B9AFDF8 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__000000067DFFFF90 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__0000001C760FA190 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__00000038E67ABFA0 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__0000003903F1CFE8 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__0000003B99F7F8A0 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__0000005D2FFFFB38 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__00000073AD3FE6B8 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__000000914E3F38F0 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__000000931B33AE68 000000067F0000400200008A590000320000-000000067F0000400200008A590000324000__000000931B9AFDF8 000000067F0000400200008A590000322645-000000067F0000400200008A59000032B01B__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__000000067DFFFF90 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__0000001C760FA190 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__00000038E67ABFA0 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__0000003903F1CFE8 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__0000003B99F7F8A0 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__0000005D2FFFFB38 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__00000073AD3FE6B8 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__000000914E3F38F0 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__000000931B33AE68 000000067F0000400200008A590000324000-000000067F0000400200008A590000328000__000000931B9AFDF8 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__000000067DFFFF90 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__0000001C760FA190 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__00000038E67ABFA0 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__0000003903F1CFE8 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__0000003B99F7F8A0 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__0000005D2FFFFB38 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__00000073AD3FE6B8 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__000000914E3F38F0 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__000000931B33AE68 000000067F0000400200008A590000328000-000000067F0000400200008A59000032C000__000000931B9AFDF8 000000067F0000400200008A59000032B01B-000000067F0000400200008A5900003339E7__000000059E81EB61-000000064E25E851 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__000000067DFFFF90 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__0000001C760FA190 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__00000038E67ABFA0 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__0000003903F1CFE8 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__0000003B99F7F8A0 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__0000005D2FFFFB38 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__00000073AD3FE6B8 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__000000914E3F38F0 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__000000931B33AE68 000000067F0000400200008A59000032C000-000000067F0000400200008A590000330000__000000931B9AFDF8 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__000000067DFFFF90 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__0000001C760FA190 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__00000038E67ABFA0 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__0000003903F1CFE8 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__0000003B99F7F8A0 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__0000005D2FFFFB38 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__00000073AD3FE6B8 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__000000914E3F38F0 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__000000931B33AE68 000000067F0000400200008A590000330000-000000067F0000400200008A590000334000__000000931B9AFDF8 000000067F0000400200008A5900003339E7-000000067F0000400200008A59000033C3C0__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__000000067DFFFF90 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__0000001C760FA190 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__00000038E67ABFA0 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__0000003903F1CFE8 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__0000003B99F7F8A0 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__0000005D2FFFFB38 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__00000073AD3FE6B8 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__000000914E3F38F0 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__000000931B33AE68 000000067F0000400200008A590000334000-000000067F0000400200008A590000338000__000000931B9AFDF8 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__000000067DFFFF90 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__0000001C760FA190 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__00000038E67ABFA0 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__0000003903F1CFE8 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__0000003B99F7F8A0 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__0000005D2FFFFB38 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__00000073AD3FE6B8 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__000000914E3F38F0 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__000000931B33AE68 000000067F0000400200008A590000338000-000000067F0000400200008A59000033C000__000000931B9AFDF8 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__000000067DFFFF90 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__0000001C760FA190 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__00000038E67ABFA0 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__0000003903F1CFE8 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__0000003B99F7F8A0 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__0000005D2FFFFB38 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__00000073AD3FE6B8 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__000000914E3F38F0 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__000000931B33AE68 000000067F0000400200008A59000033C000-000000067F0000400200008A590000340000__000000931B9AFDF8 000000067F0000400200008A59000033C3C0-000000067F0000400200008A590000344D8E__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__000000067DFFFF90 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__0000001C760FA190 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__00000038E67ABFA0 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__0000003903F1CFE8 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__0000003B99F7F8A0 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__0000005D2FFFFB38 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__00000073AD3FE6B8 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__000000914E3F38F0 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__000000931B33AE68 000000067F0000400200008A590000340000-000000067F0000400200008A590000344000__000000931B9AFDF8 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__000000067DFFFF90 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__0000001C760FA190 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__00000038E67ABFA0 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__0000003903F1CFE8 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__0000003B99F7F8A0 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__0000005D2FFFFB38 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__00000073AD3FE6B8 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__000000914E3F38F0 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__000000931B33AE68 000000067F0000400200008A590000344000-000000067F0000400200008A590000348000__000000931B9AFDF8 000000067F0000400200008A590000344D8E-000000067F0000400200008A59000034D773__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__000000067DFFFF90 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__0000001C760FA190 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__00000038E67ABFA0 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__0000003903F1CFE8 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__0000003B99F7F8A0 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__0000005D2FFFFB38 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__00000073AD3FE6B8 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__000000914E3F38F0 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__000000931B33AE68 000000067F0000400200008A590000348000-000000067F0000400200008A59000034C000__000000931B9AFDF8 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__000000067DFFFF90 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__0000001C760FA190 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__00000038E67ABFA0 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__0000003903F1CFE8 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__0000003B99F7F8A0 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__0000005D2FFFFB38 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__00000073AD3FE6B8 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__000000914E3F38F0 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__000000931B33AE68 000000067F0000400200008A59000034C000-000000067F0000400200008A590000350000__000000931B9AFDF8 000000067F0000400200008A59000034D773-000000067F0000400200008A590000356163__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__000000067DFFFF90 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__0000001C760FA190 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__00000038E67ABFA0 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__0000003903F1CFE8 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__0000003B99F7F8A0 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__0000005D2FFFFB38 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__00000073AD3FE6B8 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__000000914E3F38F0 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__000000931B33AE68 000000067F0000400200008A590000350000-000000067F0000400200008A590000354000__000000931B9AFDF8 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__000000067DFFFF90 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__0000001C760FA190 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__00000038E67ABFA0 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__0000003903F1CFE8 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__0000003B99F7F8A0 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__0000005D2FFFFB38 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__00000073AD3FE6B8 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__000000914E3F38F0 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__000000931B33AE68 000000067F0000400200008A590000354000-000000067F0000400200008A590000358000__000000931B9AFDF8 000000067F0000400200008A590000356163-000000067F0000400200008A59000035EB54__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__000000067DFFFF90 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__0000001C760FA190 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__00000038E67ABFA0 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__0000003903F1CFE8 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__0000003B99F7F8A0 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__0000005D2FFFFB38 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__00000073AD3FE6B8 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__000000914E3F38F0 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__000000931B33AE68 000000067F0000400200008A590000358000-000000067F0000400200008A59000035C000__000000931B9AFDF8 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__000000067DFFFF90 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__0000001C760FA190 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__00000038E67ABFA0 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__0000003903F1CFE8 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__0000003B99F7F8A0 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__0000005D2FFFFB38 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__00000073AD3FE6B8 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__000000914E3F38F0 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__000000931B33AE68 000000067F0000400200008A59000035C000-000000067F0000400200008A590000360000__000000931B9AFDF8 000000067F0000400200008A59000035EB54-000000067F0000400200008A59000036753C__000000059E81EB61-000000064E25E851 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__000000067DFFFF90 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__0000001C760FA190 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__00000038E67ABFA0 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__0000003903F1CFE8 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__0000003B99F7F8A0 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__0000005D2FFFFB38 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__00000073AD3FE6B8 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__000000914E3F38F0 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__000000931B33AE68 000000067F0000400200008A590000360000-000000067F0000400200008A590000364000__000000931B9AFDF8 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__000000067DFFFF90 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__0000001C760FA190 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__00000038E67ABFA0 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__0000003903F1CFE8 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__0000003B99F7F8A0 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__0000005D2FFFFB38 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__00000073AD3FE6B8 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__000000914E3F38F0 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__000000931B33AE68 000000067F0000400200008A590000364000-000000067F0000400200008A590000368000__000000931B9AFDF8 000000067F0000400200008A59000036753C-000000067F0000400200008A590100000000__000000059E81EB61-000000064E25E851 000000067F0000400200008A59000036783E-000000067F0000400200008A590000370211__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__000000067DFFFF90 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__0000001C760FA190 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__00000038E67ABFA0 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__0000003903F1CFE8 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__0000003B99F7F8A0 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__0000005D2FFFFB38 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__00000073AD3FE6B8 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__000000914E3F38F0 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__000000931B33AE68 000000067F0000400200008A590000368000-000000067F0000400200008A59000036C000__000000931B9AFDF8 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__000000067DFFFF90 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__0000001C760FA190 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__00000038E67ABFA0 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__0000003903F1CFE8 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__0000003B99F7F8A0 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__0000005D2FFFFB38 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__00000073AD3FE6B8 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__000000914E3F38F0 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__000000931B33AE68 000000067F0000400200008A59000036C000-000000067F0000400200008A590000370000__000000931B9AFDF8 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__000000067DFFFF90 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__0000001C760FA190 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__00000038E67ABFA0 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__0000003903F1CFE8 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__0000003B99F7F8A0 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__0000005D2FFFFB38 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__00000073AD3FE6B8 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__000000914E3F38F0 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__000000931B33AE68 000000067F0000400200008A590000370000-000000067F0000400200008A590000374000__000000931B9AFDF8 000000067F0000400200008A590000370211-000000067F0000400200008A590000378BCB__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__000000067DFFFF90 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__0000001C760FA190 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__00000038E67ABFA0 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__0000003903F1CFE8 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__0000003B99F7F8A0 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__0000005D2FFFFB38 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__00000073AD3FE6B8 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__000000914E3F38F0 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__000000931B33AE68 000000067F0000400200008A590000374000-000000067F0000400200008A590000378000__000000931B9AFDF8 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__000000067DFFFF90 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__0000001C760FA190 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__00000038E67ABFA0 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__0000003903F1CFE8 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__0000003B99F7F8A0 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__0000005D2FFFFB38 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__00000073AD3FE6B8 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__000000914E3F38F0 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__000000931B33AE68 000000067F0000400200008A590000378000-000000067F0000400200008A59000037C000__000000931B9AFDF8 000000067F0000400200008A590000378BCB-000000067F0000400200008A590000381599__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__000000067DFFFF90 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__0000001C760FA190 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__00000038E67ABFA0 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__0000003903F1CFE8 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__0000003B99F7F8A0 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__0000005D2FFFFB38 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__00000073AD3FE6B8 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__000000914E3F38F0 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__000000931B33AE68 000000067F0000400200008A59000037C000-000000067F0000400200008A590000380000__000000931B9AFDF8 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__0000001C760FA190 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__00000038E67ABFA0 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__0000003903F1CFE8 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__0000003B99F7F8A0 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__0000005D2FFFFB38 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__00000073AD3FE6B8 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__000000914E3F38F0 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__000000931B33AE68 000000067F0000400200008A590000380000-000000067F0000400200008A590000384000__000000931B9AFDF8 000000067F0000400200008A590000380000-030000000000000000000000000000000002__000000067DFFFF90 000000067F0000400200008A590000381599-000000067F0000400200008A590000389F86__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__0000001C760FA190 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__00000038E67ABFA0 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__0000003903F1CFE8 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__0000003B99F7F8A0 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__0000005D2FFFFB38 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__00000073AD3FE6B8 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__000000914E3F38F0 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__000000931B33AE68 000000067F0000400200008A590000384000-000000067F0000400200008A590000388000__000000931B9AFDF8 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__0000001C760FA190 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__00000038E67ABFA0 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__0000003903F1CFE8 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__0000003B99F7F8A0 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__0000005D2FFFFB38 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__00000073AD3FE6B8 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__000000914E3F38F0 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__000000931B33AE68 000000067F0000400200008A590000388000-000000067F0000400200008A59000038C000__000000931B9AFDF8 000000067F0000400200008A590000389F86-000000067F0000400200008A590000392976__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__0000001C760FA190 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__00000038E67ABFA0 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__0000003903F1CFE8 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__0000003B99F7F8A0 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__0000005D2FFFFB38 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__00000073AD3FE6B8 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__000000914E3F38F0 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__000000931B33AE68 000000067F0000400200008A59000038C000-000000067F0000400200008A590000390000__000000931B9AFDF8 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__0000001C760FA190 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__00000038E67ABFA0 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__0000003903F1CFE8 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__0000003B99F7F8A0 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__0000005D2FFFFB38 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__00000073AD3FE6B8 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__000000914E3F38F0 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__000000931B33AE68 000000067F0000400200008A590000390000-000000067F0000400200008A590000394000__000000931B9AFDF8 000000067F0000400200008A590000392976-000000067F0000400200008A59000039B366__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__0000001C760FA190 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__00000038E67ABFA0 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__0000003903F1CFE8 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__0000003B99F7F8A0 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__0000005D2FFFFB38 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__00000073AD3FE6B8 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__000000914E3F38F0 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__000000931B33AE68 000000067F0000400200008A590000394000-000000067F0000400200008A590000398000__000000931B9AFDF8 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__0000001C760FA190 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__00000038E67ABFA0 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__0000003903F1CFE8 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__0000003B99F7F8A0 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__0000005D2FFFFB38 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__00000073AD3FE6B8 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__000000914E3F38F0 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__000000931B33AE68 000000067F0000400200008A590000398000-000000067F0000400200008A59000039C000__000000931B9AFDF8 000000067F0000400200008A59000039B366-000000067F0000400200008A5900003A3D42__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__0000001C760FA190 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__00000038E67ABFA0 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__0000003903F1CFE8 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__0000003B99F7F8A0 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__0000005D2FFFFB38 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__00000073AD3FE6B8 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__000000914E3F38F0 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__000000931B33AE68 000000067F0000400200008A59000039C000-000000067F0000400200008A5900003A0000__000000931B9AFDF8 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__0000001C760FA190 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__00000038E67ABFA0 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__0000003903F1CFE8 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__0000003B99F7F8A0 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__0000005D2FFFFB38 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__00000073AD3FE6B8 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__000000914E3F38F0 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__000000931B33AE68 000000067F0000400200008A5900003A0000-000000067F0000400200008A5900003A4000__000000931B9AFDF8 000000067F0000400200008A5900003A3D42-000000067F0000400200008A5900003AC710__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__0000001C760FA190 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__00000038E67ABFA0 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__0000003903F1CFE8 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__0000003B99F7F8A0 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__0000005D2FFFFB38 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__00000073AD3FE6B8 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__000000914E3F38F0 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__000000931B33AE68 000000067F0000400200008A5900003A4000-000000067F0000400200008A5900003A8000__000000931B9AFDF8 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__0000001C760FA190 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__00000038E67ABFA0 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__0000003903F1CFE8 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__0000003B99F7F8A0 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__0000005D2FFFFB38 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__00000073AD3FE6B8 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__000000914E3F38F0 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__000000931B33AE68 000000067F0000400200008A5900003A8000-000000067F0000400200008A5900003AC000__000000931B9AFDF8 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__0000001C760FA190 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__00000038E67ABFA0 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__0000003903F1CFE8 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__0000003B99F7F8A0 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__0000005D2FFFFB38 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__00000073AD3FE6B8 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__000000914E3F38F0 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__000000931B33AE68 000000067F0000400200008A5900003AC000-000000067F0000400200008A5900003B0000__000000931B9AFDF8 000000067F0000400200008A5900003AC710-000000067F0000400200008A5900003B50C6__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__0000001C760FA190 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__00000038E67ABFA0 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__0000003903F1CFE8 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__0000003B99F7F8A0 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__0000005D2FFFFB38 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__00000073AD3FE6B8 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__000000914E3F38F0 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__000000931B33AE68 000000067F0000400200008A5900003B0000-000000067F0000400200008A5900003B4000__000000931B9AFDF8 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__0000001C760FA190 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__00000038E67ABFA0 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__0000003903F1CFE8 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__0000003B99F7F8A0 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__0000005D2FFFFB38 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__00000073AD3FE6B8 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__000000914E3F38F0 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__000000931B33AE68 000000067F0000400200008A5900003B4000-000000067F0000400200008A5900003B8000__000000931B9AFDF8 000000067F0000400200008A5900003B50C6-000000067F0000400200008A5900003BDA8D__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__0000001C760FA190 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__00000038E67ABFA0 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__0000003903F1CFE8 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__0000003B99F7F8A0 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__0000005D2FFFFB38 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__00000073AD3FE6B8 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__000000914E3F38F0 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__000000931B33AE68 000000067F0000400200008A5900003B8000-000000067F0000400200008A5900003BC000__000000931B9AFDF8 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__0000001C760FA190 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__00000038E67ABFA0 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__0000003903F1CFE8 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__0000003B99F7F8A0 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__0000005D2FFFFB38 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__00000073AD3FE6B8 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__000000914E3F38F0 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__000000931B33AE68 000000067F0000400200008A5900003BC000-000000067F0000400200008A5900003C0000__000000931B9AFDF8 000000067F0000400200008A5900003BDA8D-000000067F0000400200008A5900003C648A__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__0000001C760FA190 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__00000038E67ABFA0 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__0000003903F1CFE8 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__0000003B99F7F8A0 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__0000005D2FFFFB38 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__00000073AD3FE6B8 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__000000914E3F38F0 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__000000931B33AE68 000000067F0000400200008A5900003C0000-000000067F0000400200008A5900003C4000__000000931B9AFDF8 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__0000001C725A2400 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__0000001C760FA190 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__00000038E67ABFA0 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__0000003903F1CFE8 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__0000003B99F7F8A0 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__0000005D2FFFFB38 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__00000073AD3FE6B8 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__000000914E3F38F0 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__000000931B33AE68 000000067F0000400200008A5900003C4000-000000067F0000400200008A5900003C8000__000000931B9AFDF8 000000067F0000400200008A5900003C648A-000000067F0000400200008A590100000000__000000064E25E851-00000006FDCDDAF1 000000067F0000400200008A5900003C67B6-000000067F0000400200008A5900003CF1B7__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__0000001C725A2400 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__0000001C760FA190 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__00000038E67ABFA0 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__0000003903F1CFE8 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__0000003B99F7F8A0 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__0000005D2FFFFB38 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__00000073AD3FE6B8 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__000000914E3F38F0 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__000000931B33AE68 000000067F0000400200008A5900003C8000-000000067F0000400200008A5900003CC000__000000931B9AFDF8 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__0000001C725A2400 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__0000001C760FA190 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__00000038E67ABFA0 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__0000003903F1CFE8 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__0000003B99F7F8A0 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__0000005D2FFFFB38 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__00000073AD3FE6B8 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__000000914E3F38F0 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__000000931B33AE68 000000067F0000400200008A5900003CC000-000000067F0000400200008A5900003D0000__000000931B9AFDF8 000000067F0000400200008A5900003CF1B7-000000067F0000400200008A5900003D7BAC__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__0000001C725A2400 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__0000001C760FA190 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__00000038E67ABFA0 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__0000003903F1CFE8 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__0000003B99F7F8A0 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__0000005D2FFFFB38 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__00000073AD3FE6B8 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__000000914E3F38F0 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__000000931B33AE68 000000067F0000400200008A5900003D0000-000000067F0000400200008A5900003D4000__000000931B9AFDF8 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__0000001C725A2400 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__0000001C760FA190 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__00000038E67ABFA0 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__0000003903F1CFE8 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__0000003B99F7F8A0 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__0000005D2FFFFB38 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__00000073AD3FE6B8 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__000000914E3F38F0 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__000000931B33AE68 000000067F0000400200008A5900003D4000-000000067F0000400200008A5900003D8000__000000931B9AFDF8 000000067F0000400200008A5900003D7BAC-000000067F0000400200008A5900003E0586__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__0000001C725A2400 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__0000001C760FA190 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__00000038E67ABFA0 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__0000003903F1CFE8 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__0000003B99F7F8A0 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__0000005D2FFFFB38 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__00000073AD3FE6B8 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__000000914E3F38F0 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__000000931B33AE68 000000067F0000400200008A5900003D8000-000000067F0000400200008A5900003DC000__000000931B9AFDF8 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__0000001C725A2400 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__0000001C760FA190 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__00000038E67ABFA0 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__0000003903F1CFE8 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__0000003B99F7F8A0 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__0000005D2FFFFB38 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__00000073AD3FE6B8 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__000000914E3F38F0 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__000000931B33AE68 000000067F0000400200008A5900003DC000-000000067F0000400200008A5900003E0000__000000931B9AFDF8 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__0000001C725A2400 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__0000001C760FA190 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__00000038E67ABFA0 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__0000003903F1CFE8 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__0000003B99F7F8A0 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__0000005D2FFFFB38 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__00000073AD3FE6B8 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__000000914E3F38F0 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__000000931B33AE68 000000067F0000400200008A5900003E0000-000000067F0000400200008A5900003E4000__000000931B9AFDF8 000000067F0000400200008A5900003E0586-000000067F0000400200008A5900003E8F57__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__0000001C725A2400 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__0000001C760FA190 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__00000038E67ABFA0 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__0000003903F1CFE8 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__0000003B99F7F8A0 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__0000005D2FFFFB38 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__00000073AD3FE6B8 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__000000914E3F38F0 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__000000931B33AE68 000000067F0000400200008A5900003E4000-000000067F0000400200008A5900003E8000__000000931B9AFDF8 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__0000001C725A2400 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__0000001C760FA190 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__00000038E67ABFA0 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__0000003903F1CFE8 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__0000003B99F7F8A0 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__0000005D2FFFFB38 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__00000073AD3FE6B8 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__000000914E3F38F0 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__000000931B33AE68 000000067F0000400200008A5900003E8000-000000067F0000400200008A5900003EC000__000000931B9AFDF8 000000067F0000400200008A5900003E8F57-000000067F0000400200008A5900003F1912__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__0000001C725A2400 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__0000001C760FA190 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__00000038E67ABFA0 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__0000003903F1CFE8 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__0000003B99F7F8A0 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__0000005D2FFFFB38 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__00000073AD3FE6B8 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__000000914E3F38F0 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__000000931B33AE68 000000067F0000400200008A5900003EC000-000000067F0000400200008A5900003F0000__000000931B9AFDF8 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__0000001C725A2400 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__0000001C760FA190 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__00000038E67ABFA0 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__0000003903F1CFE8 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__0000003B99F7F8A0 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__0000005D2FFFFB38 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__00000073AD3FE6B8 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__000000914E3F38F0 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__000000931B33AE68 000000067F0000400200008A5900003F0000-000000067F0000400200008A5900003F4000__000000931B9AFDF8 000000067F0000400200008A5900003F1912-000000067F0000400200008A5900003FA2D9__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__0000001C725A2400 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__0000001C760FA190 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__00000038E67ABFA0 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__0000003903F1CFE8 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__0000003B99F7F8A0 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__0000005D2FFFFB38 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__00000073AD3FE6B8 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__000000914E3F38F0 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__000000931B33AE68 000000067F0000400200008A5900003F4000-000000067F0000400200008A5900003F8000__000000931B9AFDF8 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__0000001C725A2400 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__0000001C760FA190 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__00000038E67ABFA0 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__0000003903F1CFE8 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__0000003B99F7F8A0 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__0000005D2FFFFB38 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__00000073AD3FE6B8 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__000000914E3F38F0 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__000000931B33AE68 000000067F0000400200008A5900003F8000-000000067F0000400200008A5900003FC000__000000931B9AFDF8 000000067F0000400200008A5900003FA2D9-000000067F0000400200008A590000402CDA__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__0000001C725A2400 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__0000001C760FA190 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__00000038E67ABFA0 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__0000003903F1CFE8 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__0000003B99F7F8A0 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__0000005D2FFFFB38 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__00000073AD3FE6B8 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__000000914E3F38F0 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__000000931B33AE68 000000067F0000400200008A5900003FC000-000000067F0000400200008A590000400000__000000931B9AFDF8 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__0000001C725A2400 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__0000001C760FA190 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__00000038E67ABFA0 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__0000003903F1CFE8 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__0000003B99F7F8A0 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__0000005D2FFFFB38 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__00000073AD3FE6B8 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__000000914E3F38F0 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__000000931B33AE68 000000067F0000400200008A590000400000-000000067F0000400200008A590000404000__000000931B9AFDF8 000000067F0000400200008A590000402CDA-000000067F0000400200008A59000040B6DB__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__0000001C725A2400 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__0000001C760FA190 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__00000038E67ABFA0 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__0000003903F1CFE8 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__0000003B99F7F8A0 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__0000005D2FFFFB38 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__00000073AD3FE6B8 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__000000914E3F38F0 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__000000931B33AE68 000000067F0000400200008A590000404000-000000067F0000400200008A590000408000__000000931B9AFDF8 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__0000001C725A2400 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__0000001C760FA190 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__00000038E67ABFA0 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__0000003903F1CFE8 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__0000003B99F7F8A0 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__0000005D2FFFFB38 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__00000073AD3FE6B8 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__000000914E3F38F0 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__000000931B33AE68 000000067F0000400200008A590000408000-000000067F0000400200008A59000040C000__000000931B9AFDF8 000000067F0000400200008A59000040B6DB-000000067F0000400200008A5900004140CA__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__0000001C725A2400 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__0000001C760FA190 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__00000038E67ABFA0 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__0000003903F1CFE8 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__0000003B99F7F8A0 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__0000005D2FFFFB38 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__00000073AD3FE6B8 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__000000914E3F38F0 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__000000931B33AE68 000000067F0000400200008A59000040C000-000000067F0000400200008A590000410000__000000931B9AFDF8 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__0000001C725A2400 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__0000001C760FA190 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__00000038E67ABFA0 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__0000003903F1CFE8 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__0000003B99F7F8A0 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__0000005D2FFFFB38 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__00000073AD3FE6B8 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__000000914E3F38F0 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__000000931B33AE68 000000067F0000400200008A590000410000-000000067F0000400200008A590000414000__000000931B9AFDF8 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__0000001C725A2400 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__0000001C760FA190 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__00000038E67ABFA0 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__0000003903F1CFE8 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__0000003B99F7F8A0 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__0000005D2FFFFB38 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__00000073AD3FE6B8 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__000000914E3F38F0 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__000000931B33AE68 000000067F0000400200008A590000414000-000000067F0000400200008A590000418000__000000931B9AFDF8 000000067F0000400200008A5900004140CA-000000067F0000400200008A59000041CAA5__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__0000001C725A2400 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__0000001C760FA190 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__00000038E67ABFA0 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__0000003903F1CFE8 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__0000003B99F7F8A0 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__0000005D2FFFFB38 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__00000073AD3FE6B8 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__000000914E3F38F0 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__000000931B33AE68 000000067F0000400200008A590000418000-000000067F0000400200008A59000041C000__000000931B9AFDF8 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__0000001C725A2400 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__0000001C760FA190 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__00000038E67ABFA0 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__0000003903F1CFE8 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__0000003B99F7F8A0 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__0000005D2FFFFB38 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__00000073AD3FE6B8 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__000000914E3F38F0 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__000000931B33AE68 000000067F0000400200008A59000041C000-000000067F0000400200008A590000420000__000000931B9AFDF8 000000067F0000400200008A59000041CAA5-000000067F0000400200008A59000042546F__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__0000001C725A2400 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__0000001C760FA190 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__00000038E67ABFA0 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__0000003903F1CFE8 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__0000003B99F7F8A0 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__0000005D2FFFFB38 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__00000073AD3FE6B8 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__000000914E3F38F0 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__000000931B33AE68 000000067F0000400200008A590000420000-000000067F0000400200008A590000424000__000000931B9AFDF8 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__000000088D7FE420 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__0000001C760FA190 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__00000038E67ABFA0 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__0000003903F1CFE8 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__0000003B99F7F8A0 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__0000005D2FFFFB38 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__00000073AD3FE6B8 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__000000914E3F38F0 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__000000931B33AE68 000000067F0000400200008A590000424000-000000067F0000400200008A590000428000__000000931B9AFDF8 000000067F0000400200008A59000042546F-000000067F0000400200008A590100000000__00000006FDCDDAF1-00000007AD75F249 000000067F0000400200008A59000042576F-000000067F0000400200008A59000042E12F__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__000000088D7FE420 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__0000001C760FA190 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__00000038E67ABFA0 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__0000003903F1CFE8 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__0000003B99F7F8A0 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__0000005D2FFFFB38 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__00000073AD3FE6B8 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__000000914E3F38F0 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__000000931B33AE68 000000067F0000400200008A590000428000-000000067F0000400200008A59000042C000__000000931B9AFDF8 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__000000088D7FE420 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__0000001C760FA190 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__00000038E67ABFA0 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__0000003903F1CFE8 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__0000003B99F7F8A0 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__0000005D2FFFFB38 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__00000073AD3FE6B8 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__000000914E3F38F0 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__000000931B33AE68 000000067F0000400200008A59000042C000-000000067F0000400200008A590000430000__000000931B9AFDF8 000000067F0000400200008A59000042E12F-000000067F0000400200008A590000436B05__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__000000088D7FE420 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__0000001C760FA190 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__00000038E67ABFA0 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__0000003903F1CFE8 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__0000003B99F7F8A0 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__0000005D2FFFFB38 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__00000073AD3FE6B8 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__000000914E3F38F0 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__000000931B33AE68 000000067F0000400200008A590000430000-000000067F0000400200008A590000434000__000000931B9AFDF8 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__000000088D7FE420 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__0000001C760FA190 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__00000038E67ABFA0 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__0000003903F1CFE8 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__0000003B99F7F8A0 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__0000005D2FFFFB38 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__00000073AD3FE6B8 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__000000914E3F38F0 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__000000931B33AE68 000000067F0000400200008A590000434000-000000067F0000400200008A590000438000__000000931B9AFDF8 000000067F0000400200008A590000436B05-000000067F0000400200008A59000043F4F4__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__000000088D7FE420 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__0000001C760FA190 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__00000038E67ABFA0 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__0000003903F1CFE8 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__0000003B99F7F8A0 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__0000005D2FFFFB38 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__00000073AD3FE6B8 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__000000914E3F38F0 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__000000931B33AE68 000000067F0000400200008A590000438000-000000067F0000400200008A59000043C000__000000931B9AFDF8 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__000000088D7FE420 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__0000001C760FA190 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__00000038E67ABFA0 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__0000003903F1CFE8 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__0000003B99F7F8A0 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__0000005D2FFFFB38 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__00000073AD3FE6B8 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__000000914E3F38F0 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__000000931B33AE68 000000067F0000400200008A59000043C000-000000067F0000400200008A590000440000__000000931B9AFDF8 000000067F0000400200008A59000043F4F4-000000067F0000400200008A590000447EE3__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__000000088D7FE420 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__0000001C760FA190 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__00000038E67ABFA0 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__0000003903F1CFE8 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__0000003B99F7F8A0 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__0000005D2FFFFB38 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__00000073AD3FE6B8 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__000000914E3F38F0 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__000000931B33AE68 000000067F0000400200008A590000440000-000000067F0000400200008A590000444000__000000931B9AFDF8 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__000000088D7FE420 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__0000001C760FA190 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__00000038E67ABFA0 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__0000003903F1CFE8 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__0000003B99F7F8A0 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__0000005D2FFFFB38 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__00000073AD3FE6B8 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__000000914E3F38F0 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__000000931B33AE68 000000067F0000400200008A590000444000-000000067F0000400200008A590000448000__000000931B9AFDF8 000000067F0000400200008A590000447EE3-000000067F0000400200008A5900004508CC__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__000000088D7FE420 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__0000001C760FA190 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__00000038E67ABFA0 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__0000003903F1CFE8 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__0000003B99F7F8A0 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__0000005D2FFFFB38 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__00000073AD3FE6B8 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__000000914E3F38F0 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__000000931B33AE68 000000067F0000400200008A590000448000-000000067F0000400200008A59000044C000__000000931B9AFDF8 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__000000088D7FE420 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__0000001C760FA190 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__00000038E67ABFA0 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__0000003903F1CFE8 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__0000003B99F7F8A0 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__0000005D2FFFFB38 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__00000073AD3FE6B8 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__000000914E3F38F0 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__000000931B33AE68 000000067F0000400200008A59000044C000-000000067F0000400200008A590000450000__000000931B9AFDF8 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__000000088D7FE420 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__0000001C760FA190 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__00000038E67ABFA0 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__0000003903F1CFE8 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__0000003B99F7F8A0 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__0000005D2FFFFB38 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__00000073AD3FE6B8 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__000000914E3F38F0 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__000000931B33AE68 000000067F0000400200008A590000450000-000000067F0000400200008A590000454000__000000931B9AFDF8 000000067F0000400200008A5900004508CC-000000067F0000400200008A5900004592AC__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__000000088D7FE420 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__0000001C760FA190 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__00000038E67ABFA0 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__0000003903F1CFE8 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__0000003B99F7F8A0 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__0000005D2FFFFB38 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__00000073AD3FE6B8 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__000000914E3F38F0 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__000000931B33AE68 000000067F0000400200008A590000454000-000000067F0000400200008A590000458000__000000931B9AFDF8 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__000000088D7FE420 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__0000001C760FA190 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__00000038E67ABFA0 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__0000003903F1CFE8 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__0000003B99F7F8A0 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__0000005D2FFFFB38 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__00000073AD3FE6B8 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__000000914E3F38F0 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__000000931B33AE68 000000067F0000400200008A590000458000-000000067F0000400200008A59000045C000__000000931B9AFDF8 000000067F0000400200008A5900004592AC-000000067F0000400200008A590000461C6A__00000007AD75F249-000000085D1DF561 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__000000088D7FE420 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__0000001C760FA190 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__00000038E67ABFA0 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__0000003903F1CFE8 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__0000003B99F7F8A0 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__0000005D2FFFFB38 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__00000073AD3FE6B8 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__000000914E3F38F0 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__000000931B33AE68 000000067F0000400200008A59000045C000-000000067F0000400200008A590000460000__000000931B9AFDF8 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__000000088D7FE420 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__0000001C760FA190 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__00000038E67ABFA0 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__0000003903F1CFE8 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__0000003B99F7F8A0 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__0000005D2FFFFB38 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__00000073AD3FE6B8 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__000000914E3F38F0 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__000000931B33AE68 000000067F0000400200008A590000460000-000000067F0000400200008A590000464000__000000931B9AFDF8 000000067F0000400200008A590000461C6A-000000067F0000400200008A59000046A62B__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__000000088D7FE420 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__0000001C760FA190 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__00000038E67ABFA0 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__0000003903F1CFE8 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__0000003B99F7F8A0 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__0000005D2FFFFB38 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__00000073AD3FE6B8 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__000000914E3F38F0 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__000000931B33AE68 000000067F0000400200008A590000464000-000000067F0000400200008A590000468000__000000931B9AFDF8 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__000000088D7FE420 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__0000001C760FA190 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__00000038E67ABFA0 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__0000003903F1CFE8 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__0000003B99F7F8A0 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__0000005D2FFFFB38 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__00000073AD3FE6B8 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__000000914E3F38F0 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__000000931B33AE68 000000067F0000400200008A590000468000-000000067F0000400200008A59000046C000__000000931B9AFDF8 000000067F0000400200008A59000046A62B-000000067F0000400200008A590000473003__00000007AD75F249-000000085D1DF561 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__000000088D7FE420 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__0000001C760FA190 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__00000038E67ABFA0 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__0000003903F1CFE8 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__0000003B99F7F8A0 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__0000005D2FFFFB38 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__00000073AD3FE6B8 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__000000914E3F38F0 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__000000931B33AE68 000000067F0000400200008A59000046C000-000000067F0000400200008A590000470000__000000931B9AFDF8 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__000000088D7FE420 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__0000001C760FA190 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__00000038E67ABFA0 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__0000003903F1CFE8 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__0000003B99F7F8A0 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__0000005D2FFFFB38 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__00000073AD3FE6B8 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__000000914E3F38F0 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__000000931B33AE68 000000067F0000400200008A590000470000-000000067F0000400200008A590000474000__000000931B9AFDF8 000000067F0000400200008A590000473003-000000067F0000400200008A59000047B9EA__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__000000088D7FE420 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__0000001C760FA190 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__00000038E67ABFA0 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__0000003903F1CFE8 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__0000003B99F7F8A0 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__0000005D2FFFFB38 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__00000073AD3FE6B8 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__000000914E3F38F0 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__000000931B33AE68 000000067F0000400200008A590000474000-000000067F0000400200008A590000478000__000000931B9AFDF8 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__000000088D7FE420 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__0000001C760FA190 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__00000038E67ABFA0 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__0000003903F1CFE8 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__0000003B99F7F8A0 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__0000005D2FFFFB38 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__00000073AD3FE6B8 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__000000914E3F38F0 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__000000931B33AE68 000000067F0000400200008A590000478000-000000067F0000400200008A59000047C000__000000931B9AFDF8 000000067F0000400200008A59000047B9EA-000000067F0000400200008A5900004843E5__00000007AD75F249-000000085D1DF561 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__000000088D7FE420 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__0000001C760FA190 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__00000038E67ABFA0 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__0000003903F1CFE8 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__0000003B99F7F8A0 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__0000005D2FFFFB38 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__00000073AD3FE6B8 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__000000914E3F38F0 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__000000931B33AE68 000000067F0000400200008A59000047C000-000000067F0000400200008A590000480000__000000931B9AFDF8 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__000000088D7FE420 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__0000001C760FA190 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__00000038E67ABFA0 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__0000003903F1CFE8 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__0000003B99F7F8A0 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__0000005D2FFFFB38 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__00000073AD3FE6B8 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__000000914E3F38F0 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__000000931B33AE68 000000067F0000400200008A590000480000-000000067F0000400200008A590000484000__000000931B9AFDF8 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__000000088D7FE420 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__0000001C760FA190 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__00000038E67ABFA0 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__0000003903F1CFE8 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__0000003B99F7F8A0 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__0000005D2FFFFB38 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__00000073AD3FE6B8 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__000000914E3F38F0 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__000000931B33AE68 000000067F0000400200008A590000484000-000000067F0000400200008A590000488000__000000931B9AFDF8 000000067F0000400200008A5900004843E5-000000067F0000400200008A590100000000__00000007AD75F249-000000085D1DF561 000000067F0000400200008A590000484710-000000067F0000400200008A59000048D0ED__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__000000088D7FE420 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__0000001C760FA190 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__00000038E67ABFA0 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__0000003903F1CFE8 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__0000003B99F7F8A0 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__0000005D2FFFFB38 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__00000073AD3FE6B8 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__000000914E3F38F0 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__000000931B33AE68 000000067F0000400200008A590000488000-000000067F0000400200008A59000048C000__000000931B9AFDF8 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__000000088D7FE420 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__0000001C760FA190 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__00000038E67ABFA0 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__0000003903F1CFE8 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__0000003B99F7F8A0 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__0000005D2FFFFB38 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__00000073AD3FE6B8 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__000000914E3F38F0 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__000000931B33AE68 000000067F0000400200008A59000048C000-000000067F0000400200008A590000490000__000000931B9AFDF8 000000067F0000400200008A59000048D0ED-000000067F0000400200008A590000495ACD__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__000000088D7FE420 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__0000001C760FA190 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__00000038E67ABFA0 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__0000003903F1CFE8 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__0000003B99F7F8A0 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__0000005D2FFFFB38 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__00000073AD3FE6B8 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__000000914E3F38F0 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__000000931B33AE68 000000067F0000400200008A590000490000-000000067F0000400200008A590000494000__000000931B9AFDF8 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__000000088D7FE420 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__0000001C760FA190 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__00000038E67ABFA0 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__0000003903F1CFE8 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__0000003B99F7F8A0 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__0000005D2FFFFB38 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__00000073AD3FE6B8 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__000000914E3F38F0 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__000000931B33AE68 000000067F0000400200008A590000494000-000000067F0000400200008A590000498000__000000931B9AFDF8 000000067F0000400200008A590000495ACD-000000067F0000400200008A59000049E49A__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__000000088D7FE420 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__0000001C760FA190 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__00000038E67ABFA0 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__0000003903F1CFE8 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__0000003B99F7F8A0 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__0000005D2FFFFB38 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__00000073AD3FE6B8 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__000000914E3F38F0 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__000000931B33AE68 000000067F0000400200008A590000498000-000000067F0000400200008A59000049C000__000000931B9AFDF8 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__0000001C760FA190 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__00000038E67ABFA0 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__0000003903F1CFE8 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__0000003B99F7F8A0 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__0000005D2FFFFB38 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__00000073AD3FE6B8 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__000000914E3F38F0 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__000000931B33AE68 000000067F0000400200008A59000049C000-000000067F0000400200008A5900004A0000__000000931B9AFDF8 000000067F0000400200008A59000049C000-030000000000000000000000000000000002__000000088D7FE420 000000067F0000400200008A59000049E49A-000000067F0000400200008A5900004A6E62__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__0000001C760FA190 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__00000038E67ABFA0 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__0000003903F1CFE8 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__0000003B99F7F8A0 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__0000005D2FFFFB38 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__00000073AD3FE6B8 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__000000914E3F38F0 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__000000931B33AE68 000000067F0000400200008A5900004A0000-000000067F0000400200008A5900004A4000__000000931B9AFDF8 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__0000001C760FA190 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__00000038E67ABFA0 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__0000003903F1CFE8 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__0000003B99F7F8A0 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__0000005D2FFFFB38 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__00000073AD3FE6B8 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__000000914E3F38F0 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__000000931B33AE68 000000067F0000400200008A5900004A4000-000000067F0000400200008A5900004A8000__000000931B9AFDF8 000000067F0000400200008A5900004A6E62-000000067F0000400200008A5900004AF849__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__0000001C760FA190 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__00000038E67ABFA0 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__0000003903F1CFE8 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__0000003B99F7F8A0 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__0000005D2FFFFB38 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__00000073AD3FE6B8 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__000000914E3F38F0 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__000000931B33AE68 000000067F0000400200008A5900004A8000-000000067F0000400200008A5900004AC000__000000931B9AFDF8 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__0000001C760FA190 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__00000038E67ABFA0 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__0000003903F1CFE8 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__0000003B99F7F8A0 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__0000005D2FFFFB38 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__00000073AD3FE6B8 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__000000914E3F38F0 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__000000931B33AE68 000000067F0000400200008A5900004AC000-000000067F0000400200008A5900004B0000__000000931B9AFDF8 000000067F0000400200008A5900004AF849-000000067F0000400200008A5900004B823E__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__0000001C760FA190 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__00000038E67ABFA0 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__0000003903F1CFE8 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__0000003B99F7F8A0 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__0000005D2FFFFB38 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__00000073AD3FE6B8 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__000000914E3F38F0 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__000000931B33AE68 000000067F0000400200008A5900004B0000-000000067F0000400200008A5900004B4000__000000931B9AFDF8 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__0000001C760FA190 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__00000038E67ABFA0 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__0000003903F1CFE8 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__0000003B99F7F8A0 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__0000005D2FFFFB38 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__00000073AD3FE6B8 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__000000914E3F38F0 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__000000931B33AE68 000000067F0000400200008A5900004B4000-000000067F0000400200008A5900004B8000__000000931B9AFDF8 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__0000001C760FA190 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__00000038E67ABFA0 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__0000003903F1CFE8 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__0000003B99F7F8A0 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__0000005D2FFFFB38 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__00000073AD3FE6B8 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__000000914E3F38F0 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__000000931B33AE68 000000067F0000400200008A5900004B8000-000000067F0000400200008A5900004BC000__000000931B9AFDF8 000000067F0000400200008A5900004B823E-000000067F0000400200008A5900004C0C22__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__0000001C760FA190 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__00000038E67ABFA0 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__0000003903F1CFE8 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__0000003B99F7F8A0 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__0000005D2FFFFB38 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__00000073AD3FE6B8 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__000000914E3F38F0 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__000000931B33AE68 000000067F0000400200008A5900004BC000-000000067F0000400200008A5900004C0000__000000931B9AFDF8 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__0000001C760FA190 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__00000038E67ABFA0 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__0000003903F1CFE8 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__0000003B99F7F8A0 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__0000005D2FFFFB38 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__00000073AD3FE6B8 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__000000914E3F38F0 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__000000931B33AE68 000000067F0000400200008A5900004C0000-000000067F0000400200008A5900004C4000__000000931B9AFDF8 000000067F0000400200008A5900004C0C22-000000067F0000400200008A5900004C9601__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__0000001C760FA190 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__00000038E67ABFA0 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__0000003903F1CFE8 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__0000003B99F7F8A0 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__0000005D2FFFFB38 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__00000073AD3FE6B8 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__000000914E3F38F0 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__000000931B33AE68 000000067F0000400200008A5900004C4000-000000067F0000400200008A5900004C8000__000000931B9AFDF8 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__0000001C760FA190 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__00000038E67ABFA0 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__0000003903F1CFE8 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__0000003B99F7F8A0 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__0000005D2FFFFB38 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__00000073AD3FE6B8 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__000000914E3F38F0 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__000000931B33AE68 000000067F0000400200008A5900004C8000-000000067F0000400200008A5900004CC000__000000931B9AFDF8 000000067F0000400200008A5900004C9601-000000067F0000400200008A5900004D1FCD__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__0000001C760FA190 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__00000038E67ABFA0 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__0000003903F1CFE8 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__0000003B99F7F8A0 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__0000005D2FFFFB38 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__00000073AD3FE6B8 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__000000914E3F38F0 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__000000931B33AE68 000000067F0000400200008A5900004CC000-000000067F0000400200008A5900004D0000__000000931B9AFDF8 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__0000001C760FA190 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__00000038E67ABFA0 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__0000003903F1CFE8 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__0000003B99F7F8A0 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__0000005D2FFFFB38 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__00000073AD3FE6B8 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__000000914E3F38F0 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__000000931B33AE68 000000067F0000400200008A5900004D0000-000000067F0000400200008A5900004D4000__000000931B9AFDF8 000000067F0000400200008A5900004D1FCD-000000067F0000400200008A5900004DA99B__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__0000001C760FA190 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__00000038E67ABFA0 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__0000003903F1CFE8 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__0000003B99F7F8A0 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__0000005D2FFFFB38 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__00000073AD3FE6B8 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__000000914E3F38F0 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__000000931B33AE68 000000067F0000400200008A5900004D4000-000000067F0000400200008A5900004D8000__000000931B9AFDF8 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__0000001C760FA190 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__00000038E67ABFA0 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__0000003903F1CFE8 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__0000003B99F7F8A0 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__0000005D2FFFFB38 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__00000073AD3FE6B8 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__000000914E3F38F0 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__000000931B33AE68 000000067F0000400200008A5900004D8000-000000067F0000400200008A5900004DC000__000000931B9AFDF8 000000067F0000400200008A5900004DA99B-000000067F0000400200008A5900004E3359__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__0000001C760FA190 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__00000038E67ABFA0 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__0000003903F1CFE8 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__0000003B99F7F8A0 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__0000005D2FFFFB38 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__00000073AD3FE6B8 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__000000914E3F38F0 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__000000931B33AE68 000000067F0000400200008A5900004DC000-000000067F0000400200008A5900004E0000__000000931B9AFDF8 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__0000001C725A2400 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__0000001C760FA190 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__00000038E67ABFA0 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__0000003903F1CFE8 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__0000003B99F7F8A0 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__0000005D2FFFFB38 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__00000073AD3FE6B8 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__000000914E3F38F0 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__000000931B33AE68 000000067F0000400200008A5900004E0000-000000067F0000400200008A5900004E4000__000000931B9AFDF8 000000067F0000400200008A5900004E3359-000000067F0000400200008A590100000000__000000085D1DF561-000000090CC5DF81 000000067F0000400200008A5900004E367E-000000067F0000400200008A5900004EC06D__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__0000001C725A2400 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__0000001C760FA190 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__00000038E67ABFA0 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__0000003903F1CFE8 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__0000003B99F7F8A0 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__0000005D2FFFFB38 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__00000073AD3FE6B8 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__000000914E3F38F0 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__000000931B33AE68 000000067F0000400200008A5900004E4000-000000067F0000400200008A5900004E8000__000000931B9AFDF8 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__0000001C725A2400 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__0000001C760FA190 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__00000038E67ABFA0 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__0000003903F1CFE8 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__0000003B99F7F8A0 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__0000005D2FFFFB38 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__00000073AD3FE6B8 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__000000914E3F38F0 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__000000931B33AE68 000000067F0000400200008A5900004E8000-000000067F0000400200008A5900004EC000__000000931B9AFDF8 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__0000001C725A2400 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__0000001C760FA190 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__00000038E67ABFA0 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__0000003903F1CFE8 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__0000003B99F7F8A0 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__0000005D2FFFFB38 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__00000073AD3FE6B8 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__000000914E3F38F0 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__000000931B33AE68 000000067F0000400200008A5900004EC000-000000067F0000400200008A5900004F0000__000000931B9AFDF8 000000067F0000400200008A5900004EC06D-000000067F0000400200008A5900004F4A59__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__0000001C725A2400 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__0000001C760FA190 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__00000038E67ABFA0 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__0000003903F1CFE8 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__0000003B99F7F8A0 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__0000005D2FFFFB38 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__00000073AD3FE6B8 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__000000914E3F38F0 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__000000931B33AE68 000000067F0000400200008A5900004F0000-000000067F0000400200008A5900004F4000__000000931B9AFDF8 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__0000001C725A2400 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__0000001C760FA190 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__00000038E67ABFA0 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__0000003903F1CFE8 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__0000003B99F7F8A0 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__0000005D2FFFFB38 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__00000073AD3FE6B8 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__000000914E3F38F0 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__000000931B33AE68 000000067F0000400200008A5900004F4000-000000067F0000400200008A5900004F8000__000000931B9AFDF8 000000067F0000400200008A5900004F4A59-000000067F0000400200008A5900004FD445__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__0000001C725A2400 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__0000001C760FA190 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__00000038E67ABFA0 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__0000003903F1CFE8 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__0000003B99F7F8A0 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__0000005D2FFFFB38 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__00000073AD3FE6B8 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__000000914E3F38F0 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__000000931B33AE68 000000067F0000400200008A5900004F8000-000000067F0000400200008A5900004FC000__000000931B9AFDF8 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__0000001C725A2400 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__0000001C760FA190 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__00000038E67ABFA0 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__0000003903F1CFE8 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__0000003B99F7F8A0 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__0000005D2FFFFB38 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__00000073AD3FE6B8 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__000000914E3F38F0 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__000000931B33AE68 000000067F0000400200008A5900004FC000-000000067F0000400200008A590000500000__000000931B9AFDF8 000000067F0000400200008A5900004FD445-000000067F0000400200008A590000505E1F__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__0000001C725A2400 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__0000001C760FA190 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__00000038E67ABFA0 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__0000003903F1CFE8 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__0000003B99F7F8A0 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__0000005D2FFFFB38 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__00000073AD3FE6B8 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__000000914E3F38F0 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__000000931B33AE68 000000067F0000400200008A590000500000-000000067F0000400200008A590000504000__000000931B9AFDF8 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__0000001C725A2400 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__0000001C760FA190 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__00000038E67ABFA0 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__0000003903F1CFE8 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__0000003B99F7F8A0 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__0000005D2FFFFB38 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__00000073AD3FE6B8 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__000000914E3F38F0 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__000000931B33AE68 000000067F0000400200008A590000504000-000000067F0000400200008A590000508000__000000931B9AFDF8 000000067F0000400200008A590000505E1F-000000067F0000400200008A59000050E7F0__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__0000001C725A2400 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__0000001C760FA190 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__00000038E67ABFA0 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__0000003903F1CFE8 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__0000003B99F7F8A0 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__0000005D2FFFFB38 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__00000073AD3FE6B8 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__000000914E3F38F0 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__000000931B33AE68 000000067F0000400200008A590000508000-000000067F0000400200008A59000050C000__000000931B9AFDF8 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__0000001C725A2400 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__0000001C760FA190 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__00000038E67ABFA0 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__0000003903F1CFE8 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__0000003B99F7F8A0 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__0000005D2FFFFB38 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__00000073AD3FE6B8 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__000000914E3F38F0 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__000000931B33AE68 000000067F0000400200008A59000050C000-000000067F0000400200008A590000510000__000000931B9AFDF8 000000067F0000400200008A59000050E7F0-000000067F0000400200008A5900005171C0__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__0000001C725A2400 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__0000001C760FA190 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__00000038E67ABFA0 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__0000003903F1CFE8 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__0000003B99F7F8A0 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__0000005D2FFFFB38 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__00000073AD3FE6B8 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__000000914E3F38F0 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__000000931B33AE68 000000067F0000400200008A590000510000-000000067F0000400200008A590000514000__000000931B9AFDF8 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__0000001C725A2400 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__0000001C760FA190 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__00000038E67ABFA0 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__0000003903F1CFE8 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__0000003B99F7F8A0 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__0000005D2FFFFB38 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__00000073AD3FE6B8 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__000000914E3F38F0 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__000000931B33AE68 000000067F0000400200008A590000514000-000000067F0000400200008A590000518000__000000931B9AFDF8 000000067F0000400200008A5900005171C0-000000067F0000400200008A59000051FB89__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__0000001C725A2400 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__0000001C760FA190 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__00000038E67ABFA0 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__0000003903F1CFE8 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__0000003B99F7F8A0 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__0000005D2FFFFB38 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__00000073AD3FE6B8 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__000000914E3F38F0 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__000000931B33AE68 000000067F0000400200008A590000518000-000000067F0000400200008A59000051C000__000000931B9AFDF8 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__0000001C725A2400 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__0000001C760FA190 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__00000038E67ABFA0 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__0000003903F1CFE8 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__0000003B99F7F8A0 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__0000005D2FFFFB38 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__00000073AD3FE6B8 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__000000914E3F38F0 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__000000931B33AE68 000000067F0000400200008A59000051C000-000000067F0000400200008A590000520000__000000931B9AFDF8 000000067F0000400200008A59000051FB89-000000067F0000400200008A590000528577__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__0000001C725A2400 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__0000001C760FA190 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__00000038E67ABFA0 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__0000003903F1CFE8 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__0000003B99F7F8A0 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__0000005D2FFFFB38 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__00000073AD3FE6B8 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__000000914E3F38F0 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__000000931B33AE68 000000067F0000400200008A590000520000-000000067F0000400200008A590000524000__000000931B9AFDF8 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__0000001C725A2400 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__0000001C760FA190 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__00000038E67ABFA0 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__0000003903F1CFE8 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__0000003B99F7F8A0 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__0000005D2FFFFB38 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__00000073AD3FE6B8 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__000000914E3F38F0 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__000000931B33AE68 000000067F0000400200008A590000524000-000000067F0000400200008A590000528000__000000931B9AFDF8 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__0000001C725A2400 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__0000001C760FA190 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__00000038E67ABFA0 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__0000003903F1CFE8 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__0000003B99F7F8A0 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__0000005D2FFFFB38 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__00000073AD3FE6B8 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__000000914E3F38F0 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__000000931B33AE68 000000067F0000400200008A590000528000-000000067F0000400200008A59000052C000__000000931B9AFDF8 000000067F0000400200008A590000528577-000000067F0000400200008A590000530F67__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__0000001C725A2400 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__0000001C760FA190 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__00000038E67ABFA0 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__0000003903F1CFE8 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__0000003B99F7F8A0 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__0000005D2FFFFB38 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__00000073AD3FE6B8 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__000000914E3F38F0 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__000000931B33AE68 000000067F0000400200008A59000052C000-000000067F0000400200008A590000530000__000000931B9AFDF8 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__0000001C725A2400 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__0000001C760FA190 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__00000038E67ABFA0 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__0000003903F1CFE8 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__0000003B99F7F8A0 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__0000005D2FFFFB38 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__00000073AD3FE6B8 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__000000914E3F38F0 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__000000931B33AE68 000000067F0000400200008A590000530000-000000067F0000400200008A590000534000__000000931B9AFDF8 000000067F0000400200008A590000530F67-000000067F0000400200008A590000539959__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__0000001C725A2400 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__0000001C760FA190 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__00000038E67ABFA0 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__0000003903F1CFE8 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__0000003B99F7F8A0 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__0000005D2FFFFB38 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__00000073AD3FE6B8 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__000000914E3F38F0 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__000000931B33AE68 000000067F0000400200008A590000534000-000000067F0000400200008A590000538000__000000931B9AFDF8 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__0000000A7B3FF158 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__0000001C760FA190 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__00000038E67ABFA0 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__0000003903F1CFE8 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__0000003B99F7F8A0 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__0000005D2FFFFB38 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__00000073AD3FE6B8 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__000000914E3F38F0 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__000000931B33AE68 000000067F0000400200008A590000538000-000000067F0000400200008A59000053C000__000000931B9AFDF8 000000067F0000400200008A590000539959-000000067F0000400200008A590100000000__000000090CC5DF81-00000009AC75E659 000000067F0000400200008A590000539C35-000000067F0000400200008A590000542603__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__0000000A7B3FF158 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__0000001C760FA190 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__00000038E67ABFA0 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__0000003903F1CFE8 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__0000003B99F7F8A0 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__0000005D2FFFFB38 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__00000073AD3FE6B8 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__000000914E3F38F0 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__000000931B33AE68 000000067F0000400200008A59000053C000-000000067F0000400200008A590000540000__000000931B9AFDF8 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__0000000A7B3FF158 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__0000001C760FA190 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__00000038E67ABFA0 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__0000003903F1CFE8 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__0000003B99F7F8A0 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__0000005D2FFFFB38 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__00000073AD3FE6B8 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__000000914E3F38F0 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__000000931B33AE68 000000067F0000400200008A590000540000-000000067F0000400200008A590000544000__000000931B9AFDF8 000000067F0000400200008A590000542603-000000067F0000400200008A59000054AFD4__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__0000000A7B3FF158 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__0000001C760FA190 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__00000038E67ABFA0 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__0000003903F1CFE8 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__0000003B99F7F8A0 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__0000005D2FFFFB38 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__00000073AD3FE6B8 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__000000914E3F38F0 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__000000931B33AE68 000000067F0000400200008A590000544000-000000067F0000400200008A590000548000__000000931B9AFDF8 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__0000000A7B3FF158 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__0000001C760FA190 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__00000038E67ABFA0 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__0000003903F1CFE8 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__0000003B99F7F8A0 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__0000005D2FFFFB38 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__00000073AD3FE6B8 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__000000914E3F38F0 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__000000931B33AE68 000000067F0000400200008A590000548000-000000067F0000400200008A59000054C000__000000931B9AFDF8 000000067F0000400200008A59000054AFD4-000000067F0000400200008A59000055399F__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__0000000A7B3FF158 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__0000001C760FA190 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__00000038E67ABFA0 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__0000003903F1CFE8 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__0000003B99F7F8A0 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__0000005D2FFFFB38 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__00000073AD3FE6B8 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__000000914E3F38F0 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__000000931B33AE68 000000067F0000400200008A59000054C000-000000067F0000400200008A590000550000__000000931B9AFDF8 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__0000000A7B3FF158 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__0000001C760FA190 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__00000038E67ABFA0 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__0000003903F1CFE8 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__0000003B99F7F8A0 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__0000005D2FFFFB38 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__00000073AD3FE6B8 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__000000914E3F38F0 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__000000931B33AE68 000000067F0000400200008A590000550000-000000067F0000400200008A590000554000__000000931B9AFDF8 000000067F0000400200008A59000055399F-000000067F0000400200008A59000055C370__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__0000000A7B3FF158 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__0000001C760FA190 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__00000038E67ABFA0 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__0000003903F1CFE8 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__0000003B99F7F8A0 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__0000005D2FFFFB38 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__00000073AD3FE6B8 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__000000914E3F38F0 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__000000931B33AE68 000000067F0000400200008A590000554000-000000067F0000400200008A590000558000__000000931B9AFDF8 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__0000000A7B3FF158 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__0000001C760FA190 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__00000038E67ABFA0 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__0000003903F1CFE8 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__0000003B99F7F8A0 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__0000005D2FFFFB38 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__00000073AD3FE6B8 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__000000914E3F38F0 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__000000931B33AE68 000000067F0000400200008A590000558000-000000067F0000400200008A59000055C000__000000931B9AFDF8 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__0000000A7B3FF158 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__0000001C760FA190 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__00000038E67ABFA0 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__0000003903F1CFE8 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__0000003B99F7F8A0 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__0000005D2FFFFB38 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__00000073AD3FE6B8 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__000000914E3F38F0 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__000000931B33AE68 000000067F0000400200008A59000055C000-000000067F0000400200008A590000560000__000000931B9AFDF8 000000067F0000400200008A59000055C370-000000067F0000400200008A590000564D5E__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__0000000A7B3FF158 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__0000001C760FA190 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__00000038E67ABFA0 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__0000003903F1CFE8 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__0000003B99F7F8A0 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__0000005D2FFFFB38 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__00000073AD3FE6B8 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__000000914E3F38F0 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__000000931B33AE68 000000067F0000400200008A590000560000-000000067F0000400200008A590000564000__000000931B9AFDF8 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__0000000A7B3FF158 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__0000001C760FA190 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__00000038E67ABFA0 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__0000003903F1CFE8 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__0000003B99F7F8A0 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__0000005D2FFFFB38 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__00000073AD3FE6B8 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__000000914E3F38F0 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__000000931B33AE68 000000067F0000400200008A590000564000-000000067F0000400200008A590000568000__000000931B9AFDF8 000000067F0000400200008A590000564D5E-000000067F0000400200008A59000056D74C__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__0000000A7B3FF158 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__0000001C760FA190 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__00000038E67ABFA0 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__0000003903F1CFE8 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__0000003B99F7F8A0 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__0000005D2FFFFB38 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__00000073AD3FE6B8 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__000000914E3F38F0 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__000000931B33AE68 000000067F0000400200008A590000568000-000000067F0000400200008A59000056C000__000000931B9AFDF8 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__0000000A7B3FF158 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__0000001C760FA190 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__00000038E67ABFA0 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__0000003903F1CFE8 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__0000003B99F7F8A0 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__0000005D2FFFFB38 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__00000073AD3FE6B8 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__000000914E3F38F0 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__000000931B33AE68 000000067F0000400200008A59000056C000-000000067F0000400200008A590000570000__000000931B9AFDF8 000000067F0000400200008A59000056D74C-000000067F0000400200008A590000576130__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__0000000A7B3FF158 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__0000001C760FA190 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__00000038E67ABFA0 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__0000003903F1CFE8 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__0000003B99F7F8A0 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__0000005D2FFFFB38 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__00000073AD3FE6B8 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__000000914E3F38F0 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__000000931B33AE68 000000067F0000400200008A590000570000-000000067F0000400200008A590000574000__000000931B9AFDF8 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__0000000A7B3FF158 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__0000001C760FA190 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__00000038E67ABFA0 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__0000003903F1CFE8 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__0000003B99F7F8A0 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__0000005D2FFFFB38 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__00000073AD3FE6B8 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__000000914E3F38F0 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__000000931B33AE68 000000067F0000400200008A590000574000-000000067F0000400200008A590000578000__000000931B9AFDF8 000000067F0000400200008A590000576130-000000067F0000400200008A59000057EAFE__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__0000000A7B3FF158 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__0000001C760FA190 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__00000038E67ABFA0 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__0000003903F1CFE8 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__0000003B99F7F8A0 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__0000005D2FFFFB38 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__00000073AD3FE6B8 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__000000914E3F38F0 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__000000931B33AE68 000000067F0000400200008A590000578000-000000067F0000400200008A59000057C000__000000931B9AFDF8 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__0000000A7B3FF158 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__0000001C760FA190 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__00000038E67ABFA0 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__0000003903F1CFE8 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__0000003B99F7F8A0 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__0000005D2FFFFB38 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__00000073AD3FE6B8 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__000000914E3F38F0 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__000000931B33AE68 000000067F0000400200008A59000057C000-000000067F0000400200008A590000580000__000000931B9AFDF8 000000067F0000400200008A59000057EAFE-000000067F0000400200008A5900005874D9__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__0000000A7B3FF158 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__0000001C760FA190 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__00000038E67ABFA0 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__0000003903F1CFE8 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__0000003B99F7F8A0 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__0000005D2FFFFB38 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__00000073AD3FE6B8 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__000000914E3F38F0 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__000000931B33AE68 000000067F0000400200008A590000580000-000000067F0000400200008A590000584000__000000931B9AFDF8 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__0000000A7B3FF158 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__0000001C760FA190 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__00000038E67ABFA0 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__0000003903F1CFE8 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__0000003B99F7F8A0 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__0000005D2FFFFB38 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__00000073AD3FE6B8 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__000000914E3F38F0 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__000000931B33AE68 000000067F0000400200008A590000584000-000000067F0000400200008A590000588000__000000931B9AFDF8 000000067F0000400200008A5900005874D9-000000067F0000400200008A59000058FEA7__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__0000000A7B3FF158 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__0000001C760FA190 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__00000038E67ABFA0 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__0000003903F1CFE8 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__0000003B99F7F8A0 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__0000005D2FFFFB38 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__00000073AD3FE6B8 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__000000914E3F38F0 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__000000931B33AE68 000000067F0000400200008A590000588000-000000067F0000400200008A59000058C000__000000931B9AFDF8 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__0000000A7B3FF158 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__0000001C760FA190 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__00000038E67ABFA0 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__0000003903F1CFE8 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__0000003B99F7F8A0 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__0000005D2FFFFB38 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__00000073AD3FE6B8 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__000000914E3F38F0 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__000000931B33AE68 000000067F0000400200008A59000058C000-000000067F0000400200008A590000590000__000000931B9AFDF8 000000067F0000400200008A59000058FEA7-000000067F0000400200008A590100000000__00000009AC75E659-0000000A4C25FC21 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__0000000A7B3FF158 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__0000001C760FA190 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__00000038E67ABFA0 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__0000003903F1CFE8 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__0000003B99F7F8A0 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__0000005D2FFFFB38 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__00000073AD3FE6B8 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__000000914E3F38F0 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__000000931B33AE68 000000067F0000400200008A590000590000-000000067F0000400200008A590000594000__000000931B9AFDF8 000000067F0000400200008A590000590185-000000067F0000400200008A590000598B56__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__0000000A7B3FF158 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__0000001C760FA190 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__00000038E67ABFA0 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__0000003903F1CFE8 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__0000003B99F7F8A0 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__0000005D2FFFFB38 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__00000073AD3FE6B8 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__000000914E3F38F0 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__000000931B33AE68 000000067F0000400200008A590000594000-000000067F0000400200008A590000598000__000000931B9AFDF8 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__0000000A7B3FF158 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__0000001C760FA190 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__00000038E67ABFA0 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__0000003903F1CFE8 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__0000003B99F7F8A0 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__0000005D2FFFFB38 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__00000073AD3FE6B8 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__000000914E3F38F0 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__000000931B33AE68 000000067F0000400200008A590000598000-000000067F0000400200008A59000059C000__000000931B9AFDF8 000000067F0000400200008A590000598B56-000000067F0000400200008A5900005A153E__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__0000000A7B3FF158 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__0000001C760FA190 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__00000038E67ABFA0 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__0000003903F1CFE8 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__0000003B99F7F8A0 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__0000005D2FFFFB38 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__00000073AD3FE6B8 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__000000914E3F38F0 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__000000931B33AE68 000000067F0000400200008A59000059C000-000000067F0000400200008A5900005A0000__000000931B9AFDF8 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__0000000A7B3FF158 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__0000001C760FA190 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__00000038E67ABFA0 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__0000003903F1CFE8 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__0000003B99F7F8A0 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__0000005D2FFFFB38 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__00000073AD3FE6B8 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__000000914E3F38F0 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__000000931B33AE68 000000067F0000400200008A5900005A0000-000000067F0000400200008A5900005A4000__000000931B9AFDF8 000000067F0000400200008A5900005A153E-000000067F0000400200008A5900005A9F2C__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__0000000A7B3FF158 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__0000001C760FA190 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__00000038E67ABFA0 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__0000003903F1CFE8 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__0000003B99F7F8A0 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__0000005D2FFFFB38 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__00000073AD3FE6B8 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__000000914E3F38F0 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__000000931B33AE68 000000067F0000400200008A5900005A4000-000000067F0000400200008A5900005A8000__000000931B9AFDF8 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__0000001C760FA190 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__00000038E67ABFA0 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__0000003903F1CFE8 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__0000003B99F7F8A0 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__0000005D2FFFFB38 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__00000073AD3FE6B8 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__000000914E3F38F0 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__000000931B33AE68 000000067F0000400200008A5900005A8000-000000067F0000400200008A5900005AC000__000000931B9AFDF8 000000067F0000400200008A5900005A8000-030000000000000000000000000000000002__0000000A7B3FF158 000000067F0000400200008A5900005A9F2C-000000067F0000400200008A5900005B290F__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__0000001C760FA190 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__00000038E67ABFA0 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__0000003903F1CFE8 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__0000003B99F7F8A0 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__0000005D2FFFFB38 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__00000073AD3FE6B8 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__000000914E3F38F0 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__000000931B33AE68 000000067F0000400200008A5900005AC000-000000067F0000400200008A5900005B0000__000000931B9AFDF8 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__0000001C760FA190 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__00000038E67ABFA0 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__0000003903F1CFE8 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__0000003B99F7F8A0 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__0000005D2FFFFB38 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__00000073AD3FE6B8 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__000000914E3F38F0 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__000000931B33AE68 000000067F0000400200008A5900005B0000-000000067F0000400200008A5900005B4000__000000931B9AFDF8 000000067F0000400200008A5900005B290F-000000067F0000400200008A5900005BB2DB__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__0000001C760FA190 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__00000038E67ABFA0 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__0000003903F1CFE8 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__0000003B99F7F8A0 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__0000005D2FFFFB38 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__00000073AD3FE6B8 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__000000914E3F38F0 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__000000931B33AE68 000000067F0000400200008A5900005B4000-000000067F0000400200008A5900005B8000__000000931B9AFDF8 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__0000001C760FA190 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__00000038E67ABFA0 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__0000003903F1CFE8 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__0000003B99F7F8A0 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__0000005D2FFFFB38 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__00000073AD3FE6B8 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__000000914E3F38F0 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__000000931B33AE68 000000067F0000400200008A5900005B8000-000000067F0000400200008A5900005BC000__000000931B9AFDF8 000000067F0000400200008A5900005BB2DB-000000067F0000400200008A5900005C3CB1__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__0000001C760FA190 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__00000038E67ABFA0 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__0000003903F1CFE8 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__0000003B99F7F8A0 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__0000005D2FFFFB38 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__00000073AD3FE6B8 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__000000914E3F38F0 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__000000931B33AE68 000000067F0000400200008A5900005BC000-000000067F0000400200008A5900005C0000__000000931B9AFDF8 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__0000001C760FA190 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__00000038E67ABFA0 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__0000003903F1CFE8 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__0000003B99F7F8A0 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__0000005D2FFFFB38 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__00000073AD3FE6B8 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__000000914E3F38F0 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__000000931B33AE68 000000067F0000400200008A5900005C0000-000000067F0000400200008A5900005C4000__000000931B9AFDF8 000000067F0000400200008A5900005C3CB1-000000067F0000400200008A5900005CC678__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__0000001C760FA190 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__00000038E67ABFA0 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__0000003903F1CFE8 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__0000003B99F7F8A0 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__0000005D2FFFFB38 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__00000073AD3FE6B8 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__000000914E3F38F0 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__000000931B33AE68 000000067F0000400200008A5900005C4000-000000067F0000400200008A5900005C8000__000000931B9AFDF8 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__0000001C760FA190 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__00000038E67ABFA0 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__0000003903F1CFE8 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__0000003B99F7F8A0 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__0000005D2FFFFB38 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__00000073AD3FE6B8 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__000000914E3F38F0 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__000000931B33AE68 000000067F0000400200008A5900005C8000-000000067F0000400200008A5900005CC000__000000931B9AFDF8 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__0000001C760FA190 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__00000038E67ABFA0 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__0000003903F1CFE8 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__0000003B99F7F8A0 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__0000005D2FFFFB38 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__00000073AD3FE6B8 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__000000914E3F38F0 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__000000931B33AE68 000000067F0000400200008A5900005CC000-000000067F0000400200008A5900005D0000__000000931B9AFDF8 000000067F0000400200008A5900005CC678-000000067F0000400200008A5900005D5052__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__0000001C760FA190 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__00000038E67ABFA0 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__0000003903F1CFE8 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__0000003B99F7F8A0 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__0000005D2FFFFB38 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__00000073AD3FE6B8 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__000000914E3F38F0 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__000000931B33AE68 000000067F0000400200008A5900005D0000-000000067F0000400200008A5900005D4000__000000931B9AFDF8 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__0000001C760FA190 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__00000038E67ABFA0 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__0000003903F1CFE8 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__0000003B99F7F8A0 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__0000005D2FFFFB38 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__00000073AD3FE6B8 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__000000914E3F38F0 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__000000931B33AE68 000000067F0000400200008A5900005D4000-000000067F0000400200008A5900005D8000__000000931B9AFDF8 000000067F0000400200008A5900005D5052-000000067F0000400200008A5900005DDA38__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__0000001C760FA190 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__00000038E67ABFA0 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__0000003903F1CFE8 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__0000003B99F7F8A0 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__0000005D2FFFFB38 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__00000073AD3FE6B8 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__000000914E3F38F0 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__000000931B33AE68 000000067F0000400200008A5900005D8000-000000067F0000400200008A5900005DC000__000000931B9AFDF8 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__0000001C760FA190 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__00000038E67ABFA0 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__0000003903F1CFE8 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__0000003B99F7F8A0 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__0000005D2FFFFB38 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__00000073AD3FE6B8 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__000000914E3F38F0 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__000000931B33AE68 000000067F0000400200008A5900005DC000-000000067F0000400200008A5900005E0000__000000931B9AFDF8 000000067F0000400200008A5900005DDA38-000000067F0000400200008A5900005E6422__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__0000001C760FA190 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__00000038E67ABFA0 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__0000003903F1CFE8 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__0000003B99F7F8A0 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__0000005D2FFFFB38 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__00000073AD3FE6B8 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__000000914E3F38F0 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__000000931B33AE68 000000067F0000400200008A5900005E0000-000000067F0000400200008A5900005E4000__000000931B9AFDF8 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__0000001C725A2400 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__0000001C760FA190 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__00000038E67ABFA0 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__0000003903F1CFE8 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__0000003B99F7F8A0 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__0000005D2FFFFB38 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__00000073AD3FE6B8 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__000000914E3F38F0 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__000000931B33AE68 000000067F0000400200008A5900005E4000-000000067F0000400200008A5900005E8000__000000931B9AFDF8 000000067F0000400200008A5900005E6422-000000067F0000400200008A590100000000__0000000A4C25FC21-0000000AEBD5F889 000000067F0000400200008A5900005E670E-000000067F0000400200008A5900005EF0E7__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__0000001C725A2400 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__0000001C760FA190 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__00000038E67ABFA0 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__0000003903F1CFE8 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__0000003B99F7F8A0 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__0000005D2FFFFB38 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__00000073AD3FE6B8 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__000000914E3F38F0 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__000000931B33AE68 000000067F0000400200008A5900005E8000-000000067F0000400200008A5900005EC000__000000931B9AFDF8 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__0000001C725A2400 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__0000001C760FA190 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__00000038E67ABFA0 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__0000003903F1CFE8 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__0000003B99F7F8A0 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__0000005D2FFFFB38 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__00000073AD3FE6B8 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__000000914E3F38F0 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__000000931B33AE68 000000067F0000400200008A5900005EC000-000000067F0000400200008A5900005F0000__000000931B9AFDF8 000000067F0000400200008A5900005EF0E7-000000067F0000400200008A5900005F7AC2__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__0000001C725A2400 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__0000001C760FA190 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__00000038E67ABFA0 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__0000003903F1CFE8 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__0000003B99F7F8A0 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__0000005D2FFFFB38 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__00000073AD3FE6B8 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__000000914E3F38F0 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__000000931B33AE68 000000067F0000400200008A5900005F0000-000000067F0000400200008A5900005F4000__000000931B9AFDF8 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__0000001C725A2400 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__0000001C760FA190 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__00000038E67ABFA0 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__0000003903F1CFE8 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__0000003B99F7F8A0 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__0000005D2FFFFB38 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__00000073AD3FE6B8 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__000000914E3F38F0 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__000000931B33AE68 000000067F0000400200008A5900005F4000-000000067F0000400200008A5900005F8000__000000931B9AFDF8 000000067F0000400200008A5900005F7AC2-000000067F0000400200008A590000600494__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__0000001C725A2400 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__0000001C760FA190 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__00000038E67ABFA0 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__0000003903F1CFE8 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__0000003B99F7F8A0 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__0000005D2FFFFB38 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__00000073AD3FE6B8 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__000000914E3F38F0 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__000000931B33AE68 000000067F0000400200008A5900005F8000-000000067F0000400200008A5900005FC000__000000931B9AFDF8 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__0000001C725A2400 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__0000001C760FA190 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__00000038E67ABFA0 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__0000003903F1CFE8 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__0000003B99F7F8A0 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__0000005D2FFFFB38 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__00000073AD3FE6B8 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__000000914E3F38F0 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__000000931B33AE68 000000067F0000400200008A5900005FC000-000000067F0000400200008A590000600000__000000931B9AFDF8 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__0000001C725A2400 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__0000001C760FA190 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__00000038E67ABFA0 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__0000003903F1CFE8 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__0000003B99F7F8A0 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__0000005D2FFFFB38 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__00000073AD3FE6B8 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__000000914E3F38F0 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__000000931B33AE68 000000067F0000400200008A590000600000-000000067F0000400200008A590000604000__000000931B9AFDF8 000000067F0000400200008A590000600494-000000067F0000400200008A590000608E5C__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__0000001C725A2400 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__0000001C760FA190 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__00000038E67ABFA0 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__0000003903F1CFE8 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__0000003B99F7F8A0 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__0000005D2FFFFB38 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__00000073AD3FE6B8 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__000000914E3F38F0 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__000000931B33AE68 000000067F0000400200008A590000604000-000000067F0000400200008A590000608000__000000931B9AFDF8 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__0000001C725A2400 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__0000001C760FA190 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__00000038E67ABFA0 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__0000003903F1CFE8 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__0000003B99F7F8A0 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__0000005D2FFFFB38 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__00000073AD3FE6B8 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__000000914E3F38F0 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__000000931B33AE68 000000067F0000400200008A590000608000-000000067F0000400200008A59000060C000__000000931B9AFDF8 000000067F0000400200008A590000608E5C-000000067F0000400200008A590000611840__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__0000001C725A2400 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__0000001C760FA190 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__00000038E67ABFA0 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__0000003903F1CFE8 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__0000003B99F7F8A0 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__0000005D2FFFFB38 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__00000073AD3FE6B8 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__000000914E3F38F0 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__000000931B33AE68 000000067F0000400200008A59000060C000-000000067F0000400200008A590000610000__000000931B9AFDF8 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__0000001C725A2400 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__0000001C760FA190 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__00000038E67ABFA0 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__0000003903F1CFE8 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__0000003B99F7F8A0 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__0000005D2FFFFB38 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__00000073AD3FE6B8 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__000000914E3F38F0 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__000000931B33AE68 000000067F0000400200008A590000610000-000000067F0000400200008A590000614000__000000931B9AFDF8 000000067F0000400200008A590000611840-000000067F0000400200008A59000061A226__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__0000001C725A2400 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__0000001C760FA190 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__00000038E67ABFA0 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__0000003903F1CFE8 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__0000003B99F7F8A0 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__0000005D2FFFFB38 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__00000073AD3FE6B8 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__000000914E3F38F0 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__000000931B33AE68 000000067F0000400200008A590000614000-000000067F0000400200008A590000618000__000000931B9AFDF8 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__0000001C725A2400 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__0000001C760FA190 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__00000038E67ABFA0 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__0000003903F1CFE8 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__0000003B99F7F8A0 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__0000005D2FFFFB38 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__00000073AD3FE6B8 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__000000914E3F38F0 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__000000931B33AE68 000000067F0000400200008A590000618000-000000067F0000400200008A59000061C000__000000931B9AFDF8 000000067F0000400200008A59000061A226-000000067F0000400200008A590000622C03__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__0000001C725A2400 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__0000001C760FA190 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__00000038E67ABFA0 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__0000003903F1CFE8 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__0000003B99F7F8A0 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__0000005D2FFFFB38 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__00000073AD3FE6B8 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__000000914E3F38F0 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__000000931B33AE68 000000067F0000400200008A59000061C000-000000067F0000400200008A590000620000__000000931B9AFDF8 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__0000001C725A2400 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__0000001C760FA190 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__00000038E67ABFA0 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__0000003903F1CFE8 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__0000003B99F7F8A0 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__0000005D2FFFFB38 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__00000073AD3FE6B8 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__000000914E3F38F0 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__000000931B33AE68 000000067F0000400200008A590000620000-000000067F0000400200008A590000624000__000000931B9AFDF8 000000067F0000400200008A590000622C03-000000067F0000400200008A59000062B5D9__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__0000001C725A2400 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__0000001C760FA190 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__00000038E67ABFA0 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__0000003903F1CFE8 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__0000003B99F7F8A0 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__0000005D2FFFFB38 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__00000073AD3FE6B8 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__000000914E3F38F0 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__000000931B33AE68 000000067F0000400200008A590000624000-000000067F0000400200008A590000628000__000000931B9AFDF8 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__0000001C725A2400 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__0000001C760FA190 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__00000038E67ABFA0 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__0000003903F1CFE8 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__0000003B99F7F8A0 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__0000005D2FFFFB38 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__00000073AD3FE6B8 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__000000914E3F38F0 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__000000931B33AE68 000000067F0000400200008A590000628000-000000067F0000400200008A59000062C000__000000931B9AFDF8 000000067F0000400200008A59000062B5D9-000000067F0000400200008A590000633FB7__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__0000001C725A2400 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__0000001C760FA190 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__00000038E67ABFA0 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__0000003903F1CFE8 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__0000003B99F7F8A0 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__0000005D2FFFFB38 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__00000073AD3FE6B8 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__000000914E3F38F0 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__000000931B33AE68 000000067F0000400200008A59000062C000-000000067F0000400200008A590000630000__000000931B9AFDF8 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__0000001C725A2400 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__0000001C760FA190 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__00000038E67ABFA0 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__0000003903F1CFE8 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__0000003B99F7F8A0 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__0000005D2FFFFB38 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__00000073AD3FE6B8 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__000000914E3F38F0 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__000000931B33AE68 000000067F0000400200008A590000630000-000000067F0000400200008A590000634000__000000931B9AFDF8 000000067F0000400200008A590000633FB7-000000067F0000400200008A59000063C989__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__0000001C725A2400 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__0000001C760FA190 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__00000038E67ABFA0 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__0000003903F1CFE8 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__0000003B99F7F8A0 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__0000005D2FFFFB38 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__00000073AD3FE6B8 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__000000914E3F38F0 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__000000931B33AE68 000000067F0000400200008A590000634000-000000067F0000400200008A590000638000__000000931B9AFDF8 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__0000001C725A2400 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__0000001C760FA190 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__00000038E67ABFA0 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__0000003903F1CFE8 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__0000003B99F7F8A0 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__0000005D2FFFFB38 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__00000073AD3FE6B8 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__000000914E3F38F0 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__000000931B33AE68 000000067F0000400200008A590000638000-000000067F0000400200008A59000063C000__000000931B9AFDF8 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__0000000C539FF890 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__0000001C760FA190 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__00000038E67ABFA0 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__0000003903F1CFE8 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__0000003B99F7F8A0 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__0000005D2FFFFB38 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__00000073AD3FE6B8 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__000000914E3F38F0 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__000000931B33AE68 000000067F0000400200008A59000063C000-000000067F0000400200008A590000640000__000000931B9AFDF8 000000067F0000400200008A59000063C989-000000067F0000400200008A590100000000__0000000AEBD5F889-0000000B8B85DC91 000000067F0000400200008A59000063CC6C-000000067F0000400200008A590000645631__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__0000000C539FF890 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__0000001C760FA190 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__00000038E67ABFA0 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__0000003903F1CFE8 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__0000003B99F7F8A0 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__0000005D2FFFFB38 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__00000073AD3FE6B8 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__000000914E3F38F0 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__000000931B33AE68 000000067F0000400200008A590000640000-000000067F0000400200008A590000644000__000000931B9AFDF8 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__0000000C539FF890 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__0000001C760FA190 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__00000038E67ABFA0 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__0000003903F1CFE8 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__0000003B99F7F8A0 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__0000005D2FFFFB38 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__00000073AD3FE6B8 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__000000914E3F38F0 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__000000931B33AE68 000000067F0000400200008A590000644000-000000067F0000400200008A590000648000__000000931B9AFDF8 000000067F0000400200008A590000645631-000000067F0000400200008A59000064E015__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__0000000C539FF890 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__0000001C760FA190 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__00000038E67ABFA0 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__0000003903F1CFE8 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__0000003B99F7F8A0 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__0000005D2FFFFB38 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__00000073AD3FE6B8 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__000000914E3F38F0 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__000000931B33AE68 000000067F0000400200008A590000648000-000000067F0000400200008A59000064C000__000000931B9AFDF8 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__0000000C539FF890 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__0000001C760FA190 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__00000038E67ABFA0 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__0000003903F1CFE8 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__0000003B99F7F8A0 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__0000005D2FFFFB38 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__00000073AD3FE6B8 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__000000914E3F38F0 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__000000931B33AE68 000000067F0000400200008A59000064C000-000000067F0000400200008A590000650000__000000931B9AFDF8 000000067F0000400200008A59000064E015-000000067F0000400200008A5900006569FE__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__0000000C539FF890 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__0000001C760FA190 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__00000038E67ABFA0 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__0000003903F1CFE8 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__0000003B99F7F8A0 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__0000005D2FFFFB38 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__00000073AD3FE6B8 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__000000914E3F38F0 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__000000931B33AE68 000000067F0000400200008A590000650000-000000067F0000400200008A590000654000__000000931B9AFDF8 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__0000000C539FF890 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__0000001C760FA190 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__00000038E67ABFA0 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__0000003903F1CFE8 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__0000003B99F7F8A0 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__0000005D2FFFFB38 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__00000073AD3FE6B8 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__000000914E3F38F0 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__000000931B33AE68 000000067F0000400200008A590000654000-000000067F0000400200008A590000658000__000000931B9AFDF8 000000067F0000400200008A5900006569FE-000000067F0000400200008A59000065F3ED__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__0000000C539FF890 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__0000001C760FA190 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__00000038E67ABFA0 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__0000003903F1CFE8 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__0000003B99F7F8A0 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__0000005D2FFFFB38 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__00000073AD3FE6B8 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__000000914E3F38F0 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__000000931B33AE68 000000067F0000400200008A590000658000-000000067F0000400200008A59000065C000__000000931B9AFDF8 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__0000000C539FF890 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__0000001C760FA190 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__00000038E67ABFA0 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__0000003903F1CFE8 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__0000003B99F7F8A0 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__0000005D2FFFFB38 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__00000073AD3FE6B8 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__000000914E3F38F0 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__000000931B33AE68 000000067F0000400200008A59000065C000-000000067F0000400200008A590000660000__000000931B9AFDF8 000000067F0000400200008A59000065F3ED-000000067F0000400200008A590000667DD3__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__0000000C539FF890 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__0000001C760FA190 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__00000038E67ABFA0 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__0000003903F1CFE8 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__0000003B99F7F8A0 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__0000005D2FFFFB38 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__00000073AD3FE6B8 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__000000914E3F38F0 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__000000931B33AE68 000000067F0000400200008A590000660000-000000067F0000400200008A590000664000__000000931B9AFDF8 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__0000000C539FF890 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__0000001C760FA190 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__00000038E67ABFA0 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__0000003903F1CFE8 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__0000003B99F7F8A0 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__0000005D2FFFFB38 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__00000073AD3FE6B8 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__000000914E3F38F0 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__000000931B33AE68 000000067F0000400200008A590000664000-000000067F0000400200008A590000668000__000000931B9AFDF8 000000067F0000400200008A590000667DD3-000000067F0000400200008A5900006707A7__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__0000000C539FF890 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__0000001C760FA190 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__00000038E67ABFA0 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__0000003903F1CFE8 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__0000003B99F7F8A0 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__0000005D2FFFFB38 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__00000073AD3FE6B8 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__000000914E3F38F0 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__000000931B33AE68 000000067F0000400200008A590000668000-000000067F0000400200008A59000066C000__000000931B9AFDF8 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__0000000C539FF890 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__0000001C760FA190 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__00000038E67ABFA0 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__0000003903F1CFE8 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__0000003B99F7F8A0 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__0000005D2FFFFB38 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__00000073AD3FE6B8 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__000000914E3F38F0 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__000000931B33AE68 000000067F0000400200008A59000066C000-000000067F0000400200008A590000670000__000000931B9AFDF8 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__0000000C539FF890 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__0000001C760FA190 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__00000038E67ABFA0 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__0000003903F1CFE8 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__0000003B99F7F8A0 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__0000005D2FFFFB38 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__00000073AD3FE6B8 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__000000914E3F38F0 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__000000931B33AE68 000000067F0000400200008A590000670000-000000067F0000400200008A590000674000__000000931B9AFDF8 000000067F0000400200008A5900006707A7-000000067F0000400200008A59000067917A__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__0000000C539FF890 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__0000001C760FA190 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__00000038E67ABFA0 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__0000003903F1CFE8 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__0000003B99F7F8A0 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__0000005D2FFFFB38 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__00000073AD3FE6B8 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__000000914E3F38F0 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__000000931B33AE68 000000067F0000400200008A590000674000-000000067F0000400200008A590000678000__000000931B9AFDF8 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__0000000C539FF890 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__0000001C760FA190 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__00000038E67ABFA0 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__0000003903F1CFE8 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__0000003B99F7F8A0 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__0000005D2FFFFB38 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__00000073AD3FE6B8 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__000000914E3F38F0 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__000000931B33AE68 000000067F0000400200008A590000678000-000000067F0000400200008A59000067C000__000000931B9AFDF8 000000067F0000400200008A59000067917A-000000067F0000400200008A590000681B34__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__0000000C539FF890 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__0000001C760FA190 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__00000038E67ABFA0 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__0000003903F1CFE8 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__0000003B99F7F8A0 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__0000005D2FFFFB38 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__00000073AD3FE6B8 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__000000914E3F38F0 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__000000931B33AE68 000000067F0000400200008A59000067C000-000000067F0000400200008A590000680000__000000931B9AFDF8 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__0000000C539FF890 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__0000001C760FA190 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__00000038E67ABFA0 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__0000003903F1CFE8 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__0000003B99F7F8A0 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__0000005D2FFFFB38 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__00000073AD3FE6B8 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__000000914E3F38F0 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__000000931B33AE68 000000067F0000400200008A590000680000-000000067F0000400200008A590000684000__000000931B9AFDF8 000000067F0000400200008A590000681B34-000000067F0000400200008A59000068A51E__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__0000000C539FF890 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__0000001C760FA190 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__00000038E67ABFA0 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__0000003903F1CFE8 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__0000003B99F7F8A0 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__0000005D2FFFFB38 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__00000073AD3FE6B8 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__000000914E3F38F0 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__000000931B33AE68 000000067F0000400200008A590000684000-000000067F0000400200008A590000688000__000000931B9AFDF8 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__0000000C539FF890 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__0000001C760FA190 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__00000038E67ABFA0 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__0000003903F1CFE8 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__0000003B99F7F8A0 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__0000005D2FFFFB38 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__00000073AD3FE6B8 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__000000914E3F38F0 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__000000931B33AE68 000000067F0000400200008A590000688000-000000067F0000400200008A59000068C000__000000931B9AFDF8 000000067F0000400200008A59000068A51E-000000067F0000400200008A590000692F04__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__0000000C539FF890 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__0000001C760FA190 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__00000038E67ABFA0 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__0000003903F1CFE8 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__0000003B99F7F8A0 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__0000005D2FFFFB38 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__00000073AD3FE6B8 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__000000914E3F38F0 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__000000931B33AE68 000000067F0000400200008A59000068C000-000000067F0000400200008A590000690000__000000931B9AFDF8 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__0000000C539FF890 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__0000001C760FA190 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__00000038E67ABFA0 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__0000003903F1CFE8 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__0000003B99F7F8A0 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__0000005D2FFFFB38 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__00000073AD3FE6B8 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__000000914E3F38F0 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__000000931B33AE68 000000067F0000400200008A590000690000-000000067F0000400200008A590000694000__000000931B9AFDF8 000000067F0000400200008A590000692F04-000000067F0000400200008A59000069B8DE__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__0000000C539FF890 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__0000001C760FA190 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__00000038E67ABFA0 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__0000003903F1CFE8 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__0000003B99F7F8A0 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__0000005D2FFFFB38 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__00000073AD3FE6B8 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__000000914E3F38F0 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__000000931B33AE68 000000067F0000400200008A590000694000-000000067F0000400200008A590000698000__000000931B9AFDF8 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__0000000C539FF890 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__0000001C760FA190 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__00000038E67ABFA0 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__0000003903F1CFE8 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__0000003B99F7F8A0 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__0000005D2FFFFB38 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__00000073AD3FE6B8 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__000000914E3F38F0 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__000000931B33AE68 000000067F0000400200008A590000698000-000000067F0000400200008A59000069C000__000000931B9AFDF8 000000067F0000400200008A59000069B8DE-000000067F0000400200008A590100000000__0000000B8B85DC91-0000000C3B2DF409 000000067F0000400200008A59000069BC09-000000067F0000400200008A5900006A45D6__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__0000000C539FF890 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__0000001C760FA190 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__00000038E67ABFA0 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__0000003903F1CFE8 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__0000003B99F7F8A0 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__0000005D2FFFFB38 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__00000073AD3FE6B8 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__000000914E3F38F0 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__000000931B33AE68 000000067F0000400200008A59000069C000-000000067F0000400200008A5900006A0000__000000931B9AFDF8 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__0000000C539FF890 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__0000001C760FA190 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__00000038E67ABFA0 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__0000003903F1CFE8 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__0000003B99F7F8A0 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__0000005D2FFFFB38 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__00000073AD3FE6B8 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__000000914E3F38F0 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__000000931B33AE68 000000067F0000400200008A5900006A0000-000000067F0000400200008A5900006A4000__000000931B9AFDF8 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__0000000C539FF890 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__0000001C760FA190 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__00000038E67ABFA0 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__0000003903F1CFE8 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__0000003B99F7F8A0 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__0000005D2FFFFB38 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__00000073AD3FE6B8 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__000000914E3F38F0 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__000000931B33AE68 000000067F0000400200008A5900006A4000-000000067F0000400200008A5900006A8000__000000931B9AFDF8 000000067F0000400200008A5900006A45D6-000000067F0000400200008A5900006ACFB4__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__0000001C760FA190 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__00000038E67ABFA0 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__0000003903F1CFE8 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__0000003B99F7F8A0 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__0000005D2FFFFB38 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__00000073AD3FE6B8 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__000000914E3F38F0 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__000000931B33AE68 000000067F0000400200008A5900006A8000-000000067F0000400200008A5900006AC000__000000931B9AFDF8 000000067F0000400200008A5900006A8000-030000000000000000000000000000000002__0000000C539FF890 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__0000001C760FA190 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__00000038E67ABFA0 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__0000003903F1CFE8 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__0000003B99F7F8A0 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__0000005D2FFFFB38 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__00000073AD3FE6B8 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__000000914E3F38F0 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__000000931B33AE68 000000067F0000400200008A5900006AC000-000000067F0000400200008A5900006B0000__000000931B9AFDF8 000000067F0000400200008A5900006ACFB4-000000067F0000400200008A5900006B598B__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__0000001C760FA190 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__00000038E67ABFA0 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__0000003903F1CFE8 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__0000003B99F7F8A0 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__0000005D2FFFFB38 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__00000073AD3FE6B8 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__000000914E3F38F0 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__000000931B33AE68 000000067F0000400200008A5900006B0000-000000067F0000400200008A5900006B4000__000000931B9AFDF8 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__0000001C760FA190 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__00000038E67ABFA0 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__0000003903F1CFE8 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__0000003B99F7F8A0 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__0000005D2FFFFB38 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__00000073AD3FE6B8 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__000000914E3F38F0 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__000000931B33AE68 000000067F0000400200008A5900006B4000-000000067F0000400200008A5900006B8000__000000931B9AFDF8 000000067F0000400200008A5900006B598B-000000067F0000400200008A5900006BE35A__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__0000001C760FA190 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__00000038E67ABFA0 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__0000003903F1CFE8 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__0000003B99F7F8A0 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__0000005D2FFFFB38 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__00000073AD3FE6B8 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__000000914E3F38F0 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__000000931B33AE68 000000067F0000400200008A5900006B8000-000000067F0000400200008A5900006BC000__000000931B9AFDF8 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__0000001C760FA190 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__00000038E67ABFA0 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__0000003903F1CFE8 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__0000003B99F7F8A0 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__0000005D2FFFFB38 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__00000073AD3FE6B8 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__000000914E3F38F0 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__000000931B33AE68 000000067F0000400200008A5900006BC000-000000067F0000400200008A5900006C0000__000000931B9AFDF8 000000067F0000400200008A5900006BE35A-000000067F0000400200008A5900006C6D3C__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__0000001C760FA190 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__00000038E67ABFA0 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__0000003903F1CFE8 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__0000003B99F7F8A0 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__0000005D2FFFFB38 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__00000073AD3FE6B8 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__000000914E3F38F0 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__000000931B33AE68 000000067F0000400200008A5900006C0000-000000067F0000400200008A5900006C4000__000000931B9AFDF8 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__0000001C760FA190 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__00000038E67ABFA0 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__0000003903F1CFE8 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__0000003B99F7F8A0 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__0000005D2FFFFB38 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__00000073AD3FE6B8 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__000000914E3F38F0 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__000000931B33AE68 000000067F0000400200008A5900006C4000-000000067F0000400200008A5900006C8000__000000931B9AFDF8 000000067F0000400200008A5900006C6D3C-000000067F0000400200008A5900006CF724__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__0000001C760FA190 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__00000038E67ABFA0 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__0000003903F1CFE8 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__0000003B99F7F8A0 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__0000005D2FFFFB38 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__00000073AD3FE6B8 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__000000914E3F38F0 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__000000931B33AE68 000000067F0000400200008A5900006C8000-000000067F0000400200008A5900006CC000__000000931B9AFDF8 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__0000001C760FA190 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__00000038E67ABFA0 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__0000003903F1CFE8 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__0000003B99F7F8A0 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__0000005D2FFFFB38 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__00000073AD3FE6B8 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__000000914E3F38F0 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__000000931B33AE68 000000067F0000400200008A5900006CC000-000000067F0000400200008A5900006D0000__000000931B9AFDF8 000000067F0000400200008A5900006CF724-000000067F0000400200008A5900006D80FB__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__0000001C760FA190 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__00000038E67ABFA0 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__0000003903F1CFE8 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__0000003B99F7F8A0 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__0000005D2FFFFB38 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__00000073AD3FE6B8 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__000000914E3F38F0 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__000000931B33AE68 000000067F0000400200008A5900006D0000-000000067F0000400200008A5900006D4000__000000931B9AFDF8 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__0000001C760FA190 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__00000038E67ABFA0 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__0000003903F1CFE8 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__0000003B99F7F8A0 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__0000005D2FFFFB38 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__00000073AD3FE6B8 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__000000914E3F38F0 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__000000931B33AE68 000000067F0000400200008A5900006D4000-000000067F0000400200008A5900006D8000__000000931B9AFDF8 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__0000001C760FA190 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__00000038E67ABFA0 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__0000003903F1CFE8 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__0000003B99F7F8A0 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__0000005D2FFFFB38 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__00000073AD3FE6B8 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__000000914E3F38F0 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__000000931B33AE68 000000067F0000400200008A5900006D8000-000000067F0000400200008A5900006DC000__000000931B9AFDF8 000000067F0000400200008A5900006D80FB-000000067F0000400200008A5900006E0AD6__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__0000001C760FA190 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__00000038E67ABFA0 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__0000003903F1CFE8 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__0000003B99F7F8A0 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__0000005D2FFFFB38 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__00000073AD3FE6B8 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__000000914E3F38F0 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__000000931B33AE68 000000067F0000400200008A5900006DC000-000000067F0000400200008A5900006E0000__000000931B9AFDF8 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__0000001C760FA190 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__00000038E67ABFA0 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__0000003903F1CFE8 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__0000003B99F7F8A0 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__0000005D2FFFFB38 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__00000073AD3FE6B8 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__000000914E3F38F0 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__000000931B33AE68 000000067F0000400200008A5900006E0000-000000067F0000400200008A5900006E4000__000000931B9AFDF8 000000067F0000400200008A5900006E0AD6-000000067F0000400200008A5900006E94BB__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__0000001C760FA190 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__00000038E67ABFA0 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__0000003903F1CFE8 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__0000003B99F7F8A0 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__0000005D2FFFFB38 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__00000073AD3FE6B8 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__000000914E3F38F0 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__000000931B33AE68 000000067F0000400200008A5900006E4000-000000067F0000400200008A5900006E8000__000000931B9AFDF8 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__0000001C760FA190 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__00000038E67ABFA0 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__0000003903F1CFE8 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__0000003B99F7F8A0 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__0000005D2FFFFB38 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__00000073AD3FE6B8 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__000000914E3F38F0 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__000000931B33AE68 000000067F0000400200008A5900006E8000-000000067F0000400200008A5900006EC000__000000931B9AFDF8 000000067F0000400200008A5900006E94BB-000000067F0000400200008A5900006F1E92__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__0000001C760FA190 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__00000038E67ABFA0 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__0000003903F1CFE8 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__0000003B99F7F8A0 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__0000005D2FFFFB38 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__00000073AD3FE6B8 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__000000914E3F38F0 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__000000931B33AE68 000000067F0000400200008A5900006EC000-000000067F0000400200008A5900006F0000__000000931B9AFDF8 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__0000001C725A2400 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__0000001C760FA190 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__00000038E67ABFA0 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__0000003903F1CFE8 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__0000003B99F7F8A0 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__0000005D2FFFFB38 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__00000073AD3FE6B8 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__000000914E3F38F0 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__000000931B33AE68 000000067F0000400200008A5900006F0000-000000067F0000400200008A5900006F4000__000000931B9AFDF8 000000067F0000400200008A5900006F1E92-000000067F0000400200008A590100000000__0000000C3B2DF409-0000000CDADDDFC9 000000067F0000400200008A5900006F215C-000000067F0000400200008A5900006FAB35__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__0000001C725A2400 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__0000001C760FA190 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__00000038E67ABFA0 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__0000003903F1CFE8 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__0000003B99F7F8A0 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__0000005D2FFFFB38 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__00000073AD3FE6B8 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__000000914E3F38F0 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__000000931B33AE68 000000067F0000400200008A5900006F4000-000000067F0000400200008A5900006F8000__000000931B9AFDF8 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__0000001C725A2400 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__0000001C760FA190 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__00000038E67ABFA0 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__0000003903F1CFE8 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__0000003B99F7F8A0 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__0000005D2FFFFB38 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__00000073AD3FE6B8 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__000000914E3F38F0 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__000000931B33AE68 000000067F0000400200008A5900006F8000-000000067F0000400200008A5900006FC000__000000931B9AFDF8 000000067F0000400200008A5900006FAB35-000000067F0000400200008A590000703515__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__0000001C725A2400 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__0000001C760FA190 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__00000038E67ABFA0 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__0000003903F1CFE8 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__0000003B99F7F8A0 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__0000005D2FFFFB38 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__00000073AD3FE6B8 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__000000914E3F38F0 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__000000931B33AE68 000000067F0000400200008A5900006FC000-000000067F0000400200008A590000700000__000000931B9AFDF8 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__0000001C725A2400 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__0000001C760FA190 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__00000038E67ABFA0 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__0000003903F1CFE8 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__0000003B99F7F8A0 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__0000005D2FFFFB38 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__00000073AD3FE6B8 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__000000914E3F38F0 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__000000931B33AE68 000000067F0000400200008A590000700000-000000067F0000400200008A590000704000__000000931B9AFDF8 000000067F0000400200008A590000703515-000000067F0000400200008A59000070BEF5__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__0000001C725A2400 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__0000001C760FA190 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__00000038E67ABFA0 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__0000003903F1CFE8 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__0000003B99F7F8A0 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__0000005D2FFFFB38 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__00000073AD3FE6B8 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__000000914E3F38F0 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__000000931B33AE68 000000067F0000400200008A590000704000-000000067F0000400200008A590000708000__000000931B9AFDF8 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__0000001C725A2400 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__0000001C760FA190 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__00000038E67ABFA0 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__0000003903F1CFE8 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__0000003B99F7F8A0 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__0000005D2FFFFB38 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__00000073AD3FE6B8 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__000000914E3F38F0 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__000000931B33AE68 000000067F0000400200008A590000708000-000000067F0000400200008A59000070C000__000000931B9AFDF8 000000067F0000400200008A59000070BEF5-000000067F0000400200008A5900007148DC__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__0000001C725A2400 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__0000001C760FA190 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__00000038E67ABFA0 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__0000003903F1CFE8 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__0000003B99F7F8A0 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__0000005D2FFFFB38 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__00000073AD3FE6B8 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__000000914E3F38F0 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__000000931B33AE68 000000067F0000400200008A59000070C000-000000067F0000400200008A590000710000__000000931B9AFDF8 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__0000001C725A2400 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__0000001C760FA190 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__00000038E67ABFA0 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__0000003903F1CFE8 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__0000003B99F7F8A0 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__0000005D2FFFFB38 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__00000073AD3FE6B8 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__000000914E3F38F0 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__000000931B33AE68 000000067F0000400200008A590000710000-000000067F0000400200008A590000714000__000000931B9AFDF8 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__0000001C725A2400 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__0000001C760FA190 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__00000038E67ABFA0 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__0000003903F1CFE8 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__0000003B99F7F8A0 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__0000005D2FFFFB38 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__00000073AD3FE6B8 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__000000914E3F38F0 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__000000931B33AE68 000000067F0000400200008A590000714000-000000067F0000400200008A590000718000__000000931B9AFDF8 000000067F0000400200008A5900007148DC-000000067F0000400200008A59000071D2BF__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__0000001C725A2400 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__0000001C760FA190 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__00000038E67ABFA0 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__0000003903F1CFE8 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__0000003B99F7F8A0 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__0000005D2FFFFB38 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__00000073AD3FE6B8 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__000000914E3F38F0 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__000000931B33AE68 000000067F0000400200008A590000718000-000000067F0000400200008A59000071C000__000000931B9AFDF8 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__0000001C725A2400 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__0000001C760FA190 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__00000038E67ABFA0 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__0000003903F1CFE8 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__0000003B99F7F8A0 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__0000005D2FFFFB38 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__00000073AD3FE6B8 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__000000914E3F38F0 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__000000931B33AE68 000000067F0000400200008A59000071C000-000000067F0000400200008A590000720000__000000931B9AFDF8 000000067F0000400200008A59000071D2BF-000000067F0000400200008A590000725C94__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__0000001C725A2400 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__0000001C760FA190 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__00000038E67ABFA0 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__0000003903F1CFE8 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__0000003B99F7F8A0 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__0000005D2FFFFB38 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__00000073AD3FE6B8 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__000000914E3F38F0 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__000000931B33AE68 000000067F0000400200008A590000720000-000000067F0000400200008A590000724000__000000931B9AFDF8 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__0000001C725A2400 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__0000001C760FA190 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__00000038E67ABFA0 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__0000003903F1CFE8 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__0000003B99F7F8A0 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__0000005D2FFFFB38 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__00000073AD3FE6B8 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__000000914E3F38F0 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__000000931B33AE68 000000067F0000400200008A590000724000-000000067F0000400200008A590000728000__000000931B9AFDF8 000000067F0000400200008A590000725C94-000000067F0000400200008A59000072E65C__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__0000001C725A2400 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__0000001C760FA190 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__00000038E67ABFA0 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__0000003903F1CFE8 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__0000003B99F7F8A0 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__0000005D2FFFFB38 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__00000073AD3FE6B8 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__000000914E3F38F0 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__000000931B33AE68 000000067F0000400200008A590000728000-000000067F0000400200008A59000072C000__000000931B9AFDF8 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__0000001C725A2400 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__0000001C760FA190 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__00000038E67ABFA0 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__0000003903F1CFE8 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__0000003B99F7F8A0 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__0000005D2FFFFB38 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__00000073AD3FE6B8 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__000000914E3F38F0 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__000000931B33AE68 000000067F0000400200008A59000072C000-000000067F0000400200008A590000730000__000000931B9AFDF8 000000067F0000400200008A59000072E65C-000000067F0000400200008A590000737034__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__0000001C725A2400 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__0000001C760FA190 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__00000038E67ABFA0 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__0000003903F1CFE8 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__0000003B99F7F8A0 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__0000005D2FFFFB38 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__00000073AD3FE6B8 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__000000914E3F38F0 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__000000931B33AE68 000000067F0000400200008A590000730000-000000067F0000400200008A590000734000__000000931B9AFDF8 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__0000001C725A2400 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__0000001C760FA190 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__00000038E67ABFA0 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__0000003903F1CFE8 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__0000003B99F7F8A0 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__0000005D2FFFFB38 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__00000073AD3FE6B8 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__000000914E3F38F0 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__000000931B33AE68 000000067F0000400200008A590000734000-000000067F0000400200008A590000738000__000000931B9AFDF8 000000067F0000400200008A590000737034-000000067F0000400200008A59000073FA16__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__0000001C725A2400 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__0000001C760FA190 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__00000038E67ABFA0 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__0000003903F1CFE8 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__0000003B99F7F8A0 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__0000005D2FFFFB38 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__00000073AD3FE6B8 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__000000914E3F38F0 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__000000931B33AE68 000000067F0000400200008A590000738000-000000067F0000400200008A59000073C000__000000931B9AFDF8 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__0000001C725A2400 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__0000001C760FA190 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__00000038E67ABFA0 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__0000003903F1CFE8 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__0000003B99F7F8A0 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__0000005D2FFFFB38 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__00000073AD3FE6B8 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__000000914E3F38F0 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__000000931B33AE68 000000067F0000400200008A59000073C000-000000067F0000400200008A590000740000__000000931B9AFDF8 000000067F0000400200008A59000073FA16-000000067F0000400200008A5900007483EF__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__0000001C725A2400 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__0000001C760FA190 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__00000038E67ABFA0 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__0000003903F1CFE8 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__0000003B99F7F8A0 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__0000005D2FFFFB38 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__00000073AD3FE6B8 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__000000914E3F38F0 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__000000931B33AE68 000000067F0000400200008A590000740000-000000067F0000400200008A590000744000__000000931B9AFDF8 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__0000001C725A2400 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__0000001C760FA190 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__00000038E67ABFA0 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__0000003903F1CFE8 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__0000003B99F7F8A0 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__0000005D2FFFFB38 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__00000073AD3FE6B8 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__000000914E3F38F0 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__000000931B33AE68 000000067F0000400200008A590000744000-000000067F0000400200008A590000748000__000000931B9AFDF8 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__0000001C725A2400 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__0000001C760FA190 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__00000038E67ABFA0 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__0000003903F1CFE8 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__0000003B99F7F8A0 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__0000005D2FFFFB38 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__00000073AD3FE6B8 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__000000914E3F38F0 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__000000931B33AE68 000000067F0000400200008A590000748000-000000067F0000400200008A59000074C000__000000931B9AFDF8 000000067F0000400200008A5900007483EF-000000067F0000400200008A590000750DD5__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__0000001C725A2400 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__0000001C760FA190 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__00000038E67ABFA0 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__0000003903F1CFE8 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__0000003B99F7F8A0 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__0000005D2FFFFB38 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__00000073AD3FE6B8 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__000000914E3F38F0 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__000000931B33AE68 000000067F0000400200008A59000074C000-000000067F0000400200008A590000750000__000000931B9AFDF8 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__0000000E54FFE720 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__0000001C760FA190 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__00000038E67ABFA0 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__0000003903F1CFE8 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__0000003B99F7F8A0 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__0000005D2FFFFB38 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__00000073AD3FE6B8 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__000000914E3F38F0 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__000000931B33AE68 000000067F0000400200008A590000750000-000000067F0000400200008A590000754000__000000931B9AFDF8 000000067F0000400200008A590000750DD5-000000067F0000400200008A590100000000__0000000CDADDDFC9-0000000D8A85D199 000000067F0000400200008A5900007510F5-000000067F0000400200008A590000759AD2__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__0000000E54FFE720 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__0000001C760FA190 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__00000038E67ABFA0 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__0000003903F1CFE8 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__0000003B99F7F8A0 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__0000005D2FFFFB38 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__00000073AD3FE6B8 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__000000914E3F38F0 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__000000931B33AE68 000000067F0000400200008A590000754000-000000067F0000400200008A590000758000__000000931B9AFDF8 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__0000000E54FFE720 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__0000001C760FA190 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__00000038E67ABFA0 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__0000003903F1CFE8 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__0000003B99F7F8A0 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__0000005D2FFFFB38 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__00000073AD3FE6B8 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__000000914E3F38F0 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__000000931B33AE68 000000067F0000400200008A590000758000-000000067F0000400200008A59000075C000__000000931B9AFDF8 000000067F0000400200008A590000759AD2-000000067F0000400200008A5900007624AB__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__0000000E54FFE720 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__0000001C760FA190 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__00000038E67ABFA0 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__0000003903F1CFE8 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__0000003B99F7F8A0 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__0000005D2FFFFB38 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__00000073AD3FE6B8 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__000000914E3F38F0 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__000000931B33AE68 000000067F0000400200008A59000075C000-000000067F0000400200008A590000760000__000000931B9AFDF8 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__0000000E54FFE720 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__0000001C760FA190 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__00000038E67ABFA0 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__0000003903F1CFE8 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__0000003B99F7F8A0 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__0000005D2FFFFB38 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__00000073AD3FE6B8 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__000000914E3F38F0 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__000000931B33AE68 000000067F0000400200008A590000760000-000000067F0000400200008A590000764000__000000931B9AFDF8 000000067F0000400200008A5900007624AB-000000067F0000400200008A59000076AE86__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__0000000E54FFE720 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__0000001C760FA190 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__00000038E67ABFA0 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__0000003903F1CFE8 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__0000003B99F7F8A0 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__0000005D2FFFFB38 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__00000073AD3FE6B8 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__000000914E3F38F0 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__000000931B33AE68 000000067F0000400200008A590000764000-000000067F0000400200008A590000768000__000000931B9AFDF8 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__0000000E54FFE720 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__0000001C760FA190 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__00000038E67ABFA0 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__0000003903F1CFE8 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__0000003B99F7F8A0 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__0000005D2FFFFB38 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__00000073AD3FE6B8 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__000000914E3F38F0 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__000000931B33AE68 000000067F0000400200008A590000768000-000000067F0000400200008A59000076C000__000000931B9AFDF8 000000067F0000400200008A59000076AE86-000000067F0000400200008A590000773859__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__0000000E54FFE720 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__0000001C760FA190 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__00000038E67ABFA0 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__0000003903F1CFE8 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__0000003B99F7F8A0 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__0000005D2FFFFB38 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__00000073AD3FE6B8 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__000000914E3F38F0 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__000000931B33AE68 000000067F0000400200008A59000076C000-000000067F0000400200008A590000770000__000000931B9AFDF8 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__0000000E54FFE720 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__0000001C760FA190 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__00000038E67ABFA0 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__0000003903F1CFE8 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__0000003B99F7F8A0 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__0000005D2FFFFB38 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__00000073AD3FE6B8 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__000000914E3F38F0 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__000000931B33AE68 000000067F0000400200008A590000770000-000000067F0000400200008A590000774000__000000931B9AFDF8 000000067F0000400200008A590000773859-000000067F0000400200008A59000077C231__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__0000000E54FFE720 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__0000001C760FA190 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__00000038E67ABFA0 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__0000003903F1CFE8 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__0000003B99F7F8A0 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__0000005D2FFFFB38 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__00000073AD3FE6B8 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__000000914E3F38F0 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__000000931B33AE68 000000067F0000400200008A590000774000-000000067F0000400200008A590000778000__000000931B9AFDF8 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__0000000E54FFE720 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__0000001C760FA190 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__00000038E67ABFA0 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__0000003903F1CFE8 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__0000003B99F7F8A0 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__0000005D2FFFFB38 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__00000073AD3FE6B8 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__000000914E3F38F0 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__000000931B33AE68 000000067F0000400200008A590000778000-000000067F0000400200008A59000077C000__000000931B9AFDF8 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__0000000E54FFE720 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__0000001C760FA190 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__00000038E67ABFA0 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__0000003903F1CFE8 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__0000003B99F7F8A0 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__0000005D2FFFFB38 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__00000073AD3FE6B8 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__000000914E3F38F0 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__000000931B33AE68 000000067F0000400200008A59000077C000-000000067F0000400200008A590000780000__000000931B9AFDF8 000000067F0000400200008A59000077C231-000000067F0000400200008A590000784C0A__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__0000000E54FFE720 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__0000001C760FA190 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__00000038E67ABFA0 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__0000003903F1CFE8 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__0000003B99F7F8A0 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__0000005D2FFFFB38 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__00000073AD3FE6B8 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__000000914E3F38F0 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__000000931B33AE68 000000067F0000400200008A590000780000-000000067F0000400200008A590000784000__000000931B9AFDF8 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__0000000E54FFE720 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__0000001C760FA190 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__00000038E67ABFA0 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__0000003903F1CFE8 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__0000003B99F7F8A0 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__0000005D2FFFFB38 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__00000073AD3FE6B8 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__000000914E3F38F0 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__000000931B33AE68 000000067F0000400200008A590000784000-000000067F0000400200008A590000788000__000000931B9AFDF8 000000067F0000400200008A590000784C0A-000000067F0000400200008A59000078D5DB__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__0000000E54FFE720 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__0000001C760FA190 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__00000038E67ABFA0 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__0000003903F1CFE8 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__0000003B99F7F8A0 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__0000005D2FFFFB38 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__00000073AD3FE6B8 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__000000914E3F38F0 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__000000931B33AE68 000000067F0000400200008A590000788000-000000067F0000400200008A59000078C000__000000931B9AFDF8 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__0000000E54FFE720 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__0000001C760FA190 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__00000038E67ABFA0 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__0000003903F1CFE8 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__0000003B99F7F8A0 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__0000005D2FFFFB38 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__00000073AD3FE6B8 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__000000914E3F38F0 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__000000931B33AE68 000000067F0000400200008A59000078C000-000000067F0000400200008A590000790000__000000931B9AFDF8 000000067F0000400200008A59000078D5DB-000000067F0000400200008A590000795FB7__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__0000000E54FFE720 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__0000001C760FA190 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__00000038E67ABFA0 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__0000003903F1CFE8 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__0000003B99F7F8A0 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__0000005D2FFFFB38 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__00000073AD3FE6B8 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__000000914E3F38F0 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__000000931B33AE68 000000067F0000400200008A590000790000-000000067F0000400200008A590000794000__000000931B9AFDF8 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__0000000E54FFE720 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__0000001C760FA190 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__00000038E67ABFA0 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__0000003903F1CFE8 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__0000003B99F7F8A0 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__0000005D2FFFFB38 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__00000073AD3FE6B8 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__000000914E3F38F0 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__000000931B33AE68 000000067F0000400200008A590000794000-000000067F0000400200008A590000798000__000000931B9AFDF8 000000067F0000400200008A590000795FB7-000000067F0000400200008A59000079E99E__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__0000000E54FFE720 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__0000001C760FA190 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__00000038E67ABFA0 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__0000003903F1CFE8 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__0000003B99F7F8A0 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__0000005D2FFFFB38 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__00000073AD3FE6B8 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__000000914E3F38F0 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__000000931B33AE68 000000067F0000400200008A590000798000-000000067F0000400200008A59000079C000__000000931B9AFDF8 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__0000000E54FFE720 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__0000001C760FA190 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__00000038E67ABFA0 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__0000003903F1CFE8 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__0000003B99F7F8A0 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__0000005D2FFFFB38 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__00000073AD3FE6B8 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__000000914E3F38F0 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__000000931B33AE68 000000067F0000400200008A59000079C000-000000067F0000400200008A5900007A0000__000000931B9AFDF8 000000067F0000400200008A59000079E99E-000000067F0000400200008A5900007A7383__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__0000000E54FFE720 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__0000001C760FA190 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__00000038E67ABFA0 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__0000003903F1CFE8 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__0000003B99F7F8A0 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__0000005D2FFFFB38 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__00000073AD3FE6B8 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__000000914E3F38F0 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__000000931B33AE68 000000067F0000400200008A5900007A0000-000000067F0000400200008A5900007A4000__000000931B9AFDF8 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__0000000E54FFE720 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__0000001C760FA190 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__00000038E67ABFA0 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__0000003903F1CFE8 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__0000003B99F7F8A0 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__0000005D2FFFFB38 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__00000073AD3FE6B8 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__000000914E3F38F0 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__000000931B33AE68 000000067F0000400200008A5900007A4000-000000067F0000400200008A5900007A8000__000000931B9AFDF8 000000067F0000400200008A5900007A7383-000000067F0000400200008A590100000000__0000000D8A85D199-0000000E2A359AC1 000000067F0000400200008A5900007A7649-000000067F0000400200008A5900007B001B__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__0000000E54FFE720 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__0000001C760FA190 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__00000038E67ABFA0 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__0000003903F1CFE8 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__0000003B99F7F8A0 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__0000005D2FFFFB38 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__00000073AD3FE6B8 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__000000914E3F38F0 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__000000931B33AE68 000000067F0000400200008A5900007A8000-000000067F0000400200008A5900007AC000__000000931B9AFDF8 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__0000000E54FFE720 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__0000001C760FA190 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__00000038E67ABFA0 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__0000003903F1CFE8 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__0000003B99F7F8A0 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__0000005D2FFFFB38 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__00000073AD3FE6B8 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__000000914E3F38F0 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__000000931B33AE68 000000067F0000400200008A5900007AC000-000000067F0000400200008A5900007B0000__000000931B9AFDF8 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__0000000E54FFE720 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__0000001C760FA190 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__00000038E67ABFA0 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__0000003903F1CFE8 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__0000003B99F7F8A0 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__0000005D2FFFFB38 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__00000073AD3FE6B8 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__000000914E3F38F0 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__000000931B33AE68 000000067F0000400200008A5900007B0000-000000067F0000400200008A5900007B4000__000000931B9AFDF8 000000067F0000400200008A5900007B001B-000000067F0000400200008A5900007B89F9__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__0000000E54FFE720 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__0000001C760FA190 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__00000038E67ABFA0 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__0000003903F1CFE8 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__0000003B99F7F8A0 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__0000005D2FFFFB38 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__00000073AD3FE6B8 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__000000914E3F38F0 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__000000931B33AE68 000000067F0000400200008A5900007B4000-000000067F0000400200008A5900007B8000__000000931B9AFDF8 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__0000000E54FFE720 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__0000001C760FA190 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__00000038E67ABFA0 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__0000003903F1CFE8 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__0000003B99F7F8A0 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__0000005D2FFFFB38 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__00000073AD3FE6B8 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__000000914E3F38F0 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__000000931B33AE68 000000067F0000400200008A5900007B8000-000000067F0000400200008A5900007BC000__000000931B9AFDF8 000000067F0000400200008A5900007B89F9-000000067F0000400200008A5900007C13E0__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__0000001C760FA190 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__00000038E67ABFA0 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__0000003903F1CFE8 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__0000003B99F7F8A0 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__0000005D2FFFFB38 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__00000073AD3FE6B8 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__000000914E3F38F0 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__000000931B33AE68 000000067F0000400200008A5900007BC000-000000067F0000400200008A5900007C0000__000000931B9AFDF8 000000067F0000400200008A5900007BC000-030000000000000000000000000000000002__0000000E54FFE720 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__0000001C760FA190 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__00000038E67ABFA0 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__0000003903F1CFE8 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__0000003B99F7F8A0 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__0000005D2FFFFB38 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__00000073AD3FE6B8 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__000000914E3F38F0 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__000000931B33AE68 000000067F0000400200008A5900007C0000-000000067F0000400200008A5900007C4000__000000931B9AFDF8 000000067F0000400200008A5900007C13E0-000000067F0000400200008A5900007C9DC3__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__0000001C760FA190 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__00000038E67ABFA0 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__0000003903F1CFE8 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__0000003B99F7F8A0 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__0000005D2FFFFB38 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__00000073AD3FE6B8 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__000000914E3F38F0 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__000000931B33AE68 000000067F0000400200008A5900007C4000-000000067F0000400200008A5900007C8000__000000931B9AFDF8 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__0000001C760FA190 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__00000038E67ABFA0 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__0000003903F1CFE8 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__0000003B99F7F8A0 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__0000005D2FFFFB38 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__00000073AD3FE6B8 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__000000914E3F38F0 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__000000931B33AE68 000000067F0000400200008A5900007C8000-000000067F0000400200008A5900007CC000__000000931B9AFDF8 000000067F0000400200008A5900007C9DC3-000000067F0000400200008A5900007D2796__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__0000001C760FA190 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__00000038E67ABFA0 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__0000003903F1CFE8 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__0000003B99F7F8A0 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__0000005D2FFFFB38 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__00000073AD3FE6B8 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__000000914E3F38F0 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__000000931B33AE68 000000067F0000400200008A5900007CC000-000000067F0000400200008A5900007D0000__000000931B9AFDF8 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__0000001C760FA190 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__00000038E67ABFA0 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__0000003903F1CFE8 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__0000003B99F7F8A0 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__0000005D2FFFFB38 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__00000073AD3FE6B8 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__000000914E3F38F0 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__000000931B33AE68 000000067F0000400200008A5900007D0000-000000067F0000400200008A5900007D4000__000000931B9AFDF8 000000067F0000400200008A5900007D2796-000000067F0000400200008A5900007DB171__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__0000001C760FA190 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__00000038E67ABFA0 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__0000003903F1CFE8 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__0000003B99F7F8A0 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__0000005D2FFFFB38 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__00000073AD3FE6B8 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__000000914E3F38F0 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__000000931B33AE68 000000067F0000400200008A5900007D4000-000000067F0000400200008A5900007D8000__000000931B9AFDF8 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__0000001C760FA190 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__00000038E67ABFA0 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__0000003903F1CFE8 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__0000003B99F7F8A0 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__0000005D2FFFFB38 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__00000073AD3FE6B8 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__000000914E3F38F0 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__000000931B33AE68 000000067F0000400200008A5900007D8000-000000067F0000400200008A5900007DC000__000000931B9AFDF8 000000067F0000400200008A5900007DB171-000000067F0000400200008A5900007E3B60__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__0000001C760FA190 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__00000038E67ABFA0 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__0000003903F1CFE8 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__0000003B99F7F8A0 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__0000005D2FFFFB38 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__00000073AD3FE6B8 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__000000914E3F38F0 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__000000931B33AE68 000000067F0000400200008A5900007DC000-000000067F0000400200008A5900007E0000__000000931B9AFDF8 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__0000001C760FA190 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__00000038E67ABFA0 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__0000003903F1CFE8 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__0000003B99F7F8A0 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__0000005D2FFFFB38 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__00000073AD3FE6B8 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__000000914E3F38F0 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__000000931B33AE68 000000067F0000400200008A5900007E0000-000000067F0000400200008A5900007E4000__000000931B9AFDF8 000000067F0000400200008A5900007E3B60-000000067F0000400200008A5900007EC53A__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__0000001C760FA190 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__00000038E67ABFA0 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__0000003903F1CFE8 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__0000003B99F7F8A0 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__0000005D2FFFFB38 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__00000073AD3FE6B8 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__000000914E3F38F0 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__000000931B33AE68 000000067F0000400200008A5900007E4000-000000067F0000400200008A5900007E8000__000000931B9AFDF8 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__0000001C760FA190 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__00000038E67ABFA0 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__0000003903F1CFE8 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__0000003B99F7F8A0 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__0000005D2FFFFB38 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__00000073AD3FE6B8 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__000000914E3F38F0 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__000000931B33AE68 000000067F0000400200008A5900007E8000-000000067F0000400200008A5900007EC000__000000931B9AFDF8 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__0000001C760FA190 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__00000038E67ABFA0 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__0000003903F1CFE8 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__0000003B99F7F8A0 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__0000005D2FFFFB38 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__00000073AD3FE6B8 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__000000914E3F38F0 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__000000931B33AE68 000000067F0000400200008A5900007EC000-000000067F0000400200008A5900007F0000__000000931B9AFDF8 000000067F0000400200008A5900007EC53A-000000067F0000400200008A5900007F4F2A__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__0000001C760FA190 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__00000038E67ABFA0 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__0000003903F1CFE8 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__0000003B99F7F8A0 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__0000005D2FFFFB38 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__00000073AD3FE6B8 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__000000914E3F38F0 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__000000931B33AE68 000000067F0000400200008A5900007F0000-000000067F0000400200008A5900007F4000__000000931B9AFDF8 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__0000001C760FA190 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__00000038E67ABFA0 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__0000003903F1CFE8 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__0000003B99F7F8A0 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__0000005D2FFFFB38 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__00000073AD3FE6B8 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__000000914E3F38F0 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__000000931B33AE68 000000067F0000400200008A5900007F4000-000000067F0000400200008A5900007F8000__000000931B9AFDF8 000000067F0000400200008A5900007F4F2A-000000067F0000400200008A5900007FD903__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__0000001C760FA190 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__00000038E67ABFA0 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__0000003903F1CFE8 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__0000003B99F7F8A0 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__0000005D2FFFFB38 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__00000073AD3FE6B8 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__000000914E3F38F0 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__000000931B33AE68 000000067F0000400200008A5900007F8000-000000067F0000400200008A5900007FC000__000000931B9AFDF8 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__0000001C760FA190 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__00000038E67ABFA0 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__0000003903F1CFE8 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__0000003B99F7F8A0 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__0000005D2FFFFB38 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__00000073AD3FE6B8 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__000000914E3F38F0 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__000000931B33AE68 000000067F0000400200008A5900007FC000-000000067F0000400200008A590000800000__000000931B9AFDF8 000000067F0000400200008A5900007FD903-000000067F0000400200008A5900008062D2__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__0000001C760FA190 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__00000038E67ABFA0 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__0000003903F1CFE8 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__0000003B99F7F8A0 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__0000005D2FFFFB38 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__00000073AD3FE6B8 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__000000914E3F38F0 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__000000931B33AE68 000000067F0000400200008A590000800000-000000067F0000400200008A590000804000__000000931B9AFDF8 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__0000001C725A2400 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__0000001C760FA190 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__00000038E67ABFA0 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__0000003903F1CFE8 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__0000003B99F7F8A0 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__0000005D2FFFFB38 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__00000073AD3FE6B8 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__000000914E3F38F0 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__000000931B33AE68 000000067F0000400200008A590000804000-000000067F0000400200008A590000808000__000000931B9AFDF8 000000067F0000400200008A5900008062D2-000000067F0000400200008A590100000000__0000000E2A359AC1-0000000ED9DDF211 000000067F0000400200008A5900008065DC-000000067F0000400200008A59000080EFB7__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__0000001C725A2400 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__0000001C760FA190 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__00000038E67ABFA0 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__0000003903F1CFE8 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__0000003B99F7F8A0 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__0000005D2FFFFB38 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__00000073AD3FE6B8 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__000000914E3F38F0 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__000000931B33AE68 000000067F0000400200008A590000808000-000000067F0000400200008A59000080C000__000000931B9AFDF8 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__0000001C725A2400 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__0000001C760FA190 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__00000038E67ABFA0 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__0000003903F1CFE8 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__0000003B99F7F8A0 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__0000005D2FFFFB38 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__00000073AD3FE6B8 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__000000914E3F38F0 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__000000931B33AE68 000000067F0000400200008A59000080C000-000000067F0000400200008A590000810000__000000931B9AFDF8 000000067F0000400200008A59000080EFB7-000000067F0000400200008A590000817999__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__0000001C725A2400 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__0000001C760FA190 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__00000038E67ABFA0 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__0000003903F1CFE8 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__0000003B99F7F8A0 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__0000005D2FFFFB38 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__00000073AD3FE6B8 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__000000914E3F38F0 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__000000931B33AE68 000000067F0000400200008A590000810000-000000067F0000400200008A590000814000__000000931B9AFDF8 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__0000001C725A2400 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__0000001C760FA190 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__00000038E67ABFA0 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__0000003903F1CFE8 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__0000003B99F7F8A0 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__0000005D2FFFFB38 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__00000073AD3FE6B8 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__000000914E3F38F0 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__000000931B33AE68 000000067F0000400200008A590000814000-000000067F0000400200008A590000818000__000000931B9AFDF8 000000067F0000400200008A590000817999-000000067F0000400200008A59000082037D__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__0000001C725A2400 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__0000001C760FA190 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__00000038E67ABFA0 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__0000003903F1CFE8 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__0000003B99F7F8A0 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__0000005D2FFFFB38 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__00000073AD3FE6B8 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__000000914E3F38F0 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__000000931B33AE68 000000067F0000400200008A590000818000-000000067F0000400200008A59000081C000__000000931B9AFDF8 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__0000001C725A2400 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__0000001C760FA190 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__00000038E67ABFA0 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__0000003903F1CFE8 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__0000003B99F7F8A0 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__0000005D2FFFFB38 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__00000073AD3FE6B8 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__000000914E3F38F0 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__000000931B33AE68 000000067F0000400200008A59000081C000-000000067F0000400200008A590000820000__000000931B9AFDF8 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__0000001C725A2400 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__0000001C760FA190 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__00000038E67ABFA0 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__0000003903F1CFE8 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__0000003B99F7F8A0 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__0000005D2FFFFB38 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__00000073AD3FE6B8 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__000000914E3F38F0 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__000000931B33AE68 000000067F0000400200008A590000820000-000000067F0000400200008A590000824000__000000931B9AFDF8 000000067F0000400200008A59000082037D-000000067F0000400200008A590000828D52__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__0000001C725A2400 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__0000001C760FA190 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__00000038E67ABFA0 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__0000003903F1CFE8 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__0000003B99F7F8A0 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__0000005D2FFFFB38 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__00000073AD3FE6B8 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__000000914E3F38F0 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__000000931B33AE68 000000067F0000400200008A590000824000-000000067F0000400200008A590000828000__000000931B9AFDF8 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__0000001C725A2400 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__0000001C760FA190 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__00000038E67ABFA0 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__0000003903F1CFE8 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__0000003B99F7F8A0 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__0000005D2FFFFB38 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__00000073AD3FE6B8 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__000000914E3F38F0 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__000000931B33AE68 000000067F0000400200008A590000828000-000000067F0000400200008A59000082C000__000000931B9AFDF8 000000067F0000400200008A590000828D52-000000067F0000400200008A590000831734__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__0000001C725A2400 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__0000001C760FA190 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__00000038E67ABFA0 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__0000003903F1CFE8 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__0000003B99F7F8A0 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__0000005D2FFFFB38 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__00000073AD3FE6B8 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__000000914E3F38F0 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__000000931B33AE68 000000067F0000400200008A59000082C000-000000067F0000400200008A590000830000__000000931B9AFDF8 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__0000001C725A2400 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__0000001C760FA190 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__00000038E67ABFA0 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__0000003903F1CFE8 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__0000003B99F7F8A0 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__0000005D2FFFFB38 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__00000073AD3FE6B8 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__000000914E3F38F0 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__000000931B33AE68 000000067F0000400200008A590000830000-000000067F0000400200008A590000834000__000000931B9AFDF8 000000067F0000400200008A590000831734-000000067F0000400200008A59000083A114__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__0000001C725A2400 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__0000001C760FA190 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__00000038E67ABFA0 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__0000003903F1CFE8 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__0000003B99F7F8A0 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__0000005D2FFFFB38 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__00000073AD3FE6B8 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__000000914E3F38F0 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__000000931B33AE68 000000067F0000400200008A590000834000-000000067F0000400200008A590000838000__000000931B9AFDF8 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__0000001C725A2400 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__0000001C760FA190 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__00000038E67ABFA0 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__0000003903F1CFE8 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__0000003B99F7F8A0 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__0000005D2FFFFB38 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__00000073AD3FE6B8 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__000000914E3F38F0 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__000000931B33AE68 000000067F0000400200008A590000838000-000000067F0000400200008A59000083C000__000000931B9AFDF8 000000067F0000400200008A59000083A114-000000067F0000400200008A590000842AE0__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__0000001C725A2400 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__0000001C760FA190 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__00000038E67ABFA0 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__0000003903F1CFE8 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__0000003B99F7F8A0 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__0000005D2FFFFB38 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__00000073AD3FE6B8 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__000000914E3F38F0 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__000000931B33AE68 000000067F0000400200008A59000083C000-000000067F0000400200008A590000840000__000000931B9AFDF8 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__0000001C725A2400 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__0000001C760FA190 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__00000038E67ABFA0 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__0000003903F1CFE8 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__0000003B99F7F8A0 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__0000005D2FFFFB38 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__00000073AD3FE6B8 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__000000914E3F38F0 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__000000931B33AE68 000000067F0000400200008A590000840000-000000067F0000400200008A590000844000__000000931B9AFDF8 000000067F0000400200008A590000842AE0-000000067F0000400200008A59000084B4C5__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__0000001C725A2400 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__0000001C760FA190 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__00000038E67ABFA0 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__0000003903F1CFE8 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__0000003B99F7F8A0 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__0000005D2FFFFB38 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__00000073AD3FE6B8 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__000000914E3F38F0 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__000000931B33AE68 000000067F0000400200008A590000844000-000000067F0000400200008A590000848000__000000931B9AFDF8 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__0000001C725A2400 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__0000001C760FA190 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__00000038E67ABFA0 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__0000003903F1CFE8 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__0000003B99F7F8A0 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__0000005D2FFFFB38 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__00000073AD3FE6B8 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__000000914E3F38F0 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__000000931B33AE68 000000067F0000400200008A590000848000-000000067F0000400200008A59000084C000__000000931B9AFDF8 000000067F0000400200008A59000084B4C5-000000067F0000400200008A590000853EA6__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__0000001C725A2400 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__0000001C760FA190 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__00000038E67ABFA0 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__0000003903F1CFE8 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__0000003B99F7F8A0 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__0000005D2FFFFB38 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__00000073AD3FE6B8 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__000000914E3F38F0 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__000000931B33AE68 000000067F0000400200008A59000084C000-000000067F0000400200008A590000850000__000000931B9AFDF8 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__0000001C725A2400 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__0000001C760FA190 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__00000038E67ABFA0 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__0000003903F1CFE8 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__0000003B99F7F8A0 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__0000005D2FFFFB38 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__00000073AD3FE6B8 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__000000914E3F38F0 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__000000931B33AE68 000000067F0000400200008A590000850000-000000067F0000400200008A590000854000__000000931B9AFDF8 000000067F0000400200008A590000853EA6-000000067F0000400200008A59000085C894__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__0000001C725A2400 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__0000001C760FA190 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__00000038E67ABFA0 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__0000003903F1CFE8 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__0000003B99F7F8A0 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__0000005D2FFFFB38 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__00000073AD3FE6B8 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__000000914E3F38F0 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__000000931B33AE68 000000067F0000400200008A590000854000-000000067F0000400200008A590000858000__000000931B9AFDF8 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__0000001C725A2400 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__0000001C760FA190 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__00000038E67ABFA0 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__0000003903F1CFE8 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__0000003B99F7F8A0 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__0000005D2FFFFB38 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__00000073AD3FE6B8 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__000000914E3F38F0 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__000000931B33AE68 000000067F0000400200008A590000858000-000000067F0000400200008A59000085C000__000000931B9AFDF8 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__0000001C725A2400 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__0000001C760FA190 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__00000038E67ABFA0 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__0000003903F1CFE8 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__0000003B99F7F8A0 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__0000005D2FFFFB38 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__00000073AD3FE6B8 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__000000914E3F38F0 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__000000931B33AE68 000000067F0000400200008A59000085C000-000000067F0000400200008A590000860000__000000931B9AFDF8 000000067F0000400200008A59000085C894-000000067F0000400200008A590000865277__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__0000001C725A2400 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__0000001C760FA190 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__00000038E67ABFA0 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__0000003903F1CFE8 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__0000003B99F7F8A0 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__0000005D2FFFFB38 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__00000073AD3FE6B8 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__000000914E3F38F0 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__000000931B33AE68 000000067F0000400200008A590000860000-000000067F0000400200008A590000864000__000000931B9AFDF8 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__000000106915EC38 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__0000001C760FA190 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__00000038E67ABFA0 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__0000003903F1CFE8 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__0000003B99F7F8A0 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__0000005D2FFFFB38 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__00000073AD3FE6B8 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__000000914E3F38F0 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__000000931B33AE68 000000067F0000400200008A590000864000-000000067F0000400200008A590000868000__000000931B9AFDF8 000000067F0000400200008A590000865277-000000067F0000400200008A590100000000__0000000ED9DDF211-0000000F8985D279 000000067F0000400200008A59000086558B-000000067F0000400200008A59000086DF61__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__000000106915EC38 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__0000001C760FA190 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__00000038E67ABFA0 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__0000003903F1CFE8 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__0000003B99F7F8A0 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__0000005D2FFFFB38 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__00000073AD3FE6B8 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__000000914E3F38F0 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__000000931B33AE68 000000067F0000400200008A590000868000-000000067F0000400200008A59000086C000__000000931B9AFDF8 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__000000106915EC38 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__0000001C760FA190 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__00000038E67ABFA0 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__0000003903F1CFE8 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__0000003B99F7F8A0 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__0000005D2FFFFB38 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__00000073AD3FE6B8 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__000000914E3F38F0 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__000000931B33AE68 000000067F0000400200008A59000086C000-000000067F0000400200008A590000870000__000000931B9AFDF8 000000067F0000400200008A59000086DF61-000000067F0000400200008A59000087693A__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__000000106915EC38 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__0000001C760FA190 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__00000038E67ABFA0 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__0000003903F1CFE8 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__0000003B99F7F8A0 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__0000005D2FFFFB38 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__00000073AD3FE6B8 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__000000914E3F38F0 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__000000931B33AE68 000000067F0000400200008A590000870000-000000067F0000400200008A590000874000__000000931B9AFDF8 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__000000106915EC38 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__0000001C760FA190 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__00000038E67ABFA0 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__0000003903F1CFE8 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__0000003B99F7F8A0 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__0000005D2FFFFB38 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__00000073AD3FE6B8 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__000000914E3F38F0 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__000000931B33AE68 000000067F0000400200008A590000874000-000000067F0000400200008A590000878000__000000931B9AFDF8 000000067F0000400200008A59000087693A-000000067F0000400200008A59000087F311__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__000000106915EC38 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__0000001C760FA190 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__00000038E67ABFA0 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__0000003903F1CFE8 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__0000003B99F7F8A0 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__0000005D2FFFFB38 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__00000073AD3FE6B8 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__000000914E3F38F0 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__000000931B33AE68 000000067F0000400200008A590000878000-000000067F0000400200008A59000087C000__000000931B9AFDF8 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__000000106915EC38 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__0000001C760FA190 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__00000038E67ABFA0 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__0000003903F1CFE8 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__0000003B99F7F8A0 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__0000005D2FFFFB38 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__00000073AD3FE6B8 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__000000914E3F38F0 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__000000931B33AE68 000000067F0000400200008A59000087C000-000000067F0000400200008A590000880000__000000931B9AFDF8 000000067F0000400200008A59000087F311-000000067F0000400200008A590000887CE5__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__000000106915EC38 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__0000001C760FA190 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__00000038E67ABFA0 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__0000003903F1CFE8 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__0000003B99F7F8A0 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__0000005D2FFFFB38 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__00000073AD3FE6B8 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__000000914E3F38F0 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__000000931B33AE68 000000067F0000400200008A590000880000-000000067F0000400200008A590000884000__000000931B9AFDF8 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__000000106915EC38 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__0000001C760FA190 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__00000038E67ABFA0 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__0000003903F1CFE8 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__0000003B99F7F8A0 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__0000005D2FFFFB38 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__00000073AD3FE6B8 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__000000914E3F38F0 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__000000931B33AE68 000000067F0000400200008A590000884000-000000067F0000400200008A590000888000__000000931B9AFDF8 000000067F0000400200008A590000887CE5-000000067F0000400200008A5900008906C5__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__000000106915EC38 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__0000001C760FA190 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__00000038E67ABFA0 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__0000003903F1CFE8 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__0000003B99F7F8A0 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__0000005D2FFFFB38 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__00000073AD3FE6B8 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__000000914E3F38F0 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__000000931B33AE68 000000067F0000400200008A590000888000-000000067F0000400200008A59000088C000__000000931B9AFDF8 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__000000106915EC38 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__0000001C760FA190 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__00000038E67ABFA0 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__0000003903F1CFE8 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__0000003B99F7F8A0 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__0000005D2FFFFB38 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__00000073AD3FE6B8 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__000000914E3F38F0 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__000000931B33AE68 000000067F0000400200008A59000088C000-000000067F0000400200008A590000890000__000000931B9AFDF8 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__000000106915EC38 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__0000001C760FA190 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__00000038E67ABFA0 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__0000003903F1CFE8 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__0000003B99F7F8A0 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__0000005D2FFFFB38 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__00000073AD3FE6B8 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__000000914E3F38F0 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__000000931B33AE68 000000067F0000400200008A590000890000-000000067F0000400200008A590000894000__000000931B9AFDF8 000000067F0000400200008A5900008906C5-000000067F0000400200008A5900008990AC__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__000000106915EC38 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__0000001C760FA190 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__00000038E67ABFA0 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__0000003903F1CFE8 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__0000003B99F7F8A0 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__0000005D2FFFFB38 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__00000073AD3FE6B8 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__000000914E3F38F0 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__000000931B33AE68 000000067F0000400200008A590000894000-000000067F0000400200008A590000898000__000000931B9AFDF8 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__000000106915EC38 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__0000001C760FA190 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__00000038E67ABFA0 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__0000003903F1CFE8 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__0000003B99F7F8A0 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__0000005D2FFFFB38 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__00000073AD3FE6B8 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__000000914E3F38F0 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__000000931B33AE68 000000067F0000400200008A590000898000-000000067F0000400200008A59000089C000__000000931B9AFDF8 000000067F0000400200008A5900008990AC-000000067F0000400200008A5900008A1AA3__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__000000106915EC38 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__0000001C760FA190 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__00000038E67ABFA0 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__0000003903F1CFE8 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__0000003B99F7F8A0 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__0000005D2FFFFB38 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__00000073AD3FE6B8 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__000000914E3F38F0 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__000000931B33AE68 000000067F0000400200008A59000089C000-000000067F0000400200008A5900008A0000__000000931B9AFDF8 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__000000106915EC38 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__0000001C760FA190 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__00000038E67ABFA0 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__0000003903F1CFE8 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__0000003B99F7F8A0 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__0000005D2FFFFB38 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__00000073AD3FE6B8 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__000000914E3F38F0 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__000000931B33AE68 000000067F0000400200008A5900008A0000-000000067F0000400200008A5900008A4000__000000931B9AFDF8 000000067F0000400200008A5900008A1AA3-000000067F0000400200008A5900008AA478__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__000000106915EC38 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__0000001C760FA190 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__00000038E67ABFA0 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__0000003903F1CFE8 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__0000003B99F7F8A0 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__0000005D2FFFFB38 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__00000073AD3FE6B8 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__000000914E3F38F0 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__000000931B33AE68 000000067F0000400200008A5900008A4000-000000067F0000400200008A5900008A8000__000000931B9AFDF8 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__000000106915EC38 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__0000001C760FA190 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__00000038E67ABFA0 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__0000003903F1CFE8 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__0000003B99F7F8A0 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__0000005D2FFFFB38 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__00000073AD3FE6B8 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__000000914E3F38F0 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__000000931B33AE68 000000067F0000400200008A5900008A8000-000000067F0000400200008A5900008AC000__000000931B9AFDF8 000000067F0000400200008A5900008AA478-000000067F0000400200008A5900008B2E53__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__000000106915EC38 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__0000001C760FA190 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__00000038E67ABFA0 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__0000003903F1CFE8 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__0000003B99F7F8A0 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__0000005D2FFFFB38 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__00000073AD3FE6B8 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__000000914E3F38F0 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__000000931B33AE68 000000067F0000400200008A5900008AC000-000000067F0000400200008A5900008B0000__000000931B9AFDF8 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__000000106915EC38 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__0000001C760FA190 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__00000038E67ABFA0 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__0000003903F1CFE8 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__0000003B99F7F8A0 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__0000005D2FFFFB38 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__00000073AD3FE6B8 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__000000914E3F38F0 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__000000931B33AE68 000000067F0000400200008A5900008B0000-000000067F0000400200008A5900008B4000__000000931B9AFDF8 000000067F0000400200008A5900008B2E53-000000067F0000400200008A5900008BB832__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__000000106915EC38 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__0000001C760FA190 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__00000038E67ABFA0 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__0000003903F1CFE8 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__0000003B99F7F8A0 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__0000005D2FFFFB38 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__00000073AD3FE6B8 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__000000914E3F38F0 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__000000931B33AE68 000000067F0000400200008A5900008B4000-000000067F0000400200008A5900008B8000__000000931B9AFDF8 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__000000106915EC38 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__0000001C760FA190 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__00000038E67ABFA0 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__0000003903F1CFE8 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__0000003B99F7F8A0 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__0000005D2FFFFB38 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__00000073AD3FE6B8 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__000000914E3F38F0 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__000000931B33AE68 000000067F0000400200008A5900008B8000-000000067F0000400200008A5900008BC000__000000931B9AFDF8 000000067F0000400200008A5900008BB832-000000067F0000400200008A5900008C4202__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__000000106915EC38 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__0000001C760FA190 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__00000038E67ABFA0 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__0000003903F1CFE8 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__0000003B99F7F8A0 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__0000005D2FFFFB38 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__00000073AD3FE6B8 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__000000914E3F38F0 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__000000931B33AE68 000000067F0000400200008A5900008BC000-000000067F0000400200008A5900008C0000__000000931B9AFDF8 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__000000106915EC38 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__0000001C760FA190 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__00000038E67ABFA0 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__0000003903F1CFE8 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__0000003B99F7F8A0 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__0000005D2FFFFB38 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__00000073AD3FE6B8 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__000000914E3F38F0 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__000000931B33AE68 000000067F0000400200008A5900008C0000-000000067F0000400200008A5900008C4000__000000931B9AFDF8 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__000000106915EC38 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__0000001C760FA190 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__00000038E67ABFA0 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__0000003903F1CFE8 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__0000003B99F7F8A0 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__0000005D2FFFFB38 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__00000073AD3FE6B8 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__000000914E3F38F0 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__000000931B33AE68 000000067F0000400200008A5900008C4000-000000067F0000400200008A5900008C8000__000000931B9AFDF8 000000067F0000400200008A5900008C4202-000000067F0000400200008A590100000000__0000000F8985D279-00000010392DE3B9 000000067F0000400200008A5900008C4513-000000067F0000400200008A5900008CCEF0__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__000000106915EC38 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__0000001C760FA190 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__00000038E67ABFA0 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__0000003903F1CFE8 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__0000003B99F7F8A0 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__0000005D2FFFFB38 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__00000073AD3FE6B8 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__000000914E3F38F0 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__000000931B33AE68 000000067F0000400200008A5900008C8000-000000067F0000400200008A5900008CC000__000000931B9AFDF8 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__000000106915EC38 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__0000001C760FA190 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__00000038E67ABFA0 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__0000003903F1CFE8 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__0000003B99F7F8A0 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__0000005D2FFFFB38 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__00000073AD3FE6B8 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__000000914E3F38F0 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__000000931B33AE68 000000067F0000400200008A5900008CC000-000000067F0000400200008A5900008D0000__000000931B9AFDF8 000000067F0000400200008A5900008CCEF0-000000067F0000400200008A5900008D58DC__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__000000106915EC38 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__0000001C760FA190 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__00000038E67ABFA0 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__0000003903F1CFE8 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__0000003B99F7F8A0 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__0000005D2FFFFB38 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__00000073AD3FE6B8 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__000000914E3F38F0 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__000000931B33AE68 000000067F0000400200008A5900008D0000-000000067F0000400200008A5900008D4000__000000931B9AFDF8 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__000000106915EC38 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__0000001C760FA190 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__00000038E67ABFA0 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__0000003903F1CFE8 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__0000003B99F7F8A0 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__0000005D2FFFFB38 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__00000073AD3FE6B8 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__000000914E3F38F0 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__000000931B33AE68 000000067F0000400200008A5900008D4000-000000067F0000400200008A5900008D8000__000000931B9AFDF8 000000067F0000400200008A5900008D58DC-000000067F0000400200008A5900008DE2CB__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__000000106915EC38 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__0000001C760FA190 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__00000038E67ABFA0 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__0000003903F1CFE8 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__0000003B99F7F8A0 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__0000005D2FFFFB38 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__00000073AD3FE6B8 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__000000914E3F38F0 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__000000931B33AE68 000000067F0000400200008A5900008D8000-000000067F0000400200008A5900008DC000__000000931B9AFDF8 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__0000001C760FA190 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__00000038E67ABFA0 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__0000003903F1CFE8 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__0000003B99F7F8A0 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__0000005D2FFFFB38 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__00000073AD3FE6B8 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__000000914E3F38F0 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__000000931B33AE68 000000067F0000400200008A5900008DC000-000000067F0000400200008A5900008E0000__000000931B9AFDF8 000000067F0000400200008A5900008DC000-030000000000000000000000000000000002__000000106915EC38 000000067F0000400200008A5900008DE2CB-000000067F0000400200008A5900008E6C9E__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__0000001C760FA190 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__00000038E67ABFA0 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__0000003903F1CFE8 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__0000003B99F7F8A0 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__0000005D2FFFFB38 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__00000073AD3FE6B8 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__000000914E3F38F0 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__000000931B33AE68 000000067F0000400200008A5900008E0000-000000067F0000400200008A5900008E4000__000000931B9AFDF8 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__0000001C760FA190 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__00000038E67ABFA0 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__0000003903F1CFE8 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__0000003B99F7F8A0 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__0000005D2FFFFB38 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__00000073AD3FE6B8 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__000000914E3F38F0 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__000000931B33AE68 000000067F0000400200008A5900008E4000-000000067F0000400200008A5900008E8000__000000931B9AFDF8 000000067F0000400200008A5900008E6C9E-000000067F0000400200008A5900008EF67E__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__0000001C760FA190 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__00000038E67ABFA0 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__0000003903F1CFE8 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__0000003B99F7F8A0 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__0000005D2FFFFB38 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__00000073AD3FE6B8 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__000000914E3F38F0 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__000000931B33AE68 000000067F0000400200008A5900008E8000-000000067F0000400200008A5900008EC000__000000931B9AFDF8 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__0000001C760FA190 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__00000038E67ABFA0 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__0000003903F1CFE8 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__0000003B99F7F8A0 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__0000005D2FFFFB38 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__00000073AD3FE6B8 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__000000914E3F38F0 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__000000931B33AE68 000000067F0000400200008A5900008EC000-000000067F0000400200008A5900008F0000__000000931B9AFDF8 000000067F0000400200008A5900008EF67E-000000067F0000400200008A5900008F805B__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__0000001C760FA190 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__00000038E67ABFA0 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__0000003903F1CFE8 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__0000003B99F7F8A0 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__0000005D2FFFFB38 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__00000073AD3FE6B8 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__000000914E3F38F0 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__000000931B33AE68 000000067F0000400200008A5900008F0000-000000067F0000400200008A5900008F4000__000000931B9AFDF8 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__0000001C760FA190 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__00000038E67ABFA0 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__0000003903F1CFE8 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__0000003B99F7F8A0 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__0000005D2FFFFB38 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__00000073AD3FE6B8 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__000000914E3F38F0 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__000000931B33AE68 000000067F0000400200008A5900008F4000-000000067F0000400200008A5900008F8000__000000931B9AFDF8 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__0000001C760FA190 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__00000038E67ABFA0 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__0000003903F1CFE8 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__0000003B99F7F8A0 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__0000005D2FFFFB38 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__00000073AD3FE6B8 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__000000914E3F38F0 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__000000931B33AE68 000000067F0000400200008A5900008F8000-000000067F0000400200008A5900008FC000__000000931B9AFDF8 000000067F0000400200008A5900008F805B-000000067F0000400200008A590000900A29__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__0000001C760FA190 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__00000038E67ABFA0 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__0000003903F1CFE8 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__0000003B99F7F8A0 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__0000005D2FFFFB38 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__00000073AD3FE6B8 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__000000914E3F38F0 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__000000931B33AE68 000000067F0000400200008A5900008FC000-000000067F0000400200008A590000900000__000000931B9AFDF8 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__0000001C760FA190 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__00000038E67ABFA0 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__0000003903F1CFE8 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__0000003B99F7F8A0 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__0000005D2FFFFB38 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__00000073AD3FE6B8 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__000000914E3F38F0 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__000000931B33AE68 000000067F0000400200008A590000900000-000000067F0000400200008A590000904000__000000931B9AFDF8 000000067F0000400200008A590000900A29-000000067F0000400200008A59000090940E__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__0000001C760FA190 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__00000038E67ABFA0 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__0000003903F1CFE8 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__0000003B99F7F8A0 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__0000005D2FFFFB38 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__00000073AD3FE6B8 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__000000914E3F38F0 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__000000931B33AE68 000000067F0000400200008A590000904000-000000067F0000400200008A590000908000__000000931B9AFDF8 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__0000001C760FA190 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__00000038E67ABFA0 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__0000003903F1CFE8 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__0000003B99F7F8A0 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__0000005D2FFFFB38 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__00000073AD3FE6B8 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__000000914E3F38F0 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__000000931B33AE68 000000067F0000400200008A590000908000-000000067F0000400200008A59000090C000__000000931B9AFDF8 000000067F0000400200008A59000090940E-000000067F0000400200008A590000911DE5__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__0000001C760FA190 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__00000038E67ABFA0 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__0000003903F1CFE8 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__0000003B99F7F8A0 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__0000005D2FFFFB38 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__00000073AD3FE6B8 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__000000914E3F38F0 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__000000931B33AE68 000000067F0000400200008A59000090C000-000000067F0000400200008A590000910000__000000931B9AFDF8 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__0000001C760FA190 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__00000038E67ABFA0 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__0000003903F1CFE8 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__0000003B99F7F8A0 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__0000005D2FFFFB38 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__00000073AD3FE6B8 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__000000914E3F38F0 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__000000931B33AE68 000000067F0000400200008A590000910000-000000067F0000400200008A590000914000__000000931B9AFDF8 000000067F0000400200008A590000911DE5-000000067F0000400200008A59000091A7C5__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__0000001C760FA190 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__00000038E67ABFA0 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__0000003903F1CFE8 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__0000003B99F7F8A0 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__0000005D2FFFFB38 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__00000073AD3FE6B8 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__000000914E3F38F0 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__000000931B33AE68 000000067F0000400200008A590000914000-000000067F0000400200008A590000918000__000000931B9AFDF8 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__0000001C760FA190 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__00000038E67ABFA0 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__0000003903F1CFE8 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__0000003B99F7F8A0 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__0000005D2FFFFB38 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__00000073AD3FE6B8 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__000000914E3F38F0 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__000000931B33AE68 000000067F0000400200008A590000918000-000000067F0000400200008A59000091C000__000000931B9AFDF8 000000067F0000400200008A59000091A7C5-000000067F0000400200008A590000923196__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__0000001C760FA190 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__00000038E67ABFA0 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__0000003903F1CFE8 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__0000003B99F7F8A0 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__0000005D2FFFFB38 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__00000073AD3FE6B8 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__000000914E3F38F0 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__000000931B33AE68 000000067F0000400200008A59000091C000-000000067F0000400200008A590000920000__000000931B9AFDF8 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__0000001C725A2400 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__0000001C760FA190 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__00000038E67ABFA0 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__0000003903F1CFE8 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__0000003B99F7F8A0 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__0000005D2FFFFB38 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__00000073AD3FE6B8 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__000000914E3F38F0 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__000000931B33AE68 000000067F0000400200008A590000920000-000000067F0000400200008A590000924000__000000931B9AFDF8 000000067F0000400200008A590000923196-000000067F0000400200008A590100000000__00000010392DE3B9-00000010E8D5E0A1 000000067F0000400200008A5900009234AC-000000067F0000400200008A59000092BE93__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__0000001C725A2400 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__0000001C760FA190 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__00000038E67ABFA0 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__0000003903F1CFE8 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__0000003B99F7F8A0 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__0000005D2FFFFB38 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__00000073AD3FE6B8 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__000000914E3F38F0 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__000000931B33AE68 000000067F0000400200008A590000924000-000000067F0000400200008A590000928000__000000931B9AFDF8 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__0000001C725A2400 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__0000001C760FA190 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__00000038E67ABFA0 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__0000003903F1CFE8 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__0000003B99F7F8A0 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__0000005D2FFFFB38 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__00000073AD3FE6B8 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__000000914E3F38F0 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__000000931B33AE68 000000067F0000400200008A590000928000-000000067F0000400200008A59000092C000__000000931B9AFDF8 000000067F0000400200008A59000092BE93-000000067F0000400200008A590000934873__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__0000001C725A2400 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__0000001C760FA190 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__00000038E67ABFA0 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__0000003903F1CFE8 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__0000003B99F7F8A0 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__0000005D2FFFFB38 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__00000073AD3FE6B8 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__000000914E3F38F0 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__000000931B33AE68 000000067F0000400200008A59000092C000-000000067F0000400200008A590000930000__000000931B9AFDF8 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__0000001C725A2400 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__0000001C760FA190 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__00000038E67ABFA0 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__0000003903F1CFE8 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__0000003B99F7F8A0 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__0000005D2FFFFB38 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__00000073AD3FE6B8 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__000000914E3F38F0 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__000000931B33AE68 000000067F0000400200008A590000930000-000000067F0000400200008A590000934000__000000931B9AFDF8 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__0000001C725A2400 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__0000001C760FA190 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__00000038E67ABFA0 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__0000003903F1CFE8 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__0000003B99F7F8A0 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__0000005D2FFFFB38 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__00000073AD3FE6B8 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__000000914E3F38F0 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__000000931B33AE68 000000067F0000400200008A590000934000-000000067F0000400200008A590000938000__000000931B9AFDF8 000000067F0000400200008A590000934873-000000067F0000400200008A59000093D24E__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__0000001C725A2400 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__0000001C760FA190 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__00000038E67ABFA0 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__0000003903F1CFE8 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__0000003B99F7F8A0 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__0000005D2FFFFB38 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__00000073AD3FE6B8 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__000000914E3F38F0 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__000000931B33AE68 000000067F0000400200008A590000938000-000000067F0000400200008A59000093C000__000000931B9AFDF8 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__0000001C725A2400 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__0000001C760FA190 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__00000038E67ABFA0 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__0000003903F1CFE8 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__0000003B99F7F8A0 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__0000005D2FFFFB38 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__00000073AD3FE6B8 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__000000914E3F38F0 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__000000931B33AE68 000000067F0000400200008A59000093C000-000000067F0000400200008A590000940000__000000931B9AFDF8 000000067F0000400200008A59000093D24E-000000067F0000400200008A590000945C33__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__0000001C725A2400 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__0000001C760FA190 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__00000038E67ABFA0 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__0000003903F1CFE8 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__0000003B99F7F8A0 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__0000005D2FFFFB38 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__00000073AD3FE6B8 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__000000914E3F38F0 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__000000931B33AE68 000000067F0000400200008A590000940000-000000067F0000400200008A590000944000__000000931B9AFDF8 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__0000001C725A2400 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__0000001C760FA190 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__00000038E67ABFA0 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__0000003903F1CFE8 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__0000003B99F7F8A0 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__0000005D2FFFFB38 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__00000073AD3FE6B8 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__000000914E3F38F0 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__000000931B33AE68 000000067F0000400200008A590000944000-000000067F0000400200008A590000948000__000000931B9AFDF8 000000067F0000400200008A590000945C33-000000067F0000400200008A59000094E60F__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__0000001C725A2400 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__0000001C760FA190 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__00000038E67ABFA0 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__0000003903F1CFE8 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__0000003B99F7F8A0 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__0000005D2FFFFB38 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__00000073AD3FE6B8 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__000000914E3F38F0 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__000000931B33AE68 000000067F0000400200008A590000948000-000000067F0000400200008A59000094C000__000000931B9AFDF8 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__0000001C725A2400 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__0000001C760FA190 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__00000038E67ABFA0 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__0000003903F1CFE8 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__0000003B99F7F8A0 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__0000005D2FFFFB38 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__00000073AD3FE6B8 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__000000914E3F38F0 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__000000931B33AE68 000000067F0000400200008A59000094C000-000000067F0000400200008A590000950000__000000931B9AFDF8 000000067F0000400200008A59000094E60F-000000067F0000400200008A590000956FE7__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__0000001C725A2400 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__0000001C760FA190 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__00000038E67ABFA0 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__0000003903F1CFE8 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__0000003B99F7F8A0 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__0000005D2FFFFB38 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__00000073AD3FE6B8 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__000000914E3F38F0 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__000000931B33AE68 000000067F0000400200008A590000950000-000000067F0000400200008A590000954000__000000931B9AFDF8 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__0000001C725A2400 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__0000001C760FA190 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__00000038E67ABFA0 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__0000003903F1CFE8 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__0000003B99F7F8A0 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__0000005D2FFFFB38 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__00000073AD3FE6B8 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__000000914E3F38F0 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__000000931B33AE68 000000067F0000400200008A590000954000-000000067F0000400200008A590000958000__000000931B9AFDF8 000000067F0000400200008A590000956FE7-000000067F0000400200008A59000095F9C6__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__0000001C725A2400 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__0000001C760FA190 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__00000038E67ABFA0 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__0000003903F1CFE8 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__0000003B99F7F8A0 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__0000005D2FFFFB38 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__00000073AD3FE6B8 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__000000914E3F38F0 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__000000931B33AE68 000000067F0000400200008A590000958000-000000067F0000400200008A59000095C000__000000931B9AFDF8 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__0000001C725A2400 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__0000001C760FA190 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__00000038E67ABFA0 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__0000003903F1CFE8 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__0000003B99F7F8A0 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__0000005D2FFFFB38 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__00000073AD3FE6B8 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__000000914E3F38F0 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__000000931B33AE68 000000067F0000400200008A59000095C000-000000067F0000400200008A590000960000__000000931B9AFDF8 000000067F0000400200008A59000095F9C6-000000067F0000400200008A5900009683A0__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__0000001C725A2400 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__0000001C760FA190 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__00000038E67ABFA0 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__0000003903F1CFE8 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__0000003B99F7F8A0 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__0000005D2FFFFB38 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__00000073AD3FE6B8 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__000000914E3F38F0 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__000000931B33AE68 000000067F0000400200008A590000960000-000000067F0000400200008A590000964000__000000931B9AFDF8 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__0000001C725A2400 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__0000001C760FA190 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__00000038E67ABFA0 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__0000003903F1CFE8 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__0000003B99F7F8A0 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__0000005D2FFFFB38 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__00000073AD3FE6B8 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__000000914E3F38F0 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__000000931B33AE68 000000067F0000400200008A590000964000-000000067F0000400200008A590000968000__000000931B9AFDF8 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__0000001C725A2400 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__0000001C760FA190 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__00000038E67ABFA0 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__0000003903F1CFE8 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__0000003B99F7F8A0 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__0000005D2FFFFB38 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__00000073AD3FE6B8 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__000000914E3F38F0 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__000000931B33AE68 000000067F0000400200008A590000968000-000000067F0000400200008A59000096C000__000000931B9AFDF8 000000067F0000400200008A5900009683A0-000000067F0000400200008A590000970D7B__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__0000001C725A2400 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__0000001C760FA190 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__00000038E67ABFA0 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__0000003903F1CFE8 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__0000003B99F7F8A0 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__0000005D2FFFFB38 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__00000073AD3FE6B8 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__000000914E3F38F0 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__000000931B33AE68 000000067F0000400200008A59000096C000-000000067F0000400200008A590000970000__000000931B9AFDF8 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__0000001C725A2400 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__0000001C760FA190 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__00000038E67ABFA0 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__0000003903F1CFE8 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__0000003B99F7F8A0 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__0000005D2FFFFB38 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__00000073AD3FE6B8 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__000000914E3F38F0 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__000000931B33AE68 000000067F0000400200008A590000970000-000000067F0000400200008A590000974000__000000931B9AFDF8 000000067F0000400200008A590000970D7B-000000067F0000400200008A590000979751__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__0000001C725A2400 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__0000001C760FA190 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__00000038E67ABFA0 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__0000003903F1CFE8 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__0000003B99F7F8A0 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__0000005D2FFFFB38 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__00000073AD3FE6B8 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__000000914E3F38F0 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__000000931B33AE68 000000067F0000400200008A590000974000-000000067F0000400200008A590000978000__000000931B9AFDF8 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__0000001C725A2400 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__0000001C760FA190 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__00000038E67ABFA0 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__0000003903F1CFE8 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__0000003B99F7F8A0 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__0000005D2FFFFB38 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__00000073AD3FE6B8 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__000000914E3F38F0 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__000000931B33AE68 000000067F0000400200008A590000978000-000000067F0000400200008A59000097C000__000000931B9AFDF8 000000067F0000400200008A590000979751-000000067F0000400200008A590000982136__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__0000001C725A2400 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__0000001C760FA190 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__00000038E67ABFA0 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__0000003903F1CFE8 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__0000003B99F7F8A0 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__0000005D2FFFFB38 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__00000073AD3FE6B8 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__000000914E3F38F0 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__000000931B33AE68 000000067F0000400200008A59000097C000-000000067F0000400200008A590000980000__000000931B9AFDF8 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__000000127811CCF0 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__0000001C760FA190 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__00000038E67ABFA0 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__0000003903F1CFE8 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__0000003B99F7F8A0 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__0000005D2FFFFB38 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__00000073AD3FE6B8 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__000000914E3F38F0 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__000000931B33AE68 000000067F0000400200008A590000980000-000000067F0000400200008A590000984000__000000931B9AFDF8 000000067F0000400200008A590000982136-000000067F0000400200008A590100000000__00000010E8D5E0A1-00000011987BE139 000000067F0000400200008A590000982445-000000067F0000400200008A59000098AE24__00000011987BE139-000000124823FE31 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__000000127811CCF0 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__0000001C760FA190 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__00000038E67ABFA0 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__0000003903F1CFE8 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__0000003B99F7F8A0 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__0000005D2FFFFB38 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__00000073AD3FE6B8 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__000000914E3F38F0 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__000000931B33AE68 000000067F0000400200008A590000984000-000000067F0000400200008A590000988000__000000931B9AFDF8 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__000000127811CCF0 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__0000001C760FA190 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__00000038E67ABFA0 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__0000003903F1CFE8 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__0000003B99F7F8A0 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__0000005D2FFFFB38 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__00000073AD3FE6B8 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__000000914E3F38F0 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__000000931B33AE68 000000067F0000400200008A590000988000-000000067F0000400200008A59000098C000__000000931B9AFDF8 000000067F0000400200008A59000098AE24-000000067F0000400200008A5900009937FD__00000011987BE139-000000124823FE31 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__000000127811CCF0 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__0000001C760FA190 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__00000038E67ABFA0 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__0000003903F1CFE8 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__0000003B99F7F8A0 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__0000005D2FFFFB38 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__00000073AD3FE6B8 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__000000914E3F38F0 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__000000931B33AE68 000000067F0000400200008A59000098C000-000000067F0000400200008A590000990000__000000931B9AFDF8 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__000000127811CCF0 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__0000001C760FA190 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__00000038E67ABFA0 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__0000003903F1CFE8 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__0000003B99F7F8A0 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__0000005D2FFFFB38 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__00000073AD3FE6B8 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__000000914E3F38F0 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__000000931B33AE68 000000067F0000400200008A590000990000-000000067F0000400200008A590000994000__000000931B9AFDF8 000000067F0000400200008A5900009937FD-000000067F0000400200008A59000099C1DF__00000011987BE139-000000124823FE31 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__000000127811CCF0 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__0000001C760FA190 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__00000038E67ABFA0 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__0000003903F1CFE8 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__0000003B99F7F8A0 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__0000005D2FFFFB38 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__00000073AD3FE6B8 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__000000914E3F38F0 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__000000931B33AE68 000000067F0000400200008A590000994000-000000067F0000400200008A590000998000__000000931B9AFDF8 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__000000127811CCF0 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__0000001C760FA190 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__00000038E67ABFA0 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__0000003903F1CFE8 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__0000003B99F7F8A0 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__0000005D2FFFFB38 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__00000073AD3FE6B8 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__000000914E3F38F0 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__000000931B33AE68 000000067F0000400200008A590000998000-000000067F0000400200008A59000099C000__000000931B9AFDF8 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__000000127811CCF0 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__0000001C760FA190 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__00000038E67ABFA0 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__0000003903F1CFE8 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__0000003B99F7F8A0 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__0000005D2FFFFB38 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__00000073AD3FE6B8 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__000000914E3F38F0 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__000000931B33AE68 000000067F0000400200008A59000099C000-000000067F0000400200008A5900009A0000__000000931B9AFDF8 000000067F0000400200008A59000099C1DF-000000067F0000400200008A5900009A4BBB__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__000000127811CCF0 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__0000001C760FA190 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__00000038E67ABFA0 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__0000003903F1CFE8 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__0000003B99F7F8A0 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__0000005D2FFFFB38 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__00000073AD3FE6B8 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__000000914E3F38F0 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__000000931B33AE68 000000067F0000400200008A5900009A0000-000000067F0000400200008A5900009A4000__000000931B9AFDF8 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__000000127811CCF0 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__0000001C760FA190 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__00000038E67ABFA0 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__0000003903F1CFE8 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__0000003B99F7F8A0 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__0000005D2FFFFB38 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__00000073AD3FE6B8 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__000000914E3F38F0 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__000000931B33AE68 000000067F0000400200008A5900009A4000-000000067F0000400200008A5900009A8000__000000931B9AFDF8 000000067F0000400200008A5900009A4BBB-000000067F0000400200008A5900009AD590__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__000000127811CCF0 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__0000001C760FA190 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__00000038E67ABFA0 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__0000003903F1CFE8 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__0000003B99F7F8A0 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__0000005D2FFFFB38 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__00000073AD3FE6B8 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__000000914E3F38F0 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__000000931B33AE68 000000067F0000400200008A5900009A8000-000000067F0000400200008A5900009AC000__000000931B9AFDF8 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__000000127811CCF0 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__0000001C760FA190 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__00000038E67ABFA0 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__0000003903F1CFE8 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__0000003B99F7F8A0 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__0000005D2FFFFB38 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__00000073AD3FE6B8 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__000000914E3F38F0 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__000000931B33AE68 000000067F0000400200008A5900009AC000-000000067F0000400200008A5900009B0000__000000931B9AFDF8 000000067F0000400200008A5900009AD590-000000067F0000400200008A5900009B5F72__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__000000127811CCF0 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__0000001C760FA190 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__00000038E67ABFA0 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__0000003903F1CFE8 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__0000003B99F7F8A0 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__0000005D2FFFFB38 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__00000073AD3FE6B8 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__000000914E3F38F0 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__000000931B33AE68 000000067F0000400200008A5900009B0000-000000067F0000400200008A5900009B4000__000000931B9AFDF8 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__000000127811CCF0 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__0000001C760FA190 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__00000038E67ABFA0 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__0000003903F1CFE8 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__0000003B99F7F8A0 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__0000005D2FFFFB38 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__00000073AD3FE6B8 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__000000914E3F38F0 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__000000931B33AE68 000000067F0000400200008A5900009B4000-000000067F0000400200008A5900009B8000__000000931B9AFDF8 000000067F0000400200008A5900009B5F72-000000067F0000400200008A5900009BE956__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__000000127811CCF0 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__0000001C760FA190 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__00000038E67ABFA0 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__0000003903F1CFE8 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__0000003B99F7F8A0 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__0000005D2FFFFB38 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__00000073AD3FE6B8 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__000000914E3F38F0 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__000000931B33AE68 000000067F0000400200008A5900009B8000-000000067F0000400200008A5900009BC000__000000931B9AFDF8 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__000000127811CCF0 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__0000001C760FA190 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__00000038E67ABFA0 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__0000003903F1CFE8 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__0000003B99F7F8A0 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__0000005D2FFFFB38 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__00000073AD3FE6B8 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__000000914E3F38F0 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__000000931B33AE68 000000067F0000400200008A5900009BC000-000000067F0000400200008A5900009C0000__000000931B9AFDF8 000000067F0000400200008A5900009BE956-000000067F0000400200008A5900009C7338__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__000000127811CCF0 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__0000001C760FA190 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__00000038E67ABFA0 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__0000003903F1CFE8 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__0000003B99F7F8A0 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__0000005D2FFFFB38 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__00000073AD3FE6B8 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__000000914E3F38F0 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__000000931B33AE68 000000067F0000400200008A5900009C0000-000000067F0000400200008A5900009C4000__000000931B9AFDF8 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__000000127811CCF0 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__0000001C760FA190 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__00000038E67ABFA0 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__0000003903F1CFE8 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__0000003B99F7F8A0 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__0000005D2FFFFB38 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__00000073AD3FE6B8 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__000000914E3F38F0 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__000000931B33AE68 000000067F0000400200008A5900009C4000-000000067F0000400200008A5900009C8000__000000931B9AFDF8 000000067F0000400200008A5900009C7338-000000067F0000400200008A5900009CFD0C__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__000000127811CCF0 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__0000001C760FA190 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__00000038E67ABFA0 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__0000003903F1CFE8 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__0000003B99F7F8A0 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__0000005D2FFFFB38 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__00000073AD3FE6B8 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__000000914E3F38F0 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__000000931B33AE68 000000067F0000400200008A5900009C8000-000000067F0000400200008A5900009CC000__000000931B9AFDF8 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__000000127811CCF0 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__0000001C760FA190 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__00000038E67ABFA0 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__0000003903F1CFE8 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__0000003B99F7F8A0 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__0000005D2FFFFB38 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__00000073AD3FE6B8 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__000000914E3F38F0 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__000000931B33AE68 000000067F0000400200008A5900009CC000-000000067F0000400200008A5900009D0000__000000931B9AFDF8 000000067F0000400200008A5900009CFD0C-000000067F0000400200008A5900009D86E1__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__000000127811CCF0 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__0000001C760FA190 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__00000038E67ABFA0 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__0000003903F1CFE8 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__0000003B99F7F8A0 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__0000005D2FFFFB38 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__00000073AD3FE6B8 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__000000914E3F38F0 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__000000931B33AE68 000000067F0000400200008A5900009D0000-000000067F0000400200008A5900009D4000__000000931B9AFDF8 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__000000127811CCF0 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__0000001C760FA190 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__00000038E67ABFA0 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__0000003903F1CFE8 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__0000003B99F7F8A0 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__0000005D2FFFFB38 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__00000073AD3FE6B8 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__000000914E3F38F0 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__000000931B33AE68 000000067F0000400200008A5900009D4000-000000067F0000400200008A5900009D8000__000000931B9AFDF8 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__000000127811CCF0 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__0000001C760FA190 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__00000038E67ABFA0 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__0000003903F1CFE8 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__0000003B99F7F8A0 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__0000005D2FFFFB38 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__00000073AD3FE6B8 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__000000914E3F38F0 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__000000931B33AE68 000000067F0000400200008A5900009D8000-000000067F0000400200008A5900009DC000__000000931B9AFDF8 000000067F0000400200008A5900009D86E1-000000067F0000400200008A5900009E10BE__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__000000127811CCF0 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__0000001C760FA190 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__00000038E67ABFA0 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__0000003903F1CFE8 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__0000003B99F7F8A0 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__0000005D2FFFFB38 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__00000073AD3FE6B8 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__000000914E3F38F0 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__000000931B33AE68 000000067F0000400200008A5900009DC000-000000067F0000400200008A5900009E0000__000000931B9AFDF8 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__000000127811CCF0 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__0000001C760FA190 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__00000038E67ABFA0 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__0000003903F1CFE8 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__0000003B99F7F8A0 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__0000005D2FFFFB38 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__00000073AD3FE6B8 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__000000914E3F38F0 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__000000931B33AE68 000000067F0000400200008A5900009E0000-000000067F0000400200008A5900009E4000__000000931B9AFDF8 000000067F0000400200008A5900009E10BE-000000067F0000400200008A590100000000__00000011987BE139-000000124823FE31 000000067F0000400200008A5900009E13DD-000000067F0000400200008A5900009E9DC2__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__000000127811CCF0 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__0000001C760FA190 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__00000038E67ABFA0 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__0000003903F1CFE8 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__0000003B99F7F8A0 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__0000005D2FFFFB38 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__00000073AD3FE6B8 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__000000914E3F38F0 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__000000931B33AE68 000000067F0000400200008A5900009E4000-000000067F0000400200008A5900009E8000__000000931B9AFDF8 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__000000127811CCF0 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__0000001C760FA190 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__00000038E67ABFA0 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__0000003903F1CFE8 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__0000003B99F7F8A0 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__0000005D2FFFFB38 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__00000073AD3FE6B8 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__000000914E3F38F0 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__000000931B33AE68 000000067F0000400200008A5900009E8000-000000067F0000400200008A5900009EC000__000000931B9AFDF8 000000067F0000400200008A5900009E9DC2-000000067F0000400200008A5900009F27A6__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__000000127811CCF0 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__0000001C760FA190 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__00000038E67ABFA0 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__0000003903F1CFE8 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__0000003B99F7F8A0 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__0000005D2FFFFB38 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__00000073AD3FE6B8 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__000000914E3F38F0 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__000000931B33AE68 000000067F0000400200008A5900009EC000-000000067F0000400200008A5900009F0000__000000931B9AFDF8 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__000000127811CCF0 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__0000001C760FA190 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__00000038E67ABFA0 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__0000003903F1CFE8 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__0000003B99F7F8A0 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__0000005D2FFFFB38 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__00000073AD3FE6B8 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__000000914E3F38F0 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__000000931B33AE68 000000067F0000400200008A5900009F0000-000000067F0000400200008A5900009F4000__000000931B9AFDF8 000000067F0000400200008A5900009F27A6-000000067F0000400200008A5900009FB188__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__000000127811CCF0 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__0000001C760FA190 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__00000038E67ABFA0 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__0000003903F1CFE8 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__0000003B99F7F8A0 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__0000005D2FFFFB38 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__00000073AD3FE6B8 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__000000914E3F38F0 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__000000931B33AE68 000000067F0000400200008A5900009F4000-000000067F0000400200008A5900009F8000__000000931B9AFDF8 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__0000001C760FA190 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__00000038E67ABFA0 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__0000003903F1CFE8 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__0000003B99F7F8A0 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__0000005D2FFFFB38 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__00000073AD3FE6B8 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__000000914E3F38F0 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__000000931B33AE68 000000067F0000400200008A5900009F8000-000000067F0000400200008A5900009FC000__000000931B9AFDF8 000000067F0000400200008A5900009F8000-030000000000000000000000000000000002__000000127811CCF0 000000067F0000400200008A5900009FB188-000000067F0000400200008A590000A03B76__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__0000001C760FA190 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__00000038E67ABFA0 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__0000003903F1CFE8 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__0000003B99F7F8A0 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__0000005D2FFFFB38 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__00000073AD3FE6B8 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__000000914E3F38F0 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__000000931B33AE68 000000067F0000400200008A5900009FC000-000000067F0000400200008A590000A00000__000000931B9AFDF8 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__0000001C760FA190 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__00000038E67ABFA0 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__0000003903F1CFE8 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__0000003B99F7F8A0 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__0000005D2FFFFB38 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__00000073AD3FE6B8 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__000000914E3F38F0 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__000000931B33AE68 000000067F0000400200008A590000A00000-000000067F0000400200008A590000A04000__000000931B9AFDF8 000000067F0000400200008A590000A03B76-000000067F0000400200008A590000A0C550__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__0000001C760FA190 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__00000038E67ABFA0 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__0000003903F1CFE8 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__0000003B99F7F8A0 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__0000005D2FFFFB38 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__00000073AD3FE6B8 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__000000914E3F38F0 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__000000931B33AE68 000000067F0000400200008A590000A04000-000000067F0000400200008A590000A08000__000000931B9AFDF8 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__0000001C760FA190 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__00000038E67ABFA0 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__0000003903F1CFE8 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__0000003B99F7F8A0 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__0000005D2FFFFB38 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__00000073AD3FE6B8 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__000000914E3F38F0 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__000000931B33AE68 000000067F0000400200008A590000A08000-000000067F0000400200008A590000A0C000__000000931B9AFDF8 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__0000001C760FA190 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__00000038E67ABFA0 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__0000003903F1CFE8 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__0000003B99F7F8A0 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__0000005D2FFFFB38 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__00000073AD3FE6B8 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__000000914E3F38F0 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__000000931B33AE68 000000067F0000400200008A590000A0C000-000000067F0000400200008A590000A10000__000000931B9AFDF8 000000067F0000400200008A590000A0C550-000000067F0000400200008A590000A14F25__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__0000001C760FA190 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__00000038E67ABFA0 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__0000003903F1CFE8 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__0000003B99F7F8A0 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__0000005D2FFFFB38 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__00000073AD3FE6B8 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__000000914E3F38F0 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__000000931B33AE68 000000067F0000400200008A590000A10000-000000067F0000400200008A590000A14000__000000931B9AFDF8 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__0000001C760FA190 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__00000038E67ABFA0 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__0000003903F1CFE8 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__0000003B99F7F8A0 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__0000005D2FFFFB38 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__00000073AD3FE6B8 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__000000914E3F38F0 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__000000931B33AE68 000000067F0000400200008A590000A14000-000000067F0000400200008A590000A18000__000000931B9AFDF8 000000067F0000400200008A590000A14F25-000000067F0000400200008A590000A1D8F5__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__0000001C760FA190 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__00000038E67ABFA0 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__0000003903F1CFE8 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__0000003B99F7F8A0 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__0000005D2FFFFB38 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__00000073AD3FE6B8 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__000000914E3F38F0 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__000000931B33AE68 000000067F0000400200008A590000A18000-000000067F0000400200008A590000A1C000__000000931B9AFDF8 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__0000001C760FA190 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__00000038E67ABFA0 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__0000003903F1CFE8 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__0000003B99F7F8A0 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__0000005D2FFFFB38 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__00000073AD3FE6B8 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__000000914E3F38F0 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__000000931B33AE68 000000067F0000400200008A590000A1C000-000000067F0000400200008A590000A20000__000000931B9AFDF8 000000067F0000400200008A590000A1D8F5-000000067F0000400200008A590000A262D4__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__0000001C760FA190 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__00000038E67ABFA0 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__0000003903F1CFE8 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__0000003B99F7F8A0 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__0000005D2FFFFB38 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__00000073AD3FE6B8 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__000000914E3F38F0 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__000000931B33AE68 000000067F0000400200008A590000A20000-000000067F0000400200008A590000A24000__000000931B9AFDF8 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__0000001C760FA190 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__00000038E67ABFA0 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__0000003903F1CFE8 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__0000003B99F7F8A0 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__0000005D2FFFFB38 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__00000073AD3FE6B8 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__000000914E3F38F0 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__000000931B33AE68 000000067F0000400200008A590000A24000-000000067F0000400200008A590000A28000__000000931B9AFDF8 000000067F0000400200008A590000A262D4-000000067F0000400200008A590000A2ECBA__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__0000001C760FA190 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__00000038E67ABFA0 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__0000003903F1CFE8 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__0000003B99F7F8A0 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__0000005D2FFFFB38 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__00000073AD3FE6B8 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__000000914E3F38F0 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__000000931B33AE68 000000067F0000400200008A590000A28000-000000067F0000400200008A590000A2C000__000000931B9AFDF8 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__0000001C760FA190 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__00000038E67ABFA0 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__0000003903F1CFE8 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__0000003B99F7F8A0 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__0000005D2FFFFB38 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__00000073AD3FE6B8 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__000000914E3F38F0 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__000000931B33AE68 000000067F0000400200008A590000A2C000-000000067F0000400200008A590000A30000__000000931B9AFDF8 000000067F0000400200008A590000A2ECBA-000000067F0000400200008A590000A3769E__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__0000001C760FA190 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__00000038E67ABFA0 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__0000003903F1CFE8 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__0000003B99F7F8A0 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__0000005D2FFFFB38 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__00000073AD3FE6B8 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__000000914E3F38F0 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__000000931B33AE68 000000067F0000400200008A590000A30000-000000067F0000400200008A590000A34000__000000931B9AFDF8 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__0000001C760FA190 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__00000038E67ABFA0 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__0000003903F1CFE8 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__0000003B99F7F8A0 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__0000005D2FFFFB38 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__00000073AD3FE6B8 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__000000914E3F38F0 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__000000931B33AE68 000000067F0000400200008A590000A34000-000000067F0000400200008A590000A38000__000000931B9AFDF8 000000067F0000400200008A590000A3769E-000000067F0000400200008A590000A40089__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__0000001C760FA190 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__00000038E67ABFA0 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__0000003903F1CFE8 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__0000003B99F7F8A0 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__0000005D2FFFFB38 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__00000073AD3FE6B8 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__000000914E3F38F0 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__000000931B33AE68 000000067F0000400200008A590000A38000-000000067F0000400200008A590000A3C000__000000931B9AFDF8 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__0000001C760FA190 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__00000038E67ABFA0 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__0000003903F1CFE8 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__0000003B99F7F8A0 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__0000005D2FFFFB38 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__00000073AD3FE6B8 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__000000914E3F38F0 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__000000931B33AE68 000000067F0000400200008A590000A3C000-000000067F0000400200008A590000A40000__000000931B9AFDF8 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__0000001C725A2400 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__0000001C760FA190 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__00000038E67ABFA0 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__0000003903F1CFE8 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__0000003B99F7F8A0 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__0000005D2FFFFB38 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__00000073AD3FE6B8 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__000000914E3F38F0 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__000000931B33AE68 000000067F0000400200008A590000A40000-000000067F0000400200008A590000A44000__000000931B9AFDF8 000000067F0000400200008A590000A40089-000000067F0000400200008A590100000000__000000124823FE31-00000012F7CBDEA1 000000067F0000400200008A590000A4038F-000000067F0000400200008A590000A48D60__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__0000001C725A2400 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__0000001C760FA190 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__00000038E67ABFA0 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__0000003903F1CFE8 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__0000003B99F7F8A0 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__0000005D2FFFFB38 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__00000073AD3FE6B8 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__000000914E3F38F0 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__000000931B33AE68 000000067F0000400200008A590000A44000-000000067F0000400200008A590000A48000__000000931B9AFDF8 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__0000001C725A2400 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__0000001C760FA190 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__00000038E67ABFA0 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__0000003903F1CFE8 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__0000003B99F7F8A0 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__0000005D2FFFFB38 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__00000073AD3FE6B8 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__000000914E3F38F0 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__000000931B33AE68 000000067F0000400200008A590000A48000-000000067F0000400200008A590000A4C000__000000931B9AFDF8 000000067F0000400200008A590000A48D60-000000067F0000400200008A590000A51735__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__0000001C725A2400 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__0000001C760FA190 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__00000038E67ABFA0 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__0000003903F1CFE8 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__0000003B99F7F8A0 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__0000005D2FFFFB38 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__00000073AD3FE6B8 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__000000914E3F38F0 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__000000931B33AE68 000000067F0000400200008A590000A4C000-000000067F0000400200008A590000A50000__000000931B9AFDF8 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__0000001C725A2400 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__0000001C760FA190 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__00000038E67ABFA0 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__0000003903F1CFE8 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__0000003B99F7F8A0 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__0000005D2FFFFB38 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__00000073AD3FE6B8 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__000000914E3F38F0 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__000000931B33AE68 000000067F0000400200008A590000A50000-000000067F0000400200008A590000A54000__000000931B9AFDF8 000000067F0000400200008A590000A51735-000000067F0000400200008A590000A5A101__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__0000001C725A2400 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__0000001C760FA190 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__00000038E67ABFA0 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__0000003903F1CFE8 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__0000003B99F7F8A0 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__0000005D2FFFFB38 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__00000073AD3FE6B8 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__000000914E3F38F0 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__000000931B33AE68 000000067F0000400200008A590000A54000-000000067F0000400200008A590000A58000__000000931B9AFDF8 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__0000001C725A2400 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__0000001C760FA190 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__00000038E67ABFA0 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__0000003903F1CFE8 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__0000003B99F7F8A0 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__0000005D2FFFFB38 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__00000073AD3FE6B8 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__000000914E3F38F0 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__000000931B33AE68 000000067F0000400200008A590000A58000-000000067F0000400200008A590000A5C000__000000931B9AFDF8 000000067F0000400200008A590000A5A101-000000067F0000400200008A590000A62AD2__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__0000001C725A2400 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__0000001C760FA190 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__00000038E67ABFA0 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__0000003903F1CFE8 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__0000003B99F7F8A0 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__0000005D2FFFFB38 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__00000073AD3FE6B8 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__000000914E3F38F0 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__000000931B33AE68 000000067F0000400200008A590000A5C000-000000067F0000400200008A590000A60000__000000931B9AFDF8 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__0000001C725A2400 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__0000001C760FA190 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__00000038E67ABFA0 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__0000003903F1CFE8 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__0000003B99F7F8A0 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__0000005D2FFFFB38 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__00000073AD3FE6B8 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__000000914E3F38F0 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__000000931B33AE68 000000067F0000400200008A590000A60000-000000067F0000400200008A590000A64000__000000931B9AFDF8 000000067F0000400200008A590000A62AD2-000000067F0000400200008A590000A6B4C1__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__0000001C725A2400 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__0000001C760FA190 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__00000038E67ABFA0 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__0000003903F1CFE8 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__0000003B99F7F8A0 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__0000005D2FFFFB38 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__00000073AD3FE6B8 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__000000914E3F38F0 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__000000931B33AE68 000000067F0000400200008A590000A64000-000000067F0000400200008A590000A68000__000000931B9AFDF8 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__0000001C725A2400 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__0000001C760FA190 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__00000038E67ABFA0 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__0000003903F1CFE8 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__0000003B99F7F8A0 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__0000005D2FFFFB38 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__00000073AD3FE6B8 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__000000914E3F38F0 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__000000931B33AE68 000000067F0000400200008A590000A68000-000000067F0000400200008A590000A6C000__000000931B9AFDF8 000000067F0000400200008A590000A6B4C1-000000067F0000400200008A590000A73EAD__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__0000001C725A2400 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__0000001C760FA190 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__00000038E67ABFA0 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__0000003903F1CFE8 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__0000003B99F7F8A0 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__0000005D2FFFFB38 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__00000073AD3FE6B8 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__000000914E3F38F0 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__000000931B33AE68 000000067F0000400200008A590000A6C000-000000067F0000400200008A590000A70000__000000931B9AFDF8 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__0000001C725A2400 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__0000001C760FA190 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__00000038E67ABFA0 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__0000003903F1CFE8 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__0000003B99F7F8A0 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__0000005D2FFFFB38 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__00000073AD3FE6B8 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__000000914E3F38F0 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__000000931B33AE68 000000067F0000400200008A590000A70000-000000067F0000400200008A590000A74000__000000931B9AFDF8 000000067F0000400200008A590000A73EAD-000000067F0000400200008A590000A7C891__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__0000001C725A2400 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__0000001C760FA190 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__00000038E67ABFA0 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__0000003903F1CFE8 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__0000003B99F7F8A0 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__0000005D2FFFFB38 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__00000073AD3FE6B8 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__000000914E3F38F0 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__000000931B33AE68 000000067F0000400200008A590000A74000-000000067F0000400200008A590000A78000__000000931B9AFDF8 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__0000001C725A2400 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__0000001C760FA190 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__00000038E67ABFA0 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__0000003903F1CFE8 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__0000003B99F7F8A0 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__0000005D2FFFFB38 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__00000073AD3FE6B8 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__000000914E3F38F0 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__000000931B33AE68 000000067F0000400200008A590000A78000-000000067F0000400200008A590000A7C000__000000931B9AFDF8 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__0000001C725A2400 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__0000001C760FA190 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__00000038E67ABFA0 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__0000003903F1CFE8 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__0000003B99F7F8A0 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__0000005D2FFFFB38 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__00000073AD3FE6B8 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__000000914E3F38F0 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__000000931B33AE68 000000067F0000400200008A590000A7C000-000000067F0000400200008A590000A80000__000000931B9AFDF8 000000067F0000400200008A590000A7C891-000000067F0000400200008A590000A85266__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__0000001C725A2400 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__0000001C760FA190 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__00000038E67ABFA0 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__0000003903F1CFE8 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__0000003B99F7F8A0 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__0000005D2FFFFB38 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__00000073AD3FE6B8 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__000000914E3F38F0 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__000000931B33AE68 000000067F0000400200008A590000A80000-000000067F0000400200008A590000A84000__000000931B9AFDF8 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__0000001C725A2400 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__0000001C760FA190 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__00000038E67ABFA0 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__0000003903F1CFE8 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__0000003B99F7F8A0 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__0000005D2FFFFB38 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__00000073AD3FE6B8 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__000000914E3F38F0 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__000000931B33AE68 000000067F0000400200008A590000A84000-000000067F0000400200008A590000A88000__000000931B9AFDF8 000000067F0000400200008A590000A85266-000000067F0000400200008A590000A8DC37__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__0000001C725A2400 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__0000001C760FA190 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__00000038E67ABFA0 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__0000003903F1CFE8 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__0000003B99F7F8A0 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__0000005D2FFFFB38 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__00000073AD3FE6B8 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__000000914E3F38F0 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__000000931B33AE68 000000067F0000400200008A590000A88000-000000067F0000400200008A590000A8C000__000000931B9AFDF8 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__0000001C725A2400 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__0000001C760FA190 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__00000038E67ABFA0 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__0000003903F1CFE8 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__0000003B99F7F8A0 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__0000005D2FFFFB38 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__00000073AD3FE6B8 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__000000914E3F38F0 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__000000931B33AE68 000000067F0000400200008A590000A8C000-000000067F0000400200008A590000A90000__000000931B9AFDF8 000000067F0000400200008A590000A8DC37-000000067F0000400200008A590000A9660D__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__0000001C725A2400 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__0000001C760FA190 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__00000038E67ABFA0 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__0000003903F1CFE8 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__0000003B99F7F8A0 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__0000005D2FFFFB38 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__00000073AD3FE6B8 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__000000914E3F38F0 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__000000931B33AE68 000000067F0000400200008A590000A90000-000000067F0000400200008A590000A94000__000000931B9AFDF8 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__000000146DBFF3C0 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__0000001C760FA190 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__00000038E67ABFA0 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__0000003903F1CFE8 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__0000003B99F7F8A0 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__0000005D2FFFFB38 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__00000073AD3FE6B8 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__000000914E3F38F0 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__000000931B33AE68 000000067F0000400200008A590000A94000-000000067F0000400200008A590000A98000__000000931B9AFDF8 000000067F0000400200008A590000A9660D-000000067F0000400200008A590100000000__00000012F7CBDEA1-00000013977BD5E1 000000067F0000400200008A590000A968EA-000000067F0000400200008A590000A9F2CA__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__000000146DBFF3C0 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__0000001C760FA190 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__00000038E67ABFA0 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__0000003903F1CFE8 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__0000003B99F7F8A0 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__0000005D2FFFFB38 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__00000073AD3FE6B8 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__000000914E3F38F0 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__000000931B33AE68 000000067F0000400200008A590000A98000-000000067F0000400200008A590000A9C000__000000931B9AFDF8 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__000000146DBFF3C0 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__0000001C760FA190 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__00000038E67ABFA0 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__0000003903F1CFE8 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__0000003B99F7F8A0 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__0000005D2FFFFB38 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__00000073AD3FE6B8 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__000000914E3F38F0 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__000000931B33AE68 000000067F0000400200008A590000A9C000-000000067F0000400200008A590000AA0000__000000931B9AFDF8 000000067F0000400200008A590000A9F2CA-000000067F0000400200008A590000AA7CAE__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__000000146DBFF3C0 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__0000001C760FA190 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__00000038E67ABFA0 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__0000003903F1CFE8 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__0000003B99F7F8A0 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__0000005D2FFFFB38 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__00000073AD3FE6B8 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__000000914E3F38F0 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__000000931B33AE68 000000067F0000400200008A590000AA0000-000000067F0000400200008A590000AA4000__000000931B9AFDF8 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__000000146DBFF3C0 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__0000001C760FA190 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__00000038E67ABFA0 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__0000003903F1CFE8 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__0000003B99F7F8A0 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__0000005D2FFFFB38 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__00000073AD3FE6B8 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__000000914E3F38F0 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__000000931B33AE68 000000067F0000400200008A590000AA4000-000000067F0000400200008A590000AA8000__000000931B9AFDF8 000000067F0000400200008A590000AA7CAE-000000067F0000400200008A590000AB0693__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__000000146DBFF3C0 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__0000001C760FA190 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__00000038E67ABFA0 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__0000003903F1CFE8 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__0000003B99F7F8A0 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__0000005D2FFFFB38 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__00000073AD3FE6B8 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__000000914E3F38F0 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__000000931B33AE68 000000067F0000400200008A590000AA8000-000000067F0000400200008A590000AAC000__000000931B9AFDF8 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__000000146DBFF3C0 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__0000001C760FA190 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__00000038E67ABFA0 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__0000003903F1CFE8 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__0000003B99F7F8A0 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__0000005D2FFFFB38 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__00000073AD3FE6B8 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__000000914E3F38F0 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__000000931B33AE68 000000067F0000400200008A590000AAC000-000000067F0000400200008A590000AB0000__000000931B9AFDF8 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__000000146DBFF3C0 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__0000001C760FA190 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__00000038E67ABFA0 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__0000003903F1CFE8 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__0000003B99F7F8A0 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__0000005D2FFFFB38 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__00000073AD3FE6B8 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__000000914E3F38F0 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__000000931B33AE68 000000067F0000400200008A590000AB0000-000000067F0000400200008A590000AB4000__000000931B9AFDF8 000000067F0000400200008A590000AB0693-000000067F0000400200008A590000AB9074__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__000000146DBFF3C0 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__0000001C760FA190 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__00000038E67ABFA0 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__0000003903F1CFE8 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__0000003B99F7F8A0 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__0000005D2FFFFB38 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__00000073AD3FE6B8 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__000000914E3F38F0 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__000000931B33AE68 000000067F0000400200008A590000AB4000-000000067F0000400200008A590000AB8000__000000931B9AFDF8 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__000000146DBFF3C0 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__0000001C760FA190 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__00000038E67ABFA0 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__0000003903F1CFE8 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__0000003B99F7F8A0 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__0000005D2FFFFB38 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__00000073AD3FE6B8 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__000000914E3F38F0 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__000000931B33AE68 000000067F0000400200008A590000AB8000-000000067F0000400200008A590000ABC000__000000931B9AFDF8 000000067F0000400200008A590000AB9074-000000067F0000400200008A590000AC1A4D__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__000000146DBFF3C0 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__0000001C760FA190 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__00000038E67ABFA0 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__0000003903F1CFE8 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__0000003B99F7F8A0 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__0000005D2FFFFB38 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__00000073AD3FE6B8 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__000000914E3F38F0 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__000000931B33AE68 000000067F0000400200008A590000ABC000-000000067F0000400200008A590000AC0000__000000931B9AFDF8 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__000000146DBFF3C0 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__0000001C760FA190 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__00000038E67ABFA0 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__0000003903F1CFE8 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__0000003B99F7F8A0 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__0000005D2FFFFB38 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__00000073AD3FE6B8 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__000000914E3F38F0 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__000000931B33AE68 000000067F0000400200008A590000AC0000-000000067F0000400200008A590000AC4000__000000931B9AFDF8 000000067F0000400200008A590000AC1A4D-000000067F0000400200008A590000ACA420__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__000000146DBFF3C0 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__0000001C760FA190 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__00000038E67ABFA0 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__0000003903F1CFE8 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__0000003B99F7F8A0 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__0000005D2FFFFB38 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__00000073AD3FE6B8 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__000000914E3F38F0 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__000000931B33AE68 000000067F0000400200008A590000AC4000-000000067F0000400200008A590000AC8000__000000931B9AFDF8 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__000000146DBFF3C0 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__0000001C760FA190 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__00000038E67ABFA0 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__0000003903F1CFE8 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__0000003B99F7F8A0 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__0000005D2FFFFB38 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__00000073AD3FE6B8 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__000000914E3F38F0 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__000000931B33AE68 000000067F0000400200008A590000AC8000-000000067F0000400200008A590000ACC000__000000931B9AFDF8 000000067F0000400200008A590000ACA420-000000067F0000400200008A590000AD2DFB__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__000000146DBFF3C0 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__0000001C760FA190 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__00000038E67ABFA0 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__0000003903F1CFE8 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__0000003B99F7F8A0 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__0000005D2FFFFB38 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__00000073AD3FE6B8 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__000000914E3F38F0 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__000000931B33AE68 000000067F0000400200008A590000ACC000-000000067F0000400200008A590000AD0000__000000931B9AFDF8 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__000000146DBFF3C0 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__0000001C760FA190 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__00000038E67ABFA0 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__0000003903F1CFE8 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__0000003B99F7F8A0 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__0000005D2FFFFB38 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__00000073AD3FE6B8 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__000000914E3F38F0 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__000000931B33AE68 000000067F0000400200008A590000AD0000-000000067F0000400200008A590000AD4000__000000931B9AFDF8 000000067F0000400200008A590000AD2DFB-000000067F0000400200008A590000ADB7D7__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__000000146DBFF3C0 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__0000001C760FA190 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__00000038E67ABFA0 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__0000003903F1CFE8 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__0000003B99F7F8A0 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__0000005D2FFFFB38 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__00000073AD3FE6B8 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__000000914E3F38F0 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__000000931B33AE68 000000067F0000400200008A590000AD4000-000000067F0000400200008A590000AD8000__000000931B9AFDF8 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__000000146DBFF3C0 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__0000001C760FA190 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__00000038E67ABFA0 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__0000003903F1CFE8 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__0000003B99F7F8A0 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__0000005D2FFFFB38 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__00000073AD3FE6B8 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__000000914E3F38F0 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__000000931B33AE68 000000067F0000400200008A590000AD8000-000000067F0000400200008A590000ADC000__000000931B9AFDF8 000000067F0000400200008A590000ADB7D7-000000067F0000400200008A590000AE41BC__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__000000146DBFF3C0 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__0000001C760FA190 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__00000038E67ABFA0 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__0000003903F1CFE8 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__0000003B99F7F8A0 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__0000005D2FFFFB38 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__00000073AD3FE6B8 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__000000914E3F38F0 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__000000931B33AE68 000000067F0000400200008A590000ADC000-000000067F0000400200008A590000AE0000__000000931B9AFDF8 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__000000146DBFF3C0 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__0000001C760FA190 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__00000038E67ABFA0 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__0000003903F1CFE8 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__0000003B99F7F8A0 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__0000005D2FFFFB38 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__00000073AD3FE6B8 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__000000914E3F38F0 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__000000931B33AE68 000000067F0000400200008A590000AE0000-000000067F0000400200008A590000AE4000__000000931B9AFDF8 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__000000146DBFF3C0 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__0000001C760FA190 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__00000038E67ABFA0 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__0000003903F1CFE8 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__0000003B99F7F8A0 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__0000005D2FFFFB38 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__00000073AD3FE6B8 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__000000914E3F38F0 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__000000931B33AE68 000000067F0000400200008A590000AE4000-000000067F0000400200008A590000AE8000__000000931B9AFDF8 000000067F0000400200008A590000AE41BC-000000067F0000400200008A590000AECBAC__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__000000146DBFF3C0 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__0000001C760FA190 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__00000038E67ABFA0 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__0000003903F1CFE8 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__0000003B99F7F8A0 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__0000005D2FFFFB38 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__00000073AD3FE6B8 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__000000914E3F38F0 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__000000931B33AE68 000000067F0000400200008A590000AE8000-000000067F0000400200008A590000AEC000__000000931B9AFDF8 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__000000146DBFF3C0 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__0000001C760FA190 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__00000038E67ABFA0 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__0000003903F1CFE8 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__0000003B99F7F8A0 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__0000005D2FFFFB38 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__00000073AD3FE6B8 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__000000914E3F38F0 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__000000931B33AE68 000000067F0000400200008A590000AEC000-000000067F0000400200008A590000AF0000__000000931B9AFDF8 000000067F0000400200008A590000AECBAC-000000067F0000400200008A590000AF558D__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__000000146DBFF3C0 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__0000001C760FA190 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__00000038E67ABFA0 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__0000003903F1CFE8 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__0000003B99F7F8A0 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__0000005D2FFFFB38 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__00000073AD3FE6B8 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__000000914E3F38F0 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__000000931B33AE68 000000067F0000400200008A590000AF0000-000000067F0000400200008A590000AF4000__000000931B9AFDF8 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__000000146DBFF3C0 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__0000001C760FA190 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__00000038E67ABFA0 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__0000003903F1CFE8 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__0000003B99F7F8A0 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__0000005D2FFFFB38 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__00000073AD3FE6B8 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__000000914E3F38F0 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__000000931B33AE68 000000067F0000400200008A590000AF4000-000000067F0000400200008A590000AF8000__000000931B9AFDF8 000000067F0000400200008A590000AF558D-000000067F0000400200008A590100000000__00000013977BD5E1-000000144723F489 000000067F0000400200008A590000AF5892-000000067F0000400200008A590000AFE262__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__000000146DBFF3C0 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__0000001C760FA190 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__00000038E67ABFA0 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__0000003903F1CFE8 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__0000003B99F7F8A0 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__0000005D2FFFFB38 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__00000073AD3FE6B8 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__000000914E3F38F0 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__000000931B33AE68 000000067F0000400200008A590000AF8000-000000067F0000400200008A590000AFC000__000000931B9AFDF8 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__000000146DBFF3C0 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__0000001C760FA190 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__00000038E67ABFA0 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__0000003903F1CFE8 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__0000003B99F7F8A0 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__0000005D2FFFFB38 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__00000073AD3FE6B8 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__000000914E3F38F0 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__000000931B33AE68 000000067F0000400200008A590000AFC000-000000067F0000400200008A590000B00000__000000931B9AFDF8 000000067F0000400200008A590000AFE262-000000067F0000400200008A590000B06C3B__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__000000146DBFF3C0 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__0000001C760FA190 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__00000038E67ABFA0 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__0000003903F1CFE8 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__0000003B99F7F8A0 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__0000005D2FFFFB38 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__00000073AD3FE6B8 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__000000914E3F38F0 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__000000931B33AE68 000000067F0000400200008A590000B00000-000000067F0000400200008A590000B04000__000000931B9AFDF8 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__000000146DBFF3C0 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__0000001C760FA190 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__00000038E67ABFA0 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__0000003903F1CFE8 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__0000003B99F7F8A0 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__0000005D2FFFFB38 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__00000073AD3FE6B8 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__000000914E3F38F0 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__000000931B33AE68 000000067F0000400200008A590000B04000-000000067F0000400200008A590000B08000__000000931B9AFDF8 000000067F0000400200008A590000B06C3B-000000067F0000400200008A590000B0F60D__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__0000001C760FA190 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__00000038E67ABFA0 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__0000003903F1CFE8 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__0000003B99F7F8A0 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__0000005D2FFFFB38 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__00000073AD3FE6B8 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__000000914E3F38F0 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__000000931B33AE68 000000067F0000400200008A590000B08000-000000067F0000400200008A590000B0C000__000000931B9AFDF8 000000067F0000400200008A590000B08000-030000000000000000000000000000000002__000000146DBFF3C0 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__0000001C760FA190 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__00000038E67ABFA0 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__0000003903F1CFE8 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__0000003B99F7F8A0 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__0000005D2FFFFB38 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__00000073AD3FE6B8 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__000000914E3F38F0 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__000000931B33AE68 000000067F0000400200008A590000B0C000-000000067F0000400200008A590000B10000__000000931B9AFDF8 000000067F0000400200008A590000B0F60D-000000067F0000400200008A590000B17FE6__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__0000001C760FA190 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__00000038E67ABFA0 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__0000003903F1CFE8 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__0000003B99F7F8A0 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__0000005D2FFFFB38 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__00000073AD3FE6B8 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__000000914E3F38F0 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__000000931B33AE68 000000067F0000400200008A590000B10000-000000067F0000400200008A590000B14000__000000931B9AFDF8 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__0000001C760FA190 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__00000038E67ABFA0 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__0000003903F1CFE8 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__0000003B99F7F8A0 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__0000005D2FFFFB38 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__00000073AD3FE6B8 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__000000914E3F38F0 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__000000931B33AE68 000000067F0000400200008A590000B14000-000000067F0000400200008A590000B18000__000000931B9AFDF8 000000067F0000400200008A590000B17FE6-000000067F0000400200008A590000B209C7__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__0000001C760FA190 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__00000038E67ABFA0 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__0000003903F1CFE8 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__0000003B99F7F8A0 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__0000005D2FFFFB38 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__00000073AD3FE6B8 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__000000914E3F38F0 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__000000931B33AE68 000000067F0000400200008A590000B18000-000000067F0000400200008A590000B1C000__000000931B9AFDF8 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__0000001C760FA190 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__00000038E67ABFA0 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__0000003903F1CFE8 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__0000003B99F7F8A0 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__0000005D2FFFFB38 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__00000073AD3FE6B8 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__000000914E3F38F0 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__000000931B33AE68 000000067F0000400200008A590000B1C000-000000067F0000400200008A590000B20000__000000931B9AFDF8 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__0000001C760FA190 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__00000038E67ABFA0 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__0000003903F1CFE8 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__0000003B99F7F8A0 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__0000005D2FFFFB38 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__00000073AD3FE6B8 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__000000914E3F38F0 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__000000931B33AE68 000000067F0000400200008A590000B20000-000000067F0000400200008A590000B24000__000000931B9AFDF8 000000067F0000400200008A590000B209C7-000000067F0000400200008A590000B293BF__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__0000001C760FA190 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__00000038E67ABFA0 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__0000003903F1CFE8 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__0000003B99F7F8A0 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__0000005D2FFFFB38 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__00000073AD3FE6B8 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__000000914E3F38F0 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__000000931B33AE68 000000067F0000400200008A590000B24000-000000067F0000400200008A590000B28000__000000931B9AFDF8 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__0000001C760FA190 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__00000038E67ABFA0 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__0000003903F1CFE8 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__0000003B99F7F8A0 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__0000005D2FFFFB38 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__00000073AD3FE6B8 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__000000914E3F38F0 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__000000931B33AE68 000000067F0000400200008A590000B28000-000000067F0000400200008A590000B2C000__000000931B9AFDF8 000000067F0000400200008A590000B293BF-000000067F0000400200008A590000B31D9F__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__0000001C760FA190 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__00000038E67ABFA0 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__0000003903F1CFE8 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__0000003B99F7F8A0 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__0000005D2FFFFB38 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__00000073AD3FE6B8 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__000000914E3F38F0 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__000000931B33AE68 000000067F0000400200008A590000B2C000-000000067F0000400200008A590000B30000__000000931B9AFDF8 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__0000001C760FA190 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__00000038E67ABFA0 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__0000003903F1CFE8 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__0000003B99F7F8A0 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__0000005D2FFFFB38 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__00000073AD3FE6B8 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__000000914E3F38F0 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__000000931B33AE68 000000067F0000400200008A590000B30000-000000067F0000400200008A590000B34000__000000931B9AFDF8 000000067F0000400200008A590000B31D9F-000000067F0000400200008A590000B3A77A__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__0000001C760FA190 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__00000038E67ABFA0 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__0000003903F1CFE8 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__0000003B99F7F8A0 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__0000005D2FFFFB38 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__00000073AD3FE6B8 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__000000914E3F38F0 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__000000931B33AE68 000000067F0000400200008A590000B34000-000000067F0000400200008A590000B38000__000000931B9AFDF8 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__0000001C760FA190 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__00000038E67ABFA0 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__0000003903F1CFE8 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__0000003B99F7F8A0 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__0000005D2FFFFB38 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__00000073AD3FE6B8 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__000000914E3F38F0 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__000000931B33AE68 000000067F0000400200008A590000B38000-000000067F0000400200008A590000B3C000__000000931B9AFDF8 000000067F0000400200008A590000B3A77A-000000067F0000400200008A590000B4315B__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__0000001C760FA190 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__00000038E67ABFA0 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__0000003903F1CFE8 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__0000003B99F7F8A0 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__0000005D2FFFFB38 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__00000073AD3FE6B8 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__000000914E3F38F0 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__000000931B33AE68 000000067F0000400200008A590000B3C000-000000067F0000400200008A590000B40000__000000931B9AFDF8 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__0000001C760FA190 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__00000038E67ABFA0 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__0000003903F1CFE8 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__0000003B99F7F8A0 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__0000005D2FFFFB38 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__00000073AD3FE6B8 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__000000914E3F38F0 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__000000931B33AE68 000000067F0000400200008A590000B40000-000000067F0000400200008A590000B44000__000000931B9AFDF8 000000067F0000400200008A590000B4315B-000000067F0000400200008A590000B4BB2C__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__0000001C760FA190 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__00000038E67ABFA0 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__0000003903F1CFE8 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__0000003B99F7F8A0 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__0000005D2FFFFB38 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__00000073AD3FE6B8 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__000000914E3F38F0 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__000000931B33AE68 000000067F0000400200008A590000B44000-000000067F0000400200008A590000B48000__000000931B9AFDF8 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__0000001C725A2400 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__0000001C760FA190 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__00000038E67ABFA0 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__0000003903F1CFE8 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__0000003B99F7F8A0 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__0000005D2FFFFB38 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__00000073AD3FE6B8 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__000000914E3F38F0 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__000000931B33AE68 000000067F0000400200008A590000B48000-000000067F0000400200008A590000B4C000__000000931B9AFDF8 000000067F0000400200008A590000B4BB2C-000000067F0000400200008A590100000000__000000144723F489-00000014E6D3F501 000000067F0000400200008A590000B4BDF1-000000067F0000400200008A590000B547DD__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__0000001C725A2400 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__0000001C760FA190 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__00000038E67ABFA0 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__0000003903F1CFE8 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__0000003B99F7F8A0 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__0000005D2FFFFB38 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__00000073AD3FE6B8 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__000000914E3F38F0 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__000000931B33AE68 000000067F0000400200008A590000B4C000-000000067F0000400200008A590000B50000__000000931B9AFDF8 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__0000001C725A2400 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__0000001C760FA190 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__00000038E67ABFA0 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__0000003903F1CFE8 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__0000003B99F7F8A0 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__0000005D2FFFFB38 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__00000073AD3FE6B8 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__000000914E3F38F0 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__000000931B33AE68 000000067F0000400200008A590000B50000-000000067F0000400200008A590000B54000__000000931B9AFDF8 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__0000001C725A2400 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__0000001C760FA190 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__00000038E67ABFA0 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__0000003903F1CFE8 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__0000003B99F7F8A0 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__0000005D2FFFFB38 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__00000073AD3FE6B8 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__000000914E3F38F0 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__000000931B33AE68 000000067F0000400200008A590000B54000-000000067F0000400200008A590000B58000__000000931B9AFDF8 000000067F0000400200008A590000B547DD-000000067F0000400200008A590000B5D1BB__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__0000001C725A2400 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__0000001C760FA190 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__00000038E67ABFA0 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__0000003903F1CFE8 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__0000003B99F7F8A0 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__0000005D2FFFFB38 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__00000073AD3FE6B8 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__000000914E3F38F0 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__000000931B33AE68 000000067F0000400200008A590000B58000-000000067F0000400200008A590000B5C000__000000931B9AFDF8 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__0000001C725A2400 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__0000001C760FA190 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__00000038E67ABFA0 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__0000003903F1CFE8 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__0000003B99F7F8A0 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__0000005D2FFFFB38 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__00000073AD3FE6B8 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__000000914E3F38F0 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__000000931B33AE68 000000067F0000400200008A590000B5C000-000000067F0000400200008A590000B60000__000000931B9AFDF8 000000067F0000400200008A590000B5D1BB-000000067F0000400200008A590000B65BA4__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__0000001C725A2400 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__0000001C760FA190 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__00000038E67ABFA0 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__0000003903F1CFE8 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__0000003B99F7F8A0 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__0000005D2FFFFB38 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__00000073AD3FE6B8 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__000000914E3F38F0 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__000000931B33AE68 000000067F0000400200008A590000B60000-000000067F0000400200008A590000B64000__000000931B9AFDF8 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__0000001C725A2400 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__0000001C760FA190 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__00000038E67ABFA0 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__0000003903F1CFE8 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__0000003B99F7F8A0 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__0000005D2FFFFB38 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__00000073AD3FE6B8 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__000000914E3F38F0 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__000000931B33AE68 000000067F0000400200008A590000B64000-000000067F0000400200008A590000B68000__000000931B9AFDF8 000000067F0000400200008A590000B65BA4-000000067F0000400200008A590000B6E588__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__0000001C725A2400 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__0000001C760FA190 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__00000038E67ABFA0 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__0000003903F1CFE8 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__0000003B99F7F8A0 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__0000005D2FFFFB38 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__00000073AD3FE6B8 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__000000914E3F38F0 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__000000931B33AE68 000000067F0000400200008A590000B68000-000000067F0000400200008A590000B6C000__000000931B9AFDF8 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__0000001C725A2400 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__0000001C760FA190 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__00000038E67ABFA0 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__0000003903F1CFE8 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__0000003B99F7F8A0 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__0000005D2FFFFB38 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__00000073AD3FE6B8 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__000000914E3F38F0 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__000000931B33AE68 000000067F0000400200008A590000B6C000-000000067F0000400200008A590000B70000__000000931B9AFDF8 000000067F0000400200008A590000B6E588-000000067F0000400200008A590000B76F5E__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__0000001C725A2400 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__0000001C760FA190 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__00000038E67ABFA0 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__0000003903F1CFE8 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__0000003B99F7F8A0 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__0000005D2FFFFB38 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__00000073AD3FE6B8 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__000000914E3F38F0 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__000000931B33AE68 000000067F0000400200008A590000B70000-000000067F0000400200008A590000B74000__000000931B9AFDF8 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__0000001C725A2400 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__0000001C760FA190 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__00000038E67ABFA0 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__0000003903F1CFE8 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__0000003B99F7F8A0 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__0000005D2FFFFB38 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__00000073AD3FE6B8 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__000000914E3F38F0 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__000000931B33AE68 000000067F0000400200008A590000B74000-000000067F0000400200008A590000B78000__000000931B9AFDF8 000000067F0000400200008A590000B76F5E-000000067F0000400200008A590000B7F935__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__0000001C725A2400 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__0000001C760FA190 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__00000038E67ABFA0 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__0000003903F1CFE8 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__0000003B99F7F8A0 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__0000005D2FFFFB38 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__00000073AD3FE6B8 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__000000914E3F38F0 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__000000931B33AE68 000000067F0000400200008A590000B78000-000000067F0000400200008A590000B7C000__000000931B9AFDF8 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__0000001C725A2400 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__0000001C760FA190 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__00000038E67ABFA0 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__0000003903F1CFE8 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__0000003B99F7F8A0 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__0000005D2FFFFB38 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__00000073AD3FE6B8 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__000000914E3F38F0 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__000000931B33AE68 000000067F0000400200008A590000B7C000-000000067F0000400200008A590000B80000__000000931B9AFDF8 000000067F0000400200008A590000B7F935-000000067F0000400200008A590000B8830D__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__0000001C725A2400 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__0000001C760FA190 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__00000038E67ABFA0 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__0000003903F1CFE8 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__0000003B99F7F8A0 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__0000005D2FFFFB38 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__00000073AD3FE6B8 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__000000914E3F38F0 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__000000931B33AE68 000000067F0000400200008A590000B80000-000000067F0000400200008A590000B84000__000000931B9AFDF8 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__0000001C725A2400 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__0000001C760FA190 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__00000038E67ABFA0 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__0000003903F1CFE8 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__0000003B99F7F8A0 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__0000005D2FFFFB38 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__00000073AD3FE6B8 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__000000914E3F38F0 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__000000931B33AE68 000000067F0000400200008A590000B84000-000000067F0000400200008A590000B88000__000000931B9AFDF8 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__0000001C725A2400 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__0000001C760FA190 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__00000038E67ABFA0 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__0000003903F1CFE8 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__0000003B99F7F8A0 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__0000005D2FFFFB38 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__00000073AD3FE6B8 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__000000914E3F38F0 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__000000931B33AE68 000000067F0000400200008A590000B88000-000000067F0000400200008A590000B8C000__000000931B9AFDF8 000000067F0000400200008A590000B8830D-000000067F0000400200008A590000B90CE8__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__0000001C725A2400 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__0000001C760FA190 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__00000038E67ABFA0 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__0000003903F1CFE8 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__0000003B99F7F8A0 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__0000005D2FFFFB38 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__00000073AD3FE6B8 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__000000914E3F38F0 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__000000931B33AE68 000000067F0000400200008A590000B8C000-000000067F0000400200008A590000B90000__000000931B9AFDF8 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__0000001C725A2400 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__0000001C760FA190 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__00000038E67ABFA0 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__0000003903F1CFE8 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__0000003B99F7F8A0 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__0000005D2FFFFB38 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__00000073AD3FE6B8 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__000000914E3F38F0 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__000000931B33AE68 000000067F0000400200008A590000B90000-000000067F0000400200008A590000B94000__000000931B9AFDF8 000000067F0000400200008A590000B90CE8-000000067F0000400200008A590000B996CA__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__0000001C725A2400 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__0000001C760FA190 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__00000038E67ABFA0 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__0000003903F1CFE8 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__0000003B99F7F8A0 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__0000005D2FFFFB38 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__00000073AD3FE6B8 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__000000914E3F38F0 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__000000931B33AE68 000000067F0000400200008A590000B94000-000000067F0000400200008A590000B98000__000000931B9AFDF8 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__0000001C725A2400 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__0000001C760FA190 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__00000038E67ABFA0 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__0000003903F1CFE8 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__0000003B99F7F8A0 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__0000005D2FFFFB38 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__00000073AD3FE6B8 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__000000914E3F38F0 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__000000931B33AE68 000000067F0000400200008A590000B98000-000000067F0000400200008A590000B9C000__000000931B9AFDF8 000000067F0000400200008A590000B996CA-000000067F0000400200008A590000BA20AB__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__0000001C725A2400 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__0000001C760FA190 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__00000038E67ABFA0 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__0000003903F1CFE8 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__0000003B99F7F8A0 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__0000005D2FFFFB38 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__00000073AD3FE6B8 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__000000914E3F38F0 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__000000931B33AE68 000000067F0000400200008A590000B9C000-000000067F0000400200008A590000BA0000__000000931B9AFDF8 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__0000001C725A2400 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__0000001C760FA190 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__00000038E67ABFA0 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__0000003903F1CFE8 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__0000003B99F7F8A0 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__0000005D2FFFFB38 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__00000073AD3FE6B8 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__000000914E3F38F0 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__000000931B33AE68 000000067F0000400200008A590000BA0000-000000067F0000400200008A590000BA4000__000000931B9AFDF8 000000067F0000400200008A590000BA20AB-000000067F0000400200008A590000BAAAA5__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__0000001C725A2400 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__0000001C760FA190 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__00000038E67ABFA0 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__0000003903F1CFE8 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__0000003B99F7F8A0 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__0000005D2FFFFB38 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__00000073AD3FE6B8 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__000000914E3F38F0 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__000000931B33AE68 000000067F0000400200008A590000BA4000-000000067F0000400200008A590000BA8000__000000931B9AFDF8 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__00000016661DE360 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__0000001C760FA190 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__00000038E67ABFA0 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__0000003903F1CFE8 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__0000003B99F7F8A0 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__0000005D2FFFFB38 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__00000073AD3FE6B8 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__000000914E3F38F0 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__000000931B33AE68 000000067F0000400200008A590000BA8000-000000067F0000400200008A590000BAC000__000000931B9AFDF8 000000067F0000400200008A590000BAAAA5-000000067F0000400200008A590100000000__00000014E6D3F501-00000015967BE3A1 000000067F0000400200008A590000BAAD99-000000067F0000400200008A590000BB3774__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__00000016661DE360 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__0000001C760FA190 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__00000038E67ABFA0 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__0000003903F1CFE8 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__0000003B99F7F8A0 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__0000005D2FFFFB38 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__00000073AD3FE6B8 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__000000914E3F38F0 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__000000931B33AE68 000000067F0000400200008A590000BAC000-000000067F0000400200008A590000BB0000__000000931B9AFDF8 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__00000016661DE360 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__0000001C760FA190 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__00000038E67ABFA0 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__0000003903F1CFE8 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__0000003B99F7F8A0 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__0000005D2FFFFB38 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__00000073AD3FE6B8 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__000000914E3F38F0 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__000000931B33AE68 000000067F0000400200008A590000BB0000-000000067F0000400200008A590000BB4000__000000931B9AFDF8 000000067F0000400200008A590000BB3774-000000067F0000400200008A590000BBC149__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__00000016661DE360 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__0000001C760FA190 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__00000038E67ABFA0 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__0000003903F1CFE8 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__0000003B99F7F8A0 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__0000005D2FFFFB38 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__00000073AD3FE6B8 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__000000914E3F38F0 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__000000931B33AE68 000000067F0000400200008A590000BB4000-000000067F0000400200008A590000BB8000__000000931B9AFDF8 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__00000016661DE360 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__0000001C760FA190 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__00000038E67ABFA0 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__0000003903F1CFE8 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__0000003B99F7F8A0 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__0000005D2FFFFB38 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__00000073AD3FE6B8 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__000000914E3F38F0 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__000000931B33AE68 000000067F0000400200008A590000BB8000-000000067F0000400200008A590000BBC000__000000931B9AFDF8 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__00000016661DE360 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__0000001C760FA190 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__00000038E67ABFA0 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__0000003903F1CFE8 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__0000003B99F7F8A0 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__0000005D2FFFFB38 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__00000073AD3FE6B8 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__000000914E3F38F0 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__000000931B33AE68 000000067F0000400200008A590000BBC000-000000067F0000400200008A590000BC0000__000000931B9AFDF8 000000067F0000400200008A590000BBC149-000000067F0000400200008A590000BC4B1C__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__00000016661DE360 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__0000001C760FA190 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__00000038E67ABFA0 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__0000003903F1CFE8 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__0000003B99F7F8A0 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__0000005D2FFFFB38 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__00000073AD3FE6B8 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__000000914E3F38F0 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__000000931B33AE68 000000067F0000400200008A590000BC0000-000000067F0000400200008A590000BC4000__000000931B9AFDF8 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__00000016661DE360 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__0000001C760FA190 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__00000038E67ABFA0 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__0000003903F1CFE8 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__0000003B99F7F8A0 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__0000005D2FFFFB38 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__00000073AD3FE6B8 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__000000914E3F38F0 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__000000931B33AE68 000000067F0000400200008A590000BC4000-000000067F0000400200008A590000BC8000__000000931B9AFDF8 000000067F0000400200008A590000BC4B1C-000000067F0000400200008A590000BCD502__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__00000016661DE360 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__0000001C760FA190 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__00000038E67ABFA0 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__0000003903F1CFE8 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__0000003B99F7F8A0 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__0000005D2FFFFB38 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__00000073AD3FE6B8 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__000000914E3F38F0 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__000000931B33AE68 000000067F0000400200008A590000BC8000-000000067F0000400200008A590000BCC000__000000931B9AFDF8 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__00000016661DE360 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__0000001C760FA190 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__00000038E67ABFA0 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__0000003903F1CFE8 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__0000003B99F7F8A0 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__0000005D2FFFFB38 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__00000073AD3FE6B8 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__000000914E3F38F0 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__000000931B33AE68 000000067F0000400200008A590000BCC000-000000067F0000400200008A590000BD0000__000000931B9AFDF8 000000067F0000400200008A590000BCD502-000000067F0000400200008A590000BD5ED4__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__00000016661DE360 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__0000001C760FA190 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__00000038E67ABFA0 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__0000003903F1CFE8 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__0000003B99F7F8A0 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__0000005D2FFFFB38 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__00000073AD3FE6B8 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__000000914E3F38F0 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__000000931B33AE68 000000067F0000400200008A590000BD0000-000000067F0000400200008A590000BD4000__000000931B9AFDF8 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__00000016661DE360 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__0000001C760FA190 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__00000038E67ABFA0 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__0000003903F1CFE8 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__0000003B99F7F8A0 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__0000005D2FFFFB38 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__00000073AD3FE6B8 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__000000914E3F38F0 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__000000931B33AE68 000000067F0000400200008A590000BD4000-000000067F0000400200008A590000BD8000__000000931B9AFDF8 000000067F0000400200008A590000BD5ED4-000000067F0000400200008A590000BDE8AA__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__00000016661DE360 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__0000001C760FA190 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__00000038E67ABFA0 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__0000003903F1CFE8 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__0000003B99F7F8A0 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__0000005D2FFFFB38 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__00000073AD3FE6B8 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__000000914E3F38F0 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__000000931B33AE68 000000067F0000400200008A590000BD8000-000000067F0000400200008A590000BDC000__000000931B9AFDF8 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__00000016661DE360 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__0000001C760FA190 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__00000038E67ABFA0 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__0000003903F1CFE8 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__0000003B99F7F8A0 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__0000005D2FFFFB38 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__00000073AD3FE6B8 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__000000914E3F38F0 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__000000931B33AE68 000000067F0000400200008A590000BDC000-000000067F0000400200008A590000BE0000__000000931B9AFDF8 000000067F0000400200008A590000BDE8AA-000000067F0000400200008A590000BE7291__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__00000016661DE360 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__0000001C760FA190 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__00000038E67ABFA0 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__0000003903F1CFE8 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__0000003B99F7F8A0 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__0000005D2FFFFB38 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__00000073AD3FE6B8 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__000000914E3F38F0 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__000000931B33AE68 000000067F0000400200008A590000BE0000-000000067F0000400200008A590000BE4000__000000931B9AFDF8 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__00000016661DE360 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__0000001C760FA190 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__00000038E67ABFA0 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__0000003903F1CFE8 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__0000003B99F7F8A0 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__0000005D2FFFFB38 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__00000073AD3FE6B8 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__000000914E3F38F0 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__000000931B33AE68 000000067F0000400200008A590000BE4000-000000067F0000400200008A590000BE8000__000000931B9AFDF8 000000067F0000400200008A590000BE7291-000000067F0000400200008A590000BEFC6C__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__00000016661DE360 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__0000001C760FA190 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__00000038E67ABFA0 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__0000003903F1CFE8 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__0000003B99F7F8A0 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__0000005D2FFFFB38 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__00000073AD3FE6B8 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__000000914E3F38F0 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__000000931B33AE68 000000067F0000400200008A590000BE8000-000000067F0000400200008A590000BEC000__000000931B9AFDF8 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__00000016661DE360 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__0000001C760FA190 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__00000038E67ABFA0 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__0000003903F1CFE8 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__0000003B99F7F8A0 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__0000005D2FFFFB38 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__00000073AD3FE6B8 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__000000914E3F38F0 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__000000931B33AE68 000000067F0000400200008A590000BEC000-000000067F0000400200008A590000BF0000__000000931B9AFDF8 000000067F0000400200008A590000BEFC6C-000000067F0000400200008A590000BF8634__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__00000016661DE360 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__0000001C760FA190 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__00000038E67ABFA0 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__0000003903F1CFE8 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__0000003B99F7F8A0 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__0000005D2FFFFB38 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__00000073AD3FE6B8 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__000000914E3F38F0 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__000000931B33AE68 000000067F0000400200008A590000BF0000-000000067F0000400200008A590000BF4000__000000931B9AFDF8 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__00000016661DE360 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__0000001C760FA190 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__00000038E67ABFA0 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__0000003903F1CFE8 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__0000003B99F7F8A0 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__0000005D2FFFFB38 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__00000073AD3FE6B8 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__000000914E3F38F0 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__000000931B33AE68 000000067F0000400200008A590000BF4000-000000067F0000400200008A590000BF8000__000000931B9AFDF8 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__00000016661DE360 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__0000001C760FA190 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__00000038E67ABFA0 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__0000003903F1CFE8 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__0000003B99F7F8A0 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__0000005D2FFFFB38 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__00000073AD3FE6B8 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__000000914E3F38F0 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__000000931B33AE68 000000067F0000400200008A590000BF8000-000000067F0000400200008A590000BFC000__000000931B9AFDF8 000000067F0000400200008A590000BF8634-000000067F0000400200008A590000C01008__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__00000016661DE360 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__0000001C760FA190 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__00000038E67ABFA0 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__0000003903F1CFE8 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__0000003B99F7F8A0 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__0000005D2FFFFB38 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__00000073AD3FE6B8 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__000000914E3F38F0 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__000000931B33AE68 000000067F0000400200008A590000BFC000-000000067F0000400200008A590000C00000__000000931B9AFDF8 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__00000016661DE360 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__0000001C760FA190 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__00000038E67ABFA0 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__0000003903F1CFE8 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__0000003B99F7F8A0 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__0000005D2FFFFB38 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__00000073AD3FE6B8 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__000000914E3F38F0 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__000000931B33AE68 000000067F0000400200008A590000C00000-000000067F0000400200008A590000C04000__000000931B9AFDF8 000000067F0000400200008A590000C01008-000000067F0000400200008A590100000000__00000015967BE3A1-00000016362BE8F9 000000067F0000400200008A590000C012F5-000000067F0000400200008A590000C09CEB__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__00000016661DE360 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__0000001C760FA190 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__00000038E67ABFA0 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__0000003903F1CFE8 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__0000003B99F7F8A0 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__0000005D2FFFFB38 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__00000073AD3FE6B8 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__000000914E3F38F0 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__000000931B33AE68 000000067F0000400200008A590000C04000-000000067F0000400200008A590000C08000__000000931B9AFDF8 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__00000016661DE360 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__0000001C760FA190 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__00000038E67ABFA0 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__0000003903F1CFE8 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__0000003B99F7F8A0 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__0000005D2FFFFB38 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__00000073AD3FE6B8 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__000000914E3F38F0 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__000000931B33AE68 000000067F0000400200008A590000C08000-000000067F0000400200008A590000C0C000__000000931B9AFDF8 000000067F0000400200008A590000C09CEB-000000067F0000400200008A590000C126CC__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__00000016661DE360 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__0000001C760FA190 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__00000038E67ABFA0 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__0000003903F1CFE8 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__0000003B99F7F8A0 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__0000005D2FFFFB38 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__00000073AD3FE6B8 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__000000914E3F38F0 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__000000931B33AE68 000000067F0000400200008A590000C0C000-000000067F0000400200008A590000C10000__000000931B9AFDF8 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__00000016661DE360 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__0000001C760FA190 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__00000038E67ABFA0 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__0000003903F1CFE8 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__0000003B99F7F8A0 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__0000005D2FFFFB38 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__00000073AD3FE6B8 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__000000914E3F38F0 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__000000931B33AE68 000000067F0000400200008A590000C10000-000000067F0000400200008A590000C14000__000000931B9AFDF8 000000067F0000400200008A590000C126CC-000000067F0000400200008A590000C1B0AB__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__00000016661DE360 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__0000001C760FA190 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__00000038E67ABFA0 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__0000003903F1CFE8 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__0000003B99F7F8A0 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__0000005D2FFFFB38 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__00000073AD3FE6B8 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__000000914E3F38F0 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__000000931B33AE68 000000067F0000400200008A590000C14000-000000067F0000400200008A590000C18000__000000931B9AFDF8 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__0000001C760FA190 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__00000038E67ABFA0 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__0000003903F1CFE8 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__0000003B99F7F8A0 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__0000005D2FFFFB38 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__00000073AD3FE6B8 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__000000914E3F38F0 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__000000931B33AE68 000000067F0000400200008A590000C18000-000000067F0000400200008A590000C1C000__000000931B9AFDF8 000000067F0000400200008A590000C18000-030000000000000000000000000000000002__00000016661DE360 000000067F0000400200008A590000C1B0AB-000000067F0000400200008A590000C23A86__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__0000001C760FA190 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__00000038E67ABFA0 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__0000003903F1CFE8 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__0000003B99F7F8A0 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__0000005D2FFFFB38 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__00000073AD3FE6B8 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__000000914E3F38F0 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__000000931B33AE68 000000067F0000400200008A590000C1C000-000000067F0000400200008A590000C20000__000000931B9AFDF8 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__0000001C760FA190 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__00000038E67ABFA0 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__0000003903F1CFE8 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__0000003B99F7F8A0 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__0000005D2FFFFB38 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__00000073AD3FE6B8 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__000000914E3F38F0 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__000000931B33AE68 000000067F0000400200008A590000C20000-000000067F0000400200008A590000C24000__000000931B9AFDF8 000000067F0000400200008A590000C23A86-000000067F0000400200008A590000C2C466__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__0000001C760FA190 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__00000038E67ABFA0 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__0000003903F1CFE8 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__0000003B99F7F8A0 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__0000005D2FFFFB38 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__00000073AD3FE6B8 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__000000914E3F38F0 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__000000931B33AE68 000000067F0000400200008A590000C24000-000000067F0000400200008A590000C28000__000000931B9AFDF8 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__0000001C760FA190 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__00000038E67ABFA0 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__0000003903F1CFE8 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__0000003B99F7F8A0 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__0000005D2FFFFB38 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__00000073AD3FE6B8 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__000000914E3F38F0 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__000000931B33AE68 000000067F0000400200008A590000C28000-000000067F0000400200008A590000C2C000__000000931B9AFDF8 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__0000001C760FA190 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__00000038E67ABFA0 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__0000003903F1CFE8 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__0000003B99F7F8A0 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__0000005D2FFFFB38 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__00000073AD3FE6B8 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__000000914E3F38F0 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__000000931B33AE68 000000067F0000400200008A590000C2C000-000000067F0000400200008A590000C30000__000000931B9AFDF8 000000067F0000400200008A590000C2C466-000000067F0000400200008A590000C34E3E__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__0000001C760FA190 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__00000038E67ABFA0 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__0000003903F1CFE8 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__0000003B99F7F8A0 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__0000005D2FFFFB38 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__00000073AD3FE6B8 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__000000914E3F38F0 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__000000931B33AE68 000000067F0000400200008A590000C30000-000000067F0000400200008A590000C34000__000000931B9AFDF8 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__0000001C760FA190 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__00000038E67ABFA0 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__0000003903F1CFE8 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__0000003B99F7F8A0 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__0000005D2FFFFB38 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__00000073AD3FE6B8 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__000000914E3F38F0 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__000000931B33AE68 000000067F0000400200008A590000C34000-000000067F0000400200008A590000C38000__000000931B9AFDF8 000000067F0000400200008A590000C34E3E-000000067F0000400200008A590000C3D814__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__0000001C760FA190 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__00000038E67ABFA0 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__0000003903F1CFE8 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__0000003B99F7F8A0 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__0000005D2FFFFB38 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__00000073AD3FE6B8 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__000000914E3F38F0 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__000000931B33AE68 000000067F0000400200008A590000C38000-000000067F0000400200008A590000C3C000__000000931B9AFDF8 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__0000001C760FA190 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__00000038E67ABFA0 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__0000003903F1CFE8 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__0000003B99F7F8A0 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__0000005D2FFFFB38 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__00000073AD3FE6B8 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__000000914E3F38F0 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__000000931B33AE68 000000067F0000400200008A590000C3C000-000000067F0000400200008A590000C40000__000000931B9AFDF8 000000067F0000400200008A590000C3D814-000000067F0000400200008A590000C461F2__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__0000001C760FA190 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__00000038E67ABFA0 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__0000003903F1CFE8 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__0000003B99F7F8A0 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__0000005D2FFFFB38 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__00000073AD3FE6B8 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__000000914E3F38F0 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__000000931B33AE68 000000067F0000400200008A590000C40000-000000067F0000400200008A590000C44000__000000931B9AFDF8 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__0000001C760FA190 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__00000038E67ABFA0 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__0000003903F1CFE8 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__0000003B99F7F8A0 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__0000005D2FFFFB38 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__00000073AD3FE6B8 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__000000914E3F38F0 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__000000931B33AE68 000000067F0000400200008A590000C44000-000000067F0000400200008A590000C48000__000000931B9AFDF8 000000067F0000400200008A590000C461F2-000000067F0000400200008A590000C4EBD4__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__0000001C760FA190 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__00000038E67ABFA0 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__0000003903F1CFE8 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__0000003B99F7F8A0 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__0000005D2FFFFB38 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__00000073AD3FE6B8 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__000000914E3F38F0 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__000000931B33AE68 000000067F0000400200008A590000C48000-000000067F0000400200008A590000C4C000__000000931B9AFDF8 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__0000001C760FA190 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__00000038E67ABFA0 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__0000003903F1CFE8 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__0000003B99F7F8A0 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__0000005D2FFFFB38 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__00000073AD3FE6B8 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__000000914E3F38F0 000000067F0000400200008A590000C4C000-000000067F0000400200008A590000C50000__000000931B9A2710 000000067F0000400200008A590000C4EBD4-000000067F0000400200008A590000C575B6__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__0000001C760FA190 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__00000038E67ABFA0 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__0000003903F1CFE8 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__0000003B99F7F8A0 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__0000005D2FFFFB38 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__00000073AD3FE6B8 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__000000914E3F38F0 000000067F0000400200008A590000C50000-000000067F0000400200008A590000C54000__000000931B9A2710 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__0000001C760FA190 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__00000038E67ABFA0 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__0000003903F1CFE8 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__0000003B99F7F8A0 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__0000005D2FFFFB38 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__00000073AD3FE6B8 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__000000914E3F38F0 000000067F0000400200008A590000C54000-000000067F0000400200008A590000C58000__000000931B9A2710 000000067F0000400200008A590000C575B6-000000067F0000400200008A590000C5FF90__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__0000001C760FA190 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__00000038E67ABFA0 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__0000003903F1CFE8 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__0000003B99F7F8A0 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__0000005D2FFFFB38 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__00000073AD3FE6B8 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__000000914E3F38F0 000000067F0000400200008A590000C58000-000000067F0000400200008A590000C5C000__000000931B9A2710 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__0000001C760FA190 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__00000038E67ABFA0 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__0000003903F1CFE8 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__0000003B99F7F8A0 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__0000005D2FFFFB38 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__00000073AD3FE6B8 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__000000914E3F38F0 000000067F0000400200008A590000C5C000-000000067F0000400200008A590000C60000__000000931B9A2710 000000067F0000400200008A590000C5FF90-000000067F0000400200008A590100000000__00000016362BE8F9-00000016E5D3F7B9 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__0000001C725A2400 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__0000001C760FA190 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__00000038E67ABFA0 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__0000003903F1CFE8 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__0000003B99F7F8A0 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__0000005D2FFFFB38 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__00000073AD3FE6B8 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__000000914E3F38F0 000000067F0000400200008A590000C60000-000000067F0000400200008A590000C64000__000000931B9A2710 000000067F0000400200008A590000C60295-000000067F0000400200008A590000C68C70__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__0000001C725A2400 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__0000001C760FA190 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__00000038E67ABFA0 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__0000003903F1CFE8 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__0000003B99F7F8A0 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__0000005D2FFFFB38 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__00000073AD3FE6B8 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__000000914E3F38F0 000000067F0000400200008A590000C64000-000000067F0000400200008A590000C68000__000000931B9A2710 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__0000001C725A2400 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__0000001C760FA190 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__00000038E67ABFA0 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__0000003903F1CFE8 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__0000003B99F7F8A0 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__0000005D2FFFFB38 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__00000073AD3FE6B8 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__000000914E3F38F0 000000067F0000400200008A590000C68000-000000067F0000400200008A590000C6C000__000000931B9A2710 000000067F0000400200008A590000C68C70-000000067F0000400200008A590000C7164A__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__0000001C725A2400 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__0000001C760FA190 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__00000038E67ABFA0 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__0000003903F1CFE8 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__0000003B99F7F8A0 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__0000005D2FFFFB38 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__00000073AD3FE6B8 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__000000914E3F38F0 000000067F0000400200008A590000C6C000-000000067F0000400200008A590000C70000__000000931B9A2710 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__0000001C725A2400 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__0000001C760FA190 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__00000038E67ABFA0 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__0000003903F1CFE8 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__0000003B99F7F8A0 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__0000005D2FFFFB38 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__00000073AD3FE6B8 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__000000914E3F38F0 000000067F0000400200008A590000C70000-000000067F0000400200008A590000C74000__000000931B9A2710 000000067F0000400200008A590000C7164A-000000067F0000400200008A590000C7A01A__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__0000001C725A2400 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__0000001C760FA190 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__00000038E67ABFA0 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__0000003903F1CFE8 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__0000003B99F7F8A0 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__0000005D2FFFFB38 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__00000073AD3FE6B8 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__000000914E3F38F0 000000067F0000400200008A590000C74000-000000067F0000400200008A590000C78000__000000931B9A2710 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__0000001C725A2400 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__0000001C760FA190 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__00000038E67ABFA0 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__0000003903F1CFE8 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__0000003B99F7F8A0 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__0000005D2FFFFB38 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__00000073AD3FE6B8 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__000000914E3F38F0 000000067F0000400200008A590000C78000-000000067F0000400200008A590000C7C000__000000931B9A2710 000000067F0000400200008A590000C7A01A-000000067F0000400200008A590000C829F4__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__0000001C725A2400 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__0000001C760FA190 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__00000038E67ABFA0 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__0000003903F1CFE8 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__0000003B99F7F8A0 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__0000005D2FFFFB38 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__00000073AD3FE6B8 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__000000914E3F38F0 000000067F0000400200008A590000C7C000-000000067F0000400200008A590000C80000__000000931B9A2710 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__0000001C725A2400 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__0000001C760FA190 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__00000038E67ABFA0 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__0000003903F1CFE8 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__0000003B99F7F8A0 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__0000005D2FFFFB38 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__00000073AD3FE6B8 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__000000914E3F38F0 000000067F0000400200008A590000C80000-000000067F0000400200008A590000C84000__000000931B9A2710 000000067F0000400200008A590000C829F4-000000067F0000400200008A590000C8B3D9__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__0000001C725A2400 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__0000001C760FA190 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__00000038E67ABFA0 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__0000003903F1CFE8 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__0000003B99F7F8A0 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__0000005D2FFFFB38 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__00000073AD3FE6B8 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__000000914E3F38F0 000000067F0000400200008A590000C84000-000000067F0000400200008A590000C88000__000000931B9A2710 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__0000001C725A2400 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__0000001C760FA190 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__00000038E67ABFA0 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__0000003903F1CFE8 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__0000003B99F7F8A0 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__0000005D2FFFFB38 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__00000073AD3FE6B8 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__000000914E3F38F0 000000067F0000400200008A590000C88000-000000067F0000400200008A590000C8C000__000000931B9A2710 000000067F0000400200008A590000C8B3D9-000000067F0000400200008A590000C93DC1__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__0000001C725A2400 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__0000001C760FA190 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__00000038E67ABFA0 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__0000003903F1CFE8 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__0000003B99F7F8A0 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__0000005D2FFFFB38 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__00000073AD3FE6B8 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__000000914E3F38F0 000000067F0000400200008A590000C8C000-000000067F0000400200008A590000C90000__000000931B9A2710 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__0000001C725A2400 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__0000001C760FA190 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__00000038E67ABFA0 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__0000003903F1CFE8 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__0000003B99F7F8A0 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__0000005D2FFFFB38 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__00000073AD3FE6B8 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__000000914E3F38F0 000000067F0000400200008A590000C90000-000000067F0000400200008A590000C94000__000000931B9A2710 000000067F0000400200008A590000C93DC1-000000067F0000400200008A590000C9C79F__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__0000001C725A2400 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__0000001C760FA190 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__00000038E67ABFA0 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__0000003903F1CFE8 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__0000003B99F7F8A0 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__0000005D2FFFFB38 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__00000073AD3FE6B8 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__000000914E3F38F0 000000067F0000400200008A590000C94000-000000067F0000400200008A590000C98000__000000931B9A2710 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__0000001C725A2400 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__0000001C760FA190 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__00000038E67ABFA0 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__0000003903F1CFE8 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__0000003B99F7F8A0 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__0000005D2FFFFB38 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__00000073AD3FE6B8 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__000000914E3F38F0 000000067F0000400200008A590000C98000-000000067F0000400200008A590000C9C000__000000931B9A2710 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__0000001C725A2400 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__0000001C760FA190 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__00000038E67ABFA0 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__0000003903F1CFE8 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__0000003B99F7F8A0 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__0000005D2FFFFB38 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__00000073AD3FE6B8 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__000000914E3F38F0 000000067F0000400200008A590000C9C000-000000067F0000400200008A590000CA0000__000000931B9A2710 000000067F0000400200008A590000C9C79F-000000067F0000400200008A590000CA5172__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__0000001C725A2400 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__0000001C760FA190 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__00000038E67ABFA0 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__0000003903F1CFE8 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__0000003B99F7F8A0 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__0000005D2FFFFB38 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__00000073AD3FE6B8 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__000000914E3F38F0 000000067F0000400200008A590000CA0000-000000067F0000400200008A590000CA4000__000000931B9A2710 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__0000001C725A2400 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__0000001C760FA190 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__00000038E67ABFA0 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__0000003903F1CFE8 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__0000003B99F7F8A0 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__0000005D2FFFFB38 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__00000073AD3FE6B8 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__000000914E3F38F0 000000067F0000400200008A590000CA4000-000000067F0000400200008A590000CA8000__000000931B9A2710 000000067F0000400200008A590000CA5172-000000067F0000400200008A590000CADB56__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__0000001C725A2400 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__0000001C760FA190 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__00000038E67ABFA0 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__0000003903F1CFE8 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__0000003B99F7F8A0 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__0000005D2FFFFB38 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__00000073AD3FE6B8 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__000000914E3F38F0 000000067F0000400200008A590000CA8000-000000067F0000400200008A590000CAC000__000000931B9A2710 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__0000001C725A2400 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__0000001C760FA190 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__00000038E67ABFA0 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__0000003903F1CFE8 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__0000003B99F7F8A0 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__0000005D2FFFFB38 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__00000073AD3FE6B8 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__000000914E3F38F0 000000067F0000400200008A590000CAC000-000000067F0000400200008A590000CB0000__000000931B9A2710 000000067F0000400200008A590000CADB56-000000067F0000400200008A590000CB652D__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__0000001C725A2400 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__0000001C760FA190 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__00000038E67ABFA0 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__0000003903F1CFE8 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__0000003B99F7F8A0 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__0000005D2FFFFB38 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__00000073AD3FE6B8 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__000000914E3F38F0 000000067F0000400200008A590000CB0000-000000067F0000400200008A590000CB4000__000000931B9A2710 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__000000184D31F520 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__0000001C760FA190 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__00000038E67ABFA0 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__0000003903F1CFE8 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__0000003B99F7F8A0 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__0000005D2FFFFB38 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__00000073AD3FE6B8 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__000000914E3F38F0 000000067F0000400200008A590000CB4000-000000067F0000400200008A590000CB8000__000000931B9A2710 000000067F0000400200008A590000CB652D-000000067F0000400200008A590100000000__00000016E5D3F7B9-000000178583EBE1 000000067F0000400200008A590000CB67FC-000000067F0000400200008A590000CBF1E3__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__000000184D31F520 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__0000001C760FA190 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__00000038E67ABFA0 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__0000003903F1CFE8 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__0000003B99F7F8A0 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__0000005D2FFFFB38 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__00000073AD3FE6B8 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__000000914E3F38F0 000000067F0000400200008A590000CB8000-000000067F0000400200008A590000CBC000__000000931B9A2710 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__000000184D31F520 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__0000001C760FA190 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__00000038E67ABFA0 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__0000003903F1CFE8 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__0000003B99F7F8A0 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__0000005D2FFFFB38 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__00000073AD3FE6B8 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__000000914E3F38F0 000000067F0000400200008A590000CBC000-000000067F0000400200008A590000CC0000__000000931B9A2710 000000067F0000400200008A590000CBF1E3-000000067F0000400200008A590000CC7BC5__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__000000184D31F520 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__0000001C760FA190 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__00000038E67ABFA0 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__0000003903F1CFE8 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__0000003B99F7F8A0 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__0000005D2FFFFB38 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__00000073AD3FE6B8 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__000000914E3F38F0 000000067F0000400200008A590000CC0000-000000067F0000400200008A590000CC4000__000000931B9A2710 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__000000184D31F520 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__0000001C760FA190 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__00000038E67ABFA0 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__0000003903F1CFE8 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__0000003B99F7F8A0 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__0000005D2FFFFB38 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__00000073AD3FE6B8 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__000000914E3F38F0 000000067F0000400200008A590000CC4000-000000067F0000400200008A590000CC8000__000000931B9A2710 000000067F0000400200008A590000CC7BC5-000000067F0000400200008A590000CD05AA__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__000000184D31F520 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__0000001C760FA190 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__00000038E67ABFA0 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__0000003903F1CFE8 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__0000003B99F7F8A0 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__0000005D2FFFFB38 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__00000073AD3FE6B8 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__000000914E3F38F0 000000067F0000400200008A590000CC8000-000000067F0000400200008A590000CCC000__000000931B9A2710 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__000000184D31F520 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__0000001C760FA190 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__00000038E67ABFA0 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__0000003903F1CFE8 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__0000003B99F7F8A0 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__0000005D2FFFFB38 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__00000073AD3FE6B8 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__000000914E3F38F0 000000067F0000400200008A590000CCC000-000000067F0000400200008A590000CD0000__000000931B9A2710 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__000000184D31F520 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__0000001C760FA190 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__00000038E67ABFA0 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__0000003903F1CFE8 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__0000003B99F7F8A0 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__0000005D2FFFFB38 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__00000073AD3FE6B8 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__000000914E3F38F0 000000067F0000400200008A590000CD0000-000000067F0000400200008A590000CD4000__000000931B9A2710 000000067F0000400200008A590000CD05AA-000000067F0000400200008A590000CD8F85__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__000000184D31F520 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__0000001C760FA190 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__00000038E67ABFA0 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__0000003903F1CFE8 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__0000003B99F7F8A0 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__0000005D2FFFFB38 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__00000073AD3FE6B8 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__000000914E3F38F0 000000067F0000400200008A590000CD4000-000000067F0000400200008A590000CD8000__000000931B9A2710 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__000000184D31F520 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__0000001C760FA190 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__00000038E67ABFA0 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__0000003903F1CFE8 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__0000003B99F7F8A0 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__0000005D2FFFFB38 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__00000073AD3FE6B8 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__000000914E3F38F0 000000067F0000400200008A590000CD8000-000000067F0000400200008A590000CDC000__000000931B9A2710 000000067F0000400200008A590000CD8F85-000000067F0000400200008A590000CE195A__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__000000184D31F520 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__0000001C760FA190 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__00000038E67ABFA0 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__0000003903F1CFE8 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__0000003B99F7F8A0 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__0000005D2FFFFB38 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__00000073AD3FE6B8 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__000000914E3F38F0 000000067F0000400200008A590000CDC000-000000067F0000400200008A590000CE0000__000000931B9A2710 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__000000184D31F520 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__0000001C760FA190 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__00000038E67ABFA0 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__0000003903F1CFE8 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__0000003B99F7F8A0 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__0000005D2FFFFB38 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__00000073AD3FE6B8 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__000000914E3F38F0 000000067F0000400200008A590000CE0000-000000067F0000400200008A590000CE4000__000000931B9A2710 000000067F0000400200008A590000CE195A-000000067F0000400200008A590000CEA33F__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__000000184D31F520 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__0000001C760FA190 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__00000038E67ABFA0 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__0000003903F1CFE8 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__0000003B99F7F8A0 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__0000005D2FFFFB38 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__00000073AD3FE6B8 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__000000914E3F38F0 000000067F0000400200008A590000CE4000-000000067F0000400200008A590000CE8000__000000931B9A2710 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__000000184D31F520 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__0000001C760FA190 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__00000038E67ABFA0 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__0000003903F1CFE8 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__0000003B99F7F8A0 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__0000005D2FFFFB38 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__00000073AD3FE6B8 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__000000914E3F38F0 000000067F0000400200008A590000CE8000-000000067F0000400200008A590000CEC000__000000931B9A2710 000000067F0000400200008A590000CEA33F-000000067F0000400200008A590000CF2D12__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__000000184D31F520 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__0000001C760FA190 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__00000038E67ABFA0 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__0000003903F1CFE8 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__0000003B99F7F8A0 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__0000005D2FFFFB38 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__00000073AD3FE6B8 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__000000914E3F38F0 000000067F0000400200008A590000CEC000-000000067F0000400200008A590000CF0000__000000931B9A2710 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__000000184D31F520 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__0000001C760FA190 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__00000038E67ABFA0 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__0000003903F1CFE8 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__0000003B99F7F8A0 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__0000005D2FFFFB38 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__00000073AD3FE6B8 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__000000914E3F38F0 000000067F0000400200008A590000CF0000-000000067F0000400200008A590000CF4000__000000931B9A2710 000000067F0000400200008A590000CF2D12-000000067F0000400200008A590000CFB6EA__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__000000184D31F520 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__0000001C760FA190 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__00000038E67ABFA0 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__0000003903F1CFE8 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__0000003B99F7F8A0 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__0000005D2FFFFB38 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__00000073AD3FE6B8 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__000000914E3F38F0 000000067F0000400200008A590000CF4000-000000067F0000400200008A590000CF8000__000000931B9A2710 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__000000184D31F520 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__0000001C760FA190 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__00000038E67ABFA0 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__0000003903F1CFE8 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__0000003B99F7F8A0 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__0000005D2FFFFB38 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__00000073AD3FE6B8 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__000000914E3F38F0 000000067F0000400200008A590000CF8000-000000067F0000400200008A590000CFC000__000000931B9A2710 000000067F0000400200008A590000CFB6EA-000000067F0000400200008A590000D040CD__000000178583EBE1-000000182533E779 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__000000184D31F520 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__0000001C760FA190 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__00000038E67ABFA0 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__0000003903F1CFE8 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__0000003B99F7F8A0 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__0000005D2FFFFB38 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__00000073AD3FE6B8 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__000000914E3F38F0 000000067F0000400200008A590000CFC000-000000067F0000400200008A590000D00000__000000931B9A2710 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__000000184D31F520 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__0000001C760FA190 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__00000038E67ABFA0 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__0000003903F1CFE8 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__0000003B99F7F8A0 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__0000005D2FFFFB38 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__00000073AD3FE6B8 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__000000914E3F38F0 000000067F0000400200008A590000D00000-000000067F0000400200008A590000D04000__000000931B9A2710 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__000000184D31F520 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__0000001C760FA190 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__00000038E67ABFA0 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__0000003903F1CFE8 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__0000003B99F7F8A0 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__0000005D2FFFFB38 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__00000073AD3FE6B8 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__000000914E3F38F0 000000067F0000400200008A590000D04000-000000067F0000400200008A590000D08000__000000931B9A2710 000000067F0000400200008A590000D040CD-000000067F0000400200008A590000D0CAAD__000000178583EBE1-000000182533E779 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__000000184D31F520 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__0000001C760FA190 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__00000038E67ABFA0 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__0000003903F1CFE8 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__0000003B99F7F8A0 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__0000005D2FFFFB38 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__00000073AD3FE6B8 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__000000914E3F38F0 000000067F0000400200008A590000D08000-000000067F0000400200008A590000D0C000__000000931B9A2710 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__000000184D31F520 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__0000001C760FA190 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__00000038E67ABFA0 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__0000003903F1CFE8 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__0000003B99F7F8A0 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__0000005D2FFFFB38 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__00000073AD3FE6B8 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__000000914E3F38F0 000000067F0000400200008A590000D0C000-000000067F0000400200008A590000D10000__000000931B9A2710 000000067F0000400200008A590000D0CAAD-000000067F0000400200008A590100000000__000000178583EBE1-000000182533E779 000000067F0000400200008A590000D0CD6E-000000067F0000400200008A590000D1574D__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__000000184D31F520 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__0000001C760FA190 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__00000038E67ABFA0 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__0000003903F1CFE8 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__0000003B99F7F8A0 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__0000005D2FFFFB38 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__00000073AD3FE6B8 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__000000914E3F38F0 000000067F0000400200008A590000D10000-000000067F0000400200008A590000D14000__000000931B9A2710 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__000000184D31F520 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__0000001C760FA190 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__00000038E67ABFA0 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__0000003903F1CFE8 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__0000003B99F7F8A0 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__0000005D2FFFFB38 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__00000073AD3FE6B8 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__000000914E3F38F0 000000067F0000400200008A590000D14000-000000067F0000400200008A590000D18000__000000931B9A2710 000000067F0000400200008A590000D1574D-000000067F0000400200008A590000D1E120__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__000000184D31F520 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__0000001C760FA190 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__00000038E67ABFA0 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__0000003903F1CFE8 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__0000003B99F7F8A0 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__0000005D2FFFFB38 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__00000073AD3FE6B8 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__000000914E3F38F0 000000067F0000400200008A590000D18000-000000067F0000400200008A590000D1C000__000000931B9A2710 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__000000184D31F520 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__0000001C760FA190 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__00000038E67ABFA0 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__0000003903F1CFE8 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__0000003B99F7F8A0 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__0000005D2FFFFB38 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__00000073AD3FE6B8 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__000000914E3F38F0 000000067F0000400200008A590000D1C000-000000067F0000400200008A590000D20000__000000931B9A2710 000000067F0000400200008A590000D1E120-000000067F0000400200008A590000D26AF1__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__0000001C760FA190 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__00000038E67ABFA0 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__0000003903F1CFE8 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__0000003B99F7F8A0 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__0000005D2FFFFB38 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__00000073AD3FE6B8 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__000000914E3F38F0 000000067F0000400200008A590000D20000-000000067F0000400200008A590000D24000__000000931B9A2710 000000067F0000400200008A590000D20000-030000000000000000000000000000000002__000000184D31F520 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__0000001C760FA190 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__00000038E67ABFA0 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__0000003903F1CFE8 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__0000003B99F7F8A0 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__0000005D2FFFFB38 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__00000073AD3FE6B8 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__000000914E3F38F0 000000067F0000400200008A590000D24000-000000067F0000400200008A590000D28000__000000931B9A2710 000000067F0000400200008A590000D26AF1-000000067F0000400200008A590000D2F4D7__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__0000001C760FA190 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__00000038E67ABFA0 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__0000003903F1CFE8 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__0000003B99F7F8A0 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__0000005D2FFFFB38 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__00000073AD3FE6B8 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__000000914E3F38F0 000000067F0000400200008A590000D28000-000000067F0000400200008A590000D2C000__000000931B9A2710 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__0000001C760FA190 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__00000038E67ABFA0 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__0000003903F1CFE8 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__0000003B99F7F8A0 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__0000005D2FFFFB38 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__00000073AD3FE6B8 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__000000914E3F38F0 000000067F0000400200008A590000D2C000-000000067F0000400200008A590000D30000__000000931B9A2710 000000067F0000400200008A590000D2F4D7-000000067F0000400200008A590000D37EB1__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__0000001C760FA190 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__00000038E67ABFA0 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__0000003903F1CFE8 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__0000003B99F7F8A0 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__0000005D2FFFFB38 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__00000073AD3FE6B8 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__000000914E3F38F0 000000067F0000400200008A590000D30000-000000067F0000400200008A590000D34000__000000931B9A2710 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__0000001C760FA190 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__00000038E67ABFA0 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__0000003903F1CFE8 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__0000003B99F7F8A0 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__0000005D2FFFFB38 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__00000073AD3FE6B8 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__000000914E3F38F0 000000067F0000400200008A590000D34000-000000067F0000400200008A590000D38000__000000931B9A2710 000000067F0000400200008A590000D37EB1-000000067F0000400200008A590000D40891__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__0000001C760FA190 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__00000038E67ABFA0 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__0000003903F1CFE8 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__0000003B99F7F8A0 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__0000005D2FFFFB38 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__00000073AD3FE6B8 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__000000914E3F38F0 000000067F0000400200008A590000D38000-000000067F0000400200008A590000D3C000__000000931B9A2710 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__0000001C760FA190 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__00000038E67ABFA0 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__0000003903F1CFE8 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__0000003B99F7F8A0 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__0000005D2FFFFB38 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__00000073AD3FE6B8 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__000000914E3F38F0 000000067F0000400200008A590000D3C000-000000067F0000400200008A590000D40000__000000931B9A2710 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__0000001C760FA190 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__00000038E67ABFA0 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__0000003903F1CFE8 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__0000003B99F7F8A0 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__0000005D2FFFFB38 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__00000073AD3FE6B8 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__000000914E3F38F0 000000067F0000400200008A590000D40000-000000067F0000400200008A590000D44000__000000931B9A2710 000000067F0000400200008A590000D40891-000000067F0000400200008A590000D4926B__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__0000001C760FA190 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__00000038E67ABFA0 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__0000003903F1CFE8 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__0000003B99F7F8A0 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__0000005D2FFFFB38 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__00000073AD3FE6B8 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__000000914E3F38F0 000000067F0000400200008A590000D44000-000000067F0000400200008A590000D48000__000000931B9A2710 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__0000001C760FA190 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__00000038E67ABFA0 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__0000003903F1CFE8 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__0000003B99F7F8A0 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__0000005D2FFFFB38 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__00000073AD3FE6B8 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__000000914E3F38F0 000000067F0000400200008A590000D48000-000000067F0000400200008A590000D4C000__000000931B9A2710 000000067F0000400200008A590000D4926B-000000067F0000400200008A590000D51C42__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__0000001C760FA190 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__00000038E67ABFA0 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__0000003903F1CFE8 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__0000003B99F7F8A0 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__0000005D2FFFFB38 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__00000073AD3FE6B8 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__000000914E3F38F0 000000067F0000400200008A590000D4C000-000000067F0000400200008A590000D50000__000000931B9A2710 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__0000001C760FA190 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__00000038E67ABFA0 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__0000003903F1CFE8 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__0000003B99F7F8A0 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__0000005D2FFFFB38 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__00000073AD3FE6B8 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__000000914E3F38F0 000000067F0000400200008A590000D50000-000000067F0000400200008A590000D54000__000000931B9A2710 000000067F0000400200008A590000D51C42-000000067F0000400200008A590000D5A61A__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__0000001C760FA190 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__00000038E67ABFA0 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__0000003903F1CFE8 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__0000003B99F7F8A0 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__0000005D2FFFFB38 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__00000073AD3FE6B8 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__000000914E3F38F0 000000067F0000400200008A590000D54000-000000067F0000400200008A590000D58000__000000931B9A2710 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__0000001C760FA190 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__00000038E67ABFA0 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__0000003903F1CFE8 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__0000003B99F7F8A0 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__0000005D2FFFFB38 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__00000073AD3FE6B8 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__000000914E3F38F0 000000067F0000400200008A590000D58000-000000067F0000400200008A590000D5C000__000000931B9A2710 000000067F0000400200008A590000D5A61A-000000067F0000400200008A590000D62FED__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__0000001C760FA190 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__00000038E67ABFA0 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__0000003903F1CFE8 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__0000003B99F7F8A0 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__0000005D2FFFFB38 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__00000073AD3FE6B8 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__000000914E3F38F0 000000067F0000400200008A590000D5C000-000000067F0000400200008A590000D60000__000000931B9A2710 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__0000001C725A2400 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__0000001C760FA190 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__00000038E67ABFA0 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__0000003903F1CFE8 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__0000003B99F7F8A0 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__0000005D2FFFFB38 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__00000073AD3FE6B8 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__000000914E3F38F0 000000067F0000400200008A590000D60000-000000067F0000400200008A590000D64000__000000931B9A2710 000000067F0000400200008A590000D62FED-000000067F0000400200008A590100000000__000000182533E779-00000018C4E3E6C1 000000067F0000400200008A590000D632CE-000000067F0000400200008A590000D6BCB1__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__0000001C725A2400 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__0000001C760FA190 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__00000038E67ABFA0 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__0000003903F1CFE8 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__0000003B99F7F8A0 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__0000005D2FFFFB38 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__00000073AD3FE6B8 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__000000914E3F38F0 000000067F0000400200008A590000D64000-000000067F0000400200008A590000D68000__000000931B9A2710 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__0000001C725A2400 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__0000001C760FA190 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__00000038E67ABFA0 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__0000003903F1CFE8 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__0000003B99F7F8A0 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__0000005D2FFFFB38 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__00000073AD3FE6B8 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__000000914E3F38F0 000000067F0000400200008A590000D68000-000000067F0000400200008A590000D6C000__000000931B9A2710 000000067F0000400200008A590000D6BCB1-000000067F0000400200008A590000D746AB__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__0000001C725A2400 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__0000001C760FA190 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__00000038E67ABFA0 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__0000003903F1CFE8 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__0000003B99F7F8A0 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__0000005D2FFFFB38 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__00000073AD3FE6B8 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__000000914E3F38F0 000000067F0000400200008A590000D6C000-000000067F0000400200008A590000D70000__000000931B9A2710 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__0000001C725A2400 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__0000001C760FA190 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__00000038E67ABFA0 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__0000003903F1CFE8 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__0000003B99F7F8A0 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__0000005D2FFFFB38 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__00000073AD3FE6B8 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__000000914E3F38F0 000000067F0000400200008A590000D70000-000000067F0000400200008A590000D74000__000000931B9A2710 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__0000001C725A2400 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__0000001C760FA190 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__00000038E67ABFA0 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__0000003903F1CFE8 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__0000003B99F7F8A0 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__0000005D2FFFFB38 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__00000073AD3FE6B8 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__000000914E3F38F0 000000067F0000400200008A590000D74000-000000067F0000400200008A590000D78000__000000931B9A2710 000000067F0000400200008A590000D746AB-000000067F0000400200008A590000D7D090__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__0000001C725A2400 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__0000001C760FA190 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__00000038E67ABFA0 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__0000003903F1CFE8 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__0000003B99F7F8A0 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__0000005D2FFFFB38 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__00000073AD3FE6B8 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__000000914E3F38F0 000000067F0000400200008A590000D78000-000000067F0000400200008A590000D7C000__000000931B9A2710 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__0000001C725A2400 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__0000001C760FA190 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__00000038E67ABFA0 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__0000003903F1CFE8 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__0000003B99F7F8A0 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__0000005D2FFFFB38 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__00000073AD3FE6B8 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__000000914E3F38F0 000000067F0000400200008A590000D7C000-000000067F0000400200008A590000D80000__000000931B9A2710 000000067F0000400200008A590000D7D090-000000067F0000400200008A590000D85A63__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__0000001C725A2400 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__0000001C760FA190 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__00000038E67ABFA0 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__0000003903F1CFE8 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__0000003B99F7F8A0 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__0000005D2FFFFB38 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__00000073AD3FE6B8 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__000000914E3F38F0 000000067F0000400200008A590000D80000-000000067F0000400200008A590000D84000__000000931B9A2710 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__0000001C725A2400 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__0000001C760FA190 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__00000038E67ABFA0 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__0000003903F1CFE8 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__0000003B99F7F8A0 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__0000005D2FFFFB38 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__00000073AD3FE6B8 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__000000914E3F38F0 000000067F0000400200008A590000D84000-000000067F0000400200008A590000D88000__000000931B9A2710 000000067F0000400200008A590000D85A63-000000067F0000400200008A590000D8E43F__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__0000001C725A2400 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__0000001C760FA190 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__00000038E67ABFA0 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__0000003903F1CFE8 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__0000003B99F7F8A0 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__0000005D2FFFFB38 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__00000073AD3FE6B8 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__000000914E3F38F0 000000067F0000400200008A590000D88000-000000067F0000400200008A590000D8C000__000000931B9A2710 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__0000001C725A2400 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__0000001C760FA190 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__00000038E67ABFA0 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__0000003903F1CFE8 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__0000003B99F7F8A0 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__0000005D2FFFFB38 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__00000073AD3FE6B8 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__000000914E3F38F0 000000067F0000400200008A590000D8C000-000000067F0000400200008A590000D90000__000000931B9A2710 000000067F0000400200008A590000D8E43F-000000067F0000400200008A590000D96E19__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__0000001C725A2400 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__0000001C760FA190 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__00000038E67ABFA0 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__0000003903F1CFE8 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__0000003B99F7F8A0 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__0000005D2FFFFB38 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__00000073AD3FE6B8 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__000000914E3F38F0 000000067F0000400200008A590000D90000-000000067F0000400200008A590000D94000__000000931B9A2710 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__0000001C725A2400 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__0000001C760FA190 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__00000038E67ABFA0 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__0000003903F1CFE8 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__0000003B99F7F8A0 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__0000005D2FFFFB38 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__00000073AD3FE6B8 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__000000914E3F38F0 000000067F0000400200008A590000D94000-000000067F0000400200008A590000D98000__000000931B9A2710 000000067F0000400200008A590000D96E19-000000067F0000400200008A590000D9F7E0__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__0000001C725A2400 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__0000001C760FA190 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__00000038E67ABFA0 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__0000003903F1CFE8 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__0000003B99F7F8A0 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__0000005D2FFFFB38 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__00000073AD3FE6B8 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__000000914E3F38F0 000000067F0000400200008A590000D98000-000000067F0000400200008A590000D9C000__000000931B9A2710 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__0000001C725A2400 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__0000001C760FA190 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__00000038E67ABFA0 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__0000003903F1CFE8 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__0000003B99F7F8A0 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__0000005D2FFFFB38 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__00000073AD3FE6B8 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__000000914E3F38F0 000000067F0000400200008A590000D9C000-000000067F0000400200008A590000DA0000__000000931B9A2710 000000067F0000400200008A590000D9F7E0-000000067F0000400200008A590000DA81C4__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__0000001C725A2400 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__0000001C760FA190 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__00000038E67ABFA0 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__0000003903F1CFE8 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__0000003B99F7F8A0 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__0000005D2FFFFB38 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__00000073AD3FE6B8 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__000000914E3F38F0 000000067F0000400200008A590000DA0000-000000067F0000400200008A590000DA4000__000000931B9A2710 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__0000001C725A2400 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__0000001C760FA190 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__00000038E67ABFA0 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__0000003903F1CFE8 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__0000003B99F7F8A0 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__0000005D2FFFFB38 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__00000073AD3FE6B8 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__000000914E3F38F0 000000067F0000400200008A590000DA4000-000000067F0000400200008A590000DA8000__000000931B9A2710 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__0000001C725A2400 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__0000001C760FA190 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__00000038E67ABFA0 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__0000003903F1CFE8 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__0000003B99F7F8A0 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__0000005D2FFFFB38 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__00000073AD3FE6B8 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__000000914E3F38F0 000000067F0000400200008A590000DA8000-000000067F0000400200008A590000DAC000__000000931B9A2710 000000067F0000400200008A590000DA81C4-000000067F0000400200008A590000DB0BA9__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__0000001C725A2400 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__0000001C760FA190 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__00000038E67ABFA0 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__0000003903F1CFE8 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__0000003B99F7F8A0 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__0000005D2FFFFB38 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__00000073AD3FE6B8 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__000000914E3F38F0 000000067F0000400200008A590000DAC000-000000067F0000400200008A590000DB0000__000000931B9A2710 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__0000001C725A2400 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__0000001C760FA190 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__00000038E67ABFA0 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__0000003903F1CFE8 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__0000003B99F7F8A0 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__0000005D2FFFFB38 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__00000073AD3FE6B8 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__000000914E3F38F0 000000067F0000400200008A590000DB0000-000000067F0000400200008A590000DB4000__000000931B9A2710 000000067F0000400200008A590000DB0BA9-000000067F0000400200008A590000DB9590__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__0000001C725A2400 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__0000001C760FA190 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__00000038E67ABFA0 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__0000003903F1CFE8 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__0000003B99F7F8A0 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__0000005D2FFFFB38 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__00000073AD3FE6B8 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__000000914E3F38F0 000000067F0000400200008A590000DB4000-000000067F0000400200008A590000DB8000__000000931B9A2710 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__0000001A2433F0F8 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__0000001C760FA190 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__00000038E67ABFA0 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__0000003903F1CFE8 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__0000003B99F7F8A0 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__0000005D2FFFFB38 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__00000073AD3FE6B8 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__000000914E3F38F0 000000067F0000400200008A590000DB8000-000000067F0000400200008A590000DBC000__000000931B9A2710 000000067F0000400200008A590000DB9590-000000067F0000400200008A590100000000__00000018C4E3E6C1-000000196493E2E1 000000067F0000400200008A590000DB984D-000000067F0000400200008A590000DC221C__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__0000001A2433F0F8 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__0000001C760FA190 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__00000038E67ABFA0 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__0000003903F1CFE8 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__0000003B99F7F8A0 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__0000005D2FFFFB38 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__00000073AD3FE6B8 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__000000914E3F38F0 000000067F0000400200008A590000DBC000-000000067F0000400200008A590000DC0000__000000931B9A2710 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__0000001A2433F0F8 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__0000001C760FA190 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__00000038E67ABFA0 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__0000003903F1CFE8 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__0000003B99F7F8A0 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__0000005D2FFFFB38 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__00000073AD3FE6B8 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__000000914E3F38F0 000000067F0000400200008A590000DC0000-000000067F0000400200008A590000DC4000__000000931B9A2710 000000067F0000400200008A590000DC221C-000000067F0000400200008A590000DCABF9__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__0000001A2433F0F8 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__0000001C760FA190 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__00000038E67ABFA0 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__0000003903F1CFE8 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__0000003B99F7F8A0 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__0000005D2FFFFB38 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__00000073AD3FE6B8 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__000000914E3F38F0 000000067F0000400200008A590000DC4000-000000067F0000400200008A590000DC8000__000000931B9A2710 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__0000001A2433F0F8 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__0000001C760FA190 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__00000038E67ABFA0 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__0000003903F1CFE8 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__0000003B99F7F8A0 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__0000005D2FFFFB38 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__00000073AD3FE6B8 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__000000914E3F38F0 000000067F0000400200008A590000DC8000-000000067F0000400200008A590000DCC000__000000931B9A2710 000000067F0000400200008A590000DCABF9-000000067F0000400200008A590000DD35DF__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__0000001A2433F0F8 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__0000001C760FA190 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__00000038E67ABFA0 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__0000003903F1CFE8 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__0000003B99F7F8A0 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__0000005D2FFFFB38 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__00000073AD3FE6B8 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__000000914E3F38F0 000000067F0000400200008A590000DCC000-000000067F0000400200008A590000DD0000__000000931B9A2710 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__0000001A2433F0F8 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__0000001C760FA190 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__00000038E67ABFA0 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__0000003903F1CFE8 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__0000003B99F7F8A0 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__0000005D2FFFFB38 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__00000073AD3FE6B8 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__000000914E3F38F0 000000067F0000400200008A590000DD0000-000000067F0000400200008A590000DD4000__000000931B9A2710 000000067F0000400200008A590000DD35DF-000000067F0000400200008A590000DDBFBF__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__0000001A2433F0F8 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__0000001C760FA190 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__00000038E67ABFA0 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__0000003903F1CFE8 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__0000003B99F7F8A0 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__0000005D2FFFFB38 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__00000073AD3FE6B8 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__000000914E3F38F0 000000067F0000400200008A590000DD4000-000000067F0000400200008A590000DD8000__000000931B9A2710 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__0000001A2433F0F8 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__0000001C760FA190 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__00000038E67ABFA0 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__0000003903F1CFE8 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__0000003B99F7F8A0 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__0000005D2FFFFB38 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__00000073AD3FE6B8 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__000000914E3F38F0 000000067F0000400200008A590000DD8000-000000067F0000400200008A590000DDC000__000000931B9A2710 000000067F0000400200008A590000DDBFBF-000000067F0000400200008A590000DE49A7__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__0000001A2433F0F8 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__0000001C760FA190 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__00000038E67ABFA0 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__0000003903F1CFE8 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__0000003B99F7F8A0 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__0000005D2FFFFB38 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__00000073AD3FE6B8 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__000000914E3F38F0 000000067F0000400200008A590000DDC000-000000067F0000400200008A590000DE0000__000000931B9A2710 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__0000001A2433F0F8 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__0000001C760FA190 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__00000038E67ABFA0 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__0000003903F1CFE8 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__0000003B99F7F8A0 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__0000005D2FFFFB38 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__00000073AD3FE6B8 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__000000914E3F38F0 000000067F0000400200008A590000DE0000-000000067F0000400200008A590000DE4000__000000931B9A2710 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__0000001A2433F0F8 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__0000001C760FA190 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__00000038E67ABFA0 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__0000003903F1CFE8 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__0000003B99F7F8A0 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__0000005D2FFFFB38 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__00000073AD3FE6B8 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__000000914E3F38F0 000000067F0000400200008A590000DE4000-000000067F0000400200008A590000DE8000__000000931B9A2710 000000067F0000400200008A590000DE49A7-000000067F0000400200008A590000DED38D__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__0000001A2433F0F8 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__0000001C760FA190 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__00000038E67ABFA0 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__0000003903F1CFE8 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__0000003B99F7F8A0 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__0000005D2FFFFB38 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__00000073AD3FE6B8 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__000000914E3F38F0 000000067F0000400200008A590000DE8000-000000067F0000400200008A590000DEC000__000000931B9A2710 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__0000001A2433F0F8 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__0000001C760FA190 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__00000038E67ABFA0 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__0000003903F1CFE8 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__0000003B99F7F8A0 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__0000005D2FFFFB38 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__00000073AD3FE6B8 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__000000914E3F38F0 000000067F0000400200008A590000DEC000-000000067F0000400200008A590000DF0000__000000931B9A2710 000000067F0000400200008A590000DED38D-000000067F0000400200008A590000DF5D68__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__0000001A2433F0F8 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__0000001C760FA190 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__00000038E67ABFA0 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__0000003903F1CFE8 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__0000003B99F7F8A0 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__0000005D2FFFFB38 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__00000073AD3FE6B8 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__000000914E3F38F0 000000067F0000400200008A590000DF0000-000000067F0000400200008A590000DF4000__000000931B9A2710 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__0000001A2433F0F8 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__0000001C760FA190 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__00000038E67ABFA0 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__0000003903F1CFE8 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__0000003B99F7F8A0 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__0000005D2FFFFB38 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__00000073AD3FE6B8 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__000000914E3F38F0 000000067F0000400200008A590000DF4000-000000067F0000400200008A590000DF8000__000000931B9A2710 000000067F0000400200008A590000DF5D68-000000067F0000400200008A590000DFE74A__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__0000001A2433F0F8 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__0000001C760FA190 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__00000038E67ABFA0 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__0000003903F1CFE8 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__0000003B99F7F8A0 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__0000005D2FFFFB38 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__00000073AD3FE6B8 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__000000914E3F38F0 000000067F0000400200008A590000DF8000-000000067F0000400200008A590000DFC000__000000931B9A2710 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__0000001A2433F0F8 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__0000001C760FA190 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__00000038E67ABFA0 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__0000003903F1CFE8 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__0000003B99F7F8A0 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__0000005D2FFFFB38 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__00000073AD3FE6B8 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__000000914E3F38F0 000000067F0000400200008A590000DFC000-000000067F0000400200008A590000E00000__000000931B9A2710 000000067F0000400200008A590000DFE74A-000000067F0000400200008A590000E0711A__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__0000001A2433F0F8 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__0000001C760FA190 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__00000038E67ABFA0 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__0000003903F1CFE8 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__0000003B99F7F8A0 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__0000005D2FFFFB38 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__00000073AD3FE6B8 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__000000914E3F38F0 000000067F0000400200008A590000E00000-000000067F0000400200008A590000E04000__000000931B9A2710 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__0000001A2433F0F8 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__0000001C760FA190 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__00000038E67ABFA0 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__0000003903F1CFE8 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__0000003B99F7F8A0 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__0000005D2FFFFB38 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__00000073AD3FE6B8 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__000000914E3F38F0 000000067F0000400200008A590000E04000-000000067F0000400200008A590000E08000__000000931B9A2710 000000067F0000400200008A590000E0711A-000000067F0000400200008A590000E0FAEF__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__0000001A2433F0F8 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__0000001C760FA190 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__00000038E67ABFA0 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__0000003903F1CFE8 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__0000003B99F7F8A0 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__0000005D2FFFFB38 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__00000073AD3FE6B8 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__000000914E3F38F0 000000067F0000400200008A590000E08000-000000067F0000400200008A590000E0C000__000000931B9A2710 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__0000001A2433F0F8 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__0000001C760FA190 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__00000038E67ABFA0 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__0000003903F1CFE8 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__0000003B99F7F8A0 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__0000005D2FFFFB38 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__00000073AD3FE6B8 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__000000914E3F38F0 000000067F0000400200008A590000E0C000-000000067F0000400200008A590000E10000__000000931B9A2710 000000067F0000400200008A590000E0FAEF-000000067F0000400200008A590100000000__000000196493E2E1-0000001A0443DCD9 000000067F0000400200008A590000E0FDBF-000000067F0000400200008A590000E1879A__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__0000001A2433F0F8 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__0000001C760FA190 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__00000038E67ABFA0 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__0000003903F1CFE8 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__0000003B99F7F8A0 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__0000005D2FFFFB38 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__00000073AD3FE6B8 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__000000914E3F38F0 000000067F0000400200008A590000E10000-000000067F0000400200008A590000E14000__000000931B9A2710 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__0000001A2433F0F8 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__0000001C760FA190 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__00000038E67ABFA0 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__0000003903F1CFE8 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__0000003B99F7F8A0 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__0000005D2FFFFB38 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__00000073AD3FE6B8 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__000000914E3F38F0 000000067F0000400200008A590000E14000-000000067F0000400200008A590000E18000__000000931B9A2710 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__0000001A2433F0F8 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__0000001C760FA190 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__00000038E67ABFA0 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__0000003903F1CFE8 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__0000003B99F7F8A0 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__0000005D2FFFFB38 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__00000073AD3FE6B8 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__000000914E3F38F0 000000067F0000400200008A590000E18000-000000067F0000400200008A590000E1C000__000000931B9A2710 000000067F0000400200008A590000E1879A-000000067F0000400200008A590000E2117A__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__0000001A2433F0F8 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__0000001C760FA190 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__00000038E67ABFA0 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__0000003903F1CFE8 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__0000003B99F7F8A0 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__0000005D2FFFFB38 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__00000073AD3FE6B8 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__000000914E3F38F0 000000067F0000400200008A590000E1C000-000000067F0000400200008A590000E20000__000000931B9A2710 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__0000001C760FA190 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__00000038E67ABFA0 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__0000003903F1CFE8 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__0000003B99F7F8A0 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__0000005D2FFFFB38 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__00000073AD3FE6B8 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__000000914E3F38F0 000000067F0000400200008A590000E20000-000000067F0000400200008A590000E24000__000000931B9A2710 000000067F0000400200008A590000E20000-030000000000000000000000000000000002__0000001A2433F0F8 000000067F0000400200008A590000E2117A-000000067F0000400200008A590000E29B5F__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__0000001C760FA190 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__00000038E67ABFA0 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__0000003903F1CFE8 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__0000003B99F7F8A0 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__0000005D2FFFFB38 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__00000073AD3FE6B8 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__000000914E3F38F0 000000067F0000400200008A590000E24000-000000067F0000400200008A590000E28000__000000931B9A2710 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__0000001C760FA190 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__00000038E67ABFA0 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__0000003903F1CFE8 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__0000003B99F7F8A0 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__0000005D2FFFFB38 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__00000073AD3FE6B8 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__000000914E3F38F0 000000067F0000400200008A590000E28000-000000067F0000400200008A590000E2C000__000000931B9A2710 000000067F0000400200008A590000E29B5F-000000067F0000400200008A590000E32531__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__0000001C760FA190 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__00000038E67ABFA0 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__0000003903F1CFE8 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__0000003B99F7F8A0 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__0000005D2FFFFB38 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__00000073AD3FE6B8 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__000000914E3F38F0 000000067F0000400200008A590000E2C000-000000067F0000400200008A590000E30000__000000931B9A2710 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__0000001C760FA190 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__00000038E67ABFA0 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__0000003903F1CFE8 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__0000003B99F7F8A0 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__0000005D2FFFFB38 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__00000073AD3FE6B8 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__000000914E3F38F0 000000067F0000400200008A590000E30000-000000067F0000400200008A590000E34000__000000931B9A2710 000000067F0000400200008A590000E32531-000000067F0000400200008A590000E3AF0F__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__0000001C760FA190 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__00000038E67ABFA0 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__0000003903F1CFE8 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__0000003B99F7F8A0 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__0000005D2FFFFB38 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__00000073AD3FE6B8 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__000000914E3F38F0 000000067F0000400200008A590000E34000-000000067F0000400200008A590000E38000__000000931B9A2710 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__0000001C760FA190 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__00000038E67ABFA0 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__0000003903F1CFE8 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__0000003B99F7F8A0 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__0000005D2FFFFB38 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__00000073AD3FE6B8 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__000000914E3F38F0 000000067F0000400200008A590000E38000-000000067F0000400200008A590000E3C000__000000931B9A2710 000000067F0000400200008A590000E3AF0F-000000067F0000400200008A590000E438DB__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__0000001C760FA190 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__00000038E67ABFA0 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__0000003903F1CFE8 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__0000003B99F7F8A0 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__0000005D2FFFFB38 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__00000073AD3FE6B8 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__000000914E3F38F0 000000067F0000400200008A590000E3C000-000000067F0000400200008A590000E40000__000000931B9A2710 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__0000001C760FA190 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__00000038E67ABFA0 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__0000003903F1CFE8 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__0000003B99F7F8A0 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__0000005D2FFFFB38 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__00000073AD3FE6B8 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__000000914E3F38F0 000000067F0000400200008A590000E40000-000000067F0000400200008A590000E44000__000000931B9A2710 000000067F0000400200008A590000E438DB-000000067F0000400200008A590000E4C2B3__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__0000001C760FA190 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__00000038E67ABFA0 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__0000003903F1CFE8 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__0000003B99F7F8A0 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__0000005D2FFFFB38 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__00000073AD3FE6B8 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__000000914E3F38F0 000000067F0000400200008A590000E44000-000000067F0000400200008A590000E48000__000000931B9A2710 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__0000001C760FA190 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__00000038E67ABFA0 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__0000003903F1CFE8 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__0000003B99F7F8A0 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__0000005D2FFFFB38 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__00000073AD3FE6B8 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__000000914E3F38F0 000000067F0000400200008A590000E48000-000000067F0000400200008A590000E4C000__000000931B9A2710 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__0000001C760FA190 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__00000038E67ABFA0 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__0000003903F1CFE8 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__0000003B99F7F8A0 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__0000005D2FFFFB38 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__00000073AD3FE6B8 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__000000914E3F38F0 000000067F0000400200008A590000E4C000-000000067F0000400200008A590000E50000__000000931B9A2710 000000067F0000400200008A590000E4C2B3-000000067F0000400200008A590000E54C98__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__0000001C760FA190 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__00000038E67ABFA0 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__0000003903F1CFE8 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__0000003B99F7F8A0 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__0000005D2FFFFB38 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__00000073AD3FE6B8 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__000000914E3F38F0 000000067F0000400200008A590000E50000-000000067F0000400200008A590000E54000__000000931B9A2710 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__0000001C760FA190 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__00000038E67ABFA0 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__0000003903F1CFE8 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__0000003B99F7F8A0 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__0000005D2FFFFB38 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__00000073AD3FE6B8 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__000000914E3F38F0 000000067F0000400200008A590000E54000-000000067F0000400200008A590000E58000__000000931B9A2710 000000067F0000400200008A590000E54C98-000000067F0000400200008A590000E5D67C__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__0000001C760FA190 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__00000038E67ABFA0 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__0000003903F1CFE8 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__0000003B99F7F8A0 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__0000005D2FFFFB38 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__00000073AD3FE6B8 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__000000914E3F38F0 000000067F0000400200008A590000E58000-000000067F0000400200008A590000E5C000__000000931B9A2710 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__0000001C760FA190 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__00000038E67ABFA0 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__0000003903F1CFE8 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__0000003B99F7F8A0 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__0000005D2FFFFB38 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__00000073AD3FE6B8 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__000000914E3F38F0 000000067F0000400200008A590000E5C000-000000067F0000400200008A590000E60000__000000931B9A2710 000000067F0000400200008A590000E5D67C-000000067F0000400200008A590000E66056__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__0000001C760FA190 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__00000038E67ABFA0 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__0000003903F1CFE8 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__0000003B99F7F8A0 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__0000005D2FFFFB38 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__00000073AD3FE6B8 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__000000914E3F38F0 000000067F0000400200008A590000E60000-000000067F0000400200008A590000E64000__000000931B9A2710 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__0000001C725A2400 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__0000001C760FA190 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__00000038E67ABFA0 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__0000003903F1CFE8 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__0000003B99F7F8A0 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__0000005D2FFFFB38 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__00000073AD3FE6B8 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__000000914E3F38F0 000000067F0000400200008A590000E64000-000000067F0000400200008A590000E68000__000000931B9A2710 000000067F0000400200008A590000E66056-000000067F0000400200008A590100000000__0000001A0443DCD9-0000001AA3F3E569 000000067F0000400200008A590000E6632E-000000067F0000400200008A590000E6ED05__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__0000001C725A2400 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__0000001C760FA190 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__00000038E67ABFA0 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__0000003903F1CFE8 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__0000003B99F7F8A0 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__0000005D2FFFFB38 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__00000073AD3FE6B8 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__000000914E3F38F0 000000067F0000400200008A590000E68000-000000067F0000400200008A590000E6C000__000000931B9A2710 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__0000001C725A2400 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__0000001C760FA190 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__00000038E67ABFA0 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__0000003903F1CFE8 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__0000003B99F7F8A0 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__0000005D2FFFFB38 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__00000073AD3FE6B8 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__000000914E3F38F0 000000067F0000400200008A590000E6C000-000000067F0000400200008A590000E70000__000000931B9A2710 000000067F0000400200008A590000E6ED05-000000067F0000400200008A590000E776E1__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__0000001C725A2400 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__0000001C760FA190 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__00000038E67ABFA0 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__0000003903F1CFE8 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__0000003B99F7F8A0 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__0000005D2FFFFB38 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__00000073AD3FE6B8 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__000000914E3F38F0 000000067F0000400200008A590000E70000-000000067F0000400200008A590000E74000__000000931B9A2710 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__0000001C725A2400 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__0000001C760FA190 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__00000038E67ABFA0 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__0000003903F1CFE8 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__0000003B99F7F8A0 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__0000005D2FFFFB38 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__00000073AD3FE6B8 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__000000914E3F38F0 000000067F0000400200008A590000E74000-000000067F0000400200008A590000E78000__000000931B9A2710 000000067F0000400200008A590000E776E1-000000067F0000400200008A590000E800BC__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__0000001C725A2400 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__0000001C760FA190 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__00000038E67ABFA0 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__0000003903F1CFE8 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__0000003B99F7F8A0 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__0000005D2FFFFB38 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__00000073AD3FE6B8 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__000000914E3F38F0 000000067F0000400200008A590000E78000-000000067F0000400200008A590000E7C000__000000931B9A2710 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__0000001C725A2400 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__0000001C760FA190 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__00000038E67ABFA0 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__0000003903F1CFE8 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__0000003B99F7F8A0 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__0000005D2FFFFB38 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__00000073AD3FE6B8 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__000000914E3F38F0 000000067F0000400200008A590000E7C000-000000067F0000400200008A590000E80000__000000931B9A2710 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__0000001C725A2400 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__0000001C760FA190 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__00000038E67ABFA0 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__0000003903F1CFE8 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__0000003B99F7F8A0 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__0000005D2FFFFB38 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__00000073AD3FE6B8 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__000000914E3F38F0 000000067F0000400200008A590000E80000-000000067F0000400200008A590000E84000__000000931B9A2710 000000067F0000400200008A590000E800BC-000000067F0000400200008A590000E88A9D__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__0000001C725A2400 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__0000001C760FA190 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__00000038E67ABFA0 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__0000003903F1CFE8 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__0000003B99F7F8A0 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__0000005D2FFFFB38 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__00000073AD3FE6B8 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__000000914E3F38F0 000000067F0000400200008A590000E84000-000000067F0000400200008A590000E88000__000000931B9A2710 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__0000001C725A2400 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__0000001C760FA190 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__00000038E67ABFA0 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__0000003903F1CFE8 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__0000003B99F7F8A0 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__0000005D2FFFFB38 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__00000073AD3FE6B8 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__000000914E3F38F0 000000067F0000400200008A590000E88000-000000067F0000400200008A590000E8C000__000000931B9A2710 000000067F0000400200008A590000E88A9D-000000067F0000400200008A590000E91484__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__0000001C725A2400 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__0000001C760FA190 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__00000038E67ABFA0 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__0000003903F1CFE8 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__0000003B99F7F8A0 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__0000005D2FFFFB38 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__00000073AD3FE6B8 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__000000914E3F38F0 000000067F0000400200008A590000E8C000-000000067F0000400200008A590000E90000__000000931B9A2710 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__0000001C725A2400 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__0000001C760FA190 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__00000038E67ABFA0 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__0000003903F1CFE8 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__0000003B99F7F8A0 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__0000005D2FFFFB38 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__00000073AD3FE6B8 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__000000914E3F38F0 000000067F0000400200008A590000E90000-000000067F0000400200008A590000E94000__000000931B9A2710 000000067F0000400200008A590000E91484-000000067F0000400200008A590000E99E65__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__0000001C725A2400 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__0000001C760FA190 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__00000038E67ABFA0 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__0000003903F1CFE8 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__0000003B99F7F8A0 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__0000005D2FFFFB38 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__00000073AD3FE6B8 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__000000914E3F38F0 000000067F0000400200008A590000E94000-000000067F0000400200008A590000E98000__000000931B9A2710 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__0000001C725A2400 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__0000001C760FA190 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__00000038E67ABFA0 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__0000003903F1CFE8 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__0000003B99F7F8A0 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__0000005D2FFFFB38 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__00000073AD3FE6B8 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__000000914E3F38F0 000000067F0000400200008A590000E98000-000000067F0000400200008A590000E9C000__000000931B9A2710 000000067F0000400200008A590000E99E65-000000067F0000400200008A590000EA2841__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__0000001C725A2400 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__0000001C760FA190 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__00000038E67ABFA0 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__0000003903F1CFE8 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__0000003B99F7F8A0 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__0000005D2FFFFB38 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__00000073AD3FE6B8 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__000000914E3F38F0 000000067F0000400200008A590000E9C000-000000067F0000400200008A590000EA0000__000000931B9A2710 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__0000001C725A2400 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__0000001C760FA190 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__00000038E67ABFA0 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__0000003903F1CFE8 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__0000003B99F7F8A0 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__0000005D2FFFFB38 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__00000073AD3FE6B8 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__000000914E3F38F0 000000067F0000400200008A590000EA0000-000000067F0000400200008A590000EA4000__000000931B9A2710 000000067F0000400200008A590000EA2841-000000067F0000400200008A590000EAB20E__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__0000001C725A2400 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__0000001C760FA190 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__00000038E67ABFA0 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__0000003903F1CFE8 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__0000003B99F7F8A0 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__0000005D2FFFFB38 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__00000073AD3FE6B8 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__000000914E3F38F0 000000067F0000400200008A590000EA4000-000000067F0000400200008A590000EA8000__000000931B9A2710 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__0000001C725A2400 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__0000001C760FA190 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__00000038E67ABFA0 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__0000003903F1CFE8 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__0000003B99F7F8A0 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__0000005D2FFFFB38 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__00000073AD3FE6B8 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__000000914E3F38F0 000000067F0000400200008A590000EA8000-000000067F0000400200008A590000EAC000__000000931B9A2710 000000067F0000400200008A590000EAB20E-000000067F0000400200008A590000EB3BEC__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__0000001C725A2400 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__0000001C760FA190 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__00000038E67ABFA0 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__0000003903F1CFE8 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__0000003B99F7F8A0 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__0000005D2FFFFB38 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__00000073AD3FE6B8 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__000000914E3F38F0 000000067F0000400200008A590000EAC000-000000067F0000400200008A590000EB0000__000000931B9A2710 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__0000001C725A2400 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__0000001C760FA190 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__00000038E67ABFA0 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__0000003903F1CFE8 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__0000003B99F7F8A0 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__0000005D2FFFFB38 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__00000073AD3FE6B8 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__000000914E3F38F0 000000067F0000400200008A590000EB0000-000000067F0000400200008A590000EB4000__000000931B9A2710 000000067F0000400200008A590000EB3BEC-000000067F0000400200008A590000EBC5C4__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__0000001C725A2400 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__0000001C760FA190 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__00000038E67ABFA0 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__0000003903F1CFE8 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__0000003B99F7F8A0 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__0000005D2FFFFB38 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__00000073AD3FE6B8 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__000000914E3F38F0 000000067F0000400200008A590000EB4000-000000067F0000400200008A590000EB8000__000000931B9A2710 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__0000001C725A2400 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__0000001C760FA190 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__00000038E67ABFA0 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__0000003903F1CFE8 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__0000003B99F7F8A0 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__0000005D2FFFFB38 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__00000073AD3FE6B8 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__000000914E3F38F0 000000067F0000400200008A590000EB8000-000000067F0000400200008A590000EBC000__000000931B9A2710 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__0000001C046BD098 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__0000001C760FA190 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__00000038E67ABFA0 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__0000003903F1CFE8 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__0000003B99F7F8A0 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__0000005D2FFFFB38 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__00000073AD3FE6B8 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__000000914E3F38F0 000000067F0000400200008A590000EBC000-000000067F0000400200008A590000EC0000__000000931B9A2710 000000067F0000400200008A590000EBC5C4-000000067F0000400200008A590100000000__0000001AA3F3E569-0000001B43A3F241 000000067F0000400200008A590000EBC8A4-000000067F0000400200008A590000EC527C__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__0000001C046BD098 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__0000001C760FA190 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__00000038E67ABFA0 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__0000003903F1CFE8 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__0000003B99F7F8A0 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__0000005D2FFFFB38 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__00000073AD3FE6B8 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__000000914E3F38F0 000000067F0000400200008A590000EC0000-000000067F0000400200008A590000EC4000__000000931B9A2710 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__0000001C046BD098 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__0000001C760FA190 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__00000038E67ABFA0 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__0000003903F1CFE8 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__0000003B99F7F8A0 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__0000005D2FFFFB38 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__00000073AD3FE6B8 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__000000914E3F38F0 000000067F0000400200008A590000EC4000-000000067F0000400200008A590000EC8000__000000931B9A2710 000000067F0000400200008A590000EC527C-000000067F0000400200008A590000ECDC5F__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__0000001C046BD098 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__0000001C760FA190 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__00000038E67ABFA0 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__0000003903F1CFE8 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__0000003B99F7F8A0 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__0000005D2FFFFB38 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__00000073AD3FE6B8 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__000000914E3F38F0 000000067F0000400200008A590000EC8000-000000067F0000400200008A590000ECC000__000000931B9A2710 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__0000001C046BD098 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__0000001C760FA190 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__00000038E67ABFA0 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__0000003903F1CFE8 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__0000003B99F7F8A0 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__0000005D2FFFFB38 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__00000073AD3FE6B8 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__000000914E3F38F0 000000067F0000400200008A590000ECC000-000000067F0000400200008A590000ED0000__000000931B9A2710 000000067F0000400200008A590000ECDC5F-000000067F0000400200008A590000ED663C__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__0000001C046BD098 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__0000001C760FA190 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__00000038E67ABFA0 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__0000003903F1CFE8 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__0000003B99F7F8A0 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__0000005D2FFFFB38 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__00000073AD3FE6B8 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__000000914E3F38F0 000000067F0000400200008A590000ED0000-000000067F0000400200008A590000ED4000__000000931B9A2710 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__0000001C046BD098 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__0000001C760FA190 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__00000038E67ABFA0 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__0000003903F1CFE8 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__0000003B99F7F8A0 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__0000005D2FFFFB38 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__00000073AD3FE6B8 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__000000914E3F38F0 000000067F0000400200008A590000ED4000-000000067F0000400200008A590000ED8000__000000931B9A2710 000000067F0000400200008A590000ED663C-000000067F0000400200008A590000EDF017__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__0000001C046BD098 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__0000001C760FA190 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__00000038E67ABFA0 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__0000003903F1CFE8 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__0000003B99F7F8A0 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__0000005D2FFFFB38 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__00000073AD3FE6B8 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__000000914E3F38F0 000000067F0000400200008A590000ED8000-000000067F0000400200008A590000EDC000__000000931B9A2710 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__0000001C046BD098 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__0000001C760FA190 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__00000038E67ABFA0 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__0000003903F1CFE8 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__0000003B99F7F8A0 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__0000005D2FFFFB38 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__00000073AD3FE6B8 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__000000914E3F38F0 000000067F0000400200008A590000EDC000-000000067F0000400200008A590000EE0000__000000931B9A2710 000000067F0000400200008A590000EDF017-000000067F0000400200008A590000EE79E6__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__0000001C046BD098 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__0000001C760FA190 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__00000038E67ABFA0 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__0000003903F1CFE8 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__0000003B99F7F8A0 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__0000005D2FFFFB38 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__00000073AD3FE6B8 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__000000914E3F38F0 000000067F0000400200008A590000EE0000-000000067F0000400200008A590000EE4000__000000931B9A2710 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__0000001C046BD098 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__0000001C760FA190 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__00000038E67ABFA0 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__0000003903F1CFE8 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__0000003B99F7F8A0 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__0000005D2FFFFB38 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__00000073AD3FE6B8 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__000000914E3F38F0 000000067F0000400200008A590000EE4000-000000067F0000400200008A590000EE8000__000000931B9A2710 000000067F0000400200008A590000EE79E6-000000067F0000400200008A590000EF03CB__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__0000001C046BD098 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__0000001C760FA190 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__00000038E67ABFA0 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__0000003903F1CFE8 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__0000003B99F7F8A0 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__0000005D2FFFFB38 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__00000073AD3FE6B8 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__000000914E3F38F0 000000067F0000400200008A590000EE8000-000000067F0000400200008A590000EEC000__000000931B9A2710 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__0000001C046BD098 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__0000001C760FA190 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__00000038E67ABFA0 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__0000003903F1CFE8 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__0000003B99F7F8A0 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__0000005D2FFFFB38 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__00000073AD3FE6B8 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__000000914E3F38F0 000000067F0000400200008A590000EEC000-000000067F0000400200008A590000EF0000__000000931B9A2710 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__0000001C046BD098 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__0000001C760FA190 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__00000038E67ABFA0 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__0000003903F1CFE8 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__0000003B99F7F8A0 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__0000005D2FFFFB38 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__00000073AD3FE6B8 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__000000914E3F38F0 000000067F0000400200008A590000EF0000-000000067F0000400200008A590000EF4000__000000931B9A2710 000000067F0000400200008A590000EF03CB-000000067F0000400200008A590000EF8DAC__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__0000001C046BD098 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__0000001C760FA190 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__00000038E67ABFA0 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__0000003903F1CFE8 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__0000003B99F7F8A0 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__0000005D2FFFFB38 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__00000073AD3FE6B8 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__000000914E3F38F0 000000067F0000400200008A590000EF4000-000000067F0000400200008A590000EF8000__000000931B9A2710 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__0000001C046BD098 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__0000001C760FA190 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__00000038E67ABFA0 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__0000003903F1CFE8 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__0000003B99F7F8A0 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__0000005D2FFFFB38 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__00000073AD3FE6B8 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__000000914E3F38F0 000000067F0000400200008A590000EF8000-000000067F0000400200008A590000EFC000__000000931B9A2710 000000067F0000400200008A590000EF8DAC-000000067F0000400200008A590000F01798__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__0000001C046BD098 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__0000001C760FA190 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__00000038E67ABFA0 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__0000003903F1CFE8 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__0000003B99F7F8A0 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__0000005D2FFFFB38 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__00000073AD3FE6B8 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__000000914E3F38F0 000000067F0000400200008A590000EFC000-000000067F0000400200008A590000F00000__000000931B9A2710 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__0000001C046BD098 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__0000001C760FA190 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__00000038E67ABFA0 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__0000003903F1CFE8 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__0000003B99F7F8A0 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__0000005D2FFFFB38 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__00000073AD3FE6B8 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__000000914E3F38F0 000000067F0000400200008A590000F00000-000000067F0000400200008A590000F04000__000000931B9A2710 000000067F0000400200008A590000F01798-000000067F0000400200008A590000F0A18F__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__0000001C046BD098 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__0000001C760FA190 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__00000038E67ABFA0 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__0000003903F1CFE8 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__0000003B99F7F8A0 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__0000005D2FFFFB38 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__00000073AD3FE6B8 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__000000914E3F38F0 000000067F0000400200008A590000F04000-000000067F0000400200008A590000F08000__000000931B9A2710 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__0000001C046BD098 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__0000001C760FA190 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__00000038E67ABFA0 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__0000003903F1CFE8 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__0000003B99F7F8A0 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__0000005D2FFFFB38 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__00000073AD3FE6B8 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__000000914E3F38F0 000000067F0000400200008A590000F08000-000000067F0000400200008A590000F0C000__000000931B9A2710 000000067F0000400200008A590000F0A18F-000000067F0000400200008A590000F12B69__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__0000001C046BD098 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__0000001C760FA190 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__00000038E67ABFA0 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__0000003903F1CFE8 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__0000003B99F7F8A0 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__0000005D2FFFFB38 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__00000073AD3FE6B8 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__000000914E3F38F0 000000067F0000400200008A590000F0C000-000000067F0000400200008A590000F10000__000000931B9A2710 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__0000001C046BD098 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__0000001C760FA190 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__00000038E67ABFA0 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__0000003903F1CFE8 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__0000003B99F7F8A0 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__0000005D2FFFFB38 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__00000073AD3FE6B8 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__000000914E3F38F0 000000067F0000400200008A590000F10000-000000067F0000400200008A590000F14000__000000931B9A2710 000000067F0000400200008A590000F12B69-000000067F0000400200008A590100000000__0000001B43A3F241-0000001BE353E181 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__0000001C046BD098 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__0000001C760FA190 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__00000038E67ABFA0 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__0000003903F1CFE8 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__0000003B99F7F8A0 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__0000005D2FFFFB38 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__00000073AD3FE6B8 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__000000914E3F38F0 000000067F0000400200008A590000F14000-000000067F0000400200008A590000F18000__000000931B9A2710 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__0000001C046BD098 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__0000001C760FA190 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__00000038E67ABFA0 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__0000003903F1CFE8 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__0000003B99F7F8A0 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__0000005D2FFFFB38 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__00000073AD3FE6B8 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__000000914E3F38F0 000000067F0000400200008A590000F18000-000000067F0000400200008A590000F1C000__000000931B9A2710 000000067F0000400200008A590000F1B7DD-000000067F0000400200008A590000F241C2__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__0000001C046BD098 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__0000001C760FA190 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__00000038E67ABFA0 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__0000003903F1CFE8 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__0000003B99F7F8A0 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__0000005D2FFFFB38 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__00000073AD3FE6B8 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__000000914E3F38F0 000000067F0000400200008A590000F1C000-000000067F0000400200008A590000F20000__000000931B9A2710 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__0000001C046BD098 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__0000001C760FA190 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__00000038E67ABFA0 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__0000003903F1CFE8 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__0000003B99F7F8A0 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__0000005D2FFFFB38 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__00000073AD3FE6B8 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__000000914E3F38F0 000000067F0000400200008A590000F20000-000000067F0000400200008A590000F24000__000000931B9A2710 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__0000001C760FA190 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__00000038E67ABFA0 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__0000003903F1CFE8 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__0000003B99F7F8A0 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__0000005D2FFFFB38 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__00000073AD3FE6B8 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__000000914E3F38F0 000000067F0000400200008A590000F24000-000000067F0000400200008A590000F28000__000000931B9A2710 000000067F0000400200008A590000F24000-030000000000000000000000000000000002__0000001C046BD098 000000067F0000400200008A590000F241C2-000000067F0000400200008A590000F2CBA0__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__0000001C760FA190 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__00000038E67ABFA0 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__0000003903F1CFE8 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__0000003B99F7F8A0 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__0000005D2FFFFB38 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__00000073AD3FE6B8 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__000000914E3F38F0 000000067F0000400200008A590000F28000-000000067F0000400200008A590000F2C000__000000931B9A2710 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__0000001C760FA190 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__00000038E67ABFA0 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__0000003903F1CFE8 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__0000003B99F7F8A0 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__0000005D2FFFFB38 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__00000073AD3FE6B8 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__000000914E3F38F0 000000067F0000400200008A590000F2C000-000000067F0000400200008A590000F30000__000000931B9A2710 000000067F0000400200008A590000F2CBA0-000000067F0000400200008A590000F35584__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__0000001C760FA190 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__00000038E67ABFA0 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__0000003903F1CFE8 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__0000003B99F7F8A0 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__0000005D2FFFFB38 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__00000073AD3FE6B8 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__000000914E3F38F0 000000067F0000400200008A590000F30000-000000067F0000400200008A590000F34000__000000931B9A2710 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__0000001C760FA190 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__00000038E67ABFA0 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__0000003903F1CFE8 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__0000003B99F7F8A0 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__0000005D2FFFFB38 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__00000073AD3FE6B8 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__000000914E3F38F0 000000067F0000400200008A590000F34000-000000067F0000400200008A590000F38000__000000931B9A2710 000000067F0000400200008A590000F35584-000000067F0000400200008A590000F3DF5E__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__0000001C760FA190 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__00000038E67ABFA0 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__0000003903F1CFE8 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__0000003B99F7F8A0 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__0000005D2FFFFB38 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__00000073AD3FE6B8 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__000000914E3F38F0 000000067F0000400200008A590000F38000-000000067F0000400200008A590000F3C000__000000931B9A2710 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__0000001C760FA190 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__00000038E67ABFA0 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__0000003903F1CFE8 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__0000003B99F7F8A0 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__0000005D2FFFFB38 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__00000073AD3FE6B8 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__000000914E3F38F0 000000067F0000400200008A590000F3C000-000000067F0000400200008A590000F40000__000000931B9A2710 000000067F0000400200008A590000F3DF5E-000000067F0000400200008A590000F46935__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__0000001C760FA190 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__00000038E67ABFA0 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__0000003903F1CFE8 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__0000003B99F7F8A0 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__0000005D2FFFFB38 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__00000073AD3FE6B8 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__000000914E3F38F0 000000067F0000400200008A590000F40000-000000067F0000400200008A590000F44000__000000931B9A2710 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__0000001C760FA190 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__00000038E67ABFA0 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__0000003903F1CFE8 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__0000003B99F7F8A0 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__0000005D2FFFFB38 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__00000073AD3FE6B8 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__000000914E3F38F0 000000067F0000400200008A590000F44000-000000067F0000400200008A590000F48000__000000931B9A2710 000000067F0000400200008A590000F46935-000000067F0000400200008A590000F4F30D__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__0000001C760FA190 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__00000038E67ABFA0 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__0000003903F1CFE8 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__0000003B99F7F8A0 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__0000005D2FFFFB38 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__00000073AD3FE6B8 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__000000914E3F38F0 000000067F0000400200008A590000F48000-000000067F0000400200008A590000F4C000__000000931B9A2710 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__0000001C760FA190 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__00000038E67ABFA0 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__0000003903F1CFE8 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__0000003B99F7F8A0 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__0000005D2FFFFB38 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__00000073AD3FE6B8 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__000000914E3F38F0 000000067F0000400200008A590000F4C000-000000067F0000400200008A590000F50000__000000931B9A2710 000000067F0000400200008A590000F4F30D-000000067F0000400200008A590000F57CE5__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__0000001C760FA190 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__00000038E67ABFA0 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__0000003903F1CFE8 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__0000003B99F7F8A0 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__0000005D2FFFFB38 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__00000073AD3FE6B8 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__000000914E3F38F0 000000067F0000400200008A590000F50000-000000067F0000400200008A590000F54000__000000931B9A2710 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__0000001C760FA190 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__00000038E67ABFA0 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__0000003903F1CFE8 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__0000003B99F7F8A0 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__0000005D2FFFFB38 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__00000073AD3FE6B8 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__000000914E3F38F0 000000067F0000400200008A590000F54000-000000067F0000400200008A590000F58000__000000931B9A2710 000000067F0000400200008A590000F57CE5-000000067F0000400200008A590000F60351__0000001BE353E181-0000001C725A5929 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__0000001C760FA190 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__00000038E67ABFA0 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__0000003903F1CFE8 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__0000003B99F7F8A0 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__0000005D2FFFFB38 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__00000073AD3FE6B8 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__000000914E3F38F0 000000067F0000400200008A590000F58000-000000067F0000400200008A590000F5C000__000000931B9A2710 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__0000001C760FA190 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__00000038E67ABFA0 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__0000003903F1CFE8 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__0000003B99F7F8A0 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__0000005D2FFFFB38 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__00000073AD3FE6B8 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__000000914E3F38F0 000000067F0000400200008A590000F5C000-000000067F0000400200008A590000F60000__000000931B9A2710 000000067F0000400200008A590000F60000-000000067F0000400200008A5E0100000000__00000038E67ABFA0 000000067F0000400200008A590000F60000-000000067F0000400200008A5E0100000000__0000003903F1CFE8 000000067F0000400200008A590000F60000-000000067F0000400200008A5E0100000000__0000003B99F7F8A0 000000067F0000400200008A590000F60000-000000067F0000400200008A5E0100000000__0000005D2FFFFB38 000000067F0000400200008A590000F60000-000000067F000040020000A0080100000000__00000073AD3FE6B8 000000067F0000400200008A590000F60000-000000067F000040020000A0080100000000__000000914E3F38F0 000000067F0000400200008A590000F60000-000000067F000040020000A0080100000000__000000931B9A2710 000000067F0000400200008A590000F60000-030000000000000000000000000000000002__0000001C760FA190 000000067F0000400200008A5900FFFFFFFF-000000067F0000400200008A5E0100000000__0000001BE353E181-0000001C725A5929 000000067F000040020000A0000000000000-000000067F000040020000A0000000004000__00000038E67ABFA0 000000067F000040020000A0000000000000-000000067F000040020000A0000000004000__0000003903F1CFE8 000000067F000040020000A0000000000000-000000067F000040020000A0000000004000__0000003B99F7F8A0 000000067F000040020000A0000000000000-000000067F000040020000A0000000004000__0000005D2FFFFB38 000000067F000040020000A0000000004000-000000067F000040020000A0000000008000__00000038E67ABFA0 000000067F000040020000A0000000004000-000000067F000040020000A0000000008000__0000003903F1CFE8 000000067F000040020000A0000000004000-000000067F000040020000A0000000008000__0000003B99F7F8A0 000000067F000040020000A0000000004000-000000067F000040020000A0000000008000__0000005D2FFFFB38 000000067F000040020000A0000000008000-000000067F000040020000A000000000C000__00000038E67ABFA0 000000067F000040020000A0000000008000-000000067F000040020000A000000000C000__0000003903F1CFE8 000000067F000040020000A0000000008000-000000067F000040020000A000000000C000__0000003B99F7F8A0 000000067F000040020000A0000000008000-000000067F000040020000A000000000C000__0000005D2FFFFB38 000000067F000040020000A0000000008989-000000067F000040020000A0000000011373__0000001C725D0191-0000002070591C61 000000067F000040020000A000000000C000-000000067F000040020000A0000000010000__00000038E67ABFA0 000000067F000040020000A000000000C000-000000067F000040020000A0000000010000__0000003903F1CFE8 000000067F000040020000A000000000C000-000000067F000040020000A0000000010000__0000003B99F7F8A0 000000067F000040020000A000000000C000-000000067F000040020000A0000000010000__0000005D2FFFFB38 000000067F000040020000A0000000010000-000000067F000040020000A0000000014000__00000038E67ABFA0 000000067F000040020000A0000000010000-000000067F000040020000A0000000014000__0000003903F1CFE8 000000067F000040020000A0000000010000-000000067F000040020000A0000000014000__0000003B99F7F8A0 000000067F000040020000A0000000010000-000000067F000040020000A0000000014000__0000005D2FFFFB38 000000067F000040020000A0000000011373-000000067F000040020000A0000000019D77__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000014000-000000067F000040020000A0000000018000__00000038E67ABFA0 000000067F000040020000A0000000014000-000000067F000040020000A0000000018000__0000003903F1CFE8 000000067F000040020000A0000000014000-000000067F000040020000A0000000018000__0000003B99F7F8A0 000000067F000040020000A0000000014000-000000067F000040020000A0000000018000__0000005D2FFFFB38 000000067F000040020000A0000000018000-000000067F000040020000A000000001C000__00000038E67ABFA0 000000067F000040020000A0000000018000-000000067F000040020000A000000001C000__0000003903F1CFE8 000000067F000040020000A0000000018000-000000067F000040020000A000000001C000__0000003B99F7F8A0 000000067F000040020000A0000000018000-000000067F000040020000A000000001C000__0000005D2FFFFB38 000000067F000040020000A0000000019D77-000000067F000040020000A000000002276E__0000001C725D0191-0000002070591C61 000000067F000040020000A000000001C000-000000067F000040020000A0000000020000__00000038E67ABFA0 000000067F000040020000A000000001C000-000000067F000040020000A0000000020000__0000003903F1CFE8 000000067F000040020000A000000001C000-000000067F000040020000A0000000020000__0000003B99F7F8A0 000000067F000040020000A000000001C000-000000067F000040020000A0000000020000__0000005D2FFFFB38 000000067F000040020000A0000000020000-000000067F000040020000A0000000024000__00000038E67ABFA0 000000067F000040020000A0000000020000-000000067F000040020000A0000000024000__0000003903F1CFE8 000000067F000040020000A0000000020000-000000067F000040020000A0000000024000__0000003B99F7F8A0 000000067F000040020000A0000000020000-000000067F000040020000A0000000024000__0000005D2FFFFB38 000000067F000040020000A000000002276E-000000067F000040020000A000000002B152__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000024000-000000067F000040020000A0000000028000__00000038E67ABFA0 000000067F000040020000A0000000024000-000000067F000040020000A0000000028000__0000003903F1CFE8 000000067F000040020000A0000000024000-000000067F000040020000A0000000028000__0000003B99F7F8A0 000000067F000040020000A0000000024000-000000067F000040020000A0000000028000__0000005D2FFFFB38 000000067F000040020000A0000000028000-000000067F000040020000A000000002C000__00000038E67ABFA0 000000067F000040020000A0000000028000-000000067F000040020000A000000002C000__0000003903F1CFE8 000000067F000040020000A0000000028000-000000067F000040020000A000000002C000__0000003B99F7F8A0 000000067F000040020000A0000000028000-000000067F000040020000A000000002C000__0000005D2FFFFB38 000000067F000040020000A000000002B152-000000067F000040020000A0000000033B1C__0000001C725D0191-0000002070591C61 000000067F000040020000A000000002C000-000000067F000040020000A0000000030000__00000038E67ABFA0 000000067F000040020000A000000002C000-000000067F000040020000A0000000030000__0000003903F1CFE8 000000067F000040020000A000000002C000-000000067F000040020000A0000000030000__0000003B99F7F8A0 000000067F000040020000A000000002C000-000000067F000040020000A0000000030000__0000005D2FFFFB38 000000067F000040020000A0000000030000-000000067F000040020000A0000000034000__00000038E67ABFA0 000000067F000040020000A0000000030000-000000067F000040020000A0000000034000__0000003903F1CFE8 000000067F000040020000A0000000030000-000000067F000040020000A0000000034000__0000003B99F7F8A0 000000067F000040020000A0000000030000-000000067F000040020000A0000000034000__0000005D2FFFFB38 000000067F000040020000A0000000033B1C-000000067F000040020000A000000003C4CA__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000034000-000000067F000040020000A0000000038000__00000038E67ABFA0 000000067F000040020000A0000000034000-000000067F000040020000A0000000038000__0000003903F1CFE8 000000067F000040020000A0000000034000-000000067F000040020000A0000000038000__0000003B99F7F8A0 000000067F000040020000A0000000034000-000000067F000040020000A0000000038000__0000005D2FFFFB38 000000067F000040020000A0000000038000-000000067F000040020000A000000003C000__00000038E67ABFA0 000000067F000040020000A0000000038000-000000067F000040020000A000000003C000__0000003903F1CFE8 000000067F000040020000A0000000038000-000000067F000040020000A000000003C000__0000003B99F7F8A0 000000067F000040020000A0000000038000-000000067F000040020000A000000003C000__0000005D2FFFFB38 000000067F000040020000A000000003C000-000000067F000040020000A0000000040000__00000038E67ABFA0 000000067F000040020000A000000003C000-000000067F000040020000A0000000040000__0000003903F1CFE8 000000067F000040020000A000000003C000-000000067F000040020000A0000000040000__0000003B99F7F8A0 000000067F000040020000A000000003C000-000000067F000040020000A0000000040000__0000005D2FFFFB38 000000067F000040020000A000000003C4CA-000000067F000040020000A0000000044E8B__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000040000-000000067F000040020000A0000000044000__00000038E67ABFA0 000000067F000040020000A0000000040000-000000067F000040020000A0000000044000__0000003903F1CFE8 000000067F000040020000A0000000040000-000000067F000040020000A0000000044000__0000003B99F7F8A0 000000067F000040020000A0000000040000-000000067F000040020000A0000000044000__0000005D2FFFFB38 000000067F000040020000A0000000044000-000000067F000040020000A0000000048000__00000038E67ABFA0 000000067F000040020000A0000000044000-000000067F000040020000A0000000048000__0000003903F1CFE8 000000067F000040020000A0000000044000-000000067F000040020000A0000000048000__0000003B99F7F8A0 000000067F000040020000A0000000044000-000000067F000040020000A0000000048000__0000005D2FFFFB38 000000067F000040020000A0000000044E8B-000000067F000040020000A000000004D882__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000048000-000000067F000040020000A000000004C000__00000038E67ABFA0 000000067F000040020000A0000000048000-000000067F000040020000A000000004C000__0000003903F1CFE8 000000067F000040020000A0000000048000-000000067F000040020000A000000004C000__0000003B99F7F8A0 000000067F000040020000A0000000048000-000000067F000040020000A000000004C000__0000005D2FFFFB38 000000067F000040020000A000000004C000-000000067F000040020000A0000000050000__00000038E67ABFA0 000000067F000040020000A000000004C000-000000067F000040020000A0000000050000__0000003903F1CFE8 000000067F000040020000A000000004C000-000000067F000040020000A0000000050000__0000003B99F7F8A0 000000067F000040020000A000000004C000-000000067F000040020000A0000000050000__0000005D2FFFFB38 000000067F000040020000A000000004D882-000000067F000040020000A0000000056278__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000050000-000000067F000040020000A0000000054000__00000038E67ABFA0 000000067F000040020000A0000000050000-000000067F000040020000A0000000054000__0000003903F1CFE8 000000067F000040020000A0000000050000-000000067F000040020000A0000000054000__0000003B99F7F8A0 000000067F000040020000A0000000050000-000000067F000040020000A0000000054000__0000005D2FFFFB38 000000067F000040020000A0000000054000-000000067F000040020000A0000000058000__00000038E67ABFA0 000000067F000040020000A0000000054000-000000067F000040020000A0000000058000__0000003903F1CFE8 000000067F000040020000A0000000054000-000000067F000040020000A0000000058000__0000003B99F7F8A0 000000067F000040020000A0000000054000-000000067F000040020000A0000000058000__0000005D2FFFFB38 000000067F000040020000A0000000056278-000000067F000040020000A000000005EC6B__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000058000-000000067F000040020000A000000005C000__00000038E67ABFA0 000000067F000040020000A0000000058000-000000067F000040020000A000000005C000__0000003903F1CFE8 000000067F000040020000A0000000058000-000000067F000040020000A000000005C000__0000003B99F7F8A0 000000067F000040020000A0000000058000-000000067F000040020000A000000005C000__0000005D2FFFFB38 000000067F000040020000A000000005C000-000000067F000040020000A0000000060000__00000038E67ABFA0 000000067F000040020000A000000005C000-000000067F000040020000A0000000060000__0000003903F1CFE8 000000067F000040020000A000000005C000-000000067F000040020000A0000000060000__0000003B99F7F8A0 000000067F000040020000A000000005C000-000000067F000040020000A0000000060000__0000005D2FFFFB38 000000067F000040020000A000000005EC6B-000000067F000040020000A0000000067651__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000060000-000000067F000040020000A0000000064000__00000038E67ABFA0 000000067F000040020000A0000000060000-000000067F000040020000A0000000064000__0000003903F1CFE8 000000067F000040020000A0000000060000-000000067F000040020000A0000000064000__0000003B99F7F8A0 000000067F000040020000A0000000060000-000000067F000040020000A0000000064000__0000005D2FFFFB38 000000067F000040020000A0000000064000-000000067F000040020000A0000000068000__00000038E67ABFA0 000000067F000040020000A0000000064000-000000067F000040020000A0000000068000__0000003903F1CFE8 000000067F000040020000A0000000064000-000000067F000040020000A0000000068000__0000003B99F7F8A0 000000067F000040020000A0000000064000-000000067F000040020000A0000000068000__0000005D2FFFFB38 000000067F000040020000A0000000067651-000000067F000040020000A000000007002B__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000068000-000000067F000040020000A000000006C000__00000038E67ABFA0 000000067F000040020000A0000000068000-000000067F000040020000A000000006C000__0000003903F1CFE8 000000067F000040020000A0000000068000-000000067F000040020000A000000006C000__0000003B99F7F8A0 000000067F000040020000A0000000068000-000000067F000040020000A000000006C000__0000005D2FFFFB38 000000067F000040020000A000000006C000-000000067F000040020000A0000000070000__00000038E67ABFA0 000000067F000040020000A000000006C000-000000067F000040020000A0000000070000__0000003903F1CFE8 000000067F000040020000A000000006C000-000000067F000040020000A0000000070000__0000003B99F7F8A0 000000067F000040020000A000000006C000-000000067F000040020000A0000000070000__0000005D2FFFFB38 000000067F000040020000A0000000070000-000000067F000040020000A0000000074000__00000038E67ABFA0 000000067F000040020000A0000000070000-000000067F000040020000A0000000074000__0000003903F1CFE8 000000067F000040020000A0000000070000-000000067F000040020000A0000000074000__0000003B99F7F8A0 000000067F000040020000A0000000070000-000000067F000040020000A0000000074000__0000005D2FFFFB38 000000067F000040020000A000000007002B-000000067F000040020000A00000000789E3__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000074000-000000067F000040020000A0000000078000__00000038E67ABFA0 000000067F000040020000A0000000074000-000000067F000040020000A0000000078000__0000003903F1CFE8 000000067F000040020000A0000000074000-000000067F000040020000A0000000078000__0000003B99F7F8A0 000000067F000040020000A0000000074000-000000067F000040020000A0000000078000__0000005D2FFFFB38 000000067F000040020000A0000000078000-000000067F000040020000A000000007C000__00000038E67ABFA0 000000067F000040020000A0000000078000-000000067F000040020000A000000007C000__0000003903F1CFE8 000000067F000040020000A0000000078000-000000067F000040020000A000000007C000__0000003B99F7F8A0 000000067F000040020000A0000000078000-000000067F000040020000A000000007C000__0000005D2FFFFB38 000000067F000040020000A00000000789E3-000000067F000040020000A00000000813A7__0000001C725D0191-0000002070591C61 000000067F000040020000A000000007C000-000000067F000040020000A0000000080000__00000038E67ABFA0 000000067F000040020000A000000007C000-000000067F000040020000A0000000080000__0000003903F1CFE8 000000067F000040020000A000000007C000-000000067F000040020000A0000000080000__0000003B99F7F8A0 000000067F000040020000A000000007C000-000000067F000040020000A0000000080000__0000005D2FFFFB38 000000067F000040020000A0000000080000-000000067F000040020000A0000000084000__00000038E67ABFA0 000000067F000040020000A0000000080000-000000067F000040020000A0000000084000__0000003903F1CFE8 000000067F000040020000A0000000080000-000000067F000040020000A0000000084000__0000003B99F7F8A0 000000067F000040020000A0000000080000-000000067F000040020000A0000000084000__0000005D2FFFFB38 000000067F000040020000A00000000813A7-000000067F000040020000A0000000089D92__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000084000-000000067F000040020000A0000000088000__00000038E67ABFA0 000000067F000040020000A0000000084000-000000067F000040020000A0000000088000__0000003903F1CFE8 000000067F000040020000A0000000084000-000000067F000040020000A0000000088000__0000003B99F7F8A0 000000067F000040020000A0000000084000-000000067F000040020000A0000000088000__0000005D2FFFFB38 000000067F000040020000A0000000088000-000000067F000040020000A000000008C000__00000038E67ABFA0 000000067F000040020000A0000000088000-000000067F000040020000A000000008C000__0000003903F1CFE8 000000067F000040020000A0000000088000-000000067F000040020000A000000008C000__0000003B99F7F8A0 000000067F000040020000A0000000088000-000000067F000040020000A000000008C000__0000005D2FFFFB38 000000067F000040020000A0000000089D92-000000067F000040020000A000000009278A__0000001C725D0191-0000002070591C61 000000067F000040020000A000000008C000-000000067F000040020000A0000000090000__00000038E67ABFA0 000000067F000040020000A000000008C000-000000067F000040020000A0000000090000__0000003903F1CFE8 000000067F000040020000A000000008C000-000000067F000040020000A0000000090000__0000003B99F7F8A0 000000067F000040020000A000000008C000-000000067F000040020000A0000000090000__0000005D2FFFFB38 000000067F000040020000A0000000090000-000000067F000040020000A0000000094000__00000038E67ABFA0 000000067F000040020000A0000000090000-000000067F000040020000A0000000094000__0000003903F1CFE8 000000067F000040020000A0000000090000-000000067F000040020000A0000000094000__0000003B99F7F8A0 000000067F000040020000A0000000090000-000000067F000040020000A0000000094000__0000005D2FFFFB38 000000067F000040020000A000000009278A-000000067F000040020000A000000009B17C__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000094000-000000067F000040020000A0000000098000__00000038E67ABFA0 000000067F000040020000A0000000094000-000000067F000040020000A0000000098000__0000003903F1CFE8 000000067F000040020000A0000000094000-000000067F000040020000A0000000098000__0000003B99F7F8A0 000000067F000040020000A0000000094000-000000067F000040020000A0000000098000__0000005D2FFFFB38 000000067F000040020000A0000000098000-000000067F000040020000A000000009C000__00000038E67ABFA0 000000067F000040020000A0000000098000-000000067F000040020000A000000009C000__0000003903F1CFE8 000000067F000040020000A0000000098000-000000067F000040020000A000000009C000__0000003B99F7F8A0 000000067F000040020000A0000000098000-000000067F000040020000A000000009C000__0000005D2FFFFB38 000000067F000040020000A000000009B17C-000000067F000040020000A00000000A3B54__0000001C725D0191-0000002070591C61 000000067F000040020000A000000009C000-000000067F000040020000A00000000A0000__00000038E67ABFA0 000000067F000040020000A000000009C000-000000067F000040020000A00000000A0000__0000003903F1CFE8 000000067F000040020000A000000009C000-000000067F000040020000A00000000A0000__0000003B99F7F8A0 000000067F000040020000A000000009C000-000000067F000040020000A00000000A0000__0000005D2FFFFB38 000000067F000040020000A00000000A0000-000000067F000040020000A00000000A4000__00000038E67ABFA0 000000067F000040020000A00000000A0000-000000067F000040020000A00000000A4000__0000003903F1CFE8 000000067F000040020000A00000000A0000-000000067F000040020000A00000000A4000__0000003B99F7F8A0 000000067F000040020000A00000000A0000-000000067F000040020000A00000000A4000__0000005D2FFFFB38 000000067F000040020000A00000000A3B54-000000067F000040020000A00000000AC52A__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000A4000-000000067F000040020000A00000000A8000__00000038E67ABFA0 000000067F000040020000A00000000A4000-000000067F000040020000A00000000A8000__0000003903F1CFE8 000000067F000040020000A00000000A4000-000000067F000040020000A00000000A8000__0000003B99F7F8A0 000000067F000040020000A00000000A4000-000000067F000040020000A00000000A8000__0000005D2FFFFB38 000000067F000040020000A00000000A8000-000000067F000040020000A00000000AC000__00000038E67ABFA0 000000067F000040020000A00000000A8000-000000067F000040020000A00000000AC000__0000003903F1CFE8 000000067F000040020000A00000000A8000-000000067F000040020000A00000000AC000__0000003B99F7F8A0 000000067F000040020000A00000000A8000-000000067F000040020000A00000000AC000__0000005D2FFFFB38 000000067F000040020000A00000000AC000-000000067F000040020000A00000000B0000__00000038E67ABFA0 000000067F000040020000A00000000AC000-000000067F000040020000A00000000B0000__0000003903F1CFE8 000000067F000040020000A00000000AC000-000000067F000040020000A00000000B0000__0000003B99F7F8A0 000000067F000040020000A00000000AC000-000000067F000040020000A00000000B0000__0000005D2FFFFB38 000000067F000040020000A00000000AC52A-000000067F000040020000A00000000B4ED6__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000B0000-000000067F000040020000A00000000B4000__00000038E67ABFA0 000000067F000040020000A00000000B0000-000000067F000040020000A00000000B4000__0000003903F1CFE8 000000067F000040020000A00000000B0000-000000067F000040020000A00000000B4000__0000003B99F7F8A0 000000067F000040020000A00000000B0000-000000067F000040020000A00000000B4000__0000005D2FFFFB38 000000067F000040020000A00000000B4000-000000067F000040020000A00000000B8000__00000038E67ABFA0 000000067F000040020000A00000000B4000-000000067F000040020000A00000000B8000__0000003903F1CFE8 000000067F000040020000A00000000B4000-000000067F000040020000A00000000B8000__0000003B99F7F8A0 000000067F000040020000A00000000B4000-000000067F000040020000A00000000B8000__0000005D2FFFFB38 000000067F000040020000A00000000B4ED6-000000067F000040020000A00000000BD8A4__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000B8000-000000067F000040020000A00000000BC000__00000038E67ABFA0 000000067F000040020000A00000000B8000-000000067F000040020000A00000000BC000__0000003903F1CFE8 000000067F000040020000A00000000B8000-000000067F000040020000A00000000BC000__0000003B99F7F8A0 000000067F000040020000A00000000B8000-000000067F000040020000A00000000BC000__0000005D2FFFFB38 000000067F000040020000A00000000BC000-000000067F000040020000A00000000C0000__00000038E67ABFA0 000000067F000040020000A00000000BC000-000000067F000040020000A00000000C0000__0000003903F1CFE8 000000067F000040020000A00000000BC000-000000067F000040020000A00000000C0000__0000003B99F7F8A0 000000067F000040020000A00000000BC000-000000067F000040020000A00000000C0000__0000005D2FFFFB38 000000067F000040020000A00000000BD8A4-000000067F000040020000A00000000C629B__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000C0000-000000067F000040020000A00000000C4000__00000038E67ABFA0 000000067F000040020000A00000000C0000-000000067F000040020000A00000000C4000__0000003903F1CFE8 000000067F000040020000A00000000C0000-000000067F000040020000A00000000C4000__0000003B99F7F8A0 000000067F000040020000A00000000C0000-000000067F000040020000A00000000C4000__0000005D2FFFFB38 000000067F000040020000A00000000C4000-000000067F000040020000A00000000C8000__00000038E67ABFA0 000000067F000040020000A00000000C4000-000000067F000040020000A00000000C8000__0000003903F1CFE8 000000067F000040020000A00000000C4000-000000067F000040020000A00000000C8000__0000003B99F7F8A0 000000067F000040020000A00000000C4000-000000067F000040020000A00000000C8000__0000005D2FFFFB38 000000067F000040020000A00000000C629B-000000067F000040020000A00000000CEC94__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000C8000-000000067F000040020000A00000000CC000__00000038E67ABFA0 000000067F000040020000A00000000C8000-000000067F000040020000A00000000CC000__0000003903F1CFE8 000000067F000040020000A00000000C8000-000000067F000040020000A00000000CC000__0000003B99F7F8A0 000000067F000040020000A00000000C8000-000000067F000040020000A00000000CC000__0000005D2FFFFB38 000000067F000040020000A00000000CC000-000000067F000040020000A00000000D0000__00000038E67ABFA0 000000067F000040020000A00000000CC000-000000067F000040020000A00000000D0000__0000003903F1CFE8 000000067F000040020000A00000000CC000-000000067F000040020000A00000000D0000__0000003B99F7F8A0 000000067F000040020000A00000000CC000-000000067F000040020000A00000000D0000__0000005D2FFFFB38 000000067F000040020000A00000000CEC94-000000067F000040020000A00000000D7688__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000D0000-000000067F000040020000A00000000D4000__00000038E67ABFA0 000000067F000040020000A00000000D0000-000000067F000040020000A00000000D4000__0000003903F1CFE8 000000067F000040020000A00000000D0000-000000067F000040020000A00000000D4000__0000003B99F7F8A0 000000067F000040020000A00000000D0000-000000067F000040020000A00000000D4000__0000005D2FFFFB38 000000067F000040020000A00000000D4000-000000067F000040020000A00000000D8000__00000038E67ABFA0 000000067F000040020000A00000000D4000-000000067F000040020000A00000000D8000__0000003903F1CFE8 000000067F000040020000A00000000D4000-000000067F000040020000A00000000D8000__0000003B99F7F8A0 000000067F000040020000A00000000D4000-000000067F000040020000A00000000D8000__0000005D2FFFFB38 000000067F000040020000A00000000D7688-000000067F000040020000A00000000E0068__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000D8000-000000067F000040020000A00000000DC000__00000038E67ABFA0 000000067F000040020000A00000000D8000-000000067F000040020000A00000000DC000__0000003903F1CFE8 000000067F000040020000A00000000D8000-000000067F000040020000A00000000DC000__0000003B99F7F8A0 000000067F000040020000A00000000D8000-000000067F000040020000A00000000DC000__0000005D2FFFFB38 000000067F000040020000A00000000DC000-000000067F000040020000A00000000E0000__00000038E67ABFA0 000000067F000040020000A00000000DC000-000000067F000040020000A00000000E0000__0000003903F1CFE8 000000067F000040020000A00000000DC000-000000067F000040020000A00000000E0000__0000003B99F7F8A0 000000067F000040020000A00000000DC000-000000067F000040020000A00000000E0000__0000005D2FFFFB38 000000067F000040020000A00000000E0000-000000067F000040020000A00000000E4000__00000038E67ABFA0 000000067F000040020000A00000000E0000-000000067F000040020000A00000000E4000__0000003903F1CFE8 000000067F000040020000A00000000E0000-000000067F000040020000A00000000E4000__0000003B99F7F8A0 000000067F000040020000A00000000E0000-000000067F000040020000A00000000E4000__0000005D2FFFFB38 000000067F000040020000A00000000E0068-000000067F000040020000A00000000E8A2D__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000E4000-000000067F000040020000A00000000E8000__00000038E67ABFA0 000000067F000040020000A00000000E4000-000000067F000040020000A00000000E8000__0000003903F1CFE8 000000067F000040020000A00000000E4000-000000067F000040020000A00000000E8000__0000003B99F7F8A0 000000067F000040020000A00000000E4000-000000067F000040020000A00000000E8000__0000005D2FFFFB38 000000067F000040020000A00000000E8000-000000067F000040020000A00000000EC000__00000038E67ABFA0 000000067F000040020000A00000000E8000-000000067F000040020000A00000000EC000__0000003903F1CFE8 000000067F000040020000A00000000E8000-000000067F000040020000A00000000EC000__0000003B99F7F8A0 000000067F000040020000A00000000E8000-000000067F000040020000A00000000EC000__0000005D2FFFFB38 000000067F000040020000A00000000E8A2D-000000067F000040020000A00000000F13E7__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000EC000-000000067F000040020000A00000000F0000__00000038E67ABFA0 000000067F000040020000A00000000EC000-000000067F000040020000A00000000F0000__0000003903F1CFE8 000000067F000040020000A00000000EC000-000000067F000040020000A00000000F0000__0000003B99F7F8A0 000000067F000040020000A00000000EC000-000000067F000040020000A00000000F0000__0000005D2FFFFB38 000000067F000040020000A00000000F0000-000000067F000040020000A00000000F4000__00000038E67ABFA0 000000067F000040020000A00000000F0000-000000067F000040020000A00000000F4000__0000003903F1CFE8 000000067F000040020000A00000000F0000-000000067F000040020000A00000000F4000__0000003B99F7F8A0 000000067F000040020000A00000000F0000-000000067F000040020000A00000000F4000__0000005D2FFFFB38 000000067F000040020000A00000000F13E7-000000067F000040020000A00000000F9DC4__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000F4000-000000067F000040020000A00000000F8000__00000038E67ABFA0 000000067F000040020000A00000000F4000-000000067F000040020000A00000000F8000__0000003903F1CFE8 000000067F000040020000A00000000F4000-000000067F000040020000A00000000F8000__0000003B99F7F8A0 000000067F000040020000A00000000F4000-000000067F000040020000A00000000F8000__0000005D2FFFFB38 000000067F000040020000A00000000F8000-000000067F000040020000A00000000FC000__00000038E67ABFA0 000000067F000040020000A00000000F8000-000000067F000040020000A00000000FC000__0000003903F1CFE8 000000067F000040020000A00000000F8000-000000067F000040020000A00000000FC000__0000003B99F7F8A0 000000067F000040020000A00000000F8000-000000067F000040020000A00000000FC000__0000005D2FFFFB38 000000067F000040020000A00000000F9DC4-000000067F000040020000A00000001027C3__0000001C725D0191-0000002070591C61 000000067F000040020000A00000000FC000-000000067F000040020000A0000000100000__00000038E67ABFA0 000000067F000040020000A00000000FC000-000000067F000040020000A0000000100000__0000003903F1CFE8 000000067F000040020000A00000000FC000-000000067F000040020000A0000000100000__0000003B99F7F8A0 000000067F000040020000A00000000FC000-000000067F000040020000A0000000100000__0000005D2FFFFB38 000000067F000040020000A0000000100000-000000067F000040020000A0000000104000__00000038E67ABFA0 000000067F000040020000A0000000100000-000000067F000040020000A0000000104000__0000003903F1CFE8 000000067F000040020000A0000000100000-000000067F000040020000A0000000104000__0000003B99F7F8A0 000000067F000040020000A0000000100000-000000067F000040020000A0000000104000__0000005D2FFFFB38 000000067F000040020000A00000001027C3-000000067F000040020000A000000010B1C2__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000104000-000000067F000040020000A0000000108000__00000038E67ABFA0 000000067F000040020000A0000000104000-000000067F000040020000A0000000108000__0000003903F1CFE8 000000067F000040020000A0000000104000-000000067F000040020000A0000000108000__0000003B99F7F8A0 000000067F000040020000A0000000104000-000000067F000040020000A0000000108000__0000005D2FFFFB38 000000067F000040020000A0000000108000-000000067F000040020000A000000010C000__00000038E67ABFA0 000000067F000040020000A0000000108000-000000067F000040020000A000000010C000__0000003903F1CFE8 000000067F000040020000A0000000108000-000000067F000040020000A000000010C000__0000003B99F7F8A0 000000067F000040020000A0000000108000-000000067F000040020000A000000010C000__0000005D2FFFFB38 000000067F000040020000A000000010B1C2-000000067F000040020000A0000000113BB3__0000001C725D0191-0000002070591C61 000000067F000040020000A000000010C000-000000067F000040020000A0000000110000__00000038E67ABFA0 000000067F000040020000A000000010C000-000000067F000040020000A0000000110000__0000003903F1CFE8 000000067F000040020000A000000010C000-000000067F000040020000A0000000110000__0000003B99F7F8A0 000000067F000040020000A000000010C000-000000067F000040020000A0000000110000__0000005D2FFFFB38 000000067F000040020000A0000000110000-000000067F000040020000A0000000114000__00000038E67ABFA0 000000067F000040020000A0000000110000-000000067F000040020000A0000000114000__0000003903F1CFE8 000000067F000040020000A0000000110000-000000067F000040020000A0000000114000__0000003B99F7F8A0 000000067F000040020000A0000000110000-000000067F000040020000A0000000114000__0000005D2FFFFB38 000000067F000040020000A0000000113BB3-000000067F000040020000A000000011C591__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000114000-000000067F000040020000A0000000118000__00000038E67ABFA0 000000067F000040020000A0000000114000-000000067F000040020000A0000000118000__0000003903F1CFE8 000000067F000040020000A0000000114000-000000067F000040020000A0000000118000__0000003B99F7F8A0 000000067F000040020000A0000000114000-000000067F000040020000A0000000118000__0000005D2FFFFB38 000000067F000040020000A0000000118000-000000067F000040020000A000000011C000__00000038E67ABFA0 000000067F000040020000A0000000118000-000000067F000040020000A000000011C000__0000003903F1CFE8 000000067F000040020000A0000000118000-000000067F000040020000A000000011C000__0000003B99F7F8A0 000000067F000040020000A0000000118000-000000067F000040020000A000000011C000__0000005D2FFFFB38 000000067F000040020000A000000011C000-000000067F000040020000A0000000120000__00000038E67ABFA0 000000067F000040020000A000000011C000-000000067F000040020000A0000000120000__0000003903F1CFE8 000000067F000040020000A000000011C000-000000067F000040020000A0000000120000__0000003B99F7F8A0 000000067F000040020000A000000011C000-000000067F000040020000A0000000120000__0000005D2FFFFB38 000000067F000040020000A000000011C591-000000067F000040020000A0000000124F48__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000120000-000000067F000040020000A0000000124000__00000038E67ABFA0 000000067F000040020000A0000000120000-000000067F000040020000A0000000124000__0000003903F1CFE8 000000067F000040020000A0000000120000-000000067F000040020000A0000000124000__0000003B99F7F8A0 000000067F000040020000A0000000120000-000000067F000040020000A0000000124000__0000005D2FFFFB38 000000067F000040020000A0000000124000-000000067F000040020000A0000000128000__00000038E67ABFA0 000000067F000040020000A0000000124000-000000067F000040020000A0000000128000__0000003903F1CFE8 000000067F000040020000A0000000124000-000000067F000040020000A0000000128000__0000003B99F7F8A0 000000067F000040020000A0000000124000-000000067F000040020000A0000000128000__0000005D2FFFFB38 000000067F000040020000A0000000124F48-000000067F000040020000A000000012D900__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000128000-000000067F000040020000A000000012C000__00000038E67ABFA0 000000067F000040020000A0000000128000-000000067F000040020000A000000012C000__0000003903F1CFE8 000000067F000040020000A0000000128000-000000067F000040020000A000000012C000__0000003B99F7F8A0 000000067F000040020000A0000000128000-000000067F000040020000A000000012C000__0000005D2FFFFB38 000000067F000040020000A000000012C000-000000067F000040020000A0000000130000__00000038E67ABFA0 000000067F000040020000A000000012C000-000000067F000040020000A0000000130000__0000003903F1CFE8 000000067F000040020000A000000012C000-000000067F000040020000A0000000130000__0000003B99F7F8A0 000000067F000040020000A000000012C000-000000067F000040020000A0000000130000__0000005D2FFFFB38 000000067F000040020000A000000012D900-000000067F000040020000A00000001362D3__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000130000-000000067F000040020000A0000000134000__00000038E67ABFA0 000000067F000040020000A0000000130000-000000067F000040020000A0000000134000__0000003903F1CFE8 000000067F000040020000A0000000130000-000000067F000040020000A0000000134000__0000003B99F7F8A0 000000067F000040020000A0000000130000-000000067F000040020000A0000000134000__0000005D2FFFFB38 000000067F000040020000A0000000134000-000000067F000040020000A0000000138000__00000038E67ABFA0 000000067F000040020000A0000000134000-000000067F000040020000A0000000138000__0000003903F1CFE8 000000067F000040020000A0000000134000-000000067F000040020000A0000000138000__0000003B99F7F8A0 000000067F000040020000A0000000134000-000000067F000040020000A0000000138000__0000005D2FFFFB38 000000067F000040020000A00000001362D3-000000067F000040020000A000000013ECD2__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000138000-000000067F000040020000A000000013C000__00000038E67ABFA0 000000067F000040020000A0000000138000-000000067F000040020000A000000013C000__0000003903F1CFE8 000000067F000040020000A0000000138000-000000067F000040020000A000000013C000__0000003B99F7F8A0 000000067F000040020000A0000000138000-000000067F000040020000A000000013C000__0000005D2FFFFB38 000000067F000040020000A000000013C000-000000067F000040020000A0000000140000__00000038E67ABFA0 000000067F000040020000A000000013C000-000000067F000040020000A0000000140000__0000003903F1CFE8 000000067F000040020000A000000013C000-000000067F000040020000A0000000140000__0000003B99F7F8A0 000000067F000040020000A000000013C000-000000067F000040020000A0000000140000__0000005D2FFFFB38 000000067F000040020000A000000013ECD2-000000067F000040020000A00000001476C8__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000140000-000000067F000040020000A0000000144000__00000038E67ABFA0 000000067F000040020000A0000000140000-000000067F000040020000A0000000144000__0000003903F1CFE8 000000067F000040020000A0000000140000-000000067F000040020000A0000000144000__0000003B99F7F8A0 000000067F000040020000A0000000140000-000000067F000040020000A0000000144000__0000005D2FFFFB38 000000067F000040020000A0000000144000-000000067F000040020000A0000000148000__00000038E67ABFA0 000000067F000040020000A0000000144000-000000067F000040020000A0000000148000__0000003903F1CFE8 000000067F000040020000A0000000144000-000000067F000040020000A0000000148000__0000003B99F7F8A0 000000067F000040020000A0000000144000-000000067F000040020000A0000000148000__0000005D2FFFFB38 000000067F000040020000A00000001476C8-000000067F000040020000A00000001500B9__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000148000-000000067F000040020000A000000014C000__00000038E67ABFA0 000000067F000040020000A0000000148000-000000067F000040020000A000000014C000__0000003903F1CFE8 000000067F000040020000A0000000148000-000000067F000040020000A000000014C000__0000003B99F7F8A0 000000067F000040020000A0000000148000-000000067F000040020000A000000014C000__0000005D2FFFFB38 000000067F000040020000A000000014C000-000000067F000040020000A0000000150000__00000038E67ABFA0 000000067F000040020000A000000014C000-000000067F000040020000A0000000150000__0000003903F1CFE8 000000067F000040020000A000000014C000-000000067F000040020000A0000000150000__0000003B99F7F8A0 000000067F000040020000A000000014C000-000000067F000040020000A0000000150000__0000005D2FFFFB38 000000067F000040020000A0000000150000-000000067F000040020000A0000000154000__00000038E67ABFA0 000000067F000040020000A0000000150000-000000067F000040020000A0000000154000__0000003903F1CFE8 000000067F000040020000A0000000150000-000000067F000040020000A0000000154000__0000003B99F7F8A0 000000067F000040020000A0000000150000-000000067F000040020000A0000000154000__0000005D2FFFFB38 000000067F000040020000A00000001500B9-000000067F000040020000A0000000158A91__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000154000-000000067F000040020000A0000000158000__00000038E67ABFA0 000000067F000040020000A0000000154000-000000067F000040020000A0000000158000__0000003903F1CFE8 000000067F000040020000A0000000154000-000000067F000040020000A0000000158000__0000003B99F7F8A0 000000067F000040020000A0000000154000-000000067F000040020000A0000000158000__0000005D2FFFFB38 000000067F000040020000A0000000158000-000000067F000040020000A000000015C000__00000038E67ABFA0 000000067F000040020000A0000000158000-000000067F000040020000A000000015C000__0000003903F1CFE8 000000067F000040020000A0000000158000-000000067F000040020000A000000015C000__0000003B99F7F8A0 000000067F000040020000A0000000158000-000000067F000040020000A000000015C000__0000005D2FFFFB38 000000067F000040020000A0000000158A91-000000067F000040020000A0000000161450__0000001C725D0191-0000002070591C61 000000067F000040020000A000000015C000-000000067F000040020000A0000000160000__00000038E67ABFA0 000000067F000040020000A000000015C000-000000067F000040020000A0000000160000__0000003903F1CFE8 000000067F000040020000A000000015C000-000000067F000040020000A0000000160000__0000003B99F7F8A0 000000067F000040020000A000000015C000-000000067F000040020000A0000000160000__0000005D2FFFFB38 000000067F000040020000A0000000160000-000000067F000040020000A0000000164000__00000038E67ABFA0 000000067F000040020000A0000000160000-000000067F000040020000A0000000164000__0000003903F1CFE8 000000067F000040020000A0000000160000-000000067F000040020000A0000000164000__0000003B99F7F8A0 000000067F000040020000A0000000160000-000000067F000040020000A0000000164000__0000005D2FFFFB38 000000067F000040020000A0000000161450-000000067F000040020000A0000000169E01__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000164000-000000067F000040020000A0000000168000__00000038E67ABFA0 000000067F000040020000A0000000164000-000000067F000040020000A0000000168000__0000003903F1CFE8 000000067F000040020000A0000000164000-000000067F000040020000A0000000168000__0000003B99F7F8A0 000000067F000040020000A0000000164000-000000067F000040020000A0000000168000__0000005D2FFFFB38 000000067F000040020000A0000000168000-000000067F000040020000A000000016C000__00000038E67ABFA0 000000067F000040020000A0000000168000-000000067F000040020000A000000016C000__0000003903F1CFE8 000000067F000040020000A0000000168000-000000067F000040020000A000000016C000__0000003B99F7F8A0 000000067F000040020000A0000000168000-000000067F000040020000A000000016C000__0000005D2FFFFB38 000000067F000040020000A0000000169E01-000000067F000040020000A00000001727DF__0000001C725D0191-0000002070591C61 000000067F000040020000A000000016C000-000000067F000040020000A0000000170000__00000038E67ABFA0 000000067F000040020000A000000016C000-000000067F000040020000A0000000170000__0000003903F1CFE8 000000067F000040020000A000000016C000-000000067F000040020000A0000000170000__0000003B99F7F8A0 000000067F000040020000A000000016C000-000000067F000040020000A0000000170000__0000005D2FFFFB38 000000067F000040020000A0000000170000-000000067F000040020000A0000000174000__00000038E67ABFA0 000000067F000040020000A0000000170000-000000067F000040020000A0000000174000__0000003903F1CFE8 000000067F000040020000A0000000170000-000000067F000040020000A0000000174000__0000003B99F7F8A0 000000067F000040020000A0000000170000-000000067F000040020000A0000000174000__0000005D2FFFFB38 000000067F000040020000A00000001727DF-000000067F000040020000A000000017B1E4__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000174000-000000067F000040020000A0000000178000__00000038E67ABFA0 000000067F000040020000A0000000174000-000000067F000040020000A0000000178000__0000003903F1CFE8 000000067F000040020000A0000000174000-000000067F000040020000A0000000178000__0000003B99F7F8A0 000000067F000040020000A0000000174000-000000067F000040020000A0000000178000__0000005D2FFFFB38 000000067F000040020000A0000000178000-000000067F000040020000A000000017C000__00000038E67ABFA0 000000067F000040020000A0000000178000-000000067F000040020000A000000017C000__0000003903F1CFE8 000000067F000040020000A0000000178000-000000067F000040020000A000000017C000__0000003B99F7F8A0 000000067F000040020000A0000000178000-000000067F000040020000A000000017C000__0000005D2FFFFB38 000000067F000040020000A000000017B1E4-000000067F000040020000A0000000183BE2__0000001C725D0191-0000002070591C61 000000067F000040020000A000000017C000-000000067F000040020000A0000000180000__00000038E67ABFA0 000000067F000040020000A000000017C000-000000067F000040020000A0000000180000__0000003903F1CFE8 000000067F000040020000A000000017C000-000000067F000040020000A0000000180000__0000003B99F7F8A0 000000067F000040020000A000000017C000-000000067F000040020000A0000000180000__0000005D2FFFFB38 000000067F000040020000A0000000180000-000000067F000040020000A0000000184000__00000038E67ABFA0 000000067F000040020000A0000000180000-000000067F000040020000A0000000184000__0000003903F1CFE8 000000067F000040020000A0000000180000-000000067F000040020000A0000000184000__0000003B99F7F8A0 000000067F000040020000A0000000180000-000000067F000040020000A0000000184000__0000005D2FFFFB38 000000067F000040020000A0000000183BE2-000000067F000040020000A000000018C5D6__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000184000-000000067F000040020000A0000000188000__00000038E67ABFA0 000000067F000040020000A0000000184000-000000067F000040020000A0000000188000__0000003903F1CFE8 000000067F000040020000A0000000184000-000000067F000040020000A0000000188000__0000003B99F7F8A0 000000067F000040020000A0000000184000-000000067F000040020000A0000000188000__0000005D2FFFFB38 000000067F000040020000A0000000188000-000000067F000040020000A000000018C000__00000038E67ABFA0 000000067F000040020000A0000000188000-000000067F000040020000A000000018C000__0000003903F1CFE8 000000067F000040020000A0000000188000-000000067F000040020000A000000018C000__0000003B99F7F8A0 000000067F000040020000A0000000188000-000000067F000040020000A000000018C000__0000005D2FFFFB38 000000067F000040020000A000000018C000-000000067F000040020000A0000000190000__00000038E67ABFA0 000000067F000040020000A000000018C000-000000067F000040020000A0000000190000__0000003903F1CFE8 000000067F000040020000A000000018C000-000000067F000040020000A0000000190000__0000003B99F7F8A0 000000067F000040020000A000000018C000-000000067F000040020000A0000000190000__0000005D2FFFFB38 000000067F000040020000A000000018C5D6-000000067F000040020000A0000000194FB6__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000190000-000000067F000040020000A0000000194000__00000038E67ABFA0 000000067F000040020000A0000000190000-000000067F000040020000A0000000194000__0000003903F1CFE8 000000067F000040020000A0000000190000-000000067F000040020000A0000000194000__0000003B99F7F8A0 000000067F000040020000A0000000190000-000000067F000040020000A0000000194000__0000005D2FFFFB38 000000067F000040020000A0000000194000-000000067F000040020000A0000000198000__00000038E67ABFA0 000000067F000040020000A0000000194000-000000067F000040020000A0000000198000__0000003903F1CFE8 000000067F000040020000A0000000194000-000000067F000040020000A0000000198000__0000003B99F7F8A0 000000067F000040020000A0000000194000-000000067F000040020000A0000000198000__0000005D2FFFFB38 000000067F000040020000A0000000194FB6-000000067F000040020000A000000019D971__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000198000-000000067F000040020000A000000019C000__00000038E67ABFA0 000000067F000040020000A0000000198000-000000067F000040020000A000000019C000__0000003903F1CFE8 000000067F000040020000A0000000198000-000000067F000040020000A000000019C000__0000003B99F7F8A0 000000067F000040020000A0000000198000-000000067F000040020000A000000019C000__0000005D2FFFFB38 000000067F000040020000A000000019C000-000000067F000040020000A00000001A0000__00000038E67ABFA0 000000067F000040020000A000000019C000-000000067F000040020000A00000001A0000__0000003903F1CFE8 000000067F000040020000A000000019C000-000000067F000040020000A00000001A0000__0000003B99F7F8A0 000000067F000040020000A000000019C000-000000067F000040020000A00000001A0000__0000005D2FFFFB38 000000067F000040020000A000000019D971-000000067F000040020000A00000001A6321__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001A0000-000000067F000040020000A00000001A4000__00000038E67ABFA0 000000067F000040020000A00000001A0000-000000067F000040020000A00000001A4000__0000003903F1CFE8 000000067F000040020000A00000001A0000-000000067F000040020000A00000001A4000__0000003B99F7F8A0 000000067F000040020000A00000001A0000-000000067F000040020000A00000001A4000__0000005D2FFFFB38 000000067F000040020000A00000001A4000-000000067F000040020000A00000001A8000__00000038E67ABFA0 000000067F000040020000A00000001A4000-000000067F000040020000A00000001A8000__0000003903F1CFE8 000000067F000040020000A00000001A4000-000000067F000040020000A00000001A8000__0000003B99F7F8A0 000000067F000040020000A00000001A4000-000000067F000040020000A00000001A8000__0000005D2FFFFB38 000000067F000040020000A00000001A6321-000000067F000040020000A00000001AECFE__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001A8000-000000067F000040020000A00000001AC000__00000038E67ABFA0 000000067F000040020000A00000001A8000-000000067F000040020000A00000001AC000__0000003903F1CFE8 000000067F000040020000A00000001A8000-000000067F000040020000A00000001AC000__0000003B99F7F8A0 000000067F000040020000A00000001A8000-000000067F000040020000A00000001AC000__0000005D2FFFFB38 000000067F000040020000A00000001AC000-000000067F000040020000A00000001B0000__00000038E67ABFA0 000000067F000040020000A00000001AC000-000000067F000040020000A00000001B0000__0000003903F1CFE8 000000067F000040020000A00000001AC000-000000067F000040020000A00000001B0000__0000003B99F7F8A0 000000067F000040020000A00000001AC000-000000067F000040020000A00000001B0000__0000005D2FFFFB38 000000067F000040020000A00000001AECFE-000000067F000040020000A00000001B76FB__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001B0000-000000067F000040020000A00000001B4000__00000038E67ABFA0 000000067F000040020000A00000001B0000-000000067F000040020000A00000001B4000__0000003903F1CFE8 000000067F000040020000A00000001B0000-000000067F000040020000A00000001B4000__0000003B99F7F8A0 000000067F000040020000A00000001B0000-000000067F000040020000A00000001B4000__0000005D2FFFFB38 000000067F000040020000A00000001B4000-000000067F000040020000A00000001B8000__00000038E67ABFA0 000000067F000040020000A00000001B4000-000000067F000040020000A00000001B8000__0000003903F1CFE8 000000067F000040020000A00000001B4000-000000067F000040020000A00000001B8000__0000003B99F7F8A0 000000067F000040020000A00000001B4000-000000067F000040020000A00000001B8000__0000005D2FFFFB38 000000067F000040020000A00000001B76FB-000000067F000040020000A00000001C00F5__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001B8000-000000067F000040020000A00000001BC000__00000038E67ABFA0 000000067F000040020000A00000001B8000-000000067F000040020000A00000001BC000__0000003903F1CFE8 000000067F000040020000A00000001B8000-000000067F000040020000A00000001BC000__0000003B99F7F8A0 000000067F000040020000A00000001B8000-000000067F000040020000A00000001BC000__0000005D2FFFFB38 000000067F000040020000A00000001BC000-000000067F000040020000A00000001C0000__00000038E67ABFA0 000000067F000040020000A00000001BC000-000000067F000040020000A00000001C0000__0000003903F1CFE8 000000067F000040020000A00000001BC000-000000067F000040020000A00000001C0000__0000003B99F7F8A0 000000067F000040020000A00000001BC000-000000067F000040020000A00000001C0000__0000005D2FFFFB38 000000067F000040020000A00000001C0000-000000067F000040020000A00000001C4000__00000038E67ABFA0 000000067F000040020000A00000001C0000-000000067F000040020000A00000001C4000__0000003903F1CFE8 000000067F000040020000A00000001C0000-000000067F000040020000A00000001C4000__0000003B99F7F8A0 000000067F000040020000A00000001C0000-000000067F000040020000A00000001C4000__0000005D2FFFFB38 000000067F000040020000A00000001C00F5-000000067F000040020000A00000001C8AE1__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001C4000-000000067F000040020000A00000001C8000__00000038E67ABFA0 000000067F000040020000A00000001C4000-000000067F000040020000A00000001C8000__0000003903F1CFE8 000000067F000040020000A00000001C4000-000000067F000040020000A00000001C8000__0000003B99F7F8A0 000000067F000040020000A00000001C4000-000000067F000040020000A00000001C8000__0000005D2FFFFB38 000000067F000040020000A00000001C8000-000000067F000040020000A00000001CC000__00000038E67ABFA0 000000067F000040020000A00000001C8000-000000067F000040020000A00000001CC000__0000003903F1CFE8 000000067F000040020000A00000001C8000-000000067F000040020000A00000001CC000__0000003B99F7F8A0 000000067F000040020000A00000001C8000-000000067F000040020000A00000001CC000__0000005D2FFFFB38 000000067F000040020000A00000001C8AE1-000000067F000040020000A00000001D14C2__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001CC000-000000067F000040020000A00000001D0000__00000038E67ABFA0 000000067F000040020000A00000001CC000-000000067F000040020000A00000001D0000__0000003903F1CFE8 000000067F000040020000A00000001CC000-000000067F000040020000A00000001D0000__0000003B99F7F8A0 000000067F000040020000A00000001CC000-000000067F000040020000A00000001D0000__0000005D2FFFFB38 000000067F000040020000A00000001D0000-000000067F000040020000A00000001D4000__00000038E67ABFA0 000000067F000040020000A00000001D0000-000000067F000040020000A00000001D4000__0000003903F1CFE8 000000067F000040020000A00000001D0000-000000067F000040020000A00000001D4000__0000003B99F7F8A0 000000067F000040020000A00000001D0000-000000067F000040020000A00000001D4000__0000005D2FFFFB38 000000067F000040020000A00000001D14C2-000000067F000040020000A00000001D9E7E__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001D4000-000000067F000040020000A00000001D8000__00000038E67ABFA0 000000067F000040020000A00000001D4000-000000067F000040020000A00000001D8000__0000003903F1CFE8 000000067F000040020000A00000001D4000-000000067F000040020000A00000001D8000__0000003B99F7F8A0 000000067F000040020000A00000001D4000-000000067F000040020000A00000001D8000__0000005D2FFFFB38 000000067F000040020000A00000001D8000-000000067F000040020000A00000001DC000__00000038E67ABFA0 000000067F000040020000A00000001D8000-000000067F000040020000A00000001DC000__0000003903F1CFE8 000000067F000040020000A00000001D8000-000000067F000040020000A00000001DC000__0000003B99F7F8A0 000000067F000040020000A00000001D8000-000000067F000040020000A00000001DC000__0000005D2FFFFB38 000000067F000040020000A00000001D9E7E-000000067F000040020000A00000001E282E__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001DC000-000000067F000040020000A00000001E0000__00000038E67ABFA0 000000067F000040020000A00000001DC000-000000067F000040020000A00000001E0000__0000003903F1CFE8 000000067F000040020000A00000001DC000-000000067F000040020000A00000001E0000__0000003B99F7F8A0 000000067F000040020000A00000001DC000-000000067F000040020000A00000001E0000__0000005D2FFFFB38 000000067F000040020000A00000001E0000-000000067F000040020000A00000001E4000__00000038E67ABFA0 000000067F000040020000A00000001E0000-000000067F000040020000A00000001E4000__0000003903F1CFE8 000000067F000040020000A00000001E0000-000000067F000040020000A00000001E4000__0000003B99F7F8A0 000000067F000040020000A00000001E0000-000000067F000040020000A00000001E4000__0000005D2FFFFB38 000000067F000040020000A00000001E282E-000000067F000040020000A00000001EB21C__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001E4000-000000067F000040020000A00000001E8000__00000038E67ABFA0 000000067F000040020000A00000001E4000-000000067F000040020000A00000001E8000__0000003903F1CFE8 000000067F000040020000A00000001E4000-000000067F000040020000A00000001E8000__0000003B99F7F8A0 000000067F000040020000A00000001E4000-000000067F000040020000A00000001E8000__0000005D2FFFFB38 000000067F000040020000A00000001E8000-000000067F000040020000A00000001EC000__00000038E67ABFA0 000000067F000040020000A00000001E8000-000000067F000040020000A00000001EC000__0000003903F1CFE8 000000067F000040020000A00000001E8000-000000067F000040020000A00000001EC000__0000003B99F7F8A0 000000067F000040020000A00000001E8000-000000067F000040020000A00000001EC000__0000005D2FFFFB38 000000067F000040020000A00000001EB21C-000000067F000040020000A00000001F3C10__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001EC000-000000067F000040020000A00000001F0000__00000038E67ABFA0 000000067F000040020000A00000001EC000-000000067F000040020000A00000001F0000__0000003903F1CFE8 000000067F000040020000A00000001EC000-000000067F000040020000A00000001F0000__0000003B99F7F8A0 000000067F000040020000A00000001EC000-000000067F000040020000A00000001F0000__0000005D2FFFFB38 000000067F000040020000A00000001F0000-000000067F000040020000A00000001F4000__00000038E67ABFA0 000000067F000040020000A00000001F0000-000000067F000040020000A00000001F4000__0000003903F1CFE8 000000067F000040020000A00000001F0000-000000067F000040020000A00000001F4000__0000003B99F7F8A0 000000067F000040020000A00000001F0000-000000067F000040020000A00000001F4000__0000005D2FFFFB38 000000067F000040020000A00000001F3C10-000000067F000040020000A00000001FC601__0000001C725D0191-0000002070591C61 000000067F000040020000A00000001F4000-000000067F000040020000A00000001F8000__00000038E67ABFA0 000000067F000040020000A00000001F4000-000000067F000040020000A00000001F8000__0000003903F1CFE8 000000067F000040020000A00000001F4000-000000067F000040020000A00000001F8000__0000003B99F7F8A0 000000067F000040020000A00000001F4000-000000067F000040020000A00000001F8000__0000005D2FFFFB38 000000067F000040020000A00000001F8000-000000067F000040020000A00000001FC000__00000038E67ABFA0 000000067F000040020000A00000001F8000-000000067F000040020000A00000001FC000__0000003903F1CFE8 000000067F000040020000A00000001F8000-000000067F000040020000A00000001FC000__0000003B99F7F8A0 000000067F000040020000A00000001F8000-000000067F000040020000A00000001FC000__0000005D2FFFFB38 000000067F000040020000A00000001FC000-000000067F000040020000A0000000200000__00000038E67ABFA0 000000067F000040020000A00000001FC000-000000067F000040020000A0000000200000__0000003903F1CFE8 000000067F000040020000A00000001FC000-000000067F000040020000A0000000200000__0000003B99F7F8A0 000000067F000040020000A00000001FC000-000000067F000040020000A0000000200000__0000005D2FFFFB38 000000067F000040020000A00000001FC601-000000067F000040020000A0000000204FDD__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000200000-000000067F000040020000A0000000204000__00000038E67ABFA0 000000067F000040020000A0000000200000-000000067F000040020000A0000000204000__0000003903F1CFE8 000000067F000040020000A0000000200000-000000067F000040020000A0000000204000__0000003B99F7F8A0 000000067F000040020000A0000000200000-000000067F000040020000A0000000204000__0000005D2FFFFB38 000000067F000040020000A0000000204000-000000067F000040020000A0000000208000__00000038E67ABFA0 000000067F000040020000A0000000204000-000000067F000040020000A0000000208000__0000003903F1CFE8 000000067F000040020000A0000000204000-000000067F000040020000A0000000208000__0000003B99F7F8A0 000000067F000040020000A0000000204000-000000067F000040020000A0000000208000__0000005D2FFFFB38 000000067F000040020000A0000000204FDD-000000067F000040020000A000000020D9BD__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000208000-000000067F000040020000A000000020C000__00000038E67ABFA0 000000067F000040020000A0000000208000-000000067F000040020000A000000020C000__0000003903F1CFE8 000000067F000040020000A0000000208000-000000067F000040020000A000000020C000__0000003B99F7F8A0 000000067F000040020000A0000000208000-000000067F000040020000A000000020C000__0000005D2FFFFB38 000000067F000040020000A000000020C000-000000067F000040020000A0000000210000__00000038E67ABFA0 000000067F000040020000A000000020C000-000000067F000040020000A0000000210000__0000003903F1CFE8 000000067F000040020000A000000020C000-000000067F000040020000A0000000210000__0000003B99F7F8A0 000000067F000040020000A000000020C000-000000067F000040020000A0000000210000__0000005D2FFFFB38 000000067F000040020000A000000020D9BD-000000067F000040020000A000000021637A__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000210000-000000067F000040020000A0000000214000__00000038E67ABFA0 000000067F000040020000A0000000210000-000000067F000040020000A0000000214000__0000003903F1CFE8 000000067F000040020000A0000000210000-000000067F000040020000A0000000214000__0000003B99F7F8A0 000000067F000040020000A0000000210000-000000067F000040020000A0000000214000__0000005D2FFFFB38 000000067F000040020000A0000000214000-000000067F000040020000A0000000218000__00000038E67ABFA0 000000067F000040020000A0000000214000-000000067F000040020000A0000000218000__0000003903F1CFE8 000000067F000040020000A0000000214000-000000067F000040020000A0000000218000__0000003B99F7F8A0 000000067F000040020000A0000000214000-000000067F000040020000A0000000218000__0000005D2FFFFB38 000000067F000040020000A000000021637A-000000067F000040020000A000000021ED3A__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000218000-000000067F000040020000A000000021C000__00000038E67ABFA0 000000067F000040020000A0000000218000-000000067F000040020000A000000021C000__0000003903F1CFE8 000000067F000040020000A0000000218000-000000067F000040020000A000000021C000__0000003B99F7F8A0 000000067F000040020000A0000000218000-000000067F000040020000A000000021C000__0000005D2FFFFB38 000000067F000040020000A000000021C000-000000067F000040020000A0000000220000__00000038E67ABFA0 000000067F000040020000A000000021C000-000000067F000040020000A0000000220000__0000003903F1CFE8 000000067F000040020000A000000021C000-000000067F000040020000A0000000220000__0000003B99F7F8A0 000000067F000040020000A000000021C000-000000067F000040020000A0000000220000__0000005D2FFFFB38 000000067F000040020000A000000021ED3A-000000067F000040020000A000000022772C__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000220000-000000067F000040020000A0000000224000__00000038E67ABFA0 000000067F000040020000A0000000220000-000000067F000040020000A0000000224000__0000003903F1CFE8 000000067F000040020000A0000000220000-000000067F000040020000A0000000224000__0000003B99F7F8A0 000000067F000040020000A0000000220000-000000067F000040020000A0000000224000__0000005D2FFFFB38 000000067F000040020000A0000000224000-000000067F000040020000A0000000228000__00000038E67ABFA0 000000067F000040020000A0000000224000-000000067F000040020000A0000000228000__0000003903F1CFE8 000000067F000040020000A0000000224000-000000067F000040020000A0000000228000__0000003B99F7F8A0 000000067F000040020000A0000000224000-000000067F000040020000A0000000228000__0000005D2FFFFB38 000000067F000040020000A000000022772C-030000000000000000000000000000000002__0000001C725D0191-0000002070591C61 000000067F000040020000A0000000228000-000000067F000040020000A000000022C000__00000038E1ABFE28 000000067F000040020000A0000000228000-000000067F000040020000A000000022C000__00000038E9AF7F00 000000067F000040020000A0000000228000-000000067F000040020000A000000022C000__0000003903F1CFE8 000000067F000040020000A0000000228000-000000067F000040020000A000000022C000__0000003B99F7F8A0 000000067F000040020000A0000000228000-000000067F000040020000A000000022C000__0000005D2FFFFB38 000000067F000040020000A000000022855D-000000067F000040020000A0000000230F51__0000002070591C61-000000211009E359 000000067F000040020000A000000022C000-000000067F000040020000A0000000230000__00000038E1ABFE28 000000067F000040020000A000000022C000-000000067F000040020000A0000000230000__00000038E9AF7F00 000000067F000040020000A000000022C000-000000067F000040020000A0000000230000__0000003903F1CFE8 000000067F000040020000A000000022C000-000000067F000040020000A0000000230000__0000003B99F7F8A0 000000067F000040020000A000000022C000-000000067F000040020000A0000000230000__0000005D2FFFFB38 000000067F000040020000A0000000230000-000000067F000040020000A0000000234000__00000038E1ABFE28 000000067F000040020000A0000000230000-000000067F000040020000A0000000234000__00000038E9AF7F00 000000067F000040020000A0000000230000-000000067F000040020000A0000000234000__0000003903F1CFE8 000000067F000040020000A0000000230000-000000067F000040020000A0000000234000__0000003B99F7F8A0 000000067F000040020000A0000000230000-000000067F000040020000A0000000234000__0000005D2FFFFB38 000000067F000040020000A0000000230F51-000000067F000040020000A0000000239942__0000002070591C61-000000211009E359 000000067F000040020000A0000000234000-000000067F000040020000A0000000238000__00000038E1ABFE28 000000067F000040020000A0000000234000-000000067F000040020000A0000000238000__00000038E9AF7F00 000000067F000040020000A0000000234000-000000067F000040020000A0000000238000__0000003903F1CFE8 000000067F000040020000A0000000234000-000000067F000040020000A0000000238000__0000003B99F7F8A0 000000067F000040020000A0000000234000-000000067F000040020000A0000000238000__0000005D2FFFFB38 000000067F000040020000A0000000238000-000000067F000040020000A000000023C000__00000038E1ABFE28 000000067F000040020000A0000000238000-000000067F000040020000A000000023C000__00000038E9AF7F00 000000067F000040020000A0000000238000-000000067F000040020000A000000023C000__0000003903F1CFE8 000000067F000040020000A0000000238000-000000067F000040020000A000000023C000__0000003B99F7F8A0 000000067F000040020000A0000000238000-000000067F000040020000A000000023C000__0000005D2FFFFB38 000000067F000040020000A0000000239942-000000067F000040020000A0000000242314__0000002070591C61-000000211009E359 000000067F000040020000A000000023C000-000000067F000040020000A0000000240000__00000038E1ABFE28 000000067F000040020000A000000023C000-000000067F000040020000A0000000240000__00000038E9AF7F00 000000067F000040020000A000000023C000-000000067F000040020000A0000000240000__0000003903F1CFE8 000000067F000040020000A000000023C000-000000067F000040020000A0000000240000__0000003B99F7F8A0 000000067F000040020000A000000023C000-000000067F000040020000A0000000240000__0000005D2FFFFB38 000000067F000040020000A0000000240000-000000067F000040020000A0000000244000__00000038E1ABFE28 000000067F000040020000A0000000240000-000000067F000040020000A0000000244000__00000038E9AF7F00 000000067F000040020000A0000000240000-000000067F000040020000A0000000244000__0000003903F1CFE8 000000067F000040020000A0000000240000-000000067F000040020000A0000000244000__0000003B99F7F8A0 000000067F000040020000A0000000240000-000000067F000040020000A0000000244000__0000005D2FFFFB38 000000067F000040020000A0000000242314-000000067F000040020000A000000024ACDD__0000002070591C61-000000211009E359 000000067F000040020000A0000000244000-000000067F000040020000A0000000248000__00000038E1ABFE28 000000067F000040020000A0000000244000-000000067F000040020000A0000000248000__00000038E9AF7F00 000000067F000040020000A0000000244000-000000067F000040020000A0000000248000__0000003903F1CFE8 000000067F000040020000A0000000244000-000000067F000040020000A0000000248000__0000003B99F7F8A0 000000067F000040020000A0000000244000-000000067F000040020000A0000000248000__0000005D2FFFFB38 000000067F000040020000A0000000248000-000000067F000040020000A000000024C000__00000038E1ABFE28 000000067F000040020000A0000000248000-000000067F000040020000A000000024C000__00000038E9AF7F00 000000067F000040020000A0000000248000-000000067F000040020000A000000024C000__0000003903F1CFE8 000000067F000040020000A0000000248000-000000067F000040020000A000000024C000__0000003B99F7F8A0 000000067F000040020000A0000000248000-000000067F000040020000A000000024C000__0000005D2FFFFB38 000000067F000040020000A000000024ACDD-000000067F000040020000A0000000253697__0000002070591C61-000000211009E359 000000067F000040020000A000000024C000-000000067F000040020000A0000000250000__00000038E1ABFE28 000000067F000040020000A000000024C000-000000067F000040020000A0000000250000__00000038E9AF7F00 000000067F000040020000A000000024C000-000000067F000040020000A0000000250000__0000003903F1CFE8 000000067F000040020000A000000024C000-000000067F000040020000A0000000250000__0000003B99F7F8A0 000000067F000040020000A000000024C000-000000067F000040020000A0000000250000__0000005D2FFFFB38 000000067F000040020000A0000000250000-000000067F000040020000A0000000254000__00000038E1ABFE28 000000067F000040020000A0000000250000-000000067F000040020000A0000000254000__00000038E9AF7F00 000000067F000040020000A0000000250000-000000067F000040020000A0000000254000__0000003903F1CFE8 000000067F000040020000A0000000250000-000000067F000040020000A0000000254000__0000003B99F7F8A0 000000067F000040020000A0000000250000-000000067F000040020000A0000000254000__0000005D2FFFFB38 000000067F000040020000A0000000253697-000000067F000040020000A000000025C068__0000002070591C61-000000211009E359 000000067F000040020000A0000000254000-000000067F000040020000A0000000258000__00000038E1ABFE28 000000067F000040020000A0000000254000-000000067F000040020000A0000000258000__00000038E9AF7F00 000000067F000040020000A0000000254000-000000067F000040020000A0000000258000__0000003903F1CFE8 000000067F000040020000A0000000254000-000000067F000040020000A0000000258000__0000003B99F7F8A0 000000067F000040020000A0000000254000-000000067F000040020000A0000000258000__0000005D2FFFFB38 000000067F000040020000A0000000258000-000000067F000040020000A000000025C000__00000038E1ABFE28 000000067F000040020000A0000000258000-000000067F000040020000A000000025C000__00000038E9AF7F00 000000067F000040020000A0000000258000-000000067F000040020000A000000025C000__0000003903F1CFE8 000000067F000040020000A0000000258000-000000067F000040020000A000000025C000__0000003B99F7F8A0 000000067F000040020000A0000000258000-000000067F000040020000A000000025C000__0000005D2FFFFB38 000000067F000040020000A000000025C000-000000067F000040020000A0000000260000__00000038E1ABFE28 000000067F000040020000A000000025C000-000000067F000040020000A0000000260000__00000038E9AF7F00 000000067F000040020000A000000025C000-000000067F000040020000A0000000260000__0000003903F1CFE8 000000067F000040020000A000000025C000-000000067F000040020000A0000000260000__0000003B99F7F8A0 000000067F000040020000A000000025C000-000000067F000040020000A0000000260000__0000005D2FFFFB38 000000067F000040020000A000000025C068-000000067F000040020000A0000000264A5C__0000002070591C61-000000211009E359 000000067F000040020000A0000000260000-000000067F000040020000A0000000264000__00000038E1ABFE28 000000067F000040020000A0000000260000-000000067F000040020000A0000000264000__00000038E9AF7F00 000000067F000040020000A0000000260000-000000067F000040020000A0000000264000__0000003903F1CFE8 000000067F000040020000A0000000260000-000000067F000040020000A0000000264000__0000003B99F7F8A0 000000067F000040020000A0000000260000-000000067F000040020000A0000000264000__0000005D2FFFFB38 000000067F000040020000A0000000264000-000000067F000040020000A0000000268000__00000038E1ABFE28 000000067F000040020000A0000000264000-000000067F000040020000A0000000268000__00000038E9AF7F00 000000067F000040020000A0000000264000-000000067F000040020000A0000000268000__0000003903F1CFE8 000000067F000040020000A0000000264000-000000067F000040020000A0000000268000__0000003B99F7F8A0 000000067F000040020000A0000000264000-000000067F000040020000A0000000268000__0000005D2FFFFB38 000000067F000040020000A0000000264A5C-000000067F000040020000A000000026D448__0000002070591C61-000000211009E359 000000067F000040020000A0000000268000-000000067F000040020000A000000026C000__00000038E1ABFE28 000000067F000040020000A0000000268000-000000067F000040020000A000000026C000__00000038E9AF7F00 000000067F000040020000A0000000268000-000000067F000040020000A000000026C000__0000003903F1CFE8 000000067F000040020000A0000000268000-000000067F000040020000A000000026C000__0000003B99F7F8A0 000000067F000040020000A0000000268000-000000067F000040020000A000000026C000__0000005D2FFFFB38 000000067F000040020000A000000026C000-000000067F000040020000A0000000270000__00000038E1ABFE28 000000067F000040020000A000000026C000-000000067F000040020000A0000000270000__00000038E9AF7F00 000000067F000040020000A000000026C000-000000067F000040020000A0000000270000__0000003903F1CFE8 000000067F000040020000A000000026C000-000000067F000040020000A0000000270000__0000003B99F7F8A0 000000067F000040020000A000000026C000-000000067F000040020000A0000000270000__0000005D2FFFFB38 000000067F000040020000A000000026D448-000000067F000040020000A0000000275E35__0000002070591C61-000000211009E359 000000067F000040020000A0000000270000-000000067F000040020000A0000000274000__00000038E1ABFE28 000000067F000040020000A0000000270000-000000067F000040020000A0000000274000__00000038E9AF7F00 000000067F000040020000A0000000270000-000000067F000040020000A0000000274000__0000003903F1CFE8 000000067F000040020000A0000000270000-000000067F000040020000A0000000274000__0000003B99F7F8A0 000000067F000040020000A0000000270000-000000067F000040020000A0000000274000__0000005D2FFFFB38 000000067F000040020000A0000000274000-000000067F000040020000A0000000278000__00000038E1ABFE28 000000067F000040020000A0000000274000-000000067F000040020000A0000000278000__00000038E9AF7F00 000000067F000040020000A0000000274000-000000067F000040020000A0000000278000__0000003903F1CFE8 000000067F000040020000A0000000274000-000000067F000040020000A0000000278000__0000003B99F7F8A0 000000067F000040020000A0000000274000-000000067F000040020000A0000000278000__0000005D2FFFFB38 000000067F000040020000A0000000275E35-000000067F000040020000A000000027E807__0000002070591C61-000000211009E359 000000067F000040020000A0000000278000-000000067F000040020000A000000027C000__00000038E1ABFE28 000000067F000040020000A0000000278000-000000067F000040020000A000000027C000__00000038E9AF7F00 000000067F000040020000A0000000278000-000000067F000040020000A000000027C000__0000003903F1CFE8 000000067F000040020000A0000000278000-000000067F000040020000A000000027C000__0000003B99F7F8A0 000000067F000040020000A0000000278000-000000067F000040020000A000000027C000__0000005D2FFFFB38 000000067F000040020000A000000027C000-000000067F000040020000A0000000280000__00000021DAB8B3D0 000000067F000040020000A000000027C000-000000067F000040020000A0000000280000__00000038E9AF7F00 000000067F000040020000A000000027C000-000000067F000040020000A0000000280000__0000003903F1CFE8 000000067F000040020000A000000027C000-000000067F000040020000A0000000280000__0000003B99F7F8A0 000000067F000040020000A000000027C000-000000067F000040020000A0000000280000__0000005D2FFFFB38 000000067F000040020000A000000027E807-000000067F000040020000A0000200000000__0000002070591C61-000000211009E359 000000067F000040020000A000000027E9D5-000000067F000040020000A00000002873AE__000000211009E359-00000021AFB9E1E9 000000067F000040020000A0000000280000-000000067F000040020000A0000000284000__00000021DAB8B3D0 000000067F000040020000A0000000280000-000000067F000040020000A0000000284000__00000038E9AF7F00 000000067F000040020000A0000000280000-000000067F000040020000A0000000284000__0000003903F1CFE8 000000067F000040020000A0000000280000-000000067F000040020000A0000000284000__0000003B99F7F8A0 000000067F000040020000A0000000280000-000000067F000040020000A0000000284000__0000005D2FFFFB38 000000067F000040020000A0000000284000-000000067F000040020000A0000000288000__00000021DAB8B3D0 000000067F000040020000A0000000284000-000000067F000040020000A0000000288000__00000038E9AF7F00 000000067F000040020000A0000000284000-000000067F000040020000A0000000288000__0000003903F1CFE8 000000067F000040020000A0000000284000-000000067F000040020000A0000000288000__0000003B99F7F8A0 000000067F000040020000A0000000284000-000000067F000040020000A0000000288000__0000005D2FFFFB38 000000067F000040020000A00000002873AE-000000067F000040020000A000000028FD67__000000211009E359-00000021AFB9E1E9 000000067F000040020000A0000000288000-000000067F000040020000A000000028C000__00000021DAB8B3D0 000000067F000040020000A0000000288000-000000067F000040020000A000000028C000__00000038E9AF7F00 000000067F000040020000A0000000288000-000000067F000040020000A000000028C000__0000003903F1CFE8 000000067F000040020000A0000000288000-000000067F000040020000A000000028C000__0000003B99F7F8A0 000000067F000040020000A0000000288000-000000067F000040020000A000000028C000__0000005D2FFFFB38 000000067F000040020000A000000028C000-000000067F000040020000A0000000290000__00000021DAB8B3D0 000000067F000040020000A000000028C000-000000067F000040020000A0000000290000__00000038E9AF7F00 000000067F000040020000A000000028C000-000000067F000040020000A0000000290000__0000003903F1CFE8 000000067F000040020000A000000028C000-000000067F000040020000A0000000290000__0000003B99F7F8A0 000000067F000040020000A000000028C000-000000067F000040020000A0000000290000__0000005D2FFFFB38 000000067F000040020000A000000028FD67-000000067F000040020000A0000000298739__000000211009E359-00000021AFB9E1E9 000000067F000040020000A0000000290000-000000067F000040020000A0000000294000__00000021DAB8B3D0 000000067F000040020000A0000000290000-000000067F000040020000A0000000294000__00000038E9AF7F00 000000067F000040020000A0000000290000-000000067F000040020000A0000000294000__0000003903F1CFE8 000000067F000040020000A0000000290000-000000067F000040020000A0000000294000__0000003B99F7F8A0 000000067F000040020000A0000000290000-000000067F000040020000A0000000294000__0000005D2FFFFB38 000000067F000040020000A0000000294000-000000067F000040020000A0000000298000__00000021DAB8B3D0 000000067F000040020000A0000000294000-000000067F000040020000A0000000298000__00000038E9AF7F00 000000067F000040020000A0000000294000-000000067F000040020000A0000000298000__0000003903F1CFE8 000000067F000040020000A0000000294000-000000067F000040020000A0000000298000__0000003B99F7F8A0 000000067F000040020000A0000000294000-000000067F000040020000A0000000298000__0000005D2FFFFB38 000000067F000040020000A0000000298000-000000067F000040020000A000000029C000__00000021DAB8B3D0 000000067F000040020000A0000000298000-000000067F000040020000A000000029C000__00000038E9AF7F00 000000067F000040020000A0000000298000-000000067F000040020000A000000029C000__0000003903F1CFE8 000000067F000040020000A0000000298000-000000067F000040020000A000000029C000__0000003B99F7F8A0 000000067F000040020000A0000000298000-000000067F000040020000A000000029C000__0000005D2FFFFB38 000000067F000040020000A0000000298739-000000067F000040020000A00000002A1125__000000211009E359-00000021AFB9E1E9 000000067F000040020000A000000029C000-000000067F000040020000A00000002A0000__00000021DAB8B3D0 000000067F000040020000A000000029C000-000000067F000040020000A00000002A0000__00000038E9AF7F00 000000067F000040020000A000000029C000-000000067F000040020000A00000002A0000__0000003903F1CFE8 000000067F000040020000A000000029C000-000000067F000040020000A00000002A0000__0000003B99F7F8A0 000000067F000040020000A000000029C000-000000067F000040020000A00000002A0000__0000005D2FFFFB38 000000067F000040020000A00000002A0000-000000067F000040020000A00000002A4000__00000021DAB8B3D0 000000067F000040020000A00000002A0000-000000067F000040020000A00000002A4000__00000038E9AF7F00 000000067F000040020000A00000002A0000-000000067F000040020000A00000002A4000__0000003903F1CFE8 000000067F000040020000A00000002A0000-000000067F000040020000A00000002A4000__0000003B99F7F8A0 000000067F000040020000A00000002A0000-000000067F000040020000A00000002A4000__0000005D2FFFFB38 000000067F000040020000A00000002A1125-000000067F000040020000A00000002A9B12__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002A4000-000000067F000040020000A00000002A8000__00000021DAB8B3D0 000000067F000040020000A00000002A4000-000000067F000040020000A00000002A8000__00000038E9AF7F00 000000067F000040020000A00000002A4000-000000067F000040020000A00000002A8000__0000003903F1CFE8 000000067F000040020000A00000002A4000-000000067F000040020000A00000002A8000__0000003B99F7F8A0 000000067F000040020000A00000002A4000-000000067F000040020000A00000002A8000__0000005D2FFFFB38 000000067F000040020000A00000002A8000-000000067F000040020000A00000002AC000__00000021DAB8B3D0 000000067F000040020000A00000002A8000-000000067F000040020000A00000002AC000__00000038E9AF7F00 000000067F000040020000A00000002A8000-000000067F000040020000A00000002AC000__0000003903F1CFE8 000000067F000040020000A00000002A8000-000000067F000040020000A00000002AC000__0000003B99F7F8A0 000000067F000040020000A00000002A8000-000000067F000040020000A00000002AC000__0000005D2FFFFB38 000000067F000040020000A00000002A9B12-000000067F000040020000A00000002B24F9__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002AC000-000000067F000040020000A00000002B0000__00000021DAB8B3D0 000000067F000040020000A00000002AC000-000000067F000040020000A00000002B0000__00000038E9AF7F00 000000067F000040020000A00000002AC000-000000067F000040020000A00000002B0000__0000003903F1CFE8 000000067F000040020000A00000002AC000-000000067F000040020000A00000002B0000__0000003B99F7F8A0 000000067F000040020000A00000002AC000-000000067F000040020000A00000002B0000__0000005D2FFFFB38 000000067F000040020000A00000002B0000-000000067F000040020000A00000002B4000__00000021DAB8B3D0 000000067F000040020000A00000002B0000-000000067F000040020000A00000002B4000__00000038E9AF7F00 000000067F000040020000A00000002B0000-000000067F000040020000A00000002B4000__0000003903F1CFE8 000000067F000040020000A00000002B0000-000000067F000040020000A00000002B4000__0000003B99F7F8A0 000000067F000040020000A00000002B0000-000000067F000040020000A00000002B4000__0000005D2FFFFB38 000000067F000040020000A00000002B24F9-000000067F000040020000A00000002BAED2__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002B4000-000000067F000040020000A00000002B8000__00000021DAB8B3D0 000000067F000040020000A00000002B4000-000000067F000040020000A00000002B8000__00000038E9AF7F00 000000067F000040020000A00000002B4000-000000067F000040020000A00000002B8000__0000003903F1CFE8 000000067F000040020000A00000002B4000-000000067F000040020000A00000002B8000__0000003B99F7F8A0 000000067F000040020000A00000002B4000-000000067F000040020000A00000002B8000__0000005D2FFFFB38 000000067F000040020000A00000002B8000-000000067F000040020000A00000002BC000__00000021DAB8B3D0 000000067F000040020000A00000002B8000-000000067F000040020000A00000002BC000__00000038E9AF7F00 000000067F000040020000A00000002B8000-000000067F000040020000A00000002BC000__0000003903F1CFE8 000000067F000040020000A00000002B8000-000000067F000040020000A00000002BC000__0000003B99F7F8A0 000000067F000040020000A00000002B8000-000000067F000040020000A00000002BC000__0000005D2FFFFB38 000000067F000040020000A00000002BAED2-000000067F000040020000A00000002C3898__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002BC000-000000067F000040020000A00000002C0000__00000021DAB8B3D0 000000067F000040020000A00000002BC000-000000067F000040020000A00000002C0000__00000038E9AF7F00 000000067F000040020000A00000002BC000-000000067F000040020000A00000002C0000__0000003903F1CFE8 000000067F000040020000A00000002BC000-000000067F000040020000A00000002C0000__0000003B99F7F8A0 000000067F000040020000A00000002BC000-000000067F000040020000A00000002C0000__0000005D2FFFFB38 000000067F000040020000A00000002C0000-000000067F000040020000A00000002C4000__00000021DAB8B3D0 000000067F000040020000A00000002C0000-000000067F000040020000A00000002C4000__00000038E9AF7F00 000000067F000040020000A00000002C0000-000000067F000040020000A00000002C4000__0000003903F1CFE8 000000067F000040020000A00000002C0000-000000067F000040020000A00000002C4000__0000003B99F7F8A0 000000067F000040020000A00000002C0000-000000067F000040020000A00000002C4000__0000005D2FFFFB38 000000067F000040020000A00000002C3898-000000067F000040020000A00000002CC255__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002C4000-000000067F000040020000A00000002C8000__00000021DAB8B3D0 000000067F000040020000A00000002C4000-000000067F000040020000A00000002C8000__00000038E9AF7F00 000000067F000040020000A00000002C4000-000000067F000040020000A00000002C8000__0000003903F1CFE8 000000067F000040020000A00000002C4000-000000067F000040020000A00000002C8000__0000003B99F7F8A0 000000067F000040020000A00000002C4000-000000067F000040020000A00000002C8000__0000005D2FFFFB38 000000067F000040020000A00000002C8000-000000067F000040020000A00000002CC000__00000021DAB8B3D0 000000067F000040020000A00000002C8000-000000067F000040020000A00000002CC000__00000038E9AF7F00 000000067F000040020000A00000002C8000-000000067F000040020000A00000002CC000__0000003903F1CFE8 000000067F000040020000A00000002C8000-000000067F000040020000A00000002CC000__0000003B99F7F8A0 000000067F000040020000A00000002C8000-000000067F000040020000A00000002CC000__0000005D2FFFFB38 000000067F000040020000A00000002CC000-000000067F000040020000A00000002D0000__00000021DAB8B3D0 000000067F000040020000A00000002CC000-000000067F000040020000A00000002D0000__00000038E9AF7F00 000000067F000040020000A00000002CC000-000000067F000040020000A00000002D0000__0000003903F1CFE8 000000067F000040020000A00000002CC000-000000067F000040020000A00000002D0000__0000003B99F7F8A0 000000067F000040020000A00000002CC000-000000067F000040020000A00000002D0000__0000005D2FFFFB38 000000067F000040020000A00000002CC255-000000067F000040020000A00000002D4C30__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002D0000-000000067F000040020000A00000002D4000__00000021DAB8B3D0 000000067F000040020000A00000002D0000-000000067F000040020000A00000002D4000__00000038E9AF7F00 000000067F000040020000A00000002D0000-000000067F000040020000A00000002D4000__0000003903F1CFE8 000000067F000040020000A00000002D0000-000000067F000040020000A00000002D4000__0000003B99F7F8A0 000000067F000040020000A00000002D0000-000000067F000040020000A00000002D4000__0000005D2FFFFB38 000000067F000040020000A00000002D4000-000000067F000040020000A00000002D8000__00000021DAB8B3D0 000000067F000040020000A00000002D4000-000000067F000040020000A00000002D8000__00000038E67ABFA0 000000067F000040020000A00000002D4000-000000067F000040020000A00000002D8000__0000003903F1CFE8 000000067F000040020000A00000002D4000-000000067F000040020000A00000002D8000__0000003B99F7F8A0 000000067F000040020000A00000002D4000-000000067F000040020000A00000002D8000__0000005D2FFFFB38 000000067F000040020000A00000002D4C30-000000067F000040020000A0000200000000__000000211009E359-00000021AFB9E1E9 000000067F000040020000A00000002D4E9B-000000067F000040020000A00000002DD894__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A00000002D8000-000000067F000040020000A00000002DC000__00000021DAB8B3D0 000000067F000040020000A00000002D8000-000000067F000040020000A00000002DC000__00000038E67ABFA0 000000067F000040020000A00000002D8000-000000067F000040020000A00000002DC000__0000003903F1CFE8 000000067F000040020000A00000002D8000-000000067F000040020000A00000002DC000__0000003B99F7F8A0 000000067F000040020000A00000002D8000-000000067F000040020000A00000002DC000__0000005D2FFFFB38 000000067F000040020000A00000002DC000-000000067F000040020000A00000002E0000__00000021DAB8B3D0 000000067F000040020000A00000002DC000-000000067F000040020000A00000002E0000__00000038E67ABFA0 000000067F000040020000A00000002DC000-000000067F000040020000A00000002E0000__0000003903F1CFE8 000000067F000040020000A00000002DC000-000000067F000040020000A00000002E0000__0000003B99F7F8A0 000000067F000040020000A00000002DC000-000000067F000040020000A00000002E0000__0000005D2FFFFB38 000000067F000040020000A00000002DD894-000000067F000040020000A00000002E6287__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A00000002E0000-000000067F000040020000A00000002E4000__00000021DAB8B3D0 000000067F000040020000A00000002E0000-000000067F000040020000A00000002E4000__00000038E67ABFA0 000000067F000040020000A00000002E0000-000000067F000040020000A00000002E4000__0000003903F1CFE8 000000067F000040020000A00000002E0000-000000067F000040020000A00000002E4000__0000003B99F7F8A0 000000067F000040020000A00000002E0000-000000067F000040020000A00000002E4000__0000005D2FFFFB38 000000067F000040020000A00000002E4000-000000067F000040020000A00000002E8000__00000021DAB8B3D0 000000067F000040020000A00000002E4000-000000067F000040020000A00000002E8000__00000038E67ABFA0 000000067F000040020000A00000002E4000-000000067F000040020000A00000002E8000__0000003903F1CFE8 000000067F000040020000A00000002E4000-000000067F000040020000A00000002E8000__0000003B99F7F8A0 000000067F000040020000A00000002E4000-000000067F000040020000A00000002E8000__0000005D2FFFFB38 000000067F000040020000A00000002E6287-000000067F000040020000A00000002EEC65__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A00000002E8000-000000067F000040020000A00000002EC000__00000021DAB8B3D0 000000067F000040020000A00000002E8000-000000067F000040020000A00000002EC000__00000038E67ABFA0 000000067F000040020000A00000002E8000-000000067F000040020000A00000002EC000__0000003903F1CFE8 000000067F000040020000A00000002E8000-000000067F000040020000A00000002EC000__0000003B99F7F8A0 000000067F000040020000A00000002E8000-000000067F000040020000A00000002EC000__0000005D2FFFFB38 000000067F000040020000A00000002EC000-000000067F000040020000A00000002F0000__00000038E67ABFA0 000000067F000040020000A00000002EC000-000000067F000040020000A00000002F0000__0000003903F1CFE8 000000067F000040020000A00000002EC000-000000067F000040020000A00000002F0000__0000003B99F7F8A0 000000067F000040020000A00000002EC000-000000067F000040020000A00000002F0000__0000005D2FFFFB38 000000067F000040020000A00000002EC000-030000000000000000000000000000000002__00000021DAB8B3D0 000000067F000040020000A00000002EEC65-000000067F000040020000A00000002F7636__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A00000002F0000-000000067F000040020000A00000002F4000__00000038E67ABFA0 000000067F000040020000A00000002F0000-000000067F000040020000A00000002F4000__0000003903F1CFE8 000000067F000040020000A00000002F0000-000000067F000040020000A00000002F4000__0000003B99F7F8A0 000000067F000040020000A00000002F0000-000000067F000040020000A00000002F4000__0000005D2FFFFB38 000000067F000040020000A00000002F4000-000000067F000040020000A00000002F8000__00000038E67ABFA0 000000067F000040020000A00000002F4000-000000067F000040020000A00000002F8000__0000003903F1CFE8 000000067F000040020000A00000002F4000-000000067F000040020000A00000002F8000__0000003B99F7F8A0 000000067F000040020000A00000002F4000-000000067F000040020000A00000002F8000__0000005D2FFFFB38 000000067F000040020000A00000002F7636-000000067F000040020000A00000002FFFF6__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A00000002F8000-000000067F000040020000A00000002FC000__00000038E67ABFA0 000000067F000040020000A00000002F8000-000000067F000040020000A00000002FC000__0000003903F1CFE8 000000067F000040020000A00000002F8000-000000067F000040020000A00000002FC000__0000003B99F7F8A0 000000067F000040020000A00000002F8000-000000067F000040020000A00000002FC000__0000005D2FFFFB38 000000067F000040020000A00000002FC000-000000067F000040020000A0000000300000__00000038E67ABFA0 000000067F000040020000A00000002FC000-000000067F000040020000A0000000300000__0000003903F1CFE8 000000067F000040020000A00000002FC000-000000067F000040020000A0000000300000__0000003B99F7F8A0 000000067F000040020000A00000002FC000-000000067F000040020000A0000000300000__0000005D2FFFFB38 000000067F000040020000A00000002FFFF6-000000067F000040020000A00000003089B9__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A0000000300000-000000067F000040020000A0000000304000__00000038E67ABFA0 000000067F000040020000A0000000300000-000000067F000040020000A0000000304000__0000003903F1CFE8 000000067F000040020000A0000000300000-000000067F000040020000A0000000304000__0000003B99F7F8A0 000000067F000040020000A0000000300000-000000067F000040020000A0000000304000__0000005D2FFFFB38 000000067F000040020000A0000000304000-000000067F000040020000A0000000308000__00000038E67ABFA0 000000067F000040020000A0000000304000-000000067F000040020000A0000000308000__0000003903F1CFE8 000000067F000040020000A0000000304000-000000067F000040020000A0000000308000__0000003B99F7F8A0 000000067F000040020000A0000000304000-000000067F000040020000A0000000308000__0000005D2FFFFB38 000000067F000040020000A0000000308000-000000067F000040020000A000000030C000__00000038E67ABFA0 000000067F000040020000A0000000308000-000000067F000040020000A000000030C000__0000003903F1CFE8 000000067F000040020000A0000000308000-000000067F000040020000A000000030C000__0000003B99F7F8A0 000000067F000040020000A0000000308000-000000067F000040020000A000000030C000__0000005D2FFFFB38 000000067F000040020000A00000003089B9-000000067F000040020000A00000003113A3__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A000000030C000-000000067F000040020000A0000000310000__00000038E67ABFA0 000000067F000040020000A000000030C000-000000067F000040020000A0000000310000__0000003903F1CFE8 000000067F000040020000A000000030C000-000000067F000040020000A0000000310000__0000003B99F7F8A0 000000067F000040020000A000000030C000-000000067F000040020000A0000000310000__0000005D2FFFFB38 000000067F000040020000A0000000310000-000000067F000040020000A0000000314000__00000038E67ABFA0 000000067F000040020000A0000000310000-000000067F000040020000A0000000314000__0000003903F1CFE8 000000067F000040020000A0000000310000-000000067F000040020000A0000000314000__0000003B99F7F8A0 000000067F000040020000A0000000310000-000000067F000040020000A0000000314000__0000005D2FFFFB38 000000067F000040020000A00000003113A3-000000067F000040020000A0000000319D9B__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A0000000314000-000000067F000040020000A0000000318000__00000038E67ABFA0 000000067F000040020000A0000000314000-000000067F000040020000A0000000318000__0000003903F1CFE8 000000067F000040020000A0000000314000-000000067F000040020000A0000000318000__0000003B99F7F8A0 000000067F000040020000A0000000314000-000000067F000040020000A0000000318000__0000005D2FFFFB38 000000067F000040020000A0000000318000-000000067F000040020000A000000031C000__00000038E67ABFA0 000000067F000040020000A0000000318000-000000067F000040020000A000000031C000__0000003903F1CFE8 000000067F000040020000A0000000318000-000000067F000040020000A000000031C000__0000003B99F7F8A0 000000067F000040020000A0000000318000-000000067F000040020000A000000031C000__0000005D2FFFFB38 000000067F000040020000A0000000319D9B-000000067F000040020000A0000000322787__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A000000031C000-000000067F000040020000A0000000320000__00000038E67ABFA0 000000067F000040020000A000000031C000-000000067F000040020000A0000000320000__0000003903F1CFE8 000000067F000040020000A000000031C000-000000067F000040020000A0000000320000__0000003B99F7F8A0 000000067F000040020000A000000031C000-000000067F000040020000A0000000320000__0000005D2FFFFB38 000000067F000040020000A0000000320000-000000067F000040020000A0000000324000__00000038E67ABFA0 000000067F000040020000A0000000320000-000000067F000040020000A0000000324000__0000003903F1CFE8 000000067F000040020000A0000000320000-000000067F000040020000A0000000324000__0000003B99F7F8A0 000000067F000040020000A0000000320000-000000067F000040020000A0000000324000__0000005D2FFFFB38 000000067F000040020000A0000000322787-000000067F000040020000A000000032B167__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A0000000324000-000000067F000040020000A0000000328000__00000038E67ABFA0 000000067F000040020000A0000000324000-000000067F000040020000A0000000328000__0000003903F1CFE8 000000067F000040020000A0000000324000-000000067F000040020000A0000000328000__0000003B99F7F8A0 000000067F000040020000A0000000324000-000000067F000040020000A0000000328000__0000005D2FFFFB38 000000067F000040020000A0000000328000-000000067F000040020000A000000032C000__00000038E67ABFA0 000000067F000040020000A0000000328000-000000067F000040020000A000000032C000__0000003903F1CFE8 000000067F000040020000A0000000328000-000000067F000040020000A000000032C000__0000003B99F7F8A0 000000067F000040020000A0000000328000-000000067F000040020000A000000032C000__0000005D2FFFFB38 000000067F000040020000A000000032B167-000000067F000040020000A0000000333B49__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A000000032C000-000000067F000040020000A0000000330000__00000038E67ABFA0 000000067F000040020000A000000032C000-000000067F000040020000A0000000330000__0000003903F1CFE8 000000067F000040020000A000000032C000-000000067F000040020000A0000000330000__0000003B99F7F8A0 000000067F000040020000A000000032C000-000000067F000040020000A0000000330000__0000005D2FFFFB38 000000067F000040020000A0000000330000-000000067F000040020000A0000000334000__00000038E1ABFE28 000000067F000040020000A0000000330000-000000067F000040020000A0000000334000__00000038E9AF7F00 000000067F000040020000A0000000330000-000000067F000040020000A0000000334000__0000003903F1CFE8 000000067F000040020000A0000000330000-000000067F000040020000A0000000334000__0000003B99F7F8A0 000000067F000040020000A0000000330000-000000067F000040020000A0000000334000__0000005D2FFFFB38 000000067F000040020000A0000000333B49-000000067F000040020000A0000200000000__00000021AFB9E1E9-000000225F61DD41 000000067F000040020000A0000000333D2A-000000067F000040020000A000000033C6E5__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000334000-000000067F000040020000A0000000338000__00000038E1ABFE28 000000067F000040020000A0000000334000-000000067F000040020000A0000000338000__00000038E9AF7F00 000000067F000040020000A0000000334000-000000067F000040020000A0000000338000__0000003903F1CFE8 000000067F000040020000A0000000334000-000000067F000040020000A0000000338000__0000003B99F7F8A0 000000067F000040020000A0000000334000-000000067F000040020000A0000000338000__0000005D2FFFFB38 000000067F000040020000A0000000338000-000000067F000040020000A000000033C000__00000038E1ABFE28 000000067F000040020000A0000000338000-000000067F000040020000A000000033C000__00000038E9AF7F00 000000067F000040020000A0000000338000-000000067F000040020000A000000033C000__0000003903F1CFE8 000000067F000040020000A0000000338000-000000067F000040020000A000000033C000__0000003B99F7F8A0 000000067F000040020000A0000000338000-000000067F000040020000A000000033C000__0000005D2FFFFB38 000000067F000040020000A000000033C000-000000067F000040020000A0000000340000__00000038E1ABFE28 000000067F000040020000A000000033C000-000000067F000040020000A0000000340000__00000038E9AF7F00 000000067F000040020000A000000033C000-000000067F000040020000A0000000340000__0000003903F1CFE8 000000067F000040020000A000000033C000-000000067F000040020000A0000000340000__0000003B99F7F8A0 000000067F000040020000A000000033C000-000000067F000040020000A0000000340000__0000005D2FFFFB38 000000067F000040020000A000000033C6E5-000000067F000040020000A00000003450AA__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000340000-000000067F000040020000A0000000344000__00000038E1ABFE28 000000067F000040020000A0000000340000-000000067F000040020000A0000000344000__00000038E9AF7F00 000000067F000040020000A0000000340000-000000067F000040020000A0000000344000__0000003903F1CFE8 000000067F000040020000A0000000340000-000000067F000040020000A0000000344000__0000003B99F7F8A0 000000067F000040020000A0000000340000-000000067F000040020000A0000000344000__0000005D2FFFFB38 000000067F000040020000A0000000344000-000000067F000040020000A0000000348000__00000038E1ABFE28 000000067F000040020000A0000000344000-000000067F000040020000A0000000348000__00000038E9AF7F00 000000067F000040020000A0000000344000-000000067F000040020000A0000000348000__0000003903F1CFE8 000000067F000040020000A0000000344000-000000067F000040020000A0000000348000__0000003B99F7F8A0 000000067F000040020000A0000000344000-000000067F000040020000A0000000348000__0000005D2FFFFB38 000000067F000040020000A00000003450AA-000000067F000040020000A000000034DAA2__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000348000-000000067F000040020000A000000034C000__00000038E1ABFE28 000000067F000040020000A0000000348000-000000067F000040020000A000000034C000__00000038E9AF7F00 000000067F000040020000A0000000348000-000000067F000040020000A000000034C000__0000003903F1CFE8 000000067F000040020000A0000000348000-000000067F000040020000A000000034C000__0000003B99F7F8A0 000000067F000040020000A0000000348000-000000067F000040020000A000000034C000__0000005D2FFFFB38 000000067F000040020000A000000034C000-000000067F000040020000A0000000350000__00000038E1ABFE28 000000067F000040020000A000000034C000-000000067F000040020000A0000000350000__00000038E9AF7F00 000000067F000040020000A000000034C000-000000067F000040020000A0000000350000__0000003903F1CFE8 000000067F000040020000A000000034C000-000000067F000040020000A0000000350000__0000003B99F7F8A0 000000067F000040020000A000000034C000-000000067F000040020000A0000000350000__0000005D2FFFFB38 000000067F000040020000A000000034DAA2-000000067F000040020000A000000035649B__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000350000-000000067F000040020000A0000000354000__00000038E1ABFE28 000000067F000040020000A0000000350000-000000067F000040020000A0000000354000__00000038E9AF7F00 000000067F000040020000A0000000350000-000000067F000040020000A0000000354000__0000003903F1CFE8 000000067F000040020000A0000000350000-000000067F000040020000A0000000354000__0000003B99F7F8A0 000000067F000040020000A0000000350000-000000067F000040020000A0000000354000__0000005D2FFFFB38 000000067F000040020000A0000000354000-000000067F000040020000A0000000358000__00000038E1ABFE28 000000067F000040020000A0000000354000-000000067F000040020000A0000000358000__00000038E9AF7F00 000000067F000040020000A0000000354000-000000067F000040020000A0000000358000__0000003903F1CFE8 000000067F000040020000A0000000354000-000000067F000040020000A0000000358000__0000003B99F7F8A0 000000067F000040020000A0000000354000-000000067F000040020000A0000000358000__0000005D2FFFFB38 000000067F000040020000A000000035649B-000000067F000040020000A000000035EE91__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000358000-000000067F000040020000A000000035C000__00000038E1ABFE28 000000067F000040020000A0000000358000-000000067F000040020000A000000035C000__00000038E9AF7F00 000000067F000040020000A0000000358000-000000067F000040020000A000000035C000__0000003903F1CFE8 000000067F000040020000A0000000358000-000000067F000040020000A000000035C000__0000003B99F7F8A0 000000067F000040020000A0000000358000-000000067F000040020000A000000035C000__0000005D2FFFFB38 000000067F000040020000A000000035C000-000000067F000040020000A0000000360000__00000038E1ABFE28 000000067F000040020000A000000035C000-000000067F000040020000A0000000360000__00000038E9AF7F00 000000067F000040020000A000000035C000-000000067F000040020000A0000000360000__0000003903F1CFE8 000000067F000040020000A000000035C000-000000067F000040020000A0000000360000__0000003B99F7F8A0 000000067F000040020000A000000035C000-000000067F000040020000A0000000360000__0000005D2FFFFB38 000000067F000040020000A000000035EE91-000000067F000040020000A0000000367875__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000360000-000000067F000040020000A0000000364000__00000038E1ABFE28 000000067F000040020000A0000000360000-000000067F000040020000A0000000364000__00000038E9AF7F00 000000067F000040020000A0000000360000-000000067F000040020000A0000000364000__0000003903F1CFE8 000000067F000040020000A0000000360000-000000067F000040020000A0000000364000__0000003B99F7F8A0 000000067F000040020000A0000000360000-000000067F000040020000A0000000364000__0000005D2FFFFB38 000000067F000040020000A0000000364000-000000067F000040020000A0000000368000__00000038E1ABFE28 000000067F000040020000A0000000364000-000000067F000040020000A0000000368000__00000038E9AF7F00 000000067F000040020000A0000000364000-000000067F000040020000A0000000368000__0000003903F1CFE8 000000067F000040020000A0000000364000-000000067F000040020000A0000000368000__0000003B99F7F8A0 000000067F000040020000A0000000364000-000000067F000040020000A0000000368000__0000005D2FFFFB38 000000067F000040020000A0000000367875-000000067F000040020000A0000000370246__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000368000-000000067F000040020000A000000036C000__00000038E1ABFE28 000000067F000040020000A0000000368000-000000067F000040020000A000000036C000__00000038E9AF7F00 000000067F000040020000A0000000368000-000000067F000040020000A000000036C000__0000003903F1CFE8 000000067F000040020000A0000000368000-000000067F000040020000A000000036C000__0000003B99F7F8A0 000000067F000040020000A0000000368000-000000067F000040020000A000000036C000__0000005D2FFFFB38 000000067F000040020000A000000036C000-000000067F000040020000A0000000370000__00000038E1ABFE28 000000067F000040020000A000000036C000-000000067F000040020000A0000000370000__00000038E9AF7F00 000000067F000040020000A000000036C000-000000067F000040020000A0000000370000__0000003903F1CFE8 000000067F000040020000A000000036C000-000000067F000040020000A0000000370000__0000003B99F7F8A0 000000067F000040020000A000000036C000-000000067F000040020000A0000000370000__0000005D2FFFFB38 000000067F000040020000A0000000370000-000000067F000040020000A0000000374000__00000038E1ABFE28 000000067F000040020000A0000000370000-000000067F000040020000A0000000374000__00000038E9AF7F00 000000067F000040020000A0000000370000-000000067F000040020000A0000000374000__0000003903F1CFE8 000000067F000040020000A0000000370000-000000067F000040020000A0000000374000__0000003B99F7F8A0 000000067F000040020000A0000000370000-000000067F000040020000A0000000374000__0000005D2FFFFB38 000000067F000040020000A0000000370246-000000067F000040020000A0000000378BFE__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000374000-000000067F000040020000A0000000378000__00000038E1ABFE28 000000067F000040020000A0000000374000-000000067F000040020000A0000000378000__00000038E9AF7F00 000000067F000040020000A0000000374000-000000067F000040020000A0000000378000__0000003903F1CFE8 000000067F000040020000A0000000374000-000000067F000040020000A0000000378000__0000003B99F7F8A0 000000067F000040020000A0000000374000-000000067F000040020000A0000000378000__0000005D2FFFFB38 000000067F000040020000A0000000378000-000000067F000040020000A000000037C000__00000038E1ABFE28 000000067F000040020000A0000000378000-000000067F000040020000A000000037C000__00000038E9AF7F00 000000067F000040020000A0000000378000-000000067F000040020000A000000037C000__0000003903F1CFE8 000000067F000040020000A0000000378000-000000067F000040020000A000000037C000__0000003B99F7F8A0 000000067F000040020000A0000000378000-000000067F000040020000A000000037C000__0000005D2FFFFB38 000000067F000040020000A0000000378BFE-000000067F000040020000A00000003815CC__000000225F61DD41-000000230F09F3F1 000000067F000040020000A000000037C000-000000067F000040020000A0000000380000__00000038E1ABFE28 000000067F000040020000A000000037C000-000000067F000040020000A0000000380000__00000038E9AF7F00 000000067F000040020000A000000037C000-000000067F000040020000A0000000380000__0000003903F1CFE8 000000067F000040020000A000000037C000-000000067F000040020000A0000000380000__0000003B99F7F8A0 000000067F000040020000A000000037C000-000000067F000040020000A0000000380000__0000005D2FFFFB38 000000067F000040020000A0000000380000-000000067F000040020000A0000000384000__00000038E1ABFE28 000000067F000040020000A0000000380000-000000067F000040020000A0000000384000__00000038E9AF7F00 000000067F000040020000A0000000380000-000000067F000040020000A0000000384000__0000003903F1CFE8 000000067F000040020000A0000000380000-000000067F000040020000A0000000384000__0000003B99F7F8A0 000000067F000040020000A0000000380000-000000067F000040020000A0000000384000__0000005D2FFFFB38 000000067F000040020000A00000003815CC-000000067F000040020000A0000000389FCA__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000384000-000000067F000040020000A0000000388000__00000038E1ABFE28 000000067F000040020000A0000000384000-000000067F000040020000A0000000388000__00000038E9AF7F00 000000067F000040020000A0000000384000-000000067F000040020000A0000000388000__0000003903F1CFE8 000000067F000040020000A0000000384000-000000067F000040020000A0000000388000__0000003B99F7F8A0 000000067F000040020000A0000000384000-000000067F000040020000A0000000388000__0000005D2FFFFB38 000000067F000040020000A0000000388000-000000067F000040020000A000000038C000__00000038E1ABFE28 000000067F000040020000A0000000388000-000000067F000040020000A000000038C000__00000038E9AF7F00 000000067F000040020000A0000000388000-000000067F000040020000A000000038C000__0000003903F1CFE8 000000067F000040020000A0000000388000-000000067F000040020000A000000038C000__0000003B99F7F8A0 000000067F000040020000A0000000388000-000000067F000040020000A000000038C000__0000005D2FFFFB38 000000067F000040020000A0000000389FCA-000000067F000040020000A00000003929C4__000000225F61DD41-000000230F09F3F1 000000067F000040020000A000000038C000-000000067F000040020000A0000000390000__00000038E1ABFE28 000000067F000040020000A000000038C000-000000067F000040020000A0000000390000__00000038E9AF7F00 000000067F000040020000A000000038C000-000000067F000040020000A0000000390000__0000003903F1CFE8 000000067F000040020000A000000038C000-000000067F000040020000A0000000390000__0000003B99F7F8A0 000000067F000040020000A000000038C000-000000067F000040020000A0000000390000__0000005D2FFFFB38 000000067F000040020000A0000000390000-000000067F000040020000A0000000394000__00000023DF7FF060 000000067F000040020000A0000000390000-000000067F000040020000A0000000394000__00000038E9AF7F00 000000067F000040020000A0000000390000-000000067F000040020000A0000000394000__0000003903F1CFE8 000000067F000040020000A0000000390000-000000067F000040020000A0000000394000__0000003B99F7F8A0 000000067F000040020000A0000000390000-000000067F000040020000A0000000394000__0000005D2FFFFB38 000000067F000040020000A00000003929C4-000000067F000040020000A0000200000000__000000225F61DD41-000000230F09F3F1 000000067F000040020000A0000000392C61-000000067F000040020000A000000039B644__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A0000000394000-000000067F000040020000A0000000398000__00000023DF7FF060 000000067F000040020000A0000000394000-000000067F000040020000A0000000398000__00000038E9AF7F00 000000067F000040020000A0000000394000-000000067F000040020000A0000000398000__0000003903F1CFE8 000000067F000040020000A0000000394000-000000067F000040020000A0000000398000__0000003B99F7F8A0 000000067F000040020000A0000000394000-000000067F000040020000A0000000398000__0000005D2FFFFB38 000000067F000040020000A0000000398000-000000067F000040020000A000000039C000__00000023DF7FF060 000000067F000040020000A0000000398000-000000067F000040020000A000000039C000__00000038E9AF7F00 000000067F000040020000A0000000398000-000000067F000040020000A000000039C000__0000003903F1CFE8 000000067F000040020000A0000000398000-000000067F000040020000A000000039C000__0000003B99F7F8A0 000000067F000040020000A0000000398000-000000067F000040020000A000000039C000__0000005D2FFFFB38 000000067F000040020000A000000039B644-000000067F000040020000A00000003A4019__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A000000039C000-000000067F000040020000A00000003A0000__00000023DF7FF060 000000067F000040020000A000000039C000-000000067F000040020000A00000003A0000__00000038E9AF7F00 000000067F000040020000A000000039C000-000000067F000040020000A00000003A0000__0000003903F1CFE8 000000067F000040020000A000000039C000-000000067F000040020000A00000003A0000__0000003B99F7F8A0 000000067F000040020000A000000039C000-000000067F000040020000A00000003A0000__0000005D2FFFFB38 000000067F000040020000A00000003A0000-000000067F000040020000A00000003A4000__00000023DF7FF060 000000067F000040020000A00000003A0000-000000067F000040020000A00000003A4000__00000038E9AF7F00 000000067F000040020000A00000003A0000-000000067F000040020000A00000003A4000__0000003903F1CFE8 000000067F000040020000A00000003A0000-000000067F000040020000A00000003A4000__0000003B99F7F8A0 000000067F000040020000A00000003A0000-000000067F000040020000A00000003A4000__0000005D2FFFFB38 000000067F000040020000A00000003A4000-000000067F000040020000A00000003A8000__00000023DF7FF060 000000067F000040020000A00000003A4000-000000067F000040020000A00000003A8000__00000038E9AF7F00 000000067F000040020000A00000003A4000-000000067F000040020000A00000003A8000__0000003903F1CFE8 000000067F000040020000A00000003A4000-000000067F000040020000A00000003A8000__0000003B99F7F8A0 000000067F000040020000A00000003A4000-000000067F000040020000A00000003A8000__0000005D2FFFFB38 000000067F000040020000A00000003A4019-000000067F000040020000A00000003AC9D6__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003A8000-000000067F000040020000A00000003AC000__00000023DF7FF060 000000067F000040020000A00000003A8000-000000067F000040020000A00000003AC000__00000038E9AF7F00 000000067F000040020000A00000003A8000-000000067F000040020000A00000003AC000__0000003903F1CFE8 000000067F000040020000A00000003A8000-000000067F000040020000A00000003AC000__0000003B99F7F8A0 000000067F000040020000A00000003A8000-000000067F000040020000A00000003AC000__0000005D2FFFFB38 000000067F000040020000A00000003AC000-000000067F000040020000A00000003B0000__00000023DF7FF060 000000067F000040020000A00000003AC000-000000067F000040020000A00000003B0000__00000038E9AF7F00 000000067F000040020000A00000003AC000-000000067F000040020000A00000003B0000__0000003903F1CFE8 000000067F000040020000A00000003AC000-000000067F000040020000A00000003B0000__0000003B99F7F8A0 000000067F000040020000A00000003AC000-000000067F000040020000A00000003B0000__0000005D2FFFFB38 000000067F000040020000A00000003AC9D6-000000067F000040020000A00000003B5396__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003B0000-000000067F000040020000A00000003B4000__00000023DF7FF060 000000067F000040020000A00000003B0000-000000067F000040020000A00000003B4000__00000038E9AF7F00 000000067F000040020000A00000003B0000-000000067F000040020000A00000003B4000__0000003903F1CFE8 000000067F000040020000A00000003B0000-000000067F000040020000A00000003B4000__0000003B99F7F8A0 000000067F000040020000A00000003B0000-000000067F000040020000A00000003B4000__0000005D2FFFFB38 000000067F000040020000A00000003B4000-000000067F000040020000A00000003B8000__00000023DF7FF060 000000067F000040020000A00000003B4000-000000067F000040020000A00000003B8000__00000038E9AF7F00 000000067F000040020000A00000003B4000-000000067F000040020000A00000003B8000__0000003903F1CFE8 000000067F000040020000A00000003B4000-000000067F000040020000A00000003B8000__0000003B99F7F8A0 000000067F000040020000A00000003B4000-000000067F000040020000A00000003B8000__0000005D2FFFFB38 000000067F000040020000A00000003B5396-000000067F000040020000A00000003BDD5F__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003B8000-000000067F000040020000A00000003BC000__00000023DF7FF060 000000067F000040020000A00000003B8000-000000067F000040020000A00000003BC000__00000038E9AF7F00 000000067F000040020000A00000003B8000-000000067F000040020000A00000003BC000__0000003903F1CFE8 000000067F000040020000A00000003B8000-000000067F000040020000A00000003BC000__0000003B99F7F8A0 000000067F000040020000A00000003B8000-000000067F000040020000A00000003BC000__0000005D2FFFFB38 000000067F000040020000A00000003BC000-000000067F000040020000A00000003C0000__00000023DF7FF060 000000067F000040020000A00000003BC000-000000067F000040020000A00000003C0000__00000038E9AF7F00 000000067F000040020000A00000003BC000-000000067F000040020000A00000003C0000__0000003903F1CFE8 000000067F000040020000A00000003BC000-000000067F000040020000A00000003C0000__0000003B99F7F8A0 000000067F000040020000A00000003BC000-000000067F000040020000A00000003C0000__0000005D2FFFFB38 000000067F000040020000A00000003BDD5F-000000067F000040020000A00000003C6752__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003C0000-000000067F000040020000A00000003C4000__00000023DF7FF060 000000067F000040020000A00000003C0000-000000067F000040020000A00000003C4000__00000038E9AF7F00 000000067F000040020000A00000003C0000-000000067F000040020000A00000003C4000__0000003903F1CFE8 000000067F000040020000A00000003C0000-000000067F000040020000A00000003C4000__0000003B99F7F8A0 000000067F000040020000A00000003C0000-000000067F000040020000A00000003C4000__0000005D2FFFFB38 000000067F000040020000A00000003C4000-000000067F000040020000A00000003C8000__00000023DF7FF060 000000067F000040020000A00000003C4000-000000067F000040020000A00000003C8000__00000038E9AF7F00 000000067F000040020000A00000003C4000-000000067F000040020000A00000003C8000__0000003903F1CFE8 000000067F000040020000A00000003C4000-000000067F000040020000A00000003C8000__0000003B99F7F8A0 000000067F000040020000A00000003C4000-000000067F000040020000A00000003C8000__0000005D2FFFFB38 000000067F000040020000A00000003C6752-000000067F000040020000A00000003CF144__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003C8000-000000067F000040020000A00000003CC000__00000023DF7FF060 000000067F000040020000A00000003C8000-000000067F000040020000A00000003CC000__00000038E9AF7F00 000000067F000040020000A00000003C8000-000000067F000040020000A00000003CC000__0000003903F1CFE8 000000067F000040020000A00000003C8000-000000067F000040020000A00000003CC000__0000003B99F7F8A0 000000067F000040020000A00000003C8000-000000067F000040020000A00000003CC000__0000005D2FFFFB38 000000067F000040020000A00000003CC000-000000067F000040020000A00000003D0000__00000023DF7FF060 000000067F000040020000A00000003CC000-000000067F000040020000A00000003D0000__00000038E9AF7F00 000000067F000040020000A00000003CC000-000000067F000040020000A00000003D0000__0000003903F1CFE8 000000067F000040020000A00000003CC000-000000067F000040020000A00000003D0000__0000003B99F7F8A0 000000067F000040020000A00000003CC000-000000067F000040020000A00000003D0000__0000005D2FFFFB38 000000067F000040020000A00000003CF144-000000067F000040020000A00000003D7B34__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003D0000-000000067F000040020000A00000003D4000__00000023DF7FF060 000000067F000040020000A00000003D0000-000000067F000040020000A00000003D4000__00000038E9AF7F00 000000067F000040020000A00000003D0000-000000067F000040020000A00000003D4000__0000003903F1CFE8 000000067F000040020000A00000003D0000-000000067F000040020000A00000003D4000__0000003B99F7F8A0 000000067F000040020000A00000003D0000-000000067F000040020000A00000003D4000__0000005D2FFFFB38 000000067F000040020000A00000003D4000-000000067F000040020000A00000003D8000__00000023DF7FF060 000000067F000040020000A00000003D4000-000000067F000040020000A00000003D8000__00000038E9AF7F00 000000067F000040020000A00000003D4000-000000067F000040020000A00000003D8000__0000003903F1CFE8 000000067F000040020000A00000003D4000-000000067F000040020000A00000003D8000__0000003B99F7F8A0 000000067F000040020000A00000003D4000-000000067F000040020000A00000003D8000__0000005D2FFFFB38 000000067F000040020000A00000003D7B34-000000067F000040020000A00000003E0508__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003D8000-000000067F000040020000A00000003DC000__00000023DF7FF060 000000067F000040020000A00000003D8000-000000067F000040020000A00000003DC000__00000038E9AF7F00 000000067F000040020000A00000003D8000-000000067F000040020000A00000003DC000__0000003903F1CFE8 000000067F000040020000A00000003D8000-000000067F000040020000A00000003DC000__0000003B99F7F8A0 000000067F000040020000A00000003D8000-000000067F000040020000A00000003DC000__0000005D2FFFFB38 000000067F000040020000A00000003DC000-000000067F000040020000A00000003E0000__00000023DF7FF060 000000067F000040020000A00000003DC000-000000067F000040020000A00000003E0000__00000038E9AF7F00 000000067F000040020000A00000003DC000-000000067F000040020000A00000003E0000__0000003903F1CFE8 000000067F000040020000A00000003DC000-000000067F000040020000A00000003E0000__0000003B99F7F8A0 000000067F000040020000A00000003DC000-000000067F000040020000A00000003E0000__0000005D2FFFFB38 000000067F000040020000A00000003E0000-000000067F000040020000A00000003E4000__00000023DF7FF060 000000067F000040020000A00000003E0000-000000067F000040020000A00000003E4000__00000038E9AF7F00 000000067F000040020000A00000003E0000-000000067F000040020000A00000003E4000__0000003903F1CFE8 000000067F000040020000A00000003E0000-000000067F000040020000A00000003E4000__0000003B99F7F8A0 000000067F000040020000A00000003E0000-000000067F000040020000A00000003E4000__0000005D2FFFFB38 000000067F000040020000A00000003E0508-000000067F000040020000A00000003E8EC9__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003E4000-000000067F000040020000A00000003E8000__00000023DF7FF060 000000067F000040020000A00000003E4000-000000067F000040020000A00000003E8000__00000038E9AF7F00 000000067F000040020000A00000003E4000-000000067F000040020000A00000003E8000__0000003903F1CFE8 000000067F000040020000A00000003E4000-000000067F000040020000A00000003E8000__0000003B99F7F8A0 000000067F000040020000A00000003E4000-000000067F000040020000A00000003E8000__0000005D2FFFFB38 000000067F000040020000A00000003E8000-000000067F000040020000A00000003EC000__00000023DF7FF060 000000067F000040020000A00000003E8000-000000067F000040020000A00000003EC000__00000038E67ABFA0 000000067F000040020000A00000003E8000-000000067F000040020000A00000003EC000__0000003903F1CFE8 000000067F000040020000A00000003E8000-000000067F000040020000A00000003EC000__0000003B99F7F8A0 000000067F000040020000A00000003E8000-000000067F000040020000A00000003EC000__0000005D2FFFFB38 000000067F000040020000A00000003E8EC9-000000067F000040020000A0000200000000__000000230F09F3F1-00000023AEB9F2B9 000000067F000040020000A00000003E9093-000000067F000040020000A00000003F1A44__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A00000003EC000-000000067F000040020000A00000003F0000__00000023DF7FF060 000000067F000040020000A00000003EC000-000000067F000040020000A00000003F0000__00000038E67ABFA0 000000067F000040020000A00000003EC000-000000067F000040020000A00000003F0000__0000003903F1CFE8 000000067F000040020000A00000003EC000-000000067F000040020000A00000003F0000__0000003B99F7F8A0 000000067F000040020000A00000003EC000-000000067F000040020000A00000003F0000__0000005D2FFFFB38 000000067F000040020000A00000003F0000-000000067F000040020000A00000003F4000__00000023DF7FF060 000000067F000040020000A00000003F0000-000000067F000040020000A00000003F4000__00000038E67ABFA0 000000067F000040020000A00000003F0000-000000067F000040020000A00000003F4000__0000003903F1CFE8 000000067F000040020000A00000003F0000-000000067F000040020000A00000003F4000__0000003B99F7F8A0 000000067F000040020000A00000003F0000-000000067F000040020000A00000003F4000__0000005D2FFFFB38 000000067F000040020000A00000003F1A44-000000067F000040020000A00000003FA41E__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A00000003F4000-000000067F000040020000A00000003F8000__00000023DF7FF060 000000067F000040020000A00000003F4000-000000067F000040020000A00000003F8000__00000038E67ABFA0 000000067F000040020000A00000003F4000-000000067F000040020000A00000003F8000__0000003903F1CFE8 000000067F000040020000A00000003F4000-000000067F000040020000A00000003F8000__0000003B99F7F8A0 000000067F000040020000A00000003F4000-000000067F000040020000A00000003F8000__0000005D2FFFFB38 000000067F000040020000A00000003F8000-000000067F000040020000A00000003FC000__00000023DF7FF060 000000067F000040020000A00000003F8000-000000067F000040020000A00000003FC000__00000038E67ABFA0 000000067F000040020000A00000003F8000-000000067F000040020000A00000003FC000__0000003903F1CFE8 000000067F000040020000A00000003F8000-000000067F000040020000A00000003FC000__0000003B99F7F8A0 000000067F000040020000A00000003F8000-000000067F000040020000A00000003FC000__0000005D2FFFFB38 000000067F000040020000A00000003FA41E-000000067F000040020000A0000000402E14__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A00000003FC000-000000067F000040020000A0000000400000__00000023DF7FF060 000000067F000040020000A00000003FC000-000000067F000040020000A0000000400000__00000038E67ABFA0 000000067F000040020000A00000003FC000-000000067F000040020000A0000000400000__0000003903F1CFE8 000000067F000040020000A00000003FC000-000000067F000040020000A0000000400000__0000003B99F7F8A0 000000067F000040020000A00000003FC000-000000067F000040020000A0000000400000__0000005D2FFFFB38 000000067F000040020000A0000000400000-000000067F000040020000A0000000404000__00000038E67ABFA0 000000067F000040020000A0000000400000-000000067F000040020000A0000000404000__0000003903F1CFE8 000000067F000040020000A0000000400000-000000067F000040020000A0000000404000__0000003B99F7F8A0 000000067F000040020000A0000000400000-000000067F000040020000A0000000404000__0000005D2FFFFB38 000000067F000040020000A0000000400000-030000000000000000000000000000000002__00000023DF7FF060 000000067F000040020000A0000000402E14-000000067F000040020000A000000040B7FC__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000404000-000000067F000040020000A0000000408000__00000038E67ABFA0 000000067F000040020000A0000000404000-000000067F000040020000A0000000408000__0000003903F1CFE8 000000067F000040020000A0000000404000-000000067F000040020000A0000000408000__0000003B99F7F8A0 000000067F000040020000A0000000404000-000000067F000040020000A0000000408000__0000005D2FFFFB38 000000067F000040020000A0000000408000-000000067F000040020000A000000040C000__00000038E67ABFA0 000000067F000040020000A0000000408000-000000067F000040020000A000000040C000__0000003903F1CFE8 000000067F000040020000A0000000408000-000000067F000040020000A000000040C000__0000003B99F7F8A0 000000067F000040020000A0000000408000-000000067F000040020000A000000040C000__0000005D2FFFFB38 000000067F000040020000A000000040B7FC-000000067F000040020000A00000004141F2__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A000000040C000-000000067F000040020000A0000000410000__00000038E67ABFA0 000000067F000040020000A000000040C000-000000067F000040020000A0000000410000__0000003903F1CFE8 000000067F000040020000A000000040C000-000000067F000040020000A0000000410000__0000003B99F7F8A0 000000067F000040020000A000000040C000-000000067F000040020000A0000000410000__0000005D2FFFFB38 000000067F000040020000A0000000410000-000000067F000040020000A0000000414000__00000038E67ABFA0 000000067F000040020000A0000000410000-000000067F000040020000A0000000414000__0000003903F1CFE8 000000067F000040020000A0000000410000-000000067F000040020000A0000000414000__0000003B99F7F8A0 000000067F000040020000A0000000410000-000000067F000040020000A0000000414000__0000005D2FFFFB38 000000067F000040020000A0000000414000-000000067F000040020000A0000000418000__00000038E67ABFA0 000000067F000040020000A0000000414000-000000067F000040020000A0000000418000__0000003903F1CFE8 000000067F000040020000A0000000414000-000000067F000040020000A0000000418000__0000003B99F7F8A0 000000067F000040020000A0000000414000-000000067F000040020000A0000000418000__0000005D2FFFFB38 000000067F000040020000A00000004141F2-000000067F000040020000A000000041CBDA__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000418000-000000067F000040020000A000000041C000__00000038E67ABFA0 000000067F000040020000A0000000418000-000000067F000040020000A000000041C000__0000003903F1CFE8 000000067F000040020000A0000000418000-000000067F000040020000A000000041C000__0000003B99F7F8A0 000000067F000040020000A0000000418000-000000067F000040020000A000000041C000__0000005D2FFFFB38 000000067F000040020000A000000041C000-000000067F000040020000A0000000420000__00000038E67ABFA0 000000067F000040020000A000000041C000-000000067F000040020000A0000000420000__0000003903F1CFE8 000000067F000040020000A000000041C000-000000067F000040020000A0000000420000__0000003B99F7F8A0 000000067F000040020000A000000041C000-000000067F000040020000A0000000420000__0000005D2FFFFB38 000000067F000040020000A000000041CBDA-000000067F000040020000A00000004255AE__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000420000-000000067F000040020000A0000000424000__00000038E67ABFA0 000000067F000040020000A0000000420000-000000067F000040020000A0000000424000__0000003903F1CFE8 000000067F000040020000A0000000420000-000000067F000040020000A0000000424000__0000003B99F7F8A0 000000067F000040020000A0000000420000-000000067F000040020000A0000000424000__0000005D2FFFFB38 000000067F000040020000A0000000424000-000000067F000040020000A0000000428000__00000038E67ABFA0 000000067F000040020000A0000000424000-000000067F000040020000A0000000428000__0000003903F1CFE8 000000067F000040020000A0000000424000-000000067F000040020000A0000000428000__0000003B99F7F8A0 000000067F000040020000A0000000424000-000000067F000040020000A0000000428000__0000005D2FFFFB38 000000067F000040020000A00000004255AE-000000067F000040020000A000000042DF69__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000428000-000000067F000040020000A000000042C000__00000038E67ABFA0 000000067F000040020000A0000000428000-000000067F000040020000A000000042C000__0000003903F1CFE8 000000067F000040020000A0000000428000-000000067F000040020000A000000042C000__0000003B99F7F8A0 000000067F000040020000A0000000428000-000000067F000040020000A000000042C000__0000005D2FFFFB38 000000067F000040020000A000000042C000-000000067F000040020000A0000000430000__00000038E67ABFA0 000000067F000040020000A000000042C000-000000067F000040020000A0000000430000__0000003903F1CFE8 000000067F000040020000A000000042C000-000000067F000040020000A0000000430000__0000003B99F7F8A0 000000067F000040020000A000000042C000-000000067F000040020000A0000000430000__0000005D2FFFFB38 000000067F000040020000A000000042DF69-000000067F000040020000A0000000436935__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000430000-000000067F000040020000A0000000434000__00000038E67ABFA0 000000067F000040020000A0000000430000-000000067F000040020000A0000000434000__0000003903F1CFE8 000000067F000040020000A0000000430000-000000067F000040020000A0000000434000__0000003B99F7F8A0 000000067F000040020000A0000000430000-000000067F000040020000A0000000434000__0000005D2FFFFB38 000000067F000040020000A0000000434000-000000067F000040020000A0000000438000__00000038E67ABFA0 000000067F000040020000A0000000434000-000000067F000040020000A0000000438000__0000003903F1CFE8 000000067F000040020000A0000000434000-000000067F000040020000A0000000438000__0000003B99F7F8A0 000000067F000040020000A0000000434000-000000067F000040020000A0000000438000__0000005D2FFFFB38 000000067F000040020000A0000000436935-000000067F000040020000A000000043F31C__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A0000000438000-000000067F000040020000A000000043C000__00000038E67ABFA0 000000067F000040020000A0000000438000-000000067F000040020000A000000043C000__0000003903F1CFE8 000000067F000040020000A0000000438000-000000067F000040020000A000000043C000__0000003B99F7F8A0 000000067F000040020000A0000000438000-000000067F000040020000A000000043C000__0000005D2FFFFB38 000000067F000040020000A000000043C000-000000067F000040020000A0000000440000__00000038E1ABFE28 000000067F000040020000A000000043C000-000000067F000040020000A0000000440000__00000038E9AF7F00 000000067F000040020000A000000043C000-000000067F000040020000A0000000440000__0000003903F1CFE8 000000067F000040020000A000000043C000-000000067F000040020000A0000000440000__0000003B99F7F8A0 000000067F000040020000A000000043C000-000000067F000040020000A0000000440000__0000005D2FFFFB38 000000067F000040020000A000000043F31C-000000067F000040020000A0000200000000__00000023AEB9F2B9-000000244E69F8E9 000000067F000040020000A000000043F581-000000067F000040020000A0000000447F7E__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000440000-000000067F000040020000A0000000444000__00000038E1ABFE28 000000067F000040020000A0000000440000-000000067F000040020000A0000000444000__00000038E9AF7F00 000000067F000040020000A0000000440000-000000067F000040020000A0000000444000__0000003903F1CFE8 000000067F000040020000A0000000440000-000000067F000040020000A0000000444000__0000003B99F7F8A0 000000067F000040020000A0000000440000-000000067F000040020000A0000000444000__0000005D2FFFFB38 000000067F000040020000A0000000444000-000000067F000040020000A0000000448000__00000038E1ABFE28 000000067F000040020000A0000000444000-000000067F000040020000A0000000448000__00000038E9AF7F00 000000067F000040020000A0000000444000-000000067F000040020000A0000000448000__0000003903F1CFE8 000000067F000040020000A0000000444000-000000067F000040020000A0000000448000__0000003B99F7F8A0 000000067F000040020000A0000000444000-000000067F000040020000A0000000448000__0000005D2FFFFB38 000000067F000040020000A0000000447F7E-000000067F000040020000A000000045096D__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000448000-000000067F000040020000A000000044C000__00000038E1ABFE28 000000067F000040020000A0000000448000-000000067F000040020000A000000044C000__00000038E9AF7F00 000000067F000040020000A0000000448000-000000067F000040020000A000000044C000__0000003903F1CFE8 000000067F000040020000A0000000448000-000000067F000040020000A000000044C000__0000003B99F7F8A0 000000067F000040020000A0000000448000-000000067F000040020000A000000044C000__0000005D2FFFFB38 000000067F000040020000A000000044C000-000000067F000040020000A0000000450000__00000038E1ABFE28 000000067F000040020000A000000044C000-000000067F000040020000A0000000450000__00000038E9AF7F00 000000067F000040020000A000000044C000-000000067F000040020000A0000000450000__0000003903F1CFE8 000000067F000040020000A000000044C000-000000067F000040020000A0000000450000__0000003B99F7F8A0 000000067F000040020000A000000044C000-000000067F000040020000A0000000450000__0000005D2FFFFB38 000000067F000040020000A0000000450000-000000067F000040020000A0000000454000__00000038E1ABFE28 000000067F000040020000A0000000450000-000000067F000040020000A0000000454000__00000038E9AF7F00 000000067F000040020000A0000000450000-000000067F000040020000A0000000454000__0000003903F1CFE8 000000067F000040020000A0000000450000-000000067F000040020000A0000000454000__0000003B99F7F8A0 000000067F000040020000A0000000450000-000000067F000040020000A0000000454000__0000005D2FFFFB38 000000067F000040020000A000000045096D-000000067F000040020000A000000045934B__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000454000-000000067F000040020000A0000000458000__00000038E1ABFE28 000000067F000040020000A0000000454000-000000067F000040020000A0000000458000__00000038E9AF7F00 000000067F000040020000A0000000454000-000000067F000040020000A0000000458000__0000003903F1CFE8 000000067F000040020000A0000000454000-000000067F000040020000A0000000458000__0000003B99F7F8A0 000000067F000040020000A0000000454000-000000067F000040020000A0000000458000__0000005D2FFFFB38 000000067F000040020000A0000000458000-000000067F000040020000A000000045C000__00000038E1ABFE28 000000067F000040020000A0000000458000-000000067F000040020000A000000045C000__00000038E9AF7F00 000000067F000040020000A0000000458000-000000067F000040020000A000000045C000__0000003903F1CFE8 000000067F000040020000A0000000458000-000000067F000040020000A000000045C000__0000003B99F7F8A0 000000067F000040020000A0000000458000-000000067F000040020000A000000045C000__0000005D2FFFFB38 000000067F000040020000A000000045934B-000000067F000040020000A0000000461D13__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A000000045C000-000000067F000040020000A0000000460000__00000038E1ABFE28 000000067F000040020000A000000045C000-000000067F000040020000A0000000460000__00000038E9AF7F00 000000067F000040020000A000000045C000-000000067F000040020000A0000000460000__0000003903F1CFE8 000000067F000040020000A000000045C000-000000067F000040020000A0000000460000__0000003B99F7F8A0 000000067F000040020000A000000045C000-000000067F000040020000A0000000460000__0000005D2FFFFB38 000000067F000040020000A0000000460000-000000067F000040020000A0000000464000__00000038E1ABFE28 000000067F000040020000A0000000460000-000000067F000040020000A0000000464000__00000038E9AF7F00 000000067F000040020000A0000000460000-000000067F000040020000A0000000464000__0000003903F1CFE8 000000067F000040020000A0000000460000-000000067F000040020000A0000000464000__0000003B99F7F8A0 000000067F000040020000A0000000460000-000000067F000040020000A0000000464000__0000005D2FFFFB38 000000067F000040020000A0000000461D13-000000067F000040020000A000000046A6C6__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000464000-000000067F000040020000A0000000468000__00000038E1ABFE28 000000067F000040020000A0000000464000-000000067F000040020000A0000000468000__00000038E9AF7F00 000000067F000040020000A0000000464000-000000067F000040020000A0000000468000__0000003903F1CFE8 000000067F000040020000A0000000464000-000000067F000040020000A0000000468000__0000003B99F7F8A0 000000067F000040020000A0000000464000-000000067F000040020000A0000000468000__0000005D2FFFFB38 000000067F000040020000A0000000468000-000000067F000040020000A000000046C000__00000038E1ABFE28 000000067F000040020000A0000000468000-000000067F000040020000A000000046C000__00000038E9AF7F00 000000067F000040020000A0000000468000-000000067F000040020000A000000046C000__0000003903F1CFE8 000000067F000040020000A0000000468000-000000067F000040020000A000000046C000__0000003B99F7F8A0 000000067F000040020000A0000000468000-000000067F000040020000A000000046C000__0000005D2FFFFB38 000000067F000040020000A000000046A6C6-000000067F000040020000A00000004730A6__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A000000046C000-000000067F000040020000A0000000470000__00000038E1ABFE28 000000067F000040020000A000000046C000-000000067F000040020000A0000000470000__00000038E9AF7F00 000000067F000040020000A000000046C000-000000067F000040020000A0000000470000__0000003903F1CFE8 000000067F000040020000A000000046C000-000000067F000040020000A0000000470000__0000003B99F7F8A0 000000067F000040020000A000000046C000-000000067F000040020000A0000000470000__0000005D2FFFFB38 000000067F000040020000A0000000470000-000000067F000040020000A0000000474000__00000038E1ABFE28 000000067F000040020000A0000000470000-000000067F000040020000A0000000474000__00000038E9AF7F00 000000067F000040020000A0000000470000-000000067F000040020000A0000000474000__0000003903F1CFE8 000000067F000040020000A0000000470000-000000067F000040020000A0000000474000__0000003B99F7F8A0 000000067F000040020000A0000000470000-000000067F000040020000A0000000474000__0000005D2FFFFB38 000000067F000040020000A00000004730A6-000000067F000040020000A000000047BA93__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000474000-000000067F000040020000A0000000478000__00000038E1ABFE28 000000067F000040020000A0000000474000-000000067F000040020000A0000000478000__00000038E9AF7F00 000000067F000040020000A0000000474000-000000067F000040020000A0000000478000__0000003903F1CFE8 000000067F000040020000A0000000474000-000000067F000040020000A0000000478000__0000003B99F7F8A0 000000067F000040020000A0000000474000-000000067F000040020000A0000000478000__0000005D2FFFFB38 000000067F000040020000A0000000478000-000000067F000040020000A000000047C000__00000038E1ABFE28 000000067F000040020000A0000000478000-000000067F000040020000A000000047C000__00000038E9AF7F00 000000067F000040020000A0000000478000-000000067F000040020000A000000047C000__0000003903F1CFE8 000000067F000040020000A0000000478000-000000067F000040020000A000000047C000__0000003B99F7F8A0 000000067F000040020000A0000000478000-000000067F000040020000A000000047C000__0000005D2FFFFB38 000000067F000040020000A000000047BA93-000000067F000040020000A0000000484484__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A000000047C000-000000067F000040020000A0000000480000__00000038E1ABFE28 000000067F000040020000A000000047C000-000000067F000040020000A0000000480000__00000038E9AF7F00 000000067F000040020000A000000047C000-000000067F000040020000A0000000480000__0000003903F1CFE8 000000067F000040020000A000000047C000-000000067F000040020000A0000000480000__0000003B99F7F8A0 000000067F000040020000A000000047C000-000000067F000040020000A0000000480000__0000005D2FFFFB38 000000067F000040020000A0000000480000-000000067F000040020000A0000000484000__00000038E1ABFE28 000000067F000040020000A0000000480000-000000067F000040020000A0000000484000__00000038E9AF7F00 000000067F000040020000A0000000480000-000000067F000040020000A0000000484000__0000003903F1CFE8 000000067F000040020000A0000000480000-000000067F000040020000A0000000484000__0000003B99F7F8A0 000000067F000040020000A0000000480000-000000067F000040020000A0000000484000__0000005D2FFFFB38 000000067F000040020000A0000000484000-000000067F000040020000A0000000488000__00000038E1ABFE28 000000067F000040020000A0000000484000-000000067F000040020000A0000000488000__00000038E9AF7F00 000000067F000040020000A0000000484000-000000067F000040020000A0000000488000__0000003903F1CFE8 000000067F000040020000A0000000484000-000000067F000040020000A0000000488000__0000003B99F7F8A0 000000067F000040020000A0000000484000-000000067F000040020000A0000000488000__0000005D2FFFFB38 000000067F000040020000A0000000484484-000000067F000040020000A000000048CE6F__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000488000-000000067F000040020000A000000048C000__00000038E1ABFE28 000000067F000040020000A0000000488000-000000067F000040020000A000000048C000__00000038E9AF7F00 000000067F000040020000A0000000488000-000000067F000040020000A000000048C000__0000003903F1CFE8 000000067F000040020000A0000000488000-000000067F000040020000A000000048C000__0000003B99F7F8A0 000000067F000040020000A0000000488000-000000067F000040020000A000000048C000__0000005D2FFFFB38 000000067F000040020000A000000048C000-000000067F000040020000A0000000490000__00000038E1ABFE28 000000067F000040020000A000000048C000-000000067F000040020000A0000000490000__00000038E9AF7F00 000000067F000040020000A000000048C000-000000067F000040020000A0000000490000__0000003903F1CFE8 000000067F000040020000A000000048C000-000000067F000040020000A0000000490000__0000003B99F7F8A0 000000067F000040020000A000000048C000-000000067F000040020000A0000000490000__0000005D2FFFFB38 000000067F000040020000A000000048CE6F-000000067F000040020000A0000000495855__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A0000000490000-000000067F000040020000A0000000494000__00000038E1ABFE28 000000067F000040020000A0000000490000-000000067F000040020000A0000000494000__00000038E9AF7F00 000000067F000040020000A0000000490000-000000067F000040020000A0000000494000__0000003903F1CFE8 000000067F000040020000A0000000490000-000000067F000040020000A0000000494000__0000003B99F7F8A0 000000067F000040020000A0000000490000-000000067F000040020000A0000000494000__0000005D2FFFFB38 000000067F000040020000A0000000494000-000000067F000040020000A0000000498000__00000025CABFE5B8 000000067F000040020000A0000000494000-000000067F000040020000A0000000498000__00000038E9AF7F00 000000067F000040020000A0000000494000-000000067F000040020000A0000000498000__0000003903F1CFE8 000000067F000040020000A0000000494000-000000067F000040020000A0000000498000__0000003B99F7F8A0 000000067F000040020000A0000000494000-000000067F000040020000A0000000498000__0000005D2FFFFB38 000000067F000040020000A0000000495855-000000067F000040020000A0000200000000__000000244E69F8E9-00000024EE19EEF9 000000067F000040020000A00000004959F7-000000067F000040020000A000000049E3C6__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A0000000498000-000000067F000040020000A000000049C000__00000025CABFE5B8 000000067F000040020000A0000000498000-000000067F000040020000A000000049C000__00000038E9AF7F00 000000067F000040020000A0000000498000-000000067F000040020000A000000049C000__0000003903F1CFE8 000000067F000040020000A0000000498000-000000067F000040020000A000000049C000__0000003B99F7F8A0 000000067F000040020000A0000000498000-000000067F000040020000A000000049C000__0000005D2FFFFB38 000000067F000040020000A000000049C000-000000067F000040020000A00000004A0000__00000025CABFE5B8 000000067F000040020000A000000049C000-000000067F000040020000A00000004A0000__00000038E9AF7F00 000000067F000040020000A000000049C000-000000067F000040020000A00000004A0000__0000003903F1CFE8 000000067F000040020000A000000049C000-000000067F000040020000A00000004A0000__0000003B99F7F8A0 000000067F000040020000A000000049C000-000000067F000040020000A00000004A0000__0000005D2FFFFB38 000000067F000040020000A000000049E3C6-000000067F000040020000A00000004A6D8C__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004A0000-000000067F000040020000A00000004A4000__00000025CABFE5B8 000000067F000040020000A00000004A0000-000000067F000040020000A00000004A4000__00000038E9AF7F00 000000067F000040020000A00000004A0000-000000067F000040020000A00000004A4000__0000003903F1CFE8 000000067F000040020000A00000004A0000-000000067F000040020000A00000004A4000__0000003B99F7F8A0 000000067F000040020000A00000004A0000-000000067F000040020000A00000004A4000__0000005D2FFFFB38 000000067F000040020000A00000004A4000-000000067F000040020000A00000004A8000__00000025CABFE5B8 000000067F000040020000A00000004A4000-000000067F000040020000A00000004A8000__00000038E9AF7F00 000000067F000040020000A00000004A4000-000000067F000040020000A00000004A8000__0000003903F1CFE8 000000067F000040020000A00000004A4000-000000067F000040020000A00000004A8000__0000003B99F7F8A0 000000067F000040020000A00000004A4000-000000067F000040020000A00000004A8000__0000005D2FFFFB38 000000067F000040020000A00000004A6D8C-000000067F000040020000A00000004AF769__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004A8000-000000067F000040020000A00000004AC000__00000025CABFE5B8 000000067F000040020000A00000004A8000-000000067F000040020000A00000004AC000__00000038E9AF7F00 000000067F000040020000A00000004A8000-000000067F000040020000A00000004AC000__0000003903F1CFE8 000000067F000040020000A00000004A8000-000000067F000040020000A00000004AC000__0000003B99F7F8A0 000000067F000040020000A00000004A8000-000000067F000040020000A00000004AC000__0000005D2FFFFB38 000000067F000040020000A00000004AC000-000000067F000040020000A00000004B0000__00000025CABFE5B8 000000067F000040020000A00000004AC000-000000067F000040020000A00000004B0000__00000038E9AF7F00 000000067F000040020000A00000004AC000-000000067F000040020000A00000004B0000__0000003903F1CFE8 000000067F000040020000A00000004AC000-000000067F000040020000A00000004B0000__0000003B99F7F8A0 000000067F000040020000A00000004AC000-000000067F000040020000A00000004B0000__0000005D2FFFFB38 000000067F000040020000A00000004AF769-000000067F000040020000A00000004B8152__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004B0000-000000067F000040020000A00000004B4000__00000025CABFE5B8 000000067F000040020000A00000004B0000-000000067F000040020000A00000004B4000__00000038E9AF7F00 000000067F000040020000A00000004B0000-000000067F000040020000A00000004B4000__0000003903F1CFE8 000000067F000040020000A00000004B0000-000000067F000040020000A00000004B4000__0000003B99F7F8A0 000000067F000040020000A00000004B0000-000000067F000040020000A00000004B4000__0000005D2FFFFB38 000000067F000040020000A00000004B4000-000000067F000040020000A00000004B8000__00000025CABFE5B8 000000067F000040020000A00000004B4000-000000067F000040020000A00000004B8000__00000038E9AF7F00 000000067F000040020000A00000004B4000-000000067F000040020000A00000004B8000__0000003903F1CFE8 000000067F000040020000A00000004B4000-000000067F000040020000A00000004B8000__0000003B99F7F8A0 000000067F000040020000A00000004B4000-000000067F000040020000A00000004B8000__0000005D2FFFFB38 000000067F000040020000A00000004B8000-000000067F000040020000A00000004BC000__00000025CABFE5B8 000000067F000040020000A00000004B8000-000000067F000040020000A00000004BC000__00000038E9AF7F00 000000067F000040020000A00000004B8000-000000067F000040020000A00000004BC000__0000003903F1CFE8 000000067F000040020000A00000004B8000-000000067F000040020000A00000004BC000__0000003B99F7F8A0 000000067F000040020000A00000004B8000-000000067F000040020000A00000004BC000__0000005D2FFFFB38 000000067F000040020000A00000004B8152-000000067F000040020000A00000004C0B3C__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004BC000-000000067F000040020000A00000004C0000__00000025CABFE5B8 000000067F000040020000A00000004BC000-000000067F000040020000A00000004C0000__00000038E9AF7F00 000000067F000040020000A00000004BC000-000000067F000040020000A00000004C0000__0000003903F1CFE8 000000067F000040020000A00000004BC000-000000067F000040020000A00000004C0000__0000003B99F7F8A0 000000067F000040020000A00000004BC000-000000067F000040020000A00000004C0000__0000005D2FFFFB38 000000067F000040020000A00000004C0000-000000067F000040020000A00000004C4000__00000025CABFE5B8 000000067F000040020000A00000004C0000-000000067F000040020000A00000004C4000__00000038E9AF7F00 000000067F000040020000A00000004C0000-000000067F000040020000A00000004C4000__0000003903F1CFE8 000000067F000040020000A00000004C0000-000000067F000040020000A00000004C4000__0000003B99F7F8A0 000000067F000040020000A00000004C0000-000000067F000040020000A00000004C4000__0000005D2FFFFB38 000000067F000040020000A00000004C0B3C-000000067F000040020000A00000004C9523__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004C4000-000000067F000040020000A00000004C8000__00000025CABFE5B8 000000067F000040020000A00000004C4000-000000067F000040020000A00000004C8000__00000038E9AF7F00 000000067F000040020000A00000004C4000-000000067F000040020000A00000004C8000__0000003903F1CFE8 000000067F000040020000A00000004C4000-000000067F000040020000A00000004C8000__0000003B99F7F8A0 000000067F000040020000A00000004C4000-000000067F000040020000A00000004C8000__0000005D2FFFFB38 000000067F000040020000A00000004C8000-000000067F000040020000A00000004CC000__00000025CABFE5B8 000000067F000040020000A00000004C8000-000000067F000040020000A00000004CC000__00000038E9AF7F00 000000067F000040020000A00000004C8000-000000067F000040020000A00000004CC000__0000003903F1CFE8 000000067F000040020000A00000004C8000-000000067F000040020000A00000004CC000__0000003B99F7F8A0 000000067F000040020000A00000004C8000-000000067F000040020000A00000004CC000__0000005D2FFFFB38 000000067F000040020000A00000004C9523-000000067F000040020000A00000004D1F01__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004CC000-000000067F000040020000A00000004D0000__00000025CABFE5B8 000000067F000040020000A00000004CC000-000000067F000040020000A00000004D0000__00000038E9AF7F00 000000067F000040020000A00000004CC000-000000067F000040020000A00000004D0000__0000003903F1CFE8 000000067F000040020000A00000004CC000-000000067F000040020000A00000004D0000__0000003B99F7F8A0 000000067F000040020000A00000004CC000-000000067F000040020000A00000004D0000__0000005D2FFFFB38 000000067F000040020000A00000004D0000-000000067F000040020000A00000004D4000__00000025CABFE5B8 000000067F000040020000A00000004D0000-000000067F000040020000A00000004D4000__00000038E9AF7F00 000000067F000040020000A00000004D0000-000000067F000040020000A00000004D4000__0000003903F1CFE8 000000067F000040020000A00000004D0000-000000067F000040020000A00000004D4000__0000003B99F7F8A0 000000067F000040020000A00000004D0000-000000067F000040020000A00000004D4000__0000005D2FFFFB38 000000067F000040020000A00000004D1F01-000000067F000040020000A00000004DA8BF__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004D4000-000000067F000040020000A00000004D8000__00000025CABFE5B8 000000067F000040020000A00000004D4000-000000067F000040020000A00000004D8000__00000038E9AF7F00 000000067F000040020000A00000004D4000-000000067F000040020000A00000004D8000__0000003903F1CFE8 000000067F000040020000A00000004D4000-000000067F000040020000A00000004D8000__0000003B99F7F8A0 000000067F000040020000A00000004D4000-000000067F000040020000A00000004D8000__0000005D2FFFFB38 000000067F000040020000A00000004D8000-000000067F000040020000A00000004DC000__00000025CABFE5B8 000000067F000040020000A00000004D8000-000000067F000040020000A00000004DC000__00000038E9AF7F00 000000067F000040020000A00000004D8000-000000067F000040020000A00000004DC000__0000003903F1CFE8 000000067F000040020000A00000004D8000-000000067F000040020000A00000004DC000__0000003B99F7F8A0 000000067F000040020000A00000004D8000-000000067F000040020000A00000004DC000__0000005D2FFFFB38 000000067F000040020000A00000004DA8BF-000000067F000040020000A00000004E327F__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004DC000-000000067F000040020000A00000004E0000__00000025CABFE5B8 000000067F000040020000A00000004DC000-000000067F000040020000A00000004E0000__00000038E9AF7F00 000000067F000040020000A00000004DC000-000000067F000040020000A00000004E0000__0000003903F1CFE8 000000067F000040020000A00000004DC000-000000067F000040020000A00000004E0000__0000003B99F7F8A0 000000067F000040020000A00000004DC000-000000067F000040020000A00000004E0000__0000005D2FFFFB38 000000067F000040020000A00000004E0000-000000067F000040020000A00000004E4000__00000025CABFE5B8 000000067F000040020000A00000004E0000-000000067F000040020000A00000004E4000__00000038E9AF7F00 000000067F000040020000A00000004E0000-000000067F000040020000A00000004E4000__0000003903F1CFE8 000000067F000040020000A00000004E0000-000000067F000040020000A00000004E4000__0000003B99F7F8A0 000000067F000040020000A00000004E0000-000000067F000040020000A00000004E4000__0000005D2FFFFB38 000000067F000040020000A00000004E327F-000000067F000040020000A00000004EBC62__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004E4000-000000067F000040020000A00000004E8000__00000025CABFE5B8 000000067F000040020000A00000004E4000-000000067F000040020000A00000004E8000__00000038E9AF7F00 000000067F000040020000A00000004E4000-000000067F000040020000A00000004E8000__0000003903F1CFE8 000000067F000040020000A00000004E4000-000000067F000040020000A00000004E8000__0000003B99F7F8A0 000000067F000040020000A00000004E4000-000000067F000040020000A00000004E8000__0000005D2FFFFB38 000000067F000040020000A00000004E8000-000000067F000040020000A00000004EC000__00000025CABFE5B8 000000067F000040020000A00000004E8000-000000067F000040020000A00000004EC000__00000038E9AF7F00 000000067F000040020000A00000004E8000-000000067F000040020000A00000004EC000__0000003903F1CFE8 000000067F000040020000A00000004E8000-000000067F000040020000A00000004EC000__0000003B99F7F8A0 000000067F000040020000A00000004E8000-000000067F000040020000A00000004EC000__0000005D2FFFFB38 000000067F000040020000A00000004EBC62-000000067F000040020000A00000004F4640__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004EC000-000000067F000040020000A00000004F0000__00000025CABFE5B8 000000067F000040020000A00000004EC000-000000067F000040020000A00000004F0000__00000038E9AF7F00 000000067F000040020000A00000004EC000-000000067F000040020000A00000004F0000__0000003903F1CFE8 000000067F000040020000A00000004EC000-000000067F000040020000A00000004F0000__0000003B99F7F8A0 000000067F000040020000A00000004EC000-000000067F000040020000A00000004F0000__0000005D2FFFFB38 000000067F000040020000A00000004F0000-000000067F000040020000A00000004F4000__00000025CABFE5B8 000000067F000040020000A00000004F0000-000000067F000040020000A00000004F4000__00000038E9AF7F00 000000067F000040020000A00000004F0000-000000067F000040020000A00000004F4000__0000003903F1CFE8 000000067F000040020000A00000004F0000-000000067F000040020000A00000004F4000__0000003B99F7F8A0 000000067F000040020000A00000004F0000-000000067F000040020000A00000004F4000__0000005D2FFFFB38 000000067F000040020000A00000004F4000-000000067F000040020000A00000004F8000__00000025CABFE5B8 000000067F000040020000A00000004F4000-000000067F000040020000A00000004F8000__00000038E67ABFA0 000000067F000040020000A00000004F4000-000000067F000040020000A00000004F8000__0000003903F1CFE8 000000067F000040020000A00000004F4000-000000067F000040020000A00000004F8000__0000003B99F7F8A0 000000067F000040020000A00000004F4000-000000067F000040020000A00000004F8000__0000005D2FFFFB38 000000067F000040020000A00000004F4640-000000067F000040020000A0000200000000__00000024EE19EEF9-000000259DC1F899 000000067F000040020000A00000004F48EC-000000067F000040020000A00000004FD2E1__000000259DC1F899-000000263D71E6D9 000000067F000040020000A00000004F8000-000000067F000040020000A00000004FC000__00000025CABFE5B8 000000067F000040020000A00000004F8000-000000067F000040020000A00000004FC000__00000038E67ABFA0 000000067F000040020000A00000004F8000-000000067F000040020000A00000004FC000__0000003903F1CFE8 000000067F000040020000A00000004F8000-000000067F000040020000A00000004FC000__0000003B99F7F8A0 000000067F000040020000A00000004F8000-000000067F000040020000A00000004FC000__0000005D2FFFFB38 000000067F000040020000A00000004FC000-000000067F000040020000A0000000500000__00000025CABFE5B8 000000067F000040020000A00000004FC000-000000067F000040020000A0000000500000__00000038E67ABFA0 000000067F000040020000A00000004FC000-000000067F000040020000A0000000500000__0000003903F1CFE8 000000067F000040020000A00000004FC000-000000067F000040020000A0000000500000__0000003B99F7F8A0 000000067F000040020000A00000004FC000-000000067F000040020000A0000000500000__0000005D2FFFFB38 000000067F000040020000A00000004FD2E1-000000067F000040020000A0000000505CD7__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000500000-000000067F000040020000A0000000504000__00000025CABFE5B8 000000067F000040020000A0000000500000-000000067F000040020000A0000000504000__00000038E67ABFA0 000000067F000040020000A0000000500000-000000067F000040020000A0000000504000__0000003903F1CFE8 000000067F000040020000A0000000500000-000000067F000040020000A0000000504000__0000003B99F7F8A0 000000067F000040020000A0000000500000-000000067F000040020000A0000000504000__0000005D2FFFFB38 000000067F000040020000A0000000504000-000000067F000040020000A0000000508000__00000025CABFE5B8 000000067F000040020000A0000000504000-000000067F000040020000A0000000508000__00000038E67ABFA0 000000067F000040020000A0000000504000-000000067F000040020000A0000000508000__0000003903F1CFE8 000000067F000040020000A0000000504000-000000067F000040020000A0000000508000__0000003B99F7F8A0 000000067F000040020000A0000000504000-000000067F000040020000A0000000508000__0000005D2FFFFB38 000000067F000040020000A0000000505CD7-000000067F000040020000A000000050E6C4__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000508000-000000067F000040020000A000000050C000__00000025CABFE5B8 000000067F000040020000A0000000508000-000000067F000040020000A000000050C000__00000038E67ABFA0 000000067F000040020000A0000000508000-000000067F000040020000A000000050C000__0000003903F1CFE8 000000067F000040020000A0000000508000-000000067F000040020000A000000050C000__0000003B99F7F8A0 000000067F000040020000A0000000508000-000000067F000040020000A000000050C000__0000005D2FFFFB38 000000067F000040020000A000000050C000-000000067F000040020000A0000000510000__00000038E67ABFA0 000000067F000040020000A000000050C000-000000067F000040020000A0000000510000__0000003903F1CFE8 000000067F000040020000A000000050C000-000000067F000040020000A0000000510000__0000003B99F7F8A0 000000067F000040020000A000000050C000-000000067F000040020000A0000000510000__0000005D2FFFFB38 000000067F000040020000A000000050C000-030000000000000000000000000000000002__00000025CABFE5B8 000000067F000040020000A000000050E6C4-000000067F000040020000A000000051708F__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000510000-000000067F000040020000A0000000514000__00000038E67ABFA0 000000067F000040020000A0000000510000-000000067F000040020000A0000000514000__0000003903F1CFE8 000000067F000040020000A0000000510000-000000067F000040020000A0000000514000__0000003B99F7F8A0 000000067F000040020000A0000000510000-000000067F000040020000A0000000514000__0000005D2FFFFB38 000000067F000040020000A0000000514000-000000067F000040020000A0000000518000__00000038E67ABFA0 000000067F000040020000A0000000514000-000000067F000040020000A0000000518000__0000003903F1CFE8 000000067F000040020000A0000000514000-000000067F000040020000A0000000518000__0000003B99F7F8A0 000000067F000040020000A0000000514000-000000067F000040020000A0000000518000__0000005D2FFFFB38 000000067F000040020000A000000051708F-000000067F000040020000A000000051FA56__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000518000-000000067F000040020000A000000051C000__00000038E67ABFA0 000000067F000040020000A0000000518000-000000067F000040020000A000000051C000__0000003903F1CFE8 000000067F000040020000A0000000518000-000000067F000040020000A000000051C000__0000003B99F7F8A0 000000067F000040020000A0000000518000-000000067F000040020000A000000051C000__0000005D2FFFFB38 000000067F000040020000A000000051C000-000000067F000040020000A0000000520000__00000038E67ABFA0 000000067F000040020000A000000051C000-000000067F000040020000A0000000520000__0000003903F1CFE8 000000067F000040020000A000000051C000-000000067F000040020000A0000000520000__0000003B99F7F8A0 000000067F000040020000A000000051C000-000000067F000040020000A0000000520000__0000005D2FFFFB38 000000067F000040020000A000000051FA56-000000067F000040020000A0000000528431__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000520000-000000067F000040020000A0000000524000__00000038E67ABFA0 000000067F000040020000A0000000520000-000000067F000040020000A0000000524000__0000003903F1CFE8 000000067F000040020000A0000000520000-000000067F000040020000A0000000524000__0000003B99F7F8A0 000000067F000040020000A0000000520000-000000067F000040020000A0000000524000__0000005D2FFFFB38 000000067F000040020000A0000000524000-000000067F000040020000A0000000528000__00000038E67ABFA0 000000067F000040020000A0000000524000-000000067F000040020000A0000000528000__0000003903F1CFE8 000000067F000040020000A0000000524000-000000067F000040020000A0000000528000__0000003B99F7F8A0 000000067F000040020000A0000000524000-000000067F000040020000A0000000528000__0000005D2FFFFB38 000000067F000040020000A0000000528000-000000067F000040020000A000000052C000__00000038E67ABFA0 000000067F000040020000A0000000528000-000000067F000040020000A000000052C000__0000003903F1CFE8 000000067F000040020000A0000000528000-000000067F000040020000A000000052C000__0000003B99F7F8A0 000000067F000040020000A0000000528000-000000067F000040020000A000000052C000__0000005D2FFFFB38 000000067F000040020000A0000000528431-000000067F000040020000A0000000530E07__000000259DC1F899-000000263D71E6D9 000000067F000040020000A000000052C000-000000067F000040020000A0000000530000__00000038E67ABFA0 000000067F000040020000A000000052C000-000000067F000040020000A0000000530000__0000003903F1CFE8 000000067F000040020000A000000052C000-000000067F000040020000A0000000530000__0000003B99F7F8A0 000000067F000040020000A000000052C000-000000067F000040020000A0000000530000__0000005D2FFFFB38 000000067F000040020000A0000000530000-000000067F000040020000A0000000534000__00000038E67ABFA0 000000067F000040020000A0000000530000-000000067F000040020000A0000000534000__0000003903F1CFE8 000000067F000040020000A0000000530000-000000067F000040020000A0000000534000__0000003B99F7F8A0 000000067F000040020000A0000000530000-000000067F000040020000A0000000534000__0000005D2FFFFB38 000000067F000040020000A0000000530E07-000000067F000040020000A00000005397EE__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000534000-000000067F000040020000A0000000538000__00000038E67ABFA0 000000067F000040020000A0000000534000-000000067F000040020000A0000000538000__0000003903F1CFE8 000000067F000040020000A0000000534000-000000067F000040020000A0000000538000__0000003B99F7F8A0 000000067F000040020000A0000000534000-000000067F000040020000A0000000538000__0000005D2FFFFB38 000000067F000040020000A0000000538000-000000067F000040020000A000000053C000__00000038E67ABFA0 000000067F000040020000A0000000538000-000000067F000040020000A000000053C000__0000003903F1CFE8 000000067F000040020000A0000000538000-000000067F000040020000A000000053C000__0000003B99F7F8A0 000000067F000040020000A0000000538000-000000067F000040020000A000000053C000__0000005D2FFFFB38 000000067F000040020000A00000005397EE-000000067F000040020000A00000005421E0__000000259DC1F899-000000263D71E6D9 000000067F000040020000A000000053C000-000000067F000040020000A0000000540000__00000038E67ABFA0 000000067F000040020000A000000053C000-000000067F000040020000A0000000540000__0000003903F1CFE8 000000067F000040020000A000000053C000-000000067F000040020000A0000000540000__0000003B99F7F8A0 000000067F000040020000A000000053C000-000000067F000040020000A0000000540000__0000005D2FFFFB38 000000067F000040020000A0000000540000-000000067F000040020000A0000000544000__00000038E67ABFA0 000000067F000040020000A0000000540000-000000067F000040020000A0000000544000__0000003903F1CFE8 000000067F000040020000A0000000540000-000000067F000040020000A0000000544000__0000003B99F7F8A0 000000067F000040020000A0000000540000-000000067F000040020000A0000000544000__0000005D2FFFFB38 000000067F000040020000A00000005421E0-000000067F000040020000A000000054ABC7__000000259DC1F899-000000263D71E6D9 000000067F000040020000A0000000544000-000000067F000040020000A0000000548000__00000038E67ABFA0 000000067F000040020000A0000000544000-000000067F000040020000A0000000548000__0000003903F1CFE8 000000067F000040020000A0000000544000-000000067F000040020000A0000000548000__0000003B99F7F8A0 000000067F000040020000A0000000544000-000000067F000040020000A0000000548000__0000005D2FFFFB38 000000067F000040020000A0000000548000-000000067F000040020000A000000054C000__00000038E1ABFE28 000000067F000040020000A0000000548000-000000067F000040020000A000000054C000__00000038E9AF7F00 000000067F000040020000A0000000548000-000000067F000040020000A000000054C000__0000003903F1CFE8 000000067F000040020000A0000000548000-000000067F000040020000A000000054C000__0000003B99F7F8A0 000000067F000040020000A0000000548000-000000067F000040020000A000000054C000__0000005D2FFFFB38 000000067F000040020000A000000054ABC7-000000067F000040020000A0000200000000__000000259DC1F899-000000263D71E6D9 000000067F000040020000A000000054AD5E-000000067F000040020000A000000055371D__000000263D71E6D9-00000026ED17F009 000000067F000040020000A000000054C000-000000067F000040020000A0000000550000__00000038E1ABFE28 000000067F000040020000A000000054C000-000000067F000040020000A0000000550000__00000038E9AF7F00 000000067F000040020000A000000054C000-000000067F000040020000A0000000550000__0000003903F1CFE8 000000067F000040020000A000000054C000-000000067F000040020000A0000000550000__0000003B99F7F8A0 000000067F000040020000A000000054C000-000000067F000040020000A0000000550000__0000005D2FFFFB38 000000067F000040020000A0000000550000-000000067F000040020000A0000000554000__00000038E1ABFE28 000000067F000040020000A0000000550000-000000067F000040020000A0000000554000__00000038E9AF7F00 000000067F000040020000A0000000550000-000000067F000040020000A0000000554000__0000003903F1CFE8 000000067F000040020000A0000000550000-000000067F000040020000A0000000554000__0000003B99F7F8A0 000000067F000040020000A0000000550000-000000067F000040020000A0000000554000__0000005D2FFFFB38 000000067F000040020000A000000055371D-000000067F000040020000A000000055C0DF__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000554000-000000067F000040020000A0000000558000__00000038E1ABFE28 000000067F000040020000A0000000554000-000000067F000040020000A0000000558000__00000038E9AF7F00 000000067F000040020000A0000000554000-000000067F000040020000A0000000558000__0000003903F1CFE8 000000067F000040020000A0000000554000-000000067F000040020000A0000000558000__0000003B99F7F8A0 000000067F000040020000A0000000554000-000000067F000040020000A0000000558000__0000005D2FFFFB38 000000067F000040020000A0000000558000-000000067F000040020000A000000055C000__00000038E1ABFE28 000000067F000040020000A0000000558000-000000067F000040020000A000000055C000__00000038E9AF7F00 000000067F000040020000A0000000558000-000000067F000040020000A000000055C000__0000003903F1CFE8 000000067F000040020000A0000000558000-000000067F000040020000A000000055C000__0000003B99F7F8A0 000000067F000040020000A0000000558000-000000067F000040020000A000000055C000__0000005D2FFFFB38 000000067F000040020000A000000055C000-000000067F000040020000A0000000560000__00000038E1ABFE28 000000067F000040020000A000000055C000-000000067F000040020000A0000000560000__00000038E9AF7F00 000000067F000040020000A000000055C000-000000067F000040020000A0000000560000__0000003903F1CFE8 000000067F000040020000A000000055C000-000000067F000040020000A0000000560000__0000003B99F7F8A0 000000067F000040020000A000000055C000-000000067F000040020000A0000000560000__0000005D2FFFFB38 000000067F000040020000A000000055C0DF-000000067F000040020000A0000000564AC7__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000560000-000000067F000040020000A0000000564000__00000038E1ABFE28 000000067F000040020000A0000000560000-000000067F000040020000A0000000564000__00000038E9AF7F00 000000067F000040020000A0000000560000-000000067F000040020000A0000000564000__0000003903F1CFE8 000000067F000040020000A0000000560000-000000067F000040020000A0000000564000__0000003B99F7F8A0 000000067F000040020000A0000000560000-000000067F000040020000A0000000564000__0000005D2FFFFB38 000000067F000040020000A0000000564000-000000067F000040020000A0000000568000__00000038E1ABFE28 000000067F000040020000A0000000564000-000000067F000040020000A0000000568000__00000038E9AF7F00 000000067F000040020000A0000000564000-000000067F000040020000A0000000568000__0000003903F1CFE8 000000067F000040020000A0000000564000-000000067F000040020000A0000000568000__0000003B99F7F8A0 000000067F000040020000A0000000564000-000000067F000040020000A0000000568000__0000005D2FFFFB38 000000067F000040020000A0000000564AC7-000000067F000040020000A000000056D4B3__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000568000-000000067F000040020000A000000056C000__00000038E1ABFE28 000000067F000040020000A0000000568000-000000067F000040020000A000000056C000__00000038E9AF7F00 000000067F000040020000A0000000568000-000000067F000040020000A000000056C000__0000003903F1CFE8 000000067F000040020000A0000000568000-000000067F000040020000A000000056C000__0000003B99F7F8A0 000000067F000040020000A0000000568000-000000067F000040020000A000000056C000__0000005D2FFFFB38 000000067F000040020000A000000056C000-000000067F000040020000A0000000570000__00000038E1ABFE28 000000067F000040020000A000000056C000-000000067F000040020000A0000000570000__00000038E9AF7F00 000000067F000040020000A000000056C000-000000067F000040020000A0000000570000__0000003903F1CFE8 000000067F000040020000A000000056C000-000000067F000040020000A0000000570000__0000003B99F7F8A0 000000067F000040020000A000000056C000-000000067F000040020000A0000000570000__0000005D2FFFFB38 000000067F000040020000A000000056D4B3-000000067F000040020000A0000000575EA8__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000570000-000000067F000040020000A0000000574000__00000038E1ABFE28 000000067F000040020000A0000000570000-000000067F000040020000A0000000574000__00000038E9AF7F00 000000067F000040020000A0000000570000-000000067F000040020000A0000000574000__0000003903F1CFE8 000000067F000040020000A0000000570000-000000067F000040020000A0000000574000__0000003B99F7F8A0 000000067F000040020000A0000000570000-000000067F000040020000A0000000574000__0000005D2FFFFB38 000000067F000040020000A0000000574000-000000067F000040020000A0000000578000__00000038E1ABFE28 000000067F000040020000A0000000574000-000000067F000040020000A0000000578000__00000038E9AF7F00 000000067F000040020000A0000000574000-000000067F000040020000A0000000578000__0000003903F1CFE8 000000067F000040020000A0000000574000-000000067F000040020000A0000000578000__0000003B99F7F8A0 000000067F000040020000A0000000574000-000000067F000040020000A0000000578000__0000005D2FFFFB38 000000067F000040020000A0000000575EA8-000000067F000040020000A000000057E892__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000578000-000000067F000040020000A000000057C000__00000038E1ABFE28 000000067F000040020000A0000000578000-000000067F000040020000A000000057C000__00000038E9AF7F00 000000067F000040020000A0000000578000-000000067F000040020000A000000057C000__0000003903F1CFE8 000000067F000040020000A0000000578000-000000067F000040020000A000000057C000__0000003B99F7F8A0 000000067F000040020000A0000000578000-000000067F000040020000A000000057C000__0000005D2FFFFB38 000000067F000040020000A000000057C000-000000067F000040020000A0000000580000__00000038E1ABFE28 000000067F000040020000A000000057C000-000000067F000040020000A0000000580000__00000038E9AF7F00 000000067F000040020000A000000057C000-000000067F000040020000A0000000580000__0000003903F1CFE8 000000067F000040020000A000000057C000-000000067F000040020000A0000000580000__0000003B99F7F8A0 000000067F000040020000A000000057C000-000000067F000040020000A0000000580000__0000005D2FFFFB38 000000067F000040020000A000000057E892-000000067F000040020000A000000058726C__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000580000-000000067F000040020000A0000000584000__00000038E1ABFE28 000000067F000040020000A0000000580000-000000067F000040020000A0000000584000__00000038E9AF7F00 000000067F000040020000A0000000580000-000000067F000040020000A0000000584000__0000003903F1CFE8 000000067F000040020000A0000000580000-000000067F000040020000A0000000584000__0000003B99F7F8A0 000000067F000040020000A0000000580000-000000067F000040020000A0000000584000__0000005D2FFFFB38 000000067F000040020000A0000000584000-000000067F000040020000A0000000588000__00000038E1ABFE28 000000067F000040020000A0000000584000-000000067F000040020000A0000000588000__00000038E9AF7F00 000000067F000040020000A0000000584000-000000067F000040020000A0000000588000__0000003903F1CFE8 000000067F000040020000A0000000584000-000000067F000040020000A0000000588000__0000003B99F7F8A0 000000067F000040020000A0000000584000-000000067F000040020000A0000000588000__0000005D2FFFFB38 000000067F000040020000A000000058726C-000000067F000040020000A000000058FC31__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000588000-000000067F000040020000A000000058C000__00000038E1ABFE28 000000067F000040020000A0000000588000-000000067F000040020000A000000058C000__00000038E9AF7F00 000000067F000040020000A0000000588000-000000067F000040020000A000000058C000__0000003903F1CFE8 000000067F000040020000A0000000588000-000000067F000040020000A000000058C000__0000003B99F7F8A0 000000067F000040020000A0000000588000-000000067F000040020000A000000058C000__0000005D2FFFFB38 000000067F000040020000A000000058C000-000000067F000040020000A0000000590000__00000038E1ABFE28 000000067F000040020000A000000058C000-000000067F000040020000A0000000590000__00000038E9AF7F00 000000067F000040020000A000000058C000-000000067F000040020000A0000000590000__0000003903F1CFE8 000000067F000040020000A000000058C000-000000067F000040020000A0000000590000__0000003B99F7F8A0 000000067F000040020000A000000058C000-000000067F000040020000A0000000590000__0000005D2FFFFB38 000000067F000040020000A000000058FC31-000000067F000040020000A00000005985F9__000000263D71E6D9-00000026ED17F009 000000067F000040020000A0000000590000-000000067F000040020000A0000000594000__00000038E1ABFE28 000000067F000040020000A0000000590000-000000067F000040020000A0000000594000__00000038E9AF7F00 000000067F000040020000A0000000590000-000000067F000040020000A0000000594000__0000003903F1CFE8 000000067F000040020000A0000000590000-000000067F000040020000A0000000594000__0000003B99F7F8A0 000000067F000040020000A0000000590000-000000067F000040020000A0000000594000__0000005D2FFFFB38 000000067F000040020000A0000000594000-000000067F000040020000A0000000598000__00000038E1ABFE28 000000067F000040020000A0000000594000-000000067F000040020000A0000000598000__00000038E9AF7F00 000000067F000040020000A0000000594000-000000067F000040020000A0000000598000__0000003903F1CFE8 000000067F000040020000A0000000594000-000000067F000040020000A0000000598000__0000003B99F7F8A0 000000067F000040020000A0000000594000-000000067F000040020000A0000000598000__0000005D2FFFFB38 000000067F000040020000A0000000598000-000000067F000040020000A000000059C000__00000038E1ABFE28 000000067F000040020000A0000000598000-000000067F000040020000A000000059C000__00000038E9AF7F00 000000067F000040020000A0000000598000-000000067F000040020000A000000059C000__0000003903F1CFE8 000000067F000040020000A0000000598000-000000067F000040020000A000000059C000__0000003B99F7F8A0 000000067F000040020000A0000000598000-000000067F000040020000A000000059C000__0000005D2FFFFB38 000000067F000040020000A00000005985F9-000000067F000040020000A00000005A0FE9__000000263D71E6D9-00000026ED17F009 000000067F000040020000A000000059C000-000000067F000040020000A00000005A0000__00000038E1ABFE28 000000067F000040020000A000000059C000-000000067F000040020000A00000005A0000__00000038E9AF7F00 000000067F000040020000A000000059C000-000000067F000040020000A00000005A0000__0000003903F1CFE8 000000067F000040020000A000000059C000-000000067F000040020000A00000005A0000__0000003B99F7F8A0 000000067F000040020000A000000059C000-000000067F000040020000A00000005A0000__0000005D2FFFFB38 000000067F000040020000A00000005A0000-000000067F000040020000A00000005A4000__00000038E1ABFE28 000000067F000040020000A00000005A0000-000000067F000040020000A00000005A4000__00000038E9AF7F00 000000067F000040020000A00000005A0000-000000067F000040020000A00000005A4000__0000003903F1CFE8 000000067F000040020000A00000005A0000-000000067F000040020000A00000005A4000__0000003B99F7F8A0 000000067F000040020000A00000005A0000-000000067F000040020000A00000005A4000__0000005D2FFFFB38 000000067F000040020000A00000005A0FE9-000000067F000040020000A00000005A99D4__000000263D71E6D9-00000026ED17F009 000000067F000040020000A00000005A4000-000000067F000040020000A00000005A8000__00000038E1ABFE28 000000067F000040020000A00000005A4000-000000067F000040020000A00000005A8000__00000038E9AF7F00 000000067F000040020000A00000005A4000-000000067F000040020000A00000005A8000__0000003903F1CFE8 000000067F000040020000A00000005A4000-000000067F000040020000A00000005A8000__0000003B99F7F8A0 000000067F000040020000A00000005A4000-000000067F000040020000A00000005A8000__0000005D2FFFFB38 000000067F000040020000A00000005A8000-000000067F000040020000A00000005AC000__00000027BCAFED20 000000067F000040020000A00000005A8000-000000067F000040020000A00000005AC000__00000038E9AF7F00 000000067F000040020000A00000005A8000-000000067F000040020000A00000005AC000__0000003903F1CFE8 000000067F000040020000A00000005A8000-000000067F000040020000A00000005AC000__0000003B99F7F8A0 000000067F000040020000A00000005A8000-000000067F000040020000A00000005AC000__0000005D2FFFFB38 000000067F000040020000A00000005A99D4-000000067F000040020000A0000200000000__000000263D71E6D9-00000026ED17F009 000000067F000040020000A00000005A9C62-000000067F000040020000A00000005B2656__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005AC000-000000067F000040020000A00000005B0000__00000027BCAFED20 000000067F000040020000A00000005AC000-000000067F000040020000A00000005B0000__00000038E9AF7F00 000000067F000040020000A00000005AC000-000000067F000040020000A00000005B0000__0000003903F1CFE8 000000067F000040020000A00000005AC000-000000067F000040020000A00000005B0000__0000003B99F7F8A0 000000067F000040020000A00000005AC000-000000067F000040020000A00000005B0000__0000005D2FFFFB38 000000067F000040020000A00000005B0000-000000067F000040020000A00000005B4000__00000027BCAFED20 000000067F000040020000A00000005B0000-000000067F000040020000A00000005B4000__00000038E9AF7F00 000000067F000040020000A00000005B0000-000000067F000040020000A00000005B4000__0000003903F1CFE8 000000067F000040020000A00000005B0000-000000067F000040020000A00000005B4000__0000003B99F7F8A0 000000067F000040020000A00000005B0000-000000067F000040020000A00000005B4000__0000005D2FFFFB38 000000067F000040020000A00000005B2656-000000067F000040020000A00000005BB03A__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005B4000-000000067F000040020000A00000005B8000__00000027BCAFED20 000000067F000040020000A00000005B4000-000000067F000040020000A00000005B8000__00000038E9AF7F00 000000067F000040020000A00000005B4000-000000067F000040020000A00000005B8000__0000003903F1CFE8 000000067F000040020000A00000005B4000-000000067F000040020000A00000005B8000__0000003B99F7F8A0 000000067F000040020000A00000005B4000-000000067F000040020000A00000005B8000__0000005D2FFFFB38 000000067F000040020000A00000005B8000-000000067F000040020000A00000005BC000__00000027BCAFED20 000000067F000040020000A00000005B8000-000000067F000040020000A00000005BC000__00000038E9AF7F00 000000067F000040020000A00000005B8000-000000067F000040020000A00000005BC000__0000003903F1CFE8 000000067F000040020000A00000005B8000-000000067F000040020000A00000005BC000__0000003B99F7F8A0 000000067F000040020000A00000005B8000-000000067F000040020000A00000005BC000__0000005D2FFFFB38 000000067F000040020000A00000005BB03A-000000067F000040020000A00000005C3A02__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005BC000-000000067F000040020000A00000005C0000__00000027BCAFED20 000000067F000040020000A00000005BC000-000000067F000040020000A00000005C0000__00000038E9AF7F00 000000067F000040020000A00000005BC000-000000067F000040020000A00000005C0000__0000003903F1CFE8 000000067F000040020000A00000005BC000-000000067F000040020000A00000005C0000__0000003B99F7F8A0 000000067F000040020000A00000005BC000-000000067F000040020000A00000005C0000__0000005D2FFFFB38 000000067F000040020000A00000005C0000-000000067F000040020000A00000005C4000__00000027BCAFED20 000000067F000040020000A00000005C0000-000000067F000040020000A00000005C4000__00000038E9AF7F00 000000067F000040020000A00000005C0000-000000067F000040020000A00000005C4000__0000003903F1CFE8 000000067F000040020000A00000005C0000-000000067F000040020000A00000005C4000__0000003B99F7F8A0 000000067F000040020000A00000005C0000-000000067F000040020000A00000005C4000__0000005D2FFFFB38 000000067F000040020000A00000005C3A02-000000067F000040020000A00000005CC3B7__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005C4000-000000067F000040020000A00000005C8000__00000027BCAFED20 000000067F000040020000A00000005C4000-000000067F000040020000A00000005C8000__00000038E9AF7F00 000000067F000040020000A00000005C4000-000000067F000040020000A00000005C8000__0000003903F1CFE8 000000067F000040020000A00000005C4000-000000067F000040020000A00000005C8000__0000003B99F7F8A0 000000067F000040020000A00000005C4000-000000067F000040020000A00000005C8000__0000005D2FFFFB38 000000067F000040020000A00000005C8000-000000067F000040020000A00000005CC000__00000027BCAFED20 000000067F000040020000A00000005C8000-000000067F000040020000A00000005CC000__00000038E9AF7F00 000000067F000040020000A00000005C8000-000000067F000040020000A00000005CC000__0000003903F1CFE8 000000067F000040020000A00000005C8000-000000067F000040020000A00000005CC000__0000003B99F7F8A0 000000067F000040020000A00000005C8000-000000067F000040020000A00000005CC000__0000005D2FFFFB38 000000067F000040020000A00000005CC000-000000067F000040020000A00000005D0000__00000027BCAFED20 000000067F000040020000A00000005CC000-000000067F000040020000A00000005D0000__00000038E9AF7F00 000000067F000040020000A00000005CC000-000000067F000040020000A00000005D0000__0000003903F1CFE8 000000067F000040020000A00000005CC000-000000067F000040020000A00000005D0000__0000003B99F7F8A0 000000067F000040020000A00000005CC000-000000067F000040020000A00000005D0000__0000005D2FFFFB38 000000067F000040020000A00000005CC3B7-000000067F000040020000A00000005D4D88__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005D0000-000000067F000040020000A00000005D4000__00000027BCAFED20 000000067F000040020000A00000005D0000-000000067F000040020000A00000005D4000__00000038E9AF7F00 000000067F000040020000A00000005D0000-000000067F000040020000A00000005D4000__0000003903F1CFE8 000000067F000040020000A00000005D0000-000000067F000040020000A00000005D4000__0000003B99F7F8A0 000000067F000040020000A00000005D0000-000000067F000040020000A00000005D4000__0000005D2FFFFB38 000000067F000040020000A00000005D4000-000000067F000040020000A00000005D8000__00000027BCAFED20 000000067F000040020000A00000005D4000-000000067F000040020000A00000005D8000__00000038E9AF7F00 000000067F000040020000A00000005D4000-000000067F000040020000A00000005D8000__0000003903F1CFE8 000000067F000040020000A00000005D4000-000000067F000040020000A00000005D8000__0000003B99F7F8A0 000000067F000040020000A00000005D4000-000000067F000040020000A00000005D8000__0000005D2FFFFB38 000000067F000040020000A00000005D4D88-000000067F000040020000A00000005DD76C__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005D8000-000000067F000040020000A00000005DC000__00000027BCAFED20 000000067F000040020000A00000005D8000-000000067F000040020000A00000005DC000__00000038E9AF7F00 000000067F000040020000A00000005D8000-000000067F000040020000A00000005DC000__0000003903F1CFE8 000000067F000040020000A00000005D8000-000000067F000040020000A00000005DC000__0000003B99F7F8A0 000000067F000040020000A00000005D8000-000000067F000040020000A00000005DC000__0000005D2FFFFB38 000000067F000040020000A00000005DC000-000000067F000040020000A00000005E0000__00000027BCAFED20 000000067F000040020000A00000005DC000-000000067F000040020000A00000005E0000__00000038E9AF7F00 000000067F000040020000A00000005DC000-000000067F000040020000A00000005E0000__0000003903F1CFE8 000000067F000040020000A00000005DC000-000000067F000040020000A00000005E0000__0000003B99F7F8A0 000000067F000040020000A00000005DC000-000000067F000040020000A00000005E0000__0000005D2FFFFB38 000000067F000040020000A00000005DD76C-000000067F000040020000A00000005E6155__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005E0000-000000067F000040020000A00000005E4000__00000027BCAFED20 000000067F000040020000A00000005E0000-000000067F000040020000A00000005E4000__00000038E9AF7F00 000000067F000040020000A00000005E0000-000000067F000040020000A00000005E4000__0000003903F1CFE8 000000067F000040020000A00000005E0000-000000067F000040020000A00000005E4000__0000003B99F7F8A0 000000067F000040020000A00000005E0000-000000067F000040020000A00000005E4000__0000005D2FFFFB38 000000067F000040020000A00000005E4000-000000067F000040020000A00000005E8000__00000027BCAFED20 000000067F000040020000A00000005E4000-000000067F000040020000A00000005E8000__00000038E9AF7F00 000000067F000040020000A00000005E4000-000000067F000040020000A00000005E8000__0000003903F1CFE8 000000067F000040020000A00000005E4000-000000067F000040020000A00000005E8000__0000003B99F7F8A0 000000067F000040020000A00000005E4000-000000067F000040020000A00000005E8000__0000005D2FFFFB38 000000067F000040020000A00000005E6155-000000067F000040020000A00000005EEB42__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005E8000-000000067F000040020000A00000005EC000__00000027BCAFED20 000000067F000040020000A00000005E8000-000000067F000040020000A00000005EC000__00000038E9AF7F00 000000067F000040020000A00000005E8000-000000067F000040020000A00000005EC000__0000003903F1CFE8 000000067F000040020000A00000005E8000-000000067F000040020000A00000005EC000__0000003B99F7F8A0 000000067F000040020000A00000005E8000-000000067F000040020000A00000005EC000__0000005D2FFFFB38 000000067F000040020000A00000005EC000-000000067F000040020000A00000005F0000__00000027BCAFED20 000000067F000040020000A00000005EC000-000000067F000040020000A00000005F0000__00000038E9AF7F00 000000067F000040020000A00000005EC000-000000067F000040020000A00000005F0000__0000003903F1CFE8 000000067F000040020000A00000005EC000-000000067F000040020000A00000005F0000__0000003B99F7F8A0 000000067F000040020000A00000005EC000-000000067F000040020000A00000005F0000__0000005D2FFFFB38 000000067F000040020000A00000005EEB42-000000067F000040020000A00000005F7523__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005F0000-000000067F000040020000A00000005F4000__00000027BCAFED20 000000067F000040020000A00000005F0000-000000067F000040020000A00000005F4000__00000038E9AF7F00 000000067F000040020000A00000005F0000-000000067F000040020000A00000005F4000__0000003903F1CFE8 000000067F000040020000A00000005F0000-000000067F000040020000A00000005F4000__0000003B99F7F8A0 000000067F000040020000A00000005F0000-000000067F000040020000A00000005F4000__0000005D2FFFFB38 000000067F000040020000A00000005F4000-000000067F000040020000A00000005F8000__00000027BCAFED20 000000067F000040020000A00000005F4000-000000067F000040020000A00000005F8000__00000038E9AF7F00 000000067F000040020000A00000005F4000-000000067F000040020000A00000005F8000__0000003903F1CFE8 000000067F000040020000A00000005F4000-000000067F000040020000A00000005F8000__0000003B99F7F8A0 000000067F000040020000A00000005F4000-000000067F000040020000A00000005F8000__0000005D2FFFFB38 000000067F000040020000A00000005F7523-000000067F000040020000A00000005FFEE5__00000026ED17F009-000000278CC7EF29 000000067F000040020000A00000005F8000-000000067F000040020000A00000005FC000__00000027BCAFED20 000000067F000040020000A00000005F8000-000000067F000040020000A00000005FC000__00000038E9AF7F00 000000067F000040020000A00000005F8000-000000067F000040020000A00000005FC000__0000003903F1CFE8 000000067F000040020000A00000005F8000-000000067F000040020000A00000005FC000__0000003B99F7F8A0 000000067F000040020000A00000005F8000-000000067F000040020000A00000005FC000__0000005D2FFFFB38 000000067F000040020000A00000005FC000-000000067F000040020000A0000000600000__00000027BCAFED20 000000067F000040020000A00000005FC000-000000067F000040020000A0000000600000__00000038E9AF7F00 000000067F000040020000A00000005FC000-000000067F000040020000A0000000600000__0000003903F1CFE8 000000067F000040020000A00000005FC000-000000067F000040020000A0000000600000__0000003B99F7F8A0 000000067F000040020000A00000005FC000-000000067F000040020000A0000000600000__0000005D2FFFFB38 000000067F000040020000A00000005FFEE5-000000067F000040020000A0000200000000__00000026ED17F009-000000278CC7EF29 000000067F000040020000A0000000600000-000000067F000040020000A0000000604000__00000027BCAFED20 000000067F000040020000A0000000600000-000000067F000040020000A0000000604000__00000038E67ABFA0 000000067F000040020000A0000000600000-000000067F000040020000A0000000604000__0000003903F1CFE8 000000067F000040020000A0000000600000-000000067F000040020000A0000000604000__0000003B99F7F8A0 000000067F000040020000A0000000600000-000000067F000040020000A0000000604000__0000005D2FFFFB38 000000067F000040020000A00000006000B1-000000067F000040020000A0000000608A6E__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000604000-000000067F000040020000A0000000608000__00000027BCAFED20 000000067F000040020000A0000000604000-000000067F000040020000A0000000608000__00000038E67ABFA0 000000067F000040020000A0000000604000-000000067F000040020000A0000000608000__0000003903F1CFE8 000000067F000040020000A0000000604000-000000067F000040020000A0000000608000__0000003B99F7F8A0 000000067F000040020000A0000000604000-000000067F000040020000A0000000608000__0000005D2FFFFB38 000000067F000040020000A0000000608000-000000067F000040020000A000000060C000__00000027BCAFED20 000000067F000040020000A0000000608000-000000067F000040020000A000000060C000__00000038E67ABFA0 000000067F000040020000A0000000608000-000000067F000040020000A000000060C000__0000003903F1CFE8 000000067F000040020000A0000000608000-000000067F000040020000A000000060C000__0000003B99F7F8A0 000000067F000040020000A0000000608000-000000067F000040020000A000000060C000__0000005D2FFFFB38 000000067F000040020000A0000000608A6E-000000067F000040020000A000000061143E__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A000000060C000-000000067F000040020000A0000000610000__00000027BCAFED20 000000067F000040020000A000000060C000-000000067F000040020000A0000000610000__00000038E67ABFA0 000000067F000040020000A000000060C000-000000067F000040020000A0000000610000__0000003903F1CFE8 000000067F000040020000A000000060C000-000000067F000040020000A0000000610000__0000003B99F7F8A0 000000067F000040020000A000000060C000-000000067F000040020000A0000000610000__0000005D2FFFFB38 000000067F000040020000A0000000610000-000000067F000040020000A0000000614000__00000027BCAFED20 000000067F000040020000A0000000610000-000000067F000040020000A0000000614000__00000038E67ABFA0 000000067F000040020000A0000000610000-000000067F000040020000A0000000614000__0000003903F1CFE8 000000067F000040020000A0000000610000-000000067F000040020000A0000000614000__0000003B99F7F8A0 000000067F000040020000A0000000610000-000000067F000040020000A0000000614000__0000005D2FFFFB38 000000067F000040020000A000000061143E-000000067F000040020000A0000000619E1E__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000614000-000000067F000040020000A0000000618000__00000027BCAFED20 000000067F000040020000A0000000614000-000000067F000040020000A0000000618000__00000038E67ABFA0 000000067F000040020000A0000000614000-000000067F000040020000A0000000618000__0000003903F1CFE8 000000067F000040020000A0000000614000-000000067F000040020000A0000000618000__0000003B99F7F8A0 000000067F000040020000A0000000614000-000000067F000040020000A0000000618000__0000005D2FFFFB38 000000067F000040020000A0000000618000-000000067F000040020000A000000061C000__00000038E67ABFA0 000000067F000040020000A0000000618000-000000067F000040020000A000000061C000__0000003903F1CFE8 000000067F000040020000A0000000618000-000000067F000040020000A000000061C000__0000003B99F7F8A0 000000067F000040020000A0000000618000-000000067F000040020000A000000061C000__0000005D2FFFFB38 000000067F000040020000A0000000618000-030000000000000000000000000000000002__00000027BCAFED20 000000067F000040020000A0000000619E1E-000000067F000040020000A0000000622808__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A000000061C000-000000067F000040020000A0000000620000__00000038E67ABFA0 000000067F000040020000A000000061C000-000000067F000040020000A0000000620000__0000003903F1CFE8 000000067F000040020000A000000061C000-000000067F000040020000A0000000620000__0000003B99F7F8A0 000000067F000040020000A000000061C000-000000067F000040020000A0000000620000__0000005D2FFFFB38 000000067F000040020000A0000000620000-000000067F000040020000A0000000624000__00000038E67ABFA0 000000067F000040020000A0000000620000-000000067F000040020000A0000000624000__0000003903F1CFE8 000000067F000040020000A0000000620000-000000067F000040020000A0000000624000__0000003B99F7F8A0 000000067F000040020000A0000000620000-000000067F000040020000A0000000624000__0000005D2FFFFB38 000000067F000040020000A0000000622808-000000067F000040020000A000000062B1F2__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000624000-000000067F000040020000A0000000628000__00000038E67ABFA0 000000067F000040020000A0000000624000-000000067F000040020000A0000000628000__0000003903F1CFE8 000000067F000040020000A0000000624000-000000067F000040020000A0000000628000__0000003B99F7F8A0 000000067F000040020000A0000000624000-000000067F000040020000A0000000628000__0000005D2FFFFB38 000000067F000040020000A0000000628000-000000067F000040020000A000000062C000__00000038E67ABFA0 000000067F000040020000A0000000628000-000000067F000040020000A000000062C000__0000003903F1CFE8 000000067F000040020000A0000000628000-000000067F000040020000A000000062C000__0000003B99F7F8A0 000000067F000040020000A0000000628000-000000067F000040020000A000000062C000__0000005D2FFFFB38 000000067F000040020000A000000062B1F2-000000067F000040020000A0000000633BDF__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A000000062C000-000000067F000040020000A0000000630000__00000038E67ABFA0 000000067F000040020000A000000062C000-000000067F000040020000A0000000630000__0000003903F1CFE8 000000067F000040020000A000000062C000-000000067F000040020000A0000000630000__0000003B99F7F8A0 000000067F000040020000A000000062C000-000000067F000040020000A0000000630000__0000005D2FFFFB38 000000067F000040020000A0000000630000-000000067F000040020000A0000000634000__00000038E67ABFA0 000000067F000040020000A0000000630000-000000067F000040020000A0000000634000__0000003903F1CFE8 000000067F000040020000A0000000630000-000000067F000040020000A0000000634000__0000003B99F7F8A0 000000067F000040020000A0000000630000-000000067F000040020000A0000000634000__0000005D2FFFFB38 000000067F000040020000A0000000633BDF-000000067F000040020000A000000063C5BF__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000634000-000000067F000040020000A0000000638000__00000038E67ABFA0 000000067F000040020000A0000000634000-000000067F000040020000A0000000638000__0000003903F1CFE8 000000067F000040020000A0000000634000-000000067F000040020000A0000000638000__0000003B99F7F8A0 000000067F000040020000A0000000634000-000000067F000040020000A0000000638000__0000005D2FFFFB38 000000067F000040020000A0000000638000-000000067F000040020000A000000063C000__00000038E67ABFA0 000000067F000040020000A0000000638000-000000067F000040020000A000000063C000__0000003903F1CFE8 000000067F000040020000A0000000638000-000000067F000040020000A000000063C000__0000003B99F7F8A0 000000067F000040020000A0000000638000-000000067F000040020000A000000063C000__0000005D2FFFFB38 000000067F000040020000A000000063C000-000000067F000040020000A0000000640000__00000038E67ABFA0 000000067F000040020000A000000063C000-000000067F000040020000A0000000640000__0000003903F1CFE8 000000067F000040020000A000000063C000-000000067F000040020000A0000000640000__0000003B99F7F8A0 000000067F000040020000A000000063C000-000000067F000040020000A0000000640000__0000005D2FFFFB38 000000067F000040020000A000000063C5BF-000000067F000040020000A0000000644F80__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000640000-000000067F000040020000A0000000644000__00000038E67ABFA0 000000067F000040020000A0000000640000-000000067F000040020000A0000000644000__0000003903F1CFE8 000000067F000040020000A0000000640000-000000067F000040020000A0000000644000__0000003B99F7F8A0 000000067F000040020000A0000000640000-000000067F000040020000A0000000644000__0000005D2FFFFB38 000000067F000040020000A0000000644000-000000067F000040020000A0000000648000__00000038E67ABFA0 000000067F000040020000A0000000644000-000000067F000040020000A0000000648000__0000003903F1CFE8 000000067F000040020000A0000000644000-000000067F000040020000A0000000648000__0000003B99F7F8A0 000000067F000040020000A0000000644000-000000067F000040020000A0000000648000__0000005D2FFFFB38 000000067F000040020000A0000000644F80-000000067F000040020000A000000064D959__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000648000-000000067F000040020000A000000064C000__00000038E67ABFA0 000000067F000040020000A0000000648000-000000067F000040020000A000000064C000__0000003903F1CFE8 000000067F000040020000A0000000648000-000000067F000040020000A000000064C000__0000003B99F7F8A0 000000067F000040020000A0000000648000-000000067F000040020000A000000064C000__0000005D2FFFFB38 000000067F000040020000A000000064C000-000000067F000040020000A0000000650000__00000038E67ABFA0 000000067F000040020000A000000064C000-000000067F000040020000A0000000650000__0000003903F1CFE8 000000067F000040020000A000000064C000-000000067F000040020000A0000000650000__0000003B99F7F8A0 000000067F000040020000A000000064C000-000000067F000040020000A0000000650000__0000005D2FFFFB38 000000067F000040020000A000000064D959-000000067F000040020000A0000000656342__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000650000-000000067F000040020000A0000000654000__00000038E67ABFA0 000000067F000040020000A0000000650000-000000067F000040020000A0000000654000__0000003903F1CFE8 000000067F000040020000A0000000650000-000000067F000040020000A0000000654000__0000003B99F7F8A0 000000067F000040020000A0000000650000-000000067F000040020000A0000000654000__0000005D2FFFFB38 000000067F000040020000A0000000654000-000000067F000040020000A0000000658000__00000038E67ABFA0 000000067F000040020000A0000000654000-000000067F000040020000A0000000658000__0000003903F1CFE8 000000067F000040020000A0000000654000-000000067F000040020000A0000000658000__0000003B99F7F8A0 000000067F000040020000A0000000654000-000000067F000040020000A0000000658000__0000005D2FFFFB38 000000067F000040020000A0000000656342-000000067F000040020000A000000065ED2B__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A0000000658000-000000067F000040020000A000000065C000__00000038E67ABFA0 000000067F000040020000A0000000658000-000000067F000040020000A000000065C000__0000003903F1CFE8 000000067F000040020000A0000000658000-000000067F000040020000A000000065C000__0000003B99F7F8A0 000000067F000040020000A0000000658000-000000067F000040020000A000000065C000__0000005D2FFFFB38 000000067F000040020000A000000065C000-000000067F000040020000A0000000660000__00000038E1ABFE28 000000067F000040020000A000000065C000-000000067F000040020000A0000000660000__00000038E9AF7F00 000000067F000040020000A000000065C000-000000067F000040020000A0000000660000__0000003903F1CFE8 000000067F000040020000A000000065C000-000000067F000040020000A0000000660000__0000003B99F7F8A0 000000067F000040020000A000000065C000-000000067F000040020000A0000000660000__0000005D2FFFFB38 000000067F000040020000A000000065ED2B-000000067F000040020000A0000200000000__000000278CC7EF29-000000283C6FE2E9 000000067F000040020000A000000065EFE8-000000067F000040020000A00000006679DA__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A0000000660000-000000067F000040020000A0000000664000__00000038E1ABFE28 000000067F000040020000A0000000660000-000000067F000040020000A0000000664000__00000038E9AF7F00 000000067F000040020000A0000000660000-000000067F000040020000A0000000664000__0000003903F1CFE8 000000067F000040020000A0000000660000-000000067F000040020000A0000000664000__0000003B99F7F8A0 000000067F000040020000A0000000660000-000000067F000040020000A0000000664000__0000005D2FFFFB38 000000067F000040020000A0000000664000-000000067F000040020000A0000000668000__00000038E1ABFE28 000000067F000040020000A0000000664000-000000067F000040020000A0000000668000__00000038E9AF7F00 000000067F000040020000A0000000664000-000000067F000040020000A0000000668000__0000003903F1CFE8 000000067F000040020000A0000000664000-000000067F000040020000A0000000668000__0000003B99F7F8A0 000000067F000040020000A0000000664000-000000067F000040020000A0000000668000__0000005D2FFFFB38 000000067F000040020000A00000006679DA-000000067F000040020000A00000006703C5__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A0000000668000-000000067F000040020000A000000066C000__00000038E1ABFE28 000000067F000040020000A0000000668000-000000067F000040020000A000000066C000__00000038E9AF7F00 000000067F000040020000A0000000668000-000000067F000040020000A000000066C000__0000003903F1CFE8 000000067F000040020000A0000000668000-000000067F000040020000A000000066C000__0000003B99F7F8A0 000000067F000040020000A0000000668000-000000067F000040020000A000000066C000__0000005D2FFFFB38 000000067F000040020000A000000066C000-000000067F000040020000A0000000670000__00000038E1ABFE28 000000067F000040020000A000000066C000-000000067F000040020000A0000000670000__00000038E9AF7F00 000000067F000040020000A000000066C000-000000067F000040020000A0000000670000__0000003903F1CFE8 000000067F000040020000A000000066C000-000000067F000040020000A0000000670000__0000003B99F7F8A0 000000067F000040020000A000000066C000-000000067F000040020000A0000000670000__0000005D2FFFFB38 000000067F000040020000A0000000670000-000000067F000040020000A0000000674000__00000038E1ABFE28 000000067F000040020000A0000000670000-000000067F000040020000A0000000674000__00000038E9AF7F00 000000067F000040020000A0000000670000-000000067F000040020000A0000000674000__0000003903F1CFE8 000000067F000040020000A0000000670000-000000067F000040020000A0000000674000__0000003B99F7F8A0 000000067F000040020000A0000000670000-000000067F000040020000A0000000674000__0000005D2FFFFB38 000000067F000040020000A00000006703C5-000000067F000040020000A0000000678D98__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A0000000674000-000000067F000040020000A0000000678000__00000038E1ABFE28 000000067F000040020000A0000000674000-000000067F000040020000A0000000678000__00000038E9AF7F00 000000067F000040020000A0000000674000-000000067F000040020000A0000000678000__0000003903F1CFE8 000000067F000040020000A0000000674000-000000067F000040020000A0000000678000__0000003B99F7F8A0 000000067F000040020000A0000000674000-000000067F000040020000A0000000678000__0000005D2FFFFB38 000000067F000040020000A0000000678000-000000067F000040020000A000000067C000__00000038E1ABFE28 000000067F000040020000A0000000678000-000000067F000040020000A000000067C000__00000038E9AF7F00 000000067F000040020000A0000000678000-000000067F000040020000A000000067C000__0000003903F1CFE8 000000067F000040020000A0000000678000-000000067F000040020000A000000067C000__0000003B99F7F8A0 000000067F000040020000A0000000678000-000000067F000040020000A000000067C000__0000005D2FFFFB38 000000067F000040020000A0000000678D98-000000067F000040020000A000000068175E__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A000000067C000-000000067F000040020000A0000000680000__00000038E1ABFE28 000000067F000040020000A000000067C000-000000067F000040020000A0000000680000__00000038E9AF7F00 000000067F000040020000A000000067C000-000000067F000040020000A0000000680000__0000003903F1CFE8 000000067F000040020000A000000067C000-000000067F000040020000A0000000680000__0000003B99F7F8A0 000000067F000040020000A000000067C000-000000067F000040020000A0000000680000__0000005D2FFFFB38 000000067F000040020000A0000000680000-000000067F000040020000A0000000684000__00000038E1ABFE28 000000067F000040020000A0000000680000-000000067F000040020000A0000000684000__00000038E9AF7F00 000000067F000040020000A0000000680000-000000067F000040020000A0000000684000__0000003903F1CFE8 000000067F000040020000A0000000680000-000000067F000040020000A0000000684000__0000003B99F7F8A0 000000067F000040020000A0000000680000-000000067F000040020000A0000000684000__0000005D2FFFFB38 000000067F000040020000A000000068175E-000000067F000040020000A000000068A135__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A0000000684000-000000067F000040020000A0000000688000__00000038E1ABFE28 000000067F000040020000A0000000684000-000000067F000040020000A0000000688000__00000038E9AF7F00 000000067F000040020000A0000000684000-000000067F000040020000A0000000688000__0000003903F1CFE8 000000067F000040020000A0000000684000-000000067F000040020000A0000000688000__0000003B99F7F8A0 000000067F000040020000A0000000684000-000000067F000040020000A0000000688000__0000005D2FFFFB38 000000067F000040020000A0000000688000-000000067F000040020000A000000068C000__00000038E1ABFE28 000000067F000040020000A0000000688000-000000067F000040020000A000000068C000__00000038E9AF7F00 000000067F000040020000A0000000688000-000000067F000040020000A000000068C000__0000003903F1CFE8 000000067F000040020000A0000000688000-000000067F000040020000A000000068C000__0000003B99F7F8A0 000000067F000040020000A0000000688000-000000067F000040020000A000000068C000__0000005D2FFFFB38 000000067F000040020000A000000068A135-000000067F000040020000A0000000692B17__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A000000068C000-000000067F000040020000A0000000690000__00000038E1ABFE28 000000067F000040020000A000000068C000-000000067F000040020000A0000000690000__00000038E9AF7F00 000000067F000040020000A000000068C000-000000067F000040020000A0000000690000__0000003903F1CFE8 000000067F000040020000A000000068C000-000000067F000040020000A0000000690000__0000003B99F7F8A0 000000067F000040020000A000000068C000-000000067F000040020000A0000000690000__0000005D2FFFFB38 000000067F000040020000A0000000690000-000000067F000040020000A0000000694000__00000038E1ABFE28 000000067F000040020000A0000000690000-000000067F000040020000A0000000694000__00000038E9AF7F00 000000067F000040020000A0000000690000-000000067F000040020000A0000000694000__0000003903F1CFE8 000000067F000040020000A0000000690000-000000067F000040020000A0000000694000__0000003B99F7F8A0 000000067F000040020000A0000000690000-000000067F000040020000A0000000694000__0000005D2FFFFB38 000000067F000040020000A0000000692B17-000000067F000040020000A000000069B4FC__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A0000000694000-000000067F000040020000A0000000698000__00000038E1ABFE28 000000067F000040020000A0000000694000-000000067F000040020000A0000000698000__00000038E9AF7F00 000000067F000040020000A0000000694000-000000067F000040020000A0000000698000__0000003903F1CFE8 000000067F000040020000A0000000694000-000000067F000040020000A0000000698000__0000003B99F7F8A0 000000067F000040020000A0000000694000-000000067F000040020000A0000000698000__0000005D2FFFFB38 000000067F000040020000A0000000698000-000000067F000040020000A000000069C000__00000038E1ABFE28 000000067F000040020000A0000000698000-000000067F000040020000A000000069C000__00000038E9AF7F00 000000067F000040020000A0000000698000-000000067F000040020000A000000069C000__0000003903F1CFE8 000000067F000040020000A0000000698000-000000067F000040020000A000000069C000__0000003B99F7F8A0 000000067F000040020000A0000000698000-000000067F000040020000A000000069C000__0000005D2FFFFB38 000000067F000040020000A000000069B4FC-000000067F000040020000A00000006A3EF3__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A000000069C000-000000067F000040020000A00000006A0000__00000038E1ABFE28 000000067F000040020000A000000069C000-000000067F000040020000A00000006A0000__00000038E9AF7F00 000000067F000040020000A000000069C000-000000067F000040020000A00000006A0000__0000003903F1CFE8 000000067F000040020000A000000069C000-000000067F000040020000A00000006A0000__0000003B99F7F8A0 000000067F000040020000A000000069C000-000000067F000040020000A00000006A0000__0000005D2FFFFB38 000000067F000040020000A00000006A0000-000000067F000040020000A00000006A4000__00000038E1ABFE28 000000067F000040020000A00000006A0000-000000067F000040020000A00000006A4000__00000038E9AF7F00 000000067F000040020000A00000006A0000-000000067F000040020000A00000006A4000__0000003903F1CFE8 000000067F000040020000A00000006A0000-000000067F000040020000A00000006A4000__0000003B99F7F8A0 000000067F000040020000A00000006A0000-000000067F000040020000A00000006A4000__0000005D2FFFFB38 000000067F000040020000A00000006A3EF3-000000067F000040020000A00000006AC8DC__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A00000006A4000-000000067F000040020000A00000006A8000__00000038E1ABFE28 000000067F000040020000A00000006A4000-000000067F000040020000A00000006A8000__00000038E9AF7F00 000000067F000040020000A00000006A4000-000000067F000040020000A00000006A8000__0000003903F1CFE8 000000067F000040020000A00000006A4000-000000067F000040020000A00000006A8000__0000003B99F7F8A0 000000067F000040020000A00000006A4000-000000067F000040020000A00000006A8000__0000005D2FFFFB38 000000067F000040020000A00000006A8000-000000067F000040020000A00000006AC000__00000038E1ABFE28 000000067F000040020000A00000006A8000-000000067F000040020000A00000006AC000__00000038E9AF7F00 000000067F000040020000A00000006A8000-000000067F000040020000A00000006AC000__0000003903F1CFE8 000000067F000040020000A00000006A8000-000000067F000040020000A00000006AC000__0000003B99F7F8A0 000000067F000040020000A00000006A8000-000000067F000040020000A00000006AC000__0000005D2FFFFB38 000000067F000040020000A00000006AC000-000000067F000040020000A00000006B0000__00000038E1ABFE28 000000067F000040020000A00000006AC000-000000067F000040020000A00000006B0000__00000038E9AF7F00 000000067F000040020000A00000006AC000-000000067F000040020000A00000006B0000__0000003903F1CFE8 000000067F000040020000A00000006AC000-000000067F000040020000A00000006B0000__0000003B99F7F8A0 000000067F000040020000A00000006AC000-000000067F000040020000A00000006B0000__0000005D2FFFFB38 000000067F000040020000A00000006AC8DC-000000067F000040020000A00000006B52A7__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A00000006B0000-000000067F000040020000A00000006B4000__00000038E1ABFE28 000000067F000040020000A00000006B0000-000000067F000040020000A00000006B4000__00000038E9AF7F00 000000067F000040020000A00000006B0000-000000067F000040020000A00000006B4000__0000003903F1CFE8 000000067F000040020000A00000006B0000-000000067F000040020000A00000006B4000__0000003B99F7F8A0 000000067F000040020000A00000006B0000-000000067F000040020000A00000006B4000__0000005D2FFFFB38 000000067F000040020000A00000006B4000-000000067F000040020000A00000006B8000__00000029BBAFEDD8 000000067F000040020000A00000006B4000-000000067F000040020000A00000006B8000__00000038E9AF7F00 000000067F000040020000A00000006B4000-000000067F000040020000A00000006B8000__0000003903F1CFE8 000000067F000040020000A00000006B4000-000000067F000040020000A00000006B8000__0000003B99F7F8A0 000000067F000040020000A00000006B4000-000000067F000040020000A00000006B8000__0000005D2FFFFB38 000000067F000040020000A00000006B52A7-000000067F000040020000A0000200000000__000000283C6FE2E9-00000028DC1FE6F1 000000067F000040020000A00000006B543B-000000067F000040020000A00000006BDDFA__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006B8000-000000067F000040020000A00000006BC000__00000029BBAFEDD8 000000067F000040020000A00000006B8000-000000067F000040020000A00000006BC000__00000038E9AF7F00 000000067F000040020000A00000006B8000-000000067F000040020000A00000006BC000__0000003903F1CFE8 000000067F000040020000A00000006B8000-000000067F000040020000A00000006BC000__0000003B99F7F8A0 000000067F000040020000A00000006B8000-000000067F000040020000A00000006BC000__0000005D2FFFFB38 000000067F000040020000A00000006BC000-000000067F000040020000A00000006C0000__00000029BBAFEDD8 000000067F000040020000A00000006BC000-000000067F000040020000A00000006C0000__00000038E9AF7F00 000000067F000040020000A00000006BC000-000000067F000040020000A00000006C0000__0000003903F1CFE8 000000067F000040020000A00000006BC000-000000067F000040020000A00000006C0000__0000003B99F7F8A0 000000067F000040020000A00000006BC000-000000067F000040020000A00000006C0000__0000005D2FFFFB38 000000067F000040020000A00000006BDDFA-000000067F000040020000A00000006C67E9__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006C0000-000000067F000040020000A00000006C4000__00000029BBAFEDD8 000000067F000040020000A00000006C0000-000000067F000040020000A00000006C4000__00000038E9AF7F00 000000067F000040020000A00000006C0000-000000067F000040020000A00000006C4000__0000003903F1CFE8 000000067F000040020000A00000006C0000-000000067F000040020000A00000006C4000__0000003B99F7F8A0 000000067F000040020000A00000006C0000-000000067F000040020000A00000006C4000__0000005D2FFFFB38 000000067F000040020000A00000006C4000-000000067F000040020000A00000006C8000__00000029BBAFEDD8 000000067F000040020000A00000006C4000-000000067F000040020000A00000006C8000__00000038E9AF7F00 000000067F000040020000A00000006C4000-000000067F000040020000A00000006C8000__0000003903F1CFE8 000000067F000040020000A00000006C4000-000000067F000040020000A00000006C8000__0000003B99F7F8A0 000000067F000040020000A00000006C4000-000000067F000040020000A00000006C8000__0000005D2FFFFB38 000000067F000040020000A00000006C67E9-000000067F000040020000A00000006CF1D5__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006C8000-000000067F000040020000A00000006CC000__00000029BBAFEDD8 000000067F000040020000A00000006C8000-000000067F000040020000A00000006CC000__00000038E9AF7F00 000000067F000040020000A00000006C8000-000000067F000040020000A00000006CC000__0000003903F1CFE8 000000067F000040020000A00000006C8000-000000067F000040020000A00000006CC000__0000003B99F7F8A0 000000067F000040020000A00000006C8000-000000067F000040020000A00000006CC000__0000005D2FFFFB38 000000067F000040020000A00000006CC000-000000067F000040020000A00000006D0000__00000029BBAFEDD8 000000067F000040020000A00000006CC000-000000067F000040020000A00000006D0000__00000038E9AF7F00 000000067F000040020000A00000006CC000-000000067F000040020000A00000006D0000__0000003903F1CFE8 000000067F000040020000A00000006CC000-000000067F000040020000A00000006D0000__0000003B99F7F8A0 000000067F000040020000A00000006CC000-000000067F000040020000A00000006D0000__0000005D2FFFFB38 000000067F000040020000A00000006CF1D5-000000067F000040020000A00000006D7BC5__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006D0000-000000067F000040020000A00000006D4000__00000029BBAFEDD8 000000067F000040020000A00000006D0000-000000067F000040020000A00000006D4000__00000038E9AF7F00 000000067F000040020000A00000006D0000-000000067F000040020000A00000006D4000__0000003903F1CFE8 000000067F000040020000A00000006D0000-000000067F000040020000A00000006D4000__0000003B99F7F8A0 000000067F000040020000A00000006D0000-000000067F000040020000A00000006D4000__0000005D2FFFFB38 000000067F000040020000A00000006D4000-000000067F000040020000A00000006D8000__00000029BBAFEDD8 000000067F000040020000A00000006D4000-000000067F000040020000A00000006D8000__00000038E9AF7F00 000000067F000040020000A00000006D4000-000000067F000040020000A00000006D8000__0000003903F1CFE8 000000067F000040020000A00000006D4000-000000067F000040020000A00000006D8000__0000003B99F7F8A0 000000067F000040020000A00000006D4000-000000067F000040020000A00000006D8000__0000005D2FFFFB38 000000067F000040020000A00000006D7BC5-000000067F000040020000A00000006E05B2__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006D8000-000000067F000040020000A00000006DC000__00000029BBAFEDD8 000000067F000040020000A00000006D8000-000000067F000040020000A00000006DC000__00000038E9AF7F00 000000067F000040020000A00000006D8000-000000067F000040020000A00000006DC000__0000003903F1CFE8 000000067F000040020000A00000006D8000-000000067F000040020000A00000006DC000__0000003B99F7F8A0 000000067F000040020000A00000006D8000-000000067F000040020000A00000006DC000__0000005D2FFFFB38 000000067F000040020000A00000006DC000-000000067F000040020000A00000006E0000__00000029BBAFEDD8 000000067F000040020000A00000006DC000-000000067F000040020000A00000006E0000__00000038E9AF7F00 000000067F000040020000A00000006DC000-000000067F000040020000A00000006E0000__0000003903F1CFE8 000000067F000040020000A00000006DC000-000000067F000040020000A00000006E0000__0000003B99F7F8A0 000000067F000040020000A00000006DC000-000000067F000040020000A00000006E0000__0000005D2FFFFB38 000000067F000040020000A00000006E0000-000000067F000040020000A00000006E4000__00000029BBAFEDD8 000000067F000040020000A00000006E0000-000000067F000040020000A00000006E4000__00000038E9AF7F00 000000067F000040020000A00000006E0000-000000067F000040020000A00000006E4000__0000003903F1CFE8 000000067F000040020000A00000006E0000-000000067F000040020000A00000006E4000__0000003B99F7F8A0 000000067F000040020000A00000006E0000-000000067F000040020000A00000006E4000__0000005D2FFFFB38 000000067F000040020000A00000006E05B2-000000067F000040020000A00000006E8F91__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006E4000-000000067F000040020000A00000006E8000__00000029BBAFEDD8 000000067F000040020000A00000006E4000-000000067F000040020000A00000006E8000__00000038E9AF7F00 000000067F000040020000A00000006E4000-000000067F000040020000A00000006E8000__0000003903F1CFE8 000000067F000040020000A00000006E4000-000000067F000040020000A00000006E8000__0000003B99F7F8A0 000000067F000040020000A00000006E4000-000000067F000040020000A00000006E8000__0000005D2FFFFB38 000000067F000040020000A00000006E8000-000000067F000040020000A00000006EC000__00000029BBAFEDD8 000000067F000040020000A00000006E8000-000000067F000040020000A00000006EC000__00000038E9AF7F00 000000067F000040020000A00000006E8000-000000067F000040020000A00000006EC000__0000003903F1CFE8 000000067F000040020000A00000006E8000-000000067F000040020000A00000006EC000__0000003B99F7F8A0 000000067F000040020000A00000006E8000-000000067F000040020000A00000006EC000__0000005D2FFFFB38 000000067F000040020000A00000006E8F91-000000067F000040020000A00000006F195B__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006EC000-000000067F000040020000A00000006F0000__00000029BBAFEDD8 000000067F000040020000A00000006EC000-000000067F000040020000A00000006F0000__00000038E9AF7F00 000000067F000040020000A00000006EC000-000000067F000040020000A00000006F0000__0000003903F1CFE8 000000067F000040020000A00000006EC000-000000067F000040020000A00000006F0000__0000003B99F7F8A0 000000067F000040020000A00000006EC000-000000067F000040020000A00000006F0000__0000005D2FFFFB38 000000067F000040020000A00000006F0000-000000067F000040020000A00000006F4000__00000029BBAFEDD8 000000067F000040020000A00000006F0000-000000067F000040020000A00000006F4000__00000038E9AF7F00 000000067F000040020000A00000006F0000-000000067F000040020000A00000006F4000__0000003903F1CFE8 000000067F000040020000A00000006F0000-000000067F000040020000A00000006F4000__0000003B99F7F8A0 000000067F000040020000A00000006F0000-000000067F000040020000A00000006F4000__0000005D2FFFFB38 000000067F000040020000A00000006F195B-000000067F000040020000A00000006FA318__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006F4000-000000067F000040020000A00000006F8000__00000029BBAFEDD8 000000067F000040020000A00000006F4000-000000067F000040020000A00000006F8000__00000038E9AF7F00 000000067F000040020000A00000006F4000-000000067F000040020000A00000006F8000__0000003903F1CFE8 000000067F000040020000A00000006F4000-000000067F000040020000A00000006F8000__0000003B99F7F8A0 000000067F000040020000A00000006F4000-000000067F000040020000A00000006F8000__0000005D2FFFFB38 000000067F000040020000A00000006F8000-000000067F000040020000A00000006FC000__00000029BBAFEDD8 000000067F000040020000A00000006F8000-000000067F000040020000A00000006FC000__00000038E9AF7F00 000000067F000040020000A00000006F8000-000000067F000040020000A00000006FC000__0000003903F1CFE8 000000067F000040020000A00000006F8000-000000067F000040020000A00000006FC000__0000003B99F7F8A0 000000067F000040020000A00000006F8000-000000067F000040020000A00000006FC000__0000005D2FFFFB38 000000067F000040020000A00000006FA318-000000067F000040020000A0000000702D03__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A00000006FC000-000000067F000040020000A0000000700000__00000029BBAFEDD8 000000067F000040020000A00000006FC000-000000067F000040020000A0000000700000__00000038E9AF7F00 000000067F000040020000A00000006FC000-000000067F000040020000A0000000700000__0000003903F1CFE8 000000067F000040020000A00000006FC000-000000067F000040020000A0000000700000__0000003B99F7F8A0 000000067F000040020000A00000006FC000-000000067F000040020000A0000000700000__0000005D2FFFFB38 000000067F000040020000A0000000700000-000000067F000040020000A0000000704000__00000029BBAFEDD8 000000067F000040020000A0000000700000-000000067F000040020000A0000000704000__00000038E9AF7F00 000000067F000040020000A0000000700000-000000067F000040020000A0000000704000__0000003903F1CFE8 000000067F000040020000A0000000700000-000000067F000040020000A0000000704000__0000003B99F7F8A0 000000067F000040020000A0000000700000-000000067F000040020000A0000000704000__0000005D2FFFFB38 000000067F000040020000A0000000702D03-000000067F000040020000A000000070B6E2__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A0000000704000-000000067F000040020000A0000000708000__00000029BBAFEDD8 000000067F000040020000A0000000704000-000000067F000040020000A0000000708000__00000038E9AF7F00 000000067F000040020000A0000000704000-000000067F000040020000A0000000708000__0000003903F1CFE8 000000067F000040020000A0000000704000-000000067F000040020000A0000000708000__0000003B99F7F8A0 000000067F000040020000A0000000704000-000000067F000040020000A0000000708000__0000005D2FFFFB38 000000067F000040020000A0000000708000-000000067F000040020000A000000070C000__00000029BBAFEDD8 000000067F000040020000A0000000708000-000000067F000040020000A000000070C000__00000038E9AF7F00 000000067F000040020000A0000000708000-000000067F000040020000A000000070C000__0000003903F1CFE8 000000067F000040020000A0000000708000-000000067F000040020000A000000070C000__0000003B99F7F8A0 000000067F000040020000A0000000708000-000000067F000040020000A000000070C000__0000005D2FFFFB38 000000067F000040020000A000000070B6E2-000000067F000040020000A00000007140D2__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A000000070C000-000000067F000040020000A0000000710000__00000029BBAFEDD8 000000067F000040020000A000000070C000-000000067F000040020000A0000000710000__00000038E9AF7F00 000000067F000040020000A000000070C000-000000067F000040020000A0000000710000__0000003903F1CFE8 000000067F000040020000A000000070C000-000000067F000040020000A0000000710000__0000003B99F7F8A0 000000067F000040020000A000000070C000-000000067F000040020000A0000000710000__0000005D2FFFFB38 000000067F000040020000A0000000710000-000000067F000040020000A0000000714000__00000029BBAFEDD8 000000067F000040020000A0000000710000-000000067F000040020000A0000000714000__00000038E9AF7F00 000000067F000040020000A0000000710000-000000067F000040020000A0000000714000__0000003903F1CFE8 000000067F000040020000A0000000710000-000000067F000040020000A0000000714000__0000003B99F7F8A0 000000067F000040020000A0000000710000-000000067F000040020000A0000000714000__0000005D2FFFFB38 000000067F000040020000A0000000714000-000000067F000040020000A0000000718000__00000029BBAFEDD8 000000067F000040020000A0000000714000-000000067F000040020000A0000000718000__00000038E67ABFA0 000000067F000040020000A0000000714000-000000067F000040020000A0000000718000__0000003903F1CFE8 000000067F000040020000A0000000714000-000000067F000040020000A0000000718000__0000003B99F7F8A0 000000067F000040020000A0000000714000-000000067F000040020000A0000000718000__0000005D2FFFFB38 000000067F000040020000A00000007140D2-000000067F000040020000A0000200000000__00000028DC1FE6F1-000000298BC7EAE1 000000067F000040020000A0000000714378-000000067F000040020000A000000071CD56__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000718000-000000067F000040020000A000000071C000__00000029BBAFEDD8 000000067F000040020000A0000000718000-000000067F000040020000A000000071C000__00000038E67ABFA0 000000067F000040020000A0000000718000-000000067F000040020000A000000071C000__0000003903F1CFE8 000000067F000040020000A0000000718000-000000067F000040020000A000000071C000__0000003B99F7F8A0 000000067F000040020000A0000000718000-000000067F000040020000A000000071C000__0000005D2FFFFB38 000000067F000040020000A000000071C000-000000067F000040020000A0000000720000__00000029BBAFEDD8 000000067F000040020000A000000071C000-000000067F000040020000A0000000720000__00000038E67ABFA0 000000067F000040020000A000000071C000-000000067F000040020000A0000000720000__0000003903F1CFE8 000000067F000040020000A000000071C000-000000067F000040020000A0000000720000__0000003B99F7F8A0 000000067F000040020000A000000071C000-000000067F000040020000A0000000720000__0000005D2FFFFB38 000000067F000040020000A000000071CD56-000000067F000040020000A0000000725723__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000720000-000000067F000040020000A0000000724000__00000029BBAFEDD8 000000067F000040020000A0000000720000-000000067F000040020000A0000000724000__00000038E67ABFA0 000000067F000040020000A0000000720000-000000067F000040020000A0000000724000__0000003903F1CFE8 000000067F000040020000A0000000720000-000000067F000040020000A0000000724000__0000003B99F7F8A0 000000067F000040020000A0000000720000-000000067F000040020000A0000000724000__0000005D2FFFFB38 000000067F000040020000A0000000724000-000000067F000040020000A0000000728000__00000029BBAFEDD8 000000067F000040020000A0000000724000-000000067F000040020000A0000000728000__00000038E67ABFA0 000000067F000040020000A0000000724000-000000067F000040020000A0000000728000__0000003903F1CFE8 000000067F000040020000A0000000724000-000000067F000040020000A0000000728000__0000003B99F7F8A0 000000067F000040020000A0000000724000-000000067F000040020000A0000000728000__0000005D2FFFFB38 000000067F000040020000A0000000725723-000000067F000040020000A000000072E0E0__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000728000-000000067F000040020000A000000072C000__00000029BBAFEDD8 000000067F000040020000A0000000728000-000000067F000040020000A000000072C000__00000038E67ABFA0 000000067F000040020000A0000000728000-000000067F000040020000A000000072C000__0000003903F1CFE8 000000067F000040020000A0000000728000-000000067F000040020000A000000072C000__0000003B99F7F8A0 000000067F000040020000A0000000728000-000000067F000040020000A000000072C000__0000005D2FFFFB38 000000067F000040020000A000000072C000-000000067F000040020000A0000000730000__00000038E67ABFA0 000000067F000040020000A000000072C000-000000067F000040020000A0000000730000__0000003903F1CFE8 000000067F000040020000A000000072C000-000000067F000040020000A0000000730000__0000003B99F7F8A0 000000067F000040020000A000000072C000-000000067F000040020000A0000000730000__0000005D2FFFFB38 000000067F000040020000A000000072C000-030000000000000000000000000000000002__00000029BBAFEDD8 000000067F000040020000A000000072E0E0-000000067F000040020000A0000000736AB6__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000730000-000000067F000040020000A0000000734000__00000038E67ABFA0 000000067F000040020000A0000000730000-000000067F000040020000A0000000734000__0000003903F1CFE8 000000067F000040020000A0000000730000-000000067F000040020000A0000000734000__0000003B99F7F8A0 000000067F000040020000A0000000730000-000000067F000040020000A0000000734000__0000005D2FFFFB38 000000067F000040020000A0000000734000-000000067F000040020000A0000000738000__00000038E67ABFA0 000000067F000040020000A0000000734000-000000067F000040020000A0000000738000__0000003903F1CFE8 000000067F000040020000A0000000734000-000000067F000040020000A0000000738000__0000003B99F7F8A0 000000067F000040020000A0000000734000-000000067F000040020000A0000000738000__0000005D2FFFFB38 000000067F000040020000A0000000736AB6-000000067F000040020000A000000073F4A6__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000738000-000000067F000040020000A000000073C000__00000038E67ABFA0 000000067F000040020000A0000000738000-000000067F000040020000A000000073C000__0000003903F1CFE8 000000067F000040020000A0000000738000-000000067F000040020000A000000073C000__0000003B99F7F8A0 000000067F000040020000A0000000738000-000000067F000040020000A000000073C000__0000005D2FFFFB38 000000067F000040020000A000000073C000-000000067F000040020000A0000000740000__00000038E67ABFA0 000000067F000040020000A000000073C000-000000067F000040020000A0000000740000__0000003903F1CFE8 000000067F000040020000A000000073C000-000000067F000040020000A0000000740000__0000003B99F7F8A0 000000067F000040020000A000000073C000-000000067F000040020000A0000000740000__0000005D2FFFFB38 000000067F000040020000A000000073F4A6-000000067F000040020000A0000000747E87__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000740000-000000067F000040020000A0000000744000__00000038E67ABFA0 000000067F000040020000A0000000740000-000000067F000040020000A0000000744000__0000003903F1CFE8 000000067F000040020000A0000000740000-000000067F000040020000A0000000744000__0000003B99F7F8A0 000000067F000040020000A0000000740000-000000067F000040020000A0000000744000__0000005D2FFFFB38 000000067F000040020000A0000000744000-000000067F000040020000A0000000748000__00000038E67ABFA0 000000067F000040020000A0000000744000-000000067F000040020000A0000000748000__0000003903F1CFE8 000000067F000040020000A0000000744000-000000067F000040020000A0000000748000__0000003B99F7F8A0 000000067F000040020000A0000000744000-000000067F000040020000A0000000748000__0000005D2FFFFB38 000000067F000040020000A0000000747E87-000000067F000040020000A0000000750874__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000748000-000000067F000040020000A000000074C000__00000038E67ABFA0 000000067F000040020000A0000000748000-000000067F000040020000A000000074C000__0000003903F1CFE8 000000067F000040020000A0000000748000-000000067F000040020000A000000074C000__0000003B99F7F8A0 000000067F000040020000A0000000748000-000000067F000040020000A000000074C000__0000005D2FFFFB38 000000067F000040020000A000000074C000-000000067F000040020000A0000000750000__00000038E67ABFA0 000000067F000040020000A000000074C000-000000067F000040020000A0000000750000__0000003903F1CFE8 000000067F000040020000A000000074C000-000000067F000040020000A0000000750000__0000003B99F7F8A0 000000067F000040020000A000000074C000-000000067F000040020000A0000000750000__0000005D2FFFFB38 000000067F000040020000A0000000750000-000000067F000040020000A0000000754000__00000038E67ABFA0 000000067F000040020000A0000000750000-000000067F000040020000A0000000754000__0000003903F1CFE8 000000067F000040020000A0000000750000-000000067F000040020000A0000000754000__0000003B99F7F8A0 000000067F000040020000A0000000750000-000000067F000040020000A0000000754000__0000005D2FFFFB38 000000067F000040020000A0000000750874-000000067F000040020000A0000000759257__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000754000-000000067F000040020000A0000000758000__00000038E67ABFA0 000000067F000040020000A0000000754000-000000067F000040020000A0000000758000__0000003903F1CFE8 000000067F000040020000A0000000754000-000000067F000040020000A0000000758000__0000003B99F7F8A0 000000067F000040020000A0000000754000-000000067F000040020000A0000000758000__0000005D2FFFFB38 000000067F000040020000A0000000758000-000000067F000040020000A000000075C000__00000038E67ABFA0 000000067F000040020000A0000000758000-000000067F000040020000A000000075C000__0000003903F1CFE8 000000067F000040020000A0000000758000-000000067F000040020000A000000075C000__0000003B99F7F8A0 000000067F000040020000A0000000758000-000000067F000040020000A000000075C000__0000005D2FFFFB38 000000067F000040020000A0000000759257-000000067F000040020000A0000000761C22__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A000000075C000-000000067F000040020000A0000000760000__00000038E67ABFA0 000000067F000040020000A000000075C000-000000067F000040020000A0000000760000__0000003903F1CFE8 000000067F000040020000A000000075C000-000000067F000040020000A0000000760000__0000003B99F7F8A0 000000067F000040020000A000000075C000-000000067F000040020000A0000000760000__0000005D2FFFFB38 000000067F000040020000A0000000760000-000000067F000040020000A0000000764000__00000038E67ABFA0 000000067F000040020000A0000000760000-000000067F000040020000A0000000764000__0000003903F1CFE8 000000067F000040020000A0000000760000-000000067F000040020000A0000000764000__0000003B99F7F8A0 000000067F000040020000A0000000760000-000000067F000040020000A0000000764000__0000005D2FFFFB38 000000067F000040020000A0000000761C22-000000067F000040020000A000000076A5F3__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A0000000764000-000000067F000040020000A0000000768000__00000038E67ABFA0 000000067F000040020000A0000000764000-000000067F000040020000A0000000768000__0000003903F1CFE8 000000067F000040020000A0000000764000-000000067F000040020000A0000000768000__0000003B99F7F8A0 000000067F000040020000A0000000764000-000000067F000040020000A0000000768000__0000005D2FFFFB38 000000067F000040020000A0000000768000-000000067F000040020000A000000076C000__00000038E67ABFA0 000000067F000040020000A0000000768000-000000067F000040020000A000000076C000__0000003903F1CFE8 000000067F000040020000A0000000768000-000000067F000040020000A000000076C000__0000003B99F7F8A0 000000067F000040020000A0000000768000-000000067F000040020000A000000076C000__0000005D2FFFFB38 000000067F000040020000A000000076A5F3-000000067F000040020000A0000000772FD6__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A000000076C000-000000067F000040020000A0000000770000__00000038E67ABFA0 000000067F000040020000A000000076C000-000000067F000040020000A0000000770000__0000003903F1CFE8 000000067F000040020000A000000076C000-000000067F000040020000A0000000770000__0000003B99F7F8A0 000000067F000040020000A000000076C000-000000067F000040020000A0000000770000__0000005D2FFFFB38 000000067F000040020000A0000000770000-000000067F000040020000A0000000774000__00000038E1ABFE28 000000067F000040020000A0000000770000-000000067F000040020000A0000000774000__00000038E9AF7F00 000000067F000040020000A0000000770000-000000067F000040020000A0000000774000__0000003903F1CFE8 000000067F000040020000A0000000770000-000000067F000040020000A0000000774000__0000003B99F7F8A0 000000067F000040020000A0000000770000-000000067F000040020000A0000000774000__0000005D2FFFFB38 000000067F000040020000A0000000772FD6-000000067F000040020000A0000200000000__000000298BC7EAE1-0000002A3B6FD871 000000067F000040020000A00000007731C3-000000067F000040020000A000000077BBA6__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A0000000774000-000000067F000040020000A0000000778000__00000038E1ABFE28 000000067F000040020000A0000000774000-000000067F000040020000A0000000778000__00000038E9AF7F00 000000067F000040020000A0000000774000-000000067F000040020000A0000000778000__0000003903F1CFE8 000000067F000040020000A0000000774000-000000067F000040020000A0000000778000__0000003B99F7F8A0 000000067F000040020000A0000000774000-000000067F000040020000A0000000778000__0000005D2FFFFB38 000000067F000040020000A0000000778000-000000067F000040020000A000000077C000__00000038E1ABFE28 000000067F000040020000A0000000778000-000000067F000040020000A000000077C000__00000038E9AF7F00 000000067F000040020000A0000000778000-000000067F000040020000A000000077C000__0000003903F1CFE8 000000067F000040020000A0000000778000-000000067F000040020000A000000077C000__0000003B99F7F8A0 000000067F000040020000A0000000778000-000000067F000040020000A000000077C000__0000005D2FFFFB38 000000067F000040020000A000000077BBA6-000000067F000040020000A0000000784582__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A000000077C000-000000067F000040020000A0000000780000__00000038E1ABFE28 000000067F000040020000A000000077C000-000000067F000040020000A0000000780000__00000038E9AF7F00 000000067F000040020000A000000077C000-000000067F000040020000A0000000780000__0000003903F1CFE8 000000067F000040020000A000000077C000-000000067F000040020000A0000000780000__0000003B99F7F8A0 000000067F000040020000A000000077C000-000000067F000040020000A0000000780000__0000005D2FFFFB38 000000067F000040020000A0000000780000-000000067F000040020000A0000000784000__00000038E1ABFE28 000000067F000040020000A0000000780000-000000067F000040020000A0000000784000__00000038E9AF7F00 000000067F000040020000A0000000780000-000000067F000040020000A0000000784000__0000003903F1CFE8 000000067F000040020000A0000000780000-000000067F000040020000A0000000784000__0000003B99F7F8A0 000000067F000040020000A0000000780000-000000067F000040020000A0000000784000__0000005D2FFFFB38 000000067F000040020000A0000000784000-000000067F000040020000A0000000788000__00000038E1ABFE28 000000067F000040020000A0000000784000-000000067F000040020000A0000000788000__00000038E9AF7F00 000000067F000040020000A0000000784000-000000067F000040020000A0000000788000__0000003903F1CFE8 000000067F000040020000A0000000784000-000000067F000040020000A0000000788000__0000003B99F7F8A0 000000067F000040020000A0000000784000-000000067F000040020000A0000000788000__0000005D2FFFFB38 000000067F000040020000A0000000784582-000000067F000040020000A000000078CF68__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A0000000788000-000000067F000040020000A000000078C000__00000038E1ABFE28 000000067F000040020000A0000000788000-000000067F000040020000A000000078C000__00000038E9AF7F00 000000067F000040020000A0000000788000-000000067F000040020000A000000078C000__0000003903F1CFE8 000000067F000040020000A0000000788000-000000067F000040020000A000000078C000__0000003B99F7F8A0 000000067F000040020000A0000000788000-000000067F000040020000A000000078C000__0000005D2FFFFB38 000000067F000040020000A000000078C000-000000067F000040020000A0000000790000__00000038E1ABFE28 000000067F000040020000A000000078C000-000000067F000040020000A0000000790000__00000038E9AF7F00 000000067F000040020000A000000078C000-000000067F000040020000A0000000790000__0000003903F1CFE8 000000067F000040020000A000000078C000-000000067F000040020000A0000000790000__0000003B99F7F8A0 000000067F000040020000A000000078C000-000000067F000040020000A0000000790000__0000005D2FFFFB38 000000067F000040020000A000000078CF68-000000067F000040020000A0000000795940__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A0000000790000-000000067F000040020000A0000000794000__00000038E1ABFE28 000000067F000040020000A0000000790000-000000067F000040020000A0000000794000__00000038E9AF7F00 000000067F000040020000A0000000790000-000000067F000040020000A0000000794000__0000003903F1CFE8 000000067F000040020000A0000000790000-000000067F000040020000A0000000794000__0000003B99F7F8A0 000000067F000040020000A0000000790000-000000067F000040020000A0000000794000__0000005D2FFFFB38 000000067F000040020000A0000000794000-000000067F000040020000A0000000798000__00000038E1ABFE28 000000067F000040020000A0000000794000-000000067F000040020000A0000000798000__00000038E9AF7F00 000000067F000040020000A0000000794000-000000067F000040020000A0000000798000__0000003903F1CFE8 000000067F000040020000A0000000794000-000000067F000040020000A0000000798000__0000003B99F7F8A0 000000067F000040020000A0000000794000-000000067F000040020000A0000000798000__0000005D2FFFFB38 000000067F000040020000A0000000795940-000000067F000040020000A000000079E314__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A0000000798000-000000067F000040020000A000000079C000__00000038E1ABFE28 000000067F000040020000A0000000798000-000000067F000040020000A000000079C000__00000038E9AF7F00 000000067F000040020000A0000000798000-000000067F000040020000A000000079C000__0000003903F1CFE8 000000067F000040020000A0000000798000-000000067F000040020000A000000079C000__0000003B99F7F8A0 000000067F000040020000A0000000798000-000000067F000040020000A000000079C000__0000005D2FFFFB38 000000067F000040020000A000000079C000-000000067F000040020000A00000007A0000__00000038E1ABFE28 000000067F000040020000A000000079C000-000000067F000040020000A00000007A0000__00000038E9AF7F00 000000067F000040020000A000000079C000-000000067F000040020000A00000007A0000__0000003903F1CFE8 000000067F000040020000A000000079C000-000000067F000040020000A00000007A0000__0000003B99F7F8A0 000000067F000040020000A000000079C000-000000067F000040020000A00000007A0000__0000005D2FFFFB38 000000067F000040020000A000000079E314-000000067F000040020000A00000007A6CDE__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007A0000-000000067F000040020000A00000007A4000__00000038E1ABFE28 000000067F000040020000A00000007A0000-000000067F000040020000A00000007A4000__00000038E9AF7F00 000000067F000040020000A00000007A0000-000000067F000040020000A00000007A4000__0000003903F1CFE8 000000067F000040020000A00000007A0000-000000067F000040020000A00000007A4000__0000003B99F7F8A0 000000067F000040020000A00000007A0000-000000067F000040020000A00000007A4000__0000005D2FFFFB38 000000067F000040020000A00000007A4000-000000067F000040020000A00000007A8000__00000038E1ABFE28 000000067F000040020000A00000007A4000-000000067F000040020000A00000007A8000__00000038E9AF7F00 000000067F000040020000A00000007A4000-000000067F000040020000A00000007A8000__0000003903F1CFE8 000000067F000040020000A00000007A4000-000000067F000040020000A00000007A8000__0000003B99F7F8A0 000000067F000040020000A00000007A4000-000000067F000040020000A00000007A8000__0000005D2FFFFB38 000000067F000040020000A00000007A6CDE-000000067F000040020000A00000007AF6C2__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007A8000-000000067F000040020000A00000007AC000__00000038E1ABFE28 000000067F000040020000A00000007A8000-000000067F000040020000A00000007AC000__00000038E9AF7F00 000000067F000040020000A00000007A8000-000000067F000040020000A00000007AC000__0000003903F1CFE8 000000067F000040020000A00000007A8000-000000067F000040020000A00000007AC000__0000003B99F7F8A0 000000067F000040020000A00000007A8000-000000067F000040020000A00000007AC000__0000005D2FFFFB38 000000067F000040020000A00000007AC000-000000067F000040020000A00000007B0000__00000038E1ABFE28 000000067F000040020000A00000007AC000-000000067F000040020000A00000007B0000__00000038E9AF7F00 000000067F000040020000A00000007AC000-000000067F000040020000A00000007B0000__0000003903F1CFE8 000000067F000040020000A00000007AC000-000000067F000040020000A00000007B0000__0000003B99F7F8A0 000000067F000040020000A00000007AC000-000000067F000040020000A00000007B0000__0000005D2FFFFB38 000000067F000040020000A00000007AF6C2-000000067F000040020000A00000007B8090__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007B0000-000000067F000040020000A00000007B4000__00000038E1ABFE28 000000067F000040020000A00000007B0000-000000067F000040020000A00000007B4000__00000038E9AF7F00 000000067F000040020000A00000007B0000-000000067F000040020000A00000007B4000__0000003903F1CFE8 000000067F000040020000A00000007B0000-000000067F000040020000A00000007B4000__0000003B99F7F8A0 000000067F000040020000A00000007B0000-000000067F000040020000A00000007B4000__0000005D2FFFFB38 000000067F000040020000A00000007B4000-000000067F000040020000A00000007B8000__00000038E1ABFE28 000000067F000040020000A00000007B4000-000000067F000040020000A00000007B8000__00000038E9AF7F00 000000067F000040020000A00000007B4000-000000067F000040020000A00000007B8000__0000003903F1CFE8 000000067F000040020000A00000007B4000-000000067F000040020000A00000007B8000__0000003B99F7F8A0 000000067F000040020000A00000007B4000-000000067F000040020000A00000007B8000__0000005D2FFFFB38 000000067F000040020000A00000007B8000-000000067F000040020000A00000007BC000__00000038E1ABFE28 000000067F000040020000A00000007B8000-000000067F000040020000A00000007BC000__00000038E9AF7F00 000000067F000040020000A00000007B8000-000000067F000040020000A00000007BC000__0000003903F1CFE8 000000067F000040020000A00000007B8000-000000067F000040020000A00000007BC000__0000003B99F7F8A0 000000067F000040020000A00000007B8000-000000067F000040020000A00000007BC000__0000005D2FFFFB38 000000067F000040020000A00000007B8090-000000067F000040020000A00000007C0A77__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007BC000-000000067F000040020000A00000007C0000__00000038E1ABFE28 000000067F000040020000A00000007BC000-000000067F000040020000A00000007C0000__00000038E9AF7F00 000000067F000040020000A00000007BC000-000000067F000040020000A00000007C0000__0000003903F1CFE8 000000067F000040020000A00000007BC000-000000067F000040020000A00000007C0000__0000003B99F7F8A0 000000067F000040020000A00000007BC000-000000067F000040020000A00000007C0000__0000005D2FFFFB38 000000067F000040020000A00000007C0000-000000067F000040020000A00000007C4000__00000038E1ABFE28 000000067F000040020000A00000007C0000-000000067F000040020000A00000007C4000__00000038E9AF7F00 000000067F000040020000A00000007C0000-000000067F000040020000A00000007C4000__0000003903F1CFE8 000000067F000040020000A00000007C0000-000000067F000040020000A00000007C4000__0000003B99F7F8A0 000000067F000040020000A00000007C0000-000000067F000040020000A00000007C4000__0000005D2FFFFB38 000000067F000040020000A00000007C0A77-000000067F000040020000A00000007C945A__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007C4000-000000067F000040020000A00000007C8000__00000038E1ABFE28 000000067F000040020000A00000007C4000-000000067F000040020000A00000007C8000__00000038E9AF7F00 000000067F000040020000A00000007C4000-000000067F000040020000A00000007C8000__0000003903F1CFE8 000000067F000040020000A00000007C4000-000000067F000040020000A00000007C8000__0000003B99F7F8A0 000000067F000040020000A00000007C4000-000000067F000040020000A00000007C8000__0000005D2FFFFB38 000000067F000040020000A00000007C8000-000000067F000040020000A00000007CC000__0000002BAAB7E320 000000067F000040020000A00000007C8000-000000067F000040020000A00000007CC000__00000038E9AF7F00 000000067F000040020000A00000007C8000-000000067F000040020000A00000007CC000__0000003903F1CFE8 000000067F000040020000A00000007C8000-000000067F000040020000A00000007CC000__0000003B99F7F8A0 000000067F000040020000A00000007C8000-000000067F000040020000A00000007CC000__0000005D2FFFFB38 000000067F000040020000A00000007C945A-000000067F000040020000A0000200000000__0000002A3B6FD871-0000002ADB1FF0A9 000000067F000040020000A00000007C96D4-000000067F000040020000A00000007D20BA__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007CC000-000000067F000040020000A00000007D0000__0000002BAAB7E320 000000067F000040020000A00000007CC000-000000067F000040020000A00000007D0000__00000038E9AF7F00 000000067F000040020000A00000007CC000-000000067F000040020000A00000007D0000__0000003903F1CFE8 000000067F000040020000A00000007CC000-000000067F000040020000A00000007D0000__0000003B99F7F8A0 000000067F000040020000A00000007CC000-000000067F000040020000A00000007D0000__0000005D2FFFFB38 000000067F000040020000A00000007D0000-000000067F000040020000A00000007D4000__0000002BAAB7E320 000000067F000040020000A00000007D0000-000000067F000040020000A00000007D4000__00000038E9AF7F00 000000067F000040020000A00000007D0000-000000067F000040020000A00000007D4000__0000003903F1CFE8 000000067F000040020000A00000007D0000-000000067F000040020000A00000007D4000__0000003B99F7F8A0 000000067F000040020000A00000007D0000-000000067F000040020000A00000007D4000__0000005D2FFFFB38 000000067F000040020000A00000007D20BA-000000067F000040020000A00000007DAA9B__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007D4000-000000067F000040020000A00000007D8000__0000002BAAB7E320 000000067F000040020000A00000007D4000-000000067F000040020000A00000007D8000__00000038E9AF7F00 000000067F000040020000A00000007D4000-000000067F000040020000A00000007D8000__0000003903F1CFE8 000000067F000040020000A00000007D4000-000000067F000040020000A00000007D8000__0000003B99F7F8A0 000000067F000040020000A00000007D4000-000000067F000040020000A00000007D8000__0000005D2FFFFB38 000000067F000040020000A00000007D8000-000000067F000040020000A00000007DC000__0000002BAAB7E320 000000067F000040020000A00000007D8000-000000067F000040020000A00000007DC000__00000038E9AF7F00 000000067F000040020000A00000007D8000-000000067F000040020000A00000007DC000__0000003903F1CFE8 000000067F000040020000A00000007D8000-000000067F000040020000A00000007DC000__0000003B99F7F8A0 000000067F000040020000A00000007D8000-000000067F000040020000A00000007DC000__0000005D2FFFFB38 000000067F000040020000A00000007DAA9B-000000067F000040020000A00000007E3486__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007DC000-000000067F000040020000A00000007E0000__0000002BAAB7E320 000000067F000040020000A00000007DC000-000000067F000040020000A00000007E0000__00000038E9AF7F00 000000067F000040020000A00000007DC000-000000067F000040020000A00000007E0000__0000003903F1CFE8 000000067F000040020000A00000007DC000-000000067F000040020000A00000007E0000__0000003B99F7F8A0 000000067F000040020000A00000007DC000-000000067F000040020000A00000007E0000__0000005D2FFFFB38 000000067F000040020000A00000007E0000-000000067F000040020000A00000007E4000__0000002BAAB7E320 000000067F000040020000A00000007E0000-000000067F000040020000A00000007E4000__00000038E9AF7F00 000000067F000040020000A00000007E0000-000000067F000040020000A00000007E4000__0000003903F1CFE8 000000067F000040020000A00000007E0000-000000067F000040020000A00000007E4000__0000003B99F7F8A0 000000067F000040020000A00000007E0000-000000067F000040020000A00000007E4000__0000005D2FFFFB38 000000067F000040020000A00000007E3486-000000067F000040020000A00000007EBE5F__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007E4000-000000067F000040020000A00000007E8000__0000002BAAB7E320 000000067F000040020000A00000007E4000-000000067F000040020000A00000007E8000__00000038E9AF7F00 000000067F000040020000A00000007E4000-000000067F000040020000A00000007E8000__0000003903F1CFE8 000000067F000040020000A00000007E4000-000000067F000040020000A00000007E8000__0000003B99F7F8A0 000000067F000040020000A00000007E4000-000000067F000040020000A00000007E8000__0000005D2FFFFB38 000000067F000040020000A00000007E8000-000000067F000040020000A00000007EC000__0000002BAAB7E320 000000067F000040020000A00000007E8000-000000067F000040020000A00000007EC000__00000038E9AF7F00 000000067F000040020000A00000007E8000-000000067F000040020000A00000007EC000__0000003903F1CFE8 000000067F000040020000A00000007E8000-000000067F000040020000A00000007EC000__0000003B99F7F8A0 000000067F000040020000A00000007E8000-000000067F000040020000A00000007EC000__0000005D2FFFFB38 000000067F000040020000A00000007EBE5F-000000067F000040020000A00000007F4836__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007EC000-000000067F000040020000A00000007F0000__0000002BAAB7E320 000000067F000040020000A00000007EC000-000000067F000040020000A00000007F0000__00000038E9AF7F00 000000067F000040020000A00000007EC000-000000067F000040020000A00000007F0000__0000003903F1CFE8 000000067F000040020000A00000007EC000-000000067F000040020000A00000007F0000__0000003B99F7F8A0 000000067F000040020000A00000007EC000-000000067F000040020000A00000007F0000__0000005D2FFFFB38 000000067F000040020000A00000007F0000-000000067F000040020000A00000007F4000__0000002BAAB7E320 000000067F000040020000A00000007F0000-000000067F000040020000A00000007F4000__00000038E9AF7F00 000000067F000040020000A00000007F0000-000000067F000040020000A00000007F4000__0000003903F1CFE8 000000067F000040020000A00000007F0000-000000067F000040020000A00000007F4000__0000003B99F7F8A0 000000067F000040020000A00000007F0000-000000067F000040020000A00000007F4000__0000005D2FFFFB38 000000067F000040020000A00000007F4000-000000067F000040020000A00000007F8000__0000002BAAB7E320 000000067F000040020000A00000007F4000-000000067F000040020000A00000007F8000__00000038E9AF7F00 000000067F000040020000A00000007F4000-000000067F000040020000A00000007F8000__0000003903F1CFE8 000000067F000040020000A00000007F4000-000000067F000040020000A00000007F8000__0000003B99F7F8A0 000000067F000040020000A00000007F4000-000000067F000040020000A00000007F8000__0000005D2FFFFB38 000000067F000040020000A00000007F4836-000000067F000040020000A00000007FD216__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A00000007F8000-000000067F000040020000A00000007FC000__0000002BAAB7E320 000000067F000040020000A00000007F8000-000000067F000040020000A00000007FC000__00000038E9AF7F00 000000067F000040020000A00000007F8000-000000067F000040020000A00000007FC000__0000003903F1CFE8 000000067F000040020000A00000007F8000-000000067F000040020000A00000007FC000__0000003B99F7F8A0 000000067F000040020000A00000007F8000-000000067F000040020000A00000007FC000__0000005D2FFFFB38 000000067F000040020000A00000007FC000-000000067F000040020000A0000000800000__0000002BAAB7E320 000000067F000040020000A00000007FC000-000000067F000040020000A0000000800000__00000038E9AF7F00 000000067F000040020000A00000007FC000-000000067F000040020000A0000000800000__0000003903F1CFE8 000000067F000040020000A00000007FC000-000000067F000040020000A0000000800000__0000003B99F7F8A0 000000067F000040020000A00000007FC000-000000067F000040020000A0000000800000__0000005D2FFFFB38 000000067F000040020000A00000007FD216-000000067F000040020000A0000000805BEF__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A0000000800000-000000067F000040020000A0000000804000__0000002BAAB7E320 000000067F000040020000A0000000800000-000000067F000040020000A0000000804000__00000038E9AF7F00 000000067F000040020000A0000000800000-000000067F000040020000A0000000804000__0000003903F1CFE8 000000067F000040020000A0000000800000-000000067F000040020000A0000000804000__0000003B99F7F8A0 000000067F000040020000A0000000800000-000000067F000040020000A0000000804000__0000005D2FFFFB38 000000067F000040020000A0000000804000-000000067F000040020000A0000000808000__0000002BAAB7E320 000000067F000040020000A0000000804000-000000067F000040020000A0000000808000__00000038E9AF7F00 000000067F000040020000A0000000804000-000000067F000040020000A0000000808000__0000003903F1CFE8 000000067F000040020000A0000000804000-000000067F000040020000A0000000808000__0000003B99F7F8A0 000000067F000040020000A0000000804000-000000067F000040020000A0000000808000__0000005D2FFFFB38 000000067F000040020000A0000000805BEF-000000067F000040020000A000000080E5CA__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A0000000808000-000000067F000040020000A000000080C000__0000002BAAB7E320 000000067F000040020000A0000000808000-000000067F000040020000A000000080C000__00000038E9AF7F00 000000067F000040020000A0000000808000-000000067F000040020000A000000080C000__0000003903F1CFE8 000000067F000040020000A0000000808000-000000067F000040020000A000000080C000__0000003B99F7F8A0 000000067F000040020000A0000000808000-000000067F000040020000A000000080C000__0000005D2FFFFB38 000000067F000040020000A000000080C000-000000067F000040020000A0000000810000__0000002BAAB7E320 000000067F000040020000A000000080C000-000000067F000040020000A0000000810000__00000038E9AF7F00 000000067F000040020000A000000080C000-000000067F000040020000A0000000810000__0000003903F1CFE8 000000067F000040020000A000000080C000-000000067F000040020000A0000000810000__0000003B99F7F8A0 000000067F000040020000A000000080C000-000000067F000040020000A0000000810000__0000005D2FFFFB38 000000067F000040020000A000000080E5CA-000000067F000040020000A0000000816FB0__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A0000000810000-000000067F000040020000A0000000814000__0000002BAAB7E320 000000067F000040020000A0000000810000-000000067F000040020000A0000000814000__00000038E9AF7F00 000000067F000040020000A0000000810000-000000067F000040020000A0000000814000__0000003903F1CFE8 000000067F000040020000A0000000810000-000000067F000040020000A0000000814000__0000003B99F7F8A0 000000067F000040020000A0000000810000-000000067F000040020000A0000000814000__0000005D2FFFFB38 000000067F000040020000A0000000814000-000000067F000040020000A0000000818000__0000002BAAB7E320 000000067F000040020000A0000000814000-000000067F000040020000A0000000818000__00000038E9AF7F00 000000067F000040020000A0000000814000-000000067F000040020000A0000000818000__0000003903F1CFE8 000000067F000040020000A0000000814000-000000067F000040020000A0000000818000__0000003B99F7F8A0 000000067F000040020000A0000000814000-000000067F000040020000A0000000818000__0000005D2FFFFB38 000000067F000040020000A0000000816FB0-000000067F000040020000A000000081F994__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A0000000818000-000000067F000040020000A000000081C000__0000002BAAB7E320 000000067F000040020000A0000000818000-000000067F000040020000A000000081C000__00000038E9AF7F00 000000067F000040020000A0000000818000-000000067F000040020000A000000081C000__0000003903F1CFE8 000000067F000040020000A0000000818000-000000067F000040020000A000000081C000__0000003B99F7F8A0 000000067F000040020000A0000000818000-000000067F000040020000A000000081C000__0000005D2FFFFB38 000000067F000040020000A000000081C000-000000067F000040020000A0000000820000__0000002BAAB7E320 000000067F000040020000A000000081C000-000000067F000040020000A0000000820000__00000038E67ABFA0 000000067F000040020000A000000081C000-000000067F000040020000A0000000820000__0000003903F1CFE8 000000067F000040020000A000000081C000-000000067F000040020000A0000000820000__0000003B99F7F8A0 000000067F000040020000A000000081C000-000000067F000040020000A0000000820000__0000005D2FFFFB38 000000067F000040020000A000000081F994-000000067F000040020000A0000200000000__0000002ADB1FF0A9-0000002B7ACFE3E1 000000067F000040020000A000000081FB32-000000067F000040020000A0000000828506__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000820000-000000067F000040020000A0000000824000__0000002BAAB7E320 000000067F000040020000A0000000820000-000000067F000040020000A0000000824000__00000038E67ABFA0 000000067F000040020000A0000000820000-000000067F000040020000A0000000824000__0000003903F1CFE8 000000067F000040020000A0000000820000-000000067F000040020000A0000000824000__0000003B99F7F8A0 000000067F000040020000A0000000820000-000000067F000040020000A0000000824000__0000005D2FFFFB38 000000067F000040020000A0000000824000-000000067F000040020000A0000000828000__0000002BAAB7E320 000000067F000040020000A0000000824000-000000067F000040020000A0000000828000__00000038E67ABFA0 000000067F000040020000A0000000824000-000000067F000040020000A0000000828000__0000003903F1CFE8 000000067F000040020000A0000000824000-000000067F000040020000A0000000828000__0000003B99F7F8A0 000000067F000040020000A0000000824000-000000067F000040020000A0000000828000__0000005D2FFFFB38 000000067F000040020000A0000000828000-000000067F000040020000A000000082C000__0000002BAAB7E320 000000067F000040020000A0000000828000-000000067F000040020000A000000082C000__00000038E67ABFA0 000000067F000040020000A0000000828000-000000067F000040020000A000000082C000__0000003903F1CFE8 000000067F000040020000A0000000828000-000000067F000040020000A000000082C000__0000003B99F7F8A0 000000067F000040020000A0000000828000-000000067F000040020000A000000082C000__0000005D2FFFFB38 000000067F000040020000A0000000828506-000000067F000040020000A0000000830EDA__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A000000082C000-000000067F000040020000A0000000830000__0000002BAAB7E320 000000067F000040020000A000000082C000-000000067F000040020000A0000000830000__00000038E67ABFA0 000000067F000040020000A000000082C000-000000067F000040020000A0000000830000__0000003903F1CFE8 000000067F000040020000A000000082C000-000000067F000040020000A0000000830000__0000003B99F7F8A0 000000067F000040020000A000000082C000-000000067F000040020000A0000000830000__0000005D2FFFFB38 000000067F000040020000A0000000830000-000000067F000040020000A0000000834000__0000002BAAB7E320 000000067F000040020000A0000000830000-000000067F000040020000A0000000834000__00000038E67ABFA0 000000067F000040020000A0000000830000-000000067F000040020000A0000000834000__0000003903F1CFE8 000000067F000040020000A0000000830000-000000067F000040020000A0000000834000__0000003B99F7F8A0 000000067F000040020000A0000000830000-000000067F000040020000A0000000834000__0000005D2FFFFB38 000000067F000040020000A0000000830EDA-000000067F000040020000A00000008398DB__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000834000-000000067F000040020000A0000000838000__0000002BAAB7E320 000000067F000040020000A0000000834000-000000067F000040020000A0000000838000__00000038E67ABFA0 000000067F000040020000A0000000834000-000000067F000040020000A0000000838000__0000003903F1CFE8 000000067F000040020000A0000000834000-000000067F000040020000A0000000838000__0000003B99F7F8A0 000000067F000040020000A0000000834000-000000067F000040020000A0000000838000__0000005D2FFFFB38 000000067F000040020000A0000000838000-000000067F000040020000A000000083C000__00000038E67ABFA0 000000067F000040020000A0000000838000-000000067F000040020000A000000083C000__0000003903F1CFE8 000000067F000040020000A0000000838000-000000067F000040020000A000000083C000__0000003B99F7F8A0 000000067F000040020000A0000000838000-000000067F000040020000A000000083C000__0000005D2FFFFB38 000000067F000040020000A0000000838000-030000000000000000000000000000000002__0000002BAAB7E320 000000067F000040020000A00000008398DB-000000067F000040020000A00000008422C1__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A000000083C000-000000067F000040020000A0000000840000__00000038E67ABFA0 000000067F000040020000A000000083C000-000000067F000040020000A0000000840000__0000003903F1CFE8 000000067F000040020000A000000083C000-000000067F000040020000A0000000840000__0000003B99F7F8A0 000000067F000040020000A000000083C000-000000067F000040020000A0000000840000__0000005D2FFFFB38 000000067F000040020000A0000000840000-000000067F000040020000A0000000844000__00000038E67ABFA0 000000067F000040020000A0000000840000-000000067F000040020000A0000000844000__0000003903F1CFE8 000000067F000040020000A0000000840000-000000067F000040020000A0000000844000__0000003B99F7F8A0 000000067F000040020000A0000000840000-000000067F000040020000A0000000844000__0000005D2FFFFB38 000000067F000040020000A00000008422C1-000000067F000040020000A000000084AC98__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000844000-000000067F000040020000A0000000848000__00000038E67ABFA0 000000067F000040020000A0000000844000-000000067F000040020000A0000000848000__0000003903F1CFE8 000000067F000040020000A0000000844000-000000067F000040020000A0000000848000__0000003B99F7F8A0 000000067F000040020000A0000000844000-000000067F000040020000A0000000848000__0000005D2FFFFB38 000000067F000040020000A0000000848000-000000067F000040020000A000000084C000__00000038E67ABFA0 000000067F000040020000A0000000848000-000000067F000040020000A000000084C000__0000003903F1CFE8 000000067F000040020000A0000000848000-000000067F000040020000A000000084C000__0000003B99F7F8A0 000000067F000040020000A0000000848000-000000067F000040020000A000000084C000__0000005D2FFFFB38 000000067F000040020000A000000084AC98-000000067F000040020000A000000085367F__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A000000084C000-000000067F000040020000A0000000850000__00000038E67ABFA0 000000067F000040020000A000000084C000-000000067F000040020000A0000000850000__0000003903F1CFE8 000000067F000040020000A000000084C000-000000067F000040020000A0000000850000__0000003B99F7F8A0 000000067F000040020000A000000084C000-000000067F000040020000A0000000850000__0000005D2FFFFB38 000000067F000040020000A0000000850000-000000067F000040020000A0000000854000__00000038E67ABFA0 000000067F000040020000A0000000850000-000000067F000040020000A0000000854000__0000003903F1CFE8 000000067F000040020000A0000000850000-000000067F000040020000A0000000854000__0000003B99F7F8A0 000000067F000040020000A0000000850000-000000067F000040020000A0000000854000__0000005D2FFFFB38 000000067F000040020000A000000085367F-000000067F000040020000A000000085C059__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000854000-000000067F000040020000A0000000858000__00000038E67ABFA0 000000067F000040020000A0000000854000-000000067F000040020000A0000000858000__0000003903F1CFE8 000000067F000040020000A0000000854000-000000067F000040020000A0000000858000__0000003B99F7F8A0 000000067F000040020000A0000000854000-000000067F000040020000A0000000858000__0000005D2FFFFB38 000000067F000040020000A0000000858000-000000067F000040020000A000000085C000__00000038E67ABFA0 000000067F000040020000A0000000858000-000000067F000040020000A000000085C000__0000003903F1CFE8 000000067F000040020000A0000000858000-000000067F000040020000A000000085C000__0000003B99F7F8A0 000000067F000040020000A0000000858000-000000067F000040020000A000000085C000__0000005D2FFFFB38 000000067F000040020000A000000085C000-000000067F000040020000A0000000860000__00000038E67ABFA0 000000067F000040020000A000000085C000-000000067F000040020000A0000000860000__0000003903F1CFE8 000000067F000040020000A000000085C000-000000067F000040020000A0000000860000__0000003B99F7F8A0 000000067F000040020000A000000085C000-000000067F000040020000A0000000860000__0000005D2FFFFB38 000000067F000040020000A000000085C059-000000067F000040020000A0000000864A25__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000860000-000000067F000040020000A0000000864000__00000038E67ABFA0 000000067F000040020000A0000000860000-000000067F000040020000A0000000864000__0000003903F1CFE8 000000067F000040020000A0000000860000-000000067F000040020000A0000000864000__0000003B99F7F8A0 000000067F000040020000A0000000860000-000000067F000040020000A0000000864000__0000005D2FFFFB38 000000067F000040020000A0000000864000-000000067F000040020000A0000000868000__00000038E67ABFA0 000000067F000040020000A0000000864000-000000067F000040020000A0000000868000__0000003903F1CFE8 000000067F000040020000A0000000864000-000000067F000040020000A0000000868000__0000003B99F7F8A0 000000067F000040020000A0000000864000-000000067F000040020000A0000000868000__0000005D2FFFFB38 000000067F000040020000A0000000864A25-000000067F000040020000A000000086D403__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000868000-000000067F000040020000A000000086C000__00000038E67ABFA0 000000067F000040020000A0000000868000-000000067F000040020000A000000086C000__0000003903F1CFE8 000000067F000040020000A0000000868000-000000067F000040020000A000000086C000__0000003B99F7F8A0 000000067F000040020000A0000000868000-000000067F000040020000A000000086C000__0000005D2FFFFB38 000000067F000040020000A000000086C000-000000067F000040020000A0000000870000__00000038E67ABFA0 000000067F000040020000A000000086C000-000000067F000040020000A0000000870000__0000003903F1CFE8 000000067F000040020000A000000086C000-000000067F000040020000A0000000870000__0000003B99F7F8A0 000000067F000040020000A000000086C000-000000067F000040020000A0000000870000__0000005D2FFFFB38 000000067F000040020000A000000086D403-000000067F000040020000A0000000875DE0__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000870000-000000067F000040020000A0000000874000__00000038E67ABFA0 000000067F000040020000A0000000870000-000000067F000040020000A0000000874000__0000003903F1CFE8 000000067F000040020000A0000000870000-000000067F000040020000A0000000874000__0000003B99F7F8A0 000000067F000040020000A0000000870000-000000067F000040020000A0000000874000__0000005D2FFFFB38 000000067F000040020000A0000000874000-000000067F000040020000A0000000878000__00000038E1ABFE28 000000067F000040020000A0000000874000-000000067F000040020000A0000000878000__00000038E9AF7F00 000000067F000040020000A0000000874000-000000067F000040020000A0000000878000__0000003903F1CFE8 000000067F000040020000A0000000874000-000000067F000040020000A0000000878000__0000003B99F7F8A0 000000067F000040020000A0000000874000-000000067F000040020000A0000000878000__0000005D2FFFFB38 000000067F000040020000A0000000875DE0-000000067F000040020000A0000200000000__0000002B7ACFE3E1-0000002C1A7DEAD1 000000067F000040020000A0000000876030-000000067F000040020000A000000087EA03__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A0000000878000-000000067F000040020000A000000087C000__00000038E1ABFE28 000000067F000040020000A0000000878000-000000067F000040020000A000000087C000__00000038E9AF7F00 000000067F000040020000A0000000878000-000000067F000040020000A000000087C000__0000003903F1CFE8 000000067F000040020000A0000000878000-000000067F000040020000A000000087C000__0000003B99F7F8A0 000000067F000040020000A0000000878000-000000067F000040020000A000000087C000__0000005D2FFFFB38 000000067F000040020000A000000087C000-000000067F000040020000A0000000880000__00000038E1ABFE28 000000067F000040020000A000000087C000-000000067F000040020000A0000000880000__00000038E9AF7F00 000000067F000040020000A000000087C000-000000067F000040020000A0000000880000__0000003903F1CFE8 000000067F000040020000A000000087C000-000000067F000040020000A0000000880000__0000003B99F7F8A0 000000067F000040020000A000000087C000-000000067F000040020000A0000000880000__0000005D2FFFFB38 000000067F000040020000A000000087EA03-000000067F000040020000A00000008873D2__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A0000000880000-000000067F000040020000A0000000884000__00000038E1ABFE28 000000067F000040020000A0000000880000-000000067F000040020000A0000000884000__00000038E9AF7F00 000000067F000040020000A0000000880000-000000067F000040020000A0000000884000__0000003903F1CFE8 000000067F000040020000A0000000880000-000000067F000040020000A0000000884000__0000003B99F7F8A0 000000067F000040020000A0000000880000-000000067F000040020000A0000000884000__0000005D2FFFFB38 000000067F000040020000A0000000884000-000000067F000040020000A0000000888000__00000038E1ABFE28 000000067F000040020000A0000000884000-000000067F000040020000A0000000888000__00000038E9AF7F00 000000067F000040020000A0000000884000-000000067F000040020000A0000000888000__0000003903F1CFE8 000000067F000040020000A0000000884000-000000067F000040020000A0000000888000__0000003B99F7F8A0 000000067F000040020000A0000000884000-000000067F000040020000A0000000888000__0000005D2FFFFB38 000000067F000040020000A00000008873D2-000000067F000040020000A000000088FDC5__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A0000000888000-000000067F000040020000A000000088C000__00000038E1ABFE28 000000067F000040020000A0000000888000-000000067F000040020000A000000088C000__00000038E9AF7F00 000000067F000040020000A0000000888000-000000067F000040020000A000000088C000__0000003903F1CFE8 000000067F000040020000A0000000888000-000000067F000040020000A000000088C000__0000003B99F7F8A0 000000067F000040020000A0000000888000-000000067F000040020000A000000088C000__0000005D2FFFFB38 000000067F000040020000A000000088C000-000000067F000040020000A0000000890000__00000038E1ABFE28 000000067F000040020000A000000088C000-000000067F000040020000A0000000890000__00000038E9AF7F00 000000067F000040020000A000000088C000-000000067F000040020000A0000000890000__0000003903F1CFE8 000000067F000040020000A000000088C000-000000067F000040020000A0000000890000__0000003B99F7F8A0 000000067F000040020000A000000088C000-000000067F000040020000A0000000890000__0000005D2FFFFB38 000000067F000040020000A000000088FDC5-000000067F000040020000A00000008987A7__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A0000000890000-000000067F000040020000A0000000894000__00000038E1ABFE28 000000067F000040020000A0000000890000-000000067F000040020000A0000000894000__00000038E9AF7F00 000000067F000040020000A0000000890000-000000067F000040020000A0000000894000__0000003903F1CFE8 000000067F000040020000A0000000890000-000000067F000040020000A0000000894000__0000003B99F7F8A0 000000067F000040020000A0000000890000-000000067F000040020000A0000000894000__0000005D2FFFFB38 000000067F000040020000A0000000894000-000000067F000040020000A0000000898000__00000038E1ABFE28 000000067F000040020000A0000000894000-000000067F000040020000A0000000898000__00000038E9AF7F00 000000067F000040020000A0000000894000-000000067F000040020000A0000000898000__0000003903F1CFE8 000000067F000040020000A0000000894000-000000067F000040020000A0000000898000__0000003B99F7F8A0 000000067F000040020000A0000000894000-000000067F000040020000A0000000898000__0000005D2FFFFB38 000000067F000040020000A0000000898000-000000067F000040020000A000000089C000__00000038E1ABFE28 000000067F000040020000A0000000898000-000000067F000040020000A000000089C000__00000038E9AF7F00 000000067F000040020000A0000000898000-000000067F000040020000A000000089C000__0000003903F1CFE8 000000067F000040020000A0000000898000-000000067F000040020000A000000089C000__0000003B99F7F8A0 000000067F000040020000A0000000898000-000000067F000040020000A000000089C000__0000005D2FFFFB38 000000067F000040020000A00000008987A7-000000067F000040020000A00000008A117E__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A000000089C000-000000067F000040020000A00000008A0000__00000038E1ABFE28 000000067F000040020000A000000089C000-000000067F000040020000A00000008A0000__00000038E9AF7F00 000000067F000040020000A000000089C000-000000067F000040020000A00000008A0000__0000003903F1CFE8 000000067F000040020000A000000089C000-000000067F000040020000A00000008A0000__0000003B99F7F8A0 000000067F000040020000A000000089C000-000000067F000040020000A00000008A0000__0000005D2FFFFB38 000000067F000040020000A00000008A0000-000000067F000040020000A00000008A4000__00000038E1ABFE28 000000067F000040020000A00000008A0000-000000067F000040020000A00000008A4000__00000038E9AF7F00 000000067F000040020000A00000008A0000-000000067F000040020000A00000008A4000__0000003903F1CFE8 000000067F000040020000A00000008A0000-000000067F000040020000A00000008A4000__0000003B99F7F8A0 000000067F000040020000A00000008A0000-000000067F000040020000A00000008A4000__0000005D2FFFFB38 000000067F000040020000A00000008A117E-000000067F000040020000A00000008A9B5D__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008A4000-000000067F000040020000A00000008A8000__00000038E1ABFE28 000000067F000040020000A00000008A4000-000000067F000040020000A00000008A8000__00000038E9AF7F00 000000067F000040020000A00000008A4000-000000067F000040020000A00000008A8000__0000003903F1CFE8 000000067F000040020000A00000008A4000-000000067F000040020000A00000008A8000__0000003B99F7F8A0 000000067F000040020000A00000008A4000-000000067F000040020000A00000008A8000__0000005D2FFFFB38 000000067F000040020000A00000008A8000-000000067F000040020000A00000008AC000__00000038E1ABFE28 000000067F000040020000A00000008A8000-000000067F000040020000A00000008AC000__00000038E9AF7F00 000000067F000040020000A00000008A8000-000000067F000040020000A00000008AC000__0000003903F1CFE8 000000067F000040020000A00000008A8000-000000067F000040020000A00000008AC000__0000003B99F7F8A0 000000067F000040020000A00000008A8000-000000067F000040020000A00000008AC000__0000005D2FFFFB38 000000067F000040020000A00000008A9B5D-000000067F000040020000A00000008B253E__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008AC000-000000067F000040020000A00000008B0000__00000038E1ABFE28 000000067F000040020000A00000008AC000-000000067F000040020000A00000008B0000__00000038E9AF7F00 000000067F000040020000A00000008AC000-000000067F000040020000A00000008B0000__0000003903F1CFE8 000000067F000040020000A00000008AC000-000000067F000040020000A00000008B0000__0000003B99F7F8A0 000000067F000040020000A00000008AC000-000000067F000040020000A00000008B0000__0000005D2FFFFB38 000000067F000040020000A00000008B0000-000000067F000040020000A00000008B4000__00000038E1ABFE28 000000067F000040020000A00000008B0000-000000067F000040020000A00000008B4000__00000038E9AF7F00 000000067F000040020000A00000008B0000-000000067F000040020000A00000008B4000__0000003903F1CFE8 000000067F000040020000A00000008B0000-000000067F000040020000A00000008B4000__0000003B99F7F8A0 000000067F000040020000A00000008B0000-000000067F000040020000A00000008B4000__0000005D2FFFFB38 000000067F000040020000A00000008B253E-000000067F000040020000A00000008BAF04__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008B4000-000000067F000040020000A00000008B8000__00000038E1ABFE28 000000067F000040020000A00000008B4000-000000067F000040020000A00000008B8000__00000038E9AF7F00 000000067F000040020000A00000008B4000-000000067F000040020000A00000008B8000__0000003903F1CFE8 000000067F000040020000A00000008B4000-000000067F000040020000A00000008B8000__0000003B99F7F8A0 000000067F000040020000A00000008B4000-000000067F000040020000A00000008B8000__0000005D2FFFFB38 000000067F000040020000A00000008B8000-000000067F000040020000A00000008BC000__00000038E1ABFE28 000000067F000040020000A00000008B8000-000000067F000040020000A00000008BC000__00000038E9AF7F00 000000067F000040020000A00000008B8000-000000067F000040020000A00000008BC000__0000003903F1CFE8 000000067F000040020000A00000008B8000-000000067F000040020000A00000008BC000__0000003B99F7F8A0 000000067F000040020000A00000008B8000-000000067F000040020000A00000008BC000__0000005D2FFFFB38 000000067F000040020000A00000008BAF04-000000067F000040020000A00000008C38D2__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008BC000-000000067F000040020000A00000008C0000__00000038E1ABFE28 000000067F000040020000A00000008BC000-000000067F000040020000A00000008C0000__00000038E9AF7F00 000000067F000040020000A00000008BC000-000000067F000040020000A00000008C0000__0000003903F1CFE8 000000067F000040020000A00000008BC000-000000067F000040020000A00000008C0000__0000003B99F7F8A0 000000067F000040020000A00000008BC000-000000067F000040020000A00000008C0000__0000005D2FFFFB38 000000067F000040020000A00000008C0000-000000067F000040020000A00000008C4000__00000038E1ABFE28 000000067F000040020000A00000008C0000-000000067F000040020000A00000008C4000__00000038E9AF7F00 000000067F000040020000A00000008C0000-000000067F000040020000A00000008C4000__0000003903F1CFE8 000000067F000040020000A00000008C0000-000000067F000040020000A00000008C4000__0000003B99F7F8A0 000000067F000040020000A00000008C0000-000000067F000040020000A00000008C4000__0000005D2FFFFB38 000000067F000040020000A00000008C38D2-000000067F000040020000A00000008CC2C1__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008C4000-000000067F000040020000A00000008C8000__00000038E1ABFE28 000000067F000040020000A00000008C4000-000000067F000040020000A00000008C8000__00000038E9AF7F00 000000067F000040020000A00000008C4000-000000067F000040020000A00000008C8000__0000003903F1CFE8 000000067F000040020000A00000008C4000-000000067F000040020000A00000008C8000__0000003B99F7F8A0 000000067F000040020000A00000008C4000-000000067F000040020000A00000008C8000__0000005D2FFFFB38 000000067F000040020000A00000008C8000-000000067F000040020000A00000008CC000__00000038E1ABFE28 000000067F000040020000A00000008C8000-000000067F000040020000A00000008CC000__00000038E9AF7F00 000000067F000040020000A00000008C8000-000000067F000040020000A00000008CC000__0000003903F1CFE8 000000067F000040020000A00000008C8000-000000067F000040020000A00000008CC000__0000003B99F7F8A0 000000067F000040020000A00000008C8000-000000067F000040020000A00000008CC000__0000005D2FFFFB38 000000067F000040020000A00000008CC000-000000067F000040020000A00000008D0000__0000002D89C52B28 000000067F000040020000A00000008CC000-000000067F000040020000A00000008D0000__00000038E9AF7F00 000000067F000040020000A00000008CC000-000000067F000040020000A00000008D0000__0000003903F1CFE8 000000067F000040020000A00000008CC000-000000067F000040020000A00000008D0000__0000003B99F7F8A0 000000067F000040020000A00000008CC000-000000067F000040020000A00000008D0000__0000005D2FFFFB38 000000067F000040020000A00000008CC2C1-000000067F000040020000A0000200000000__0000002C1A7DEAD1-0000002CBA2DFCE9 000000067F000040020000A00000008CC47E-000000067F000040020000A00000008D4E54__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008D0000-000000067F000040020000A00000008D4000__0000002D89C52B28 000000067F000040020000A00000008D0000-000000067F000040020000A00000008D4000__00000038E9AF7F00 000000067F000040020000A00000008D0000-000000067F000040020000A00000008D4000__0000003903F1CFE8 000000067F000040020000A00000008D0000-000000067F000040020000A00000008D4000__0000003B99F7F8A0 000000067F000040020000A00000008D0000-000000067F000040020000A00000008D4000__0000005D2FFFFB38 000000067F000040020000A00000008D4000-000000067F000040020000A00000008D8000__0000002D89C52B28 000000067F000040020000A00000008D4000-000000067F000040020000A00000008D8000__00000038E9AF7F00 000000067F000040020000A00000008D4000-000000067F000040020000A00000008D8000__0000003903F1CFE8 000000067F000040020000A00000008D4000-000000067F000040020000A00000008D8000__0000003B99F7F8A0 000000067F000040020000A00000008D4000-000000067F000040020000A00000008D8000__0000005D2FFFFB38 000000067F000040020000A00000008D4E54-000000067F000040020000A00000008DD830__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008D8000-000000067F000040020000A00000008DC000__0000002D89C52B28 000000067F000040020000A00000008D8000-000000067F000040020000A00000008DC000__00000038E9AF7F00 000000067F000040020000A00000008D8000-000000067F000040020000A00000008DC000__0000003903F1CFE8 000000067F000040020000A00000008D8000-000000067F000040020000A00000008DC000__0000003B99F7F8A0 000000067F000040020000A00000008D8000-000000067F000040020000A00000008DC000__0000005D2FFFFB38 000000067F000040020000A00000008DC000-000000067F000040020000A00000008E0000__0000002D89C52B28 000000067F000040020000A00000008DC000-000000067F000040020000A00000008E0000__00000038E9AF7F00 000000067F000040020000A00000008DC000-000000067F000040020000A00000008E0000__0000003903F1CFE8 000000067F000040020000A00000008DC000-000000067F000040020000A00000008E0000__0000003B99F7F8A0 000000067F000040020000A00000008DC000-000000067F000040020000A00000008E0000__0000005D2FFFFB38 000000067F000040020000A00000008DD830-000000067F000040020000A00000008E6201__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008E0000-000000067F000040020000A00000008E4000__0000002D89C52B28 000000067F000040020000A00000008E0000-000000067F000040020000A00000008E4000__00000038E9AF7F00 000000067F000040020000A00000008E0000-000000067F000040020000A00000008E4000__0000003903F1CFE8 000000067F000040020000A00000008E0000-000000067F000040020000A00000008E4000__0000003B99F7F8A0 000000067F000040020000A00000008E0000-000000067F000040020000A00000008E4000__0000005D2FFFFB38 000000067F000040020000A00000008E4000-000000067F000040020000A00000008E8000__0000002D89C52B28 000000067F000040020000A00000008E4000-000000067F000040020000A00000008E8000__00000038E9AF7F00 000000067F000040020000A00000008E4000-000000067F000040020000A00000008E8000__0000003903F1CFE8 000000067F000040020000A00000008E4000-000000067F000040020000A00000008E8000__0000003B99F7F8A0 000000067F000040020000A00000008E4000-000000067F000040020000A00000008E8000__0000005D2FFFFB38 000000067F000040020000A00000008E6201-000000067F000040020000A00000008EEBDC__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008E8000-000000067F000040020000A00000008EC000__0000002D89C52B28 000000067F000040020000A00000008E8000-000000067F000040020000A00000008EC000__00000038E9AF7F00 000000067F000040020000A00000008E8000-000000067F000040020000A00000008EC000__0000003903F1CFE8 000000067F000040020000A00000008E8000-000000067F000040020000A00000008EC000__0000003B99F7F8A0 000000067F000040020000A00000008E8000-000000067F000040020000A00000008EC000__0000005D2FFFFB38 000000067F000040020000A00000008EC000-000000067F000040020000A00000008F0000__0000002D89C52B28 000000067F000040020000A00000008EC000-000000067F000040020000A00000008F0000__00000038E9AF7F00 000000067F000040020000A00000008EC000-000000067F000040020000A00000008F0000__0000003903F1CFE8 000000067F000040020000A00000008EC000-000000067F000040020000A00000008F0000__0000003B99F7F8A0 000000067F000040020000A00000008EC000-000000067F000040020000A00000008F0000__0000005D2FFFFB38 000000067F000040020000A00000008EEBDC-000000067F000040020000A00000008F75B3__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008F0000-000000067F000040020000A00000008F4000__0000002D89C52B28 000000067F000040020000A00000008F0000-000000067F000040020000A00000008F4000__00000038E9AF7F00 000000067F000040020000A00000008F0000-000000067F000040020000A00000008F4000__0000003903F1CFE8 000000067F000040020000A00000008F0000-000000067F000040020000A00000008F4000__0000003B99F7F8A0 000000067F000040020000A00000008F0000-000000067F000040020000A00000008F4000__0000005D2FFFFB38 000000067F000040020000A00000008F4000-000000067F000040020000A00000008F8000__0000002D89C52B28 000000067F000040020000A00000008F4000-000000067F000040020000A00000008F8000__00000038E9AF7F00 000000067F000040020000A00000008F4000-000000067F000040020000A00000008F8000__0000003903F1CFE8 000000067F000040020000A00000008F4000-000000067F000040020000A00000008F8000__0000003B99F7F8A0 000000067F000040020000A00000008F4000-000000067F000040020000A00000008F8000__0000005D2FFFFB38 000000067F000040020000A00000008F75B3-000000067F000040020000A00000008FFF8B__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A00000008F8000-000000067F000040020000A00000008FC000__0000002D89C52B28 000000067F000040020000A00000008F8000-000000067F000040020000A00000008FC000__00000038E9AF7F00 000000067F000040020000A00000008F8000-000000067F000040020000A00000008FC000__0000003903F1CFE8 000000067F000040020000A00000008F8000-000000067F000040020000A00000008FC000__0000003B99F7F8A0 000000067F000040020000A00000008F8000-000000067F000040020000A00000008FC000__0000005D2FFFFB38 000000067F000040020000A00000008FC000-000000067F000040020000A0000000900000__0000002D89C52B28 000000067F000040020000A00000008FC000-000000067F000040020000A0000000900000__00000038E9AF7F00 000000067F000040020000A00000008FC000-000000067F000040020000A0000000900000__0000003903F1CFE8 000000067F000040020000A00000008FC000-000000067F000040020000A0000000900000__0000003B99F7F8A0 000000067F000040020000A00000008FC000-000000067F000040020000A0000000900000__0000005D2FFFFB38 000000067F000040020000A00000008FFF8B-000000067F000040020000A000000090896E__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A0000000900000-000000067F000040020000A0000000904000__0000002D89C52B28 000000067F000040020000A0000000900000-000000067F000040020000A0000000904000__00000038E9AF7F00 000000067F000040020000A0000000900000-000000067F000040020000A0000000904000__0000003903F1CFE8 000000067F000040020000A0000000900000-000000067F000040020000A0000000904000__0000003B99F7F8A0 000000067F000040020000A0000000900000-000000067F000040020000A0000000904000__0000005D2FFFFB38 000000067F000040020000A0000000904000-000000067F000040020000A0000000908000__0000002D89C52B28 000000067F000040020000A0000000904000-000000067F000040020000A0000000908000__00000038E9AF7F00 000000067F000040020000A0000000904000-000000067F000040020000A0000000908000__0000003903F1CFE8 000000067F000040020000A0000000904000-000000067F000040020000A0000000908000__0000003B99F7F8A0 000000067F000040020000A0000000904000-000000067F000040020000A0000000908000__0000005D2FFFFB38 000000067F000040020000A0000000908000-000000067F000040020000A000000090C000__0000002D89C52B28 000000067F000040020000A0000000908000-000000067F000040020000A000000090C000__00000038E9AF7F00 000000067F000040020000A0000000908000-000000067F000040020000A000000090C000__0000003903F1CFE8 000000067F000040020000A0000000908000-000000067F000040020000A000000090C000__0000003B99F7F8A0 000000067F000040020000A0000000908000-000000067F000040020000A000000090C000__0000005D2FFFFB38 000000067F000040020000A000000090896E-000000067F000040020000A000000091134B__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A000000090C000-000000067F000040020000A0000000910000__0000002D89C52B28 000000067F000040020000A000000090C000-000000067F000040020000A0000000910000__00000038E9AF7F00 000000067F000040020000A000000090C000-000000067F000040020000A0000000910000__0000003903F1CFE8 000000067F000040020000A000000090C000-000000067F000040020000A0000000910000__0000003B99F7F8A0 000000067F000040020000A000000090C000-000000067F000040020000A0000000910000__0000005D2FFFFB38 000000067F000040020000A0000000910000-000000067F000040020000A0000000914000__0000002D89C52B28 000000067F000040020000A0000000910000-000000067F000040020000A0000000914000__00000038E9AF7F00 000000067F000040020000A0000000910000-000000067F000040020000A0000000914000__0000003903F1CFE8 000000067F000040020000A0000000910000-000000067F000040020000A0000000914000__0000003B99F7F8A0 000000067F000040020000A0000000910000-000000067F000040020000A0000000914000__0000005D2FFFFB38 000000067F000040020000A000000091134B-000000067F000040020000A0000000919D16__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A0000000914000-000000067F000040020000A0000000918000__0000002D89C52B28 000000067F000040020000A0000000914000-000000067F000040020000A0000000918000__00000038E9AF7F00 000000067F000040020000A0000000914000-000000067F000040020000A0000000918000__0000003903F1CFE8 000000067F000040020000A0000000914000-000000067F000040020000A0000000918000__0000003B99F7F8A0 000000067F000040020000A0000000914000-000000067F000040020000A0000000918000__0000005D2FFFFB38 000000067F000040020000A0000000918000-000000067F000040020000A000000091C000__0000002D89C52B28 000000067F000040020000A0000000918000-000000067F000040020000A000000091C000__00000038E9AF7F00 000000067F000040020000A0000000918000-000000067F000040020000A000000091C000__0000003903F1CFE8 000000067F000040020000A0000000918000-000000067F000040020000A000000091C000__0000003B99F7F8A0 000000067F000040020000A0000000918000-000000067F000040020000A000000091C000__0000005D2FFFFB38 000000067F000040020000A0000000919D16-000000067F000040020000A00000009226E9__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A000000091C000-000000067F000040020000A0000000920000__0000002D89C52B28 000000067F000040020000A000000091C000-000000067F000040020000A0000000920000__00000038E9AF7F00 000000067F000040020000A000000091C000-000000067F000040020000A0000000920000__0000003903F1CFE8 000000067F000040020000A000000091C000-000000067F000040020000A0000000920000__0000003B99F7F8A0 000000067F000040020000A000000091C000-000000067F000040020000A0000000920000__0000005D2FFFFB38 000000067F000040020000A0000000920000-000000067F000040020000A0000000924000__0000002D89C52B28 000000067F000040020000A0000000920000-000000067F000040020000A0000000924000__00000038E67ABFA0 000000067F000040020000A0000000920000-000000067F000040020000A0000000924000__0000003903F1CFE8 000000067F000040020000A0000000920000-000000067F000040020000A0000000924000__0000003B99F7F8A0 000000067F000040020000A0000000920000-000000067F000040020000A0000000924000__0000005D2FFFFB38 000000067F000040020000A00000009226E9-000000067F000040020000A0000200000000__0000002CBA2DFCE9-0000002D59DDCFE9 000000067F000040020000A0000000922977-000000067F000040020000A000000092B35C__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000924000-000000067F000040020000A0000000928000__0000002D89C52B28 000000067F000040020000A0000000924000-000000067F000040020000A0000000928000__00000038E67ABFA0 000000067F000040020000A0000000924000-000000067F000040020000A0000000928000__0000003903F1CFE8 000000067F000040020000A0000000924000-000000067F000040020000A0000000928000__0000003B99F7F8A0 000000067F000040020000A0000000924000-000000067F000040020000A0000000928000__0000005D2FFFFB38 000000067F000040020000A0000000928000-000000067F000040020000A000000092C000__0000002D89C52B28 000000067F000040020000A0000000928000-000000067F000040020000A000000092C000__00000038E67ABFA0 000000067F000040020000A0000000928000-000000067F000040020000A000000092C000__0000003903F1CFE8 000000067F000040020000A0000000928000-000000067F000040020000A000000092C000__0000003B99F7F8A0 000000067F000040020000A0000000928000-000000067F000040020000A000000092C000__0000005D2FFFFB38 000000067F000040020000A000000092B35C-000000067F000040020000A0000000933D30__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A000000092C000-000000067F000040020000A0000000930000__0000002D89C52B28 000000067F000040020000A000000092C000-000000067F000040020000A0000000930000__00000038E67ABFA0 000000067F000040020000A000000092C000-000000067F000040020000A0000000930000__0000003903F1CFE8 000000067F000040020000A000000092C000-000000067F000040020000A0000000930000__0000003B99F7F8A0 000000067F000040020000A000000092C000-000000067F000040020000A0000000930000__0000005D2FFFFB38 000000067F000040020000A0000000930000-000000067F000040020000A0000000934000__0000002D89C52B28 000000067F000040020000A0000000930000-000000067F000040020000A0000000934000__00000038E67ABFA0 000000067F000040020000A0000000930000-000000067F000040020000A0000000934000__0000003903F1CFE8 000000067F000040020000A0000000930000-000000067F000040020000A0000000934000__0000003B99F7F8A0 000000067F000040020000A0000000930000-000000067F000040020000A0000000934000__0000005D2FFFFB38 000000067F000040020000A0000000933D30-000000067F000040020000A000000093C701__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000934000-000000067F000040020000A0000000938000__0000002D89C52B28 000000067F000040020000A0000000934000-000000067F000040020000A0000000938000__00000038E67ABFA0 000000067F000040020000A0000000934000-000000067F000040020000A0000000938000__0000003903F1CFE8 000000067F000040020000A0000000934000-000000067F000040020000A0000000938000__0000003B99F7F8A0 000000067F000040020000A0000000934000-000000067F000040020000A0000000938000__0000005D2FFFFB38 000000067F000040020000A0000000938000-000000067F000040020000A000000093C000__0000002D89C52B28 000000067F000040020000A0000000938000-000000067F000040020000A000000093C000__00000038E67ABFA0 000000067F000040020000A0000000938000-000000067F000040020000A000000093C000__0000003903F1CFE8 000000067F000040020000A0000000938000-000000067F000040020000A000000093C000__0000003B99F7F8A0 000000067F000040020000A0000000938000-000000067F000040020000A000000093C000__0000005D2FFFFB38 000000067F000040020000A000000093C000-000000067F000040020000A0000000940000__00000038E67ABFA0 000000067F000040020000A000000093C000-000000067F000040020000A0000000940000__0000003903F1CFE8 000000067F000040020000A000000093C000-000000067F000040020000A0000000940000__0000003B99F7F8A0 000000067F000040020000A000000093C000-000000067F000040020000A0000000940000__0000005D2FFFFB38 000000067F000040020000A000000093C000-030000000000000000000000000000000002__0000002D89C52B28 000000067F000040020000A000000093C701-000000067F000040020000A00000009450E3__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000940000-000000067F000040020000A0000000944000__00000038E67ABFA0 000000067F000040020000A0000000940000-000000067F000040020000A0000000944000__0000003903F1CFE8 000000067F000040020000A0000000940000-000000067F000040020000A0000000944000__0000003B99F7F8A0 000000067F000040020000A0000000940000-000000067F000040020000A0000000944000__0000005D2FFFFB38 000000067F000040020000A0000000944000-000000067F000040020000A0000000948000__00000038E67ABFA0 000000067F000040020000A0000000944000-000000067F000040020000A0000000948000__0000003903F1CFE8 000000067F000040020000A0000000944000-000000067F000040020000A0000000948000__0000003B99F7F8A0 000000067F000040020000A0000000944000-000000067F000040020000A0000000948000__0000005D2FFFFB38 000000067F000040020000A00000009450E3-000000067F000040020000A000000094DAC0__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000948000-000000067F000040020000A000000094C000__00000038E67ABFA0 000000067F000040020000A0000000948000-000000067F000040020000A000000094C000__0000003903F1CFE8 000000067F000040020000A0000000948000-000000067F000040020000A000000094C000__0000003B99F7F8A0 000000067F000040020000A0000000948000-000000067F000040020000A000000094C000__0000005D2FFFFB38 000000067F000040020000A000000094C000-000000067F000040020000A0000000950000__00000038E67ABFA0 000000067F000040020000A000000094C000-000000067F000040020000A0000000950000__0000003903F1CFE8 000000067F000040020000A000000094C000-000000067F000040020000A0000000950000__0000003B99F7F8A0 000000067F000040020000A000000094C000-000000067F000040020000A0000000950000__0000005D2FFFFB38 000000067F000040020000A000000094DAC0-000000067F000040020000A0000000956495__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000950000-000000067F000040020000A0000000954000__00000038E67ABFA0 000000067F000040020000A0000000950000-000000067F000040020000A0000000954000__0000003903F1CFE8 000000067F000040020000A0000000950000-000000067F000040020000A0000000954000__0000003B99F7F8A0 000000067F000040020000A0000000950000-000000067F000040020000A0000000954000__0000005D2FFFFB38 000000067F000040020000A0000000954000-000000067F000040020000A0000000958000__00000038E67ABFA0 000000067F000040020000A0000000954000-000000067F000040020000A0000000958000__0000003903F1CFE8 000000067F000040020000A0000000954000-000000067F000040020000A0000000958000__0000003B99F7F8A0 000000067F000040020000A0000000954000-000000067F000040020000A0000000958000__0000005D2FFFFB38 000000067F000040020000A0000000956495-000000067F000040020000A000000095EE79__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000958000-000000067F000040020000A000000095C000__00000038E67ABFA0 000000067F000040020000A0000000958000-000000067F000040020000A000000095C000__0000003903F1CFE8 000000067F000040020000A0000000958000-000000067F000040020000A000000095C000__0000003B99F7F8A0 000000067F000040020000A0000000958000-000000067F000040020000A000000095C000__0000005D2FFFFB38 000000067F000040020000A000000095C000-000000067F000040020000A0000000960000__00000038E67ABFA0 000000067F000040020000A000000095C000-000000067F000040020000A0000000960000__0000003903F1CFE8 000000067F000040020000A000000095C000-000000067F000040020000A0000000960000__0000003B99F7F8A0 000000067F000040020000A000000095C000-000000067F000040020000A0000000960000__0000005D2FFFFB38 000000067F000040020000A000000095EE79-000000067F000040020000A0000000967850__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000960000-000000067F000040020000A0000000964000__00000038E67ABFA0 000000067F000040020000A0000000960000-000000067F000040020000A0000000964000__0000003903F1CFE8 000000067F000040020000A0000000960000-000000067F000040020000A0000000964000__0000003B99F7F8A0 000000067F000040020000A0000000960000-000000067F000040020000A0000000964000__0000005D2FFFFB38 000000067F000040020000A0000000964000-000000067F000040020000A0000000968000__00000038E67ABFA0 000000067F000040020000A0000000964000-000000067F000040020000A0000000968000__0000003903F1CFE8 000000067F000040020000A0000000964000-000000067F000040020000A0000000968000__0000003B99F7F8A0 000000067F000040020000A0000000964000-000000067F000040020000A0000000968000__0000005D2FFFFB38 000000067F000040020000A0000000967850-000000067F000040020000A000000097022A__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000968000-000000067F000040020000A000000096C000__00000038E67ABFA0 000000067F000040020000A0000000968000-000000067F000040020000A000000096C000__0000003903F1CFE8 000000067F000040020000A0000000968000-000000067F000040020000A000000096C000__0000003B99F7F8A0 000000067F000040020000A0000000968000-000000067F000040020000A000000096C000__0000005D2FFFFB38 000000067F000040020000A000000096C000-000000067F000040020000A0000000970000__00000038E67ABFA0 000000067F000040020000A000000096C000-000000067F000040020000A0000000970000__0000003903F1CFE8 000000067F000040020000A000000096C000-000000067F000040020000A0000000970000__0000003B99F7F8A0 000000067F000040020000A000000096C000-000000067F000040020000A0000000970000__0000005D2FFFFB38 000000067F000040020000A0000000970000-000000067F000040020000A0000000974000__00000038E67ABFA0 000000067F000040020000A0000000970000-000000067F000040020000A0000000974000__0000003903F1CFE8 000000067F000040020000A0000000970000-000000067F000040020000A0000000974000__0000003B99F7F8A0 000000067F000040020000A0000000970000-000000067F000040020000A0000000974000__0000005D2FFFFB38 000000067F000040020000A000000097022A-000000067F000040020000A0000000978BFD__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A0000000974000-000000067F000040020000A0000000978000__00000038E67ABFA0 000000067F000040020000A0000000974000-000000067F000040020000A0000000978000__0000003903F1CFE8 000000067F000040020000A0000000974000-000000067F000040020000A0000000978000__0000003B99F7F8A0 000000067F000040020000A0000000974000-000000067F000040020000A0000000978000__0000005D2FFFFB38 000000067F000040020000A0000000978000-000000067F000040020000A000000097C000__00000038E67ABFA0 000000067F000040020000A0000000978000-000000067F000040020000A000000097C000__0000003903F1CFE8 000000067F000040020000A0000000978000-000000067F000040020000A000000097C000__0000003B99F7F8A0 000000067F000040020000A0000000978000-000000067F000040020000A000000097C000__0000005D2FFFFB38 000000067F000040020000A0000000978BFD-000000067F000040020000A00000009815F8__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A000000097C000-000000067F000040020000A0000000980000__00000038E67ABFA0 000000067F000040020000A000000097C000-000000067F000040020000A0000000980000__0000003903F1CFE8 000000067F000040020000A000000097C000-000000067F000040020000A0000000980000__0000003B99F7F8A0 000000067F000040020000A000000097C000-000000067F000040020000A0000000980000__0000005D2FFFFB38 000000067F000040020000A0000000980000-000000067F000040020000A0000000984000__00000038E1ABFE28 000000067F000040020000A0000000980000-000000067F000040020000A0000000984000__00000038E9AF7F00 000000067F000040020000A0000000980000-000000067F000040020000A0000000984000__0000003903F1CFE8 000000067F000040020000A0000000980000-000000067F000040020000A0000000984000__0000003B99F7F8A0 000000067F000040020000A0000000980000-000000067F000040020000A0000000984000__0000005D2FFFFB38 000000067F000040020000A00000009815F8-000000067F000040020000A0000200000000__0000002D59DDCFE9-0000002E0985D9D9 000000067F000040020000A00000009817EC-000000067F000040020000A000000098A1CB__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A0000000984000-000000067F000040020000A0000000988000__00000038E1ABFE28 000000067F000040020000A0000000984000-000000067F000040020000A0000000988000__00000038E9AF7F00 000000067F000040020000A0000000984000-000000067F000040020000A0000000988000__0000003903F1CFE8 000000067F000040020000A0000000984000-000000067F000040020000A0000000988000__0000003B99F7F8A0 000000067F000040020000A0000000984000-000000067F000040020000A0000000988000__0000005D2FFFFB38 000000067F000040020000A0000000988000-000000067F000040020000A000000098C000__00000038E1ABFE28 000000067F000040020000A0000000988000-000000067F000040020000A000000098C000__00000038E9AF7F00 000000067F000040020000A0000000988000-000000067F000040020000A000000098C000__0000003903F1CFE8 000000067F000040020000A0000000988000-000000067F000040020000A000000098C000__0000003B99F7F8A0 000000067F000040020000A0000000988000-000000067F000040020000A000000098C000__0000005D2FFFFB38 000000067F000040020000A000000098A1CB-000000067F000040020000A0000000992BA5__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A000000098C000-000000067F000040020000A0000000990000__00000038E1ABFE28 000000067F000040020000A000000098C000-000000067F000040020000A0000000990000__00000038E9AF7F00 000000067F000040020000A000000098C000-000000067F000040020000A0000000990000__0000003903F1CFE8 000000067F000040020000A000000098C000-000000067F000040020000A0000000990000__0000003B99F7F8A0 000000067F000040020000A000000098C000-000000067F000040020000A0000000990000__0000005D2FFFFB38 000000067F000040020000A0000000990000-000000067F000040020000A0000000994000__00000038E1ABFE28 000000067F000040020000A0000000990000-000000067F000040020000A0000000994000__00000038E9AF7F00 000000067F000040020000A0000000990000-000000067F000040020000A0000000994000__0000003903F1CFE8 000000067F000040020000A0000000990000-000000067F000040020000A0000000994000__0000003B99F7F8A0 000000067F000040020000A0000000990000-000000067F000040020000A0000000994000__0000005D2FFFFB38 000000067F000040020000A0000000992BA5-000000067F000040020000A000000099B589__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A0000000994000-000000067F000040020000A0000000998000__00000038E1ABFE28 000000067F000040020000A0000000994000-000000067F000040020000A0000000998000__00000038E9AF7F00 000000067F000040020000A0000000994000-000000067F000040020000A0000000998000__0000003903F1CFE8 000000067F000040020000A0000000994000-000000067F000040020000A0000000998000__0000003B99F7F8A0 000000067F000040020000A0000000994000-000000067F000040020000A0000000998000__0000005D2FFFFB38 000000067F000040020000A0000000998000-000000067F000040020000A000000099C000__00000038E1ABFE28 000000067F000040020000A0000000998000-000000067F000040020000A000000099C000__00000038E9AF7F00 000000067F000040020000A0000000998000-000000067F000040020000A000000099C000__0000003903F1CFE8 000000067F000040020000A0000000998000-000000067F000040020000A000000099C000__0000003B99F7F8A0 000000067F000040020000A0000000998000-000000067F000040020000A000000099C000__0000005D2FFFFB38 000000067F000040020000A000000099B589-000000067F000040020000A00000009A3F65__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A000000099C000-000000067F000040020000A00000009A0000__00000038E1ABFE28 000000067F000040020000A000000099C000-000000067F000040020000A00000009A0000__00000038E9AF7F00 000000067F000040020000A000000099C000-000000067F000040020000A00000009A0000__0000003903F1CFE8 000000067F000040020000A000000099C000-000000067F000040020000A00000009A0000__0000003B99F7F8A0 000000067F000040020000A000000099C000-000000067F000040020000A00000009A0000__0000005D2FFFFB38 000000067F000040020000A00000009A0000-000000067F000040020000A00000009A4000__00000038E1ABFE28 000000067F000040020000A00000009A0000-000000067F000040020000A00000009A4000__00000038E9AF7F00 000000067F000040020000A00000009A0000-000000067F000040020000A00000009A4000__0000003903F1CFE8 000000067F000040020000A00000009A0000-000000067F000040020000A00000009A4000__0000003B99F7F8A0 000000067F000040020000A00000009A0000-000000067F000040020000A00000009A4000__0000005D2FFFFB38 000000067F000040020000A00000009A3F65-000000067F000040020000A00000009AC941__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009A4000-000000067F000040020000A00000009A8000__00000038E1ABFE28 000000067F000040020000A00000009A4000-000000067F000040020000A00000009A8000__00000038E9AF7F00 000000067F000040020000A00000009A4000-000000067F000040020000A00000009A8000__0000003903F1CFE8 000000067F000040020000A00000009A4000-000000067F000040020000A00000009A8000__0000003B99F7F8A0 000000067F000040020000A00000009A4000-000000067F000040020000A00000009A8000__0000005D2FFFFB38 000000067F000040020000A00000009A8000-000000067F000040020000A00000009AC000__00000038E1ABFE28 000000067F000040020000A00000009A8000-000000067F000040020000A00000009AC000__00000038E9AF7F00 000000067F000040020000A00000009A8000-000000067F000040020000A00000009AC000__0000003903F1CFE8 000000067F000040020000A00000009A8000-000000067F000040020000A00000009AC000__0000003B99F7F8A0 000000067F000040020000A00000009A8000-000000067F000040020000A00000009AC000__0000005D2FFFFB38 000000067F000040020000A00000009AC000-000000067F000040020000A00000009B0000__00000038E1ABFE28 000000067F000040020000A00000009AC000-000000067F000040020000A00000009B0000__00000038E9AF7F00 000000067F000040020000A00000009AC000-000000067F000040020000A00000009B0000__0000003903F1CFE8 000000067F000040020000A00000009AC000-000000067F000040020000A00000009B0000__0000003B99F7F8A0 000000067F000040020000A00000009AC000-000000067F000040020000A00000009B0000__0000005D2FFFFB38 000000067F000040020000A00000009AC941-000000067F000040020000A00000009B531B__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009B0000-000000067F000040020000A00000009B4000__00000038E1ABFE28 000000067F000040020000A00000009B0000-000000067F000040020000A00000009B4000__00000038E9AF7F00 000000067F000040020000A00000009B0000-000000067F000040020000A00000009B4000__0000003903F1CFE8 000000067F000040020000A00000009B0000-000000067F000040020000A00000009B4000__0000003B99F7F8A0 000000067F000040020000A00000009B0000-000000067F000040020000A00000009B4000__0000005D2FFFFB38 000000067F000040020000A00000009B4000-000000067F000040020000A00000009B8000__00000038E1ABFE28 000000067F000040020000A00000009B4000-000000067F000040020000A00000009B8000__00000038E9AF7F00 000000067F000040020000A00000009B4000-000000067F000040020000A00000009B8000__0000003903F1CFE8 000000067F000040020000A00000009B4000-000000067F000040020000A00000009B8000__0000003B99F7F8A0 000000067F000040020000A00000009B4000-000000067F000040020000A00000009B8000__0000005D2FFFFB38 000000067F000040020000A00000009B531B-000000067F000040020000A00000009BDCFC__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009B8000-000000067F000040020000A00000009BC000__00000038E1ABFE28 000000067F000040020000A00000009B8000-000000067F000040020000A00000009BC000__00000038E9AF7F00 000000067F000040020000A00000009B8000-000000067F000040020000A00000009BC000__0000003903F1CFE8 000000067F000040020000A00000009B8000-000000067F000040020000A00000009BC000__0000003B99F7F8A0 000000067F000040020000A00000009B8000-000000067F000040020000A00000009BC000__0000005D2FFFFB38 000000067F000040020000A00000009BC000-000000067F000040020000A00000009C0000__00000038E1ABFE28 000000067F000040020000A00000009BC000-000000067F000040020000A00000009C0000__00000038E9AF7F00 000000067F000040020000A00000009BC000-000000067F000040020000A00000009C0000__0000003903F1CFE8 000000067F000040020000A00000009BC000-000000067F000040020000A00000009C0000__0000003B99F7F8A0 000000067F000040020000A00000009BC000-000000067F000040020000A00000009C0000__0000005D2FFFFB38 000000067F000040020000A00000009BDCFC-000000067F000040020000A00000009C66D1__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009C0000-000000067F000040020000A00000009C4000__00000038E1ABFE28 000000067F000040020000A00000009C0000-000000067F000040020000A00000009C4000__00000038E9AF7F00 000000067F000040020000A00000009C0000-000000067F000040020000A00000009C4000__0000003903F1CFE8 000000067F000040020000A00000009C0000-000000067F000040020000A00000009C4000__0000003B99F7F8A0 000000067F000040020000A00000009C0000-000000067F000040020000A00000009C4000__0000005D2FFFFB38 000000067F000040020000A00000009C4000-000000067F000040020000A00000009C8000__00000038E1ABFE28 000000067F000040020000A00000009C4000-000000067F000040020000A00000009C8000__00000038E9AF7F00 000000067F000040020000A00000009C4000-000000067F000040020000A00000009C8000__0000003903F1CFE8 000000067F000040020000A00000009C4000-000000067F000040020000A00000009C8000__0000003B99F7F8A0 000000067F000040020000A00000009C4000-000000067F000040020000A00000009C8000__0000005D2FFFFB38 000000067F000040020000A00000009C66D1-000000067F000040020000A00000009CF0AC__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009C8000-000000067F000040020000A00000009CC000__00000038E1ABFE28 000000067F000040020000A00000009C8000-000000067F000040020000A00000009CC000__00000038E9AF7F00 000000067F000040020000A00000009C8000-000000067F000040020000A00000009CC000__0000003903F1CFE8 000000067F000040020000A00000009C8000-000000067F000040020000A00000009CC000__0000003B99F7F8A0 000000067F000040020000A00000009C8000-000000067F000040020000A00000009CC000__0000005D2FFFFB38 000000067F000040020000A00000009CC000-000000067F000040020000A00000009D0000__00000038E1ABFE28 000000067F000040020000A00000009CC000-000000067F000040020000A00000009D0000__00000038E9AF7F00 000000067F000040020000A00000009CC000-000000067F000040020000A00000009D0000__0000003903F1CFE8 000000067F000040020000A00000009CC000-000000067F000040020000A00000009D0000__0000003B99F7F8A0 000000067F000040020000A00000009CC000-000000067F000040020000A00000009D0000__0000005D2FFFFB38 000000067F000040020000A00000009CF0AC-000000067F000040020000A00000009D7A91__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009D0000-000000067F000040020000A00000009D4000__00000038E1ABFE28 000000067F000040020000A00000009D0000-000000067F000040020000A00000009D4000__00000038E9AF7F00 000000067F000040020000A00000009D0000-000000067F000040020000A00000009D4000__0000003903F1CFE8 000000067F000040020000A00000009D0000-000000067F000040020000A00000009D4000__0000003B99F7F8A0 000000067F000040020000A00000009D0000-000000067F000040020000A00000009D4000__0000005D2FFFFB38 000000067F000040020000A00000009D4000-000000067F000040020000A00000009D8000__00000038E1ABFE28 000000067F000040020000A00000009D4000-000000067F000040020000A00000009D8000__00000038E9AF7F00 000000067F000040020000A00000009D4000-000000067F000040020000A00000009D8000__0000003903F1CFE8 000000067F000040020000A00000009D4000-000000067F000040020000A00000009D8000__0000003B99F7F8A0 000000067F000040020000A00000009D4000-000000067F000040020000A00000009D8000__0000005D2FFFFB38 000000067F000040020000A00000009D7A91-000000067F000040020000A00000009E0464__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009D8000-000000067F000040020000A00000009DC000__00000038E1ABFE28 000000067F000040020000A00000009D8000-000000067F000040020000A00000009DC000__00000038E9AF7F00 000000067F000040020000A00000009D8000-000000067F000040020000A00000009DC000__0000003903F1CFE8 000000067F000040020000A00000009D8000-000000067F000040020000A00000009DC000__0000003B99F7F8A0 000000067F000040020000A00000009D8000-000000067F000040020000A00000009DC000__0000005D2FFFFB38 000000067F000040020000A00000009DC000-000000067F000040020000A00000009E0000__00000038E1ABFE28 000000067F000040020000A00000009DC000-000000067F000040020000A00000009E0000__00000038E9AF7F00 000000067F000040020000A00000009DC000-000000067F000040020000A00000009E0000__0000003903F1CFE8 000000067F000040020000A00000009DC000-000000067F000040020000A00000009E0000__0000003B99F7F8A0 000000067F000040020000A00000009DC000-000000067F000040020000A00000009E0000__0000005D2FFFFB38 000000067F000040020000A00000009E0000-000000067F000040020000A00000009E4000__0000002F83FFFE68 000000067F000040020000A00000009E0000-000000067F000040020000A00000009E4000__00000038E9AF7F00 000000067F000040020000A00000009E0000-000000067F000040020000A00000009E4000__0000003903F1CFE8 000000067F000040020000A00000009E0000-000000067F000040020000A00000009E4000__0000003B99F7F8A0 000000067F000040020000A00000009E0000-000000067F000040020000A00000009E4000__0000005D2FFFFB38 000000067F000040020000A00000009E0464-000000067F000040020000A0000200000000__0000002E0985D9D9-0000002EB92DEAC1 000000067F000040020000A00000009E0707-000000067F000040020000A00000009E90D7__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A00000009E4000-000000067F000040020000A00000009E8000__0000002F83FFFE68 000000067F000040020000A00000009E4000-000000067F000040020000A00000009E8000__00000038E9AF7F00 000000067F000040020000A00000009E4000-000000067F000040020000A00000009E8000__0000003903F1CFE8 000000067F000040020000A00000009E4000-000000067F000040020000A00000009E8000__0000003B99F7F8A0 000000067F000040020000A00000009E4000-000000067F000040020000A00000009E8000__0000005D2FFFFB38 000000067F000040020000A00000009E8000-000000067F000040020000A00000009EC000__0000002F83FFFE68 000000067F000040020000A00000009E8000-000000067F000040020000A00000009EC000__00000038E9AF7F00 000000067F000040020000A00000009E8000-000000067F000040020000A00000009EC000__0000003903F1CFE8 000000067F000040020000A00000009E8000-000000067F000040020000A00000009EC000__0000003B99F7F8A0 000000067F000040020000A00000009E8000-000000067F000040020000A00000009EC000__0000005D2FFFFB38 000000067F000040020000A00000009E90D7-000000067F000040020000A00000009F1AB6__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A00000009EC000-000000067F000040020000A00000009F0000__0000002F83FFFE68 000000067F000040020000A00000009EC000-000000067F000040020000A00000009F0000__00000038E9AF7F00 000000067F000040020000A00000009EC000-000000067F000040020000A00000009F0000__0000003903F1CFE8 000000067F000040020000A00000009EC000-000000067F000040020000A00000009F0000__0000003B99F7F8A0 000000067F000040020000A00000009EC000-000000067F000040020000A00000009F0000__0000005D2FFFFB38 000000067F000040020000A00000009F0000-000000067F000040020000A00000009F4000__0000002F83FFFE68 000000067F000040020000A00000009F0000-000000067F000040020000A00000009F4000__00000038E9AF7F00 000000067F000040020000A00000009F0000-000000067F000040020000A00000009F4000__0000003903F1CFE8 000000067F000040020000A00000009F0000-000000067F000040020000A00000009F4000__0000003B99F7F8A0 000000067F000040020000A00000009F0000-000000067F000040020000A00000009F4000__0000005D2FFFFB38 000000067F000040020000A00000009F1AB6-000000067F000040020000A00000009FA4A4__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A00000009F4000-000000067F000040020000A00000009F8000__0000002F83FFFE68 000000067F000040020000A00000009F4000-000000067F000040020000A00000009F8000__00000038E9AF7F00 000000067F000040020000A00000009F4000-000000067F000040020000A00000009F8000__0000003903F1CFE8 000000067F000040020000A00000009F4000-000000067F000040020000A00000009F8000__0000003B99F7F8A0 000000067F000040020000A00000009F4000-000000067F000040020000A00000009F8000__0000005D2FFFFB38 000000067F000040020000A00000009F8000-000000067F000040020000A00000009FC000__0000002F83FFFE68 000000067F000040020000A00000009F8000-000000067F000040020000A00000009FC000__00000038E9AF7F00 000000067F000040020000A00000009F8000-000000067F000040020000A00000009FC000__0000003903F1CFE8 000000067F000040020000A00000009F8000-000000067F000040020000A00000009FC000__0000003B99F7F8A0 000000067F000040020000A00000009F8000-000000067F000040020000A00000009FC000__0000005D2FFFFB38 000000067F000040020000A00000009FA4A4-000000067F000040020000A0000000A02E70__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A00000009FC000-000000067F000040020000A0000000A00000__0000002F83FFFE68 000000067F000040020000A00000009FC000-000000067F000040020000A0000000A00000__00000038E9AF7F00 000000067F000040020000A00000009FC000-000000067F000040020000A0000000A00000__0000003903F1CFE8 000000067F000040020000A00000009FC000-000000067F000040020000A0000000A00000__0000003B99F7F8A0 000000067F000040020000A00000009FC000-000000067F000040020000A0000000A00000__0000005D2FFFFB38 000000067F000040020000A0000000A00000-000000067F000040020000A0000000A04000__0000002F83FFFE68 000000067F000040020000A0000000A00000-000000067F000040020000A0000000A04000__00000038E9AF7F00 000000067F000040020000A0000000A00000-000000067F000040020000A0000000A04000__0000003903F1CFE8 000000067F000040020000A0000000A00000-000000067F000040020000A0000000A04000__0000003B99F7F8A0 000000067F000040020000A0000000A00000-000000067F000040020000A0000000A04000__0000005D2FFFFB38 000000067F000040020000A0000000A02E70-000000067F000040020000A0000000A0B844__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A04000-000000067F000040020000A0000000A08000__0000002F83FFFE68 000000067F000040020000A0000000A04000-000000067F000040020000A0000000A08000__00000038E9AF7F00 000000067F000040020000A0000000A04000-000000067F000040020000A0000000A08000__0000003903F1CFE8 000000067F000040020000A0000000A04000-000000067F000040020000A0000000A08000__0000003B99F7F8A0 000000067F000040020000A0000000A04000-000000067F000040020000A0000000A08000__0000005D2FFFFB38 000000067F000040020000A0000000A08000-000000067F000040020000A0000000A0C000__0000002F83FFFE68 000000067F000040020000A0000000A08000-000000067F000040020000A0000000A0C000__00000038E9AF7F00 000000067F000040020000A0000000A08000-000000067F000040020000A0000000A0C000__0000003903F1CFE8 000000067F000040020000A0000000A08000-000000067F000040020000A0000000A0C000__0000003B99F7F8A0 000000067F000040020000A0000000A08000-000000067F000040020000A0000000A0C000__0000005D2FFFFB38 000000067F000040020000A0000000A0B844-000000067F000040020000A0000000A14223__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A0C000-000000067F000040020000A0000000A10000__0000002F83FFFE68 000000067F000040020000A0000000A0C000-000000067F000040020000A0000000A10000__00000038E9AF7F00 000000067F000040020000A0000000A0C000-000000067F000040020000A0000000A10000__0000003903F1CFE8 000000067F000040020000A0000000A0C000-000000067F000040020000A0000000A10000__0000003B99F7F8A0 000000067F000040020000A0000000A0C000-000000067F000040020000A0000000A10000__0000005D2FFFFB38 000000067F000040020000A0000000A10000-000000067F000040020000A0000000A14000__0000002F83FFFE68 000000067F000040020000A0000000A10000-000000067F000040020000A0000000A14000__00000038E9AF7F00 000000067F000040020000A0000000A10000-000000067F000040020000A0000000A14000__0000003903F1CFE8 000000067F000040020000A0000000A10000-000000067F000040020000A0000000A14000__0000003B99F7F8A0 000000067F000040020000A0000000A10000-000000067F000040020000A0000000A14000__0000005D2FFFFB38 000000067F000040020000A0000000A14000-000000067F000040020000A0000000A18000__0000002F83FFFE68 000000067F000040020000A0000000A14000-000000067F000040020000A0000000A18000__00000038E9AF7F00 000000067F000040020000A0000000A14000-000000067F000040020000A0000000A18000__0000003903F1CFE8 000000067F000040020000A0000000A14000-000000067F000040020000A0000000A18000__0000003B99F7F8A0 000000067F000040020000A0000000A14000-000000067F000040020000A0000000A18000__0000005D2FFFFB38 000000067F000040020000A0000000A14223-000000067F000040020000A0000000A1CBFC__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A18000-000000067F000040020000A0000000A1C000__0000002F83FFFE68 000000067F000040020000A0000000A18000-000000067F000040020000A0000000A1C000__00000038E9AF7F00 000000067F000040020000A0000000A18000-000000067F000040020000A0000000A1C000__0000003903F1CFE8 000000067F000040020000A0000000A18000-000000067F000040020000A0000000A1C000__0000003B99F7F8A0 000000067F000040020000A0000000A18000-000000067F000040020000A0000000A1C000__0000005D2FFFFB38 000000067F000040020000A0000000A1C000-000000067F000040020000A0000000A20000__0000002F83FFFE68 000000067F000040020000A0000000A1C000-000000067F000040020000A0000000A20000__00000038E9AF7F00 000000067F000040020000A0000000A1C000-000000067F000040020000A0000000A20000__0000003903F1CFE8 000000067F000040020000A0000000A1C000-000000067F000040020000A0000000A20000__0000003B99F7F8A0 000000067F000040020000A0000000A1C000-000000067F000040020000A0000000A20000__0000005D2FFFFB38 000000067F000040020000A0000000A1CBFC-000000067F000040020000A0000000A255DB__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A20000-000000067F000040020000A0000000A24000__0000002F83FFFE68 000000067F000040020000A0000000A20000-000000067F000040020000A0000000A24000__00000038E9AF7F00 000000067F000040020000A0000000A20000-000000067F000040020000A0000000A24000__0000003903F1CFE8 000000067F000040020000A0000000A20000-000000067F000040020000A0000000A24000__0000003B99F7F8A0 000000067F000040020000A0000000A20000-000000067F000040020000A0000000A24000__0000005D2FFFFB38 000000067F000040020000A0000000A24000-000000067F000040020000A0000000A28000__0000002F83FFFE68 000000067F000040020000A0000000A24000-000000067F000040020000A0000000A28000__00000038E9AF7F00 000000067F000040020000A0000000A24000-000000067F000040020000A0000000A28000__0000003903F1CFE8 000000067F000040020000A0000000A24000-000000067F000040020000A0000000A28000__0000003B99F7F8A0 000000067F000040020000A0000000A24000-000000067F000040020000A0000000A28000__0000005D2FFFFB38 000000067F000040020000A0000000A255DB-000000067F000040020000A0000000A2DFCE__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A28000-000000067F000040020000A0000000A2C000__0000002F83FFFE68 000000067F000040020000A0000000A28000-000000067F000040020000A0000000A2C000__00000038E9AF7F00 000000067F000040020000A0000000A28000-000000067F000040020000A0000000A2C000__0000003903F1CFE8 000000067F000040020000A0000000A28000-000000067F000040020000A0000000A2C000__0000003B99F7F8A0 000000067F000040020000A0000000A28000-000000067F000040020000A0000000A2C000__0000005D2FFFFB38 000000067F000040020000A0000000A2C000-000000067F000040020000A0000000A30000__0000002F83FFFE68 000000067F000040020000A0000000A2C000-000000067F000040020000A0000000A30000__00000038E9AF7F00 000000067F000040020000A0000000A2C000-000000067F000040020000A0000000A30000__0000003903F1CFE8 000000067F000040020000A0000000A2C000-000000067F000040020000A0000000A30000__0000003B99F7F8A0 000000067F000040020000A0000000A2C000-000000067F000040020000A0000000A30000__0000005D2FFFFB38 000000067F000040020000A0000000A2DFCE-000000067F000040020000A0000000A369B3__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A30000-000000067F000040020000A0000000A34000__0000002F83FFFE68 000000067F000040020000A0000000A30000-000000067F000040020000A0000000A34000__00000038E9AF7F00 000000067F000040020000A0000000A30000-000000067F000040020000A0000000A34000__0000003903F1CFE8 000000067F000040020000A0000000A30000-000000067F000040020000A0000000A34000__0000003B99F7F8A0 000000067F000040020000A0000000A30000-000000067F000040020000A0000000A34000__0000005D2FFFFB38 000000067F000040020000A0000000A34000-000000067F000040020000A0000000A38000__0000002F83FFFE68 000000067F000040020000A0000000A34000-000000067F000040020000A0000000A38000__00000038E67ABFA0 000000067F000040020000A0000000A34000-000000067F000040020000A0000000A38000__0000003903F1CFE8 000000067F000040020000A0000000A34000-000000067F000040020000A0000000A38000__0000003B99F7F8A0 000000067F000040020000A0000000A34000-000000067F000040020000A0000000A38000__0000005D2FFFFB38 000000067F000040020000A0000000A369B3-000000067F000040020000A0000200000000__0000002EB92DEAC1-0000002F58DE5511 000000067F000040020000A0000000A36B5B-000000067F000040020000A0000000A3F527__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A38000-000000067F000040020000A0000000A3C000__0000002F83FFFE68 000000067F000040020000A0000000A38000-000000067F000040020000A0000000A3C000__00000038E67ABFA0 000000067F000040020000A0000000A38000-000000067F000040020000A0000000A3C000__0000003903F1CFE8 000000067F000040020000A0000000A38000-000000067F000040020000A0000000A3C000__0000003B99F7F8A0 000000067F000040020000A0000000A38000-000000067F000040020000A0000000A3C000__0000005D2FFFFB38 000000067F000040020000A0000000A3C000-000000067F000040020000A0000000A40000__0000002F83FFFE68 000000067F000040020000A0000000A3C000-000000067F000040020000A0000000A40000__00000038E67ABFA0 000000067F000040020000A0000000A3C000-000000067F000040020000A0000000A40000__0000003903F1CFE8 000000067F000040020000A0000000A3C000-000000067F000040020000A0000000A40000__0000003B99F7F8A0 000000067F000040020000A0000000A3C000-000000067F000040020000A0000000A40000__0000005D2FFFFB38 000000067F000040020000A0000000A3F527-000000067F000040020000A0000000A47EFA__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A40000-000000067F000040020000A0000000A44000__0000002F83FFFE68 000000067F000040020000A0000000A40000-000000067F000040020000A0000000A44000__00000038E67ABFA0 000000067F000040020000A0000000A40000-000000067F000040020000A0000000A44000__0000003903F1CFE8 000000067F000040020000A0000000A40000-000000067F000040020000A0000000A44000__0000003B99F7F8A0 000000067F000040020000A0000000A40000-000000067F000040020000A0000000A44000__0000005D2FFFFB38 000000067F000040020000A0000000A44000-000000067F000040020000A0000000A48000__0000002F83FFFE68 000000067F000040020000A0000000A44000-000000067F000040020000A0000000A48000__00000038E67ABFA0 000000067F000040020000A0000000A44000-000000067F000040020000A0000000A48000__0000003903F1CFE8 000000067F000040020000A0000000A44000-000000067F000040020000A0000000A48000__0000003B99F7F8A0 000000067F000040020000A0000000A44000-000000067F000040020000A0000000A48000__0000005D2FFFFB38 000000067F000040020000A0000000A47EFA-000000067F000040020000A0000000A508E3__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A48000-000000067F000040020000A0000000A4C000__0000002F83FFFE68 000000067F000040020000A0000000A48000-000000067F000040020000A0000000A4C000__00000038E67ABFA0 000000067F000040020000A0000000A48000-000000067F000040020000A0000000A4C000__0000003903F1CFE8 000000067F000040020000A0000000A48000-000000067F000040020000A0000000A4C000__0000003B99F7F8A0 000000067F000040020000A0000000A48000-000000067F000040020000A0000000A4C000__0000005D2FFFFB38 000000067F000040020000A0000000A4C000-000000067F000040020000A0000000A50000__00000038E67ABFA0 000000067F000040020000A0000000A4C000-000000067F000040020000A0000000A50000__0000003903F1CFE8 000000067F000040020000A0000000A4C000-000000067F000040020000A0000000A50000__0000003B99F7F8A0 000000067F000040020000A0000000A4C000-000000067F000040020000A0000000A50000__0000005D2FFFFB38 000000067F000040020000A0000000A4C000-030000000000000000000000000000000002__0000002F83FFFE68 000000067F000040020000A0000000A50000-000000067F000040020000A0000000A54000__00000038E67ABFA0 000000067F000040020000A0000000A50000-000000067F000040020000A0000000A54000__0000003903F1CFE8 000000067F000040020000A0000000A50000-000000067F000040020000A0000000A54000__0000003B99F7F8A0 000000067F000040020000A0000000A50000-000000067F000040020000A0000000A54000__0000005D2FFFFB38 000000067F000040020000A0000000A508E3-000000067F000040020000A0000000A592C6__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A54000-000000067F000040020000A0000000A58000__00000038E67ABFA0 000000067F000040020000A0000000A54000-000000067F000040020000A0000000A58000__0000003903F1CFE8 000000067F000040020000A0000000A54000-000000067F000040020000A0000000A58000__0000003B99F7F8A0 000000067F000040020000A0000000A54000-000000067F000040020000A0000000A58000__0000005D2FFFFB38 000000067F000040020000A0000000A58000-000000067F000040020000A0000000A5C000__00000038E67ABFA0 000000067F000040020000A0000000A58000-000000067F000040020000A0000000A5C000__0000003903F1CFE8 000000067F000040020000A0000000A58000-000000067F000040020000A0000000A5C000__0000003B99F7F8A0 000000067F000040020000A0000000A58000-000000067F000040020000A0000000A5C000__0000005D2FFFFB38 000000067F000040020000A0000000A592C6-000000067F000040020000A0000000A61CA5__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A5C000-000000067F000040020000A0000000A60000__00000038E67ABFA0 000000067F000040020000A0000000A5C000-000000067F000040020000A0000000A60000__0000003903F1CFE8 000000067F000040020000A0000000A5C000-000000067F000040020000A0000000A60000__0000003B99F7F8A0 000000067F000040020000A0000000A5C000-000000067F000040020000A0000000A60000__0000005D2FFFFB38 000000067F000040020000A0000000A60000-000000067F000040020000A0000000A64000__00000038E67ABFA0 000000067F000040020000A0000000A60000-000000067F000040020000A0000000A64000__0000003903F1CFE8 000000067F000040020000A0000000A60000-000000067F000040020000A0000000A64000__0000003B99F7F8A0 000000067F000040020000A0000000A60000-000000067F000040020000A0000000A64000__0000005D2FFFFB38 000000067F000040020000A0000000A61CA5-000000067F000040020000A0000000A6A68D__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A64000-000000067F000040020000A0000000A68000__00000038E67ABFA0 000000067F000040020000A0000000A64000-000000067F000040020000A0000000A68000__0000003903F1CFE8 000000067F000040020000A0000000A64000-000000067F000040020000A0000000A68000__0000003B99F7F8A0 000000067F000040020000A0000000A64000-000000067F000040020000A0000000A68000__0000005D2FFFFB38 000000067F000040020000A0000000A68000-000000067F000040020000A0000000A6C000__00000038E67ABFA0 000000067F000040020000A0000000A68000-000000067F000040020000A0000000A6C000__0000003903F1CFE8 000000067F000040020000A0000000A68000-000000067F000040020000A0000000A6C000__0000003B99F7F8A0 000000067F000040020000A0000000A68000-000000067F000040020000A0000000A6C000__0000005D2FFFFB38 000000067F000040020000A0000000A6A68D-000000067F000040020000A0000000A73072__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A6C000-000000067F000040020000A0000000A70000__00000038E67ABFA0 000000067F000040020000A0000000A6C000-000000067F000040020000A0000000A70000__0000003903F1CFE8 000000067F000040020000A0000000A6C000-000000067F000040020000A0000000A70000__0000003B99F7F8A0 000000067F000040020000A0000000A6C000-000000067F000040020000A0000000A70000__0000005D2FFFFB38 000000067F000040020000A0000000A70000-000000067F000040020000A0000000A74000__00000038E67ABFA0 000000067F000040020000A0000000A70000-000000067F000040020000A0000000A74000__0000003903F1CFE8 000000067F000040020000A0000000A70000-000000067F000040020000A0000000A74000__0000003B99F7F8A0 000000067F000040020000A0000000A70000-000000067F000040020000A0000000A74000__0000005D2FFFFB38 000000067F000040020000A0000000A73072-000000067F000040020000A0000000A7BA4E__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A74000-000000067F000040020000A0000000A78000__00000038E67ABFA0 000000067F000040020000A0000000A74000-000000067F000040020000A0000000A78000__0000003903F1CFE8 000000067F000040020000A0000000A74000-000000067F000040020000A0000000A78000__0000003B99F7F8A0 000000067F000040020000A0000000A74000-000000067F000040020000A0000000A78000__0000005D2FFFFB38 000000067F000040020000A0000000A78000-000000067F000040020000A0000000A7C000__00000038E67ABFA0 000000067F000040020000A0000000A78000-000000067F000040020000A0000000A7C000__0000003903F1CFE8 000000067F000040020000A0000000A78000-000000067F000040020000A0000000A7C000__0000003B99F7F8A0 000000067F000040020000A0000000A78000-000000067F000040020000A0000000A7C000__0000005D2FFFFB38 000000067F000040020000A0000000A7BA4E-000000067F000040020000A0000000A84426__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A7C000-000000067F000040020000A0000000A80000__00000038E67ABFA0 000000067F000040020000A0000000A7C000-000000067F000040020000A0000000A80000__0000003903F1CFE8 000000067F000040020000A0000000A7C000-000000067F000040020000A0000000A80000__0000003B99F7F8A0 000000067F000040020000A0000000A7C000-000000067F000040020000A0000000A80000__0000005D2FFFFB38 000000067F000040020000A0000000A80000-000000067F000040020000A0000000A84000__00000038E67ABFA0 000000067F000040020000A0000000A80000-000000067F000040020000A0000000A84000__0000003903F1CFE8 000000067F000040020000A0000000A80000-000000067F000040020000A0000000A84000__0000003B99F7F8A0 000000067F000040020000A0000000A80000-000000067F000040020000A0000000A84000__0000005D2FFFFB38 000000067F000040020000A0000000A84000-000000067F000040020000A0000000A88000__00000038E67ABFA0 000000067F000040020000A0000000A84000-000000067F000040020000A0000000A88000__0000003903F1CFE8 000000067F000040020000A0000000A84000-000000067F000040020000A0000000A88000__0000003B99F7F8A0 000000067F000040020000A0000000A84000-000000067F000040020000A0000000A88000__0000005D2FFFFB38 000000067F000040020000A0000000A84426-000000067F000040020000A0000000A8CDF4__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A88000-000000067F000040020000A0000000A8C000__00000038E67ABFA0 000000067F000040020000A0000000A88000-000000067F000040020000A0000000A8C000__0000003903F1CFE8 000000067F000040020000A0000000A88000-000000067F000040020000A0000000A8C000__0000003B99F7F8A0 000000067F000040020000A0000000A88000-000000067F000040020000A0000000A8C000__0000005D2FFFFB38 000000067F000040020000A0000000A8C000-000000067F000040020000A0000000A90000__00000038E67ABFA0 000000067F000040020000A0000000A8C000-000000067F000040020000A0000000A90000__0000003903F1CFE8 000000067F000040020000A0000000A8C000-000000067F000040020000A0000000A90000__0000003B99F7F8A0 000000067F000040020000A0000000A8C000-000000067F000040020000A0000000A90000__0000005D2FFFFB38 000000067F000040020000A0000000A8CDF4-000000067F000040020000A0000000A957D8__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A90000-000000067F000040020000A0000000A94000__00000038E67ABFA0 000000067F000040020000A0000000A90000-000000067F000040020000A0000000A94000__0000003903F1CFE8 000000067F000040020000A0000000A90000-000000067F000040020000A0000000A94000__0000003B99F7F8A0 000000067F000040020000A0000000A90000-000000067F000040020000A0000000A94000__0000005D2FFFFB38 000000067F000040020000A0000000A94000-000000067F000040020000A0000000A98000__00000038E1ABFE28 000000067F000040020000A0000000A94000-000000067F000040020000A0000000A98000__00000038E9AF7F00 000000067F000040020000A0000000A94000-000000067F000040020000A0000000A98000__0000003903F1CFE8 000000067F000040020000A0000000A94000-000000067F000040020000A0000000A98000__0000003B99F7F8A0 000000067F000040020000A0000000A94000-000000067F000040020000A0000000A98000__0000005D2FFFFB38 000000067F000040020000A0000000A957D8-000000067F000040020000A0000200000000__0000002F58DE5511-000000300885D069 000000067F000040020000A0000000A95A7F-000000067F000040020000A0000000A9E45C__000000300885D069-00000030B82DF289 000000067F000040020000A0000000A98000-000000067F000040020000A0000000A9C000__00000038E1ABFE28 000000067F000040020000A0000000A98000-000000067F000040020000A0000000A9C000__00000038E9AF7F00 000000067F000040020000A0000000A98000-000000067F000040020000A0000000A9C000__0000003903F1CFE8 000000067F000040020000A0000000A98000-000000067F000040020000A0000000A9C000__0000003B99F7F8A0 000000067F000040020000A0000000A98000-000000067F000040020000A0000000A9C000__0000005D2FFFFB38 000000067F000040020000A0000000A9C000-000000067F000040020000A0000000AA0000__00000038E1ABFE28 000000067F000040020000A0000000A9C000-000000067F000040020000A0000000AA0000__00000038E9AF7F00 000000067F000040020000A0000000A9C000-000000067F000040020000A0000000AA0000__0000003903F1CFE8 000000067F000040020000A0000000A9C000-000000067F000040020000A0000000AA0000__0000003B99F7F8A0 000000067F000040020000A0000000A9C000-000000067F000040020000A0000000AA0000__0000005D2FFFFB38 000000067F000040020000A0000000A9E45C-000000067F000040020000A0000000AA6E3F__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AA0000-000000067F000040020000A0000000AA4000__00000038E1ABFE28 000000067F000040020000A0000000AA0000-000000067F000040020000A0000000AA4000__00000038E9AF7F00 000000067F000040020000A0000000AA0000-000000067F000040020000A0000000AA4000__0000003903F1CFE8 000000067F000040020000A0000000AA0000-000000067F000040020000A0000000AA4000__0000003B99F7F8A0 000000067F000040020000A0000000AA0000-000000067F000040020000A0000000AA4000__0000005D2FFFFB38 000000067F000040020000A0000000AA4000-000000067F000040020000A0000000AA8000__00000038E1ABFE28 000000067F000040020000A0000000AA4000-000000067F000040020000A0000000AA8000__00000038E9AF7F00 000000067F000040020000A0000000AA4000-000000067F000040020000A0000000AA8000__0000003903F1CFE8 000000067F000040020000A0000000AA4000-000000067F000040020000A0000000AA8000__0000003B99F7F8A0 000000067F000040020000A0000000AA4000-000000067F000040020000A0000000AA8000__0000005D2FFFFB38 000000067F000040020000A0000000AA6E3F-000000067F000040020000A0000000AAF81B__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AA8000-000000067F000040020000A0000000AAC000__00000038E1ABFE28 000000067F000040020000A0000000AA8000-000000067F000040020000A0000000AAC000__00000038E9AF7F00 000000067F000040020000A0000000AA8000-000000067F000040020000A0000000AAC000__0000003903F1CFE8 000000067F000040020000A0000000AA8000-000000067F000040020000A0000000AAC000__0000003B99F7F8A0 000000067F000040020000A0000000AA8000-000000067F000040020000A0000000AAC000__0000005D2FFFFB38 000000067F000040020000A0000000AAC000-000000067F000040020000A0000000AB0000__00000038E1ABFE28 000000067F000040020000A0000000AAC000-000000067F000040020000A0000000AB0000__00000038E9AF7F00 000000067F000040020000A0000000AAC000-000000067F000040020000A0000000AB0000__0000003903F1CFE8 000000067F000040020000A0000000AAC000-000000067F000040020000A0000000AB0000__0000003B99F7F8A0 000000067F000040020000A0000000AAC000-000000067F000040020000A0000000AB0000__0000005D2FFFFB38 000000067F000040020000A0000000AAF81B-000000067F000040020000A0000000AB81F8__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AB0000-000000067F000040020000A0000000AB4000__00000038E1ABFE28 000000067F000040020000A0000000AB0000-000000067F000040020000A0000000AB4000__00000038E9AF7F00 000000067F000040020000A0000000AB0000-000000067F000040020000A0000000AB4000__0000003903F1CFE8 000000067F000040020000A0000000AB0000-000000067F000040020000A0000000AB4000__0000003B99F7F8A0 000000067F000040020000A0000000AB0000-000000067F000040020000A0000000AB4000__0000005D2FFFFB38 000000067F000040020000A0000000AB4000-000000067F000040020000A0000000AB8000__00000038E1ABFE28 000000067F000040020000A0000000AB4000-000000067F000040020000A0000000AB8000__00000038E9AF7F00 000000067F000040020000A0000000AB4000-000000067F000040020000A0000000AB8000__0000003903F1CFE8 000000067F000040020000A0000000AB4000-000000067F000040020000A0000000AB8000__0000003B99F7F8A0 000000067F000040020000A0000000AB4000-000000067F000040020000A0000000AB8000__0000005D2FFFFB38 000000067F000040020000A0000000AB8000-000000067F000040020000A0000000ABC000__00000038E1ABFE28 000000067F000040020000A0000000AB8000-000000067F000040020000A0000000ABC000__00000038E9AF7F00 000000067F000040020000A0000000AB8000-000000067F000040020000A0000000ABC000__0000003903F1CFE8 000000067F000040020000A0000000AB8000-000000067F000040020000A0000000ABC000__0000003B99F7F8A0 000000067F000040020000A0000000AB8000-000000067F000040020000A0000000ABC000__0000005D2FFFFB38 000000067F000040020000A0000000AB81F8-000000067F000040020000A0000000AC0BE2__000000300885D069-00000030B82DF289 000000067F000040020000A0000000ABC000-000000067F000040020000A0000000AC0000__00000038E1ABFE28 000000067F000040020000A0000000ABC000-000000067F000040020000A0000000AC0000__00000038E9AF7F00 000000067F000040020000A0000000ABC000-000000067F000040020000A0000000AC0000__0000003903F1CFE8 000000067F000040020000A0000000ABC000-000000067F000040020000A0000000AC0000__0000003B99F7F8A0 000000067F000040020000A0000000ABC000-000000067F000040020000A0000000AC0000__0000005D2FFFFB38 000000067F000040020000A0000000AC0000-000000067F000040020000A0000000AC4000__00000038E1ABFE28 000000067F000040020000A0000000AC0000-000000067F000040020000A0000000AC4000__00000038E9AF7F00 000000067F000040020000A0000000AC0000-000000067F000040020000A0000000AC4000__0000003903F1CFE8 000000067F000040020000A0000000AC0000-000000067F000040020000A0000000AC4000__0000003B99F7F8A0 000000067F000040020000A0000000AC0000-000000067F000040020000A0000000AC4000__0000005D2FFFFB38 000000067F000040020000A0000000AC0BE2-000000067F000040020000A0000000AC95C0__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AC4000-000000067F000040020000A0000000AC8000__00000038E1ABFE28 000000067F000040020000A0000000AC4000-000000067F000040020000A0000000AC8000__00000038E9AF7F00 000000067F000040020000A0000000AC4000-000000067F000040020000A0000000AC8000__0000003903F1CFE8 000000067F000040020000A0000000AC4000-000000067F000040020000A0000000AC8000__0000003B99F7F8A0 000000067F000040020000A0000000AC4000-000000067F000040020000A0000000AC8000__0000005D2FFFFB38 000000067F000040020000A0000000AC8000-000000067F000040020000A0000000ACC000__00000038E1ABFE28 000000067F000040020000A0000000AC8000-000000067F000040020000A0000000ACC000__00000038E9AF7F00 000000067F000040020000A0000000AC8000-000000067F000040020000A0000000ACC000__0000003903F1CFE8 000000067F000040020000A0000000AC8000-000000067F000040020000A0000000ACC000__0000003B99F7F8A0 000000067F000040020000A0000000AC8000-000000067F000040020000A0000000ACC000__0000005D2FFFFB38 000000067F000040020000A0000000AC95C0-000000067F000040020000A0000000AD1F9F__000000300885D069-00000030B82DF289 000000067F000040020000A0000000ACC000-000000067F000040020000A0000000AD0000__00000038E1ABFE28 000000067F000040020000A0000000ACC000-000000067F000040020000A0000000AD0000__00000038E9AF7F00 000000067F000040020000A0000000ACC000-000000067F000040020000A0000000AD0000__0000003903F1CFE8 000000067F000040020000A0000000ACC000-000000067F000040020000A0000000AD0000__0000003B99F7F8A0 000000067F000040020000A0000000ACC000-000000067F000040020000A0000000AD0000__0000005D2FFFFB38 000000067F000040020000A0000000AD0000-000000067F000040020000A0000000AD4000__00000038E1ABFE28 000000067F000040020000A0000000AD0000-000000067F000040020000A0000000AD4000__00000038E9AF7F00 000000067F000040020000A0000000AD0000-000000067F000040020000A0000000AD4000__0000003903F1CFE8 000000067F000040020000A0000000AD0000-000000067F000040020000A0000000AD4000__0000003B99F7F8A0 000000067F000040020000A0000000AD0000-000000067F000040020000A0000000AD4000__0000005D2FFFFB38 000000067F000040020000A0000000AD1F9F-000000067F000040020000A0000000ADA983__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AD4000-000000067F000040020000A0000000AD8000__00000038E1ABFE28 000000067F000040020000A0000000AD4000-000000067F000040020000A0000000AD8000__00000038E9AF7F00 000000067F000040020000A0000000AD4000-000000067F000040020000A0000000AD8000__0000003903F1CFE8 000000067F000040020000A0000000AD4000-000000067F000040020000A0000000AD8000__0000003B99F7F8A0 000000067F000040020000A0000000AD4000-000000067F000040020000A0000000AD8000__0000005D2FFFFB38 000000067F000040020000A0000000AD8000-000000067F000040020000A0000000ADC000__00000038E1ABFE28 000000067F000040020000A0000000AD8000-000000067F000040020000A0000000ADC000__00000038E9AF7F00 000000067F000040020000A0000000AD8000-000000067F000040020000A0000000ADC000__0000003903F1CFE8 000000067F000040020000A0000000AD8000-000000067F000040020000A0000000ADC000__0000003B99F7F8A0 000000067F000040020000A0000000AD8000-000000067F000040020000A0000000ADC000__0000005D2FFFFB38 000000067F000040020000A0000000ADA983-000000067F000040020000A0000000AE3365__000000300885D069-00000030B82DF289 000000067F000040020000A0000000ADC000-000000067F000040020000A0000000AE0000__00000038E1ABFE28 000000067F000040020000A0000000ADC000-000000067F000040020000A0000000AE0000__00000038E9AF7F00 000000067F000040020000A0000000ADC000-000000067F000040020000A0000000AE0000__0000003903F1CFE8 000000067F000040020000A0000000ADC000-000000067F000040020000A0000000AE0000__0000003B99F7F8A0 000000067F000040020000A0000000ADC000-000000067F000040020000A0000000AE0000__0000005D2FFFFB38 000000067F000040020000A0000000AE0000-000000067F000040020000A0000000AE4000__00000038E1ABFE28 000000067F000040020000A0000000AE0000-000000067F000040020000A0000000AE4000__00000038E9AF7F00 000000067F000040020000A0000000AE0000-000000067F000040020000A0000000AE4000__0000003903F1CFE8 000000067F000040020000A0000000AE0000-000000067F000040020000A0000000AE4000__0000003B99F7F8A0 000000067F000040020000A0000000AE0000-000000067F000040020000A0000000AE4000__0000005D2FFFFB38 000000067F000040020000A0000000AE3365-000000067F000040020000A0000000AEBD39__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AE4000-000000067F000040020000A0000000AE8000__00000038E1ABFE28 000000067F000040020000A0000000AE4000-000000067F000040020000A0000000AE8000__00000038E9AF7F00 000000067F000040020000A0000000AE4000-000000067F000040020000A0000000AE8000__0000003903F1CFE8 000000067F000040020000A0000000AE4000-000000067F000040020000A0000000AE8000__0000003B99F7F8A0 000000067F000040020000A0000000AE4000-000000067F000040020000A0000000AE8000__0000005D2FFFFB38 000000067F000040020000A0000000AE8000-000000067F000040020000A0000000AEC000__00000038E1ABFE28 000000067F000040020000A0000000AE8000-000000067F000040020000A0000000AEC000__00000038E9AF7F00 000000067F000040020000A0000000AE8000-000000067F000040020000A0000000AEC000__0000003903F1CFE8 000000067F000040020000A0000000AE8000-000000067F000040020000A0000000AEC000__0000003B99F7F8A0 000000067F000040020000A0000000AE8000-000000067F000040020000A0000000AEC000__0000005D2FFFFB38 000000067F000040020000A0000000AEBD39-000000067F000040020000A0000000AF4712__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AEC000-000000067F000040020000A0000000AF0000__00000038E1ABFE28 000000067F000040020000A0000000AEC000-000000067F000040020000A0000000AF0000__00000038E9AF7F00 000000067F000040020000A0000000AEC000-000000067F000040020000A0000000AF0000__0000003903F1CFE8 000000067F000040020000A0000000AEC000-000000067F000040020000A0000000AF0000__0000003B99F7F8A0 000000067F000040020000A0000000AEC000-000000067F000040020000A0000000AF0000__0000005D2FFFFB38 000000067F000040020000A0000000AF0000-000000067F000040020000A0000000AF4000__00000038E1ABFE28 000000067F000040020000A0000000AF0000-000000067F000040020000A0000000AF4000__00000038E9AF7F00 000000067F000040020000A0000000AF0000-000000067F000040020000A0000000AF4000__0000003903F1CFE8 000000067F000040020000A0000000AF0000-000000067F000040020000A0000000AF4000__0000003B99F7F8A0 000000067F000040020000A0000000AF0000-000000067F000040020000A0000000AF4000__0000005D2FFFFB38 000000067F000040020000A0000000AF4000-000000067F000040020000A0000000AF8000__00000031853FEA98 000000067F000040020000A0000000AF4000-000000067F000040020000A0000000AF8000__00000038E9AF7F00 000000067F000040020000A0000000AF4000-000000067F000040020000A0000000AF8000__0000003903F1CFE8 000000067F000040020000A0000000AF4000-000000067F000040020000A0000000AF8000__0000003B99F7F8A0 000000067F000040020000A0000000AF4000-000000067F000040020000A0000000AF8000__0000005D2FFFFB38 000000067F000040020000A0000000AF4712-000000067F000040020000A0000200000000__000000300885D069-00000030B82DF289 000000067F000040020000A0000000AF4908-000000067F000040020000A0000000AFD2DF__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000AF8000-000000067F000040020000A0000000AFC000__00000031853FEA98 000000067F000040020000A0000000AF8000-000000067F000040020000A0000000AFC000__00000038E9AF7F00 000000067F000040020000A0000000AF8000-000000067F000040020000A0000000AFC000__0000003903F1CFE8 000000067F000040020000A0000000AF8000-000000067F000040020000A0000000AFC000__0000003B99F7F8A0 000000067F000040020000A0000000AF8000-000000067F000040020000A0000000AFC000__0000005D2FFFFB38 000000067F000040020000A0000000AFC000-000000067F000040020000A0000000B00000__00000031853FEA98 000000067F000040020000A0000000AFC000-000000067F000040020000A0000000B00000__00000038E9AF7F00 000000067F000040020000A0000000AFC000-000000067F000040020000A0000000B00000__0000003903F1CFE8 000000067F000040020000A0000000AFC000-000000067F000040020000A0000000B00000__0000003B99F7F8A0 000000067F000040020000A0000000AFC000-000000067F000040020000A0000000B00000__0000005D2FFFFB38 000000067F000040020000A0000000AFD2DF-000000067F000040020000A0000000B05CBB__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B00000-000000067F000040020000A0000000B04000__00000031853FEA98 000000067F000040020000A0000000B00000-000000067F000040020000A0000000B04000__00000038E9AF7F00 000000067F000040020000A0000000B00000-000000067F000040020000A0000000B04000__0000003903F1CFE8 000000067F000040020000A0000000B00000-000000067F000040020000A0000000B04000__0000003B99F7F8A0 000000067F000040020000A0000000B00000-000000067F000040020000A0000000B04000__0000005D2FFFFB38 000000067F000040020000A0000000B04000-000000067F000040020000A0000000B08000__00000031853FEA98 000000067F000040020000A0000000B04000-000000067F000040020000A0000000B08000__00000038E9AF7F00 000000067F000040020000A0000000B04000-000000067F000040020000A0000000B08000__0000003903F1CFE8 000000067F000040020000A0000000B04000-000000067F000040020000A0000000B08000__0000003B99F7F8A0 000000067F000040020000A0000000B04000-000000067F000040020000A0000000B08000__0000005D2FFFFB38 000000067F000040020000A0000000B05CBB-000000067F000040020000A0000000B0E6A0__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B08000-000000067F000040020000A0000000B0C000__00000031853FEA98 000000067F000040020000A0000000B08000-000000067F000040020000A0000000B0C000__00000038E9AF7F00 000000067F000040020000A0000000B08000-000000067F000040020000A0000000B0C000__0000003903F1CFE8 000000067F000040020000A0000000B08000-000000067F000040020000A0000000B0C000__0000003B99F7F8A0 000000067F000040020000A0000000B08000-000000067F000040020000A0000000B0C000__0000005D2FFFFB38 000000067F000040020000A0000000B0C000-000000067F000040020000A0000000B10000__00000031853FEA98 000000067F000040020000A0000000B0C000-000000067F000040020000A0000000B10000__00000038E9AF7F00 000000067F000040020000A0000000B0C000-000000067F000040020000A0000000B10000__0000003903F1CFE8 000000067F000040020000A0000000B0C000-000000067F000040020000A0000000B10000__0000003B99F7F8A0 000000067F000040020000A0000000B0C000-000000067F000040020000A0000000B10000__0000005D2FFFFB38 000000067F000040020000A0000000B0E6A0-000000067F000040020000A0000000B1707D__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B10000-000000067F000040020000A0000000B14000__00000031853FEA98 000000067F000040020000A0000000B10000-000000067F000040020000A0000000B14000__00000038E9AF7F00 000000067F000040020000A0000000B10000-000000067F000040020000A0000000B14000__0000003903F1CFE8 000000067F000040020000A0000000B10000-000000067F000040020000A0000000B14000__0000003B99F7F8A0 000000067F000040020000A0000000B10000-000000067F000040020000A0000000B14000__0000005D2FFFFB38 000000067F000040020000A0000000B14000-000000067F000040020000A0000000B18000__00000031853FEA98 000000067F000040020000A0000000B14000-000000067F000040020000A0000000B18000__00000038E9AF7F00 000000067F000040020000A0000000B14000-000000067F000040020000A0000000B18000__0000003903F1CFE8 000000067F000040020000A0000000B14000-000000067F000040020000A0000000B18000__0000003B99F7F8A0 000000067F000040020000A0000000B14000-000000067F000040020000A0000000B18000__0000005D2FFFFB38 000000067F000040020000A0000000B1707D-000000067F000040020000A0000000B1FA5F__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B18000-000000067F000040020000A0000000B1C000__00000031853FEA98 000000067F000040020000A0000000B18000-000000067F000040020000A0000000B1C000__00000038E9AF7F00 000000067F000040020000A0000000B18000-000000067F000040020000A0000000B1C000__0000003903F1CFE8 000000067F000040020000A0000000B18000-000000067F000040020000A0000000B1C000__0000003B99F7F8A0 000000067F000040020000A0000000B18000-000000067F000040020000A0000000B1C000__0000005D2FFFFB38 000000067F000040020000A0000000B1C000-000000067F000040020000A0000000B20000__00000031853FEA98 000000067F000040020000A0000000B1C000-000000067F000040020000A0000000B20000__00000038E9AF7F00 000000067F000040020000A0000000B1C000-000000067F000040020000A0000000B20000__0000003903F1CFE8 000000067F000040020000A0000000B1C000-000000067F000040020000A0000000B20000__0000003B99F7F8A0 000000067F000040020000A0000000B1C000-000000067F000040020000A0000000B20000__0000005D2FFFFB38 000000067F000040020000A0000000B1FA5F-000000067F000040020000A0000000B28438__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B20000-000000067F000040020000A0000000B24000__00000031853FEA98 000000067F000040020000A0000000B20000-000000067F000040020000A0000000B24000__00000038E9AF7F00 000000067F000040020000A0000000B20000-000000067F000040020000A0000000B24000__0000003903F1CFE8 000000067F000040020000A0000000B20000-000000067F000040020000A0000000B24000__0000003B99F7F8A0 000000067F000040020000A0000000B20000-000000067F000040020000A0000000B24000__0000005D2FFFFB38 000000067F000040020000A0000000B24000-000000067F000040020000A0000000B28000__00000031853FEA98 000000067F000040020000A0000000B24000-000000067F000040020000A0000000B28000__00000038E9AF7F00 000000067F000040020000A0000000B24000-000000067F000040020000A0000000B28000__0000003903F1CFE8 000000067F000040020000A0000000B24000-000000067F000040020000A0000000B28000__0000003B99F7F8A0 000000067F000040020000A0000000B24000-000000067F000040020000A0000000B28000__0000005D2FFFFB38 000000067F000040020000A0000000B28000-000000067F000040020000A0000000B2C000__00000031853FEA98 000000067F000040020000A0000000B28000-000000067F000040020000A0000000B2C000__00000038E9AF7F00 000000067F000040020000A0000000B28000-000000067F000040020000A0000000B2C000__0000003903F1CFE8 000000067F000040020000A0000000B28000-000000067F000040020000A0000000B2C000__0000003B99F7F8A0 000000067F000040020000A0000000B28000-000000067F000040020000A0000000B2C000__0000005D2FFFFB38 000000067F000040020000A0000000B28438-000000067F000040020000A0000000B30E0A__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B2C000-000000067F000040020000A0000000B30000__00000031853FEA98 000000067F000040020000A0000000B2C000-000000067F000040020000A0000000B30000__00000038E9AF7F00 000000067F000040020000A0000000B2C000-000000067F000040020000A0000000B30000__0000003903F1CFE8 000000067F000040020000A0000000B2C000-000000067F000040020000A0000000B30000__0000003B99F7F8A0 000000067F000040020000A0000000B2C000-000000067F000040020000A0000000B30000__0000005D2FFFFB38 000000067F000040020000A0000000B30000-000000067F000040020000A0000000B34000__00000031853FEA98 000000067F000040020000A0000000B30000-000000067F000040020000A0000000B34000__00000038E9AF7F00 000000067F000040020000A0000000B30000-000000067F000040020000A0000000B34000__0000003903F1CFE8 000000067F000040020000A0000000B30000-000000067F000040020000A0000000B34000__0000003B99F7F8A0 000000067F000040020000A0000000B30000-000000067F000040020000A0000000B34000__0000005D2FFFFB38 000000067F000040020000A0000000B30E0A-000000067F000040020000A0000000B397D4__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B34000-000000067F000040020000A0000000B38000__00000031853FEA98 000000067F000040020000A0000000B34000-000000067F000040020000A0000000B38000__00000038E9AF7F00 000000067F000040020000A0000000B34000-000000067F000040020000A0000000B38000__0000003903F1CFE8 000000067F000040020000A0000000B34000-000000067F000040020000A0000000B38000__0000003B99F7F8A0 000000067F000040020000A0000000B34000-000000067F000040020000A0000000B38000__0000005D2FFFFB38 000000067F000040020000A0000000B38000-000000067F000040020000A0000000B3C000__00000031853FEA98 000000067F000040020000A0000000B38000-000000067F000040020000A0000000B3C000__00000038E9AF7F00 000000067F000040020000A0000000B38000-000000067F000040020000A0000000B3C000__0000003903F1CFE8 000000067F000040020000A0000000B38000-000000067F000040020000A0000000B3C000__0000003B99F7F8A0 000000067F000040020000A0000000B38000-000000067F000040020000A0000000B3C000__0000005D2FFFFB38 000000067F000040020000A0000000B397D4-000000067F000040020000A0000000B421B1__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B3C000-000000067F000040020000A0000000B40000__00000031853FEA98 000000067F000040020000A0000000B3C000-000000067F000040020000A0000000B40000__00000038E9AF7F00 000000067F000040020000A0000000B3C000-000000067F000040020000A0000000B40000__0000003903F1CFE8 000000067F000040020000A0000000B3C000-000000067F000040020000A0000000B40000__0000003B99F7F8A0 000000067F000040020000A0000000B3C000-000000067F000040020000A0000000B40000__0000005D2FFFFB38 000000067F000040020000A0000000B40000-000000067F000040020000A0000000B44000__00000031853FEA98 000000067F000040020000A0000000B40000-000000067F000040020000A0000000B44000__00000038E9AF7F00 000000067F000040020000A0000000B40000-000000067F000040020000A0000000B44000__0000003903F1CFE8 000000067F000040020000A0000000B40000-000000067F000040020000A0000000B44000__0000003B99F7F8A0 000000067F000040020000A0000000B40000-000000067F000040020000A0000000B44000__0000005D2FFFFB38 000000067F000040020000A0000000B421B1-000000067F000040020000A0000000B4AB8F__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B44000-000000067F000040020000A0000000B48000__00000031853FEA98 000000067F000040020000A0000000B44000-000000067F000040020000A0000000B48000__00000038E9AF7F00 000000067F000040020000A0000000B44000-000000067F000040020000A0000000B48000__0000003903F1CFE8 000000067F000040020000A0000000B44000-000000067F000040020000A0000000B48000__0000003B99F7F8A0 000000067F000040020000A0000000B44000-000000067F000040020000A0000000B48000__0000005D2FFFFB38 000000067F000040020000A0000000B48000-000000067F000040020000A0000000B4C000__00000031853FEA98 000000067F000040020000A0000000B48000-000000067F000040020000A0000000B4C000__00000038E67ABFA0 000000067F000040020000A0000000B48000-000000067F000040020000A0000000B4C000__0000003903F1CFE8 000000067F000040020000A0000000B48000-000000067F000040020000A0000000B4C000__0000003B99F7F8A0 000000067F000040020000A0000000B48000-000000067F000040020000A0000000B4C000__0000005D2FFFFB38 000000067F000040020000A0000000B4AB8F-000000067F000040020000A0000200000000__00000030B82DF289-0000003157DDD551 000000067F000040020000A0000000B4AE08-000000067F000040020000A0000000B537DE__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B4C000-000000067F000040020000A0000000B50000__00000031853FEA98 000000067F000040020000A0000000B4C000-000000067F000040020000A0000000B50000__00000038E67ABFA0 000000067F000040020000A0000000B4C000-000000067F000040020000A0000000B50000__0000003903F1CFE8 000000067F000040020000A0000000B4C000-000000067F000040020000A0000000B50000__0000003B99F7F8A0 000000067F000040020000A0000000B4C000-000000067F000040020000A0000000B50000__0000005D2FFFFB38 000000067F000040020000A0000000B50000-000000067F000040020000A0000000B54000__00000031853FEA98 000000067F000040020000A0000000B50000-000000067F000040020000A0000000B54000__00000038E67ABFA0 000000067F000040020000A0000000B50000-000000067F000040020000A0000000B54000__0000003903F1CFE8 000000067F000040020000A0000000B50000-000000067F000040020000A0000000B54000__0000003B99F7F8A0 000000067F000040020000A0000000B50000-000000067F000040020000A0000000B54000__0000005D2FFFFB38 000000067F000040020000A0000000B537DE-000000067F000040020000A0000000B5C1C1__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B54000-000000067F000040020000A0000000B58000__00000031853FEA98 000000067F000040020000A0000000B54000-000000067F000040020000A0000000B58000__00000038E67ABFA0 000000067F000040020000A0000000B54000-000000067F000040020000A0000000B58000__0000003903F1CFE8 000000067F000040020000A0000000B54000-000000067F000040020000A0000000B58000__0000003B99F7F8A0 000000067F000040020000A0000000B54000-000000067F000040020000A0000000B58000__0000005D2FFFFB38 000000067F000040020000A0000000B58000-000000067F000040020000A0000000B5C000__00000031853FEA98 000000067F000040020000A0000000B58000-000000067F000040020000A0000000B5C000__00000038E67ABFA0 000000067F000040020000A0000000B58000-000000067F000040020000A0000000B5C000__0000003903F1CFE8 000000067F000040020000A0000000B58000-000000067F000040020000A0000000B5C000__0000003B99F7F8A0 000000067F000040020000A0000000B58000-000000067F000040020000A0000000B5C000__0000005D2FFFFB38 000000067F000040020000A0000000B5C000-000000067F000040020000A0000000B60000__00000031853FEA98 000000067F000040020000A0000000B5C000-000000067F000040020000A0000000B60000__00000038E67ABFA0 000000067F000040020000A0000000B5C000-000000067F000040020000A0000000B60000__0000003903F1CFE8 000000067F000040020000A0000000B5C000-000000067F000040020000A0000000B60000__0000003B99F7F8A0 000000067F000040020000A0000000B5C000-000000067F000040020000A0000000B60000__0000005D2FFFFB38 000000067F000040020000A0000000B5C1C1-000000067F000040020000A0000000B64BA0__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B635E2__00000031853FEA98 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B64000__00000031C7B24AB0 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B64000__00000038E67ABFA0 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B64000__0000003903F1CFE8 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B64000__0000003B99F7F8A0 000000067F000040020000A0000000B60000-000000067F000040020000A0000000B64000__0000005D2FFFFB38 000000067F000040020000A0000000B64000-000000067F000040020000A0000000B68000__00000031C7B24AB0 000000067F000040020000A0000000B64000-000000067F000040020000A0000000B68000__00000038E67ABFA0 000000067F000040020000A0000000B64000-000000067F000040020000A0000000B68000__0000003903F1CFE8 000000067F000040020000A0000000B64000-000000067F000040020000A0000000B68000__0000003B99F7F8A0 000000067F000040020000A0000000B64000-000000067F000040020000A0000000B68000__0000005D2FFFFB38 000000067F000040020000A0000000B64BA0-000000067F000040020000A0000000B6D57C__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B68000-000000067F000040020000A0000000B6C000__00000031C7B24AB0 000000067F000040020000A0000000B68000-000000067F000040020000A0000000B6C000__00000038E67ABFA0 000000067F000040020000A0000000B68000-000000067F000040020000A0000000B6C000__0000003903F1CFE8 000000067F000040020000A0000000B68000-000000067F000040020000A0000000B6C000__0000003B99F7F8A0 000000067F000040020000A0000000B68000-000000067F000040020000A0000000B6C000__0000005D2FFFFB38 000000067F000040020000A0000000B6C000-000000067F000040020000A0000000B70000__00000031C7B24AB0 000000067F000040020000A0000000B6C000-000000067F000040020000A0000000B70000__00000038E67ABFA0 000000067F000040020000A0000000B6C000-000000067F000040020000A0000000B70000__0000003903F1CFE8 000000067F000040020000A0000000B6C000-000000067F000040020000A0000000B70000__0000003B99F7F8A0 000000067F000040020000A0000000B6C000-000000067F000040020000A0000000B70000__0000005D2FFFFB38 000000067F000040020000A0000000B6D57C-000000067F000040020000A0000000B75F57__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B70000-000000067F000040020000A0000000B74000__00000031C7B24AB0 000000067F000040020000A0000000B70000-000000067F000040020000A0000000B74000__00000038E67ABFA0 000000067F000040020000A0000000B70000-000000067F000040020000A0000000B74000__0000003903F1CFE8 000000067F000040020000A0000000B70000-000000067F000040020000A0000000B74000__0000003B99F7F8A0 000000067F000040020000A0000000B70000-000000067F000040020000A0000000B74000__0000005D2FFFFB38 000000067F000040020000A0000000B74000-000000067F000040020000A0000000B78000__00000031C7B24AB0 000000067F000040020000A0000000B74000-000000067F000040020000A0000000B78000__00000038E67ABFA0 000000067F000040020000A0000000B74000-000000067F000040020000A0000000B78000__0000003903F1CFE8 000000067F000040020000A0000000B74000-000000067F000040020000A0000000B78000__0000003B99F7F8A0 000000067F000040020000A0000000B74000-000000067F000040020000A0000000B78000__0000005D2FFFFB38 000000067F000040020000A0000000B75F57-000000067F000040020000A0000000B7E928__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B78000-000000067F000040020000A0000000B7C000__00000031C7B24AB0 000000067F000040020000A0000000B78000-000000067F000040020000A0000000B7C000__00000038E67ABFA0 000000067F000040020000A0000000B78000-000000067F000040020000A0000000B7C000__0000003903F1CFE8 000000067F000040020000A0000000B78000-000000067F000040020000A0000000B7C000__0000003B99F7F8A0 000000067F000040020000A0000000B78000-000000067F000040020000A0000000B7C000__0000005D2FFFFB38 000000067F000040020000A0000000B7C000-000000067F000040020000A0000000B80000__00000031C7B24AB0 000000067F000040020000A0000000B7C000-000000067F000040020000A0000000B80000__00000038E67ABFA0 000000067F000040020000A0000000B7C000-000000067F000040020000A0000000B80000__0000003903F1CFE8 000000067F000040020000A0000000B7C000-000000067F000040020000A0000000B80000__0000003B99F7F8A0 000000067F000040020000A0000000B7C000-000000067F000040020000A0000000B80000__0000005D2FFFFB38 000000067F000040020000A0000000B7E928-000000067F000040020000A0000000B872FE__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B80000-000000067F000040020000A0000000B84000__00000031C7B24AB0 000000067F000040020000A0000000B80000-000000067F000040020000A0000000B84000__00000038E67ABFA0 000000067F000040020000A0000000B80000-000000067F000040020000A0000000B84000__0000003903F1CFE8 000000067F000040020000A0000000B80000-000000067F000040020000A0000000B84000__0000003B99F7F8A0 000000067F000040020000A0000000B80000-000000067F000040020000A0000000B84000__0000005D2FFFFB38 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B87482__00000031C7B24AB0 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B88000__00000031EA7FFF60 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B88000__00000038E67ABFA0 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B88000__0000003903F1CFE8 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B88000__0000003B99F7F8A0 000000067F000040020000A0000000B84000-000000067F000040020000A0000000B88000__0000005D2FFFFB38 000000067F000040020000A0000000B872FE-000000067F000040020000A0000000B8FCED__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B88000-000000067F000040020000A0000000B8C000__00000031EA7FFF60 000000067F000040020000A0000000B88000-000000067F000040020000A0000000B8C000__00000038E67ABFA0 000000067F000040020000A0000000B88000-000000067F000040020000A0000000B8C000__0000003903F1CFE8 000000067F000040020000A0000000B88000-000000067F000040020000A0000000B8C000__0000003B99F7F8A0 000000067F000040020000A0000000B88000-000000067F000040020000A0000000B8C000__0000005D2FFFFB38 000000067F000040020000A0000000B8C000-000000067F000040020000A0000000B90000__00000031EA7FFF60 000000067F000040020000A0000000B8C000-000000067F000040020000A0000000B90000__00000038E67ABFA0 000000067F000040020000A0000000B8C000-000000067F000040020000A0000000B90000__0000003903F1CFE8 000000067F000040020000A0000000B8C000-000000067F000040020000A0000000B90000__0000003B99F7F8A0 000000067F000040020000A0000000B8C000-000000067F000040020000A0000000B90000__0000005D2FFFFB38 000000067F000040020000A0000000B8FCED-000000067F000040020000A0000000B986CE__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B90000-000000067F000040020000A0000000B94000__00000031EA7FFF60 000000067F000040020000A0000000B90000-000000067F000040020000A0000000B94000__00000038E67ABFA0 000000067F000040020000A0000000B90000-000000067F000040020000A0000000B94000__0000003903F1CFE8 000000067F000040020000A0000000B90000-000000067F000040020000A0000000B94000__0000003B99F7F8A0 000000067F000040020000A0000000B90000-000000067F000040020000A0000000B94000__0000005D2FFFFB38 000000067F000040020000A0000000B94000-000000067F000040020000A0000000B98000__00000031EA7FFF60 000000067F000040020000A0000000B94000-000000067F000040020000A0000000B98000__00000038E67ABFA0 000000067F000040020000A0000000B94000-000000067F000040020000A0000000B98000__0000003903F1CFE8 000000067F000040020000A0000000B94000-000000067F000040020000A0000000B98000__0000003B99F7F8A0 000000067F000040020000A0000000B94000-000000067F000040020000A0000000B98000__0000005D2FFFFB38 000000067F000040020000A0000000B98000-000000067F000040020000A0000000B9C000__00000038E67ABFA0 000000067F000040020000A0000000B98000-000000067F000040020000A0000000B9C000__0000003903F1CFE8 000000067F000040020000A0000000B98000-000000067F000040020000A0000000B9C000__0000003B99F7F8A0 000000067F000040020000A0000000B98000-000000067F000040020000A0000000B9C000__0000005D2FFFFB38 000000067F000040020000A0000000B98000-030000000000000000000000000000000002__00000031EA7FFF60 000000067F000040020000A0000000B986CE-000000067F000040020000A0000000BA10BA__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000B9C000-000000067F000040020000A0000000BA0000__00000038E67ABFA0 000000067F000040020000A0000000B9C000-000000067F000040020000A0000000BA0000__0000003903F1CFE8 000000067F000040020000A0000000B9C000-000000067F000040020000A0000000BA0000__0000003B99F7F8A0 000000067F000040020000A0000000B9C000-000000067F000040020000A0000000BA0000__0000005D2FFFFB38 000000067F000040020000A0000000BA0000-000000067F000040020000A0000000BA4000__00000038E1ABFE28 000000067F000040020000A0000000BA0000-000000067F000040020000A0000000BA4000__00000038E9AF7F00 000000067F000040020000A0000000BA0000-000000067F000040020000A0000000BA4000__0000003903F1CFE8 000000067F000040020000A0000000BA0000-000000067F000040020000A0000000BA4000__0000003B99F7F8A0 000000067F000040020000A0000000BA0000-000000067F000040020000A0000000BA4000__0000005D2FFFFB38 000000067F000040020000A0000000BA10BA-000000067F000040020000A0000200000000__0000003157DDD551-00000031F78DF129 000000067F000040020000A0000000BA1288-000000067F000040020000A0000000BA9C74__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BA4000-000000067F000040020000A0000000BA8000__00000038E1ABFE28 000000067F000040020000A0000000BA4000-000000067F000040020000A0000000BA8000__00000038E9AF7F00 000000067F000040020000A0000000BA4000-000000067F000040020000A0000000BA8000__0000003903F1CFE8 000000067F000040020000A0000000BA4000-000000067F000040020000A0000000BA8000__0000003B99F7F8A0 000000067F000040020000A0000000BA4000-000000067F000040020000A0000000BA8000__0000005D2FFFFB38 000000067F000040020000A0000000BA8000-000000067F000040020000A0000000BAC000__00000038E1ABFE28 000000067F000040020000A0000000BA8000-000000067F000040020000A0000000BAC000__00000038E9AF7F00 000000067F000040020000A0000000BA8000-000000067F000040020000A0000000BAC000__0000003903F1CFE8 000000067F000040020000A0000000BA8000-000000067F000040020000A0000000BAC000__0000003B99F7F8A0 000000067F000040020000A0000000BA8000-000000067F000040020000A0000000BAC000__0000005D2FFFFB38 000000067F000040020000A0000000BA9C74-000000067F000040020000A0000000BB264F__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BAC000-000000067F000040020000A0000000BB0000__00000038E1ABFE28 000000067F000040020000A0000000BAC000-000000067F000040020000A0000000BB0000__00000038E9AF7F00 000000067F000040020000A0000000BAC000-000000067F000040020000A0000000BB0000__0000003903F1CFE8 000000067F000040020000A0000000BAC000-000000067F000040020000A0000000BB0000__0000003B99F7F8A0 000000067F000040020000A0000000BAC000-000000067F000040020000A0000000BB0000__0000005D2FFFFB38 000000067F000040020000A0000000BB0000-000000067F000040020000A0000000BB4000__00000038E1ABFE28 000000067F000040020000A0000000BB0000-000000067F000040020000A0000000BB4000__00000038E9AF7F00 000000067F000040020000A0000000BB0000-000000067F000040020000A0000000BB4000__0000003903F1CFE8 000000067F000040020000A0000000BB0000-000000067F000040020000A0000000BB4000__0000003B99F7F8A0 000000067F000040020000A0000000BB0000-000000067F000040020000A0000000BB4000__0000005D2FFFFB38 000000067F000040020000A0000000BB264F-000000067F000040020000A0000000BBB01F__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BB4000-000000067F000040020000A0000000BB8000__00000038E1ABFE28 000000067F000040020000A0000000BB4000-000000067F000040020000A0000000BB8000__00000038E9AF7F00 000000067F000040020000A0000000BB4000-000000067F000040020000A0000000BB8000__0000003903F1CFE8 000000067F000040020000A0000000BB4000-000000067F000040020000A0000000BB8000__0000003B99F7F8A0 000000067F000040020000A0000000BB4000-000000067F000040020000A0000000BB8000__0000005D2FFFFB38 000000067F000040020000A0000000BB8000-000000067F000040020000A0000000BBC000__00000038E1ABFE28 000000067F000040020000A0000000BB8000-000000067F000040020000A0000000BBC000__00000038E9AF7F00 000000067F000040020000A0000000BB8000-000000067F000040020000A0000000BBC000__0000003903F1CFE8 000000067F000040020000A0000000BB8000-000000067F000040020000A0000000BBC000__0000003B99F7F8A0 000000067F000040020000A0000000BB8000-000000067F000040020000A0000000BBC000__0000005D2FFFFB38 000000067F000040020000A0000000BBB01F-000000067F000040020000A0000000BC39F4__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BBC000-000000067F000040020000A0000000BC0000__00000038E1ABFE28 000000067F000040020000A0000000BBC000-000000067F000040020000A0000000BC0000__00000038E9AF7F00 000000067F000040020000A0000000BBC000-000000067F000040020000A0000000BC0000__0000003903F1CFE8 000000067F000040020000A0000000BBC000-000000067F000040020000A0000000BC0000__0000003B99F7F8A0 000000067F000040020000A0000000BBC000-000000067F000040020000A0000000BC0000__0000005D2FFFFB38 000000067F000040020000A0000000BC0000-000000067F000040020000A0000000BC4000__00000038E1ABFE28 000000067F000040020000A0000000BC0000-000000067F000040020000A0000000BC4000__00000038E9AF7F00 000000067F000040020000A0000000BC0000-000000067F000040020000A0000000BC4000__0000003903F1CFE8 000000067F000040020000A0000000BC0000-000000067F000040020000A0000000BC4000__0000003B99F7F8A0 000000067F000040020000A0000000BC0000-000000067F000040020000A0000000BC4000__0000005D2FFFFB38 000000067F000040020000A0000000BC39F4-000000067F000040020000A0000000BCC3D7__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BC4000-000000067F000040020000A0000000BC8000__00000038E1ABFE28 000000067F000040020000A0000000BC4000-000000067F000040020000A0000000BC8000__00000038E9AF7F00 000000067F000040020000A0000000BC4000-000000067F000040020000A0000000BC8000__0000003903F1CFE8 000000067F000040020000A0000000BC4000-000000067F000040020000A0000000BC8000__0000003B99F7F8A0 000000067F000040020000A0000000BC4000-000000067F000040020000A0000000BC8000__0000005D2FFFFB38 000000067F000040020000A0000000BC8000-000000067F000040020000A0000000BCC000__00000038E1ABFE28 000000067F000040020000A0000000BC8000-000000067F000040020000A0000000BCC000__00000038E9AF7F00 000000067F000040020000A0000000BC8000-000000067F000040020000A0000000BCC000__0000003903F1CFE8 000000067F000040020000A0000000BC8000-000000067F000040020000A0000000BCC000__0000003B99F7F8A0 000000067F000040020000A0000000BC8000-000000067F000040020000A0000000BCC000__0000005D2FFFFB38 000000067F000040020000A0000000BCC000-000000067F000040020000A0000000BD0000__00000038E1ABFE28 000000067F000040020000A0000000BCC000-000000067F000040020000A0000000BD0000__00000038E9AF7F00 000000067F000040020000A0000000BCC000-000000067F000040020000A0000000BD0000__0000003903F1CFE8 000000067F000040020000A0000000BCC000-000000067F000040020000A0000000BD0000__0000003B99F7F8A0 000000067F000040020000A0000000BCC000-000000067F000040020000A0000000BD0000__0000005D2FFFFB38 000000067F000040020000A0000000BCC3D7-000000067F000040020000A0000000BD4DC4__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BD0000-000000067F000040020000A0000000BD4000__00000038E1ABFE28 000000067F000040020000A0000000BD0000-000000067F000040020000A0000000BD4000__00000038E9AF7F00 000000067F000040020000A0000000BD0000-000000067F000040020000A0000000BD4000__0000003903F1CFE8 000000067F000040020000A0000000BD0000-000000067F000040020000A0000000BD4000__0000003B99F7F8A0 000000067F000040020000A0000000BD0000-000000067F000040020000A0000000BD4000__0000005D2FFFFB38 000000067F000040020000A0000000BD4000-000000067F000040020000A0000000BD8000__00000038E1ABFE28 000000067F000040020000A0000000BD4000-000000067F000040020000A0000000BD8000__00000038E9AF7F00 000000067F000040020000A0000000BD4000-000000067F000040020000A0000000BD8000__0000003903F1CFE8 000000067F000040020000A0000000BD4000-000000067F000040020000A0000000BD8000__0000003B99F7F8A0 000000067F000040020000A0000000BD4000-000000067F000040020000A0000000BD8000__0000005D2FFFFB38 000000067F000040020000A0000000BD4DC4-000000067F000040020000A0000000BDD7AA__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BD8000-000000067F000040020000A0000000BDC000__00000038E1ABFE28 000000067F000040020000A0000000BD8000-000000067F000040020000A0000000BDC000__00000038E9AF7F00 000000067F000040020000A0000000BD8000-000000067F000040020000A0000000BDC000__0000003903F1CFE8 000000067F000040020000A0000000BD8000-000000067F000040020000A0000000BDC000__0000003B99F7F8A0 000000067F000040020000A0000000BD8000-000000067F000040020000A0000000BDC000__0000005D2FFFFB38 000000067F000040020000A0000000BDC000-000000067F000040020000A0000000BE0000__00000038E1ABFE28 000000067F000040020000A0000000BDC000-000000067F000040020000A0000000BE0000__00000038E9AF7F00 000000067F000040020000A0000000BDC000-000000067F000040020000A0000000BE0000__0000003903F1CFE8 000000067F000040020000A0000000BDC000-000000067F000040020000A0000000BE0000__0000003B99F7F8A0 000000067F000040020000A0000000BDC000-000000067F000040020000A0000000BE0000__0000005D2FFFFB38 000000067F000040020000A0000000BDD7AA-000000067F000040020000A0000000BE6184__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BE0000-000000067F000040020000A0000000BE4000__00000038E1ABFE28 000000067F000040020000A0000000BE0000-000000067F000040020000A0000000BE4000__00000038E9AF7F00 000000067F000040020000A0000000BE0000-000000067F000040020000A0000000BE4000__0000003903F1CFE8 000000067F000040020000A0000000BE0000-000000067F000040020000A0000000BE4000__0000003B99F7F8A0 000000067F000040020000A0000000BE0000-000000067F000040020000A0000000BE4000__0000005D2FFFFB38 000000067F000040020000A0000000BE4000-000000067F000040020000A0000000BE8000__00000038E1ABFE28 000000067F000040020000A0000000BE4000-000000067F000040020000A0000000BE8000__00000038E9AF7F00 000000067F000040020000A0000000BE4000-000000067F000040020000A0000000BE8000__0000003903F1CFE8 000000067F000040020000A0000000BE4000-000000067F000040020000A0000000BE8000__0000003B99F7F8A0 000000067F000040020000A0000000BE4000-000000067F000040020000A0000000BE8000__0000005D2FFFFB38 000000067F000040020000A0000000BE6184-000000067F000040020000A0000000BEEB65__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BE8000-000000067F000040020000A0000000BEC000__00000038E1ABFE28 000000067F000040020000A0000000BE8000-000000067F000040020000A0000000BEC000__00000038E9AF7F00 000000067F000040020000A0000000BE8000-000000067F000040020000A0000000BEC000__0000003903F1CFE8 000000067F000040020000A0000000BE8000-000000067F000040020000A0000000BEC000__0000003B99F7F8A0 000000067F000040020000A0000000BE8000-000000067F000040020000A0000000BEC000__0000005D2FFFFB38 000000067F000040020000A0000000BEC000-000000067F000040020000A0000000BF0000__00000038E1ABFE28 000000067F000040020000A0000000BEC000-000000067F000040020000A0000000BF0000__00000038E9AF7F00 000000067F000040020000A0000000BEC000-000000067F000040020000A0000000BF0000__0000003903F1CFE8 000000067F000040020000A0000000BEC000-000000067F000040020000A0000000BF0000__0000003B99F7F8A0 000000067F000040020000A0000000BEC000-000000067F000040020000A0000000BF0000__0000005D2FFFFB38 000000067F000040020000A0000000BEEB65-000000067F000040020000A0000000BF7534__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BF0000-000000067F000040020000A0000000BF4000__00000038E1ABFE28 000000067F000040020000A0000000BF0000-000000067F000040020000A0000000BF4000__00000038E9AF7F00 000000067F000040020000A0000000BF0000-000000067F000040020000A0000000BF4000__0000003903F1CFE8 000000067F000040020000A0000000BF0000-000000067F000040020000A0000000BF4000__0000003B99F7F8A0 000000067F000040020000A0000000BF0000-000000067F000040020000A0000000BF4000__0000005D2FFFFB38 000000067F000040020000A0000000BF4000-000000067F000040020000A0000000BF8000__00000033605476A8 000000067F000040020000A0000000BF4000-000000067F000040020000A0000000BF8000__00000038E9AF7F00 000000067F000040020000A0000000BF4000-000000067F000040020000A0000000BF8000__0000003903F1CFE8 000000067F000040020000A0000000BF4000-000000067F000040020000A0000000BF8000__0000003B99F7F8A0 000000067F000040020000A0000000BF4000-000000067F000040020000A0000000BF8000__0000005D2FFFFB38 000000067F000040020000A0000000BF7534-000000067F000040020000A0000200000000__00000031F78DF129-00000032973DEDD1 000000067F000040020000A0000000BF778F-000000067F000040020000A0000000C00165__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000BF8000-000000067F000040020000A0000000BFC000__00000033605476A8 000000067F000040020000A0000000BF8000-000000067F000040020000A0000000BFC000__00000038E9AF7F00 000000067F000040020000A0000000BF8000-000000067F000040020000A0000000BFC000__0000003903F1CFE8 000000067F000040020000A0000000BF8000-000000067F000040020000A0000000BFC000__0000003B99F7F8A0 000000067F000040020000A0000000BF8000-000000067F000040020000A0000000BFC000__0000005D2FFFFB38 000000067F000040020000A0000000BFC000-000000067F000040020000A0000000C00000__00000033605476A8 000000067F000040020000A0000000BFC000-000000067F000040020000A0000000C00000__00000038E9AF7F00 000000067F000040020000A0000000BFC000-000000067F000040020000A0000000C00000__0000003903F1CFE8 000000067F000040020000A0000000BFC000-000000067F000040020000A0000000C00000__0000003B99F7F8A0 000000067F000040020000A0000000BFC000-000000067F000040020000A0000000C00000__0000005D2FFFFB38 000000067F000040020000A0000000C00000-000000067F000040020000A0000000C04000__00000033605476A8 000000067F000040020000A0000000C00000-000000067F000040020000A0000000C04000__00000038E9AF7F00 000000067F000040020000A0000000C00000-000000067F000040020000A0000000C04000__0000003903F1CFE8 000000067F000040020000A0000000C00000-000000067F000040020000A0000000C04000__0000003B99F7F8A0 000000067F000040020000A0000000C00000-000000067F000040020000A0000000C04000__0000005D2FFFFB38 000000067F000040020000A0000000C00165-000000067F000040020000A0000000C08B3A__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C04000-000000067F000040020000A0000000C08000__00000033605476A8 000000067F000040020000A0000000C04000-000000067F000040020000A0000000C08000__00000038E9AF7F00 000000067F000040020000A0000000C04000-000000067F000040020000A0000000C08000__0000003903F1CFE8 000000067F000040020000A0000000C04000-000000067F000040020000A0000000C08000__0000003B99F7F8A0 000000067F000040020000A0000000C04000-000000067F000040020000A0000000C08000__0000005D2FFFFB38 000000067F000040020000A0000000C08000-000000067F000040020000A0000000C0C000__00000033605476A8 000000067F000040020000A0000000C08000-000000067F000040020000A0000000C0C000__00000038E9AF7F00 000000067F000040020000A0000000C08000-000000067F000040020000A0000000C0C000__0000003903F1CFE8 000000067F000040020000A0000000C08000-000000067F000040020000A0000000C0C000__0000003B99F7F8A0 000000067F000040020000A0000000C08000-000000067F000040020000A0000000C0C000__0000005D2FFFFB38 000000067F000040020000A0000000C08B3A-000000067F000040020000A0000000C1151B__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C0C000-000000067F000040020000A0000000C10000__00000033605476A8 000000067F000040020000A0000000C0C000-000000067F000040020000A0000000C10000__00000038E9AF7F00 000000067F000040020000A0000000C0C000-000000067F000040020000A0000000C10000__0000003903F1CFE8 000000067F000040020000A0000000C0C000-000000067F000040020000A0000000C10000__0000003B99F7F8A0 000000067F000040020000A0000000C0C000-000000067F000040020000A0000000C10000__0000005D2FFFFB38 000000067F000040020000A0000000C10000-000000067F000040020000A0000000C14000__00000033605476A8 000000067F000040020000A0000000C10000-000000067F000040020000A0000000C14000__00000038E9AF7F00 000000067F000040020000A0000000C10000-000000067F000040020000A0000000C14000__0000003903F1CFE8 000000067F000040020000A0000000C10000-000000067F000040020000A0000000C14000__0000003B99F7F8A0 000000067F000040020000A0000000C10000-000000067F000040020000A0000000C14000__0000005D2FFFFB38 000000067F000040020000A0000000C1151B-000000067F000040020000A0000000C19EF6__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C14000-000000067F000040020000A0000000C18000__00000033605476A8 000000067F000040020000A0000000C14000-000000067F000040020000A0000000C18000__00000038E9AF7F00 000000067F000040020000A0000000C14000-000000067F000040020000A0000000C18000__0000003903F1CFE8 000000067F000040020000A0000000C14000-000000067F000040020000A0000000C18000__0000003B99F7F8A0 000000067F000040020000A0000000C14000-000000067F000040020000A0000000C18000__0000005D2FFFFB38 000000067F000040020000A0000000C18000-000000067F000040020000A0000000C1C000__00000033605476A8 000000067F000040020000A0000000C18000-000000067F000040020000A0000000C1C000__00000038E9AF7F00 000000067F000040020000A0000000C18000-000000067F000040020000A0000000C1C000__0000003903F1CFE8 000000067F000040020000A0000000C18000-000000067F000040020000A0000000C1C000__0000003B99F7F8A0 000000067F000040020000A0000000C18000-000000067F000040020000A0000000C1C000__0000005D2FFFFB38 000000067F000040020000A0000000C19EF6-000000067F000040020000A0000000C228E6__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C1C000-000000067F000040020000A0000000C20000__00000033605476A8 000000067F000040020000A0000000C1C000-000000067F000040020000A0000000C20000__00000038E9AF7F00 000000067F000040020000A0000000C1C000-000000067F000040020000A0000000C20000__0000003903F1CFE8 000000067F000040020000A0000000C1C000-000000067F000040020000A0000000C20000__0000003B99F7F8A0 000000067F000040020000A0000000C1C000-000000067F000040020000A0000000C20000__0000005D2FFFFB38 000000067F000040020000A0000000C20000-000000067F000040020000A0000000C24000__00000033605476A8 000000067F000040020000A0000000C20000-000000067F000040020000A0000000C24000__00000038E9AF7F00 000000067F000040020000A0000000C20000-000000067F000040020000A0000000C24000__0000003903F1CFE8 000000067F000040020000A0000000C20000-000000067F000040020000A0000000C24000__0000003B99F7F8A0 000000067F000040020000A0000000C20000-000000067F000040020000A0000000C24000__0000005D2FFFFB38 000000067F000040020000A0000000C228E6-000000067F000040020000A0000000C2B2C5__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C24000-000000067F000040020000A0000000C28000__00000033605476A8 000000067F000040020000A0000000C24000-000000067F000040020000A0000000C28000__00000038E9AF7F00 000000067F000040020000A0000000C24000-000000067F000040020000A0000000C28000__0000003903F1CFE8 000000067F000040020000A0000000C24000-000000067F000040020000A0000000C28000__0000003B99F7F8A0 000000067F000040020000A0000000C24000-000000067F000040020000A0000000C28000__0000005D2FFFFB38 000000067F000040020000A0000000C28000-000000067F000040020000A0000000C2C000__00000033605476A8 000000067F000040020000A0000000C28000-000000067F000040020000A0000000C2C000__00000038E9AF7F00 000000067F000040020000A0000000C28000-000000067F000040020000A0000000C2C000__0000003903F1CFE8 000000067F000040020000A0000000C28000-000000067F000040020000A0000000C2C000__0000003B99F7F8A0 000000067F000040020000A0000000C28000-000000067F000040020000A0000000C2C000__0000005D2FFFFB38 000000067F000040020000A0000000C2B2C5-000000067F000040020000A0000000C33C9C__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C2C000-000000067F000040020000A0000000C30000__00000033605476A8 000000067F000040020000A0000000C2C000-000000067F000040020000A0000000C30000__00000038E9AF7F00 000000067F000040020000A0000000C2C000-000000067F000040020000A0000000C30000__0000003903F1CFE8 000000067F000040020000A0000000C2C000-000000067F000040020000A0000000C30000__0000003B99F7F8A0 000000067F000040020000A0000000C2C000-000000067F000040020000A0000000C30000__0000005D2FFFFB38 000000067F000040020000A0000000C30000-000000067F000040020000A0000000C34000__00000033605476A8 000000067F000040020000A0000000C30000-000000067F000040020000A0000000C34000__00000038E9AF7F00 000000067F000040020000A0000000C30000-000000067F000040020000A0000000C34000__0000003903F1CFE8 000000067F000040020000A0000000C30000-000000067F000040020000A0000000C34000__0000003B99F7F8A0 000000067F000040020000A0000000C30000-000000067F000040020000A0000000C34000__0000005D2FFFFB38 000000067F000040020000A0000000C33C9C-000000067F000040020000A0000000C3C66D__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C34000-000000067F000040020000A0000000C38000__00000033605476A8 000000067F000040020000A0000000C34000-000000067F000040020000A0000000C38000__00000038E9AF7F00 000000067F000040020000A0000000C34000-000000067F000040020000A0000000C38000__0000003903F1CFE8 000000067F000040020000A0000000C34000-000000067F000040020000A0000000C38000__0000003B99F7F8A0 000000067F000040020000A0000000C34000-000000067F000040020000A0000000C38000__0000005D2FFFFB38 000000067F000040020000A0000000C38000-000000067F000040020000A0000000C3C000__00000033605476A8 000000067F000040020000A0000000C38000-000000067F000040020000A0000000C3C000__00000038E9AF7F00 000000067F000040020000A0000000C38000-000000067F000040020000A0000000C3C000__0000003903F1CFE8 000000067F000040020000A0000000C38000-000000067F000040020000A0000000C3C000__0000003B99F7F8A0 000000067F000040020000A0000000C38000-000000067F000040020000A0000000C3C000__0000005D2FFFFB38 000000067F000040020000A0000000C3C000-000000067F000040020000A0000000C40000__00000033605476A8 000000067F000040020000A0000000C3C000-000000067F000040020000A0000000C40000__00000038E9AF7F00 000000067F000040020000A0000000C3C000-000000067F000040020000A0000000C40000__0000003903F1CFE8 000000067F000040020000A0000000C3C000-000000067F000040020000A0000000C40000__0000003B99F7F8A0 000000067F000040020000A0000000C3C000-000000067F000040020000A0000000C40000__0000005D2FFFFB38 000000067F000040020000A0000000C3C66D-000000067F000040020000A0000000C45033__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C40000-000000067F000040020000A0000000C44000__00000033605476A8 000000067F000040020000A0000000C40000-000000067F000040020000A0000000C44000__00000038E9AF7F00 000000067F000040020000A0000000C40000-000000067F000040020000A0000000C44000__0000003903F1CFE8 000000067F000040020000A0000000C40000-000000067F000040020000A0000000C44000__0000003B99F7F8A0 000000067F000040020000A0000000C40000-000000067F000040020000A0000000C44000__0000005D2FFFFB38 000000067F000040020000A0000000C44000-000000067F000040020000A0000000C48000__00000033605476A8 000000067F000040020000A0000000C44000-000000067F000040020000A0000000C48000__00000038E9AF7F00 000000067F000040020000A0000000C44000-000000067F000040020000A0000000C48000__0000003903F1CFE8 000000067F000040020000A0000000C44000-000000067F000040020000A0000000C48000__0000003B99F7F8A0 000000067F000040020000A0000000C44000-000000067F000040020000A0000000C48000__0000005D2FFFFB38 000000067F000040020000A0000000C45033-000000067F000040020000A0000000C4DA13__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C48000-000000067F000040020000A0000000C4C000__00000033605476A8 000000067F000040020000A0000000C48000-000000067F000040020000A0000000C4C000__00000038E9AF7F00 000000067F000040020000A0000000C48000-000000067F000040020000A0000000C4C000__0000003903F1CFE8 000000067F000040020000A0000000C48000-000000067F000040020000A0000000C4C000__0000003B99F7F8A0 000000067F000040020000A0000000C48000-000000067F000040020000A0000000C4C000__0000005D2FFFFB38 000000067F000040020000A0000000C4C000-000000067F000040020000A0000000C50000__00000033605476A8 000000067F000040020000A0000000C4C000-000000067F000040020000A0000000C50000__00000038E67ABFA0 000000067F000040020000A0000000C4C000-000000067F000040020000A0000000C50000__0000003903F1CFE8 000000067F000040020000A0000000C4C000-000000067F000040020000A0000000C50000__0000003B99F7F8A0 000000067F000040020000A0000000C4C000-000000067F000040020000A0000000C50000__0000005D2FFFFB38 000000067F000040020000A0000000C4DA13-000000067F000040020000A0000200000000__00000032973DEDD1-0000003336EBF989 000000067F000040020000A0000000C4DBC3-000000067F000040020000A0000000C565B4__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C50000-000000067F000040020000A0000000C54000__00000033605476A8 000000067F000040020000A0000000C50000-000000067F000040020000A0000000C54000__00000038E67ABFA0 000000067F000040020000A0000000C50000-000000067F000040020000A0000000C54000__0000003903F1CFE8 000000067F000040020000A0000000C50000-000000067F000040020000A0000000C54000__0000003B99F7F8A0 000000067F000040020000A0000000C50000-000000067F000040020000A0000000C54000__0000005D2FFFFB38 000000067F000040020000A0000000C54000-000000067F000040020000A0000000C58000__00000033605476A8 000000067F000040020000A0000000C54000-000000067F000040020000A0000000C58000__00000038E67ABFA0 000000067F000040020000A0000000C54000-000000067F000040020000A0000000C58000__0000003903F1CFE8 000000067F000040020000A0000000C54000-000000067F000040020000A0000000C58000__0000003B99F7F8A0 000000067F000040020000A0000000C54000-000000067F000040020000A0000000C58000__0000005D2FFFFB38 000000067F000040020000A0000000C565B4-000000067F000040020000A0000000C5EFA1__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C58000-000000067F000040020000A0000000C5C000__00000033605476A8 000000067F000040020000A0000000C58000-000000067F000040020000A0000000C5C000__00000038E67ABFA0 000000067F000040020000A0000000C58000-000000067F000040020000A0000000C5C000__0000003903F1CFE8 000000067F000040020000A0000000C58000-000000067F000040020000A0000000C5C000__0000003B99F7F8A0 000000067F000040020000A0000000C58000-000000067F000040020000A0000000C5C000__0000005D2FFFFB38 000000067F000040020000A0000000C5C000-000000067F000040020000A0000000C60000__00000033605476A8 000000067F000040020000A0000000C5C000-000000067F000040020000A0000000C60000__00000038E67ABFA0 000000067F000040020000A0000000C5C000-000000067F000040020000A0000000C60000__0000003903F1CFE8 000000067F000040020000A0000000C5C000-000000067F000040020000A0000000C60000__0000003B99F7F8A0 000000067F000040020000A0000000C5C000-000000067F000040020000A0000000C60000__0000005D2FFFFB38 000000067F000040020000A0000000C5EFA1-000000067F000040020000A0000000C6797A__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C60000-000000067F000040020000A0000000C64000__00000033605476A8 000000067F000040020000A0000000C60000-000000067F000040020000A0000000C64000__00000038E67ABFA0 000000067F000040020000A0000000C60000-000000067F000040020000A0000000C64000__0000003903F1CFE8 000000067F000040020000A0000000C60000-000000067F000040020000A0000000C64000__0000003B99F7F8A0 000000067F000040020000A0000000C60000-000000067F000040020000A0000000C64000__0000005D2FFFFB38 000000067F000040020000A0000000C64000-000000067F000040020000A0000000C68000__00000038E67ABFA0 000000067F000040020000A0000000C64000-000000067F000040020000A0000000C68000__0000003903F1CFE8 000000067F000040020000A0000000C64000-000000067F000040020000A0000000C68000__0000003B99F7F8A0 000000067F000040020000A0000000C64000-000000067F000040020000A0000000C68000__0000005D2FFFFB38 000000067F000040020000A0000000C64000-030000000000000000000000000000000002__00000033605476A8 000000067F000040020000A0000000C6797A-000000067F000040020000A0000000C7034B__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C68000-000000067F000040020000A0000000C6C000__00000038E67ABFA0 000000067F000040020000A0000000C68000-000000067F000040020000A0000000C6C000__0000003903F1CFE8 000000067F000040020000A0000000C68000-000000067F000040020000A0000000C6C000__0000003B99F7F8A0 000000067F000040020000A0000000C68000-000000067F000040020000A0000000C6C000__0000005D2FFFFB38 000000067F000040020000A0000000C6C000-000000067F000040020000A0000000C70000__00000038E67ABFA0 000000067F000040020000A0000000C6C000-000000067F000040020000A0000000C70000__0000003903F1CFE8 000000067F000040020000A0000000C6C000-000000067F000040020000A0000000C70000__0000003B99F7F8A0 000000067F000040020000A0000000C6C000-000000067F000040020000A0000000C70000__0000005D2FFFFB38 000000067F000040020000A0000000C70000-000000067F000040020000A0000000C74000__00000038E67ABFA0 000000067F000040020000A0000000C70000-000000067F000040020000A0000000C74000__0000003903F1CFE8 000000067F000040020000A0000000C70000-000000067F000040020000A0000000C74000__0000003B99F7F8A0 000000067F000040020000A0000000C70000-000000067F000040020000A0000000C74000__0000005D2FFFFB38 000000067F000040020000A0000000C7034B-000000067F000040020000A0000000C78D17__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C74000-000000067F000040020000A0000000C78000__00000038E67ABFA0 000000067F000040020000A0000000C74000-000000067F000040020000A0000000C78000__0000003903F1CFE8 000000067F000040020000A0000000C74000-000000067F000040020000A0000000C78000__0000003B99F7F8A0 000000067F000040020000A0000000C74000-000000067F000040020000A0000000C78000__0000005D2FFFFB38 000000067F000040020000A0000000C78000-000000067F000040020000A0000000C7C000__00000038E67ABFA0 000000067F000040020000A0000000C78000-000000067F000040020000A0000000C7C000__0000003903F1CFE8 000000067F000040020000A0000000C78000-000000067F000040020000A0000000C7C000__0000003B99F7F8A0 000000067F000040020000A0000000C78000-000000067F000040020000A0000000C7C000__0000005D2FFFFB38 000000067F000040020000A0000000C78D17-000000067F000040020000A0000000C816E2__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C7C000-000000067F000040020000A0000000C80000__00000038E67ABFA0 000000067F000040020000A0000000C7C000-000000067F000040020000A0000000C80000__0000003903F1CFE8 000000067F000040020000A0000000C7C000-000000067F000040020000A0000000C80000__0000003B99F7F8A0 000000067F000040020000A0000000C7C000-000000067F000040020000A0000000C80000__0000005D2FFFFB38 000000067F000040020000A0000000C80000-000000067F000040020000A0000000C84000__00000038E67ABFA0 000000067F000040020000A0000000C80000-000000067F000040020000A0000000C84000__0000003903F1CFE8 000000067F000040020000A0000000C80000-000000067F000040020000A0000000C84000__0000003B99F7F8A0 000000067F000040020000A0000000C80000-000000067F000040020000A0000000C84000__0000005D2FFFFB38 000000067F000040020000A0000000C816E2-000000067F000040020000A0000000C8A0D8__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C84000-000000067F000040020000A0000000C88000__00000038E67ABFA0 000000067F000040020000A0000000C84000-000000067F000040020000A0000000C88000__0000003903F1CFE8 000000067F000040020000A0000000C84000-000000067F000040020000A0000000C88000__0000003B99F7F8A0 000000067F000040020000A0000000C84000-000000067F000040020000A0000000C88000__0000005D2FFFFB38 000000067F000040020000A0000000C88000-000000067F000040020000A0000000C8C000__00000038E67ABFA0 000000067F000040020000A0000000C88000-000000067F000040020000A0000000C8C000__0000003903F1CFE8 000000067F000040020000A0000000C88000-000000067F000040020000A0000000C8C000__0000003B99F7F8A0 000000067F000040020000A0000000C88000-000000067F000040020000A0000000C8C000__0000005D2FFFFB38 000000067F000040020000A0000000C8A0D8-000000067F000040020000A0000000C92AC4__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C8C000-000000067F000040020000A0000000C90000__00000038E67ABFA0 000000067F000040020000A0000000C8C000-000000067F000040020000A0000000C90000__0000003903F1CFE8 000000067F000040020000A0000000C8C000-000000067F000040020000A0000000C90000__0000003B99F7F8A0 000000067F000040020000A0000000C8C000-000000067F000040020000A0000000C90000__0000005D2FFFFB38 000000067F000040020000A0000000C90000-000000067F000040020000A0000000C94000__00000038E67ABFA0 000000067F000040020000A0000000C90000-000000067F000040020000A0000000C94000__0000003903F1CFE8 000000067F000040020000A0000000C90000-000000067F000040020000A0000000C94000__0000003B99F7F8A0 000000067F000040020000A0000000C90000-000000067F000040020000A0000000C94000__0000005D2FFFFB38 000000067F000040020000A0000000C92AC4-000000067F000040020000A0000000C9B4AF__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C94000-000000067F000040020000A0000000C98000__00000038E67ABFA0 000000067F000040020000A0000000C94000-000000067F000040020000A0000000C98000__0000003903F1CFE8 000000067F000040020000A0000000C94000-000000067F000040020000A0000000C98000__0000003B99F7F8A0 000000067F000040020000A0000000C94000-000000067F000040020000A0000000C98000__0000005D2FFFFB38 000000067F000040020000A0000000C98000-000000067F000040020000A0000000C9C000__00000038E67ABFA0 000000067F000040020000A0000000C98000-000000067F000040020000A0000000C9C000__0000003903F1CFE8 000000067F000040020000A0000000C98000-000000067F000040020000A0000000C9C000__0000003B99F7F8A0 000000067F000040020000A0000000C98000-000000067F000040020000A0000000C9C000__0000005D2FFFFB38 000000067F000040020000A0000000C9B4AF-000000067F000040020000A0000000CA3E87__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000C9C000-000000067F000040020000A0000000CA0000__00000038E67ABFA0 000000067F000040020000A0000000C9C000-000000067F000040020000A0000000CA0000__0000003903F1CFE8 000000067F000040020000A0000000C9C000-000000067F000040020000A0000000CA0000__0000003B99F7F8A0 000000067F000040020000A0000000C9C000-000000067F000040020000A0000000CA0000__0000005D2FFFFB38 000000067F000040020000A0000000CA0000-000000067F000040020000A0000000CA4000__00000038E67ABFA0 000000067F000040020000A0000000CA0000-000000067F000040020000A0000000CA4000__0000003903F1CFE8 000000067F000040020000A0000000CA0000-000000067F000040020000A0000000CA4000__0000003B99F7F8A0 000000067F000040020000A0000000CA0000-000000067F000040020000A0000000CA4000__0000005D2FFFFB38 000000067F000040020000A0000000CA3E87-000000067F000040020000A0000200000000__0000003336EBF989-00000033D69BE889 000000067F000040020000A0000000CA4000-000000067F000040020000A0000000CA8000__00000038E1ABFE28 000000067F000040020000A0000000CA4000-000000067F000040020000A0000000CA8000__00000038E9AF7F00 000000067F000040020000A0000000CA4000-000000067F000040020000A0000000CA8000__0000003903F1CFE8 000000067F000040020000A0000000CA4000-000000067F000040020000A0000000CA8000__0000003B99F7F8A0 000000067F000040020000A0000000CA4000-000000067F000040020000A0000000CA8000__0000005D2FFFFB38 000000067F000040020000A0000000CA403E-000000067F000040020000A0000000CACA12__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CA8000-000000067F000040020000A0000000CAC000__00000038E1ABFE28 000000067F000040020000A0000000CA8000-000000067F000040020000A0000000CAC000__00000038E9AF7F00 000000067F000040020000A0000000CA8000-000000067F000040020000A0000000CAC000__0000003903F1CFE8 000000067F000040020000A0000000CA8000-000000067F000040020000A0000000CAC000__0000003B99F7F8A0 000000067F000040020000A0000000CA8000-000000067F000040020000A0000000CAC000__0000005D2FFFFB38 000000067F000040020000A0000000CAC000-000000067F000040020000A0000000CB0000__00000038E1ABFE28 000000067F000040020000A0000000CAC000-000000067F000040020000A0000000CB0000__00000038E9AF7F00 000000067F000040020000A0000000CAC000-000000067F000040020000A0000000CB0000__0000003903F1CFE8 000000067F000040020000A0000000CAC000-000000067F000040020000A0000000CB0000__0000003B99F7F8A0 000000067F000040020000A0000000CAC000-000000067F000040020000A0000000CB0000__0000005D2FFFFB38 000000067F000040020000A0000000CACA12-000000067F000040020000A0000000CB53E3__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CB0000-000000067F000040020000A0000000CB4000__00000038E1ABFE28 000000067F000040020000A0000000CB0000-000000067F000040020000A0000000CB4000__00000038E9AF7F00 000000067F000040020000A0000000CB0000-000000067F000040020000A0000000CB4000__0000003903F1CFE8 000000067F000040020000A0000000CB0000-000000067F000040020000A0000000CB4000__0000003B99F7F8A0 000000067F000040020000A0000000CB0000-000000067F000040020000A0000000CB4000__0000005D2FFFFB38 000000067F000040020000A0000000CB4000-000000067F000040020000A0000000CB8000__00000038E1ABFE28 000000067F000040020000A0000000CB4000-000000067F000040020000A0000000CB8000__00000038E9AF7F00 000000067F000040020000A0000000CB4000-000000067F000040020000A0000000CB8000__0000003903F1CFE8 000000067F000040020000A0000000CB4000-000000067F000040020000A0000000CB8000__0000003B99F7F8A0 000000067F000040020000A0000000CB4000-000000067F000040020000A0000000CB8000__0000005D2FFFFB38 000000067F000040020000A0000000CB53E3-000000067F000040020000A0000000CBDDBA__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CB8000-000000067F000040020000A0000000CBC000__00000038E1ABFE28 000000067F000040020000A0000000CB8000-000000067F000040020000A0000000CBC000__00000038E9AF7F00 000000067F000040020000A0000000CB8000-000000067F000040020000A0000000CBC000__0000003903F1CFE8 000000067F000040020000A0000000CB8000-000000067F000040020000A0000000CBC000__0000003B99F7F8A0 000000067F000040020000A0000000CB8000-000000067F000040020000A0000000CBC000__0000005D2FFFFB38 000000067F000040020000A0000000CBC000-000000067F000040020000A0000000CC0000__00000038E1ABFE28 000000067F000040020000A0000000CBC000-000000067F000040020000A0000000CC0000__00000038E9AF7F00 000000067F000040020000A0000000CBC000-000000067F000040020000A0000000CC0000__0000003903F1CFE8 000000067F000040020000A0000000CBC000-000000067F000040020000A0000000CC0000__0000003B99F7F8A0 000000067F000040020000A0000000CBC000-000000067F000040020000A0000000CC0000__0000005D2FFFFB38 000000067F000040020000A0000000CBDDBA-000000067F000040020000A0000000CC67A6__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CC0000-000000067F000040020000A0000000CC4000__00000038E1ABFE28 000000067F000040020000A0000000CC0000-000000067F000040020000A0000000CC4000__00000038E9AF7F00 000000067F000040020000A0000000CC0000-000000067F000040020000A0000000CC4000__0000003903F1CFE8 000000067F000040020000A0000000CC0000-000000067F000040020000A0000000CC4000__0000003B99F7F8A0 000000067F000040020000A0000000CC0000-000000067F000040020000A0000000CC4000__0000005D2FFFFB38 000000067F000040020000A0000000CC4000-000000067F000040020000A0000000CC8000__00000038E1ABFE28 000000067F000040020000A0000000CC4000-000000067F000040020000A0000000CC8000__00000038E9AF7F00 000000067F000040020000A0000000CC4000-000000067F000040020000A0000000CC8000__0000003903F1CFE8 000000067F000040020000A0000000CC4000-000000067F000040020000A0000000CC8000__0000003B99F7F8A0 000000067F000040020000A0000000CC4000-000000067F000040020000A0000000CC8000__0000005D2FFFFB38 000000067F000040020000A0000000CC67A6-000000067F000040020000A0000000CCF196__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CC8000-000000067F000040020000A0000000CCC000__00000038E1ABFE28 000000067F000040020000A0000000CC8000-000000067F000040020000A0000000CCC000__00000038E9AF7F00 000000067F000040020000A0000000CC8000-000000067F000040020000A0000000CCC000__0000003903F1CFE8 000000067F000040020000A0000000CC8000-000000067F000040020000A0000000CCC000__0000003B99F7F8A0 000000067F000040020000A0000000CC8000-000000067F000040020000A0000000CCC000__0000005D2FFFFB38 000000067F000040020000A0000000CCC000-000000067F000040020000A0000000CD0000__00000038E1ABFE28 000000067F000040020000A0000000CCC000-000000067F000040020000A0000000CD0000__00000038E9AF7F00 000000067F000040020000A0000000CCC000-000000067F000040020000A0000000CD0000__0000003903F1CFE8 000000067F000040020000A0000000CCC000-000000067F000040020000A0000000CD0000__0000003B99F7F8A0 000000067F000040020000A0000000CCC000-000000067F000040020000A0000000CD0000__0000005D2FFFFB38 000000067F000040020000A0000000CCF196-000000067F000040020000A0000000CD7BA1__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CD0000-000000067F000040020000A0000000CD4000__00000038E1ABFE28 000000067F000040020000A0000000CD0000-000000067F000040020000A0000000CD4000__00000038E9AF7F00 000000067F000040020000A0000000CD0000-000000067F000040020000A0000000CD4000__0000003903F1CFE8 000000067F000040020000A0000000CD0000-000000067F000040020000A0000000CD4000__0000003B99F7F8A0 000000067F000040020000A0000000CD0000-000000067F000040020000A0000000CD4000__0000005D2FFFFB38 000000067F000040020000A0000000CD4000-000000067F000040020000A0000000CD8000__00000038E1ABFE28 000000067F000040020000A0000000CD4000-000000067F000040020000A0000000CD8000__00000038E9AF7F00 000000067F000040020000A0000000CD4000-000000067F000040020000A0000000CD8000__0000003903F1CFE8 000000067F000040020000A0000000CD4000-000000067F000040020000A0000000CD8000__0000003B99F7F8A0 000000067F000040020000A0000000CD4000-000000067F000040020000A0000000CD8000__0000005D2FFFFB38 000000067F000040020000A0000000CD7BA1-000000067F000040020000A0000000CE0577__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CD8000-000000067F000040020000A0000000CDC000__00000038E1ABFE28 000000067F000040020000A0000000CD8000-000000067F000040020000A0000000CDC000__00000038E9AF7F00 000000067F000040020000A0000000CD8000-000000067F000040020000A0000000CDC000__0000003903F1CFE8 000000067F000040020000A0000000CD8000-000000067F000040020000A0000000CDC000__0000003B99F7F8A0 000000067F000040020000A0000000CD8000-000000067F000040020000A0000000CDC000__0000005D2FFFFB38 000000067F000040020000A0000000CDC000-000000067F000040020000A0000000CE0000__00000038E1ABFE28 000000067F000040020000A0000000CDC000-000000067F000040020000A0000000CE0000__00000038E9AF7F00 000000067F000040020000A0000000CDC000-000000067F000040020000A0000000CE0000__0000003903F1CFE8 000000067F000040020000A0000000CDC000-000000067F000040020000A0000000CE0000__0000003B99F7F8A0 000000067F000040020000A0000000CDC000-000000067F000040020000A0000000CE0000__0000005D2FFFFB38 000000067F000040020000A0000000CE0000-000000067F000040020000A0000000CE4000__00000038E1ABFE28 000000067F000040020000A0000000CE0000-000000067F000040020000A0000000CE4000__00000038E9AF7F00 000000067F000040020000A0000000CE0000-000000067F000040020000A0000000CE4000__0000003903F1CFE8 000000067F000040020000A0000000CE0000-000000067F000040020000A0000000CE4000__0000003B99F7F8A0 000000067F000040020000A0000000CE0000-000000067F000040020000A0000000CE4000__0000005D2FFFFB38 000000067F000040020000A0000000CE0577-000000067F000040020000A0000000CE8F57__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CE4000-000000067F000040020000A0000000CE8000__00000038E1ABFE28 000000067F000040020000A0000000CE4000-000000067F000040020000A0000000CE8000__00000038E9AF7F00 000000067F000040020000A0000000CE4000-000000067F000040020000A0000000CE8000__0000003903F1CFE8 000000067F000040020000A0000000CE4000-000000067F000040020000A0000000CE8000__0000003B99F7F8A0 000000067F000040020000A0000000CE4000-000000067F000040020000A0000000CE8000__0000005D2FFFFB38 000000067F000040020000A0000000CE8000-000000067F000040020000A0000000CEC000__00000038E1ABFE28 000000067F000040020000A0000000CE8000-000000067F000040020000A0000000CEC000__00000038E9AF7F00 000000067F000040020000A0000000CE8000-000000067F000040020000A0000000CEC000__0000003903F1CFE8 000000067F000040020000A0000000CE8000-000000067F000040020000A0000000CEC000__0000003B99F7F8A0 000000067F000040020000A0000000CE8000-000000067F000040020000A0000000CEC000__0000005D2FFFFB38 000000067F000040020000A0000000CE8F57-000000067F000040020000A0000000CF1933__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CEC000-000000067F000040020000A0000000CF0000__00000038E1ABFE28 000000067F000040020000A0000000CEC000-000000067F000040020000A0000000CF0000__00000038E9AF7F00 000000067F000040020000A0000000CEC000-000000067F000040020000A0000000CF0000__0000003903F1CFE8 000000067F000040020000A0000000CEC000-000000067F000040020000A0000000CF0000__0000003B99F7F8A0 000000067F000040020000A0000000CEC000-000000067F000040020000A0000000CF0000__0000005D2FFFFB38 000000067F000040020000A0000000CF0000-000000067F000040020000A0000000CF4000__00000038E1ABFE28 000000067F000040020000A0000000CF0000-000000067F000040020000A0000000CF4000__00000038E9AF7F00 000000067F000040020000A0000000CF0000-000000067F000040020000A0000000CF4000__0000003903F1CFE8 000000067F000040020000A0000000CF0000-000000067F000040020000A0000000CF4000__0000003B99F7F8A0 000000067F000040020000A0000000CF0000-000000067F000040020000A0000000CF4000__0000005D2FFFFB38 000000067F000040020000A0000000CF1933-000000067F000040020000A0000000CFA300__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CF4000-000000067F000040020000A0000000CF8000__00000038E1ABFE28 000000067F000040020000A0000000CF4000-000000067F000040020000A0000000CF8000__00000038E9AF7F00 000000067F000040020000A0000000CF4000-000000067F000040020000A0000000CF8000__0000003903F1CFE8 000000067F000040020000A0000000CF4000-000000067F000040020000A0000000CF8000__0000003B99F7F8A0 000000067F000040020000A0000000CF4000-000000067F000040020000A0000000CF8000__0000005D2FFFFB38 000000067F000040020000A0000000CF8000-000000067F000040020000A0000000CFC000__0000003545E7DCF0 000000067F000040020000A0000000CF8000-000000067F000040020000A0000000CFC000__00000038E9AF7F00 000000067F000040020000A0000000CF8000-000000067F000040020000A0000000CFC000__0000003903F1CFE8 000000067F000040020000A0000000CF8000-000000067F000040020000A0000000CFC000__0000003B99F7F8A0 000000067F000040020000A0000000CF8000-000000067F000040020000A0000000CFC000__0000005D2FFFFB38 000000067F000040020000A0000000CFA300-000000067F000040020000A0000200000000__00000033D69BE889-00000034764BE349 000000067F000040020000A0000000CFA548-000000067F000040020000A0000000D02F25__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000CFC000-000000067F000040020000A0000000D00000__0000003545E7DCF0 000000067F000040020000A0000000CFC000-000000067F000040020000A0000000D00000__00000038E9AF7F00 000000067F000040020000A0000000CFC000-000000067F000040020000A0000000D00000__0000003903F1CFE8 000000067F000040020000A0000000CFC000-000000067F000040020000A0000000D00000__0000003B99F7F8A0 000000067F000040020000A0000000CFC000-000000067F000040020000A0000000D00000__0000005D2FFFFB38 000000067F000040020000A0000000D00000-000000067F000040020000A0000000D04000__0000003545E7DCF0 000000067F000040020000A0000000D00000-000000067F000040020000A0000000D04000__00000038E9AF7F00 000000067F000040020000A0000000D00000-000000067F000040020000A0000000D04000__0000003903F1CFE8 000000067F000040020000A0000000D00000-000000067F000040020000A0000000D04000__0000003B99F7F8A0 000000067F000040020000A0000000D00000-000000067F000040020000A0000000D04000__0000005D2FFFFB38 000000067F000040020000A0000000D02F25-000000067F000040020000A0000000D0B903__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D04000-000000067F000040020000A0000000D08000__0000003545E7DCF0 000000067F000040020000A0000000D04000-000000067F000040020000A0000000D08000__00000038E9AF7F00 000000067F000040020000A0000000D04000-000000067F000040020000A0000000D08000__0000003903F1CFE8 000000067F000040020000A0000000D04000-000000067F000040020000A0000000D08000__0000003B99F7F8A0 000000067F000040020000A0000000D04000-000000067F000040020000A0000000D08000__0000005D2FFFFB38 000000067F000040020000A0000000D08000-000000067F000040020000A0000000D0C000__0000003545E7DCF0 000000067F000040020000A0000000D08000-000000067F000040020000A0000000D0C000__00000038E9AF7F00 000000067F000040020000A0000000D08000-000000067F000040020000A0000000D0C000__0000003903F1CFE8 000000067F000040020000A0000000D08000-000000067F000040020000A0000000D0C000__0000003B99F7F8A0 000000067F000040020000A0000000D08000-000000067F000040020000A0000000D0C000__0000005D2FFFFB38 000000067F000040020000A0000000D0B903-000000067F000040020000A0000000D142DA__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D0C000-000000067F000040020000A0000000D10000__0000003545E7DCF0 000000067F000040020000A0000000D0C000-000000067F000040020000A0000000D10000__00000038E9AF7F00 000000067F000040020000A0000000D0C000-000000067F000040020000A0000000D10000__0000003903F1CFE8 000000067F000040020000A0000000D0C000-000000067F000040020000A0000000D10000__0000003B99F7F8A0 000000067F000040020000A0000000D0C000-000000067F000040020000A0000000D10000__0000005D2FFFFB38 000000067F000040020000A0000000D10000-000000067F000040020000A0000000D14000__0000003545E7DCF0 000000067F000040020000A0000000D10000-000000067F000040020000A0000000D14000__00000038E9AF7F00 000000067F000040020000A0000000D10000-000000067F000040020000A0000000D14000__0000003903F1CFE8 000000067F000040020000A0000000D10000-000000067F000040020000A0000000D14000__0000003B99F7F8A0 000000067F000040020000A0000000D10000-000000067F000040020000A0000000D14000__0000005D2FFFFB38 000000067F000040020000A0000000D14000-000000067F000040020000A0000000D18000__0000003545E7DCF0 000000067F000040020000A0000000D14000-000000067F000040020000A0000000D18000__00000038E9AF7F00 000000067F000040020000A0000000D14000-000000067F000040020000A0000000D18000__0000003903F1CFE8 000000067F000040020000A0000000D14000-000000067F000040020000A0000000D18000__0000003B99F7F8A0 000000067F000040020000A0000000D14000-000000067F000040020000A0000000D18000__0000005D2FFFFB38 000000067F000040020000A0000000D142DA-000000067F000040020000A0000000D1CCBE__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D18000-000000067F000040020000A0000000D1C000__0000003545E7DCF0 000000067F000040020000A0000000D18000-000000067F000040020000A0000000D1C000__00000038E9AF7F00 000000067F000040020000A0000000D18000-000000067F000040020000A0000000D1C000__0000003903F1CFE8 000000067F000040020000A0000000D18000-000000067F000040020000A0000000D1C000__0000003B99F7F8A0 000000067F000040020000A0000000D18000-000000067F000040020000A0000000D1C000__0000005D2FFFFB38 000000067F000040020000A0000000D1C000-000000067F000040020000A0000000D20000__0000003545E7DCF0 000000067F000040020000A0000000D1C000-000000067F000040020000A0000000D20000__00000038E9AF7F00 000000067F000040020000A0000000D1C000-000000067F000040020000A0000000D20000__0000003903F1CFE8 000000067F000040020000A0000000D1C000-000000067F000040020000A0000000D20000__0000003B99F7F8A0 000000067F000040020000A0000000D1C000-000000067F000040020000A0000000D20000__0000005D2FFFFB38 000000067F000040020000A0000000D1CCBE-000000067F000040020000A0000000D25694__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D20000-000000067F000040020000A0000000D24000__0000003545E7DCF0 000000067F000040020000A0000000D20000-000000067F000040020000A0000000D24000__00000038E9AF7F00 000000067F000040020000A0000000D20000-000000067F000040020000A0000000D24000__0000003903F1CFE8 000000067F000040020000A0000000D20000-000000067F000040020000A0000000D24000__0000003B99F7F8A0 000000067F000040020000A0000000D20000-000000067F000040020000A0000000D24000__0000005D2FFFFB38 000000067F000040020000A0000000D24000-000000067F000040020000A0000000D28000__0000003545E7DCF0 000000067F000040020000A0000000D24000-000000067F000040020000A0000000D28000__00000038E9AF7F00 000000067F000040020000A0000000D24000-000000067F000040020000A0000000D28000__0000003903F1CFE8 000000067F000040020000A0000000D24000-000000067F000040020000A0000000D28000__0000003B99F7F8A0 000000067F000040020000A0000000D24000-000000067F000040020000A0000000D28000__0000005D2FFFFB38 000000067F000040020000A0000000D25694-000000067F000040020000A0000000D2E06B__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D28000-000000067F000040020000A0000000D2C000__0000003545E7DCF0 000000067F000040020000A0000000D28000-000000067F000040020000A0000000D2C000__00000038E9AF7F00 000000067F000040020000A0000000D28000-000000067F000040020000A0000000D2C000__0000003903F1CFE8 000000067F000040020000A0000000D28000-000000067F000040020000A0000000D2C000__0000003B99F7F8A0 000000067F000040020000A0000000D28000-000000067F000040020000A0000000D2C000__0000005D2FFFFB38 000000067F000040020000A0000000D2C000-000000067F000040020000A0000000D30000__0000003545E7DCF0 000000067F000040020000A0000000D2C000-000000067F000040020000A0000000D30000__00000038E9AF7F00 000000067F000040020000A0000000D2C000-000000067F000040020000A0000000D30000__0000003903F1CFE8 000000067F000040020000A0000000D2C000-000000067F000040020000A0000000D30000__0000003B99F7F8A0 000000067F000040020000A0000000D2C000-000000067F000040020000A0000000D30000__0000005D2FFFFB38 000000067F000040020000A0000000D2E06B-000000067F000040020000A0000000D36A3F__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D30000-000000067F000040020000A0000000D34000__0000003545E7DCF0 000000067F000040020000A0000000D30000-000000067F000040020000A0000000D34000__00000038E9AF7F00 000000067F000040020000A0000000D30000-000000067F000040020000A0000000D34000__0000003903F1CFE8 000000067F000040020000A0000000D30000-000000067F000040020000A0000000D34000__0000003B99F7F8A0 000000067F000040020000A0000000D30000-000000067F000040020000A0000000D34000__0000005D2FFFFB38 000000067F000040020000A0000000D34000-000000067F000040020000A0000000D38000__0000003545E7DCF0 000000067F000040020000A0000000D34000-000000067F000040020000A0000000D38000__00000038E9AF7F00 000000067F000040020000A0000000D34000-000000067F000040020000A0000000D38000__0000003903F1CFE8 000000067F000040020000A0000000D34000-000000067F000040020000A0000000D38000__0000003B99F7F8A0 000000067F000040020000A0000000D34000-000000067F000040020000A0000000D38000__0000005D2FFFFB38 000000067F000040020000A0000000D36A3F-000000067F000040020000A0000000D3F41D__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D38000-000000067F000040020000A0000000D3C000__0000003545E7DCF0 000000067F000040020000A0000000D38000-000000067F000040020000A0000000D3C000__00000038E9AF7F00 000000067F000040020000A0000000D38000-000000067F000040020000A0000000D3C000__0000003903F1CFE8 000000067F000040020000A0000000D38000-000000067F000040020000A0000000D3C000__0000003B99F7F8A0 000000067F000040020000A0000000D38000-000000067F000040020000A0000000D3C000__0000005D2FFFFB38 000000067F000040020000A0000000D3C000-000000067F000040020000A0000000D40000__0000003545E7DCF0 000000067F000040020000A0000000D3C000-000000067F000040020000A0000000D40000__00000038E9AF7F00 000000067F000040020000A0000000D3C000-000000067F000040020000A0000000D40000__0000003903F1CFE8 000000067F000040020000A0000000D3C000-000000067F000040020000A0000000D40000__0000003B99F7F8A0 000000067F000040020000A0000000D3C000-000000067F000040020000A0000000D40000__0000005D2FFFFB38 000000067F000040020000A0000000D3F41D-000000067F000040020000A0000000D47DFC__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D40000-000000067F000040020000A0000000D44000__0000003545E7DCF0 000000067F000040020000A0000000D40000-000000067F000040020000A0000000D44000__00000038E9AF7F00 000000067F000040020000A0000000D40000-000000067F000040020000A0000000D44000__0000003903F1CFE8 000000067F000040020000A0000000D40000-000000067F000040020000A0000000D44000__0000003B99F7F8A0 000000067F000040020000A0000000D40000-000000067F000040020000A0000000D44000__0000005D2FFFFB38 000000067F000040020000A0000000D44000-000000067F000040020000A0000000D48000__0000003545E7DCF0 000000067F000040020000A0000000D44000-000000067F000040020000A0000000D48000__00000038E9AF7F00 000000067F000040020000A0000000D44000-000000067F000040020000A0000000D48000__0000003903F1CFE8 000000067F000040020000A0000000D44000-000000067F000040020000A0000000D48000__0000003B99F7F8A0 000000067F000040020000A0000000D44000-000000067F000040020000A0000000D48000__0000005D2FFFFB38 000000067F000040020000A0000000D47DFC-000000067F000040020000A0000000D507EE__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D48000-000000067F000040020000A0000000D4C000__0000003545E7DCF0 000000067F000040020000A0000000D48000-000000067F000040020000A0000000D4C000__00000038E9AF7F00 000000067F000040020000A0000000D48000-000000067F000040020000A0000000D4C000__0000003903F1CFE8 000000067F000040020000A0000000D48000-000000067F000040020000A0000000D4C000__0000003B99F7F8A0 000000067F000040020000A0000000D48000-000000067F000040020000A0000000D4C000__0000005D2FFFFB38 000000067F000040020000A0000000D4C000-000000067F000040020000A0000000D50000__0000003545E7DCF0 000000067F000040020000A0000000D4C000-000000067F000040020000A0000000D50000__00000038E9AF7F00 000000067F000040020000A0000000D4C000-000000067F000040020000A0000000D50000__0000003903F1CFE8 000000067F000040020000A0000000D4C000-000000067F000040020000A0000000D50000__0000003B99F7F8A0 000000067F000040020000A0000000D4C000-000000067F000040020000A0000000D50000__0000005D2FFFFB38 000000067F000040020000A0000000D50000-000000067F000040020000A0000000D54000__0000003545E7DCF0 000000067F000040020000A0000000D50000-000000067F000040020000A0000000D54000__00000038E9AF7F00 000000067F000040020000A0000000D50000-000000067F000040020000A0000000D54000__0000003903F1CFE8 000000067F000040020000A0000000D50000-000000067F000040020000A0000000D54000__0000003B99F7F8A0 000000067F000040020000A0000000D50000-000000067F000040020000A0000000D54000__0000005D2FFFFB38 000000067F000040020000A0000000D507EE-000000067F000040020000A0000000D591D1__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D54000-000000067F000040020000A0000000D58000__0000003545E7DCF0 000000067F000040020000A0000000D54000-000000067F000040020000A0000000D58000__00000038E9AF7F00 000000067F000040020000A0000000D54000-000000067F000040020000A0000000D58000__0000003903F1CFE8 000000067F000040020000A0000000D54000-000000067F000040020000A0000000D58000__0000003B99F7F8A0 000000067F000040020000A0000000D54000-000000067F000040020000A0000000D58000__0000005D2FFFFB38 000000067F000040020000A0000000D58000-000000067F000040020000A0000000D5C000__0000003545E7DCF0 000000067F000040020000A0000000D58000-000000067F000040020000A0000000D5C000__00000038E67ABFA0 000000067F000040020000A0000000D58000-000000067F000040020000A0000000D5C000__0000003903F1CFE8 000000067F000040020000A0000000D58000-000000067F000040020000A0000000D5C000__0000003B99F7F8A0 000000067F000040020000A0000000D58000-000000067F000040020000A0000000D5C000__0000005D2FFFFB38 000000067F000040020000A0000000D591D1-000000067F000040020000A0000200000000__00000034764BE349-0000003525F3D179 000000067F000040020000A0000000D593E0-000000067F000040020000A0000000D61DB9__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D5C000-000000067F000040020000A0000000D60000__0000003545E7DCF0 000000067F000040020000A0000000D5C000-000000067F000040020000A0000000D60000__00000038E67ABFA0 000000067F000040020000A0000000D5C000-000000067F000040020000A0000000D60000__0000003903F1CFE8 000000067F000040020000A0000000D5C000-000000067F000040020000A0000000D60000__0000003B99F7F8A0 000000067F000040020000A0000000D5C000-000000067F000040020000A0000000D60000__0000005D2FFFFB38 000000067F000040020000A0000000D60000-000000067F000040020000A0000000D64000__0000003545E7DCF0 000000067F000040020000A0000000D60000-000000067F000040020000A0000000D64000__00000038E67ABFA0 000000067F000040020000A0000000D60000-000000067F000040020000A0000000D64000__0000003903F1CFE8 000000067F000040020000A0000000D60000-000000067F000040020000A0000000D64000__0000003B99F7F8A0 000000067F000040020000A0000000D60000-000000067F000040020000A0000000D64000__0000005D2FFFFB38 000000067F000040020000A0000000D61DB9-000000067F000040020000A0000000D6A793__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D64000-000000067F000040020000A0000000D68000__0000003545E7DCF0 000000067F000040020000A0000000D64000-000000067F000040020000A0000000D68000__00000038E67ABFA0 000000067F000040020000A0000000D64000-000000067F000040020000A0000000D68000__0000003903F1CFE8 000000067F000040020000A0000000D64000-000000067F000040020000A0000000D68000__0000003B99F7F8A0 000000067F000040020000A0000000D64000-000000067F000040020000A0000000D68000__0000005D2FFFFB38 000000067F000040020000A0000000D68000-000000067F000040020000A0000000D6C000__00000038E67ABFA0 000000067F000040020000A0000000D68000-000000067F000040020000A0000000D6C000__0000003903F1CFE8 000000067F000040020000A0000000D68000-000000067F000040020000A0000000D6C000__0000003B99F7F8A0 000000067F000040020000A0000000D68000-000000067F000040020000A0000000D6C000__0000005D2FFFFB38 000000067F000040020000A0000000D68000-030000000000000000000000000000000002__0000003545E7DCF0 000000067F000040020000A0000000D6A793-000000067F000040020000A0000000D73179__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D6C000-000000067F000040020000A0000000D70000__00000038E67ABFA0 000000067F000040020000A0000000D6C000-000000067F000040020000A0000000D70000__0000003903F1CFE8 000000067F000040020000A0000000D6C000-000000067F000040020000A0000000D70000__0000003B99F7F8A0 000000067F000040020000A0000000D6C000-000000067F000040020000A0000000D70000__0000005D2FFFFB38 000000067F000040020000A0000000D70000-000000067F000040020000A0000000D74000__00000038E67ABFA0 000000067F000040020000A0000000D70000-000000067F000040020000A0000000D74000__0000003903F1CFE8 000000067F000040020000A0000000D70000-000000067F000040020000A0000000D74000__0000003B99F7F8A0 000000067F000040020000A0000000D70000-000000067F000040020000A0000000D74000__0000005D2FFFFB38 000000067F000040020000A0000000D73179-000000067F000040020000A0000000D7BB57__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D74000-000000067F000040020000A0000000D78000__00000038E67ABFA0 000000067F000040020000A0000000D74000-000000067F000040020000A0000000D78000__0000003903F1CFE8 000000067F000040020000A0000000D74000-000000067F000040020000A0000000D78000__0000003B99F7F8A0 000000067F000040020000A0000000D74000-000000067F000040020000A0000000D78000__0000005D2FFFFB38 000000067F000040020000A0000000D78000-000000067F000040020000A0000000D7C000__00000038E67ABFA0 000000067F000040020000A0000000D78000-000000067F000040020000A0000000D7C000__0000003903F1CFE8 000000067F000040020000A0000000D78000-000000067F000040020000A0000000D7C000__0000003B99F7F8A0 000000067F000040020000A0000000D78000-000000067F000040020000A0000000D7C000__0000005D2FFFFB38 000000067F000040020000A0000000D7BB57-000000067F000040020000A0000000D8453C__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D7C000-000000067F000040020000A0000000D80000__00000038E67ABFA0 000000067F000040020000A0000000D7C000-000000067F000040020000A0000000D80000__0000003903F1CFE8 000000067F000040020000A0000000D7C000-000000067F000040020000A0000000D80000__0000003B99F7F8A0 000000067F000040020000A0000000D7C000-000000067F000040020000A0000000D80000__0000005D2FFFFB38 000000067F000040020000A0000000D80000-000000067F000040020000A0000000D84000__00000038E67ABFA0 000000067F000040020000A0000000D80000-000000067F000040020000A0000000D84000__0000003903F1CFE8 000000067F000040020000A0000000D80000-000000067F000040020000A0000000D84000__0000003B99F7F8A0 000000067F000040020000A0000000D80000-000000067F000040020000A0000000D84000__0000005D2FFFFB38 000000067F000040020000A0000000D84000-000000067F000040020000A0000000D88000__00000038E67ABFA0 000000067F000040020000A0000000D84000-000000067F000040020000A0000000D88000__0000003903F1CFE8 000000067F000040020000A0000000D84000-000000067F000040020000A0000000D88000__0000003B99F7F8A0 000000067F000040020000A0000000D84000-000000067F000040020000A0000000D88000__0000005D2FFFFB38 000000067F000040020000A0000000D8453C-000000067F000040020000A0000000D8CF1B__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D88000-000000067F000040020000A0000000D8C000__00000038E67ABFA0 000000067F000040020000A0000000D88000-000000067F000040020000A0000000D8C000__0000003903F1CFE8 000000067F000040020000A0000000D88000-000000067F000040020000A0000000D8C000__0000003B99F7F8A0 000000067F000040020000A0000000D88000-000000067F000040020000A0000000D8C000__0000005D2FFFFB38 000000067F000040020000A0000000D8C000-000000067F000040020000A0000000D90000__00000038E67ABFA0 000000067F000040020000A0000000D8C000-000000067F000040020000A0000000D90000__0000003903F1CFE8 000000067F000040020000A0000000D8C000-000000067F000040020000A0000000D90000__0000003B99F7F8A0 000000067F000040020000A0000000D8C000-000000067F000040020000A0000000D90000__0000005D2FFFFB38 000000067F000040020000A0000000D8CF1B-000000067F000040020000A0000000D958EB__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D90000-000000067F000040020000A0000000D94000__00000038E67ABFA0 000000067F000040020000A0000000D90000-000000067F000040020000A0000000D94000__0000003903F1CFE8 000000067F000040020000A0000000D90000-000000067F000040020000A0000000D94000__0000003B99F7F8A0 000000067F000040020000A0000000D90000-000000067F000040020000A0000000D94000__0000005D2FFFFB38 000000067F000040020000A0000000D94000-000000067F000040020000A0000000D98000__00000038E67ABFA0 000000067F000040020000A0000000D94000-000000067F000040020000A0000000D98000__0000003903F1CFE8 000000067F000040020000A0000000D94000-000000067F000040020000A0000000D98000__0000003B99F7F8A0 000000067F000040020000A0000000D94000-000000067F000040020000A0000000D98000__0000005D2FFFFB38 000000067F000040020000A0000000D958EB-000000067F000040020000A0000000D9E2CF__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000D98000-000000067F000040020000A0000000D9C000__00000038E67ABFA0 000000067F000040020000A0000000D98000-000000067F000040020000A0000000D9C000__0000003903F1CFE8 000000067F000040020000A0000000D98000-000000067F000040020000A0000000D9C000__0000003B99F7F8A0 000000067F000040020000A0000000D98000-000000067F000040020000A0000000D9C000__0000005D2FFFFB38 000000067F000040020000A0000000D9C000-000000067F000040020000A0000000DA0000__00000038E67ABFA0 000000067F000040020000A0000000D9C000-000000067F000040020000A0000000DA0000__0000003903F1CFE8 000000067F000040020000A0000000D9C000-000000067F000040020000A0000000DA0000__0000003B99F7F8A0 000000067F000040020000A0000000D9C000-000000067F000040020000A0000000DA0000__0000005D2FFFFB38 000000067F000040020000A0000000D9E2CF-000000067F000040020000A0000000DA6CA5__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000DA0000-000000067F000040020000A0000000DA4000__00000038E67ABFA0 000000067F000040020000A0000000DA0000-000000067F000040020000A0000000DA4000__0000003903F1CFE8 000000067F000040020000A0000000DA0000-000000067F000040020000A0000000DA4000__0000003B99F7F8A0 000000067F000040020000A0000000DA0000-000000067F000040020000A0000000DA4000__0000005D2FFFFB38 000000067F000040020000A0000000DA4000-000000067F000040020000A0000000DA8000__00000038E67ABFA0 000000067F000040020000A0000000DA4000-000000067F000040020000A0000000DA8000__0000003903F1CFE8 000000067F000040020000A0000000DA4000-000000067F000040020000A0000000DA8000__0000003B99F7F8A0 000000067F000040020000A0000000DA4000-000000067F000040020000A0000000DA8000__0000005D2FFFFB38 000000067F000040020000A0000000DA6CA5-000000067F000040020000A0000000DAF684__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000DA8000-000000067F000040020000A0000000DAC000__00000038E67ABFA0 000000067F000040020000A0000000DA8000-000000067F000040020000A0000000DAC000__0000003903F1CFE8 000000067F000040020000A0000000DA8000-000000067F000040020000A0000000DAC000__0000003B99F7F8A0 000000067F000040020000A0000000DA8000-000000067F000040020000A0000000DAC000__0000005D2FFFFB38 000000067F000040020000A0000000DAC000-000000067F000040020000A0000000DB0000__00000038E1ABFE28 000000067F000040020000A0000000DAC000-000000067F000040020000A0000000DB0000__00000038E9AF7F00 000000067F000040020000A0000000DAC000-000000067F000040020000A0000000DB0000__0000003903F1CFE8 000000067F000040020000A0000000DAC000-000000067F000040020000A0000000DB0000__0000003B99F7F8A0 000000067F000040020000A0000000DAC000-000000067F000040020000A0000000DB0000__0000005D2FFFFB38 000000067F000040020000A0000000DAF684-000000067F000040020000A0000200000000__0000003525F3D179-00000035C5A3EE11 000000067F000040020000A0000000DAF8EF-000000067F000040020000A0000000DB82C7__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DB0000-000000067F000040020000A0000000DB4000__00000038E1ABFE28 000000067F000040020000A0000000DB0000-000000067F000040020000A0000000DB4000__00000038E9AF7F00 000000067F000040020000A0000000DB0000-000000067F000040020000A0000000DB4000__0000003903F1CFE8 000000067F000040020000A0000000DB0000-000000067F000040020000A0000000DB4000__0000003B99F7F8A0 000000067F000040020000A0000000DB0000-000000067F000040020000A0000000DB4000__0000005D2FFFFB38 000000067F000040020000A0000000DB4000-000000067F000040020000A0000000DB8000__00000038E1ABFE28 000000067F000040020000A0000000DB4000-000000067F000040020000A0000000DB8000__00000038E9AF7F00 000000067F000040020000A0000000DB4000-000000067F000040020000A0000000DB8000__0000003903F1CFE8 000000067F000040020000A0000000DB4000-000000067F000040020000A0000000DB8000__0000003B99F7F8A0 000000067F000040020000A0000000DB4000-000000067F000040020000A0000000DB8000__0000005D2FFFFB38 000000067F000040020000A0000000DB8000-000000067F000040020000A0000000DBC000__00000038E1ABFE28 000000067F000040020000A0000000DB8000-000000067F000040020000A0000000DBC000__00000038E9AF7F00 000000067F000040020000A0000000DB8000-000000067F000040020000A0000000DBC000__0000003903F1CFE8 000000067F000040020000A0000000DB8000-000000067F000040020000A0000000DBC000__0000003B99F7F8A0 000000067F000040020000A0000000DB8000-000000067F000040020000A0000000DBC000__0000005D2FFFFB38 000000067F000040020000A0000000DB82C7-000000067F000040020000A0000000DC0CB4__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DBC000-000000067F000040020000A0000000DC0000__00000038E1ABFE28 000000067F000040020000A0000000DBC000-000000067F000040020000A0000000DC0000__00000038E9AF7F00 000000067F000040020000A0000000DBC000-000000067F000040020000A0000000DC0000__0000003903F1CFE8 000000067F000040020000A0000000DBC000-000000067F000040020000A0000000DC0000__0000003B99F7F8A0 000000067F000040020000A0000000DBC000-000000067F000040020000A0000000DC0000__0000005D2FFFFB38 000000067F000040020000A0000000DC0000-000000067F000040020000A0000000DC4000__00000038E1ABFE28 000000067F000040020000A0000000DC0000-000000067F000040020000A0000000DC4000__00000038E9AF7F00 000000067F000040020000A0000000DC0000-000000067F000040020000A0000000DC4000__0000003903F1CFE8 000000067F000040020000A0000000DC0000-000000067F000040020000A0000000DC4000__0000003B99F7F8A0 000000067F000040020000A0000000DC0000-000000067F000040020000A0000000DC4000__0000005D2FFFFB38 000000067F000040020000A0000000DC0CB4-000000067F000040020000A0000000DC9693__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DC4000-000000067F000040020000A0000000DC8000__00000038E1ABFE28 000000067F000040020000A0000000DC4000-000000067F000040020000A0000000DC8000__00000038E9AF7F00 000000067F000040020000A0000000DC4000-000000067F000040020000A0000000DC8000__0000003903F1CFE8 000000067F000040020000A0000000DC4000-000000067F000040020000A0000000DC8000__0000003B99F7F8A0 000000067F000040020000A0000000DC4000-000000067F000040020000A0000000DC8000__0000005D2FFFFB38 000000067F000040020000A0000000DC8000-000000067F000040020000A0000000DCC000__00000038E1ABFE28 000000067F000040020000A0000000DC8000-000000067F000040020000A0000000DCC000__00000038E9AF7F00 000000067F000040020000A0000000DC8000-000000067F000040020000A0000000DCC000__0000003903F1CFE8 000000067F000040020000A0000000DC8000-000000067F000040020000A0000000DCC000__0000003B99F7F8A0 000000067F000040020000A0000000DC8000-000000067F000040020000A0000000DCC000__0000005D2FFFFB38 000000067F000040020000A0000000DC9693-000000067F000040020000A0000000DD2070__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DCC000-000000067F000040020000A0000000DD0000__00000038E1ABFE28 000000067F000040020000A0000000DCC000-000000067F000040020000A0000000DD0000__00000038E9AF7F00 000000067F000040020000A0000000DCC000-000000067F000040020000A0000000DD0000__0000003903F1CFE8 000000067F000040020000A0000000DCC000-000000067F000040020000A0000000DD0000__0000003B99F7F8A0 000000067F000040020000A0000000DCC000-000000067F000040020000A0000000DD0000__0000005D2FFFFB38 000000067F000040020000A0000000DD0000-000000067F000040020000A0000000DD4000__00000038E1ABFE28 000000067F000040020000A0000000DD0000-000000067F000040020000A0000000DD4000__00000038E9AF7F00 000000067F000040020000A0000000DD0000-000000067F000040020000A0000000DD4000__0000003903F1CFE8 000000067F000040020000A0000000DD0000-000000067F000040020000A0000000DD4000__0000003B99F7F8A0 000000067F000040020000A0000000DD0000-000000067F000040020000A0000000DD4000__0000005D2FFFFB38 000000067F000040020000A0000000DD2070-000000067F000040020000A0000000DDAA44__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DD4000-000000067F000040020000A0000000DD8000__00000038E1ABFE28 000000067F000040020000A0000000DD4000-000000067F000040020000A0000000DD8000__00000038E9AF7F00 000000067F000040020000A0000000DD4000-000000067F000040020000A0000000DD8000__0000003903F1CFE8 000000067F000040020000A0000000DD4000-000000067F000040020000A0000000DD8000__0000003B99F7F8A0 000000067F000040020000A0000000DD4000-000000067F000040020000A0000000DD8000__0000005D2FFFFB38 000000067F000040020000A0000000DD8000-000000067F000040020000A0000000DDC000__00000038E1ABFE28 000000067F000040020000A0000000DD8000-000000067F000040020000A0000000DDC000__00000038E9AF7F00 000000067F000040020000A0000000DD8000-000000067F000040020000A0000000DDC000__0000003903F1CFE8 000000067F000040020000A0000000DD8000-000000067F000040020000A0000000DDC000__0000003B99F7F8A0 000000067F000040020000A0000000DD8000-000000067F000040020000A0000000DDC000__0000005D2FFFFB38 000000067F000040020000A0000000DDAA44-000000067F000040020000A0000000DE341F__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DDC000-000000067F000040020000A0000000DE0000__00000038E1ABFE28 000000067F000040020000A0000000DDC000-000000067F000040020000A0000000DE0000__00000038E9AF7F00 000000067F000040020000A0000000DDC000-000000067F000040020000A0000000DE0000__0000003903F1CFE8 000000067F000040020000A0000000DDC000-000000067F000040020000A0000000DE0000__0000003B99F7F8A0 000000067F000040020000A0000000DDC000-000000067F000040020000A0000000DE0000__0000005D2FFFFB38 000000067F000040020000A0000000DE0000-000000067F000040020000A0000000DE4000__00000038E1ABFE28 000000067F000040020000A0000000DE0000-000000067F000040020000A0000000DE4000__00000038E9AF7F00 000000067F000040020000A0000000DE0000-000000067F000040020000A0000000DE4000__0000003903F1CFE8 000000067F000040020000A0000000DE0000-000000067F000040020000A0000000DE4000__0000003B99F7F8A0 000000067F000040020000A0000000DE0000-000000067F000040020000A0000000DE4000__0000005D2FFFFB38 000000067F000040020000A0000000DE341F-000000067F000040020000A0000000DEBDF4__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DE4000-000000067F000040020000A0000000DE8000__00000038E1ABFE28 000000067F000040020000A0000000DE4000-000000067F000040020000A0000000DE8000__00000038E9AF7F00 000000067F000040020000A0000000DE4000-000000067F000040020000A0000000DE8000__0000003903F1CFE8 000000067F000040020000A0000000DE4000-000000067F000040020000A0000000DE8000__0000003B99F7F8A0 000000067F000040020000A0000000DE4000-000000067F000040020000A0000000DE8000__0000005D2FFFFB38 000000067F000040020000A0000000DE8000-000000067F000040020000A0000000DEC000__00000038E1ABFE28 000000067F000040020000A0000000DE8000-000000067F000040020000A0000000DEC000__00000038E9AF7F00 000000067F000040020000A0000000DE8000-000000067F000040020000A0000000DEC000__0000003903F1CFE8 000000067F000040020000A0000000DE8000-000000067F000040020000A0000000DEC000__0000003B99F7F8A0 000000067F000040020000A0000000DE8000-000000067F000040020000A0000000DEC000__0000005D2FFFFB38 000000067F000040020000A0000000DEBDF4-000000067F000040020000A0000000DF47DB__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DEC000-000000067F000040020000A0000000DF0000__00000038E1ABFE28 000000067F000040020000A0000000DEC000-000000067F000040020000A0000000DF0000__00000038E9AF7F00 000000067F000040020000A0000000DEC000-000000067F000040020000A0000000DF0000__0000003903F1CFE8 000000067F000040020000A0000000DEC000-000000067F000040020000A0000000DF0000__0000003B99F7F8A0 000000067F000040020000A0000000DEC000-000000067F000040020000A0000000DF0000__0000005D2FFFFB38 000000067F000040020000A0000000DF0000-000000067F000040020000A0000000DF4000__00000038E1ABFE28 000000067F000040020000A0000000DF0000-000000067F000040020000A0000000DF4000__00000038E9AF7F00 000000067F000040020000A0000000DF0000-000000067F000040020000A0000000DF4000__0000003903F1CFE8 000000067F000040020000A0000000DF0000-000000067F000040020000A0000000DF4000__0000003B99F7F8A0 000000067F000040020000A0000000DF0000-000000067F000040020000A0000000DF4000__0000005D2FFFFB38 000000067F000040020000A0000000DF4000-000000067F000040020000A0000000DF8000__00000038E1ABFE28 000000067F000040020000A0000000DF4000-000000067F000040020000A0000000DF8000__00000038E9AF7F00 000000067F000040020000A0000000DF4000-000000067F000040020000A0000000DF8000__0000003903F1CFE8 000000067F000040020000A0000000DF4000-000000067F000040020000A0000000DF8000__0000003B99F7F8A0 000000067F000040020000A0000000DF4000-000000067F000040020000A0000000DF8000__0000005D2FFFFB38 000000067F000040020000A0000000DF47DB-000000067F000040020000A0000000DFD1C6__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000DF8000-000000067F000040020000A0000000DFC000__00000038E1ABFE28 000000067F000040020000A0000000DF8000-000000067F000040020000A0000000DFC000__00000038E9AF7F00 000000067F000040020000A0000000DF8000-000000067F000040020000A0000000DFC000__0000003903F1CFE8 000000067F000040020000A0000000DF8000-000000067F000040020000A0000000DFC000__0000003B99F7F8A0 000000067F000040020000A0000000DF8000-000000067F000040020000A0000000DFC000__0000005D2FFFFB38 000000067F000040020000A0000000DFC000-000000067F000040020000A0000000E00000__00000038E1ABFE28 000000067F000040020000A0000000DFC000-000000067F000040020000A0000000E00000__00000038E9AF7F00 000000067F000040020000A0000000DFC000-000000067F000040020000A0000000E00000__0000003903F1CFE8 000000067F000040020000A0000000DFC000-000000067F000040020000A0000000E00000__0000003B99F7F8A0 000000067F000040020000A0000000DFC000-000000067F000040020000A0000000E00000__0000005D2FFFFB38 000000067F000040020000A0000000DFD1C6-000000067F000040020000A0000000E05BAE__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000E00000-000000067F000040020000A0000000E04000__00000038E1ABFE28 000000067F000040020000A0000000E00000-000000067F000040020000A0000000E04000__00000038E9AF7F00 000000067F000040020000A0000000E00000-000000067F000040020000A0000000E04000__0000003903F1CFE8 000000067F000040020000A0000000E00000-000000067F000040020000A0000000E04000__0000003B99F7F8A0 000000067F000040020000A0000000E00000-000000067F000040020000A0000000E04000__0000005D2FFFFB38 000000067F000040020000A0000000E04000-000000067F000040020000A0000000E08000__0000003734F16F18 000000067F000040020000A0000000E04000-000000067F000040020000A0000000E08000__00000038E9AF7F00 000000067F000040020000A0000000E04000-000000067F000040020000A0000000E08000__0000003903F1CFE8 000000067F000040020000A0000000E04000-000000067F000040020000A0000000E08000__0000003B99F7F8A0 000000067F000040020000A0000000E04000-000000067F000040020000A0000000E08000__0000005D2FFFFB38 000000067F000040020000A0000000E05BAE-000000067F000040020000A0000200000000__00000035C5A3EE11-000000366553DF11 000000067F000040020000A0000000E05D58-000000067F000040020000A0000000E0E727__000000366553DF11-000000370503E969 000000067F000040020000A0000000E08000-000000067F000040020000A0000000E0C000__0000003734F16F18 000000067F000040020000A0000000E08000-000000067F000040020000A0000000E0C000__00000038E9AF7F00 000000067F000040020000A0000000E08000-000000067F000040020000A0000000E0C000__0000003903F1CFE8 000000067F000040020000A0000000E08000-000000067F000040020000A0000000E0C000__0000003B99F7F8A0 000000067F000040020000A0000000E08000-000000067F000040020000A0000000E0C000__0000005D2FFFFB38 000000067F000040020000A0000000E0C000-000000067F000040020000A0000000E10000__0000003734F16F18 000000067F000040020000A0000000E0C000-000000067F000040020000A0000000E10000__00000038E9AF7F00 000000067F000040020000A0000000E0C000-000000067F000040020000A0000000E10000__0000003903F1CFE8 000000067F000040020000A0000000E0C000-000000067F000040020000A0000000E10000__0000003B99F7F8A0 000000067F000040020000A0000000E0C000-000000067F000040020000A0000000E10000__0000005D2FFFFB38 000000067F000040020000A0000000E0E727-000000067F000040020000A0000000E17100__000000366553DF11-000000370503E969 000000067F000040020000A0000000E10000-000000067F000040020000A0000000E14000__0000003734F16F18 000000067F000040020000A0000000E10000-000000067F000040020000A0000000E14000__00000038E9AF7F00 000000067F000040020000A0000000E10000-000000067F000040020000A0000000E14000__0000003903F1CFE8 000000067F000040020000A0000000E10000-000000067F000040020000A0000000E14000__0000003B99F7F8A0 000000067F000040020000A0000000E10000-000000067F000040020000A0000000E14000__0000005D2FFFFB38 000000067F000040020000A0000000E14000-000000067F000040020000A0000000E18000__0000003734F16F18 000000067F000040020000A0000000E14000-000000067F000040020000A0000000E18000__00000038E9AF7F00 000000067F000040020000A0000000E14000-000000067F000040020000A0000000E18000__0000003903F1CFE8 000000067F000040020000A0000000E14000-000000067F000040020000A0000000E18000__0000003B99F7F8A0 000000067F000040020000A0000000E14000-000000067F000040020000A0000000E18000__0000005D2FFFFB38 000000067F000040020000A0000000E17100-000000067F000040020000A0000000E1FAD2__000000366553DF11-000000370503E969 000000067F000040020000A0000000E18000-000000067F000040020000A0000000E1C000__0000003734F16F18 000000067F000040020000A0000000E18000-000000067F000040020000A0000000E1C000__00000038E9AF7F00 000000067F000040020000A0000000E18000-000000067F000040020000A0000000E1C000__0000003903F1CFE8 000000067F000040020000A0000000E18000-000000067F000040020000A0000000E1C000__0000003B99F7F8A0 000000067F000040020000A0000000E18000-000000067F000040020000A0000000E1C000__0000005D2FFFFB38 000000067F000040020000A0000000E1C000-000000067F000040020000A0000000E20000__0000003734F16F18 000000067F000040020000A0000000E1C000-000000067F000040020000A0000000E20000__00000038E9AF7F00 000000067F000040020000A0000000E1C000-000000067F000040020000A0000000E20000__0000003903F1CFE8 000000067F000040020000A0000000E1C000-000000067F000040020000A0000000E20000__0000003B99F7F8A0 000000067F000040020000A0000000E1C000-000000067F000040020000A0000000E20000__0000005D2FFFFB38 000000067F000040020000A0000000E1FAD2-000000067F000040020000A0000000E284A9__000000366553DF11-000000370503E969 000000067F000040020000A0000000E20000-000000067F000040020000A0000000E24000__0000003734F16F18 000000067F000040020000A0000000E20000-000000067F000040020000A0000000E24000__00000038E9AF7F00 000000067F000040020000A0000000E20000-000000067F000040020000A0000000E24000__0000003903F1CFE8 000000067F000040020000A0000000E20000-000000067F000040020000A0000000E24000__0000003B99F7F8A0 000000067F000040020000A0000000E20000-000000067F000040020000A0000000E24000__0000005D2FFFFB38 000000067F000040020000A0000000E24000-000000067F000040020000A0000000E28000__0000003734F16F18 000000067F000040020000A0000000E24000-000000067F000040020000A0000000E28000__00000038E9AF7F00 000000067F000040020000A0000000E24000-000000067F000040020000A0000000E28000__0000003903F1CFE8 000000067F000040020000A0000000E24000-000000067F000040020000A0000000E28000__0000003B99F7F8A0 000000067F000040020000A0000000E24000-000000067F000040020000A0000000E28000__0000005D2FFFFB38 000000067F000040020000A0000000E28000-000000067F000040020000A0000000E2C000__0000003734F16F18 000000067F000040020000A0000000E28000-000000067F000040020000A0000000E2C000__00000038E9AF7F00 000000067F000040020000A0000000E28000-000000067F000040020000A0000000E2C000__0000003903F1CFE8 000000067F000040020000A0000000E28000-000000067F000040020000A0000000E2C000__0000003B99F7F8A0 000000067F000040020000A0000000E28000-000000067F000040020000A0000000E2C000__0000005D2FFFFB38 000000067F000040020000A0000000E284A9-000000067F000040020000A0000000E30E94__000000366553DF11-000000370503E969 000000067F000040020000A0000000E2C000-000000067F000040020000A0000000E30000__0000003734F16F18 000000067F000040020000A0000000E2C000-000000067F000040020000A0000000E30000__00000038E9AF7F00 000000067F000040020000A0000000E2C000-000000067F000040020000A0000000E30000__0000003903F1CFE8 000000067F000040020000A0000000E2C000-000000067F000040020000A0000000E30000__0000003B99F7F8A0 000000067F000040020000A0000000E2C000-000000067F000040020000A0000000E30000__0000005D2FFFFB38 000000067F000040020000A0000000E30000-000000067F000040020000A0000000E34000__0000003734F16F18 000000067F000040020000A0000000E30000-000000067F000040020000A0000000E34000__00000038E9AF7F00 000000067F000040020000A0000000E30000-000000067F000040020000A0000000E34000__0000003903F1CFE8 000000067F000040020000A0000000E30000-000000067F000040020000A0000000E34000__0000003B99F7F8A0 000000067F000040020000A0000000E30000-000000067F000040020000A0000000E34000__0000005D2FFFFB38 000000067F000040020000A0000000E30E94-000000067F000040020000A0000000E39878__000000366553DF11-000000370503E969 000000067F000040020000A0000000E34000-000000067F000040020000A0000000E38000__0000003734F16F18 000000067F000040020000A0000000E34000-000000067F000040020000A0000000E38000__00000038E9AF7F00 000000067F000040020000A0000000E34000-000000067F000040020000A0000000E38000__0000003903F1CFE8 000000067F000040020000A0000000E34000-000000067F000040020000A0000000E38000__0000003B99F7F8A0 000000067F000040020000A0000000E34000-000000067F000040020000A0000000E38000__0000005D2FFFFB38 000000067F000040020000A0000000E38000-000000067F000040020000A0000000E3C000__0000003734F16F18 000000067F000040020000A0000000E38000-000000067F000040020000A0000000E3C000__00000038E9AF7F00 000000067F000040020000A0000000E38000-000000067F000040020000A0000000E3C000__0000003903F1CFE8 000000067F000040020000A0000000E38000-000000067F000040020000A0000000E3C000__0000003B99F7F8A0 000000067F000040020000A0000000E38000-000000067F000040020000A0000000E3C000__0000005D2FFFFB38 000000067F000040020000A0000000E39878-000000067F000040020000A0000000E42256__000000366553DF11-000000370503E969 000000067F000040020000A0000000E3C000-000000067F000040020000A0000000E40000__0000003734F16F18 000000067F000040020000A0000000E3C000-000000067F000040020000A0000000E40000__00000038E9AF7F00 000000067F000040020000A0000000E3C000-000000067F000040020000A0000000E40000__0000003903F1CFE8 000000067F000040020000A0000000E3C000-000000067F000040020000A0000000E40000__0000003B99F7F8A0 000000067F000040020000A0000000E3C000-000000067F000040020000A0000000E40000__0000005D2FFFFB38 000000067F000040020000A0000000E40000-000000067F000040020000A0000000E44000__0000003734F16F18 000000067F000040020000A0000000E40000-000000067F000040020000A0000000E44000__00000038E9AF7F00 000000067F000040020000A0000000E40000-000000067F000040020000A0000000E44000__0000003903F1CFE8 000000067F000040020000A0000000E40000-000000067F000040020000A0000000E44000__0000003B99F7F8A0 000000067F000040020000A0000000E40000-000000067F000040020000A0000000E44000__0000005D2FFFFB38 000000067F000040020000A0000000E42256-000000067F000040020000A0000000E4AC29__000000366553DF11-000000370503E969 000000067F000040020000A0000000E44000-000000067F000040020000A0000000E48000__0000003734F16F18 000000067F000040020000A0000000E44000-000000067F000040020000A0000000E48000__00000038E9AF7F00 000000067F000040020000A0000000E44000-000000067F000040020000A0000000E48000__0000003903F1CFE8 000000067F000040020000A0000000E44000-000000067F000040020000A0000000E48000__0000003B99F7F8A0 000000067F000040020000A0000000E44000-000000067F000040020000A0000000E48000__0000005D2FFFFB38 000000067F000040020000A0000000E48000-000000067F000040020000A0000000E4C000__0000003734F16F18 000000067F000040020000A0000000E48000-000000067F000040020000A0000000E4C000__00000038E9AF7F00 000000067F000040020000A0000000E48000-000000067F000040020000A0000000E4C000__0000003903F1CFE8 000000067F000040020000A0000000E48000-000000067F000040020000A0000000E4C000__0000003B99F7F8A0 000000067F000040020000A0000000E48000-000000067F000040020000A0000000E4C000__0000005D2FFFFB38 000000067F000040020000A0000000E4AC29-000000067F000040020000A0000000E53600__000000366553DF11-000000370503E969 000000067F000040020000A0000000E4C000-000000067F000040020000A0000000E50000__0000003734F16F18 000000067F000040020000A0000000E4C000-000000067F000040020000A0000000E50000__00000038E9AF7F00 000000067F000040020000A0000000E4C000-000000067F000040020000A0000000E50000__0000003903F1CFE8 000000067F000040020000A0000000E4C000-000000067F000040020000A0000000E50000__0000003B99F7F8A0 000000067F000040020000A0000000E4C000-000000067F000040020000A0000000E50000__0000005D2FFFFB38 000000067F000040020000A0000000E50000-000000067F000040020000A0000000E54000__0000003734F16F18 000000067F000040020000A0000000E50000-000000067F000040020000A0000000E54000__00000038E9AF7F00 000000067F000040020000A0000000E50000-000000067F000040020000A0000000E54000__0000003903F1CFE8 000000067F000040020000A0000000E50000-000000067F000040020000A0000000E54000__0000003B99F7F8A0 000000067F000040020000A0000000E50000-000000067F000040020000A0000000E54000__0000005D2FFFFB38 000000067F000040020000A0000000E53600-000000067F000040020000A0000000E5BFD2__000000366553DF11-000000370503E969 000000067F000040020000A0000000E54000-000000067F000040020000A0000000E58000__0000003734F16F18 000000067F000040020000A0000000E54000-000000067F000040020000A0000000E58000__00000038E9AF7F00 000000067F000040020000A0000000E54000-000000067F000040020000A0000000E58000__0000003903F1CFE8 000000067F000040020000A0000000E54000-000000067F000040020000A0000000E58000__0000003B99F7F8A0 000000067F000040020000A0000000E54000-000000067F000040020000A0000000E58000__0000005D2FFFFB38 000000067F000040020000A0000000E58000-000000067F000040020000A0000000E5C000__0000003734F16F18 000000067F000040020000A0000000E58000-000000067F000040020000A0000000E5C000__00000038E9AF7F00 000000067F000040020000A0000000E58000-000000067F000040020000A0000000E5C000__0000003903F1CFE8 000000067F000040020000A0000000E58000-000000067F000040020000A0000000E5C000__0000003B99F7F8A0 000000067F000040020000A0000000E58000-000000067F000040020000A0000000E5C000__0000005D2FFFFB38 000000067F000040020000A0000000E5BFD2-000000067F000040020000A0000200000000__000000366553DF11-000000370503E969 000000067F000040020000A0000000E5C000-000000067F000040020000A0000000E60000__0000003734F16F18 000000067F000040020000A0000000E5C000-000000067F000040020000A0000000E60000__00000038E67ABFA0 000000067F000040020000A0000000E5C000-000000067F000040020000A0000000E60000__0000003903F1CFE8 000000067F000040020000A0000000E5C000-000000067F000040020000A0000000E60000__0000003B99F7F8A0 000000067F000040020000A0000000E5C000-000000067F000040020000A0000000E60000__0000005D2FFFFB38 000000067F000040020000A0000000E5C268-000000067F000040020000A0000000E64C37__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E60000-000000067F000040020000A0000000E64000__0000003734F16F18 000000067F000040020000A0000000E60000-000000067F000040020000A0000000E64000__00000038E67ABFA0 000000067F000040020000A0000000E60000-000000067F000040020000A0000000E64000__0000003903F1CFE8 000000067F000040020000A0000000E60000-000000067F000040020000A0000000E64000__0000003B99F7F8A0 000000067F000040020000A0000000E60000-000000067F000040020000A0000000E64000__0000005D2FFFFB38 000000067F000040020000A0000000E64000-000000067F000040020000A0000000E68000__0000003734F16F18 000000067F000040020000A0000000E64000-000000067F000040020000A0000000E68000__00000038E67ABFA0 000000067F000040020000A0000000E64000-000000067F000040020000A0000000E68000__0000003903F1CFE8 000000067F000040020000A0000000E64000-000000067F000040020000A0000000E68000__0000003B99F7F8A0 000000067F000040020000A0000000E64000-000000067F000040020000A0000000E68000__0000005D2FFFFB38 000000067F000040020000A0000000E64C37-000000067F000040020000A0000000E6D618__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E68000-000000067F000040020000A0000000E6C000__0000003734F16F18 000000067F000040020000A0000000E68000-000000067F000040020000A0000000E6C000__00000038E67ABFA0 000000067F000040020000A0000000E68000-000000067F000040020000A0000000E6C000__0000003903F1CFE8 000000067F000040020000A0000000E68000-000000067F000040020000A0000000E6C000__0000003B99F7F8A0 000000067F000040020000A0000000E68000-000000067F000040020000A0000000E6C000__0000005D2FFFFB38 000000067F000040020000A0000000E6C000-000000067F000040020000A0000000E70000__0000003734F16F18 000000067F000040020000A0000000E6C000-000000067F000040020000A0000000E70000__00000038E67ABFA0 000000067F000040020000A0000000E6C000-000000067F000040020000A0000000E70000__0000003903F1CFE8 000000067F000040020000A0000000E6C000-000000067F000040020000A0000000E70000__0000003B99F7F8A0 000000067F000040020000A0000000E6C000-000000067F000040020000A0000000E70000__0000005D2FFFFB38 000000067F000040020000A0000000E6D618-000000067F000040020000A0000000E75FEF__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E70000-000000067F000040020000A0000000E74000__0000003734F16F18 000000067F000040020000A0000000E70000-000000067F000040020000A0000000E74000__00000038E67ABFA0 000000067F000040020000A0000000E70000-000000067F000040020000A0000000E74000__0000003903F1CFE8 000000067F000040020000A0000000E70000-000000067F000040020000A0000000E74000__0000003B99F7F8A0 000000067F000040020000A0000000E70000-000000067F000040020000A0000000E74000__0000005D2FFFFB38 000000067F000040020000A0000000E74000-000000067F000040020000A0000000E78000__00000038E67ABFA0 000000067F000040020000A0000000E74000-000000067F000040020000A0000000E78000__0000003903F1CFE8 000000067F000040020000A0000000E74000-000000067F000040020000A0000000E78000__0000003B99F7F8A0 000000067F000040020000A0000000E74000-000000067F000040020000A0000000E78000__0000005D2FFFFB38 000000067F000040020000A0000000E74000-030000000000000000000000000000000002__0000003734F16F18 000000067F000040020000A0000000E75FEF-000000067F000040020000A0000000E7E9D1__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E78000-000000067F000040020000A0000000E7C000__00000038E67ABFA0 000000067F000040020000A0000000E78000-000000067F000040020000A0000000E7C000__0000003903F1CFE8 000000067F000040020000A0000000E78000-000000067F000040020000A0000000E7C000__0000003B99F7F8A0 000000067F000040020000A0000000E78000-000000067F000040020000A0000000E7C000__0000005D2FFFFB38 000000067F000040020000A0000000E7C000-000000067F000040020000A0000000E80000__00000038E67ABFA0 000000067F000040020000A0000000E7C000-000000067F000040020000A0000000E80000__0000003903F1CFE8 000000067F000040020000A0000000E7C000-000000067F000040020000A0000000E80000__0000003B99F7F8A0 000000067F000040020000A0000000E7C000-000000067F000040020000A0000000E80000__0000005D2FFFFB38 000000067F000040020000A0000000E7E9D1-000000067F000040020000A0000000E873AA__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E80000-000000067F000040020000A0000000E84000__00000038E67ABFA0 000000067F000040020000A0000000E80000-000000067F000040020000A0000000E84000__0000003903F1CFE8 000000067F000040020000A0000000E80000-000000067F000040020000A0000000E84000__0000003B99F7F8A0 000000067F000040020000A0000000E80000-000000067F000040020000A0000000E84000__0000005D2FFFFB38 000000067F000040020000A0000000E84000-000000067F000040020000A0000000E88000__00000038E67ABFA0 000000067F000040020000A0000000E84000-000000067F000040020000A0000000E88000__0000003903F1CFE8 000000067F000040020000A0000000E84000-000000067F000040020000A0000000E88000__0000003B99F7F8A0 000000067F000040020000A0000000E84000-000000067F000040020000A0000000E88000__0000005D2FFFFB38 000000067F000040020000A0000000E873AA-000000067F000040020000A0000000E8FD88__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E88000-000000067F000040020000A0000000E8C000__00000038E67ABFA0 000000067F000040020000A0000000E88000-000000067F000040020000A0000000E8C000__0000003903F1CFE8 000000067F000040020000A0000000E88000-000000067F000040020000A0000000E8C000__0000003B99F7F8A0 000000067F000040020000A0000000E88000-000000067F000040020000A0000000E8C000__0000005D2FFFFB38 000000067F000040020000A0000000E8C000-000000067F000040020000A0000000E90000__00000038E67ABFA0 000000067F000040020000A0000000E8C000-000000067F000040020000A0000000E90000__0000003903F1CFE8 000000067F000040020000A0000000E8C000-000000067F000040020000A0000000E90000__0000003B99F7F8A0 000000067F000040020000A0000000E8C000-000000067F000040020000A0000000E90000__0000005D2FFFFB38 000000067F000040020000A0000000E8FD88-000000067F000040020000A0000000E98764__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E90000-000000067F000040020000A0000000E94000__00000038E67ABFA0 000000067F000040020000A0000000E90000-000000067F000040020000A0000000E94000__0000003903F1CFE8 000000067F000040020000A0000000E90000-000000067F000040020000A0000000E94000__0000003B99F7F8A0 000000067F000040020000A0000000E90000-000000067F000040020000A0000000E94000__0000005D2FFFFB38 000000067F000040020000A0000000E94000-000000067F000040020000A0000000E98000__00000038E67ABFA0 000000067F000040020000A0000000E94000-000000067F000040020000A0000000E98000__0000003903F1CFE8 000000067F000040020000A0000000E94000-000000067F000040020000A0000000E98000__0000003B99F7F8A0 000000067F000040020000A0000000E94000-000000067F000040020000A0000000E98000__0000005D2FFFFB38 000000067F000040020000A0000000E98000-000000067F000040020000A0000000E9C000__00000038E67ABFA0 000000067F000040020000A0000000E98000-000000067F000040020000A0000000E9C000__0000003903F1CFE8 000000067F000040020000A0000000E98000-000000067F000040020000A0000000E9C000__0000003B99F7F8A0 000000067F000040020000A0000000E98000-000000067F000040020000A0000000E9C000__0000005D2FFFFB38 000000067F000040020000A0000000E98764-000000067F000040020000A0000000EA1139__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000E9C000-000000067F000040020000A0000000EA0000__00000038E67ABFA0 000000067F000040020000A0000000E9C000-000000067F000040020000A0000000EA0000__0000003903F1CFE8 000000067F000040020000A0000000E9C000-000000067F000040020000A0000000EA0000__0000003B99F7F8A0 000000067F000040020000A0000000E9C000-000000067F000040020000A0000000EA0000__0000005D2FFFFB38 000000067F000040020000A0000000EA0000-000000067F000040020000A0000000EA4000__00000038E67ABFA0 000000067F000040020000A0000000EA0000-000000067F000040020000A0000000EA4000__0000003903F1CFE8 000000067F000040020000A0000000EA0000-000000067F000040020000A0000000EA4000__0000003B99F7F8A0 000000067F000040020000A0000000EA0000-000000067F000040020000A0000000EA4000__0000005D2FFFFB38 000000067F000040020000A0000000EA1139-000000067F000040020000A0000000EA9B11__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000EA4000-000000067F000040020000A0000000EA8000__00000038E67ABFA0 000000067F000040020000A0000000EA4000-000000067F000040020000A0000000EA8000__0000003903F1CFE8 000000067F000040020000A0000000EA4000-000000067F000040020000A0000000EA8000__0000003B99F7F8A0 000000067F000040020000A0000000EA4000-000000067F000040020000A0000000EA8000__0000005D2FFFFB38 000000067F000040020000A0000000EA8000-000000067F000040020000A0000000EAC000__00000038E67ABFA0 000000067F000040020000A0000000EA8000-000000067F000040020000A0000000EAC000__0000003903F1CFE8 000000067F000040020000A0000000EA8000-000000067F000040020000A0000000EAC000__0000003B99F7F8A0 000000067F000040020000A0000000EA8000-000000067F000040020000A0000000EAC000__0000005D2FFFFB38 000000067F000040020000A0000000EA9B11-000000067F000040020000A0000000EB24E9__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000EAC000-000000067F000040020000A0000000EB0000__00000038E67ABFA0 000000067F000040020000A0000000EAC000-000000067F000040020000A0000000EB0000__0000003903F1CFE8 000000067F000040020000A0000000EAC000-000000067F000040020000A0000000EB0000__0000003B99F7F8A0 000000067F000040020000A0000000EAC000-000000067F000040020000A0000000EB0000__0000005D2FFFFB38 000000067F000040020000A0000000EB0000-000000067F000040020000A0000000EB4000__00000038E1ABFE28 000000067F000040020000A0000000EB0000-000000067F000040020000A0000000EB4000__00000038E9AF7F00 000000067F000040020000A0000000EB0000-000000067F000040020000A0000000EB4000__0000003903F1CFE8 000000067F000040020000A0000000EB0000-000000067F000040020000A0000000EB4000__0000003B99F7F8A0 000000067F000040020000A0000000EB0000-000000067F000040020000A0000000EB4000__0000005D2FFFFB38 000000067F000040020000A0000000EB24E9-000000067F000040020000A0000200000000__000000370503E969-00000037A4B3E7B1 000000067F000040020000A0000000EB26A9-000000067F000040020000A0000000EBB084__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EB4000-000000067F000040020000A0000000EB8000__00000038E1ABFE28 000000067F000040020000A0000000EB4000-000000067F000040020000A0000000EB8000__00000038E9AF7F00 000000067F000040020000A0000000EB4000-000000067F000040020000A0000000EB8000__0000003903F1CFE8 000000067F000040020000A0000000EB4000-000000067F000040020000A0000000EB8000__0000003B99F7F8A0 000000067F000040020000A0000000EB4000-000000067F000040020000A0000000EB8000__0000005D2FFFFB38 000000067F000040020000A0000000EB8000-000000067F000040020000A0000000EBC000__00000038E1ABFE28 000000067F000040020000A0000000EB8000-000000067F000040020000A0000000EBC000__00000038E9AF7F00 000000067F000040020000A0000000EB8000-000000067F000040020000A0000000EBC000__0000003903F1CFE8 000000067F000040020000A0000000EB8000-000000067F000040020000A0000000EBC000__0000003B99F7F8A0 000000067F000040020000A0000000EB8000-000000067F000040020000A0000000EBC000__0000005D2FFFFB38 000000067F000040020000A0000000EBB084-000000067F000040020000A0000000EC3A59__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EBC000-000000067F000040020000A0000000EC0000__00000038E1ABFE28 000000067F000040020000A0000000EBC000-000000067F000040020000A0000000EC0000__00000038E9AF7F00 000000067F000040020000A0000000EBC000-000000067F000040020000A0000000EC0000__0000003903F1CFE8 000000067F000040020000A0000000EBC000-000000067F000040020000A0000000EC0000__0000003B99F7F8A0 000000067F000040020000A0000000EBC000-000000067F000040020000A0000000EC0000__0000005D2FFFFB38 000000067F000040020000A0000000EC0000-000000067F000040020000A0000000EC4000__00000038E1ABFE28 000000067F000040020000A0000000EC0000-000000067F000040020000A0000000EC4000__00000038E9AF7F00 000000067F000040020000A0000000EC0000-000000067F000040020000A0000000EC4000__0000003903F1CFE8 000000067F000040020000A0000000EC0000-000000067F000040020000A0000000EC4000__0000003B99F7F8A0 000000067F000040020000A0000000EC0000-000000067F000040020000A0000000EC4000__0000005D2FFFFB38 000000067F000040020000A0000000EC3A59-000000067F000040020000A0000000ECC43D__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EC4000-000000067F000040020000A0000000EC8000__00000038E1ABFE28 000000067F000040020000A0000000EC4000-000000067F000040020000A0000000EC8000__00000038E9AF7F00 000000067F000040020000A0000000EC4000-000000067F000040020000A0000000EC8000__0000003903F1CFE8 000000067F000040020000A0000000EC4000-000000067F000040020000A0000000EC8000__0000003B99F7F8A0 000000067F000040020000A0000000EC4000-000000067F000040020000A0000000EC8000__0000005D2FFFFB38 000000067F000040020000A0000000EC8000-000000067F000040020000A0000000ECC000__00000038E1ABFE28 000000067F000040020000A0000000EC8000-000000067F000040020000A0000000ECC000__00000038E9AF7F00 000000067F000040020000A0000000EC8000-000000067F000040020000A0000000ECC000__0000003903F1CFE8 000000067F000040020000A0000000EC8000-000000067F000040020000A0000000ECC000__0000003B99F7F8A0 000000067F000040020000A0000000EC8000-000000067F000040020000A0000000ECC000__0000005D2FFFFB38 000000067F000040020000A0000000ECC000-000000067F000040020000A0000000ED0000__00000038E1ABFE28 000000067F000040020000A0000000ECC000-000000067F000040020000A0000000ED0000__00000038E9AF7F00 000000067F000040020000A0000000ECC000-000000067F000040020000A0000000ED0000__0000003903F1CFE8 000000067F000040020000A0000000ECC000-000000067F000040020000A0000000ED0000__0000003B99F7F8A0 000000067F000040020000A0000000ECC000-000000067F000040020000A0000000ED0000__0000005D2FFFFB38 000000067F000040020000A0000000ECC43D-000000067F000040020000A0000000ED4E14__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000ED0000-000000067F000040020000A0000000ED4000__00000038E1ABFE28 000000067F000040020000A0000000ED0000-000000067F000040020000A0000000ED4000__00000038E9AF7F00 000000067F000040020000A0000000ED0000-000000067F000040020000A0000000ED4000__0000003903F1CFE8 000000067F000040020000A0000000ED0000-000000067F000040020000A0000000ED4000__0000003B99F7F8A0 000000067F000040020000A0000000ED0000-000000067F000040020000A0000000ED4000__0000005D2FFFFB38 000000067F000040020000A0000000ED4000-000000067F000040020000A0000000ED8000__00000038E1ABFE28 000000067F000040020000A0000000ED4000-000000067F000040020000A0000000ED8000__00000038E9AF7F00 000000067F000040020000A0000000ED4000-000000067F000040020000A0000000ED8000__0000003903F1CFE8 000000067F000040020000A0000000ED4000-000000067F000040020000A0000000ED8000__0000003B99F7F8A0 000000067F000040020000A0000000ED4000-000000067F000040020000A0000000ED8000__0000005D2FFFFB38 000000067F000040020000A0000000ED4E14-000000067F000040020000A0000000EDD7F0__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000ED8000-000000067F000040020000A0000000EDC000__00000038E1ABFE28 000000067F000040020000A0000000ED8000-000000067F000040020000A0000000EDC000__00000038E9AF7F00 000000067F000040020000A0000000ED8000-000000067F000040020000A0000000EDC000__0000003903F1CFE8 000000067F000040020000A0000000ED8000-000000067F000040020000A0000000EDC000__0000003B99F7F8A0 000000067F000040020000A0000000ED8000-000000067F000040020000A0000000EDC000__0000005D2FFFFB38 000000067F000040020000A0000000EDC000-000000067F000040020000A0000000EE0000__00000038E1ABFE28 000000067F000040020000A0000000EDC000-000000067F000040020000A0000000EE0000__00000038E9AF7F00 000000067F000040020000A0000000EDC000-000000067F000040020000A0000000EE0000__0000003903F1CFE8 000000067F000040020000A0000000EDC000-000000067F000040020000A0000000EE0000__0000003B99F7F8A0 000000067F000040020000A0000000EDC000-000000067F000040020000A0000000EE0000__0000005D2FFFFB38 000000067F000040020000A0000000EDD7F0-000000067F000040020000A0000000EE61D2__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EE0000-000000067F000040020000A0000000EE4000__00000038E1ABFE28 000000067F000040020000A0000000EE0000-000000067F000040020000A0000000EE4000__00000038E9AF7F00 000000067F000040020000A0000000EE0000-000000067F000040020000A0000000EE4000__0000003903F1CFE8 000000067F000040020000A0000000EE0000-000000067F000040020000A0000000EE4000__0000003B99F7F8A0 000000067F000040020000A0000000EE0000-000000067F000040020000A0000000EE4000__0000005D2FFFFB38 000000067F000040020000A0000000EE4000-000000067F000040020000A0000000EE8000__00000038E1ABFE28 000000067F000040020000A0000000EE4000-000000067F000040020000A0000000EE8000__00000038E9AF7F00 000000067F000040020000A0000000EE4000-000000067F000040020000A0000000EE8000__0000003903F1CFE8 000000067F000040020000A0000000EE4000-000000067F000040020000A0000000EE8000__0000003B99F7F8A0 000000067F000040020000A0000000EE4000-000000067F000040020000A0000000EE8000__0000005D2FFFFB38 000000067F000040020000A0000000EE61D2-000000067F000040020000A0000000EEEBB3__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EE8000-000000067F000040020000A0000000EEC000__00000038E1ABFE28 000000067F000040020000A0000000EE8000-000000067F000040020000A0000000EEC000__00000038E9AF7F00 000000067F000040020000A0000000EE8000-000000067F000040020000A0000000EEC000__0000003903F1CFE8 000000067F000040020000A0000000EE8000-000000067F000040020000A0000000EEC000__0000003B99F7F8A0 000000067F000040020000A0000000EE8000-000000067F000040020000A0000000EEC000__0000005D2FFFFB38 000000067F000040020000A0000000EEC000-000000067F000040020000A0000000EF0000__00000038E1ABFE28 000000067F000040020000A0000000EEC000-000000067F000040020000A0000000EF0000__00000038E9AF7F00 000000067F000040020000A0000000EEC000-000000067F000040020000A0000000EF0000__0000003903F1CFE8 000000067F000040020000A0000000EEC000-000000067F000040020000A0000000EF0000__0000003B99F7F8A0 000000067F000040020000A0000000EEC000-000000067F000040020000A0000000EF0000__0000005D2FFFFB38 000000067F000040020000A0000000EEEBB3-000000067F000040020000A0000000EF759F__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EF0000-000000067F000040020000A0000000EF4000__00000038E1ABFE28 000000067F000040020000A0000000EF0000-000000067F000040020000A0000000EF4000__00000038E9AF7F00 000000067F000040020000A0000000EF0000-000000067F000040020000A0000000EF4000__0000003903F1CFE8 000000067F000040020000A0000000EF0000-000000067F000040020000A0000000EF4000__0000003B99F7F8A0 000000067F000040020000A0000000EF0000-000000067F000040020000A0000000EF4000__0000005D2FFFFB38 000000067F000040020000A0000000EF4000-000000067F000040020000A0000000EF8000__00000038E1ABFE28 000000067F000040020000A0000000EF4000-000000067F000040020000A0000000EF8000__00000038E9AF7F00 000000067F000040020000A0000000EF4000-000000067F000040020000A0000000EF8000__0000003903F1CFE8 000000067F000040020000A0000000EF4000-000000067F000040020000A0000000EF8000__0000003B99F7F8A0 000000067F000040020000A0000000EF4000-000000067F000040020000A0000000EF8000__0000005D2FFFFB38 000000067F000040020000A0000000EF759F-000000067F000040020000A0000000EFFF76__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000EF8000-000000067F000040020000A0000000EFC000__00000038E1ABFE28 000000067F000040020000A0000000EF8000-000000067F000040020000A0000000EFC000__00000038E9AF7F00 000000067F000040020000A0000000EF8000-000000067F000040020000A0000000EFC000__0000003903F1CFE8 000000067F000040020000A0000000EF8000-000000067F000040020000A0000000EFC000__0000003B99F7F8A0 000000067F000040020000A0000000EF8000-000000067F000040020000A0000000EFC000__0000005D2FFFFB38 000000067F000040020000A0000000EFC000-000000067F000040020000A0000000F00000__00000038E1ABFE28 000000067F000040020000A0000000EFC000-000000067F000040020000A0000000F00000__00000038E9AF7F00 000000067F000040020000A0000000EFC000-000000067F000040020000A0000000F00000__0000003903F1CFE8 000000067F000040020000A0000000EFC000-000000067F000040020000A0000000F00000__0000003B99F7F8A0 000000067F000040020000A0000000EFC000-000000067F000040020000A0000000F00000__0000005D2FFFFB38 000000067F000040020000A0000000EFFF76-000000067F000040020000A0000000F08950__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000F00000-000000067F000040020000A0000000F04000__00000038E1ABFE28 000000067F000040020000A0000000F00000-000000067F000040020000A0000000F04000__00000038E9AF7F00 000000067F000040020000A0000000F00000-000000067F000040020000A0000000F04000__0000003903F1CFE8 000000067F000040020000A0000000F00000-000000067F000040020000A0000000F04000__0000003B99F7F8A0 000000067F000040020000A0000000F00000-000000067F000040020000A0000000F04000__0000005D2FFFFB38 000000067F000040020000A0000000F04000-000000067F000040020000A0000000F08000__00000038E1ABFE28 000000067F000040020000A0000000F04000-000000067F000040020000A0000000F08000__00000038E9AF7F00 000000067F000040020000A0000000F04000-000000067F000040020000A0000000F08000__0000003903F1CFE8 000000067F000040020000A0000000F04000-000000067F000040020000A0000000F08000__0000003B99F7F8A0 000000067F000040020000A0000000F04000-000000067F000040020000A0000000F08000__0000005D2FFFFB38 000000067F000040020000A0000000F08000-000000067F000040020000A0000000F0C000__00000038E1ABFE28 000000067F000040020000A0000000F08000-000000067F000040020000A0000000F0C000__00000038E9AF7F00 000000067F000040020000A0000000F08000-000000067F000040020000A0000000F0C000__0000003903F1CFE8 000000067F000040020000A0000000F08000-000000067F000040020000A0000000F0C000__0000003B99F7F8A0 000000067F000040020000A0000000F08000-000000067F000040020000A0000000F0C000__0000005D2FFFFB38 000000067F000040020000A0000000F08950-000000067F000040020000A0000200000000__00000037A4B3E7B1-000000384463E2C1 000000067F000040020000A0000000F0C000-000000067F000040020000A0000000F10000__00000038E1ABFE28 000000067F000040020000A0000000F0C000-000000067F000040020000A0000000F10000__00000038E9AF7F00 000000067F000040020000A0000000F0C000-000000067F000040020000A0000000F10000__0000003903F1CFE8 000000067F000040020000A0000000F0C000-000000067F000040020000A0000000F10000__0000003B99F7F8A0 000000067F000040020000A0000000F0C000-000000067F000040020000A0000000F10000__0000005D2FFFFB38 000000067F000040020000A0000000F10000-000000067F000040020000A0000000F14000__00000038E1ABFE28 000000067F000040020000A0000000F10000-000000067F000040020000A0000000F14000__00000038E9AF7F00 000000067F000040020000A0000000F10000-000000067F000040020000A0000000F14000__0000003903F1CFE8 000000067F000040020000A0000000F10000-000000067F000040020000A0000000F14000__0000003B99F7F8A0 000000067F000040020000A0000000F10000-000000067F000040020000A0000000F14000__0000005D2FFFFB38 000000067F000040020000A0000000F11587-000000067F000040020000A0000000F19F63__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F14000-000000067F000040020000A0000000F18000__00000038E1ABFE28 000000067F000040020000A0000000F14000-000000067F000040020000A0000000F18000__00000038E9AF7F00 000000067F000040020000A0000000F14000-000000067F000040020000A0000000F18000__0000003903F1CFE8 000000067F000040020000A0000000F14000-000000067F000040020000A0000000F18000__0000003B99F7F8A0 000000067F000040020000A0000000F14000-000000067F000040020000A0000000F18000__0000005D2FFFFB38 000000067F000040020000A0000000F18000-000000067F000040020000A0000000F1C000__00000038E1ABFE28 000000067F000040020000A0000000F18000-000000067F000040020000A0000000F1C000__00000038E9AF7F00 000000067F000040020000A0000000F18000-000000067F000040020000A0000000F1C000__0000003903F1CFE8 000000067F000040020000A0000000F18000-000000067F000040020000A0000000F1C000__0000003B99F7F8A0 000000067F000040020000A0000000F18000-000000067F000040020000A0000000F1C000__0000005D2FFFFB38 000000067F000040020000A0000000F19F63-000000067F000040020000A0000000F2293B__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F1C000-000000067F000040020000A0000000F20000__00000038E1ABFE28 000000067F000040020000A0000000F1C000-000000067F000040020000A0000000F20000__00000038E9AF7F00 000000067F000040020000A0000000F1C000-000000067F000040020000A0000000F20000__0000003903F1CFE8 000000067F000040020000A0000000F1C000-000000067F000040020000A0000000F20000__0000003B99F7F8A0 000000067F000040020000A0000000F1C000-000000067F000040020000A0000000F20000__0000005D2FFFFB38 000000067F000040020000A0000000F20000-000000067F000040020000A0000000F24000__00000038E1ABFE28 000000067F000040020000A0000000F20000-000000067F000040020000A0000000F24000__00000038E9AF7F00 000000067F000040020000A0000000F20000-000000067F000040020000A0000000F24000__0000003903F1CFE8 000000067F000040020000A0000000F20000-000000067F000040020000A0000000F24000__0000003B99F7F8A0 000000067F000040020000A0000000F20000-000000067F000040020000A0000000F24000__0000005D2FFFFB38 000000067F000040020000A0000000F2293B-000000067F000040020000A0000000F2B30B__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F24000-000000067F000040020000A0000000F28000__00000038E1ABFE28 000000067F000040020000A0000000F24000-000000067F000040020000A0000000F28000__00000038E9AF7F00 000000067F000040020000A0000000F24000-000000067F000040020000A0000000F28000__0000003903F1CFE8 000000067F000040020000A0000000F24000-000000067F000040020000A0000000F28000__0000003B99F7F8A0 000000067F000040020000A0000000F24000-000000067F000040020000A0000000F28000__0000005D2FFFFB38 000000067F000040020000A0000000F28000-000000067F000040020000A0000000F2C000__00000038E1ABFE28 000000067F000040020000A0000000F28000-000000067F000040020000A0000000F2C000__00000038E9AF7F00 000000067F000040020000A0000000F28000-000000067F000040020000A0000000F2C000__0000003903F1CFE8 000000067F000040020000A0000000F28000-000000067F000040020000A0000000F2C000__0000003B99F7F8A0 000000067F000040020000A0000000F28000-000000067F000040020000A0000000F2C000__0000005D2FFFFB38 000000067F000040020000A0000000F2B30B-000000067F000040020000A0000000F33CE3__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F2C000-000000067F000040020000A0000000F30000__00000038E1ABFE28 000000067F000040020000A0000000F2C000-000000067F000040020000A0000000F30000__00000038E9AF7F00 000000067F000040020000A0000000F2C000-000000067F000040020000A0000000F30000__0000003903F1CFE8 000000067F000040020000A0000000F2C000-000000067F000040020000A0000000F30000__0000003B99F7F8A0 000000067F000040020000A0000000F2C000-000000067F000040020000A0000000F30000__0000005D2FFFFB38 000000067F000040020000A0000000F30000-000000067F000040020000A0000000F34000__00000038E1ABFE28 000000067F000040020000A0000000F30000-000000067F000040020000A0000000F34000__00000038E9AF7F00 000000067F000040020000A0000000F30000-000000067F000040020000A0000000F34000__0000003903F1CFE8 000000067F000040020000A0000000F30000-000000067F000040020000A0000000F34000__0000003B99F7F8A0 000000067F000040020000A0000000F30000-000000067F000040020000A0000000F34000__0000005D2FFFFB38 000000067F000040020000A0000000F33CE3-000000067F000040020000A0000000F3C6C9__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F34000-000000067F000040020000A0000000F38000__00000038E1ABFE28 000000067F000040020000A0000000F34000-000000067F000040020000A0000000F38000__00000038E9AF7F00 000000067F000040020000A0000000F34000-000000067F000040020000A0000000F38000__0000003903F1CFE8 000000067F000040020000A0000000F34000-000000067F000040020000A0000000F38000__0000003B99F7F8A0 000000067F000040020000A0000000F34000-000000067F000040020000A0000000F38000__0000005D2FFFFB38 000000067F000040020000A0000000F38000-000000067F000040020000A0000000F3C000__00000038E1ABFE28 000000067F000040020000A0000000F38000-000000067F000040020000A0000000F3C000__00000038E9AF7F00 000000067F000040020000A0000000F38000-000000067F000040020000A0000000F3C000__0000003903F1CFE8 000000067F000040020000A0000000F38000-000000067F000040020000A0000000F3C000__0000003B99F7F8A0 000000067F000040020000A0000000F38000-000000067F000040020000A0000000F3C000__0000005D2FFFFB38 000000067F000040020000A0000000F3C000-000000067F000040020000A0000000F40000__00000038E1ABFE28 000000067F000040020000A0000000F3C000-000000067F000040020000A0000000F40000__00000038E9AF7F00 000000067F000040020000A0000000F3C000-000000067F000040020000A0000000F40000__0000003903F1CFE8 000000067F000040020000A0000000F3C000-000000067F000040020000A0000000F40000__0000003B99F7F8A0 000000067F000040020000A0000000F3C000-000000067F000040020000A0000000F40000__0000005D2FFFFB38 000000067F000040020000A0000000F3C6C9-000000067F000040020000A0000000F450AB__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F40000-000000067F000040020000A0000000F44000__00000038E1ABFE28 000000067F000040020000A0000000F40000-000000067F000040020000A0000000F44000__00000038E9AF7F00 000000067F000040020000A0000000F40000-000000067F000040020000A0000000F44000__0000003903F1CFE8 000000067F000040020000A0000000F40000-000000067F000040020000A0000000F44000__0000003B99F7F8A0 000000067F000040020000A0000000F40000-000000067F000040020000A0000000F44000__0000005D2FFFFB38 000000067F000040020000A0000000F44000-000000067F000040020000A0000000F48000__00000038E1ABFE28 000000067F000040020000A0000000F44000-000000067F000040020000A0000000F48000__00000038E9AF7F00 000000067F000040020000A0000000F44000-000000067F000040020000A0000000F48000__0000003903F1CFE8 000000067F000040020000A0000000F44000-000000067F000040020000A0000000F48000__0000003B99F7F8A0 000000067F000040020000A0000000F44000-000000067F000040020000A0000000F48000__0000005D2FFFFB38 000000067F000040020000A0000000F450AB-000000067F000040020000A0000000F4DA85__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F48000-000000067F000040020000A0000000F4C000__00000038E1ABFE28 000000067F000040020000A0000000F48000-000000067F000040020000A0000000F4C000__00000038E9AF7F00 000000067F000040020000A0000000F48000-000000067F000040020000A0000000F4C000__0000003903F1CFE8 000000067F000040020000A0000000F48000-000000067F000040020000A0000000F4C000__0000003B99F7F8A0 000000067F000040020000A0000000F48000-000000067F000040020000A0000000F4C000__0000005D2FFFFB38 000000067F000040020000A0000000F4C000-000000067F000040020000A0000000F50000__00000038E1ABFE28 000000067F000040020000A0000000F4C000-000000067F000040020000A0000000F50000__00000038E9AF7F00 000000067F000040020000A0000000F4C000-000000067F000040020000A0000000F50000__0000003903F1CFE8 000000067F000040020000A0000000F4C000-000000067F000040020000A0000000F50000__0000003B99F7F8A0 000000067F000040020000A0000000F4C000-000000067F000040020000A0000000F50000__0000005D2FFFFB38 000000067F000040020000A0000000F4DA85-000000067F000040020000A0000000F56464__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F50000-000000067F000040020000A0000000F54000__00000038E1ABFE28 000000067F000040020000A0000000F50000-000000067F000040020000A0000000F54000__00000038E9AF7F00 000000067F000040020000A0000000F50000-000000067F000040020000A0000000F54000__0000003903F1CFE8 000000067F000040020000A0000000F50000-000000067F000040020000A0000000F54000__0000003B99F7F8A0 000000067F000040020000A0000000F50000-000000067F000040020000A0000000F54000__0000005D2FFFFB38 000000067F000040020000A0000000F54000-000000067F000040020000A0000000F58000__00000038E1ABFE28 000000067F000040020000A0000000F54000-000000067F000040020000A0000000F58000__00000038E9AF7F00 000000067F000040020000A0000000F54000-000000067F000040020000A0000000F58000__0000003903F1CFE8 000000067F000040020000A0000000F54000-000000067F000040020000A0000000F58000__0000003B99F7F8A0 000000067F000040020000A0000000F54000-000000067F000040020000A0000000F58000__0000005D2FFFFB38 000000067F000040020000A0000000F56464-010000000000000001000000000000000001__000000384463E2C1-00000038E1E2FE19 000000067F000040020000A0000000F58000-000000067F000040020000A0000000F5C000__00000038E1ABFE28 000000067F000040020000A0000000F58000-000000067F000040020000A0000000F5C000__00000038E9AF7F00 000000067F000040020000A0000000F58000-000000067F000040020000A0000000F5C000__0000003903F1CFE8 000000067F000040020000A0000000F58000-000000067F000040020000A0000000F5C000__0000003B99F7F8A0 000000067F000040020000A0000000F58000-000000067F000040020000A0000000F5C000__0000005D2FFFFB38 000000067F000040020000A0000000F5C000-000000067F000040020000A0040100000000__00000038E9AF7F00 000000067F000040020000A0000000F5C000-000000067F000040020000A0080100000000__0000003903F1CFE8 000000067F000040020000A0000000F5C000-000000067F000040020000A0080100000000__0000003B99F7F8A0 000000067F000040020000A0000000F5C000-000000067F000040020000A0080100000000__0000005D2FFFFB38 000000067F000040020000A0000000F5C000-030000000000000000000000000000000002__00000038E1ABFE28 000000067F000040020000A00000FFFFFFFF-030000000000000000000000000000000002__00000031853FEA98 000000067F000040020000A0050000000000-000000067F000040020000A0050100000003__00000038E1E2FE19-00000038E3787F09 000000067F000040020000A0050000000000-000000067F000040020000A0050200000000__00000038E3787F09-00000038E5077EE1 000000067F000040020000A0050000000000-030000000000000000000000000000000002__00000038E4DFC4C8 000000067F000040020000A0050000000000-030000000000000000000000000000000002__00000038E815BE18 000000067F000040020000A0060000000000-000000067F000040020000A0060100000003__00000038E68FBE49-00000038E813FFC9 000000067F000040020000A0060000000000-000000067F000040020000A0060200000000__00000038E813FFC9-00000038E99BFDE9 000000067F000040020000A0070000000000-000000067F000040020000A0070100000003__00000038EAFDDF91-00000038EBFD1ED1 000000067F000040020000A0070000000000-000000067F000040020000A0070100000003__00000038EBFD1ED1-00000038ECF55FD9 000000067F000040020000A0070000000000-030000000000000000000000000000000002__00000038ECE35F08 000000067F000040020000C0000000000000-000000067F000040020000C0000000004000__0000003903F1CFE8 000000067F000040020000C0000000000000-000000067F000040020000C0000000004000__0000003B99F7F8A0 000000067F000040020000C0000000000000-000000067F000040020000C0000000004000__0000005D2FFFFB38 000000067F000040020000C0000000004000-000000067F000040020000C0000000008000__0000003903F1CFE8 000000067F000040020000C0000000004000-000000067F000040020000C0000000008000__0000003B99F7F8A0 000000067F000040020000C0000000004000-000000067F000040020000C0000000008000__0000005D2FFFFB38 000000067F000040020000C0000000007F72-000000067F000040020000C000000000FEF5__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000008000-000000067F000040020000C000000000C000__0000003903F1CFE8 000000067F000040020000C0000000008000-000000067F000040020000C000000000C000__0000003B99F7F8A0 000000067F000040020000C0000000008000-000000067F000040020000C000000000C000__0000005D2FFFFB38 000000067F000040020000C000000000C000-000000067F000040020000C0000000010000__0000003903F1CFE8 000000067F000040020000C000000000C000-000000067F000040020000C0000000010000__0000003B99F7F8A0 000000067F000040020000C000000000C000-000000067F000040020000C0000000010000__0000005D2FFFFB38 000000067F000040020000C000000000FEF5-000000067F000040020000C0000000017E78__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000010000-000000067F000040020000C0000000014000__0000003B99F7F8A0 000000067F000040020000C0000000010000-000000067F000040020000C0000000014000__0000005D2FFFFB38 000000067F000040020000C0000000010000-030000000000000000000000000000000002__0000003903F1CFE8 000000067F000040020000C0000000014000-000000067F000040020000C0000000018000__0000003B99F7F8A0 000000067F000040020000C0000000014000-000000067F000040020000C0000000018000__0000005D2FFFFB38 000000067F000040020000C0000000017E78-000000067F000040020000C000000001FDFB__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000018000-000000067F000040020000C000000001C000__0000003B99F7F8A0 000000067F000040020000C0000000018000-000000067F000040020000C000000001C000__0000005D2FFFFB38 000000067F000040020000C000000001C000-000000067F000040020000C0000000020000__0000003B99F7F8A0 000000067F000040020000C000000001C000-000000067F000040020000C0000000020000__0000005D2FFFFB38 000000067F000040020000C000000001FDFB-000000067F000040020000C0000000027D7E__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000020000-000000067F000040020000C0000000024000__0000003B99F7F8A0 000000067F000040020000C0000000020000-000000067F000040020000C0000000024000__0000005D2FFFFB38 000000067F000040020000C0000000024000-000000067F000040020000C0000000028000__0000003B99F7F8A0 000000067F000040020000C0000000024000-000000067F000040020000C0000000028000__0000005D2FFFFB38 000000067F000040020000C0000000027D7E-000000067F000040020000C000000002FD01__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000028000-000000067F000040020000C000000002C000__0000003B99F7F8A0 000000067F000040020000C0000000028000-000000067F000040020000C000000002C000__0000005D2FFFFB38 000000067F000040020000C000000002C000-000000067F000040020000C0000000030000__0000003B99F7F8A0 000000067F000040020000C000000002C000-000000067F000040020000C0000000030000__0000005D2FFFFB38 000000067F000040020000C000000002FD01-000000067F000040020000C0000000037C84__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000030000-000000067F000040020000C0000000034000__0000003B99F7F8A0 000000067F000040020000C0000000030000-000000067F000040020000C0000000034000__0000005D2FFFFB38 000000067F000040020000C0000000034000-000000067F000040020000C0000000038000__0000003B99F7F8A0 000000067F000040020000C0000000034000-000000067F000040020000C0000000038000__0000005D2FFFFB38 000000067F000040020000C0000000037C84-000000067F000040020000C000000003FC07__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000038000-000000067F000040020000C000000003C000__0000003B99F7F8A0 000000067F000040020000C0000000038000-000000067F000040020000C000000003C000__0000005D2FFFFB38 000000067F000040020000C000000003C000-000000067F000040020000C0000000040000__0000003B99F7F8A0 000000067F000040020000C000000003C000-000000067F000040020000C0000000040000__0000005D2FFFFB38 000000067F000040020000C000000003FC07-000000067F000040020000C0000000047B8A__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000040000-000000067F000040020000C0000000044000__0000003B99F7F8A0 000000067F000040020000C0000000040000-000000067F000040020000C0000000044000__0000005D2FFFFB38 000000067F000040020000C0000000044000-000000067F000040020000C0000000048000__0000003B99F7F8A0 000000067F000040020000C0000000044000-000000067F000040020000C0000000048000__0000005D2FFFFB38 000000067F000040020000C0000000047B8A-000000067F000040020000C000000004FB0D__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000048000-000000067F000040020000C000000004C000__0000003B99F7F8A0 000000067F000040020000C0000000048000-000000067F000040020000C000000004C000__0000005D2FFFFB38 000000067F000040020000C000000004C000-000000067F000040020000C0000000050000__0000003B99F7F8A0 000000067F000040020000C000000004C000-000000067F000040020000C0000000050000__0000005D2FFFFB38 000000067F000040020000C000000004FB0D-000000067F000040020000C0000000057A90__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000050000-000000067F000040020000C0000000054000__0000003B99F7F8A0 000000067F000040020000C0000000050000-000000067F000040020000C0000000054000__0000005D2FFFFB38 000000067F000040020000C0000000054000-000000067F000040020000C0000000058000__0000003B99F7F8A0 000000067F000040020000C0000000054000-000000067F000040020000C0000000058000__0000005D2FFFFB38 000000067F000040020000C0000000057A90-000000067F000040020000C000000005FA13__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000058000-000000067F000040020000C000000005C000__0000003B99F7F8A0 000000067F000040020000C0000000058000-000000067F000040020000C000000005C000__0000005D2FFFFB38 000000067F000040020000C000000005C000-000000067F000040020000C0000000060000__0000003B99F7F8A0 000000067F000040020000C000000005C000-000000067F000040020000C0000000060000__0000005D2FFFFB38 000000067F000040020000C000000005FA13-000000067F000040020000C0000000067996__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000060000-000000067F000040020000C0000000064000__0000003B99F7F8A0 000000067F000040020000C0000000060000-000000067F000040020000C0000000064000__0000005D2FFFFB38 000000067F000040020000C0000000064000-000000067F000040020000C0000000068000__0000003B99F7F8A0 000000067F000040020000C0000000064000-000000067F000040020000C0000000068000__0000005D2FFFFB38 000000067F000040020000C0000000067996-000000067F000040020000C000000006F919__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000068000-000000067F000040020000C000000006C000__0000003B99F7F8A0 000000067F000040020000C0000000068000-000000067F000040020000C000000006C000__0000005D2FFFFB38 000000067F000040020000C000000006C000-000000067F000040020000C0000000070000__0000003B99F7F8A0 000000067F000040020000C000000006C000-000000067F000040020000C0000000070000__0000005D2FFFFB38 000000067F000040020000C000000006F919-000000067F000040020000C000000007789C__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000070000-000000067F000040020000C0000000074000__0000003B99F7F8A0 000000067F000040020000C0000000070000-000000067F000040020000C0000000074000__0000005D2FFFFB38 000000067F000040020000C0000000074000-000000067F000040020000C0000000078000__0000003B99F7F8A0 000000067F000040020000C0000000074000-000000067F000040020000C0000000078000__0000005D2FFFFB38 000000067F000040020000C000000007789C-000000067F000040020000C000000007F81F__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000078000-000000067F000040020000C000000007C000__0000003B99F7F8A0 000000067F000040020000C0000000078000-000000067F000040020000C000000007C000__0000005D2FFFFB38 000000067F000040020000C000000007C000-000000067F000040020000C0000000080000__0000003B99F7F8A0 000000067F000040020000C000000007C000-000000067F000040020000C0000000080000__0000005D2FFFFB38 000000067F000040020000C000000007F81F-000000067F000040020000C00000000877A2__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000080000-000000067F000040020000C0000000084000__0000003B99F7F8A0 000000067F000040020000C0000000080000-000000067F000040020000C0000000084000__0000005D2FFFFB38 000000067F000040020000C0000000084000-000000067F000040020000C0000000088000__0000003B99F7F8A0 000000067F000040020000C0000000084000-000000067F000040020000C0000000088000__0000005D2FFFFB38 000000067F000040020000C00000000877A2-000000067F000040020000C000000008F725__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000088000-000000067F000040020000C000000008C000__0000003B99F7F8A0 000000067F000040020000C0000000088000-000000067F000040020000C000000008C000__0000005D2FFFFB38 000000067F000040020000C000000008C000-000000067F000040020000C0000000090000__0000003B99F7F8A0 000000067F000040020000C000000008C000-000000067F000040020000C0000000090000__0000005D2FFFFB38 000000067F000040020000C000000008F725-000000067F000040020000C00000000976A8__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000090000-000000067F000040020000C0000000094000__0000003B99F7F8A0 000000067F000040020000C0000000090000-000000067F000040020000C0000000094000__0000005D2FFFFB38 000000067F000040020000C0000000094000-000000067F000040020000C0000000098000__0000003B99F7F8A0 000000067F000040020000C0000000094000-000000067F000040020000C0000000098000__0000005D2FFFFB38 000000067F000040020000C00000000976A8-000000067F000040020000C000000009F62B__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000098000-000000067F000040020000C000000009C000__0000003B99F7F8A0 000000067F000040020000C0000000098000-000000067F000040020000C000000009C000__0000005D2FFFFB38 000000067F000040020000C000000009C000-000000067F000040020000C00000000A0000__0000003B99F7F8A0 000000067F000040020000C000000009C000-000000067F000040020000C00000000A0000__0000005D2FFFFB38 000000067F000040020000C000000009F62B-000000067F000040020000C00000000A75AE__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000A0000-000000067F000040020000C00000000A4000__0000003B99F7F8A0 000000067F000040020000C00000000A0000-000000067F000040020000C00000000A4000__0000005D2FFFFB38 000000067F000040020000C00000000A4000-000000067F000040020000C00000000A8000__0000003B99F7F8A0 000000067F000040020000C00000000A4000-000000067F000040020000C00000000A8000__0000005D2FFFFB38 000000067F000040020000C00000000A75AE-000000067F000040020000C00000000AF531__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000A8000-000000067F000040020000C00000000AC000__0000003B99F7F8A0 000000067F000040020000C00000000A8000-000000067F000040020000C00000000AC000__0000005D2FFFFB38 000000067F000040020000C00000000AC000-000000067F000040020000C00000000B0000__0000003B99F7F8A0 000000067F000040020000C00000000AC000-000000067F000040020000C00000000B0000__0000005D2FFFFB38 000000067F000040020000C00000000AF531-000000067F000040020000C00000000B74B4__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000B0000-000000067F000040020000C00000000B4000__0000003B99F7F8A0 000000067F000040020000C00000000B0000-000000067F000040020000C00000000B4000__0000005D2FFFFB38 000000067F000040020000C00000000B4000-000000067F000040020000C00000000B8000__0000003B99F7F8A0 000000067F000040020000C00000000B4000-000000067F000040020000C00000000B8000__0000005D2FFFFB38 000000067F000040020000C00000000B74B4-000000067F000040020000C00000000BF437__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000B8000-000000067F000040020000C00000000BC000__0000003B99F7F8A0 000000067F000040020000C00000000B8000-000000067F000040020000C00000000BC000__0000005D2FFFFB38 000000067F000040020000C00000000BC000-000000067F000040020000C00000000C0000__0000003B99F7F8A0 000000067F000040020000C00000000BC000-000000067F000040020000C00000000C0000__0000005D2FFFFB38 000000067F000040020000C00000000BF437-000000067F000040020000C00000000C73BA__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000C0000-000000067F000040020000C00000000C4000__0000003B99F7F8A0 000000067F000040020000C00000000C0000-000000067F000040020000C00000000C4000__0000005D2FFFFB38 000000067F000040020000C00000000C4000-000000067F000040020000C00000000C8000__0000003B99F7F8A0 000000067F000040020000C00000000C4000-000000067F000040020000C00000000C8000__0000005D2FFFFB38 000000067F000040020000C00000000C73BA-000000067F000040020000C00000000CF33D__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000C8000-000000067F000040020000C00000000CC000__0000003B99F7F8A0 000000067F000040020000C00000000C8000-000000067F000040020000C00000000CC000__0000005D2FFFFB38 000000067F000040020000C00000000CC000-000000067F000040020000C00000000D0000__0000003B99F7F8A0 000000067F000040020000C00000000CC000-000000067F000040020000C00000000D0000__0000005D2FFFFB38 000000067F000040020000C00000000CF33D-000000067F000040020000C00000000D72C0__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000D0000-000000067F000040020000C00000000D4000__0000003B99F7F8A0 000000067F000040020000C00000000D0000-000000067F000040020000C00000000D4000__0000005D2FFFFB38 000000067F000040020000C00000000D4000-000000067F000040020000C00000000D8000__0000003B99F7F8A0 000000067F000040020000C00000000D4000-000000067F000040020000C00000000D8000__0000005D2FFFFB38 000000067F000040020000C00000000D72C0-000000067F000040020000C00000000DF243__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000D8000-000000067F000040020000C00000000DC000__0000003B99F7F8A0 000000067F000040020000C00000000D8000-000000067F000040020000C00000000DC000__0000005D2FFFFB38 000000067F000040020000C00000000DC000-000000067F000040020000C00000000E0000__0000003B99F7F8A0 000000067F000040020000C00000000DC000-000000067F000040020000C00000000E0000__0000005D2FFFFB38 000000067F000040020000C00000000DF243-000000067F000040020000C00000000E71C6__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000E0000-000000067F000040020000C00000000E4000__0000003B99F7F8A0 000000067F000040020000C00000000E0000-000000067F000040020000C00000000E4000__0000005D2FFFFB38 000000067F000040020000C00000000E4000-000000067F000040020000C00000000E8000__0000003B99F7F8A0 000000067F000040020000C00000000E4000-000000067F000040020000C00000000E8000__0000005D2FFFFB38 000000067F000040020000C00000000E71C6-000000067F000040020000C00000000EF149__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000E8000-000000067F000040020000C00000000EC000__0000003B99F7F8A0 000000067F000040020000C00000000E8000-000000067F000040020000C00000000EC000__0000005D2FFFFB38 000000067F000040020000C00000000EC000-000000067F000040020000C00000000F0000__0000003B99F7F8A0 000000067F000040020000C00000000EC000-000000067F000040020000C00000000F0000__0000005D2FFFFB38 000000067F000040020000C00000000EF149-000000067F000040020000C00000000F70CC__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000F0000-000000067F000040020000C00000000F4000__0000003B99F7F8A0 000000067F000040020000C00000000F0000-000000067F000040020000C00000000F4000__0000005D2FFFFB38 000000067F000040020000C00000000F4000-000000067F000040020000C00000000F8000__0000003B99F7F8A0 000000067F000040020000C00000000F4000-000000067F000040020000C00000000F8000__0000005D2FFFFB38 000000067F000040020000C00000000F70CC-000000067F000040020000C00000000FF04F__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C00000000F8000-000000067F000040020000C00000000FC000__0000003B99F7F8A0 000000067F000040020000C00000000F8000-000000067F000040020000C00000000FC000__0000005D2FFFFB38 000000067F000040020000C00000000FC000-000000067F000040020000C0000000100000__0000003B99F7F8A0 000000067F000040020000C00000000FC000-000000067F000040020000C0000000100000__0000005D2FFFFB38 000000067F000040020000C00000000FF04F-000000067F000040020000C0000000106FD2__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000100000-000000067F000040020000C0000000104000__0000003B99F7F8A0 000000067F000040020000C0000000100000-000000067F000040020000C0000000104000__0000005D2FFFFB38 000000067F000040020000C0000000104000-000000067F000040020000C0000000108000__0000003B99F7F8A0 000000067F000040020000C0000000104000-000000067F000040020000C0000000108000__0000005D2FFFFB38 000000067F000040020000C0000000106FD2-000000067F000040020000C000000010EF55__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000108000-000000067F000040020000C000000010C000__0000003B99F7F8A0 000000067F000040020000C0000000108000-000000067F000040020000C000000010C000__0000005D2FFFFB38 000000067F000040020000C000000010C000-000000067F000040020000C0000000110000__0000003B99F7F8A0 000000067F000040020000C000000010C000-000000067F000040020000C0000000110000__0000005D2FFFFB38 000000067F000040020000C000000010EF55-000000067F000040020000C0000000116ED8__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000110000-000000067F000040020000C0000000114000__0000003B99F7F8A0 000000067F000040020000C0000000110000-000000067F000040020000C0000000114000__0000005D2FFFFB38 000000067F000040020000C0000000114000-000000067F000040020000C0000000118000__0000003B99F7F8A0 000000067F000040020000C0000000114000-000000067F000040020000C0000000118000__0000005D2FFFFB38 000000067F000040020000C0000000116ED8-000000067F000040020000C000000011EE5B__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000118000-000000067F000040020000C000000011C000__0000003B99F7F8A0 000000067F000040020000C0000000118000-000000067F000040020000C000000011C000__0000005D2FFFFB38 000000067F000040020000C000000011C000-000000067F000040020000C0000000120000__0000003B99F7F8A0 000000067F000040020000C000000011C000-000000067F000040020000C0000000120000__0000005D2FFFFB38 000000067F000040020000C000000011EE5B-000000067F000040020000C0000000126DDE__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000120000-000000067F000040020000C0000000124000__0000003B99F7F8A0 000000067F000040020000C0000000120000-000000067F000040020000C0000000124000__0000005D2FFFFB38 000000067F000040020000C0000000124000-000000067F000040020000C0000000128000__0000003B99F7F8A0 000000067F000040020000C0000000124000-000000067F000040020000C0000000128000__0000005D2FFFFB38 000000067F000040020000C0000000126DDE-000000067F000040020000C000000012ED61__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000128000-000000067F000040020000C000000012C000__0000003B99F7F8A0 000000067F000040020000C0000000128000-000000067F000040020000C000000012C000__0000005D2FFFFB38 000000067F000040020000C000000012C000-000000067F000040020000C0000000130000__0000003B99F7F8A0 000000067F000040020000C000000012C000-000000067F000040020000C0000000130000__0000005D2FFFFB38 000000067F000040020000C000000012ED61-000000067F000040020000C0000000136CE4__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000130000-000000067F000040020000C0000000134000__0000003B99F7F8A0 000000067F000040020000C0000000130000-000000067F000040020000C0000000134000__0000005D2FFFFB38 000000067F000040020000C0000000134000-000000067F000040020000C0000000138000__0000003B99F7F8A0 000000067F000040020000C0000000134000-000000067F000040020000C0000000138000__0000005D2FFFFB38 000000067F000040020000C0000000136CE4-000000067F000040020000C000000013EC67__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000138000-000000067F000040020000C000000013C000__0000003B99F7F8A0 000000067F000040020000C0000000138000-000000067F000040020000C000000013C000__0000005D2FFFFB38 000000067F000040020000C000000013C000-000000067F000040020000C0000000140000__0000003B99F7F8A0 000000067F000040020000C000000013C000-000000067F000040020000C0000000140000__0000005D2FFFFB38 000000067F000040020000C000000013EC67-000000067F000040020000C0000000146BEA__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000140000-000000067F000040020000C0000000144000__0000003B99F7F8A0 000000067F000040020000C0000000140000-000000067F000040020000C0000000144000__0000005D2FFFFB38 000000067F000040020000C0000000144000-000000067F000040020000C0000000148000__0000003B99F7F8A0 000000067F000040020000C0000000144000-000000067F000040020000C0000000148000__0000005D2FFFFB38 000000067F000040020000C0000000146BEA-000000067F000040020000C000000014EB6D__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000148000-000000067F000040020000C000000014C000__0000003B99F7F8A0 000000067F000040020000C0000000148000-000000067F000040020000C000000014C000__0000005D2FFFFB38 000000067F000040020000C000000014C000-000000067F000040020000C0000000150000__0000003B99F7F8A0 000000067F000040020000C000000014C000-000000067F000040020000C0000000150000__0000005D2FFFFB38 000000067F000040020000C000000014EB6D-000000067F000040020000C0000000156AF0__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000150000-000000067F000040020000C0000000154000__0000003B99F7F8A0 000000067F000040020000C0000000150000-000000067F000040020000C0000000154000__0000005D2FFFFB38 000000067F000040020000C0000000154000-000000067F000040020000C0000000158000__0000003B99F7F8A0 000000067F000040020000C0000000154000-000000067F000040020000C0000000158000__0000005D2FFFFB38 000000067F000040020000C0000000156AF0-000000067F000040020000C000000015EA73__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000158000-000000067F000040020000C000000015C000__0000003B99F7F8A0 000000067F000040020000C0000000158000-000000067F000040020000C000000015C000__0000005D2FFFFB38 000000067F000040020000C000000015C000-000000067F000040020000C0000000160000__0000003B99F7F8A0 000000067F000040020000C000000015C000-000000067F000040020000C0000000160000__0000005D2FFFFB38 000000067F000040020000C000000015EA73-000000067F000040020000C00000001669F6__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000160000-000000067F000040020000C0000000164000__0000003B99F7F8A0 000000067F000040020000C0000000160000-000000067F000040020000C0000000164000__0000005D2FFFFB38 000000067F000040020000C0000000164000-000000067F000040020000C0000000168000__0000003B99F7F8A0 000000067F000040020000C0000000164000-000000067F000040020000C0000000168000__0000005D2FFFFB38 000000067F000040020000C00000001669F6-000000067F000040020000C000000016E979__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000168000-000000067F000040020000C000000016C000__0000003B99F7F8A0 000000067F000040020000C0000000168000-000000067F000040020000C000000016C000__0000005D2FFFFB38 000000067F000040020000C000000016C000-000000067F000040020000C0000000170000__0000003B99F7F8A0 000000067F000040020000C000000016C000-000000067F000040020000C0000000170000__0000005D2FFFFB38 000000067F000040020000C000000016E979-000000067F000040020000C00000001768FC__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000170000-000000067F000040020000C0000000174000__0000003B99F7F8A0 000000067F000040020000C0000000170000-000000067F000040020000C0000000174000__0000005D2FFFFB38 000000067F000040020000C0000000174000-000000067F000040020000C0000000178000__0000003B99F7F8A0 000000067F000040020000C0000000174000-000000067F000040020000C0000000178000__0000005D2FFFFB38 000000067F000040020000C00000001768FC-000000067F000040020000C000000017E87F__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000178000-000000067F000040020000C000000017C000__0000003B99F7F8A0 000000067F000040020000C0000000178000-000000067F000040020000C000000017C000__0000005D2FFFFB38 000000067F000040020000C000000017C000-000000067F000040020000C0000000180000__0000003B99F7F8A0 000000067F000040020000C000000017C000-000000067F000040020000C0000000180000__0000005D2FFFFB38 000000067F000040020000C000000017E87F-030000000000000000000000000000000002__00000038ED8FA069-0000003ABA685F11 000000067F000040020000C0000000180000-000000067F000040020000C0000100000000__0000003B99F7F8A0 000000067F000040020000C0000000180000-000000067F000040020000C0000100000000__0000005D2FFFFB38 000000067F000040020000E0000000000000-000000067F000040020000E0000000004000__0000003B99F7F8A0 000000067F000040020000E0000000000000-000000067F000040020000E0000000004000__0000005D2FFFFB38 000000067F000040020000E0000000000000-000000067F000040020000E0000000004000__00000073AD3FE6B8 000000067F000040020000E0000000000000-000000067F000040020000E0000000004000__000000914E3F38F0 000000067F000040020000E0000000000000-000000067F000040020000E0000000004000__000000931B9A2710 000000067F000040020000E0000000004000-000000067F000040020000E0000000008000__0000003B99F7F8A0 000000067F000040020000E0000000004000-000000067F000040020000E0000000008000__0000005D2FFFFB38 000000067F000040020000E0000000004000-000000067F000040020000E0000000008000__00000073AD3FE6B8 000000067F000040020000E0000000004000-000000067F000040020000E0000000008000__000000914E3F38F0 000000067F000040020000E0000000004000-000000067F000040020000E0000000008000__000000931B9A2710 000000067F000040020000E0000000008000-000000067F000040020000E000000000C000__0000003B99F7F8A0 000000067F000040020000E0000000008000-000000067F000040020000E000000000C000__0000005D2FFFFB38 000000067F000040020000E0000000008000-000000067F000040020000E000000000C000__00000073AD3FE6B8 000000067F000040020000E0000000008000-000000067F000040020000E000000000C000__000000914E3F38F0 000000067F000040020000E0000000008000-000000067F000040020000E000000000C000__000000931B9A2710 000000067F000040020000E000000000899C-000000067F000040020000E000000001137C__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E000000000C000-000000067F000040020000E0000000010000__0000003B99F7F8A0 000000067F000040020000E000000000C000-000000067F000040020000E0000000010000__0000005D2FFFFB38 000000067F000040020000E000000000C000-000000067F000040020000E0000000010000__00000073AD3FE6B8 000000067F000040020000E000000000C000-000000067F000040020000E0000000010000__000000914E3F38F0 000000067F000040020000E000000000C000-000000067F000040020000E0000000010000__000000931B9A2710 000000067F000040020000E0000000010000-000000067F000040020000E0000000014000__0000003B99F7F8A0 000000067F000040020000E0000000010000-000000067F000040020000E0000000014000__0000005D2FFFFB38 000000067F000040020000E0000000010000-000000067F000040020000E0000000014000__00000073AD3FE6B8 000000067F000040020000E0000000010000-000000067F000040020000E0000000014000__000000914E3F38F0 000000067F000040020000E0000000010000-000000067F000040020000E0000000014000__000000931B9A2710 000000067F000040020000E000000001137C-000000067F000040020000E0000000019D79__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000014000-000000067F000040020000E0000000018000__0000003B99F7F8A0 000000067F000040020000E0000000014000-000000067F000040020000E0000000018000__0000005D2FFFFB38 000000067F000040020000E0000000014000-000000067F000040020000E0000000018000__00000073AD3FE6B8 000000067F000040020000E0000000014000-000000067F000040020000E0000000018000__000000914E3F38F0 000000067F000040020000E0000000014000-000000067F000040020000E0000000018000__000000931B9A2710 000000067F000040020000E0000000018000-000000067F000040020000E000000001C000__0000003B99F7F8A0 000000067F000040020000E0000000018000-000000067F000040020000E000000001C000__0000005D2FFFFB38 000000067F000040020000E0000000018000-000000067F000040020000E000000001C000__00000073AD3FE6B8 000000067F000040020000E0000000018000-000000067F000040020000E000000001C000__000000914E3F38F0 000000067F000040020000E0000000018000-000000067F000040020000E000000001C000__000000931B9A2710 000000067F000040020000E0000000019D79-000000067F000040020000E0000000022776__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E000000001C000-000000067F000040020000E0000000020000__0000003B99F7F8A0 000000067F000040020000E000000001C000-000000067F000040020000E0000000020000__0000005D2FFFFB38 000000067F000040020000E000000001C000-000000067F000040020000E0000000020000__00000073AD3FE6B8 000000067F000040020000E000000001C000-000000067F000040020000E0000000020000__000000914E3F38F0 000000067F000040020000E000000001C000-000000067F000040020000E0000000020000__000000931B9A2710 000000067F000040020000E0000000020000-000000067F000040020000E0000000024000__0000003B99F7F8A0 000000067F000040020000E0000000020000-000000067F000040020000E0000000024000__0000005D2FFFFB38 000000067F000040020000E0000000020000-000000067F000040020000E0000000024000__00000073AD3FE6B8 000000067F000040020000E0000000020000-000000067F000040020000E0000000024000__000000914E3F38F0 000000067F000040020000E0000000020000-000000067F000040020000E0000000024000__000000931B9A2710 000000067F000040020000E0000000022776-000000067F000040020000E000000002B15B__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000024000-000000067F000040020000E0000000028000__0000003B99F7F8A0 000000067F000040020000E0000000024000-000000067F000040020000E0000000028000__0000005D2FFFFB38 000000067F000040020000E0000000024000-000000067F000040020000E0000000028000__00000073AD3FE6B8 000000067F000040020000E0000000024000-000000067F000040020000E0000000028000__000000914E3F38F0 000000067F000040020000E0000000024000-000000067F000040020000E0000000028000__000000931B9A2710 000000067F000040020000E0000000028000-000000067F000040020000E000000002C000__0000003B99F7F8A0 000000067F000040020000E0000000028000-000000067F000040020000E000000002C000__0000005D2FFFFB38 000000067F000040020000E0000000028000-000000067F000040020000E000000002C000__00000073AD3FE6B8 000000067F000040020000E0000000028000-000000067F000040020000E000000002C000__000000914E3F38F0 000000067F000040020000E0000000028000-000000067F000040020000E000000002C000__000000931B9A2710 000000067F000040020000E000000002B15B-000000067F000040020000E0000000033B2F__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E000000002C000-000000067F000040020000E0000000030000__0000003B99F7F8A0 000000067F000040020000E000000002C000-000000067F000040020000E0000000030000__0000005D2FFFFB38 000000067F000040020000E000000002C000-000000067F000040020000E0000000030000__00000073AD3FE6B8 000000067F000040020000E000000002C000-000000067F000040020000E0000000030000__000000914E3F38F0 000000067F000040020000E000000002C000-000000067F000040020000E0000000030000__000000931B9A2710 000000067F000040020000E0000000030000-000000067F000040020000E0000000034000__0000003B99F7F8A0 000000067F000040020000E0000000030000-000000067F000040020000E0000000034000__0000005D2FFFFB38 000000067F000040020000E0000000030000-000000067F000040020000E0000000034000__00000073AD3FE6B8 000000067F000040020000E0000000030000-000000067F000040020000E0000000034000__000000914E3F38F0 000000067F000040020000E0000000030000-000000067F000040020000E0000000034000__000000931B9A2710 000000067F000040020000E0000000033B2F-000000067F000040020000E000000003C4EA__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000034000-000000067F000040020000E0000000038000__0000003B99F7F8A0 000000067F000040020000E0000000034000-000000067F000040020000E0000000038000__0000005D2FFFFB38 000000067F000040020000E0000000034000-000000067F000040020000E0000000038000__00000073AD3FE6B8 000000067F000040020000E0000000034000-000000067F000040020000E0000000038000__000000914E3F38F0 000000067F000040020000E0000000034000-000000067F000040020000E0000000038000__000000931B9A2710 000000067F000040020000E0000000038000-000000067F000040020000E000000003C000__0000003B99F7F8A0 000000067F000040020000E0000000038000-000000067F000040020000E000000003C000__0000005D2FFFFB38 000000067F000040020000E0000000038000-000000067F000040020000E000000003C000__00000073AD3FE6B8 000000067F000040020000E0000000038000-000000067F000040020000E000000003C000__000000914E3F38F0 000000067F000040020000E0000000038000-000000067F000040020000E000000003C000__000000931B9A2710 000000067F000040020000E000000003C000-000000067F000040020000E0000000040000__0000003B99F7F8A0 000000067F000040020000E000000003C000-000000067F000040020000E0000000040000__0000005D2FFFFB38 000000067F000040020000E000000003C000-000000067F000040020000E0000000040000__00000073AD3FE6B8 000000067F000040020000E000000003C000-000000067F000040020000E0000000040000__000000914E3F38F0 000000067F000040020000E000000003C000-000000067F000040020000E0000000040000__000000931B9A2710 000000067F000040020000E000000003C4EA-000000067F000040020000E0000000044EA8__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000040000-000000067F000040020000E0000000044000__0000003B99F7F8A0 000000067F000040020000E0000000040000-000000067F000040020000E0000000044000__0000005D2FFFFB38 000000067F000040020000E0000000040000-000000067F000040020000E0000000044000__00000073AD3FE6B8 000000067F000040020000E0000000040000-000000067F000040020000E0000000044000__000000914E3F38F0 000000067F000040020000E0000000040000-000000067F000040020000E0000000044000__000000931B9A2710 000000067F000040020000E0000000044000-000000067F000040020000E0000000048000__0000003B99F7F8A0 000000067F000040020000E0000000044000-000000067F000040020000E0000000048000__0000005D2FFFFB38 000000067F000040020000E0000000044000-000000067F000040020000E0000000048000__00000073AD3FE6B8 000000067F000040020000E0000000044000-000000067F000040020000E0000000048000__000000914E3F38F0 000000067F000040020000E0000000044000-000000067F000040020000E0000000048000__000000931B9A2710 000000067F000040020000E0000000044EA8-000000067F000040020000E000000004D890__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000048000-000000067F000040020000E000000004C000__0000003B99F7F8A0 000000067F000040020000E0000000048000-000000067F000040020000E000000004C000__0000005D2FFFFB38 000000067F000040020000E0000000048000-000000067F000040020000E000000004C000__00000073AD3FE6B8 000000067F000040020000E0000000048000-000000067F000040020000E000000004C000__000000914E3F38F0 000000067F000040020000E0000000048000-000000067F000040020000E000000004C000__000000931B9A2710 000000067F000040020000E000000004C000-000000067F000040020000E0000000050000__0000003B99F7F8A0 000000067F000040020000E000000004C000-000000067F000040020000E0000000050000__0000005D2FFFFB38 000000067F000040020000E000000004C000-000000067F000040020000E0000000050000__00000073AD3FE6B8 000000067F000040020000E000000004C000-000000067F000040020000E0000000050000__000000914E3F38F0 000000067F000040020000E000000004C000-000000067F000040020000E0000000050000__000000931B9A2710 000000067F000040020000E000000004D890-000000067F000040020000E0000000056296__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000050000-000000067F000040020000E0000000054000__0000003B99F7F8A0 000000067F000040020000E0000000050000-000000067F000040020000E0000000054000__0000005D2FFFFB38 000000067F000040020000E0000000050000-000000067F000040020000E0000000054000__00000073AD3FE6B8 000000067F000040020000E0000000050000-000000067F000040020000E0000000054000__000000914E3F38F0 000000067F000040020000E0000000050000-000000067F000040020000E0000000054000__000000931B9A2710 000000067F000040020000E0000000054000-000000067F000040020000E0000000058000__0000003B99F7F8A0 000000067F000040020000E0000000054000-000000067F000040020000E0000000058000__0000005D2FFFFB38 000000067F000040020000E0000000054000-000000067F000040020000E0000000058000__00000073AD3FE6B8 000000067F000040020000E0000000054000-000000067F000040020000E0000000058000__000000914E3F38F0 000000067F000040020000E0000000054000-000000067F000040020000E0000000058000__000000931B9A2710 000000067F000040020000E0000000056296-000000067F000040020000E000000005EC8C__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E0000000058000-000000067F000040020000E000000005C000__0000003B99F7F8A0 000000067F000040020000E0000000058000-000000067F000040020000E000000005C000__0000005D2FFFFB38 000000067F000040020000E0000000058000-000000067F000040020000E000000005C000__00000073AD3FE6B8 000000067F000040020000E0000000058000-000000067F000040020000E000000005C000__000000914E3F38F0 000000067F000040020000E0000000058000-000000067F000040020000E000000005C000__000000931B9A2710 000000067F000040020000E000000005C000-000000067F000040020000E0000000060000__0000003B99F7F8A0 000000067F000040020000E000000005C000-000000067F000040020000E0000000060000__000000574B7FF240 000000067F000040020000E000000005C000-000000067F000040020000E0000000060000__00000073AD3FE6B8 000000067F000040020000E000000005C000-000000067F000040020000E0000000060000__000000914E3F38F0 000000067F000040020000E000000005C000-000000067F000040020000E0000000060000__000000931B9A2710 000000067F000040020000E000000005EC8C-030000000000000000000000000000000002__0000003ABA698781-0000003B6A0FFB09 000000067F000040020000E000000005EF9E-000000067F000040020000E0000000067994__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000060000-000000067F000040020000E0000000064000__0000003B99F7F8A0 000000067F000040020000E0000000060000-000000067F000040020000E0000000064000__000000574B7FF240 000000067F000040020000E0000000060000-000000067F000040020000E0000000064000__00000073AD3FE6B8 000000067F000040020000E0000000060000-000000067F000040020000E0000000064000__000000914E3F38F0 000000067F000040020000E0000000060000-000000067F000040020000E0000000064000__000000931B9A2710 000000067F000040020000E0000000064000-000000067F000040020000E0000000068000__0000003B99F7F8A0 000000067F000040020000E0000000064000-000000067F000040020000E0000000068000__000000574B7FF240 000000067F000040020000E0000000064000-000000067F000040020000E0000000068000__00000073AD3FE6B8 000000067F000040020000E0000000064000-000000067F000040020000E0000000068000__000000914E3F38F0 000000067F000040020000E0000000064000-000000067F000040020000E0000000068000__000000931B9A2710 000000067F000040020000E0000000067994-000000067F000040020000E0000000070359__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000068000-000000067F000040020000E000000006C000__0000003B99F7F8A0 000000067F000040020000E0000000068000-000000067F000040020000E000000006C000__000000574B7FF240 000000067F000040020000E0000000068000-000000067F000040020000E000000006C000__00000073AD3FE6B8 000000067F000040020000E0000000068000-000000067F000040020000E000000006C000__000000914E3F38F0 000000067F000040020000E0000000068000-000000067F000040020000E000000006C000__000000931B9A2710 000000067F000040020000E000000006C000-000000067F000040020000E0000000070000__0000003B99F7F8A0 000000067F000040020000E000000006C000-000000067F000040020000E0000000070000__000000574B7FF240 000000067F000040020000E000000006C000-000000067F000040020000E0000000070000__00000073AD3FE6B8 000000067F000040020000E000000006C000-000000067F000040020000E0000000070000__000000914E3F38F0 000000067F000040020000E000000006C000-000000067F000040020000E0000000070000__000000931B9A2710 000000067F000040020000E0000000070000-000000067F000040020000E0000000074000__0000003B99F7F8A0 000000067F000040020000E0000000070000-000000067F000040020000E0000000074000__000000574B7FF240 000000067F000040020000E0000000070000-000000067F000040020000E0000000074000__00000073AD3FE6B8 000000067F000040020000E0000000070000-000000067F000040020000E0000000074000__000000914E3F38F0 000000067F000040020000E0000000070000-000000067F000040020000E0000000074000__000000931B9A2710 000000067F000040020000E0000000070359-000000067F000040020000E0000000078D16__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000074000-000000067F000040020000E0000000078000__0000003B99F7F8A0 000000067F000040020000E0000000074000-000000067F000040020000E0000000078000__000000574B7FF240 000000067F000040020000E0000000074000-000000067F000040020000E0000000078000__00000073AD3FE6B8 000000067F000040020000E0000000074000-000000067F000040020000E0000000078000__000000914E3F38F0 000000067F000040020000E0000000074000-000000067F000040020000E0000000078000__000000931B9A2710 000000067F000040020000E0000000078000-000000067F000040020000E000000007C000__000000574B7FF240 000000067F000040020000E0000000078000-000000067F000040020000E000000007C000__00000073AD3FE6B8 000000067F000040020000E0000000078000-000000067F000040020000E000000007C000__000000914E3F38F0 000000067F000040020000E0000000078000-000000067F000040020000E000000007C000__000000931B9A2710 000000067F000040020000E0000000078000-030000000000000000000000000000000002__0000003B99F7F8A0 000000067F000040020000E0000000078D16-000000067F000040020000E00000000816CB__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000007C000-000000067F000040020000E0000000080000__000000574B7FF240 000000067F000040020000E000000007C000-000000067F000040020000E0000000080000__00000073AD3FE6B8 000000067F000040020000E000000007C000-000000067F000040020000E0000000080000__000000914E3F38F0 000000067F000040020000E000000007C000-000000067F000040020000E0000000080000__000000931B9A2710 000000067F000040020000E0000000080000-000000067F000040020000E0000000084000__000000574B7FF240 000000067F000040020000E0000000080000-000000067F000040020000E0000000084000__00000073AD3FE6B8 000000067F000040020000E0000000080000-000000067F000040020000E0000000084000__000000914E3F38F0 000000067F000040020000E0000000080000-000000067F000040020000E0000000084000__000000931B9A2710 000000067F000040020000E00000000816CB-000000067F000040020000E000000008A0C4__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000084000-000000067F000040020000E0000000088000__000000574B7FF240 000000067F000040020000E0000000084000-000000067F000040020000E0000000088000__00000073AD3FE6B8 000000067F000040020000E0000000084000-000000067F000040020000E0000000088000__000000914E3F38F0 000000067F000040020000E0000000084000-000000067F000040020000E0000000088000__000000931B9A2710 000000067F000040020000E0000000088000-000000067F000040020000E000000008C000__000000574B7FF240 000000067F000040020000E0000000088000-000000067F000040020000E000000008C000__00000073AD3FE6B8 000000067F000040020000E0000000088000-000000067F000040020000E000000008C000__000000914E3F38F0 000000067F000040020000E0000000088000-000000067F000040020000E000000008C000__000000931B9A2710 000000067F000040020000E000000008A0C4-000000067F000040020000E0000000092AC7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000008C000-000000067F000040020000E0000000090000__000000574B7FF240 000000067F000040020000E000000008C000-000000067F000040020000E0000000090000__00000073AD3FE6B8 000000067F000040020000E000000008C000-000000067F000040020000E0000000090000__000000914E3F38F0 000000067F000040020000E000000008C000-000000067F000040020000E0000000090000__000000931B9A2710 000000067F000040020000E0000000090000-000000067F000040020000E0000000094000__000000574B7FF240 000000067F000040020000E0000000090000-000000067F000040020000E0000000094000__00000073AD3FE6B8 000000067F000040020000E0000000090000-000000067F000040020000E0000000094000__000000914E3F38F0 000000067F000040020000E0000000090000-000000067F000040020000E0000000094000__000000931B9A2710 000000067F000040020000E0000000092AC7-000000067F000040020000E000000009B4BC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000094000-000000067F000040020000E0000000098000__000000574B7FF240 000000067F000040020000E0000000094000-000000067F000040020000E0000000098000__00000073AD3FE6B8 000000067F000040020000E0000000094000-000000067F000040020000E0000000098000__000000914E3F38F0 000000067F000040020000E0000000094000-000000067F000040020000E0000000098000__000000931B9A2710 000000067F000040020000E0000000098000-000000067F000040020000E000000009C000__000000574B7FF240 000000067F000040020000E0000000098000-000000067F000040020000E000000009C000__00000073AD3FE6B8 000000067F000040020000E0000000098000-000000067F000040020000E000000009C000__000000914E3F38F0 000000067F000040020000E0000000098000-000000067F000040020000E000000009C000__000000931B9A2710 000000067F000040020000E000000009B4BC-000000067F000040020000E00000000A3EA3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000009C000-000000067F000040020000E00000000A0000__000000574B7FF240 000000067F000040020000E000000009C000-000000067F000040020000E00000000A0000__00000073AD3FE6B8 000000067F000040020000E000000009C000-000000067F000040020000E00000000A0000__000000914E3F38F0 000000067F000040020000E000000009C000-000000067F000040020000E00000000A0000__000000931B9A2710 000000067F000040020000E00000000A0000-000000067F000040020000E00000000A4000__000000574B7FF240 000000067F000040020000E00000000A0000-000000067F000040020000E00000000A4000__00000073AD3FE6B8 000000067F000040020000E00000000A0000-000000067F000040020000E00000000A4000__000000914E3F38F0 000000067F000040020000E00000000A0000-000000067F000040020000E00000000A4000__000000931B9A2710 000000067F000040020000E00000000A3EA3-000000067F000040020000E00000000AC86A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000A4000-000000067F000040020000E00000000A8000__000000574B7FF240 000000067F000040020000E00000000A4000-000000067F000040020000E00000000A8000__00000073AD3FE6B8 000000067F000040020000E00000000A4000-000000067F000040020000E00000000A8000__000000914E3F38F0 000000067F000040020000E00000000A4000-000000067F000040020000E00000000A8000__000000931B9A2710 000000067F000040020000E00000000A8000-000000067F000040020000E00000000AC000__000000574B7FF240 000000067F000040020000E00000000A8000-000000067F000040020000E00000000AC000__00000073AD3FE6B8 000000067F000040020000E00000000A8000-000000067F000040020000E00000000AC000__000000914E3F38F0 000000067F000040020000E00000000A8000-000000067F000040020000E00000000AC000__000000931B9A2710 000000067F000040020000E00000000AC000-000000067F000040020000E00000000B0000__000000574B7FF240 000000067F000040020000E00000000AC000-000000067F000040020000E00000000B0000__00000073AD3FE6B8 000000067F000040020000E00000000AC000-000000067F000040020000E00000000B0000__000000914E3F38F0 000000067F000040020000E00000000AC000-000000067F000040020000E00000000B0000__000000931B9A2710 000000067F000040020000E00000000AC86A-000000067F000040020000E00000000B5227__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000B0000-000000067F000040020000E00000000B4000__000000574B7FF240 000000067F000040020000E00000000B0000-000000067F000040020000E00000000B4000__00000073AD3FE6B8 000000067F000040020000E00000000B0000-000000067F000040020000E00000000B4000__000000914E3F38F0 000000067F000040020000E00000000B0000-000000067F000040020000E00000000B4000__000000931B9A2710 000000067F000040020000E00000000B4000-000000067F000040020000E00000000B8000__000000574B7FF240 000000067F000040020000E00000000B4000-000000067F000040020000E00000000B8000__00000073AD3FE6B8 000000067F000040020000E00000000B4000-000000067F000040020000E00000000B8000__000000914E3F38F0 000000067F000040020000E00000000B4000-000000067F000040020000E00000000B8000__000000931B9A2710 000000067F000040020000E00000000B5227-000000067F000040020000E00000000BDBEB__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000B8000-000000067F000040020000E00000000BC000__000000574B7FF240 000000067F000040020000E00000000B8000-000000067F000040020000E00000000BC000__00000073AD3FE6B8 000000067F000040020000E00000000B8000-000000067F000040020000E00000000BC000__000000914E3F38F0 000000067F000040020000E00000000B8000-000000067F000040020000E00000000BC000__000000931B9A2710 000000067F000040020000E00000000BC000-000000067F000040020000E00000000C0000__000000574B7FF240 000000067F000040020000E00000000BC000-000000067F000040020000E00000000C0000__00000073AD3FE6B8 000000067F000040020000E00000000BC000-000000067F000040020000E00000000C0000__000000914E3F38F0 000000067F000040020000E00000000BC000-000000067F000040020000E00000000C0000__000000931B9A2710 000000067F000040020000E00000000BDBEB-000000067F000040020000E00000000C65F2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000C0000-000000067F000040020000E00000000C4000__000000574B7FF240 000000067F000040020000E00000000C0000-000000067F000040020000E00000000C4000__00000073AD3FE6B8 000000067F000040020000E00000000C0000-000000067F000040020000E00000000C4000__000000914E3F38F0 000000067F000040020000E00000000C0000-000000067F000040020000E00000000C4000__000000931B9A2710 000000067F000040020000E00000000C4000-000000067F000040020000E00000000C8000__000000574B7FF240 000000067F000040020000E00000000C4000-000000067F000040020000E00000000C8000__00000073AD3FE6B8 000000067F000040020000E00000000C4000-000000067F000040020000E00000000C8000__000000914E3F38F0 000000067F000040020000E00000000C4000-000000067F000040020000E00000000C8000__000000931B9A2710 000000067F000040020000E00000000C65F2-000000067F000040020000E00000000CEFF3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000C8000-000000067F000040020000E00000000CC000__000000574B7FF240 000000067F000040020000E00000000C8000-000000067F000040020000E00000000CC000__00000073AD3FE6B8 000000067F000040020000E00000000C8000-000000067F000040020000E00000000CC000__000000914E3F38F0 000000067F000040020000E00000000C8000-000000067F000040020000E00000000CC000__000000931B9A2710 000000067F000040020000E00000000CC000-000000067F000040020000E00000000D0000__000000574B7FF240 000000067F000040020000E00000000CC000-000000067F000040020000E00000000D0000__00000073AD3FE6B8 000000067F000040020000E00000000CC000-000000067F000040020000E00000000D0000__000000914E3F38F0 000000067F000040020000E00000000CC000-000000067F000040020000E00000000D0000__000000931B9A2710 000000067F000040020000E00000000CEFF3-000000067F000040020000E00000000D79E6__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000D0000-000000067F000040020000E00000000D4000__000000574B7FF240 000000067F000040020000E00000000D0000-000000067F000040020000E00000000D4000__00000073AD3FE6B8 000000067F000040020000E00000000D0000-000000067F000040020000E00000000D4000__000000914E3F38F0 000000067F000040020000E00000000D0000-000000067F000040020000E00000000D4000__000000931B9A2710 000000067F000040020000E00000000D4000-000000067F000040020000E00000000D8000__000000574B7FF240 000000067F000040020000E00000000D4000-000000067F000040020000E00000000D8000__00000073AD3FE6B8 000000067F000040020000E00000000D4000-000000067F000040020000E00000000D8000__000000914E3F38F0 000000067F000040020000E00000000D4000-000000067F000040020000E00000000D8000__000000931B9A2710 000000067F000040020000E00000000D79E6-000000067F000040020000E00000000E03C4__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000D8000-000000067F000040020000E00000000DC000__000000574B7FF240 000000067F000040020000E00000000D8000-000000067F000040020000E00000000DC000__00000073AD3FE6B8 000000067F000040020000E00000000D8000-000000067F000040020000E00000000DC000__000000914E3F38F0 000000067F000040020000E00000000D8000-000000067F000040020000E00000000DC000__000000931B9A2710 000000067F000040020000E00000000DC000-000000067F000040020000E00000000E0000__000000574B7FF240 000000067F000040020000E00000000DC000-000000067F000040020000E00000000E0000__00000073AD3FE6B8 000000067F000040020000E00000000DC000-000000067F000040020000E00000000E0000__000000914E3F38F0 000000067F000040020000E00000000DC000-000000067F000040020000E00000000E0000__000000931B9A2710 000000067F000040020000E00000000E0000-000000067F000040020000E00000000E4000__000000574B7FF240 000000067F000040020000E00000000E0000-000000067F000040020000E00000000E4000__00000073AD3FE6B8 000000067F000040020000E00000000E0000-000000067F000040020000E00000000E4000__000000914E3F38F0 000000067F000040020000E00000000E0000-000000067F000040020000E00000000E4000__000000931B9A2710 000000067F000040020000E00000000E03C4-000000067F000040020000E00000000E8D95__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000E4000-000000067F000040020000E00000000E8000__000000574B7FF240 000000067F000040020000E00000000E4000-000000067F000040020000E00000000E8000__00000073AD3FE6B8 000000067F000040020000E00000000E4000-000000067F000040020000E00000000E8000__000000914E3F38F0 000000067F000040020000E00000000E4000-000000067F000040020000E00000000E8000__000000931B9A2710 000000067F000040020000E00000000E8000-000000067F000040020000E00000000EC000__000000574B7FF240 000000067F000040020000E00000000E8000-000000067F000040020000E00000000EC000__00000073AD3FE6B8 000000067F000040020000E00000000E8000-000000067F000040020000E00000000EC000__000000914E3F38F0 000000067F000040020000E00000000E8000-000000067F000040020000E00000000EC000__000000931B9A2710 000000067F000040020000E00000000E8D95-000000067F000040020000E00000000F175E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000EC000-000000067F000040020000E00000000F0000__000000574B7FF240 000000067F000040020000E00000000EC000-000000067F000040020000E00000000F0000__00000073AD3FE6B8 000000067F000040020000E00000000EC000-000000067F000040020000E00000000F0000__000000914E3F38F0 000000067F000040020000E00000000EC000-000000067F000040020000E00000000F0000__000000931B9A2710 000000067F000040020000E00000000F0000-000000067F000040020000E00000000F4000__000000574B7FF240 000000067F000040020000E00000000F0000-000000067F000040020000E00000000F4000__00000073AD3FE6B8 000000067F000040020000E00000000F0000-000000067F000040020000E00000000F4000__000000914E3F38F0 000000067F000040020000E00000000F0000-000000067F000040020000E00000000F4000__000000931B9A2710 000000067F000040020000E00000000F175E-000000067F000040020000E00000000FA122__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000F4000-000000067F000040020000E00000000F8000__000000574B7FF240 000000067F000040020000E00000000F4000-000000067F000040020000E00000000F8000__00000073AD3FE6B8 000000067F000040020000E00000000F4000-000000067F000040020000E00000000F8000__000000914E3F38F0 000000067F000040020000E00000000F4000-000000067F000040020000E00000000F8000__000000931B9A2710 000000067F000040020000E00000000F8000-000000067F000040020000E00000000FC000__000000574B7FF240 000000067F000040020000E00000000F8000-000000067F000040020000E00000000FC000__00000073AD3FE6B8 000000067F000040020000E00000000F8000-000000067F000040020000E00000000FC000__000000914E3F38F0 000000067F000040020000E00000000F8000-000000067F000040020000E00000000FC000__000000931B9A2710 000000067F000040020000E00000000FA122-000000067F000040020000E0000000102B0A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000000FC000-000000067F000040020000E0000000100000__000000574B7FF240 000000067F000040020000E00000000FC000-000000067F000040020000E0000000100000__00000073AD3FE6B8 000000067F000040020000E00000000FC000-000000067F000040020000E0000000100000__000000914E3F38F0 000000067F000040020000E00000000FC000-000000067F000040020000E0000000100000__000000931B9A2710 000000067F000040020000E0000000100000-000000067F000040020000E0000000104000__000000574B7FF240 000000067F000040020000E0000000100000-000000067F000040020000E0000000104000__00000073AD3FE6B8 000000067F000040020000E0000000100000-000000067F000040020000E0000000104000__000000914E3F38F0 000000067F000040020000E0000000100000-000000067F000040020000E0000000104000__000000931B9A2710 000000067F000040020000E0000000102B0A-000000067F000040020000E000000010B4F8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000104000-000000067F000040020000E0000000108000__000000574B7FF240 000000067F000040020000E0000000104000-000000067F000040020000E0000000108000__00000073AD3FE6B8 000000067F000040020000E0000000104000-000000067F000040020000E0000000108000__000000914E3F38F0 000000067F000040020000E0000000104000-000000067F000040020000E0000000108000__000000931B9A2710 000000067F000040020000E0000000108000-000000067F000040020000E000000010C000__000000574B7FF240 000000067F000040020000E0000000108000-000000067F000040020000E000000010C000__00000073AD3FE6B8 000000067F000040020000E0000000108000-000000067F000040020000E000000010C000__000000914E3F38F0 000000067F000040020000E0000000108000-000000067F000040020000E000000010C000__000000931B9A2710 000000067F000040020000E000000010B4F8-000000067F000040020000E0000000113EEA__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000010C000-000000067F000040020000E0000000110000__000000574B7FF240 000000067F000040020000E000000010C000-000000067F000040020000E0000000110000__00000073AD3FE6B8 000000067F000040020000E000000010C000-000000067F000040020000E0000000110000__000000914E3F38F0 000000067F000040020000E000000010C000-000000067F000040020000E0000000110000__000000931B9A2710 000000067F000040020000E0000000110000-000000067F000040020000E0000000114000__000000574B7FF240 000000067F000040020000E0000000110000-000000067F000040020000E0000000114000__00000073AD3FE6B8 000000067F000040020000E0000000110000-000000067F000040020000E0000000114000__000000914E3F38F0 000000067F000040020000E0000000110000-000000067F000040020000E0000000114000__000000931B9A2710 000000067F000040020000E0000000113EEA-000000067F000040020000E000000011C8D2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000114000-000000067F000040020000E0000000118000__000000574B7FF240 000000067F000040020000E0000000114000-000000067F000040020000E0000000118000__00000073AD3FE6B8 000000067F000040020000E0000000114000-000000067F000040020000E0000000118000__000000914E3F38F0 000000067F000040020000E0000000114000-000000067F000040020000E0000000118000__000000931B9A2710 000000067F000040020000E0000000118000-000000067F000040020000E000000011C000__000000574B7FF240 000000067F000040020000E0000000118000-000000067F000040020000E000000011C000__00000073AD3FE6B8 000000067F000040020000E0000000118000-000000067F000040020000E000000011C000__000000914E3F38F0 000000067F000040020000E0000000118000-000000067F000040020000E000000011C000__000000931B9A2710 000000067F000040020000E000000011C000-000000067F000040020000E0000000120000__000000574B7FF240 000000067F000040020000E000000011C000-000000067F000040020000E0000000120000__00000073AD3FE6B8 000000067F000040020000E000000011C000-000000067F000040020000E0000000120000__000000914E3F38F0 000000067F000040020000E000000011C000-000000067F000040020000E0000000120000__000000931B9A2710 000000067F000040020000E000000011C8D2-000000067F000040020000E00000001252A2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000120000-000000067F000040020000E0000000124000__000000574B7FF240 000000067F000040020000E0000000120000-000000067F000040020000E0000000124000__00000073AD3FE6B8 000000067F000040020000E0000000120000-000000067F000040020000E0000000124000__000000914E3F38F0 000000067F000040020000E0000000120000-000000067F000040020000E0000000124000__000000931B9A2710 000000067F000040020000E0000000124000-000000067F000040020000E0000000128000__000000574B7FF240 000000067F000040020000E0000000124000-000000067F000040020000E0000000128000__00000073AD3FE6B8 000000067F000040020000E0000000124000-000000067F000040020000E0000000128000__000000914E3F38F0 000000067F000040020000E0000000124000-000000067F000040020000E0000000128000__000000931B9A2710 000000067F000040020000E00000001252A2-000000067F000040020000E000000012DC5E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000128000-000000067F000040020000E000000012C000__000000574B7FF240 000000067F000040020000E0000000128000-000000067F000040020000E000000012C000__00000073AD3FE6B8 000000067F000040020000E0000000128000-000000067F000040020000E000000012C000__000000914E3F38F0 000000067F000040020000E0000000128000-000000067F000040020000E000000012C000__000000931B9A2710 000000067F000040020000E000000012C000-000000067F000040020000E0000000130000__000000574B7FF240 000000067F000040020000E000000012C000-000000067F000040020000E0000000130000__00000073AD3FE6B8 000000067F000040020000E000000012C000-000000067F000040020000E0000000130000__000000914E3F38F0 000000067F000040020000E000000012C000-000000067F000040020000E0000000130000__000000931B9A2710 000000067F000040020000E000000012DC5E-000000067F000040020000E0000000136629__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000130000-000000067F000040020000E0000000134000__000000574B7FF240 000000067F000040020000E0000000130000-000000067F000040020000E0000000134000__00000073AD3FE6B8 000000067F000040020000E0000000130000-000000067F000040020000E0000000134000__000000914E3F38F0 000000067F000040020000E0000000130000-000000067F000040020000E0000000134000__000000931B9A2710 000000067F000040020000E0000000134000-000000067F000040020000E0000000138000__000000574B7FF240 000000067F000040020000E0000000134000-000000067F000040020000E0000000138000__00000073AD3FE6B8 000000067F000040020000E0000000134000-000000067F000040020000E0000000138000__000000914E3F38F0 000000067F000040020000E0000000134000-000000067F000040020000E0000000138000__000000931B9A2710 000000067F000040020000E0000000136629-000000067F000040020000E000000013F013__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000138000-000000067F000040020000E000000013C000__000000574B7FF240 000000067F000040020000E0000000138000-000000067F000040020000E000000013C000__00000073AD3FE6B8 000000067F000040020000E0000000138000-000000067F000040020000E000000013C000__000000914E3F38F0 000000067F000040020000E0000000138000-000000067F000040020000E000000013C000__000000931B9A2710 000000067F000040020000E000000013C000-000000067F000040020000E0000000140000__000000574B7FF240 000000067F000040020000E000000013C000-000000067F000040020000E0000000140000__00000073AD3FE6B8 000000067F000040020000E000000013C000-000000067F000040020000E0000000140000__000000914E3F38F0 000000067F000040020000E000000013C000-000000067F000040020000E0000000140000__000000931B9A2710 000000067F000040020000E000000013F013-000000067F000040020000E0000000147A01__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000140000-000000067F000040020000E0000000144000__000000574B7FF240 000000067F000040020000E0000000140000-000000067F000040020000E0000000144000__00000073AD3FE6B8 000000067F000040020000E0000000140000-000000067F000040020000E0000000144000__000000914E3F38F0 000000067F000040020000E0000000140000-000000067F000040020000E0000000144000__000000931B9A2710 000000067F000040020000E0000000144000-000000067F000040020000E0000000148000__000000574B7FF240 000000067F000040020000E0000000144000-000000067F000040020000E0000000148000__00000073AD3FE6B8 000000067F000040020000E0000000144000-000000067F000040020000E0000000148000__000000914E3F38F0 000000067F000040020000E0000000144000-000000067F000040020000E0000000148000__000000931B9A2710 000000067F000040020000E0000000147A01-000000067F000040020000E00000001503DC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000148000-000000067F000040020000E000000014C000__000000574B7FF240 000000067F000040020000E0000000148000-000000067F000040020000E000000014C000__00000073AD3FE6B8 000000067F000040020000E0000000148000-000000067F000040020000E000000014C000__000000914E3F38F0 000000067F000040020000E0000000148000-000000067F000040020000E000000014C000__000000931B9A2710 000000067F000040020000E000000014C000-000000067F000040020000E0000000150000__000000574B7FF240 000000067F000040020000E000000014C000-000000067F000040020000E0000000150000__00000073AD3FE6B8 000000067F000040020000E000000014C000-000000067F000040020000E0000000150000__000000914E3F38F0 000000067F000040020000E000000014C000-000000067F000040020000E0000000150000__000000931B9A2710 000000067F000040020000E0000000150000-000000067F000040020000E0000000154000__000000574B7FF240 000000067F000040020000E0000000150000-000000067F000040020000E0000000154000__00000073AD3FE6B8 000000067F000040020000E0000000150000-000000067F000040020000E0000000154000__000000914E3F38F0 000000067F000040020000E0000000150000-000000067F000040020000E0000000154000__000000931B9A2710 000000067F000040020000E00000001503DC-000000067F000040020000E0000000158DC2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000154000-000000067F000040020000E0000000158000__000000574B7FF240 000000067F000040020000E0000000154000-000000067F000040020000E0000000158000__00000073AD3FE6B8 000000067F000040020000E0000000154000-000000067F000040020000E0000000158000__000000914E3F38F0 000000067F000040020000E0000000154000-000000067F000040020000E0000000158000__000000931B9A2710 000000067F000040020000E0000000158000-000000067F000040020000E000000015C000__000000574B7FF240 000000067F000040020000E0000000158000-000000067F000040020000E000000015C000__00000073AD3FE6B8 000000067F000040020000E0000000158000-000000067F000040020000E000000015C000__000000914E3F38F0 000000067F000040020000E0000000158000-000000067F000040020000E000000015C000__000000931B9A2710 000000067F000040020000E0000000158DC2-000000067F000040020000E000000016178D__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000015C000-000000067F000040020000E0000000160000__000000574B7FF240 000000067F000040020000E000000015C000-000000067F000040020000E0000000160000__00000073AD3FE6B8 000000067F000040020000E000000015C000-000000067F000040020000E0000000160000__000000914E3F38F0 000000067F000040020000E000000015C000-000000067F000040020000E0000000160000__000000931B9A2710 000000067F000040020000E0000000160000-000000067F000040020000E0000000164000__000000574B7FF240 000000067F000040020000E0000000160000-000000067F000040020000E0000000164000__00000073AD3FE6B8 000000067F000040020000E0000000160000-000000067F000040020000E0000000164000__000000914E3F38F0 000000067F000040020000E0000000160000-000000067F000040020000E0000000164000__000000931B9A2710 000000067F000040020000E000000016178D-000000067F000040020000E000000016A148__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000164000-000000067F000040020000E0000000168000__000000574B7FF240 000000067F000040020000E0000000164000-000000067F000040020000E0000000168000__00000073AD3FE6B8 000000067F000040020000E0000000164000-000000067F000040020000E0000000168000__000000914E3F38F0 000000067F000040020000E0000000164000-000000067F000040020000E0000000168000__000000931B9A2710 000000067F000040020000E0000000168000-000000067F000040020000E000000016C000__000000574B7FF240 000000067F000040020000E0000000168000-000000067F000040020000E000000016C000__00000073AD3FE6B8 000000067F000040020000E0000000168000-000000067F000040020000E000000016C000__000000914E3F38F0 000000067F000040020000E0000000168000-000000067F000040020000E000000016C000__000000931B9A2710 000000067F000040020000E000000016A148-000000067F000040020000E0000000172B20__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000016C000-000000067F000040020000E0000000170000__000000574B7FF240 000000067F000040020000E000000016C000-000000067F000040020000E0000000170000__00000073AD3FE6B8 000000067F000040020000E000000016C000-000000067F000040020000E0000000170000__000000914E3F38F0 000000067F000040020000E000000016C000-000000067F000040020000E0000000170000__000000931B9A2710 000000067F000040020000E0000000170000-000000067F000040020000E0000000174000__000000574B7FF240 000000067F000040020000E0000000170000-000000067F000040020000E0000000174000__00000073AD3FE6B8 000000067F000040020000E0000000170000-000000067F000040020000E0000000174000__000000914E3F38F0 000000067F000040020000E0000000170000-000000067F000040020000E0000000174000__000000931B9A2710 000000067F000040020000E0000000172B20-000000067F000040020000E000000017B50C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000174000-000000067F000040020000E0000000178000__000000574B7FF240 000000067F000040020000E0000000174000-000000067F000040020000E0000000178000__00000073AD3FE6B8 000000067F000040020000E0000000174000-000000067F000040020000E0000000178000__000000914E3F38F0 000000067F000040020000E0000000174000-000000067F000040020000E0000000178000__000000931B9A2710 000000067F000040020000E0000000178000-000000067F000040020000E000000017C000__000000574B7FF240 000000067F000040020000E0000000178000-000000067F000040020000E000000017C000__00000073AD3FE6B8 000000067F000040020000E0000000178000-000000067F000040020000E000000017C000__000000914E3F38F0 000000067F000040020000E0000000178000-000000067F000040020000E000000017C000__000000931B9A2710 000000067F000040020000E000000017B50C-000000067F000040020000E0000000183EF9__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000017C000-000000067F000040020000E0000000180000__000000574B7FF240 000000067F000040020000E000000017C000-000000067F000040020000E0000000180000__00000073AD3FE6B8 000000067F000040020000E000000017C000-000000067F000040020000E0000000180000__000000914E3F38F0 000000067F000040020000E000000017C000-000000067F000040020000E0000000180000__000000931B9A2710 000000067F000040020000E0000000180000-000000067F000040020000E0000000184000__000000574B7FF240 000000067F000040020000E0000000180000-000000067F000040020000E0000000184000__00000073AD3FE6B8 000000067F000040020000E0000000180000-000000067F000040020000E0000000184000__000000914E3F38F0 000000067F000040020000E0000000180000-000000067F000040020000E0000000184000__000000931B9A2710 000000067F000040020000E0000000183EF9-000000067F000040020000E000000018C8E8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000184000-000000067F000040020000E0000000188000__000000574B7FF240 000000067F000040020000E0000000184000-000000067F000040020000E0000000188000__00000073AD3FE6B8 000000067F000040020000E0000000184000-000000067F000040020000E0000000188000__000000914E3F38F0 000000067F000040020000E0000000184000-000000067F000040020000E0000000188000__000000931B9A2710 000000067F000040020000E0000000188000-000000067F000040020000E000000018C000__000000574B7FF240 000000067F000040020000E0000000188000-000000067F000040020000E000000018C000__00000073AD3FE6B8 000000067F000040020000E0000000188000-000000067F000040020000E000000018C000__000000914E3F38F0 000000067F000040020000E0000000188000-000000067F000040020000E000000018C000__000000931B9A2710 000000067F000040020000E000000018C000-000000067F000040020000E0000000190000__000000574B7FF240 000000067F000040020000E000000018C000-000000067F000040020000E0000000190000__00000073AD3FE6B8 000000067F000040020000E000000018C000-000000067F000040020000E0000000190000__000000914E3F38F0 000000067F000040020000E000000018C000-000000067F000040020000E0000000190000__000000931B9A2710 000000067F000040020000E000000018C8E8-000000067F000040020000E00000001952CE__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000190000-000000067F000040020000E0000000194000__000000574B7FF240 000000067F000040020000E0000000190000-000000067F000040020000E0000000194000__00000073AD3FE6B8 000000067F000040020000E0000000190000-000000067F000040020000E0000000194000__000000914E3F38F0 000000067F000040020000E0000000190000-000000067F000040020000E0000000194000__000000931B9A2710 000000067F000040020000E0000000194000-000000067F000040020000E0000000198000__000000574B7FF240 000000067F000040020000E0000000194000-000000067F000040020000E0000000198000__00000073AD3FE6B8 000000067F000040020000E0000000194000-000000067F000040020000E0000000198000__000000914E3F38F0 000000067F000040020000E0000000194000-000000067F000040020000E0000000198000__000000931B9A2710 000000067F000040020000E00000001952CE-000000067F000040020000E000000019DC94__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000198000-000000067F000040020000E000000019C000__000000574B7FF240 000000067F000040020000E0000000198000-000000067F000040020000E000000019C000__00000073AD3FE6B8 000000067F000040020000E0000000198000-000000067F000040020000E000000019C000__000000914E3F38F0 000000067F000040020000E0000000198000-000000067F000040020000E000000019C000__000000931B9A2710 000000067F000040020000E000000019C000-000000067F000040020000E00000001A0000__000000574B7FF240 000000067F000040020000E000000019C000-000000067F000040020000E00000001A0000__00000073AD3FE6B8 000000067F000040020000E000000019C000-000000067F000040020000E00000001A0000__000000914E3F38F0 000000067F000040020000E000000019C000-000000067F000040020000E00000001A0000__000000931B9A2710 000000067F000040020000E000000019DC94-000000067F000040020000E00000001A6650__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001A0000-000000067F000040020000E00000001A4000__000000574B7FF240 000000067F000040020000E00000001A0000-000000067F000040020000E00000001A4000__00000073AD3FE6B8 000000067F000040020000E00000001A0000-000000067F000040020000E00000001A4000__000000914E3F38F0 000000067F000040020000E00000001A0000-000000067F000040020000E00000001A4000__000000931B9A2710 000000067F000040020000E00000001A4000-000000067F000040020000E00000001A8000__000000574B7FF240 000000067F000040020000E00000001A4000-000000067F000040020000E00000001A8000__00000073AD3FE6B8 000000067F000040020000E00000001A4000-000000067F000040020000E00000001A8000__000000914E3F38F0 000000067F000040020000E00000001A4000-000000067F000040020000E00000001A8000__000000931B9A2710 000000067F000040020000E00000001A6650-000000067F000040020000E00000001AF031__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001A8000-000000067F000040020000E00000001AC000__000000574B7FF240 000000067F000040020000E00000001A8000-000000067F000040020000E00000001AC000__00000073AD3FE6B8 000000067F000040020000E00000001A8000-000000067F000040020000E00000001AC000__000000914E3F38F0 000000067F000040020000E00000001A8000-000000067F000040020000E00000001AC000__000000931B9A2710 000000067F000040020000E00000001AC000-000000067F000040020000E00000001B0000__000000574B7FF240 000000067F000040020000E00000001AC000-000000067F000040020000E00000001B0000__00000073AD3FE6B8 000000067F000040020000E00000001AC000-000000067F000040020000E00000001B0000__000000914E3F38F0 000000067F000040020000E00000001AC000-000000067F000040020000E00000001B0000__000000931B9A2710 000000067F000040020000E00000001AF031-000000067F000040020000E00000001B7A19__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001B0000-000000067F000040020000E00000001B4000__000000574B7FF240 000000067F000040020000E00000001B0000-000000067F000040020000E00000001B4000__00000073AD3FE6B8 000000067F000040020000E00000001B0000-000000067F000040020000E00000001B4000__000000914E3F38F0 000000067F000040020000E00000001B0000-000000067F000040020000E00000001B4000__000000931B9A2710 000000067F000040020000E00000001B4000-000000067F000040020000E00000001B8000__000000574B7FF240 000000067F000040020000E00000001B4000-000000067F000040020000E00000001B8000__00000073AD3FE6B8 000000067F000040020000E00000001B4000-000000067F000040020000E00000001B8000__000000914E3F38F0 000000067F000040020000E00000001B4000-000000067F000040020000E00000001B8000__000000931B9A2710 000000067F000040020000E00000001B7A19-000000067F000040020000E00000001C0402__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001B8000-000000067F000040020000E00000001BC000__000000574B7FF240 000000067F000040020000E00000001B8000-000000067F000040020000E00000001BC000__00000073AD3FE6B8 000000067F000040020000E00000001B8000-000000067F000040020000E00000001BC000__000000914E3F38F0 000000067F000040020000E00000001B8000-000000067F000040020000E00000001BC000__000000931B9A2710 000000067F000040020000E00000001BC000-000000067F000040020000E00000001C0000__000000574B7FF240 000000067F000040020000E00000001BC000-000000067F000040020000E00000001C0000__00000073AD3FE6B8 000000067F000040020000E00000001BC000-000000067F000040020000E00000001C0000__000000914E3F38F0 000000067F000040020000E00000001BC000-000000067F000040020000E00000001C0000__000000931B9A2710 000000067F000040020000E00000001C0000-000000067F000040020000E00000001C4000__000000574B7FF240 000000067F000040020000E00000001C0000-000000067F000040020000E00000001C4000__00000073AD3FE6B8 000000067F000040020000E00000001C0000-000000067F000040020000E00000001C4000__000000914E3F38F0 000000067F000040020000E00000001C0000-000000067F000040020000E00000001C4000__000000931B9A2710 000000067F000040020000E00000001C0402-000000067F000040020000E00000001C8DD6__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001C4000-000000067F000040020000E00000001C8000__000000574B7FF240 000000067F000040020000E00000001C4000-000000067F000040020000E00000001C8000__00000073AD3FE6B8 000000067F000040020000E00000001C4000-000000067F000040020000E00000001C8000__000000914E3F38F0 000000067F000040020000E00000001C4000-000000067F000040020000E00000001C8000__000000931B9A2710 000000067F000040020000E00000001C8000-000000067F000040020000E00000001CC000__000000574B7FF240 000000067F000040020000E00000001C8000-000000067F000040020000E00000001CC000__00000073AD3FE6B8 000000067F000040020000E00000001C8000-000000067F000040020000E00000001CC000__000000914E3F38F0 000000067F000040020000E00000001C8000-000000067F000040020000E00000001CC000__000000931B9A2710 000000067F000040020000E00000001C8DD6-000000067F000040020000E00000001D17B3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001CC000-000000067F000040020000E00000001D0000__000000574B7FF240 000000067F000040020000E00000001CC000-000000067F000040020000E00000001D0000__00000073AD3FE6B8 000000067F000040020000E00000001CC000-000000067F000040020000E00000001D0000__000000914E3F38F0 000000067F000040020000E00000001CC000-000000067F000040020000E00000001D0000__000000931B9A2710 000000067F000040020000E00000001D0000-000000067F000040020000E00000001D4000__000000574B7FF240 000000067F000040020000E00000001D0000-000000067F000040020000E00000001D4000__00000073AD3FE6B8 000000067F000040020000E00000001D0000-000000067F000040020000E00000001D4000__000000914E3F38F0 000000067F000040020000E00000001D0000-000000067F000040020000E00000001D4000__000000931B9A2710 000000067F000040020000E00000001D17B3-000000067F000040020000E00000001DA183__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001D4000-000000067F000040020000E00000001D8000__000000574B7FF240 000000067F000040020000E00000001D4000-000000067F000040020000E00000001D8000__00000073AD3FE6B8 000000067F000040020000E00000001D4000-000000067F000040020000E00000001D8000__000000914E3F38F0 000000067F000040020000E00000001D4000-000000067F000040020000E00000001D8000__000000931B9A2710 000000067F000040020000E00000001D8000-000000067F000040020000E00000001DC000__000000574B7FF240 000000067F000040020000E00000001D8000-000000067F000040020000E00000001DC000__00000073AD3FE6B8 000000067F000040020000E00000001D8000-000000067F000040020000E00000001DC000__000000914E3F38F0 000000067F000040020000E00000001D8000-000000067F000040020000E00000001DC000__000000931B9A2710 000000067F000040020000E00000001DA183-000000067F000040020000E00000001E2B47__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001DC000-000000067F000040020000E00000001E0000__000000574B7FF240 000000067F000040020000E00000001DC000-000000067F000040020000E00000001E0000__00000073AD3FE6B8 000000067F000040020000E00000001DC000-000000067F000040020000E00000001E0000__000000914E3F38F0 000000067F000040020000E00000001DC000-000000067F000040020000E00000001E0000__000000931B9A2710 000000067F000040020000E00000001E0000-000000067F000040020000E00000001E4000__000000574B7FF240 000000067F000040020000E00000001E0000-000000067F000040020000E00000001E4000__00000073AD3FE6B8 000000067F000040020000E00000001E0000-000000067F000040020000E00000001E4000__000000914E3F38F0 000000067F000040020000E00000001E0000-000000067F000040020000E00000001E4000__000000931B9A2710 000000067F000040020000E00000001E2B47-000000067F000040020000E00000001EB52B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001E4000-000000067F000040020000E00000001E8000__000000574B7FF240 000000067F000040020000E00000001E4000-000000067F000040020000E00000001E8000__00000073AD3FE6B8 000000067F000040020000E00000001E4000-000000067F000040020000E00000001E8000__000000914E3F38F0 000000067F000040020000E00000001E4000-000000067F000040020000E00000001E8000__000000931B9A2710 000000067F000040020000E00000001E8000-000000067F000040020000E00000001EC000__000000574B7FF240 000000067F000040020000E00000001E8000-000000067F000040020000E00000001EC000__00000073AD3FE6B8 000000067F000040020000E00000001E8000-000000067F000040020000E00000001EC000__000000914E3F38F0 000000067F000040020000E00000001E8000-000000067F000040020000E00000001EC000__000000931B9A2710 000000067F000040020000E00000001EB52B-000000067F000040020000E00000001F3F12__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001EC000-000000067F000040020000E00000001F0000__000000574B7FF240 000000067F000040020000E00000001EC000-000000067F000040020000E00000001F0000__00000073AD3FE6B8 000000067F000040020000E00000001EC000-000000067F000040020000E00000001F0000__000000914E3F38F0 000000067F000040020000E00000001EC000-000000067F000040020000E00000001F0000__000000931B9A2710 000000067F000040020000E00000001F0000-000000067F000040020000E00000001F4000__000000574B7FF240 000000067F000040020000E00000001F0000-000000067F000040020000E00000001F4000__00000073AD3FE6B8 000000067F000040020000E00000001F0000-000000067F000040020000E00000001F4000__000000914E3F38F0 000000067F000040020000E00000001F0000-000000067F000040020000E00000001F4000__000000931B9A2710 000000067F000040020000E00000001F3F12-000000067F000040020000E00000001FC902__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000001F4000-000000067F000040020000E00000001F8000__000000574B7FF240 000000067F000040020000E00000001F4000-000000067F000040020000E00000001F8000__00000073AD3FE6B8 000000067F000040020000E00000001F4000-000000067F000040020000E00000001F8000__000000914E3F38F0 000000067F000040020000E00000001F4000-000000067F000040020000E00000001F8000__000000931B9A2710 000000067F000040020000E00000001F8000-000000067F000040020000E00000001FC000__000000574B7FF240 000000067F000040020000E00000001F8000-000000067F000040020000E00000001FC000__00000073AD3FE6B8 000000067F000040020000E00000001F8000-000000067F000040020000E00000001FC000__000000914E3F38F0 000000067F000040020000E00000001F8000-000000067F000040020000E00000001FC000__000000931B9A2710 000000067F000040020000E00000001FC000-000000067F000040020000E0000000200000__000000574B7FF240 000000067F000040020000E00000001FC000-000000067F000040020000E0000000200000__00000073AD3FE6B8 000000067F000040020000E00000001FC000-000000067F000040020000E0000000200000__000000914E3F38F0 000000067F000040020000E00000001FC000-000000067F000040020000E0000000200000__000000931B9A2710 000000067F000040020000E00000001FC902-000000067F000040020000E00000002052D8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000200000-000000067F000040020000E0000000204000__000000574B7FF240 000000067F000040020000E0000000200000-000000067F000040020000E0000000204000__00000073AD3FE6B8 000000067F000040020000E0000000200000-000000067F000040020000E0000000204000__000000914E3F38F0 000000067F000040020000E0000000200000-000000067F000040020000E0000000204000__000000931B9A2710 000000067F000040020000E0000000204000-000000067F000040020000E0000000208000__000000574B7FF240 000000067F000040020000E0000000204000-000000067F000040020000E0000000208000__00000073AD3FE6B8 000000067F000040020000E0000000204000-000000067F000040020000E0000000208000__000000914E3F38F0 000000067F000040020000E0000000204000-000000067F000040020000E0000000208000__000000931B9A2710 000000067F000040020000E00000002052D8-000000067F000040020000E000000020DCB6__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000208000-000000067F000040020000E000000020C000__000000574B7FF240 000000067F000040020000E0000000208000-000000067F000040020000E000000020C000__00000073AD3FE6B8 000000067F000040020000E0000000208000-000000067F000040020000E000000020C000__000000914E3F38F0 000000067F000040020000E0000000208000-000000067F000040020000E000000020C000__000000931B9A2710 000000067F000040020000E000000020C000-000000067F000040020000E0000000210000__000000574B7FF240 000000067F000040020000E000000020C000-000000067F000040020000E0000000210000__00000073AD3FE6B8 000000067F000040020000E000000020C000-000000067F000040020000E0000000210000__000000914E3F38F0 000000067F000040020000E000000020C000-000000067F000040020000E0000000210000__000000931B9A2710 000000067F000040020000E000000020DCB6-000000067F000040020000E0000000216686__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000210000-000000067F000040020000E0000000214000__000000574B7FF240 000000067F000040020000E0000000210000-000000067F000040020000E0000000214000__00000073AD3FE6B8 000000067F000040020000E0000000210000-000000067F000040020000E0000000214000__000000914E3F38F0 000000067F000040020000E0000000210000-000000067F000040020000E0000000214000__000000931B9A2710 000000067F000040020000E0000000214000-000000067F000040020000E0000000218000__000000574B7FF240 000000067F000040020000E0000000214000-000000067F000040020000E0000000218000__00000073AD3FE6B8 000000067F000040020000E0000000214000-000000067F000040020000E0000000218000__000000914E3F38F0 000000067F000040020000E0000000214000-000000067F000040020000E0000000218000__000000931B9A2710 000000067F000040020000E0000000216686-000000067F000040020000E000000021F04B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000218000-000000067F000040020000E000000021C000__000000574B7FF240 000000067F000040020000E0000000218000-000000067F000040020000E000000021C000__00000073AD3FE6B8 000000067F000040020000E0000000218000-000000067F000040020000E000000021C000__000000914E3F38F0 000000067F000040020000E0000000218000-000000067F000040020000E000000021C000__000000931B9A2710 000000067F000040020000E000000021C000-000000067F000040020000E0000000220000__000000574B7FF240 000000067F000040020000E000000021C000-000000067F000040020000E0000000220000__00000073AD3FE6B8 000000067F000040020000E000000021C000-000000067F000040020000E0000000220000__000000914E3F38F0 000000067F000040020000E000000021C000-000000067F000040020000E0000000220000__000000931B9A2710 000000067F000040020000E000000021F04B-000000067F000040020000E0000000227A38__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000220000-000000067F000040020000E0000000224000__000000574B7FF240 000000067F000040020000E0000000220000-000000067F000040020000E0000000224000__00000073AD3FE6B8 000000067F000040020000E0000000220000-000000067F000040020000E0000000224000__000000914E3F38F0 000000067F000040020000E0000000220000-000000067F000040020000E0000000224000__000000931B9A2710 000000067F000040020000E0000000224000-000000067F000040020000E0000000228000__000000574B7FF240 000000067F000040020000E0000000224000-000000067F000040020000E0000000228000__00000073AD3FE6B8 000000067F000040020000E0000000224000-000000067F000040020000E0000000228000__000000914E3F38F0 000000067F000040020000E0000000224000-000000067F000040020000E0000000228000__000000931B9A2710 000000067F000040020000E0000000227A38-000000067F000040020000E0000000230422__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000228000-000000067F000040020000E000000022C000__000000574B7FF240 000000067F000040020000E0000000228000-000000067F000040020000E000000022C000__00000073AD3FE6B8 000000067F000040020000E0000000228000-000000067F000040020000E000000022C000__000000914E3F38F0 000000067F000040020000E0000000228000-000000067F000040020000E000000022C000__000000931B9A2710 000000067F000040020000E000000022C000-000000067F000040020000E0000000230000__000000574B7FF240 000000067F000040020000E000000022C000-000000067F000040020000E0000000230000__00000073AD3FE6B8 000000067F000040020000E000000022C000-000000067F000040020000E0000000230000__000000914E3F38F0 000000067F000040020000E000000022C000-000000067F000040020000E0000000230000__000000931B9A2710 000000067F000040020000E0000000230000-000000067F000040020000E0000000234000__000000574B7FF240 000000067F000040020000E0000000230000-000000067F000040020000E0000000234000__00000073AD3FE6B8 000000067F000040020000E0000000230000-000000067F000040020000E0000000234000__000000914E3F38F0 000000067F000040020000E0000000230000-000000067F000040020000E0000000234000__000000931B9A2710 000000067F000040020000E0000000230422-000000067F000040020000E0000000238E0E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000234000-000000067F000040020000E0000000238000__000000574B7FF240 000000067F000040020000E0000000234000-000000067F000040020000E0000000238000__00000073AD3FE6B8 000000067F000040020000E0000000234000-000000067F000040020000E0000000238000__000000914E3F38F0 000000067F000040020000E0000000234000-000000067F000040020000E0000000238000__000000931B9A2710 000000067F000040020000E0000000238000-000000067F000040020000E000000023C000__000000574B7FF240 000000067F000040020000E0000000238000-000000067F000040020000E000000023C000__00000073AD3FE6B8 000000067F000040020000E0000000238000-000000067F000040020000E000000023C000__000000914E3F38F0 000000067F000040020000E0000000238000-000000067F000040020000E000000023C000__000000931B9A2710 000000067F000040020000E0000000238E0E-000000067F000040020000E00000002417DF__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000023C000-000000067F000040020000E0000000240000__000000574B7FF240 000000067F000040020000E000000023C000-000000067F000040020000E0000000240000__00000073AD3FE6B8 000000067F000040020000E000000023C000-000000067F000040020000E0000000240000__000000914E3F38F0 000000067F000040020000E000000023C000-000000067F000040020000E0000000240000__000000931B9A2710 000000067F000040020000E0000000240000-000000067F000040020000E0000000244000__000000574B7FF240 000000067F000040020000E0000000240000-000000067F000040020000E0000000244000__00000073AD3FE6B8 000000067F000040020000E0000000240000-000000067F000040020000E0000000244000__000000914E3F38F0 000000067F000040020000E0000000240000-000000067F000040020000E0000000244000__000000931B9A2710 000000067F000040020000E00000002417DF-000000067F000040020000E000000024A1C0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000244000-000000067F000040020000E0000000248000__000000574B7FF240 000000067F000040020000E0000000244000-000000067F000040020000E0000000248000__00000073AD3FE6B8 000000067F000040020000E0000000244000-000000067F000040020000E0000000248000__000000914E3F38F0 000000067F000040020000E0000000244000-000000067F000040020000E0000000248000__000000931B9A2710 000000067F000040020000E0000000248000-000000067F000040020000E000000024C000__000000574B7FF240 000000067F000040020000E0000000248000-000000067F000040020000E000000024C000__00000073AD3FE6B8 000000067F000040020000E0000000248000-000000067F000040020000E000000024C000__000000914E3F38F0 000000067F000040020000E0000000248000-000000067F000040020000E000000024C000__000000931B9A2710 000000067F000040020000E000000024A1C0-000000067F000040020000E0000000252B80__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000024C000-000000067F000040020000E0000000250000__000000574B7FF240 000000067F000040020000E000000024C000-000000067F000040020000E0000000250000__00000073AD3FE6B8 000000067F000040020000E000000024C000-000000067F000040020000E0000000250000__000000914E3F38F0 000000067F000040020000E000000024C000-000000067F000040020000E0000000250000__000000931B9A2710 000000067F000040020000E0000000250000-000000067F000040020000E0000000254000__000000574B7FF240 000000067F000040020000E0000000250000-000000067F000040020000E0000000254000__00000073AD3FE6B8 000000067F000040020000E0000000250000-000000067F000040020000E0000000254000__000000914E3F38F0 000000067F000040020000E0000000250000-000000067F000040020000E0000000254000__000000931B9A2710 000000067F000040020000E0000000252B80-000000067F000040020000E000000025B542__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000254000-000000067F000040020000E0000000258000__000000574B7FF240 000000067F000040020000E0000000254000-000000067F000040020000E0000000258000__00000073AD3FE6B8 000000067F000040020000E0000000254000-000000067F000040020000E0000000258000__000000914E3F38F0 000000067F000040020000E0000000254000-000000067F000040020000E0000000258000__000000931B9A2710 000000067F000040020000E0000000258000-000000067F000040020000E000000025C000__000000574B7FF240 000000067F000040020000E0000000258000-000000067F000040020000E000000025C000__00000073AD3FE6B8 000000067F000040020000E0000000258000-000000067F000040020000E000000025C000__000000914E3F38F0 000000067F000040020000E0000000258000-000000067F000040020000E000000025C000__000000931B9A2710 000000067F000040020000E000000025B542-000000067F000040020000E0000000263F2C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000025C000-000000067F000040020000E0000000260000__000000574B7FF240 000000067F000040020000E000000025C000-000000067F000040020000E0000000260000__00000073AD3FE6B8 000000067F000040020000E000000025C000-000000067F000040020000E0000000260000__000000914E3F38F0 000000067F000040020000E000000025C000-000000067F000040020000E0000000260000__000000931B9A2710 000000067F000040020000E0000000260000-000000067F000040020000E0000000264000__000000574B7FF240 000000067F000040020000E0000000260000-000000067F000040020000E0000000264000__00000073AD3FE6B8 000000067F000040020000E0000000260000-000000067F000040020000E0000000264000__000000914E3F38F0 000000067F000040020000E0000000260000-000000067F000040020000E0000000264000__000000931B9A2710 000000067F000040020000E0000000263F2C-000000067F000040020000E000000026C925__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000264000-000000067F000040020000E0000000268000__000000574B7FF240 000000067F000040020000E0000000264000-000000067F000040020000E0000000268000__00000073AD3FE6B8 000000067F000040020000E0000000264000-000000067F000040020000E0000000268000__000000914E3F38F0 000000067F000040020000E0000000264000-000000067F000040020000E0000000268000__000000931B9A2710 000000067F000040020000E0000000268000-000000067F000040020000E000000026C000__000000574B7FF240 000000067F000040020000E0000000268000-000000067F000040020000E000000026C000__00000073AD3FE6B8 000000067F000040020000E0000000268000-000000067F000040020000E000000026C000__000000914E3F38F0 000000067F000040020000E0000000268000-000000067F000040020000E000000026C000__000000931B9A2710 000000067F000040020000E000000026C000-000000067F000040020000E0000000270000__000000574B7FF240 000000067F000040020000E000000026C000-000000067F000040020000E0000000270000__00000073AD3FE6B8 000000067F000040020000E000000026C000-000000067F000040020000E0000000270000__000000914E3F38F0 000000067F000040020000E000000026C000-000000067F000040020000E0000000270000__000000931B9A2710 000000067F000040020000E000000026C925-000000067F000040020000E0000000275309__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000270000-000000067F000040020000E0000000274000__000000574B7FF240 000000067F000040020000E0000000270000-000000067F000040020000E0000000274000__00000073AD3FE6B8 000000067F000040020000E0000000270000-000000067F000040020000E0000000274000__000000914E3F38F0 000000067F000040020000E0000000270000-000000067F000040020000E0000000274000__000000931B9A2710 000000067F000040020000E0000000274000-000000067F000040020000E0000000278000__000000574B7FF240 000000067F000040020000E0000000274000-000000067F000040020000E0000000278000__00000073AD3FE6B8 000000067F000040020000E0000000274000-000000067F000040020000E0000000278000__000000914E3F38F0 000000067F000040020000E0000000274000-000000067F000040020000E0000000278000__000000931B9A2710 000000067F000040020000E0000000275309-000000067F000040020000E000000027DCE0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000278000-000000067F000040020000E000000027C000__000000574B7FF240 000000067F000040020000E0000000278000-000000067F000040020000E000000027C000__00000073AD3FE6B8 000000067F000040020000E0000000278000-000000067F000040020000E000000027C000__000000914E3F38F0 000000067F000040020000E0000000278000-000000067F000040020000E000000027C000__000000931B9A2710 000000067F000040020000E000000027C000-000000067F000040020000E0000000280000__000000574B7FF240 000000067F000040020000E000000027C000-000000067F000040020000E0000000280000__00000073AD3FE6B8 000000067F000040020000E000000027C000-000000067F000040020000E0000000280000__000000914E3F38F0 000000067F000040020000E000000027C000-000000067F000040020000E0000000280000__000000931B9A2710 000000067F000040020000E000000027DCE0-000000067F000040020000E00000002866B7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000280000-000000067F000040020000E0000000284000__000000574B7FF240 000000067F000040020000E0000000280000-000000067F000040020000E0000000284000__00000073AD3FE6B8 000000067F000040020000E0000000280000-000000067F000040020000E0000000284000__000000914E3F38F0 000000067F000040020000E0000000280000-000000067F000040020000E0000000284000__000000931B9A2710 000000067F000040020000E0000000284000-000000067F000040020000E0000000288000__000000574B7FF240 000000067F000040020000E0000000284000-000000067F000040020000E0000000288000__00000073AD3FE6B8 000000067F000040020000E0000000284000-000000067F000040020000E0000000288000__000000914E3F38F0 000000067F000040020000E0000000284000-000000067F000040020000E0000000288000__000000931B9A2710 000000067F000040020000E00000002866B7-000000067F000040020000E000000028F073__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000288000-000000067F000040020000E000000028C000__000000574B7FF240 000000067F000040020000E0000000288000-000000067F000040020000E000000028C000__00000073AD3FE6B8 000000067F000040020000E0000000288000-000000067F000040020000E000000028C000__000000914E3F38F0 000000067F000040020000E0000000288000-000000067F000040020000E000000028C000__000000931B9A2710 000000067F000040020000E000000028C000-000000067F000040020000E0000000290000__000000574B7FF240 000000067F000040020000E000000028C000-000000067F000040020000E0000000290000__00000073AD3FE6B8 000000067F000040020000E000000028C000-000000067F000040020000E0000000290000__000000914E3F38F0 000000067F000040020000E000000028C000-000000067F000040020000E0000000290000__000000931B9A2710 000000067F000040020000E000000028F073-000000067F000040020000E0000000297A3B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000290000-000000067F000040020000E0000000294000__000000574B7FF240 000000067F000040020000E0000000290000-000000067F000040020000E0000000294000__00000073AD3FE6B8 000000067F000040020000E0000000290000-000000067F000040020000E0000000294000__000000914E3F38F0 000000067F000040020000E0000000290000-000000067F000040020000E0000000294000__000000931B9A2710 000000067F000040020000E0000000294000-000000067F000040020000E0000000298000__000000574B7FF240 000000067F000040020000E0000000294000-000000067F000040020000E0000000298000__00000073AD3FE6B8 000000067F000040020000E0000000294000-000000067F000040020000E0000000298000__000000914E3F38F0 000000067F000040020000E0000000294000-000000067F000040020000E0000000298000__000000931B9A2710 000000067F000040020000E0000000297A3B-000000067F000040020000E00000002A0430__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000298000-000000067F000040020000E000000029C000__000000574B7FF240 000000067F000040020000E0000000298000-000000067F000040020000E000000029C000__00000073AD3FE6B8 000000067F000040020000E0000000298000-000000067F000040020000E000000029C000__000000914E3F38F0 000000067F000040020000E0000000298000-000000067F000040020000E000000029C000__000000931B9A2710 000000067F000040020000E000000029C000-000000067F000040020000E00000002A0000__000000574B7FF240 000000067F000040020000E000000029C000-000000067F000040020000E00000002A0000__00000073AD3FE6B8 000000067F000040020000E000000029C000-000000067F000040020000E00000002A0000__000000914E3F38F0 000000067F000040020000E000000029C000-000000067F000040020000E00000002A0000__000000931B9A2710 000000067F000040020000E00000002A0000-000000067F000040020000E00000002A4000__000000574B7FF240 000000067F000040020000E00000002A0000-000000067F000040020000E00000002A4000__00000073AD3FE6B8 000000067F000040020000E00000002A0000-000000067F000040020000E00000002A4000__000000914E3F38F0 000000067F000040020000E00000002A0000-000000067F000040020000E00000002A4000__000000931B9A2710 000000067F000040020000E00000002A0430-000000067F000040020000E00000002A8E24__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002A4000-000000067F000040020000E00000002A8000__000000574B7FF240 000000067F000040020000E00000002A4000-000000067F000040020000E00000002A8000__00000073AD3FE6B8 000000067F000040020000E00000002A4000-000000067F000040020000E00000002A8000__000000914E3F38F0 000000067F000040020000E00000002A4000-000000067F000040020000E00000002A8000__000000931B9A2710 000000067F000040020000E00000002A8000-000000067F000040020000E00000002AC000__000000574B7FF240 000000067F000040020000E00000002A8000-000000067F000040020000E00000002AC000__00000073AD3FE6B8 000000067F000040020000E00000002A8000-000000067F000040020000E00000002AC000__000000914E3F38F0 000000067F000040020000E00000002A8000-000000067F000040020000E00000002AC000__000000931B9A2710 000000067F000040020000E00000002A8E24-000000067F000040020000E00000002B180A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002AC000-000000067F000040020000E00000002B0000__000000574B7FF240 000000067F000040020000E00000002AC000-000000067F000040020000E00000002B0000__00000073AD3FE6B8 000000067F000040020000E00000002AC000-000000067F000040020000E00000002B0000__000000914E3F38F0 000000067F000040020000E00000002AC000-000000067F000040020000E00000002B0000__000000931B9A2710 000000067F000040020000E00000002B0000-000000067F000040020000E00000002B4000__000000574B7FF240 000000067F000040020000E00000002B0000-000000067F000040020000E00000002B4000__00000073AD3FE6B8 000000067F000040020000E00000002B0000-000000067F000040020000E00000002B4000__000000914E3F38F0 000000067F000040020000E00000002B0000-000000067F000040020000E00000002B4000__000000931B9A2710 000000067F000040020000E00000002B180A-000000067F000040020000E00000002BA1E2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002B4000-000000067F000040020000E00000002B8000__000000574B7FF240 000000067F000040020000E00000002B4000-000000067F000040020000E00000002B8000__00000073AD3FE6B8 000000067F000040020000E00000002B4000-000000067F000040020000E00000002B8000__000000914E3F38F0 000000067F000040020000E00000002B4000-000000067F000040020000E00000002B8000__000000931B9A2710 000000067F000040020000E00000002B8000-000000067F000040020000E00000002BC000__000000574B7FF240 000000067F000040020000E00000002B8000-000000067F000040020000E00000002BC000__00000073AD3FE6B8 000000067F000040020000E00000002B8000-000000067F000040020000E00000002BC000__000000914E3F38F0 000000067F000040020000E00000002B8000-000000067F000040020000E00000002BC000__000000931B9A2710 000000067F000040020000E00000002BA1E2-000000067F000040020000E00000002C2BB0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002BC000-000000067F000040020000E00000002C0000__000000574B7FF240 000000067F000040020000E00000002BC000-000000067F000040020000E00000002C0000__00000073AD3FE6B8 000000067F000040020000E00000002BC000-000000067F000040020000E00000002C0000__000000914E3F38F0 000000067F000040020000E00000002BC000-000000067F000040020000E00000002C0000__000000931B9A2710 000000067F000040020000E00000002C0000-000000067F000040020000E00000002C4000__000000574B7FF240 000000067F000040020000E00000002C0000-000000067F000040020000E00000002C4000__00000073AD3FE6B8 000000067F000040020000E00000002C0000-000000067F000040020000E00000002C4000__000000914E3F38F0 000000067F000040020000E00000002C0000-000000067F000040020000E00000002C4000__000000931B9A2710 000000067F000040020000E00000002C2BB0-000000067F000040020000E00000002CB579__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002C4000-000000067F000040020000E00000002C8000__000000574B7FF240 000000067F000040020000E00000002C4000-000000067F000040020000E00000002C8000__00000073AD3FE6B8 000000067F000040020000E00000002C4000-000000067F000040020000E00000002C8000__000000914E3F38F0 000000067F000040020000E00000002C4000-000000067F000040020000E00000002C8000__000000931B9A2710 000000067F000040020000E00000002C8000-000000067F000040020000E00000002CC000__000000574B7FF240 000000067F000040020000E00000002C8000-000000067F000040020000E00000002CC000__00000073AD3FE6B8 000000067F000040020000E00000002C8000-000000067F000040020000E00000002CC000__000000914E3F38F0 000000067F000040020000E00000002C8000-000000067F000040020000E00000002CC000__000000931B9A2710 000000067F000040020000E00000002CB579-000000067F000040020000E00000002D3F48__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002CC000-000000067F000040020000E00000002D0000__000000574B7FF240 000000067F000040020000E00000002CC000-000000067F000040020000E00000002D0000__00000073AD3FE6B8 000000067F000040020000E00000002CC000-000000067F000040020000E00000002D0000__000000914E3F38F0 000000067F000040020000E00000002CC000-000000067F000040020000E00000002D0000__000000931B9A2710 000000067F000040020000E00000002D0000-000000067F000040020000E00000002D4000__000000574B7FF240 000000067F000040020000E00000002D0000-000000067F000040020000E00000002D4000__00000073AD3FE6B8 000000067F000040020000E00000002D0000-000000067F000040020000E00000002D4000__000000914E3F38F0 000000067F000040020000E00000002D0000-000000067F000040020000E00000002D4000__000000931B9A2710 000000067F000040020000E00000002D3F48-000000067F000040020000E00000002DC941__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002D4000-000000067F000040020000E00000002D8000__000000574B7FF240 000000067F000040020000E00000002D4000-000000067F000040020000E00000002D8000__00000073AD3FE6B8 000000067F000040020000E00000002D4000-000000067F000040020000E00000002D8000__000000914E3F38F0 000000067F000040020000E00000002D4000-000000067F000040020000E00000002D8000__000000931B9A2710 000000067F000040020000E00000002D8000-000000067F000040020000E00000002DC000__000000574B7FF240 000000067F000040020000E00000002D8000-000000067F000040020000E00000002DC000__00000073AD3FE6B8 000000067F000040020000E00000002D8000-000000067F000040020000E00000002DC000__000000914E3F38F0 000000067F000040020000E00000002D8000-000000067F000040020000E00000002DC000__000000931B9A2710 000000067F000040020000E00000002DC000-000000067F000040020000E00000002E0000__000000574B7FF240 000000067F000040020000E00000002DC000-000000067F000040020000E00000002E0000__00000073AD3FE6B8 000000067F000040020000E00000002DC000-000000067F000040020000E00000002E0000__000000914E3F38F0 000000067F000040020000E00000002DC000-000000067F000040020000E00000002E0000__000000931B9A2710 000000067F000040020000E00000002DC941-000000067F000040020000E00000002E532B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002E0000-000000067F000040020000E00000002E4000__000000574B7FF240 000000067F000040020000E00000002E0000-000000067F000040020000E00000002E4000__00000073AD3FE6B8 000000067F000040020000E00000002E0000-000000067F000040020000E00000002E4000__000000914E3F38F0 000000067F000040020000E00000002E0000-000000067F000040020000E00000002E4000__000000931B9A2710 000000067F000040020000E00000002E4000-000000067F000040020000E00000002E8000__000000574B7FF240 000000067F000040020000E00000002E4000-000000067F000040020000E00000002E8000__00000073AD3FE6B8 000000067F000040020000E00000002E4000-000000067F000040020000E00000002E8000__000000914E3F38F0 000000067F000040020000E00000002E4000-000000067F000040020000E00000002E8000__000000931B9A2710 000000067F000040020000E00000002E532B-000000067F000040020000E00000002EDD10__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002E8000-000000067F000040020000E00000002EC000__000000574B7FF240 000000067F000040020000E00000002E8000-000000067F000040020000E00000002EC000__00000073AD3FE6B8 000000067F000040020000E00000002E8000-000000067F000040020000E00000002EC000__000000914E3F38F0 000000067F000040020000E00000002E8000-000000067F000040020000E00000002EC000__000000931B9A2710 000000067F000040020000E00000002EC000-000000067F000040020000E00000002F0000__000000574B7FF240 000000067F000040020000E00000002EC000-000000067F000040020000E00000002F0000__00000073AD3FE6B8 000000067F000040020000E00000002EC000-000000067F000040020000E00000002F0000__000000914E3F38F0 000000067F000040020000E00000002EC000-000000067F000040020000E00000002F0000__000000931B9A2710 000000067F000040020000E00000002EDD10-000000067F000040020000E00000002F66E2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002F0000-000000067F000040020000E00000002F4000__000000574B7FF240 000000067F000040020000E00000002F0000-000000067F000040020000E00000002F4000__00000073AD3FE6B8 000000067F000040020000E00000002F0000-000000067F000040020000E00000002F4000__000000914E3F38F0 000000067F000040020000E00000002F0000-000000067F000040020000E00000002F4000__000000931B9A2710 000000067F000040020000E00000002F4000-000000067F000040020000E00000002F8000__000000574B7FF240 000000067F000040020000E00000002F4000-000000067F000040020000E00000002F8000__00000073AD3FE6B8 000000067F000040020000E00000002F4000-000000067F000040020000E00000002F8000__000000914E3F38F0 000000067F000040020000E00000002F4000-000000067F000040020000E00000002F8000__000000931B9A2710 000000067F000040020000E00000002F66E2-000000067F000040020000E00000002FF0B3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000002F8000-000000067F000040020000E00000002FC000__000000574B7FF240 000000067F000040020000E00000002F8000-000000067F000040020000E00000002FC000__00000073AD3FE6B8 000000067F000040020000E00000002F8000-000000067F000040020000E00000002FC000__000000914E3F38F0 000000067F000040020000E00000002F8000-000000067F000040020000E00000002FC000__000000931B9A2710 000000067F000040020000E00000002FC000-000000067F000040020000E0000000300000__000000574B7FF240 000000067F000040020000E00000002FC000-000000067F000040020000E0000000300000__00000073AD3FE6B8 000000067F000040020000E00000002FC000-000000067F000040020000E0000000300000__000000914E3F38F0 000000067F000040020000E00000002FC000-000000067F000040020000E0000000300000__000000931B9A2710 000000067F000040020000E00000002FF0B3-000000067F000040020000E0000000307A76__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000300000-000000067F000040020000E0000000304000__000000574B7FF240 000000067F000040020000E0000000300000-000000067F000040020000E0000000304000__00000073AD3FE6B8 000000067F000040020000E0000000300000-000000067F000040020000E0000000304000__000000914E3F38F0 000000067F000040020000E0000000300000-000000067F000040020000E0000000304000__000000931B9A2710 000000067F000040020000E0000000304000-000000067F000040020000E0000000308000__000000574B7FF240 000000067F000040020000E0000000304000-000000067F000040020000E0000000308000__00000073AD3FE6B8 000000067F000040020000E0000000304000-000000067F000040020000E0000000308000__000000914E3F38F0 000000067F000040020000E0000000304000-000000067F000040020000E0000000308000__000000931B9A2710 000000067F000040020000E0000000307A76-000000067F000040020000E0000000310449__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000308000-000000067F000040020000E000000030C000__000000574B7FF240 000000067F000040020000E0000000308000-000000067F000040020000E000000030C000__00000073AD3FE6B8 000000067F000040020000E0000000308000-000000067F000040020000E000000030C000__000000914E3F38F0 000000067F000040020000E0000000308000-000000067F000040020000E000000030C000__000000931B9A2710 000000067F000040020000E000000030C000-000000067F000040020000E0000000310000__000000574B7FF240 000000067F000040020000E000000030C000-000000067F000040020000E0000000310000__00000073AD3FE6B8 000000067F000040020000E000000030C000-000000067F000040020000E0000000310000__000000914E3F38F0 000000067F000040020000E000000030C000-000000067F000040020000E0000000310000__000000931B9A2710 000000067F000040020000E0000000310000-000000067F000040020000E0000000314000__000000574B7FF240 000000067F000040020000E0000000310000-000000067F000040020000E0000000314000__00000073AD3FE6B8 000000067F000040020000E0000000310000-000000067F000040020000E0000000314000__000000914E3F38F0 000000067F000040020000E0000000310000-000000067F000040020000E0000000314000__000000931B9A2710 000000067F000040020000E0000000310449-000000067F000040020000E0000000318E4F__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000314000-000000067F000040020000E0000000318000__000000574B7FF240 000000067F000040020000E0000000314000-000000067F000040020000E0000000318000__00000073AD3FE6B8 000000067F000040020000E0000000314000-000000067F000040020000E0000000318000__000000914E3F38F0 000000067F000040020000E0000000314000-000000067F000040020000E0000000318000__000000931B9A2710 000000067F000040020000E0000000318000-000000067F000040020000E000000031C000__000000574B7FF240 000000067F000040020000E0000000318000-000000067F000040020000E000000031C000__00000073AD3FE6B8 000000067F000040020000E0000000318000-000000067F000040020000E000000031C000__000000914E3F38F0 000000067F000040020000E0000000318000-000000067F000040020000E000000031C000__000000931B9A2710 000000067F000040020000E0000000318E4F-000000067F000040020000E0000000321836__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000031C000-000000067F000040020000E0000000320000__000000574B7FF240 000000067F000040020000E000000031C000-000000067F000040020000E0000000320000__00000073AD3FE6B8 000000067F000040020000E000000031C000-000000067F000040020000E0000000320000__000000914E3F38F0 000000067F000040020000E000000031C000-000000067F000040020000E0000000320000__000000931B9A2710 000000067F000040020000E0000000320000-000000067F000040020000E0000000324000__000000574B7FF240 000000067F000040020000E0000000320000-000000067F000040020000E0000000324000__00000073AD3FE6B8 000000067F000040020000E0000000320000-000000067F000040020000E0000000324000__000000914E3F38F0 000000067F000040020000E0000000320000-000000067F000040020000E0000000324000__000000931B9A2710 000000067F000040020000E0000000321836-000000067F000040020000E000000032A20E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000324000-000000067F000040020000E0000000328000__000000574B7FF240 000000067F000040020000E0000000324000-000000067F000040020000E0000000328000__00000073AD3FE6B8 000000067F000040020000E0000000324000-000000067F000040020000E0000000328000__000000914E3F38F0 000000067F000040020000E0000000324000-000000067F000040020000E0000000328000__000000931B9A2710 000000067F000040020000E0000000328000-000000067F000040020000E000000032C000__000000574B7FF240 000000067F000040020000E0000000328000-000000067F000040020000E000000032C000__00000073AD3FE6B8 000000067F000040020000E0000000328000-000000067F000040020000E000000032C000__000000914E3F38F0 000000067F000040020000E0000000328000-000000067F000040020000E000000032C000__000000931B9A2710 000000067F000040020000E000000032A20E-000000067F000040020000E0000000332BDA__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000032C000-000000067F000040020000E0000000330000__000000574B7FF240 000000067F000040020000E000000032C000-000000067F000040020000E0000000330000__00000073AD3FE6B8 000000067F000040020000E000000032C000-000000067F000040020000E0000000330000__000000914E3F38F0 000000067F000040020000E000000032C000-000000067F000040020000E0000000330000__000000931B9A2710 000000067F000040020000E0000000330000-000000067F000040020000E0000000334000__000000574B7FF240 000000067F000040020000E0000000330000-000000067F000040020000E0000000334000__00000073AD3FE6B8 000000067F000040020000E0000000330000-000000067F000040020000E0000000334000__000000914E3F38F0 000000067F000040020000E0000000330000-000000067F000040020000E0000000334000__000000931B9A2710 000000067F000040020000E0000000332BDA-000000067F000040020000E000000033B5AD__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000334000-000000067F000040020000E0000000338000__000000574B7FF240 000000067F000040020000E0000000334000-000000067F000040020000E0000000338000__00000073AD3FE6B8 000000067F000040020000E0000000334000-000000067F000040020000E0000000338000__000000914E3F38F0 000000067F000040020000E0000000334000-000000067F000040020000E0000000338000__000000931B9A2710 000000067F000040020000E0000000338000-000000067F000040020000E000000033C000__000000574B7FF240 000000067F000040020000E0000000338000-000000067F000040020000E000000033C000__00000073AD3FE6B8 000000067F000040020000E0000000338000-000000067F000040020000E000000033C000__000000914E3F38F0 000000067F000040020000E0000000338000-000000067F000040020000E000000033C000__000000931B9A2710 000000067F000040020000E000000033B5AD-000000067F000040020000E0000000343F77__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000033C000-000000067F000040020000E0000000340000__000000574B7FF240 000000067F000040020000E000000033C000-000000067F000040020000E0000000340000__00000073AD3FE6B8 000000067F000040020000E000000033C000-000000067F000040020000E0000000340000__000000914E3F38F0 000000067F000040020000E000000033C000-000000067F000040020000E0000000340000__000000931B9A2710 000000067F000040020000E0000000340000-000000067F000040020000E0000000344000__000000574B7FF240 000000067F000040020000E0000000340000-000000067F000040020000E0000000344000__00000073AD3FE6B8 000000067F000040020000E0000000340000-000000067F000040020000E0000000344000__000000914E3F38F0 000000067F000040020000E0000000340000-000000067F000040020000E0000000344000__000000931B9A2710 000000067F000040020000E0000000343F77-000000067F000040020000E000000034C95A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000344000-000000067F000040020000E0000000348000__000000574B7FF240 000000067F000040020000E0000000344000-000000067F000040020000E0000000348000__00000073AD3FE6B8 000000067F000040020000E0000000344000-000000067F000040020000E0000000348000__000000914E3F38F0 000000067F000040020000E0000000344000-000000067F000040020000E0000000348000__000000931B9A2710 000000067F000040020000E0000000348000-000000067F000040020000E000000034C000__000000574B7FF240 000000067F000040020000E0000000348000-000000067F000040020000E000000034C000__00000073AD3FE6B8 000000067F000040020000E0000000348000-000000067F000040020000E000000034C000__000000914E3F38F0 000000067F000040020000E0000000348000-000000067F000040020000E000000034C000__000000931B9A2710 000000067F000040020000E000000034C000-000000067F000040020000E0000000350000__000000574B7FF240 000000067F000040020000E000000034C000-000000067F000040020000E0000000350000__00000073AD3FE6B8 000000067F000040020000E000000034C000-000000067F000040020000E0000000350000__000000914E3F38F0 000000067F000040020000E000000034C000-000000067F000040020000E0000000350000__000000931B9A2710 000000067F000040020000E000000034C95A-000000067F000040020000E0000000355348__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000350000-000000067F000040020000E0000000354000__000000574B7FF240 000000067F000040020000E0000000350000-000000067F000040020000E0000000354000__00000073AD3FE6B8 000000067F000040020000E0000000350000-000000067F000040020000E0000000354000__000000914E3F38F0 000000067F000040020000E0000000350000-000000067F000040020000E0000000354000__000000931B9A2710 000000067F000040020000E0000000354000-000000067F000040020000E0000000358000__000000574B7FF240 000000067F000040020000E0000000354000-000000067F000040020000E0000000358000__00000073AD3FE6B8 000000067F000040020000E0000000354000-000000067F000040020000E0000000358000__000000914E3F38F0 000000067F000040020000E0000000354000-000000067F000040020000E0000000358000__000000931B9A2710 000000067F000040020000E0000000355348-000000067F000040020000E000000035DD35__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000358000-000000067F000040020000E000000035C000__000000574B7FF240 000000067F000040020000E0000000358000-000000067F000040020000E000000035C000__00000073AD3FE6B8 000000067F000040020000E0000000358000-000000067F000040020000E000000035C000__000000914E3F38F0 000000067F000040020000E0000000358000-000000067F000040020000E000000035C000__000000931B9A2710 000000067F000040020000E000000035C000-000000067F000040020000E0000000360000__000000574B7FF240 000000067F000040020000E000000035C000-000000067F000040020000E0000000360000__00000073AD3FE6B8 000000067F000040020000E000000035C000-000000067F000040020000E0000000360000__000000914E3F38F0 000000067F000040020000E000000035C000-000000067F000040020000E0000000360000__000000931B9A2710 000000067F000040020000E000000035DD35-000000067F000040020000E000000036671D__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000360000-000000067F000040020000E0000000364000__000000574B7FF240 000000067F000040020000E0000000360000-000000067F000040020000E0000000364000__00000073AD3FE6B8 000000067F000040020000E0000000360000-000000067F000040020000E0000000364000__000000914E3F38F0 000000067F000040020000E0000000360000-000000067F000040020000E0000000364000__000000931B9A2710 000000067F000040020000E0000000364000-000000067F000040020000E0000000368000__000000574B7FF240 000000067F000040020000E0000000364000-000000067F000040020000E0000000368000__00000073AD3FE6B8 000000067F000040020000E0000000364000-000000067F000040020000E0000000368000__000000914E3F38F0 000000067F000040020000E0000000364000-000000067F000040020000E0000000368000__000000931B9A2710 000000067F000040020000E000000036671D-000000067F000040020000E000000036F0F0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000368000-000000067F000040020000E000000036C000__000000574B7FF240 000000067F000040020000E0000000368000-000000067F000040020000E000000036C000__00000073AD3FE6B8 000000067F000040020000E0000000368000-000000067F000040020000E000000036C000__000000914E3F38F0 000000067F000040020000E0000000368000-000000067F000040020000E000000036C000__000000931B9A2710 000000067F000040020000E000000036C000-000000067F000040020000E0000000370000__000000574B7FF240 000000067F000040020000E000000036C000-000000067F000040020000E0000000370000__00000073AD3FE6B8 000000067F000040020000E000000036C000-000000067F000040020000E0000000370000__000000914E3F38F0 000000067F000040020000E000000036C000-000000067F000040020000E0000000370000__000000931B9A2710 000000067F000040020000E000000036F0F0-000000067F000040020000E0000000377AB4__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000370000-000000067F000040020000E0000000374000__000000574B7FF240 000000067F000040020000E0000000370000-000000067F000040020000E0000000374000__00000073AD3FE6B8 000000067F000040020000E0000000370000-000000067F000040020000E0000000374000__000000914E3F38F0 000000067F000040020000E0000000370000-000000067F000040020000E0000000374000__000000931B9A2710 000000067F000040020000E0000000374000-000000067F000040020000E0000000378000__000000574B7FF240 000000067F000040020000E0000000374000-000000067F000040020000E0000000378000__00000073AD3FE6B8 000000067F000040020000E0000000374000-000000067F000040020000E0000000378000__000000914E3F38F0 000000067F000040020000E0000000374000-000000067F000040020000E0000000378000__000000931B9A2710 000000067F000040020000E0000000377AB4-000000067F000040020000E000000038047C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000378000-000000067F000040020000E000000037C000__000000574B7FF240 000000067F000040020000E0000000378000-000000067F000040020000E000000037C000__00000073AD3FE6B8 000000067F000040020000E0000000378000-000000067F000040020000E000000037C000__000000914E3F38F0 000000067F000040020000E0000000378000-000000067F000040020000E000000037C000__000000931B9A2710 000000067F000040020000E000000037C000-000000067F000040020000E0000000380000__000000574B7FF240 000000067F000040020000E000000037C000-000000067F000040020000E0000000380000__00000073AD3FE6B8 000000067F000040020000E000000037C000-000000067F000040020000E0000000380000__000000914E3F38F0 000000067F000040020000E000000037C000-000000067F000040020000E0000000380000__000000931B9A2710 000000067F000040020000E0000000380000-000000067F000040020000E0000000384000__000000574B7FF240 000000067F000040020000E0000000380000-000000067F000040020000E0000000384000__00000073AD3FE6B8 000000067F000040020000E0000000380000-000000067F000040020000E0000000384000__000000914E3F38F0 000000067F000040020000E0000000380000-000000067F000040020000E0000000384000__000000931B9A2710 000000067F000040020000E000000038047C-000000067F000040020000E0000000388E68__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000384000-000000067F000040020000E0000000388000__000000574B7FF240 000000067F000040020000E0000000384000-000000067F000040020000E0000000388000__00000073AD3FE6B8 000000067F000040020000E0000000384000-000000067F000040020000E0000000388000__000000914E3F38F0 000000067F000040020000E0000000384000-000000067F000040020000E0000000388000__000000931B9A2710 000000067F000040020000E0000000388000-000000067F000040020000E000000038C000__000000574B7FF240 000000067F000040020000E0000000388000-000000067F000040020000E000000038C000__00000073AD3FE6B8 000000067F000040020000E0000000388000-000000067F000040020000E000000038C000__000000914E3F38F0 000000067F000040020000E0000000388000-000000067F000040020000E000000038C000__000000931B9A2710 000000067F000040020000E0000000388E68-000000067F000040020000E0000000391852__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000038C000-000000067F000040020000E0000000390000__000000574B7FF240 000000067F000040020000E000000038C000-000000067F000040020000E0000000390000__00000073AD3FE6B8 000000067F000040020000E000000038C000-000000067F000040020000E0000000390000__000000914E3F38F0 000000067F000040020000E000000038C000-000000067F000040020000E0000000390000__000000931B9A2710 000000067F000040020000E0000000390000-000000067F000040020000E0000000394000__000000574B7FF240 000000067F000040020000E0000000390000-000000067F000040020000E0000000394000__00000073AD3FE6B8 000000067F000040020000E0000000390000-000000067F000040020000E0000000394000__000000914E3F38F0 000000067F000040020000E0000000390000-000000067F000040020000E0000000394000__000000931B9A2710 000000067F000040020000E0000000391852-000000067F000040020000E000000039A23F__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000394000-000000067F000040020000E0000000398000__000000574B7FF240 000000067F000040020000E0000000394000-000000067F000040020000E0000000398000__00000073AD3FE6B8 000000067F000040020000E0000000394000-000000067F000040020000E0000000398000__000000914E3F38F0 000000067F000040020000E0000000394000-000000067F000040020000E0000000398000__000000931B9A2710 000000067F000040020000E0000000398000-000000067F000040020000E000000039C000__000000574B7FF240 000000067F000040020000E0000000398000-000000067F000040020000E000000039C000__00000073AD3FE6B8 000000067F000040020000E0000000398000-000000067F000040020000E000000039C000__000000914E3F38F0 000000067F000040020000E0000000398000-000000067F000040020000E000000039C000__000000931B9A2710 000000067F000040020000E000000039A23F-000000067F000040020000E00000003A2C1E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000039C000-000000067F000040020000E00000003A0000__000000574B7FF240 000000067F000040020000E000000039C000-000000067F000040020000E00000003A0000__00000073AD3FE6B8 000000067F000040020000E000000039C000-000000067F000040020000E00000003A0000__000000914E3F38F0 000000067F000040020000E000000039C000-000000067F000040020000E00000003A0000__000000931B9A2710 000000067F000040020000E00000003A0000-000000067F000040020000E00000003A4000__000000574B7FF240 000000067F000040020000E00000003A0000-000000067F000040020000E00000003A4000__00000073AD3FE6B8 000000067F000040020000E00000003A0000-000000067F000040020000E00000003A4000__000000914E3F38F0 000000067F000040020000E00000003A0000-000000067F000040020000E00000003A4000__000000931B9A2710 000000067F000040020000E00000003A2C1E-000000067F000040020000E00000003AB5EC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003A4000-000000067F000040020000E00000003A8000__000000574B7FF240 000000067F000040020000E00000003A4000-000000067F000040020000E00000003A8000__00000073AD3FE6B8 000000067F000040020000E00000003A4000-000000067F000040020000E00000003A8000__000000914E3F38F0 000000067F000040020000E00000003A4000-000000067F000040020000E00000003A8000__000000931B9A2710 000000067F000040020000E00000003A8000-000000067F000040020000E00000003AC000__000000574B7FF240 000000067F000040020000E00000003A8000-000000067F000040020000E00000003AC000__00000073AD3FE6B8 000000067F000040020000E00000003A8000-000000067F000040020000E00000003AC000__000000914E3F38F0 000000067F000040020000E00000003A8000-000000067F000040020000E00000003AC000__000000931B9A2710 000000067F000040020000E00000003AB5EC-000000067F000040020000E00000003B3FB2__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003AC000-000000067F000040020000E00000003B0000__000000574B7FF240 000000067F000040020000E00000003AC000-000000067F000040020000E00000003B0000__00000073AD3FE6B8 000000067F000040020000E00000003AC000-000000067F000040020000E00000003B0000__000000914E3F38F0 000000067F000040020000E00000003AC000-000000067F000040020000E00000003B0000__000000931B9A2710 000000067F000040020000E00000003B0000-000000067F000040020000E00000003B4000__000000574B7FF240 000000067F000040020000E00000003B0000-000000067F000040020000E00000003B4000__00000073AD3FE6B8 000000067F000040020000E00000003B0000-000000067F000040020000E00000003B4000__000000914E3F38F0 000000067F000040020000E00000003B0000-000000067F000040020000E00000003B4000__000000931B9A2710 000000067F000040020000E00000003B3FB2-000000067F000040020000E00000003BC972__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003B4000-000000067F000040020000E00000003B8000__000000574B7FF240 000000067F000040020000E00000003B4000-000000067F000040020000E00000003B8000__00000073AD3FE6B8 000000067F000040020000E00000003B4000-000000067F000040020000E00000003B8000__000000914E3F38F0 000000067F000040020000E00000003B4000-000000067F000040020000E00000003B8000__000000931B9A2710 000000067F000040020000E00000003B8000-000000067F000040020000E00000003BC000__000000574B7FF240 000000067F000040020000E00000003B8000-000000067F000040020000E00000003BC000__00000073AD3FE6B8 000000067F000040020000E00000003B8000-000000067F000040020000E00000003BC000__000000914E3F38F0 000000067F000040020000E00000003B8000-000000067F000040020000E00000003BC000__000000931B9A2710 000000067F000040020000E00000003BC000-000000067F000040020000E00000003C0000__000000574B7FF240 000000067F000040020000E00000003BC000-000000067F000040020000E00000003C0000__00000073AD3FE6B8 000000067F000040020000E00000003BC000-000000067F000040020000E00000003C0000__000000914E3F38F0 000000067F000040020000E00000003BC000-000000067F000040020000E00000003C0000__000000931B9A2710 000000067F000040020000E00000003BC972-000000067F000040020000E00000003C5369__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003C0000-000000067F000040020000E00000003C4000__000000574B7FF240 000000067F000040020000E00000003C0000-000000067F000040020000E00000003C4000__00000073AD3FE6B8 000000067F000040020000E00000003C0000-000000067F000040020000E00000003C4000__000000914E3F38F0 000000067F000040020000E00000003C0000-000000067F000040020000E00000003C4000__000000931B9A2710 000000067F000040020000E00000003C4000-000000067F000040020000E00000003C8000__000000574B7FF240 000000067F000040020000E00000003C4000-000000067F000040020000E00000003C8000__00000073AD3FE6B8 000000067F000040020000E00000003C4000-000000067F000040020000E00000003C8000__000000914E3F38F0 000000067F000040020000E00000003C4000-000000067F000040020000E00000003C8000__000000931B9A2710 000000067F000040020000E00000003C5369-000000067F000040020000E00000003CDD67__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003C8000-000000067F000040020000E00000003CC000__000000574B7FF240 000000067F000040020000E00000003C8000-000000067F000040020000E00000003CC000__00000073AD3FE6B8 000000067F000040020000E00000003C8000-000000067F000040020000E00000003CC000__000000914E3F38F0 000000067F000040020000E00000003C8000-000000067F000040020000E00000003CC000__000000931B9A2710 000000067F000040020000E00000003CC000-000000067F000040020000E00000003D0000__000000574B7FF240 000000067F000040020000E00000003CC000-000000067F000040020000E00000003D0000__00000073AD3FE6B8 000000067F000040020000E00000003CC000-000000067F000040020000E00000003D0000__000000914E3F38F0 000000067F000040020000E00000003CC000-000000067F000040020000E00000003D0000__000000931B9A2710 000000067F000040020000E00000003CDD67-000000067F000040020000E00000003D675B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003D0000-000000067F000040020000E00000003D4000__000000574B7FF240 000000067F000040020000E00000003D0000-000000067F000040020000E00000003D4000__00000073AD3FE6B8 000000067F000040020000E00000003D0000-000000067F000040020000E00000003D4000__000000914E3F38F0 000000067F000040020000E00000003D0000-000000067F000040020000E00000003D4000__000000931B9A2710 000000067F000040020000E00000003D4000-000000067F000040020000E00000003D8000__000000574B7FF240 000000067F000040020000E00000003D4000-000000067F000040020000E00000003D8000__00000073AD3FE6B8 000000067F000040020000E00000003D4000-000000067F000040020000E00000003D8000__000000914E3F38F0 000000067F000040020000E00000003D4000-000000067F000040020000E00000003D8000__000000931B9A2710 000000067F000040020000E00000003D675B-000000067F000040020000E00000003DF132__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003D8000-000000067F000040020000E00000003DC000__000000574B7FF240 000000067F000040020000E00000003D8000-000000067F000040020000E00000003DC000__00000073AD3FE6B8 000000067F000040020000E00000003D8000-000000067F000040020000E00000003DC000__000000914E3F38F0 000000067F000040020000E00000003D8000-000000067F000040020000E00000003DC000__000000931B9A2710 000000067F000040020000E00000003DC000-000000067F000040020000E00000003E0000__000000574B7FF240 000000067F000040020000E00000003DC000-000000067F000040020000E00000003E0000__00000073AD3FE6B8 000000067F000040020000E00000003DC000-000000067F000040020000E00000003E0000__000000914E3F38F0 000000067F000040020000E00000003DC000-000000067F000040020000E00000003E0000__000000931B9A2710 000000067F000040020000E00000003DF132-000000067F000040020000E00000003E7AFE__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003E0000-000000067F000040020000E00000003E4000__000000574B7FF240 000000067F000040020000E00000003E0000-000000067F000040020000E00000003E4000__00000073AD3FE6B8 000000067F000040020000E00000003E0000-000000067F000040020000E00000003E4000__000000914E3F38F0 000000067F000040020000E00000003E0000-000000067F000040020000E00000003E4000__000000931B9A2710 000000067F000040020000E00000003E4000-000000067F000040020000E00000003E8000__000000574B7FF240 000000067F000040020000E00000003E4000-000000067F000040020000E00000003E8000__00000073AD3FE6B8 000000067F000040020000E00000003E4000-000000067F000040020000E00000003E8000__000000914E3F38F0 000000067F000040020000E00000003E4000-000000067F000040020000E00000003E8000__000000931B9A2710 000000067F000040020000E00000003E7AFE-000000067F000040020000E00000003F04C7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003E8000-000000067F000040020000E00000003EC000__000000574B7FF240 000000067F000040020000E00000003E8000-000000067F000040020000E00000003EC000__00000073AD3FE6B8 000000067F000040020000E00000003E8000-000000067F000040020000E00000003EC000__000000914E3F38F0 000000067F000040020000E00000003E8000-000000067F000040020000E00000003EC000__000000931B9A2710 000000067F000040020000E00000003EC000-000000067F000040020000E00000003F0000__000000574B7FF240 000000067F000040020000E00000003EC000-000000067F000040020000E00000003F0000__00000073AD3FE6B8 000000067F000040020000E00000003EC000-000000067F000040020000E00000003F0000__000000914E3F38F0 000000067F000040020000E00000003EC000-000000067F000040020000E00000003F0000__000000931B9A2710 000000067F000040020000E00000003F0000-000000067F000040020000E00000003F4000__000000574B7FF240 000000067F000040020000E00000003F0000-000000067F000040020000E00000003F4000__00000073AD3FE6B8 000000067F000040020000E00000003F0000-000000067F000040020000E00000003F4000__000000914E3F38F0 000000067F000040020000E00000003F0000-000000067F000040020000E00000003F4000__000000931B9A2710 000000067F000040020000E00000003F04C7-000000067F000040020000E00000003F8E92__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003F4000-000000067F000040020000E00000003F8000__000000574B7FF240 000000067F000040020000E00000003F4000-000000067F000040020000E00000003F8000__00000073AD3FE6B8 000000067F000040020000E00000003F4000-000000067F000040020000E00000003F8000__000000914E3F38F0 000000067F000040020000E00000003F4000-000000067F000040020000E00000003F8000__000000931B9A2710 000000067F000040020000E00000003F8000-000000067F000040020000E00000003FC000__000000574B7FF240 000000067F000040020000E00000003F8000-000000067F000040020000E00000003FC000__00000073AD3FE6B8 000000067F000040020000E00000003F8000-000000067F000040020000E00000003FC000__000000914E3F38F0 000000067F000040020000E00000003F8000-000000067F000040020000E00000003FC000__000000931B9A2710 000000067F000040020000E00000003F8E92-000000067F000040020000E000000040188E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000003FC000-000000067F000040020000E0000000400000__000000574B7FF240 000000067F000040020000E00000003FC000-000000067F000040020000E0000000400000__00000073AD3FE6B8 000000067F000040020000E00000003FC000-000000067F000040020000E0000000400000__000000914E3F38F0 000000067F000040020000E00000003FC000-000000067F000040020000E0000000400000__000000931B9A2710 000000067F000040020000E0000000400000-000000067F000040020000E0000000404000__000000574B7FF240 000000067F000040020000E0000000400000-000000067F000040020000E0000000404000__00000073AD3FE6B8 000000067F000040020000E0000000400000-000000067F000040020000E0000000404000__000000914E3F38F0 000000067F000040020000E0000000400000-000000067F000040020000E0000000404000__000000931B9A2710 000000067F000040020000E000000040188E-000000067F000040020000E000000040A288__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000404000-000000067F000040020000E0000000408000__000000574B7FF240 000000067F000040020000E0000000404000-000000067F000040020000E0000000408000__00000073AD3FE6B8 000000067F000040020000E0000000404000-000000067F000040020000E0000000408000__000000914E3F38F0 000000067F000040020000E0000000404000-000000067F000040020000E0000000408000__000000931B9A2710 000000067F000040020000E0000000408000-000000067F000040020000E000000040C000__000000574B7FF240 000000067F000040020000E0000000408000-000000067F000040020000E000000040C000__00000073AD3FE6B8 000000067F000040020000E0000000408000-000000067F000040020000E000000040C000__000000914E3F38F0 000000067F000040020000E0000000408000-000000067F000040020000E000000040C000__000000931B9A2710 000000067F000040020000E000000040A288-000000067F000040020000E0000000412C77__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000040C000-000000067F000040020000E0000000410000__000000574B7FF240 000000067F000040020000E000000040C000-000000067F000040020000E0000000410000__00000073AD3FE6B8 000000067F000040020000E000000040C000-000000067F000040020000E0000000410000__000000914E3F38F0 000000067F000040020000E000000040C000-000000067F000040020000E0000000410000__000000931B9A2710 000000067F000040020000E0000000410000-000000067F000040020000E0000000414000__000000574B7FF240 000000067F000040020000E0000000410000-000000067F000040020000E0000000414000__00000073AD3FE6B8 000000067F000040020000E0000000410000-000000067F000040020000E0000000414000__000000914E3F38F0 000000067F000040020000E0000000410000-000000067F000040020000E0000000414000__000000931B9A2710 000000067F000040020000E0000000412C77-000000067F000040020000E000000041B646__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000414000-000000067F000040020000E0000000418000__000000574B7FF240 000000067F000040020000E0000000414000-000000067F000040020000E0000000418000__00000073AD3FE6B8 000000067F000040020000E0000000414000-000000067F000040020000E0000000418000__000000914E3F38F0 000000067F000040020000E0000000414000-000000067F000040020000E0000000418000__000000931B9A2710 000000067F000040020000E0000000418000-000000067F000040020000E000000041C000__000000574B7FF240 000000067F000040020000E0000000418000-000000067F000040020000E000000041C000__00000073AD3FE6B8 000000067F000040020000E0000000418000-000000067F000040020000E000000041C000__000000914E3F38F0 000000067F000040020000E0000000418000-000000067F000040020000E000000041C000__000000931B9A2710 000000067F000040020000E000000041B646-000000067F000040020000E000000042400E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000041C000-000000067F000040020000E0000000420000__000000574B7FF240 000000067F000040020000E000000041C000-000000067F000040020000E0000000420000__00000073AD3FE6B8 000000067F000040020000E000000041C000-000000067F000040020000E0000000420000__000000914E3F38F0 000000067F000040020000E000000041C000-000000067F000040020000E0000000420000__000000931B9A2710 000000067F000040020000E0000000420000-000000067F000040020000E0000000424000__000000574B7FF240 000000067F000040020000E0000000420000-000000067F000040020000E0000000424000__00000073AD3FE6B8 000000067F000040020000E0000000420000-000000067F000040020000E0000000424000__000000914E3F38F0 000000067F000040020000E0000000420000-000000067F000040020000E0000000424000__000000931B9A2710 000000067F000040020000E0000000424000-000000067F000040020000E0000000428000__000000574B7FF240 000000067F000040020000E0000000424000-000000067F000040020000E0000000428000__00000073AD3FE6B8 000000067F000040020000E0000000424000-000000067F000040020000E0000000428000__000000914E3F38F0 000000067F000040020000E0000000424000-000000067F000040020000E0000000428000__000000931B9A2710 000000067F000040020000E000000042400E-000000067F000040020000E000000042C9CC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000428000-000000067F000040020000E000000042C000__000000574B7FF240 000000067F000040020000E0000000428000-000000067F000040020000E000000042C000__00000073AD3FE6B8 000000067F000040020000E0000000428000-000000067F000040020000E000000042C000__000000914E3F38F0 000000067F000040020000E0000000428000-000000067F000040020000E000000042C000__000000931B9A2710 000000067F000040020000E000000042C000-000000067F000040020000E0000000430000__000000574B7FF240 000000067F000040020000E000000042C000-000000067F000040020000E0000000430000__00000073AD3FE6B8 000000067F000040020000E000000042C000-000000067F000040020000E0000000430000__000000914E3F38F0 000000067F000040020000E000000042C000-000000067F000040020000E0000000430000__000000931B9A2710 000000067F000040020000E000000042C9CC-000000067F000040020000E00000004353A5__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000430000-000000067F000040020000E0000000434000__000000574B7FF240 000000067F000040020000E0000000430000-000000067F000040020000E0000000434000__00000073AD3FE6B8 000000067F000040020000E0000000430000-000000067F000040020000E0000000434000__000000914E3F38F0 000000067F000040020000E0000000430000-000000067F000040020000E0000000434000__000000931B9A2710 000000067F000040020000E0000000434000-000000067F000040020000E0000000438000__000000574B7FF240 000000067F000040020000E0000000434000-000000067F000040020000E0000000438000__00000073AD3FE6B8 000000067F000040020000E0000000434000-000000067F000040020000E0000000438000__000000914E3F38F0 000000067F000040020000E0000000434000-000000067F000040020000E0000000438000__000000931B9A2710 000000067F000040020000E00000004353A5-000000067F000040020000E000000043DD9A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000438000-000000067F000040020000E000000043C000__000000574B7FF240 000000067F000040020000E0000000438000-000000067F000040020000E000000043C000__00000073AD3FE6B8 000000067F000040020000E0000000438000-000000067F000040020000E000000043C000__000000914E3F38F0 000000067F000040020000E0000000438000-000000067F000040020000E000000043C000__000000931B9A2710 000000067F000040020000E000000043C000-000000067F000040020000E0000000440000__000000574B7FF240 000000067F000040020000E000000043C000-000000067F000040020000E0000000440000__00000073AD3FE6B8 000000067F000040020000E000000043C000-000000067F000040020000E0000000440000__000000914E3F38F0 000000067F000040020000E000000043C000-000000067F000040020000E0000000440000__000000931B9A2710 000000067F000040020000E000000043DD9A-000000067F000040020000E0000000446792__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000440000-000000067F000040020000E0000000444000__000000574B7FF240 000000067F000040020000E0000000440000-000000067F000040020000E0000000444000__00000073AD3FE6B8 000000067F000040020000E0000000440000-000000067F000040020000E0000000444000__000000914E3F38F0 000000067F000040020000E0000000440000-000000067F000040020000E0000000444000__000000931B9A2710 000000067F000040020000E0000000444000-000000067F000040020000E0000000448000__000000574B7FF240 000000067F000040020000E0000000444000-000000067F000040020000E0000000448000__00000073AD3FE6B8 000000067F000040020000E0000000444000-000000067F000040020000E0000000448000__000000914E3F38F0 000000067F000040020000E0000000444000-000000067F000040020000E0000000448000__000000931B9A2710 000000067F000040020000E0000000446792-000000067F000040020000E000000044F178__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000448000-000000067F000040020000E000000044C000__000000574B7FF240 000000067F000040020000E0000000448000-000000067F000040020000E000000044C000__00000073AD3FE6B8 000000067F000040020000E0000000448000-000000067F000040020000E000000044C000__000000914E3F38F0 000000067F000040020000E0000000448000-000000067F000040020000E000000044C000__000000931B9A2710 000000067F000040020000E000000044C000-000000067F000040020000E0000000450000__000000574B7FF240 000000067F000040020000E000000044C000-000000067F000040020000E0000000450000__00000073AD3FE6B8 000000067F000040020000E000000044C000-000000067F000040020000E0000000450000__000000914E3F38F0 000000067F000040020000E000000044C000-000000067F000040020000E0000000450000__000000931B9A2710 000000067F000040020000E000000044F178-000000067F000040020000E0000000457B4D__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000450000-000000067F000040020000E0000000454000__000000574B7FF240 000000067F000040020000E0000000450000-000000067F000040020000E0000000454000__00000073AD3FE6B8 000000067F000040020000E0000000450000-000000067F000040020000E0000000454000__000000914E3F38F0 000000067F000040020000E0000000450000-000000067F000040020000E0000000454000__000000931B9A2710 000000067F000040020000E0000000454000-000000067F000040020000E0000000458000__000000574B7FF240 000000067F000040020000E0000000454000-000000067F000040020000E0000000458000__00000073AD3FE6B8 000000067F000040020000E0000000454000-000000067F000040020000E0000000458000__000000914E3F38F0 000000067F000040020000E0000000454000-000000067F000040020000E0000000458000__000000931B9A2710 000000067F000040020000E0000000457B4D-000000067F000040020000E0000000460512__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000458000-000000067F000040020000E000000045C000__000000574B7FF240 000000067F000040020000E0000000458000-000000067F000040020000E000000045C000__00000073AD3FE6B8 000000067F000040020000E0000000458000-000000067F000040020000E000000045C000__000000914E3F38F0 000000067F000040020000E0000000458000-000000067F000040020000E000000045C000__000000931B9A2710 000000067F000040020000E000000045C000-000000067F000040020000E0000000460000__000000574B7FF240 000000067F000040020000E000000045C000-000000067F000040020000E0000000460000__00000073AD3FE6B8 000000067F000040020000E000000045C000-000000067F000040020000E0000000460000__000000914E3F38F0 000000067F000040020000E000000045C000-000000067F000040020000E0000000460000__000000931B9A2710 000000067F000040020000E0000000460000-000000067F000040020000E0000000464000__000000574B7FF240 000000067F000040020000E0000000460000-000000067F000040020000E0000000464000__00000073AD3FE6B8 000000067F000040020000E0000000460000-000000067F000040020000E0000000464000__000000914E3F38F0 000000067F000040020000E0000000460000-000000067F000040020000E0000000464000__000000931B9A2710 000000067F000040020000E0000000460512-000000067F000040020000E0000000468ECC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000464000-000000067F000040020000E0000000468000__000000574B7FF240 000000067F000040020000E0000000464000-000000067F000040020000E0000000468000__00000073AD3FE6B8 000000067F000040020000E0000000464000-000000067F000040020000E0000000468000__000000914E3F38F0 000000067F000040020000E0000000464000-000000067F000040020000E0000000468000__000000931B9A2710 000000067F000040020000E0000000468000-000000067F000040020000E000000046C000__000000574B7FF240 000000067F000040020000E0000000468000-000000067F000040020000E000000046C000__00000073AD3FE6B8 000000067F000040020000E0000000468000-000000067F000040020000E000000046C000__000000914E3F38F0 000000067F000040020000E0000000468000-000000067F000040020000E000000046C000__000000931B9A2710 000000067F000040020000E0000000468ECC-000000067F000040020000E00000004718AA__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000046C000-000000067F000040020000E0000000470000__000000574B7FF240 000000067F000040020000E000000046C000-000000067F000040020000E0000000470000__00000073AD3FE6B8 000000067F000040020000E000000046C000-000000067F000040020000E0000000470000__000000914E3F38F0 000000067F000040020000E000000046C000-000000067F000040020000E0000000470000__000000931B9A2710 000000067F000040020000E0000000470000-000000067F000040020000E0000000474000__000000574B7FF240 000000067F000040020000E0000000470000-000000067F000040020000E0000000474000__00000073AD3FE6B8 000000067F000040020000E0000000470000-000000067F000040020000E0000000474000__000000914E3F38F0 000000067F000040020000E0000000470000-000000067F000040020000E0000000474000__000000931B9A2710 000000067F000040020000E00000004718AA-000000067F000040020000E000000047A299__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000474000-000000067F000040020000E0000000478000__000000574B7FF240 000000067F000040020000E0000000474000-000000067F000040020000E0000000478000__00000073AD3FE6B8 000000067F000040020000E0000000474000-000000067F000040020000E0000000478000__000000914E3F38F0 000000067F000040020000E0000000474000-000000067F000040020000E0000000478000__000000931B9A2710 000000067F000040020000E0000000478000-000000067F000040020000E000000047C000__000000574B7FF240 000000067F000040020000E0000000478000-000000067F000040020000E000000047C000__00000073AD3FE6B8 000000067F000040020000E0000000478000-000000067F000040020000E000000047C000__000000914E3F38F0 000000067F000040020000E0000000478000-000000067F000040020000E000000047C000__000000931B9A2710 000000067F000040020000E000000047A299-000000067F000040020000E0000000482C8C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000047C000-000000067F000040020000E0000000480000__000000574B7FF240 000000067F000040020000E000000047C000-000000067F000040020000E0000000480000__00000073AD3FE6B8 000000067F000040020000E000000047C000-000000067F000040020000E0000000480000__000000914E3F38F0 000000067F000040020000E000000047C000-000000067F000040020000E0000000480000__000000931B9A2710 000000067F000040020000E0000000480000-000000067F000040020000E0000000484000__000000574B7FF240 000000067F000040020000E0000000480000-000000067F000040020000E0000000484000__00000073AD3FE6B8 000000067F000040020000E0000000480000-000000067F000040020000E0000000484000__000000914E3F38F0 000000067F000040020000E0000000480000-000000067F000040020000E0000000484000__000000931B9A2710 000000067F000040020000E0000000482C8C-000000067F000040020000E000000048B675__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000484000-000000067F000040020000E0000000488000__000000574B7FF240 000000067F000040020000E0000000484000-000000067F000040020000E0000000488000__00000073AD3FE6B8 000000067F000040020000E0000000484000-000000067F000040020000E0000000488000__000000914E3F38F0 000000067F000040020000E0000000484000-000000067F000040020000E0000000488000__000000931B9A2710 000000067F000040020000E0000000488000-000000067F000040020000E000000048C000__000000574B7FF240 000000067F000040020000E0000000488000-000000067F000040020000E000000048C000__00000073AD3FE6B8 000000067F000040020000E0000000488000-000000067F000040020000E000000048C000__000000914E3F38F0 000000067F000040020000E0000000488000-000000067F000040020000E000000048C000__000000931B9A2710 000000067F000040020000E000000048B675-000000067F000040020000E0000000494053__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000048C000-000000067F000040020000E0000000490000__000000574B7FF240 000000067F000040020000E000000048C000-000000067F000040020000E0000000490000__00000073AD3FE6B8 000000067F000040020000E000000048C000-000000067F000040020000E0000000490000__000000914E3F38F0 000000067F000040020000E000000048C000-000000067F000040020000E0000000490000__000000931B9A2710 000000067F000040020000E0000000490000-000000067F000040020000E0000000494000__000000574B7FF240 000000067F000040020000E0000000490000-000000067F000040020000E0000000494000__00000073AD3FE6B8 000000067F000040020000E0000000490000-000000067F000040020000E0000000494000__000000914E3F38F0 000000067F000040020000E0000000490000-000000067F000040020000E0000000494000__000000931B9A2710 000000067F000040020000E0000000494000-000000067F000040020000E0000000498000__000000574B7FF240 000000067F000040020000E0000000494000-000000067F000040020000E0000000498000__00000073AD3FE6B8 000000067F000040020000E0000000494000-000000067F000040020000E0000000498000__000000914E3F38F0 000000067F000040020000E0000000494000-000000067F000040020000E0000000498000__000000931B9A2710 000000067F000040020000E0000000494053-000000067F000040020000E000000049CA16__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000498000-000000067F000040020000E000000049C000__000000574B7FF240 000000067F000040020000E0000000498000-000000067F000040020000E000000049C000__00000073AD3FE6B8 000000067F000040020000E0000000498000-000000067F000040020000E000000049C000__000000914E3F38F0 000000067F000040020000E0000000498000-000000067F000040020000E000000049C000__000000931B9A2710 000000067F000040020000E000000049C000-000000067F000040020000E00000004A0000__000000574B7FF240 000000067F000040020000E000000049C000-000000067F000040020000E00000004A0000__00000073AD3FE6B8 000000067F000040020000E000000049C000-000000067F000040020000E00000004A0000__000000914E3F38F0 000000067F000040020000E000000049C000-000000067F000040020000E00000004A0000__000000931B9A2710 000000067F000040020000E000000049CA16-000000067F000040020000E00000004A53D6__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004A0000-000000067F000040020000E00000004A4000__000000574B7FF240 000000067F000040020000E00000004A0000-000000067F000040020000E00000004A4000__00000073AD3FE6B8 000000067F000040020000E00000004A0000-000000067F000040020000E00000004A4000__000000914E3F38F0 000000067F000040020000E00000004A0000-000000067F000040020000E00000004A4000__000000931B9A2710 000000067F000040020000E00000004A4000-000000067F000040020000E00000004A8000__000000574B7FF240 000000067F000040020000E00000004A4000-000000067F000040020000E00000004A8000__00000073AD3FE6B8 000000067F000040020000E00000004A4000-000000067F000040020000E00000004A8000__000000914E3F38F0 000000067F000040020000E00000004A4000-000000067F000040020000E00000004A8000__000000931B9A2710 000000067F000040020000E00000004A53D6-000000067F000040020000E00000004ADDB9__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004A8000-000000067F000040020000E00000004AC000__000000574B7FF240 000000067F000040020000E00000004A8000-000000067F000040020000E00000004AC000__00000073AD3FE6B8 000000067F000040020000E00000004A8000-000000067F000040020000E00000004AC000__000000914E3F38F0 000000067F000040020000E00000004A8000-000000067F000040020000E00000004AC000__000000931B9A2710 000000067F000040020000E00000004AC000-000000067F000040020000E00000004B0000__000000574B7FF240 000000067F000040020000E00000004AC000-000000067F000040020000E00000004B0000__00000073AD3FE6B8 000000067F000040020000E00000004AC000-000000067F000040020000E00000004B0000__000000914E3F38F0 000000067F000040020000E00000004AC000-000000067F000040020000E00000004B0000__000000931B9A2710 000000067F000040020000E00000004ADDB9-000000067F000040020000E00000004B67B7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004B0000-000000067F000040020000E00000004B4000__000000574B7FF240 000000067F000040020000E00000004B0000-000000067F000040020000E00000004B4000__00000073AD3FE6B8 000000067F000040020000E00000004B0000-000000067F000040020000E00000004B4000__000000914E3F38F0 000000067F000040020000E00000004B0000-000000067F000040020000E00000004B4000__000000931B9A2710 000000067F000040020000E00000004B4000-000000067F000040020000E00000004B8000__000000574B7FF240 000000067F000040020000E00000004B4000-000000067F000040020000E00000004B8000__00000073AD3FE6B8 000000067F000040020000E00000004B4000-000000067F000040020000E00000004B8000__000000914E3F38F0 000000067F000040020000E00000004B4000-000000067F000040020000E00000004B8000__000000931B9A2710 000000067F000040020000E00000004B67B7-000000067F000040020000E00000004BF1AD__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004B8000-000000067F000040020000E00000004BC000__000000574B7FF240 000000067F000040020000E00000004B8000-000000067F000040020000E00000004BC000__00000073AD3FE6B8 000000067F000040020000E00000004B8000-000000067F000040020000E00000004BC000__000000914E3F38F0 000000067F000040020000E00000004B8000-000000067F000040020000E00000004BC000__000000931B9A2710 000000067F000040020000E00000004BC000-000000067F000040020000E00000004C0000__000000574B7FF240 000000067F000040020000E00000004BC000-000000067F000040020000E00000004C0000__00000073AD3FE6B8 000000067F000040020000E00000004BC000-000000067F000040020000E00000004C0000__000000914E3F38F0 000000067F000040020000E00000004BC000-000000067F000040020000E00000004C0000__000000931B9A2710 000000067F000040020000E00000004BF1AD-000000067F000040020000E00000004C7B96__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004C0000-000000067F000040020000E00000004C4000__000000574B7FF240 000000067F000040020000E00000004C0000-000000067F000040020000E00000004C4000__00000073AD3FE6B8 000000067F000040020000E00000004C0000-000000067F000040020000E00000004C4000__000000914E3F38F0 000000067F000040020000E00000004C0000-000000067F000040020000E00000004C4000__000000931B9A2710 000000067F000040020000E00000004C4000-000000067F000040020000E00000004C8000__000000574B7FF240 000000067F000040020000E00000004C4000-000000067F000040020000E00000004C8000__00000073AD3FE6B8 000000067F000040020000E00000004C4000-000000067F000040020000E00000004C8000__000000914E3F38F0 000000067F000040020000E00000004C4000-000000067F000040020000E00000004C8000__000000931B9A2710 000000067F000040020000E00000004C7B96-000000067F000040020000E00000004D0568__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004C8000-000000067F000040020000E00000004CC000__000000574B7FF240 000000067F000040020000E00000004C8000-000000067F000040020000E00000004CC000__00000073AD3FE6B8 000000067F000040020000E00000004C8000-000000067F000040020000E00000004CC000__000000914E3F38F0 000000067F000040020000E00000004C8000-000000067F000040020000E00000004CC000__000000931B9A2710 000000067F000040020000E00000004CC000-000000067F000040020000E00000004D0000__000000574B7FF240 000000067F000040020000E00000004CC000-000000067F000040020000E00000004D0000__00000073AD3FE6B8 000000067F000040020000E00000004CC000-000000067F000040020000E00000004D0000__000000914E3F38F0 000000067F000040020000E00000004CC000-000000067F000040020000E00000004D0000__000000931B9A2710 000000067F000040020000E00000004D0000-000000067F000040020000E00000004D4000__000000574B7FF240 000000067F000040020000E00000004D0000-000000067F000040020000E00000004D4000__00000073AD3FE6B8 000000067F000040020000E00000004D0000-000000067F000040020000E00000004D4000__000000914E3F38F0 000000067F000040020000E00000004D0000-000000067F000040020000E00000004D4000__000000931B9A2710 000000067F000040020000E00000004D0568-000000067F000040020000E00000004D8F2E__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004D4000-000000067F000040020000E00000004D8000__000000574B7FF240 000000067F000040020000E00000004D4000-000000067F000040020000E00000004D8000__00000073AD3FE6B8 000000067F000040020000E00000004D4000-000000067F000040020000E00000004D8000__000000914E3F38F0 000000067F000040020000E00000004D4000-000000067F000040020000E00000004D8000__000000931B9A2710 000000067F000040020000E00000004D8000-000000067F000040020000E00000004DC000__000000574B7FF240 000000067F000040020000E00000004D8000-000000067F000040020000E00000004DC000__00000073AD3FE6B8 000000067F000040020000E00000004D8000-000000067F000040020000E00000004DC000__000000914E3F38F0 000000067F000040020000E00000004D8000-000000067F000040020000E00000004DC000__000000931B9A2710 000000067F000040020000E00000004D8F2E-000000067F000040020000E00000004E18E6__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004DC000-000000067F000040020000E00000004E0000__000000574B7FF240 000000067F000040020000E00000004DC000-000000067F000040020000E00000004E0000__00000073AD3FE6B8 000000067F000040020000E00000004DC000-000000067F000040020000E00000004E0000__000000914E3F38F0 000000067F000040020000E00000004DC000-000000067F000040020000E00000004E0000__000000931B9A2710 000000067F000040020000E00000004E0000-000000067F000040020000E00000004E4000__000000574B7FF240 000000067F000040020000E00000004E0000-000000067F000040020000E00000004E4000__00000073AD3FE6B8 000000067F000040020000E00000004E0000-000000067F000040020000E00000004E4000__000000914E3F38F0 000000067F000040020000E00000004E0000-000000067F000040020000E00000004E4000__000000931B9A2710 000000067F000040020000E00000004E18E6-000000067F000040020000E00000004EA2D3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004E4000-000000067F000040020000E00000004E8000__000000574B7FF240 000000067F000040020000E00000004E4000-000000067F000040020000E00000004E8000__00000073AD3FE6B8 000000067F000040020000E00000004E4000-000000067F000040020000E00000004E8000__000000914E3F38F0 000000067F000040020000E00000004E4000-000000067F000040020000E00000004E8000__000000931B9A2710 000000067F000040020000E00000004E8000-000000067F000040020000E00000004EC000__000000574B7FF240 000000067F000040020000E00000004E8000-000000067F000040020000E00000004EC000__00000073AD3FE6B8 000000067F000040020000E00000004E8000-000000067F000040020000E00000004EC000__000000914E3F38F0 000000067F000040020000E00000004E8000-000000067F000040020000E00000004EC000__000000931B9A2710 000000067F000040020000E00000004EA2D3-000000067F000040020000E00000004F2CC7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004EC000-000000067F000040020000E00000004F0000__000000574B7FF240 000000067F000040020000E00000004EC000-000000067F000040020000E00000004F0000__00000073AD3FE6B8 000000067F000040020000E00000004EC000-000000067F000040020000E00000004F0000__000000914E3F38F0 000000067F000040020000E00000004EC000-000000067F000040020000E00000004F0000__000000931B9A2710 000000067F000040020000E00000004F0000-000000067F000040020000E00000004F4000__000000574B7FF240 000000067F000040020000E00000004F0000-000000067F000040020000E00000004F4000__00000073AD3FE6B8 000000067F000040020000E00000004F0000-000000067F000040020000E00000004F4000__000000914E3F38F0 000000067F000040020000E00000004F0000-000000067F000040020000E00000004F4000__000000931B9A2710 000000067F000040020000E00000004F2CC7-000000067F000040020000E00000004FB6B8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004F4000-000000067F000040020000E00000004F8000__000000574B7FF240 000000067F000040020000E00000004F4000-000000067F000040020000E00000004F8000__00000073AD3FE6B8 000000067F000040020000E00000004F4000-000000067F000040020000E00000004F8000__000000914E3F38F0 000000067F000040020000E00000004F4000-000000067F000040020000E00000004F8000__000000931B9A2710 000000067F000040020000E00000004F8000-000000067F000040020000E00000004FC000__000000574B7FF240 000000067F000040020000E00000004F8000-000000067F000040020000E00000004FC000__00000073AD3FE6B8 000000067F000040020000E00000004F8000-000000067F000040020000E00000004FC000__000000914E3F38F0 000000067F000040020000E00000004F8000-000000067F000040020000E00000004FC000__000000931B9A2710 000000067F000040020000E00000004FB6B8-000000067F000040020000E00000005040A3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000004FC000-000000067F000040020000E0000000500000__000000574B7FF240 000000067F000040020000E00000004FC000-000000067F000040020000E0000000500000__00000073AD3FE6B8 000000067F000040020000E00000004FC000-000000067F000040020000E0000000500000__000000914E3F38F0 000000067F000040020000E00000004FC000-000000067F000040020000E0000000500000__000000931B9A2710 000000067F000040020000E0000000500000-000000067F000040020000E0000000504000__000000574B7FF240 000000067F000040020000E0000000500000-000000067F000040020000E0000000504000__00000073AD3FE6B8 000000067F000040020000E0000000500000-000000067F000040020000E0000000504000__000000914E3F38F0 000000067F000040020000E0000000500000-000000067F000040020000E0000000504000__000000931B9A2710 000000067F000040020000E0000000504000-000000067F000040020000E0000000508000__000000574B7FF240 000000067F000040020000E0000000504000-000000067F000040020000E0000000508000__00000073AD3FE6B8 000000067F000040020000E0000000504000-000000067F000040020000E0000000508000__000000914E3F38F0 000000067F000040020000E0000000504000-000000067F000040020000E0000000508000__000000931B9A2710 000000067F000040020000E00000005040A3-000000067F000040020000E000000050CA7A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000508000-000000067F000040020000E000000050C000__000000574B7FF240 000000067F000040020000E0000000508000-000000067F000040020000E000000050C000__00000073AD3FE6B8 000000067F000040020000E0000000508000-000000067F000040020000E000000050C000__000000914E3F38F0 000000067F000040020000E0000000508000-000000067F000040020000E000000050C000__000000931B9A2710 000000067F000040020000E000000050C000-000000067F000040020000E0000000510000__000000574B7FF240 000000067F000040020000E000000050C000-000000067F000040020000E0000000510000__00000073AD3FE6B8 000000067F000040020000E000000050C000-000000067F000040020000E0000000510000__000000914E3F38F0 000000067F000040020000E000000050C000-000000067F000040020000E0000000510000__000000931B9A2710 000000067F000040020000E000000050CA7A-000000067F000040020000E0000000515448__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000510000-000000067F000040020000E0000000514000__000000574B7FF240 000000067F000040020000E0000000510000-000000067F000040020000E0000000514000__00000073AD3FE6B8 000000067F000040020000E0000000510000-000000067F000040020000E0000000514000__000000914E3F38F0 000000067F000040020000E0000000510000-000000067F000040020000E0000000514000__000000931B9A2710 000000067F000040020000E0000000514000-000000067F000040020000E0000000518000__000000574B7FF240 000000067F000040020000E0000000514000-000000067F000040020000E0000000518000__00000073AD3FE6B8 000000067F000040020000E0000000514000-000000067F000040020000E0000000518000__000000914E3F38F0 000000067F000040020000E0000000514000-000000067F000040020000E0000000518000__000000931B9A2710 000000067F000040020000E0000000515448-000000067F000040020000E000000051DE01__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000518000-000000067F000040020000E000000051C000__000000574B7FF240 000000067F000040020000E0000000518000-000000067F000040020000E000000051C000__00000073AD3FE6B8 000000067F000040020000E0000000518000-000000067F000040020000E000000051C000__000000914E3F38F0 000000067F000040020000E0000000518000-000000067F000040020000E000000051C000__000000931B9A2710 000000067F000040020000E000000051C000-000000067F000040020000E0000000520000__000000574B7FF240 000000067F000040020000E000000051C000-000000067F000040020000E0000000520000__00000073AD3FE6B8 000000067F000040020000E000000051C000-000000067F000040020000E0000000520000__000000914E3F38F0 000000067F000040020000E000000051C000-000000067F000040020000E0000000520000__000000931B9A2710 000000067F000040020000E000000051DE01-000000067F000040020000E00000005267E4__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000520000-000000067F000040020000E0000000524000__000000574B7FF240 000000067F000040020000E0000000520000-000000067F000040020000E0000000524000__00000073AD3FE6B8 000000067F000040020000E0000000520000-000000067F000040020000E0000000524000__000000914E3F38F0 000000067F000040020000E0000000520000-000000067F000040020000E0000000524000__000000931B9A2710 000000067F000040020000E0000000524000-000000067F000040020000E0000000528000__000000574B7FF240 000000067F000040020000E0000000524000-000000067F000040020000E0000000528000__00000073AD3FE6B8 000000067F000040020000E0000000524000-000000067F000040020000E0000000528000__000000914E3F38F0 000000067F000040020000E0000000524000-000000067F000040020000E0000000528000__000000931B9A2710 000000067F000040020000E00000005267E4-000000067F000040020000E000000052F1DD__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000528000-000000067F000040020000E000000052C000__000000574B7FF240 000000067F000040020000E0000000528000-000000067F000040020000E000000052C000__00000073AD3FE6B8 000000067F000040020000E0000000528000-000000067F000040020000E000000052C000__000000914E3F38F0 000000067F000040020000E0000000528000-000000067F000040020000E000000052C000__000000931B9A2710 000000067F000040020000E000000052C000-000000067F000040020000E0000000530000__000000574B7FF240 000000067F000040020000E000000052C000-000000067F000040020000E0000000530000__00000073AD3FE6B8 000000067F000040020000E000000052C000-000000067F000040020000E0000000530000__000000914E3F38F0 000000067F000040020000E000000052C000-000000067F000040020000E0000000530000__000000931B9A2710 000000067F000040020000E000000052F1DD-000000067F000040020000E0000000537BD3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000530000-000000067F000040020000E0000000534000__000000574B7FF240 000000067F000040020000E0000000530000-000000067F000040020000E0000000534000__00000073AD3FE6B8 000000067F000040020000E0000000530000-000000067F000040020000E0000000534000__000000914E3F38F0 000000067F000040020000E0000000530000-000000067F000040020000E0000000534000__000000931B9A2710 000000067F000040020000E0000000534000-000000067F000040020000E0000000538000__000000574B7FF240 000000067F000040020000E0000000534000-000000067F000040020000E0000000538000__00000073AD3FE6B8 000000067F000040020000E0000000534000-000000067F000040020000E0000000538000__000000914E3F38F0 000000067F000040020000E0000000534000-000000067F000040020000E0000000538000__000000931B9A2710 000000067F000040020000E0000000537BD3-000000067F000040020000E00000005405B7__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000538000-000000067F000040020000E000000053C000__000000574B7FF240 000000067F000040020000E0000000538000-000000067F000040020000E000000053C000__00000073AD3FE6B8 000000067F000040020000E0000000538000-000000067F000040020000E000000053C000__000000914E3F38F0 000000067F000040020000E0000000538000-000000067F000040020000E000000053C000__000000931B9A2710 000000067F000040020000E000000053C000-000000067F000040020000E0000000540000__000000574B7FF240 000000067F000040020000E000000053C000-000000067F000040020000E0000000540000__00000073AD3FE6B8 000000067F000040020000E000000053C000-000000067F000040020000E0000000540000__000000914E3F38F0 000000067F000040020000E000000053C000-000000067F000040020000E0000000540000__000000931B9A2710 000000067F000040020000E0000000540000-000000067F000040020000E0000000544000__000000574B7FF240 000000067F000040020000E0000000540000-000000067F000040020000E0000000544000__00000073AD3FE6B8 000000067F000040020000E0000000540000-000000067F000040020000E0000000544000__000000914E3F38F0 000000067F000040020000E0000000540000-000000067F000040020000E0000000544000__000000931B9A2710 000000067F000040020000E00000005405B7-000000067F000040020000E0000000548F92__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000544000-000000067F000040020000E0000000548000__000000574B7FF240 000000067F000040020000E0000000544000-000000067F000040020000E0000000548000__00000073AD3FE6B8 000000067F000040020000E0000000544000-000000067F000040020000E0000000548000__000000914E3F38F0 000000067F000040020000E0000000544000-000000067F000040020000E0000000548000__000000931B9A2710 000000067F000040020000E0000000548000-000000067F000040020000E000000054C000__000000574B7FF240 000000067F000040020000E0000000548000-000000067F000040020000E000000054C000__00000073AD3FE6B8 000000067F000040020000E0000000548000-000000067F000040020000E000000054C000__000000914E3F38F0 000000067F000040020000E0000000548000-000000067F000040020000E000000054C000__000000931B9A2710 000000067F000040020000E0000000548F92-000000067F000040020000E000000055195C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000054C000-000000067F000040020000E0000000550000__000000574B7FF240 000000067F000040020000E000000054C000-000000067F000040020000E0000000550000__00000073AD3FE6B8 000000067F000040020000E000000054C000-000000067F000040020000E0000000550000__000000914E3F38F0 000000067F000040020000E000000054C000-000000067F000040020000E0000000550000__000000931B9A2710 000000067F000040020000E0000000550000-000000067F000040020000E0000000554000__000000574B7FF240 000000067F000040020000E0000000550000-000000067F000040020000E0000000554000__00000073AD3FE6B8 000000067F000040020000E0000000550000-000000067F000040020000E0000000554000__000000914E3F38F0 000000067F000040020000E0000000550000-000000067F000040020000E0000000554000__000000931B9A2710 000000067F000040020000E000000055195C-000000067F000040020000E000000055A319__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000554000-000000067F000040020000E0000000558000__000000574B7FF240 000000067F000040020000E0000000554000-000000067F000040020000E0000000558000__00000073AD3FE6B8 000000067F000040020000E0000000554000-000000067F000040020000E0000000558000__000000914E3F38F0 000000067F000040020000E0000000554000-000000067F000040020000E0000000558000__000000931B9A2710 000000067F000040020000E0000000558000-000000067F000040020000E000000055C000__000000574B7FF240 000000067F000040020000E0000000558000-000000067F000040020000E000000055C000__00000073AD3FE6B8 000000067F000040020000E0000000558000-000000067F000040020000E000000055C000__000000914E3F38F0 000000067F000040020000E0000000558000-000000067F000040020000E000000055C000__000000931B9A2710 000000067F000040020000E000000055A319-000000067F000040020000E0000000562D04__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000055C000-000000067F000040020000E0000000560000__000000574B7FF240 000000067F000040020000E000000055C000-000000067F000040020000E0000000560000__00000073AD3FE6B8 000000067F000040020000E000000055C000-000000067F000040020000E0000000560000__000000914E3F38F0 000000067F000040020000E000000055C000-000000067F000040020000E0000000560000__000000931B9A2710 000000067F000040020000E0000000560000-000000067F000040020000E0000000564000__000000574B7FF240 000000067F000040020000E0000000560000-000000067F000040020000E0000000564000__00000073AD3FE6B8 000000067F000040020000E0000000560000-000000067F000040020000E0000000564000__000000914E3F38F0 000000067F000040020000E0000000560000-000000067F000040020000E0000000564000__000000931B9A2710 000000067F000040020000E0000000562D04-000000067F000040020000E000000056B6E9__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000564000-000000067F000040020000E0000000568000__000000574B7FF240 000000067F000040020000E0000000564000-000000067F000040020000E0000000568000__00000073AD3FE6B8 000000067F000040020000E0000000564000-000000067F000040020000E0000000568000__000000914E3F38F0 000000067F000040020000E0000000564000-000000067F000040020000E0000000568000__000000931B9A2710 000000067F000040020000E0000000568000-000000067F000040020000E000000056C000__000000574B7FF240 000000067F000040020000E0000000568000-000000067F000040020000E000000056C000__00000073AD3FE6B8 000000067F000040020000E0000000568000-000000067F000040020000E000000056C000__000000914E3F38F0 000000067F000040020000E0000000568000-000000067F000040020000E000000056C000__000000931B9A2710 000000067F000040020000E000000056B6E9-000000067F000040020000E00000005740DF__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000056C000-000000067F000040020000E0000000570000__000000574B7FF240 000000067F000040020000E000000056C000-000000067F000040020000E0000000570000__00000073AD3FE6B8 000000067F000040020000E000000056C000-000000067F000040020000E0000000570000__000000914E3F38F0 000000067F000040020000E000000056C000-000000067F000040020000E0000000570000__000000931B9A2710 000000067F000040020000E0000000570000-000000067F000040020000E0000000574000__000000574B7FF240 000000067F000040020000E0000000570000-000000067F000040020000E0000000574000__00000073AD3FE6B8 000000067F000040020000E0000000570000-000000067F000040020000E0000000574000__000000914E3F38F0 000000067F000040020000E0000000570000-000000067F000040020000E0000000574000__000000931B9A2710 000000067F000040020000E0000000574000-000000067F000040020000E0000000578000__000000574B7FF240 000000067F000040020000E0000000574000-000000067F000040020000E0000000578000__00000073AD3FE6B8 000000067F000040020000E0000000574000-000000067F000040020000E0000000578000__000000914E3F38F0 000000067F000040020000E0000000574000-000000067F000040020000E0000000578000__000000931B9A2710 000000067F000040020000E00000005740DF-000000067F000040020000E000000057CAB9__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000578000-000000067F000040020000E000000057C000__000000574B7FF240 000000067F000040020000E0000000578000-000000067F000040020000E000000057C000__00000073AD3FE6B8 000000067F000040020000E0000000578000-000000067F000040020000E000000057C000__000000914E3F38F0 000000067F000040020000E0000000578000-000000067F000040020000E000000057C000__000000931B9A2710 000000067F000040020000E000000057C000-000000067F000040020000E0000000580000__000000574B7FF240 000000067F000040020000E000000057C000-000000067F000040020000E0000000580000__00000073AD3FE6B8 000000067F000040020000E000000057C000-000000067F000040020000E0000000580000__000000914E3F38F0 000000067F000040020000E000000057C000-000000067F000040020000E0000000580000__000000931B9A2710 000000067F000040020000E000000057CAB9-000000067F000040020000E0000000585495__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000580000-000000067F000040020000E0000000584000__000000574B7FF240 000000067F000040020000E0000000580000-000000067F000040020000E0000000584000__00000073AD3FE6B8 000000067F000040020000E0000000580000-000000067F000040020000E0000000584000__000000914E3F38F0 000000067F000040020000E0000000580000-000000067F000040020000E0000000584000__000000931B9A2710 000000067F000040020000E0000000584000-000000067F000040020000E0000000588000__000000574B7FF240 000000067F000040020000E0000000584000-000000067F000040020000E0000000588000__00000073AD3FE6B8 000000067F000040020000E0000000584000-000000067F000040020000E0000000588000__000000914E3F38F0 000000067F000040020000E0000000584000-000000067F000040020000E0000000588000__000000931B9A2710 000000067F000040020000E0000000585495-000000067F000040020000E000000058DE64__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000588000-000000067F000040020000E000000058C000__000000574B7FF240 000000067F000040020000E0000000588000-000000067F000040020000E000000058C000__00000073AD3FE6B8 000000067F000040020000E0000000588000-000000067F000040020000E000000058C000__000000914E3F38F0 000000067F000040020000E0000000588000-000000067F000040020000E000000058C000__000000931B9A2710 000000067F000040020000E000000058C000-000000067F000040020000E0000000590000__000000574B7FF240 000000067F000040020000E000000058C000-000000067F000040020000E0000000590000__00000073AD3FE6B8 000000067F000040020000E000000058C000-000000067F000040020000E0000000590000__000000914E3F38F0 000000067F000040020000E000000058C000-000000067F000040020000E0000000590000__000000931B9A2710 000000067F000040020000E000000058DE64-000000067F000040020000E000000059682F__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000590000-000000067F000040020000E0000000594000__000000574B7FF240 000000067F000040020000E0000000590000-000000067F000040020000E0000000594000__00000073AD3FE6B8 000000067F000040020000E0000000590000-000000067F000040020000E0000000594000__000000914E3F38F0 000000067F000040020000E0000000590000-000000067F000040020000E0000000594000__000000931B9A2710 000000067F000040020000E0000000594000-000000067F000040020000E0000000598000__000000574B7FF240 000000067F000040020000E0000000594000-000000067F000040020000E0000000598000__00000073AD3FE6B8 000000067F000040020000E0000000594000-000000067F000040020000E0000000598000__000000914E3F38F0 000000067F000040020000E0000000594000-000000067F000040020000E0000000598000__000000931B9A2710 000000067F000040020000E000000059682F-000000067F000040020000E000000059F20F__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000598000-000000067F000040020000E000000059C000__000000574B7FF240 000000067F000040020000E0000000598000-000000067F000040020000E000000059C000__00000073AD3FE6B8 000000067F000040020000E0000000598000-000000067F000040020000E000000059C000__000000914E3F38F0 000000067F000040020000E0000000598000-000000067F000040020000E000000059C000__000000931B9A2710 000000067F000040020000E000000059C000-000000067F000040020000E00000005A0000__000000574B7FF240 000000067F000040020000E000000059C000-000000067F000040020000E00000005A0000__00000073AD3FE6B8 000000067F000040020000E000000059C000-000000067F000040020000E00000005A0000__000000914E3F38F0 000000067F000040020000E000000059C000-000000067F000040020000E00000005A0000__000000931B9A2710 000000067F000040020000E000000059F20F-000000067F000040020000E00000005A7BFC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005A0000-000000067F000040020000E00000005A4000__000000574B7FF240 000000067F000040020000E00000005A0000-000000067F000040020000E00000005A4000__00000073AD3FE6B8 000000067F000040020000E00000005A0000-000000067F000040020000E00000005A4000__000000914E3F38F0 000000067F000040020000E00000005A0000-000000067F000040020000E00000005A4000__000000931B9A2710 000000067F000040020000E00000005A4000-000000067F000040020000E00000005A8000__000000574B7FF240 000000067F000040020000E00000005A4000-000000067F000040020000E00000005A8000__00000073AD3FE6B8 000000067F000040020000E00000005A4000-000000067F000040020000E00000005A8000__000000914E3F38F0 000000067F000040020000E00000005A4000-000000067F000040020000E00000005A8000__000000931B9A2710 000000067F000040020000E00000005A7BFC-000000067F000040020000E00000005B05EF__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005A8000-000000067F000040020000E00000005AC000__000000574B7FF240 000000067F000040020000E00000005A8000-000000067F000040020000E00000005AC000__00000073AD3FE6B8 000000067F000040020000E00000005A8000-000000067F000040020000E00000005AC000__000000914E3F38F0 000000067F000040020000E00000005A8000-000000067F000040020000E00000005AC000__000000931B9A2710 000000067F000040020000E00000005AC000-000000067F000040020000E00000005B0000__000000574B7FF240 000000067F000040020000E00000005AC000-000000067F000040020000E00000005B0000__00000073AD3FE6B8 000000067F000040020000E00000005AC000-000000067F000040020000E00000005B0000__000000914E3F38F0 000000067F000040020000E00000005AC000-000000067F000040020000E00000005B0000__000000931B9A2710 000000067F000040020000E00000005B0000-000000067F000040020000E00000005B4000__000000574B7FF240 000000067F000040020000E00000005B0000-000000067F000040020000E00000005B4000__00000073AD3FE6B8 000000067F000040020000E00000005B0000-000000067F000040020000E00000005B4000__000000914E3F38F0 000000067F000040020000E00000005B0000-000000067F000040020000E00000005B4000__000000931B9A2710 000000067F000040020000E00000005B05EF-000000067F000040020000E00000005B8FCE__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005B4000-000000067F000040020000E00000005B8000__000000574B7FF240 000000067F000040020000E00000005B4000-000000067F000040020000E00000005B8000__00000073AD3FE6B8 000000067F000040020000E00000005B4000-000000067F000040020000E00000005B8000__000000914E3F38F0 000000067F000040020000E00000005B4000-000000067F000040020000E00000005B8000__000000931B9A2710 000000067F000040020000E00000005B8000-000000067F000040020000E00000005BC000__000000574B7FF240 000000067F000040020000E00000005B8000-000000067F000040020000E00000005BC000__00000073AD3FE6B8 000000067F000040020000E00000005B8000-000000067F000040020000E00000005BC000__000000914E3F38F0 000000067F000040020000E00000005B8000-000000067F000040020000E00000005BC000__000000931B9A2710 000000067F000040020000E00000005B8FCE-000000067F000040020000E00000005C19AA__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005BC000-000000067F000040020000E00000005C0000__000000574B7FF240 000000067F000040020000E00000005BC000-000000067F000040020000E00000005C0000__00000073AD3FE6B8 000000067F000040020000E00000005BC000-000000067F000040020000E00000005C0000__000000914E3F38F0 000000067F000040020000E00000005BC000-000000067F000040020000E00000005C0000__000000931B9A2710 000000067F000040020000E00000005C0000-000000067F000040020000E00000005C4000__000000574B7FF240 000000067F000040020000E00000005C0000-000000067F000040020000E00000005C4000__00000073AD3FE6B8 000000067F000040020000E00000005C0000-000000067F000040020000E00000005C4000__000000914E3F38F0 000000067F000040020000E00000005C0000-000000067F000040020000E00000005C4000__000000931B9A2710 000000067F000040020000E00000005C19AA-000000067F000040020000E00000005CA378__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005C4000-000000067F000040020000E00000005C8000__000000574B7FF240 000000067F000040020000E00000005C4000-000000067F000040020000E00000005C8000__00000073AD3FE6B8 000000067F000040020000E00000005C4000-000000067F000040020000E00000005C8000__000000914E3F38F0 000000067F000040020000E00000005C4000-000000067F000040020000E00000005C8000__000000931B9A2710 000000067F000040020000E00000005C8000-000000067F000040020000E00000005CC000__000000574B7FF240 000000067F000040020000E00000005C8000-000000067F000040020000E00000005CC000__00000073AD3FE6B8 000000067F000040020000E00000005C8000-000000067F000040020000E00000005CC000__000000914E3F38F0 000000067F000040020000E00000005C8000-000000067F000040020000E00000005CC000__000000931B9A2710 000000067F000040020000E00000005CA378-000000067F000040020000E00000005D2D45__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005CC000-000000067F000040020000E00000005D0000__000000574B7FF240 000000067F000040020000E00000005CC000-000000067F000040020000E00000005D0000__00000073AD3FE6B8 000000067F000040020000E00000005CC000-000000067F000040020000E00000005D0000__000000914E3F38F0 000000067F000040020000E00000005CC000-000000067F000040020000E00000005D0000__000000931B9A2710 000000067F000040020000E00000005D0000-000000067F000040020000E00000005D4000__000000574B7FF240 000000067F000040020000E00000005D0000-000000067F000040020000E00000005D4000__00000073AD3FE6B8 000000067F000040020000E00000005D0000-000000067F000040020000E00000005D4000__000000914E3F38F0 000000067F000040020000E00000005D0000-000000067F000040020000E00000005D4000__000000931B9A2710 000000067F000040020000E00000005D2D45-000000067F000040020000E00000005DB728__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005D4000-000000067F000040020000E00000005D8000__000000574B7FF240 000000067F000040020000E00000005D4000-000000067F000040020000E00000005D8000__00000073AD3FE6B8 000000067F000040020000E00000005D4000-000000067F000040020000E00000005D8000__000000914E3F38F0 000000067F000040020000E00000005D4000-000000067F000040020000E00000005D8000__000000931B9A2710 000000067F000040020000E00000005D8000-000000067F000040020000E00000005DC000__000000574B7FF240 000000067F000040020000E00000005D8000-000000067F000040020000E00000005DC000__00000073AD3FE6B8 000000067F000040020000E00000005D8000-000000067F000040020000E00000005DC000__000000914E3F38F0 000000067F000040020000E00000005D8000-000000067F000040020000E00000005DC000__000000931B9A2710 000000067F000040020000E00000005DB728-000000067F000040020000E00000005E4114__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005DC000-000000067F000040020000E00000005E0000__000000574B7FF240 000000067F000040020000E00000005DC000-000000067F000040020000E00000005E0000__00000073AD3FE6B8 000000067F000040020000E00000005DC000-000000067F000040020000E00000005E0000__000000914E3F38F0 000000067F000040020000E00000005DC000-000000067F000040020000E00000005E0000__000000931B9A2710 000000067F000040020000E00000005E0000-000000067F000040020000E00000005E4000__000000574B7FF240 000000067F000040020000E00000005E0000-000000067F000040020000E00000005E4000__00000073AD3FE6B8 000000067F000040020000E00000005E0000-000000067F000040020000E00000005E4000__000000914E3F38F0 000000067F000040020000E00000005E0000-000000067F000040020000E00000005E4000__000000931B9A2710 000000067F000040020000E00000005E4000-000000067F000040020000E00000005E8000__000000574B7FF240 000000067F000040020000E00000005E4000-000000067F000040020000E00000005E8000__00000073AD3FE6B8 000000067F000040020000E00000005E4000-000000067F000040020000E00000005E8000__000000914E3F38F0 000000067F000040020000E00000005E4000-000000067F000040020000E00000005E8000__000000931B9A2710 000000067F000040020000E00000005E4114-000000067F000040020000E00000005ECAF0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005E8000-000000067F000040020000E00000005EC000__000000574B7FF240 000000067F000040020000E00000005E8000-000000067F000040020000E00000005EC000__00000073AD3FE6B8 000000067F000040020000E00000005E8000-000000067F000040020000E00000005EC000__000000914E3F38F0 000000067F000040020000E00000005E8000-000000067F000040020000E00000005EC000__000000931B9A2710 000000067F000040020000E00000005EC000-000000067F000040020000E00000005F0000__000000574B7FF240 000000067F000040020000E00000005EC000-000000067F000040020000E00000005F0000__00000073AD3FE6B8 000000067F000040020000E00000005EC000-000000067F000040020000E00000005F0000__000000914E3F38F0 000000067F000040020000E00000005EC000-000000067F000040020000E00000005F0000__000000931B9A2710 000000067F000040020000E00000005ECAF0-000000067F000040020000E00000005F54D3__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005F0000-000000067F000040020000E00000005F4000__000000574B7FF240 000000067F000040020000E00000005F0000-000000067F000040020000E00000005F4000__00000073AD3FE6B8 000000067F000040020000E00000005F0000-000000067F000040020000E00000005F4000__000000914E3F38F0 000000067F000040020000E00000005F0000-000000067F000040020000E00000005F4000__000000931B9A2710 000000067F000040020000E00000005F4000-000000067F000040020000E00000005F8000__000000574B7FF240 000000067F000040020000E00000005F4000-000000067F000040020000E00000005F8000__00000073AD3FE6B8 000000067F000040020000E00000005F4000-000000067F000040020000E00000005F8000__000000914E3F38F0 000000067F000040020000E00000005F4000-000000067F000040020000E00000005F8000__000000931B9A2710 000000067F000040020000E00000005F54D3-000000067F000040020000E00000005FDEAC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000005F8000-000000067F000040020000E00000005FC000__000000574B7FF240 000000067F000040020000E00000005F8000-000000067F000040020000E00000005FC000__00000073AD3FE6B8 000000067F000040020000E00000005F8000-000000067F000040020000E00000005FC000__000000914E3F38F0 000000067F000040020000E00000005F8000-000000067F000040020000E00000005FC000__000000931B9A2710 000000067F000040020000E00000005FC000-000000067F000040020000E0000000600000__000000574B7FF240 000000067F000040020000E00000005FC000-000000067F000040020000E0000000600000__00000073AD3FE6B8 000000067F000040020000E00000005FC000-000000067F000040020000E0000000600000__000000914E3F38F0 000000067F000040020000E00000005FC000-000000067F000040020000E0000000600000__000000931B9A2710 000000067F000040020000E00000005FDEAC-000000067F000040020000E000000060687C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000600000-000000067F000040020000E0000000604000__000000574B7FF240 000000067F000040020000E0000000600000-000000067F000040020000E0000000604000__00000073AD3FE6B8 000000067F000040020000E0000000600000-000000067F000040020000E0000000604000__000000914E3F38F0 000000067F000040020000E0000000600000-000000067F000040020000E0000000604000__000000931B9A2710 000000067F000040020000E0000000604000-000000067F000040020000E0000000608000__000000574B7FF240 000000067F000040020000E0000000604000-000000067F000040020000E0000000608000__00000073AD3FE6B8 000000067F000040020000E0000000604000-000000067F000040020000E0000000608000__000000914E3F38F0 000000067F000040020000E0000000604000-000000067F000040020000E0000000608000__000000931B9A2710 000000067F000040020000E000000060687C-000000067F000040020000E000000060F25A__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000608000-000000067F000040020000E000000060C000__000000574B7FF240 000000067F000040020000E0000000608000-000000067F000040020000E000000060C000__00000073AD3FE6B8 000000067F000040020000E0000000608000-000000067F000040020000E000000060C000__000000914E3F38F0 000000067F000040020000E0000000608000-000000067F000040020000E000000060C000__000000931B9A2710 000000067F000040020000E000000060C000-000000067F000040020000E0000000610000__000000574B7FF240 000000067F000040020000E000000060C000-000000067F000040020000E0000000610000__00000073AD3FE6B8 000000067F000040020000E000000060C000-000000067F000040020000E0000000610000__000000914E3F38F0 000000067F000040020000E000000060C000-000000067F000040020000E0000000610000__000000931B9A2710 000000067F000040020000E000000060F25A-000000067F000040020000E0000000617C3B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000610000-000000067F000040020000E0000000614000__000000574B7FF240 000000067F000040020000E0000000610000-000000067F000040020000E0000000614000__00000073AD3FE6B8 000000067F000040020000E0000000610000-000000067F000040020000E0000000614000__000000914E3F38F0 000000067F000040020000E0000000610000-000000067F000040020000E0000000614000__000000931B9A2710 000000067F000040020000E0000000614000-000000067F000040020000E0000000618000__000000574B7FF240 000000067F000040020000E0000000614000-000000067F000040020000E0000000618000__00000073AD3FE6B8 000000067F000040020000E0000000614000-000000067F000040020000E0000000618000__000000914E3F38F0 000000067F000040020000E0000000614000-000000067F000040020000E0000000618000__000000931B9A2710 000000067F000040020000E0000000617C3B-000000067F000040020000E0000000620625__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000618000-000000067F000040020000E000000061C000__000000574B7FF240 000000067F000040020000E0000000618000-000000067F000040020000E000000061C000__00000073AD3FE6B8 000000067F000040020000E0000000618000-000000067F000040020000E000000061C000__000000914E3F38F0 000000067F000040020000E0000000618000-000000067F000040020000E000000061C000__000000931B9A2710 000000067F000040020000E000000061C000-000000067F000040020000E0000000620000__000000574B7FF240 000000067F000040020000E000000061C000-000000067F000040020000E0000000620000__00000073AD3FE6B8 000000067F000040020000E000000061C000-000000067F000040020000E0000000620000__000000914E3F38F0 000000067F000040020000E000000061C000-000000067F000040020000E0000000620000__000000931B9A2710 000000067F000040020000E0000000620000-000000067F000040020000E0000000624000__000000574B7FF240 000000067F000040020000E0000000620000-000000067F000040020000E0000000624000__00000073AD3FE6B8 000000067F000040020000E0000000620000-000000067F000040020000E0000000624000__000000914E3F38F0 000000067F000040020000E0000000620000-000000067F000040020000E0000000624000__000000931B9A2710 000000067F000040020000E0000000620625-000000067F000040020000E0000000628FFC__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000624000-000000067F000040020000E0000000628000__000000574B7FF240 000000067F000040020000E0000000624000-000000067F000040020000E0000000628000__00000073AD3FE6B8 000000067F000040020000E0000000624000-000000067F000040020000E0000000628000__000000914E3F38F0 000000067F000040020000E0000000624000-000000067F000040020000E0000000628000__000000931B9A2710 000000067F000040020000E0000000628000-000000067F000040020000E000000062C000__000000574B7FF240 000000067F000040020000E0000000628000-000000067F000040020000E000000062C000__00000073AD3FE6B8 000000067F000040020000E0000000628000-000000067F000040020000E000000062C000__000000914E3F38F0 000000067F000040020000E0000000628000-000000067F000040020000E000000062C000__000000931B9A2710 000000067F000040020000E0000000628FFC-000000067F000040020000E00000006319E0__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000062C000-000000067F000040020000E0000000630000__000000574B7FF240 000000067F000040020000E000000062C000-000000067F000040020000E0000000630000__00000073AD3FE6B8 000000067F000040020000E000000062C000-000000067F000040020000E0000000630000__000000914E3F38F0 000000067F000040020000E000000062C000-000000067F000040020000E0000000630000__000000931B9A2710 000000067F000040020000E0000000630000-000000067F000040020000E0000000634000__000000574B7FF240 000000067F000040020000E0000000630000-000000067F000040020000E0000000634000__00000073AD3FE6B8 000000067F000040020000E0000000630000-000000067F000040020000E0000000634000__000000914E3F38F0 000000067F000040020000E0000000630000-000000067F000040020000E0000000634000__000000931B9A2710 000000067F000040020000E00000006319E0-000000067F000040020000E000000063A3B8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000634000-000000067F000040020000E0000000638000__000000574B7FF240 000000067F000040020000E0000000634000-000000067F000040020000E0000000638000__00000073AD3FE6B8 000000067F000040020000E0000000634000-000000067F000040020000E0000000638000__000000914E3F38F0 000000067F000040020000E0000000634000-000000067F000040020000E0000000638000__000000931B9A2710 000000067F000040020000E0000000638000-000000067F000040020000E000000063C000__000000574B7FF240 000000067F000040020000E0000000638000-000000067F000040020000E000000063C000__00000073AD3FE6B8 000000067F000040020000E0000000638000-000000067F000040020000E000000063C000__000000914E3F38F0 000000067F000040020000E0000000638000-000000067F000040020000E000000063C000__000000931B9A2710 000000067F000040020000E000000063A3B8-000000067F000040020000E0000000642D80__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000063C000-000000067F000040020000E0000000640000__000000574B7FF240 000000067F000040020000E000000063C000-000000067F000040020000E0000000640000__00000073AD3FE6B8 000000067F000040020000E000000063C000-000000067F000040020000E0000000640000__000000914E3F38F0 000000067F000040020000E000000063C000-000000067F000040020000E0000000640000__000000931B9A2710 000000067F000040020000E0000000640000-000000067F000040020000E0000000644000__000000574B7FF240 000000067F000040020000E0000000640000-000000067F000040020000E0000000644000__00000073AD3FE6B8 000000067F000040020000E0000000640000-000000067F000040020000E0000000644000__000000914E3F38F0 000000067F000040020000E0000000640000-000000067F000040020000E0000000644000__000000931B9A2710 000000067F000040020000E0000000642D80-000000067F000040020000E000000064B762__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000644000-000000067F000040020000E0000000648000__000000574B7FF240 000000067F000040020000E0000000644000-000000067F000040020000E0000000648000__00000073AD3FE6B8 000000067F000040020000E0000000644000-000000067F000040020000E0000000648000__000000914E3F38F0 000000067F000040020000E0000000644000-000000067F000040020000E0000000648000__000000931B9A2710 000000067F000040020000E0000000648000-000000067F000040020000E000000064C000__000000574B7FF240 000000067F000040020000E0000000648000-000000067F000040020000E000000064C000__00000073AD3FE6B8 000000067F000040020000E0000000648000-000000067F000040020000E000000064C000__000000914E3F38F0 000000067F000040020000E0000000648000-000000067F000040020000E000000064C000__000000931B9A2710 000000067F000040020000E000000064B762-000000067F000040020000E000000065415B__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000064C000-000000067F000040020000E0000000650000__000000574B7FF240 000000067F000040020000E000000064C000-000000067F000040020000E0000000650000__00000073AD3FE6B8 000000067F000040020000E000000064C000-000000067F000040020000E0000000650000__000000914E3F38F0 000000067F000040020000E000000064C000-000000067F000040020000E0000000650000__000000931B9A2710 000000067F000040020000E0000000650000-000000067F000040020000E0000000654000__000000574B7FF240 000000067F000040020000E0000000650000-000000067F000040020000E0000000654000__00000073AD3FE6B8 000000067F000040020000E0000000650000-000000067F000040020000E0000000654000__000000914E3F38F0 000000067F000040020000E0000000650000-000000067F000040020000E0000000654000__000000931B9A2710 000000067F000040020000E0000000654000-000000067F000040020000E0000000658000__000000574B7FF240 000000067F000040020000E0000000654000-000000067F000040020000E0000000658000__00000073AD3FE6B8 000000067F000040020000E0000000654000-000000067F000040020000E0000000658000__000000914E3F38F0 000000067F000040020000E0000000654000-000000067F000040020000E0000000658000__000000931B9A2710 000000067F000040020000E000000065415B-000000067F000040020000E000000065CB43__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000658000-000000067F000040020000E000000065C000__000000574B7FF240 000000067F000040020000E0000000658000-000000067F000040020000E000000065C000__00000073AD3FE6B8 000000067F000040020000E0000000658000-000000067F000040020000E000000065C000__000000914E3F38F0 000000067F000040020000E0000000658000-000000067F000040020000E000000065C000__000000931B9A2710 000000067F000040020000E000000065C000-000000067F000040020000E0000000660000__000000574B7FF240 000000067F000040020000E000000065C000-000000067F000040020000E0000000660000__00000073AD3FE6B8 000000067F000040020000E000000065C000-000000067F000040020000E0000000660000__000000914E3F38F0 000000067F000040020000E000000065C000-000000067F000040020000E0000000660000__000000931B9A2710 000000067F000040020000E000000065CB43-000000067F000040020000E0000000665527__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000660000-000000067F000040020000E0000000664000__000000574B7FF240 000000067F000040020000E0000000660000-000000067F000040020000E0000000664000__00000073AD3FE6B8 000000067F000040020000E0000000660000-000000067F000040020000E0000000664000__000000914E3F38F0 000000067F000040020000E0000000660000-000000067F000040020000E0000000664000__000000931B9A2710 000000067F000040020000E0000000664000-000000067F000040020000E0000000668000__000000574B7FF240 000000067F000040020000E0000000664000-000000067F000040020000E0000000668000__00000073AD3FE6B8 000000067F000040020000E0000000664000-000000067F000040020000E0000000668000__000000914E3F38F0 000000067F000040020000E0000000664000-000000067F000040020000E0000000668000__000000931B9A2710 000000067F000040020000E0000000665527-000000067F000040020000E000000066DEEE__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000668000-000000067F000040020000E000000066C000__000000574B7FF240 000000067F000040020000E0000000668000-000000067F000040020000E000000066C000__00000073AD3FE6B8 000000067F000040020000E0000000668000-000000067F000040020000E000000066C000__000000914E3F38F0 000000067F000040020000E0000000668000-000000067F000040020000E000000066C000__000000931B9A2710 000000067F000040020000E000000066C000-000000067F000040020000E0000000670000__000000574B7FF240 000000067F000040020000E000000066C000-000000067F000040020000E0000000670000__00000073AD3FE6B8 000000067F000040020000E000000066C000-000000067F000040020000E0000000670000__000000914E3F38F0 000000067F000040020000E000000066C000-000000067F000040020000E0000000670000__000000931B9A2710 000000067F000040020000E000000066DEEE-000000067F000040020000E00000006768C5__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000670000-000000067F000040020000E0000000674000__000000574B7FF240 000000067F000040020000E0000000670000-000000067F000040020000E0000000674000__00000073AD3FE6B8 000000067F000040020000E0000000670000-000000067F000040020000E0000000674000__000000914E3F38F0 000000067F000040020000E0000000670000-000000067F000040020000E0000000674000__000000931B9A2710 000000067F000040020000E0000000674000-000000067F000040020000E0000000678000__000000574B7FF240 000000067F000040020000E0000000674000-000000067F000040020000E0000000678000__00000073AD3FE6B8 000000067F000040020000E0000000674000-000000067F000040020000E0000000678000__000000914E3F38F0 000000067F000040020000E0000000674000-000000067F000040020000E0000000678000__000000931B9A2710 000000067F000040020000E00000006768C5-000000067F000040020000E000000067F286__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000678000-000000067F000040020000E000000067C000__000000574B7FF240 000000067F000040020000E0000000678000-000000067F000040020000E000000067C000__00000073AD3FE6B8 000000067F000040020000E0000000678000-000000067F000040020000E000000067C000__000000914E3F38F0 000000067F000040020000E0000000678000-000000067F000040020000E000000067C000__000000931B9A2710 000000067F000040020000E000000067C000-000000067F000040020000E0000000680000__000000574B7FF240 000000067F000040020000E000000067C000-000000067F000040020000E0000000680000__00000073AD3FE6B8 000000067F000040020000E000000067C000-000000067F000040020000E0000000680000__000000914E3F38F0 000000067F000040020000E000000067C000-000000067F000040020000E0000000680000__000000931B9A2710 000000067F000040020000E000000067F286-000000067F000040020000E0000000687C67__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000680000-000000067F000040020000E0000000684000__000000574B7FF240 000000067F000040020000E0000000680000-000000067F000040020000E0000000684000__00000073AD3FE6B8 000000067F000040020000E0000000680000-000000067F000040020000E0000000684000__000000914E3F38F0 000000067F000040020000E0000000680000-000000067F000040020000E0000000684000__000000931B9A2710 000000067F000040020000E0000000684000-000000067F000040020000E0000000688000__000000574B7FF240 000000067F000040020000E0000000684000-000000067F000040020000E0000000688000__00000073AD3FE6B8 000000067F000040020000E0000000684000-000000067F000040020000E0000000688000__000000914E3F38F0 000000067F000040020000E0000000684000-000000067F000040020000E0000000688000__000000931B9A2710 000000067F000040020000E0000000687C67-000000067F000040020000E0000000690653__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000688000-000000067F000040020000E000000068C000__000000574B7FF240 000000067F000040020000E0000000688000-000000067F000040020000E000000068C000__00000073AD3FE6B8 000000067F000040020000E0000000688000-000000067F000040020000E000000068C000__000000914E3F38F0 000000067F000040020000E0000000688000-000000067F000040020000E000000068C000__000000931B9A2710 000000067F000040020000E000000068C000-000000067F000040020000E0000000690000__000000574B7FF240 000000067F000040020000E000000068C000-000000067F000040020000E0000000690000__00000073AD3FE6B8 000000067F000040020000E000000068C000-000000067F000040020000E0000000690000__000000914E3F38F0 000000067F000040020000E000000068C000-000000067F000040020000E0000000690000__000000931B9A2710 000000067F000040020000E0000000690000-000000067F000040020000E0000000694000__000000574B7FF240 000000067F000040020000E0000000690000-000000067F000040020000E0000000694000__00000073AD3FE6B8 000000067F000040020000E0000000690000-000000067F000040020000E0000000694000__000000914E3F38F0 000000067F000040020000E0000000690000-000000067F000040020000E0000000694000__000000931B9A2710 000000067F000040020000E0000000690653-000000067F000040020000E0000000699034__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E0000000694000-000000067F000040020000E0000000698000__000000574B7FF240 000000067F000040020000E0000000694000-000000067F000040020000E0000000698000__00000073AD3FE6B8 000000067F000040020000E0000000694000-000000067F000040020000E0000000698000__000000914E3F38F0 000000067F000040020000E0000000694000-000000067F000040020000E0000000698000__000000931B9A2710 000000067F000040020000E0000000698000-000000067F000040020000E000000069C000__000000574B7FF240 000000067F000040020000E0000000698000-000000067F000040020000E000000069C000__00000073AD3FE6B8 000000067F000040020000E0000000698000-000000067F000040020000E000000069C000__000000914E3F38F0 000000067F000040020000E0000000698000-000000067F000040020000E000000069C000__000000931B9A2710 000000067F000040020000E0000000699034-000000067F000040020000E00000006A1A0D__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E000000069C000-000000067F000040020000E00000006A0000__000000574B7FF240 000000067F000040020000E000000069C000-000000067F000040020000E00000006A0000__00000073AD3FE6B8 000000067F000040020000E000000069C000-000000067F000040020000E00000006A0000__000000914E3F38F0 000000067F000040020000E000000069C000-000000067F000040020000E00000006A0000__000000931B9A2710 000000067F000040020000E00000006A0000-000000067F000040020000E00000006A4000__000000574B7FF240 000000067F000040020000E00000006A0000-000000067F000040020000E00000006A4000__00000073AD3FE6B8 000000067F000040020000E00000006A0000-000000067F000040020000E00000006A4000__000000914E3F38F0 000000067F000040020000E00000006A0000-000000067F000040020000E00000006A4000__000000931B9A2710 000000067F000040020000E00000006A1A0D-000000067F000040020000E00000006AA3D8__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000006A4000-000000067F000040020000E00000006A8000__000000574B7FF240 000000067F000040020000E00000006A4000-000000067F000040020000E00000006A8000__00000073AD3FE6B8 000000067F000040020000E00000006A4000-000000067F000040020000E00000006A8000__000000914E3F38F0 000000067F000040020000E00000006A4000-000000067F000040020000E00000006A8000__000000931B9A2710 000000067F000040020000E00000006A8000-000000067F000040020000E00000006AC000__000000574B7FF240 000000067F000040020000E00000006A8000-000000067F000040020000E00000006AC000__00000073AD3FE6B8 000000067F000040020000E00000006A8000-000000067F000040020000E00000006AC000__000000914E3F38F0 000000067F000040020000E00000006A8000-000000067F000040020000E00000006AC000__000000931B9A2710 000000067F000040020000E00000006AA3D8-000000067F000040020000E00000006B2DB1__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000006AC000-000000067F000040020000E00000006B0000__000000574B7FF240 000000067F000040020000E00000006AC000-000000067F000040020000E00000006B0000__00000073AD3FE6B8 000000067F000040020000E00000006AC000-000000067F000040020000E00000006B0000__000000914E3F38F0 000000067F000040020000E00000006AC000-000000067F000040020000E00000006B0000__000000931B9A2710 000000067F000040020000E00000006B0000-000000067F000040020000E00000006B4000__000000574B7FF240 000000067F000040020000E00000006B0000-000000067F000040020000E00000006B4000__00000073AD3FE6B8 000000067F000040020000E00000006B0000-000000067F000040020000E00000006B4000__000000914E3F38F0 000000067F000040020000E00000006B0000-000000067F000040020000E00000006B4000__000000931B9A2710 000000067F000040020000E00000006B2DB1-000000067F000040020000E00000006BB77C__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000006B4000-000000067F000040020000E00000006B8000__000000574B7FF240 000000067F000040020000E00000006B4000-000000067F000040020000E00000006B8000__00000073AD3FE6B8 000000067F000040020000E00000006B4000-000000067F000040020000E00000006B8000__000000914E3F38F0 000000067F000040020000E00000006B4000-000000067F000040020000E00000006B8000__000000931B9A2710 000000067F000040020000E00000006B8000-000000067F000040020000E00000006BC000__000000574B7FF240 000000067F000040020000E00000006B8000-000000067F000040020000E00000006BC000__00000073AD3FE6B8 000000067F000040020000E00000006B8000-000000067F000040020000E00000006BC000__000000914E3F38F0 000000067F000040020000E00000006B8000-000000067F000040020000E00000006BC000__000000931B9A2710 000000067F000040020000E00000006BB77C-000000067F000040020000E00000006C416F__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000006BC000-000000067F000040020000E00000006C0000__000000574B7FF240 000000067F000040020000E00000006BC000-000000067F000040020000E00000006C0000__00000073AD3FE6B8 000000067F000040020000E00000006BC000-000000067F000040020000E00000006C0000__000000914E3F38F0 000000067F000040020000E00000006BC000-000000067F000040020000E00000006C0000__000000931B9A2710 000000067F000040020000E00000006C0000-000000067F000040020000E00000006C4000__000000574B7FF240 000000067F000040020000E00000006C0000-000000067F000040020000E00000006C4000__00000073AD3FE6B8 000000067F000040020000E00000006C0000-000000067F000040020000E00000006C4000__000000914E3F38F0 000000067F000040020000E00000006C0000-000000067F000040020000E00000006C4000__000000931B9A2710 000000067F000040020000E00000006C4000-000000067F000040020000E00000006C8000__000000574B7FF240 000000067F000040020000E00000006C4000-000000067F000040020000E00000006C8000__00000073AD3FE6B8 000000067F000040020000E00000006C4000-000000067F000040020000E00000006C8000__000000914E3F38F0 000000067F000040020000E00000006C4000-000000067F000040020000E00000006C8000__000000931B9A2710 000000067F000040020000E00000006C416F-000000067F000040020000E00000006C76FF__0000003B6A0FFB09-00000047441DEA39 000000067F000040020000E00000006C76FF-000000067F000040020000E00000006D00F3__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006C8000-000000067F000040020000E00000006CC000__000000574B7FF240 000000067F000040020000E00000006C8000-000000067F000040020000E00000006CC000__00000073AD3FE6B8 000000067F000040020000E00000006C8000-000000067F000040020000E00000006CC000__000000914E3F38F0 000000067F000040020000E00000006C8000-000000067F000040020000E00000006CC000__000000931B9A2710 000000067F000040020000E00000006CC000-000000067F000040020000E00000006D0000__000000574B7FF240 000000067F000040020000E00000006CC000-000000067F000040020000E00000006D0000__00000073AD3FE6B8 000000067F000040020000E00000006CC000-000000067F000040020000E00000006D0000__000000914E3F38F0 000000067F000040020000E00000006CC000-000000067F000040020000E00000006D0000__000000931B9A2710 000000067F000040020000E00000006D0000-000000067F000040020000E00000006D4000__000000574B7FF240 000000067F000040020000E00000006D0000-000000067F000040020000E00000006D4000__00000073AD3FE6B8 000000067F000040020000E00000006D0000-000000067F000040020000E00000006D4000__000000914E3F38F0 000000067F000040020000E00000006D0000-000000067F000040020000E00000006D4000__000000931B9A2710 000000067F000040020000E00000006D00F3-000000067F000040020000E00000006D8AD9__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006D4000-000000067F000040020000E00000006D8000__000000574B7FF240 000000067F000040020000E00000006D4000-000000067F000040020000E00000006D8000__00000073AD3FE6B8 000000067F000040020000E00000006D4000-000000067F000040020000E00000006D8000__000000914E3F38F0 000000067F000040020000E00000006D4000-000000067F000040020000E00000006D8000__000000931B9A2710 000000067F000040020000E00000006D8000-000000067F000040020000E00000006DC000__000000574B7FF240 000000067F000040020000E00000006D8000-000000067F000040020000E00000006DC000__00000073AD3FE6B8 000000067F000040020000E00000006D8000-000000067F000040020000E00000006DC000__000000914E3F38F0 000000067F000040020000E00000006D8000-000000067F000040020000E00000006DC000__000000931B9A2710 000000067F000040020000E00000006D8AD9-000000067F000040020000E00000006E14B0__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006DC000-000000067F000040020000E00000006E0000__000000574B7FF240 000000067F000040020000E00000006DC000-000000067F000040020000E00000006E0000__00000073AD3FE6B8 000000067F000040020000E00000006DC000-000000067F000040020000E00000006E0000__000000914E3F38F0 000000067F000040020000E00000006DC000-000000067F000040020000E00000006E0000__000000931B9A2710 000000067F000040020000E00000006E0000-000000067F000040020000E00000006E4000__000000574B7FF240 000000067F000040020000E00000006E0000-000000067F000040020000E00000006E4000__00000073AD3FE6B8 000000067F000040020000E00000006E0000-000000067F000040020000E00000006E4000__000000914E3F38F0 000000067F000040020000E00000006E0000-000000067F000040020000E00000006E4000__000000931B9A2710 000000067F000040020000E00000006E14B0-000000067F000040020000E00000006E9E91__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006E4000-000000067F000040020000E00000006E8000__000000574B7FF240 000000067F000040020000E00000006E4000-000000067F000040020000E00000006E8000__00000073AD3FE6B8 000000067F000040020000E00000006E4000-000000067F000040020000E00000006E8000__000000914E3F38F0 000000067F000040020000E00000006E4000-000000067F000040020000E00000006E8000__000000931B9A2710 000000067F000040020000E00000006E8000-000000067F000040020000E00000006EC000__000000574B7FF240 000000067F000040020000E00000006E8000-000000067F000040020000E00000006EC000__00000073AD3FE6B8 000000067F000040020000E00000006E8000-000000067F000040020000E00000006EC000__000000914E3F38F0 000000067F000040020000E00000006E8000-000000067F000040020000E00000006EC000__000000931B9A2710 000000067F000040020000E00000006E9E91-000000067F000040020000E00000006F2877__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006EC000-000000067F000040020000E00000006F0000__000000574B7FF240 000000067F000040020000E00000006EC000-000000067F000040020000E00000006F0000__00000073AD3FE6B8 000000067F000040020000E00000006EC000-000000067F000040020000E00000006F0000__000000914E3F38F0 000000067F000040020000E00000006EC000-000000067F000040020000E00000006F0000__000000931B9A2710 000000067F000040020000E00000006F0000-000000067F000040020000E00000006F4000__000000574B7FF240 000000067F000040020000E00000006F0000-000000067F000040020000E00000006F4000__00000073AD3FE6B8 000000067F000040020000E00000006F0000-000000067F000040020000E00000006F4000__000000914E3F38F0 000000067F000040020000E00000006F0000-000000067F000040020000E00000006F4000__000000931B9A2710 000000067F000040020000E00000006F2877-000000067F000040020000E00000006FB252__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006F4000-000000067F000040020000E00000006F8000__000000574B7FF240 000000067F000040020000E00000006F4000-000000067F000040020000E00000006F8000__00000073AD3FE6B8 000000067F000040020000E00000006F4000-000000067F000040020000E00000006F8000__000000914E3F38F0 000000067F000040020000E00000006F4000-000000067F000040020000E00000006F8000__000000931B9A2710 000000067F000040020000E00000006F8000-000000067F000040020000E00000006FC000__000000574B7FF240 000000067F000040020000E00000006F8000-000000067F000040020000E00000006FC000__00000073AD3FE6B8 000000067F000040020000E00000006F8000-000000067F000040020000E00000006FC000__000000914E3F38F0 000000067F000040020000E00000006F8000-000000067F000040020000E00000006FC000__000000931B9A2710 000000067F000040020000E00000006FB252-000000067F000040020000E0000000703C35__00000047441DEA39-0000004803BDE029 000000067F000040020000E00000006FC000-000000067F000040020000E0000000700000__000000574B7FF240 000000067F000040020000E00000006FC000-000000067F000040020000E0000000700000__00000073AD3FE6B8 000000067F000040020000E00000006FC000-000000067F000040020000E0000000700000__000000914E3F38F0 000000067F000040020000E00000006FC000-000000067F000040020000E0000000700000__000000931B9A2710 000000067F000040020000E0000000700000-000000067F000040020000E0000000704000__000000574B7FF240 000000067F000040020000E0000000700000-000000067F000040020000E0000000704000__00000073AD3FE6B8 000000067F000040020000E0000000700000-000000067F000040020000E0000000704000__000000914E3F38F0 000000067F000040020000E0000000700000-000000067F000040020000E0000000704000__000000931B9A2710 000000067F000040020000E0000000703C35-000000067F000040020000E000000070C617__00000047441DEA39-0000004803BDE029 000000067F000040020000E0000000704000-000000067F000040020000E0000000708000__000000574B7FF240 000000067F000040020000E0000000704000-000000067F000040020000E0000000708000__00000073AD3FE6B8 000000067F000040020000E0000000704000-000000067F000040020000E0000000708000__000000914E3F38F0 000000067F000040020000E0000000704000-000000067F000040020000E0000000708000__000000931B9A2710 000000067F000040020000E0000000708000-000000067F000040020000E000000070C000__000000574B7FF240 000000067F000040020000E0000000708000-000000067F000040020000E000000070C000__00000073AD3FE6B8 000000067F000040020000E0000000708000-000000067F000040020000E000000070C000__000000914E3F38F0 000000067F000040020000E0000000708000-000000067F000040020000E000000070C000__000000931B9A2710 000000067F000040020000E000000070C000-000000067F000040020000E0000000710000__000000574B7FF240 000000067F000040020000E000000070C000-000000067F000040020000E0000000710000__00000073AD3FE6B8 000000067F000040020000E000000070C000-000000067F000040020000E0000000710000__000000914E3F38F0 000000067F000040020000E000000070C000-000000067F000040020000E0000000710000__000000931B9A2710 000000067F000040020000E000000070C617-000000067F000040020000E0000000714FEF__00000047441DEA39-0000004803BDE029 000000067F000040020000E0000000710000-000000067F000040020000E0000000714000__000000574B7FF240 000000067F000040020000E0000000710000-000000067F000040020000E0000000714000__00000073AD3FE6B8 000000067F000040020000E0000000710000-000000067F000040020000E0000000714000__000000914E3F38F0 000000067F000040020000E0000000710000-000000067F000040020000E0000000714000__000000931B9A2710 000000067F000040020000E0000000714000-000000067F000040020000E0000000718000__000000574B7FF240 000000067F000040020000E0000000714000-000000067F000040020000E0000000718000__00000073AD3FE6B8 000000067F000040020000E0000000714000-000000067F000040020000E0000000718000__000000914E3F38F0 000000067F000040020000E0000000714000-000000067F000040020000E0000000718000__000000931B9A2710 000000067F000040020000E0000000714FEF-000000067F000040020000E000000071D9D3__00000047441DEA39-0000004803BDE029 000000067F000040020000E0000000718000-000000067F000040020000E000000071C000__000000574B7FF240 000000067F000040020000E0000000718000-000000067F000040020000E000000071C000__00000073AD3FE6B8 000000067F000040020000E0000000718000-000000067F000040020000E000000071C000__000000914E3F38F0 000000067F000040020000E0000000718000-000000067F000040020000E000000071C000__000000931B9A2710 000000067F000040020000E000000071C000-000000067F000040020000E0000000720000__000000574B7FF240 000000067F000040020000E000000071C000-000000067F000040020000E0000000720000__00000073AD3FE6B8 000000067F000040020000E000000071C000-000000067F000040020000E0000000720000__000000914E3F38F0 000000067F000040020000E000000071C000-000000067F000040020000E0000000720000__000000931B9A2710 000000067F000040020000E000000071D9D3-000000067F000040020000E00000007263A7__00000047441DEA39-0000004803BDE029 000000067F000040020000E0000000720000-000000067F000040020000E0000000724000__000000574B7FF240 000000067F000040020000E0000000720000-000000067F000040020000E0000000724000__00000073AD3FE6B8 000000067F000040020000E0000000720000-000000067F000040020000E0000000724000__000000914E3F38F0 000000067F000040020000E0000000720000-000000067F000040020000E0000000724000__000000931B9A2710 000000067F000040020000E0000000724000-000000067F000040020000E0000000728000__000000574B7FF240 000000067F000040020000E0000000724000-000000067F000040020000E0000000728000__00000073AD3FE6B8 000000067F000040020000E0000000724000-000000067F000040020000E0000000728000__000000914E3F38F0 000000067F000040020000E0000000724000-000000067F000040020000E0000000728000__000000931B9A2710 000000067F000040020000E00000007263A7-000000067F000040020000E000000072ED72__00000047441DEA39-0000004803BDE029 000000067F000040020000E0000000728000-000000067F000040020000E000000072C000__000000574B7FF240 000000067F000040020000E0000000728000-000000067F000040020000E000000072C000__00000073AD3FE6B8 000000067F000040020000E0000000728000-000000067F000040020000E000000072C000__000000914E3F38F0 000000067F000040020000E0000000728000-000000067F000040020000E000000072C000__000000931B9A2710 000000067F000040020000E000000072C000-000000067F000040020000E0000000730000__000000572A7A05D8 000000067F000040020000E000000072C000-000000067F000040020000E0000000730000__0000005D2FFFFB38 000000067F000040020000E000000072C000-000000067F000040020000E0000000730000__00000073AD3FE6B8 000000067F000040020000E000000072C000-000000067F000040020000E0000000730000__000000914E3F38F0 000000067F000040020000E000000072C000-000000067F000040020000E0000000730000__000000931B9A2710 000000067F000040020000E000000072ED72-000000067F000040020000E0000100000000__00000047441DEA39-0000004803BDE029 000000067F000040020000E000000072F0A9-000000067F000040020000E0000000737A87__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000730000-000000067F000040020000E0000000734000__000000572A7A05D8 000000067F000040020000E0000000730000-000000067F000040020000E0000000734000__0000005D2FFFFB38 000000067F000040020000E0000000730000-000000067F000040020000E0000000734000__00000073AD3FE6B8 000000067F000040020000E0000000730000-000000067F000040020000E0000000734000__000000914E3F38F0 000000067F000040020000E0000000730000-000000067F000040020000E0000000734000__000000931B9A2710 000000067F000040020000E0000000734000-000000067F000040020000E0000000738000__000000572A7A05D8 000000067F000040020000E0000000734000-000000067F000040020000E0000000738000__0000005D2FFFFB38 000000067F000040020000E0000000734000-000000067F000040020000E0000000738000__00000073AD3FE6B8 000000067F000040020000E0000000734000-000000067F000040020000E0000000738000__000000914E3F38F0 000000067F000040020000E0000000734000-000000067F000040020000E0000000738000__000000931B9A2710 000000067F000040020000E0000000737A87-000000067F000040020000E000000074046F__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000738000-000000067F000040020000E000000073C000__000000572A7A05D8 000000067F000040020000E0000000738000-000000067F000040020000E000000073C000__0000005D2FFFFB38 000000067F000040020000E0000000738000-000000067F000040020000E000000073C000__00000073AD3FE6B8 000000067F000040020000E0000000738000-000000067F000040020000E000000073C000__000000914E3F38F0 000000067F000040020000E0000000738000-000000067F000040020000E000000073C000__000000931B9A2710 000000067F000040020000E000000073C000-000000067F000040020000E0000000740000__000000572A7A05D8 000000067F000040020000E000000073C000-000000067F000040020000E0000000740000__0000005D2FFFFB38 000000067F000040020000E000000073C000-000000067F000040020000E0000000740000__00000073AD3FE6B8 000000067F000040020000E000000073C000-000000067F000040020000E0000000740000__000000914E3F38F0 000000067F000040020000E000000073C000-000000067F000040020000E0000000740000__000000931B9A2710 000000067F000040020000E0000000740000-000000067F000040020000E0000000744000__000000572A7A05D8 000000067F000040020000E0000000740000-000000067F000040020000E0000000744000__0000005D2FFFFB38 000000067F000040020000E0000000740000-000000067F000040020000E0000000744000__00000073AD3FE6B8 000000067F000040020000E0000000740000-000000067F000040020000E0000000744000__000000914E3F38F0 000000067F000040020000E0000000740000-000000067F000040020000E0000000744000__000000931B9A2710 000000067F000040020000E000000074046F-000000067F000040020000E0000000748E4A__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000744000-000000067F000040020000E0000000748000__000000574B7FF240 000000067F000040020000E0000000744000-000000067F000040020000E0000000748000__00000073AD3FE6B8 000000067F000040020000E0000000744000-000000067F000040020000E0000000748000__000000914E3F38F0 000000067F000040020000E0000000744000-000000067F000040020000E0000000748000__000000931B9A2710 000000067F000040020000E0000000744000-030000000000000000000000000000000002__000000482DBFED58 000000067F000040020000E0000000748000-000000067F000040020000E000000074C000__000000574B7FF240 000000067F000040020000E0000000748000-000000067F000040020000E000000074C000__00000073AD3FE6B8 000000067F000040020000E0000000748000-000000067F000040020000E000000074C000__000000914E3F38F0 000000067F000040020000E0000000748000-000000067F000040020000E000000074C000__000000931B9A2710 000000067F000040020000E0000000748E4A-000000067F000040020000E0000000751827__0000004803BDE029-00000048B365CD91 000000067F000040020000E000000074C000-000000067F000040020000E0000000750000__000000574B7FF240 000000067F000040020000E000000074C000-000000067F000040020000E0000000750000__00000073AD3FE6B8 000000067F000040020000E000000074C000-000000067F000040020000E0000000750000__000000914E3F38F0 000000067F000040020000E000000074C000-000000067F000040020000E0000000750000__000000931B9A2710 000000067F000040020000E0000000750000-000000067F000040020000E0000000754000__000000574B7FF240 000000067F000040020000E0000000750000-000000067F000040020000E0000000754000__00000073AD3FE6B8 000000067F000040020000E0000000750000-000000067F000040020000E0000000754000__000000914E3F38F0 000000067F000040020000E0000000750000-000000067F000040020000E0000000754000__000000931B9A2710 000000067F000040020000E0000000751827-000000067F000040020000E000000075A1F6__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000754000-000000067F000040020000E0000000758000__000000574B7FF240 000000067F000040020000E0000000754000-000000067F000040020000E0000000758000__00000073AD3FE6B8 000000067F000040020000E0000000754000-000000067F000040020000E0000000758000__000000914E3F38F0 000000067F000040020000E0000000754000-000000067F000040020000E0000000758000__000000931B9A2710 000000067F000040020000E0000000758000-000000067F000040020000E000000075C000__000000574B7FF240 000000067F000040020000E0000000758000-000000067F000040020000E000000075C000__00000073AD3FE6B8 000000067F000040020000E0000000758000-000000067F000040020000E000000075C000__000000914E3F38F0 000000067F000040020000E0000000758000-000000067F000040020000E000000075C000__000000931B9A2710 000000067F000040020000E000000075A1F6-000000067F000040020000E0000000762BD3__0000004803BDE029-00000048B365CD91 000000067F000040020000E000000075C000-000000067F000040020000E0000000760000__000000574B7FF240 000000067F000040020000E000000075C000-000000067F000040020000E0000000760000__00000073AD3FE6B8 000000067F000040020000E000000075C000-000000067F000040020000E0000000760000__000000914E3F38F0 000000067F000040020000E000000075C000-000000067F000040020000E0000000760000__000000931B9A2710 000000067F000040020000E0000000760000-000000067F000040020000E0000000764000__000000574B7FF240 000000067F000040020000E0000000760000-000000067F000040020000E0000000764000__00000073AD3FE6B8 000000067F000040020000E0000000760000-000000067F000040020000E0000000764000__000000914E3F38F0 000000067F000040020000E0000000760000-000000067F000040020000E0000000764000__000000931B9A2710 000000067F000040020000E0000000762BD3-000000067F000040020000E000000076B5AA__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000764000-000000067F000040020000E0000000768000__000000574B7FF240 000000067F000040020000E0000000764000-000000067F000040020000E0000000768000__00000073AD3FE6B8 000000067F000040020000E0000000764000-000000067F000040020000E0000000768000__000000914E3F38F0 000000067F000040020000E0000000764000-000000067F000040020000E0000000768000__000000931B9A2710 000000067F000040020000E0000000768000-000000067F000040020000E000000076C000__000000574B7FF240 000000067F000040020000E0000000768000-000000067F000040020000E000000076C000__00000073AD3FE6B8 000000067F000040020000E0000000768000-000000067F000040020000E000000076C000__000000914E3F38F0 000000067F000040020000E0000000768000-000000067F000040020000E000000076C000__000000931B9A2710 000000067F000040020000E000000076B5AA-000000067F000040020000E0000000773F85__0000004803BDE029-00000048B365CD91 000000067F000040020000E000000076C000-000000067F000040020000E0000000770000__000000574B7FF240 000000067F000040020000E000000076C000-000000067F000040020000E0000000770000__00000073AD3FE6B8 000000067F000040020000E000000076C000-000000067F000040020000E0000000770000__000000914E3F38F0 000000067F000040020000E000000076C000-000000067F000040020000E0000000770000__000000931B9A2710 000000067F000040020000E0000000770000-000000067F000040020000E0000000774000__000000574B7FF240 000000067F000040020000E0000000770000-000000067F000040020000E0000000774000__00000073AD3FE6B8 000000067F000040020000E0000000770000-000000067F000040020000E0000000774000__000000914E3F38F0 000000067F000040020000E0000000770000-000000067F000040020000E0000000774000__000000931B9A2710 000000067F000040020000E0000000773F85-000000067F000040020000E000000077C960__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000774000-000000067F000040020000E0000000778000__000000574B7FF240 000000067F000040020000E0000000774000-000000067F000040020000E0000000778000__00000073AD3FE6B8 000000067F000040020000E0000000774000-000000067F000040020000E0000000778000__000000914E3F38F0 000000067F000040020000E0000000774000-000000067F000040020000E0000000778000__000000931B9A2710 000000067F000040020000E0000000778000-000000067F000040020000E000000077C000__000000574B7FF240 000000067F000040020000E0000000778000-000000067F000040020000E000000077C000__00000073AD3FE6B8 000000067F000040020000E0000000778000-000000067F000040020000E000000077C000__000000914E3F38F0 000000067F000040020000E0000000778000-000000067F000040020000E000000077C000__000000931B9A2710 000000067F000040020000E000000077C000-000000067F000040020000E0000000780000__000000574B7FF240 000000067F000040020000E000000077C000-000000067F000040020000E0000000780000__00000073AD3FE6B8 000000067F000040020000E000000077C000-000000067F000040020000E0000000780000__000000914E3F38F0 000000067F000040020000E000000077C000-000000067F000040020000E0000000780000__000000931B9A2710 000000067F000040020000E000000077C960-000000067F000040020000E0000000785337__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000780000-000000067F000040020000E0000000784000__000000574B7FF240 000000067F000040020000E0000000780000-000000067F000040020000E0000000784000__00000073AD3FE6B8 000000067F000040020000E0000000780000-000000067F000040020000E0000000784000__000000914E3F38F0 000000067F000040020000E0000000780000-000000067F000040020000E0000000784000__000000931B9A2710 000000067F000040020000E0000000784000-000000067F000040020000E0000000788000__000000574B7FF240 000000067F000040020000E0000000784000-000000067F000040020000E0000000788000__00000073AD3FE6B8 000000067F000040020000E0000000784000-000000067F000040020000E0000000788000__000000914E3F38F0 000000067F000040020000E0000000784000-000000067F000040020000E0000000788000__000000931B9A2710 000000067F000040020000E0000000785337-000000067F000040020000E000000078DD09__0000004803BDE029-00000048B365CD91 000000067F000040020000E0000000788000-000000067F000040020000E000000078C000__000000574B7FF240 000000067F000040020000E0000000788000-000000067F000040020000E000000078C000__00000073AD3FE6B8 000000067F000040020000E0000000788000-000000067F000040020000E000000078C000__000000914E3F38F0 000000067F000040020000E0000000788000-000000067F000040020000E000000078C000__000000931B9A2710 000000067F000040020000E000000078C000-000000067F000040020000E0000000790000__000000572A7A05D8 000000067F000040020000E000000078C000-000000067F000040020000E0000000790000__0000005D2FFFFB38 000000067F000040020000E000000078C000-000000067F000040020000E0000000790000__00000073AD3FE6B8 000000067F000040020000E000000078C000-000000067F000040020000E0000000790000__000000914E3F38F0 000000067F000040020000E000000078C000-000000067F000040020000E0000000790000__000000931B9A2710 000000067F000040020000E000000078DD09-000000067F000040020000E0000100000000__0000004803BDE029-00000048B365CD91 000000067F000040020000E000000078E02B-000000067F000040020000E0000000796A04__00000048B365CD91-000000495313EB21 000000067F000040020000E0000000790000-000000067F000040020000E0000000794000__000000572A7A05D8 000000067F000040020000E0000000790000-000000067F000040020000E0000000794000__0000005D2FFFFB38 000000067F000040020000E0000000790000-000000067F000040020000E0000000794000__00000073AD3FE6B8 000000067F000040020000E0000000790000-000000067F000040020000E0000000794000__000000914E3F38F0 000000067F000040020000E0000000790000-000000067F000040020000E0000000794000__000000931B9A2710 000000067F000040020000E0000000794000-000000067F000040020000E0000000798000__000000572A7A05D8 000000067F000040020000E0000000794000-000000067F000040020000E0000000798000__0000005D2FFFFB38 000000067F000040020000E0000000794000-000000067F000040020000E0000000798000__00000073AD3FE6B8 000000067F000040020000E0000000794000-000000067F000040020000E0000000798000__000000914E3F38F0 000000067F000040020000E0000000794000-000000067F000040020000E0000000798000__000000931B9A2710 000000067F000040020000E0000000796A04-000000067F000040020000E000000079F3DB__00000048B365CD91-000000495313EB21 000000067F000040020000E0000000798000-000000067F000040020000E000000079C000__000000572A7A05D8 000000067F000040020000E0000000798000-000000067F000040020000E000000079C000__0000005D2FFFFB38 000000067F000040020000E0000000798000-000000067F000040020000E000000079C000__00000073AD3FE6B8 000000067F000040020000E0000000798000-000000067F000040020000E000000079C000__000000914E3F38F0 000000067F000040020000E0000000798000-000000067F000040020000E000000079C000__000000931B9A2710 000000067F000040020000E000000079C000-000000067F000040020000E00000007A0000__000000572A7A05D8 000000067F000040020000E000000079C000-000000067F000040020000E00000007A0000__0000005D2FFFFB38 000000067F000040020000E000000079C000-000000067F000040020000E00000007A0000__00000073AD3FE6B8 000000067F000040020000E000000079C000-000000067F000040020000E00000007A0000__000000914E3F38F0 000000067F000040020000E000000079C000-000000067F000040020000E00000007A0000__000000931B9A2710 000000067F000040020000E000000079F3DB-000000067F000040020000E00000007A7DC0__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007A0000-000000067F000040020000E00000007A4000__000000572A7A05D8 000000067F000040020000E00000007A0000-000000067F000040020000E00000007A4000__0000005D2FFFFB38 000000067F000040020000E00000007A0000-000000067F000040020000E00000007A4000__00000073AD3FE6B8 000000067F000040020000E00000007A0000-000000067F000040020000E00000007A4000__000000914E3F38F0 000000067F000040020000E00000007A0000-000000067F000040020000E00000007A4000__000000931B9A2710 000000067F000040020000E00000007A4000-000000067F000040020000E00000007A8000__000000572A7A05D8 000000067F000040020000E00000007A4000-000000067F000040020000E00000007A8000__0000005D2FFFFB38 000000067F000040020000E00000007A4000-000000067F000040020000E00000007A8000__00000073AD3FE6B8 000000067F000040020000E00000007A4000-000000067F000040020000E00000007A8000__000000914E3F38F0 000000067F000040020000E00000007A4000-000000067F000040020000E00000007A8000__000000931B9A2710 000000067F000040020000E00000007A7DC0-000000067F000040020000E00000007B079C__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007A8000-000000067F000040020000E00000007AC000__000000572A7A05D8 000000067F000040020000E00000007A8000-000000067F000040020000E00000007AC000__0000005D2FFFFB38 000000067F000040020000E00000007A8000-000000067F000040020000E00000007AC000__00000073AD3FE6B8 000000067F000040020000E00000007A8000-000000067F000040020000E00000007AC000__000000914E3F38F0 000000067F000040020000E00000007A8000-000000067F000040020000E00000007AC000__000000931B9A2710 000000067F000040020000E00000007AC000-000000067F000040020000E00000007B0000__000000572A7A05D8 000000067F000040020000E00000007AC000-000000067F000040020000E00000007B0000__0000005D2FFFFB38 000000067F000040020000E00000007AC000-000000067F000040020000E00000007B0000__00000073AD3FE6B8 000000067F000040020000E00000007AC000-000000067F000040020000E00000007B0000__000000914E3F38F0 000000067F000040020000E00000007AC000-000000067F000040020000E00000007B0000__000000931B9A2710 000000067F000040020000E00000007B0000-000000067F000040020000E00000007B4000__000000572A7A05D8 000000067F000040020000E00000007B0000-000000067F000040020000E00000007B4000__0000005D2FFFFB38 000000067F000040020000E00000007B0000-000000067F000040020000E00000007B4000__00000073AD3FE6B8 000000067F000040020000E00000007B0000-000000067F000040020000E00000007B4000__000000914E3F38F0 000000067F000040020000E00000007B0000-000000067F000040020000E00000007B4000__000000931B9A2710 000000067F000040020000E00000007B079C-000000067F000040020000E00000007B9183__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007B4000-000000067F000040020000E00000007B8000__000000572A7A05D8 000000067F000040020000E00000007B4000-000000067F000040020000E00000007B8000__0000005D2FFFFB38 000000067F000040020000E00000007B4000-000000067F000040020000E00000007B8000__00000073AD3FE6B8 000000067F000040020000E00000007B4000-000000067F000040020000E00000007B8000__000000914E3F38F0 000000067F000040020000E00000007B4000-000000067F000040020000E00000007B8000__000000931B9A2710 000000067F000040020000E00000007B8000-000000067F000040020000E00000007BC000__000000572A7A05D8 000000067F000040020000E00000007B8000-000000067F000040020000E00000007BC000__0000005D2FFFFB38 000000067F000040020000E00000007B8000-000000067F000040020000E00000007BC000__00000073AD3FE6B8 000000067F000040020000E00000007B8000-000000067F000040020000E00000007BC000__000000914E3F38F0 000000067F000040020000E00000007B8000-000000067F000040020000E00000007BC000__000000931B9A2710 000000067F000040020000E00000007B9183-000000067F000040020000E00000007C1B60__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007BC000-000000067F000040020000E00000007C0000__000000572A7A05D8 000000067F000040020000E00000007BC000-000000067F000040020000E00000007C0000__0000005D2FFFFB38 000000067F000040020000E00000007BC000-000000067F000040020000E00000007C0000__00000073AD3FE6B8 000000067F000040020000E00000007BC000-000000067F000040020000E00000007C0000__000000914E3F38F0 000000067F000040020000E00000007BC000-000000067F000040020000E00000007C0000__000000931B9A2710 000000067F000040020000E00000007C0000-000000067F000040020000E00000007C4000__000000572A7A05D8 000000067F000040020000E00000007C0000-000000067F000040020000E00000007C4000__0000005D2FFFFB38 000000067F000040020000E00000007C0000-000000067F000040020000E00000007C4000__00000073AD3FE6B8 000000067F000040020000E00000007C0000-000000067F000040020000E00000007C4000__000000914E3F38F0 000000067F000040020000E00000007C0000-000000067F000040020000E00000007C4000__000000931B9A2710 000000067F000040020000E00000007C1B60-000000067F000040020000E00000007CA53A__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007C4000-000000067F000040020000E00000007C8000__000000572A7A05D8 000000067F000040020000E00000007C4000-000000067F000040020000E00000007C8000__0000005D2FFFFB38 000000067F000040020000E00000007C4000-000000067F000040020000E00000007C8000__00000073AD3FE6B8 000000067F000040020000E00000007C4000-000000067F000040020000E00000007C8000__000000914E3F38F0 000000067F000040020000E00000007C4000-000000067F000040020000E00000007C8000__000000931B9A2710 000000067F000040020000E00000007C8000-000000067F000040020000E00000007CC000__000000572A7A05D8 000000067F000040020000E00000007C8000-000000067F000040020000E00000007CC000__0000005D2FFFFB38 000000067F000040020000E00000007C8000-000000067F000040020000E00000007CC000__00000073AD3FE6B8 000000067F000040020000E00000007C8000-000000067F000040020000E00000007CC000__000000914E3F38F0 000000067F000040020000E00000007C8000-000000067F000040020000E00000007CC000__000000931B9A2710 000000067F000040020000E00000007CA53A-000000067F000040020000E00000007D2F02__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007CC000-000000067F000040020000E00000007D0000__000000572A7A05D8 000000067F000040020000E00000007CC000-000000067F000040020000E00000007D0000__0000005D2FFFFB38 000000067F000040020000E00000007CC000-000000067F000040020000E00000007D0000__00000073AD3FE6B8 000000067F000040020000E00000007CC000-000000067F000040020000E00000007D0000__000000914E3F38F0 000000067F000040020000E00000007CC000-000000067F000040020000E00000007D0000__000000931B9A2710 000000067F000040020000E00000007D0000-000000067F000040020000E00000007D4000__000000572A7A05D8 000000067F000040020000E00000007D0000-000000067F000040020000E00000007D4000__0000005D2FFFFB38 000000067F000040020000E00000007D0000-000000067F000040020000E00000007D4000__00000073AD3FE6B8 000000067F000040020000E00000007D0000-000000067F000040020000E00000007D4000__000000914E3F38F0 000000067F000040020000E00000007D0000-000000067F000040020000E00000007D4000__000000931B9A2710 000000067F000040020000E00000007D2F02-000000067F000040020000E00000007DB8D5__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007D4000-000000067F000040020000E00000007D8000__000000572A7A05D8 000000067F000040020000E00000007D4000-000000067F000040020000E00000007D8000__0000005D2FFFFB38 000000067F000040020000E00000007D4000-000000067F000040020000E00000007D8000__00000073AD3FE6B8 000000067F000040020000E00000007D4000-000000067F000040020000E00000007D8000__000000914E3F38F0 000000067F000040020000E00000007D4000-000000067F000040020000E00000007D8000__000000931B9A2710 000000067F000040020000E00000007D8000-000000067F000040020000E00000007DC000__000000572A7A05D8 000000067F000040020000E00000007D8000-000000067F000040020000E00000007DC000__0000005D2FFFFB38 000000067F000040020000E00000007D8000-000000067F000040020000E00000007DC000__00000073AD3FE6B8 000000067F000040020000E00000007D8000-000000067F000040020000E00000007DC000__000000914E3F38F0 000000067F000040020000E00000007D8000-000000067F000040020000E00000007DC000__000000931B9A2710 000000067F000040020000E00000007DB8D5-000000067F000040020000E00000007E42BB__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007DC000-000000067F000040020000E00000007E0000__000000572A7A05D8 000000067F000040020000E00000007DC000-000000067F000040020000E00000007E0000__0000005D2FFFFB38 000000067F000040020000E00000007DC000-000000067F000040020000E00000007E0000__00000073AD3FE6B8 000000067F000040020000E00000007DC000-000000067F000040020000E00000007E0000__000000914E3F38F0 000000067F000040020000E00000007DC000-000000067F000040020000E00000007E0000__000000931B9A2710 000000067F000040020000E00000007E0000-000000067F000040020000E00000007E4000__000000572A7A05D8 000000067F000040020000E00000007E0000-000000067F000040020000E00000007E4000__0000005D2FFFFB38 000000067F000040020000E00000007E0000-000000067F000040020000E00000007E4000__00000073AD3FE6B8 000000067F000040020000E00000007E0000-000000067F000040020000E00000007E4000__000000914E3F38F0 000000067F000040020000E00000007E0000-000000067F000040020000E00000007E4000__000000931B9A2710 000000067F000040020000E00000007E4000-000000067F000040020000E00000007E8000__0000004A297FFC38 000000067F000040020000E00000007E4000-000000067F000040020000E00000007E8000__0000005D2FFFFB38 000000067F000040020000E00000007E4000-000000067F000040020000E00000007E8000__00000073AD3FE6B8 000000067F000040020000E00000007E4000-000000067F000040020000E00000007E8000__000000914E3F38F0 000000067F000040020000E00000007E4000-000000067F000040020000E00000007E8000__000000931B9A2710 000000067F000040020000E00000007E42BB-000000067F000040020000E0000100000000__00000048B365CD91-000000495313EB21 000000067F000040020000E00000007E458D-000000067F000040020000E00000007ECF68__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E00000007E8000-000000067F000040020000E00000007EC000__0000004A297FFC38 000000067F000040020000E00000007E8000-000000067F000040020000E00000007EC000__0000005D2FFFFB38 000000067F000040020000E00000007E8000-000000067F000040020000E00000007EC000__00000073AD3FE6B8 000000067F000040020000E00000007E8000-000000067F000040020000E00000007EC000__000000914E3F38F0 000000067F000040020000E00000007E8000-000000067F000040020000E00000007EC000__000000931B9A2710 000000067F000040020000E00000007EC000-000000067F000040020000E00000007F0000__0000004A297FFC38 000000067F000040020000E00000007EC000-000000067F000040020000E00000007F0000__0000005D2FFFFB38 000000067F000040020000E00000007EC000-000000067F000040020000E00000007F0000__00000073AD3FE6B8 000000067F000040020000E00000007EC000-000000067F000040020000E00000007F0000__000000914E3F38F0 000000067F000040020000E00000007EC000-000000067F000040020000E00000007F0000__000000931B9A2710 000000067F000040020000E00000007ECF68-000000067F000040020000E00000007F594B__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E00000007F0000-000000067F000040020000E00000007F4000__0000004A297FFC38 000000067F000040020000E00000007F0000-000000067F000040020000E00000007F4000__0000005D2FFFFB38 000000067F000040020000E00000007F0000-000000067F000040020000E00000007F4000__00000073AD3FE6B8 000000067F000040020000E00000007F0000-000000067F000040020000E00000007F4000__000000914E3F38F0 000000067F000040020000E00000007F0000-000000067F000040020000E00000007F4000__000000931B9A2710 000000067F000040020000E00000007F4000-000000067F000040020000E00000007F8000__0000004A297FFC38 000000067F000040020000E00000007F4000-000000067F000040020000E00000007F8000__0000005D2FFFFB38 000000067F000040020000E00000007F4000-000000067F000040020000E00000007F8000__00000073AD3FE6B8 000000067F000040020000E00000007F4000-000000067F000040020000E00000007F8000__000000914E3F38F0 000000067F000040020000E00000007F4000-000000067F000040020000E00000007F8000__000000931B9A2710 000000067F000040020000E00000007F594B-000000067F000040020000E00000007FE326__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E00000007F8000-000000067F000040020000E00000007FC000__0000004A297FFC38 000000067F000040020000E00000007F8000-000000067F000040020000E00000007FC000__0000005D2FFFFB38 000000067F000040020000E00000007F8000-000000067F000040020000E00000007FC000__00000073AD3FE6B8 000000067F000040020000E00000007F8000-000000067F000040020000E00000007FC000__000000914E3F38F0 000000067F000040020000E00000007F8000-000000067F000040020000E00000007FC000__000000931B9A2710 000000067F000040020000E00000007FC000-000000067F000040020000E0000000800000__0000004A297FFC38 000000067F000040020000E00000007FC000-000000067F000040020000E0000000800000__0000005D2FFFFB38 000000067F000040020000E00000007FC000-000000067F000040020000E0000000800000__00000073AD3FE6B8 000000067F000040020000E00000007FC000-000000067F000040020000E0000000800000__000000914E3F38F0 000000067F000040020000E00000007FC000-000000067F000040020000E0000000800000__000000931B9A2710 000000067F000040020000E00000007FE326-000000067F000040020000E0000000806CF5__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000800000-000000067F000040020000E0000000804000__0000004A297FFC38 000000067F000040020000E0000000800000-000000067F000040020000E0000000804000__0000005D2FFFFB38 000000067F000040020000E0000000800000-000000067F000040020000E0000000804000__00000073AD3FE6B8 000000067F000040020000E0000000800000-000000067F000040020000E0000000804000__000000914E3F38F0 000000067F000040020000E0000000800000-000000067F000040020000E0000000804000__000000931B9A2710 000000067F000040020000E0000000804000-000000067F000040020000E0000000808000__0000004A297FFC38 000000067F000040020000E0000000804000-000000067F000040020000E0000000808000__0000005D2FFFFB38 000000067F000040020000E0000000804000-000000067F000040020000E0000000808000__00000073AD3FE6B8 000000067F000040020000E0000000804000-000000067F000040020000E0000000808000__000000914E3F38F0 000000067F000040020000E0000000804000-000000067F000040020000E0000000808000__000000931B9A2710 000000067F000040020000E0000000806CF5-000000067F000040020000E000000080F6D5__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000808000-000000067F000040020000E000000080C000__0000004A297FFC38 000000067F000040020000E0000000808000-000000067F000040020000E000000080C000__0000005D2FFFFB38 000000067F000040020000E0000000808000-000000067F000040020000E000000080C000__00000073AD3FE6B8 000000067F000040020000E0000000808000-000000067F000040020000E000000080C000__000000914E3F38F0 000000067F000040020000E0000000808000-000000067F000040020000E000000080C000__000000931B9A2710 000000067F000040020000E000000080C000-000000067F000040020000E0000000810000__0000004A297FFC38 000000067F000040020000E000000080C000-000000067F000040020000E0000000810000__0000005D2FFFFB38 000000067F000040020000E000000080C000-000000067F000040020000E0000000810000__00000073AD3FE6B8 000000067F000040020000E000000080C000-000000067F000040020000E0000000810000__000000914E3F38F0 000000067F000040020000E000000080C000-000000067F000040020000E0000000810000__000000931B9A2710 000000067F000040020000E000000080F6D5-000000067F000040020000E00000008180B1__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000810000-000000067F000040020000E0000000814000__0000004A297FFC38 000000067F000040020000E0000000810000-000000067F000040020000E0000000814000__0000005D2FFFFB38 000000067F000040020000E0000000810000-000000067F000040020000E0000000814000__00000073AD3FE6B8 000000067F000040020000E0000000810000-000000067F000040020000E0000000814000__000000914E3F38F0 000000067F000040020000E0000000810000-000000067F000040020000E0000000814000__000000931B9A2710 000000067F000040020000E0000000814000-000000067F000040020000E0000000818000__0000004A297FFC38 000000067F000040020000E0000000814000-000000067F000040020000E0000000818000__0000005D2FFFFB38 000000067F000040020000E0000000814000-000000067F000040020000E0000000818000__00000073AD3FE6B8 000000067F000040020000E0000000814000-000000067F000040020000E0000000818000__000000914E3F38F0 000000067F000040020000E0000000814000-000000067F000040020000E0000000818000__000000931B9A2710 000000067F000040020000E0000000818000-000000067F000040020000E000000081C000__0000004A297FFC38 000000067F000040020000E0000000818000-000000067F000040020000E000000081C000__0000005D2FFFFB38 000000067F000040020000E0000000818000-000000067F000040020000E000000081C000__00000073AD3FE6B8 000000067F000040020000E0000000818000-000000067F000040020000E000000081C000__000000914E3F38F0 000000067F000040020000E0000000818000-000000067F000040020000E000000081C000__000000931B9A2710 000000067F000040020000E00000008180B1-000000067F000040020000E0000000820A9A__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E000000081C000-000000067F000040020000E0000000820000__0000004A297FFC38 000000067F000040020000E000000081C000-000000067F000040020000E0000000820000__0000005D2FFFFB38 000000067F000040020000E000000081C000-000000067F000040020000E0000000820000__00000073AD3FE6B8 000000067F000040020000E000000081C000-000000067F000040020000E0000000820000__000000914E3F38F0 000000067F000040020000E000000081C000-000000067F000040020000E0000000820000__000000931B9A2710 000000067F000040020000E0000000820000-000000067F000040020000E0000000824000__0000004A297FFC38 000000067F000040020000E0000000820000-000000067F000040020000E0000000824000__0000005D2FFFFB38 000000067F000040020000E0000000820000-000000067F000040020000E0000000824000__00000073AD3FE6B8 000000067F000040020000E0000000820000-000000067F000040020000E0000000824000__000000914E3F38F0 000000067F000040020000E0000000820000-000000067F000040020000E0000000824000__000000931B9A2710 000000067F000040020000E0000000820A9A-000000067F000040020000E000000082946F__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000824000-000000067F000040020000E0000000828000__0000004A297FFC38 000000067F000040020000E0000000824000-000000067F000040020000E0000000828000__0000005D2FFFFB38 000000067F000040020000E0000000824000-000000067F000040020000E0000000828000__00000073AD3FE6B8 000000067F000040020000E0000000824000-000000067F000040020000E0000000828000__000000914E3F38F0 000000067F000040020000E0000000824000-000000067F000040020000E0000000828000__000000931B9A2710 000000067F000040020000E0000000828000-000000067F000040020000E000000082C000__0000004A297FFC38 000000067F000040020000E0000000828000-000000067F000040020000E000000082C000__0000005D2FFFFB38 000000067F000040020000E0000000828000-000000067F000040020000E000000082C000__00000073AD3FE6B8 000000067F000040020000E0000000828000-000000067F000040020000E000000082C000__000000914E3F38F0 000000067F000040020000E0000000828000-000000067F000040020000E000000082C000__000000931B9A2710 000000067F000040020000E000000082946F-000000067F000040020000E0000000831E53__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E000000082C000-000000067F000040020000E0000000830000__0000004A297FFC38 000000067F000040020000E000000082C000-000000067F000040020000E0000000830000__0000005D2FFFFB38 000000067F000040020000E000000082C000-000000067F000040020000E0000000830000__00000073AD3FE6B8 000000067F000040020000E000000082C000-000000067F000040020000E0000000830000__000000914E3F38F0 000000067F000040020000E000000082C000-000000067F000040020000E0000000830000__000000931B9A2710 000000067F000040020000E0000000830000-000000067F000040020000E0000000834000__0000004A297FFC38 000000067F000040020000E0000000830000-000000067F000040020000E0000000834000__0000005D2FFFFB38 000000067F000040020000E0000000830000-000000067F000040020000E0000000834000__00000073AD3FE6B8 000000067F000040020000E0000000830000-000000067F000040020000E0000000834000__000000914E3F38F0 000000067F000040020000E0000000830000-000000067F000040020000E0000000834000__000000931B9A2710 000000067F000040020000E0000000831E53-000000067F000040020000E000000083A834__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000834000-000000067F000040020000E0000000838000__0000004A297FFC38 000000067F000040020000E0000000834000-000000067F000040020000E0000000838000__0000005D2FFFFB38 000000067F000040020000E0000000834000-000000067F000040020000E0000000838000__00000073AD3FE6B8 000000067F000040020000E0000000834000-000000067F000040020000E0000000838000__000000914E3F38F0 000000067F000040020000E0000000834000-000000067F000040020000E0000000838000__000000931B9A2710 000000067F000040020000E0000000838000-000000067F000040020000E000000083C000__0000004A297FFC38 000000067F000040020000E0000000838000-000000067F000040020000E000000083C000__0000005D2FFFFB38 000000067F000040020000E0000000838000-000000067F000040020000E000000083C000__00000073AD3FE6B8 000000067F000040020000E0000000838000-000000067F000040020000E000000083C000__000000914E3F38F0 000000067F000040020000E0000000838000-000000067F000040020000E000000083C000__000000931B9A2710 000000067F000040020000E000000083A834-000000067F000040020000E0000000843201__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E000000083C000-000000067F000040020000E0000000840000__0000004A297FFC38 000000067F000040020000E000000083C000-000000067F000040020000E0000000840000__0000005D2FFFFB38 000000067F000040020000E000000083C000-000000067F000040020000E0000000840000__00000073AD3FE6B8 000000067F000040020000E000000083C000-000000067F000040020000E0000000840000__000000914E3F38F0 000000067F000040020000E000000083C000-000000067F000040020000E0000000840000__000000931B9A2710 000000067F000040020000E0000000840000-000000067F000040020000E0000000844000__0000004A297FFC38 000000067F000040020000E0000000840000-000000067F000040020000E0000000844000__000000574B7FF240 000000067F000040020000E0000000840000-000000067F000040020000E0000000844000__00000073AD3FE6B8 000000067F000040020000E0000000840000-000000067F000040020000E0000000844000__000000914E3F38F0 000000067F000040020000E0000000840000-000000067F000040020000E0000000844000__000000931B9A2710 000000067F000040020000E0000000843201-000000067F000040020000E0000100000000__000000495313EB21-0000004A02BBD6B1 000000067F000040020000E0000000843529-000000067F000040020000E000000084BF01__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000844000-000000067F000040020000E0000000848000__0000004A297FFC38 000000067F000040020000E0000000844000-000000067F000040020000E0000000848000__000000574B7FF240 000000067F000040020000E0000000844000-000000067F000040020000E0000000848000__00000073AD3FE6B8 000000067F000040020000E0000000844000-000000067F000040020000E0000000848000__000000914E3F38F0 000000067F000040020000E0000000844000-000000067F000040020000E0000000848000__000000931B9A2710 000000067F000040020000E0000000848000-000000067F000040020000E000000084C000__0000004A297FFC38 000000067F000040020000E0000000848000-000000067F000040020000E000000084C000__000000574B7FF240 000000067F000040020000E0000000848000-000000067F000040020000E000000084C000__00000073AD3FE6B8 000000067F000040020000E0000000848000-000000067F000040020000E000000084C000__000000914E3F38F0 000000067F000040020000E0000000848000-000000067F000040020000E000000084C000__000000931B9A2710 000000067F000040020000E000000084BF01-000000067F000040020000E00000008548D9__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E000000084C000-000000067F000040020000E0000000850000__0000004A297FFC38 000000067F000040020000E000000084C000-000000067F000040020000E0000000850000__000000574B7FF240 000000067F000040020000E000000084C000-000000067F000040020000E0000000850000__00000073AD3FE6B8 000000067F000040020000E000000084C000-000000067F000040020000E0000000850000__000000914E3F38F0 000000067F000040020000E000000084C000-000000067F000040020000E0000000850000__000000931B9A2710 000000067F000040020000E0000000850000-000000067F000040020000E0000000854000__0000004A297FFC38 000000067F000040020000E0000000850000-000000067F000040020000E0000000854000__000000574B7FF240 000000067F000040020000E0000000850000-000000067F000040020000E0000000854000__00000073AD3FE6B8 000000067F000040020000E0000000850000-000000067F000040020000E0000000854000__000000914E3F38F0 000000067F000040020000E0000000850000-000000067F000040020000E0000000854000__000000931B9A2710 000000067F000040020000E0000000854000-000000067F000040020000E0000000858000__0000004A297FFC38 000000067F000040020000E0000000854000-000000067F000040020000E0000000858000__000000574B7FF240 000000067F000040020000E0000000854000-000000067F000040020000E0000000858000__00000073AD3FE6B8 000000067F000040020000E0000000854000-000000067F000040020000E0000000858000__000000914E3F38F0 000000067F000040020000E0000000854000-000000067F000040020000E0000000858000__000000931B9A2710 000000067F000040020000E00000008548D9-000000067F000040020000E000000085D2CA__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000858000-000000067F000040020000E000000085C000__000000574B7FF240 000000067F000040020000E0000000858000-000000067F000040020000E000000085C000__00000073AD3FE6B8 000000067F000040020000E0000000858000-000000067F000040020000E000000085C000__000000914E3F38F0 000000067F000040020000E0000000858000-000000067F000040020000E000000085C000__000000931B9A2710 000000067F000040020000E0000000858000-030000000000000000000000000000000002__0000004A297FFC38 000000067F000040020000E000000085C000-000000067F000040020000E0000000860000__000000574B7FF240 000000067F000040020000E000000085C000-000000067F000040020000E0000000860000__00000073AD3FE6B8 000000067F000040020000E000000085C000-000000067F000040020000E0000000860000__000000914E3F38F0 000000067F000040020000E000000085C000-000000067F000040020000E0000000860000__000000931B9A2710 000000067F000040020000E000000085D2CA-000000067F000040020000E0000000865CB1__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000860000-000000067F000040020000E0000000864000__000000574B7FF240 000000067F000040020000E0000000860000-000000067F000040020000E0000000864000__00000073AD3FE6B8 000000067F000040020000E0000000860000-000000067F000040020000E0000000864000__000000914E3F38F0 000000067F000040020000E0000000860000-000000067F000040020000E0000000864000__000000931B9A2710 000000067F000040020000E0000000864000-000000067F000040020000E0000000868000__000000574B7FF240 000000067F000040020000E0000000864000-000000067F000040020000E0000000868000__00000073AD3FE6B8 000000067F000040020000E0000000864000-000000067F000040020000E0000000868000__000000914E3F38F0 000000067F000040020000E0000000864000-000000067F000040020000E0000000868000__000000931B9A2710 000000067F000040020000E0000000865CB1-000000067F000040020000E000000086E688__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000868000-000000067F000040020000E000000086C000__000000574B7FF240 000000067F000040020000E0000000868000-000000067F000040020000E000000086C000__00000073AD3FE6B8 000000067F000040020000E0000000868000-000000067F000040020000E000000086C000__000000914E3F38F0 000000067F000040020000E0000000868000-000000067F000040020000E000000086C000__000000931B9A2710 000000067F000040020000E000000086C000-000000067F000040020000E0000000870000__000000574B7FF240 000000067F000040020000E000000086C000-000000067F000040020000E0000000870000__00000073AD3FE6B8 000000067F000040020000E000000086C000-000000067F000040020000E0000000870000__000000914E3F38F0 000000067F000040020000E000000086C000-000000067F000040020000E0000000870000__000000931B9A2710 000000067F000040020000E000000086E688-000000067F000040020000E0000000877067__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000870000-000000067F000040020000E0000000874000__000000574B7FF240 000000067F000040020000E0000000870000-000000067F000040020000E0000000874000__00000073AD3FE6B8 000000067F000040020000E0000000870000-000000067F000040020000E0000000874000__000000914E3F38F0 000000067F000040020000E0000000870000-000000067F000040020000E0000000874000__000000931B9A2710 000000067F000040020000E0000000874000-000000067F000040020000E0000000878000__000000574B7FF240 000000067F000040020000E0000000874000-000000067F000040020000E0000000878000__00000073AD3FE6B8 000000067F000040020000E0000000874000-000000067F000040020000E0000000878000__000000914E3F38F0 000000067F000040020000E0000000874000-000000067F000040020000E0000000878000__000000931B9A2710 000000067F000040020000E0000000877067-000000067F000040020000E000000087FA40__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000878000-000000067F000040020000E000000087C000__000000574B7FF240 000000067F000040020000E0000000878000-000000067F000040020000E000000087C000__00000073AD3FE6B8 000000067F000040020000E0000000878000-000000067F000040020000E000000087C000__000000914E3F38F0 000000067F000040020000E0000000878000-000000067F000040020000E000000087C000__000000931B9A2710 000000067F000040020000E000000087C000-000000067F000040020000E0000000880000__000000574B7FF240 000000067F000040020000E000000087C000-000000067F000040020000E0000000880000__00000073AD3FE6B8 000000067F000040020000E000000087C000-000000067F000040020000E0000000880000__000000914E3F38F0 000000067F000040020000E000000087C000-000000067F000040020000E0000000880000__000000931B9A2710 000000067F000040020000E000000087FA40-000000067F000040020000E0000000888413__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000880000-000000067F000040020000E0000000884000__000000574B7FF240 000000067F000040020000E0000000880000-000000067F000040020000E0000000884000__00000073AD3FE6B8 000000067F000040020000E0000000880000-000000067F000040020000E0000000884000__000000914E3F38F0 000000067F000040020000E0000000880000-000000067F000040020000E0000000884000__000000931B9A2710 000000067F000040020000E0000000884000-000000067F000040020000E0000000888000__000000574B7FF240 000000067F000040020000E0000000884000-000000067F000040020000E0000000888000__00000073AD3FE6B8 000000067F000040020000E0000000884000-000000067F000040020000E0000000888000__000000914E3F38F0 000000067F000040020000E0000000884000-000000067F000040020000E0000000888000__000000931B9A2710 000000067F000040020000E0000000888000-000000067F000040020000E000000088C000__000000574B7FF240 000000067F000040020000E0000000888000-000000067F000040020000E000000088C000__00000073AD3FE6B8 000000067F000040020000E0000000888000-000000067F000040020000E000000088C000__000000914E3F38F0 000000067F000040020000E0000000888000-000000067F000040020000E000000088C000__000000931B9A2710 000000067F000040020000E0000000888413-000000067F000040020000E0000000890DE6__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E000000088C000-000000067F000040020000E0000000890000__000000574B7FF240 000000067F000040020000E000000088C000-000000067F000040020000E0000000890000__00000073AD3FE6B8 000000067F000040020000E000000088C000-000000067F000040020000E0000000890000__000000914E3F38F0 000000067F000040020000E000000088C000-000000067F000040020000E0000000890000__000000931B9A2710 000000067F000040020000E0000000890000-000000067F000040020000E0000000894000__000000574B7FF240 000000067F000040020000E0000000890000-000000067F000040020000E0000000894000__00000073AD3FE6B8 000000067F000040020000E0000000890000-000000067F000040020000E0000000894000__000000914E3F38F0 000000067F000040020000E0000000890000-000000067F000040020000E0000000894000__000000931B9A2710 000000067F000040020000E0000000890DE6-000000067F000040020000E00000008997D0__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000894000-000000067F000040020000E0000000898000__000000574B7FF240 000000067F000040020000E0000000894000-000000067F000040020000E0000000898000__00000073AD3FE6B8 000000067F000040020000E0000000894000-000000067F000040020000E0000000898000__000000914E3F38F0 000000067F000040020000E0000000894000-000000067F000040020000E0000000898000__000000931B9A2710 000000067F000040020000E0000000898000-000000067F000040020000E000000089C000__000000572A7A05D8 000000067F000040020000E0000000898000-000000067F000040020000E000000089C000__0000005D2FFFFB38 000000067F000040020000E0000000898000-000000067F000040020000E000000089C000__00000073AD3FE6B8 000000067F000040020000E0000000898000-000000067F000040020000E000000089C000__000000914E3F38F0 000000067F000040020000E0000000898000-000000067F000040020000E000000089C000__000000931B9A2710 000000067F000040020000E00000008997D0-000000067F000040020000E0000100000000__0000004A02BBD6B1-0000004AA26BDB49 000000067F000040020000E0000000899AAA-000000067F000040020000E00000008A248D__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E000000089C000-000000067F000040020000E00000008A0000__000000572A7A05D8 000000067F000040020000E000000089C000-000000067F000040020000E00000008A0000__0000005D2FFFFB38 000000067F000040020000E000000089C000-000000067F000040020000E00000008A0000__00000073AD3FE6B8 000000067F000040020000E000000089C000-000000067F000040020000E00000008A0000__000000914E3F38F0 000000067F000040020000E000000089C000-000000067F000040020000E00000008A0000__000000931B9A2710 000000067F000040020000E00000008A0000-000000067F000040020000E00000008A4000__000000572A7A05D8 000000067F000040020000E00000008A0000-000000067F000040020000E00000008A4000__0000005D2FFFFB38 000000067F000040020000E00000008A0000-000000067F000040020000E00000008A4000__00000073AD3FE6B8 000000067F000040020000E00000008A0000-000000067F000040020000E00000008A4000__000000914E3F38F0 000000067F000040020000E00000008A0000-000000067F000040020000E00000008A4000__000000931B9A2710 000000067F000040020000E00000008A248D-000000067F000040020000E00000008AAE5E__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008A4000-000000067F000040020000E00000008A8000__000000572A7A05D8 000000067F000040020000E00000008A4000-000000067F000040020000E00000008A8000__0000005D2FFFFB38 000000067F000040020000E00000008A4000-000000067F000040020000E00000008A8000__00000073AD3FE6B8 000000067F000040020000E00000008A4000-000000067F000040020000E00000008A8000__000000914E3F38F0 000000067F000040020000E00000008A4000-000000067F000040020000E00000008A8000__000000931B9A2710 000000067F000040020000E00000008A8000-000000067F000040020000E00000008AC000__000000572A7A05D8 000000067F000040020000E00000008A8000-000000067F000040020000E00000008AC000__0000005D2FFFFB38 000000067F000040020000E00000008A8000-000000067F000040020000E00000008AC000__00000073AD3FE6B8 000000067F000040020000E00000008A8000-000000067F000040020000E00000008AC000__000000914E3F38F0 000000067F000040020000E00000008A8000-000000067F000040020000E00000008AC000__000000931B9A2710 000000067F000040020000E00000008AAE5E-000000067F000040020000E00000008B383C__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008AC000-000000067F000040020000E00000008B0000__000000572A7A05D8 000000067F000040020000E00000008AC000-000000067F000040020000E00000008B0000__0000005D2FFFFB38 000000067F000040020000E00000008AC000-000000067F000040020000E00000008B0000__00000073AD3FE6B8 000000067F000040020000E00000008AC000-000000067F000040020000E00000008B0000__000000914E3F38F0 000000067F000040020000E00000008AC000-000000067F000040020000E00000008B0000__000000931B9A2710 000000067F000040020000E00000008B0000-000000067F000040020000E00000008B4000__000000572A7A05D8 000000067F000040020000E00000008B0000-000000067F000040020000E00000008B4000__0000005D2FFFFB38 000000067F000040020000E00000008B0000-000000067F000040020000E00000008B4000__00000073AD3FE6B8 000000067F000040020000E00000008B0000-000000067F000040020000E00000008B4000__000000914E3F38F0 000000067F000040020000E00000008B0000-000000067F000040020000E00000008B4000__000000931B9A2710 000000067F000040020000E00000008B383C-000000067F000040020000E00000008BC219__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008B4000-000000067F000040020000E00000008B8000__000000572A7A05D8 000000067F000040020000E00000008B4000-000000067F000040020000E00000008B8000__0000005D2FFFFB38 000000067F000040020000E00000008B4000-000000067F000040020000E00000008B8000__00000073AD3FE6B8 000000067F000040020000E00000008B4000-000000067F000040020000E00000008B8000__000000914E3F38F0 000000067F000040020000E00000008B4000-000000067F000040020000E00000008B8000__000000931B9A2710 000000067F000040020000E00000008B8000-000000067F000040020000E00000008BC000__000000572A7A05D8 000000067F000040020000E00000008B8000-000000067F000040020000E00000008BC000__0000005D2FFFFB38 000000067F000040020000E00000008B8000-000000067F000040020000E00000008BC000__00000073AD3FE6B8 000000067F000040020000E00000008B8000-000000067F000040020000E00000008BC000__000000914E3F38F0 000000067F000040020000E00000008B8000-000000067F000040020000E00000008BC000__000000931B9A2710 000000067F000040020000E00000008BC000-000000067F000040020000E00000008C0000__000000572A7A05D8 000000067F000040020000E00000008BC000-000000067F000040020000E00000008C0000__0000005D2FFFFB38 000000067F000040020000E00000008BC000-000000067F000040020000E00000008C0000__00000073AD3FE6B8 000000067F000040020000E00000008BC000-000000067F000040020000E00000008C0000__000000914E3F38F0 000000067F000040020000E00000008BC000-000000067F000040020000E00000008C0000__000000931B9A2710 000000067F000040020000E00000008BC219-000000067F000040020000E00000008C4BE6__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008C0000-000000067F000040020000E00000008C4000__000000572A7A05D8 000000067F000040020000E00000008C0000-000000067F000040020000E00000008C4000__0000005D2FFFFB38 000000067F000040020000E00000008C0000-000000067F000040020000E00000008C4000__00000073AD3FE6B8 000000067F000040020000E00000008C0000-000000067F000040020000E00000008C4000__000000914E3F38F0 000000067F000040020000E00000008C0000-000000067F000040020000E00000008C4000__000000931B9A2710 000000067F000040020000E00000008C4000-000000067F000040020000E00000008C8000__000000572A7A05D8 000000067F000040020000E00000008C4000-000000067F000040020000E00000008C8000__0000005D2FFFFB38 000000067F000040020000E00000008C4000-000000067F000040020000E00000008C8000__00000073AD3FE6B8 000000067F000040020000E00000008C4000-000000067F000040020000E00000008C8000__000000914E3F38F0 000000067F000040020000E00000008C4000-000000067F000040020000E00000008C8000__000000931B9A2710 000000067F000040020000E00000008C4BE6-000000067F000040020000E00000008CD5D3__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008C8000-000000067F000040020000E00000008CC000__000000572A7A05D8 000000067F000040020000E00000008C8000-000000067F000040020000E00000008CC000__0000005D2FFFFB38 000000067F000040020000E00000008C8000-000000067F000040020000E00000008CC000__00000073AD3FE6B8 000000067F000040020000E00000008C8000-000000067F000040020000E00000008CC000__000000914E3F38F0 000000067F000040020000E00000008C8000-000000067F000040020000E00000008CC000__000000931B9A2710 000000067F000040020000E00000008CC000-000000067F000040020000E00000008D0000__000000572A7A05D8 000000067F000040020000E00000008CC000-000000067F000040020000E00000008D0000__0000005D2FFFFB38 000000067F000040020000E00000008CC000-000000067F000040020000E00000008D0000__00000073AD3FE6B8 000000067F000040020000E00000008CC000-000000067F000040020000E00000008D0000__000000914E3F38F0 000000067F000040020000E00000008CC000-000000067F000040020000E00000008D0000__000000931B9A2710 000000067F000040020000E00000008CD5D3-000000067F000040020000E00000008D5FBE__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008D0000-000000067F000040020000E00000008D4000__000000572A7A05D8 000000067F000040020000E00000008D0000-000000067F000040020000E00000008D4000__0000005D2FFFFB38 000000067F000040020000E00000008D0000-000000067F000040020000E00000008D4000__00000073AD3FE6B8 000000067F000040020000E00000008D0000-000000067F000040020000E00000008D4000__000000914E3F38F0 000000067F000040020000E00000008D0000-000000067F000040020000E00000008D4000__000000931B9A2710 000000067F000040020000E00000008D4000-000000067F000040020000E00000008D8000__000000572A7A05D8 000000067F000040020000E00000008D4000-000000067F000040020000E00000008D8000__0000005D2FFFFB38 000000067F000040020000E00000008D4000-000000067F000040020000E00000008D8000__00000073AD3FE6B8 000000067F000040020000E00000008D4000-000000067F000040020000E00000008D8000__000000914E3F38F0 000000067F000040020000E00000008D4000-000000067F000040020000E00000008D8000__000000931B9A2710 000000067F000040020000E00000008D5FBE-000000067F000040020000E00000008DE9A8__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008D8000-000000067F000040020000E00000008DC000__000000572A7A05D8 000000067F000040020000E00000008D8000-000000067F000040020000E00000008DC000__0000005D2FFFFB38 000000067F000040020000E00000008D8000-000000067F000040020000E00000008DC000__00000073AD3FE6B8 000000067F000040020000E00000008D8000-000000067F000040020000E00000008DC000__000000914E3F38F0 000000067F000040020000E00000008D8000-000000067F000040020000E00000008DC000__000000931B9A2710 000000067F000040020000E00000008DC000-000000067F000040020000E00000008E0000__000000572A7A05D8 000000067F000040020000E00000008DC000-000000067F000040020000E00000008E0000__0000005D2FFFFB38 000000067F000040020000E00000008DC000-000000067F000040020000E00000008E0000__00000073AD3FE6B8 000000067F000040020000E00000008DC000-000000067F000040020000E00000008E0000__000000914E3F38F0 000000067F000040020000E00000008DC000-000000067F000040020000E00000008E0000__000000931B9A2710 000000067F000040020000E00000008DE9A8-000000067F000040020000E00000008E737A__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008E0000-000000067F000040020000E00000008E4000__000000572A7A05D8 000000067F000040020000E00000008E0000-000000067F000040020000E00000008E4000__0000005D2FFFFB38 000000067F000040020000E00000008E0000-000000067F000040020000E00000008E4000__00000073AD3FE6B8 000000067F000040020000E00000008E0000-000000067F000040020000E00000008E4000__000000914E3F38F0 000000067F000040020000E00000008E0000-000000067F000040020000E00000008E4000__000000931B9A2710 000000067F000040020000E00000008E4000-000000067F000040020000E00000008E8000__000000572A7A05D8 000000067F000040020000E00000008E4000-000000067F000040020000E00000008E8000__0000005D2FFFFB38 000000067F000040020000E00000008E4000-000000067F000040020000E00000008E8000__00000073AD3FE6B8 000000067F000040020000E00000008E4000-000000067F000040020000E00000008E8000__000000914E3F38F0 000000067F000040020000E00000008E4000-000000067F000040020000E00000008E8000__000000931B9A2710 000000067F000040020000E00000008E737A-000000067F000040020000E00000008EFD57__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008E8000-000000067F000040020000E00000008EC000__000000572A7A05D8 000000067F000040020000E00000008E8000-000000067F000040020000E00000008EC000__0000005D2FFFFB38 000000067F000040020000E00000008E8000-000000067F000040020000E00000008EC000__00000073AD3FE6B8 000000067F000040020000E00000008E8000-000000067F000040020000E00000008EC000__000000914E3F38F0 000000067F000040020000E00000008E8000-000000067F000040020000E00000008EC000__000000931B9A2710 000000067F000040020000E00000008EC000-000000067F000040020000E00000008F0000__000000572A7A05D8 000000067F000040020000E00000008EC000-000000067F000040020000E00000008F0000__0000005D2FFFFB38 000000067F000040020000E00000008EC000-000000067F000040020000E00000008F0000__00000073AD3FE6B8 000000067F000040020000E00000008EC000-000000067F000040020000E00000008F0000__000000914E3F38F0 000000067F000040020000E00000008EC000-000000067F000040020000E00000008F0000__000000931B9A2710 000000067F000040020000E00000008EFD57-000000067F000040020000E0000100000000__0000004AA26BDB49-0000004B421BFF39 000000067F000040020000E00000008F0000-000000067F000040020000E00000008F4000__0000004C0EBFF260 000000067F000040020000E00000008F0000-000000067F000040020000E00000008F4000__0000005D2FFFFB38 000000067F000040020000E00000008F0000-000000067F000040020000E00000008F4000__00000073AD3FE6B8 000000067F000040020000E00000008F0000-000000067F000040020000E00000008F4000__000000914E3F38F0 000000067F000040020000E00000008F0000-000000067F000040020000E00000008F4000__000000931B9A2710 000000067F000040020000E00000008F0021-000000067F000040020000E00000008F89FC__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E00000008F4000-000000067F000040020000E00000008F8000__0000004C0EBFF260 000000067F000040020000E00000008F4000-000000067F000040020000E00000008F8000__0000005D2FFFFB38 000000067F000040020000E00000008F4000-000000067F000040020000E00000008F8000__00000073AD3FE6B8 000000067F000040020000E00000008F4000-000000067F000040020000E00000008F8000__000000914E3F38F0 000000067F000040020000E00000008F4000-000000067F000040020000E00000008F8000__000000931B9A2710 000000067F000040020000E00000008F8000-000000067F000040020000E00000008FC000__0000004C0EBFF260 000000067F000040020000E00000008F8000-000000067F000040020000E00000008FC000__0000005D2FFFFB38 000000067F000040020000E00000008F8000-000000067F000040020000E00000008FC000__00000073AD3FE6B8 000000067F000040020000E00000008F8000-000000067F000040020000E00000008FC000__000000914E3F38F0 000000067F000040020000E00000008F8000-000000067F000040020000E00000008FC000__000000931B9A2710 000000067F000040020000E00000008F89FC-000000067F000040020000E00000009013D5__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E00000008FC000-000000067F000040020000E0000000900000__0000004C0EBFF260 000000067F000040020000E00000008FC000-000000067F000040020000E0000000900000__0000005D2FFFFB38 000000067F000040020000E00000008FC000-000000067F000040020000E0000000900000__00000073AD3FE6B8 000000067F000040020000E00000008FC000-000000067F000040020000E0000000900000__000000914E3F38F0 000000067F000040020000E00000008FC000-000000067F000040020000E0000000900000__000000931B9A2710 000000067F000040020000E0000000900000-000000067F000040020000E0000000904000__0000004C0EBFF260 000000067F000040020000E0000000900000-000000067F000040020000E0000000904000__0000005D2FFFFB38 000000067F000040020000E0000000900000-000000067F000040020000E0000000904000__00000073AD3FE6B8 000000067F000040020000E0000000900000-000000067F000040020000E0000000904000__000000914E3F38F0 000000067F000040020000E0000000900000-000000067F000040020000E0000000904000__000000931B9A2710 000000067F000040020000E00000009013D5-000000067F000040020000E0000000909DC8__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000904000-000000067F000040020000E0000000908000__0000004C0EBFF260 000000067F000040020000E0000000904000-000000067F000040020000E0000000908000__0000005D2FFFFB38 000000067F000040020000E0000000904000-000000067F000040020000E0000000908000__00000073AD3FE6B8 000000067F000040020000E0000000904000-000000067F000040020000E0000000908000__000000914E3F38F0 000000067F000040020000E0000000904000-000000067F000040020000E0000000908000__000000931B9A2710 000000067F000040020000E0000000908000-000000067F000040020000E000000090C000__0000004C0EBFF260 000000067F000040020000E0000000908000-000000067F000040020000E000000090C000__0000005D2FFFFB38 000000067F000040020000E0000000908000-000000067F000040020000E000000090C000__00000073AD3FE6B8 000000067F000040020000E0000000908000-000000067F000040020000E000000090C000__000000914E3F38F0 000000067F000040020000E0000000908000-000000067F000040020000E000000090C000__000000931B9A2710 000000067F000040020000E0000000909DC8-000000067F000040020000E00000009127AB__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E000000090C000-000000067F000040020000E0000000910000__0000004C0EBFF260 000000067F000040020000E000000090C000-000000067F000040020000E0000000910000__0000005D2FFFFB38 000000067F000040020000E000000090C000-000000067F000040020000E0000000910000__00000073AD3FE6B8 000000067F000040020000E000000090C000-000000067F000040020000E0000000910000__000000914E3F38F0 000000067F000040020000E000000090C000-000000067F000040020000E0000000910000__000000931B9A2710 000000067F000040020000E0000000910000-000000067F000040020000E0000000914000__0000004C0EBFF260 000000067F000040020000E0000000910000-000000067F000040020000E0000000914000__0000005D2FFFFB38 000000067F000040020000E0000000910000-000000067F000040020000E0000000914000__00000073AD3FE6B8 000000067F000040020000E0000000910000-000000067F000040020000E0000000914000__000000914E3F38F0 000000067F000040020000E0000000910000-000000067F000040020000E0000000914000__000000931B9A2710 000000067F000040020000E00000009127AB-000000067F000040020000E000000091B185__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000914000-000000067F000040020000E0000000918000__0000004C0EBFF260 000000067F000040020000E0000000914000-000000067F000040020000E0000000918000__0000005D2FFFFB38 000000067F000040020000E0000000914000-000000067F000040020000E0000000918000__00000073AD3FE6B8 000000067F000040020000E0000000914000-000000067F000040020000E0000000918000__000000914E3F38F0 000000067F000040020000E0000000914000-000000067F000040020000E0000000918000__000000931B9A2710 000000067F000040020000E0000000918000-000000067F000040020000E000000091C000__0000004C0EBFF260 000000067F000040020000E0000000918000-000000067F000040020000E000000091C000__0000005D2FFFFB38 000000067F000040020000E0000000918000-000000067F000040020000E000000091C000__00000073AD3FE6B8 000000067F000040020000E0000000918000-000000067F000040020000E000000091C000__000000914E3F38F0 000000067F000040020000E0000000918000-000000067F000040020000E000000091C000__000000931B9A2710 000000067F000040020000E000000091B185-000000067F000040020000E0000000923B55__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E000000091C000-000000067F000040020000E0000000920000__0000004C0EBFF260 000000067F000040020000E000000091C000-000000067F000040020000E0000000920000__0000005D2FFFFB38 000000067F000040020000E000000091C000-000000067F000040020000E0000000920000__00000073AD3FE6B8 000000067F000040020000E000000091C000-000000067F000040020000E0000000920000__000000914E3F38F0 000000067F000040020000E000000091C000-000000067F000040020000E0000000920000__000000931B9A2710 000000067F000040020000E0000000920000-000000067F000040020000E0000000924000__0000004C0EBFF260 000000067F000040020000E0000000920000-000000067F000040020000E0000000924000__0000005D2FFFFB38 000000067F000040020000E0000000920000-000000067F000040020000E0000000924000__00000073AD3FE6B8 000000067F000040020000E0000000920000-000000067F000040020000E0000000924000__000000914E3F38F0 000000067F000040020000E0000000920000-000000067F000040020000E0000000924000__000000931B9A2710 000000067F000040020000E0000000923B55-000000067F000040020000E000000092C536__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000924000-000000067F000040020000E0000000928000__0000004C0EBFF260 000000067F000040020000E0000000924000-000000067F000040020000E0000000928000__0000005D2FFFFB38 000000067F000040020000E0000000924000-000000067F000040020000E0000000928000__00000073AD3FE6B8 000000067F000040020000E0000000924000-000000067F000040020000E0000000928000__000000914E3F38F0 000000067F000040020000E0000000924000-000000067F000040020000E0000000928000__000000931B9A2710 000000067F000040020000E0000000928000-000000067F000040020000E000000092C000__0000004C0EBFF260 000000067F000040020000E0000000928000-000000067F000040020000E000000092C000__0000005D2FFFFB38 000000067F000040020000E0000000928000-000000067F000040020000E000000092C000__00000073AD3FE6B8 000000067F000040020000E0000000928000-000000067F000040020000E000000092C000__000000914E3F38F0 000000067F000040020000E0000000928000-000000067F000040020000E000000092C000__000000931B9A2710 000000067F000040020000E000000092C000-000000067F000040020000E0000000930000__0000004C0EBFF260 000000067F000040020000E000000092C000-000000067F000040020000E0000000930000__0000005D2FFFFB38 000000067F000040020000E000000092C000-000000067F000040020000E0000000930000__00000073AD3FE6B8 000000067F000040020000E000000092C000-000000067F000040020000E0000000930000__000000914E3F38F0 000000067F000040020000E000000092C000-000000067F000040020000E0000000930000__000000931B9A2710 000000067F000040020000E000000092C536-000000067F000040020000E0000000934F0F__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000930000-000000067F000040020000E0000000934000__0000004C0EBFF260 000000067F000040020000E0000000930000-000000067F000040020000E0000000934000__0000005D2FFFFB38 000000067F000040020000E0000000930000-000000067F000040020000E0000000934000__00000073AD3FE6B8 000000067F000040020000E0000000930000-000000067F000040020000E0000000934000__000000914E3F38F0 000000067F000040020000E0000000930000-000000067F000040020000E0000000934000__000000931B9A2710 000000067F000040020000E0000000934000-000000067F000040020000E0000000938000__0000004C0EBFF260 000000067F000040020000E0000000934000-000000067F000040020000E0000000938000__0000005D2FFFFB38 000000067F000040020000E0000000934000-000000067F000040020000E0000000938000__00000073AD3FE6B8 000000067F000040020000E0000000934000-000000067F000040020000E0000000938000__000000914E3F38F0 000000067F000040020000E0000000934000-000000067F000040020000E0000000938000__000000931B9A2710 000000067F000040020000E0000000934F0F-000000067F000040020000E000000093D8E2__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000938000-000000067F000040020000E000000093C000__0000004C0EBFF260 000000067F000040020000E0000000938000-000000067F000040020000E000000093C000__0000005D2FFFFB38 000000067F000040020000E0000000938000-000000067F000040020000E000000093C000__00000073AD3FE6B8 000000067F000040020000E0000000938000-000000067F000040020000E000000093C000__000000914E3F38F0 000000067F000040020000E0000000938000-000000067F000040020000E000000093C000__000000931B9A2710 000000067F000040020000E000000093C000-000000067F000040020000E0000000940000__0000004C0EBFF260 000000067F000040020000E000000093C000-000000067F000040020000E0000000940000__0000005D2FFFFB38 000000067F000040020000E000000093C000-000000067F000040020000E0000000940000__00000073AD3FE6B8 000000067F000040020000E000000093C000-000000067F000040020000E0000000940000__000000914E3F38F0 000000067F000040020000E000000093C000-000000067F000040020000E0000000940000__000000931B9A2710 000000067F000040020000E000000093D8E2-000000067F000040020000E00000009462D1__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E0000000940000-000000067F000040020000E0000000944000__0000004C0EBFF260 000000067F000040020000E0000000940000-000000067F000040020000E0000000944000__0000005D2FFFFB38 000000067F000040020000E0000000940000-000000067F000040020000E0000000944000__00000073AD3FE6B8 000000067F000040020000E0000000940000-000000067F000040020000E0000000944000__000000914E3F38F0 000000067F000040020000E0000000940000-000000067F000040020000E0000000944000__000000931B9A2710 000000067F000040020000E0000000944000-000000067F000040020000E0000000948000__0000004C0EBFF260 000000067F000040020000E0000000944000-000000067F000040020000E0000000948000__000000574B7FF240 000000067F000040020000E0000000944000-000000067F000040020000E0000000948000__00000073AD3FE6B8 000000067F000040020000E0000000944000-000000067F000040020000E0000000948000__000000914E3F38F0 000000067F000040020000E0000000944000-000000067F000040020000E0000000948000__000000931B9A2710 000000067F000040020000E00000009462D1-000000067F000040020000E0000100000000__0000004B421BFF39-0000004BE1CBD591 000000067F000040020000E000000094659B-000000067F000040020000E000000094EF81__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000948000-000000067F000040020000E000000094C000__0000004C0EBFF260 000000067F000040020000E0000000948000-000000067F000040020000E000000094C000__000000574B7FF240 000000067F000040020000E0000000948000-000000067F000040020000E000000094C000__00000073AD3FE6B8 000000067F000040020000E0000000948000-000000067F000040020000E000000094C000__000000914E3F38F0 000000067F000040020000E0000000948000-000000067F000040020000E000000094C000__000000931B9A2710 000000067F000040020000E000000094C000-000000067F000040020000E0000000950000__0000004C0EBFF260 000000067F000040020000E000000094C000-000000067F000040020000E0000000950000__000000574B7FF240 000000067F000040020000E000000094C000-000000067F000040020000E0000000950000__00000073AD3FE6B8 000000067F000040020000E000000094C000-000000067F000040020000E0000000950000__000000914E3F38F0 000000067F000040020000E000000094C000-000000067F000040020000E0000000950000__000000931B9A2710 000000067F000040020000E000000094EF81-000000067F000040020000E000000095795E__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000950000-000000067F000040020000E0000000954000__0000004C0EBFF260 000000067F000040020000E0000000950000-000000067F000040020000E0000000954000__000000574B7FF240 000000067F000040020000E0000000950000-000000067F000040020000E0000000954000__00000073AD3FE6B8 000000067F000040020000E0000000950000-000000067F000040020000E0000000954000__000000914E3F38F0 000000067F000040020000E0000000950000-000000067F000040020000E0000000954000__000000931B9A2710 000000067F000040020000E0000000954000-000000067F000040020000E0000000958000__0000004C0EBFF260 000000067F000040020000E0000000954000-000000067F000040020000E0000000958000__000000574B7FF240 000000067F000040020000E0000000954000-000000067F000040020000E0000000958000__00000073AD3FE6B8 000000067F000040020000E0000000954000-000000067F000040020000E0000000958000__000000914E3F38F0 000000067F000040020000E0000000954000-000000067F000040020000E0000000958000__000000931B9A2710 000000067F000040020000E000000095795E-000000067F000040020000E0000000960335__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000958000-000000067F000040020000E000000095C000__0000004C0EBFF260 000000067F000040020000E0000000958000-000000067F000040020000E000000095C000__000000574B7FF240 000000067F000040020000E0000000958000-000000067F000040020000E000000095C000__00000073AD3FE6B8 000000067F000040020000E0000000958000-000000067F000040020000E000000095C000__000000914E3F38F0 000000067F000040020000E0000000958000-000000067F000040020000E000000095C000__000000931B9A2710 000000067F000040020000E000000095C000-000000067F000040020000E0000000960000__000000574B7FF240 000000067F000040020000E000000095C000-000000067F000040020000E0000000960000__00000073AD3FE6B8 000000067F000040020000E000000095C000-000000067F000040020000E0000000960000__000000914E3F38F0 000000067F000040020000E000000095C000-000000067F000040020000E0000000960000__000000931B9A2710 000000067F000040020000E000000095C000-030000000000000000000000000000000002__0000004C0EBFF260 000000067F000040020000E0000000960000-000000067F000040020000E0000000964000__000000574B7FF240 000000067F000040020000E0000000960000-000000067F000040020000E0000000964000__00000073AD3FE6B8 000000067F000040020000E0000000960000-000000067F000040020000E0000000964000__000000914E3F38F0 000000067F000040020000E0000000960000-000000067F000040020000E0000000964000__000000931B9A2710 000000067F000040020000E0000000960335-000000067F000040020000E0000000968D05__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000964000-000000067F000040020000E0000000968000__000000574B7FF240 000000067F000040020000E0000000964000-000000067F000040020000E0000000968000__00000073AD3FE6B8 000000067F000040020000E0000000964000-000000067F000040020000E0000000968000__000000914E3F38F0 000000067F000040020000E0000000964000-000000067F000040020000E0000000968000__000000931B9A2710 000000067F000040020000E0000000968000-000000067F000040020000E000000096C000__000000574B7FF240 000000067F000040020000E0000000968000-000000067F000040020000E000000096C000__00000073AD3FE6B8 000000067F000040020000E0000000968000-000000067F000040020000E000000096C000__000000914E3F38F0 000000067F000040020000E0000000968000-000000067F000040020000E000000096C000__000000931B9A2710 000000067F000040020000E0000000968D05-000000067F000040020000E00000009716D4__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E000000096C000-000000067F000040020000E0000000970000__000000574B7FF240 000000067F000040020000E000000096C000-000000067F000040020000E0000000970000__00000073AD3FE6B8 000000067F000040020000E000000096C000-000000067F000040020000E0000000970000__000000914E3F38F0 000000067F000040020000E000000096C000-000000067F000040020000E0000000970000__000000931B9A2710 000000067F000040020000E0000000970000-000000067F000040020000E0000000974000__000000574B7FF240 000000067F000040020000E0000000970000-000000067F000040020000E0000000974000__00000073AD3FE6B8 000000067F000040020000E0000000970000-000000067F000040020000E0000000974000__000000914E3F38F0 000000067F000040020000E0000000970000-000000067F000040020000E0000000974000__000000931B9A2710 000000067F000040020000E00000009716D4-000000067F000040020000E000000097A0B2__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000974000-000000067F000040020000E0000000978000__000000574B7FF240 000000067F000040020000E0000000974000-000000067F000040020000E0000000978000__00000073AD3FE6B8 000000067F000040020000E0000000974000-000000067F000040020000E0000000978000__000000914E3F38F0 000000067F000040020000E0000000974000-000000067F000040020000E0000000978000__000000931B9A2710 000000067F000040020000E0000000978000-000000067F000040020000E000000097C000__000000574B7FF240 000000067F000040020000E0000000978000-000000067F000040020000E000000097C000__00000073AD3FE6B8 000000067F000040020000E0000000978000-000000067F000040020000E000000097C000__000000914E3F38F0 000000067F000040020000E0000000978000-000000067F000040020000E000000097C000__000000931B9A2710 000000067F000040020000E000000097A0B2-000000067F000040020000E0000000982A9D__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E000000097C000-000000067F000040020000E0000000980000__000000574B7FF240 000000067F000040020000E000000097C000-000000067F000040020000E0000000980000__00000073AD3FE6B8 000000067F000040020000E000000097C000-000000067F000040020000E0000000980000__000000914E3F38F0 000000067F000040020000E000000097C000-000000067F000040020000E0000000980000__000000931B9A2710 000000067F000040020000E0000000980000-000000067F000040020000E0000000984000__000000574B7FF240 000000067F000040020000E0000000980000-000000067F000040020000E0000000984000__00000073AD3FE6B8 000000067F000040020000E0000000980000-000000067F000040020000E0000000984000__000000914E3F38F0 000000067F000040020000E0000000980000-000000067F000040020000E0000000984000__000000931B9A2710 000000067F000040020000E0000000982A9D-000000067F000040020000E000000098B483__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000984000-000000067F000040020000E0000000988000__000000574B7FF240 000000067F000040020000E0000000984000-000000067F000040020000E0000000988000__00000073AD3FE6B8 000000067F000040020000E0000000984000-000000067F000040020000E0000000988000__000000914E3F38F0 000000067F000040020000E0000000984000-000000067F000040020000E0000000988000__000000931B9A2710 000000067F000040020000E0000000988000-000000067F000040020000E000000098C000__000000574B7FF240 000000067F000040020000E0000000988000-000000067F000040020000E000000098C000__00000073AD3FE6B8 000000067F000040020000E0000000988000-000000067F000040020000E000000098C000__000000914E3F38F0 000000067F000040020000E0000000988000-000000067F000040020000E000000098C000__000000931B9A2710 000000067F000040020000E000000098B483-000000067F000040020000E0000000993E61__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E000000098C000-000000067F000040020000E0000000990000__000000574B7FF240 000000067F000040020000E000000098C000-000000067F000040020000E0000000990000__00000073AD3FE6B8 000000067F000040020000E000000098C000-000000067F000040020000E0000000990000__000000914E3F38F0 000000067F000040020000E000000098C000-000000067F000040020000E0000000990000__000000931B9A2710 000000067F000040020000E0000000990000-000000067F000040020000E0000000994000__000000574B7FF240 000000067F000040020000E0000000990000-000000067F000040020000E0000000994000__00000073AD3FE6B8 000000067F000040020000E0000000990000-000000067F000040020000E0000000994000__000000914E3F38F0 000000067F000040020000E0000000990000-000000067F000040020000E0000000994000__000000931B9A2710 000000067F000040020000E0000000993E61-000000067F000040020000E000000099C837__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E0000000994000-000000067F000040020000E0000000998000__000000574B7FF240 000000067F000040020000E0000000994000-000000067F000040020000E0000000998000__00000073AD3FE6B8 000000067F000040020000E0000000994000-000000067F000040020000E0000000998000__000000914E3F38F0 000000067F000040020000E0000000994000-000000067F000040020000E0000000998000__000000931B9A2710 000000067F000040020000E0000000998000-000000067F000040020000E000000099C000__000000574B7FF240 000000067F000040020000E0000000998000-000000067F000040020000E000000099C000__00000073AD3FE6B8 000000067F000040020000E0000000998000-000000067F000040020000E000000099C000__000000914E3F38F0 000000067F000040020000E0000000998000-000000067F000040020000E000000099C000__000000931B9A2710 000000067F000040020000E000000099C000-000000067F000040020000E00000009A0000__000000574B7FF240 000000067F000040020000E000000099C000-000000067F000040020000E00000009A0000__00000073AD3FE6B8 000000067F000040020000E000000099C000-000000067F000040020000E00000009A0000__000000914E3F38F0 000000067F000040020000E000000099C000-000000067F000040020000E00000009A0000__000000931B9A2710 000000067F000040020000E000000099C837-000000067F000040020000E00000009A5205__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E00000009A0000-000000067F000040020000E00000009A4000__000000574B7FF240 000000067F000040020000E00000009A0000-000000067F000040020000E00000009A4000__00000073AD3FE6B8 000000067F000040020000E00000009A0000-000000067F000040020000E00000009A4000__000000914E3F38F0 000000067F000040020000E00000009A0000-000000067F000040020000E00000009A4000__000000931B9A2710 000000067F000040020000E00000009A4000-000000067F000040020000E00000009A8000__000000572A7A05D8 000000067F000040020000E00000009A4000-000000067F000040020000E00000009A8000__0000005D2FFFFB38 000000067F000040020000E00000009A4000-000000067F000040020000E00000009A8000__00000073AD3FE6B8 000000067F000040020000E00000009A4000-000000067F000040020000E00000009A8000__000000914E3F38F0 000000067F000040020000E00000009A4000-000000067F000040020000E00000009A8000__000000931B9A2710 000000067F000040020000E00000009A5205-000000067F000040020000E0000100000000__0000004BE1CBD591-0000004C9173DB81 000000067F000040020000E00000009A552F-000000067F000040020000E00000009ADEFC__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009A8000-000000067F000040020000E00000009AC000__000000572A7A05D8 000000067F000040020000E00000009A8000-000000067F000040020000E00000009AC000__0000005D2FFFFB38 000000067F000040020000E00000009A8000-000000067F000040020000E00000009AC000__00000073AD3FE6B8 000000067F000040020000E00000009A8000-000000067F000040020000E00000009AC000__000000914E3F38F0 000000067F000040020000E00000009A8000-000000067F000040020000E00000009AC000__000000931B9A2710 000000067F000040020000E00000009AC000-000000067F000040020000E00000009B0000__000000572A7A05D8 000000067F000040020000E00000009AC000-000000067F000040020000E00000009B0000__0000005D2FFFFB38 000000067F000040020000E00000009AC000-000000067F000040020000E00000009B0000__00000073AD3FE6B8 000000067F000040020000E00000009AC000-000000067F000040020000E00000009B0000__000000914E3F38F0 000000067F000040020000E00000009AC000-000000067F000040020000E00000009B0000__000000931B9A2710 000000067F000040020000E00000009ADEFC-000000067F000040020000E00000009B68E7__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009B0000-000000067F000040020000E00000009B4000__000000572A7A05D8 000000067F000040020000E00000009B0000-000000067F000040020000E00000009B4000__0000005D2FFFFB38 000000067F000040020000E00000009B0000-000000067F000040020000E00000009B4000__00000073AD3FE6B8 000000067F000040020000E00000009B0000-000000067F000040020000E00000009B4000__000000914E3F38F0 000000067F000040020000E00000009B0000-000000067F000040020000E00000009B4000__000000931B9A2710 000000067F000040020000E00000009B4000-000000067F000040020000E00000009B8000__000000572A7A05D8 000000067F000040020000E00000009B4000-000000067F000040020000E00000009B8000__0000005D2FFFFB38 000000067F000040020000E00000009B4000-000000067F000040020000E00000009B8000__00000073AD3FE6B8 000000067F000040020000E00000009B4000-000000067F000040020000E00000009B8000__000000914E3F38F0 000000067F000040020000E00000009B4000-000000067F000040020000E00000009B8000__000000931B9A2710 000000067F000040020000E00000009B68E7-000000067F000040020000E00000009BF2D2__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009B8000-000000067F000040020000E00000009BC000__000000572A7A05D8 000000067F000040020000E00000009B8000-000000067F000040020000E00000009BC000__0000005D2FFFFB38 000000067F000040020000E00000009B8000-000000067F000040020000E00000009BC000__00000073AD3FE6B8 000000067F000040020000E00000009B8000-000000067F000040020000E00000009BC000__000000914E3F38F0 000000067F000040020000E00000009B8000-000000067F000040020000E00000009BC000__000000931B9A2710 000000067F000040020000E00000009BC000-000000067F000040020000E00000009C0000__000000572A7A05D8 000000067F000040020000E00000009BC000-000000067F000040020000E00000009C0000__0000005D2FFFFB38 000000067F000040020000E00000009BC000-000000067F000040020000E00000009C0000__00000073AD3FE6B8 000000067F000040020000E00000009BC000-000000067F000040020000E00000009C0000__000000914E3F38F0 000000067F000040020000E00000009BC000-000000067F000040020000E00000009C0000__000000931B9A2710 000000067F000040020000E00000009BF2D2-000000067F000040020000E00000009C7CB7__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009C0000-000000067F000040020000E00000009C4000__000000572A7A05D8 000000067F000040020000E00000009C0000-000000067F000040020000E00000009C4000__0000005D2FFFFB38 000000067F000040020000E00000009C0000-000000067F000040020000E00000009C4000__00000073AD3FE6B8 000000067F000040020000E00000009C0000-000000067F000040020000E00000009C4000__000000914E3F38F0 000000067F000040020000E00000009C0000-000000067F000040020000E00000009C4000__000000931B9A2710 000000067F000040020000E00000009C4000-000000067F000040020000E00000009C8000__000000572A7A05D8 000000067F000040020000E00000009C4000-000000067F000040020000E00000009C8000__0000005D2FFFFB38 000000067F000040020000E00000009C4000-000000067F000040020000E00000009C8000__00000073AD3FE6B8 000000067F000040020000E00000009C4000-000000067F000040020000E00000009C8000__000000914E3F38F0 000000067F000040020000E00000009C4000-000000067F000040020000E00000009C8000__000000931B9A2710 000000067F000040020000E00000009C7CB7-000000067F000040020000E00000009D0695__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009C8000-000000067F000040020000E00000009CC000__000000572A7A05D8 000000067F000040020000E00000009C8000-000000067F000040020000E00000009CC000__0000005D2FFFFB38 000000067F000040020000E00000009C8000-000000067F000040020000E00000009CC000__00000073AD3FE6B8 000000067F000040020000E00000009C8000-000000067F000040020000E00000009CC000__000000914E3F38F0 000000067F000040020000E00000009C8000-000000067F000040020000E00000009CC000__000000931B9A2710 000000067F000040020000E00000009CC000-000000067F000040020000E00000009D0000__000000572A7A05D8 000000067F000040020000E00000009CC000-000000067F000040020000E00000009D0000__0000005D2FFFFB38 000000067F000040020000E00000009CC000-000000067F000040020000E00000009D0000__00000073AD3FE6B8 000000067F000040020000E00000009CC000-000000067F000040020000E00000009D0000__000000914E3F38F0 000000067F000040020000E00000009CC000-000000067F000040020000E00000009D0000__000000931B9A2710 000000067F000040020000E00000009D0000-000000067F000040020000E00000009D4000__000000572A7A05D8 000000067F000040020000E00000009D0000-000000067F000040020000E00000009D4000__0000005D2FFFFB38 000000067F000040020000E00000009D0000-000000067F000040020000E00000009D4000__00000073AD3FE6B8 000000067F000040020000E00000009D0000-000000067F000040020000E00000009D4000__000000914E3F38F0 000000067F000040020000E00000009D0000-000000067F000040020000E00000009D4000__000000931B9A2710 000000067F000040020000E00000009D0695-000000067F000040020000E00000009D9071__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009D4000-000000067F000040020000E00000009D8000__000000572A7A05D8 000000067F000040020000E00000009D4000-000000067F000040020000E00000009D8000__0000005D2FFFFB38 000000067F000040020000E00000009D4000-000000067F000040020000E00000009D8000__00000073AD3FE6B8 000000067F000040020000E00000009D4000-000000067F000040020000E00000009D8000__000000914E3F38F0 000000067F000040020000E00000009D4000-000000067F000040020000E00000009D8000__000000931B9A2710 000000067F000040020000E00000009D8000-000000067F000040020000E00000009DC000__000000572A7A05D8 000000067F000040020000E00000009D8000-000000067F000040020000E00000009DC000__0000005D2FFFFB38 000000067F000040020000E00000009D8000-000000067F000040020000E00000009DC000__00000073AD3FE6B8 000000067F000040020000E00000009D8000-000000067F000040020000E00000009DC000__000000914E3F38F0 000000067F000040020000E00000009D8000-000000067F000040020000E00000009DC000__000000931B9A2710 000000067F000040020000E00000009D9071-000000067F000040020000E00000009E1A46__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009DC000-000000067F000040020000E00000009E0000__000000572A7A05D8 000000067F000040020000E00000009DC000-000000067F000040020000E00000009E0000__0000005D2FFFFB38 000000067F000040020000E00000009DC000-000000067F000040020000E00000009E0000__00000073AD3FE6B8 000000067F000040020000E00000009DC000-000000067F000040020000E00000009E0000__000000914E3F38F0 000000067F000040020000E00000009DC000-000000067F000040020000E00000009E0000__000000931B9A2710 000000067F000040020000E00000009E0000-000000067F000040020000E00000009E4000__000000572A7A05D8 000000067F000040020000E00000009E0000-000000067F000040020000E00000009E4000__0000005D2FFFFB38 000000067F000040020000E00000009E0000-000000067F000040020000E00000009E4000__00000073AD3FE6B8 000000067F000040020000E00000009E0000-000000067F000040020000E00000009E4000__000000914E3F38F0 000000067F000040020000E00000009E0000-000000067F000040020000E00000009E4000__000000931B9A2710 000000067F000040020000E00000009E1A46-000000067F000040020000E00000009EA421__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009E4000-000000067F000040020000E00000009E8000__000000572A7A05D8 000000067F000040020000E00000009E4000-000000067F000040020000E00000009E8000__0000005D2FFFFB38 000000067F000040020000E00000009E4000-000000067F000040020000E00000009E8000__00000073AD3FE6B8 000000067F000040020000E00000009E4000-000000067F000040020000E00000009E8000__000000914E3F38F0 000000067F000040020000E00000009E4000-000000067F000040020000E00000009E8000__000000931B9A2710 000000067F000040020000E00000009E8000-000000067F000040020000E00000009EC000__000000572A7A05D8 000000067F000040020000E00000009E8000-000000067F000040020000E00000009EC000__0000005D2FFFFB38 000000067F000040020000E00000009E8000-000000067F000040020000E00000009EC000__00000073AD3FE6B8 000000067F000040020000E00000009E8000-000000067F000040020000E00000009EC000__000000914E3F38F0 000000067F000040020000E00000009E8000-000000067F000040020000E00000009EC000__000000931B9A2710 000000067F000040020000E00000009EA421-000000067F000040020000E00000009F2DFA__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009EC000-000000067F000040020000E00000009F0000__000000572A7A05D8 000000067F000040020000E00000009EC000-000000067F000040020000E00000009F0000__0000005D2FFFFB38 000000067F000040020000E00000009EC000-000000067F000040020000E00000009F0000__00000073AD3FE6B8 000000067F000040020000E00000009EC000-000000067F000040020000E00000009F0000__000000914E3F38F0 000000067F000040020000E00000009EC000-000000067F000040020000E00000009F0000__000000931B9A2710 000000067F000040020000E00000009F0000-000000067F000040020000E00000009F4000__000000572A7A05D8 000000067F000040020000E00000009F0000-000000067F000040020000E00000009F4000__0000005D2FFFFB38 000000067F000040020000E00000009F0000-000000067F000040020000E00000009F4000__00000073AD3FE6B8 000000067F000040020000E00000009F0000-000000067F000040020000E00000009F4000__000000914E3F38F0 000000067F000040020000E00000009F0000-000000067F000040020000E00000009F4000__000000931B9A2710 000000067F000040020000E00000009F2DFA-000000067F000040020000E00000009FB7E4__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009F4000-000000067F000040020000E00000009F8000__000000572A7A05D8 000000067F000040020000E00000009F4000-000000067F000040020000E00000009F8000__0000005D2FFFFB38 000000067F000040020000E00000009F4000-000000067F000040020000E00000009F8000__00000073AD3FE6B8 000000067F000040020000E00000009F4000-000000067F000040020000E00000009F8000__000000914E3F38F0 000000067F000040020000E00000009F4000-000000067F000040020000E00000009F8000__000000931B9A2710 000000067F000040020000E00000009F8000-000000067F000040020000E00000009FC000__0000004E11956660 000000067F000040020000E00000009F8000-000000067F000040020000E00000009FC000__0000005D2FFFFB38 000000067F000040020000E00000009F8000-000000067F000040020000E00000009FC000__00000073AD3FE6B8 000000067F000040020000E00000009F8000-000000067F000040020000E00000009FC000__000000914E3F38F0 000000067F000040020000E00000009F8000-000000067F000040020000E00000009FC000__000000931B9A2710 000000067F000040020000E00000009FB7E4-000000067F000040020000E0000100000000__0000004C9173DB81-0000004D3123ED31 000000067F000040020000E00000009FBAAD-000000067F000040020000E0000000A0449F__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E00000009FC000-000000067F000040020000E0000000A00000__0000004E11956660 000000067F000040020000E00000009FC000-000000067F000040020000E0000000A00000__0000005D2FFFFB38 000000067F000040020000E00000009FC000-000000067F000040020000E0000000A00000__00000073AD3FE6B8 000000067F000040020000E00000009FC000-000000067F000040020000E0000000A00000__000000914E3F38F0 000000067F000040020000E00000009FC000-000000067F000040020000E0000000A00000__000000931B9A2710 000000067F000040020000E0000000A00000-000000067F000040020000E0000000A04000__0000004E11956660 000000067F000040020000E0000000A00000-000000067F000040020000E0000000A04000__0000005D2FFFFB38 000000067F000040020000E0000000A00000-000000067F000040020000E0000000A04000__00000073AD3FE6B8 000000067F000040020000E0000000A00000-000000067F000040020000E0000000A04000__000000914E3F38F0 000000067F000040020000E0000000A00000-000000067F000040020000E0000000A04000__000000931B9A2710 000000067F000040020000E0000000A04000-000000067F000040020000E0000000A08000__0000004E11956660 000000067F000040020000E0000000A04000-000000067F000040020000E0000000A08000__0000005D2FFFFB38 000000067F000040020000E0000000A04000-000000067F000040020000E0000000A08000__00000073AD3FE6B8 000000067F000040020000E0000000A04000-000000067F000040020000E0000000A08000__000000914E3F38F0 000000067F000040020000E0000000A04000-000000067F000040020000E0000000A08000__000000931B9A2710 000000067F000040020000E0000000A0449F-000000067F000040020000E0000000A0CE79__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A08000-000000067F000040020000E0000000A0C000__0000004E11956660 000000067F000040020000E0000000A08000-000000067F000040020000E0000000A0C000__0000005D2FFFFB38 000000067F000040020000E0000000A08000-000000067F000040020000E0000000A0C000__00000073AD3FE6B8 000000067F000040020000E0000000A08000-000000067F000040020000E0000000A0C000__000000914E3F38F0 000000067F000040020000E0000000A08000-000000067F000040020000E0000000A0C000__000000931B9A2710 000000067F000040020000E0000000A0C000-000000067F000040020000E0000000A10000__0000004E11956660 000000067F000040020000E0000000A0C000-000000067F000040020000E0000000A10000__0000005D2FFFFB38 000000067F000040020000E0000000A0C000-000000067F000040020000E0000000A10000__00000073AD3FE6B8 000000067F000040020000E0000000A0C000-000000067F000040020000E0000000A10000__000000914E3F38F0 000000067F000040020000E0000000A0C000-000000067F000040020000E0000000A10000__000000931B9A2710 000000067F000040020000E0000000A0CE79-000000067F000040020000E0000000A15852__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A10000-000000067F000040020000E0000000A14000__0000004E11956660 000000067F000040020000E0000000A10000-000000067F000040020000E0000000A14000__0000005D2FFFFB38 000000067F000040020000E0000000A10000-000000067F000040020000E0000000A14000__00000073AD3FE6B8 000000067F000040020000E0000000A10000-000000067F000040020000E0000000A14000__000000914E3F38F0 000000067F000040020000E0000000A10000-000000067F000040020000E0000000A14000__000000931B9A2710 000000067F000040020000E0000000A14000-000000067F000040020000E0000000A18000__0000004E11956660 000000067F000040020000E0000000A14000-000000067F000040020000E0000000A18000__0000005D2FFFFB38 000000067F000040020000E0000000A14000-000000067F000040020000E0000000A18000__00000073AD3FE6B8 000000067F000040020000E0000000A14000-000000067F000040020000E0000000A18000__000000914E3F38F0 000000067F000040020000E0000000A14000-000000067F000040020000E0000000A18000__000000931B9A2710 000000067F000040020000E0000000A15852-000000067F000040020000E0000000A1E225__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A18000-000000067F000040020000E0000000A1C000__0000004E11956660 000000067F000040020000E0000000A18000-000000067F000040020000E0000000A1C000__0000005D2FFFFB38 000000067F000040020000E0000000A18000-000000067F000040020000E0000000A1C000__00000073AD3FE6B8 000000067F000040020000E0000000A18000-000000067F000040020000E0000000A1C000__000000914E3F38F0 000000067F000040020000E0000000A18000-000000067F000040020000E0000000A1C000__000000931B9A2710 000000067F000040020000E0000000A1C000-000000067F000040020000E0000000A20000__0000004E11956660 000000067F000040020000E0000000A1C000-000000067F000040020000E0000000A20000__0000005D2FFFFB38 000000067F000040020000E0000000A1C000-000000067F000040020000E0000000A20000__00000073AD3FE6B8 000000067F000040020000E0000000A1C000-000000067F000040020000E0000000A20000__000000914E3F38F0 000000067F000040020000E0000000A1C000-000000067F000040020000E0000000A20000__000000931B9A2710 000000067F000040020000E0000000A1E225-000000067F000040020000E0000000A26BF5__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A20000-000000067F000040020000E0000000A24000__0000004E11956660 000000067F000040020000E0000000A20000-000000067F000040020000E0000000A24000__0000005D2FFFFB38 000000067F000040020000E0000000A20000-000000067F000040020000E0000000A24000__00000073AD3FE6B8 000000067F000040020000E0000000A20000-000000067F000040020000E0000000A24000__000000914E3F38F0 000000067F000040020000E0000000A20000-000000067F000040020000E0000000A24000__000000931B9A2710 000000067F000040020000E0000000A24000-000000067F000040020000E0000000A28000__0000004E11956660 000000067F000040020000E0000000A24000-000000067F000040020000E0000000A28000__0000005D2FFFFB38 000000067F000040020000E0000000A24000-000000067F000040020000E0000000A28000__00000073AD3FE6B8 000000067F000040020000E0000000A24000-000000067F000040020000E0000000A28000__000000914E3F38F0 000000067F000040020000E0000000A24000-000000067F000040020000E0000000A28000__000000931B9A2710 000000067F000040020000E0000000A26BF5-000000067F000040020000E0000000A2F5E1__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A28000-000000067F000040020000E0000000A2C000__0000004E11956660 000000067F000040020000E0000000A28000-000000067F000040020000E0000000A2C000__0000005D2FFFFB38 000000067F000040020000E0000000A28000-000000067F000040020000E0000000A2C000__00000073AD3FE6B8 000000067F000040020000E0000000A28000-000000067F000040020000E0000000A2C000__000000914E3F38F0 000000067F000040020000E0000000A28000-000000067F000040020000E0000000A2C000__000000931B9A2710 000000067F000040020000E0000000A2C000-000000067F000040020000E0000000A30000__0000004E11956660 000000067F000040020000E0000000A2C000-000000067F000040020000E0000000A30000__0000005D2FFFFB38 000000067F000040020000E0000000A2C000-000000067F000040020000E0000000A30000__00000073AD3FE6B8 000000067F000040020000E0000000A2C000-000000067F000040020000E0000000A30000__000000914E3F38F0 000000067F000040020000E0000000A2C000-000000067F000040020000E0000000A30000__000000931B9A2710 000000067F000040020000E0000000A2F5E1-000000067F000040020000E0000000A37FC8__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A30000-000000067F000040020000E0000000A34000__0000004E11956660 000000067F000040020000E0000000A30000-000000067F000040020000E0000000A34000__0000005D2FFFFB38 000000067F000040020000E0000000A30000-000000067F000040020000E0000000A34000__00000073AD3FE6B8 000000067F000040020000E0000000A30000-000000067F000040020000E0000000A34000__000000914E3F38F0 000000067F000040020000E0000000A30000-000000067F000040020000E0000000A34000__000000931B9A2710 000000067F000040020000E0000000A34000-000000067F000040020000E0000000A38000__0000004E11956660 000000067F000040020000E0000000A34000-000000067F000040020000E0000000A38000__0000005D2FFFFB38 000000067F000040020000E0000000A34000-000000067F000040020000E0000000A38000__00000073AD3FE6B8 000000067F000040020000E0000000A34000-000000067F000040020000E0000000A38000__000000914E3F38F0 000000067F000040020000E0000000A34000-000000067F000040020000E0000000A38000__000000931B9A2710 000000067F000040020000E0000000A37FC8-000000067F000040020000E0000000A409C6__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A38000-000000067F000040020000E0000000A3C000__0000004E11956660 000000067F000040020000E0000000A38000-000000067F000040020000E0000000A3C000__0000005D2FFFFB38 000000067F000040020000E0000000A38000-000000067F000040020000E0000000A3C000__00000073AD3FE6B8 000000067F000040020000E0000000A38000-000000067F000040020000E0000000A3C000__000000914E3F38F0 000000067F000040020000E0000000A38000-000000067F000040020000E0000000A3C000__000000931B9A2710 000000067F000040020000E0000000A3C000-000000067F000040020000E0000000A40000__0000004E11956660 000000067F000040020000E0000000A3C000-000000067F000040020000E0000000A40000__0000005D2FFFFB38 000000067F000040020000E0000000A3C000-000000067F000040020000E0000000A40000__00000073AD3FE6B8 000000067F000040020000E0000000A3C000-000000067F000040020000E0000000A40000__000000914E3F38F0 000000067F000040020000E0000000A3C000-000000067F000040020000E0000000A40000__000000931B9A2710 000000067F000040020000E0000000A40000-000000067F000040020000E0000000A44000__0000004E11956660 000000067F000040020000E0000000A40000-000000067F000040020000E0000000A44000__0000005D2FFFFB38 000000067F000040020000E0000000A40000-000000067F000040020000E0000000A44000__00000073AD3FE6B8 000000067F000040020000E0000000A40000-000000067F000040020000E0000000A44000__000000914E3F38F0 000000067F000040020000E0000000A40000-000000067F000040020000E0000000A44000__000000931B9A2710 000000067F000040020000E0000000A409C6-000000067F000040020000E0000000A4939A__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A44000-000000067F000040020000E0000000A48000__0000004E11956660 000000067F000040020000E0000000A44000-000000067F000040020000E0000000A48000__0000005D2FFFFB38 000000067F000040020000E0000000A44000-000000067F000040020000E0000000A48000__00000073AD3FE6B8 000000067F000040020000E0000000A44000-000000067F000040020000E0000000A48000__000000914E3F38F0 000000067F000040020000E0000000A44000-000000067F000040020000E0000000A48000__000000931B9A2710 000000067F000040020000E0000000A48000-000000067F000040020000E0000000A4C000__0000004E11956660 000000067F000040020000E0000000A48000-000000067F000040020000E0000000A4C000__0000005D2FFFFB38 000000067F000040020000E0000000A48000-000000067F000040020000E0000000A4C000__00000073AD3FE6B8 000000067F000040020000E0000000A48000-000000067F000040020000E0000000A4C000__000000914E3F38F0 000000067F000040020000E0000000A48000-000000067F000040020000E0000000A4C000__000000931B9A2710 000000067F000040020000E0000000A4939A-000000067F000040020000E0000000A51D74__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A4C000-000000067F000040020000E0000000A50000__0000004E11956660 000000067F000040020000E0000000A4C000-000000067F000040020000E0000000A50000__0000005D2FFFFB38 000000067F000040020000E0000000A4C000-000000067F000040020000E0000000A50000__00000073AD3FE6B8 000000067F000040020000E0000000A4C000-000000067F000040020000E0000000A50000__000000914E3F38F0 000000067F000040020000E0000000A4C000-000000067F000040020000E0000000A50000__000000931B9A2710 000000067F000040020000E0000000A50000-000000067F000040020000E0000000A54000__0000004E11956660 000000067F000040020000E0000000A50000-000000067F000040020000E0000000A54000__0000005D2FFFFB38 000000067F000040020000E0000000A50000-000000067F000040020000E0000000A54000__00000073AD3FE6B8 000000067F000040020000E0000000A50000-000000067F000040020000E0000000A54000__000000914E3F38F0 000000067F000040020000E0000000A50000-000000067F000040020000E0000000A54000__000000931B9A2710 000000067F000040020000E0000000A51D74-000000067F000040020000E0000000A5A745__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A54000-000000067F000040020000E0000000A58000__0000004E11956660 000000067F000040020000E0000000A54000-000000067F000040020000E0000000A58000__0000005D2FFFFB38 000000067F000040020000E0000000A54000-000000067F000040020000E0000000A58000__00000073AD3FE6B8 000000067F000040020000E0000000A54000-000000067F000040020000E0000000A58000__000000914E3F38F0 000000067F000040020000E0000000A54000-000000067F000040020000E0000000A58000__000000931B9A2710 000000067F000040020000E0000000A58000-000000067F000040020000E0000000A5C000__0000004E11956660 000000067F000040020000E0000000A58000-000000067F000040020000E0000000A5C000__000000574B7FF240 000000067F000040020000E0000000A58000-000000067F000040020000E0000000A5C000__00000073AD3FE6B8 000000067F000040020000E0000000A58000-000000067F000040020000E0000000A5C000__000000914E3F38F0 000000067F000040020000E0000000A58000-000000067F000040020000E0000000A5C000__000000931B9A2710 000000067F000040020000E0000000A5A745-000000067F000040020000E0000100000000__0000004D3123ED31-0000004DE0CBDCD1 000000067F000040020000E0000000A5AA3B-000000067F000040020000E0000000A6340F__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A5C000-000000067F000040020000E0000000A60000__0000004E11956660 000000067F000040020000E0000000A5C000-000000067F000040020000E0000000A60000__000000574B7FF240 000000067F000040020000E0000000A5C000-000000067F000040020000E0000000A60000__00000073AD3FE6B8 000000067F000040020000E0000000A5C000-000000067F000040020000E0000000A60000__000000914E3F38F0 000000067F000040020000E0000000A5C000-000000067F000040020000E0000000A60000__000000931B9A2710 000000067F000040020000E0000000A60000-000000067F000040020000E0000000A64000__0000004E11956660 000000067F000040020000E0000000A60000-000000067F000040020000E0000000A64000__000000574B7FF240 000000067F000040020000E0000000A60000-000000067F000040020000E0000000A64000__00000073AD3FE6B8 000000067F000040020000E0000000A60000-000000067F000040020000E0000000A64000__000000914E3F38F0 000000067F000040020000E0000000A60000-000000067F000040020000E0000000A64000__000000931B9A2710 000000067F000040020000E0000000A6340F-000000067F000040020000E0000000A6BDF1__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A64000-000000067F000040020000E0000000A68000__0000004E11956660 000000067F000040020000E0000000A64000-000000067F000040020000E0000000A68000__000000574B7FF240 000000067F000040020000E0000000A64000-000000067F000040020000E0000000A68000__00000073AD3FE6B8 000000067F000040020000E0000000A64000-000000067F000040020000E0000000A68000__000000914E3F38F0 000000067F000040020000E0000000A64000-000000067F000040020000E0000000A68000__000000931B9A2710 000000067F000040020000E0000000A68000-000000067F000040020000E0000000A6C000__0000004E11956660 000000067F000040020000E0000000A68000-000000067F000040020000E0000000A6C000__000000574B7FF240 000000067F000040020000E0000000A68000-000000067F000040020000E0000000A6C000__00000073AD3FE6B8 000000067F000040020000E0000000A68000-000000067F000040020000E0000000A6C000__000000914E3F38F0 000000067F000040020000E0000000A68000-000000067F000040020000E0000000A6C000__000000931B9A2710 000000067F000040020000E0000000A6BDF1-000000067F000040020000E0000000A747DF__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A6C000-000000067F000040020000E0000000A70000__0000004E11956660 000000067F000040020000E0000000A6C000-000000067F000040020000E0000000A70000__000000574B7FF240 000000067F000040020000E0000000A6C000-000000067F000040020000E0000000A70000__00000073AD3FE6B8 000000067F000040020000E0000000A6C000-000000067F000040020000E0000000A70000__000000914E3F38F0 000000067F000040020000E0000000A6C000-000000067F000040020000E0000000A70000__000000931B9A2710 000000067F000040020000E0000000A70000-000000067F000040020000E0000000A74000__0000004E11956660 000000067F000040020000E0000000A70000-000000067F000040020000E0000000A74000__000000574B7FF240 000000067F000040020000E0000000A70000-000000067F000040020000E0000000A74000__00000073AD3FE6B8 000000067F000040020000E0000000A70000-000000067F000040020000E0000000A74000__000000914E3F38F0 000000067F000040020000E0000000A70000-000000067F000040020000E0000000A74000__000000931B9A2710 000000067F000040020000E0000000A74000-000000067F000040020000E0000000A78000__000000574B7FF240 000000067F000040020000E0000000A74000-000000067F000040020000E0000000A78000__00000073AD3FE6B8 000000067F000040020000E0000000A74000-000000067F000040020000E0000000A78000__000000914E3F38F0 000000067F000040020000E0000000A74000-000000067F000040020000E0000000A78000__000000931B9A2710 000000067F000040020000E0000000A74000-030000000000000000000000000000000002__0000004E11956660 000000067F000040020000E0000000A747DF-000000067F000040020000E0000000A7D1C5__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A78000-000000067F000040020000E0000000A7C000__000000574B7FF240 000000067F000040020000E0000000A78000-000000067F000040020000E0000000A7C000__00000073AD3FE6B8 000000067F000040020000E0000000A78000-000000067F000040020000E0000000A7C000__000000914E3F38F0 000000067F000040020000E0000000A78000-000000067F000040020000E0000000A7C000__000000931B9A2710 000000067F000040020000E0000000A7C000-000000067F000040020000E0000000A80000__000000574B7FF240 000000067F000040020000E0000000A7C000-000000067F000040020000E0000000A80000__00000073AD3FE6B8 000000067F000040020000E0000000A7C000-000000067F000040020000E0000000A80000__000000914E3F38F0 000000067F000040020000E0000000A7C000-000000067F000040020000E0000000A80000__000000931B9A2710 000000067F000040020000E0000000A7D1C5-000000067F000040020000E0000000A85B9E__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A80000-000000067F000040020000E0000000A84000__000000574B7FF240 000000067F000040020000E0000000A80000-000000067F000040020000E0000000A84000__00000073AD3FE6B8 000000067F000040020000E0000000A80000-000000067F000040020000E0000000A84000__000000914E3F38F0 000000067F000040020000E0000000A80000-000000067F000040020000E0000000A84000__000000931B9A2710 000000067F000040020000E0000000A84000-000000067F000040020000E0000000A88000__000000574B7FF240 000000067F000040020000E0000000A84000-000000067F000040020000E0000000A88000__00000073AD3FE6B8 000000067F000040020000E0000000A84000-000000067F000040020000E0000000A88000__000000914E3F38F0 000000067F000040020000E0000000A84000-000000067F000040020000E0000000A88000__000000931B9A2710 000000067F000040020000E0000000A85B9E-000000067F000040020000E0000000A8E573__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A88000-000000067F000040020000E0000000A8C000__000000574B7FF240 000000067F000040020000E0000000A88000-000000067F000040020000E0000000A8C000__00000073AD3FE6B8 000000067F000040020000E0000000A88000-000000067F000040020000E0000000A8C000__000000914E3F38F0 000000067F000040020000E0000000A88000-000000067F000040020000E0000000A8C000__000000931B9A2710 000000067F000040020000E0000000A8C000-000000067F000040020000E0000000A90000__000000574B7FF240 000000067F000040020000E0000000A8C000-000000067F000040020000E0000000A90000__00000073AD3FE6B8 000000067F000040020000E0000000A8C000-000000067F000040020000E0000000A90000__000000914E3F38F0 000000067F000040020000E0000000A8C000-000000067F000040020000E0000000A90000__000000931B9A2710 000000067F000040020000E0000000A8E573-000000067F000040020000E0000000A96F4D__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A90000-000000067F000040020000E0000000A94000__000000574B7FF240 000000067F000040020000E0000000A90000-000000067F000040020000E0000000A94000__00000073AD3FE6B8 000000067F000040020000E0000000A90000-000000067F000040020000E0000000A94000__000000914E3F38F0 000000067F000040020000E0000000A90000-000000067F000040020000E0000000A94000__000000931B9A2710 000000067F000040020000E0000000A94000-000000067F000040020000E0000000A98000__000000574B7FF240 000000067F000040020000E0000000A94000-000000067F000040020000E0000000A98000__00000073AD3FE6B8 000000067F000040020000E0000000A94000-000000067F000040020000E0000000A98000__000000914E3F38F0 000000067F000040020000E0000000A94000-000000067F000040020000E0000000A98000__000000931B9A2710 000000067F000040020000E0000000A96F4D-000000067F000040020000E0000000A9F922__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000A98000-000000067F000040020000E0000000A9C000__000000574B7FF240 000000067F000040020000E0000000A98000-000000067F000040020000E0000000A9C000__00000073AD3FE6B8 000000067F000040020000E0000000A98000-000000067F000040020000E0000000A9C000__000000914E3F38F0 000000067F000040020000E0000000A98000-000000067F000040020000E0000000A9C000__000000931B9A2710 000000067F000040020000E0000000A9C000-000000067F000040020000E0000000AA0000__000000574B7FF240 000000067F000040020000E0000000A9C000-000000067F000040020000E0000000AA0000__00000073AD3FE6B8 000000067F000040020000E0000000A9C000-000000067F000040020000E0000000AA0000__000000914E3F38F0 000000067F000040020000E0000000A9C000-000000067F000040020000E0000000AA0000__000000931B9A2710 000000067F000040020000E0000000A9F922-000000067F000040020000E0000000AA8300__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000AA0000-000000067F000040020000E0000000AA4000__000000574B7FF240 000000067F000040020000E0000000AA0000-000000067F000040020000E0000000AA4000__00000073AD3FE6B8 000000067F000040020000E0000000AA0000-000000067F000040020000E0000000AA4000__000000914E3F38F0 000000067F000040020000E0000000AA0000-000000067F000040020000E0000000AA4000__000000931B9A2710 000000067F000040020000E0000000AA4000-000000067F000040020000E0000000AA8000__000000574B7FF240 000000067F000040020000E0000000AA4000-000000067F000040020000E0000000AA8000__00000073AD3FE6B8 000000067F000040020000E0000000AA4000-000000067F000040020000E0000000AA8000__000000914E3F38F0 000000067F000040020000E0000000AA4000-000000067F000040020000E0000000AA8000__000000931B9A2710 000000067F000040020000E0000000AA8000-000000067F000040020000E0000000AAC000__000000574B7FF240 000000067F000040020000E0000000AA8000-000000067F000040020000E0000000AAC000__00000073AD3FE6B8 000000067F000040020000E0000000AA8000-000000067F000040020000E0000000AAC000__000000914E3F38F0 000000067F000040020000E0000000AA8000-000000067F000040020000E0000000AAC000__000000931B9A2710 000000067F000040020000E0000000AA8300-000000067F000040020000E0000000AB0CDB__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000AAC000-000000067F000040020000E0000000AB0000__000000574B7FF240 000000067F000040020000E0000000AAC000-000000067F000040020000E0000000AB0000__00000073AD3FE6B8 000000067F000040020000E0000000AAC000-000000067F000040020000E0000000AB0000__000000914E3F38F0 000000067F000040020000E0000000AAC000-000000067F000040020000E0000000AB0000__000000931B9A2710 000000067F000040020000E0000000AB0000-000000067F000040020000E0000000AB4000__000000572A7A05D8 000000067F000040020000E0000000AB0000-000000067F000040020000E0000000AB4000__0000005D2FFFFB38 000000067F000040020000E0000000AB0000-000000067F000040020000E0000000AB4000__00000073AD3FE6B8 000000067F000040020000E0000000AB0000-000000067F000040020000E0000000AB4000__000000914E3F38F0 000000067F000040020000E0000000AB0000-000000067F000040020000E0000000AB4000__000000931B9A2710 000000067F000040020000E0000000AB0CDB-000000067F000040020000E0000100000000__0000004DE0CBDCD1-0000004E807BE039 000000067F000040020000E0000000AB0FBD-000000067F000040020000E0000000AB99A0__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AB4000-000000067F000040020000E0000000AB8000__000000572A7A05D8 000000067F000040020000E0000000AB4000-000000067F000040020000E0000000AB8000__0000005D2FFFFB38 000000067F000040020000E0000000AB4000-000000067F000040020000E0000000AB8000__00000073AD3FE6B8 000000067F000040020000E0000000AB4000-000000067F000040020000E0000000AB8000__000000914E3F38F0 000000067F000040020000E0000000AB4000-000000067F000040020000E0000000AB8000__000000931B9A2710 000000067F000040020000E0000000AB8000-000000067F000040020000E0000000ABC000__000000572A7A05D8 000000067F000040020000E0000000AB8000-000000067F000040020000E0000000ABC000__0000005D2FFFFB38 000000067F000040020000E0000000AB8000-000000067F000040020000E0000000ABC000__00000073AD3FE6B8 000000067F000040020000E0000000AB8000-000000067F000040020000E0000000ABC000__000000914E3F38F0 000000067F000040020000E0000000AB8000-000000067F000040020000E0000000ABC000__000000931B9A2710 000000067F000040020000E0000000AB99A0-000000067F000040020000E0000000AC237B__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000ABC000-000000067F000040020000E0000000AC0000__000000572A7A05D8 000000067F000040020000E0000000ABC000-000000067F000040020000E0000000AC0000__0000005D2FFFFB38 000000067F000040020000E0000000ABC000-000000067F000040020000E0000000AC0000__00000073AD3FE6B8 000000067F000040020000E0000000ABC000-000000067F000040020000E0000000AC0000__000000914E3F38F0 000000067F000040020000E0000000ABC000-000000067F000040020000E0000000AC0000__000000931B9A2710 000000067F000040020000E0000000AC0000-000000067F000040020000E0000000AC4000__000000572A7A05D8 000000067F000040020000E0000000AC0000-000000067F000040020000E0000000AC4000__0000005D2FFFFB38 000000067F000040020000E0000000AC0000-000000067F000040020000E0000000AC4000__00000073AD3FE6B8 000000067F000040020000E0000000AC0000-000000067F000040020000E0000000AC4000__000000914E3F38F0 000000067F000040020000E0000000AC0000-000000067F000040020000E0000000AC4000__000000931B9A2710 000000067F000040020000E0000000AC237B-000000067F000040020000E0000000ACAD51__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AC4000-000000067F000040020000E0000000AC8000__000000572A7A05D8 000000067F000040020000E0000000AC4000-000000067F000040020000E0000000AC8000__0000005D2FFFFB38 000000067F000040020000E0000000AC4000-000000067F000040020000E0000000AC8000__00000073AD3FE6B8 000000067F000040020000E0000000AC4000-000000067F000040020000E0000000AC8000__000000914E3F38F0 000000067F000040020000E0000000AC4000-000000067F000040020000E0000000AC8000__000000931B9A2710 000000067F000040020000E0000000AC8000-000000067F000040020000E0000000ACC000__000000572A7A05D8 000000067F000040020000E0000000AC8000-000000067F000040020000E0000000ACC000__0000005D2FFFFB38 000000067F000040020000E0000000AC8000-000000067F000040020000E0000000ACC000__00000073AD3FE6B8 000000067F000040020000E0000000AC8000-000000067F000040020000E0000000ACC000__000000914E3F38F0 000000067F000040020000E0000000AC8000-000000067F000040020000E0000000ACC000__000000931B9A2710 000000067F000040020000E0000000ACAD51-000000067F000040020000E0000000AD372F__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000ACC000-000000067F000040020000E0000000AD0000__000000572A7A05D8 000000067F000040020000E0000000ACC000-000000067F000040020000E0000000AD0000__0000005D2FFFFB38 000000067F000040020000E0000000ACC000-000000067F000040020000E0000000AD0000__00000073AD3FE6B8 000000067F000040020000E0000000ACC000-000000067F000040020000E0000000AD0000__000000914E3F38F0 000000067F000040020000E0000000ACC000-000000067F000040020000E0000000AD0000__000000931B9A2710 000000067F000040020000E0000000AD0000-000000067F000040020000E0000000AD4000__000000572A7A05D8 000000067F000040020000E0000000AD0000-000000067F000040020000E0000000AD4000__0000005D2FFFFB38 000000067F000040020000E0000000AD0000-000000067F000040020000E0000000AD4000__00000073AD3FE6B8 000000067F000040020000E0000000AD0000-000000067F000040020000E0000000AD4000__000000914E3F38F0 000000067F000040020000E0000000AD0000-000000067F000040020000E0000000AD4000__000000931B9A2710 000000067F000040020000E0000000AD372F-000000067F000040020000E0000000ADC0FD__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AD4000-000000067F000040020000E0000000AD8000__000000572A7A05D8 000000067F000040020000E0000000AD4000-000000067F000040020000E0000000AD8000__0000005D2FFFFB38 000000067F000040020000E0000000AD4000-000000067F000040020000E0000000AD8000__00000073AD3FE6B8 000000067F000040020000E0000000AD4000-000000067F000040020000E0000000AD8000__000000914E3F38F0 000000067F000040020000E0000000AD4000-000000067F000040020000E0000000AD8000__000000931B9A2710 000000067F000040020000E0000000AD8000-000000067F000040020000E0000000ADC000__000000572A7A05D8 000000067F000040020000E0000000AD8000-000000067F000040020000E0000000ADC000__0000005D2FFFFB38 000000067F000040020000E0000000AD8000-000000067F000040020000E0000000ADC000__00000073AD3FE6B8 000000067F000040020000E0000000AD8000-000000067F000040020000E0000000ADC000__000000914E3F38F0 000000067F000040020000E0000000AD8000-000000067F000040020000E0000000ADC000__000000931B9A2710 000000067F000040020000E0000000ADC000-000000067F000040020000E0000000AE0000__000000572A7A05D8 000000067F000040020000E0000000ADC000-000000067F000040020000E0000000AE0000__0000005D2FFFFB38 000000067F000040020000E0000000ADC000-000000067F000040020000E0000000AE0000__00000073AD3FE6B8 000000067F000040020000E0000000ADC000-000000067F000040020000E0000000AE0000__000000914E3F38F0 000000067F000040020000E0000000ADC000-000000067F000040020000E0000000AE0000__000000931B9A2710 000000067F000040020000E0000000ADC0FD-000000067F000040020000E0000000AE4AE3__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AE0000-000000067F000040020000E0000000AE4000__000000572A7A05D8 000000067F000040020000E0000000AE0000-000000067F000040020000E0000000AE4000__0000005D2FFFFB38 000000067F000040020000E0000000AE0000-000000067F000040020000E0000000AE4000__00000073AD3FE6B8 000000067F000040020000E0000000AE0000-000000067F000040020000E0000000AE4000__000000914E3F38F0 000000067F000040020000E0000000AE0000-000000067F000040020000E0000000AE4000__000000931B9A2710 000000067F000040020000E0000000AE4000-000000067F000040020000E0000000AE8000__000000572A7A05D8 000000067F000040020000E0000000AE4000-000000067F000040020000E0000000AE8000__0000005D2FFFFB38 000000067F000040020000E0000000AE4000-000000067F000040020000E0000000AE8000__00000073AD3FE6B8 000000067F000040020000E0000000AE4000-000000067F000040020000E0000000AE8000__000000914E3F38F0 000000067F000040020000E0000000AE4000-000000067F000040020000E0000000AE8000__000000931B9A2710 000000067F000040020000E0000000AE4AE3-000000067F000040020000E0000000AED4D7__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AE8000-000000067F000040020000E0000000AEC000__000000572A7A05D8 000000067F000040020000E0000000AE8000-000000067F000040020000E0000000AEC000__0000005D2FFFFB38 000000067F000040020000E0000000AE8000-000000067F000040020000E0000000AEC000__00000073AD3FE6B8 000000067F000040020000E0000000AE8000-000000067F000040020000E0000000AEC000__000000914E3F38F0 000000067F000040020000E0000000AE8000-000000067F000040020000E0000000AEC000__000000931B9A2710 000000067F000040020000E0000000AEC000-000000067F000040020000E0000000AF0000__000000572A7A05D8 000000067F000040020000E0000000AEC000-000000067F000040020000E0000000AF0000__0000005D2FFFFB38 000000067F000040020000E0000000AEC000-000000067F000040020000E0000000AF0000__00000073AD3FE6B8 000000067F000040020000E0000000AEC000-000000067F000040020000E0000000AF0000__000000914E3F38F0 000000067F000040020000E0000000AEC000-000000067F000040020000E0000000AF0000__000000931B9A2710 000000067F000040020000E0000000AED4D7-000000067F000040020000E0000000AF5EBA__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AF0000-000000067F000040020000E0000000AF4000__000000572A7A05D8 000000067F000040020000E0000000AF0000-000000067F000040020000E0000000AF4000__0000005D2FFFFB38 000000067F000040020000E0000000AF0000-000000067F000040020000E0000000AF4000__00000073AD3FE6B8 000000067F000040020000E0000000AF0000-000000067F000040020000E0000000AF4000__000000914E3F38F0 000000067F000040020000E0000000AF0000-000000067F000040020000E0000000AF4000__000000931B9A2710 000000067F000040020000E0000000AF4000-000000067F000040020000E0000000AF8000__000000572A7A05D8 000000067F000040020000E0000000AF4000-000000067F000040020000E0000000AF8000__0000005D2FFFFB38 000000067F000040020000E0000000AF4000-000000067F000040020000E0000000AF8000__00000073AD3FE6B8 000000067F000040020000E0000000AF4000-000000067F000040020000E0000000AF8000__000000914E3F38F0 000000067F000040020000E0000000AF4000-000000067F000040020000E0000000AF8000__000000931B9A2710 000000067F000040020000E0000000AF5EBA-000000067F000040020000E0000000AFE88E__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000AF8000-000000067F000040020000E0000000AFC000__000000572A7A05D8 000000067F000040020000E0000000AF8000-000000067F000040020000E0000000AFC000__0000005D2FFFFB38 000000067F000040020000E0000000AF8000-000000067F000040020000E0000000AFC000__00000073AD3FE6B8 000000067F000040020000E0000000AF8000-000000067F000040020000E0000000AFC000__000000914E3F38F0 000000067F000040020000E0000000AF8000-000000067F000040020000E0000000AFC000__000000931B9A2710 000000067F000040020000E0000000AFC000-000000067F000040020000E0000000B00000__000000572A7A05D8 000000067F000040020000E0000000AFC000-000000067F000040020000E0000000B00000__0000005D2FFFFB38 000000067F000040020000E0000000AFC000-000000067F000040020000E0000000B00000__00000073AD3FE6B8 000000067F000040020000E0000000AFC000-000000067F000040020000E0000000B00000__000000914E3F38F0 000000067F000040020000E0000000AFC000-000000067F000040020000E0000000B00000__000000931B9A2710 000000067F000040020000E0000000AFE88E-000000067F000040020000E0000000B07269__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000B00000-000000067F000040020000E0000000B04000__000000572A7A05D8 000000067F000040020000E0000000B00000-000000067F000040020000E0000000B04000__0000005D2FFFFB38 000000067F000040020000E0000000B00000-000000067F000040020000E0000000B04000__00000073AD3FE6B8 000000067F000040020000E0000000B00000-000000067F000040020000E0000000B04000__000000914E3F38F0 000000067F000040020000E0000000B00000-000000067F000040020000E0000000B04000__000000931B9A2710 000000067F000040020000E0000000B04000-000000067F000040020000E0000000B08000__0000004FEAB6F890 000000067F000040020000E0000000B04000-000000067F000040020000E0000000B08000__0000005D2FFFFB38 000000067F000040020000E0000000B04000-000000067F000040020000E0000000B08000__00000073AD3FE6B8 000000067F000040020000E0000000B04000-000000067F000040020000E0000000B08000__000000914E3F38F0 000000067F000040020000E0000000B04000-000000067F000040020000E0000000B08000__000000931B9A2710 000000067F000040020000E0000000B07269-000000067F000040020000E0000100000000__0000004E807BE039-0000004F2029EFA9 000000067F000040020000E0000000B07515-000000067F000040020000E0000000B0FEE8__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B08000-000000067F000040020000E0000000B0C000__0000004FEAB6F890 000000067F000040020000E0000000B08000-000000067F000040020000E0000000B0C000__0000005D2FFFFB38 000000067F000040020000E0000000B08000-000000067F000040020000E0000000B0C000__00000073AD3FE6B8 000000067F000040020000E0000000B08000-000000067F000040020000E0000000B0C000__000000914E3F38F0 000000067F000040020000E0000000B08000-000000067F000040020000E0000000B0C000__000000931B9A2710 000000067F000040020000E0000000B0C000-000000067F000040020000E0000000B10000__0000004FEAB6F890 000000067F000040020000E0000000B0C000-000000067F000040020000E0000000B10000__0000005D2FFFFB38 000000067F000040020000E0000000B0C000-000000067F000040020000E0000000B10000__00000073AD3FE6B8 000000067F000040020000E0000000B0C000-000000067F000040020000E0000000B10000__000000914E3F38F0 000000067F000040020000E0000000B0C000-000000067F000040020000E0000000B10000__000000931B9A2710 000000067F000040020000E0000000B0FEE8-000000067F000040020000E0000000B188C0__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B10000-000000067F000040020000E0000000B14000__0000004FEAB6F890 000000067F000040020000E0000000B10000-000000067F000040020000E0000000B14000__0000005D2FFFFB38 000000067F000040020000E0000000B10000-000000067F000040020000E0000000B14000__00000073AD3FE6B8 000000067F000040020000E0000000B10000-000000067F000040020000E0000000B14000__000000914E3F38F0 000000067F000040020000E0000000B10000-000000067F000040020000E0000000B14000__000000931B9A2710 000000067F000040020000E0000000B14000-000000067F000040020000E0000000B18000__0000004FEAB6F890 000000067F000040020000E0000000B14000-000000067F000040020000E0000000B18000__0000005D2FFFFB38 000000067F000040020000E0000000B14000-000000067F000040020000E0000000B18000__00000073AD3FE6B8 000000067F000040020000E0000000B14000-000000067F000040020000E0000000B18000__000000914E3F38F0 000000067F000040020000E0000000B14000-000000067F000040020000E0000000B18000__000000931B9A2710 000000067F000040020000E0000000B18000-000000067F000040020000E0000000B1C000__0000004FEAB6F890 000000067F000040020000E0000000B18000-000000067F000040020000E0000000B1C000__0000005D2FFFFB38 000000067F000040020000E0000000B18000-000000067F000040020000E0000000B1C000__00000073AD3FE6B8 000000067F000040020000E0000000B18000-000000067F000040020000E0000000B1C000__000000914E3F38F0 000000067F000040020000E0000000B18000-000000067F000040020000E0000000B1C000__000000931B9A2710 000000067F000040020000E0000000B188C0-000000067F000040020000E0000000B212A1__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B1C000-000000067F000040020000E0000000B20000__0000004FEAB6F890 000000067F000040020000E0000000B1C000-000000067F000040020000E0000000B20000__0000005D2FFFFB38 000000067F000040020000E0000000B1C000-000000067F000040020000E0000000B20000__00000073AD3FE6B8 000000067F000040020000E0000000B1C000-000000067F000040020000E0000000B20000__000000914E3F38F0 000000067F000040020000E0000000B1C000-000000067F000040020000E0000000B20000__000000931B9A2710 000000067F000040020000E0000000B20000-000000067F000040020000E0000000B24000__0000004FEAB6F890 000000067F000040020000E0000000B20000-000000067F000040020000E0000000B24000__0000005D2FFFFB38 000000067F000040020000E0000000B20000-000000067F000040020000E0000000B24000__00000073AD3FE6B8 000000067F000040020000E0000000B20000-000000067F000040020000E0000000B24000__000000914E3F38F0 000000067F000040020000E0000000B20000-000000067F000040020000E0000000B24000__000000931B9A2710 000000067F000040020000E0000000B212A1-000000067F000040020000E0000000B29C85__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B24000-000000067F000040020000E0000000B28000__0000004FEAB6F890 000000067F000040020000E0000000B24000-000000067F000040020000E0000000B28000__0000005D2FFFFB38 000000067F000040020000E0000000B24000-000000067F000040020000E0000000B28000__00000073AD3FE6B8 000000067F000040020000E0000000B24000-000000067F000040020000E0000000B28000__000000914E3F38F0 000000067F000040020000E0000000B24000-000000067F000040020000E0000000B28000__000000931B9A2710 000000067F000040020000E0000000B28000-000000067F000040020000E0000000B2C000__0000004FEAB6F890 000000067F000040020000E0000000B28000-000000067F000040020000E0000000B2C000__0000005D2FFFFB38 000000067F000040020000E0000000B28000-000000067F000040020000E0000000B2C000__00000073AD3FE6B8 000000067F000040020000E0000000B28000-000000067F000040020000E0000000B2C000__000000914E3F38F0 000000067F000040020000E0000000B28000-000000067F000040020000E0000000B2C000__000000931B9A2710 000000067F000040020000E0000000B29C85-000000067F000040020000E0000000B3265D__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B2C000-000000067F000040020000E0000000B30000__0000004FEAB6F890 000000067F000040020000E0000000B2C000-000000067F000040020000E0000000B30000__0000005D2FFFFB38 000000067F000040020000E0000000B2C000-000000067F000040020000E0000000B30000__00000073AD3FE6B8 000000067F000040020000E0000000B2C000-000000067F000040020000E0000000B30000__000000914E3F38F0 000000067F000040020000E0000000B2C000-000000067F000040020000E0000000B30000__000000931B9A2710 000000067F000040020000E0000000B30000-000000067F000040020000E0000000B34000__0000004FEAB6F890 000000067F000040020000E0000000B30000-000000067F000040020000E0000000B34000__0000005D2FFFFB38 000000067F000040020000E0000000B30000-000000067F000040020000E0000000B34000__00000073AD3FE6B8 000000067F000040020000E0000000B30000-000000067F000040020000E0000000B34000__000000914E3F38F0 000000067F000040020000E0000000B30000-000000067F000040020000E0000000B34000__000000931B9A2710 000000067F000040020000E0000000B3265D-000000067F000040020000E0000000B3B036__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B34000-000000067F000040020000E0000000B38000__0000004FEAB6F890 000000067F000040020000E0000000B34000-000000067F000040020000E0000000B38000__0000005D2FFFFB38 000000067F000040020000E0000000B34000-000000067F000040020000E0000000B38000__00000073AD3FE6B8 000000067F000040020000E0000000B34000-000000067F000040020000E0000000B38000__000000914E3F38F0 000000067F000040020000E0000000B34000-000000067F000040020000E0000000B38000__000000931B9A2710 000000067F000040020000E0000000B38000-000000067F000040020000E0000000B3C000__0000004FEAB6F890 000000067F000040020000E0000000B38000-000000067F000040020000E0000000B3C000__0000005D2FFFFB38 000000067F000040020000E0000000B38000-000000067F000040020000E0000000B3C000__00000073AD3FE6B8 000000067F000040020000E0000000B38000-000000067F000040020000E0000000B3C000__000000914E3F38F0 000000067F000040020000E0000000B38000-000000067F000040020000E0000000B3C000__000000931B9A2710 000000067F000040020000E0000000B3B036-000000067F000040020000E0000000B43A10__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B3C000-000000067F000040020000E0000000B40000__0000004FEAB6F890 000000067F000040020000E0000000B3C000-000000067F000040020000E0000000B40000__0000005D2FFFFB38 000000067F000040020000E0000000B3C000-000000067F000040020000E0000000B40000__00000073AD3FE6B8 000000067F000040020000E0000000B3C000-000000067F000040020000E0000000B40000__000000914E3F38F0 000000067F000040020000E0000000B3C000-000000067F000040020000E0000000B40000__000000931B9A2710 000000067F000040020000E0000000B40000-000000067F000040020000E0000000B44000__0000004FEAB6F890 000000067F000040020000E0000000B40000-000000067F000040020000E0000000B44000__0000005D2FFFFB38 000000067F000040020000E0000000B40000-000000067F000040020000E0000000B44000__00000073AD3FE6B8 000000067F000040020000E0000000B40000-000000067F000040020000E0000000B44000__000000914E3F38F0 000000067F000040020000E0000000B40000-000000067F000040020000E0000000B44000__000000931B9A2710 000000067F000040020000E0000000B43A10-000000067F000040020000E0000000B4C3DD__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B44000-000000067F000040020000E0000000B48000__0000004FEAB6F890 000000067F000040020000E0000000B44000-000000067F000040020000E0000000B48000__0000005D2FFFFB38 000000067F000040020000E0000000B44000-000000067F000040020000E0000000B48000__00000073AD3FE6B8 000000067F000040020000E0000000B44000-000000067F000040020000E0000000B48000__000000914E3F38F0 000000067F000040020000E0000000B44000-000000067F000040020000E0000000B48000__000000931B9A2710 000000067F000040020000E0000000B48000-000000067F000040020000E0000000B4C000__0000004FEAB6F890 000000067F000040020000E0000000B48000-000000067F000040020000E0000000B4C000__0000005D2FFFFB38 000000067F000040020000E0000000B48000-000000067F000040020000E0000000B4C000__00000073AD3FE6B8 000000067F000040020000E0000000B48000-000000067F000040020000E0000000B4C000__000000914E3F38F0 000000067F000040020000E0000000B48000-000000067F000040020000E0000000B4C000__000000931B9A2710 000000067F000040020000E0000000B4C000-000000067F000040020000E0000000B50000__0000004FEAB6F890 000000067F000040020000E0000000B4C000-000000067F000040020000E0000000B50000__0000005D2FFFFB38 000000067F000040020000E0000000B4C000-000000067F000040020000E0000000B50000__00000073AD3FE6B8 000000067F000040020000E0000000B4C000-000000067F000040020000E0000000B50000__000000914E3F38F0 000000067F000040020000E0000000B4C000-000000067F000040020000E0000000B50000__000000931B9A2710 000000067F000040020000E0000000B4C3DD-000000067F000040020000E0000000B54DC7__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B50000-000000067F000040020000E0000000B54000__0000004FEAB6F890 000000067F000040020000E0000000B50000-000000067F000040020000E0000000B54000__0000005D2FFFFB38 000000067F000040020000E0000000B50000-000000067F000040020000E0000000B54000__00000073AD3FE6B8 000000067F000040020000E0000000B50000-000000067F000040020000E0000000B54000__000000914E3F38F0 000000067F000040020000E0000000B50000-000000067F000040020000E0000000B54000__000000931B9A2710 000000067F000040020000E0000000B54000-000000067F000040020000E0000000B58000__0000004FEAB6F890 000000067F000040020000E0000000B54000-000000067F000040020000E0000000B58000__0000005D2FFFFB38 000000067F000040020000E0000000B54000-000000067F000040020000E0000000B58000__00000073AD3FE6B8 000000067F000040020000E0000000B54000-000000067F000040020000E0000000B58000__000000914E3F38F0 000000067F000040020000E0000000B54000-000000067F000040020000E0000000B58000__000000931B9A2710 000000067F000040020000E0000000B54DC7-000000067F000040020000E0000000B5D7A4__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B58000-000000067F000040020000E0000000B5C000__0000004FEAB6F890 000000067F000040020000E0000000B58000-000000067F000040020000E0000000B5C000__0000005D2FFFFB38 000000067F000040020000E0000000B58000-000000067F000040020000E0000000B5C000__00000073AD3FE6B8 000000067F000040020000E0000000B58000-000000067F000040020000E0000000B5C000__000000914E3F38F0 000000067F000040020000E0000000B58000-000000067F000040020000E0000000B5C000__000000931B9A2710 000000067F000040020000E0000000B5C000-000000067F000040020000E0000000B60000__0000004FEAB6F890 000000067F000040020000E0000000B5C000-000000067F000040020000E0000000B60000__000000574B7FF240 000000067F000040020000E0000000B5C000-000000067F000040020000E0000000B60000__00000073AD3FE6B8 000000067F000040020000E0000000B5C000-000000067F000040020000E0000000B60000__000000914E3F38F0 000000067F000040020000E0000000B5C000-000000067F000040020000E0000000B60000__000000931B9A2710 000000067F000040020000E0000000B5D7A4-000000067F000040020000E0000100000000__0000004F2029EFA9-0000004FBFD9F391 000000067F000040020000E0000000B5DA84-000000067F000040020000E0000000B66465__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B60000-000000067F000040020000E0000000B64000__0000004FEAB6F890 000000067F000040020000E0000000B60000-000000067F000040020000E0000000B64000__000000574B7FF240 000000067F000040020000E0000000B60000-000000067F000040020000E0000000B64000__00000073AD3FE6B8 000000067F000040020000E0000000B60000-000000067F000040020000E0000000B64000__000000914E3F38F0 000000067F000040020000E0000000B60000-000000067F000040020000E0000000B64000__000000931B9A2710 000000067F000040020000E0000000B64000-000000067F000040020000E0000000B68000__0000004FEAB6F890 000000067F000040020000E0000000B64000-000000067F000040020000E0000000B68000__000000574B7FF240 000000067F000040020000E0000000B64000-000000067F000040020000E0000000B68000__00000073AD3FE6B8 000000067F000040020000E0000000B64000-000000067F000040020000E0000000B68000__000000914E3F38F0 000000067F000040020000E0000000B64000-000000067F000040020000E0000000B68000__000000931B9A2710 000000067F000040020000E0000000B66465-000000067F000040020000E0000000B6EE49__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B68000-000000067F000040020000E0000000B6C000__0000004FEAB6F890 000000067F000040020000E0000000B68000-000000067F000040020000E0000000B6C000__000000574B7FF240 000000067F000040020000E0000000B68000-000000067F000040020000E0000000B6C000__00000073AD3FE6B8 000000067F000040020000E0000000B68000-000000067F000040020000E0000000B6C000__000000914E3F38F0 000000067F000040020000E0000000B68000-000000067F000040020000E0000000B6C000__000000931B9A2710 000000067F000040020000E0000000B6C000-000000067F000040020000E0000000B70000__0000004FEAB6F890 000000067F000040020000E0000000B6C000-000000067F000040020000E0000000B70000__000000574B7FF240 000000067F000040020000E0000000B6C000-000000067F000040020000E0000000B70000__00000073AD3FE6B8 000000067F000040020000E0000000B6C000-000000067F000040020000E0000000B70000__000000914E3F38F0 000000067F000040020000E0000000B6C000-000000067F000040020000E0000000B70000__000000931B9A2710 000000067F000040020000E0000000B6EE49-000000067F000040020000E0000000B77817__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B70000-000000067F000040020000E0000000B74000__0000004FEAB6F890 000000067F000040020000E0000000B70000-000000067F000040020000E0000000B74000__000000574B7FF240 000000067F000040020000E0000000B70000-000000067F000040020000E0000000B74000__00000073AD3FE6B8 000000067F000040020000E0000000B70000-000000067F000040020000E0000000B74000__000000914E3F38F0 000000067F000040020000E0000000B70000-000000067F000040020000E0000000B74000__000000931B9A2710 000000067F000040020000E0000000B74000-000000067F000040020000E0000000B78000__000000574B7FF240 000000067F000040020000E0000000B74000-000000067F000040020000E0000000B78000__00000073AD3FE6B8 000000067F000040020000E0000000B74000-000000067F000040020000E0000000B78000__000000914E3F38F0 000000067F000040020000E0000000B74000-000000067F000040020000E0000000B78000__000000931B9A2710 000000067F000040020000E0000000B74000-030000000000000000000000000000000002__0000004FEAB6F890 000000067F000040020000E0000000B77817-000000067F000040020000E0000000B801EA__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B78000-000000067F000040020000E0000000B7C000__000000574B7FF240 000000067F000040020000E0000000B78000-000000067F000040020000E0000000B7C000__00000073AD3FE6B8 000000067F000040020000E0000000B78000-000000067F000040020000E0000000B7C000__000000914E3F38F0 000000067F000040020000E0000000B78000-000000067F000040020000E0000000B7C000__000000931B9A2710 000000067F000040020000E0000000B7C000-000000067F000040020000E0000000B80000__000000574B7FF240 000000067F000040020000E0000000B7C000-000000067F000040020000E0000000B80000__00000073AD3FE6B8 000000067F000040020000E0000000B7C000-000000067F000040020000E0000000B80000__000000914E3F38F0 000000067F000040020000E0000000B7C000-000000067F000040020000E0000000B80000__000000931B9A2710 000000067F000040020000E0000000B80000-000000067F000040020000E0000000B84000__000000574B7FF240 000000067F000040020000E0000000B80000-000000067F000040020000E0000000B84000__00000073AD3FE6B8 000000067F000040020000E0000000B80000-000000067F000040020000E0000000B84000__000000914E3F38F0 000000067F000040020000E0000000B80000-000000067F000040020000E0000000B84000__000000931B9A2710 000000067F000040020000E0000000B801EA-000000067F000040020000E0000000B88BCE__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B84000-000000067F000040020000E0000000B88000__000000574B7FF240 000000067F000040020000E0000000B84000-000000067F000040020000E0000000B88000__00000073AD3FE6B8 000000067F000040020000E0000000B84000-000000067F000040020000E0000000B88000__000000914E3F38F0 000000067F000040020000E0000000B84000-000000067F000040020000E0000000B88000__000000931B9A2710 000000067F000040020000E0000000B88000-000000067F000040020000E0000000B8C000__000000574B7FF240 000000067F000040020000E0000000B88000-000000067F000040020000E0000000B8C000__00000073AD3FE6B8 000000067F000040020000E0000000B88000-000000067F000040020000E0000000B8C000__000000914E3F38F0 000000067F000040020000E0000000B88000-000000067F000040020000E0000000B8C000__000000931B9A2710 000000067F000040020000E0000000B88BCE-000000067F000040020000E0000000B915B5__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B8C000-000000067F000040020000E0000000B90000__000000574B7FF240 000000067F000040020000E0000000B8C000-000000067F000040020000E0000000B90000__00000073AD3FE6B8 000000067F000040020000E0000000B8C000-000000067F000040020000E0000000B90000__000000914E3F38F0 000000067F000040020000E0000000B8C000-000000067F000040020000E0000000B90000__000000931B9A2710 000000067F000040020000E0000000B90000-000000067F000040020000E0000000B94000__000000574B7FF240 000000067F000040020000E0000000B90000-000000067F000040020000E0000000B94000__00000073AD3FE6B8 000000067F000040020000E0000000B90000-000000067F000040020000E0000000B94000__000000914E3F38F0 000000067F000040020000E0000000B90000-000000067F000040020000E0000000B94000__000000931B9A2710 000000067F000040020000E0000000B915B5-000000067F000040020000E0000000B99F95__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B94000-000000067F000040020000E0000000B98000__000000574B7FF240 000000067F000040020000E0000000B94000-000000067F000040020000E0000000B98000__00000073AD3FE6B8 000000067F000040020000E0000000B94000-000000067F000040020000E0000000B98000__000000914E3F38F0 000000067F000040020000E0000000B94000-000000067F000040020000E0000000B98000__000000931B9A2710 000000067F000040020000E0000000B98000-000000067F000040020000E0000000B9C000__000000574B7FF240 000000067F000040020000E0000000B98000-000000067F000040020000E0000000B9C000__00000073AD3FE6B8 000000067F000040020000E0000000B98000-000000067F000040020000E0000000B9C000__000000914E3F38F0 000000067F000040020000E0000000B98000-000000067F000040020000E0000000B9C000__000000931B9A2710 000000067F000040020000E0000000B99F95-000000067F000040020000E0000000BA2971__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000B9C000-000000067F000040020000E0000000BA0000__000000574B7FF240 000000067F000040020000E0000000B9C000-000000067F000040020000E0000000BA0000__00000073AD3FE6B8 000000067F000040020000E0000000B9C000-000000067F000040020000E0000000BA0000__000000914E3F38F0 000000067F000040020000E0000000B9C000-000000067F000040020000E0000000BA0000__000000931B9A2710 000000067F000040020000E0000000BA0000-000000067F000040020000E0000000BA4000__000000574B7FF240 000000067F000040020000E0000000BA0000-000000067F000040020000E0000000BA4000__00000073AD3FE6B8 000000067F000040020000E0000000BA0000-000000067F000040020000E0000000BA4000__000000914E3F38F0 000000067F000040020000E0000000BA0000-000000067F000040020000E0000000BA4000__000000931B9A2710 000000067F000040020000E0000000BA2971-000000067F000040020000E0000000BAB356__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000BA4000-000000067F000040020000E0000000BA8000__000000574B7FF240 000000067F000040020000E0000000BA4000-000000067F000040020000E0000000BA8000__00000073AD3FE6B8 000000067F000040020000E0000000BA4000-000000067F000040020000E0000000BA8000__000000914E3F38F0 000000067F000040020000E0000000BA4000-000000067F000040020000E0000000BA8000__000000931B9A2710 000000067F000040020000E0000000BA8000-000000067F000040020000E0000000BAC000__000000574B7FF240 000000067F000040020000E0000000BA8000-000000067F000040020000E0000000BAC000__00000073AD3FE6B8 000000067F000040020000E0000000BA8000-000000067F000040020000E0000000BAC000__000000914E3F38F0 000000067F000040020000E0000000BA8000-000000067F000040020000E0000000BAC000__000000931B9A2710 000000067F000040020000E0000000BAB356-000000067F000040020000E0000000BB3D2B__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000BAC000-000000067F000040020000E0000000BB0000__000000574B7FF240 000000067F000040020000E0000000BAC000-000000067F000040020000E0000000BB0000__00000073AD3FE6B8 000000067F000040020000E0000000BAC000-000000067F000040020000E0000000BB0000__000000914E3F38F0 000000067F000040020000E0000000BAC000-000000067F000040020000E0000000BB0000__000000931B9A2710 000000067F000040020000E0000000BB0000-000000067F000040020000E0000000BB4000__000000574B7FF240 000000067F000040020000E0000000BB0000-000000067F000040020000E0000000BB4000__00000073AD3FE6B8 000000067F000040020000E0000000BB0000-000000067F000040020000E0000000BB4000__000000914E3F38F0 000000067F000040020000E0000000BB0000-000000067F000040020000E0000000BB4000__000000931B9A2710 000000067F000040020000E0000000BB3D2B-000000067F000040020000E0000100000000__0000004FBFD9F391-000000505F89E839 000000067F000040020000E0000000BB4000-000000067F000040020000E0000000BB8000__000000572A7A05D8 000000067F000040020000E0000000BB4000-000000067F000040020000E0000000BB8000__0000005D2FFFFB38 000000067F000040020000E0000000BB4000-000000067F000040020000E0000000BB8000__00000073AD3FE6B8 000000067F000040020000E0000000BB4000-000000067F000040020000E0000000BB8000__000000914E3F38F0 000000067F000040020000E0000000BB4000-000000067F000040020000E0000000BB8000__000000931B9A2710 000000067F000040020000E0000000BB4003-000000067F000040020000E0000000BBC9D3__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BB8000-000000067F000040020000E0000000BBC000__000000572A7A05D8 000000067F000040020000E0000000BB8000-000000067F000040020000E0000000BBC000__0000005D2FFFFB38 000000067F000040020000E0000000BB8000-000000067F000040020000E0000000BBC000__00000073AD3FE6B8 000000067F000040020000E0000000BB8000-000000067F000040020000E0000000BBC000__000000914E3F38F0 000000067F000040020000E0000000BB8000-000000067F000040020000E0000000BBC000__000000931B9A2710 000000067F000040020000E0000000BBC000-000000067F000040020000E0000000BC0000__000000572A7A05D8 000000067F000040020000E0000000BBC000-000000067F000040020000E0000000BC0000__0000005D2FFFFB38 000000067F000040020000E0000000BBC000-000000067F000040020000E0000000BC0000__00000073AD3FE6B8 000000067F000040020000E0000000BBC000-000000067F000040020000E0000000BC0000__000000914E3F38F0 000000067F000040020000E0000000BBC000-000000067F000040020000E0000000BC0000__000000931B9A2710 000000067F000040020000E0000000BBC9D3-000000067F000040020000E0000000BC53AC__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BC0000-000000067F000040020000E0000000BC4000__000000572A7A05D8 000000067F000040020000E0000000BC0000-000000067F000040020000E0000000BC4000__0000005D2FFFFB38 000000067F000040020000E0000000BC0000-000000067F000040020000E0000000BC4000__00000073AD3FE6B8 000000067F000040020000E0000000BC0000-000000067F000040020000E0000000BC4000__000000914E3F38F0 000000067F000040020000E0000000BC0000-000000067F000040020000E0000000BC4000__000000931B9A2710 000000067F000040020000E0000000BC4000-000000067F000040020000E0000000BC8000__000000572A7A05D8 000000067F000040020000E0000000BC4000-000000067F000040020000E0000000BC8000__0000005D2FFFFB38 000000067F000040020000E0000000BC4000-000000067F000040020000E0000000BC8000__00000073AD3FE6B8 000000067F000040020000E0000000BC4000-000000067F000040020000E0000000BC8000__000000914E3F38F0 000000067F000040020000E0000000BC4000-000000067F000040020000E0000000BC8000__000000931B9A2710 000000067F000040020000E0000000BC53AC-000000067F000040020000E0000000BCDD9B__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BC8000-000000067F000040020000E0000000BCC000__000000572A7A05D8 000000067F000040020000E0000000BC8000-000000067F000040020000E0000000BCC000__0000005D2FFFFB38 000000067F000040020000E0000000BC8000-000000067F000040020000E0000000BCC000__00000073AD3FE6B8 000000067F000040020000E0000000BC8000-000000067F000040020000E0000000BCC000__000000914E3F38F0 000000067F000040020000E0000000BC8000-000000067F000040020000E0000000BCC000__000000931B9A2710 000000067F000040020000E0000000BCC000-000000067F000040020000E0000000BD0000__000000572A7A05D8 000000067F000040020000E0000000BCC000-000000067F000040020000E0000000BD0000__0000005D2FFFFB38 000000067F000040020000E0000000BCC000-000000067F000040020000E0000000BD0000__00000073AD3FE6B8 000000067F000040020000E0000000BCC000-000000067F000040020000E0000000BD0000__000000914E3F38F0 000000067F000040020000E0000000BCC000-000000067F000040020000E0000000BD0000__000000931B9A2710 000000067F000040020000E0000000BCDD9B-000000067F000040020000E0000000BD6777__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BD0000-000000067F000040020000E0000000BD4000__000000572A7A05D8 000000067F000040020000E0000000BD0000-000000067F000040020000E0000000BD4000__0000005D2FFFFB38 000000067F000040020000E0000000BD0000-000000067F000040020000E0000000BD4000__00000073AD3FE6B8 000000067F000040020000E0000000BD0000-000000067F000040020000E0000000BD4000__000000914E3F38F0 000000067F000040020000E0000000BD0000-000000067F000040020000E0000000BD4000__000000931B9A2710 000000067F000040020000E0000000BD4000-000000067F000040020000E0000000BD8000__000000572A7A05D8 000000067F000040020000E0000000BD4000-000000067F000040020000E0000000BD8000__0000005D2FFFFB38 000000067F000040020000E0000000BD4000-000000067F000040020000E0000000BD8000__00000073AD3FE6B8 000000067F000040020000E0000000BD4000-000000067F000040020000E0000000BD8000__000000914E3F38F0 000000067F000040020000E0000000BD4000-000000067F000040020000E0000000BD8000__000000931B9A2710 000000067F000040020000E0000000BD6777-000000067F000040020000E0000000BDF149__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BD8000-000000067F000040020000E0000000BDC000__000000572A7A05D8 000000067F000040020000E0000000BD8000-000000067F000040020000E0000000BDC000__0000005D2FFFFB38 000000067F000040020000E0000000BD8000-000000067F000040020000E0000000BDC000__00000073AD3FE6B8 000000067F000040020000E0000000BD8000-000000067F000040020000E0000000BDC000__000000914E3F38F0 000000067F000040020000E0000000BD8000-000000067F000040020000E0000000BDC000__000000931B9A2710 000000067F000040020000E0000000BDC000-000000067F000040020000E0000000BE0000__000000572A7A05D8 000000067F000040020000E0000000BDC000-000000067F000040020000E0000000BE0000__0000005D2FFFFB38 000000067F000040020000E0000000BDC000-000000067F000040020000E0000000BE0000__00000073AD3FE6B8 000000067F000040020000E0000000BDC000-000000067F000040020000E0000000BE0000__000000914E3F38F0 000000067F000040020000E0000000BDC000-000000067F000040020000E0000000BE0000__000000931B9A2710 000000067F000040020000E0000000BDF149-000000067F000040020000E0000000BE7B28__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BE0000-000000067F000040020000E0000000BE4000__000000572A7A05D8 000000067F000040020000E0000000BE0000-000000067F000040020000E0000000BE4000__0000005D2FFFFB38 000000067F000040020000E0000000BE0000-000000067F000040020000E0000000BE4000__00000073AD3FE6B8 000000067F000040020000E0000000BE0000-000000067F000040020000E0000000BE4000__000000914E3F38F0 000000067F000040020000E0000000BE0000-000000067F000040020000E0000000BE4000__000000931B9A2710 000000067F000040020000E0000000BE4000-000000067F000040020000E0000000BE8000__000000572A7A05D8 000000067F000040020000E0000000BE4000-000000067F000040020000E0000000BE8000__0000005D2FFFFB38 000000067F000040020000E0000000BE4000-000000067F000040020000E0000000BE8000__00000073AD3FE6B8 000000067F000040020000E0000000BE4000-000000067F000040020000E0000000BE8000__000000914E3F38F0 000000067F000040020000E0000000BE4000-000000067F000040020000E0000000BE8000__000000931B9A2710 000000067F000040020000E0000000BE7B28-000000067F000040020000E0000000BF04FD__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BE8000-000000067F000040020000E0000000BEC000__000000572A7A05D8 000000067F000040020000E0000000BE8000-000000067F000040020000E0000000BEC000__0000005D2FFFFB38 000000067F000040020000E0000000BE8000-000000067F000040020000E0000000BEC000__00000073AD3FE6B8 000000067F000040020000E0000000BE8000-000000067F000040020000E0000000BEC000__000000914E3F38F0 000000067F000040020000E0000000BE8000-000000067F000040020000E0000000BEC000__000000931B9A2710 000000067F000040020000E0000000BEC000-000000067F000040020000E0000000BF0000__000000572A7A05D8 000000067F000040020000E0000000BEC000-000000067F000040020000E0000000BF0000__0000005D2FFFFB38 000000067F000040020000E0000000BEC000-000000067F000040020000E0000000BF0000__00000073AD3FE6B8 000000067F000040020000E0000000BEC000-000000067F000040020000E0000000BF0000__000000914E3F38F0 000000067F000040020000E0000000BEC000-000000067F000040020000E0000000BF0000__000000931B9A2710 000000067F000040020000E0000000BF0000-000000067F000040020000E0000000BF4000__000000572A7A05D8 000000067F000040020000E0000000BF0000-000000067F000040020000E0000000BF4000__0000005D2FFFFB38 000000067F000040020000E0000000BF0000-000000067F000040020000E0000000BF4000__00000073AD3FE6B8 000000067F000040020000E0000000BF0000-000000067F000040020000E0000000BF4000__000000914E3F38F0 000000067F000040020000E0000000BF0000-000000067F000040020000E0000000BF4000__000000931B9A2710 000000067F000040020000E0000000BF04FD-000000067F000040020000E0000000BF8ED4__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BF4000-000000067F000040020000E0000000BF8000__000000572A7A05D8 000000067F000040020000E0000000BF4000-000000067F000040020000E0000000BF8000__0000005D2FFFFB38 000000067F000040020000E0000000BF4000-000000067F000040020000E0000000BF8000__00000073AD3FE6B8 000000067F000040020000E0000000BF4000-000000067F000040020000E0000000BF8000__000000914E3F38F0 000000067F000040020000E0000000BF4000-000000067F000040020000E0000000BF8000__000000931B9A2710 000000067F000040020000E0000000BF8000-000000067F000040020000E0000000BFC000__000000572A7A05D8 000000067F000040020000E0000000BF8000-000000067F000040020000E0000000BFC000__0000005D2FFFFB38 000000067F000040020000E0000000BF8000-000000067F000040020000E0000000BFC000__00000073AD3FE6B8 000000067F000040020000E0000000BF8000-000000067F000040020000E0000000BFC000__000000914E3F38F0 000000067F000040020000E0000000BF8000-000000067F000040020000E0000000BFC000__000000931B9A2710 000000067F000040020000E0000000BF8ED4-000000067F000040020000E0000000C018B5__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000BFC000-000000067F000040020000E0000000C00000__000000572A7A05D8 000000067F000040020000E0000000BFC000-000000067F000040020000E0000000C00000__0000005D2FFFFB38 000000067F000040020000E0000000BFC000-000000067F000040020000E0000000C00000__00000073AD3FE6B8 000000067F000040020000E0000000BFC000-000000067F000040020000E0000000C00000__000000914E3F38F0 000000067F000040020000E0000000BFC000-000000067F000040020000E0000000C00000__000000931B9A2710 000000067F000040020000E0000000C00000-000000067F000040020000E0000000C04000__000000572A7A05D8 000000067F000040020000E0000000C00000-000000067F000040020000E0000000C04000__0000005D2FFFFB38 000000067F000040020000E0000000C00000-000000067F000040020000E0000000C04000__00000073AD3FE6B8 000000067F000040020000E0000000C00000-000000067F000040020000E0000000C04000__000000914E3F38F0 000000067F000040020000E0000000C00000-000000067F000040020000E0000000C04000__000000931B9A2710 000000067F000040020000E0000000C018B5-000000067F000040020000E0000000C0A2A7__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000C04000-000000067F000040020000E0000000C08000__000000572A7A05D8 000000067F000040020000E0000000C04000-000000067F000040020000E0000000C08000__0000005D2FFFFB38 000000067F000040020000E0000000C04000-000000067F000040020000E0000000C08000__00000073AD3FE6B8 000000067F000040020000E0000000C04000-000000067F000040020000E0000000C08000__000000914E3F38F0 000000067F000040020000E0000000C04000-000000067F000040020000E0000000C08000__000000931B9A2710 000000067F000040020000E0000000C08000-000000067F000040020000E0000000C0C000__000000572A7A05D8 000000067F000040020000E0000000C08000-000000067F000040020000E0000000C0C000__0000005D2FFFFB38 000000067F000040020000E0000000C08000-000000067F000040020000E0000000C0C000__00000073AD3FE6B8 000000067F000040020000E0000000C08000-000000067F000040020000E0000000C0C000__000000914E3F38F0 000000067F000040020000E0000000C08000-000000067F000040020000E0000000C0C000__000000931B9A2710 000000067F000040020000E0000000C0A2A7-000000067F000040020000E0000000C12C83__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000C0C000-000000067F000040020000E0000000C10000__000000572A7A05D8 000000067F000040020000E0000000C0C000-000000067F000040020000E0000000C10000__0000005D2FFFFB38 000000067F000040020000E0000000C0C000-000000067F000040020000E0000000C10000__00000073AD3FE6B8 000000067F000040020000E0000000C0C000-000000067F000040020000E0000000C10000__000000914E3F38F0 000000067F000040020000E0000000C0C000-000000067F000040020000E0000000C10000__000000931B9A2710 000000067F000040020000E0000000C10000-000000067F000040020000E0000000C14000__00000051EEFFE900 000000067F000040020000E0000000C10000-000000067F000040020000E0000000C14000__0000005D2FFFFB38 000000067F000040020000E0000000C10000-000000067F000040020000E0000000C14000__00000073AD3FE6B8 000000067F000040020000E0000000C10000-000000067F000040020000E0000000C14000__000000914E3F38F0 000000067F000040020000E0000000C10000-000000067F000040020000E0000000C14000__000000931B9A2710 000000067F000040020000E0000000C12C83-000000067F000040020000E0000100000000__000000505F89E839-000000510F31FEA9 000000067F000040020000E0000000C12F96-000000067F000040020000E0000000C1B971__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C14000-000000067F000040020000E0000000C18000__00000051EEFFE900 000000067F000040020000E0000000C14000-000000067F000040020000E0000000C18000__0000005D2FFFFB38 000000067F000040020000E0000000C14000-000000067F000040020000E0000000C18000__00000073AD3FE6B8 000000067F000040020000E0000000C14000-000000067F000040020000E0000000C18000__000000914E3F38F0 000000067F000040020000E0000000C14000-000000067F000040020000E0000000C18000__000000931B9A2710 000000067F000040020000E0000000C18000-000000067F000040020000E0000000C1C000__00000051EEFFE900 000000067F000040020000E0000000C18000-000000067F000040020000E0000000C1C000__0000005D2FFFFB38 000000067F000040020000E0000000C18000-000000067F000040020000E0000000C1C000__00000073AD3FE6B8 000000067F000040020000E0000000C18000-000000067F000040020000E0000000C1C000__000000914E3F38F0 000000067F000040020000E0000000C18000-000000067F000040020000E0000000C1C000__000000931B9A2710 000000067F000040020000E0000000C1B971-000000067F000040020000E0000000C24348__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C1C000-000000067F000040020000E0000000C20000__00000051EEFFE900 000000067F000040020000E0000000C1C000-000000067F000040020000E0000000C20000__0000005D2FFFFB38 000000067F000040020000E0000000C1C000-000000067F000040020000E0000000C20000__00000073AD3FE6B8 000000067F000040020000E0000000C1C000-000000067F000040020000E0000000C20000__000000914E3F38F0 000000067F000040020000E0000000C1C000-000000067F000040020000E0000000C20000__000000931B9A2710 000000067F000040020000E0000000C20000-000000067F000040020000E0000000C24000__00000051EEFFE900 000000067F000040020000E0000000C20000-000000067F000040020000E0000000C24000__0000005D2FFFFB38 000000067F000040020000E0000000C20000-000000067F000040020000E0000000C24000__00000073AD3FE6B8 000000067F000040020000E0000000C20000-000000067F000040020000E0000000C24000__000000914E3F38F0 000000067F000040020000E0000000C20000-000000067F000040020000E0000000C24000__000000931B9A2710 000000067F000040020000E0000000C24000-000000067F000040020000E0000000C28000__00000051EEFFE900 000000067F000040020000E0000000C24000-000000067F000040020000E0000000C28000__0000005D2FFFFB38 000000067F000040020000E0000000C24000-000000067F000040020000E0000000C28000__00000073AD3FE6B8 000000067F000040020000E0000000C24000-000000067F000040020000E0000000C28000__000000914E3F38F0 000000067F000040020000E0000000C24000-000000067F000040020000E0000000C28000__000000931B9A2710 000000067F000040020000E0000000C24348-000000067F000040020000E0000000C2CD23__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C28000-000000067F000040020000E0000000C2C000__00000051EEFFE900 000000067F000040020000E0000000C28000-000000067F000040020000E0000000C2C000__0000005D2FFFFB38 000000067F000040020000E0000000C28000-000000067F000040020000E0000000C2C000__00000073AD3FE6B8 000000067F000040020000E0000000C28000-000000067F000040020000E0000000C2C000__000000914E3F38F0 000000067F000040020000E0000000C28000-000000067F000040020000E0000000C2C000__000000931B9A2710 000000067F000040020000E0000000C2C000-000000067F000040020000E0000000C30000__00000051EEFFE900 000000067F000040020000E0000000C2C000-000000067F000040020000E0000000C30000__0000005D2FFFFB38 000000067F000040020000E0000000C2C000-000000067F000040020000E0000000C30000__00000073AD3FE6B8 000000067F000040020000E0000000C2C000-000000067F000040020000E0000000C30000__000000914E3F38F0 000000067F000040020000E0000000C2C000-000000067F000040020000E0000000C30000__000000931B9A2710 000000067F000040020000E0000000C2CD23-000000067F000040020000E0000000C356F9__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C30000-000000067F000040020000E0000000C34000__00000051EEFFE900 000000067F000040020000E0000000C30000-000000067F000040020000E0000000C34000__0000005D2FFFFB38 000000067F000040020000E0000000C30000-000000067F000040020000E0000000C34000__00000073AD3FE6B8 000000067F000040020000E0000000C30000-000000067F000040020000E0000000C34000__000000914E3F38F0 000000067F000040020000E0000000C30000-000000067F000040020000E0000000C34000__000000931B9A2710 000000067F000040020000E0000000C34000-000000067F000040020000E0000000C38000__00000051EEFFE900 000000067F000040020000E0000000C34000-000000067F000040020000E0000000C38000__0000005D2FFFFB38 000000067F000040020000E0000000C34000-000000067F000040020000E0000000C38000__00000073AD3FE6B8 000000067F000040020000E0000000C34000-000000067F000040020000E0000000C38000__000000914E3F38F0 000000067F000040020000E0000000C34000-000000067F000040020000E0000000C38000__000000931B9A2710 000000067F000040020000E0000000C356F9-000000067F000040020000E0000000C3E0DC__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C38000-000000067F000040020000E0000000C3C000__00000051EEFFE900 000000067F000040020000E0000000C38000-000000067F000040020000E0000000C3C000__0000005D2FFFFB38 000000067F000040020000E0000000C38000-000000067F000040020000E0000000C3C000__00000073AD3FE6B8 000000067F000040020000E0000000C38000-000000067F000040020000E0000000C3C000__000000914E3F38F0 000000067F000040020000E0000000C38000-000000067F000040020000E0000000C3C000__000000931B9A2710 000000067F000040020000E0000000C3C000-000000067F000040020000E0000000C40000__00000051EEFFE900 000000067F000040020000E0000000C3C000-000000067F000040020000E0000000C40000__0000005D2FFFFB38 000000067F000040020000E0000000C3C000-000000067F000040020000E0000000C40000__00000073AD3FE6B8 000000067F000040020000E0000000C3C000-000000067F000040020000E0000000C40000__000000914E3F38F0 000000067F000040020000E0000000C3C000-000000067F000040020000E0000000C40000__000000931B9A2710 000000067F000040020000E0000000C3E0DC-000000067F000040020000E0000000C46AC7__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C40000-000000067F000040020000E0000000C44000__00000051EEFFE900 000000067F000040020000E0000000C40000-000000067F000040020000E0000000C44000__0000005D2FFFFB38 000000067F000040020000E0000000C40000-000000067F000040020000E0000000C44000__00000073AD3FE6B8 000000067F000040020000E0000000C40000-000000067F000040020000E0000000C44000__000000914E3F38F0 000000067F000040020000E0000000C40000-000000067F000040020000E0000000C44000__000000931B9A2710 000000067F000040020000E0000000C44000-000000067F000040020000E0000000C48000__00000051EEFFE900 000000067F000040020000E0000000C44000-000000067F000040020000E0000000C48000__0000005D2FFFFB38 000000067F000040020000E0000000C44000-000000067F000040020000E0000000C48000__00000073AD3FE6B8 000000067F000040020000E0000000C44000-000000067F000040020000E0000000C48000__000000914E3F38F0 000000067F000040020000E0000000C44000-000000067F000040020000E0000000C48000__000000931B9A2710 000000067F000040020000E0000000C46AC7-000000067F000040020000E0000000C4F4A5__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C48000-000000067F000040020000E0000000C4C000__00000051EEFFE900 000000067F000040020000E0000000C48000-000000067F000040020000E0000000C4C000__0000005D2FFFFB38 000000067F000040020000E0000000C48000-000000067F000040020000E0000000C4C000__00000073AD3FE6B8 000000067F000040020000E0000000C48000-000000067F000040020000E0000000C4C000__000000914E3F38F0 000000067F000040020000E0000000C48000-000000067F000040020000E0000000C4C000__000000931B9A2710 000000067F000040020000E0000000C4C000-000000067F000040020000E0000000C50000__00000051EEFFE900 000000067F000040020000E0000000C4C000-000000067F000040020000E0000000C50000__0000005D2FFFFB38 000000067F000040020000E0000000C4C000-000000067F000040020000E0000000C50000__00000073AD3FE6B8 000000067F000040020000E0000000C4C000-000000067F000040020000E0000000C50000__000000914E3F38F0 000000067F000040020000E0000000C4C000-000000067F000040020000E0000000C50000__000000931B9A2710 000000067F000040020000E0000000C4F4A5-000000067F000040020000E0000000C57E7D__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C50000-000000067F000040020000E0000000C54000__00000051EEFFE900 000000067F000040020000E0000000C50000-000000067F000040020000E0000000C54000__0000005D2FFFFB38 000000067F000040020000E0000000C50000-000000067F000040020000E0000000C54000__00000073AD3FE6B8 000000067F000040020000E0000000C50000-000000067F000040020000E0000000C54000__000000914E3F38F0 000000067F000040020000E0000000C50000-000000067F000040020000E0000000C54000__000000931B9A2710 000000067F000040020000E0000000C54000-000000067F000040020000E0000000C58000__00000051EEFFE900 000000067F000040020000E0000000C54000-000000067F000040020000E0000000C58000__0000005D2FFFFB38 000000067F000040020000E0000000C54000-000000067F000040020000E0000000C58000__00000073AD3FE6B8 000000067F000040020000E0000000C54000-000000067F000040020000E0000000C58000__000000914E3F38F0 000000067F000040020000E0000000C54000-000000067F000040020000E0000000C58000__000000931B9A2710 000000067F000040020000E0000000C57E7D-000000067F000040020000E0000000C60858__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C58000-000000067F000040020000E0000000C5C000__00000051EEFFE900 000000067F000040020000E0000000C58000-000000067F000040020000E0000000C5C000__0000005D2FFFFB38 000000067F000040020000E0000000C58000-000000067F000040020000E0000000C5C000__00000073AD3FE6B8 000000067F000040020000E0000000C58000-000000067F000040020000E0000000C5C000__000000914E3F38F0 000000067F000040020000E0000000C58000-000000067F000040020000E0000000C5C000__000000931B9A2710 000000067F000040020000E0000000C5C000-000000067F000040020000E0000000C60000__00000051EEFFE900 000000067F000040020000E0000000C5C000-000000067F000040020000E0000000C60000__0000005D2FFFFB38 000000067F000040020000E0000000C5C000-000000067F000040020000E0000000C60000__00000073AD3FE6B8 000000067F000040020000E0000000C5C000-000000067F000040020000E0000000C60000__000000914E3F38F0 000000067F000040020000E0000000C5C000-000000067F000040020000E0000000C60000__000000931B9A2710 000000067F000040020000E0000000C60000-000000067F000040020000E0000000C64000__00000051EEFFE900 000000067F000040020000E0000000C60000-000000067F000040020000E0000000C64000__0000005D2FFFFB38 000000067F000040020000E0000000C60000-000000067F000040020000E0000000C64000__00000073AD3FE6B8 000000067F000040020000E0000000C60000-000000067F000040020000E0000000C64000__000000914E3F38F0 000000067F000040020000E0000000C60000-000000067F000040020000E0000000C64000__000000931B9A2710 000000067F000040020000E0000000C60858-000000067F000040020000E0000000C6922E__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C64000-000000067F000040020000E0000000C68000__00000051EEFFE900 000000067F000040020000E0000000C64000-000000067F000040020000E0000000C68000__0000005D2FFFFB38 000000067F000040020000E0000000C64000-000000067F000040020000E0000000C68000__00000073AD3FE6B8 000000067F000040020000E0000000C64000-000000067F000040020000E0000000C68000__000000914E3F38F0 000000067F000040020000E0000000C64000-000000067F000040020000E0000000C68000__000000931B9A2710 000000067F000040020000E0000000C68000-000000067F000040020000E0000000C6C000__00000051EEFFE900 000000067F000040020000E0000000C68000-000000067F000040020000E0000000C6C000__0000005D2FFFFB38 000000067F000040020000E0000000C68000-000000067F000040020000E0000000C6C000__00000073AD3FE6B8 000000067F000040020000E0000000C68000-000000067F000040020000E0000000C6C000__000000914E3F38F0 000000067F000040020000E0000000C68000-000000067F000040020000E0000000C6C000__000000931B9A2710 000000067F000040020000E0000000C6922E-000000067F000040020000E0000000C71C02__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C6C000-000000067F000040020000E0000000C70000__00000051EEFFE900 000000067F000040020000E0000000C6C000-000000067F000040020000E0000000C70000__0000005D2FFFFB38 000000067F000040020000E0000000C6C000-000000067F000040020000E0000000C70000__00000073AD3FE6B8 000000067F000040020000E0000000C6C000-000000067F000040020000E0000000C70000__000000914E3F38F0 000000067F000040020000E0000000C6C000-000000067F000040020000E0000000C70000__000000931B9A2710 000000067F000040020000E0000000C70000-000000067F000040020000E0000000C74000__00000051EEFFE900 000000067F000040020000E0000000C70000-000000067F000040020000E0000000C74000__000000574B7FF240 000000067F000040020000E0000000C70000-000000067F000040020000E0000000C74000__00000073AD3FE6B8 000000067F000040020000E0000000C70000-000000067F000040020000E0000000C74000__000000914E3F38F0 000000067F000040020000E0000000C70000-000000067F000040020000E0000000C74000__000000931B9A2710 000000067F000040020000E0000000C71C02-000000067F000040020000E0000100000000__000000510F31FEA9-00000051BED9D7E1 000000067F000040020000E0000000C71F22-000000067F000040020000E0000000C7A8F1__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000C74000-000000067F000040020000E0000000C78000__00000051EEFFE900 000000067F000040020000E0000000C74000-000000067F000040020000E0000000C78000__000000574B7FF240 000000067F000040020000E0000000C74000-000000067F000040020000E0000000C78000__00000073AD3FE6B8 000000067F000040020000E0000000C74000-000000067F000040020000E0000000C78000__000000914E3F38F0 000000067F000040020000E0000000C74000-000000067F000040020000E0000000C78000__000000931B9A2710 000000067F000040020000E0000000C78000-000000067F000040020000E0000000C7C000__00000051EEFFE900 000000067F000040020000E0000000C78000-000000067F000040020000E0000000C7C000__000000574B7FF240 000000067F000040020000E0000000C78000-000000067F000040020000E0000000C7C000__00000073AD3FE6B8 000000067F000040020000E0000000C78000-000000067F000040020000E0000000C7C000__000000914E3F38F0 000000067F000040020000E0000000C78000-000000067F000040020000E0000000C7C000__000000931B9A2710 000000067F000040020000E0000000C7A8F1-000000067F000040020000E0000000C832D9__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000C7C000-000000067F000040020000E0000000C80000__00000051EEFFE900 000000067F000040020000E0000000C7C000-000000067F000040020000E0000000C80000__000000574B7FF240 000000067F000040020000E0000000C7C000-000000067F000040020000E0000000C80000__00000073AD3FE6B8 000000067F000040020000E0000000C7C000-000000067F000040020000E0000000C80000__000000914E3F38F0 000000067F000040020000E0000000C7C000-000000067F000040020000E0000000C80000__000000931B9A2710 000000067F000040020000E0000000C80000-000000067F000040020000E0000000C84000__00000051EEFFE900 000000067F000040020000E0000000C80000-000000067F000040020000E0000000C84000__000000574B7FF240 000000067F000040020000E0000000C80000-000000067F000040020000E0000000C84000__00000073AD3FE6B8 000000067F000040020000E0000000C80000-000000067F000040020000E0000000C84000__000000914E3F38F0 000000067F000040020000E0000000C80000-000000067F000040020000E0000000C84000__000000931B9A2710 000000067F000040020000E0000000C832D9-000000067F000040020000E0000000C8BCBC__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000C84000-000000067F000040020000E0000000C88000__00000051EEFFE900 000000067F000040020000E0000000C84000-000000067F000040020000E0000000C88000__000000574B7FF240 000000067F000040020000E0000000C84000-000000067F000040020000E0000000C88000__00000073AD3FE6B8 000000067F000040020000E0000000C84000-000000067F000040020000E0000000C88000__000000914E3F38F0 000000067F000040020000E0000000C84000-000000067F000040020000E0000000C88000__000000931B9A2710 000000067F000040020000E0000000C88000-000000067F000040020000E0000000C8C000__000000574B7FF240 000000067F000040020000E0000000C88000-000000067F000040020000E0000000C8C000__00000073AD3FE6B8 000000067F000040020000E0000000C88000-000000067F000040020000E0000000C8C000__000000914E3F38F0 000000067F000040020000E0000000C88000-000000067F000040020000E0000000C8C000__000000931B9A2710 000000067F000040020000E0000000C88000-00000006800000000000000B1F0100000000__00000051EEFFE900 000000067F000040020000E0000000C8BCBC-000000067F000040020000E0000000C946A4__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000C8C000-000000067F000040020000E0000000C90000__000000574B7FF240 000000067F000040020000E0000000C8C000-000000067F000040020000E0000000C90000__00000073AD3FE6B8 000000067F000040020000E0000000C8C000-000000067F000040020000E0000000C90000__000000914E3F38F0 000000067F000040020000E0000000C8C000-000000067F000040020000E0000000C90000__000000931B9A2710 000000067F000040020000E0000000C90000-000000067F000040020000E0000000C94000__000000574B7FF240 000000067F000040020000E0000000C90000-000000067F000040020000E0000000C94000__00000073AD3FE6B8 000000067F000040020000E0000000C90000-000000067F000040020000E0000000C94000__000000914E3F38F0 000000067F000040020000E0000000C90000-000000067F000040020000E0000000C94000__000000931B9A2710 000000067F000040020000E0000000C94000-000000067F000040020000E0000000C98000__000000574B7FF240 000000067F000040020000E0000000C94000-000000067F000040020000E0000000C98000__00000073AD3FE6B8 000000067F000040020000E0000000C94000-000000067F000040020000E0000000C98000__000000914E3F38F0 000000067F000040020000E0000000C94000-000000067F000040020000E0000000C98000__000000931B9A2710 000000067F000040020000E0000000C946A4-000000067F000040020000E0000000C9D07F__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000C98000-000000067F000040020000E0000000C9C000__000000574B7FF240 000000067F000040020000E0000000C98000-000000067F000040020000E0000000C9C000__00000073AD3FE6B8 000000067F000040020000E0000000C98000-000000067F000040020000E0000000C9C000__000000914E3F38F0 000000067F000040020000E0000000C98000-000000067F000040020000E0000000C9C000__000000931B9A2710 000000067F000040020000E0000000C9C000-000000067F000040020000E0000000CA0000__000000574B7FF240 000000067F000040020000E0000000C9C000-000000067F000040020000E0000000CA0000__00000073AD3FE6B8 000000067F000040020000E0000000C9C000-000000067F000040020000E0000000CA0000__000000914E3F38F0 000000067F000040020000E0000000C9C000-000000067F000040020000E0000000CA0000__000000931B9A2710 000000067F000040020000E0000000C9D07F-000000067F000040020000E0000000CA5A4E__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CA0000-000000067F000040020000E0000000CA4000__000000574B7FF240 000000067F000040020000E0000000CA0000-000000067F000040020000E0000000CA4000__00000073AD3FE6B8 000000067F000040020000E0000000CA0000-000000067F000040020000E0000000CA4000__000000914E3F38F0 000000067F000040020000E0000000CA0000-000000067F000040020000E0000000CA4000__000000931B9A2710 000000067F000040020000E0000000CA4000-000000067F000040020000E0000000CA8000__000000574B7FF240 000000067F000040020000E0000000CA4000-000000067F000040020000E0000000CA8000__00000073AD3FE6B8 000000067F000040020000E0000000CA4000-000000067F000040020000E0000000CA8000__000000914E3F38F0 000000067F000040020000E0000000CA4000-000000067F000040020000E0000000CA8000__000000931B9A2710 000000067F000040020000E0000000CA5A4E-000000067F000040020000E0000000CAE42F__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CA8000-000000067F000040020000E0000000CAC000__000000574B7FF240 000000067F000040020000E0000000CA8000-000000067F000040020000E0000000CAC000__00000073AD3FE6B8 000000067F000040020000E0000000CA8000-000000067F000040020000E0000000CAC000__000000914E3F38F0 000000067F000040020000E0000000CA8000-000000067F000040020000E0000000CAC000__000000931B9A2710 000000067F000040020000E0000000CAC000-000000067F000040020000E0000000CB0000__000000574B7FF240 000000067F000040020000E0000000CAC000-000000067F000040020000E0000000CB0000__00000073AD3FE6B8 000000067F000040020000E0000000CAC000-000000067F000040020000E0000000CB0000__000000914E3F38F0 000000067F000040020000E0000000CAC000-000000067F000040020000E0000000CB0000__000000931B9A2710 000000067F000040020000E0000000CAE42F-000000067F000040020000E0000000CB6E04__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CB0000-000000067F000040020000E0000000CB4000__000000574B7FF240 000000067F000040020000E0000000CB0000-000000067F000040020000E0000000CB4000__00000073AD3FE6B8 000000067F000040020000E0000000CB0000-000000067F000040020000E0000000CB4000__000000914E3F38F0 000000067F000040020000E0000000CB0000-000000067F000040020000E0000000CB4000__000000931B9A2710 000000067F000040020000E0000000CB4000-000000067F000040020000E0000000CB8000__000000574B7FF240 000000067F000040020000E0000000CB4000-000000067F000040020000E0000000CB8000__00000073AD3FE6B8 000000067F000040020000E0000000CB4000-000000067F000040020000E0000000CB8000__000000914E3F38F0 000000067F000040020000E0000000CB4000-000000067F000040020000E0000000CB8000__000000931B9A2710 000000067F000040020000E0000000CB6E04-000000067F000040020000E0000000CBF7D9__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CB8000-000000067F000040020000E0000000CBC000__000000574B7FF240 000000067F000040020000E0000000CB8000-000000067F000040020000E0000000CBC000__00000073AD3FE6B8 000000067F000040020000E0000000CB8000-000000067F000040020000E0000000CBC000__000000914E3F38F0 000000067F000040020000E0000000CB8000-000000067F000040020000E0000000CBC000__000000931B9A2710 000000067F000040020000E0000000CBC000-000000067F000040020000E0000000CC0000__000000574B7FF240 000000067F000040020000E0000000CBC000-000000067F000040020000E0000000CC0000__00000073AD3FE6B8 000000067F000040020000E0000000CBC000-000000067F000040020000E0000000CC0000__000000914E3F38F0 000000067F000040020000E0000000CBC000-000000067F000040020000E0000000CC0000__000000931B9A2710 000000067F000040020000E0000000CBF7D9-000000067F000040020000E0000000CC81BA__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CC0000-000000067F000040020000E0000000CC4000__000000574B7FF240 000000067F000040020000E0000000CC0000-000000067F000040020000E0000000CC4000__00000073AD3FE6B8 000000067F000040020000E0000000CC0000-000000067F000040020000E0000000CC4000__000000914E3F38F0 000000067F000040020000E0000000CC0000-000000067F000040020000E0000000CC4000__000000931B9A2710 000000067F000040020000E0000000CC4000-000000067F000040020000E0000000CC8000__000000574B7FF240 000000067F000040020000E0000000CC4000-000000067F000040020000E0000000CC8000__00000073AD3FE6B8 000000067F000040020000E0000000CC4000-000000067F000040020000E0000000CC8000__000000914E3F38F0 000000067F000040020000E0000000CC4000-000000067F000040020000E0000000CC8000__000000931B9A2710 000000067F000040020000E0000000CC8000-000000067F000040020000E0000000CCC000__000000574B7FF240 000000067F000040020000E0000000CC8000-000000067F000040020000E0000000CCC000__00000073AD3FE6B8 000000067F000040020000E0000000CC8000-000000067F000040020000E0000000CCC000__000000914E3F38F0 000000067F000040020000E0000000CC8000-000000067F000040020000E0000000CCC000__000000931B9A2710 000000067F000040020000E0000000CC81BA-000000067F000040020000E0000000CD0B9F__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CCC000-000000067F000040020000E0000000CD0000__000000574B7FF240 000000067F000040020000E0000000CCC000-000000067F000040020000E0000000CD0000__00000073AD3FE6B8 000000067F000040020000E0000000CCC000-000000067F000040020000E0000000CD0000__000000914E3F38F0 000000067F000040020000E0000000CCC000-000000067F000040020000E0000000CD0000__000000931B9A2710 000000067F000040020000E0000000CD0000-000000067F000040020000E0000000CD4000__000000572A7A05D8 000000067F000040020000E0000000CD0000-000000067F000040020000E0000000CD4000__0000005D2FFFFB38 000000067F000040020000E0000000CD0000-000000067F000040020000E0000000CD4000__00000073AD3FE6B8 000000067F000040020000E0000000CD0000-000000067F000040020000E0000000CD4000__000000914E3F38F0 000000067F000040020000E0000000CD0000-000000067F000040020000E0000000CD4000__000000931B9A2710 000000067F000040020000E0000000CD0B9F-000000067F000040020000E0000100000000__00000051BED9D7E1-000000526E81F439 000000067F000040020000E0000000CD0EB9-000000067F000040020000E0000000CD9893__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CD4000-000000067F000040020000E0000000CD8000__000000572A7A05D8 000000067F000040020000E0000000CD4000-000000067F000040020000E0000000CD8000__0000005D2FFFFB38 000000067F000040020000E0000000CD4000-000000067F000040020000E0000000CD8000__00000073AD3FE6B8 000000067F000040020000E0000000CD4000-000000067F000040020000E0000000CD8000__000000914E3F38F0 000000067F000040020000E0000000CD4000-000000067F000040020000E0000000CD8000__000000931B9A2710 000000067F000040020000E0000000CD8000-000000067F000040020000E0000000CDC000__000000572A7A05D8 000000067F000040020000E0000000CD8000-000000067F000040020000E0000000CDC000__0000005D2FFFFB38 000000067F000040020000E0000000CD8000-000000067F000040020000E0000000CDC000__00000073AD3FE6B8 000000067F000040020000E0000000CD8000-000000067F000040020000E0000000CDC000__000000914E3F38F0 000000067F000040020000E0000000CD8000-000000067F000040020000E0000000CDC000__000000931B9A2710 000000067F000040020000E0000000CD9893-000000067F000040020000E0000000CE226B__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CDC000-000000067F000040020000E0000000CE0000__000000572A7A05D8 000000067F000040020000E0000000CDC000-000000067F000040020000E0000000CE0000__0000005D2FFFFB38 000000067F000040020000E0000000CDC000-000000067F000040020000E0000000CE0000__00000073AD3FE6B8 000000067F000040020000E0000000CDC000-000000067F000040020000E0000000CE0000__000000914E3F38F0 000000067F000040020000E0000000CDC000-000000067F000040020000E0000000CE0000__000000931B9A2710 000000067F000040020000E0000000CE0000-000000067F000040020000E0000000CE4000__000000572A7A05D8 000000067F000040020000E0000000CE0000-000000067F000040020000E0000000CE4000__0000005D2FFFFB38 000000067F000040020000E0000000CE0000-000000067F000040020000E0000000CE4000__00000073AD3FE6B8 000000067F000040020000E0000000CE0000-000000067F000040020000E0000000CE4000__000000914E3F38F0 000000067F000040020000E0000000CE0000-000000067F000040020000E0000000CE4000__000000931B9A2710 000000067F000040020000E0000000CE226B-000000067F000040020000E0000000CEAC50__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CE4000-000000067F000040020000E0000000CE8000__000000572A7A05D8 000000067F000040020000E0000000CE4000-000000067F000040020000E0000000CE8000__0000005D2FFFFB38 000000067F000040020000E0000000CE4000-000000067F000040020000E0000000CE8000__00000073AD3FE6B8 000000067F000040020000E0000000CE4000-000000067F000040020000E0000000CE8000__000000914E3F38F0 000000067F000040020000E0000000CE4000-000000067F000040020000E0000000CE8000__000000931B9A2710 000000067F000040020000E0000000CE8000-000000067F000040020000E0000000CEC000__000000572A7A05D8 000000067F000040020000E0000000CE8000-000000067F000040020000E0000000CEC000__0000005D2FFFFB38 000000067F000040020000E0000000CE8000-000000067F000040020000E0000000CEC000__00000073AD3FE6B8 000000067F000040020000E0000000CE8000-000000067F000040020000E0000000CEC000__000000914E3F38F0 000000067F000040020000E0000000CE8000-000000067F000040020000E0000000CEC000__000000931B9A2710 000000067F000040020000E0000000CEAC50-000000067F000040020000E0000000CF3627__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CEC000-000000067F000040020000E0000000CF0000__000000572A7A05D8 000000067F000040020000E0000000CEC000-000000067F000040020000E0000000CF0000__0000005D2FFFFB38 000000067F000040020000E0000000CEC000-000000067F000040020000E0000000CF0000__00000073AD3FE6B8 000000067F000040020000E0000000CEC000-000000067F000040020000E0000000CF0000__000000914E3F38F0 000000067F000040020000E0000000CEC000-000000067F000040020000E0000000CF0000__000000931B9A2710 000000067F000040020000E0000000CF0000-000000067F000040020000E0000000CF4000__000000572A7A05D8 000000067F000040020000E0000000CF0000-000000067F000040020000E0000000CF4000__0000005D2FFFFB38 000000067F000040020000E0000000CF0000-000000067F000040020000E0000000CF4000__00000073AD3FE6B8 000000067F000040020000E0000000CF0000-000000067F000040020000E0000000CF4000__000000914E3F38F0 000000067F000040020000E0000000CF0000-000000067F000040020000E0000000CF4000__000000931B9A2710 000000067F000040020000E0000000CF3627-000000067F000040020000E0000000CFBFFE__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CF4000-000000067F000040020000E0000000CF8000__000000572A7A05D8 000000067F000040020000E0000000CF4000-000000067F000040020000E0000000CF8000__0000005D2FFFFB38 000000067F000040020000E0000000CF4000-000000067F000040020000E0000000CF8000__00000073AD3FE6B8 000000067F000040020000E0000000CF4000-000000067F000040020000E0000000CF8000__000000914E3F38F0 000000067F000040020000E0000000CF4000-000000067F000040020000E0000000CF8000__000000931B9A2710 000000067F000040020000E0000000CF8000-000000067F000040020000E0000000CFC000__000000572A7A05D8 000000067F000040020000E0000000CF8000-000000067F000040020000E0000000CFC000__0000005D2FFFFB38 000000067F000040020000E0000000CF8000-000000067F000040020000E0000000CFC000__00000073AD3FE6B8 000000067F000040020000E0000000CF8000-000000067F000040020000E0000000CFC000__000000914E3F38F0 000000067F000040020000E0000000CF8000-000000067F000040020000E0000000CFC000__000000931B9A2710 000000067F000040020000E0000000CFBFFE-000000067F000040020000E0000000D049E2__000000526E81F439-000000531E29F559 000000067F000040020000E0000000CFC000-000000067F000040020000E0000000D00000__000000572A7A05D8 000000067F000040020000E0000000CFC000-000000067F000040020000E0000000D00000__0000005D2FFFFB38 000000067F000040020000E0000000CFC000-000000067F000040020000E0000000D00000__00000073AD3FE6B8 000000067F000040020000E0000000CFC000-000000067F000040020000E0000000D00000__000000914E3F38F0 000000067F000040020000E0000000CFC000-000000067F000040020000E0000000D00000__000000931B9A2710 000000067F000040020000E0000000D00000-000000067F000040020000E0000000D04000__000000572A7A05D8 000000067F000040020000E0000000D00000-000000067F000040020000E0000000D04000__0000005D2FFFFB38 000000067F000040020000E0000000D00000-000000067F000040020000E0000000D04000__00000073AD3FE6B8 000000067F000040020000E0000000D00000-000000067F000040020000E0000000D04000__000000914E3F38F0 000000067F000040020000E0000000D00000-000000067F000040020000E0000000D04000__000000931B9A2710 000000067F000040020000E0000000D04000-000000067F000040020000E0000000D08000__000000572A7A05D8 000000067F000040020000E0000000D04000-000000067F000040020000E0000000D08000__0000005D2FFFFB38 000000067F000040020000E0000000D04000-000000067F000040020000E0000000D08000__00000073AD3FE6B8 000000067F000040020000E0000000D04000-000000067F000040020000E0000000D08000__000000914E3F38F0 000000067F000040020000E0000000D04000-000000067F000040020000E0000000D08000__000000931B9A2710 000000067F000040020000E0000000D049E2-000000067F000040020000E0000000D0D3C4__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D08000-000000067F000040020000E0000000D0C000__000000572A7A05D8 000000067F000040020000E0000000D08000-000000067F000040020000E0000000D0C000__0000005D2FFFFB38 000000067F000040020000E0000000D08000-000000067F000040020000E0000000D0C000__00000073AD3FE6B8 000000067F000040020000E0000000D08000-000000067F000040020000E0000000D0C000__000000914E3F38F0 000000067F000040020000E0000000D08000-000000067F000040020000E0000000D0C000__000000931B9A2710 000000067F000040020000E0000000D0C000-000000067F000040020000E0000000D10000__000000572A7A05D8 000000067F000040020000E0000000D0C000-000000067F000040020000E0000000D10000__0000005D2FFFFB38 000000067F000040020000E0000000D0C000-000000067F000040020000E0000000D10000__00000073AD3FE6B8 000000067F000040020000E0000000D0C000-000000067F000040020000E0000000D10000__000000914E3F38F0 000000067F000040020000E0000000D0C000-000000067F000040020000E0000000D10000__000000931B9A2710 000000067F000040020000E0000000D0D3C4-000000067F000040020000E0000000D15DA8__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D10000-000000067F000040020000E0000000D14000__000000572A7A05D8 000000067F000040020000E0000000D10000-000000067F000040020000E0000000D14000__0000005D2FFFFB38 000000067F000040020000E0000000D10000-000000067F000040020000E0000000D14000__00000073AD3FE6B8 000000067F000040020000E0000000D10000-000000067F000040020000E0000000D14000__000000914E3F38F0 000000067F000040020000E0000000D10000-000000067F000040020000E0000000D14000__000000931B9A2710 000000067F000040020000E0000000D14000-000000067F000040020000E0000000D18000__000000572A7A05D8 000000067F000040020000E0000000D14000-000000067F000040020000E0000000D18000__0000005D2FFFFB38 000000067F000040020000E0000000D14000-000000067F000040020000E0000000D18000__00000073AD3FE6B8 000000067F000040020000E0000000D14000-000000067F000040020000E0000000D18000__000000914E3F38F0 000000067F000040020000E0000000D14000-000000067F000040020000E0000000D18000__000000931B9A2710 000000067F000040020000E0000000D15DA8-000000067F000040020000E0000000D1E783__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D18000-000000067F000040020000E0000000D1C000__000000572A7A05D8 000000067F000040020000E0000000D18000-000000067F000040020000E0000000D1C000__0000005D2FFFFB38 000000067F000040020000E0000000D18000-000000067F000040020000E0000000D1C000__00000073AD3FE6B8 000000067F000040020000E0000000D18000-000000067F000040020000E0000000D1C000__000000914E3F38F0 000000067F000040020000E0000000D18000-000000067F000040020000E0000000D1C000__000000931B9A2710 000000067F000040020000E0000000D1C000-000000067F000040020000E0000000D20000__000000572A7A05D8 000000067F000040020000E0000000D1C000-000000067F000040020000E0000000D20000__0000005D2FFFFB38 000000067F000040020000E0000000D1C000-000000067F000040020000E0000000D20000__00000073AD3FE6B8 000000067F000040020000E0000000D1C000-000000067F000040020000E0000000D20000__000000914E3F38F0 000000067F000040020000E0000000D1C000-000000067F000040020000E0000000D20000__000000931B9A2710 000000067F000040020000E0000000D1E783-000000067F000040020000E0000000D27156__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D20000-000000067F000040020000E0000000D24000__000000572A7A05D8 000000067F000040020000E0000000D20000-000000067F000040020000E0000000D24000__0000005D2FFFFB38 000000067F000040020000E0000000D20000-000000067F000040020000E0000000D24000__00000073AD3FE6B8 000000067F000040020000E0000000D20000-000000067F000040020000E0000000D24000__000000914E3F38F0 000000067F000040020000E0000000D20000-000000067F000040020000E0000000D24000__000000931B9A2710 000000067F000040020000E0000000D24000-000000067F000040020000E0000000D28000__000000572A7A05D8 000000067F000040020000E0000000D24000-000000067F000040020000E0000000D28000__0000005D2FFFFB38 000000067F000040020000E0000000D24000-000000067F000040020000E0000000D28000__00000073AD3FE6B8 000000067F000040020000E0000000D24000-000000067F000040020000E0000000D28000__000000914E3F38F0 000000067F000040020000E0000000D24000-000000067F000040020000E0000000D28000__000000931B9A2710 000000067F000040020000E0000000D27156-000000067F000040020000E0000000D2FB43__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D28000-000000067F000040020000E0000000D2C000__000000572A7A05D8 000000067F000040020000E0000000D28000-000000067F000040020000E0000000D2C000__0000005D2FFFFB38 000000067F000040020000E0000000D28000-000000067F000040020000E0000000D2C000__00000073AD3FE6B8 000000067F000040020000E0000000D28000-000000067F000040020000E0000000D2C000__000000914E3F38F0 000000067F000040020000E0000000D28000-000000067F000040020000E0000000D2C000__000000931B9A2710 000000067F000040020000E0000000D2C000-000000067F000040020000E0000000D30000__00000053FAFFF9D8 000000067F000040020000E0000000D2C000-000000067F000040020000E0000000D30000__0000005D2FFFFB38 000000067F000040020000E0000000D2C000-000000067F000040020000E0000000D30000__00000073AD3FE6B8 000000067F000040020000E0000000D2C000-000000067F000040020000E0000000D30000__000000914E3F38F0 000000067F000040020000E0000000D2C000-000000067F000040020000E0000000D30000__000000931B9A2710 000000067F000040020000E0000000D2FB43-000000067F000040020000E0000100000000__000000526E81F439-000000531E29F559 000000067F000040020000E0000000D2FE44-000000067F000040020000E0000000D3881B__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D30000-000000067F000040020000E0000000D34000__00000053FAFFF9D8 000000067F000040020000E0000000D30000-000000067F000040020000E0000000D34000__0000005D2FFFFB38 000000067F000040020000E0000000D30000-000000067F000040020000E0000000D34000__00000073AD3FE6B8 000000067F000040020000E0000000D30000-000000067F000040020000E0000000D34000__000000914E3F38F0 000000067F000040020000E0000000D30000-000000067F000040020000E0000000D34000__000000931B9A2710 000000067F000040020000E0000000D34000-000000067F000040020000E0000000D38000__00000053FAFFF9D8 000000067F000040020000E0000000D34000-000000067F000040020000E0000000D38000__0000005D2FFFFB38 000000067F000040020000E0000000D34000-000000067F000040020000E0000000D38000__00000073AD3FE6B8 000000067F000040020000E0000000D34000-000000067F000040020000E0000000D38000__000000914E3F38F0 000000067F000040020000E0000000D34000-000000067F000040020000E0000000D38000__000000931B9A2710 000000067F000040020000E0000000D38000-000000067F000040020000E0000000D3C000__00000053FAFFF9D8 000000067F000040020000E0000000D38000-000000067F000040020000E0000000D3C000__0000005D2FFFFB38 000000067F000040020000E0000000D38000-000000067F000040020000E0000000D3C000__00000073AD3FE6B8 000000067F000040020000E0000000D38000-000000067F000040020000E0000000D3C000__000000914E3F38F0 000000067F000040020000E0000000D38000-000000067F000040020000E0000000D3C000__000000931B9A2710 000000067F000040020000E0000000D3881B-000000067F000040020000E0000000D411EF__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D3C000-000000067F000040020000E0000000D40000__00000053FAFFF9D8 000000067F000040020000E0000000D3C000-000000067F000040020000E0000000D40000__0000005D2FFFFB38 000000067F000040020000E0000000D3C000-000000067F000040020000E0000000D40000__00000073AD3FE6B8 000000067F000040020000E0000000D3C000-000000067F000040020000E0000000D40000__000000914E3F38F0 000000067F000040020000E0000000D3C000-000000067F000040020000E0000000D40000__000000931B9A2710 000000067F000040020000E0000000D40000-000000067F000040020000E0000000D44000__00000053FAFFF9D8 000000067F000040020000E0000000D40000-000000067F000040020000E0000000D44000__0000005D2FFFFB38 000000067F000040020000E0000000D40000-000000067F000040020000E0000000D44000__00000073AD3FE6B8 000000067F000040020000E0000000D40000-000000067F000040020000E0000000D44000__000000914E3F38F0 000000067F000040020000E0000000D40000-000000067F000040020000E0000000D44000__000000931B9A2710 000000067F000040020000E0000000D411EF-000000067F000040020000E0000000D49BD0__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D44000-000000067F000040020000E0000000D48000__00000053FAFFF9D8 000000067F000040020000E0000000D44000-000000067F000040020000E0000000D48000__0000005D2FFFFB38 000000067F000040020000E0000000D44000-000000067F000040020000E0000000D48000__00000073AD3FE6B8 000000067F000040020000E0000000D44000-000000067F000040020000E0000000D48000__000000914E3F38F0 000000067F000040020000E0000000D44000-000000067F000040020000E0000000D48000__000000931B9A2710 000000067F000040020000E0000000D48000-000000067F000040020000E0000000D4C000__00000053FAFFF9D8 000000067F000040020000E0000000D48000-000000067F000040020000E0000000D4C000__0000005D2FFFFB38 000000067F000040020000E0000000D48000-000000067F000040020000E0000000D4C000__00000073AD3FE6B8 000000067F000040020000E0000000D48000-000000067F000040020000E0000000D4C000__000000914E3F38F0 000000067F000040020000E0000000D48000-000000067F000040020000E0000000D4C000__000000931B9A2710 000000067F000040020000E0000000D49BD0-000000067F000040020000E0000000D525B0__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D4C000-000000067F000040020000E0000000D50000__00000053FAFFF9D8 000000067F000040020000E0000000D4C000-000000067F000040020000E0000000D50000__0000005D2FFFFB38 000000067F000040020000E0000000D4C000-000000067F000040020000E0000000D50000__00000073AD3FE6B8 000000067F000040020000E0000000D4C000-000000067F000040020000E0000000D50000__000000914E3F38F0 000000067F000040020000E0000000D4C000-000000067F000040020000E0000000D50000__000000931B9A2710 000000067F000040020000E0000000D50000-000000067F000040020000E0000000D54000__00000053FAFFF9D8 000000067F000040020000E0000000D50000-000000067F000040020000E0000000D54000__0000005D2FFFFB38 000000067F000040020000E0000000D50000-000000067F000040020000E0000000D54000__00000073AD3FE6B8 000000067F000040020000E0000000D50000-000000067F000040020000E0000000D54000__000000914E3F38F0 000000067F000040020000E0000000D50000-000000067F000040020000E0000000D54000__000000931B9A2710 000000067F000040020000E0000000D525B0-000000067F000040020000E0000000D5AF8E__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D54000-000000067F000040020000E0000000D58000__00000053FAFFF9D8 000000067F000040020000E0000000D54000-000000067F000040020000E0000000D58000__0000005D2FFFFB38 000000067F000040020000E0000000D54000-000000067F000040020000E0000000D58000__00000073AD3FE6B8 000000067F000040020000E0000000D54000-000000067F000040020000E0000000D58000__000000914E3F38F0 000000067F000040020000E0000000D54000-000000067F000040020000E0000000D58000__000000931B9A2710 000000067F000040020000E0000000D58000-000000067F000040020000E0000000D5C000__00000053FAFFF9D8 000000067F000040020000E0000000D58000-000000067F000040020000E0000000D5C000__0000005D2FFFFB38 000000067F000040020000E0000000D58000-000000067F000040020000E0000000D5C000__00000073AD3FE6B8 000000067F000040020000E0000000D58000-000000067F000040020000E0000000D5C000__000000914E3F38F0 000000067F000040020000E0000000D58000-000000067F000040020000E0000000D5C000__000000931B9A2710 000000067F000040020000E0000000D5AF8E-000000067F000040020000E0000000D63966__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D5C000-000000067F000040020000E0000000D60000__00000053FAFFF9D8 000000067F000040020000E0000000D5C000-000000067F000040020000E0000000D60000__0000005D2FFFFB38 000000067F000040020000E0000000D5C000-000000067F000040020000E0000000D60000__00000073AD3FE6B8 000000067F000040020000E0000000D5C000-000000067F000040020000E0000000D60000__000000914E3F38F0 000000067F000040020000E0000000D5C000-000000067F000040020000E0000000D60000__000000931B9A2710 000000067F000040020000E0000000D60000-000000067F000040020000E0000000D64000__00000053FAFFF9D8 000000067F000040020000E0000000D60000-000000067F000040020000E0000000D64000__0000005D2FFFFB38 000000067F000040020000E0000000D60000-000000067F000040020000E0000000D64000__00000073AD3FE6B8 000000067F000040020000E0000000D60000-000000067F000040020000E0000000D64000__000000914E3F38F0 000000067F000040020000E0000000D60000-000000067F000040020000E0000000D64000__000000931B9A2710 000000067F000040020000E0000000D63966-000000067F000040020000E0000000D6C344__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D64000-000000067F000040020000E0000000D68000__00000053FAFFF9D8 000000067F000040020000E0000000D64000-000000067F000040020000E0000000D68000__0000005D2FFFFB38 000000067F000040020000E0000000D64000-000000067F000040020000E0000000D68000__00000073AD3FE6B8 000000067F000040020000E0000000D64000-000000067F000040020000E0000000D68000__000000914E3F38F0 000000067F000040020000E0000000D64000-000000067F000040020000E0000000D68000__000000931B9A2710 000000067F000040020000E0000000D68000-000000067F000040020000E0000000D6C000__00000053FAFFF9D8 000000067F000040020000E0000000D68000-000000067F000040020000E0000000D6C000__0000005D2FFFFB38 000000067F000040020000E0000000D68000-000000067F000040020000E0000000D6C000__00000073AD3FE6B8 000000067F000040020000E0000000D68000-000000067F000040020000E0000000D6C000__000000914E3F38F0 000000067F000040020000E0000000D68000-000000067F000040020000E0000000D6C000__000000931B9A2710 000000067F000040020000E0000000D6C000-000000067F000040020000E0000000D70000__00000053FAFFF9D8 000000067F000040020000E0000000D6C000-000000067F000040020000E0000000D70000__0000005D2FFFFB38 000000067F000040020000E0000000D6C000-000000067F000040020000E0000000D70000__00000073AD3FE6B8 000000067F000040020000E0000000D6C000-000000067F000040020000E0000000D70000__000000914E3F38F0 000000067F000040020000E0000000D6C000-000000067F000040020000E0000000D70000__000000931B9A2710 000000067F000040020000E0000000D6C344-000000067F000040020000E0000000D74D26__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D70000-000000067F000040020000E0000000D74000__00000053FAFFF9D8 000000067F000040020000E0000000D70000-000000067F000040020000E0000000D74000__0000005D2FFFFB38 000000067F000040020000E0000000D70000-000000067F000040020000E0000000D74000__00000073AD3FE6B8 000000067F000040020000E0000000D70000-000000067F000040020000E0000000D74000__000000914E3F38F0 000000067F000040020000E0000000D70000-000000067F000040020000E0000000D74000__000000931B9A2710 000000067F000040020000E0000000D74000-000000067F000040020000E0000000D78000__00000053FAFFF9D8 000000067F000040020000E0000000D74000-000000067F000040020000E0000000D78000__0000005D2FFFFB38 000000067F000040020000E0000000D74000-000000067F000040020000E0000000D78000__00000073AD3FE6B8 000000067F000040020000E0000000D74000-000000067F000040020000E0000000D78000__000000914E3F38F0 000000067F000040020000E0000000D74000-000000067F000040020000E0000000D78000__000000931B9A2710 000000067F000040020000E0000000D74D26-000000067F000040020000E0000000D7D701__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D78000-000000067F000040020000E0000000D7C000__00000053FAFFF9D8 000000067F000040020000E0000000D78000-000000067F000040020000E0000000D7C000__0000005D2FFFFB38 000000067F000040020000E0000000D78000-000000067F000040020000E0000000D7C000__00000073AD3FE6B8 000000067F000040020000E0000000D78000-000000067F000040020000E0000000D7C000__000000914E3F38F0 000000067F000040020000E0000000D78000-000000067F000040020000E0000000D7C000__000000931B9A2710 000000067F000040020000E0000000D7C000-000000067F000040020000E0000000D80000__00000053FAFFF9D8 000000067F000040020000E0000000D7C000-000000067F000040020000E0000000D80000__0000005D2FFFFB38 000000067F000040020000E0000000D7C000-000000067F000040020000E0000000D80000__00000073AD3FE6B8 000000067F000040020000E0000000D7C000-000000067F000040020000E0000000D80000__000000914E3F38F0 000000067F000040020000E0000000D7C000-000000067F000040020000E0000000D80000__000000931B9A2710 000000067F000040020000E0000000D7D701-000000067F000040020000E0000000D860CB__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D80000-000000067F000040020000E0000000D84000__00000053FAFFF9D8 000000067F000040020000E0000000D80000-000000067F000040020000E0000000D84000__0000005D2FFFFB38 000000067F000040020000E0000000D80000-000000067F000040020000E0000000D84000__00000073AD3FE6B8 000000067F000040020000E0000000D80000-000000067F000040020000E0000000D84000__000000914E3F38F0 000000067F000040020000E0000000D80000-000000067F000040020000E0000000D84000__000000931B9A2710 000000067F000040020000E0000000D84000-000000067F000040020000E0000000D88000__00000053FAFFF9D8 000000067F000040020000E0000000D84000-000000067F000040020000E0000000D88000__0000005D2FFFFB38 000000067F000040020000E0000000D84000-000000067F000040020000E0000000D88000__00000073AD3FE6B8 000000067F000040020000E0000000D84000-000000067F000040020000E0000000D88000__000000914E3F38F0 000000067F000040020000E0000000D84000-000000067F000040020000E0000000D88000__000000931B9A2710 000000067F000040020000E0000000D860CB-000000067F000040020000E0000000D8EAB0__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D88000-000000067F000040020000E0000000D8C000__00000053FAFFF9D8 000000067F000040020000E0000000D88000-000000067F000040020000E0000000D8C000__0000005D2FFFFB38 000000067F000040020000E0000000D88000-000000067F000040020000E0000000D8C000__00000073AD3FE6B8 000000067F000040020000E0000000D88000-000000067F000040020000E0000000D8C000__000000914E3F38F0 000000067F000040020000E0000000D88000-000000067F000040020000E0000000D8C000__000000931B9A2710 000000067F000040020000E0000000D8C000-000000067F000040020000E0000000D90000__00000053FAFFF9D8 000000067F000040020000E0000000D8C000-000000067F000040020000E0000000D90000__000000574B7FF240 000000067F000040020000E0000000D8C000-000000067F000040020000E0000000D90000__00000073AD3FE6B8 000000067F000040020000E0000000D8C000-000000067F000040020000E0000000D90000__000000914E3F38F0 000000067F000040020000E0000000D8C000-000000067F000040020000E0000000D90000__000000931B9A2710 000000067F000040020000E0000000D8EAB0-000000067F000040020000E0000100000000__000000531E29F559-00000053CDCFF331 000000067F000040020000E0000000D8EDC6-000000067F000040020000E0000000D977A7__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000D90000-000000067F000040020000E0000000D94000__00000053FAFFF9D8 000000067F000040020000E0000000D90000-000000067F000040020000E0000000D94000__000000574B7FF240 000000067F000040020000E0000000D90000-000000067F000040020000E0000000D94000__00000073AD3FE6B8 000000067F000040020000E0000000D90000-000000067F000040020000E0000000D94000__000000914E3F38F0 000000067F000040020000E0000000D90000-000000067F000040020000E0000000D94000__000000931B9A2710 000000067F000040020000E0000000D94000-000000067F000040020000E0000000D98000__00000053FAFFF9D8 000000067F000040020000E0000000D94000-000000067F000040020000E0000000D98000__000000574B7FF240 000000067F000040020000E0000000D94000-000000067F000040020000E0000000D98000__00000073AD3FE6B8 000000067F000040020000E0000000D94000-000000067F000040020000E0000000D98000__000000914E3F38F0 000000067F000040020000E0000000D94000-000000067F000040020000E0000000D98000__000000931B9A2710 000000067F000040020000E0000000D977A7-000000067F000040020000E0000000DA0176__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000D98000-000000067F000040020000E0000000D9C000__00000053FAFFF9D8 000000067F000040020000E0000000D98000-000000067F000040020000E0000000D9C000__000000574B7FF240 000000067F000040020000E0000000D98000-000000067F000040020000E0000000D9C000__00000073AD3FE6B8 000000067F000040020000E0000000D98000-000000067F000040020000E0000000D9C000__000000914E3F38F0 000000067F000040020000E0000000D98000-000000067F000040020000E0000000D9C000__000000931B9A2710 000000067F000040020000E0000000D9C000-000000067F000040020000E0000000DA0000__00000053FAFFF9D8 000000067F000040020000E0000000D9C000-000000067F000040020000E0000000DA0000__000000574B7FF240 000000067F000040020000E0000000D9C000-000000067F000040020000E0000000DA0000__00000073AD3FE6B8 000000067F000040020000E0000000D9C000-000000067F000040020000E0000000DA0000__000000914E3F38F0 000000067F000040020000E0000000D9C000-000000067F000040020000E0000000DA0000__000000931B9A2710 000000067F000040020000E0000000DA0000-000000067F000040020000E0000000DA4000__00000053FAFFF9D8 000000067F000040020000E0000000DA0000-000000067F000040020000E0000000DA4000__000000574B7FF240 000000067F000040020000E0000000DA0000-000000067F000040020000E0000000DA4000__00000073AD3FE6B8 000000067F000040020000E0000000DA0000-000000067F000040020000E0000000DA4000__000000914E3F38F0 000000067F000040020000E0000000DA0000-000000067F000040020000E0000000DA4000__000000931B9A2710 000000067F000040020000E0000000DA0176-000000067F000040020000E0000000DA8B58__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DA4000-000000067F000040020000E0000000DA8000__000000574B7FF240 000000067F000040020000E0000000DA4000-000000067F000040020000E0000000DA8000__00000073AD3FE6B8 000000067F000040020000E0000000DA4000-000000067F000040020000E0000000DA8000__000000914E3F38F0 000000067F000040020000E0000000DA4000-000000067F000040020000E0000000DA8000__000000931B9A2710 000000067F000040020000E0000000DA4000-030000000000000000000000000000000002__00000053FAFFF9D8 000000067F000040020000E0000000DA8000-000000067F000040020000E0000000DAC000__000000574B7FF240 000000067F000040020000E0000000DA8000-000000067F000040020000E0000000DAC000__00000073AD3FE6B8 000000067F000040020000E0000000DA8000-000000067F000040020000E0000000DAC000__000000914E3F38F0 000000067F000040020000E0000000DA8000-000000067F000040020000E0000000DAC000__000000931B9A2710 000000067F000040020000E0000000DA8B58-000000067F000040020000E0000000DB1534__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DAC000-000000067F000040020000E0000000DB0000__000000574B7FF240 000000067F000040020000E0000000DAC000-000000067F000040020000E0000000DB0000__00000073AD3FE6B8 000000067F000040020000E0000000DAC000-000000067F000040020000E0000000DB0000__000000914E3F38F0 000000067F000040020000E0000000DAC000-000000067F000040020000E0000000DB0000__000000931B9A2710 000000067F000040020000E0000000DB0000-000000067F000040020000E0000000DB4000__000000574B7FF240 000000067F000040020000E0000000DB0000-000000067F000040020000E0000000DB4000__00000073AD3FE6B8 000000067F000040020000E0000000DB0000-000000067F000040020000E0000000DB4000__000000914E3F38F0 000000067F000040020000E0000000DB0000-000000067F000040020000E0000000DB4000__000000931B9A2710 000000067F000040020000E0000000DB1534-000000067F000040020000E0000000DB9F12__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DB4000-000000067F000040020000E0000000DB8000__000000574B7FF240 000000067F000040020000E0000000DB4000-000000067F000040020000E0000000DB8000__00000073AD3FE6B8 000000067F000040020000E0000000DB4000-000000067F000040020000E0000000DB8000__000000914E3F38F0 000000067F000040020000E0000000DB4000-000000067F000040020000E0000000DB8000__000000931B9A2710 000000067F000040020000E0000000DB8000-000000067F000040020000E0000000DBC000__000000574B7FF240 000000067F000040020000E0000000DB8000-000000067F000040020000E0000000DBC000__00000073AD3FE6B8 000000067F000040020000E0000000DB8000-000000067F000040020000E0000000DBC000__000000914E3F38F0 000000067F000040020000E0000000DB8000-000000067F000040020000E0000000DBC000__000000931B9A2710 000000067F000040020000E0000000DB9F12-000000067F000040020000E0000000DC28E0__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DBC000-000000067F000040020000E0000000DC0000__000000574B7FF240 000000067F000040020000E0000000DBC000-000000067F000040020000E0000000DC0000__00000073AD3FE6B8 000000067F000040020000E0000000DBC000-000000067F000040020000E0000000DC0000__000000914E3F38F0 000000067F000040020000E0000000DBC000-000000067F000040020000E0000000DC0000__000000931B9A2710 000000067F000040020000E0000000DC0000-000000067F000040020000E0000000DC4000__000000574B7FF240 000000067F000040020000E0000000DC0000-000000067F000040020000E0000000DC4000__00000073AD3FE6B8 000000067F000040020000E0000000DC0000-000000067F000040020000E0000000DC4000__000000914E3F38F0 000000067F000040020000E0000000DC0000-000000067F000040020000E0000000DC4000__000000931B9A2710 000000067F000040020000E0000000DC28E0-000000067F000040020000E0000000DCB2CC__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DC4000-000000067F000040020000E0000000DC8000__000000574B7FF240 000000067F000040020000E0000000DC4000-000000067F000040020000E0000000DC8000__00000073AD3FE6B8 000000067F000040020000E0000000DC4000-000000067F000040020000E0000000DC8000__000000914E3F38F0 000000067F000040020000E0000000DC4000-000000067F000040020000E0000000DC8000__000000931B9A2710 000000067F000040020000E0000000DC8000-000000067F000040020000E0000000DCC000__000000574B7FF240 000000067F000040020000E0000000DC8000-000000067F000040020000E0000000DCC000__00000073AD3FE6B8 000000067F000040020000E0000000DC8000-000000067F000040020000E0000000DCC000__000000914E3F38F0 000000067F000040020000E0000000DC8000-000000067F000040020000E0000000DCC000__000000931B9A2710 000000067F000040020000E0000000DCB2CC-000000067F000040020000E0000000DD3CB0__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DCC000-000000067F000040020000E0000000DD0000__000000574B7FF240 000000067F000040020000E0000000DCC000-000000067F000040020000E0000000DD0000__00000073AD3FE6B8 000000067F000040020000E0000000DCC000-000000067F000040020000E0000000DD0000__000000914E3F38F0 000000067F000040020000E0000000DCC000-000000067F000040020000E0000000DD0000__000000931B9A2710 000000067F000040020000E0000000DD0000-000000067F000040020000E0000000DD4000__000000574B7FF240 000000067F000040020000E0000000DD0000-000000067F000040020000E0000000DD4000__00000073AD3FE6B8 000000067F000040020000E0000000DD0000-000000067F000040020000E0000000DD4000__000000914E3F38F0 000000067F000040020000E0000000DD0000-000000067F000040020000E0000000DD4000__000000931B9A2710 000000067F000040020000E0000000DD3CB0-000000067F000040020000E0000000DDC69C__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DD4000-000000067F000040020000E0000000DD8000__000000574B7FF240 000000067F000040020000E0000000DD4000-000000067F000040020000E0000000DD8000__00000073AD3FE6B8 000000067F000040020000E0000000DD4000-000000067F000040020000E0000000DD8000__000000914E3F38F0 000000067F000040020000E0000000DD4000-000000067F000040020000E0000000DD8000__000000931B9A2710 000000067F000040020000E0000000DD8000-000000067F000040020000E0000000DDC000__000000574B7FF240 000000067F000040020000E0000000DD8000-000000067F000040020000E0000000DDC000__00000073AD3FE6B8 000000067F000040020000E0000000DD8000-000000067F000040020000E0000000DDC000__000000914E3F38F0 000000067F000040020000E0000000DD8000-000000067F000040020000E0000000DDC000__000000931B9A2710 000000067F000040020000E0000000DDC000-000000067F000040020000E0000000DE0000__000000574B7FF240 000000067F000040020000E0000000DDC000-000000067F000040020000E0000000DE0000__00000073AD3FE6B8 000000067F000040020000E0000000DDC000-000000067F000040020000E0000000DE0000__000000914E3F38F0 000000067F000040020000E0000000DDC000-000000067F000040020000E0000000DE0000__000000931B9A2710 000000067F000040020000E0000000DDC69C-000000067F000040020000E0000000DE5083__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DE0000-000000067F000040020000E0000000DE4000__000000574B7FF240 000000067F000040020000E0000000DE0000-000000067F000040020000E0000000DE4000__00000073AD3FE6B8 000000067F000040020000E0000000DE0000-000000067F000040020000E0000000DE4000__000000914E3F38F0 000000067F000040020000E0000000DE0000-000000067F000040020000E0000000DE4000__000000931B9A2710 000000067F000040020000E0000000DE4000-000000067F000040020000E0000000DE8000__000000574B7FF240 000000067F000040020000E0000000DE4000-000000067F000040020000E0000000DE8000__00000073AD3FE6B8 000000067F000040020000E0000000DE4000-000000067F000040020000E0000000DE8000__000000914E3F38F0 000000067F000040020000E0000000DE4000-000000067F000040020000E0000000DE8000__000000931B9A2710 000000067F000040020000E0000000DE5083-000000067F000040020000E0000000DEDA64__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DE8000-000000067F000040020000E0000000DEC000__000000574B7FF240 000000067F000040020000E0000000DE8000-000000067F000040020000E0000000DEC000__00000073AD3FE6B8 000000067F000040020000E0000000DE8000-000000067F000040020000E0000000DEC000__000000914E3F38F0 000000067F000040020000E0000000DE8000-000000067F000040020000E0000000DEC000__000000931B9A2710 000000067F000040020000E0000000DEC000-000000067F000040020000E0000000DF0000__000000572A7A05D8 000000067F000040020000E0000000DEC000-000000067F000040020000E0000000DF0000__0000005D2FFFFB38 000000067F000040020000E0000000DEC000-000000067F000040020000E0000000DF0000__00000073AD3FE6B8 000000067F000040020000E0000000DEC000-000000067F000040020000E0000000DF0000__000000914E3F38F0 000000067F000040020000E0000000DEC000-000000067F000040020000E0000000DF0000__000000931B9A2710 000000067F000040020000E0000000DEDA64-000000067F000040020000E0000100000000__00000053CDCFF331-000000547D77D8A1 000000067F000040020000E0000000DEDD69-000000067F000040020000E0000000DF6741__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000DF0000-000000067F000040020000E0000000DF4000__000000572A7A05D8 000000067F000040020000E0000000DF0000-000000067F000040020000E0000000DF4000__0000005D2FFFFB38 000000067F000040020000E0000000DF0000-000000067F000040020000E0000000DF4000__00000073AD3FE6B8 000000067F000040020000E0000000DF0000-000000067F000040020000E0000000DF4000__000000914E3F38F0 000000067F000040020000E0000000DF0000-000000067F000040020000E0000000DF4000__000000931B9A2710 000000067F000040020000E0000000DF4000-000000067F000040020000E0000000DF8000__000000572A7A05D8 000000067F000040020000E0000000DF4000-000000067F000040020000E0000000DF8000__0000005D2FFFFB38 000000067F000040020000E0000000DF4000-000000067F000040020000E0000000DF8000__00000073AD3FE6B8 000000067F000040020000E0000000DF4000-000000067F000040020000E0000000DF8000__000000914E3F38F0 000000067F000040020000E0000000DF4000-000000067F000040020000E0000000DF8000__000000931B9A2710 000000067F000040020000E0000000DF6741-000000067F000040020000E0000000DFF11F__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000DF8000-000000067F000040020000E0000000DFC000__000000572A7A05D8 000000067F000040020000E0000000DF8000-000000067F000040020000E0000000DFC000__0000005D2FFFFB38 000000067F000040020000E0000000DF8000-000000067F000040020000E0000000DFC000__00000073AD3FE6B8 000000067F000040020000E0000000DF8000-000000067F000040020000E0000000DFC000__000000914E3F38F0 000000067F000040020000E0000000DF8000-000000067F000040020000E0000000DFC000__000000931B9A2710 000000067F000040020000E0000000DFC000-000000067F000040020000E0000000E00000__000000572A7A05D8 000000067F000040020000E0000000DFC000-000000067F000040020000E0000000E00000__0000005D2FFFFB38 000000067F000040020000E0000000DFC000-000000067F000040020000E0000000E00000__00000073AD3FE6B8 000000067F000040020000E0000000DFC000-000000067F000040020000E0000000E00000__000000914E3F38F0 000000067F000040020000E0000000DFC000-000000067F000040020000E0000000E00000__000000931B9A2710 000000067F000040020000E0000000DFF11F-000000067F000040020000E0000000E07AED__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E00000-000000067F000040020000E0000000E04000__000000572A7A05D8 000000067F000040020000E0000000E00000-000000067F000040020000E0000000E04000__0000005D2FFFFB38 000000067F000040020000E0000000E00000-000000067F000040020000E0000000E04000__00000073AD3FE6B8 000000067F000040020000E0000000E00000-000000067F000040020000E0000000E04000__000000914E3F38F0 000000067F000040020000E0000000E00000-000000067F000040020000E0000000E04000__000000931B9A2710 000000067F000040020000E0000000E04000-000000067F000040020000E0000000E08000__000000572A7A05D8 000000067F000040020000E0000000E04000-000000067F000040020000E0000000E08000__0000005D2FFFFB38 000000067F000040020000E0000000E04000-000000067F000040020000E0000000E08000__00000073AD3FE6B8 000000067F000040020000E0000000E04000-000000067F000040020000E0000000E08000__000000914E3F38F0 000000067F000040020000E0000000E04000-000000067F000040020000E0000000E08000__000000931B9A2710 000000067F000040020000E0000000E07AED-000000067F000040020000E0000000E104CE__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E08000-000000067F000040020000E0000000E0C000__000000572A7A05D8 000000067F000040020000E0000000E08000-000000067F000040020000E0000000E0C000__0000005D2FFFFB38 000000067F000040020000E0000000E08000-000000067F000040020000E0000000E0C000__00000073AD3FE6B8 000000067F000040020000E0000000E08000-000000067F000040020000E0000000E0C000__000000914E3F38F0 000000067F000040020000E0000000E08000-000000067F000040020000E0000000E0C000__000000931B9A2710 000000067F000040020000E0000000E0C000-000000067F000040020000E0000000E10000__000000572A7A05D8 000000067F000040020000E0000000E0C000-000000067F000040020000E0000000E10000__0000005D2FFFFB38 000000067F000040020000E0000000E0C000-000000067F000040020000E0000000E10000__00000073AD3FE6B8 000000067F000040020000E0000000E0C000-000000067F000040020000E0000000E10000__000000914E3F38F0 000000067F000040020000E0000000E0C000-000000067F000040020000E0000000E10000__000000931B9A2710 000000067F000040020000E0000000E10000-000000067F000040020000E0000000E14000__000000572A7A05D8 000000067F000040020000E0000000E10000-000000067F000040020000E0000000E14000__0000005D2FFFFB38 000000067F000040020000E0000000E10000-000000067F000040020000E0000000E14000__00000073AD3FE6B8 000000067F000040020000E0000000E10000-000000067F000040020000E0000000E14000__000000914E3F38F0 000000067F000040020000E0000000E10000-000000067F000040020000E0000000E14000__000000931B9A2710 000000067F000040020000E0000000E104CE-000000067F000040020000E0000000E18EAE__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E14000-000000067F000040020000E0000000E18000__000000572A7A05D8 000000067F000040020000E0000000E14000-000000067F000040020000E0000000E18000__0000005D2FFFFB38 000000067F000040020000E0000000E14000-000000067F000040020000E0000000E18000__00000073AD3FE6B8 000000067F000040020000E0000000E14000-000000067F000040020000E0000000E18000__000000914E3F38F0 000000067F000040020000E0000000E14000-000000067F000040020000E0000000E18000__000000931B9A2710 000000067F000040020000E0000000E18000-000000067F000040020000E0000000E1C000__000000572A7A05D8 000000067F000040020000E0000000E18000-000000067F000040020000E0000000E1C000__0000005D2FFFFB38 000000067F000040020000E0000000E18000-000000067F000040020000E0000000E1C000__00000073AD3FE6B8 000000067F000040020000E0000000E18000-000000067F000040020000E0000000E1C000__000000914E3F38F0 000000067F000040020000E0000000E18000-000000067F000040020000E0000000E1C000__000000931B9A2710 000000067F000040020000E0000000E18EAE-000000067F000040020000E0000000E2188E__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E1C000-000000067F000040020000E0000000E20000__000000572A7A05D8 000000067F000040020000E0000000E1C000-000000067F000040020000E0000000E20000__0000005D2FFFFB38 000000067F000040020000E0000000E1C000-000000067F000040020000E0000000E20000__00000073AD3FE6B8 000000067F000040020000E0000000E1C000-000000067F000040020000E0000000E20000__000000914E3F38F0 000000067F000040020000E0000000E1C000-000000067F000040020000E0000000E20000__000000931B9A2710 000000067F000040020000E0000000E20000-000000067F000040020000E0000000E24000__000000572A7A05D8 000000067F000040020000E0000000E20000-000000067F000040020000E0000000E24000__0000005D2FFFFB38 000000067F000040020000E0000000E20000-000000067F000040020000E0000000E24000__00000073AD3FE6B8 000000067F000040020000E0000000E20000-000000067F000040020000E0000000E24000__000000914E3F38F0 000000067F000040020000E0000000E20000-000000067F000040020000E0000000E24000__000000931B9A2710 000000067F000040020000E0000000E2188E-000000067F000040020000E0000000E2A276__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E24000-000000067F000040020000E0000000E28000__000000572A7A05D8 000000067F000040020000E0000000E24000-000000067F000040020000E0000000E28000__0000005D2FFFFB38 000000067F000040020000E0000000E24000-000000067F000040020000E0000000E28000__00000073AD3FE6B8 000000067F000040020000E0000000E24000-000000067F000040020000E0000000E28000__000000914E3F38F0 000000067F000040020000E0000000E24000-000000067F000040020000E0000000E28000__000000931B9A2710 000000067F000040020000E0000000E28000-000000067F000040020000E0000000E2C000__000000572A7A05D8 000000067F000040020000E0000000E28000-000000067F000040020000E0000000E2C000__0000005D2FFFFB38 000000067F000040020000E0000000E28000-000000067F000040020000E0000000E2C000__00000073AD3FE6B8 000000067F000040020000E0000000E28000-000000067F000040020000E0000000E2C000__000000914E3F38F0 000000067F000040020000E0000000E28000-000000067F000040020000E0000000E2C000__000000931B9A2710 000000067F000040020000E0000000E2A276-000000067F000040020000E0000000E32C4B__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E2C000-000000067F000040020000E0000000E30000__000000572A7A05D8 000000067F000040020000E0000000E2C000-000000067F000040020000E0000000E30000__0000005D2FFFFB38 000000067F000040020000E0000000E2C000-000000067F000040020000E0000000E30000__00000073AD3FE6B8 000000067F000040020000E0000000E2C000-000000067F000040020000E0000000E30000__000000914E3F38F0 000000067F000040020000E0000000E2C000-000000067F000040020000E0000000E30000__000000931B9A2710 000000067F000040020000E0000000E30000-000000067F000040020000E0000000E34000__000000572A7A05D8 000000067F000040020000E0000000E30000-000000067F000040020000E0000000E34000__0000005D2FFFFB38 000000067F000040020000E0000000E30000-000000067F000040020000E0000000E34000__00000073AD3FE6B8 000000067F000040020000E0000000E30000-000000067F000040020000E0000000E34000__000000914E3F38F0 000000067F000040020000E0000000E30000-000000067F000040020000E0000000E34000__000000931B9A2710 000000067F000040020000E0000000E32C4B-000000067F000040020000E0000000E3B629__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E34000-000000067F000040020000E0000000E38000__000000572A7A05D8 000000067F000040020000E0000000E34000-000000067F000040020000E0000000E38000__0000005D2FFFFB38 000000067F000040020000E0000000E34000-000000067F000040020000E0000000E38000__00000073AD3FE6B8 000000067F000040020000E0000000E34000-000000067F000040020000E0000000E38000__000000914E3F38F0 000000067F000040020000E0000000E34000-000000067F000040020000E0000000E38000__000000931B9A2710 000000067F000040020000E0000000E38000-000000067F000040020000E0000000E3C000__000000572A7A05D8 000000067F000040020000E0000000E38000-000000067F000040020000E0000000E3C000__0000005D2FFFFB38 000000067F000040020000E0000000E38000-000000067F000040020000E0000000E3C000__00000073AD3FE6B8 000000067F000040020000E0000000E38000-000000067F000040020000E0000000E3C000__000000914E3F38F0 000000067F000040020000E0000000E38000-000000067F000040020000E0000000E3C000__000000931B9A2710 000000067F000040020000E0000000E3B629-000000067F000040020000E0000000E43FF6__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E3C000-000000067F000040020000E0000000E40000__000000572A7A05D8 000000067F000040020000E0000000E3C000-000000067F000040020000E0000000E40000__0000005D2FFFFB38 000000067F000040020000E0000000E3C000-000000067F000040020000E0000000E40000__00000073AD3FE6B8 000000067F000040020000E0000000E3C000-000000067F000040020000E0000000E40000__000000914E3F38F0 000000067F000040020000E0000000E3C000-000000067F000040020000E0000000E40000__000000931B9A2710 000000067F000040020000E0000000E40000-000000067F000040020000E0000000E44000__000000572A7A05D8 000000067F000040020000E0000000E40000-000000067F000040020000E0000000E44000__0000005D2FFFFB38 000000067F000040020000E0000000E40000-000000067F000040020000E0000000E44000__00000073AD3FE6B8 000000067F000040020000E0000000E40000-000000067F000040020000E0000000E44000__000000914E3F38F0 000000067F000040020000E0000000E40000-000000067F000040020000E0000000E44000__000000931B9A2710 000000067F000040020000E0000000E43FF6-000000067F000040020000E0000100000000__000000547D77D8A1-000000551D27ECC9 000000067F000040020000E0000000E44000-000000067F000040020000E0000000E48000__00000055ECBFFA00 000000067F000040020000E0000000E44000-000000067F000040020000E0000000E48000__0000005D2FFFFB38 000000067F000040020000E0000000E44000-000000067F000040020000E0000000E48000__00000073AD3FE6B8 000000067F000040020000E0000000E44000-000000067F000040020000E0000000E48000__000000914E3F38F0 000000067F000040020000E0000000E44000-000000067F000040020000E0000000E48000__000000931B9A2710 000000067F000040020000E0000000E442D1-000000067F000040020000E0000000E4CCA9__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E48000-000000067F000040020000E0000000E4C000__00000055ECBFFA00 000000067F000040020000E0000000E48000-000000067F000040020000E0000000E4C000__0000005D2FFFFB38 000000067F000040020000E0000000E48000-000000067F000040020000E0000000E4C000__00000073AD3FE6B8 000000067F000040020000E0000000E48000-000000067F000040020000E0000000E4C000__000000914E3F38F0 000000067F000040020000E0000000E48000-000000067F000040020000E0000000E4C000__000000931B9A2710 000000067F000040020000E0000000E4C000-000000067F000040020000E0000000E50000__00000055ECBFFA00 000000067F000040020000E0000000E4C000-000000067F000040020000E0000000E50000__0000005D2FFFFB38 000000067F000040020000E0000000E4C000-000000067F000040020000E0000000E50000__00000073AD3FE6B8 000000067F000040020000E0000000E4C000-000000067F000040020000E0000000E50000__000000914E3F38F0 000000067F000040020000E0000000E4C000-000000067F000040020000E0000000E50000__000000931B9A2710 000000067F000040020000E0000000E4CCA9-000000067F000040020000E0000000E55690__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E50000-000000067F000040020000E0000000E54000__00000055ECBFFA00 000000067F000040020000E0000000E50000-000000067F000040020000E0000000E54000__0000005D2FFFFB38 000000067F000040020000E0000000E50000-000000067F000040020000E0000000E54000__00000073AD3FE6B8 000000067F000040020000E0000000E50000-000000067F000040020000E0000000E54000__000000914E3F38F0 000000067F000040020000E0000000E50000-000000067F000040020000E0000000E54000__000000931B9A2710 000000067F000040020000E0000000E54000-000000067F000040020000E0000000E58000__00000055ECBFFA00 000000067F000040020000E0000000E54000-000000067F000040020000E0000000E58000__0000005D2FFFFB38 000000067F000040020000E0000000E54000-000000067F000040020000E0000000E58000__00000073AD3FE6B8 000000067F000040020000E0000000E54000-000000067F000040020000E0000000E58000__000000914E3F38F0 000000067F000040020000E0000000E54000-000000067F000040020000E0000000E58000__000000931B9A2710 000000067F000040020000E0000000E55690-000000067F000040020000E0000000E5E072__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E58000-000000067F000040020000E0000000E5C000__00000055ECBFFA00 000000067F000040020000E0000000E58000-000000067F000040020000E0000000E5C000__0000005D2FFFFB38 000000067F000040020000E0000000E58000-000000067F000040020000E0000000E5C000__00000073AD3FE6B8 000000067F000040020000E0000000E58000-000000067F000040020000E0000000E5C000__000000914E3F38F0 000000067F000040020000E0000000E58000-000000067F000040020000E0000000E5C000__000000931B9A2710 000000067F000040020000E0000000E5C000-000000067F000040020000E0000000E60000__00000055ECBFFA00 000000067F000040020000E0000000E5C000-000000067F000040020000E0000000E60000__0000005D2FFFFB38 000000067F000040020000E0000000E5C000-000000067F000040020000E0000000E60000__00000073AD3FE6B8 000000067F000040020000E0000000E5C000-000000067F000040020000E0000000E60000__000000914E3F38F0 000000067F000040020000E0000000E5C000-000000067F000040020000E0000000E60000__000000931B9A2710 000000067F000040020000E0000000E5E072-000000067F000040020000E0000000E66A4D__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E60000-000000067F000040020000E0000000E64000__00000055ECBFFA00 000000067F000040020000E0000000E60000-000000067F000040020000E0000000E64000__0000005D2FFFFB38 000000067F000040020000E0000000E60000-000000067F000040020000E0000000E64000__00000073AD3FE6B8 000000067F000040020000E0000000E60000-000000067F000040020000E0000000E64000__000000914E3F38F0 000000067F000040020000E0000000E60000-000000067F000040020000E0000000E64000__000000931B9A2710 000000067F000040020000E0000000E64000-000000067F000040020000E0000000E68000__00000055ECBFFA00 000000067F000040020000E0000000E64000-000000067F000040020000E0000000E68000__0000005D2FFFFB38 000000067F000040020000E0000000E64000-000000067F000040020000E0000000E68000__00000073AD3FE6B8 000000067F000040020000E0000000E64000-000000067F000040020000E0000000E68000__000000914E3F38F0 000000067F000040020000E0000000E64000-000000067F000040020000E0000000E68000__000000931B9A2710 000000067F000040020000E0000000E66A4D-000000067F000040020000E0000000E6F424__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E68000-000000067F000040020000E0000000E6C000__00000055ECBFFA00 000000067F000040020000E0000000E68000-000000067F000040020000E0000000E6C000__0000005D2FFFFB38 000000067F000040020000E0000000E68000-000000067F000040020000E0000000E6C000__00000073AD3FE6B8 000000067F000040020000E0000000E68000-000000067F000040020000E0000000E6C000__000000914E3F38F0 000000067F000040020000E0000000E68000-000000067F000040020000E0000000E6C000__000000931B9A2710 000000067F000040020000E0000000E6C000-000000067F000040020000E0000000E70000__00000055ECBFFA00 000000067F000040020000E0000000E6C000-000000067F000040020000E0000000E70000__0000005D2FFFFB38 000000067F000040020000E0000000E6C000-000000067F000040020000E0000000E70000__00000073AD3FE6B8 000000067F000040020000E0000000E6C000-000000067F000040020000E0000000E70000__000000914E3F38F0 000000067F000040020000E0000000E6C000-000000067F000040020000E0000000E70000__000000931B9A2710 000000067F000040020000E0000000E6F424-000000067F000040020000E0000000E77E01__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E70000-000000067F000040020000E0000000E74000__00000055ECBFFA00 000000067F000040020000E0000000E70000-000000067F000040020000E0000000E74000__0000005D2FFFFB38 000000067F000040020000E0000000E70000-000000067F000040020000E0000000E74000__00000073AD3FE6B8 000000067F000040020000E0000000E70000-000000067F000040020000E0000000E74000__000000914E3F38F0 000000067F000040020000E0000000E70000-000000067F000040020000E0000000E74000__000000931B9A2710 000000067F000040020000E0000000E74000-000000067F000040020000E0000000E78000__00000055ECBFFA00 000000067F000040020000E0000000E74000-000000067F000040020000E0000000E78000__0000005D2FFFFB38 000000067F000040020000E0000000E74000-000000067F000040020000E0000000E78000__00000073AD3FE6B8 000000067F000040020000E0000000E74000-000000067F000040020000E0000000E78000__000000914E3F38F0 000000067F000040020000E0000000E74000-000000067F000040020000E0000000E78000__000000931B9A2710 000000067F000040020000E0000000E77E01-000000067F000040020000E0000000E807CF__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E78000-000000067F000040020000E0000000E7C000__00000055ECBFFA00 000000067F000040020000E0000000E78000-000000067F000040020000E0000000E7C000__0000005D2FFFFB38 000000067F000040020000E0000000E78000-000000067F000040020000E0000000E7C000__00000073AD3FE6B8 000000067F000040020000E0000000E78000-000000067F000040020000E0000000E7C000__000000914E3F38F0 000000067F000040020000E0000000E78000-000000067F000040020000E0000000E7C000__000000931B9A2710 000000067F000040020000E0000000E7C000-000000067F000040020000E0000000E80000__00000055ECBFFA00 000000067F000040020000E0000000E7C000-000000067F000040020000E0000000E80000__0000005D2FFFFB38 000000067F000040020000E0000000E7C000-000000067F000040020000E0000000E80000__00000073AD3FE6B8 000000067F000040020000E0000000E7C000-000000067F000040020000E0000000E80000__000000914E3F38F0 000000067F000040020000E0000000E7C000-000000067F000040020000E0000000E80000__000000931B9A2710 000000067F000040020000E0000000E80000-000000067F000040020000E0000000E84000__00000055ECBFFA00 000000067F000040020000E0000000E80000-000000067F000040020000E0000000E84000__0000005D2FFFFB38 000000067F000040020000E0000000E80000-000000067F000040020000E0000000E84000__00000073AD3FE6B8 000000067F000040020000E0000000E80000-000000067F000040020000E0000000E84000__000000914E3F38F0 000000067F000040020000E0000000E80000-000000067F000040020000E0000000E84000__000000931B9A2710 000000067F000040020000E0000000E807CF-000000067F000040020000E0000000E891B1__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E84000-000000067F000040020000E0000000E88000__00000055ECBFFA00 000000067F000040020000E0000000E84000-000000067F000040020000E0000000E88000__0000005D2FFFFB38 000000067F000040020000E0000000E84000-000000067F000040020000E0000000E88000__00000073AD3FE6B8 000000067F000040020000E0000000E84000-000000067F000040020000E0000000E88000__000000914E3F38F0 000000067F000040020000E0000000E84000-000000067F000040020000E0000000E88000__000000931B9A2710 000000067F000040020000E0000000E88000-000000067F000040020000E0000000E8C000__00000055ECBFFA00 000000067F000040020000E0000000E88000-000000067F000040020000E0000000E8C000__0000005D2FFFFB38 000000067F000040020000E0000000E88000-000000067F000040020000E0000000E8C000__00000073AD3FE6B8 000000067F000040020000E0000000E88000-000000067F000040020000E0000000E8C000__000000914E3F38F0 000000067F000040020000E0000000E88000-000000067F000040020000E0000000E8C000__000000931B9A2710 000000067F000040020000E0000000E891B1-000000067F000040020000E0000000E91B9A__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E8C000-000000067F000040020000E0000000E90000__00000055ECBFFA00 000000067F000040020000E0000000E8C000-000000067F000040020000E0000000E90000__0000005D2FFFFB38 000000067F000040020000E0000000E8C000-000000067F000040020000E0000000E90000__00000073AD3FE6B8 000000067F000040020000E0000000E8C000-000000067F000040020000E0000000E90000__000000914E3F38F0 000000067F000040020000E0000000E8C000-000000067F000040020000E0000000E90000__000000931B9A2710 000000067F000040020000E0000000E90000-000000067F000040020000E0000000E94000__00000055ECBFFA00 000000067F000040020000E0000000E90000-000000067F000040020000E0000000E94000__0000005D2FFFFB38 000000067F000040020000E0000000E90000-000000067F000040020000E0000000E94000__00000073AD3FE6B8 000000067F000040020000E0000000E90000-000000067F000040020000E0000000E94000__000000914E3F38F0 000000067F000040020000E0000000E90000-000000067F000040020000E0000000E94000__000000931B9A2710 000000067F000040020000E0000000E91B9A-000000067F000040020000E0000000E9A57C__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E94000-000000067F000040020000E0000000E98000__00000055ECBFFA00 000000067F000040020000E0000000E94000-000000067F000040020000E0000000E98000__0000005D2FFFFB38 000000067F000040020000E0000000E94000-000000067F000040020000E0000000E98000__00000073AD3FE6B8 000000067F000040020000E0000000E94000-000000067F000040020000E0000000E98000__000000914E3F38F0 000000067F000040020000E0000000E94000-000000067F000040020000E0000000E98000__000000931B9A2710 000000067F000040020000E0000000E98000-000000067F000040020000E0000000E9C000__00000055ECBFFA00 000000067F000040020000E0000000E98000-000000067F000040020000E0000000E9C000__000000574B7FF240 000000067F000040020000E0000000E98000-000000067F000040020000E0000000E9C000__00000073AD3FE6B8 000000067F000040020000E0000000E98000-000000067F000040020000E0000000E9C000__000000914E3F38F0 000000067F000040020000E0000000E98000-000000067F000040020000E0000000E9C000__000000931B9A2710 000000067F000040020000E0000000E9A57C-000000067F000040020000E0000100000000__000000551D27ECC9-00000055BCD7D459 000000067F000040020000E0000000E9A850-000000067F000040020000E0000000EA322A__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000E9C000-000000067F000040020000E0000000EA0000__00000055ECBFFA00 000000067F000040020000E0000000E9C000-000000067F000040020000E0000000EA0000__000000574B7FF240 000000067F000040020000E0000000E9C000-000000067F000040020000E0000000EA0000__00000073AD3FE6B8 000000067F000040020000E0000000E9C000-000000067F000040020000E0000000EA0000__000000914E3F38F0 000000067F000040020000E0000000E9C000-000000067F000040020000E0000000EA0000__000000931B9A2710 000000067F000040020000E0000000EA0000-000000067F000040020000E0000000EA4000__00000055ECBFFA00 000000067F000040020000E0000000EA0000-000000067F000040020000E0000000EA4000__000000574B7FF240 000000067F000040020000E0000000EA0000-000000067F000040020000E0000000EA4000__00000073AD3FE6B8 000000067F000040020000E0000000EA0000-000000067F000040020000E0000000EA4000__000000914E3F38F0 000000067F000040020000E0000000EA0000-000000067F000040020000E0000000EA4000__000000931B9A2710 000000067F000040020000E0000000EA322A-000000067F000040020000E0000000EABBFA__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EA4000-000000067F000040020000E0000000EA8000__00000055ECBFFA00 000000067F000040020000E0000000EA4000-000000067F000040020000E0000000EA8000__000000574B7FF240 000000067F000040020000E0000000EA4000-000000067F000040020000E0000000EA8000__00000073AD3FE6B8 000000067F000040020000E0000000EA4000-000000067F000040020000E0000000EA8000__000000914E3F38F0 000000067F000040020000E0000000EA4000-000000067F000040020000E0000000EA8000__000000931B9A2710 000000067F000040020000E0000000EA8000-000000067F000040020000E0000000EAC000__00000055ECBFFA00 000000067F000040020000E0000000EA8000-000000067F000040020000E0000000EAC000__000000574B7FF240 000000067F000040020000E0000000EA8000-000000067F000040020000E0000000EAC000__00000073AD3FE6B8 000000067F000040020000E0000000EA8000-000000067F000040020000E0000000EAC000__000000914E3F38F0 000000067F000040020000E0000000EA8000-000000067F000040020000E0000000EAC000__000000931B9A2710 000000067F000040020000E0000000EABBFA-000000067F000040020000E0000000EB45E8__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EAC000-000000067F000040020000E0000000EB0000__00000055ECBFFA00 000000067F000040020000E0000000EAC000-000000067F000040020000E0000000EB0000__000000574B7FF240 000000067F000040020000E0000000EAC000-000000067F000040020000E0000000EB0000__00000073AD3FE6B8 000000067F000040020000E0000000EAC000-000000067F000040020000E0000000EB0000__000000914E3F38F0 000000067F000040020000E0000000EAC000-000000067F000040020000E0000000EB0000__000000931B9A2710 000000067F000040020000E0000000EB0000-000000067F000040020000E0000000EB4000__00000055ECBFFA00 000000067F000040020000E0000000EB0000-000000067F000040020000E0000000EB4000__000000574B7FF240 000000067F000040020000E0000000EB0000-000000067F000040020000E0000000EB4000__00000073AD3FE6B8 000000067F000040020000E0000000EB0000-000000067F000040020000E0000000EB4000__000000914E3F38F0 000000067F000040020000E0000000EB0000-000000067F000040020000E0000000EB4000__000000931B9A2710 000000067F000040020000E0000000EB4000-000000067F000040020000E0000000EB8000__000000574B7FF240 000000067F000040020000E0000000EB4000-000000067F000040020000E0000000EB8000__00000073AD3FE6B8 000000067F000040020000E0000000EB4000-000000067F000040020000E0000000EB8000__000000914E3F38F0 000000067F000040020000E0000000EB4000-000000067F000040020000E0000000EB8000__000000931B9A2710 000000067F000040020000E0000000EB4000-030000000000000000000000000000000002__00000055ECBFFA00 000000067F000040020000E0000000EB45E8-000000067F000040020000E0000000EBCFC3__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EB8000-000000067F000040020000E0000000EBC000__000000574B7FF240 000000067F000040020000E0000000EB8000-000000067F000040020000E0000000EBC000__00000073AD3FE6B8 000000067F000040020000E0000000EB8000-000000067F000040020000E0000000EBC000__000000914E3F38F0 000000067F000040020000E0000000EB8000-000000067F000040020000E0000000EBC000__000000931B9A2710 000000067F000040020000E0000000EBC000-000000067F000040020000E0000000EC0000__000000574B7FF240 000000067F000040020000E0000000EBC000-000000067F000040020000E0000000EC0000__00000073AD3FE6B8 000000067F000040020000E0000000EBC000-000000067F000040020000E0000000EC0000__000000914E3F38F0 000000067F000040020000E0000000EBC000-000000067F000040020000E0000000EC0000__000000931B9A2710 000000067F000040020000E0000000EBCFC3-000000067F000040020000E0000000EC599B__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EC0000-000000067F000040020000E0000000EC4000__000000574B7FF240 000000067F000040020000E0000000EC0000-000000067F000040020000E0000000EC4000__00000073AD3FE6B8 000000067F000040020000E0000000EC0000-000000067F000040020000E0000000EC4000__000000914E3F38F0 000000067F000040020000E0000000EC0000-000000067F000040020000E0000000EC4000__000000931B9A2710 000000067F000040020000E0000000EC4000-000000067F000040020000E0000000EC8000__000000574B7FF240 000000067F000040020000E0000000EC4000-000000067F000040020000E0000000EC8000__00000073AD3FE6B8 000000067F000040020000E0000000EC4000-000000067F000040020000E0000000EC8000__000000914E3F38F0 000000067F000040020000E0000000EC4000-000000067F000040020000E0000000EC8000__000000931B9A2710 000000067F000040020000E0000000EC599B-000000067F000040020000E0000000ECE381__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EC8000-000000067F000040020000E0000000ECC000__000000574B7FF240 000000067F000040020000E0000000EC8000-000000067F000040020000E0000000ECC000__00000073AD3FE6B8 000000067F000040020000E0000000EC8000-000000067F000040020000E0000000ECC000__000000914E3F38F0 000000067F000040020000E0000000EC8000-000000067F000040020000E0000000ECC000__000000931B9A2710 000000067F000040020000E0000000ECC000-000000067F000040020000E0000000ED0000__000000574B7FF240 000000067F000040020000E0000000ECC000-000000067F000040020000E0000000ED0000__00000073AD3FE6B8 000000067F000040020000E0000000ECC000-000000067F000040020000E0000000ED0000__000000914E3F38F0 000000067F000040020000E0000000ECC000-000000067F000040020000E0000000ED0000__000000931B9A2710 000000067F000040020000E0000000ECE381-000000067F000040020000E0000000ED6D60__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000ED0000-000000067F000040020000E0000000ED4000__000000574B7FF240 000000067F000040020000E0000000ED0000-000000067F000040020000E0000000ED4000__00000073AD3FE6B8 000000067F000040020000E0000000ED0000-000000067F000040020000E0000000ED4000__000000914E3F38F0 000000067F000040020000E0000000ED0000-000000067F000040020000E0000000ED4000__000000931B9A2710 000000067F000040020000E0000000ED4000-000000067F000040020000E0000000ED8000__000000574B7FF240 000000067F000040020000E0000000ED4000-000000067F000040020000E0000000ED8000__00000073AD3FE6B8 000000067F000040020000E0000000ED4000-000000067F000040020000E0000000ED8000__000000914E3F38F0 000000067F000040020000E0000000ED4000-000000067F000040020000E0000000ED8000__000000931B9A2710 000000067F000040020000E0000000ED6D60-000000067F000040020000E0000000EDF740__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000ED8000-000000067F000040020000E0000000EDC000__000000574B7FF240 000000067F000040020000E0000000ED8000-000000067F000040020000E0000000EDC000__00000073AD3FE6B8 000000067F000040020000E0000000ED8000-000000067F000040020000E0000000EDC000__000000914E3F38F0 000000067F000040020000E0000000ED8000-000000067F000040020000E0000000EDC000__000000931B9A2710 000000067F000040020000E0000000EDC000-000000067F000040020000E0000000EE0000__000000574B7FF240 000000067F000040020000E0000000EDC000-000000067F000040020000E0000000EE0000__00000073AD3FE6B8 000000067F000040020000E0000000EDC000-000000067F000040020000E0000000EE0000__000000914E3F38F0 000000067F000040020000E0000000EDC000-000000067F000040020000E0000000EE0000__000000931B9A2710 000000067F000040020000E0000000EDF740-000000067F000040020000E0000000EE8111__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EE0000-000000067F000040020000E0000000EE4000__000000574B7FF240 000000067F000040020000E0000000EE0000-000000067F000040020000E0000000EE4000__00000073AD3FE6B8 000000067F000040020000E0000000EE0000-000000067F000040020000E0000000EE4000__000000914E3F38F0 000000067F000040020000E0000000EE0000-000000067F000040020000E0000000EE4000__000000931B9A2710 000000067F000040020000E0000000EE4000-000000067F000040020000E0000000EE8000__000000574B7FF240 000000067F000040020000E0000000EE4000-000000067F000040020000E0000000EE8000__00000073AD3FE6B8 000000067F000040020000E0000000EE4000-000000067F000040020000E0000000EE8000__000000914E3F38F0 000000067F000040020000E0000000EE4000-000000067F000040020000E0000000EE8000__000000931B9A2710 000000067F000040020000E0000000EE8000-000000067F000040020000E0000000EEC000__000000574B7FF240 000000067F000040020000E0000000EE8000-000000067F000040020000E0000000EEC000__00000073AD3FE6B8 000000067F000040020000E0000000EE8000-000000067F000040020000E0000000EEC000__000000914E3F38F0 000000067F000040020000E0000000EE8000-000000067F000040020000E0000000EEC000__000000931B9A2710 000000067F000040020000E0000000EE8111-000000067F000040020000E0000000EF0AF9__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EEC000-000000067F000040020000E0000000EF0000__000000574B7FF240 000000067F000040020000E0000000EEC000-000000067F000040020000E0000000EF0000__00000073AD3FE6B8 000000067F000040020000E0000000EEC000-000000067F000040020000E0000000EF0000__000000914E3F38F0 000000067F000040020000E0000000EEC000-000000067F000040020000E0000000EF0000__000000931B9A2710 000000067F000040020000E0000000EF0000-000000067F000040020000E0000000EF4000__000000572A7A05D8 000000067F000040020000E0000000EF0000-000000067F000040020000E0000000EF4000__0000005D2FFFFB38 000000067F000040020000E0000000EF0000-000000067F000040020000E0000000EF4000__00000073AD3FE6B8 000000067F000040020000E0000000EF0000-000000067F000040020000E0000000EF4000__000000914E3F38F0 000000067F000040020000E0000000EF0000-000000067F000040020000E0000000EF4000__000000931B9A2710 000000067F000040020000E0000000EF0AF9-000000067F000040020000E0000100000000__00000055BCD7D459-000000565C87E419 000000067F000040020000E0000000EF0DBC-000000067F000040020000E0000000EF979A__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000EF4000-000000067F000040020000E0000000EF8000__000000572A7A05D8 000000067F000040020000E0000000EF4000-000000067F000040020000E0000000EF8000__0000005D2FFFFB38 000000067F000040020000E0000000EF4000-000000067F000040020000E0000000EF8000__00000073AD3FE6B8 000000067F000040020000E0000000EF4000-000000067F000040020000E0000000EF8000__000000914E3F38F0 000000067F000040020000E0000000EF4000-000000067F000040020000E0000000EF8000__000000931B9A2710 000000067F000040020000E0000000EF8000-000000067F000040020000E0000000EFC000__000000572A7A05D8 000000067F000040020000E0000000EF8000-000000067F000040020000E0000000EFC000__0000005D2FFFFB38 000000067F000040020000E0000000EF8000-000000067F000040020000E0000000EFC000__00000073AD3FE6B8 000000067F000040020000E0000000EF8000-000000067F000040020000E0000000EFC000__000000914E3F38F0 000000067F000040020000E0000000EF8000-000000067F000040020000E0000000EFC000__000000931B9A2710 000000067F000040020000E0000000EF979A-000000067F000040020000E0000000F02175__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000EFC000-000000067F000040020000E0000000F00000__000000572A7A05D8 000000067F000040020000E0000000EFC000-000000067F000040020000E0000000F00000__0000005D2FFFFB38 000000067F000040020000E0000000EFC000-000000067F000040020000E0000000F00000__00000073AD3FE6B8 000000067F000040020000E0000000EFC000-000000067F000040020000E0000000F00000__000000914E3F38F0 000000067F000040020000E0000000EFC000-000000067F000040020000E0000000F00000__000000931B9A2710 000000067F000040020000E0000000F00000-000000067F000040020000E0000000F04000__000000572A7A05D8 000000067F000040020000E0000000F00000-000000067F000040020000E0000000F04000__0000005D2FFFFB38 000000067F000040020000E0000000F00000-000000067F000040020000E0000000F04000__00000073AD3FE6B8 000000067F000040020000E0000000F00000-000000067F000040020000E0000000F04000__000000914E3F38F0 000000067F000040020000E0000000F00000-000000067F000040020000E0000000F04000__000000931B9A2710 000000067F000040020000E0000000F02175-000000067F000040020000E0000000F0AB56__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F04000-000000067F000040020000E0000000F08000__000000572A7A05D8 000000067F000040020000E0000000F04000-000000067F000040020000E0000000F08000__0000005D2FFFFB38 000000067F000040020000E0000000F04000-000000067F000040020000E0000000F08000__00000073AD3FE6B8 000000067F000040020000E0000000F04000-000000067F000040020000E0000000F08000__000000914E3F38F0 000000067F000040020000E0000000F04000-000000067F000040020000E0000000F08000__000000931B9A2710 000000067F000040020000E0000000F08000-000000067F000040020000E0000000F0C000__000000572A7A05D8 000000067F000040020000E0000000F08000-000000067F000040020000E0000000F0C000__0000005D2FFFFB38 000000067F000040020000E0000000F08000-000000067F000040020000E0000000F0C000__00000073AD3FE6B8 000000067F000040020000E0000000F08000-000000067F000040020000E0000000F0C000__000000914E3F38F0 000000067F000040020000E0000000F08000-000000067F000040020000E0000000F0C000__000000931B9A2710 000000067F000040020000E0000000F0AB56-000000067F000040020000E0000000F1352C__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F0C000-000000067F000040020000E0000000F10000__000000572A7A05D8 000000067F000040020000E0000000F0C000-000000067F000040020000E0000000F10000__0000005D2FFFFB38 000000067F000040020000E0000000F0C000-000000067F000040020000E0000000F10000__00000073AD3FE6B8 000000067F000040020000E0000000F0C000-000000067F000040020000E0000000F10000__000000914E3F38F0 000000067F000040020000E0000000F0C000-000000067F000040020000E0000000F10000__000000931B9A2710 000000067F000040020000E0000000F10000-000000067F000040020000E0000000F14000__000000572A7A05D8 000000067F000040020000E0000000F10000-000000067F000040020000E0000000F14000__0000005D2FFFFB38 000000067F000040020000E0000000F10000-000000067F000040020000E0000000F14000__00000073AD3FE6B8 000000067F000040020000E0000000F10000-000000067F000040020000E0000000F14000__000000914E3F38F0 000000067F000040020000E0000000F10000-000000067F000040020000E0000000F14000__000000931B9A2710 000000067F000040020000E0000000F1352C-000000067F000040020000E0000000F1BF05__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F14000-000000067F000040020000E0000000F18000__000000572A7A05D8 000000067F000040020000E0000000F14000-000000067F000040020000E0000000F18000__0000005D2FFFFB38 000000067F000040020000E0000000F14000-000000067F000040020000E0000000F18000__00000073AD3FE6B8 000000067F000040020000E0000000F14000-000000067F000040020000E0000000F18000__000000914E3F38F0 000000067F000040020000E0000000F14000-000000067F000040020000E0000000F18000__000000931B9A2710 000000067F000040020000E0000000F18000-000000067F000040020000E0000000F1C000__000000572A7A05D8 000000067F000040020000E0000000F18000-000000067F000040020000E0000000F1C000__0000005D2FFFFB38 000000067F000040020000E0000000F18000-000000067F000040020000E0000000F1C000__00000073AD3FE6B8 000000067F000040020000E0000000F18000-000000067F000040020000E0000000F1C000__000000914E3F38F0 000000067F000040020000E0000000F18000-000000067F000040020000E0000000F1C000__000000931B9A2710 000000067F000040020000E0000000F1BF05-000000067F000040020000E0000000F248DB__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F1C000-000000067F000040020000E0000000F20000__000000572A7A05D8 000000067F000040020000E0000000F1C000-000000067F000040020000E0000000F20000__0000005D2FFFFB38 000000067F000040020000E0000000F1C000-000000067F000040020000E0000000F20000__00000073AD3FE6B8 000000067F000040020000E0000000F1C000-000000067F000040020000E0000000F20000__000000914E3F38F0 000000067F000040020000E0000000F1C000-000000067F000040020000E0000000F20000__000000931B9A2710 000000067F000040020000E0000000F20000-000000067F000040020000E0000000F24000__000000572A7A05D8 000000067F000040020000E0000000F20000-000000067F000040020000E0000000F24000__0000005D2FFFFB38 000000067F000040020000E0000000F20000-000000067F000040020000E0000000F24000__00000073AD3FE6B8 000000067F000040020000E0000000F20000-000000067F000040020000E0000000F24000__000000914E3F38F0 000000067F000040020000E0000000F20000-000000067F000040020000E0000000F24000__000000931B9A2710 000000067F000040020000E0000000F24000-000000067F000040020000E0000000F28000__000000572A7A05D8 000000067F000040020000E0000000F24000-000000067F000040020000E0000000F28000__0000005D2FFFFB38 000000067F000040020000E0000000F24000-000000067F000040020000E0000000F28000__00000073AD3FE6B8 000000067F000040020000E0000000F24000-000000067F000040020000E0000000F28000__000000914E3F38F0 000000067F000040020000E0000000F24000-000000067F000040020000E0000000F28000__000000931B9A2710 000000067F000040020000E0000000F248DB-000000067F000040020000E0000000F2D2BA__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F28000-000000067F000040020000E0000000F2C000__000000572A7A05D8 000000067F000040020000E0000000F28000-000000067F000040020000E0000000F2C000__0000005D2FFFFB38 000000067F000040020000E0000000F28000-000000067F000040020000E0000000F2C000__00000073AD3FE6B8 000000067F000040020000E0000000F28000-000000067F000040020000E0000000F2C000__000000914E3F38F0 000000067F000040020000E0000000F28000-000000067F000040020000E0000000F2C000__000000931B9A2710 000000067F000040020000E0000000F2C000-000000067F000040020000E0000000F30000__000000572A7A05D8 000000067F000040020000E0000000F2C000-000000067F000040020000E0000000F30000__0000005D2FFFFB38 000000067F000040020000E0000000F2C000-000000067F000040020000E0000000F30000__00000073AD3FE6B8 000000067F000040020000E0000000F2C000-000000067F000040020000E0000000F30000__000000914E3F38F0 000000067F000040020000E0000000F2C000-000000067F000040020000E0000000F30000__000000931B9A2710 000000067F000040020000E0000000F2D2BA-000000067F000040020000E0000000F35CA3__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F30000-000000067F000040020000E0000000F34000__000000572A7A05D8 000000067F000040020000E0000000F30000-000000067F000040020000E0000000F34000__0000005D2FFFFB38 000000067F000040020000E0000000F30000-000000067F000040020000E0000000F34000__00000073AD3FE6B8 000000067F000040020000E0000000F30000-000000067F000040020000E0000000F34000__000000914E3F38F0 000000067F000040020000E0000000F30000-000000067F000040020000E0000000F34000__000000931B9A2710 000000067F000040020000E0000000F34000-000000067F000040020000E0000000F38000__000000572A7A05D8 000000067F000040020000E0000000F34000-000000067F000040020000E0000000F38000__0000005D2FFFFB38 000000067F000040020000E0000000F34000-000000067F000040020000E0000000F38000__00000073AD3FE6B8 000000067F000040020000E0000000F34000-000000067F000040020000E0000000F38000__000000914E3F38F0 000000067F000040020000E0000000F34000-000000067F000040020000E0000000F38000__000000931B9A2710 000000067F000040020000E0000000F35CA3-000000067F000040020000E0000000F3E680__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F38000-000000067F000040020000E0000000F3C000__000000572A7A05D8 000000067F000040020000E0000000F38000-000000067F000040020000E0000000F3C000__0000005D2FFFFB38 000000067F000040020000E0000000F38000-000000067F000040020000E0000000F3C000__00000073AD3FE6B8 000000067F000040020000E0000000F38000-000000067F000040020000E0000000F3C000__000000914E3F38F0 000000067F000040020000E0000000F38000-000000067F000040020000E0000000F3C000__000000931B9A2710 000000067F000040020000E0000000F3C000-000000067F000040020000E0000000F40000__000000572A7A05D8 000000067F000040020000E0000000F3C000-000000067F000040020000E0000000F40000__0000005D2FFFFB38 000000067F000040020000E0000000F3C000-000000067F000040020000E0000000F40000__00000073AD3FE6B8 000000067F000040020000E0000000F3C000-000000067F000040020000E0000000F40000__000000914E3F38F0 000000067F000040020000E0000000F3C000-000000067F000040020000E0000000F40000__000000931B9A2710 000000067F000040020000E0000000F3E680-000000067F000040020000E0000000F4705B__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F40000-000000067F000040020000E0000000F44000__000000572A7A05D8 000000067F000040020000E0000000F40000-000000067F000040020000E0000000F44000__0000005D2FFFFB38 000000067F000040020000E0000000F40000-000000067F000040020000E0000000F44000__00000073AD3FE6B8 000000067F000040020000E0000000F40000-000000067F000040020000E0000000F44000__000000914E3F38F0 000000067F000040020000E0000000F40000-000000067F000040020000E0000000F44000__000000931B9A2710 000000067F000040020000E0000000F44000-000000067F000040020000E0000000F48000__000000572A7A05D8 000000067F000040020000E0000000F44000-000000067F000040020000E0000000F48000__0000005D2FFFFB38 000000067F000040020000E0000000F44000-000000067F000040020000E0000000F48000__00000073AD3FE6B8 000000067F000040020000E0000000F44000-000000067F000040020000E0000000F48000__000000914E3F38F0 000000067F000040020000E0000000F44000-000000067F000040020000E0000000F48000__000000931B9A2710 000000067F000040020000E0000000F4705B-000000067F000040020000E0000100000000__000000565C87E419-00000056FC37F3D9 000000067F000040020000E0000000F48000-000000067F000040020000E0000000F4C000__000000572A7A05D8 000000067F000040020000E0000000F48000-000000067F000040020000E0000000F4C000__0000005D2FFFFB38 000000067F000040020000E0000000F48000-000000067F000040020000E0000000F4C000__00000073AD3FE6B8 000000067F000040020000E0000000F48000-000000067F000040020000E0000000F4C000__000000914E3F38F0 000000067F000040020000E0000000F48000-000000067F000040020000E0000000F4C000__000000931B9A2710 000000067F000040020000E0000000F4C000-000000067F000040020000E0000000F50000__000000572A7A05D8 000000067F000040020000E0000000F4C000-000000067F000040020000E0000000F50000__0000005D2FFFFB38 000000067F000040020000E0000000F4C000-000000067F000040020000E0000000F50000__00000073AD3FE6B8 000000067F000040020000E0000000F4C000-000000067F000040020000E0000000F50000__000000914E3F38F0 000000067F000040020000E0000000F4C000-000000067F000040020000E0000000F50000__000000931B9A2710 000000067F000040020000E0000000F4FCF9-000000067F000040020000E0000000F586CE__00000056FC37F3D9-000000572A7B4CD9 000000067F000040020000E0000000F50000-000000067F000040020000E0000000F54000__000000572A7A05D8 000000067F000040020000E0000000F50000-000000067F000040020000E0000000F54000__0000005D2FFFFB38 000000067F000040020000E0000000F50000-000000067F000040020000E0000000F54000__00000073AD3FE6B8 000000067F000040020000E0000000F50000-000000067F000040020000E0000000F54000__000000914E3F38F0 000000067F000040020000E0000000F50000-000000067F000040020000E0000000F54000__000000931B9A2710 000000067F000040020000E0000000F54000-000000067F000040020000E0000000F58000__000000572A7A05D8 000000067F000040020000E0000000F54000-000000067F000040020000E0000000F58000__0000005D2FFFFB38 000000067F000040020000E0000000F54000-000000067F000040020000E0000000F58000__00000073AD3FE6B8 000000067F000040020000E0000000F54000-000000067F000040020000E0000000F58000__000000914E3F38F0 000000067F000040020000E0000000F54000-000000067F000040020000E0000000F58000__000000931B9A2710 000000067F000040020000E0000000F58000-000000067F000040020000E0000000F5C000__000000572A7A05D8 000000067F000040020000E0000000F58000-000000067F000040020000E0000000F5C000__0000005D2FFFFB38 000000067F000040020000E0000000F58000-000000067F000040020000E0000000F5C000__00000073AD3FE6B8 000000067F000040020000E0000000F58000-000000067F000040020000E0000000F5C000__000000914E3F38F0 000000067F000040020000E0000000F58000-000000067F000040020000E0000000F5C000__000000931B9A2710 000000067F000040020000E0000000F586CE-030000000000000000000000000000000002__00000056FC37F3D9-000000572A7B4CD9 000000067F000040020000E0000000F5C000-000000067F000040020000E0000000F60000__000000572A7A05D8 000000067F000040020000E0000000F5C000-000000067F000040020000E0000000F60000__0000005D2FFFFB38 000000067F000040020000E0000000F5C000-000000067F000040020000E0000000F60000__00000073AD3FE6B8 000000067F000040020000E0000000F5C000-000000067F000040020000E0000000F60000__000000914E3F38F0 000000067F000040020000E0000000F5C000-000000067F000040020000E0000000F60000__000000931B9A2710 000000067F000040020000E0000000F60000-000000067F000040020000E0050100000000__0000005D2FFFFB38 000000067F000040020000E0000000F60000-000000067F000040020000E0050100000000__00000073AD3FE6B8 000000067F000040020000E0000000F60000-000000067F000040020000E0050100000000__000000914E3F38F0 000000067F000040020000E0000000F60000-000000067F000040020000E0050100000000__000000931B9A2710 000000067F000040020000E0000000F60000-030000000000000000000000000000000002__000000572A7A05D8 000000067F000040020000E00000FFFFFFFF-000000067F000040020000E0000100000000__0000003B6A101880-00000043C5DDFE18 000000067F000040020000E00000FFFFFFFF-000000067F000040020000E0000100000000__00000043C5DDFE18-00000047441DEA39 000000067F00004002000100000000000000-000000067F00004002000100000000004000__0000005D2FFFFB38 000000067F00004002000100000000000000-000000067F00004002000100000000004000__00000073AD3FE6B8 000000067F00004002000100000000000000-000000067F00004002000100000000004000__000000914E3F38F0 000000067F00004002000100000000000000-000000067F00004002000100000000004000__000000931B9A2710 000000067F00004002000100000000004000-000000067F00004002000100000000008000__0000005D2FFFFB38 000000067F00004002000100000000004000-000000067F00004002000100000000008000__00000073AD3FE6B8 000000067F00004002000100000000004000-000000067F00004002000100000000008000__000000914E3F38F0 000000067F00004002000100000000004000-000000067F00004002000100000000008000__000000931B9A2710 000000067F00004002000100000000008000-000000067F0000400200010000000000C000__0000005D2FFFFB38 000000067F00004002000100000000008000-000000067F0000400200010000000000C000__00000073AD3FE6B8 000000067F00004002000100000000008000-000000067F0000400200010000000000C000__000000914E3F38F0 000000067F00004002000100000000008000-000000067F0000400200010000000000C000__000000931B9A2710 000000067F0000400200010000000000899C-000000067F0000400200010000000001137C__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000000C000-000000067F00004002000100000000010000__0000005D2FFFFB38 000000067F0000400200010000000000C000-000000067F00004002000100000000010000__00000073AD3FE6B8 000000067F0000400200010000000000C000-000000067F00004002000100000000010000__000000914E3F38F0 000000067F0000400200010000000000C000-000000067F00004002000100000000010000__000000931B9A2710 000000067F00004002000100000000010000-000000067F00004002000100000000014000__0000005D2FFFFB38 000000067F00004002000100000000010000-000000067F00004002000100000000014000__00000073AD3FE6B8 000000067F00004002000100000000010000-000000067F00004002000100000000014000__000000914E3F38F0 000000067F00004002000100000000010000-000000067F00004002000100000000014000__000000931B9A2710 000000067F0000400200010000000001137C-000000067F00004002000100000000019D79__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000014000-000000067F00004002000100000000018000__0000005D2FFFFB38 000000067F00004002000100000000014000-000000067F00004002000100000000018000__00000073AD3FE6B8 000000067F00004002000100000000014000-000000067F00004002000100000000018000__000000914E3F38F0 000000067F00004002000100000000014000-000000067F00004002000100000000018000__000000931B9A2710 000000067F00004002000100000000018000-000000067F0000400200010000000001C000__0000005D2FFFFB38 000000067F00004002000100000000018000-000000067F0000400200010000000001C000__00000073AD3FE6B8 000000067F00004002000100000000018000-000000067F0000400200010000000001C000__000000914E3F38F0 000000067F00004002000100000000018000-000000067F0000400200010000000001C000__000000931B9A2710 000000067F00004002000100000000019D79-000000067F00004002000100000000022776__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000001C000-000000067F00004002000100000000020000__0000005D2FFFFB38 000000067F0000400200010000000001C000-000000067F00004002000100000000020000__00000073AD3FE6B8 000000067F0000400200010000000001C000-000000067F00004002000100000000020000__000000914E3F38F0 000000067F0000400200010000000001C000-000000067F00004002000100000000020000__000000931B9A2710 000000067F00004002000100000000020000-000000067F00004002000100000000024000__0000005D2FFFFB38 000000067F00004002000100000000020000-000000067F00004002000100000000024000__00000073AD3FE6B8 000000067F00004002000100000000020000-000000067F00004002000100000000024000__000000914E3F38F0 000000067F00004002000100000000020000-000000067F00004002000100000000024000__000000931B9A2710 000000067F00004002000100000000022776-000000067F0000400200010000000002B15B__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000024000-000000067F00004002000100000000028000__0000005D2FFFFB38 000000067F00004002000100000000024000-000000067F00004002000100000000028000__00000073AD3FE6B8 000000067F00004002000100000000024000-000000067F00004002000100000000028000__000000914E3F38F0 000000067F00004002000100000000024000-000000067F00004002000100000000028000__000000931B9A2710 000000067F00004002000100000000028000-000000067F0000400200010000000002C000__0000005D2FFFFB38 000000067F00004002000100000000028000-000000067F0000400200010000000002C000__00000073AD3FE6B8 000000067F00004002000100000000028000-000000067F0000400200010000000002C000__000000914E3F38F0 000000067F00004002000100000000028000-000000067F0000400200010000000002C000__000000931B9A2710 000000067F0000400200010000000002B15B-000000067F00004002000100000000033B2F__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000002C000-000000067F00004002000100000000030000__0000005D2FFFFB38 000000067F0000400200010000000002C000-000000067F00004002000100000000030000__00000073AD3FE6B8 000000067F0000400200010000000002C000-000000067F00004002000100000000030000__000000914E3F38F0 000000067F0000400200010000000002C000-000000067F00004002000100000000030000__000000931B9A2710 000000067F00004002000100000000030000-000000067F00004002000100000000034000__0000005D2FFFFB38 000000067F00004002000100000000030000-000000067F00004002000100000000034000__00000073AD3FE6B8 000000067F00004002000100000000030000-000000067F00004002000100000000034000__000000914E3F38F0 000000067F00004002000100000000030000-000000067F00004002000100000000034000__000000931B9A2710 000000067F00004002000100000000033B2F-000000067F0000400200010000000003C4EA__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000034000-000000067F00004002000100000000038000__0000005D2FFFFB38 000000067F00004002000100000000034000-000000067F00004002000100000000038000__00000073AD3FE6B8 000000067F00004002000100000000034000-000000067F00004002000100000000038000__000000914E3F38F0 000000067F00004002000100000000034000-000000067F00004002000100000000038000__000000931B9A2710 000000067F00004002000100000000038000-000000067F0000400200010000000003C000__0000005D2FFFFB38 000000067F00004002000100000000038000-000000067F0000400200010000000003C000__00000073AD3FE6B8 000000067F00004002000100000000038000-000000067F0000400200010000000003C000__000000914E3F38F0 000000067F00004002000100000000038000-000000067F0000400200010000000003C000__000000931B9A2710 000000067F0000400200010000000003C000-000000067F00004002000100000000040000__0000005D2FFFFB38 000000067F0000400200010000000003C000-000000067F00004002000100000000040000__00000073AD3FE6B8 000000067F0000400200010000000003C000-000000067F00004002000100000000040000__000000914E3F38F0 000000067F0000400200010000000003C000-000000067F00004002000100000000040000__000000931B9A2710 000000067F0000400200010000000003C4EA-000000067F00004002000100000000044EA8__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000040000-000000067F00004002000100000000044000__0000005D2FFFFB38 000000067F00004002000100000000040000-000000067F00004002000100000000044000__00000073AD3FE6B8 000000067F00004002000100000000040000-000000067F00004002000100000000044000__000000914E3F38F0 000000067F00004002000100000000040000-000000067F00004002000100000000044000__000000931B9A2710 000000067F00004002000100000000044000-000000067F00004002000100000000048000__0000005D2FFFFB38 000000067F00004002000100000000044000-000000067F00004002000100000000048000__00000073AD3FE6B8 000000067F00004002000100000000044000-000000067F00004002000100000000048000__000000914E3F38F0 000000067F00004002000100000000044000-000000067F00004002000100000000048000__000000931B9A2710 000000067F00004002000100000000044EA8-000000067F0000400200010000000004D890__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000048000-000000067F0000400200010000000004C000__0000005D2FFFFB38 000000067F00004002000100000000048000-000000067F0000400200010000000004C000__00000073AD3FE6B8 000000067F00004002000100000000048000-000000067F0000400200010000000004C000__000000914E3F38F0 000000067F00004002000100000000048000-000000067F0000400200010000000004C000__000000931B9A2710 000000067F0000400200010000000004C000-000000067F00004002000100000000050000__0000005D2FFFFB38 000000067F0000400200010000000004C000-000000067F00004002000100000000050000__00000073AD3FE6B8 000000067F0000400200010000000004C000-000000067F00004002000100000000050000__000000914E3F38F0 000000067F0000400200010000000004C000-000000067F00004002000100000000050000__000000931B9A2710 000000067F0000400200010000000004D890-000000067F00004002000100000000056296__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000050000-000000067F00004002000100000000054000__0000005D2FFFFB38 000000067F00004002000100000000050000-000000067F00004002000100000000054000__00000073AD3FE6B8 000000067F00004002000100000000050000-000000067F00004002000100000000054000__000000914E3F38F0 000000067F00004002000100000000050000-000000067F00004002000100000000054000__000000931B9A2710 000000067F00004002000100000000054000-000000067F00004002000100000000058000__0000005D2FFFFB38 000000067F00004002000100000000054000-000000067F00004002000100000000058000__00000073AD3FE6B8 000000067F00004002000100000000054000-000000067F00004002000100000000058000__000000914E3F38F0 000000067F00004002000100000000054000-000000067F00004002000100000000058000__000000931B9A2710 000000067F00004002000100000000056296-000000067F0000400200010000000005EC8C__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000058000-000000067F0000400200010000000005C000__0000005D2FFFFB38 000000067F00004002000100000000058000-000000067F0000400200010000000005C000__00000073AD3FE6B8 000000067F00004002000100000000058000-000000067F0000400200010000000005C000__000000914E3F38F0 000000067F00004002000100000000058000-000000067F0000400200010000000005C000__000000931B9A2710 000000067F0000400200010000000005C000-000000067F00004002000100000000060000__0000005D2FFFFB38 000000067F0000400200010000000005C000-000000067F00004002000100000000060000__00000073AD3FE6B8 000000067F0000400200010000000005C000-000000067F00004002000100000000060000__000000914E3F38F0 000000067F0000400200010000000005C000-000000067F00004002000100000000060000__000000931B9A2710 000000067F0000400200010000000005EC8C-000000067F00004002000100000000067682__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000060000-000000067F00004002000100000000064000__0000005D2FFFFB38 000000067F00004002000100000000060000-000000067F00004002000100000000064000__00000073AD3FE6B8 000000067F00004002000100000000060000-000000067F00004002000100000000064000__000000914E3F38F0 000000067F00004002000100000000060000-000000067F00004002000100000000064000__000000931B9A2710 000000067F00004002000100000000064000-000000067F00004002000100000000068000__0000005D2FFFFB38 000000067F00004002000100000000064000-000000067F00004002000100000000068000__00000073AD3FE6B8 000000067F00004002000100000000064000-000000067F00004002000100000000068000__000000914E3F38F0 000000067F00004002000100000000064000-000000067F00004002000100000000068000__000000931B9A2710 000000067F00004002000100000000067682-000000067F00004002000100000000070046__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000068000-000000067F0000400200010000000006C000__0000005D2FFFFB38 000000067F00004002000100000000068000-000000067F0000400200010000000006C000__00000073AD3FE6B8 000000067F00004002000100000000068000-000000067F0000400200010000000006C000__000000914E3F38F0 000000067F00004002000100000000068000-000000067F0000400200010000000006C000__000000931B9A2710 000000067F0000400200010000000006C000-000000067F00004002000100000000070000__0000005D2FFFFB38 000000067F0000400200010000000006C000-000000067F00004002000100000000070000__00000073AD3FE6B8 000000067F0000400200010000000006C000-000000067F00004002000100000000070000__000000914E3F38F0 000000067F0000400200010000000006C000-000000067F00004002000100000000070000__000000931B9A2710 000000067F00004002000100000000070000-000000067F00004002000100000000074000__0000005D2FFFFB38 000000067F00004002000100000000070000-000000067F00004002000100000000074000__00000073AD3FE6B8 000000067F00004002000100000000070000-000000067F00004002000100000000074000__000000914E3F38F0 000000067F00004002000100000000070000-000000067F00004002000100000000074000__000000931B9A2710 000000067F00004002000100000000070046-000000067F00004002000100000000078A01__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000074000-000000067F00004002000100000000078000__0000005D2FFFFB38 000000067F00004002000100000000074000-000000067F00004002000100000000078000__00000073AD3FE6B8 000000067F00004002000100000000074000-000000067F00004002000100000000078000__000000914E3F38F0 000000067F00004002000100000000074000-000000067F00004002000100000000078000__000000931B9A2710 000000067F00004002000100000000078000-000000067F0000400200010000000007C000__0000005D2FFFFB38 000000067F00004002000100000000078000-000000067F0000400200010000000007C000__00000073AD3FE6B8 000000067F00004002000100000000078000-000000067F0000400200010000000007C000__000000914E3F38F0 000000067F00004002000100000000078000-000000067F0000400200010000000007C000__000000931B9A2710 000000067F00004002000100000000078A01-000000067F000040020001000000000813B5__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000007C000-000000067F00004002000100000000080000__0000005D2FFFFB38 000000067F0000400200010000000007C000-000000067F00004002000100000000080000__00000073AD3FE6B8 000000067F0000400200010000000007C000-000000067F00004002000100000000080000__000000914E3F38F0 000000067F0000400200010000000007C000-000000067F00004002000100000000080000__000000931B9A2710 000000067F00004002000100000000080000-000000067F00004002000100000000084000__0000005D2FFFFB38 000000067F00004002000100000000080000-000000067F00004002000100000000084000__00000073AD3FE6B8 000000067F00004002000100000000080000-000000067F00004002000100000000084000__000000914E3F38F0 000000067F00004002000100000000080000-000000067F00004002000100000000084000__000000931B9A2710 000000067F000040020001000000000813B5-000000067F00004002000100000000089DAC__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000084000-000000067F00004002000100000000088000__0000005D2FFFFB38 000000067F00004002000100000000084000-000000067F00004002000100000000088000__00000073AD3FE6B8 000000067F00004002000100000000084000-000000067F00004002000100000000088000__000000914E3F38F0 000000067F00004002000100000000084000-000000067F00004002000100000000088000__000000931B9A2710 000000067F00004002000100000000088000-000000067F0000400200010000000008C000__0000005D2FFFFB38 000000067F00004002000100000000088000-000000067F0000400200010000000008C000__00000073AD3FE6B8 000000067F00004002000100000000088000-000000067F0000400200010000000008C000__000000914E3F38F0 000000067F00004002000100000000088000-000000067F0000400200010000000008C000__000000931B9A2710 000000067F00004002000100000000089DAC-000000067F000040020001000000000927AD__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000008C000-000000067F00004002000100000000090000__0000005D2FFFFB38 000000067F0000400200010000000008C000-000000067F00004002000100000000090000__00000073AD3FE6B8 000000067F0000400200010000000008C000-000000067F00004002000100000000090000__000000914E3F38F0 000000067F0000400200010000000008C000-000000067F00004002000100000000090000__000000931B9A2710 000000067F00004002000100000000090000-000000067F00004002000100000000094000__0000005D2FFFFB38 000000067F00004002000100000000090000-000000067F00004002000100000000094000__00000073AD3FE6B8 000000067F00004002000100000000090000-000000067F00004002000100000000094000__000000914E3F38F0 000000067F00004002000100000000090000-000000067F00004002000100000000094000__000000931B9A2710 000000067F000040020001000000000927AD-000000067F0000400200010000000009B1A0__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000094000-000000067F00004002000100000000098000__0000005D2FFFFB38 000000067F00004002000100000000094000-000000067F00004002000100000000098000__00000073AD3FE6B8 000000067F00004002000100000000094000-000000067F00004002000100000000098000__000000914E3F38F0 000000067F00004002000100000000094000-000000067F00004002000100000000098000__000000931B9A2710 000000067F00004002000100000000098000-000000067F0000400200010000000009C000__0000005D2FFFFB38 000000067F00004002000100000000098000-000000067F0000400200010000000009C000__00000073AD3FE6B8 000000067F00004002000100000000098000-000000067F0000400200010000000009C000__000000914E3F38F0 000000067F00004002000100000000098000-000000067F0000400200010000000009C000__000000931B9A2710 000000067F0000400200010000000009B1A0-000000067F000040020001000000000A3B86__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000009C000-000000067F000040020001000000000A0000__0000005D2FFFFB38 000000067F0000400200010000000009C000-000000067F000040020001000000000A0000__00000073AD3FE6B8 000000067F0000400200010000000009C000-000000067F000040020001000000000A0000__000000914E3F38F0 000000067F0000400200010000000009C000-000000067F000040020001000000000A0000__000000931B9A2710 000000067F000040020001000000000A0000-000000067F000040020001000000000A4000__0000005D2FFFFB38 000000067F000040020001000000000A0000-000000067F000040020001000000000A4000__00000073AD3FE6B8 000000067F000040020001000000000A0000-000000067F000040020001000000000A4000__000000914E3F38F0 000000067F000040020001000000000A0000-000000067F000040020001000000000A4000__000000931B9A2710 000000067F000040020001000000000A3B86-000000067F000040020001000000000AC549__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000A4000-000000067F000040020001000000000A8000__0000005D2FFFFB38 000000067F000040020001000000000A4000-000000067F000040020001000000000A8000__00000073AD3FE6B8 000000067F000040020001000000000A4000-000000067F000040020001000000000A8000__000000914E3F38F0 000000067F000040020001000000000A4000-000000067F000040020001000000000A8000__000000931B9A2710 000000067F000040020001000000000A8000-000000067F000040020001000000000AC000__0000005D2FFFFB38 000000067F000040020001000000000A8000-000000067F000040020001000000000AC000__00000073AD3FE6B8 000000067F000040020001000000000A8000-000000067F000040020001000000000AC000__000000914E3F38F0 000000067F000040020001000000000A8000-000000067F000040020001000000000AC000__000000931B9A2710 000000067F000040020001000000000AC000-000000067F000040020001000000000B0000__0000005D2FFFFB38 000000067F000040020001000000000AC000-000000067F000040020001000000000B0000__00000073AD3FE6B8 000000067F000040020001000000000AC000-000000067F000040020001000000000B0000__000000914E3F38F0 000000067F000040020001000000000AC000-000000067F000040020001000000000B0000__000000931B9A2710 000000067F000040020001000000000AC549-000000067F000040020001000000000B4F06__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000B0000-000000067F000040020001000000000B4000__0000005D2FFFFB38 000000067F000040020001000000000B0000-000000067F000040020001000000000B4000__00000073AD3FE6B8 000000067F000040020001000000000B0000-000000067F000040020001000000000B4000__000000914E3F38F0 000000067F000040020001000000000B0000-000000067F000040020001000000000B4000__000000931B9A2710 000000067F000040020001000000000B4000-000000067F000040020001000000000B8000__0000005D2FFFFB38 000000067F000040020001000000000B4000-000000067F000040020001000000000B8000__00000073AD3FE6B8 000000067F000040020001000000000B4000-000000067F000040020001000000000B8000__000000914E3F38F0 000000067F000040020001000000000B4000-000000067F000040020001000000000B8000__000000931B9A2710 000000067F000040020001000000000B4F06-000000067F000040020001000000000BD8C7__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000B8000-000000067F000040020001000000000BC000__0000005D2FFFFB38 000000067F000040020001000000000B8000-000000067F000040020001000000000BC000__00000073AD3FE6B8 000000067F000040020001000000000B8000-000000067F000040020001000000000BC000__000000914E3F38F0 000000067F000040020001000000000B8000-000000067F000040020001000000000BC000__000000931B9A2710 000000067F000040020001000000000BC000-000000067F000040020001000000000C0000__0000005D2FFFFB38 000000067F000040020001000000000BC000-000000067F000040020001000000000C0000__00000073AD3FE6B8 000000067F000040020001000000000BC000-000000067F000040020001000000000C0000__000000914E3F38F0 000000067F000040020001000000000BC000-000000067F000040020001000000000C0000__000000931B9A2710 000000067F000040020001000000000BD8C7-000000067F000040020001000000000C62CB__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000C0000-000000067F000040020001000000000C4000__0000005D2FFFFB38 000000067F000040020001000000000C0000-000000067F000040020001000000000C4000__00000073AD3FE6B8 000000067F000040020001000000000C0000-000000067F000040020001000000000C4000__000000914E3F38F0 000000067F000040020001000000000C0000-000000067F000040020001000000000C4000__000000931B9A2710 000000067F000040020001000000000C4000-000000067F000040020001000000000C8000__0000005D2FFFFB38 000000067F000040020001000000000C4000-000000067F000040020001000000000C8000__00000073AD3FE6B8 000000067F000040020001000000000C4000-000000067F000040020001000000000C8000__000000914E3F38F0 000000067F000040020001000000000C4000-000000067F000040020001000000000C8000__000000931B9A2710 000000067F000040020001000000000C62CB-000000067F000040020001000000000CECC9__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000C8000-000000067F000040020001000000000CC000__0000005D2FFFFB38 000000067F000040020001000000000C8000-000000067F000040020001000000000CC000__00000073AD3FE6B8 000000067F000040020001000000000C8000-000000067F000040020001000000000CC000__000000914E3F38F0 000000067F000040020001000000000C8000-000000067F000040020001000000000CC000__000000931B9A2710 000000067F000040020001000000000CC000-000000067F000040020001000000000D0000__0000005D2FFFFB38 000000067F000040020001000000000CC000-000000067F000040020001000000000D0000__00000073AD3FE6B8 000000067F000040020001000000000CC000-000000067F000040020001000000000D0000__000000914E3F38F0 000000067F000040020001000000000CC000-000000067F000040020001000000000D0000__000000931B9A2710 000000067F000040020001000000000CECC9-000000067F000040020001000000000D76B8__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000D0000-000000067F000040020001000000000D4000__0000005D2FFFFB38 000000067F000040020001000000000D0000-000000067F000040020001000000000D4000__00000073AD3FE6B8 000000067F000040020001000000000D0000-000000067F000040020001000000000D4000__000000914E3F38F0 000000067F000040020001000000000D0000-000000067F000040020001000000000D4000__000000931B9A2710 000000067F000040020001000000000D4000-000000067F000040020001000000000D8000__0000005D2FFFFB38 000000067F000040020001000000000D4000-000000067F000040020001000000000D8000__00000073AD3FE6B8 000000067F000040020001000000000D4000-000000067F000040020001000000000D8000__000000914E3F38F0 000000067F000040020001000000000D4000-000000067F000040020001000000000D8000__000000931B9A2710 000000067F000040020001000000000D76B8-000000067F000040020001000000000E0094__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000D8000-000000067F000040020001000000000DC000__0000005D2FFFFB38 000000067F000040020001000000000D8000-000000067F000040020001000000000DC000__00000073AD3FE6B8 000000067F000040020001000000000D8000-000000067F000040020001000000000DC000__000000914E3F38F0 000000067F000040020001000000000D8000-000000067F000040020001000000000DC000__000000931B9A2710 000000067F000040020001000000000DC000-000000067F000040020001000000000E0000__0000005D2FFFFB38 000000067F000040020001000000000DC000-000000067F000040020001000000000E0000__00000073AD3FE6B8 000000067F000040020001000000000DC000-000000067F000040020001000000000E0000__000000914E3F38F0 000000067F000040020001000000000DC000-000000067F000040020001000000000E0000__000000931B9A2710 000000067F000040020001000000000E0000-000000067F000040020001000000000E4000__0000005D2FFFFB38 000000067F000040020001000000000E0000-000000067F000040020001000000000E4000__00000073AD3FE6B8 000000067F000040020001000000000E0000-000000067F000040020001000000000E4000__000000914E3F38F0 000000067F000040020001000000000E0000-000000067F000040020001000000000E4000__000000931B9A2710 000000067F000040020001000000000E0094-000000067F000040020001000000000E8A61__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000E4000-000000067F000040020001000000000E8000__0000005D2FFFFB38 000000067F000040020001000000000E4000-000000067F000040020001000000000E8000__00000073AD3FE6B8 000000067F000040020001000000000E4000-000000067F000040020001000000000E8000__000000914E3F38F0 000000067F000040020001000000000E4000-000000067F000040020001000000000E8000__000000931B9A2710 000000067F000040020001000000000E8000-000000067F000040020001000000000EC000__0000005D2FFFFB38 000000067F000040020001000000000E8000-000000067F000040020001000000000EC000__00000073AD3FE6B8 000000067F000040020001000000000E8000-000000067F000040020001000000000EC000__000000914E3F38F0 000000067F000040020001000000000E8000-000000067F000040020001000000000EC000__000000931B9A2710 000000067F000040020001000000000E8A61-000000067F000040020001000000000F1423__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000EC000-000000067F000040020001000000000F0000__0000005D2FFFFB38 000000067F000040020001000000000EC000-000000067F000040020001000000000F0000__00000073AD3FE6B8 000000067F000040020001000000000EC000-000000067F000040020001000000000F0000__000000914E3F38F0 000000067F000040020001000000000EC000-000000067F000040020001000000000F0000__000000931B9A2710 000000067F000040020001000000000F0000-000000067F000040020001000000000F4000__0000005D2FFFFB38 000000067F000040020001000000000F0000-000000067F000040020001000000000F4000__00000073AD3FE6B8 000000067F000040020001000000000F0000-000000067F000040020001000000000F4000__000000914E3F38F0 000000067F000040020001000000000F0000-000000067F000040020001000000000F4000__000000931B9A2710 000000067F000040020001000000000F1423-000000067F000040020001000000000F9DE5__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000F4000-000000067F000040020001000000000F8000__0000005D2FFFFB38 000000067F000040020001000000000F4000-000000067F000040020001000000000F8000__00000073AD3FE6B8 000000067F000040020001000000000F4000-000000067F000040020001000000000F8000__000000914E3F38F0 000000067F000040020001000000000F4000-000000067F000040020001000000000F8000__000000931B9A2710 000000067F000040020001000000000F8000-000000067F000040020001000000000FC000__0000005D2FFFFB38 000000067F000040020001000000000F8000-000000067F000040020001000000000FC000__00000073AD3FE6B8 000000067F000040020001000000000F8000-000000067F000040020001000000000FC000__000000914E3F38F0 000000067F000040020001000000000F8000-000000067F000040020001000000000FC000__000000931B9A2710 000000067F000040020001000000000F9DE5-000000067F000040020001000000001027EC__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000000FC000-000000067F00004002000100000000100000__0000005D2FFFFB38 000000067F000040020001000000000FC000-000000067F00004002000100000000100000__00000073AD3FE6B8 000000067F000040020001000000000FC000-000000067F00004002000100000000100000__000000914E3F38F0 000000067F000040020001000000000FC000-000000067F00004002000100000000100000__000000931B9A2710 000000067F00004002000100000000100000-000000067F00004002000100000000104000__0000005D2FFFFB38 000000067F00004002000100000000100000-000000067F00004002000100000000104000__00000073AD3FE6B8 000000067F00004002000100000000100000-000000067F00004002000100000000104000__000000914E3F38F0 000000067F00004002000100000000100000-000000067F00004002000100000000104000__000000931B9A2710 000000067F000040020001000000001027EC-000000067F0000400200010000000010B1E9__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000104000-000000067F00004002000100000000108000__0000005D2FFFFB38 000000067F00004002000100000000104000-000000067F00004002000100000000108000__00000073AD3FE6B8 000000067F00004002000100000000104000-000000067F00004002000100000000108000__000000914E3F38F0 000000067F00004002000100000000104000-000000067F00004002000100000000108000__000000931B9A2710 000000067F00004002000100000000108000-000000067F0000400200010000000010C000__0000005D2FFFFB38 000000067F00004002000100000000108000-000000067F0000400200010000000010C000__00000073AD3FE6B8 000000067F00004002000100000000108000-000000067F0000400200010000000010C000__000000914E3F38F0 000000067F00004002000100000000108000-000000067F0000400200010000000010C000__000000931B9A2710 000000067F0000400200010000000010B1E9-000000067F00004002000100000000113BDB__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000010C000-000000067F00004002000100000000110000__0000005D2FFFFB38 000000067F0000400200010000000010C000-000000067F00004002000100000000110000__00000073AD3FE6B8 000000067F0000400200010000000010C000-000000067F00004002000100000000110000__000000914E3F38F0 000000067F0000400200010000000010C000-000000067F00004002000100000000110000__000000931B9A2710 000000067F00004002000100000000110000-000000067F00004002000100000000114000__0000005D2FFFFB38 000000067F00004002000100000000110000-000000067F00004002000100000000114000__00000073AD3FE6B8 000000067F00004002000100000000110000-000000067F00004002000100000000114000__000000914E3F38F0 000000067F00004002000100000000110000-000000067F00004002000100000000114000__000000931B9A2710 000000067F00004002000100000000113BDB-000000067F0000400200010000000011C5C3__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000114000-000000067F00004002000100000000118000__0000005D2FFFFB38 000000067F00004002000100000000114000-000000067F00004002000100000000118000__00000073AD3FE6B8 000000067F00004002000100000000114000-000000067F00004002000100000000118000__000000914E3F38F0 000000067F00004002000100000000114000-000000067F00004002000100000000118000__000000931B9A2710 000000067F00004002000100000000118000-000000067F0000400200010000000011C000__0000005D2FFFFB38 000000067F00004002000100000000118000-000000067F0000400200010000000011C000__00000073AD3FE6B8 000000067F00004002000100000000118000-000000067F0000400200010000000011C000__000000914E3F38F0 000000067F00004002000100000000118000-000000067F0000400200010000000011C000__000000931B9A2710 000000067F0000400200010000000011C000-000000067F00004002000100000000120000__0000005D2FFFFB38 000000067F0000400200010000000011C000-000000067F00004002000100000000120000__00000073AD3FE6B8 000000067F0000400200010000000011C000-000000067F00004002000100000000120000__000000914E3F38F0 000000067F0000400200010000000011C000-000000067F00004002000100000000120000__000000931B9A2710 000000067F0000400200010000000011C5C3-000000067F00004002000100000000124F94__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000120000-000000067F00004002000100000000124000__0000005D2FFFFB38 000000067F00004002000100000000120000-000000067F00004002000100000000124000__00000073AD3FE6B8 000000067F00004002000100000000120000-000000067F00004002000100000000124000__000000914E3F38F0 000000067F00004002000100000000120000-000000067F00004002000100000000124000__000000931B9A2710 000000067F00004002000100000000124000-000000067F00004002000100000000128000__0000005D2FFFFB38 000000067F00004002000100000000124000-000000067F00004002000100000000128000__00000073AD3FE6B8 000000067F00004002000100000000124000-000000067F00004002000100000000128000__000000914E3F38F0 000000067F00004002000100000000124000-000000067F00004002000100000000128000__000000931B9A2710 000000067F00004002000100000000124F94-000000067F0000400200010000000012D94F__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000128000-000000067F0000400200010000000012C000__0000005D2FFFFB38 000000067F00004002000100000000128000-000000067F0000400200010000000012C000__00000073AD3FE6B8 000000067F00004002000100000000128000-000000067F0000400200010000000012C000__000000914E3F38F0 000000067F00004002000100000000128000-000000067F0000400200010000000012C000__000000931B9A2710 000000067F0000400200010000000012C000-000000067F00004002000100000000130000__0000005D2FFFFB38 000000067F0000400200010000000012C000-000000067F00004002000100000000130000__00000073AD3FE6B8 000000067F0000400200010000000012C000-000000067F00004002000100000000130000__000000914E3F38F0 000000067F0000400200010000000012C000-000000067F00004002000100000000130000__000000931B9A2710 000000067F0000400200010000000012D94F-000000067F00004002000100000000136318__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000130000-000000067F00004002000100000000134000__0000005D2FFFFB38 000000067F00004002000100000000130000-000000067F00004002000100000000134000__00000073AD3FE6B8 000000067F00004002000100000000130000-000000067F00004002000100000000134000__000000914E3F38F0 000000067F00004002000100000000130000-000000067F00004002000100000000134000__000000931B9A2710 000000067F00004002000100000000134000-000000067F00004002000100000000138000__0000005D2FFFFB38 000000067F00004002000100000000134000-000000067F00004002000100000000138000__00000073AD3FE6B8 000000067F00004002000100000000134000-000000067F00004002000100000000138000__000000914E3F38F0 000000067F00004002000100000000134000-000000067F00004002000100000000138000__000000931B9A2710 000000067F00004002000100000000136318-000000067F0000400200010000000013ED01__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000138000-000000067F0000400200010000000013C000__0000005D2FFFFB38 000000067F00004002000100000000138000-000000067F0000400200010000000013C000__00000073AD3FE6B8 000000067F00004002000100000000138000-000000067F0000400200010000000013C000__000000914E3F38F0 000000067F00004002000100000000138000-000000067F0000400200010000000013C000__000000931B9A2710 000000067F0000400200010000000013C000-000000067F00004002000100000000140000__0000005D2FFFFB38 000000067F0000400200010000000013C000-000000067F00004002000100000000140000__00000073AD3FE6B8 000000067F0000400200010000000013C000-000000067F00004002000100000000140000__000000914E3F38F0 000000067F0000400200010000000013C000-000000067F00004002000100000000140000__000000931B9A2710 000000067F0000400200010000000013ED01-000000067F000040020001000000001476ED__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000140000-000000067F00004002000100000000144000__0000005D2FFFFB38 000000067F00004002000100000000140000-000000067F00004002000100000000144000__00000073AD3FE6B8 000000067F00004002000100000000140000-000000067F00004002000100000000144000__000000914E3F38F0 000000067F00004002000100000000140000-000000067F00004002000100000000144000__000000931B9A2710 000000067F00004002000100000000144000-000000067F00004002000100000000148000__0000005D2FFFFB38 000000067F00004002000100000000144000-000000067F00004002000100000000148000__00000073AD3FE6B8 000000067F00004002000100000000144000-000000067F00004002000100000000148000__000000914E3F38F0 000000067F00004002000100000000144000-000000067F00004002000100000000148000__000000931B9A2710 000000067F000040020001000000001476ED-000000067F000040020001000000001500D7__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000148000-000000067F0000400200010000000014C000__0000005D2FFFFB38 000000067F00004002000100000000148000-000000067F0000400200010000000014C000__00000073AD3FE6B8 000000067F00004002000100000000148000-000000067F0000400200010000000014C000__000000914E3F38F0 000000067F00004002000100000000148000-000000067F0000400200010000000014C000__000000931B9A2710 000000067F0000400200010000000014C000-000000067F00004002000100000000150000__0000005D2FFFFB38 000000067F0000400200010000000014C000-000000067F00004002000100000000150000__00000073AD3FE6B8 000000067F0000400200010000000014C000-000000067F00004002000100000000150000__000000914E3F38F0 000000067F0000400200010000000014C000-000000067F00004002000100000000150000__000000931B9A2710 000000067F00004002000100000000150000-000000067F00004002000100000000154000__0000005D2FFFFB38 000000067F00004002000100000000150000-000000067F00004002000100000000154000__00000073AD3FE6B8 000000067F00004002000100000000150000-000000067F00004002000100000000154000__000000914E3F38F0 000000067F00004002000100000000150000-000000067F00004002000100000000154000__000000931B9A2710 000000067F000040020001000000001500D7-000000067F00004002000100000000158ABD__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000154000-000000067F00004002000100000000158000__0000005D2FFFFB38 000000067F00004002000100000000154000-000000067F00004002000100000000158000__00000073AD3FE6B8 000000067F00004002000100000000154000-000000067F00004002000100000000158000__000000914E3F38F0 000000067F00004002000100000000154000-000000067F00004002000100000000158000__000000931B9A2710 000000067F00004002000100000000158000-000000067F0000400200010000000015C000__0000005D2FFFFB38 000000067F00004002000100000000158000-000000067F0000400200010000000015C000__00000073AD3FE6B8 000000067F00004002000100000000158000-000000067F0000400200010000000015C000__000000914E3F38F0 000000067F00004002000100000000158000-000000067F0000400200010000000015C000__000000931B9A2710 000000067F00004002000100000000158ABD-000000067F00004002000100000000161489__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000015C000-000000067F00004002000100000000160000__0000005D2FFFFB38 000000067F0000400200010000000015C000-000000067F00004002000100000000160000__00000073AD3FE6B8 000000067F0000400200010000000015C000-000000067F00004002000100000000160000__000000914E3F38F0 000000067F0000400200010000000015C000-000000067F00004002000100000000160000__000000931B9A2710 000000067F00004002000100000000160000-000000067F00004002000100000000164000__0000005D2FFFFB38 000000067F00004002000100000000160000-000000067F00004002000100000000164000__00000073AD3FE6B8 000000067F00004002000100000000160000-000000067F00004002000100000000164000__000000914E3F38F0 000000067F00004002000100000000160000-000000067F00004002000100000000164000__000000931B9A2710 000000067F00004002000100000000161489-000000067F00004002000100000000169E43__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000164000-000000067F00004002000100000000168000__0000005D2FFFFB38 000000067F00004002000100000000164000-000000067F00004002000100000000168000__00000073AD3FE6B8 000000067F00004002000100000000164000-000000067F00004002000100000000168000__000000914E3F38F0 000000067F00004002000100000000164000-000000067F00004002000100000000168000__000000931B9A2710 000000067F00004002000100000000168000-000000067F0000400200010000000016C000__0000005D2FFFFB38 000000067F00004002000100000000168000-000000067F0000400200010000000016C000__00000073AD3FE6B8 000000067F00004002000100000000168000-000000067F0000400200010000000016C000__000000914E3F38F0 000000067F00004002000100000000168000-000000067F0000400200010000000016C000__000000931B9A2710 000000067F00004002000100000000169E43-000000067F00004002000100000000172829__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000016C000-000000067F00004002000100000000170000__0000005D2FFFFB38 000000067F0000400200010000000016C000-000000067F00004002000100000000170000__00000073AD3FE6B8 000000067F0000400200010000000016C000-000000067F00004002000100000000170000__000000914E3F38F0 000000067F0000400200010000000016C000-000000067F00004002000100000000170000__000000931B9A2710 000000067F00004002000100000000170000-000000067F00004002000100000000174000__0000005D2FFFFB38 000000067F00004002000100000000170000-000000067F00004002000100000000174000__00000073AD3FE6B8 000000067F00004002000100000000170000-000000067F00004002000100000000174000__000000914E3F38F0 000000067F00004002000100000000170000-000000067F00004002000100000000174000__000000931B9A2710 000000067F00004002000100000000172829-000000067F0000400200010000000017B215__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000174000-000000067F00004002000100000000178000__0000005D2FFFFB38 000000067F00004002000100000000174000-000000067F00004002000100000000178000__00000073AD3FE6B8 000000067F00004002000100000000174000-000000067F00004002000100000000178000__000000914E3F38F0 000000067F00004002000100000000174000-000000067F00004002000100000000178000__000000931B9A2710 000000067F00004002000100000000178000-000000067F0000400200010000000017C000__0000005D2FFFFB38 000000067F00004002000100000000178000-000000067F0000400200010000000017C000__00000073AD3FE6B8 000000067F00004002000100000000178000-000000067F0000400200010000000017C000__000000914E3F38F0 000000067F00004002000100000000178000-000000067F0000400200010000000017C000__000000931B9A2710 000000067F0000400200010000000017B215-000000067F00004002000100000000183C02__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000017C000-000000067F00004002000100000000180000__0000005D2FFFFB38 000000067F0000400200010000000017C000-000000067F00004002000100000000180000__00000073AD3FE6B8 000000067F0000400200010000000017C000-000000067F00004002000100000000180000__000000914E3F38F0 000000067F0000400200010000000017C000-000000067F00004002000100000000180000__000000931B9A2710 000000067F00004002000100000000180000-000000067F00004002000100000000184000__0000005D2FFFFB38 000000067F00004002000100000000180000-000000067F00004002000100000000184000__00000073AD3FE6B8 000000067F00004002000100000000180000-000000067F00004002000100000000184000__000000914E3F38F0 000000067F00004002000100000000180000-000000067F00004002000100000000184000__000000931B9A2710 000000067F00004002000100000000183C02-000000067F0000400200010000000018C5E0__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000184000-000000067F00004002000100000000188000__0000005D2FFFFB38 000000067F00004002000100000000184000-000000067F00004002000100000000188000__00000073AD3FE6B8 000000067F00004002000100000000184000-000000067F00004002000100000000188000__000000914E3F38F0 000000067F00004002000100000000184000-000000067F00004002000100000000188000__000000931B9A2710 000000067F00004002000100000000188000-000000067F0000400200010000000018C000__0000005D2FFFFB38 000000067F00004002000100000000188000-000000067F0000400200010000000018C000__00000073AD3FE6B8 000000067F00004002000100000000188000-000000067F0000400200010000000018C000__000000914E3F38F0 000000067F00004002000100000000188000-000000067F0000400200010000000018C000__000000931B9A2710 000000067F0000400200010000000018C000-000000067F00004002000100000000190000__0000005D2FFFFB38 000000067F0000400200010000000018C000-000000067F00004002000100000000190000__00000073AD3FE6B8 000000067F0000400200010000000018C000-000000067F00004002000100000000190000__000000914E3F38F0 000000067F0000400200010000000018C000-000000067F00004002000100000000190000__000000931B9A2710 000000067F0000400200010000000018C5E0-000000067F00004002000100000000194FC7__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000190000-000000067F00004002000100000000194000__0000005D2FFFFB38 000000067F00004002000100000000190000-000000067F00004002000100000000194000__00000073AD3FE6B8 000000067F00004002000100000000190000-000000067F00004002000100000000194000__000000914E3F38F0 000000067F00004002000100000000190000-000000067F00004002000100000000194000__000000931B9A2710 000000067F00004002000100000000194000-000000067F00004002000100000000198000__0000005D2FFFFB38 000000067F00004002000100000000194000-000000067F00004002000100000000198000__00000073AD3FE6B8 000000067F00004002000100000000194000-000000067F00004002000100000000198000__000000914E3F38F0 000000067F00004002000100000000194000-000000067F00004002000100000000198000__000000931B9A2710 000000067F00004002000100000000194FC7-000000067F0000400200010000000019D98D__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000198000-000000067F0000400200010000000019C000__0000005D2FFFFB38 000000067F00004002000100000000198000-000000067F0000400200010000000019C000__00000073AD3FE6B8 000000067F00004002000100000000198000-000000067F0000400200010000000019C000__000000914E3F38F0 000000067F00004002000100000000198000-000000067F0000400200010000000019C000__000000931B9A2710 000000067F0000400200010000000019C000-000000067F000040020001000000001A0000__0000005D2FFFFB38 000000067F0000400200010000000019C000-000000067F000040020001000000001A0000__00000073AD3FE6B8 000000067F0000400200010000000019C000-000000067F000040020001000000001A0000__000000914E3F38F0 000000067F0000400200010000000019C000-000000067F000040020001000000001A0000__000000931B9A2710 000000067F0000400200010000000019D98D-000000067F000040020001000000001A6347__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001A0000-000000067F000040020001000000001A4000__0000005D2FFFFB38 000000067F000040020001000000001A0000-000000067F000040020001000000001A4000__00000073AD3FE6B8 000000067F000040020001000000001A0000-000000067F000040020001000000001A4000__000000914E3F38F0 000000067F000040020001000000001A0000-000000067F000040020001000000001A4000__000000931B9A2710 000000067F000040020001000000001A4000-000000067F000040020001000000001A8000__0000005D2FFFFB38 000000067F000040020001000000001A4000-000000067F000040020001000000001A8000__00000073AD3FE6B8 000000067F000040020001000000001A4000-000000067F000040020001000000001A8000__000000914E3F38F0 000000067F000040020001000000001A4000-000000067F000040020001000000001A8000__000000931B9A2710 000000067F000040020001000000001A6347-000000067F000040020001000000001AED26__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001A8000-000000067F000040020001000000001AC000__0000005D2FFFFB38 000000067F000040020001000000001A8000-000000067F000040020001000000001AC000__00000073AD3FE6B8 000000067F000040020001000000001A8000-000000067F000040020001000000001AC000__000000914E3F38F0 000000067F000040020001000000001A8000-000000067F000040020001000000001AC000__000000931B9A2710 000000067F000040020001000000001AC000-000000067F000040020001000000001B0000__0000005D2FFFFB38 000000067F000040020001000000001AC000-000000067F000040020001000000001B0000__00000073AD3FE6B8 000000067F000040020001000000001AC000-000000067F000040020001000000001B0000__000000914E3F38F0 000000067F000040020001000000001AC000-000000067F000040020001000000001B0000__000000931B9A2710 000000067F000040020001000000001AED26-000000067F000040020001000000001B770D__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001B0000-000000067F000040020001000000001B4000__0000005D2FFFFB38 000000067F000040020001000000001B0000-000000067F000040020001000000001B4000__00000073AD3FE6B8 000000067F000040020001000000001B0000-000000067F000040020001000000001B4000__000000914E3F38F0 000000067F000040020001000000001B0000-000000067F000040020001000000001B4000__000000931B9A2710 000000067F000040020001000000001B4000-000000067F000040020001000000001B8000__0000005D2FFFFB38 000000067F000040020001000000001B4000-000000067F000040020001000000001B8000__00000073AD3FE6B8 000000067F000040020001000000001B4000-000000067F000040020001000000001B8000__000000914E3F38F0 000000067F000040020001000000001B4000-000000067F000040020001000000001B8000__000000931B9A2710 000000067F000040020001000000001B770D-000000067F000040020001000000001C00F6__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001B8000-000000067F000040020001000000001BC000__0000005D2FFFFB38 000000067F000040020001000000001B8000-000000067F000040020001000000001BC000__00000073AD3FE6B8 000000067F000040020001000000001B8000-000000067F000040020001000000001BC000__000000914E3F38F0 000000067F000040020001000000001B8000-000000067F000040020001000000001BC000__000000931B9A2710 000000067F000040020001000000001BC000-000000067F000040020001000000001C0000__0000005D2FFFFB38 000000067F000040020001000000001BC000-000000067F000040020001000000001C0000__00000073AD3FE6B8 000000067F000040020001000000001BC000-000000067F000040020001000000001C0000__000000914E3F38F0 000000067F000040020001000000001BC000-000000067F000040020001000000001C0000__000000931B9A2710 000000067F000040020001000000001C0000-000000067F000040020001000000001C4000__0000005D2FFFFB38 000000067F000040020001000000001C0000-000000067F000040020001000000001C4000__00000073AD3FE6B8 000000067F000040020001000000001C0000-000000067F000040020001000000001C4000__000000914E3F38F0 000000067F000040020001000000001C0000-000000067F000040020001000000001C4000__000000931B9A2710 000000067F000040020001000000001C00F6-000000067F000040020001000000001C8ADD__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001C4000-000000067F000040020001000000001C8000__0000005D2FFFFB38 000000067F000040020001000000001C4000-000000067F000040020001000000001C8000__00000073AD3FE6B8 000000067F000040020001000000001C4000-000000067F000040020001000000001C8000__000000914E3F38F0 000000067F000040020001000000001C4000-000000067F000040020001000000001C8000__000000931B9A2710 000000067F000040020001000000001C8000-000000067F000040020001000000001CC000__0000005D2FFFFB38 000000067F000040020001000000001C8000-000000067F000040020001000000001CC000__00000073AD3FE6B8 000000067F000040020001000000001C8000-000000067F000040020001000000001CC000__000000914E3F38F0 000000067F000040020001000000001C8000-000000067F000040020001000000001CC000__000000931B9A2710 000000067F000040020001000000001C8ADD-000000067F000040020001000000001D14BA__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001CC000-000000067F000040020001000000001D0000__0000005D2FFFFB38 000000067F000040020001000000001CC000-000000067F000040020001000000001D0000__00000073AD3FE6B8 000000067F000040020001000000001CC000-000000067F000040020001000000001D0000__000000914E3F38F0 000000067F000040020001000000001CC000-000000067F000040020001000000001D0000__000000931B9A2710 000000067F000040020001000000001D0000-000000067F000040020001000000001D4000__0000005D2FFFFB38 000000067F000040020001000000001D0000-000000067F000040020001000000001D4000__00000073AD3FE6B8 000000067F000040020001000000001D0000-000000067F000040020001000000001D4000__000000914E3F38F0 000000067F000040020001000000001D0000-000000067F000040020001000000001D4000__000000931B9A2710 000000067F000040020001000000001D14BA-000000067F000040020001000000001D9E89__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001D4000-000000067F000040020001000000001D8000__0000005D2FFFFB38 000000067F000040020001000000001D4000-000000067F000040020001000000001D8000__00000073AD3FE6B8 000000067F000040020001000000001D4000-000000067F000040020001000000001D8000__000000914E3F38F0 000000067F000040020001000000001D4000-000000067F000040020001000000001D8000__000000931B9A2710 000000067F000040020001000000001D8000-000000067F000040020001000000001DC000__0000005D2FFFFB38 000000067F000040020001000000001D8000-000000067F000040020001000000001DC000__00000073AD3FE6B8 000000067F000040020001000000001D8000-000000067F000040020001000000001DC000__000000914E3F38F0 000000067F000040020001000000001D8000-000000067F000040020001000000001DC000__000000931B9A2710 000000067F000040020001000000001D9E89-000000067F000040020001000000001E284E__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001DC000-000000067F000040020001000000001E0000__0000005D2FFFFB38 000000067F000040020001000000001DC000-000000067F000040020001000000001E0000__00000073AD3FE6B8 000000067F000040020001000000001DC000-000000067F000040020001000000001E0000__000000914E3F38F0 000000067F000040020001000000001DC000-000000067F000040020001000000001E0000__000000931B9A2710 000000067F000040020001000000001E0000-000000067F000040020001000000001E4000__0000005D2FFFFB38 000000067F000040020001000000001E0000-000000067F000040020001000000001E4000__00000073AD3FE6B8 000000067F000040020001000000001E0000-000000067F000040020001000000001E4000__000000914E3F38F0 000000067F000040020001000000001E0000-000000067F000040020001000000001E4000__000000931B9A2710 000000067F000040020001000000001E284E-000000067F000040020001000000001EB231__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001E4000-000000067F000040020001000000001E8000__0000005D2FFFFB38 000000067F000040020001000000001E4000-000000067F000040020001000000001E8000__00000073AD3FE6B8 000000067F000040020001000000001E4000-000000067F000040020001000000001E8000__000000914E3F38F0 000000067F000040020001000000001E4000-000000067F000040020001000000001E8000__000000931B9A2710 000000067F000040020001000000001E8000-000000067F000040020001000000001EC000__0000005D2FFFFB38 000000067F000040020001000000001E8000-000000067F000040020001000000001EC000__00000073AD3FE6B8 000000067F000040020001000000001E8000-000000067F000040020001000000001EC000__000000914E3F38F0 000000067F000040020001000000001E8000-000000067F000040020001000000001EC000__000000931B9A2710 000000067F000040020001000000001EB231-000000067F000040020001000000001F3C19__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001EC000-000000067F000040020001000000001F0000__0000005D2FFFFB38 000000067F000040020001000000001EC000-000000067F000040020001000000001F0000__00000073AD3FE6B8 000000067F000040020001000000001EC000-000000067F000040020001000000001F0000__000000914E3F38F0 000000067F000040020001000000001EC000-000000067F000040020001000000001F0000__000000931B9A2710 000000067F000040020001000000001F0000-000000067F000040020001000000001F4000__0000005D2FFFFB38 000000067F000040020001000000001F0000-000000067F000040020001000000001F4000__00000073AD3FE6B8 000000067F000040020001000000001F0000-000000067F000040020001000000001F4000__000000914E3F38F0 000000067F000040020001000000001F0000-000000067F000040020001000000001F4000__000000931B9A2710 000000067F000040020001000000001F3C19-000000067F000040020001000000001FC608__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000001F4000-000000067F000040020001000000001F8000__0000005D2FFFFB38 000000067F000040020001000000001F4000-000000067F000040020001000000001F8000__00000073AD3FE6B8 000000067F000040020001000000001F4000-000000067F000040020001000000001F8000__000000914E3F38F0 000000067F000040020001000000001F4000-000000067F000040020001000000001F8000__000000931B9A2710 000000067F000040020001000000001F8000-000000067F000040020001000000001FC000__0000005D2FFFFB38 000000067F000040020001000000001F8000-000000067F000040020001000000001FC000__00000073AD3FE6B8 000000067F000040020001000000001F8000-000000067F000040020001000000001FC000__000000914E3F38F0 000000067F000040020001000000001F8000-000000067F000040020001000000001FC000__000000931B9A2710 000000067F000040020001000000001FC000-000000067F00004002000100000000200000__0000005D2FFFFB38 000000067F000040020001000000001FC000-000000067F00004002000100000000200000__00000073AD3FE6B8 000000067F000040020001000000001FC000-000000067F00004002000100000000200000__000000914E3F38F0 000000067F000040020001000000001FC000-000000067F00004002000100000000200000__000000931B9A2710 000000067F000040020001000000001FC608-000000067F00004002000100000000204FDF__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000200000-000000067F00004002000100000000204000__0000005D2FFFFB38 000000067F00004002000100000000200000-000000067F00004002000100000000204000__00000073AD3FE6B8 000000067F00004002000100000000200000-000000067F00004002000100000000204000__000000914E3F38F0 000000067F00004002000100000000200000-000000067F00004002000100000000204000__000000931B9A2710 000000067F00004002000100000000204000-000000067F00004002000100000000208000__0000005D2FFFFB38 000000067F00004002000100000000204000-000000067F00004002000100000000208000__00000073AD3FE6B8 000000067F00004002000100000000204000-000000067F00004002000100000000208000__000000914E3F38F0 000000067F00004002000100000000204000-000000067F00004002000100000000208000__000000931B9A2710 000000067F00004002000100000000204FDF-000000067F0000400200010000000020D9BC__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000208000-000000067F0000400200010000000020C000__0000005D2FFFFB38 000000067F00004002000100000000208000-000000067F0000400200010000000020C000__00000073AD3FE6B8 000000067F00004002000100000000208000-000000067F0000400200010000000020C000__000000914E3F38F0 000000067F00004002000100000000208000-000000067F0000400200010000000020C000__000000931B9A2710 000000067F0000400200010000000020C000-000000067F00004002000100000000210000__0000005D2FFFFB38 000000067F0000400200010000000020C000-000000067F00004002000100000000210000__00000073AD3FE6B8 000000067F0000400200010000000020C000-000000067F00004002000100000000210000__000000914E3F38F0 000000067F0000400200010000000020C000-000000067F00004002000100000000210000__000000931B9A2710 000000067F0000400200010000000020D9BC-000000067F0000400200010000000021638D__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000210000-000000067F00004002000100000000214000__0000005D2FFFFB38 000000067F00004002000100000000210000-000000067F00004002000100000000214000__00000073AD3FE6B8 000000067F00004002000100000000210000-000000067F00004002000100000000214000__000000914E3F38F0 000000067F00004002000100000000210000-000000067F00004002000100000000214000__000000931B9A2710 000000067F00004002000100000000214000-000000067F00004002000100000000218000__0000005D2FFFFB38 000000067F00004002000100000000214000-000000067F00004002000100000000218000__00000073AD3FE6B8 000000067F00004002000100000000214000-000000067F00004002000100000000218000__000000914E3F38F0 000000067F00004002000100000000214000-000000067F00004002000100000000218000__000000931B9A2710 000000067F0000400200010000000021638D-000000067F0000400200010000000021ED51__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000218000-000000067F0000400200010000000021C000__0000005D2FFFFB38 000000067F00004002000100000000218000-000000067F0000400200010000000021C000__00000073AD3FE6B8 000000067F00004002000100000000218000-000000067F0000400200010000000021C000__000000914E3F38F0 000000067F00004002000100000000218000-000000067F0000400200010000000021C000__000000931B9A2710 000000067F0000400200010000000021C000-000000067F00004002000100000000220000__0000005D2FFFFB38 000000067F0000400200010000000021C000-000000067F00004002000100000000220000__00000073AD3FE6B8 000000067F0000400200010000000021C000-000000067F00004002000100000000220000__000000914E3F38F0 000000067F0000400200010000000021C000-000000067F00004002000100000000220000__000000931B9A2710 000000067F0000400200010000000021ED51-000000067F0000400200010000000022773E__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000220000-000000067F00004002000100000000224000__0000005D2FFFFB38 000000067F00004002000100000000220000-000000067F00004002000100000000224000__00000073AD3FE6B8 000000067F00004002000100000000220000-000000067F00004002000100000000224000__000000914E3F38F0 000000067F00004002000100000000220000-000000067F00004002000100000000224000__000000931B9A2710 000000067F00004002000100000000224000-000000067F00004002000100000000228000__0000005D2FFFFB38 000000067F00004002000100000000224000-000000067F00004002000100000000228000__00000073AD3FE6B8 000000067F00004002000100000000224000-000000067F00004002000100000000228000__000000914E3F38F0 000000067F00004002000100000000224000-000000067F00004002000100000000228000__000000931B9A2710 000000067F0000400200010000000022773E-000000067F00004002000100000000230129__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000228000-000000067F0000400200010000000022C000__0000005D2FFFFB38 000000067F00004002000100000000228000-000000067F0000400200010000000022C000__00000073AD3FE6B8 000000067F00004002000100000000228000-000000067F0000400200010000000022C000__000000914E3F38F0 000000067F00004002000100000000228000-000000067F0000400200010000000022C000__000000931B9A2710 000000067F0000400200010000000022C000-000000067F00004002000100000000230000__0000005D2FFFFB38 000000067F0000400200010000000022C000-000000067F00004002000100000000230000__00000073AD3FE6B8 000000067F0000400200010000000022C000-000000067F00004002000100000000230000__000000914E3F38F0 000000067F0000400200010000000022C000-000000067F00004002000100000000230000__000000931B9A2710 000000067F00004002000100000000230000-000000067F00004002000100000000234000__0000005D2FFFFB38 000000067F00004002000100000000230000-000000067F00004002000100000000234000__00000073AD3FE6B8 000000067F00004002000100000000230000-000000067F00004002000100000000234000__000000914E3F38F0 000000067F00004002000100000000230000-000000067F00004002000100000000234000__000000931B9A2710 000000067F00004002000100000000230129-000000067F00004002000100000000238B15__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000234000-000000067F00004002000100000000238000__0000005D2FFFFB38 000000067F00004002000100000000234000-000000067F00004002000100000000238000__00000073AD3FE6B8 000000067F00004002000100000000234000-000000067F00004002000100000000238000__000000914E3F38F0 000000067F00004002000100000000234000-000000067F00004002000100000000238000__000000931B9A2710 000000067F00004002000100000000238000-000000067F0000400200010000000023C000__0000005D2FFFFB38 000000067F00004002000100000000238000-000000067F0000400200010000000023C000__00000073AD3FE6B8 000000067F00004002000100000000238000-000000067F0000400200010000000023C000__000000914E3F38F0 000000067F00004002000100000000238000-000000067F0000400200010000000023C000__000000931B9A2710 000000067F00004002000100000000238B15-000000067F000040020001000000002414E7__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000023C000-000000067F00004002000100000000240000__0000005D2FFFFB38 000000067F0000400200010000000023C000-000000067F00004002000100000000240000__00000073AD3FE6B8 000000067F0000400200010000000023C000-000000067F00004002000100000000240000__000000914E3F38F0 000000067F0000400200010000000023C000-000000067F00004002000100000000240000__000000931B9A2710 000000067F00004002000100000000240000-000000067F00004002000100000000244000__0000005D2FFFFB38 000000067F00004002000100000000240000-000000067F00004002000100000000244000__00000073AD3FE6B8 000000067F00004002000100000000240000-000000067F00004002000100000000244000__000000914E3F38F0 000000067F00004002000100000000240000-000000067F00004002000100000000244000__000000931B9A2710 000000067F000040020001000000002414E7-000000067F00004002000100000000249EC9__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000244000-000000067F00004002000100000000248000__0000005D2FFFFB38 000000067F00004002000100000000244000-000000067F00004002000100000000248000__00000073AD3FE6B8 000000067F00004002000100000000244000-000000067F00004002000100000000248000__000000914E3F38F0 000000067F00004002000100000000244000-000000067F00004002000100000000248000__000000931B9A2710 000000067F00004002000100000000248000-000000067F0000400200010000000024C000__0000005D2FFFFB38 000000067F00004002000100000000248000-000000067F0000400200010000000024C000__00000073AD3FE6B8 000000067F00004002000100000000248000-000000067F0000400200010000000024C000__000000914E3F38F0 000000067F00004002000100000000248000-000000067F0000400200010000000024C000__000000931B9A2710 000000067F00004002000100000000249EC9-000000067F0000400200010000000025288A__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000024C000-000000067F00004002000100000000250000__0000005D2FFFFB38 000000067F0000400200010000000024C000-000000067F00004002000100000000250000__00000073AD3FE6B8 000000067F0000400200010000000024C000-000000067F00004002000100000000250000__000000914E3F38F0 000000067F0000400200010000000024C000-000000067F00004002000100000000250000__000000931B9A2710 000000067F00004002000100000000250000-000000067F00004002000100000000254000__0000005D2FFFFB38 000000067F00004002000100000000250000-000000067F00004002000100000000254000__00000073AD3FE6B8 000000067F00004002000100000000250000-000000067F00004002000100000000254000__000000914E3F38F0 000000067F00004002000100000000250000-000000067F00004002000100000000254000__000000931B9A2710 000000067F0000400200010000000025288A-000000067F0000400200010000000025B24E__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000254000-000000067F00004002000100000000258000__0000005D2FFFFB38 000000067F00004002000100000000254000-000000067F00004002000100000000258000__00000073AD3FE6B8 000000067F00004002000100000000254000-000000067F00004002000100000000258000__000000914E3F38F0 000000067F00004002000100000000254000-000000067F00004002000100000000258000__000000931B9A2710 000000067F00004002000100000000258000-000000067F0000400200010000000025C000__0000005D2FFFFB38 000000067F00004002000100000000258000-000000067F0000400200010000000025C000__00000073AD3FE6B8 000000067F00004002000100000000258000-000000067F0000400200010000000025C000__000000914E3F38F0 000000067F00004002000100000000258000-000000067F0000400200010000000025C000__000000931B9A2710 000000067F0000400200010000000025B24E-000000067F00004002000100000000263C37__000000572A7C74A1-0000005CA7BBD6F9 000000067F0000400200010000000025C000-000000067F00004002000100000000260000__0000005D2FFFFB38 000000067F0000400200010000000025C000-000000067F00004002000100000000260000__00000073AD3FE6B8 000000067F0000400200010000000025C000-000000067F00004002000100000000260000__000000914E3F38F0 000000067F0000400200010000000025C000-000000067F00004002000100000000260000__000000931B9A2710 000000067F00004002000100000000260000-000000067F00004002000100000000264000__0000005D2FFFFB38 000000067F00004002000100000000260000-000000067F00004002000100000000264000__00000073AD3FE6B8 000000067F00004002000100000000260000-000000067F00004002000100000000264000__000000914E3F38F0 000000067F00004002000100000000260000-000000067F00004002000100000000264000__000000931B9A2710 000000067F00004002000100000000263C37-000000067F0000400200010000000026C620__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000264000-000000067F00004002000100000000268000__0000005D2FFFFB38 000000067F00004002000100000000264000-000000067F00004002000100000000268000__00000073AD3FE6B8 000000067F00004002000100000000264000-000000067F00004002000100000000268000__000000914E3F38F0 000000067F00004002000100000000264000-000000067F00004002000100000000268000__000000931B9A2710 000000067F00004002000100000000268000-000000067F0000400200010000000026C000__0000005D2FFFFB38 000000067F00004002000100000000268000-000000067F0000400200010000000026C000__00000073AD3FE6B8 000000067F00004002000100000000268000-000000067F0000400200010000000026C000__000000914E3F38F0 000000067F00004002000100000000268000-000000067F0000400200010000000026C000__000000931B9A2710 000000067F0000400200010000000026C000-000000067F00004002000100000000270000__0000005D2FFFFB38 000000067F0000400200010000000026C000-000000067F00004002000100000000270000__00000073AD3FE6B8 000000067F0000400200010000000026C000-000000067F00004002000100000000270000__000000914E3F38F0 000000067F0000400200010000000026C000-000000067F00004002000100000000270000__000000931B9A2710 000000067F0000400200010000000026C620-000000067F00004002000100000000275003__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000270000-000000067F00004002000100000000274000__0000005D2FFFFB38 000000067F00004002000100000000270000-000000067F00004002000100000000274000__00000073AD3FE6B8 000000067F00004002000100000000270000-000000067F00004002000100000000274000__000000914E3F38F0 000000067F00004002000100000000270000-000000067F00004002000100000000274000__000000931B9A2710 000000067F00004002000100000000274000-000000067F00004002000100000000278000__0000005D2FFFFB38 000000067F00004002000100000000274000-000000067F00004002000100000000278000__00000073AD3FE6B8 000000067F00004002000100000000274000-000000067F00004002000100000000278000__000000914E3F38F0 000000067F00004002000100000000274000-000000067F00004002000100000000278000__000000931B9A2710 000000067F00004002000100000000275003-000000067F0000400200010000000027D9DA__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000278000-000000067F0000400200010000000027C000__0000005D2FFFFB38 000000067F00004002000100000000278000-000000067F0000400200010000000027C000__00000073AD3FE6B8 000000067F00004002000100000000278000-000000067F0000400200010000000027C000__000000914E3F38F0 000000067F00004002000100000000278000-000000067F0000400200010000000027C000__000000931B9A2710 000000067F0000400200010000000027C000-000000067F00004002000100000000280000__0000005D2FFFFB38 000000067F0000400200010000000027C000-000000067F00004002000100000000280000__00000073AD3FE6B8 000000067F0000400200010000000027C000-000000067F00004002000100000000280000__000000914E3F38F0 000000067F0000400200010000000027C000-000000067F00004002000100000000280000__000000931B9A2710 000000067F0000400200010000000027D9DA-000000067F000040020001000000002863B3__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000280000-000000067F00004002000100000000284000__0000005D2FFFFB38 000000067F00004002000100000000280000-000000067F00004002000100000000284000__00000073AD3FE6B8 000000067F00004002000100000000280000-000000067F00004002000100000000284000__000000914E3F38F0 000000067F00004002000100000000280000-000000067F00004002000100000000284000__000000931B9A2710 000000067F00004002000100000000284000-000000067F00004002000100000000288000__0000005D2FFFFB38 000000067F00004002000100000000284000-000000067F00004002000100000000288000__00000073AD3FE6B8 000000067F00004002000100000000284000-000000067F00004002000100000000288000__000000914E3F38F0 000000067F00004002000100000000284000-000000067F00004002000100000000288000__000000931B9A2710 000000067F000040020001000000002863B3-000000067F0000400200010000000028ED6E__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000288000-000000067F0000400200010000000028C000__0000005D2FFFFB38 000000067F00004002000100000000288000-000000067F0000400200010000000028C000__00000073AD3FE6B8 000000067F00004002000100000000288000-000000067F0000400200010000000028C000__000000914E3F38F0 000000067F00004002000100000000288000-000000067F0000400200010000000028C000__000000931B9A2710 000000067F0000400200010000000028C000-000000067F00004002000100000000290000__0000005D2FFFFB38 000000067F0000400200010000000028C000-000000067F00004002000100000000290000__00000073AD3FE6B8 000000067F0000400200010000000028C000-000000067F00004002000100000000290000__000000914E3F38F0 000000067F0000400200010000000028C000-000000067F00004002000100000000290000__000000931B9A2710 000000067F0000400200010000000028ED6E-000000067F00004002000100000000297734__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000290000-000000067F00004002000100000000294000__0000005D2FFFFB38 000000067F00004002000100000000290000-000000067F00004002000100000000294000__00000073AD3FE6B8 000000067F00004002000100000000290000-000000067F00004002000100000000294000__000000914E3F38F0 000000067F00004002000100000000290000-000000067F00004002000100000000294000__000000931B9A2710 000000067F00004002000100000000294000-000000067F00004002000100000000298000__0000005D2FFFFB38 000000067F00004002000100000000294000-000000067F00004002000100000000298000__00000073AD3FE6B8 000000067F00004002000100000000294000-000000067F00004002000100000000298000__000000914E3F38F0 000000067F00004002000100000000294000-000000067F00004002000100000000298000__000000931B9A2710 000000067F00004002000100000000297734-000000067F000040020001000000002A0126__000000572A7C74A1-0000005CA7BBD6F9 000000067F00004002000100000000298000-000000067F0000400200010000000029C000__0000005D2FFFFB38 000000067F00004002000100000000298000-000000067F0000400200010000000029C000__00000073AD3FE6B8 000000067F00004002000100000000298000-000000067F0000400200010000000029C000__000000914E3F38F0 000000067F00004002000100000000298000-000000067F0000400200010000000029C000__000000931B9A2710 000000067F0000400200010000000029C000-000000067F000040020001000000002A0000__0000005D2FFFFB38 000000067F0000400200010000000029C000-000000067F000040020001000000002A0000__00000073AD3FE6B8 000000067F0000400200010000000029C000-000000067F000040020001000000002A0000__000000914E3F38F0 000000067F0000400200010000000029C000-000000067F000040020001000000002A0000__000000931B9A2710 000000067F000040020001000000002A0000-000000067F000040020001000000002A4000__0000005D2FFFFB38 000000067F000040020001000000002A0000-000000067F000040020001000000002A4000__00000073AD3FE6B8 000000067F000040020001000000002A0000-000000067F000040020001000000002A4000__000000914E3F38F0 000000067F000040020001000000002A0000-000000067F000040020001000000002A4000__000000931B9A2710 000000067F000040020001000000002A0126-000000067F000040020001000000002A8B19__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002A4000-000000067F000040020001000000002A8000__0000005D2FFFFB38 000000067F000040020001000000002A4000-000000067F000040020001000000002A8000__00000073AD3FE6B8 000000067F000040020001000000002A4000-000000067F000040020001000000002A8000__000000914E3F38F0 000000067F000040020001000000002A4000-000000067F000040020001000000002A8000__000000931B9A2710 000000067F000040020001000000002A8000-000000067F000040020001000000002AC000__0000005D2FFFFB38 000000067F000040020001000000002A8000-000000067F000040020001000000002AC000__00000073AD3FE6B8 000000067F000040020001000000002A8000-000000067F000040020001000000002AC000__000000914E3F38F0 000000067F000040020001000000002A8000-000000067F000040020001000000002AC000__000000931B9A2710 000000067F000040020001000000002A8B19-000000067F000040020001000000002B1501__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002AC000-000000067F000040020001000000002B0000__0000005D2FFFFB38 000000067F000040020001000000002AC000-000000067F000040020001000000002B0000__00000073AD3FE6B8 000000067F000040020001000000002AC000-000000067F000040020001000000002B0000__000000914E3F38F0 000000067F000040020001000000002AC000-000000067F000040020001000000002B0000__000000931B9A2710 000000067F000040020001000000002B0000-000000067F000040020001000000002B4000__0000005D2FFFFB38 000000067F000040020001000000002B0000-000000067F000040020001000000002B4000__00000073AD3FE6B8 000000067F000040020001000000002B0000-000000067F000040020001000000002B4000__000000914E3F38F0 000000067F000040020001000000002B0000-000000067F000040020001000000002B4000__000000931B9A2710 000000067F000040020001000000002B1501-000000067F000040020001000000002B9EDA__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002B4000-000000067F000040020001000000002B8000__0000005D2FFFFB38 000000067F000040020001000000002B4000-000000067F000040020001000000002B8000__00000073AD3FE6B8 000000067F000040020001000000002B4000-000000067F000040020001000000002B8000__000000914E3F38F0 000000067F000040020001000000002B4000-000000067F000040020001000000002B8000__000000931B9A2710 000000067F000040020001000000002B8000-000000067F000040020001000000002BC000__0000005D2FFFFB38 000000067F000040020001000000002B8000-000000067F000040020001000000002BC000__00000073AD3FE6B8 000000067F000040020001000000002B8000-000000067F000040020001000000002BC000__000000914E3F38F0 000000067F000040020001000000002B8000-000000067F000040020001000000002BC000__000000931B9A2710 000000067F000040020001000000002B9EDA-000000067F000040020001000000002C28A8__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002BC000-000000067F000040020001000000002C0000__0000005D2FFFFB38 000000067F000040020001000000002BC000-000000067F000040020001000000002C0000__00000073AD3FE6B8 000000067F000040020001000000002BC000-000000067F000040020001000000002C0000__000000914E3F38F0 000000067F000040020001000000002BC000-000000067F000040020001000000002C0000__000000931B9A2710 000000067F000040020001000000002C0000-000000067F000040020001000000002C4000__0000005D2FFFFB38 000000067F000040020001000000002C0000-000000067F000040020001000000002C4000__00000073AD3FE6B8 000000067F000040020001000000002C0000-000000067F000040020001000000002C4000__000000914E3F38F0 000000067F000040020001000000002C0000-000000067F000040020001000000002C4000__000000931B9A2710 000000067F000040020001000000002C28A8-000000067F000040020001000000002CB271__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002C4000-000000067F000040020001000000002C8000__0000005D2FFFFB38 000000067F000040020001000000002C4000-000000067F000040020001000000002C8000__00000073AD3FE6B8 000000067F000040020001000000002C4000-000000067F000040020001000000002C8000__000000914E3F38F0 000000067F000040020001000000002C4000-000000067F000040020001000000002C8000__000000931B9A2710 000000067F000040020001000000002C8000-000000067F000040020001000000002CC000__0000005D2FFFFB38 000000067F000040020001000000002C8000-000000067F000040020001000000002CC000__00000073AD3FE6B8 000000067F000040020001000000002C8000-000000067F000040020001000000002CC000__000000914E3F38F0 000000067F000040020001000000002C8000-000000067F000040020001000000002CC000__000000931B9A2710 000000067F000040020001000000002CB271-000000067F000040020001000000002D3C3E__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002CC000-000000067F000040020001000000002D0000__0000005D2FFFFB38 000000067F000040020001000000002CC000-000000067F000040020001000000002D0000__00000073AD3FE6B8 000000067F000040020001000000002CC000-000000067F000040020001000000002D0000__000000914E3F38F0 000000067F000040020001000000002CC000-000000067F000040020001000000002D0000__000000931B9A2710 000000067F000040020001000000002D0000-000000067F000040020001000000002D4000__0000005D2FFFFB38 000000067F000040020001000000002D0000-000000067F000040020001000000002D4000__00000073AD3FE6B8 000000067F000040020001000000002D0000-000000067F000040020001000000002D4000__000000914E3F38F0 000000067F000040020001000000002D0000-000000067F000040020001000000002D4000__000000931B9A2710 000000067F000040020001000000002D3C3E-000000067F000040020001000000002DC636__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002D4000-000000067F000040020001000000002D8000__0000005D2FFFFB38 000000067F000040020001000000002D4000-000000067F000040020001000000002D8000__00000073AD3FE6B8 000000067F000040020001000000002D4000-000000067F000040020001000000002D8000__000000914E3F38F0 000000067F000040020001000000002D4000-000000067F000040020001000000002D8000__000000931B9A2710 000000067F000040020001000000002D8000-000000067F000040020001000000002DC000__0000005D2FFFFB38 000000067F000040020001000000002D8000-000000067F000040020001000000002DC000__00000073AD3FE6B8 000000067F000040020001000000002D8000-000000067F000040020001000000002DC000__000000914E3F38F0 000000067F000040020001000000002D8000-000000067F000040020001000000002DC000__000000931B9A2710 000000067F000040020001000000002DC000-000000067F000040020001000000002E0000__0000005D2FFFFB38 000000067F000040020001000000002DC000-000000067F000040020001000000002E0000__00000073AD3FE6B8 000000067F000040020001000000002DC000-000000067F000040020001000000002E0000__000000914E3F38F0 000000067F000040020001000000002DC000-000000067F000040020001000000002E0000__000000931B9A2710 000000067F000040020001000000002DC636-000000067F000040020001000000002E5020__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002E0000-000000067F000040020001000000002E4000__0000005D2FFFFB38 000000067F000040020001000000002E0000-000000067F000040020001000000002E4000__00000073AD3FE6B8 000000067F000040020001000000002E0000-000000067F000040020001000000002E4000__000000914E3F38F0 000000067F000040020001000000002E0000-000000067F000040020001000000002E4000__000000931B9A2710 000000067F000040020001000000002E4000-000000067F000040020001000000002E8000__0000005D2FFFFB38 000000067F000040020001000000002E4000-000000067F000040020001000000002E8000__00000073AD3FE6B8 000000067F000040020001000000002E4000-000000067F000040020001000000002E8000__000000914E3F38F0 000000067F000040020001000000002E4000-000000067F000040020001000000002E8000__000000931B9A2710 000000067F000040020001000000002E5020-000000067F000040020001000000002EDA05__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002E8000-000000067F000040020001000000002EC000__0000005D2FFFFB38 000000067F000040020001000000002E8000-000000067F000040020001000000002EC000__00000073AD3FE6B8 000000067F000040020001000000002E8000-000000067F000040020001000000002EC000__000000914E3F38F0 000000067F000040020001000000002E8000-000000067F000040020001000000002EC000__000000931B9A2710 000000067F000040020001000000002EC000-000000067F000040020001000000002F0000__0000005D2FFFFB38 000000067F000040020001000000002EC000-000000067F000040020001000000002F0000__00000073AD3FE6B8 000000067F000040020001000000002EC000-000000067F000040020001000000002F0000__000000914E3F38F0 000000067F000040020001000000002EC000-000000067F000040020001000000002F0000__000000931B9A2710 000000067F000040020001000000002EDA05-000000067F000040020001000000002F63D8__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002F0000-000000067F000040020001000000002F4000__0000005D2FFFFB38 000000067F000040020001000000002F0000-000000067F000040020001000000002F4000__00000073AD3FE6B8 000000067F000040020001000000002F0000-000000067F000040020001000000002F4000__000000914E3F38F0 000000067F000040020001000000002F0000-000000067F000040020001000000002F4000__000000931B9A2710 000000067F000040020001000000002F4000-000000067F000040020001000000002F8000__0000005D2FFFFB38 000000067F000040020001000000002F4000-000000067F000040020001000000002F8000__00000073AD3FE6B8 000000067F000040020001000000002F4000-000000067F000040020001000000002F8000__000000914E3F38F0 000000067F000040020001000000002F4000-000000067F000040020001000000002F8000__000000931B9A2710 000000067F000040020001000000002F63D8-030000000000000000000000000000000002__000000572A7C74A1-0000005CA7BBD6F9 000000067F000040020001000000002F8000-000000067F000040020001000000002FC000__0000005D2FFFFB38 000000067F000040020001000000002F8000-000000067F000040020001000000002FC000__00000073AD3FE6B8 000000067F000040020001000000002F8000-000000067F000040020001000000002FC000__000000914E3F38F0 000000067F000040020001000000002F8000-000000067F000040020001000000002FC000__000000931B9A2710 000000067F000040020001000000002FC000-000000067F00004002000100000000300000__0000005D2FFFFB38 000000067F000040020001000000002FC000-000000067F00004002000100000000300000__00000073AD3FE6B8 000000067F000040020001000000002FC000-000000067F00004002000100000000300000__000000914E3F38F0 000000067F000040020001000000002FC000-000000067F00004002000100000000300000__000000931B9A2710 000000067F00004002000100000000300000-000000067F00004002000100000000304000__0000005D2FFFFB38 000000067F00004002000100000000300000-000000067F00004002000100000000304000__00000073AD3FE6B8 000000067F00004002000100000000300000-000000067F00004002000100000000304000__000000914E3F38F0 000000067F00004002000100000000300000-000000067F00004002000100000000304000__000000931B9A2710 000000067F0000400200010000000030067A-000000067F0000400200010000000030903C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000304000-000000067F00004002000100000000308000__0000005D2FFFFB38 000000067F00004002000100000000304000-000000067F00004002000100000000308000__00000073AD3FE6B8 000000067F00004002000100000000304000-000000067F00004002000100000000308000__000000914E3F38F0 000000067F00004002000100000000304000-000000067F00004002000100000000308000__000000931B9A2710 000000067F00004002000100000000308000-000000067F0000400200010000000030C000__0000005D2FFFFB38 000000067F00004002000100000000308000-000000067F0000400200010000000030C000__00000073AD3FE6B8 000000067F00004002000100000000308000-000000067F0000400200010000000030C000__000000914E3F38F0 000000067F00004002000100000000308000-000000067F0000400200010000000030C000__000000931B9A2710 000000067F0000400200010000000030903C-000000067F00004002000100000000311A14__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000030C000-000000067F00004002000100000000310000__0000005D2FFFFB38 000000067F0000400200010000000030C000-000000067F00004002000100000000310000__00000073AD3FE6B8 000000067F0000400200010000000030C000-000000067F00004002000100000000310000__000000914E3F38F0 000000067F0000400200010000000030C000-000000067F00004002000100000000310000__000000931B9A2710 000000067F00004002000100000000310000-000000067F00004002000100000000314000__0000005D2FFFFB38 000000067F00004002000100000000310000-000000067F00004002000100000000314000__00000073AD3FE6B8 000000067F00004002000100000000310000-000000067F00004002000100000000314000__000000914E3F38F0 000000067F00004002000100000000310000-000000067F00004002000100000000314000__000000931B9A2710 000000067F00004002000100000000311A14-000000067F0000400200010000000031A404__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000314000-000000067F00004002000100000000318000__0000005D2FFFFB38 000000067F00004002000100000000314000-000000067F00004002000100000000318000__00000073AD3FE6B8 000000067F00004002000100000000314000-000000067F00004002000100000000318000__000000914E3F38F0 000000067F00004002000100000000314000-000000067F00004002000100000000318000__000000931B9A2710 000000067F00004002000100000000318000-000000067F0000400200010000000031C000__0000005D2FFFFB38 000000067F00004002000100000000318000-000000067F0000400200010000000031C000__00000073AD3FE6B8 000000067F00004002000100000000318000-000000067F0000400200010000000031C000__000000914E3F38F0 000000067F00004002000100000000318000-000000067F0000400200010000000031C000__000000931B9A2710 000000067F0000400200010000000031A404-000000067F00004002000100000000322DE1__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000031C000-000000067F00004002000100000000320000__0000005D2FFFFB38 000000067F0000400200010000000031C000-000000067F00004002000100000000320000__00000073AD3FE6B8 000000067F0000400200010000000031C000-000000067F00004002000100000000320000__000000914E3F38F0 000000067F0000400200010000000031C000-000000067F00004002000100000000320000__000000931B9A2710 000000067F00004002000100000000320000-000000067F00004002000100000000324000__0000005D2FFFFB38 000000067F00004002000100000000320000-000000067F00004002000100000000324000__00000073AD3FE6B8 000000067F00004002000100000000320000-000000067F00004002000100000000324000__000000914E3F38F0 000000067F00004002000100000000320000-000000067F00004002000100000000324000__000000931B9A2710 000000067F00004002000100000000322DE1-000000067F0000400200010000000032B7D4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000324000-000000067F00004002000100000000328000__0000005D2FFFFB38 000000067F00004002000100000000324000-000000067F00004002000100000000328000__00000073AD3FE6B8 000000067F00004002000100000000324000-000000067F00004002000100000000328000__000000914E3F38F0 000000067F00004002000100000000324000-000000067F00004002000100000000328000__000000931B9A2710 000000067F00004002000100000000328000-000000067F0000400200010000000032C000__0000005D2FFFFB38 000000067F00004002000100000000328000-000000067F0000400200010000000032C000__00000073AD3FE6B8 000000067F00004002000100000000328000-000000067F0000400200010000000032C000__000000914E3F38F0 000000067F00004002000100000000328000-000000067F0000400200010000000032C000__000000931B9A2710 000000067F0000400200010000000032B7D4-000000067F000040020001000000003341AB__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000032C000-000000067F00004002000100000000330000__0000005D2FFFFB38 000000067F0000400200010000000032C000-000000067F00004002000100000000330000__00000073AD3FE6B8 000000067F0000400200010000000032C000-000000067F00004002000100000000330000__000000914E3F38F0 000000067F0000400200010000000032C000-000000067F00004002000100000000330000__000000931B9A2710 000000067F00004002000100000000330000-000000067F00004002000100000000334000__0000005D2FFFFB38 000000067F00004002000100000000330000-000000067F00004002000100000000334000__00000073AD3FE6B8 000000067F00004002000100000000330000-000000067F00004002000100000000334000__000000914E3F38F0 000000067F00004002000100000000330000-000000067F00004002000100000000334000__000000931B9A2710 000000067F00004002000100000000334000-000000067F00004002000100000000338000__0000005D2FFFFB38 000000067F00004002000100000000334000-000000067F00004002000100000000338000__00000073AD3FE6B8 000000067F00004002000100000000334000-000000067F00004002000100000000338000__000000914E3F38F0 000000067F00004002000100000000334000-000000067F00004002000100000000338000__000000931B9A2710 000000067F000040020001000000003341AB-000000067F0000400200010000000033CB80__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000338000-000000067F0000400200010000000033C000__0000005D2FFFFB38 000000067F00004002000100000000338000-000000067F0000400200010000000033C000__00000073AD3FE6B8 000000067F00004002000100000000338000-000000067F0000400200010000000033C000__000000914E3F38F0 000000067F00004002000100000000338000-000000067F0000400200010000000033C000__000000931B9A2710 000000067F0000400200010000000033C000-000000067F00004002000100000000340000__0000005D2FFFFB38 000000067F0000400200010000000033C000-000000067F00004002000100000000340000__00000073AD3FE6B8 000000067F0000400200010000000033C000-000000067F00004002000100000000340000__000000914E3F38F0 000000067F0000400200010000000033C000-000000067F00004002000100000000340000__000000931B9A2710 000000067F0000400200010000000033CB80-000000067F0000400200010000000034554A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000340000-000000067F00004002000100000000344000__00000073AD3FE6B8 000000067F00004002000100000000340000-000000067F00004002000100000000344000__000000914E3F38F0 000000067F00004002000100000000340000-000000067F00004002000100000000344000__000000931B9A2710 000000067F00004002000100000000340000-030000000000000000000000000000000002__0000005D2FFFFB38 000000067F00004002000100000000344000-000000067F00004002000100000000348000__00000073AD3FE6B8 000000067F00004002000100000000344000-000000067F00004002000100000000348000__000000914E3F38F0 000000067F00004002000100000000344000-000000067F00004002000100000000348000__000000931B9A2710 000000067F0000400200010000000034554A-000000067F0000400200010000000034DF2D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000348000-000000067F0000400200010000000034C000__00000073AD3FE6B8 000000067F00004002000100000000348000-000000067F0000400200010000000034C000__000000914E3F38F0 000000067F00004002000100000000348000-000000067F0000400200010000000034C000__000000931B9A2710 000000067F0000400200010000000034C000-000000067F00004002000100000000350000__00000073AD3FE6B8 000000067F0000400200010000000034C000-000000067F00004002000100000000350000__000000914E3F38F0 000000067F0000400200010000000034C000-000000067F00004002000100000000350000__000000931B9A2710 000000067F0000400200010000000034DF2D-000000067F00004002000100000000356917__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000350000-000000067F00004002000100000000354000__00000073AD3FE6B8 000000067F00004002000100000000350000-000000067F00004002000100000000354000__000000914E3F38F0 000000067F00004002000100000000350000-000000067F00004002000100000000354000__000000931B9A2710 000000067F00004002000100000000354000-000000067F00004002000100000000358000__00000073AD3FE6B8 000000067F00004002000100000000354000-000000067F00004002000100000000358000__000000914E3F38F0 000000067F00004002000100000000354000-000000067F00004002000100000000358000__000000931B9A2710 000000067F00004002000100000000356917-000000067F0000400200010000000035F303__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000358000-000000067F0000400200010000000035C000__00000073AD3FE6B8 000000067F00004002000100000000358000-000000067F0000400200010000000035C000__000000914E3F38F0 000000067F00004002000100000000358000-000000067F0000400200010000000035C000__000000931B9A2710 000000067F0000400200010000000035C000-000000067F00004002000100000000360000__00000073AD3FE6B8 000000067F0000400200010000000035C000-000000067F00004002000100000000360000__000000914E3F38F0 000000067F0000400200010000000035C000-000000067F00004002000100000000360000__000000931B9A2710 000000067F0000400200010000000035F303-000000067F00004002000100000000367CE4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000360000-000000067F00004002000100000000364000__00000073AD3FE6B8 000000067F00004002000100000000360000-000000067F00004002000100000000364000__000000914E3F38F0 000000067F00004002000100000000360000-000000067F00004002000100000000364000__000000931B9A2710 000000067F00004002000100000000364000-000000067F00004002000100000000368000__00000073AD3FE6B8 000000067F00004002000100000000364000-000000067F00004002000100000000368000__000000914E3F38F0 000000067F00004002000100000000364000-000000067F00004002000100000000368000__000000931B9A2710 000000067F00004002000100000000367CE4-000000067F000040020001000000003706C3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000368000-000000067F0000400200010000000036C000__00000073AD3FE6B8 000000067F00004002000100000000368000-000000067F0000400200010000000036C000__000000914E3F38F0 000000067F00004002000100000000368000-000000067F0000400200010000000036C000__000000931B9A2710 000000067F0000400200010000000036C000-000000067F00004002000100000000370000__00000073AD3FE6B8 000000067F0000400200010000000036C000-000000067F00004002000100000000370000__000000914E3F38F0 000000067F0000400200010000000036C000-000000067F00004002000100000000370000__000000931B9A2710 000000067F00004002000100000000370000-000000067F00004002000100000000374000__00000073AD3FE6B8 000000067F00004002000100000000370000-000000067F00004002000100000000374000__000000914E3F38F0 000000067F00004002000100000000370000-000000067F00004002000100000000374000__000000931B9A2710 000000067F000040020001000000003706C3-000000067F00004002000100000000379087__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000374000-000000067F00004002000100000000378000__00000073AD3FE6B8 000000067F00004002000100000000374000-000000067F00004002000100000000378000__000000914E3F38F0 000000067F00004002000100000000374000-000000067F00004002000100000000378000__000000931B9A2710 000000067F00004002000100000000378000-000000067F0000400200010000000037C000__00000073AD3FE6B8 000000067F00004002000100000000378000-000000067F0000400200010000000037C000__000000914E3F38F0 000000067F00004002000100000000378000-000000067F0000400200010000000037C000__000000931B9A2710 000000067F00004002000100000000379087-000000067F00004002000100000000381A53__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000037C000-000000067F00004002000100000000380000__00000073AD3FE6B8 000000067F0000400200010000000037C000-000000067F00004002000100000000380000__000000914E3F38F0 000000067F0000400200010000000037C000-000000067F00004002000100000000380000__000000931B9A2710 000000067F00004002000100000000380000-000000067F00004002000100000000384000__00000073AD3FE6B8 000000067F00004002000100000000380000-000000067F00004002000100000000384000__000000914E3F38F0 000000067F00004002000100000000380000-000000067F00004002000100000000384000__000000931B9A2710 000000067F00004002000100000000381A53-000000067F0000400200010000000038A43A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000384000-000000067F00004002000100000000388000__00000073AD3FE6B8 000000067F00004002000100000000384000-000000067F00004002000100000000388000__000000914E3F38F0 000000067F00004002000100000000384000-000000067F00004002000100000000388000__000000931B9A2710 000000067F00004002000100000000388000-000000067F0000400200010000000038C000__00000073AD3FE6B8 000000067F00004002000100000000388000-000000067F0000400200010000000038C000__000000914E3F38F0 000000067F00004002000100000000388000-000000067F0000400200010000000038C000__000000931B9A2710 000000067F0000400200010000000038A43A-000000067F00004002000100000000392E24__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000038C000-000000067F00004002000100000000390000__00000073AD3FE6B8 000000067F0000400200010000000038C000-000000067F00004002000100000000390000__000000914E3F38F0 000000067F0000400200010000000038C000-000000067F00004002000100000000390000__000000931B9A2710 000000067F00004002000100000000390000-000000067F00004002000100000000394000__00000073AD3FE6B8 000000067F00004002000100000000390000-000000067F00004002000100000000394000__000000914E3F38F0 000000067F00004002000100000000390000-000000067F00004002000100000000394000__000000931B9A2710 000000067F00004002000100000000392E24-000000067F0000400200010000000039B80E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000394000-000000067F00004002000100000000398000__00000073AD3FE6B8 000000067F00004002000100000000394000-000000067F00004002000100000000398000__000000914E3F38F0 000000067F00004002000100000000394000-000000067F00004002000100000000398000__000000931B9A2710 000000067F00004002000100000000398000-000000067F0000400200010000000039C000__00000073AD3FE6B8 000000067F00004002000100000000398000-000000067F0000400200010000000039C000__000000914E3F38F0 000000067F00004002000100000000398000-000000067F0000400200010000000039C000__000000931B9A2710 000000067F0000400200010000000039B80E-000000067F000040020001000000003A41E4__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000039C000-000000067F000040020001000000003A0000__00000073AD3FE6B8 000000067F0000400200010000000039C000-000000067F000040020001000000003A0000__000000914E3F38F0 000000067F0000400200010000000039C000-000000067F000040020001000000003A0000__000000931B9A2710 000000067F000040020001000000003A0000-000000067F000040020001000000003A4000__00000073AD3FE6B8 000000067F000040020001000000003A0000-000000067F000040020001000000003A4000__000000914E3F38F0 000000067F000040020001000000003A0000-000000067F000040020001000000003A4000__000000931B9A2710 000000067F000040020001000000003A4000-000000067F000040020001000000003A8000__00000073AD3FE6B8 000000067F000040020001000000003A4000-000000067F000040020001000000003A8000__000000914E3F38F0 000000067F000040020001000000003A4000-000000067F000040020001000000003A8000__000000931B9A2710 000000067F000040020001000000003A41E4-000000067F000040020001000000003ACBC0__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003A8000-000000067F000040020001000000003AC000__00000073AD3FE6B8 000000067F000040020001000000003A8000-000000067F000040020001000000003AC000__000000914E3F38F0 000000067F000040020001000000003A8000-000000067F000040020001000000003AC000__000000931B9A2710 000000067F000040020001000000003AC000-000000067F000040020001000000003B0000__00000073AD3FE6B8 000000067F000040020001000000003AC000-000000067F000040020001000000003B0000__000000914E3F38F0 000000067F000040020001000000003AC000-000000067F000040020001000000003B0000__000000931B9A2710 000000067F000040020001000000003ACBC0-000000067F000040020001000000003B5581__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003B0000-000000067F000040020001000000003B4000__00000073AD3FE6B8 000000067F000040020001000000003B0000-000000067F000040020001000000003B4000__000000914E3F38F0 000000067F000040020001000000003B0000-000000067F000040020001000000003B4000__000000931B9A2710 000000067F000040020001000000003B4000-000000067F000040020001000000003B8000__00000073AD3FE6B8 000000067F000040020001000000003B4000-000000067F000040020001000000003B8000__000000914E3F38F0 000000067F000040020001000000003B4000-000000067F000040020001000000003B8000__000000931B9A2710 000000067F000040020001000000003B5581-000000067F000040020001000000003BDF45__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003B8000-000000067F000040020001000000003BC000__00000073AD3FE6B8 000000067F000040020001000000003B8000-000000067F000040020001000000003BC000__000000914E3F38F0 000000067F000040020001000000003B8000-000000067F000040020001000000003BC000__000000931B9A2710 000000067F000040020001000000003BC000-000000067F000040020001000000003C0000__00000073AD3FE6B8 000000067F000040020001000000003BC000-000000067F000040020001000000003C0000__000000914E3F38F0 000000067F000040020001000000003BC000-000000067F000040020001000000003C0000__000000931B9A2710 000000067F000040020001000000003BDF45-000000067F000040020001000000003C694A__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003C0000-000000067F000040020001000000003C4000__00000073AD3FE6B8 000000067F000040020001000000003C0000-000000067F000040020001000000003C4000__000000914E3F38F0 000000067F000040020001000000003C0000-000000067F000040020001000000003C4000__000000931B9A2710 000000067F000040020001000000003C4000-000000067F000040020001000000003C8000__00000073AD3FE6B8 000000067F000040020001000000003C4000-000000067F000040020001000000003C8000__000000914E3F38F0 000000067F000040020001000000003C4000-000000067F000040020001000000003C8000__000000931B9A2710 000000067F000040020001000000003C694A-000000067F000040020001000000003CF343__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003C8000-000000067F000040020001000000003CC000__00000073AD3FE6B8 000000067F000040020001000000003C8000-000000067F000040020001000000003CC000__000000914E3F38F0 000000067F000040020001000000003C8000-000000067F000040020001000000003CC000__000000931B9A2710 000000067F000040020001000000003CC000-000000067F000040020001000000003D0000__00000073AD3FE6B8 000000067F000040020001000000003CC000-000000067F000040020001000000003D0000__000000914E3F38F0 000000067F000040020001000000003CC000-000000067F000040020001000000003D0000__000000931B9A2710 000000067F000040020001000000003CF343-000000067F000040020001000000003D7D31__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003D0000-000000067F000040020001000000003D4000__00000073AD3FE6B8 000000067F000040020001000000003D0000-000000067F000040020001000000003D4000__000000914E3F38F0 000000067F000040020001000000003D0000-000000067F000040020001000000003D4000__000000931B9A2710 000000067F000040020001000000003D4000-000000067F000040020001000000003D8000__00000073AD3FE6B8 000000067F000040020001000000003D4000-000000067F000040020001000000003D8000__000000914E3F38F0 000000067F000040020001000000003D4000-000000067F000040020001000000003D8000__000000931B9A2710 000000067F000040020001000000003D7D31-000000067F000040020001000000003E0701__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003D8000-000000067F000040020001000000003DC000__00000073AD3FE6B8 000000067F000040020001000000003D8000-000000067F000040020001000000003DC000__000000914E3F38F0 000000067F000040020001000000003D8000-000000067F000040020001000000003DC000__000000931B9A2710 000000067F000040020001000000003DC000-000000067F000040020001000000003E0000__00000073AD3FE6B8 000000067F000040020001000000003DC000-000000067F000040020001000000003E0000__000000914E3F38F0 000000067F000040020001000000003DC000-000000067F000040020001000000003E0000__000000931B9A2710 000000067F000040020001000000003E0000-000000067F000040020001000000003E4000__00000073AD3FE6B8 000000067F000040020001000000003E0000-000000067F000040020001000000003E4000__000000914E3F38F0 000000067F000040020001000000003E0000-000000067F000040020001000000003E4000__000000931B9A2710 000000067F000040020001000000003E0701-000000067F000040020001000000003E90C9__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003E4000-000000067F000040020001000000003E8000__00000073AD3FE6B8 000000067F000040020001000000003E4000-000000067F000040020001000000003E8000__000000914E3F38F0 000000067F000040020001000000003E4000-000000067F000040020001000000003E8000__000000931B9A2710 000000067F000040020001000000003E8000-000000067F000040020001000000003EC000__00000073AD3FE6B8 000000067F000040020001000000003E8000-000000067F000040020001000000003EC000__000000914E3F38F0 000000067F000040020001000000003E8000-000000067F000040020001000000003EC000__000000931B9A2710 000000067F000040020001000000003E90C9-000000067F000040020001000000003F1A8D__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003EC000-000000067F000040020001000000003F0000__00000073AD3FE6B8 000000067F000040020001000000003EC000-000000067F000040020001000000003F0000__000000914E3F38F0 000000067F000040020001000000003EC000-000000067F000040020001000000003F0000__000000931B9A2710 000000067F000040020001000000003F0000-000000067F000040020001000000003F4000__00000073AD3FE6B8 000000067F000040020001000000003F0000-000000067F000040020001000000003F4000__000000914E3F38F0 000000067F000040020001000000003F0000-000000067F000040020001000000003F4000__000000931B9A2710 000000067F000040020001000000003F1A8D-000000067F000040020001000000003FA45C__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003F4000-000000067F000040020001000000003F8000__00000073AD3FE6B8 000000067F000040020001000000003F4000-000000067F000040020001000000003F8000__000000914E3F38F0 000000067F000040020001000000003F4000-000000067F000040020001000000003F8000__000000931B9A2710 000000067F000040020001000000003F8000-000000067F000040020001000000003FC000__00000073AD3FE6B8 000000067F000040020001000000003F8000-000000067F000040020001000000003FC000__000000914E3F38F0 000000067F000040020001000000003F8000-000000067F000040020001000000003FC000__000000931B9A2710 000000067F000040020001000000003FA45C-000000067F00004002000100000000402E54__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000003FC000-000000067F00004002000100000000400000__00000073AD3FE6B8 000000067F000040020001000000003FC000-000000067F00004002000100000000400000__000000914E3F38F0 000000067F000040020001000000003FC000-000000067F00004002000100000000400000__000000931B9A2710 000000067F00004002000100000000400000-000000067F00004002000100000000404000__00000073AD3FE6B8 000000067F00004002000100000000400000-000000067F00004002000100000000404000__000000914E3F38F0 000000067F00004002000100000000400000-000000067F00004002000100000000404000__000000931B9A2710 000000067F00004002000100000000402E54-000000067F0000400200010000000040B84B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000404000-000000067F00004002000100000000408000__00000073AD3FE6B8 000000067F00004002000100000000404000-000000067F00004002000100000000408000__000000914E3F38F0 000000067F00004002000100000000404000-000000067F00004002000100000000408000__000000931B9A2710 000000067F00004002000100000000408000-000000067F0000400200010000000040C000__00000073AD3FE6B8 000000067F00004002000100000000408000-000000067F0000400200010000000040C000__000000914E3F38F0 000000067F00004002000100000000408000-000000067F0000400200010000000040C000__000000931B9A2710 000000067F0000400200010000000040B84B-000000067F00004002000100000000414230__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000040C000-000000067F00004002000100000000410000__00000073AD3FE6B8 000000067F0000400200010000000040C000-000000067F00004002000100000000410000__000000914E3F38F0 000000067F0000400200010000000040C000-000000067F00004002000100000000410000__000000931B9A2710 000000067F00004002000100000000410000-000000067F00004002000100000000414000__00000073AD3FE6B8 000000067F00004002000100000000410000-000000067F00004002000100000000414000__000000914E3F38F0 000000067F00004002000100000000410000-000000067F00004002000100000000414000__000000931B9A2710 000000067F00004002000100000000414000-000000067F00004002000100000000418000__00000073AD3FE6B8 000000067F00004002000100000000414000-000000067F00004002000100000000418000__000000914E3F38F0 000000067F00004002000100000000414000-000000067F00004002000100000000418000__000000931B9A2710 000000067F00004002000100000000414230-000000067F0000400200010000000041CC01__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000418000-000000067F0000400200010000000041C000__00000073AD3FE6B8 000000067F00004002000100000000418000-000000067F0000400200010000000041C000__000000914E3F38F0 000000067F00004002000100000000418000-000000067F0000400200010000000041C000__000000931B9A2710 000000067F0000400200010000000041C000-000000067F00004002000100000000420000__00000073AD3FE6B8 000000067F0000400200010000000041C000-000000067F00004002000100000000420000__000000914E3F38F0 000000067F0000400200010000000041C000-000000067F00004002000100000000420000__000000931B9A2710 000000067F0000400200010000000041CC01-000000067F000040020001000000004255BE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000420000-000000067F00004002000100000000424000__00000073AD3FE6B8 000000067F00004002000100000000420000-000000067F00004002000100000000424000__000000914E3F38F0 000000067F00004002000100000000420000-000000067F00004002000100000000424000__000000931B9A2710 000000067F00004002000100000000424000-000000067F00004002000100000000428000__00000073AD3FE6B8 000000067F00004002000100000000424000-000000067F00004002000100000000428000__000000914E3F38F0 000000067F00004002000100000000424000-000000067F00004002000100000000428000__000000931B9A2710 000000067F000040020001000000004255BE-000000067F0000400200010000000042DF85__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000428000-000000067F0000400200010000000042C000__00000073AD3FE6B8 000000067F00004002000100000000428000-000000067F0000400200010000000042C000__000000914E3F38F0 000000067F00004002000100000000428000-000000067F0000400200010000000042C000__000000931B9A2710 000000067F0000400200010000000042C000-000000067F00004002000100000000430000__00000073AD3FE6B8 000000067F0000400200010000000042C000-000000067F00004002000100000000430000__000000914E3F38F0 000000067F0000400200010000000042C000-000000067F00004002000100000000430000__000000931B9A2710 000000067F0000400200010000000042DF85-000000067F00004002000100000000436961__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000430000-000000067F00004002000100000000434000__00000073AD3FE6B8 000000067F00004002000100000000430000-000000067F00004002000100000000434000__000000914E3F38F0 000000067F00004002000100000000430000-000000067F00004002000100000000434000__000000931B9A2710 000000067F00004002000100000000434000-000000067F00004002000100000000438000__00000073AD3FE6B8 000000067F00004002000100000000434000-000000067F00004002000100000000438000__000000914E3F38F0 000000067F00004002000100000000434000-000000067F00004002000100000000438000__000000931B9A2710 000000067F00004002000100000000436961-000000067F0000400200010000000043F354__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000438000-000000067F0000400200010000000043C000__00000073AD3FE6B8 000000067F00004002000100000000438000-000000067F0000400200010000000043C000__000000914E3F38F0 000000067F00004002000100000000438000-000000067F0000400200010000000043C000__000000931B9A2710 000000067F0000400200010000000043C000-000000067F00004002000100000000440000__00000073AD3FE6B8 000000067F0000400200010000000043C000-000000067F00004002000100000000440000__000000914E3F38F0 000000067F0000400200010000000043C000-000000067F00004002000100000000440000__000000931B9A2710 000000067F0000400200010000000043F354-000000067F00004002000100000000447D42__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000440000-000000067F00004002000100000000444000__00000073AD3FE6B8 000000067F00004002000100000000440000-000000067F00004002000100000000444000__000000914E3F38F0 000000067F00004002000100000000440000-000000067F00004002000100000000444000__000000931B9A2710 000000067F00004002000100000000444000-000000067F00004002000100000000448000__00000073AD3FE6B8 000000067F00004002000100000000444000-000000067F00004002000100000000448000__000000914E3F38F0 000000067F00004002000100000000444000-000000067F00004002000100000000448000__000000931B9A2710 000000067F00004002000100000000447D42-000000067F00004002000100000000450730__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000448000-000000067F0000400200010000000044C000__00000073AD3FE6B8 000000067F00004002000100000000448000-000000067F0000400200010000000044C000__000000914E3F38F0 000000067F00004002000100000000448000-000000067F0000400200010000000044C000__000000931B9A2710 000000067F0000400200010000000044C000-000000067F00004002000100000000450000__00000073AD3FE6B8 000000067F0000400200010000000044C000-000000067F00004002000100000000450000__000000914E3F38F0 000000067F0000400200010000000044C000-000000067F00004002000100000000450000__000000931B9A2710 000000067F00004002000100000000450000-000000067F00004002000100000000454000__00000073AD3FE6B8 000000067F00004002000100000000450000-000000067F00004002000100000000454000__000000914E3F38F0 000000067F00004002000100000000450000-000000067F00004002000100000000454000__000000931B9A2710 000000067F00004002000100000000450730-000000067F00004002000100000000459116__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000454000-000000067F00004002000100000000458000__00000073AD3FE6B8 000000067F00004002000100000000454000-000000067F00004002000100000000458000__000000914E3F38F0 000000067F00004002000100000000454000-000000067F00004002000100000000458000__000000931B9A2710 000000067F00004002000100000000458000-000000067F0000400200010000000045C000__00000073AD3FE6B8 000000067F00004002000100000000458000-000000067F0000400200010000000045C000__000000914E3F38F0 000000067F00004002000100000000458000-000000067F0000400200010000000045C000__000000931B9A2710 000000067F00004002000100000000459116-000000067F00004002000100000000461ACC__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000045C000-000000067F00004002000100000000460000__00000073AD3FE6B8 000000067F0000400200010000000045C000-000000067F00004002000100000000460000__000000914E3F38F0 000000067F0000400200010000000045C000-000000067F00004002000100000000460000__000000931B9A2710 000000067F00004002000100000000460000-000000067F00004002000100000000464000__00000073AD3FE6B8 000000067F00004002000100000000460000-000000067F00004002000100000000464000__000000914E3F38F0 000000067F00004002000100000000460000-000000067F00004002000100000000464000__000000931B9A2710 000000067F00004002000100000000461ACC-000000067F0000400200010000000046A495__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000464000-000000067F00004002000100000000468000__00000073AD3FE6B8 000000067F00004002000100000000464000-000000067F00004002000100000000468000__000000914E3F38F0 000000067F00004002000100000000464000-000000067F00004002000100000000468000__000000931B9A2710 000000067F00004002000100000000468000-000000067F0000400200010000000046C000__00000073AD3FE6B8 000000067F00004002000100000000468000-000000067F0000400200010000000046C000__000000914E3F38F0 000000067F00004002000100000000468000-000000067F0000400200010000000046C000__000000931B9A2710 000000067F0000400200010000000046A495-000000067F00004002000100000000472E71__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000046C000-000000067F00004002000100000000470000__00000073AD3FE6B8 000000067F0000400200010000000046C000-000000067F00004002000100000000470000__000000914E3F38F0 000000067F0000400200010000000046C000-000000067F00004002000100000000470000__000000931B9A2710 000000067F00004002000100000000470000-000000067F00004002000100000000474000__00000073AD3FE6B8 000000067F00004002000100000000470000-000000067F00004002000100000000474000__000000914E3F38F0 000000067F00004002000100000000470000-000000067F00004002000100000000474000__000000931B9A2710 000000067F00004002000100000000472E71-000000067F0000400200010000000047B85E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000474000-000000067F00004002000100000000478000__00000073AD3FE6B8 000000067F00004002000100000000474000-000000067F00004002000100000000478000__000000914E3F38F0 000000067F00004002000100000000474000-000000067F00004002000100000000478000__000000931B9A2710 000000067F00004002000100000000478000-000000067F0000400200010000000047C000__00000073AD3FE6B8 000000067F00004002000100000000478000-000000067F0000400200010000000047C000__000000914E3F38F0 000000067F00004002000100000000478000-000000067F0000400200010000000047C000__000000931B9A2710 000000067F0000400200010000000047B85E-000000067F0000400200010000000048424F__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000047C000-000000067F00004002000100000000480000__00000073AD3FE6B8 000000067F0000400200010000000047C000-000000067F00004002000100000000480000__000000914E3F38F0 000000067F0000400200010000000047C000-000000067F00004002000100000000480000__000000931B9A2710 000000067F00004002000100000000480000-000000067F00004002000100000000484000__00000073AD3FE6B8 000000067F00004002000100000000480000-000000067F00004002000100000000484000__000000914E3F38F0 000000067F00004002000100000000480000-000000067F00004002000100000000484000__000000931B9A2710 000000067F00004002000100000000484000-000000067F00004002000100000000488000__00000073AD3FE6B8 000000067F00004002000100000000484000-000000067F00004002000100000000488000__000000914E3F38F0 000000067F00004002000100000000484000-000000067F00004002000100000000488000__000000931B9A2710 000000067F0000400200010000000048424F-000000067F0000400200010000000048CC2F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000488000-000000067F0000400200010000000048C000__00000073AD3FE6B8 000000067F00004002000100000000488000-000000067F0000400200010000000048C000__000000914E3F38F0 000000067F00004002000100000000488000-000000067F0000400200010000000048C000__000000931B9A2710 000000067F0000400200010000000048C000-000000067F00004002000100000000490000__00000073AD3FE6B8 000000067F0000400200010000000048C000-000000067F00004002000100000000490000__000000914E3F38F0 000000067F0000400200010000000048C000-000000067F00004002000100000000490000__000000931B9A2710 000000067F0000400200010000000048CC2F-000000067F00004002000100000000495603__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000490000-000000067F00004002000100000000494000__00000073AD3FE6B8 000000067F00004002000100000000490000-000000067F00004002000100000000494000__000000914E3F38F0 000000067F00004002000100000000490000-000000067F00004002000100000000494000__000000931B9A2710 000000067F00004002000100000000494000-000000067F00004002000100000000498000__00000073AD3FE6B8 000000067F00004002000100000000494000-000000067F00004002000100000000498000__000000914E3F38F0 000000067F00004002000100000000494000-000000067F00004002000100000000498000__000000931B9A2710 000000067F00004002000100000000495603-000000067F0000400200010000000049DFC0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000498000-000000067F0000400200010000000049C000__00000073AD3FE6B8 000000067F00004002000100000000498000-000000067F0000400200010000000049C000__000000914E3F38F0 000000067F00004002000100000000498000-000000067F0000400200010000000049C000__000000931B9A2710 000000067F0000400200010000000049C000-000000067F000040020001000000004A0000__00000073AD3FE6B8 000000067F0000400200010000000049C000-000000067F000040020001000000004A0000__000000914E3F38F0 000000067F0000400200010000000049C000-000000067F000040020001000000004A0000__000000931B9A2710 000000067F0000400200010000000049DFC0-000000067F000040020001000000004A698B__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004A0000-000000067F000040020001000000004A4000__00000073AD3FE6B8 000000067F000040020001000000004A0000-000000067F000040020001000000004A4000__000000914E3F38F0 000000067F000040020001000000004A0000-000000067F000040020001000000004A4000__000000931B9A2710 000000067F000040020001000000004A4000-000000067F000040020001000000004A8000__00000073AD3FE6B8 000000067F000040020001000000004A4000-000000067F000040020001000000004A8000__000000914E3F38F0 000000067F000040020001000000004A4000-000000067F000040020001000000004A8000__000000931B9A2710 000000067F000040020001000000004A698B-000000067F000040020001000000004AF374__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004A8000-000000067F000040020001000000004AC000__00000073AD3FE6B8 000000067F000040020001000000004A8000-000000067F000040020001000000004AC000__000000914E3F38F0 000000067F000040020001000000004A8000-000000067F000040020001000000004AC000__000000931B9A2710 000000067F000040020001000000004AC000-000000067F000040020001000000004B0000__00000073AD3FE6B8 000000067F000040020001000000004AC000-000000067F000040020001000000004B0000__000000914E3F38F0 000000067F000040020001000000004AC000-000000067F000040020001000000004B0000__000000931B9A2710 000000067F000040020001000000004AF374-000000067F000040020001000000004B7D7B__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004B0000-000000067F000040020001000000004B4000__00000073AD3FE6B8 000000067F000040020001000000004B0000-000000067F000040020001000000004B4000__000000914E3F38F0 000000067F000040020001000000004B0000-000000067F000040020001000000004B4000__000000931B9A2710 000000067F000040020001000000004B4000-000000067F000040020001000000004B8000__00000073AD3FE6B8 000000067F000040020001000000004B4000-000000067F000040020001000000004B8000__000000914E3F38F0 000000067F000040020001000000004B4000-000000067F000040020001000000004B8000__000000931B9A2710 000000067F000040020001000000004B7D7B-000000067F000040020001000000004C0764__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004B8000-000000067F000040020001000000004BC000__00000073AD3FE6B8 000000067F000040020001000000004B8000-000000067F000040020001000000004BC000__000000914E3F38F0 000000067F000040020001000000004B8000-000000067F000040020001000000004BC000__000000931B9A2710 000000067F000040020001000000004BC000-000000067F000040020001000000004C0000__00000073AD3FE6B8 000000067F000040020001000000004BC000-000000067F000040020001000000004C0000__000000914E3F38F0 000000067F000040020001000000004BC000-000000067F000040020001000000004C0000__000000931B9A2710 000000067F000040020001000000004C0000-000000067F000040020001000000004C4000__00000073AD3FE6B8 000000067F000040020001000000004C0000-000000067F000040020001000000004C4000__000000914E3F38F0 000000067F000040020001000000004C0000-000000067F000040020001000000004C4000__000000931B9A2710 000000067F000040020001000000004C0764-000000067F000040020001000000004C9146__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004C4000-000000067F000040020001000000004C8000__00000073AD3FE6B8 000000067F000040020001000000004C4000-000000067F000040020001000000004C8000__000000914E3F38F0 000000067F000040020001000000004C4000-000000067F000040020001000000004C8000__000000931B9A2710 000000067F000040020001000000004C8000-000000067F000040020001000000004CC000__00000073AD3FE6B8 000000067F000040020001000000004C8000-000000067F000040020001000000004CC000__000000914E3F38F0 000000067F000040020001000000004C8000-000000067F000040020001000000004CC000__000000931B9A2710 000000067F000040020001000000004C9146-000000067F000040020001000000004D1B16__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004CC000-000000067F000040020001000000004D0000__00000073AD3FE6B8 000000067F000040020001000000004CC000-000000067F000040020001000000004D0000__000000914E3F38F0 000000067F000040020001000000004CC000-000000067F000040020001000000004D0000__000000931B9A2710 000000067F000040020001000000004D0000-000000067F000040020001000000004D4000__00000073AD3FE6B8 000000067F000040020001000000004D0000-000000067F000040020001000000004D4000__000000914E3F38F0 000000067F000040020001000000004D0000-000000067F000040020001000000004D4000__000000931B9A2710 000000067F000040020001000000004D1B16-000000067F000040020001000000004DA4D9__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004D4000-000000067F000040020001000000004D8000__00000073AD3FE6B8 000000067F000040020001000000004D4000-000000067F000040020001000000004D8000__000000914E3F38F0 000000067F000040020001000000004D4000-000000067F000040020001000000004D8000__000000931B9A2710 000000067F000040020001000000004D8000-000000067F000040020001000000004DC000__00000073AD3FE6B8 000000067F000040020001000000004D8000-000000067F000040020001000000004DC000__000000914E3F38F0 000000067F000040020001000000004D8000-000000067F000040020001000000004DC000__000000931B9A2710 000000067F000040020001000000004DA4D9-000000067F000040020001000000004E2EAB__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004DC000-000000067F000040020001000000004E0000__00000073AD3FE6B8 000000067F000040020001000000004DC000-000000067F000040020001000000004E0000__000000914E3F38F0 000000067F000040020001000000004DC000-000000067F000040020001000000004E0000__000000931B9A2710 000000067F000040020001000000004E0000-000000067F000040020001000000004E4000__00000073AD3FE6B8 000000067F000040020001000000004E0000-000000067F000040020001000000004E4000__000000914E3F38F0 000000067F000040020001000000004E0000-000000067F000040020001000000004E4000__000000931B9A2710 000000067F000040020001000000004E2EAB-000000067F000040020001000000004EB89B__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004E4000-000000067F000040020001000000004E8000__00000073AD3FE6B8 000000067F000040020001000000004E4000-000000067F000040020001000000004E8000__000000914E3F38F0 000000067F000040020001000000004E4000-000000067F000040020001000000004E8000__000000931B9A2710 000000067F000040020001000000004E8000-000000067F000040020001000000004EC000__00000073AD3FE6B8 000000067F000040020001000000004E8000-000000067F000040020001000000004EC000__000000914E3F38F0 000000067F000040020001000000004E8000-000000067F000040020001000000004EC000__000000931B9A2710 000000067F000040020001000000004EB89B-000000067F000040020001000000004F428A__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004EC000-000000067F000040020001000000004F0000__00000073AD3FE6B8 000000067F000040020001000000004EC000-000000067F000040020001000000004F0000__000000914E3F38F0 000000067F000040020001000000004EC000-000000067F000040020001000000004F0000__000000931B9A2710 000000067F000040020001000000004F0000-000000067F000040020001000000004F4000__00000073AD3FE6B8 000000067F000040020001000000004F0000-000000067F000040020001000000004F4000__000000914E3F38F0 000000067F000040020001000000004F0000-000000067F000040020001000000004F4000__000000931B9A2710 000000067F000040020001000000004F4000-000000067F000040020001000000004F8000__00000073AD3FE6B8 000000067F000040020001000000004F4000-000000067F000040020001000000004F8000__000000914E3F38F0 000000067F000040020001000000004F4000-000000067F000040020001000000004F8000__000000931B9A2710 000000067F000040020001000000004F428A-000000067F000040020001000000004FCC78__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000004F8000-000000067F000040020001000000004FC000__00000073AD3FE6B8 000000067F000040020001000000004F8000-000000067F000040020001000000004FC000__000000914E3F38F0 000000067F000040020001000000004F8000-000000067F000040020001000000004FC000__000000931B9A2710 000000067F000040020001000000004FC000-000000067F00004002000100000000500000__00000073AD3FE6B8 000000067F000040020001000000004FC000-000000067F00004002000100000000500000__000000914E3F38F0 000000067F000040020001000000004FC000-000000067F00004002000100000000500000__000000931B9A2710 000000067F000040020001000000004FCC78-000000067F00004002000100000000505659__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000500000-000000067F00004002000100000000504000__00000073AD3FE6B8 000000067F00004002000100000000500000-000000067F00004002000100000000504000__000000914E3F38F0 000000067F00004002000100000000500000-000000067F00004002000100000000504000__000000931B9A2710 000000067F00004002000100000000504000-000000067F00004002000100000000508000__00000073AD3FE6B8 000000067F00004002000100000000504000-000000067F00004002000100000000508000__000000914E3F38F0 000000067F00004002000100000000504000-000000067F00004002000100000000508000__000000931B9A2710 000000067F00004002000100000000505659-000000067F0000400200010000000050E02B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000508000-000000067F0000400200010000000050C000__00000073AD3FE6B8 000000067F00004002000100000000508000-000000067F0000400200010000000050C000__000000914E3F38F0 000000067F00004002000100000000508000-000000067F0000400200010000000050C000__000000931B9A2710 000000067F0000400200010000000050C000-000000067F00004002000100000000510000__00000073AD3FE6B8 000000067F0000400200010000000050C000-000000067F00004002000100000000510000__000000914E3F38F0 000000067F0000400200010000000050C000-000000067F00004002000100000000510000__000000931B9A2710 000000067F0000400200010000000050E02B-000000067F000040020001000000005169EF__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000510000-000000067F00004002000100000000514000__00000073AD3FE6B8 000000067F00004002000100000000510000-000000067F00004002000100000000514000__000000914E3F38F0 000000067F00004002000100000000510000-000000067F00004002000100000000514000__000000931B9A2710 000000067F00004002000100000000514000-000000067F00004002000100000000518000__00000073AD3FE6B8 000000067F00004002000100000000514000-000000067F00004002000100000000518000__000000914E3F38F0 000000067F00004002000100000000514000-000000067F00004002000100000000518000__000000931B9A2710 000000067F000040020001000000005169EF-000000067F0000400200010000000051F3BA__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000518000-000000067F0000400200010000000051C000__00000073AD3FE6B8 000000067F00004002000100000000518000-000000067F0000400200010000000051C000__000000914E3F38F0 000000067F00004002000100000000518000-000000067F0000400200010000000051C000__000000931B9A2710 000000067F0000400200010000000051C000-000000067F00004002000100000000520000__00000073AD3FE6B8 000000067F0000400200010000000051C000-000000067F00004002000100000000520000__000000914E3F38F0 000000067F0000400200010000000051C000-000000067F00004002000100000000520000__000000931B9A2710 000000067F0000400200010000000051F3BA-000000067F00004002000100000000527DAC__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000520000-000000067F00004002000100000000524000__00000073AD3FE6B8 000000067F00004002000100000000520000-000000067F00004002000100000000524000__000000914E3F38F0 000000067F00004002000100000000520000-000000067F00004002000100000000524000__000000931B9A2710 000000067F00004002000100000000524000-000000067F00004002000100000000528000__00000073AD3FE6B8 000000067F00004002000100000000524000-000000067F00004002000100000000528000__000000914E3F38F0 000000067F00004002000100000000524000-000000067F00004002000100000000528000__000000931B9A2710 000000067F00004002000100000000527DAC-000000067F0000400200010000000053079E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000528000-000000067F0000400200010000000052C000__00000073AD3FE6B8 000000067F00004002000100000000528000-000000067F0000400200010000000052C000__000000914E3F38F0 000000067F00004002000100000000528000-000000067F0000400200010000000052C000__000000931B9A2710 000000067F0000400200010000000052C000-000000067F00004002000100000000530000__00000073AD3FE6B8 000000067F0000400200010000000052C000-000000067F00004002000100000000530000__000000914E3F38F0 000000067F0000400200010000000052C000-000000067F00004002000100000000530000__000000931B9A2710 000000067F00004002000100000000530000-000000067F00004002000100000000534000__00000073AD3FE6B8 000000067F00004002000100000000530000-000000067F00004002000100000000534000__000000914E3F38F0 000000067F00004002000100000000530000-000000067F00004002000100000000534000__000000931B9A2710 000000067F0000400200010000000053079E-000000067F00004002000100000000539198__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000534000-000000067F00004002000100000000538000__00000073AD3FE6B8 000000067F00004002000100000000534000-000000067F00004002000100000000538000__000000914E3F38F0 000000067F00004002000100000000534000-000000067F00004002000100000000538000__000000931B9A2710 000000067F00004002000100000000538000-000000067F0000400200010000000053C000__00000073AD3FE6B8 000000067F00004002000100000000538000-000000067F0000400200010000000053C000__000000914E3F38F0 000000067F00004002000100000000538000-000000067F0000400200010000000053C000__000000931B9A2710 000000067F00004002000100000000539198-000000067F00004002000100000000541B6B__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000053C000-000000067F00004002000100000000540000__00000073AD3FE6B8 000000067F0000400200010000000053C000-000000067F00004002000100000000540000__000000914E3F38F0 000000067F0000400200010000000053C000-000000067F00004002000100000000540000__000000931B9A2710 000000067F00004002000100000000540000-000000067F00004002000100000000544000__00000073AD3FE6B8 000000067F00004002000100000000540000-000000067F00004002000100000000544000__000000914E3F38F0 000000067F00004002000100000000540000-000000067F00004002000100000000544000__000000931B9A2710 000000067F00004002000100000000541B6B-000000067F0000400200010000000054A544__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000544000-000000067F00004002000100000000548000__00000073AD3FE6B8 000000067F00004002000100000000544000-000000067F00004002000100000000548000__000000914E3F38F0 000000067F00004002000100000000544000-000000067F00004002000100000000548000__000000931B9A2710 000000067F00004002000100000000548000-000000067F0000400200010000000054C000__00000073AD3FE6B8 000000067F00004002000100000000548000-000000067F0000400200010000000054C000__000000914E3F38F0 000000067F00004002000100000000548000-000000067F0000400200010000000054C000__000000931B9A2710 000000067F0000400200010000000054A544-000000067F00004002000100000000552F06__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000054C000-000000067F00004002000100000000550000__00000073AD3FE6B8 000000067F0000400200010000000054C000-000000067F00004002000100000000550000__000000914E3F38F0 000000067F0000400200010000000054C000-000000067F00004002000100000000550000__000000931B9A2710 000000067F00004002000100000000550000-000000067F00004002000100000000554000__00000073AD3FE6B8 000000067F00004002000100000000550000-000000067F00004002000100000000554000__000000914E3F38F0 000000067F00004002000100000000550000-000000067F00004002000100000000554000__000000931B9A2710 000000067F00004002000100000000552F06-000000067F0000400200010000000055B8C8__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000554000-000000067F00004002000100000000558000__00000073AD3FE6B8 000000067F00004002000100000000554000-000000067F00004002000100000000558000__000000914E3F38F0 000000067F00004002000100000000554000-000000067F00004002000100000000558000__000000931B9A2710 000000067F00004002000100000000558000-000000067F0000400200010000000055C000__00000073AD3FE6B8 000000067F00004002000100000000558000-000000067F0000400200010000000055C000__000000914E3F38F0 000000067F00004002000100000000558000-000000067F0000400200010000000055C000__000000931B9A2710 000000067F0000400200010000000055B8C8-000000067F000040020001000000005642BF__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000055C000-000000067F00004002000100000000560000__00000073AD3FE6B8 000000067F0000400200010000000055C000-000000067F00004002000100000000560000__000000914E3F38F0 000000067F0000400200010000000055C000-000000067F00004002000100000000560000__000000931B9A2710 000000067F00004002000100000000560000-000000067F00004002000100000000564000__00000073AD3FE6B8 000000067F00004002000100000000560000-000000067F00004002000100000000564000__000000914E3F38F0 000000067F00004002000100000000560000-000000067F00004002000100000000564000__000000931B9A2710 000000067F00004002000100000000564000-000000067F00004002000100000000568000__00000073AD3FE6B8 000000067F00004002000100000000564000-000000067F00004002000100000000568000__000000914E3F38F0 000000067F00004002000100000000564000-000000067F00004002000100000000568000__000000931B9A2710 000000067F000040020001000000005642BF-000000067F0000400200010000000056CCB6__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000568000-000000067F0000400200010000000056C000__00000073AD3FE6B8 000000067F00004002000100000000568000-000000067F0000400200010000000056C000__000000914E3F38F0 000000067F00004002000100000000568000-000000067F0000400200010000000056C000__000000931B9A2710 000000067F0000400200010000000056C000-000000067F00004002000100000000570000__00000073AD3FE6B8 000000067F0000400200010000000056C000-000000067F00004002000100000000570000__000000914E3F38F0 000000067F0000400200010000000056C000-000000067F00004002000100000000570000__000000931B9A2710 000000067F0000400200010000000056CCB6-000000067F000040020001000000005756A1__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000570000-000000067F00004002000100000000574000__00000073AD3FE6B8 000000067F00004002000100000000570000-000000067F00004002000100000000574000__000000914E3F38F0 000000067F00004002000100000000570000-000000067F00004002000100000000574000__000000931B9A2710 000000067F00004002000100000000574000-000000067F00004002000100000000578000__00000073AD3FE6B8 000000067F00004002000100000000574000-000000067F00004002000100000000578000__000000914E3F38F0 000000067F00004002000100000000574000-000000067F00004002000100000000578000__000000931B9A2710 000000067F000040020001000000005756A1-000000067F0000400200010000000057E077__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000578000-000000067F0000400200010000000057C000__00000073AD3FE6B8 000000067F00004002000100000000578000-000000067F0000400200010000000057C000__000000914E3F38F0 000000067F00004002000100000000578000-000000067F0000400200010000000057C000__000000931B9A2710 000000067F0000400200010000000057C000-000000067F00004002000100000000580000__00000073AD3FE6B8 000000067F0000400200010000000057C000-000000067F00004002000100000000580000__000000914E3F38F0 000000067F0000400200010000000057C000-000000067F00004002000100000000580000__000000931B9A2710 000000067F0000400200010000000057E077-000000067F00004002000100000000586A4E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000580000-000000067F00004002000100000000584000__00000073AD3FE6B8 000000067F00004002000100000000580000-000000067F00004002000100000000584000__000000914E3F38F0 000000067F00004002000100000000580000-000000067F00004002000100000000584000__000000931B9A2710 000000067F00004002000100000000584000-000000067F00004002000100000000588000__00000073AD3FE6B8 000000067F00004002000100000000584000-000000067F00004002000100000000588000__000000914E3F38F0 000000067F00004002000100000000584000-000000067F00004002000100000000588000__000000931B9A2710 000000067F00004002000100000000586A4E-000000067F0000400200010000000058F415__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000588000-000000067F0000400200010000000058C000__00000073AD3FE6B8 000000067F00004002000100000000588000-000000067F0000400200010000000058C000__000000914E3F38F0 000000067F00004002000100000000588000-000000067F0000400200010000000058C000__000000931B9A2710 000000067F0000400200010000000058C000-000000067F00004002000100000000590000__00000073AD3FE6B8 000000067F0000400200010000000058C000-000000067F00004002000100000000590000__000000914E3F38F0 000000067F0000400200010000000058C000-000000067F00004002000100000000590000__000000931B9A2710 000000067F0000400200010000000058F415-000000067F00004002000100000000597DDF__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000590000-000000067F00004002000100000000594000__00000073AD3FE6B8 000000067F00004002000100000000590000-000000067F00004002000100000000594000__000000914E3F38F0 000000067F00004002000100000000590000-000000067F00004002000100000000594000__000000931B9A2710 000000067F00004002000100000000594000-000000067F00004002000100000000598000__00000073AD3FE6B8 000000067F00004002000100000000594000-000000067F00004002000100000000598000__000000914E3F38F0 000000067F00004002000100000000594000-000000067F00004002000100000000598000__000000931B9A2710 000000067F00004002000100000000597DDF-000000067F000040020001000000005A07CE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000598000-000000067F0000400200010000000059C000__00000073AD3FE6B8 000000067F00004002000100000000598000-000000067F0000400200010000000059C000__000000914E3F38F0 000000067F00004002000100000000598000-000000067F0000400200010000000059C000__000000931B9A2710 000000067F0000400200010000000059C000-000000067F000040020001000000005A0000__00000073AD3FE6B8 000000067F0000400200010000000059C000-000000067F000040020001000000005A0000__000000914E3F38F0 000000067F0000400200010000000059C000-000000067F000040020001000000005A0000__000000931B9A2710 000000067F000040020001000000005A0000-000000067F000040020001000000005A4000__00000073AD3FE6B8 000000067F000040020001000000005A0000-000000067F000040020001000000005A4000__000000914E3F38F0 000000067F000040020001000000005A0000-000000067F000040020001000000005A4000__000000931B9A2710 000000067F000040020001000000005A07CE-000000067F000040020001000000005A91C9__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005A4000-000000067F000040020001000000005A8000__00000073AD3FE6B8 000000067F000040020001000000005A4000-000000067F000040020001000000005A8000__000000914E3F38F0 000000067F000040020001000000005A4000-000000067F000040020001000000005A8000__000000931B9A2710 000000067F000040020001000000005A8000-000000067F000040020001000000005AC000__00000073AD3FE6B8 000000067F000040020001000000005A8000-000000067F000040020001000000005AC000__000000914E3F38F0 000000067F000040020001000000005A8000-000000067F000040020001000000005AC000__000000931B9A2710 000000067F000040020001000000005A91C9-000000067F000040020001000000005B1BB6__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005AC000-000000067F000040020001000000005B0000__00000073AD3FE6B8 000000067F000040020001000000005AC000-000000067F000040020001000000005B0000__000000914E3F38F0 000000067F000040020001000000005AC000-000000067F000040020001000000005B0000__000000931B9A2710 000000067F000040020001000000005B0000-000000067F000040020001000000005B4000__00000073AD3FE6B8 000000067F000040020001000000005B0000-000000067F000040020001000000005B4000__000000914E3F38F0 000000067F000040020001000000005B0000-000000067F000040020001000000005B4000__000000931B9A2710 000000067F000040020001000000005B1BB6-000000067F000040020001000000005BA58F__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005B4000-000000067F000040020001000000005B8000__00000073AD3FE6B8 000000067F000040020001000000005B4000-000000067F000040020001000000005B8000__000000914E3F38F0 000000067F000040020001000000005B4000-000000067F000040020001000000005B8000__000000931B9A2710 000000067F000040020001000000005B8000-000000067F000040020001000000005BC000__00000073AD3FE6B8 000000067F000040020001000000005B8000-000000067F000040020001000000005BC000__000000914E3F38F0 000000067F000040020001000000005B8000-000000067F000040020001000000005BC000__000000931B9A2710 000000067F000040020001000000005BA58F-000000067F000040020001000000005C2F60__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005BC000-000000067F000040020001000000005C0000__00000073AD3FE6B8 000000067F000040020001000000005BC000-000000067F000040020001000000005C0000__000000914E3F38F0 000000067F000040020001000000005BC000-000000067F000040020001000000005C0000__000000931B9A2710 000000067F000040020001000000005C0000-000000067F000040020001000000005C4000__00000073AD3FE6B8 000000067F000040020001000000005C0000-000000067F000040020001000000005C4000__000000914E3F38F0 000000067F000040020001000000005C0000-000000067F000040020001000000005C4000__000000931B9A2710 000000067F000040020001000000005C2F60-000000067F000040020001000000005CB925__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005C4000-000000067F000040020001000000005C8000__00000073AD3FE6B8 000000067F000040020001000000005C4000-000000067F000040020001000000005C8000__000000914E3F38F0 000000067F000040020001000000005C4000-000000067F000040020001000000005C8000__000000931B9A2710 000000067F000040020001000000005C8000-000000067F000040020001000000005CC000__00000073AD3FE6B8 000000067F000040020001000000005C8000-000000067F000040020001000000005CC000__000000914E3F38F0 000000067F000040020001000000005C8000-000000067F000040020001000000005CC000__000000931B9A2710 000000067F000040020001000000005CB925-000000067F000040020001000000005D42F3__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005CC000-000000067F000040020001000000005D0000__00000073AD3FE6B8 000000067F000040020001000000005CC000-000000067F000040020001000000005D0000__000000914E3F38F0 000000067F000040020001000000005CC000-000000067F000040020001000000005D0000__000000931B9A2710 000000067F000040020001000000005D0000-000000067F000040020001000000005D4000__00000073AD3FE6B8 000000067F000040020001000000005D0000-000000067F000040020001000000005D4000__000000914E3F38F0 000000067F000040020001000000005D0000-000000067F000040020001000000005D4000__000000931B9A2710 000000067F000040020001000000005D4000-000000067F000040020001000000005D8000__00000073AD3FE6B8 000000067F000040020001000000005D4000-000000067F000040020001000000005D8000__000000914E3F38F0 000000067F000040020001000000005D4000-000000067F000040020001000000005D8000__000000931B9A2710 000000067F000040020001000000005D42F3-000000067F000040020001000000005DCCE4__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005D8000-000000067F000040020001000000005DC000__00000073AD3FE6B8 000000067F000040020001000000005D8000-000000067F000040020001000000005DC000__000000914E3F38F0 000000067F000040020001000000005D8000-000000067F000040020001000000005DC000__000000931B9A2710 000000067F000040020001000000005DC000-000000067F000040020001000000005E0000__00000073AD3FE6B8 000000067F000040020001000000005DC000-000000067F000040020001000000005E0000__000000914E3F38F0 000000067F000040020001000000005DC000-000000067F000040020001000000005E0000__000000931B9A2710 000000067F000040020001000000005DCCE4-000000067F000040020001000000005E56DD__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005E0000-000000067F000040020001000000005E4000__00000073AD3FE6B8 000000067F000040020001000000005E0000-000000067F000040020001000000005E4000__000000914E3F38F0 000000067F000040020001000000005E0000-000000067F000040020001000000005E4000__000000931B9A2710 000000067F000040020001000000005E4000-000000067F000040020001000000005E8000__00000073AD3FE6B8 000000067F000040020001000000005E4000-000000067F000040020001000000005E8000__000000914E3F38F0 000000067F000040020001000000005E4000-000000067F000040020001000000005E8000__000000931B9A2710 000000067F000040020001000000005E56DD-000000067F000040020001000000005EE0C5__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005E8000-000000067F000040020001000000005EC000__00000073AD3FE6B8 000000067F000040020001000000005E8000-000000067F000040020001000000005EC000__000000914E3F38F0 000000067F000040020001000000005E8000-000000067F000040020001000000005EC000__000000931B9A2710 000000067F000040020001000000005EC000-000000067F000040020001000000005F0000__00000073AD3FE6B8 000000067F000040020001000000005EC000-000000067F000040020001000000005F0000__000000914E3F38F0 000000067F000040020001000000005EC000-000000067F000040020001000000005F0000__000000931B9A2710 000000067F000040020001000000005EE0C5-000000067F000040020001000000005F6AA8__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005F0000-000000067F000040020001000000005F4000__00000073AD3FE6B8 000000067F000040020001000000005F0000-000000067F000040020001000000005F4000__000000914E3F38F0 000000067F000040020001000000005F0000-000000067F000040020001000000005F4000__000000931B9A2710 000000067F000040020001000000005F4000-000000067F000040020001000000005F8000__00000073AD3FE6B8 000000067F000040020001000000005F4000-000000067F000040020001000000005F8000__000000914E3F38F0 000000067F000040020001000000005F4000-000000067F000040020001000000005F8000__000000931B9A2710 000000067F000040020001000000005F6AA8-000000067F000040020001000000005FF476__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000005F8000-000000067F000040020001000000005FC000__00000073AD3FE6B8 000000067F000040020001000000005F8000-000000067F000040020001000000005FC000__000000914E3F38F0 000000067F000040020001000000005F8000-000000067F000040020001000000005FC000__000000931B9A2710 000000067F000040020001000000005FC000-000000067F00004002000100000000600000__00000073AD3FE6B8 000000067F000040020001000000005FC000-000000067F00004002000100000000600000__000000914E3F38F0 000000067F000040020001000000005FC000-000000067F00004002000100000000600000__000000931B9A2710 000000067F000040020001000000005FF476-000000067F00004002000100000000607E40__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000600000-000000067F00004002000100000000604000__00000073AD3FE6B8 000000067F00004002000100000000600000-000000067F00004002000100000000604000__000000914E3F38F0 000000067F00004002000100000000600000-000000067F00004002000100000000604000__000000931B9A2710 000000067F00004002000100000000604000-000000067F00004002000100000000608000__00000073AD3FE6B8 000000067F00004002000100000000604000-000000067F00004002000100000000608000__000000914E3F38F0 000000067F00004002000100000000604000-000000067F00004002000100000000608000__000000931B9A2710 000000067F00004002000100000000607E40-000000067F0000400200010000000061081B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000608000-000000067F0000400200010000000060C000__00000073AD3FE6B8 000000067F00004002000100000000608000-000000067F0000400200010000000060C000__000000914E3F38F0 000000067F00004002000100000000608000-000000067F0000400200010000000060C000__000000931B9A2710 000000067F0000400200010000000060C000-000000067F00004002000100000000610000__00000073AD3FE6B8 000000067F0000400200010000000060C000-000000067F00004002000100000000610000__000000914E3F38F0 000000067F0000400200010000000060C000-000000067F00004002000100000000610000__000000931B9A2710 000000067F00004002000100000000610000-000000067F00004002000100000000614000__00000073AD3FE6B8 000000067F00004002000100000000610000-000000067F00004002000100000000614000__000000914E3F38F0 000000067F00004002000100000000610000-000000067F00004002000100000000614000__000000931B9A2710 000000067F0000400200010000000061081B-000000067F000040020001000000006191FB__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000614000-000000067F00004002000100000000618000__00000073AD3FE6B8 000000067F00004002000100000000614000-000000067F00004002000100000000618000__000000914E3F38F0 000000067F00004002000100000000614000-000000067F00004002000100000000618000__000000931B9A2710 000000067F00004002000100000000618000-000000067F0000400200010000000061C000__00000073AD3FE6B8 000000067F00004002000100000000618000-000000067F0000400200010000000061C000__000000914E3F38F0 000000067F00004002000100000000618000-000000067F0000400200010000000061C000__000000931B9A2710 000000067F000040020001000000006191FB-000000067F00004002000100000000621BEC__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000061C000-000000067F00004002000100000000620000__00000073AD3FE6B8 000000067F0000400200010000000061C000-000000067F00004002000100000000620000__000000914E3F38F0 000000067F0000400200010000000061C000-000000067F00004002000100000000620000__000000931B9A2710 000000067F00004002000100000000620000-000000067F00004002000100000000624000__00000073AD3FE6B8 000000067F00004002000100000000620000-000000067F00004002000100000000624000__000000914E3F38F0 000000067F00004002000100000000620000-000000067F00004002000100000000624000__000000931B9A2710 000000067F00004002000100000000621BEC-000000067F0000400200010000000062A5D2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000624000-000000067F00004002000100000000628000__00000073AD3FE6B8 000000067F00004002000100000000624000-000000067F00004002000100000000628000__000000914E3F38F0 000000067F00004002000100000000624000-000000067F00004002000100000000628000__000000931B9A2710 000000067F00004002000100000000628000-000000067F0000400200010000000062C000__00000073AD3FE6B8 000000067F00004002000100000000628000-000000067F0000400200010000000062C000__000000914E3F38F0 000000067F00004002000100000000628000-000000067F0000400200010000000062C000__000000931B9A2710 000000067F0000400200010000000062A5D2-000000067F00004002000100000000632FB1__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000062C000-000000067F00004002000100000000630000__00000073AD3FE6B8 000000067F0000400200010000000062C000-000000067F00004002000100000000630000__000000914E3F38F0 000000067F0000400200010000000062C000-000000067F00004002000100000000630000__000000931B9A2710 000000067F00004002000100000000630000-000000067F00004002000100000000634000__00000073AD3FE6B8 000000067F00004002000100000000630000-000000067F00004002000100000000634000__000000914E3F38F0 000000067F00004002000100000000630000-000000067F00004002000100000000634000__000000931B9A2710 000000067F00004002000100000000632FB1-000000067F0000400200010000000063B985__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000634000-000000067F00004002000100000000638000__00000073AD3FE6B8 000000067F00004002000100000000634000-000000067F00004002000100000000638000__000000914E3F38F0 000000067F00004002000100000000634000-000000067F00004002000100000000638000__000000931B9A2710 000000067F00004002000100000000638000-000000067F0000400200010000000063C000__00000073AD3FE6B8 000000067F00004002000100000000638000-000000067F0000400200010000000063C000__000000914E3F38F0 000000067F00004002000100000000638000-000000067F0000400200010000000063C000__000000931B9A2710 000000067F0000400200010000000063B985-000000067F00004002000100000000644349__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000063C000-000000067F00004002000100000000640000__00000073AD3FE6B8 000000067F0000400200010000000063C000-000000067F00004002000100000000640000__000000914E3F38F0 000000067F0000400200010000000063C000-000000067F00004002000100000000640000__000000931B9A2710 000000067F00004002000100000000640000-000000067F00004002000100000000644000__00000073AD3FE6B8 000000067F00004002000100000000640000-000000067F00004002000100000000644000__000000914E3F38F0 000000067F00004002000100000000640000-000000067F00004002000100000000644000__000000931B9A2710 000000067F00004002000100000000644000-000000067F00004002000100000000648000__00000073AD3FE6B8 000000067F00004002000100000000644000-000000067F00004002000100000000648000__000000914E3F38F0 000000067F00004002000100000000644000-000000067F00004002000100000000648000__000000931B9A2710 000000067F00004002000100000000644349-000000067F0000400200010000000064CD2B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000648000-000000067F0000400200010000000064C000__00000073AD3FE6B8 000000067F00004002000100000000648000-000000067F0000400200010000000064C000__000000914E3F38F0 000000067F00004002000100000000648000-000000067F0000400200010000000064C000__000000931B9A2710 000000067F0000400200010000000064C000-000000067F00004002000100000000650000__00000073AD3FE6B8 000000067F0000400200010000000064C000-000000067F00004002000100000000650000__000000914E3F38F0 000000067F0000400200010000000064C000-000000067F00004002000100000000650000__000000931B9A2710 000000067F0000400200010000000064CD2B-000000067F00004002000100000000655712__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000650000-000000067F00004002000100000000654000__00000073AD3FE6B8 000000067F00004002000100000000650000-000000067F00004002000100000000654000__000000914E3F38F0 000000067F00004002000100000000650000-000000067F00004002000100000000654000__000000931B9A2710 000000067F00004002000100000000654000-000000067F00004002000100000000658000__00000073AD3FE6B8 000000067F00004002000100000000654000-000000067F00004002000100000000658000__000000914E3F38F0 000000067F00004002000100000000654000-000000067F00004002000100000000658000__000000931B9A2710 000000067F00004002000100000000655712-000000067F0000400200010000000065E0F3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000658000-000000067F0000400200010000000065C000__00000073AD3FE6B8 000000067F00004002000100000000658000-000000067F0000400200010000000065C000__000000914E3F38F0 000000067F00004002000100000000658000-000000067F0000400200010000000065C000__000000931B9A2710 000000067F0000400200010000000065C000-000000067F00004002000100000000660000__00000073AD3FE6B8 000000067F0000400200010000000065C000-000000067F00004002000100000000660000__000000914E3F38F0 000000067F0000400200010000000065C000-000000067F00004002000100000000660000__000000931B9A2710 000000067F0000400200010000000065E0F3-000000067F00004002000100000000666AE2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000660000-000000067F00004002000100000000664000__00000073AD3FE6B8 000000067F00004002000100000000660000-000000067F00004002000100000000664000__000000914E3F38F0 000000067F00004002000100000000660000-000000067F00004002000100000000664000__000000931B9A2710 000000067F00004002000100000000664000-000000067F00004002000100000000668000__00000073AD3FE6B8 000000067F00004002000100000000664000-000000067F00004002000100000000668000__000000914E3F38F0 000000067F00004002000100000000664000-000000067F00004002000100000000668000__000000931B9A2710 000000067F00004002000100000000666AE2-000000067F0000400200010000000066F4B5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000668000-000000067F0000400200010000000066C000__00000073AD3FE6B8 000000067F00004002000100000000668000-000000067F0000400200010000000066C000__000000914E3F38F0 000000067F00004002000100000000668000-000000067F0000400200010000000066C000__000000931B9A2710 000000067F0000400200010000000066C000-000000067F00004002000100000000670000__00000073AD3FE6B8 000000067F0000400200010000000066C000-000000067F00004002000100000000670000__000000914E3F38F0 000000067F0000400200010000000066C000-000000067F00004002000100000000670000__000000931B9A2710 000000067F0000400200010000000066F4B5-000000067F00004002000100000000677E81__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000670000-000000067F00004002000100000000674000__00000073AD3FE6B8 000000067F00004002000100000000670000-000000067F00004002000100000000674000__000000914E3F38F0 000000067F00004002000100000000670000-000000067F00004002000100000000674000__000000931B9A2710 000000067F00004002000100000000674000-000000067F00004002000100000000678000__00000073AD3FE6B8 000000067F00004002000100000000674000-000000067F00004002000100000000678000__000000914E3F38F0 000000067F00004002000100000000674000-000000067F00004002000100000000678000__000000931B9A2710 000000067F00004002000100000000677E81-000000067F0000400200010000000068083C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000678000-000000067F0000400200010000000067C000__00000073AD3FE6B8 000000067F00004002000100000000678000-000000067F0000400200010000000067C000__000000914E3F38F0 000000067F00004002000100000000678000-000000067F0000400200010000000067C000__000000931B9A2710 000000067F0000400200010000000067C000-000000067F00004002000100000000680000__00000073AD3FE6B8 000000067F0000400200010000000067C000-000000067F00004002000100000000680000__000000914E3F38F0 000000067F0000400200010000000067C000-000000067F00004002000100000000680000__000000931B9A2710 000000067F00004002000100000000680000-000000067F00004002000100000000684000__00000073AD3FE6B8 000000067F00004002000100000000680000-000000067F00004002000100000000684000__000000914E3F38F0 000000067F00004002000100000000680000-000000067F00004002000100000000684000__000000931B9A2710 000000067F0000400200010000000068083C-000000067F00004002000100000000689223__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000684000-000000067F00004002000100000000688000__00000073AD3FE6B8 000000067F00004002000100000000684000-000000067F00004002000100000000688000__000000914E3F38F0 000000067F00004002000100000000684000-000000067F00004002000100000000688000__000000931B9A2710 000000067F00004002000100000000688000-000000067F0000400200010000000068C000__00000073AD3FE6B8 000000067F00004002000100000000688000-000000067F0000400200010000000068C000__000000914E3F38F0 000000067F00004002000100000000688000-000000067F0000400200010000000068C000__000000931B9A2710 000000067F00004002000100000000689223-000000067F00004002000100000000691C08__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000068C000-000000067F00004002000100000000690000__00000073AD3FE6B8 000000067F0000400200010000000068C000-000000067F00004002000100000000690000__000000914E3F38F0 000000067F0000400200010000000068C000-000000067F00004002000100000000690000__000000931B9A2710 000000067F00004002000100000000690000-000000067F00004002000100000000694000__00000073AD3FE6B8 000000067F00004002000100000000690000-000000067F00004002000100000000694000__000000914E3F38F0 000000067F00004002000100000000690000-000000067F00004002000100000000694000__000000931B9A2710 000000067F00004002000100000000691C08-000000067F0000400200010000000069A5E4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000694000-000000067F00004002000100000000698000__00000073AD3FE6B8 000000067F00004002000100000000694000-000000067F00004002000100000000698000__000000914E3F38F0 000000067F00004002000100000000694000-000000067F00004002000100000000698000__000000931B9A2710 000000067F00004002000100000000698000-000000067F0000400200010000000069C000__00000073AD3FE6B8 000000067F00004002000100000000698000-000000067F0000400200010000000069C000__000000914E3F38F0 000000067F00004002000100000000698000-000000067F0000400200010000000069C000__000000931B9A2710 000000067F0000400200010000000069A5E4-000000067F000040020001000000006A2FC5__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000069C000-000000067F000040020001000000006A0000__00000073AD3FE6B8 000000067F0000400200010000000069C000-000000067F000040020001000000006A0000__000000914E3F38F0 000000067F0000400200010000000069C000-000000067F000040020001000000006A0000__000000931B9A2710 000000067F000040020001000000006A0000-000000067F000040020001000000006A4000__00000073AD3FE6B8 000000067F000040020001000000006A0000-000000067F000040020001000000006A4000__000000914E3F38F0 000000067F000040020001000000006A0000-000000067F000040020001000000006A4000__000000931B9A2710 000000067F000040020001000000006A2FC5-000000067F000040020001000000006AB99F__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006A4000-000000067F000040020001000000006A8000__00000073AD3FE6B8 000000067F000040020001000000006A4000-000000067F000040020001000000006A8000__000000914E3F38F0 000000067F000040020001000000006A4000-000000067F000040020001000000006A8000__000000931B9A2710 000000067F000040020001000000006A8000-000000067F000040020001000000006AC000__00000073AD3FE6B8 000000067F000040020001000000006A8000-000000067F000040020001000000006AC000__000000914E3F38F0 000000067F000040020001000000006A8000-000000067F000040020001000000006AC000__000000931B9A2710 000000067F000040020001000000006AB99F-000000067F000040020001000000006B4375__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006AC000-000000067F000040020001000000006B0000__00000073AD3FE6B8 000000067F000040020001000000006AC000-000000067F000040020001000000006B0000__000000914E3F38F0 000000067F000040020001000000006AC000-000000067F000040020001000000006B0000__000000931B9A2710 000000067F000040020001000000006B0000-000000067F000040020001000000006B4000__00000073AD3FE6B8 000000067F000040020001000000006B0000-000000067F000040020001000000006B4000__000000914E3F38F0 000000067F000040020001000000006B0000-000000067F000040020001000000006B4000__000000931B9A2710 000000067F000040020001000000006B4000-000000067F000040020001000000006B8000__00000073AD3FE6B8 000000067F000040020001000000006B4000-000000067F000040020001000000006B8000__000000914E3F38F0 000000067F000040020001000000006B4000-000000067F000040020001000000006B8000__000000931B9A2710 000000067F000040020001000000006B4375-000000067F000040020001000000006BCD3D__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006B8000-000000067F000040020001000000006BC000__00000073AD3FE6B8 000000067F000040020001000000006B8000-000000067F000040020001000000006BC000__000000914E3F38F0 000000067F000040020001000000006B8000-000000067F000040020001000000006BC000__000000931B9A2710 000000067F000040020001000000006BC000-000000067F000040020001000000006C0000__00000073AD3FE6B8 000000067F000040020001000000006BC000-000000067F000040020001000000006C0000__000000914E3F38F0 000000067F000040020001000000006BC000-000000067F000040020001000000006C0000__000000931B9A2710 000000067F000040020001000000006BCD3D-000000067F000040020001000000006C571E__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006C0000-000000067F000040020001000000006C4000__00000073AD3FE6B8 000000067F000040020001000000006C0000-000000067F000040020001000000006C4000__000000914E3F38F0 000000067F000040020001000000006C0000-000000067F000040020001000000006C4000__000000931B9A2710 000000067F000040020001000000006C4000-000000067F000040020001000000006C8000__00000073AD3FE6B8 000000067F000040020001000000006C4000-000000067F000040020001000000006C8000__000000914E3F38F0 000000067F000040020001000000006C4000-000000067F000040020001000000006C8000__000000931B9A2710 000000067F000040020001000000006C571E-000000067F000040020001000000006CE101__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006C8000-000000067F000040020001000000006CC000__00000073AD3FE6B8 000000067F000040020001000000006C8000-000000067F000040020001000000006CC000__000000914E3F38F0 000000067F000040020001000000006C8000-000000067F000040020001000000006CC000__000000931B9A2710 000000067F000040020001000000006CC000-000000067F000040020001000000006D0000__00000073AD3FE6B8 000000067F000040020001000000006CC000-000000067F000040020001000000006D0000__000000914E3F38F0 000000067F000040020001000000006CC000-000000067F000040020001000000006D0000__000000931B9A2710 000000067F000040020001000000006CE101-000000067F000040020001000000006D6AD7__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006D0000-000000067F000040020001000000006D4000__00000073AD3FE6B8 000000067F000040020001000000006D0000-000000067F000040020001000000006D4000__000000914E3F38F0 000000067F000040020001000000006D0000-000000067F000040020001000000006D4000__000000931B9A2710 000000067F000040020001000000006D4000-000000067F000040020001000000006D8000__00000073AD3FE6B8 000000067F000040020001000000006D4000-000000067F000040020001000000006D8000__000000914E3F38F0 000000067F000040020001000000006D4000-000000067F000040020001000000006D8000__000000931B9A2710 000000067F000040020001000000006D6AD7-000000067F000040020001000000006DF4B3__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006D8000-000000067F000040020001000000006DC000__00000073AD3FE6B8 000000067F000040020001000000006D8000-000000067F000040020001000000006DC000__000000914E3F38F0 000000067F000040020001000000006D8000-000000067F000040020001000000006DC000__000000931B9A2710 000000067F000040020001000000006DC000-000000067F000040020001000000006E0000__00000073AD3FE6B8 000000067F000040020001000000006DC000-000000067F000040020001000000006E0000__000000914E3F38F0 000000067F000040020001000000006DC000-000000067F000040020001000000006E0000__000000931B9A2710 000000067F000040020001000000006DF4B3-000000067F000040020001000000006E7E8D__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006E0000-000000067F000040020001000000006E4000__00000073AD3FE6B8 000000067F000040020001000000006E0000-000000067F000040020001000000006E4000__000000914E3F38F0 000000067F000040020001000000006E0000-000000067F000040020001000000006E4000__000000931B9A2710 000000067F000040020001000000006E4000-000000067F000040020001000000006E8000__00000073AD3FE6B8 000000067F000040020001000000006E4000-000000067F000040020001000000006E8000__000000914E3F38F0 000000067F000040020001000000006E4000-000000067F000040020001000000006E8000__000000931B9A2710 000000067F000040020001000000006E7E8D-000000067F000040020001000000006F0867__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006E8000-000000067F000040020001000000006EC000__00000073AD3FE6B8 000000067F000040020001000000006E8000-000000067F000040020001000000006EC000__000000914E3F38F0 000000067F000040020001000000006E8000-000000067F000040020001000000006EC000__000000931B9A2710 000000067F000040020001000000006EC000-000000067F000040020001000000006F0000__00000073AD3FE6B8 000000067F000040020001000000006EC000-000000067F000040020001000000006F0000__000000914E3F38F0 000000067F000040020001000000006EC000-000000067F000040020001000000006F0000__000000931B9A2710 000000067F000040020001000000006F0000-000000067F000040020001000000006F4000__00000073AD3FE6B8 000000067F000040020001000000006F0000-000000067F000040020001000000006F4000__000000914E3F38F0 000000067F000040020001000000006F0000-000000067F000040020001000000006F4000__000000931B9A2710 000000067F000040020001000000006F0867-000000067F000040020001000000006F923B__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006F4000-000000067F000040020001000000006F8000__00000073AD3FE6B8 000000067F000040020001000000006F4000-000000067F000040020001000000006F8000__000000914E3F38F0 000000067F000040020001000000006F4000-000000067F000040020001000000006F8000__000000931B9A2710 000000067F000040020001000000006F8000-000000067F000040020001000000006FC000__00000073AD3FE6B8 000000067F000040020001000000006F8000-000000067F000040020001000000006FC000__000000914E3F38F0 000000067F000040020001000000006F8000-000000067F000040020001000000006FC000__000000931B9A2710 000000067F000040020001000000006F923B-000000067F00004002000100000000701C1C__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000006FC000-000000067F00004002000100000000700000__00000073AD3FE6B8 000000067F000040020001000000006FC000-000000067F00004002000100000000700000__000000914E3F38F0 000000067F000040020001000000006FC000-000000067F00004002000100000000700000__000000931B9A2710 000000067F00004002000100000000700000-000000067F00004002000100000000704000__00000073AD3FE6B8 000000067F00004002000100000000700000-000000067F00004002000100000000704000__000000914E3F38F0 000000067F00004002000100000000700000-000000067F00004002000100000000704000__000000931B9A2710 000000067F00004002000100000000701C1C-000000067F0000400200010000000070A601__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000704000-000000067F00004002000100000000708000__00000073AD3FE6B8 000000067F00004002000100000000704000-000000067F00004002000100000000708000__000000914E3F38F0 000000067F00004002000100000000704000-000000067F00004002000100000000708000__000000931B9A2710 000000067F00004002000100000000708000-000000067F0000400200010000000070C000__00000073AD3FE6B8 000000067F00004002000100000000708000-000000067F0000400200010000000070C000__000000914E3F38F0 000000067F00004002000100000000708000-000000067F0000400200010000000070C000__000000931B9A2710 000000067F0000400200010000000070A601-000000067F00004002000100000000712FD4__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000070C000-000000067F00004002000100000000710000__00000073AD3FE6B8 000000067F0000400200010000000070C000-000000067F00004002000100000000710000__000000914E3F38F0 000000067F0000400200010000000070C000-000000067F00004002000100000000710000__000000931B9A2710 000000067F00004002000100000000710000-000000067F00004002000100000000714000__00000073AD3FE6B8 000000067F00004002000100000000710000-000000067F00004002000100000000714000__000000914E3F38F0 000000067F00004002000100000000710000-000000067F00004002000100000000714000__000000931B9A2710 000000067F00004002000100000000712FD4-000000067F0000400200010000000071B9B4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000714000-000000067F00004002000100000000718000__00000073AD3FE6B8 000000067F00004002000100000000714000-000000067F00004002000100000000718000__000000914E3F38F0 000000067F00004002000100000000714000-000000067F00004002000100000000718000__000000931B9A2710 000000067F00004002000100000000718000-000000067F0000400200010000000071C000__00000073AD3FE6B8 000000067F00004002000100000000718000-000000067F0000400200010000000071C000__000000914E3F38F0 000000067F00004002000100000000718000-000000067F0000400200010000000071C000__000000931B9A2710 000000067F0000400200010000000071B9B4-000000067F00004002000100000000724391__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000071C000-000000067F00004002000100000000720000__00000073AD3FE6B8 000000067F0000400200010000000071C000-000000067F00004002000100000000720000__000000914E3F38F0 000000067F0000400200010000000071C000-000000067F00004002000100000000720000__000000931B9A2710 000000067F00004002000100000000720000-000000067F00004002000100000000724000__00000073AD3FE6B8 000000067F00004002000100000000720000-000000067F00004002000100000000724000__000000914E3F38F0 000000067F00004002000100000000720000-000000067F00004002000100000000724000__000000931B9A2710 000000067F00004002000100000000724000-000000067F00004002000100000000728000__00000073AD3FE6B8 000000067F00004002000100000000724000-000000067F00004002000100000000728000__000000914E3F38F0 000000067F00004002000100000000724000-000000067F00004002000100000000728000__000000931B9A2710 000000067F00004002000100000000724391-000000067F0000400200010000000072CD55__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000728000-000000067F0000400200010000000072C000__00000073AD3FE6B8 000000067F00004002000100000000728000-000000067F0000400200010000000072C000__000000914E3F38F0 000000067F00004002000100000000728000-000000067F0000400200010000000072C000__000000931B9A2710 000000067F0000400200010000000072C000-000000067F00004002000100000000730000__00000073AD3FE6B8 000000067F0000400200010000000072C000-000000067F00004002000100000000730000__000000914E3F38F0 000000067F0000400200010000000072C000-000000067F00004002000100000000730000__000000931B9A2710 000000067F0000400200010000000072CD55-000000067F00004002000100000000735725__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000730000-000000067F00004002000100000000734000__00000073AD3FE6B8 000000067F00004002000100000000730000-000000067F00004002000100000000734000__000000914E3F38F0 000000067F00004002000100000000730000-000000067F00004002000100000000734000__000000931B9A2710 000000067F00004002000100000000734000-000000067F00004002000100000000738000__00000073AD3FE6B8 000000067F00004002000100000000734000-000000067F00004002000100000000738000__000000914E3F38F0 000000067F00004002000100000000734000-000000067F00004002000100000000738000__000000931B9A2710 000000067F00004002000100000000735725-000000067F0000400200010000000073E109__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000738000-000000067F0000400200010000000073C000__00000073AD3FE6B8 000000067F00004002000100000000738000-000000067F0000400200010000000073C000__000000914E3F38F0 000000067F00004002000100000000738000-000000067F0000400200010000000073C000__000000931B9A2710 000000067F0000400200010000000073C000-000000067F00004002000100000000740000__00000073AD3FE6B8 000000067F0000400200010000000073C000-000000067F00004002000100000000740000__000000914E3F38F0 000000067F0000400200010000000073C000-000000067F00004002000100000000740000__000000931B9A2710 000000067F0000400200010000000073E109-000000067F00004002000100000000746AE4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000740000-000000067F00004002000100000000744000__00000073AD3FE6B8 000000067F00004002000100000000740000-000000067F00004002000100000000744000__000000914E3F38F0 000000067F00004002000100000000740000-000000067F00004002000100000000744000__000000931B9A2710 000000067F00004002000100000000744000-000000067F00004002000100000000748000__00000073AD3FE6B8 000000067F00004002000100000000744000-000000067F00004002000100000000748000__000000914E3F38F0 000000067F00004002000100000000744000-000000067F00004002000100000000748000__000000931B9A2710 000000067F00004002000100000000746AE4-000000067F0000400200010000000074F4C9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000748000-000000067F0000400200010000000074C000__00000073AD3FE6B8 000000067F00004002000100000000748000-000000067F0000400200010000000074C000__000000914E3F38F0 000000067F00004002000100000000748000-000000067F0000400200010000000074C000__000000931B9A2710 000000067F0000400200010000000074C000-000000067F00004002000100000000750000__00000073AD3FE6B8 000000067F0000400200010000000074C000-000000067F00004002000100000000750000__000000914E3F38F0 000000067F0000400200010000000074C000-000000067F00004002000100000000750000__000000931B9A2710 000000067F0000400200010000000074F4C9-000000067F00004002000100000000757E9F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000750000-000000067F00004002000100000000754000__00000073AD3FE6B8 000000067F00004002000100000000750000-000000067F00004002000100000000754000__000000914E3F38F0 000000067F00004002000100000000750000-000000067F00004002000100000000754000__000000931B9A2710 000000067F00004002000100000000754000-000000067F00004002000100000000758000__00000073AD3FE6B8 000000067F00004002000100000000754000-000000067F00004002000100000000758000__000000914E3F38F0 000000067F00004002000100000000754000-000000067F00004002000100000000758000__000000931B9A2710 000000067F00004002000100000000757E9F-000000067F00004002000100000000760874__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000758000-000000067F0000400200010000000075C000__00000073AD3FE6B8 000000067F00004002000100000000758000-000000067F0000400200010000000075C000__000000914E3F38F0 000000067F00004002000100000000758000-000000067F0000400200010000000075C000__000000931B9A2710 000000067F0000400200010000000075C000-000000067F00004002000100000000760000__00000073AD3FE6B8 000000067F0000400200010000000075C000-000000067F00004002000100000000760000__000000914E3F38F0 000000067F0000400200010000000075C000-000000067F00004002000100000000760000__000000931B9A2710 000000067F00004002000100000000760000-000000067F00004002000100000000764000__00000073AD3FE6B8 000000067F00004002000100000000760000-000000067F00004002000100000000764000__000000914E3F38F0 000000067F00004002000100000000760000-000000067F00004002000100000000764000__000000931B9A2710 000000067F00004002000100000000760874-000000067F0000400200010000000076924C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000764000-000000067F00004002000100000000768000__00000073AD3FE6B8 000000067F00004002000100000000764000-000000067F00004002000100000000768000__000000914E3F38F0 000000067F00004002000100000000764000-000000067F00004002000100000000768000__000000931B9A2710 000000067F00004002000100000000768000-000000067F0000400200010000000076C000__00000073AD3FE6B8 000000067F00004002000100000000768000-000000067F0000400200010000000076C000__000000914E3F38F0 000000067F00004002000100000000768000-000000067F0000400200010000000076C000__000000931B9A2710 000000067F0000400200010000000076924C-000000067F00004002000100000000771C36__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000076C000-000000067F00004002000100000000770000__00000073AD3FE6B8 000000067F0000400200010000000076C000-000000067F00004002000100000000770000__000000914E3F38F0 000000067F0000400200010000000076C000-000000067F00004002000100000000770000__000000931B9A2710 000000067F00004002000100000000770000-000000067F00004002000100000000774000__00000073AD3FE6B8 000000067F00004002000100000000770000-000000067F00004002000100000000774000__000000914E3F38F0 000000067F00004002000100000000770000-000000067F00004002000100000000774000__000000931B9A2710 000000067F00004002000100000000771C36-000000067F0000400200010000000077A601__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000774000-000000067F00004002000100000000778000__00000073AD3FE6B8 000000067F00004002000100000000774000-000000067F00004002000100000000778000__000000914E3F38F0 000000067F00004002000100000000774000-000000067F00004002000100000000778000__000000931B9A2710 000000067F00004002000100000000778000-000000067F0000400200010000000077C000__00000073AD3FE6B8 000000067F00004002000100000000778000-000000067F0000400200010000000077C000__000000914E3F38F0 000000067F00004002000100000000778000-000000067F0000400200010000000077C000__000000931B9A2710 000000067F0000400200010000000077A601-000000067F00004002000100000000782FCF__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000077C000-000000067F00004002000100000000780000__00000073AD3FE6B8 000000067F0000400200010000000077C000-000000067F00004002000100000000780000__000000914E3F38F0 000000067F0000400200010000000077C000-000000067F00004002000100000000780000__000000931B9A2710 000000067F00004002000100000000780000-000000067F00004002000100000000784000__00000073AD3FE6B8 000000067F00004002000100000000780000-000000067F00004002000100000000784000__000000914E3F38F0 000000067F00004002000100000000780000-000000067F00004002000100000000784000__000000931B9A2710 000000067F00004002000100000000782FCF-000000067F0000400200010000000078B9BA__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000784000-000000067F00004002000100000000788000__00000073AD3FE6B8 000000067F00004002000100000000784000-000000067F00004002000100000000788000__000000914E3F38F0 000000067F00004002000100000000784000-000000067F00004002000100000000788000__000000931B9A2710 000000067F00004002000100000000788000-000000067F0000400200010000000078C000__00000073AD3FE6B8 000000067F00004002000100000000788000-000000067F0000400200010000000078C000__000000914E3F38F0 000000067F00004002000100000000788000-000000067F0000400200010000000078C000__000000931B9A2710 000000067F0000400200010000000078B9BA-000000067F0000400200010000000079439A__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000078C000-000000067F00004002000100000000790000__00000073AD3FE6B8 000000067F0000400200010000000078C000-000000067F00004002000100000000790000__000000914E3F38F0 000000067F0000400200010000000078C000-000000067F00004002000100000000790000__000000931B9A2710 000000067F00004002000100000000790000-000000067F00004002000100000000794000__00000073AD3FE6B8 000000067F00004002000100000000790000-000000067F00004002000100000000794000__000000914E3F38F0 000000067F00004002000100000000790000-000000067F00004002000100000000794000__000000931B9A2710 000000067F00004002000100000000794000-000000067F00004002000100000000798000__00000073AD3FE6B8 000000067F00004002000100000000794000-000000067F00004002000100000000798000__000000914E3F38F0 000000067F00004002000100000000794000-000000067F00004002000100000000798000__000000931B9A2710 000000067F0000400200010000000079439A-000000067F0000400200010000000079CD75__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000798000-000000067F0000400200010000000079C000__00000073AD3FE6B8 000000067F00004002000100000000798000-000000067F0000400200010000000079C000__000000914E3F38F0 000000067F00004002000100000000798000-000000067F0000400200010000000079C000__000000931B9A2710 000000067F0000400200010000000079C000-000000067F000040020001000000007A0000__00000073AD3FE6B8 000000067F0000400200010000000079C000-000000067F000040020001000000007A0000__000000914E3F38F0 000000067F0000400200010000000079C000-000000067F000040020001000000007A0000__000000931B9A2710 000000067F0000400200010000000079CD75-000000067F000040020001000000007A5758__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007A0000-000000067F000040020001000000007A4000__00000073AD3FE6B8 000000067F000040020001000000007A0000-000000067F000040020001000000007A4000__000000914E3F38F0 000000067F000040020001000000007A0000-000000067F000040020001000000007A4000__000000931B9A2710 000000067F000040020001000000007A4000-000000067F000040020001000000007A8000__00000073AD3FE6B8 000000067F000040020001000000007A4000-000000067F000040020001000000007A8000__000000914E3F38F0 000000067F000040020001000000007A4000-000000067F000040020001000000007A8000__000000931B9A2710 000000067F000040020001000000007A5758-000000067F000040020001000000007AE12F__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007A8000-000000067F000040020001000000007AC000__00000073AD3FE6B8 000000067F000040020001000000007A8000-000000067F000040020001000000007AC000__000000914E3F38F0 000000067F000040020001000000007A8000-000000067F000040020001000000007AC000__000000931B9A2710 000000067F000040020001000000007AC000-000000067F000040020001000000007B0000__00000073AD3FE6B8 000000067F000040020001000000007AC000-000000067F000040020001000000007B0000__000000914E3F38F0 000000067F000040020001000000007AC000-000000067F000040020001000000007B0000__000000931B9A2710 000000067F000040020001000000007AE12F-000000067F000040020001000000007B6B09__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007B0000-000000067F000040020001000000007B4000__00000073AD3FE6B8 000000067F000040020001000000007B0000-000000067F000040020001000000007B4000__000000914E3F38F0 000000067F000040020001000000007B0000-000000067F000040020001000000007B4000__000000931B9A2710 000000067F000040020001000000007B4000-000000067F000040020001000000007B8000__00000073AD3FE6B8 000000067F000040020001000000007B4000-000000067F000040020001000000007B8000__000000914E3F38F0 000000067F000040020001000000007B4000-000000067F000040020001000000007B8000__000000931B9A2710 000000067F000040020001000000007B6B09-000000067F000040020001000000007BF4E1__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007B8000-000000067F000040020001000000007BC000__00000073AD3FE6B8 000000067F000040020001000000007B8000-000000067F000040020001000000007BC000__000000914E3F38F0 000000067F000040020001000000007B8000-000000067F000040020001000000007BC000__000000931B9A2710 000000067F000040020001000000007BC000-000000067F000040020001000000007C0000__00000073AD3FE6B8 000000067F000040020001000000007BC000-000000067F000040020001000000007C0000__000000914E3F38F0 000000067F000040020001000000007BC000-000000067F000040020001000000007C0000__000000931B9A2710 000000067F000040020001000000007BF4E1-000000067F000040020001000000007C7EBE__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007C0000-000000067F000040020001000000007C4000__00000073AD3FE6B8 000000067F000040020001000000007C0000-000000067F000040020001000000007C4000__000000914E3F38F0 000000067F000040020001000000007C0000-000000067F000040020001000000007C4000__000000931B9A2710 000000067F000040020001000000007C4000-000000067F000040020001000000007C8000__00000073AD3FE6B8 000000067F000040020001000000007C4000-000000067F000040020001000000007C8000__000000914E3F38F0 000000067F000040020001000000007C4000-000000067F000040020001000000007C8000__000000931B9A2710 000000067F000040020001000000007C7EBE-000000067F000040020001000000007D0891__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007C8000-000000067F000040020001000000007CC000__00000073AD3FE6B8 000000067F000040020001000000007C8000-000000067F000040020001000000007CC000__000000914E3F38F0 000000067F000040020001000000007C8000-000000067F000040020001000000007CC000__000000931B9A2710 000000067F000040020001000000007CC000-000000067F000040020001000000007D0000__00000073AD3FE6B8 000000067F000040020001000000007CC000-000000067F000040020001000000007D0000__000000914E3F38F0 000000067F000040020001000000007CC000-000000067F000040020001000000007D0000__000000931B9A2710 000000067F000040020001000000007D0000-000000067F000040020001000000007D4000__00000073AD3FE6B8 000000067F000040020001000000007D0000-000000067F000040020001000000007D4000__000000914E3F38F0 000000067F000040020001000000007D0000-000000067F000040020001000000007D4000__000000931B9A2710 000000067F000040020001000000007D0891-000000067F000040020001000000007D926D__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007D4000-000000067F000040020001000000007D8000__00000073AD3FE6B8 000000067F000040020001000000007D4000-000000067F000040020001000000007D8000__000000914E3F38F0 000000067F000040020001000000007D4000-000000067F000040020001000000007D8000__000000931B9A2710 000000067F000040020001000000007D8000-000000067F000040020001000000007DC000__00000073AD3FE6B8 000000067F000040020001000000007D8000-000000067F000040020001000000007DC000__000000914E3F38F0 000000067F000040020001000000007D8000-000000067F000040020001000000007DC000__000000931B9A2710 000000067F000040020001000000007D926D-000000067F000040020001000000007E1C45__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007DC000-000000067F000040020001000000007E0000__00000073AD3FE6B8 000000067F000040020001000000007DC000-000000067F000040020001000000007E0000__000000914E3F38F0 000000067F000040020001000000007DC000-000000067F000040020001000000007E0000__000000931B9A2710 000000067F000040020001000000007E0000-000000067F000040020001000000007E4000__00000073AD3FE6B8 000000067F000040020001000000007E0000-000000067F000040020001000000007E4000__000000914E3F38F0 000000067F000040020001000000007E0000-000000067F000040020001000000007E4000__000000931B9A2710 000000067F000040020001000000007E1C45-000000067F000040020001000000007EA622__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007E4000-000000067F000040020001000000007E8000__00000073AD3FE6B8 000000067F000040020001000000007E4000-000000067F000040020001000000007E8000__000000914E3F38F0 000000067F000040020001000000007E4000-000000067F000040020001000000007E8000__000000931B9A2710 000000067F000040020001000000007E8000-000000067F000040020001000000007EC000__00000073AD3FE6B8 000000067F000040020001000000007E8000-000000067F000040020001000000007EC000__000000914E3F38F0 000000067F000040020001000000007E8000-000000067F000040020001000000007EC000__000000931B9A2710 000000067F000040020001000000007EA622-000000067F000040020001000000007F2FFC__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007EC000-000000067F000040020001000000007F0000__00000073AD3FE6B8 000000067F000040020001000000007EC000-000000067F000040020001000000007F0000__000000914E3F38F0 000000067F000040020001000000007EC000-000000067F000040020001000000007F0000__000000931B9A2710 000000067F000040020001000000007F0000-000000067F000040020001000000007F4000__00000073AD3FE6B8 000000067F000040020001000000007F0000-000000067F000040020001000000007F4000__000000914E3F38F0 000000067F000040020001000000007F0000-000000067F000040020001000000007F4000__000000931B9A2710 000000067F000040020001000000007F2FFC-000000067F000040020001000000007FB9E5__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007F4000-000000067F000040020001000000007F8000__00000073AD3FE6B8 000000067F000040020001000000007F4000-000000067F000040020001000000007F8000__000000914E3F38F0 000000067F000040020001000000007F4000-000000067F000040020001000000007F8000__000000931B9A2710 000000067F000040020001000000007F8000-000000067F000040020001000000007FC000__00000073AD3FE6B8 000000067F000040020001000000007F8000-000000067F000040020001000000007FC000__000000914E3F38F0 000000067F000040020001000000007F8000-000000067F000040020001000000007FC000__000000931B9A2710 000000067F000040020001000000007FB9E5-000000067F000040020001000000008043C6__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000007FC000-000000067F00004002000100000000800000__00000073AD3FE6B8 000000067F000040020001000000007FC000-000000067F00004002000100000000800000__000000914E3F38F0 000000067F000040020001000000007FC000-000000067F00004002000100000000800000__000000931B9A2710 000000067F00004002000100000000800000-000000067F00004002000100000000804000__00000073AD3FE6B8 000000067F00004002000100000000800000-000000067F00004002000100000000804000__000000914E3F38F0 000000067F00004002000100000000800000-000000067F00004002000100000000804000__000000931B9A2710 000000067F00004002000100000000804000-000000067F00004002000100000000808000__00000073AD3FE6B8 000000067F00004002000100000000804000-000000067F00004002000100000000808000__000000914E3F38F0 000000067F00004002000100000000804000-000000067F00004002000100000000808000__000000931B9A2710 000000067F000040020001000000008043C6-000000067F0000400200010000000080CD9F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000808000-000000067F0000400200010000000080C000__00000073AD3FE6B8 000000067F00004002000100000000808000-000000067F0000400200010000000080C000__000000914E3F38F0 000000067F00004002000100000000808000-000000067F0000400200010000000080C000__000000931B9A2710 000000067F0000400200010000000080C000-000000067F00004002000100000000810000__00000073AD3FE6B8 000000067F0000400200010000000080C000-000000067F00004002000100000000810000__000000914E3F38F0 000000067F0000400200010000000080C000-000000067F00004002000100000000810000__000000931B9A2710 000000067F0000400200010000000080CD9F-000000067F00004002000100000000815785__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000810000-000000067F00004002000100000000814000__00000073AD3FE6B8 000000067F00004002000100000000810000-000000067F00004002000100000000814000__000000914E3F38F0 000000067F00004002000100000000810000-000000067F00004002000100000000814000__000000931B9A2710 000000067F00004002000100000000814000-000000067F00004002000100000000818000__00000073AD3FE6B8 000000067F00004002000100000000814000-000000067F00004002000100000000818000__000000914E3F38F0 000000067F00004002000100000000814000-000000067F00004002000100000000818000__000000931B9A2710 000000067F00004002000100000000815785-000000067F0000400200010000000081E161__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000818000-000000067F0000400200010000000081C000__00000073AD3FE6B8 000000067F00004002000100000000818000-000000067F0000400200010000000081C000__000000914E3F38F0 000000067F00004002000100000000818000-000000067F0000400200010000000081C000__000000931B9A2710 000000067F0000400200010000000081C000-000000067F00004002000100000000820000__00000073AD3FE6B8 000000067F0000400200010000000081C000-000000067F00004002000100000000820000__000000914E3F38F0 000000067F0000400200010000000081C000-000000067F00004002000100000000820000__000000931B9A2710 000000067F0000400200010000000081E161-000000067F00004002000100000000826B3A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000820000-000000067F00004002000100000000824000__00000073AD3FE6B8 000000067F00004002000100000000820000-000000067F00004002000100000000824000__000000914E3F38F0 000000067F00004002000100000000820000-000000067F00004002000100000000824000__000000931B9A2710 000000067F00004002000100000000824000-000000067F00004002000100000000828000__00000073AD3FE6B8 000000067F00004002000100000000824000-000000067F00004002000100000000828000__000000914E3F38F0 000000067F00004002000100000000824000-000000067F00004002000100000000828000__000000931B9A2710 000000067F00004002000100000000826B3A-000000067F0000400200010000000082F516__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000828000-000000067F0000400200010000000082C000__00000073AD3FE6B8 000000067F00004002000100000000828000-000000067F0000400200010000000082C000__000000914E3F38F0 000000067F00004002000100000000828000-000000067F0000400200010000000082C000__000000931B9A2710 000000067F0000400200010000000082C000-000000067F00004002000100000000830000__00000073AD3FE6B8 000000067F0000400200010000000082C000-000000067F00004002000100000000830000__000000914E3F38F0 000000067F0000400200010000000082C000-000000067F00004002000100000000830000__000000931B9A2710 000000067F0000400200010000000082F516-000000067F00004002000100000000837EF5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000830000-000000067F00004002000100000000834000__00000073AD3FE6B8 000000067F00004002000100000000830000-000000067F00004002000100000000834000__000000914E3F38F0 000000067F00004002000100000000830000-000000067F00004002000100000000834000__000000931B9A2710 000000067F00004002000100000000834000-000000067F00004002000100000000838000__00000073AD3FE6B8 000000067F00004002000100000000834000-000000067F00004002000100000000838000__000000914E3F38F0 000000067F00004002000100000000834000-000000067F00004002000100000000838000__000000931B9A2710 000000067F00004002000100000000837EF5-000000067F000040020001000000008408D5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000838000-000000067F0000400200010000000083C000__00000073AD3FE6B8 000000067F00004002000100000000838000-000000067F0000400200010000000083C000__000000914E3F38F0 000000067F00004002000100000000838000-000000067F0000400200010000000083C000__000000931B9A2710 000000067F0000400200010000000083C000-000000067F00004002000100000000840000__00000073AD3FE6B8 000000067F0000400200010000000083C000-000000067F00004002000100000000840000__000000914E3F38F0 000000067F0000400200010000000083C000-000000067F00004002000100000000840000__000000931B9A2710 000000067F00004002000100000000840000-000000067F00004002000100000000844000__00000073AD3FE6B8 000000067F00004002000100000000840000-000000067F00004002000100000000844000__000000914E3F38F0 000000067F00004002000100000000840000-000000067F00004002000100000000844000__000000931B9A2710 000000067F000040020001000000008408D5-000000067F000040020001000000008492B9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000844000-000000067F00004002000100000000848000__00000073AD3FE6B8 000000067F00004002000100000000844000-000000067F00004002000100000000848000__000000914E3F38F0 000000067F00004002000100000000844000-000000067F00004002000100000000848000__000000931B9A2710 000000067F00004002000100000000848000-000000067F0000400200010000000084C000__00000073AD3FE6B8 000000067F00004002000100000000848000-000000067F0000400200010000000084C000__000000914E3F38F0 000000067F00004002000100000000848000-000000067F0000400200010000000084C000__000000931B9A2710 000000067F000040020001000000008492B9-000000067F00004002000100000000851C91__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000084C000-000000067F00004002000100000000850000__00000073AD3FE6B8 000000067F0000400200010000000084C000-000000067F00004002000100000000850000__000000914E3F38F0 000000067F0000400200010000000084C000-000000067F00004002000100000000850000__000000931B9A2710 000000067F00004002000100000000850000-000000067F00004002000100000000854000__00000073AD3FE6B8 000000067F00004002000100000000850000-000000067F00004002000100000000854000__000000914E3F38F0 000000067F00004002000100000000850000-000000067F00004002000100000000854000__000000931B9A2710 000000067F00004002000100000000851C91-000000067F0000400200010000000085A67F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000854000-000000067F00004002000100000000858000__00000073AD3FE6B8 000000067F00004002000100000000854000-000000067F00004002000100000000858000__000000914E3F38F0 000000067F00004002000100000000854000-000000067F00004002000100000000858000__000000931B9A2710 000000067F00004002000100000000858000-000000067F0000400200010000000085C000__00000073AD3FE6B8 000000067F00004002000100000000858000-000000067F0000400200010000000085C000__000000914E3F38F0 000000067F00004002000100000000858000-000000067F0000400200010000000085C000__000000931B9A2710 000000067F0000400200010000000085A67F-000000067F00004002000100000000863061__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000085C000-000000067F00004002000100000000860000__00000073AD3FE6B8 000000067F0000400200010000000085C000-000000067F00004002000100000000860000__000000914E3F38F0 000000067F0000400200010000000085C000-000000067F00004002000100000000860000__000000931B9A2710 000000067F00004002000100000000860000-000000067F00004002000100000000864000__00000073AD3FE6B8 000000067F00004002000100000000860000-000000067F00004002000100000000864000__000000914E3F38F0 000000067F00004002000100000000860000-000000067F00004002000100000000864000__000000931B9A2710 000000067F00004002000100000000863061-000000067F0000400200010000000086BA3E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000864000-000000067F00004002000100000000868000__00000073AD3FE6B8 000000067F00004002000100000000864000-000000067F00004002000100000000868000__000000914E3F38F0 000000067F00004002000100000000864000-000000067F00004002000100000000868000__000000931B9A2710 000000067F00004002000100000000868000-000000067F0000400200010000000086C000__00000073AD3FE6B8 000000067F00004002000100000000868000-000000067F0000400200010000000086C000__000000914E3F38F0 000000067F00004002000100000000868000-000000067F0000400200010000000086C000__000000931B9A2710 000000067F0000400200010000000086BA3E-000000067F0000400200010000000087440C__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000086C000-000000067F00004002000100000000870000__00000073AD3FE6B8 000000067F0000400200010000000086C000-000000067F00004002000100000000870000__000000914E3F38F0 000000067F0000400200010000000086C000-000000067F00004002000100000000870000__000000931B9A2710 000000067F00004002000100000000870000-000000067F00004002000100000000874000__00000073AD3FE6B8 000000067F00004002000100000000870000-000000067F00004002000100000000874000__000000914E3F38F0 000000067F00004002000100000000870000-000000067F00004002000100000000874000__000000931B9A2710 000000067F00004002000100000000874000-000000067F00004002000100000000878000__00000073AD3FE6B8 000000067F00004002000100000000874000-000000067F00004002000100000000878000__000000914E3F38F0 000000067F00004002000100000000874000-000000067F00004002000100000000878000__000000931B9A2710 000000067F0000400200010000000087440C-000000067F0000400200010000000087CDE0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000878000-000000067F0000400200010000000087C000__00000073AD3FE6B8 000000067F00004002000100000000878000-000000067F0000400200010000000087C000__000000914E3F38F0 000000067F00004002000100000000878000-000000067F0000400200010000000087C000__000000931B9A2710 000000067F0000400200010000000087C000-000000067F00004002000100000000880000__00000073AD3FE6B8 000000067F0000400200010000000087C000-000000067F00004002000100000000880000__000000914E3F38F0 000000067F0000400200010000000087C000-000000067F00004002000100000000880000__000000931B9A2710 000000067F0000400200010000000087CDE0-000000067F000040020001000000008857BF__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000880000-000000067F00004002000100000000884000__00000073AD3FE6B8 000000067F00004002000100000000880000-000000067F00004002000100000000884000__000000914E3F38F0 000000067F00004002000100000000880000-000000067F00004002000100000000884000__000000931B9A2710 000000067F00004002000100000000884000-000000067F00004002000100000000888000__00000073AD3FE6B8 000000067F00004002000100000000884000-000000067F00004002000100000000888000__000000914E3F38F0 000000067F00004002000100000000884000-000000067F00004002000100000000888000__000000931B9A2710 000000067F000040020001000000008857BF-000000067F0000400200010000000088E19E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000888000-000000067F0000400200010000000088C000__00000073AD3FE6B8 000000067F00004002000100000000888000-000000067F0000400200010000000088C000__000000914E3F38F0 000000067F00004002000100000000888000-000000067F0000400200010000000088C000__000000931B9A2710 000000067F0000400200010000000088C000-000000067F00004002000100000000890000__00000073AD3FE6B8 000000067F0000400200010000000088C000-000000067F00004002000100000000890000__000000914E3F38F0 000000067F0000400200010000000088C000-000000067F00004002000100000000890000__000000931B9A2710 000000067F0000400200010000000088E19E-000000067F00004002000100000000896B7C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000890000-000000067F00004002000100000000894000__00000073AD3FE6B8 000000067F00004002000100000000890000-000000067F00004002000100000000894000__000000914E3F38F0 000000067F00004002000100000000890000-000000067F00004002000100000000894000__000000931B9A2710 000000067F00004002000100000000894000-000000067F00004002000100000000898000__00000073AD3FE6B8 000000067F00004002000100000000894000-000000067F00004002000100000000898000__000000914E3F38F0 000000067F00004002000100000000894000-000000067F00004002000100000000898000__000000931B9A2710 000000067F00004002000100000000896B7C-000000067F0000400200010000000089F566__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000898000-000000067F0000400200010000000089C000__00000073AD3FE6B8 000000067F00004002000100000000898000-000000067F0000400200010000000089C000__000000914E3F38F0 000000067F00004002000100000000898000-000000067F0000400200010000000089C000__000000931B9A2710 000000067F0000400200010000000089C000-000000067F000040020001000000008A0000__00000073AD3FE6B8 000000067F0000400200010000000089C000-000000067F000040020001000000008A0000__000000914E3F38F0 000000067F0000400200010000000089C000-000000067F000040020001000000008A0000__000000931B9A2710 000000067F0000400200010000000089F566-000000067F000040020001000000008A7F45__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008A0000-000000067F000040020001000000008A4000__00000073AD3FE6B8 000000067F000040020001000000008A0000-000000067F000040020001000000008A4000__000000914E3F38F0 000000067F000040020001000000008A0000-000000067F000040020001000000008A4000__000000931B9A2710 000000067F000040020001000000008A4000-000000067F000040020001000000008A8000__00000073AD3FE6B8 000000067F000040020001000000008A4000-000000067F000040020001000000008A8000__000000914E3F38F0 000000067F000040020001000000008A4000-000000067F000040020001000000008A8000__000000931B9A2710 000000067F000040020001000000008A7F45-000000067F000040020001000000008B0918__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008A8000-000000067F000040020001000000008AC000__00000073AD3FE6B8 000000067F000040020001000000008A8000-000000067F000040020001000000008AC000__000000914E3F38F0 000000067F000040020001000000008A8000-000000067F000040020001000000008AC000__000000931B9A2710 000000067F000040020001000000008AC000-000000067F000040020001000000008B0000__00000073AD3FE6B8 000000067F000040020001000000008AC000-000000067F000040020001000000008B0000__000000914E3F38F0 000000067F000040020001000000008AC000-000000067F000040020001000000008B0000__000000931B9A2710 000000067F000040020001000000008B0000-000000067F000040020001000000008B4000__00000073AD3FE6B8 000000067F000040020001000000008B0000-000000067F000040020001000000008B4000__000000914E3F38F0 000000067F000040020001000000008B0000-000000067F000040020001000000008B4000__000000931B9A2710 000000067F000040020001000000008B0918-000000067F000040020001000000008B92F6__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008B4000-000000067F000040020001000000008B8000__00000073AD3FE6B8 000000067F000040020001000000008B4000-000000067F000040020001000000008B8000__000000914E3F38F0 000000067F000040020001000000008B4000-000000067F000040020001000000008B8000__000000931B9A2710 000000067F000040020001000000008B8000-000000067F000040020001000000008BC000__00000073AD3FE6B8 000000067F000040020001000000008B8000-000000067F000040020001000000008BC000__000000914E3F38F0 000000067F000040020001000000008B8000-000000067F000040020001000000008BC000__000000931B9A2710 000000067F000040020001000000008B92F6-000000067F000040020001000000008C1CD8__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008BC000-000000067F000040020001000000008C0000__00000073AD3FE6B8 000000067F000040020001000000008BC000-000000067F000040020001000000008C0000__000000914E3F38F0 000000067F000040020001000000008BC000-000000067F000040020001000000008C0000__000000931B9A2710 000000067F000040020001000000008C0000-000000067F000040020001000000008C4000__00000073AD3FE6B8 000000067F000040020001000000008C0000-000000067F000040020001000000008C4000__000000914E3F38F0 000000067F000040020001000000008C0000-000000067F000040020001000000008C4000__000000931B9A2710 000000067F000040020001000000008C1CD8-000000067F000040020001000000008CA6C0__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008C4000-000000067F000040020001000000008C8000__00000073AD3FE6B8 000000067F000040020001000000008C4000-000000067F000040020001000000008C8000__000000914E3F38F0 000000067F000040020001000000008C4000-000000067F000040020001000000008C8000__000000931B9A2710 000000067F000040020001000000008C8000-000000067F000040020001000000008CC000__00000073AD3FE6B8 000000067F000040020001000000008C8000-000000067F000040020001000000008CC000__000000914E3F38F0 000000067F000040020001000000008C8000-000000067F000040020001000000008CC000__000000931B9A2710 000000067F000040020001000000008CA6C0-000000067F000040020001000000008D30A3__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008CC000-000000067F000040020001000000008D0000__00000073AD3FE6B8 000000067F000040020001000000008CC000-000000067F000040020001000000008D0000__000000914E3F38F0 000000067F000040020001000000008CC000-000000067F000040020001000000008D0000__000000931B9A2710 000000067F000040020001000000008D0000-000000067F000040020001000000008D4000__00000073AD3FE6B8 000000067F000040020001000000008D0000-000000067F000040020001000000008D4000__000000914E3F38F0 000000067F000040020001000000008D0000-000000067F000040020001000000008D4000__000000931B9A2710 000000067F000040020001000000008D30A3-000000067F000040020001000000008DBA92__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008D4000-000000067F000040020001000000008D8000__00000073AD3FE6B8 000000067F000040020001000000008D4000-000000067F000040020001000000008D8000__000000914E3F38F0 000000067F000040020001000000008D4000-000000067F000040020001000000008D8000__000000931B9A2710 000000067F000040020001000000008D8000-000000067F000040020001000000008DC000__00000073AD3FE6B8 000000067F000040020001000000008D8000-000000067F000040020001000000008DC000__000000914E3F38F0 000000067F000040020001000000008D8000-000000067F000040020001000000008DC000__000000931B9A2710 000000067F000040020001000000008DBA92-000000067F000040020001000000008E4465__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008DC000-000000067F000040020001000000008E0000__00000073AD3FE6B8 000000067F000040020001000000008DC000-000000067F000040020001000000008E0000__000000914E3F38F0 000000067F000040020001000000008DC000-000000067F000040020001000000008E0000__000000931B9A2710 000000067F000040020001000000008E0000-000000067F000040020001000000008E4000__00000073AD3FE6B8 000000067F000040020001000000008E0000-000000067F000040020001000000008E4000__000000914E3F38F0 000000067F000040020001000000008E0000-000000067F000040020001000000008E4000__000000931B9A2710 000000067F000040020001000000008E4000-000000067F000040020001000000008E8000__00000073AD3FE6B8 000000067F000040020001000000008E4000-000000067F000040020001000000008E8000__000000914E3F38F0 000000067F000040020001000000008E4000-000000067F000040020001000000008E8000__000000931B9A2710 000000067F000040020001000000008E4465-000000067F000040020001000000008ECE3E__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008E8000-000000067F000040020001000000008EC000__00000073AD3FE6B8 000000067F000040020001000000008E8000-000000067F000040020001000000008EC000__000000914E3F38F0 000000067F000040020001000000008E8000-000000067F000040020001000000008EC000__000000931B9A2710 000000067F000040020001000000008EC000-000000067F000040020001000000008F0000__00000073AD3FE6B8 000000067F000040020001000000008EC000-000000067F000040020001000000008F0000__000000914E3F38F0 000000067F000040020001000000008EC000-000000067F000040020001000000008F0000__000000931B9A2710 000000067F000040020001000000008ECE3E-000000067F000040020001000000008F5814__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008F0000-000000067F000040020001000000008F4000__00000073AD3FE6B8 000000067F000040020001000000008F0000-000000067F000040020001000000008F4000__000000914E3F38F0 000000067F000040020001000000008F0000-000000067F000040020001000000008F4000__000000931B9A2710 000000067F000040020001000000008F4000-000000067F000040020001000000008F8000__00000073AD3FE6B8 000000067F000040020001000000008F4000-000000067F000040020001000000008F8000__000000914E3F38F0 000000067F000040020001000000008F4000-000000067F000040020001000000008F8000__000000931B9A2710 000000067F000040020001000000008F5814-000000067F000040020001000000008FE1EC__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000008F8000-000000067F000040020001000000008FC000__00000073AD3FE6B8 000000067F000040020001000000008F8000-000000067F000040020001000000008FC000__000000914E3F38F0 000000067F000040020001000000008F8000-000000067F000040020001000000008FC000__000000931B9A2710 000000067F000040020001000000008FC000-000000067F00004002000100000000900000__00000073AD3FE6B8 000000067F000040020001000000008FC000-000000067F00004002000100000000900000__000000914E3F38F0 000000067F000040020001000000008FC000-000000067F00004002000100000000900000__000000931B9A2710 000000067F000040020001000000008FE1EC-000000067F00004002000100000000906BDF__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000900000-000000067F00004002000100000000904000__00000073AD3FE6B8 000000067F00004002000100000000900000-000000067F00004002000100000000904000__000000914E3F38F0 000000067F00004002000100000000900000-000000067F00004002000100000000904000__000000931B9A2710 000000067F00004002000100000000904000-000000067F00004002000100000000908000__00000073AD3FE6B8 000000067F00004002000100000000904000-000000067F00004002000100000000908000__000000914E3F38F0 000000067F00004002000100000000904000-000000067F00004002000100000000908000__000000931B9A2710 000000067F00004002000100000000906BDF-000000067F0000400200010000000090F5CA__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000908000-000000067F0000400200010000000090C000__00000073AD3FE6B8 000000067F00004002000100000000908000-000000067F0000400200010000000090C000__000000914E3F38F0 000000067F00004002000100000000908000-000000067F0000400200010000000090C000__000000931B9A2710 000000067F0000400200010000000090C000-000000067F00004002000100000000910000__00000073AD3FE6B8 000000067F0000400200010000000090C000-000000067F00004002000100000000910000__000000914E3F38F0 000000067F0000400200010000000090C000-000000067F00004002000100000000910000__000000931B9A2710 000000067F0000400200010000000090F5CA-000000067F00004002000100000000917FAA__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000910000-000000067F00004002000100000000914000__00000073AD3FE6B8 000000067F00004002000100000000910000-000000067F00004002000100000000914000__000000914E3F38F0 000000067F00004002000100000000910000-000000067F00004002000100000000914000__000000931B9A2710 000000067F00004002000100000000914000-000000067F00004002000100000000918000__00000073AD3FE6B8 000000067F00004002000100000000914000-000000067F00004002000100000000918000__000000914E3F38F0 000000067F00004002000100000000914000-000000067F00004002000100000000918000__000000931B9A2710 000000067F00004002000100000000917FAA-000000067F0000400200010000000092097C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000918000-000000067F0000400200010000000091C000__00000073AD3FE6B8 000000067F00004002000100000000918000-000000067F0000400200010000000091C000__000000914E3F38F0 000000067F00004002000100000000918000-000000067F0000400200010000000091C000__000000931B9A2710 000000067F0000400200010000000091C000-000000067F00004002000100000000920000__00000073AD3FE6B8 000000067F0000400200010000000091C000-000000067F00004002000100000000920000__000000914E3F38F0 000000067F0000400200010000000091C000-000000067F00004002000100000000920000__000000931B9A2710 000000067F00004002000100000000920000-000000067F00004002000100000000924000__00000073AD3FE6B8 000000067F00004002000100000000920000-000000067F00004002000100000000924000__000000914E3F38F0 000000067F00004002000100000000920000-000000067F00004002000100000000924000__000000931B9A2710 000000067F0000400200010000000092097C-000000067F0000400200010000000092935B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000924000-000000067F00004002000100000000928000__00000073AD3FE6B8 000000067F00004002000100000000924000-000000067F00004002000100000000928000__000000914E3F38F0 000000067F00004002000100000000924000-000000067F00004002000100000000928000__000000931B9A2710 000000067F00004002000100000000928000-000000067F0000400200010000000092C000__00000073AD3FE6B8 000000067F00004002000100000000928000-000000067F0000400200010000000092C000__000000914E3F38F0 000000067F00004002000100000000928000-000000067F0000400200010000000092C000__000000931B9A2710 000000067F0000400200010000000092935B-000000067F00004002000100000000931D2F__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000092C000-000000067F00004002000100000000930000__00000073AD3FE6B8 000000067F0000400200010000000092C000-000000067F00004002000100000000930000__000000914E3F38F0 000000067F0000400200010000000092C000-000000067F00004002000100000000930000__000000931B9A2710 000000067F00004002000100000000930000-000000067F00004002000100000000934000__00000073AD3FE6B8 000000067F00004002000100000000930000-000000067F00004002000100000000934000__000000914E3F38F0 000000067F00004002000100000000930000-000000067F00004002000100000000934000__000000931B9A2710 000000067F00004002000100000000931D2F-000000067F0000400200010000000093A709__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000934000-000000067F00004002000100000000938000__00000073AD3FE6B8 000000067F00004002000100000000934000-000000067F00004002000100000000938000__000000914E3F38F0 000000067F00004002000100000000934000-000000067F00004002000100000000938000__000000931B9A2710 000000067F00004002000100000000938000-000000067F0000400200010000000093C000__00000073AD3FE6B8 000000067F00004002000100000000938000-000000067F0000400200010000000093C000__000000914E3F38F0 000000067F00004002000100000000938000-000000067F0000400200010000000093C000__000000931B9A2710 000000067F0000400200010000000093A709-000000067F000040020001000000009430E7__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000093C000-000000067F00004002000100000000940000__00000073AD3FE6B8 000000067F0000400200010000000093C000-000000067F00004002000100000000940000__000000914E3F38F0 000000067F0000400200010000000093C000-000000067F00004002000100000000940000__000000931B9A2710 000000067F00004002000100000000940000-000000067F00004002000100000000944000__00000073AD3FE6B8 000000067F00004002000100000000940000-000000067F00004002000100000000944000__000000914E3F38F0 000000067F00004002000100000000940000-000000067F00004002000100000000944000__000000931B9A2710 000000067F000040020001000000009430E7-000000067F0000400200010000000094BAD0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000944000-000000067F00004002000100000000948000__00000073AD3FE6B8 000000067F00004002000100000000944000-000000067F00004002000100000000948000__000000914E3F38F0 000000067F00004002000100000000944000-000000067F00004002000100000000948000__000000931B9A2710 000000067F00004002000100000000948000-000000067F0000400200010000000094C000__00000073AD3FE6B8 000000067F00004002000100000000948000-000000067F0000400200010000000094C000__000000914E3F38F0 000000067F00004002000100000000948000-000000067F0000400200010000000094C000__000000931B9A2710 000000067F0000400200010000000094BAD0-000000067F000040020001000000009544BD__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000094C000-000000067F00004002000100000000950000__00000073AD3FE6B8 000000067F0000400200010000000094C000-000000067F00004002000100000000950000__000000914E3F38F0 000000067F0000400200010000000094C000-000000067F00004002000100000000950000__000000931B9A2710 000000067F00004002000100000000950000-000000067F00004002000100000000954000__00000073AD3FE6B8 000000067F00004002000100000000950000-000000067F00004002000100000000954000__000000914E3F38F0 000000067F00004002000100000000950000-000000067F00004002000100000000954000__000000931B9A2710 000000067F00004002000100000000954000-000000067F00004002000100000000958000__00000073AD3FE6B8 000000067F00004002000100000000954000-000000067F00004002000100000000958000__000000914E3F38F0 000000067F00004002000100000000954000-000000067F00004002000100000000958000__000000931B9A2710 000000067F000040020001000000009544BD-000000067F0000400200010000000095CE95__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000958000-000000067F0000400200010000000095C000__00000073AD3FE6B8 000000067F00004002000100000000958000-000000067F0000400200010000000095C000__000000914E3F38F0 000000067F00004002000100000000958000-000000067F0000400200010000000095C000__000000931B9A2710 000000067F0000400200010000000095C000-000000067F00004002000100000000960000__00000073AD3FE6B8 000000067F0000400200010000000095C000-000000067F00004002000100000000960000__000000914E3F38F0 000000067F0000400200010000000095C000-000000067F00004002000100000000960000__000000931B9A2710 000000067F0000400200010000000095CE95-000000067F0000400200010000000096586F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000960000-000000067F00004002000100000000964000__00000073AD3FE6B8 000000067F00004002000100000000960000-000000067F00004002000100000000964000__000000914E3F38F0 000000067F00004002000100000000960000-000000067F00004002000100000000964000__000000931B9A2710 000000067F00004002000100000000964000-000000067F00004002000100000000968000__00000073AD3FE6B8 000000067F00004002000100000000964000-000000067F00004002000100000000968000__000000914E3F38F0 000000067F00004002000100000000964000-000000067F00004002000100000000968000__000000931B9A2710 000000067F0000400200010000000096586F-000000067F0000400200010000000096E247__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000968000-000000067F0000400200010000000096C000__00000073AD3FE6B8 000000067F00004002000100000000968000-000000067F0000400200010000000096C000__000000914E3F38F0 000000067F00004002000100000000968000-000000067F0000400200010000000096C000__000000931B9A2710 000000067F0000400200010000000096C000-000000067F00004002000100000000970000__00000073AD3FE6B8 000000067F0000400200010000000096C000-000000067F00004002000100000000970000__000000914E3F38F0 000000067F0000400200010000000096C000-000000067F00004002000100000000970000__000000931B9A2710 000000067F0000400200010000000096E247-000000067F00004002000100000000976C0F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000970000-000000067F00004002000100000000974000__00000073AD3FE6B8 000000067F00004002000100000000970000-000000067F00004002000100000000974000__000000914E3F38F0 000000067F00004002000100000000970000-000000067F00004002000100000000974000__000000931B9A2710 000000067F00004002000100000000974000-000000067F00004002000100000000978000__00000073AD3FE6B8 000000067F00004002000100000000974000-000000067F00004002000100000000978000__000000914E3F38F0 000000067F00004002000100000000974000-000000067F00004002000100000000978000__000000931B9A2710 000000067F00004002000100000000976C0F-000000067F0000400200010000000097F5F4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000978000-000000067F0000400200010000000097C000__00000073AD3FE6B8 000000067F00004002000100000000978000-000000067F0000400200010000000097C000__000000914E3F38F0 000000067F00004002000100000000978000-000000067F0000400200010000000097C000__000000931B9A2710 000000067F0000400200010000000097C000-000000067F00004002000100000000980000__00000073AD3FE6B8 000000067F0000400200010000000097C000-000000067F00004002000100000000980000__000000914E3F38F0 000000067F0000400200010000000097C000-000000067F00004002000100000000980000__000000931B9A2710 000000067F0000400200010000000097F5F4-000000067F00004002000100000000987FD8__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000980000-000000067F00004002000100000000984000__00000073AD3FE6B8 000000067F00004002000100000000980000-000000067F00004002000100000000984000__000000914E3F38F0 000000067F00004002000100000000980000-000000067F00004002000100000000984000__000000931B9A2710 000000067F00004002000100000000984000-000000067F00004002000100000000988000__00000073AD3FE6B8 000000067F00004002000100000000984000-000000067F00004002000100000000988000__000000914E3F38F0 000000067F00004002000100000000984000-000000067F00004002000100000000988000__000000931B9A2710 000000067F00004002000100000000987FD8-000000067F000040020001000000009909C2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000988000-000000067F0000400200010000000098C000__00000073AD3FE6B8 000000067F00004002000100000000988000-000000067F0000400200010000000098C000__000000914E3F38F0 000000067F00004002000100000000988000-000000067F0000400200010000000098C000__000000931B9A2710 000000067F0000400200010000000098C000-000000067F00004002000100000000990000__00000073AD3FE6B8 000000067F0000400200010000000098C000-000000067F00004002000100000000990000__000000914E3F38F0 000000067F0000400200010000000098C000-000000067F00004002000100000000990000__000000931B9A2710 000000067F00004002000100000000990000-000000067F00004002000100000000994000__00000073AD3FE6B8 000000067F00004002000100000000990000-000000067F00004002000100000000994000__000000914E3F38F0 000000067F00004002000100000000990000-000000067F00004002000100000000994000__000000931B9A2710 000000067F000040020001000000009909C2-000000067F000040020001000000009993A0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000994000-000000067F00004002000100000000998000__00000073AD3FE6B8 000000067F00004002000100000000994000-000000067F00004002000100000000998000__000000914E3F38F0 000000067F00004002000100000000994000-000000067F00004002000100000000998000__000000931B9A2710 000000067F00004002000100000000998000-000000067F0000400200010000000099C000__00000073AD3FE6B8 000000067F00004002000100000000998000-000000067F0000400200010000000099C000__000000914E3F38F0 000000067F00004002000100000000998000-000000067F0000400200010000000099C000__000000931B9A2710 000000067F000040020001000000009993A0-000000067F000040020001000000009A1D79__0000005CA7BBD6F9-000000739A8D1299 000000067F0000400200010000000099C000-000000067F000040020001000000009A0000__00000073AD3FE6B8 000000067F0000400200010000000099C000-000000067F000040020001000000009A0000__000000914E3F38F0 000000067F0000400200010000000099C000-000000067F000040020001000000009A0000__000000931B9A2710 000000067F000040020001000000009A0000-000000067F000040020001000000009A4000__00000073AD3FE6B8 000000067F000040020001000000009A0000-000000067F000040020001000000009A4000__000000914E3F38F0 000000067F000040020001000000009A0000-000000067F000040020001000000009A4000__000000931B9A2710 000000067F000040020001000000009A1D79-000000067F000040020001000000009AA74E__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009A4000-000000067F000040020001000000009A8000__00000073AD3FE6B8 000000067F000040020001000000009A4000-000000067F000040020001000000009A8000__000000914E3F38F0 000000067F000040020001000000009A4000-000000067F000040020001000000009A8000__000000931B9A2710 000000067F000040020001000000009A8000-000000067F000040020001000000009AC000__00000073AD3FE6B8 000000067F000040020001000000009A8000-000000067F000040020001000000009AC000__000000914E3F38F0 000000067F000040020001000000009A8000-000000067F000040020001000000009AC000__000000931B9A2710 000000067F000040020001000000009AA74E-000000067F000040020001000000009B311D__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009AC000-000000067F000040020001000000009B0000__00000073AD3FE6B8 000000067F000040020001000000009AC000-000000067F000040020001000000009B0000__000000914E3F38F0 000000067F000040020001000000009AC000-000000067F000040020001000000009B0000__000000931B9A2710 000000067F000040020001000000009B0000-000000067F000040020001000000009B4000__00000073AD3FE6B8 000000067F000040020001000000009B0000-000000067F000040020001000000009B4000__000000914E3F38F0 000000067F000040020001000000009B0000-000000067F000040020001000000009B4000__000000931B9A2710 000000067F000040020001000000009B311D-000000067F000040020001000000009BBB01__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009B4000-000000067F000040020001000000009B8000__00000073AD3FE6B8 000000067F000040020001000000009B4000-000000067F000040020001000000009B8000__000000914E3F38F0 000000067F000040020001000000009B4000-000000067F000040020001000000009B8000__000000931B9A2710 000000067F000040020001000000009B8000-000000067F000040020001000000009BC000__00000073AD3FE6B8 000000067F000040020001000000009B8000-000000067F000040020001000000009BC000__000000914E3F38F0 000000067F000040020001000000009B8000-000000067F000040020001000000009BC000__000000931B9A2710 000000067F000040020001000000009BBB01-000000067F000040020001000000009C44DD__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009BC000-000000067F000040020001000000009C0000__00000073AD3FE6B8 000000067F000040020001000000009BC000-000000067F000040020001000000009C0000__000000914E3F38F0 000000067F000040020001000000009BC000-000000067F000040020001000000009C0000__000000931B9A2710 000000067F000040020001000000009C0000-000000067F000040020001000000009C4000__00000073AD3FE6B8 000000067F000040020001000000009C0000-000000067F000040020001000000009C4000__000000914E3F38F0 000000067F000040020001000000009C0000-000000067F000040020001000000009C4000__000000931B9A2710 000000067F000040020001000000009C4000-000000067F000040020001000000009C8000__00000073AD3FE6B8 000000067F000040020001000000009C4000-000000067F000040020001000000009C8000__000000914E3F38F0 000000067F000040020001000000009C4000-000000067F000040020001000000009C8000__000000931B9A2710 000000067F000040020001000000009C44DD-000000067F000040020001000000009CCEC8__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009C8000-000000067F000040020001000000009CC000__00000073AD3FE6B8 000000067F000040020001000000009C8000-000000067F000040020001000000009CC000__000000914E3F38F0 000000067F000040020001000000009C8000-000000067F000040020001000000009CC000__000000931B9A2710 000000067F000040020001000000009CC000-000000067F000040020001000000009D0000__00000073AD3FE6B8 000000067F000040020001000000009CC000-000000067F000040020001000000009D0000__000000914E3F38F0 000000067F000040020001000000009CC000-000000067F000040020001000000009D0000__000000931B9A2710 000000067F000040020001000000009CCEC8-000000067F000040020001000000009D58A3__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009D0000-000000067F000040020001000000009D4000__00000073AD3FE6B8 000000067F000040020001000000009D0000-000000067F000040020001000000009D4000__000000914E3F38F0 000000067F000040020001000000009D0000-000000067F000040020001000000009D4000__000000931B9A2710 000000067F000040020001000000009D4000-000000067F000040020001000000009D8000__00000073AD3FE6B8 000000067F000040020001000000009D4000-000000067F000040020001000000009D8000__000000914E3F38F0 000000067F000040020001000000009D4000-000000067F000040020001000000009D8000__000000931B9A2710 000000067F000040020001000000009D58A3-000000067F000040020001000000009DE27F__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009D8000-000000067F000040020001000000009DC000__00000073AD3FE6B8 000000067F000040020001000000009D8000-000000067F000040020001000000009DC000__000000914E3F38F0 000000067F000040020001000000009D8000-000000067F000040020001000000009DC000__000000931B9A2710 000000067F000040020001000000009DC000-000000067F000040020001000000009E0000__00000073AD3FE6B8 000000067F000040020001000000009DC000-000000067F000040020001000000009E0000__000000914E3F38F0 000000067F000040020001000000009DC000-000000067F000040020001000000009E0000__000000931B9A2710 000000067F000040020001000000009DE27F-000000067F000040020001000000009E6C5B__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009E0000-000000067F000040020001000000009E4000__00000073AD3FE6B8 000000067F000040020001000000009E0000-000000067F000040020001000000009E4000__000000914E3F38F0 000000067F000040020001000000009E0000-000000067F000040020001000000009E4000__000000931B9A2710 000000067F000040020001000000009E4000-000000067F000040020001000000009E8000__00000073AD3FE6B8 000000067F000040020001000000009E4000-000000067F000040020001000000009E8000__000000914E3F38F0 000000067F000040020001000000009E4000-000000067F000040020001000000009E8000__000000931B9A2710 000000067F000040020001000000009E6C5B-000000067F000040020001000000009EF631__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009E8000-000000067F000040020001000000009EC000__00000073AD3FE6B8 000000067F000040020001000000009E8000-000000067F000040020001000000009EC000__000000914E3F38F0 000000067F000040020001000000009E8000-000000067F000040020001000000009EC000__000000931B9A2710 000000067F000040020001000000009EC000-000000067F000040020001000000009F0000__00000073AD3FE6B8 000000067F000040020001000000009EC000-000000067F000040020001000000009F0000__000000914E3F38F0 000000067F000040020001000000009EC000-000000067F000040020001000000009F0000__000000931B9A2710 000000067F000040020001000000009EF631-000000067F000040020001000000009F8011__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009F0000-000000067F000040020001000000009F4000__00000073AD3FE6B8 000000067F000040020001000000009F0000-000000067F000040020001000000009F4000__000000914E3F38F0 000000067F000040020001000000009F0000-000000067F000040020001000000009F4000__000000931B9A2710 000000067F000040020001000000009F4000-000000067F000040020001000000009F8000__00000073AD3FE6B8 000000067F000040020001000000009F4000-000000067F000040020001000000009F8000__000000914E3F38F0 000000067F000040020001000000009F4000-000000067F000040020001000000009F8000__000000931B9A2710 000000067F000040020001000000009F8000-000000067F000040020001000000009FC000__00000073AD3FE6B8 000000067F000040020001000000009F8000-000000067F000040020001000000009FC000__000000914E3F38F0 000000067F000040020001000000009F8000-000000067F000040020001000000009FC000__000000931B9A2710 000000067F000040020001000000009F8011-000000067F00004002000100000000A009F2__0000005CA7BBD6F9-000000739A8D1299 000000067F000040020001000000009FC000-000000067F00004002000100000000A00000__00000073AD3FE6B8 000000067F000040020001000000009FC000-000000067F00004002000100000000A00000__000000914E3F38F0 000000067F000040020001000000009FC000-000000067F00004002000100000000A00000__000000931B9A2710 000000067F00004002000100000000A00000-000000067F00004002000100000000A04000__00000073AD3FE6B8 000000067F00004002000100000000A00000-000000067F00004002000100000000A04000__000000914E3F38F0 000000067F00004002000100000000A00000-000000067F00004002000100000000A04000__000000931B9A2710 000000067F00004002000100000000A009F2-000000067F00004002000100000000A093E0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A04000-000000067F00004002000100000000A08000__00000073AD3FE6B8 000000067F00004002000100000000A04000-000000067F00004002000100000000A08000__000000914E3F38F0 000000067F00004002000100000000A04000-000000067F00004002000100000000A08000__000000931B9A2710 000000067F00004002000100000000A08000-000000067F00004002000100000000A0C000__00000073AD3FE6B8 000000067F00004002000100000000A08000-000000067F00004002000100000000A0C000__000000914E3F38F0 000000067F00004002000100000000A08000-000000067F00004002000100000000A0C000__000000931B9A2710 000000067F00004002000100000000A093E0-000000067F00004002000100000000A11DBB__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A0C000-000000067F00004002000100000000A10000__00000073AD3FE6B8 000000067F00004002000100000000A0C000-000000067F00004002000100000000A10000__000000914E3F38F0 000000067F00004002000100000000A0C000-000000067F00004002000100000000A10000__000000931B9A2710 000000067F00004002000100000000A10000-000000067F00004002000100000000A14000__00000073AD3FE6B8 000000067F00004002000100000000A10000-000000067F00004002000100000000A14000__000000914E3F38F0 000000067F00004002000100000000A10000-000000067F00004002000100000000A14000__000000931B9A2710 000000067F00004002000100000000A11DBB-000000067F00004002000100000000A1A795__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A14000-000000067F00004002000100000000A18000__00000073AD3FE6B8 000000067F00004002000100000000A14000-000000067F00004002000100000000A18000__000000914E3F38F0 000000067F00004002000100000000A14000-000000067F00004002000100000000A18000__000000931B9A2710 000000067F00004002000100000000A18000-000000067F00004002000100000000A1C000__00000073AD3FE6B8 000000067F00004002000100000000A18000-000000067F00004002000100000000A1C000__000000914E3F38F0 000000067F00004002000100000000A18000-000000067F00004002000100000000A1C000__000000931B9A2710 000000067F00004002000100000000A1A795-000000067F00004002000100000000A23173__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A1C000-000000067F00004002000100000000A20000__00000073AD3FE6B8 000000067F00004002000100000000A1C000-000000067F00004002000100000000A20000__000000914E3F38F0 000000067F00004002000100000000A1C000-000000067F00004002000100000000A20000__000000931B9A2710 000000067F00004002000100000000A20000-000000067F00004002000100000000A24000__00000073AD3FE6B8 000000067F00004002000100000000A20000-000000067F00004002000100000000A24000__000000914E3F38F0 000000067F00004002000100000000A20000-000000067F00004002000100000000A24000__000000931B9A2710 000000067F00004002000100000000A23173-000000067F00004002000100000000A2BB4B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A24000-000000067F00004002000100000000A28000__00000073AD3FE6B8 000000067F00004002000100000000A24000-000000067F00004002000100000000A28000__000000914E3F38F0 000000067F00004002000100000000A24000-000000067F00004002000100000000A28000__000000931B9A2710 000000067F00004002000100000000A28000-000000067F00004002000100000000A2C000__00000073AD3FE6B8 000000067F00004002000100000000A28000-000000067F00004002000100000000A2C000__000000914E3F38F0 000000067F00004002000100000000A28000-000000067F00004002000100000000A2C000__000000931B9A2710 000000067F00004002000100000000A2BB4B-000000067F00004002000100000000A34529__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A2C000-000000067F00004002000100000000A30000__00000073AD3FE6B8 000000067F00004002000100000000A2C000-000000067F00004002000100000000A30000__000000914E3F38F0 000000067F00004002000100000000A2C000-000000067F00004002000100000000A30000__000000931B9A2710 000000067F00004002000100000000A30000-000000067F00004002000100000000A34000__00000073AD3FE6B8 000000067F00004002000100000000A30000-000000067F00004002000100000000A34000__000000914E3F38F0 000000067F00004002000100000000A30000-000000067F00004002000100000000A34000__000000931B9A2710 000000067F00004002000100000000A34000-000000067F00004002000100000000A38000__00000073AD3FE6B8 000000067F00004002000100000000A34000-000000067F00004002000100000000A38000__000000914E3F38F0 000000067F00004002000100000000A34000-000000067F00004002000100000000A38000__000000931B9A2710 000000067F00004002000100000000A34529-000000067F00004002000100000000A3CF0D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A38000-000000067F00004002000100000000A3C000__00000073AD3FE6B8 000000067F00004002000100000000A38000-000000067F00004002000100000000A3C000__000000914E3F38F0 000000067F00004002000100000000A38000-000000067F00004002000100000000A3C000__000000931B9A2710 000000067F00004002000100000000A3C000-000000067F00004002000100000000A40000__00000073AD3FE6B8 000000067F00004002000100000000A3C000-000000067F00004002000100000000A40000__000000914E3F38F0 000000067F00004002000100000000A3C000-000000067F00004002000100000000A40000__000000931B9A2710 000000067F00004002000100000000A3CF0D-000000067F00004002000100000000A458E2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A40000-000000067F00004002000100000000A44000__00000073AD3FE6B8 000000067F00004002000100000000A40000-000000067F00004002000100000000A44000__000000914E3F38F0 000000067F00004002000100000000A40000-000000067F00004002000100000000A44000__000000931B9A2710 000000067F00004002000100000000A44000-000000067F00004002000100000000A48000__00000073AD3FE6B8 000000067F00004002000100000000A44000-000000067F00004002000100000000A48000__000000914E3F38F0 000000067F00004002000100000000A44000-000000067F00004002000100000000A48000__000000931B9A2710 000000067F00004002000100000000A458E2-000000067F00004002000100000000A4E2BE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A48000-000000067F00004002000100000000A4C000__00000073AD3FE6B8 000000067F00004002000100000000A48000-000000067F00004002000100000000A4C000__000000914E3F38F0 000000067F00004002000100000000A48000-000000067F00004002000100000000A4C000__000000931B9A2710 000000067F00004002000100000000A4C000-000000067F00004002000100000000A50000__00000073AD3FE6B8 000000067F00004002000100000000A4C000-000000067F00004002000100000000A50000__000000914E3F38F0 000000067F00004002000100000000A4C000-000000067F00004002000100000000A50000__000000931B9A2710 000000067F00004002000100000000A4E2BE-000000067F00004002000100000000A56C93__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A50000-000000067F00004002000100000000A54000__00000073AD3FE6B8 000000067F00004002000100000000A50000-000000067F00004002000100000000A54000__000000914E3F38F0 000000067F00004002000100000000A50000-000000067F00004002000100000000A54000__000000931B9A2710 000000067F00004002000100000000A54000-000000067F00004002000100000000A58000__00000073AD3FE6B8 000000067F00004002000100000000A54000-000000067F00004002000100000000A58000__000000914E3F38F0 000000067F00004002000100000000A54000-000000067F00004002000100000000A58000__000000931B9A2710 000000067F00004002000100000000A56C93-000000067F00004002000100000000A5F666__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A58000-000000067F00004002000100000000A5C000__00000073AD3FE6B8 000000067F00004002000100000000A58000-000000067F00004002000100000000A5C000__000000914E3F38F0 000000067F00004002000100000000A58000-000000067F00004002000100000000A5C000__000000931B9A2710 000000067F00004002000100000000A5C000-000000067F00004002000100000000A60000__00000073AD3FE6B8 000000067F00004002000100000000A5C000-000000067F00004002000100000000A60000__000000914E3F38F0 000000067F00004002000100000000A5C000-000000067F00004002000100000000A60000__000000931B9A2710 000000067F00004002000100000000A5F666-000000067F00004002000100000000A68049__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A60000-000000067F00004002000100000000A64000__00000073AD3FE6B8 000000067F00004002000100000000A60000-000000067F00004002000100000000A64000__000000914E3F38F0 000000067F00004002000100000000A60000-000000067F00004002000100000000A64000__000000931B9A2710 000000067F00004002000100000000A64000-000000067F00004002000100000000A68000__00000073AD3FE6B8 000000067F00004002000100000000A64000-000000067F00004002000100000000A68000__000000914E3F38F0 000000067F00004002000100000000A64000-000000067F00004002000100000000A68000__000000931B9A2710 000000067F00004002000100000000A68000-000000067F00004002000100000000A6C000__00000073AD3FE6B8 000000067F00004002000100000000A68000-000000067F00004002000100000000A6C000__000000914E3F38F0 000000067F00004002000100000000A68000-000000067F00004002000100000000A6C000__000000931B9A2710 000000067F00004002000100000000A68049-000000067F00004002000100000000A70A2B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A6C000-000000067F00004002000100000000A70000__00000073AD3FE6B8 000000067F00004002000100000000A6C000-000000067F00004002000100000000A70000__000000914E3F38F0 000000067F00004002000100000000A6C000-000000067F00004002000100000000A70000__000000931B9A2710 000000067F00004002000100000000A70000-000000067F00004002000100000000A74000__00000073AD3FE6B8 000000067F00004002000100000000A70000-000000067F00004002000100000000A74000__000000914E3F38F0 000000067F00004002000100000000A70000-000000067F00004002000100000000A74000__000000931B9A2710 000000067F00004002000100000000A70A2B-000000067F00004002000100000000A7940C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A74000-000000067F00004002000100000000A78000__00000073AD3FE6B8 000000067F00004002000100000000A74000-000000067F00004002000100000000A78000__000000914E3F38F0 000000067F00004002000100000000A74000-000000067F00004002000100000000A78000__000000931B9A2710 000000067F00004002000100000000A78000-000000067F00004002000100000000A7C000__00000073AD3FE6B8 000000067F00004002000100000000A78000-000000067F00004002000100000000A7C000__000000914E3F38F0 000000067F00004002000100000000A78000-000000067F00004002000100000000A7C000__000000931B9A2710 000000067F00004002000100000000A7940C-000000067F00004002000100000000A81DD9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A7C000-000000067F00004002000100000000A80000__00000073AD3FE6B8 000000067F00004002000100000000A7C000-000000067F00004002000100000000A80000__000000914E3F38F0 000000067F00004002000100000000A7C000-000000067F00004002000100000000A80000__000000931B9A2710 000000067F00004002000100000000A80000-000000067F00004002000100000000A84000__00000073AD3FE6B8 000000067F00004002000100000000A80000-000000067F00004002000100000000A84000__000000914E3F38F0 000000067F00004002000100000000A80000-000000067F00004002000100000000A84000__000000931B9A2710 000000067F00004002000100000000A81DD9-000000067F00004002000100000000A8A7B8__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A84000-000000067F00004002000100000000A88000__00000073AD3FE6B8 000000067F00004002000100000000A84000-000000067F00004002000100000000A88000__000000914E3F38F0 000000067F00004002000100000000A84000-000000067F00004002000100000000A88000__000000931B9A2710 000000067F00004002000100000000A88000-000000067F00004002000100000000A8C000__00000073AD3FE6B8 000000067F00004002000100000000A88000-000000067F00004002000100000000A8C000__000000914E3F38F0 000000067F00004002000100000000A88000-000000067F00004002000100000000A8C000__000000931B9A2710 000000067F00004002000100000000A8A7B8-000000067F00004002000100000000A9318F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A8C000-000000067F00004002000100000000A90000__00000073AD3FE6B8 000000067F00004002000100000000A8C000-000000067F00004002000100000000A90000__000000914E3F38F0 000000067F00004002000100000000A8C000-000000067F00004002000100000000A90000__000000931B9A2710 000000067F00004002000100000000A90000-000000067F00004002000100000000A94000__00000073AD3FE6B8 000000067F00004002000100000000A90000-000000067F00004002000100000000A94000__000000914E3F38F0 000000067F00004002000100000000A90000-000000067F00004002000100000000A94000__000000931B9A2710 000000067F00004002000100000000A9318F-000000067F00004002000100000000A9BB65__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A94000-000000067F00004002000100000000A98000__00000073AD3FE6B8 000000067F00004002000100000000A94000-000000067F00004002000100000000A98000__000000914E3F38F0 000000067F00004002000100000000A94000-000000067F00004002000100000000A98000__000000931B9A2710 000000067F00004002000100000000A98000-000000067F00004002000100000000A9C000__00000073AD3FE6B8 000000067F00004002000100000000A98000-000000067F00004002000100000000A9C000__000000914E3F38F0 000000067F00004002000100000000A98000-000000067F00004002000100000000A9C000__000000931B9A2710 000000067F00004002000100000000A9BB65-000000067F00004002000100000000AA4546__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000A9C000-000000067F00004002000100000000AA0000__00000073AD3FE6B8 000000067F00004002000100000000A9C000-000000067F00004002000100000000AA0000__000000914E3F38F0 000000067F00004002000100000000A9C000-000000067F00004002000100000000AA0000__000000931B9A2710 000000067F00004002000100000000AA0000-000000067F00004002000100000000AA4000__00000073AD3FE6B8 000000067F00004002000100000000AA0000-000000067F00004002000100000000AA4000__000000914E3F38F0 000000067F00004002000100000000AA0000-000000067F00004002000100000000AA4000__000000931B9A2710 000000067F00004002000100000000AA4000-000000067F00004002000100000000AA8000__00000073AD3FE6B8 000000067F00004002000100000000AA4000-000000067F00004002000100000000AA8000__000000914E3F38F0 000000067F00004002000100000000AA4000-000000067F00004002000100000000AA8000__000000931B9A2710 000000067F00004002000100000000AA4546-000000067F00004002000100000000AACF1E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AA8000-000000067F00004002000100000000AAC000__00000073AD3FE6B8 000000067F00004002000100000000AA8000-000000067F00004002000100000000AAC000__000000914E3F38F0 000000067F00004002000100000000AA8000-000000067F00004002000100000000AAC000__000000931B9A2710 000000067F00004002000100000000AAC000-000000067F00004002000100000000AB0000__00000073AD3FE6B8 000000067F00004002000100000000AAC000-000000067F00004002000100000000AB0000__000000914E3F38F0 000000067F00004002000100000000AAC000-000000067F00004002000100000000AB0000__000000931B9A2710 000000067F00004002000100000000AACF1E-000000067F00004002000100000000AB58FC__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AB0000-000000067F00004002000100000000AB4000__00000073AD3FE6B8 000000067F00004002000100000000AB0000-000000067F00004002000100000000AB4000__000000914E3F38F0 000000067F00004002000100000000AB0000-000000067F00004002000100000000AB4000__000000931B9A2710 000000067F00004002000100000000AB4000-000000067F00004002000100000000AB8000__00000073AD3FE6B8 000000067F00004002000100000000AB4000-000000067F00004002000100000000AB8000__000000914E3F38F0 000000067F00004002000100000000AB4000-000000067F00004002000100000000AB8000__000000931B9A2710 000000067F00004002000100000000AB58FC-000000067F00004002000100000000ABE2E6__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AB8000-000000067F00004002000100000000ABC000__00000073AD3FE6B8 000000067F00004002000100000000AB8000-000000067F00004002000100000000ABC000__000000914E3F38F0 000000067F00004002000100000000AB8000-000000067F00004002000100000000ABC000__000000931B9A2710 000000067F00004002000100000000ABC000-000000067F00004002000100000000AC0000__00000073AD3FE6B8 000000067F00004002000100000000ABC000-000000067F00004002000100000000AC0000__000000914E3F38F0 000000067F00004002000100000000ABC000-000000067F00004002000100000000AC0000__000000931B9A2710 000000067F00004002000100000000ABE2E6-000000067F00004002000100000000AC6CC2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AC0000-000000067F00004002000100000000AC4000__00000073AD3FE6B8 000000067F00004002000100000000AC0000-000000067F00004002000100000000AC4000__000000914E3F38F0 000000067F00004002000100000000AC0000-000000067F00004002000100000000AC4000__000000931B9A2710 000000067F00004002000100000000AC4000-000000067F00004002000100000000AC8000__00000073AD3FE6B8 000000067F00004002000100000000AC4000-000000067F00004002000100000000AC8000__000000914E3F38F0 000000067F00004002000100000000AC4000-000000067F00004002000100000000AC8000__000000931B9A2710 000000067F00004002000100000000AC6CC2-000000067F00004002000100000000ACF6A1__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AC8000-000000067F00004002000100000000ACC000__00000073AD3FE6B8 000000067F00004002000100000000AC8000-000000067F00004002000100000000ACC000__000000914E3F38F0 000000067F00004002000100000000AC8000-000000067F00004002000100000000ACC000__000000931B9A2710 000000067F00004002000100000000ACC000-000000067F00004002000100000000AD0000__00000073AD3FE6B8 000000067F00004002000100000000ACC000-000000067F00004002000100000000AD0000__000000914E3F38F0 000000067F00004002000100000000ACC000-000000067F00004002000100000000AD0000__000000931B9A2710 000000067F00004002000100000000ACF6A1-000000067F00004002000100000000AD8072__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AD0000-000000067F00004002000100000000AD4000__00000073AD3FE6B8 000000067F00004002000100000000AD0000-000000067F00004002000100000000AD4000__000000914E3F38F0 000000067F00004002000100000000AD0000-000000067F00004002000100000000AD4000__000000931B9A2710 000000067F00004002000100000000AD4000-000000067F00004002000100000000AD8000__00000073AD3FE6B8 000000067F00004002000100000000AD4000-000000067F00004002000100000000AD8000__000000914E3F38F0 000000067F00004002000100000000AD4000-000000067F00004002000100000000AD8000__000000931B9A2710 000000067F00004002000100000000AD8000-000000067F00004002000100000000ADC000__00000073AD3FE6B8 000000067F00004002000100000000AD8000-000000067F00004002000100000000ADC000__000000914E3F38F0 000000067F00004002000100000000AD8000-000000067F00004002000100000000ADC000__000000931B9A2710 000000067F00004002000100000000AD8072-000000067F00004002000100000000AE0A4E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000ADC000-000000067F00004002000100000000AE0000__00000073AD3FE6B8 000000067F00004002000100000000ADC000-000000067F00004002000100000000AE0000__000000914E3F38F0 000000067F00004002000100000000ADC000-000000067F00004002000100000000AE0000__000000931B9A2710 000000067F00004002000100000000AE0000-000000067F00004002000100000000AE4000__00000073AD3FE6B8 000000067F00004002000100000000AE0000-000000067F00004002000100000000AE4000__000000914E3F38F0 000000067F00004002000100000000AE0000-000000067F00004002000100000000AE4000__000000931B9A2710 000000067F00004002000100000000AE0A4E-000000067F00004002000100000000AE942F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AE4000-000000067F00004002000100000000AE8000__00000073AD3FE6B8 000000067F00004002000100000000AE4000-000000067F00004002000100000000AE8000__000000914E3F38F0 000000067F00004002000100000000AE4000-000000067F00004002000100000000AE8000__000000931B9A2710 000000067F00004002000100000000AE8000-000000067F00004002000100000000AEC000__00000073AD3FE6B8 000000067F00004002000100000000AE8000-000000067F00004002000100000000AEC000__000000914E3F38F0 000000067F00004002000100000000AE8000-000000067F00004002000100000000AEC000__000000931B9A2710 000000067F00004002000100000000AE942F-000000067F00004002000100000000AF1E0F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AEC000-000000067F00004002000100000000AF0000__00000073AD3FE6B8 000000067F00004002000100000000AEC000-000000067F00004002000100000000AF0000__000000914E3F38F0 000000067F00004002000100000000AEC000-000000067F00004002000100000000AF0000__000000931B9A2710 000000067F00004002000100000000AF0000-000000067F00004002000100000000AF4000__00000073AD3FE6B8 000000067F00004002000100000000AF0000-000000067F00004002000100000000AF4000__000000914E3F38F0 000000067F00004002000100000000AF0000-000000067F00004002000100000000AF4000__000000931B9A2710 000000067F00004002000100000000AF1E0F-000000067F00004002000100000000AFA7DD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AF4000-000000067F00004002000100000000AF8000__00000073AD3FE6B8 000000067F00004002000100000000AF4000-000000067F00004002000100000000AF8000__000000914E3F38F0 000000067F00004002000100000000AF4000-000000067F00004002000100000000AF8000__000000931B9A2710 000000067F00004002000100000000AF8000-000000067F00004002000100000000AFC000__00000073AD3FE6B8 000000067F00004002000100000000AF8000-000000067F00004002000100000000AFC000__000000914E3F38F0 000000067F00004002000100000000AF8000-000000067F00004002000100000000AFC000__000000931B9A2710 000000067F00004002000100000000AFA7DD-000000067F00004002000100000000B031B5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000AFC000-000000067F00004002000100000000B00000__00000073AD3FE6B8 000000067F00004002000100000000AFC000-000000067F00004002000100000000B00000__000000914E3F38F0 000000067F00004002000100000000AFC000-000000067F00004002000100000000B00000__000000931B9A2710 000000067F00004002000100000000B00000-000000067F00004002000100000000B04000__00000073AD3FE6B8 000000067F00004002000100000000B00000-000000067F00004002000100000000B04000__000000914E3F38F0 000000067F00004002000100000000B00000-000000067F00004002000100000000B04000__000000931B9A2710 000000067F00004002000100000000B031B5-000000067F00004002000100000000B0BB95__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B04000-000000067F00004002000100000000B08000__00000073AD3FE6B8 000000067F00004002000100000000B04000-000000067F00004002000100000000B08000__000000914E3F38F0 000000067F00004002000100000000B04000-000000067F00004002000100000000B08000__000000931B9A2710 000000067F00004002000100000000B08000-000000067F00004002000100000000B0C000__00000073AD3FE6B8 000000067F00004002000100000000B08000-000000067F00004002000100000000B0C000__000000914E3F38F0 000000067F00004002000100000000B08000-000000067F00004002000100000000B0C000__000000931B9A2710 000000067F00004002000100000000B0BB95-000000067F00004002000100000000B1456D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B0C000-000000067F00004002000100000000B10000__00000073AD3FE6B8 000000067F00004002000100000000B0C000-000000067F00004002000100000000B10000__000000914E3F38F0 000000067F00004002000100000000B0C000-000000067F00004002000100000000B10000__000000931B9A2710 000000067F00004002000100000000B10000-000000067F00004002000100000000B14000__00000073AD3FE6B8 000000067F00004002000100000000B10000-000000067F00004002000100000000B14000__000000914E3F38F0 000000067F00004002000100000000B10000-000000067F00004002000100000000B14000__000000931B9A2710 000000067F00004002000100000000B14000-000000067F00004002000100000000B18000__00000073AD3FE6B8 000000067F00004002000100000000B14000-000000067F00004002000100000000B18000__000000914E3F38F0 000000067F00004002000100000000B14000-000000067F00004002000100000000B18000__000000931B9A2710 000000067F00004002000100000000B1456D-000000067F00004002000100000000B1CF4D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B18000-000000067F00004002000100000000B1C000__00000073AD3FE6B8 000000067F00004002000100000000B18000-000000067F00004002000100000000B1C000__000000914E3F38F0 000000067F00004002000100000000B18000-000000067F00004002000100000000B1C000__000000931B9A2710 000000067F00004002000100000000B1C000-000000067F00004002000100000000B20000__00000073AD3FE6B8 000000067F00004002000100000000B1C000-000000067F00004002000100000000B20000__000000914E3F38F0 000000067F00004002000100000000B1C000-000000067F00004002000100000000B20000__000000931B9A2710 000000067F00004002000100000000B1CF4D-000000067F00004002000100000000B2592E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B20000-000000067F00004002000100000000B24000__00000073AD3FE6B8 000000067F00004002000100000000B20000-000000067F00004002000100000000B24000__000000914E3F38F0 000000067F00004002000100000000B20000-000000067F00004002000100000000B24000__000000931B9A2710 000000067F00004002000100000000B24000-000000067F00004002000100000000B28000__00000073AD3FE6B8 000000067F00004002000100000000B24000-000000067F00004002000100000000B28000__000000914E3F38F0 000000067F00004002000100000000B24000-000000067F00004002000100000000B28000__000000931B9A2710 000000067F00004002000100000000B2592E-000000067F00004002000100000000B2E310__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B28000-000000067F00004002000100000000B2C000__00000073AD3FE6B8 000000067F00004002000100000000B28000-000000067F00004002000100000000B2C000__000000914E3F38F0 000000067F00004002000100000000B28000-000000067F00004002000100000000B2C000__000000931B9A2710 000000067F00004002000100000000B2C000-000000067F00004002000100000000B30000__00000073AD3FE6B8 000000067F00004002000100000000B2C000-000000067F00004002000100000000B30000__000000914E3F38F0 000000067F00004002000100000000B2C000-000000067F00004002000100000000B30000__000000931B9A2710 000000067F00004002000100000000B2E310-000000067F00004002000100000000B36CE8__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B30000-000000067F00004002000100000000B34000__00000073AD3FE6B8 000000067F00004002000100000000B30000-000000067F00004002000100000000B34000__000000914E3F38F0 000000067F00004002000100000000B30000-000000067F00004002000100000000B34000__000000931B9A2710 000000067F00004002000100000000B34000-000000067F00004002000100000000B38000__00000073AD3FE6B8 000000067F00004002000100000000B34000-000000067F00004002000100000000B38000__000000914E3F38F0 000000067F00004002000100000000B34000-000000067F00004002000100000000B38000__000000931B9A2710 000000067F00004002000100000000B36CE8-000000067F00004002000100000000B3F6C4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B38000-000000067F00004002000100000000B3C000__00000073AD3FE6B8 000000067F00004002000100000000B38000-000000067F00004002000100000000B3C000__000000914E3F38F0 000000067F00004002000100000000B38000-000000067F00004002000100000000B3C000__000000931B9A2710 000000067F00004002000100000000B3C000-000000067F00004002000100000000B40000__00000073AD3FE6B8 000000067F00004002000100000000B3C000-000000067F00004002000100000000B40000__000000914E3F38F0 000000067F00004002000100000000B3C000-000000067F00004002000100000000B40000__000000931B9A2710 000000067F00004002000100000000B3F6C4-000000067F00004002000100000000B480A3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B40000-000000067F00004002000100000000B44000__00000073AD3FE6B8 000000067F00004002000100000000B40000-000000067F00004002000100000000B44000__000000914E3F38F0 000000067F00004002000100000000B40000-000000067F00004002000100000000B44000__000000931B9A2710 000000067F00004002000100000000B44000-000000067F00004002000100000000B48000__00000073AD3FE6B8 000000067F00004002000100000000B44000-000000067F00004002000100000000B48000__000000914E3F38F0 000000067F00004002000100000000B44000-000000067F00004002000100000000B48000__000000931B9A2710 000000067F00004002000100000000B48000-000000067F00004002000100000000B4C000__00000073AD3FE6B8 000000067F00004002000100000000B48000-000000067F00004002000100000000B4C000__000000914E3F38F0 000000067F00004002000100000000B48000-000000067F00004002000100000000B4C000__000000931B9A2710 000000067F00004002000100000000B480A3-000000067F00004002000100000000B50A7D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B4C000-000000067F00004002000100000000B50000__00000073AD3FE6B8 000000067F00004002000100000000B4C000-000000067F00004002000100000000B50000__000000914E3F38F0 000000067F00004002000100000000B4C000-000000067F00004002000100000000B50000__000000931B9A2710 000000067F00004002000100000000B50000-000000067F00004002000100000000B54000__00000073AD3FE6B8 000000067F00004002000100000000B50000-000000067F00004002000100000000B54000__000000914E3F38F0 000000067F00004002000100000000B50000-000000067F00004002000100000000B54000__000000931B9A2710 000000067F00004002000100000000B50A7D-000000067F00004002000100000000B59456__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B54000-000000067F00004002000100000000B58000__00000073AD3FE6B8 000000067F00004002000100000000B54000-000000067F00004002000100000000B58000__000000914E3F38F0 000000067F00004002000100000000B54000-000000067F00004002000100000000B58000__000000931B9A2710 000000067F00004002000100000000B58000-000000067F00004002000100000000B5C000__00000073AD3FE6B8 000000067F00004002000100000000B58000-000000067F00004002000100000000B5C000__000000914E3F38F0 000000067F00004002000100000000B58000-000000067F00004002000100000000B5C000__000000931B9A2710 000000067F00004002000100000000B59456-000000067F00004002000100000000B61E31__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B5C000-000000067F00004002000100000000B60000__00000073AD3FE6B8 000000067F00004002000100000000B5C000-000000067F00004002000100000000B60000__000000914E3F38F0 000000067F00004002000100000000B5C000-000000067F00004002000100000000B60000__000000931B9A2710 000000067F00004002000100000000B60000-000000067F00004002000100000000B64000__00000073AD3FE6B8 000000067F00004002000100000000B60000-000000067F00004002000100000000B64000__000000914E3F38F0 000000067F00004002000100000000B60000-000000067F00004002000100000000B64000__000000931B9A2710 000000067F00004002000100000000B61E31-000000067F00004002000100000000B6A810__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B64000-000000067F00004002000100000000B68000__00000073AD3FE6B8 000000067F00004002000100000000B64000-000000067F00004002000100000000B68000__000000914E3F38F0 000000067F00004002000100000000B64000-000000067F00004002000100000000B68000__000000931B9A2710 000000067F00004002000100000000B68000-000000067F00004002000100000000B6C000__00000073AD3FE6B8 000000067F00004002000100000000B68000-000000067F00004002000100000000B6C000__000000914E3F38F0 000000067F00004002000100000000B68000-000000067F00004002000100000000B6C000__000000931B9A2710 000000067F00004002000100000000B6A810-000000067F00004002000100000000B731E5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B6C000-000000067F00004002000100000000B70000__00000073AD3FE6B8 000000067F00004002000100000000B6C000-000000067F00004002000100000000B70000__000000914E3F38F0 000000067F00004002000100000000B6C000-000000067F00004002000100000000B70000__000000931B9A2710 000000067F00004002000100000000B70000-000000067F00004002000100000000B74000__00000073AD3FE6B8 000000067F00004002000100000000B70000-000000067F00004002000100000000B74000__000000914E3F38F0 000000067F00004002000100000000B70000-000000067F00004002000100000000B74000__000000931B9A2710 000000067F00004002000100000000B731E5-000000067F00004002000100000000B7BBC4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B74000-000000067F00004002000100000000B78000__00000073AD3FE6B8 000000067F00004002000100000000B74000-000000067F00004002000100000000B78000__000000914E3F38F0 000000067F00004002000100000000B74000-000000067F00004002000100000000B78000__000000931B9A2710 000000067F00004002000100000000B78000-000000067F00004002000100000000B7C000__00000073AD3FE6B8 000000067F00004002000100000000B78000-000000067F00004002000100000000B7C000__000000914E3F38F0 000000067F00004002000100000000B78000-000000067F00004002000100000000B7C000__000000931B9A2710 000000067F00004002000100000000B7BBC4-000000067F00004002000100000000B845A5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B7C000-000000067F00004002000100000000B80000__00000073AD3FE6B8 000000067F00004002000100000000B7C000-000000067F00004002000100000000B80000__000000914E3F38F0 000000067F00004002000100000000B7C000-000000067F00004002000100000000B80000__000000931B9A2710 000000067F00004002000100000000B80000-000000067F00004002000100000000B84000__00000073AD3FE6B8 000000067F00004002000100000000B80000-000000067F00004002000100000000B84000__000000914E3F38F0 000000067F00004002000100000000B80000-000000067F00004002000100000000B84000__000000931B9A2710 000000067F00004002000100000000B84000-000000067F00004002000100000000B88000__00000073AD3FE6B8 000000067F00004002000100000000B84000-000000067F00004002000100000000B88000__000000914E3F38F0 000000067F00004002000100000000B84000-000000067F00004002000100000000B88000__000000931B9A2710 000000067F00004002000100000000B845A5-000000067F00004002000100000000B8CF82__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B88000-000000067F00004002000100000000B8C000__00000073AD3FE6B8 000000067F00004002000100000000B88000-000000067F00004002000100000000B8C000__000000914E3F38F0 000000067F00004002000100000000B88000-000000067F00004002000100000000B8C000__000000931B9A2710 000000067F00004002000100000000B8C000-000000067F00004002000100000000B90000__00000073AD3FE6B8 000000067F00004002000100000000B8C000-000000067F00004002000100000000B90000__000000914E3F38F0 000000067F00004002000100000000B8C000-000000067F00004002000100000000B90000__000000931B9A2710 000000067F00004002000100000000B8CF82-000000067F00004002000100000000B95960__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B90000-000000067F00004002000100000000B94000__00000073AD3FE6B8 000000067F00004002000100000000B90000-000000067F00004002000100000000B94000__000000914E3F38F0 000000067F00004002000100000000B90000-000000067F00004002000100000000B94000__000000931B9A2710 000000067F00004002000100000000B94000-000000067F00004002000100000000B98000__00000073AD3FE6B8 000000067F00004002000100000000B94000-000000067F00004002000100000000B98000__000000914E3F38F0 000000067F00004002000100000000B94000-000000067F00004002000100000000B98000__000000931B9A2710 000000067F00004002000100000000B95960-000000067F00004002000100000000B9E33F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000B98000-000000067F00004002000100000000B9C000__00000073AD3FE6B8 000000067F00004002000100000000B98000-000000067F00004002000100000000B9C000__000000914E3F38F0 000000067F00004002000100000000B98000-000000067F00004002000100000000B9C000__000000931B9A2710 000000067F00004002000100000000B9C000-000000067F00004002000100000000BA0000__00000073AD3FE6B8 000000067F00004002000100000000B9C000-000000067F00004002000100000000BA0000__000000914E3F38F0 000000067F00004002000100000000B9C000-000000067F00004002000100000000BA0000__000000931B9A2710 000000067F00004002000100000000B9E33F-000000067F00004002000100000000BA6D14__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BA0000-000000067F00004002000100000000BA4000__00000073AD3FE6B8 000000067F00004002000100000000BA0000-000000067F00004002000100000000BA4000__000000914E3F38F0 000000067F00004002000100000000BA0000-000000067F00004002000100000000BA4000__000000931B9A2710 000000067F00004002000100000000BA4000-000000067F00004002000100000000BA8000__00000073AD3FE6B8 000000067F00004002000100000000BA4000-000000067F00004002000100000000BA8000__000000914E3F38F0 000000067F00004002000100000000BA4000-000000067F00004002000100000000BA8000__000000931B9A2710 000000067F00004002000100000000BA6D14-000000067F00004002000100000000BAF6EE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BA8000-000000067F00004002000100000000BAC000__00000073AD3FE6B8 000000067F00004002000100000000BA8000-000000067F00004002000100000000BAC000__000000914E3F38F0 000000067F00004002000100000000BA8000-000000067F00004002000100000000BAC000__000000931B9A2710 000000067F00004002000100000000BAC000-000000067F00004002000100000000BB0000__00000073AD3FE6B8 000000067F00004002000100000000BAC000-000000067F00004002000100000000BB0000__000000914E3F38F0 000000067F00004002000100000000BAC000-000000067F00004002000100000000BB0000__000000931B9A2710 000000067F00004002000100000000BAF6EE-000000067F00004002000100000000BB80C4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BB0000-000000067F00004002000100000000BB4000__00000073AD3FE6B8 000000067F00004002000100000000BB0000-000000067F00004002000100000000BB4000__000000914E3F38F0 000000067F00004002000100000000BB0000-000000067F00004002000100000000BB4000__000000931B9A2710 000000067F00004002000100000000BB4000-000000067F00004002000100000000BB8000__00000073AD3FE6B8 000000067F00004002000100000000BB4000-000000067F00004002000100000000BB8000__000000914E3F38F0 000000067F00004002000100000000BB4000-000000067F00004002000100000000BB8000__000000931B9A2710 000000067F00004002000100000000BB8000-000000067F00004002000100000000BBC000__00000073AD3FE6B8 000000067F00004002000100000000BB8000-000000067F00004002000100000000BBC000__000000914E3F38F0 000000067F00004002000100000000BB8000-000000067F00004002000100000000BBC000__000000931B9A2710 000000067F00004002000100000000BB80C4-000000067F00004002000100000000BC0A9B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BBC000-000000067F00004002000100000000BC0000__00000073AD3FE6B8 000000067F00004002000100000000BBC000-000000067F00004002000100000000BC0000__000000914E3F38F0 000000067F00004002000100000000BBC000-000000067F00004002000100000000BC0000__000000931B9A2710 000000067F00004002000100000000BC0000-000000067F00004002000100000000BC4000__00000073AD3FE6B8 000000067F00004002000100000000BC0000-000000067F00004002000100000000BC4000__000000914E3F38F0 000000067F00004002000100000000BC0000-000000067F00004002000100000000BC4000__000000931B9A2710 000000067F00004002000100000000BC0A9B-000000067F00004002000100000000BC9480__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BC4000-000000067F00004002000100000000BC8000__00000073AD3FE6B8 000000067F00004002000100000000BC4000-000000067F00004002000100000000BC8000__000000914E3F38F0 000000067F00004002000100000000BC4000-000000067F00004002000100000000BC8000__000000931B9A2710 000000067F00004002000100000000BC8000-000000067F00004002000100000000BCC000__00000073AD3FE6B8 000000067F00004002000100000000BC8000-000000067F00004002000100000000BCC000__000000914E3F38F0 000000067F00004002000100000000BC8000-000000067F00004002000100000000BCC000__000000931B9A2710 000000067F00004002000100000000BC9480-000000067F00004002000100000000BD1E68__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BCC000-000000067F00004002000100000000BD0000__00000073AD3FE6B8 000000067F00004002000100000000BCC000-000000067F00004002000100000000BD0000__000000914E3F38F0 000000067F00004002000100000000BCC000-000000067F00004002000100000000BD0000__000000931B9A2710 000000067F00004002000100000000BD0000-000000067F00004002000100000000BD4000__00000073AD3FE6B8 000000067F00004002000100000000BD0000-000000067F00004002000100000000BD4000__000000914E3F38F0 000000067F00004002000100000000BD0000-000000067F00004002000100000000BD4000__000000931B9A2710 000000067F00004002000100000000BD1E68-000000067F00004002000100000000BDA835__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BD4000-000000067F00004002000100000000BD8000__00000073AD3FE6B8 000000067F00004002000100000000BD4000-000000067F00004002000100000000BD8000__000000914E3F38F0 000000067F00004002000100000000BD4000-000000067F00004002000100000000BD8000__000000931B9A2710 000000067F00004002000100000000BD8000-000000067F00004002000100000000BDC000__00000073AD3FE6B8 000000067F00004002000100000000BD8000-000000067F00004002000100000000BDC000__000000914E3F38F0 000000067F00004002000100000000BD8000-000000067F00004002000100000000BDC000__000000931B9A2710 000000067F00004002000100000000BDA835-000000067F00004002000100000000BE320C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BDC000-000000067F00004002000100000000BE0000__00000073AD3FE6B8 000000067F00004002000100000000BDC000-000000067F00004002000100000000BE0000__000000914E3F38F0 000000067F00004002000100000000BDC000-000000067F00004002000100000000BE0000__000000931B9A2710 000000067F00004002000100000000BE0000-000000067F00004002000100000000BE4000__00000073AD3FE6B8 000000067F00004002000100000000BE0000-000000067F00004002000100000000BE4000__000000914E3F38F0 000000067F00004002000100000000BE0000-000000067F00004002000100000000BE4000__000000931B9A2710 000000067F00004002000100000000BE320C-000000067F00004002000100000000BEBBE5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BE4000-000000067F00004002000100000000BE8000__00000073AD3FE6B8 000000067F00004002000100000000BE4000-000000067F00004002000100000000BE8000__000000914E3F38F0 000000067F00004002000100000000BE4000-000000067F00004002000100000000BE8000__000000931B9A2710 000000067F00004002000100000000BE8000-000000067F00004002000100000000BEC000__00000073AD3FE6B8 000000067F00004002000100000000BE8000-000000067F00004002000100000000BEC000__000000914E3F38F0 000000067F00004002000100000000BE8000-000000067F00004002000100000000BEC000__000000931B9A2710 000000067F00004002000100000000BEBBE5-000000067F00004002000100000000BF45C3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BEC000-000000067F00004002000100000000BF0000__00000073AD3FE6B8 000000067F00004002000100000000BEC000-000000067F00004002000100000000BF0000__000000914E3F38F0 000000067F00004002000100000000BEC000-000000067F00004002000100000000BF0000__000000931B9A2710 000000067F00004002000100000000BF0000-000000067F00004002000100000000BF4000__00000073AD3FE6B8 000000067F00004002000100000000BF0000-000000067F00004002000100000000BF4000__000000914E3F38F0 000000067F00004002000100000000BF0000-000000067F00004002000100000000BF4000__000000931B9A2710 000000067F00004002000100000000BF4000-000000067F00004002000100000000BF8000__00000073AD3FE6B8 000000067F00004002000100000000BF4000-000000067F00004002000100000000BF8000__000000914E3F38F0 000000067F00004002000100000000BF4000-000000067F00004002000100000000BF8000__000000931B9A2710 000000067F00004002000100000000BF45C3-000000067F00004002000100000000BFCF9A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000BF8000-000000067F00004002000100000000BFC000__00000073AD3FE6B8 000000067F00004002000100000000BF8000-000000067F00004002000100000000BFC000__000000914E3F38F0 000000067F00004002000100000000BF8000-000000067F00004002000100000000BFC000__000000931B9A2710 000000067F00004002000100000000BFC000-000000067F00004002000100000000C00000__00000073AD3FE6B8 000000067F00004002000100000000BFC000-000000067F00004002000100000000C00000__000000914E3F38F0 000000067F00004002000100000000BFC000-000000067F00004002000100000000C00000__000000931B9A2710 000000067F00004002000100000000BFCF9A-000000067F00004002000100000000C0597F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C00000-000000067F00004002000100000000C04000__00000073AD3FE6B8 000000067F00004002000100000000C00000-000000067F00004002000100000000C04000__000000914E3F38F0 000000067F00004002000100000000C00000-000000067F00004002000100000000C04000__000000931B9A2710 000000067F00004002000100000000C04000-000000067F00004002000100000000C08000__00000073AD3FE6B8 000000067F00004002000100000000C04000-000000067F00004002000100000000C08000__000000914E3F38F0 000000067F00004002000100000000C04000-000000067F00004002000100000000C08000__000000931B9A2710 000000067F00004002000100000000C0597F-000000067F00004002000100000000C0E366__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C08000-000000067F00004002000100000000C0C000__00000073AD3FE6B8 000000067F00004002000100000000C08000-000000067F00004002000100000000C0C000__000000914E3F38F0 000000067F00004002000100000000C08000-000000067F00004002000100000000C0C000__000000931B9A2710 000000067F00004002000100000000C0C000-000000067F00004002000100000000C10000__00000073AD3FE6B8 000000067F00004002000100000000C0C000-000000067F00004002000100000000C10000__000000914E3F38F0 000000067F00004002000100000000C0C000-000000067F00004002000100000000C10000__000000931B9A2710 000000067F00004002000100000000C0E366-000000067F00004002000100000000C16D38__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C10000-000000067F00004002000100000000C14000__00000073AD3FE6B8 000000067F00004002000100000000C10000-000000067F00004002000100000000C14000__000000914E3F38F0 000000067F00004002000100000000C10000-000000067F00004002000100000000C14000__000000931B9A2710 000000067F00004002000100000000C14000-000000067F00004002000100000000C18000__00000073AD3FE6B8 000000067F00004002000100000000C14000-000000067F00004002000100000000C18000__000000914E3F38F0 000000067F00004002000100000000C14000-000000067F00004002000100000000C18000__000000931B9A2710 000000067F00004002000100000000C16D38-000000067F00004002000100000000C1F70B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C18000-000000067F00004002000100000000C1C000__00000073AD3FE6B8 000000067F00004002000100000000C18000-000000067F00004002000100000000C1C000__000000914E3F38F0 000000067F00004002000100000000C18000-000000067F00004002000100000000C1C000__000000931B9A2710 000000067F00004002000100000000C1C000-000000067F00004002000100000000C20000__00000073AD3FE6B8 000000067F00004002000100000000C1C000-000000067F00004002000100000000C20000__000000914E3F38F0 000000067F00004002000100000000C1C000-000000067F00004002000100000000C20000__000000931B9A2710 000000067F00004002000100000000C1F70B-000000067F00004002000100000000C280E6__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C20000-000000067F00004002000100000000C24000__00000073AD3FE6B8 000000067F00004002000100000000C20000-000000067F00004002000100000000C24000__000000914E3F38F0 000000067F00004002000100000000C20000-000000067F00004002000100000000C24000__000000931B9A2710 000000067F00004002000100000000C24000-000000067F00004002000100000000C28000__00000073AD3FE6B8 000000067F00004002000100000000C24000-000000067F00004002000100000000C28000__000000914E3F38F0 000000067F00004002000100000000C24000-000000067F00004002000100000000C28000__000000931B9A2710 000000067F00004002000100000000C28000-000000067F00004002000100000000C2C000__00000073AD3FE6B8 000000067F00004002000100000000C28000-000000067F00004002000100000000C2C000__000000914E3F38F0 000000067F00004002000100000000C28000-000000067F00004002000100000000C2C000__000000931B9A2710 000000067F00004002000100000000C280E6-000000067F00004002000100000000C30AC6__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C2C000-000000067F00004002000100000000C30000__00000073AD3FE6B8 000000067F00004002000100000000C2C000-000000067F00004002000100000000C30000__000000914E3F38F0 000000067F00004002000100000000C2C000-000000067F00004002000100000000C30000__000000931B9A2710 000000067F00004002000100000000C30000-000000067F00004002000100000000C34000__00000073AD3FE6B8 000000067F00004002000100000000C30000-000000067F00004002000100000000C34000__000000914E3F38F0 000000067F00004002000100000000C30000-000000067F00004002000100000000C34000__000000931B9A2710 000000067F00004002000100000000C30AC6-000000067F00004002000100000000C394A4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C34000-000000067F00004002000100000000C38000__00000073AD3FE6B8 000000067F00004002000100000000C34000-000000067F00004002000100000000C38000__000000914E3F38F0 000000067F00004002000100000000C34000-000000067F00004002000100000000C38000__000000931B9A2710 000000067F00004002000100000000C38000-000000067F00004002000100000000C3C000__00000073AD3FE6B8 000000067F00004002000100000000C38000-000000067F00004002000100000000C3C000__000000914E3F38F0 000000067F00004002000100000000C38000-000000067F00004002000100000000C3C000__000000931B9A2710 000000067F00004002000100000000C394A4-000000067F00004002000100000000C41E88__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C3C000-000000067F00004002000100000000C40000__00000073AD3FE6B8 000000067F00004002000100000000C3C000-000000067F00004002000100000000C40000__000000914E3F38F0 000000067F00004002000100000000C3C000-000000067F00004002000100000000C40000__000000931B9A2710 000000067F00004002000100000000C40000-000000067F00004002000100000000C44000__00000073AD3FE6B8 000000067F00004002000100000000C40000-000000067F00004002000100000000C44000__000000914E3F38F0 000000067F00004002000100000000C40000-000000067F00004002000100000000C44000__000000931B9A2710 000000067F00004002000100000000C41E88-000000067F00004002000100000000C4A868__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C44000-000000067F00004002000100000000C48000__00000073AD3FE6B8 000000067F00004002000100000000C44000-000000067F00004002000100000000C48000__000000914E3F38F0 000000067F00004002000100000000C44000-000000067F00004002000100000000C48000__000000931B9A2710 000000067F00004002000100000000C48000-000000067F00004002000100000000C4C000__00000073AD3FE6B8 000000067F00004002000100000000C48000-000000067F00004002000100000000C4C000__000000914E3F38F0 000000067F00004002000100000000C48000-000000067F00004002000100000000C4C000__000000931B9A2710 000000067F00004002000100000000C4A868-000000067F00004002000100000000C53243__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C4C000-000000067F00004002000100000000C50000__00000073AD3FE6B8 000000067F00004002000100000000C4C000-000000067F00004002000100000000C50000__000000914E3F38F0 000000067F00004002000100000000C4C000-000000067F00004002000100000000C50000__000000931B9A2710 000000067F00004002000100000000C50000-000000067F00004002000100000000C54000__00000073AD3FE6B8 000000067F00004002000100000000C50000-000000067F00004002000100000000C54000__000000914E3F38F0 000000067F00004002000100000000C50000-000000067F00004002000100000000C54000__000000931B9A2710 000000067F00004002000100000000C53243-000000067F00004002000100000000C5BC12__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C54000-000000067F00004002000100000000C58000__00000073AD3FE6B8 000000067F00004002000100000000C54000-000000067F00004002000100000000C58000__000000914E3F38F0 000000067F00004002000100000000C54000-000000067F00004002000100000000C58000__000000931B9A2710 000000067F00004002000100000000C58000-000000067F00004002000100000000C5C000__00000073AD3FE6B8 000000067F00004002000100000000C58000-000000067F00004002000100000000C5C000__000000914E3F38F0 000000067F00004002000100000000C58000-000000067F00004002000100000000C5C000__000000931B9A2710 000000067F00004002000100000000C5BC12-000000067F00004002000100000000C645E7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C5C000-000000067F00004002000100000000C60000__00000073AD3FE6B8 000000067F00004002000100000000C5C000-000000067F00004002000100000000C60000__000000914E3F38F0 000000067F00004002000100000000C5C000-000000067F00004002000100000000C60000__000000931B9A2710 000000067F00004002000100000000C60000-000000067F00004002000100000000C64000__00000073AD3FE6B8 000000067F00004002000100000000C60000-000000067F00004002000100000000C64000__000000914E3F38F0 000000067F00004002000100000000C60000-000000067F00004002000100000000C64000__000000931B9A2710 000000067F00004002000100000000C64000-000000067F00004002000100000000C68000__00000073AD3FE6B8 000000067F00004002000100000000C64000-000000067F00004002000100000000C68000__000000914E3F38F0 000000067F00004002000100000000C64000-000000067F00004002000100000000C68000__000000931B9A2710 000000067F00004002000100000000C645E7-000000067F00004002000100000000C6CFCD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C68000-000000067F00004002000100000000C6C000__00000073AD3FE6B8 000000067F00004002000100000000C68000-000000067F00004002000100000000C6C000__000000914E3F38F0 000000067F00004002000100000000C68000-000000067F00004002000100000000C6C000__000000931B9A2710 000000067F00004002000100000000C6C000-000000067F00004002000100000000C70000__00000073AD3FE6B8 000000067F00004002000100000000C6C000-000000067F00004002000100000000C70000__000000914E3F38F0 000000067F00004002000100000000C6C000-000000067F00004002000100000000C70000__000000931B9A2710 000000067F00004002000100000000C6CFCD-000000067F00004002000100000000C759AB__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C70000-000000067F00004002000100000000C74000__00000073AD3FE6B8 000000067F00004002000100000000C70000-000000067F00004002000100000000C74000__000000914E3F38F0 000000067F00004002000100000000C70000-000000067F00004002000100000000C74000__000000931B9A2710 000000067F00004002000100000000C74000-000000067F00004002000100000000C78000__00000073AD3FE6B8 000000067F00004002000100000000C74000-000000067F00004002000100000000C78000__000000914E3F38F0 000000067F00004002000100000000C74000-000000067F00004002000100000000C78000__000000931B9A2710 000000067F00004002000100000000C759AB-000000067F00004002000100000000C7E38B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C78000-000000067F00004002000100000000C7C000__00000073AD3FE6B8 000000067F00004002000100000000C78000-000000067F00004002000100000000C7C000__000000914E3F38F0 000000067F00004002000100000000C78000-000000067F00004002000100000000C7C000__000000931B9A2710 000000067F00004002000100000000C7C000-000000067F00004002000100000000C80000__00000073AD3FE6B8 000000067F00004002000100000000C7C000-000000067F00004002000100000000C80000__000000914E3F38F0 000000067F00004002000100000000C7C000-000000067F00004002000100000000C80000__000000931B9A2710 000000067F00004002000100000000C7E38B-000000067F00004002000100000000C86D65__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C80000-000000067F00004002000100000000C84000__00000073AD3FE6B8 000000067F00004002000100000000C80000-000000067F00004002000100000000C84000__000000914E3F38F0 000000067F00004002000100000000C80000-000000067F00004002000100000000C84000__000000931B9A2710 000000067F00004002000100000000C84000-000000067F00004002000100000000C88000__00000073AD3FE6B8 000000067F00004002000100000000C84000-000000067F00004002000100000000C88000__000000914E3F38F0 000000067F00004002000100000000C84000-000000067F00004002000100000000C88000__000000931B9A2710 000000067F00004002000100000000C86D65-000000067F00004002000100000000C8F758__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C88000-000000067F00004002000100000000C8C000__00000073AD3FE6B8 000000067F00004002000100000000C88000-000000067F00004002000100000000C8C000__000000914E3F38F0 000000067F00004002000100000000C88000-000000067F00004002000100000000C8C000__000000931B9A2710 000000067F00004002000100000000C8C000-000000067F00004002000100000000C90000__00000073AD3FE6B8 000000067F00004002000100000000C8C000-000000067F00004002000100000000C90000__000000914E3F38F0 000000067F00004002000100000000C8C000-000000067F00004002000100000000C90000__000000931B9A2710 000000067F00004002000100000000C8F758-000000067F00004002000100000000C98142__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C90000-000000067F00004002000100000000C94000__00000073AD3FE6B8 000000067F00004002000100000000C90000-000000067F00004002000100000000C94000__000000914E3F38F0 000000067F00004002000100000000C90000-000000067F00004002000100000000C94000__000000931B9A2710 000000067F00004002000100000000C94000-000000067F00004002000100000000C98000__00000073AD3FE6B8 000000067F00004002000100000000C94000-000000067F00004002000100000000C98000__000000914E3F38F0 000000067F00004002000100000000C94000-000000067F00004002000100000000C98000__000000931B9A2710 000000067F00004002000100000000C98000-000000067F00004002000100000000C9C000__00000073AD3FE6B8 000000067F00004002000100000000C98000-000000067F00004002000100000000C9C000__000000914E3F38F0 000000067F00004002000100000000C98000-000000067F00004002000100000000C9C000__000000931B9A2710 000000067F00004002000100000000C98142-000000067F00004002000100000000CA0B11__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000C9C000-000000067F00004002000100000000CA0000__00000073AD3FE6B8 000000067F00004002000100000000C9C000-000000067F00004002000100000000CA0000__000000914E3F38F0 000000067F00004002000100000000C9C000-000000067F00004002000100000000CA0000__000000931B9A2710 000000067F00004002000100000000CA0000-000000067F00004002000100000000CA4000__00000073AD3FE6B8 000000067F00004002000100000000CA0000-000000067F00004002000100000000CA4000__000000914E3F38F0 000000067F00004002000100000000CA0000-000000067F00004002000100000000CA4000__000000931B9A2710 000000067F00004002000100000000CA0B11-000000067F00004002000100000000CA94E7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CA4000-000000067F00004002000100000000CA8000__00000073AD3FE6B8 000000067F00004002000100000000CA4000-000000067F00004002000100000000CA8000__000000914E3F38F0 000000067F00004002000100000000CA4000-000000067F00004002000100000000CA8000__000000931B9A2710 000000067F00004002000100000000CA8000-000000067F00004002000100000000CAC000__00000073AD3FE6B8 000000067F00004002000100000000CA8000-000000067F00004002000100000000CAC000__000000914E3F38F0 000000067F00004002000100000000CA8000-000000067F00004002000100000000CAC000__000000931B9A2710 000000067F00004002000100000000CA94E7-000000067F00004002000100000000CB1EC7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CAC000-000000067F00004002000100000000CB0000__00000073AD3FE6B8 000000067F00004002000100000000CAC000-000000067F00004002000100000000CB0000__000000914E3F38F0 000000067F00004002000100000000CAC000-000000067F00004002000100000000CB0000__000000931B9A2710 000000067F00004002000100000000CB0000-000000067F00004002000100000000CB4000__00000073AD3FE6B8 000000067F00004002000100000000CB0000-000000067F00004002000100000000CB4000__000000914E3F38F0 000000067F00004002000100000000CB0000-000000067F00004002000100000000CB4000__000000931B9A2710 000000067F00004002000100000000CB1EC7-000000067F00004002000100000000CBA8AE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CB4000-000000067F00004002000100000000CB8000__00000073AD3FE6B8 000000067F00004002000100000000CB4000-000000067F00004002000100000000CB8000__000000914E3F38F0 000000067F00004002000100000000CB4000-000000067F00004002000100000000CB8000__000000931B9A2710 000000067F00004002000100000000CB8000-000000067F00004002000100000000CBC000__00000073AD3FE6B8 000000067F00004002000100000000CB8000-000000067F00004002000100000000CBC000__000000914E3F38F0 000000067F00004002000100000000CB8000-000000067F00004002000100000000CBC000__000000931B9A2710 000000067F00004002000100000000CBA8AE-000000067F00004002000100000000CC3288__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CBC000-000000067F00004002000100000000CC0000__00000073AD3FE6B8 000000067F00004002000100000000CBC000-000000067F00004002000100000000CC0000__000000914E3F38F0 000000067F00004002000100000000CBC000-000000067F00004002000100000000CC0000__000000931B9A2710 000000067F00004002000100000000CC0000-000000067F00004002000100000000CC4000__00000073AD3FE6B8 000000067F00004002000100000000CC0000-000000067F00004002000100000000CC4000__000000914E3F38F0 000000067F00004002000100000000CC0000-000000067F00004002000100000000CC4000__000000931B9A2710 000000067F00004002000100000000CC3288-000000067F00004002000100000000CCBC6F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CC4000-000000067F00004002000100000000CC8000__00000073AD3FE6B8 000000067F00004002000100000000CC4000-000000067F00004002000100000000CC8000__000000914E3F38F0 000000067F00004002000100000000CC4000-000000067F00004002000100000000CC8000__000000931B9A2710 000000067F00004002000100000000CC8000-000000067F00004002000100000000CCC000__00000073AD3FE6B8 000000067F00004002000100000000CC8000-000000067F00004002000100000000CCC000__000000914E3F38F0 000000067F00004002000100000000CC8000-000000067F00004002000100000000CCC000__000000931B9A2710 000000067F00004002000100000000CCBC6F-000000067F00004002000100000000CD4644__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CCC000-000000067F00004002000100000000CD0000__00000073AD3FE6B8 000000067F00004002000100000000CCC000-000000067F00004002000100000000CD0000__000000914E3F38F0 000000067F00004002000100000000CCC000-000000067F00004002000100000000CD0000__000000931B9A2710 000000067F00004002000100000000CD0000-000000067F00004002000100000000CD4000__00000073AD3FE6B8 000000067F00004002000100000000CD0000-000000067F00004002000100000000CD4000__000000914E3F38F0 000000067F00004002000100000000CD0000-000000067F00004002000100000000CD4000__000000931B9A2710 000000067F00004002000100000000CD4000-000000067F00004002000100000000CD8000__00000073AD3FE6B8 000000067F00004002000100000000CD4000-000000067F00004002000100000000CD8000__000000914E3F38F0 000000067F00004002000100000000CD4000-000000067F00004002000100000000CD8000__000000931B9A2710 000000067F00004002000100000000CD4644-000000067F00004002000100000000CDD014__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CD8000-000000067F00004002000100000000CDC000__00000073AD3FE6B8 000000067F00004002000100000000CD8000-000000067F00004002000100000000CDC000__000000914E3F38F0 000000067F00004002000100000000CD8000-000000067F00004002000100000000CDC000__000000931B9A2710 000000067F00004002000100000000CDC000-000000067F00004002000100000000CE0000__00000073AD3FE6B8 000000067F00004002000100000000CDC000-000000067F00004002000100000000CE0000__000000914E3F38F0 000000067F00004002000100000000CDC000-000000067F00004002000100000000CE0000__000000931B9A2710 000000067F00004002000100000000CDD014-000000067F00004002000100000000CE59EF__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CE0000-000000067F00004002000100000000CE4000__00000073AD3FE6B8 000000067F00004002000100000000CE0000-000000067F00004002000100000000CE4000__000000914E3F38F0 000000067F00004002000100000000CE0000-000000067F00004002000100000000CE4000__000000931B9A2710 000000067F00004002000100000000CE4000-000000067F00004002000100000000CE8000__00000073AD3FE6B8 000000067F00004002000100000000CE4000-000000067F00004002000100000000CE8000__000000914E3F38F0 000000067F00004002000100000000CE4000-000000067F00004002000100000000CE8000__000000931B9A2710 000000067F00004002000100000000CE59EF-000000067F00004002000100000000CEE3D4__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CE8000-000000067F00004002000100000000CEC000__00000073AD3FE6B8 000000067F00004002000100000000CE8000-000000067F00004002000100000000CEC000__000000914E3F38F0 000000067F00004002000100000000CE8000-000000067F00004002000100000000CEC000__000000931B9A2710 000000067F00004002000100000000CEC000-000000067F00004002000100000000CF0000__00000073AD3FE6B8 000000067F00004002000100000000CEC000-000000067F00004002000100000000CF0000__000000914E3F38F0 000000067F00004002000100000000CEC000-000000067F00004002000100000000CF0000__000000931B9A2710 000000067F00004002000100000000CEE3D4-000000067F00004002000100000000CF6DB9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CF0000-000000067F00004002000100000000CF4000__00000073AD3FE6B8 000000067F00004002000100000000CF0000-000000067F00004002000100000000CF4000__000000914E3F38F0 000000067F00004002000100000000CF0000-000000067F00004002000100000000CF4000__000000931B9A2710 000000067F00004002000100000000CF4000-000000067F00004002000100000000CF8000__00000073AD3FE6B8 000000067F00004002000100000000CF4000-000000067F00004002000100000000CF8000__000000914E3F38F0 000000067F00004002000100000000CF4000-000000067F00004002000100000000CF8000__000000931B9A2710 000000067F00004002000100000000CF6DB9-000000067F00004002000100000000CFF798__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000CF8000-000000067F00004002000100000000CFC000__00000073AD3FE6B8 000000067F00004002000100000000CF8000-000000067F00004002000100000000CFC000__000000914E3F38F0 000000067F00004002000100000000CF8000-000000067F00004002000100000000CFC000__000000931B9A2710 000000067F00004002000100000000CFC000-000000067F00004002000100000000D00000__00000073AD3FE6B8 000000067F00004002000100000000CFC000-000000067F00004002000100000000D00000__000000914E3F38F0 000000067F00004002000100000000CFC000-000000067F00004002000100000000D00000__000000931B9A2710 000000067F00004002000100000000CFF798-000000067F00004002000100000000D08175__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D00000-000000067F00004002000100000000D04000__00000073AD3FE6B8 000000067F00004002000100000000D00000-000000067F00004002000100000000D04000__000000914E3F38F0 000000067F00004002000100000000D00000-000000067F00004002000100000000D04000__000000931B9A2710 000000067F00004002000100000000D04000-000000067F00004002000100000000D08000__00000073AD3FE6B8 000000067F00004002000100000000D04000-000000067F00004002000100000000D08000__000000914E3F38F0 000000067F00004002000100000000D04000-000000067F00004002000100000000D08000__000000931B9A2710 000000067F00004002000100000000D08000-000000067F00004002000100000000D0C000__00000073AD3FE6B8 000000067F00004002000100000000D08000-000000067F00004002000100000000D0C000__000000914E3F38F0 000000067F00004002000100000000D08000-000000067F00004002000100000000D0C000__000000931B9A2710 000000067F00004002000100000000D08175-000000067F00004002000100000000D10B4D__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D0C000-000000067F00004002000100000000D10000__00000073AD3FE6B8 000000067F00004002000100000000D0C000-000000067F00004002000100000000D10000__000000914E3F38F0 000000067F00004002000100000000D0C000-000000067F00004002000100000000D10000__000000931B9A2710 000000067F00004002000100000000D10000-000000067F00004002000100000000D14000__00000073AD3FE6B8 000000067F00004002000100000000D10000-000000067F00004002000100000000D14000__000000914E3F38F0 000000067F00004002000100000000D10000-000000067F00004002000100000000D14000__000000931B9A2710 000000067F00004002000100000000D10B4D-000000067F00004002000100000000D19528__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D14000-000000067F00004002000100000000D18000__00000073AD3FE6B8 000000067F00004002000100000000D14000-000000067F00004002000100000000D18000__000000914E3F38F0 000000067F00004002000100000000D14000-000000067F00004002000100000000D18000__000000931B9A2710 000000067F00004002000100000000D18000-000000067F00004002000100000000D1C000__00000073AD3FE6B8 000000067F00004002000100000000D18000-000000067F00004002000100000000D1C000__000000914E3F38F0 000000067F00004002000100000000D18000-000000067F00004002000100000000D1C000__000000931B9A2710 000000067F00004002000100000000D19528-000000067F00004002000100000000D21EFC__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D1C000-000000067F00004002000100000000D20000__00000073AD3FE6B8 000000067F00004002000100000000D1C000-000000067F00004002000100000000D20000__000000914E3F38F0 000000067F00004002000100000000D1C000-000000067F00004002000100000000D20000__000000931B9A2710 000000067F00004002000100000000D20000-000000067F00004002000100000000D24000__00000073AD3FE6B8 000000067F00004002000100000000D20000-000000067F00004002000100000000D24000__000000914E3F38F0 000000067F00004002000100000000D20000-000000067F00004002000100000000D24000__000000931B9A2710 000000067F00004002000100000000D21EFC-000000067F00004002000100000000D2A8DC__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D24000-000000067F00004002000100000000D28000__00000073AD3FE6B8 000000067F00004002000100000000D24000-000000067F00004002000100000000D28000__000000914E3F38F0 000000067F00004002000100000000D24000-000000067F00004002000100000000D28000__000000931B9A2710 000000067F00004002000100000000D28000-000000067F00004002000100000000D2C000__00000073AD3FE6B8 000000067F00004002000100000000D28000-000000067F00004002000100000000D2C000__000000914E3F38F0 000000067F00004002000100000000D28000-000000067F00004002000100000000D2C000__000000931B9A2710 000000067F00004002000100000000D2A8DC-000000067F00004002000100000000D332BD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D2C000-000000067F00004002000100000000D30000__00000073AD3FE6B8 000000067F00004002000100000000D2C000-000000067F00004002000100000000D30000__000000914E3F38F0 000000067F00004002000100000000D2C000-000000067F00004002000100000000D30000__000000931B9A2710 000000067F00004002000100000000D30000-000000067F00004002000100000000D34000__00000073AD3FE6B8 000000067F00004002000100000000D30000-000000067F00004002000100000000D34000__000000914E3F38F0 000000067F00004002000100000000D30000-000000067F00004002000100000000D34000__000000931B9A2710 000000067F00004002000100000000D332BD-000000067F00004002000100000000D3BC9F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D34000-000000067F00004002000100000000D38000__00000073AD3FE6B8 000000067F00004002000100000000D34000-000000067F00004002000100000000D38000__000000914E3F38F0 000000067F00004002000100000000D34000-000000067F00004002000100000000D38000__000000931B9A2710 000000067F00004002000100000000D38000-000000067F00004002000100000000D3C000__00000073AD3FE6B8 000000067F00004002000100000000D38000-000000067F00004002000100000000D3C000__000000914E3F38F0 000000067F00004002000100000000D38000-000000067F00004002000100000000D3C000__000000931B9A2710 000000067F00004002000100000000D3BC9F-000000067F00004002000100000000D4467B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D3C000-000000067F00004002000100000000D40000__00000073AD3FE6B8 000000067F00004002000100000000D3C000-000000067F00004002000100000000D40000__000000914E3F38F0 000000067F00004002000100000000D3C000-000000067F00004002000100000000D40000__000000931B9A2710 000000067F00004002000100000000D40000-000000067F00004002000100000000D44000__00000073AD3FE6B8 000000067F00004002000100000000D40000-000000067F00004002000100000000D44000__000000914E3F38F0 000000067F00004002000100000000D40000-000000067F00004002000100000000D44000__000000931B9A2710 000000067F00004002000100000000D44000-000000067F00004002000100000000D48000__00000073AD3FE6B8 000000067F00004002000100000000D44000-000000067F00004002000100000000D48000__000000914E3F38F0 000000067F00004002000100000000D44000-000000067F00004002000100000000D48000__000000931B9A2710 000000067F00004002000100000000D4467B-000000067F00004002000100000000D4D058__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D48000-000000067F00004002000100000000D4C000__00000073AD3FE6B8 000000067F00004002000100000000D48000-000000067F00004002000100000000D4C000__000000914E3F38F0 000000067F00004002000100000000D48000-000000067F00004002000100000000D4C000__000000931B9A2710 000000067F00004002000100000000D4C000-000000067F00004002000100000000D50000__00000073AD3FE6B8 000000067F00004002000100000000D4C000-000000067F00004002000100000000D50000__000000914E3F38F0 000000067F00004002000100000000D4C000-000000067F00004002000100000000D50000__000000931B9A2710 000000067F00004002000100000000D4D058-000000067F00004002000100000000D55A2B__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D50000-000000067F00004002000100000000D54000__00000073AD3FE6B8 000000067F00004002000100000000D50000-000000067F00004002000100000000D54000__000000914E3F38F0 000000067F00004002000100000000D50000-000000067F00004002000100000000D54000__000000931B9A2710 000000067F00004002000100000000D54000-000000067F00004002000100000000D58000__00000073AD3FE6B8 000000067F00004002000100000000D54000-000000067F00004002000100000000D58000__000000914E3F38F0 000000067F00004002000100000000D54000-000000067F00004002000100000000D58000__000000931B9A2710 000000067F00004002000100000000D55A2B-000000067F00004002000100000000D5E400__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D58000-000000067F00004002000100000000D5C000__00000073AD3FE6B8 000000067F00004002000100000000D58000-000000067F00004002000100000000D5C000__000000914E3F38F0 000000067F00004002000100000000D58000-000000067F00004002000100000000D5C000__000000931B9A2710 000000067F00004002000100000000D5C000-000000067F00004002000100000000D60000__00000073AD3FE6B8 000000067F00004002000100000000D5C000-000000067F00004002000100000000D60000__000000914E3F38F0 000000067F00004002000100000000D5C000-000000067F00004002000100000000D60000__000000931B9A2710 000000067F00004002000100000000D5E400-000000067F00004002000100000000D66DD2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D60000-000000067F00004002000100000000D64000__00000073AD3FE6B8 000000067F00004002000100000000D60000-000000067F00004002000100000000D64000__000000914E3F38F0 000000067F00004002000100000000D60000-000000067F00004002000100000000D64000__000000931B9A2710 000000067F00004002000100000000D64000-000000067F00004002000100000000D68000__00000073AD3FE6B8 000000067F00004002000100000000D64000-000000067F00004002000100000000D68000__000000914E3F38F0 000000067F00004002000100000000D64000-000000067F00004002000100000000D68000__000000931B9A2710 000000067F00004002000100000000D66DD2-000000067F00004002000100000000D6F7B8__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D68000-000000067F00004002000100000000D6C000__00000073AD3FE6B8 000000067F00004002000100000000D68000-000000067F00004002000100000000D6C000__000000914E3F38F0 000000067F00004002000100000000D68000-000000067F00004002000100000000D6C000__000000931B9A2710 000000067F00004002000100000000D6C000-000000067F00004002000100000000D70000__00000073AD3FE6B8 000000067F00004002000100000000D6C000-000000067F00004002000100000000D70000__000000914E3F38F0 000000067F00004002000100000000D6C000-000000067F00004002000100000000D70000__000000931B9A2710 000000067F00004002000100000000D6F7B8-000000067F00004002000100000000D7819E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D70000-000000067F00004002000100000000D74000__00000073AD3FE6B8 000000067F00004002000100000000D70000-000000067F00004002000100000000D74000__000000914E3F38F0 000000067F00004002000100000000D70000-000000067F00004002000100000000D74000__000000931B9A2710 000000067F00004002000100000000D74000-000000067F00004002000100000000D78000__00000073AD3FE6B8 000000067F00004002000100000000D74000-000000067F00004002000100000000D78000__000000914E3F38F0 000000067F00004002000100000000D74000-000000067F00004002000100000000D78000__000000931B9A2710 000000067F00004002000100000000D78000-000000067F00004002000100000000D7C000__00000073AD3FE6B8 000000067F00004002000100000000D78000-000000067F00004002000100000000D7C000__000000914E3F38F0 000000067F00004002000100000000D78000-000000067F00004002000100000000D7C000__000000931B9A2710 000000067F00004002000100000000D7819E-000000067F00004002000100000000D80B7F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D7C000-000000067F00004002000100000000D80000__00000073AD3FE6B8 000000067F00004002000100000000D7C000-000000067F00004002000100000000D80000__000000914E3F38F0 000000067F00004002000100000000D7C000-000000067F00004002000100000000D80000__000000931B9A2710 000000067F00004002000100000000D80000-000000067F00004002000100000000D84000__00000073AD3FE6B8 000000067F00004002000100000000D80000-000000067F00004002000100000000D84000__000000914E3F38F0 000000067F00004002000100000000D80000-000000067F00004002000100000000D84000__000000931B9A2710 000000067F00004002000100000000D80B7F-000000067F00004002000100000000D89552__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D84000-000000067F00004002000100000000D88000__00000073AD3FE6B8 000000067F00004002000100000000D84000-000000067F00004002000100000000D88000__000000914E3F38F0 000000067F00004002000100000000D84000-000000067F00004002000100000000D88000__000000931B9A2710 000000067F00004002000100000000D88000-000000067F00004002000100000000D8C000__00000073AD3FE6B8 000000067F00004002000100000000D88000-000000067F00004002000100000000D8C000__000000914E3F38F0 000000067F00004002000100000000D88000-000000067F00004002000100000000D8C000__000000931B9A2710 000000067F00004002000100000000D89552-000000067F00004002000100000000D91F30__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D8C000-000000067F00004002000100000000D90000__00000073AD3FE6B8 000000067F00004002000100000000D8C000-000000067F00004002000100000000D90000__000000914E3F38F0 000000067F00004002000100000000D8C000-000000067F00004002000100000000D90000__000000931B9A2710 000000067F00004002000100000000D90000-000000067F00004002000100000000D94000__00000073AD3FE6B8 000000067F00004002000100000000D90000-000000067F00004002000100000000D94000__000000914E3F38F0 000000067F00004002000100000000D90000-000000067F00004002000100000000D94000__000000931B9A2710 000000067F00004002000100000000D91F30-000000067F00004002000100000000D9A901__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D94000-000000067F00004002000100000000D98000__00000073AD3FE6B8 000000067F00004002000100000000D94000-000000067F00004002000100000000D98000__000000914E3F38F0 000000067F00004002000100000000D94000-000000067F00004002000100000000D98000__000000931B9A2710 000000067F00004002000100000000D98000-000000067F00004002000100000000D9C000__00000073AD3FE6B8 000000067F00004002000100000000D98000-000000067F00004002000100000000D9C000__000000914E3F38F0 000000067F00004002000100000000D98000-000000067F00004002000100000000D9C000__000000931B9A2710 000000067F00004002000100000000D9A901-000000067F00004002000100000000DA32CC__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000D9C000-000000067F00004002000100000000DA0000__00000073AD3FE6B8 000000067F00004002000100000000D9C000-000000067F00004002000100000000DA0000__000000914E3F38F0 000000067F00004002000100000000D9C000-000000067F00004002000100000000DA0000__000000931B9A2710 000000067F00004002000100000000DA0000-000000067F00004002000100000000DA4000__00000073AD3FE6B8 000000067F00004002000100000000DA0000-000000067F00004002000100000000DA4000__000000914E3F38F0 000000067F00004002000100000000DA0000-000000067F00004002000100000000DA4000__000000931B9A2710 000000067F00004002000100000000DA32CC-000000067F00004002000100000000DABCB3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DA4000-000000067F00004002000100000000DA8000__00000073AD3FE6B8 000000067F00004002000100000000DA4000-000000067F00004002000100000000DA8000__000000914E3F38F0 000000067F00004002000100000000DA4000-000000067F00004002000100000000DA8000__000000931B9A2710 000000067F00004002000100000000DA8000-000000067F00004002000100000000DAC000__00000073AD3FE6B8 000000067F00004002000100000000DA8000-000000067F00004002000100000000DAC000__000000914E3F38F0 000000067F00004002000100000000DA8000-000000067F00004002000100000000DAC000__000000931B9A2710 000000067F00004002000100000000DABCB3-000000067F00004002000100000000DB469A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DAC000-000000067F00004002000100000000DB0000__00000073AD3FE6B8 000000067F00004002000100000000DAC000-000000067F00004002000100000000DB0000__000000914E3F38F0 000000067F00004002000100000000DAC000-000000067F00004002000100000000DB0000__000000931B9A2710 000000067F00004002000100000000DB0000-000000067F00004002000100000000DB4000__00000073AD3FE6B8 000000067F00004002000100000000DB0000-000000067F00004002000100000000DB4000__000000914E3F38F0 000000067F00004002000100000000DB0000-000000067F00004002000100000000DB4000__000000931B9A2710 000000067F00004002000100000000DB4000-000000067F00004002000100000000DB8000__00000073AD3FE6B8 000000067F00004002000100000000DB4000-000000067F00004002000100000000DB8000__000000914E3F38F0 000000067F00004002000100000000DB4000-000000067F00004002000100000000DB8000__000000931B9A2710 000000067F00004002000100000000DB469A-000000067F00004002000100000000DBD075__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DB8000-000000067F00004002000100000000DBC000__00000073AD3FE6B8 000000067F00004002000100000000DB8000-000000067F00004002000100000000DBC000__000000914E3F38F0 000000067F00004002000100000000DB8000-000000067F00004002000100000000DBC000__000000931B9A2710 000000067F00004002000100000000DBC000-000000067F00004002000100000000DC0000__00000073AD3FE6B8 000000067F00004002000100000000DBC000-000000067F00004002000100000000DC0000__000000914E3F38F0 000000067F00004002000100000000DBC000-000000067F00004002000100000000DC0000__000000931B9A2710 000000067F00004002000100000000DBD075-000000067F00004002000100000000DC5A50__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DC0000-000000067F00004002000100000000DC4000__00000073AD3FE6B8 000000067F00004002000100000000DC0000-000000067F00004002000100000000DC4000__000000914E3F38F0 000000067F00004002000100000000DC0000-000000067F00004002000100000000DC4000__000000931B9A2710 000000067F00004002000100000000DC4000-000000067F00004002000100000000DC8000__00000073AD3FE6B8 000000067F00004002000100000000DC4000-000000067F00004002000100000000DC8000__000000914E3F38F0 000000067F00004002000100000000DC4000-000000067F00004002000100000000DC8000__000000931B9A2710 000000067F00004002000100000000DC5A50-000000067F00004002000100000000DCE430__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DC8000-000000067F00004002000100000000DCC000__00000073AD3FE6B8 000000067F00004002000100000000DC8000-000000067F00004002000100000000DCC000__000000914E3F38F0 000000067F00004002000100000000DC8000-000000067F00004002000100000000DCC000__000000931B9A2710 000000067F00004002000100000000DCC000-000000067F00004002000100000000DD0000__00000073AD3FE6B8 000000067F00004002000100000000DCC000-000000067F00004002000100000000DD0000__000000914E3F38F0 000000067F00004002000100000000DCC000-000000067F00004002000100000000DD0000__000000931B9A2710 000000067F00004002000100000000DCE430-000000067F00004002000100000000DD6E06__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DD0000-000000067F00004002000100000000DD4000__00000073AD3FE6B8 000000067F00004002000100000000DD0000-000000067F00004002000100000000DD4000__000000914E3F38F0 000000067F00004002000100000000DD0000-000000067F00004002000100000000DD4000__000000931B9A2710 000000067F00004002000100000000DD4000-000000067F00004002000100000000DD8000__00000073AD3FE6B8 000000067F00004002000100000000DD4000-000000067F00004002000100000000DD8000__000000914E3F38F0 000000067F00004002000100000000DD4000-000000067F00004002000100000000DD8000__000000931B9A2710 000000067F00004002000100000000DD6E06-000000067F00004002000100000000DDF7DB__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DD8000-000000067F00004002000100000000DDC000__00000073AD3FE6B8 000000067F00004002000100000000DD8000-000000067F00004002000100000000DDC000__000000914E3F38F0 000000067F00004002000100000000DD8000-000000067F00004002000100000000DDC000__000000931B9A2710 000000067F00004002000100000000DDC000-000000067F00004002000100000000DE0000__00000073AD3FE6B8 000000067F00004002000100000000DDC000-000000067F00004002000100000000DE0000__000000914E3F38F0 000000067F00004002000100000000DDC000-000000067F00004002000100000000DE0000__000000931B9A2710 000000067F00004002000100000000DDF7DB-000000067F00004002000100000000DE81C3__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DE0000-000000067F00004002000100000000DE4000__00000073AD3FE6B8 000000067F00004002000100000000DE0000-000000067F00004002000100000000DE4000__000000914E3F38F0 000000067F00004002000100000000DE0000-000000067F00004002000100000000DE4000__000000931B9A2710 000000067F00004002000100000000DE4000-000000067F00004002000100000000DE8000__00000073AD3FE6B8 000000067F00004002000100000000DE4000-000000067F00004002000100000000DE8000__000000914E3F38F0 000000067F00004002000100000000DE4000-000000067F00004002000100000000DE8000__000000931B9A2710 000000067F00004002000100000000DE8000-000000067F00004002000100000000DEC000__00000073AD3FE6B8 000000067F00004002000100000000DE8000-000000067F00004002000100000000DEC000__000000914E3F38F0 000000067F00004002000100000000DE8000-000000067F00004002000100000000DEC000__000000931B9A2710 000000067F00004002000100000000DE81C3-000000067F00004002000100000000DF0B9F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DEC000-000000067F00004002000100000000DF0000__00000073AD3FE6B8 000000067F00004002000100000000DEC000-000000067F00004002000100000000DF0000__000000914E3F38F0 000000067F00004002000100000000DEC000-000000067F00004002000100000000DF0000__000000931B9A2710 000000067F00004002000100000000DF0000-000000067F00004002000100000000DF4000__00000073AD3FE6B8 000000067F00004002000100000000DF0000-000000067F00004002000100000000DF4000__000000914E3F38F0 000000067F00004002000100000000DF0000-000000067F00004002000100000000DF4000__000000931B9A2710 000000067F00004002000100000000DF0B9F-000000067F00004002000100000000DF9582__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DF4000-000000067F00004002000100000000DF8000__00000073AD3FE6B8 000000067F00004002000100000000DF4000-000000067F00004002000100000000DF8000__000000914E3F38F0 000000067F00004002000100000000DF4000-000000067F00004002000100000000DF8000__000000931B9A2710 000000067F00004002000100000000DF8000-000000067F00004002000100000000DFC000__00000073AD3FE6B8 000000067F00004002000100000000DF8000-000000067F00004002000100000000DFC000__000000914E3F38F0 000000067F00004002000100000000DF8000-000000067F00004002000100000000DFC000__000000931B9A2710 000000067F00004002000100000000DF9582-000000067F00004002000100000000E01F62__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000DFC000-000000067F00004002000100000000E00000__00000073AD3FE6B8 000000067F00004002000100000000DFC000-000000067F00004002000100000000E00000__000000914E3F38F0 000000067F00004002000100000000DFC000-000000067F00004002000100000000E00000__000000931B9A2710 000000067F00004002000100000000E00000-000000067F00004002000100000000E04000__00000073AD3FE6B8 000000067F00004002000100000000E00000-000000067F00004002000100000000E04000__000000914E3F38F0 000000067F00004002000100000000E00000-000000067F00004002000100000000E04000__000000931B9A2710 000000067F00004002000100000000E01F62-000000067F00004002000100000000E0A930__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E04000-000000067F00004002000100000000E08000__00000073AD3FE6B8 000000067F00004002000100000000E04000-000000067F00004002000100000000E08000__000000914E3F38F0 000000067F00004002000100000000E04000-000000067F00004002000100000000E08000__000000931B9A2710 000000067F00004002000100000000E08000-000000067F00004002000100000000E0C000__00000073AD3FE6B8 000000067F00004002000100000000E08000-000000067F00004002000100000000E0C000__000000914E3F38F0 000000067F00004002000100000000E08000-000000067F00004002000100000000E0C000__000000931B9A2710 000000067F00004002000100000000E0A930-000000067F00004002000100000000E13305__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E0C000-000000067F00004002000100000000E10000__00000073AD3FE6B8 000000067F00004002000100000000E0C000-000000067F00004002000100000000E10000__000000914E3F38F0 000000067F00004002000100000000E0C000-000000067F00004002000100000000E10000__000000931B9A2710 000000067F00004002000100000000E10000-000000067F00004002000100000000E14000__00000073AD3FE6B8 000000067F00004002000100000000E10000-000000067F00004002000100000000E14000__000000914E3F38F0 000000067F00004002000100000000E10000-000000067F00004002000100000000E14000__000000931B9A2710 000000067F00004002000100000000E13305-000000067F00004002000100000000E1BCDD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E14000-000000067F00004002000100000000E18000__00000073AD3FE6B8 000000067F00004002000100000000E14000-000000067F00004002000100000000E18000__000000914E3F38F0 000000067F00004002000100000000E14000-000000067F00004002000100000000E18000__000000931B9A2710 000000067F00004002000100000000E18000-000000067F00004002000100000000E1C000__00000073AD3FE6B8 000000067F00004002000100000000E18000-000000067F00004002000100000000E1C000__000000914E3F38F0 000000067F00004002000100000000E18000-000000067F00004002000100000000E1C000__000000931B9A2710 000000067F00004002000100000000E1BCDD-000000067F00004002000100000000E246C0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E1C000-000000067F00004002000100000000E20000__00000073AD3FE6B8 000000067F00004002000100000000E1C000-000000067F00004002000100000000E20000__000000914E3F38F0 000000067F00004002000100000000E1C000-000000067F00004002000100000000E20000__000000931B9A2710 000000067F00004002000100000000E20000-000000067F00004002000100000000E24000__00000073AD3FE6B8 000000067F00004002000100000000E20000-000000067F00004002000100000000E24000__000000914E3F38F0 000000067F00004002000100000000E20000-000000067F00004002000100000000E24000__000000931B9A2710 000000067F00004002000100000000E24000-000000067F00004002000100000000E28000__00000073AD3FE6B8 000000067F00004002000100000000E24000-000000067F00004002000100000000E28000__000000914E3F38F0 000000067F00004002000100000000E24000-000000067F00004002000100000000E28000__000000931B9A2710 000000067F00004002000100000000E246C0-000000067F00004002000100000000E2D0A2__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E28000-000000067F00004002000100000000E2C000__00000073AD3FE6B8 000000067F00004002000100000000E28000-000000067F00004002000100000000E2C000__000000914E3F38F0 000000067F00004002000100000000E28000-000000067F00004002000100000000E2C000__000000931B9A2710 000000067F00004002000100000000E2C000-000000067F00004002000100000000E30000__00000073AD3FE6B8 000000067F00004002000100000000E2C000-000000067F00004002000100000000E30000__000000914E3F38F0 000000067F00004002000100000000E2C000-000000067F00004002000100000000E30000__000000931B9A2710 000000067F00004002000100000000E2D0A2-000000067F00004002000100000000E35A83__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E30000-000000067F00004002000100000000E34000__00000073AD3FE6B8 000000067F00004002000100000000E30000-000000067F00004002000100000000E34000__000000914E3F38F0 000000067F00004002000100000000E30000-000000067F00004002000100000000E34000__000000931B9A2710 000000067F00004002000100000000E34000-000000067F00004002000100000000E38000__00000073AD3FE6B8 000000067F00004002000100000000E34000-000000067F00004002000100000000E38000__000000914E3F38F0 000000067F00004002000100000000E34000-000000067F00004002000100000000E38000__000000931B9A2710 000000067F00004002000100000000E35A83-000000067F00004002000100000000E3E45F__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E38000-000000067F00004002000100000000E3C000__00000073AD3FE6B8 000000067F00004002000100000000E38000-000000067F00004002000100000000E3C000__000000914E3F38F0 000000067F00004002000100000000E38000-000000067F00004002000100000000E3C000__000000931B9A2710 000000067F00004002000100000000E3C000-000000067F00004002000100000000E40000__00000073AD3FE6B8 000000067F00004002000100000000E3C000-000000067F00004002000100000000E40000__000000914E3F38F0 000000067F00004002000100000000E3C000-000000067F00004002000100000000E40000__000000931B9A2710 000000067F00004002000100000000E3E45F-000000067F00004002000100000000E46E30__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E40000-000000067F00004002000100000000E44000__00000073AD3FE6B8 000000067F00004002000100000000E40000-000000067F00004002000100000000E44000__000000914E3F38F0 000000067F00004002000100000000E40000-000000067F00004002000100000000E44000__000000931B9A2710 000000067F00004002000100000000E44000-000000067F00004002000100000000E48000__00000073AD3FE6B8 000000067F00004002000100000000E44000-000000067F00004002000100000000E48000__000000914E3F38F0 000000067F00004002000100000000E44000-000000067F00004002000100000000E48000__000000931B9A2710 000000067F00004002000100000000E46E30-000000067F00004002000100000000E4F802__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E48000-000000067F00004002000100000000E4C000__00000073AD3FE6B8 000000067F00004002000100000000E48000-000000067F00004002000100000000E4C000__000000914E3F38F0 000000067F00004002000100000000E48000-000000067F00004002000100000000E4C000__000000931B9A2710 000000067F00004002000100000000E4C000-000000067F00004002000100000000E50000__00000073AD3FE6B8 000000067F00004002000100000000E4C000-000000067F00004002000100000000E50000__000000914E3F38F0 000000067F00004002000100000000E4C000-000000067F00004002000100000000E50000__000000931B9A2710 000000067F00004002000100000000E4F802-000000067F00004002000100000000E581E0__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E50000-000000067F00004002000100000000E54000__00000073AD3FE6B8 000000067F00004002000100000000E50000-000000067F00004002000100000000E54000__000000914E3F38F0 000000067F00004002000100000000E50000-000000067F00004002000100000000E54000__000000931B9A2710 000000067F00004002000100000000E54000-000000067F00004002000100000000E58000__00000073AD3FE6B8 000000067F00004002000100000000E54000-000000067F00004002000100000000E58000__000000914E3F38F0 000000067F00004002000100000000E54000-000000067F00004002000100000000E58000__000000931B9A2710 000000067F00004002000100000000E58000-000000067F00004002000100000000E5C000__00000073AD3FE6B8 000000067F00004002000100000000E58000-000000067F00004002000100000000E5C000__000000914E3F38F0 000000067F00004002000100000000E58000-000000067F00004002000100000000E5C000__000000931B9A2710 000000067F00004002000100000000E581E0-000000067F00004002000100000000E60BC6__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E5C000-000000067F00004002000100000000E60000__00000073AD3FE6B8 000000067F00004002000100000000E5C000-000000067F00004002000100000000E60000__000000914E3F38F0 000000067F00004002000100000000E5C000-000000067F00004002000100000000E60000__000000931B9A2710 000000067F00004002000100000000E60000-000000067F00004002000100000000E64000__00000073AD3FE6B8 000000067F00004002000100000000E60000-000000067F00004002000100000000E64000__000000914E3F38F0 000000067F00004002000100000000E60000-000000067F00004002000100000000E64000__000000931B9A2710 000000067F00004002000100000000E60BC6-000000067F00004002000100000000E695A7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E64000-000000067F00004002000100000000E68000__00000073AD3FE6B8 000000067F00004002000100000000E64000-000000067F00004002000100000000E68000__000000914E3F38F0 000000067F00004002000100000000E64000-000000067F00004002000100000000E68000__000000931B9A2710 000000067F00004002000100000000E68000-000000067F00004002000100000000E6C000__00000073AD3FE6B8 000000067F00004002000100000000E68000-000000067F00004002000100000000E6C000__000000914E3F38F0 000000067F00004002000100000000E68000-000000067F00004002000100000000E6C000__000000931B9A2710 000000067F00004002000100000000E695A7-000000067F00004002000100000000E71F86__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E6C000-000000067F00004002000100000000E70000__00000073AD3FE6B8 000000067F00004002000100000000E6C000-000000067F00004002000100000000E70000__000000914E3F38F0 000000067F00004002000100000000E6C000-000000067F00004002000100000000E70000__000000931B9A2710 000000067F00004002000100000000E70000-000000067F00004002000100000000E74000__00000073AD3FE6B8 000000067F00004002000100000000E70000-000000067F00004002000100000000E74000__000000914E3F38F0 000000067F00004002000100000000E70000-000000067F00004002000100000000E74000__000000931B9A2710 000000067F00004002000100000000E71F86-000000067F00004002000100000000E7A966__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E74000-000000067F00004002000100000000E78000__00000073AD3FE6B8 000000067F00004002000100000000E74000-000000067F00004002000100000000E78000__000000914E3F38F0 000000067F00004002000100000000E74000-000000067F00004002000100000000E78000__000000931B9A2710 000000067F00004002000100000000E78000-000000067F00004002000100000000E7C000__00000073AD3FE6B8 000000067F00004002000100000000E78000-000000067F00004002000100000000E7C000__000000914E3F38F0 000000067F00004002000100000000E78000-000000067F00004002000100000000E7C000__000000931B9A2710 000000067F00004002000100000000E7A966-000000067F00004002000100000000E8333C__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E7C000-000000067F00004002000100000000E80000__00000073AD3FE6B8 000000067F00004002000100000000E7C000-000000067F00004002000100000000E80000__000000914E3F38F0 000000067F00004002000100000000E7C000-000000067F00004002000100000000E80000__000000931B9A2710 000000067F00004002000100000000E80000-000000067F00004002000100000000E84000__00000073AD3FE6B8 000000067F00004002000100000000E80000-000000067F00004002000100000000E84000__000000914E3F38F0 000000067F00004002000100000000E80000-000000067F00004002000100000000E84000__000000931B9A2710 000000067F00004002000100000000E8333C-000000067F00004002000100000000E8BD17__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E84000-000000067F00004002000100000000E88000__00000073AD3FE6B8 000000067F00004002000100000000E84000-000000067F00004002000100000000E88000__000000914E3F38F0 000000067F00004002000100000000E84000-000000067F00004002000100000000E88000__000000931B9A2710 000000067F00004002000100000000E88000-000000067F00004002000100000000E8C000__00000073AD3FE6B8 000000067F00004002000100000000E88000-000000067F00004002000100000000E8C000__000000914E3F38F0 000000067F00004002000100000000E88000-000000067F00004002000100000000E8C000__000000931B9A2710 000000067F00004002000100000000E8BD17-000000067F00004002000100000000E946F5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E8C000-000000067F00004002000100000000E90000__00000073AD3FE6B8 000000067F00004002000100000000E8C000-000000067F00004002000100000000E90000__000000914E3F38F0 000000067F00004002000100000000E8C000-000000067F00004002000100000000E90000__000000931B9A2710 000000067F00004002000100000000E90000-000000067F00004002000100000000E94000__00000073AD3FE6B8 000000067F00004002000100000000E90000-000000067F00004002000100000000E94000__000000914E3F38F0 000000067F00004002000100000000E90000-000000067F00004002000100000000E94000__000000931B9A2710 000000067F00004002000100000000E94000-000000067F00004002000100000000E98000__00000073AD3FE6B8 000000067F00004002000100000000E94000-000000067F00004002000100000000E98000__000000914E3F38F0 000000067F00004002000100000000E94000-000000067F00004002000100000000E98000__000000931B9A2710 000000067F00004002000100000000E946F5-000000067F00004002000100000000E9D0D7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000E98000-000000067F00004002000100000000E9C000__00000073AD3FE6B8 000000067F00004002000100000000E98000-000000067F00004002000100000000E9C000__000000914E3F38F0 000000067F00004002000100000000E98000-000000067F00004002000100000000E9C000__000000931B9A2710 000000067F00004002000100000000E9C000-000000067F00004002000100000000EA0000__00000073AD3FE6B8 000000067F00004002000100000000E9C000-000000067F00004002000100000000EA0000__000000914E3F38F0 000000067F00004002000100000000E9C000-000000067F00004002000100000000EA0000__000000931B9A2710 000000067F00004002000100000000E9D0D7-000000067F00004002000100000000EA5AB9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EA0000-000000067F00004002000100000000EA4000__00000073AD3FE6B8 000000067F00004002000100000000EA0000-000000067F00004002000100000000EA4000__000000914E3F38F0 000000067F00004002000100000000EA0000-000000067F00004002000100000000EA4000__000000931B9A2710 000000067F00004002000100000000EA4000-000000067F00004002000100000000EA8000__00000073AD3FE6B8 000000067F00004002000100000000EA4000-000000067F00004002000100000000EA8000__000000914E3F38F0 000000067F00004002000100000000EA4000-000000067F00004002000100000000EA8000__000000931B9A2710 000000067F00004002000100000000EA5AB9-000000067F00004002000100000000EAE49A__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EA8000-000000067F00004002000100000000EAC000__00000073AD3FE6B8 000000067F00004002000100000000EA8000-000000067F00004002000100000000EAC000__000000914E3F38F0 000000067F00004002000100000000EA8000-000000067F00004002000100000000EAC000__000000931B9A2710 000000067F00004002000100000000EAC000-000000067F00004002000100000000EB0000__00000073AD3FE6B8 000000067F00004002000100000000EAC000-000000067F00004002000100000000EB0000__000000914E3F38F0 000000067F00004002000100000000EAC000-000000067F00004002000100000000EB0000__000000931B9A2710 000000067F00004002000100000000EAE49A-000000067F00004002000100000000EB6E78__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EB0000-000000067F00004002000100000000EB4000__00000073AD3FE6B8 000000067F00004002000100000000EB0000-000000067F00004002000100000000EB4000__000000914E3F38F0 000000067F00004002000100000000EB0000-000000067F00004002000100000000EB4000__000000931B9A2710 000000067F00004002000100000000EB4000-000000067F00004002000100000000EB8000__00000073AD3FE6B8 000000067F00004002000100000000EB4000-000000067F00004002000100000000EB8000__000000914E3F38F0 000000067F00004002000100000000EB4000-000000067F00004002000100000000EB8000__000000931B9A2710 000000067F00004002000100000000EB6E78-000000067F00004002000100000000EBF851__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EB8000-000000067F00004002000100000000EBC000__00000073AD3FE6B8 000000067F00004002000100000000EB8000-000000067F00004002000100000000EBC000__000000914E3F38F0 000000067F00004002000100000000EB8000-000000067F00004002000100000000EBC000__000000931B9A2710 000000067F00004002000100000000EBC000-000000067F00004002000100000000EC0000__00000073AD3FE6B8 000000067F00004002000100000000EBC000-000000067F00004002000100000000EC0000__000000914E3F38F0 000000067F00004002000100000000EBC000-000000067F00004002000100000000EC0000__000000931B9A2710 000000067F00004002000100000000EBF851-000000067F00004002000100000000EC8221__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EC0000-000000067F00004002000100000000EC4000__00000073AD3FE6B8 000000067F00004002000100000000EC0000-000000067F00004002000100000000EC4000__000000914E3F38F0 000000067F00004002000100000000EC0000-000000067F00004002000100000000EC4000__000000931B9A2710 000000067F00004002000100000000EC4000-000000067F00004002000100000000EC8000__00000073AD3FE6B8 000000067F00004002000100000000EC4000-000000067F00004002000100000000EC8000__000000914E3F38F0 000000067F00004002000100000000EC4000-000000067F00004002000100000000EC8000__000000931B9A2710 000000067F00004002000100000000EC8000-000000067F00004002000100000000ECC000__00000073AD3FE6B8 000000067F00004002000100000000EC8000-000000067F00004002000100000000ECC000__000000914E3F38F0 000000067F00004002000100000000EC8000-000000067F00004002000100000000ECC000__000000931B9A2710 000000067F00004002000100000000EC8221-000000067F00004002000100000000ED0BFD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000ECC000-000000067F00004002000100000000ED0000__00000073AD3FE6B8 000000067F00004002000100000000ECC000-000000067F00004002000100000000ED0000__000000914E3F38F0 000000067F00004002000100000000ECC000-000000067F00004002000100000000ED0000__000000931B9A2710 000000067F00004002000100000000ED0000-000000067F00004002000100000000ED4000__00000073AD3FE6B8 000000067F00004002000100000000ED0000-000000067F00004002000100000000ED4000__000000914E3F38F0 000000067F00004002000100000000ED0000-000000067F00004002000100000000ED4000__000000931B9A2710 000000067F00004002000100000000ED0BFD-000000067F00004002000100000000ED95E5__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000ED4000-000000067F00004002000100000000ED8000__00000073AD3FE6B8 000000067F00004002000100000000ED4000-000000067F00004002000100000000ED8000__000000914E3F38F0 000000067F00004002000100000000ED4000-000000067F00004002000100000000ED8000__000000931B9A2710 000000067F00004002000100000000ED8000-000000067F00004002000100000000EDC000__00000073AD3FE6B8 000000067F00004002000100000000ED8000-000000067F00004002000100000000EDC000__000000914E3F38F0 000000067F00004002000100000000ED8000-000000067F00004002000100000000EDC000__000000931B9A2710 000000067F00004002000100000000ED95E5-000000067F00004002000100000000EE1FCD__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EDC000-000000067F00004002000100000000EE0000__00000073AD3FE6B8 000000067F00004002000100000000EDC000-000000067F00004002000100000000EE0000__000000914E3F38F0 000000067F00004002000100000000EDC000-000000067F00004002000100000000EE0000__000000931B9A2710 000000067F00004002000100000000EE0000-000000067F00004002000100000000EE4000__00000073AD3FE6B8 000000067F00004002000100000000EE0000-000000067F00004002000100000000EE4000__000000914E3F38F0 000000067F00004002000100000000EE0000-000000067F00004002000100000000EE4000__000000931B9A2710 000000067F00004002000100000000EE1FCD-000000067F00004002000100000000EEA9A7__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EE4000-000000067F00004002000100000000EE8000__00000073AD3FE6B8 000000067F00004002000100000000EE4000-000000067F00004002000100000000EE8000__000000914E3F38F0 000000067F00004002000100000000EE4000-000000067F00004002000100000000EE8000__000000931B9A2710 000000067F00004002000100000000EE8000-000000067F00004002000100000000EEC000__00000073AD3FE6B8 000000067F00004002000100000000EE8000-000000067F00004002000100000000EEC000__000000914E3F38F0 000000067F00004002000100000000EE8000-000000067F00004002000100000000EEC000__000000931B9A2710 000000067F00004002000100000000EEA9A7-000000067F00004002000100000000EF3387__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EEC000-000000067F00004002000100000000EF0000__00000073AD3FE6B8 000000067F00004002000100000000EEC000-000000067F00004002000100000000EF0000__000000914E3F38F0 000000067F00004002000100000000EEC000-000000067F00004002000100000000EF0000__000000931B9A2710 000000067F00004002000100000000EF0000-000000067F00004002000100000000EF4000__00000073AD3FE6B8 000000067F00004002000100000000EF0000-000000067F00004002000100000000EF4000__000000914E3F38F0 000000067F00004002000100000000EF0000-000000067F00004002000100000000EF4000__000000931B9A2710 000000067F00004002000100000000EF3387-000000067F00004002000100000000EFBD62__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EF4000-000000067F00004002000100000000EF8000__00000073AD3FE6B8 000000067F00004002000100000000EF4000-000000067F00004002000100000000EF8000__000000914E3F38F0 000000067F00004002000100000000EF4000-000000067F00004002000100000000EF8000__000000931B9A2710 000000067F00004002000100000000EF8000-000000067F00004002000100000000EFC000__00000073AD3FE6B8 000000067F00004002000100000000EF8000-000000067F00004002000100000000EFC000__000000914E3F38F0 000000067F00004002000100000000EF8000-000000067F00004002000100000000EFC000__000000931B9A2710 000000067F00004002000100000000EFBD62-000000067F00004002000100000000F0473E__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000EFC000-000000067F00004002000100000000F00000__00000073AD3FE6B8 000000067F00004002000100000000EFC000-000000067F00004002000100000000F00000__000000914E3F38F0 000000067F00004002000100000000EFC000-000000067F00004002000100000000F00000__000000931B9A2710 000000067F00004002000100000000F00000-000000067F00004002000100000000F04000__00000073AD3FE6B8 000000067F00004002000100000000F00000-000000067F00004002000100000000F04000__000000914E3F38F0 000000067F00004002000100000000F00000-000000067F00004002000100000000F04000__000000931B9A2710 000000067F00004002000100000000F04000-000000067F00004002000100000000F08000__00000073AD3FE6B8 000000067F00004002000100000000F04000-000000067F00004002000100000000F08000__000000914E3F38F0 000000067F00004002000100000000F04000-000000067F00004002000100000000F08000__000000931B9A2710 000000067F00004002000100000000F0473E-000000067F00004002000100000000F0D116__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F08000-000000067F00004002000100000000F0C000__00000073AD3FE6B8 000000067F00004002000100000000F08000-000000067F00004002000100000000F0C000__000000914E3F38F0 000000067F00004002000100000000F08000-000000067F00004002000100000000F0C000__000000931B9A2710 000000067F00004002000100000000F0C000-000000067F00004002000100000000F10000__00000073AD3FE6B8 000000067F00004002000100000000F0C000-000000067F00004002000100000000F10000__000000914E3F38F0 000000067F00004002000100000000F0C000-000000067F00004002000100000000F10000__000000931B9A2710 000000067F00004002000100000000F0D116-000000067F00004002000100000000F15AE9__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F10000-000000067F00004002000100000000F14000__00000073AD3FE6B8 000000067F00004002000100000000F10000-000000067F00004002000100000000F14000__000000914E3F38F0 000000067F00004002000100000000F10000-000000067F00004002000100000000F14000__000000931B9A2710 000000067F00004002000100000000F14000-000000067F00004002000100000000F18000__00000073AD3FE6B8 000000067F00004002000100000000F14000-000000067F00004002000100000000F18000__000000914E3F38F0 000000067F00004002000100000000F14000-000000067F00004002000100000000F18000__000000931B9A2710 000000067F00004002000100000000F15AE9-000000067F00004002000100000000F1E4CB__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F18000-000000067F00004002000100000000F1C000__00000073AD3FE6B8 000000067F00004002000100000000F18000-000000067F00004002000100000000F1C000__000000914E3F38F0 000000067F00004002000100000000F18000-000000067F00004002000100000000F1C000__000000931B9A2710 000000067F00004002000100000000F1C000-000000067F00004002000100000000F20000__00000073AD3FE6B8 000000067F00004002000100000000F1C000-000000067F00004002000100000000F20000__000000914E3F38F0 000000067F00004002000100000000F1C000-000000067F00004002000100000000F20000__000000931B9A2710 000000067F00004002000100000000F1E4CB-000000067F00004002000100000000F26EC1__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F20000-000000067F00004002000100000000F24000__00000073AD3FE6B8 000000067F00004002000100000000F20000-000000067F00004002000100000000F24000__000000914E3F38F0 000000067F00004002000100000000F20000-000000067F00004002000100000000F24000__000000931B9A2710 000000067F00004002000100000000F24000-000000067F00004002000100000000F28000__00000073AD3FE6B8 000000067F00004002000100000000F24000-000000067F00004002000100000000F28000__000000914E3F38F0 000000067F00004002000100000000F24000-000000067F00004002000100000000F28000__000000931B9A2710 000000067F00004002000100000000F26EC1-000000067F00004002000100000000F2F8A1__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F28000-000000067F00004002000100000000F2C000__00000073AD3FE6B8 000000067F00004002000100000000F28000-000000067F00004002000100000000F2C000__000000914E3F38F0 000000067F00004002000100000000F28000-000000067F00004002000100000000F2C000__000000931B9A2710 000000067F00004002000100000000F2C000-000000067F00004002000100000000F30000__00000073AD3FE6B8 000000067F00004002000100000000F2C000-000000067F00004002000100000000F30000__000000914E3F38F0 000000067F00004002000100000000F2C000-000000067F00004002000100000000F30000__000000931B9A2710 000000067F00004002000100000000F2F8A1-000000067F00004002000100000000F38278__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F30000-000000067F00004002000100000000F34000__00000073AD3FE6B8 000000067F00004002000100000000F30000-000000067F00004002000100000000F34000__000000914E3F38F0 000000067F00004002000100000000F30000-000000067F00004002000100000000F34000__000000931B9A2710 000000067F00004002000100000000F34000-000000067F00004002000100000000F38000__00000073AD3FE6B8 000000067F00004002000100000000F34000-000000067F00004002000100000000F38000__000000914E3F38F0 000000067F00004002000100000000F34000-000000067F00004002000100000000F38000__000000931B9A2710 000000067F00004002000100000000F38000-000000067F00004002000100000000F3C000__00000073AD3FE6B8 000000067F00004002000100000000F38000-000000067F00004002000100000000F3C000__000000914E3F38F0 000000067F00004002000100000000F38000-000000067F00004002000100000000F3C000__000000931B9A2710 000000067F00004002000100000000F38278-000000067F00004002000100000000F40C57__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F3C000-000000067F00004002000100000000F40000__00000073AD3FE6B8 000000067F00004002000100000000F3C000-000000067F00004002000100000000F40000__000000914E3F38F0 000000067F00004002000100000000F3C000-000000067F00004002000100000000F40000__000000931B9A2710 000000067F00004002000100000000F40000-000000067F00004002000100000000F44000__00000073AD3FE6B8 000000067F00004002000100000000F40000-000000067F00004002000100000000F44000__000000914E3F38F0 000000067F00004002000100000000F40000-000000067F00004002000100000000F44000__000000931B9A2710 000000067F00004002000100000000F40C57-000000067F00004002000100000000F49630__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F44000-000000067F00004002000100000000F48000__00000073AD3FE6B8 000000067F00004002000100000000F44000-000000067F00004002000100000000F48000__000000914E3F38F0 000000067F00004002000100000000F44000-000000067F00004002000100000000F48000__000000931B9A2710 000000067F00004002000100000000F48000-000000067F00004002000100000000F4C000__00000073AD3FE6B8 000000067F00004002000100000000F48000-000000067F00004002000100000000F4C000__000000914E3F38F0 000000067F00004002000100000000F48000-000000067F00004002000100000000F4C000__000000931B9A2710 000000067F00004002000100000000F49630-000000067F00004002000100000000F52007__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F4C000-000000067F00004002000100000000F50000__00000073AD3FE6B8 000000067F00004002000100000000F4C000-000000067F00004002000100000000F50000__000000914E3F38F0 000000067F00004002000100000000F4C000-000000067F00004002000100000000F50000__000000931B9A2710 000000067F00004002000100000000F50000-000000067F00004002000100000000F54000__00000073AD3FE6B8 000000067F00004002000100000000F50000-000000067F00004002000100000000F54000__000000914E3F38F0 000000067F00004002000100000000F50000-000000067F00004002000100000000F54000__000000931B9A2710 000000067F00004002000100000000F52007-000000067F00004002000100000000F5A9DE__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F54000-000000067F00004002000100000000F58000__00000073AD3FE6B8 000000067F00004002000100000000F54000-000000067F00004002000100000000F58000__000000914E3F38F0 000000067F00004002000100000000F54000-000000067F00004002000100000000F58000__000000931B9A2710 000000067F00004002000100000000F58000-000000067F00004002000100000000F5C000__00000073AD3FE6B8 000000067F00004002000100000000F58000-000000067F00004002000100000000F5C000__000000914E3F38F0 000000067F00004002000100000000F58000-000000067F00004002000100000000F5C000__000000931B9A2710 000000067F00004002000100000000F5A9DE-000000067F00004002000100000000F60351__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000100000000F5C000-000000067F00004002000100000000F60000__00000073AD3FE6B8 000000067F00004002000100000000F5C000-000000067F00004002000100000000F60000__000000914E3F38F0 000000067F00004002000100000000F5C000-000000067F00004002000100000000F60000__000000931B9A2710 000000067F00004002000100000000F60000-000000067F00004002000100050100000000__00000073AD3FE6B8 000000067F00004002000100000000F60000-000000067F00004002000100050100000000__000000914E3F38F0 000000067F00004002000100000000F60000-000000067F00004002000100050100000000__000000931B9A2710 000000067F00004002000100000000F60000-030000000000000000000000000000000002__000000739A8D1298 000000067F000040020001000000FFFFFFFF-000000067F00004002000100000100000000__0000005CA7BBF4A0-00000064F391EC28 000000067F000040020001000000FFFFFFFF-000000067F00004002000100000100000000__00000064F391EC28-0000006D3F67EDA8 000000067F000040020001000000FFFFFFFF-000000067F00004002000100000100000000__0000006D3F67EDA8-000000739A8D1299 000000067F000040020001000500FFFFFFFF-000000067F00004002000100050100000000__0000005CA7BBD6F9-000000739A8D1299 000000067F00004002000140000000000000-000000067F00004002000140000000004000__00000073AD3FE6B8 000000067F00004002000140000000000000-000000067F00004002000140000000004000__000000914E3F38F0 000000067F00004002000140000000000000-000000067F00004002000140000000004000__000000931B9A2710 000000067F00004002000140000000004000-000000067F00004002000140000000008000__00000073AD3FE6B8 000000067F00004002000140000000004000-000000067F00004002000140000000008000__000000914E3F38F0 000000067F00004002000140000000004000-000000067F00004002000140000000008000__000000931B9A2710 000000067F00004002000140000000008000-000000067F0000400200014000000000C000__000000914E3F38F0 000000067F00004002000140000000008000-000000067F0000400200014000000000C000__000000931B9A2710 000000067F00004002000140000000008000-030000000000000000000000000000000002__00000073AD3FE6B8 000000067F00004002000140000000008988-000000067F00004002000140000000011367__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000000C000-000000067F00004002000140000000010000__000000914E3F38F0 000000067F0000400200014000000000C000-000000067F00004002000140000000010000__000000931B9A2710 000000067F00004002000140000000010000-000000067F00004002000140000000014000__000000914E3F38F0 000000067F00004002000140000000010000-000000067F00004002000140000000014000__000000931B9A2710 000000067F00004002000140000000011367-000000067F00004002000140000000019D71__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000014000-000000067F00004002000140000000018000__000000914E3F38F0 000000067F00004002000140000000014000-000000067F00004002000140000000018000__000000931B9A2710 000000067F00004002000140000000018000-000000067F0000400200014000000001C000__000000914E3F38F0 000000067F00004002000140000000018000-000000067F0000400200014000000001C000__000000931B9A2710 000000067F00004002000140000000019D71-000000067F00004002000140000000022769__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000001C000-000000067F00004002000140000000020000__000000914E3F38F0 000000067F0000400200014000000001C000-000000067F00004002000140000000020000__000000931B9A2710 000000067F00004002000140000000020000-000000067F00004002000140000000024000__000000914E3F38F0 000000067F00004002000140000000020000-000000067F00004002000140000000024000__000000931B9A2710 000000067F00004002000140000000022769-000000067F0000400200014000000002B151__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000024000-000000067F00004002000140000000028000__000000914E3F38F0 000000067F00004002000140000000024000-000000067F00004002000140000000028000__000000931B9A2710 000000067F00004002000140000000028000-000000067F0000400200014000000002C000__000000914E3F38F0 000000067F00004002000140000000028000-000000067F0000400200014000000002C000__000000931B9A2710 000000067F0000400200014000000002B151-000000067F00004002000140000000033B28__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000002C000-000000067F00004002000140000000030000__000000914E3F38F0 000000067F0000400200014000000002C000-000000067F00004002000140000000030000__000000931B9A2710 000000067F00004002000140000000030000-000000067F00004002000140000000034000__000000914E3F38F0 000000067F00004002000140000000030000-000000067F00004002000140000000034000__000000931B9A2710 000000067F00004002000140000000033B28-000000067F0000400200014000000003C4CB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000034000-000000067F00004002000140000000038000__000000914E3F38F0 000000067F00004002000140000000034000-000000067F00004002000140000000038000__000000931B9A2710 000000067F00004002000140000000038000-000000067F0000400200014000000003C000__000000914E3F38F0 000000067F00004002000140000000038000-000000067F0000400200014000000003C000__000000931B9A2710 000000067F0000400200014000000003C000-000000067F00004002000140000000040000__000000914E3F38F0 000000067F0000400200014000000003C000-000000067F00004002000140000000040000__000000931B9A2710 000000067F0000400200014000000003C4CB-000000067F00004002000140000000044E80__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000040000-000000067F00004002000140000000044000__000000914E3F38F0 000000067F00004002000140000000040000-000000067F00004002000140000000044000__000000931B9A2710 000000067F00004002000140000000044000-000000067F00004002000140000000048000__000000914E3F38F0 000000067F00004002000140000000044000-000000067F00004002000140000000048000__000000931B9A2710 000000067F00004002000140000000044E80-000000067F0000400200014000000004D872__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000048000-000000067F0000400200014000000004C000__000000914E3F38F0 000000067F00004002000140000000048000-000000067F0000400200014000000004C000__000000931B9A2710 000000067F0000400200014000000004C000-000000067F00004002000140000000050000__000000914E3F38F0 000000067F0000400200014000000004C000-000000067F00004002000140000000050000__000000931B9A2710 000000067F0000400200014000000004D872-000000067F00004002000140000000056274__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000050000-000000067F00004002000140000000054000__000000914E3F38F0 000000067F00004002000140000000050000-000000067F00004002000140000000054000__000000931B9A2710 000000067F00004002000140000000054000-000000067F00004002000140000000058000__000000914E3F38F0 000000067F00004002000140000000054000-000000067F00004002000140000000058000__000000931B9A2710 000000067F00004002000140000000056274-000000067F0000400200014000000005EC6A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000058000-000000067F0000400200014000000005C000__000000914E3F38F0 000000067F00004002000140000000058000-000000067F0000400200014000000005C000__000000931B9A2710 000000067F0000400200014000000005C000-000000067F00004002000140000000060000__000000914E3F38F0 000000067F0000400200014000000005C000-000000067F00004002000140000000060000__000000931B9A2710 000000067F0000400200014000000005EC6A-000000067F0000400200014000000006764E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000060000-000000067F00004002000140000000064000__000000914E3F38F0 000000067F00004002000140000000060000-000000067F00004002000140000000064000__000000931B9A2710 000000067F00004002000140000000064000-000000067F00004002000140000000068000__000000914E3F38F0 000000067F00004002000140000000064000-000000067F00004002000140000000068000__000000931B9A2710 000000067F0000400200014000000006764E-000000067F00004002000140000000070013__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000068000-000000067F0000400200014000000006C000__000000914E3F38F0 000000067F00004002000140000000068000-000000067F0000400200014000000006C000__000000931B9A2710 000000067F0000400200014000000006C000-000000067F00004002000140000000070000__000000914E3F38F0 000000067F0000400200014000000006C000-000000067F00004002000140000000070000__000000931B9A2710 000000067F00004002000140000000070000-000000067F00004002000140000000074000__000000914E3F38F0 000000067F00004002000140000000070000-000000067F00004002000140000000074000__000000931B9A2710 000000067F00004002000140000000070013-000000067F000040020001400000000789BA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000074000-000000067F00004002000140000000078000__000000914E3F38F0 000000067F00004002000140000000074000-000000067F00004002000140000000078000__000000931B9A2710 000000067F00004002000140000000078000-000000067F0000400200014000000007C000__000000914E3F38F0 000000067F00004002000140000000078000-000000067F0000400200014000000007C000__000000931B9A2710 000000067F000040020001400000000789BA-000000067F0000400200014000000008136D__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000007C000-000000067F00004002000140000000080000__000000914E3F38F0 000000067F0000400200014000000007C000-000000067F00004002000140000000080000__000000931B9A2710 000000067F00004002000140000000080000-000000067F00004002000140000000084000__000000914E3F38F0 000000067F00004002000140000000080000-000000067F00004002000140000000084000__000000931B9A2710 000000067F0000400200014000000008136D-000000067F00004002000140000000089D5F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000084000-000000067F00004002000140000000088000__000000914E3F38F0 000000067F00004002000140000000084000-000000067F00004002000140000000088000__000000931B9A2710 000000067F00004002000140000000088000-000000067F0000400200014000000008C000__000000914E3F38F0 000000067F00004002000140000000088000-000000067F0000400200014000000008C000__000000931B9A2710 000000067F00004002000140000000089D5F-000000067F0000400200014000000009275F__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000008C000-000000067F00004002000140000000090000__000000914E3F38F0 000000067F0000400200014000000008C000-000000067F00004002000140000000090000__000000931B9A2710 000000067F00004002000140000000090000-000000067F00004002000140000000094000__000000914E3F38F0 000000067F00004002000140000000090000-000000067F00004002000140000000094000__000000931B9A2710 000000067F0000400200014000000009275F-000000067F0000400200014000000009B154__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000094000-000000067F00004002000140000000098000__000000914E3F38F0 000000067F00004002000140000000094000-000000067F00004002000140000000098000__000000931B9A2710 000000067F00004002000140000000098000-000000067F0000400200014000000009C000__000000914E3F38F0 000000067F00004002000140000000098000-000000067F0000400200014000000009C000__000000931B9A2710 000000067F0000400200014000000009B154-000000067F000040020001400000000A3B2B__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000009C000-000000067F000040020001400000000A0000__000000914E3F38F0 000000067F0000400200014000000009C000-000000067F000040020001400000000A0000__000000931B9A2710 000000067F000040020001400000000A0000-000000067F000040020001400000000A4000__000000914E3F38F0 000000067F000040020001400000000A0000-000000067F000040020001400000000A4000__000000931B9A2710 000000067F000040020001400000000A3B2B-000000067F000040020001400000000AC4F0__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000A4000-000000067F000040020001400000000A8000__000000914E3F38F0 000000067F000040020001400000000A4000-000000067F000040020001400000000A8000__000000931B9A2710 000000067F000040020001400000000A8000-000000067F000040020001400000000AC000__000000914E3F38F0 000000067F000040020001400000000A8000-000000067F000040020001400000000AC000__000000931B9A2710 000000067F000040020001400000000AC000-000000067F000040020001400000000B0000__000000914E3F38F0 000000067F000040020001400000000AC000-000000067F000040020001400000000B0000__000000931B9A2710 000000067F000040020001400000000AC4F0-000000067F000040020001400000000B4EAA__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000B0000-000000067F000040020001400000000B4000__000000914E3F38F0 000000067F000040020001400000000B0000-000000067F000040020001400000000B4000__000000931B9A2710 000000067F000040020001400000000B4000-000000067F000040020001400000000B8000__000000914E3F38F0 000000067F000040020001400000000B4000-000000067F000040020001400000000B8000__000000931B9A2710 000000067F000040020001400000000B4EAA-000000067F000040020001400000000BD86C__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000B8000-000000067F000040020001400000000BC000__000000914E3F38F0 000000067F000040020001400000000B8000-000000067F000040020001400000000BC000__000000931B9A2710 000000067F000040020001400000000BC000-000000067F000040020001400000000C0000__000000914E3F38F0 000000067F000040020001400000000BC000-000000067F000040020001400000000C0000__000000931B9A2710 000000067F000040020001400000000BD86C-000000067F000040020001400000000C6268__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000C0000-000000067F000040020001400000000C4000__000000914E3F38F0 000000067F000040020001400000000C0000-000000067F000040020001400000000C4000__000000931B9A2710 000000067F000040020001400000000C4000-000000067F000040020001400000000C8000__000000914E3F38F0 000000067F000040020001400000000C4000-000000067F000040020001400000000C8000__000000931B9A2710 000000067F000040020001400000000C6268-000000067F000040020001400000000CEC64__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000C8000-000000067F000040020001400000000CC000__000000914E3F38F0 000000067F000040020001400000000C8000-000000067F000040020001400000000CC000__000000931B9A2710 000000067F000040020001400000000CC000-000000067F000040020001400000000D0000__000000914E3F38F0 000000067F000040020001400000000CC000-000000067F000040020001400000000D0000__000000931B9A2710 000000067F000040020001400000000CEC64-000000067F000040020001400000000D7659__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000D0000-000000067F000040020001400000000D4000__000000914E3F38F0 000000067F000040020001400000000D0000-000000067F000040020001400000000D4000__000000931B9A2710 000000067F000040020001400000000D4000-000000067F000040020001400000000D8000__000000914E3F38F0 000000067F000040020001400000000D4000-000000067F000040020001400000000D8000__000000931B9A2710 000000067F000040020001400000000D7659-000000067F000040020001400000000E0026__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000D8000-000000067F000040020001400000000DC000__000000914E3F38F0 000000067F000040020001400000000D8000-000000067F000040020001400000000DC000__000000931B9A2710 000000067F000040020001400000000DC000-000000067F000040020001400000000E0000__000000914E3F38F0 000000067F000040020001400000000DC000-000000067F000040020001400000000E0000__000000931B9A2710 000000067F000040020001400000000E0000-000000067F000040020001400000000E4000__000000914E3F38F0 000000067F000040020001400000000E0000-000000067F000040020001400000000E4000__000000931B9A2710 000000067F000040020001400000000E0026-000000067F000040020001400000000E89F4__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000E4000-000000067F000040020001400000000E8000__000000914E3F38F0 000000067F000040020001400000000E4000-000000067F000040020001400000000E8000__000000931B9A2710 000000067F000040020001400000000E8000-000000067F000040020001400000000EC000__000000914E3F38F0 000000067F000040020001400000000E8000-000000067F000040020001400000000EC000__000000931B9A2710 000000067F000040020001400000000E89F4-000000067F000040020001400000000F13B1__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000EC000-000000067F000040020001400000000F0000__000000914E3F38F0 000000067F000040020001400000000EC000-000000067F000040020001400000000F0000__000000931B9A2710 000000067F000040020001400000000F0000-000000067F000040020001400000000F4000__000000914E3F38F0 000000067F000040020001400000000F0000-000000067F000040020001400000000F4000__000000931B9A2710 000000067F000040020001400000000F13B1-000000067F000040020001400000000F9D77__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000F4000-000000067F000040020001400000000F8000__000000914E3F38F0 000000067F000040020001400000000F4000-000000067F000040020001400000000F8000__000000931B9A2710 000000067F000040020001400000000F8000-000000067F000040020001400000000FC000__000000914E3F38F0 000000067F000040020001400000000F8000-000000067F000040020001400000000FC000__000000931B9A2710 000000067F000040020001400000000F9D77-000000067F00004002000140000000102774__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000000FC000-000000067F00004002000140000000100000__000000914E3F38F0 000000067F000040020001400000000FC000-000000067F00004002000140000000100000__000000931B9A2710 000000067F00004002000140000000100000-000000067F00004002000140000000104000__000000914E3F38F0 000000067F00004002000140000000100000-000000067F00004002000140000000104000__000000931B9A2710 000000067F00004002000140000000102774-000000067F0000400200014000000010B172__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000104000-000000067F00004002000140000000108000__000000914E3F38F0 000000067F00004002000140000000104000-000000067F00004002000140000000108000__000000931B9A2710 000000067F00004002000140000000108000-000000067F0000400200014000000010C000__000000914E3F38F0 000000067F00004002000140000000108000-000000067F0000400200014000000010C000__000000931B9A2710 000000067F0000400200014000000010B172-000000067F00004002000140000000113B64__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000010C000-000000067F00004002000140000000110000__000000914E3F38F0 000000067F0000400200014000000010C000-000000067F00004002000140000000110000__000000931B9A2710 000000067F00004002000140000000110000-000000067F00004002000140000000114000__000000914E3F38F0 000000067F00004002000140000000110000-000000067F00004002000140000000114000__000000931B9A2710 000000067F00004002000140000000113B64-000000067F0000400200014000000011C533__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000114000-000000067F00004002000140000000118000__000000914E3F38F0 000000067F00004002000140000000114000-000000067F00004002000140000000118000__000000931B9A2710 000000067F00004002000140000000118000-000000067F0000400200014000000011C000__000000914E3F38F0 000000067F00004002000140000000118000-000000067F0000400200014000000011C000__000000931B9A2710 000000067F0000400200014000000011C000-000000067F00004002000140000000120000__000000914E3F38F0 000000067F0000400200014000000011C000-000000067F00004002000140000000120000__000000931B9A2710 000000067F0000400200014000000011C533-000000067F00004002000140000000124EF8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000120000-000000067F00004002000140000000124000__000000914E3F38F0 000000067F00004002000140000000120000-000000067F00004002000140000000124000__000000931B9A2710 000000067F00004002000140000000124000-000000067F00004002000140000000128000__000000914E3F38F0 000000067F00004002000140000000124000-000000067F00004002000140000000128000__000000931B9A2710 000000067F00004002000140000000124EF8-000000067F0000400200014000000012D8AC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000128000-000000067F0000400200014000000012C000__000000914E3F38F0 000000067F00004002000140000000128000-000000067F0000400200014000000012C000__000000931B9A2710 000000067F0000400200014000000012C000-000000067F00004002000140000000130000__000000914E3F38F0 000000067F0000400200014000000012C000-000000067F00004002000140000000130000__000000931B9A2710 000000067F0000400200014000000012D8AC-000000067F00004002000140000000136277__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000130000-000000067F00004002000140000000134000__000000914E3F38F0 000000067F00004002000140000000130000-000000067F00004002000140000000134000__000000931B9A2710 000000067F00004002000140000000134000-000000067F00004002000140000000138000__000000914E3F38F0 000000067F00004002000140000000134000-000000067F00004002000140000000138000__000000931B9A2710 000000067F00004002000140000000136277-000000067F0000400200014000000013EC72__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000138000-000000067F0000400200014000000013C000__000000914E3F38F0 000000067F00004002000140000000138000-000000067F0000400200014000000013C000__000000931B9A2710 000000067F0000400200014000000013C000-000000067F00004002000140000000140000__000000914E3F38F0 000000067F0000400200014000000013C000-000000067F00004002000140000000140000__000000931B9A2710 000000067F0000400200014000000013EC72-000000067F0000400200014000000014766F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000140000-000000067F00004002000140000000144000__000000914E3F38F0 000000067F00004002000140000000140000-000000067F00004002000140000000144000__000000931B9A2710 000000067F00004002000140000000144000-000000067F00004002000140000000148000__000000914E3F38F0 000000067F00004002000140000000144000-000000067F00004002000140000000148000__000000931B9A2710 000000067F0000400200014000000014766F-000000067F00004002000140000000150061__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000148000-000000067F0000400200014000000014C000__000000914E3F38F0 000000067F00004002000140000000148000-000000067F0000400200014000000014C000__000000931B9A2710 000000067F0000400200014000000014C000-000000067F00004002000140000000150000__000000914E3F38F0 000000067F0000400200014000000014C000-000000067F00004002000140000000150000__000000931B9A2710 000000067F00004002000140000000150000-000000067F00004002000140000000154000__000000914E3F38F0 000000067F00004002000140000000150000-000000067F00004002000140000000154000__000000931B9A2710 000000067F00004002000140000000150061-000000067F00004002000140000000158A3C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000154000-000000067F00004002000140000000158000__000000914E3F38F0 000000067F00004002000140000000154000-000000067F00004002000140000000158000__000000931B9A2710 000000067F00004002000140000000158000-000000067F0000400200014000000015C000__000000914E3F38F0 000000067F00004002000140000000158000-000000067F0000400200014000000015C000__000000931B9A2710 000000067F00004002000140000000158A3C-000000067F000040020001400000001613FB__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000015C000-000000067F00004002000140000000160000__000000914E3F38F0 000000067F0000400200014000000015C000-000000067F00004002000140000000160000__000000931B9A2710 000000067F00004002000140000000160000-000000067F00004002000140000000164000__000000914E3F38F0 000000067F00004002000140000000160000-000000067F00004002000140000000164000__000000931B9A2710 000000067F000040020001400000001613FB-000000067F00004002000140000000169DB2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000164000-000000067F00004002000140000000168000__000000914E3F38F0 000000067F00004002000140000000164000-000000067F00004002000140000000168000__000000931B9A2710 000000067F00004002000140000000168000-000000067F0000400200014000000016C000__000000914E3F38F0 000000067F00004002000140000000168000-000000067F0000400200014000000016C000__000000931B9A2710 000000067F00004002000140000000169DB2-000000067F00004002000140000000172788__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000016C000-000000067F00004002000140000000170000__000000914E3F38F0 000000067F0000400200014000000016C000-000000067F00004002000140000000170000__000000931B9A2710 000000067F00004002000140000000170000-000000067F00004002000140000000174000__000000914E3F38F0 000000067F00004002000140000000170000-000000067F00004002000140000000174000__000000931B9A2710 000000067F00004002000140000000172788-000000067F0000400200014000000017B17E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000174000-000000067F00004002000140000000178000__000000914E3F38F0 000000067F00004002000140000000174000-000000067F00004002000140000000178000__000000931B9A2710 000000067F00004002000140000000178000-000000067F0000400200014000000017C000__000000914E3F38F0 000000067F00004002000140000000178000-000000067F0000400200014000000017C000__000000931B9A2710 000000067F0000400200014000000017B17E-000000067F00004002000140000000183B77__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000017C000-000000067F00004002000140000000180000__000000914E3F38F0 000000067F0000400200014000000017C000-000000067F00004002000140000000180000__000000931B9A2710 000000067F00004002000140000000180000-000000067F00004002000140000000184000__000000914E3F38F0 000000067F00004002000140000000180000-000000067F00004002000140000000184000__000000931B9A2710 000000067F00004002000140000000183B77-000000067F0000400200014000000018C56B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000184000-000000067F00004002000140000000188000__000000914E3F38F0 000000067F00004002000140000000184000-000000067F00004002000140000000188000__000000931B9A2710 000000067F00004002000140000000188000-000000067F0000400200014000000018C000__000000914E3F38F0 000000067F00004002000140000000188000-000000067F0000400200014000000018C000__000000931B9A2710 000000067F0000400200014000000018C000-000000067F00004002000140000000190000__000000914E3F38F0 000000067F0000400200014000000018C000-000000067F00004002000140000000190000__000000931B9A2710 000000067F0000400200014000000018C56B-000000067F00004002000140000000194F47__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000190000-000000067F00004002000140000000194000__000000914E3F38F0 000000067F00004002000140000000190000-000000067F00004002000140000000194000__000000931B9A2710 000000067F00004002000140000000194000-000000067F00004002000140000000198000__000000914E3F38F0 000000067F00004002000140000000194000-000000067F00004002000140000000198000__000000931B9A2710 000000067F00004002000140000000194F47-000000067F0000400200014000000019D8FE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000198000-000000067F0000400200014000000019C000__000000914E3F38F0 000000067F00004002000140000000198000-000000067F0000400200014000000019C000__000000931B9A2710 000000067F0000400200014000000019C000-000000067F000040020001400000001A0000__000000914E3F38F0 000000067F0000400200014000000019C000-000000067F000040020001400000001A0000__000000931B9A2710 000000067F0000400200014000000019D8FE-000000067F000040020001400000001A62B8__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001A0000-000000067F000040020001400000001A4000__000000914E3F38F0 000000067F000040020001400000001A0000-000000067F000040020001400000001A4000__000000931B9A2710 000000067F000040020001400000001A4000-000000067F000040020001400000001A8000__000000914E3F38F0 000000067F000040020001400000001A4000-000000067F000040020001400000001A8000__000000931B9A2710 000000067F000040020001400000001A62B8-000000067F000040020001400000001AEC8F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001A8000-000000067F000040020001400000001AC000__000000914E3F38F0 000000067F000040020001400000001A8000-000000067F000040020001400000001AC000__000000931B9A2710 000000067F000040020001400000001AC000-000000067F000040020001400000001B0000__000000914E3F38F0 000000067F000040020001400000001AC000-000000067F000040020001400000001B0000__000000931B9A2710 000000067F000040020001400000001AEC8F-000000067F000040020001400000001B7686__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001B0000-000000067F000040020001400000001B4000__000000914E3F38F0 000000067F000040020001400000001B0000-000000067F000040020001400000001B4000__000000931B9A2710 000000067F000040020001400000001B4000-000000067F000040020001400000001B8000__000000914E3F38F0 000000067F000040020001400000001B4000-000000067F000040020001400000001B8000__000000931B9A2710 000000067F000040020001400000001B7686-000000067F000040020001400000001C0079__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001B8000-000000067F000040020001400000001BC000__000000914E3F38F0 000000067F000040020001400000001B8000-000000067F000040020001400000001BC000__000000931B9A2710 000000067F000040020001400000001BC000-000000067F000040020001400000001C0000__000000914E3F38F0 000000067F000040020001400000001BC000-000000067F000040020001400000001C0000__000000931B9A2710 000000067F000040020001400000001C0000-000000067F000040020001400000001C4000__000000914E3F38F0 000000067F000040020001400000001C0000-000000067F000040020001400000001C4000__000000931B9A2710 000000067F000040020001400000001C0079-000000067F000040020001400000001C8A6F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001C4000-000000067F000040020001400000001C8000__000000914E3F38F0 000000067F000040020001400000001C4000-000000067F000040020001400000001C8000__000000931B9A2710 000000067F000040020001400000001C8000-000000067F000040020001400000001CC000__000000914E3F38F0 000000067F000040020001400000001C8000-000000067F000040020001400000001CC000__000000931B9A2710 000000067F000040020001400000001C8A6F-000000067F000040020001400000001D1442__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001CC000-000000067F000040020001400000001D0000__000000914E3F38F0 000000067F000040020001400000001CC000-000000067F000040020001400000001D0000__000000931B9A2710 000000067F000040020001400000001D0000-000000067F000040020001400000001D4000__000000914E3F38F0 000000067F000040020001400000001D0000-000000067F000040020001400000001D4000__000000931B9A2710 000000067F000040020001400000001D1442-000000067F000040020001400000001D9DF3__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001D4000-000000067F000040020001400000001D8000__000000914E3F38F0 000000067F000040020001400000001D4000-000000067F000040020001400000001D8000__000000931B9A2710 000000067F000040020001400000001D8000-000000067F000040020001400000001DC000__000000914E3F38F0 000000067F000040020001400000001D8000-000000067F000040020001400000001DC000__000000931B9A2710 000000067F000040020001400000001D9DF3-000000067F000040020001400000001E27AE__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001DC000-000000067F000040020001400000001E0000__000000914E3F38F0 000000067F000040020001400000001DC000-000000067F000040020001400000001E0000__000000931B9A2710 000000067F000040020001400000001E0000-000000067F000040020001400000001E4000__000000914E3F38F0 000000067F000040020001400000001E0000-000000067F000040020001400000001E4000__000000931B9A2710 000000067F000040020001400000001E27AE-000000067F000040020001400000001EB193__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001E4000-000000067F000040020001400000001E8000__000000914E3F38F0 000000067F000040020001400000001E4000-000000067F000040020001400000001E8000__000000931B9A2710 000000067F000040020001400000001E8000-000000067F000040020001400000001EC000__000000914E3F38F0 000000067F000040020001400000001E8000-000000067F000040020001400000001EC000__000000931B9A2710 000000067F000040020001400000001EB193-000000067F000040020001400000001F3B93__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001EC000-000000067F000040020001400000001F0000__000000914E3F38F0 000000067F000040020001400000001EC000-000000067F000040020001400000001F0000__000000931B9A2710 000000067F000040020001400000001F0000-000000067F000040020001400000001F4000__000000914E3F38F0 000000067F000040020001400000001F0000-000000067F000040020001400000001F4000__000000931B9A2710 000000067F000040020001400000001F3B93-000000067F000040020001400000001FC594__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000001F4000-000000067F000040020001400000001F8000__000000914E3F38F0 000000067F000040020001400000001F4000-000000067F000040020001400000001F8000__000000931B9A2710 000000067F000040020001400000001F8000-000000067F000040020001400000001FC000__000000914E3F38F0 000000067F000040020001400000001F8000-000000067F000040020001400000001FC000__000000931B9A2710 000000067F000040020001400000001FC000-000000067F00004002000140000000200000__000000914E3F38F0 000000067F000040020001400000001FC000-000000067F00004002000140000000200000__000000931B9A2710 000000067F000040020001400000001FC594-000000067F00004002000140000000204F82__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000200000-000000067F00004002000140000000204000__000000914E3F38F0 000000067F00004002000140000000200000-000000067F00004002000140000000204000__000000931B9A2710 000000067F00004002000140000000204000-000000067F00004002000140000000208000__000000914E3F38F0 000000067F00004002000140000000204000-000000067F00004002000140000000208000__000000931B9A2710 000000067F00004002000140000000204F82-000000067F0000400200014000000020D952__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000208000-000000067F0000400200014000000020C000__000000914E3F38F0 000000067F00004002000140000000208000-000000067F0000400200014000000020C000__000000931B9A2710 000000067F0000400200014000000020C000-000000067F00004002000140000000210000__000000914E3F38F0 000000067F0000400200014000000020C000-000000067F00004002000140000000210000__000000931B9A2710 000000067F0000400200014000000020D952-000000067F00004002000140000000216305__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000210000-000000067F00004002000140000000214000__000000914E3F38F0 000000067F00004002000140000000210000-000000067F00004002000140000000214000__000000931B9A2710 000000067F00004002000140000000214000-000000067F00004002000140000000218000__000000914E3F38F0 000000067F00004002000140000000214000-000000067F00004002000140000000218000__000000931B9A2710 000000067F00004002000140000000216305-000000067F0000400200014000000021ECB6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000218000-000000067F0000400200014000000021C000__000000914E3F38F0 000000067F00004002000140000000218000-000000067F0000400200014000000021C000__000000931B9A2710 000000067F0000400200014000000021C000-000000067F00004002000140000000220000__000000914E3F38F0 000000067F0000400200014000000021C000-000000067F00004002000140000000220000__000000931B9A2710 000000067F0000400200014000000021ECB6-000000067F000040020001400000002276A1__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000220000-000000067F00004002000140000000224000__000000914E3F38F0 000000067F00004002000140000000220000-000000067F00004002000140000000224000__000000931B9A2710 000000067F00004002000140000000224000-000000067F00004002000140000000228000__000000914E3F38F0 000000067F00004002000140000000224000-000000067F00004002000140000000228000__000000931B9A2710 000000067F000040020001400000002276A1-000000067F0000400200014000000023009D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000228000-000000067F0000400200014000000022C000__000000914E3F38F0 000000067F00004002000140000000228000-000000067F0000400200014000000022C000__000000931B9A2710 000000067F0000400200014000000022C000-000000067F00004002000140000000230000__000000914E3F38F0 000000067F0000400200014000000022C000-000000067F00004002000140000000230000__000000931B9A2710 000000067F00004002000140000000230000-000000067F00004002000140000000234000__000000914E3F38F0 000000067F00004002000140000000230000-000000067F00004002000140000000234000__000000931B9A2710 000000067F0000400200014000000023009D-000000067F00004002000140000000238AA0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000234000-000000067F00004002000140000000238000__000000914E3F38F0 000000067F00004002000140000000234000-000000067F00004002000140000000238000__000000931B9A2710 000000067F00004002000140000000238000-000000067F0000400200014000000023C000__000000914E3F38F0 000000067F00004002000140000000238000-000000067F0000400200014000000023C000__000000931B9A2710 000000067F00004002000140000000238AA0-000000067F00004002000140000000241480__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000023C000-000000067F00004002000140000000240000__000000914E3F38F0 000000067F0000400200014000000023C000-000000067F00004002000140000000240000__000000931B9A2710 000000067F00004002000140000000240000-000000067F00004002000140000000244000__000000914E3F38F0 000000067F00004002000140000000240000-000000067F00004002000140000000244000__000000931B9A2710 000000067F00004002000140000000241480-000000067F00004002000140000000249E56__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000244000-000000067F00004002000140000000248000__000000914E3F38F0 000000067F00004002000140000000244000-000000067F00004002000140000000248000__000000931B9A2710 000000067F00004002000140000000248000-000000067F0000400200014000000024C000__000000914E3F38F0 000000067F00004002000140000000248000-000000067F0000400200014000000024C000__000000931B9A2710 000000067F00004002000140000000249E56-000000067F00004002000140000000252803__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000024C000-000000067F00004002000140000000250000__000000914E3F38F0 000000067F0000400200014000000024C000-000000067F00004002000140000000250000__000000931B9A2710 000000067F00004002000140000000250000-000000067F00004002000140000000254000__000000914E3F38F0 000000067F00004002000140000000250000-000000067F00004002000140000000254000__000000931B9A2710 000000067F00004002000140000000252803-000000067F0000400200014000000025B1BA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000254000-000000067F00004002000140000000258000__000000914E3F38F0 000000067F00004002000140000000254000-000000067F00004002000140000000258000__000000931B9A2710 000000067F00004002000140000000258000-000000067F0000400200014000000025C000__000000914E3F38F0 000000067F00004002000140000000258000-000000067F0000400200014000000025C000__000000931B9A2710 000000067F0000400200014000000025B1BA-000000067F00004002000140000000263BAA__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000025C000-000000067F00004002000140000000260000__000000914E3F38F0 000000067F0000400200014000000025C000-000000067F00004002000140000000260000__000000931B9A2710 000000067F00004002000140000000260000-000000067F00004002000140000000264000__000000914E3F38F0 000000067F00004002000140000000260000-000000067F00004002000140000000264000__000000931B9A2710 000000067F00004002000140000000263BAA-000000067F0000400200014000000026C5A8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000264000-000000067F00004002000140000000268000__000000914E3F38F0 000000067F00004002000140000000264000-000000067F00004002000140000000268000__000000931B9A2710 000000067F00004002000140000000268000-000000067F0000400200014000000026C000__000000914E3F38F0 000000067F00004002000140000000268000-000000067F0000400200014000000026C000__000000931B9A2710 000000067F0000400200014000000026C000-000000067F00004002000140000000270000__000000914E3F38F0 000000067F0000400200014000000026C000-000000067F00004002000140000000270000__000000931B9A2710 000000067F0000400200014000000026C5A8-000000067F00004002000140000000274FA4__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000270000-000000067F00004002000140000000274000__000000914E3F38F0 000000067F00004002000140000000270000-000000067F00004002000140000000274000__000000931B9A2710 000000067F00004002000140000000274000-000000067F00004002000140000000278000__000000914E3F38F0 000000067F00004002000140000000274000-000000067F00004002000140000000278000__000000931B9A2710 000000067F00004002000140000000274FA4-000000067F0000400200014000000027D982__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000278000-000000067F0000400200014000000027C000__000000914E3F38F0 000000067F00004002000140000000278000-000000067F0000400200014000000027C000__000000931B9A2710 000000067F0000400200014000000027C000-000000067F00004002000140000000280000__000000914E3F38F0 000000067F0000400200014000000027C000-000000067F00004002000140000000280000__000000931B9A2710 000000067F0000400200014000000027D982-000000067F0000400200014000000028634B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000280000-000000067F00004002000140000000284000__000000914E3F38F0 000000067F00004002000140000000280000-000000067F00004002000140000000284000__000000931B9A2710 000000067F00004002000140000000284000-000000067F00004002000140000000288000__000000914E3F38F0 000000067F00004002000140000000284000-000000067F00004002000140000000288000__000000931B9A2710 000000067F0000400200014000000028634B-000000067F0000400200014000000028ED00__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000288000-000000067F0000400200014000000028C000__000000914E3F38F0 000000067F00004002000140000000288000-000000067F0000400200014000000028C000__000000931B9A2710 000000067F0000400200014000000028C000-000000067F00004002000140000000290000__000000914E3F38F0 000000067F0000400200014000000028C000-000000067F00004002000140000000290000__000000931B9A2710 000000067F0000400200014000000028ED00-000000067F000040020001400000002976BA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000290000-000000067F00004002000140000000294000__000000914E3F38F0 000000067F00004002000140000000290000-000000067F00004002000140000000294000__000000931B9A2710 000000067F00004002000140000000294000-000000067F00004002000140000000298000__000000914E3F38F0 000000067F00004002000140000000294000-000000067F00004002000140000000298000__000000931B9A2710 000000067F000040020001400000002976BA-000000067F000040020001400000002A00B5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000298000-000000067F0000400200014000000029C000__000000914E3F38F0 000000067F00004002000140000000298000-000000067F0000400200014000000029C000__000000931B9A2710 000000067F0000400200014000000029C000-000000067F000040020001400000002A0000__000000914E3F38F0 000000067F0000400200014000000029C000-000000067F000040020001400000002A0000__000000931B9A2710 000000067F000040020001400000002A0000-000000067F000040020001400000002A4000__000000914E3F38F0 000000067F000040020001400000002A0000-000000067F000040020001400000002A4000__000000931B9A2710 000000067F000040020001400000002A00B5-000000067F000040020001400000002A8AB5__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002A4000-000000067F000040020001400000002A8000__000000914E3F38F0 000000067F000040020001400000002A4000-000000067F000040020001400000002A8000__000000931B9A2710 000000067F000040020001400000002A8000-000000067F000040020001400000002AC000__000000914E3F38F0 000000067F000040020001400000002A8000-000000067F000040020001400000002AC000__000000931B9A2710 000000067F000040020001400000002A8AB5-000000067F000040020001400000002B14B0__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002AC000-000000067F000040020001400000002B0000__000000914E3F38F0 000000067F000040020001400000002AC000-000000067F000040020001400000002B0000__000000931B9A2710 000000067F000040020001400000002B0000-000000067F000040020001400000002B4000__000000914E3F38F0 000000067F000040020001400000002B0000-000000067F000040020001400000002B4000__000000931B9A2710 000000067F000040020001400000002B14B0-000000067F000040020001400000002B9E90__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002B4000-000000067F000040020001400000002B8000__000000914E3F38F0 000000067F000040020001400000002B4000-000000067F000040020001400000002B8000__000000931B9A2710 000000067F000040020001400000002B8000-000000067F000040020001400000002BC000__000000914E3F38F0 000000067F000040020001400000002B8000-000000067F000040020001400000002BC000__000000931B9A2710 000000067F000040020001400000002B9E90-000000067F000040020001400000002C2852__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002BC000-000000067F000040020001400000002C0000__000000914E3F38F0 000000067F000040020001400000002BC000-000000067F000040020001400000002C0000__000000931B9A2710 000000067F000040020001400000002C0000-000000067F000040020001400000002C4000__000000914E3F38F0 000000067F000040020001400000002C0000-000000067F000040020001400000002C4000__000000931B9A2710 000000067F000040020001400000002C2852-000000067F000040020001400000002CB205__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002C4000-000000067F000040020001400000002C8000__000000914E3F38F0 000000067F000040020001400000002C4000-000000067F000040020001400000002C8000__000000931B9A2710 000000067F000040020001400000002C8000-000000067F000040020001400000002CC000__000000914E3F38F0 000000067F000040020001400000002C8000-000000067F000040020001400000002CC000__000000931B9A2710 000000067F000040020001400000002CB205-000000067F000040020001400000002D3BC7__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002CC000-000000067F000040020001400000002D0000__000000914E3F38F0 000000067F000040020001400000002CC000-000000067F000040020001400000002D0000__000000931B9A2710 000000067F000040020001400000002D0000-000000067F000040020001400000002D4000__000000914E3F38F0 000000067F000040020001400000002D0000-000000067F000040020001400000002D4000__000000931B9A2710 000000067F000040020001400000002D3BC7-000000067F000040020001400000002DC5BB__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002D4000-000000067F000040020001400000002D8000__000000914E3F38F0 000000067F000040020001400000002D4000-000000067F000040020001400000002D8000__000000931B9A2710 000000067F000040020001400000002D8000-000000067F000040020001400000002DC000__000000914E3F38F0 000000067F000040020001400000002D8000-000000067F000040020001400000002DC000__000000931B9A2710 000000067F000040020001400000002DC000-000000067F000040020001400000002E0000__000000914E3F38F0 000000067F000040020001400000002DC000-000000067F000040020001400000002E0000__000000931B9A2710 000000067F000040020001400000002DC5BB-000000067F000040020001400000002E4FBB__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002E0000-000000067F000040020001400000002E4000__000000914E3F38F0 000000067F000040020001400000002E0000-000000067F000040020001400000002E4000__000000931B9A2710 000000067F000040020001400000002E4000-000000067F000040020001400000002E8000__000000914E3F38F0 000000067F000040020001400000002E4000-000000067F000040020001400000002E8000__000000931B9A2710 000000067F000040020001400000002E4FBB-000000067F000040020001400000002ED9B4__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002E8000-000000067F000040020001400000002EC000__000000914E3F38F0 000000067F000040020001400000002E8000-000000067F000040020001400000002EC000__000000931B9A2710 000000067F000040020001400000002EC000-000000067F000040020001400000002F0000__000000914E3F38F0 000000067F000040020001400000002EC000-000000067F000040020001400000002F0000__000000931B9A2710 000000067F000040020001400000002ED9B4-000000067F000040020001400000002F6390__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002F0000-000000067F000040020001400000002F4000__000000914E3F38F0 000000067F000040020001400000002F0000-000000067F000040020001400000002F4000__000000931B9A2710 000000067F000040020001400000002F4000-000000067F000040020001400000002F8000__000000914E3F38F0 000000067F000040020001400000002F4000-000000067F000040020001400000002F8000__000000931B9A2710 000000067F000040020001400000002F6390-000000067F000040020001400000002FED51__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000002F8000-000000067F000040020001400000002FC000__000000914E3F38F0 000000067F000040020001400000002F8000-000000067F000040020001400000002FC000__000000931B9A2710 000000067F000040020001400000002FC000-000000067F00004002000140000000300000__000000914E3F38F0 000000067F000040020001400000002FC000-000000067F00004002000140000000300000__000000931B9A2710 000000067F000040020001400000002FED51-000000067F00004002000140000000307706__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000300000-000000067F00004002000140000000304000__000000914E3F38F0 000000067F00004002000140000000300000-000000067F00004002000140000000304000__000000931B9A2710 000000067F00004002000140000000304000-000000067F00004002000140000000308000__000000914E3F38F0 000000067F00004002000140000000304000-000000067F00004002000140000000308000__000000931B9A2710 000000067F00004002000140000000307706-000000067F000040020001400000003100CD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000308000-000000067F0000400200014000000030C000__000000914E3F38F0 000000067F00004002000140000000308000-000000067F0000400200014000000030C000__000000931B9A2710 000000067F0000400200014000000030C000-000000067F00004002000140000000310000__000000914E3F38F0 000000067F0000400200014000000030C000-000000067F00004002000140000000310000__000000931B9A2710 000000067F00004002000140000000310000-000000067F00004002000140000000314000__000000914E3F38F0 000000067F00004002000140000000310000-000000067F00004002000140000000314000__000000931B9A2710 000000067F000040020001400000003100CD-000000067F00004002000140000000318AC7__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000314000-000000067F00004002000140000000318000__000000914E3F38F0 000000067F00004002000140000000314000-000000067F00004002000140000000318000__000000931B9A2710 000000067F00004002000140000000318000-000000067F0000400200014000000031C000__000000914E3F38F0 000000067F00004002000140000000318000-000000067F0000400200014000000031C000__000000931B9A2710 000000067F00004002000140000000318AC7-000000067F000040020001400000003214C9__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000031C000-000000067F00004002000140000000320000__000000914E3F38F0 000000067F0000400200014000000031C000-000000067F00004002000140000000320000__000000931B9A2710 000000067F00004002000140000000320000-000000067F00004002000140000000324000__000000914E3F38F0 000000067F00004002000140000000320000-000000067F00004002000140000000324000__000000931B9A2710 000000067F000040020001400000003214C9-000000067F00004002000140000000329EC1__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000324000-000000067F00004002000140000000328000__000000914E3F38F0 000000067F00004002000140000000324000-000000067F00004002000140000000328000__000000931B9A2710 000000067F00004002000140000000328000-000000067F0000400200014000000032C000__000000914E3F38F0 000000067F00004002000140000000328000-000000067F0000400200014000000032C000__000000931B9A2710 000000067F00004002000140000000329EC1-000000067F0000400200014000000033289A__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000032C000-000000067F00004002000140000000330000__000000914E3F38F0 000000067F0000400200014000000032C000-000000067F00004002000140000000330000__000000931B9A2710 000000067F00004002000140000000330000-000000067F00004002000140000000334000__000000914E3F38F0 000000067F00004002000140000000330000-000000067F00004002000140000000334000__000000931B9A2710 000000067F0000400200014000000033289A-000000067F0000400200014000000033B25C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000334000-000000067F00004002000140000000338000__000000914E3F38F0 000000067F00004002000140000000334000-000000067F00004002000140000000338000__000000931B9A2710 000000067F00004002000140000000338000-000000067F0000400200014000000033C000__000000914E3F38F0 000000067F00004002000140000000338000-000000067F0000400200014000000033C000__000000931B9A2710 000000067F0000400200014000000033B25C-000000067F00004002000140000000343C39__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000033C000-000000067F00004002000140000000340000__000000914E3F38F0 000000067F0000400200014000000033C000-000000067F00004002000140000000340000__000000931B9A2710 000000067F00004002000140000000340000-000000067F00004002000140000000344000__000000914E3F38F0 000000067F00004002000140000000340000-000000067F00004002000140000000344000__000000931B9A2710 000000067F00004002000140000000343C39-000000067F0000400200014000000034C60F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000344000-000000067F00004002000140000000348000__000000914E3F38F0 000000067F00004002000140000000344000-000000067F00004002000140000000348000__000000931B9A2710 000000067F00004002000140000000348000-000000067F0000400200014000000034C000__000000914E3F38F0 000000067F00004002000140000000348000-000000067F0000400200014000000034C000__000000931B9A2710 000000067F0000400200014000000034C000-000000067F00004002000140000000350000__000000914E3F38F0 000000067F0000400200014000000034C000-000000067F00004002000140000000350000__000000931B9A2710 000000067F0000400200014000000034C60F-000000067F00004002000140000000354FEE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000350000-000000067F00004002000140000000354000__000000914E3F38F0 000000067F00004002000140000000350000-000000067F00004002000140000000354000__000000931B9A2710 000000067F00004002000140000000354000-000000067F00004002000140000000358000__000000914E3F38F0 000000067F00004002000140000000354000-000000067F00004002000140000000358000__000000931B9A2710 000000067F00004002000140000000354FEE-000000067F0000400200014000000035D9E2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000358000-000000067F0000400200014000000035C000__000000914E3F38F0 000000067F00004002000140000000358000-000000067F0000400200014000000035C000__000000931B9A2710 000000067F0000400200014000000035C000-000000067F00004002000140000000360000__000000914E3F38F0 000000067F0000400200014000000035C000-000000067F00004002000140000000360000__000000931B9A2710 000000067F0000400200014000000035D9E2-000000067F000040020001400000003663D8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000360000-000000067F00004002000140000000364000__000000914E3F38F0 000000067F00004002000140000000360000-000000067F00004002000140000000364000__000000931B9A2710 000000067F00004002000140000000364000-000000067F00004002000140000000368000__000000914E3F38F0 000000067F00004002000140000000364000-000000067F00004002000140000000368000__000000931B9A2710 000000067F000040020001400000003663D8-000000067F0000400200014000000036EDB9__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000368000-000000067F0000400200014000000036C000__000000914E3F38F0 000000067F00004002000140000000368000-000000067F0000400200014000000036C000__000000931B9A2710 000000067F0000400200014000000036C000-000000067F00004002000140000000370000__000000914E3F38F0 000000067F0000400200014000000036C000-000000067F00004002000140000000370000__000000931B9A2710 000000067F0000400200014000000036EDB9-000000067F00004002000140000000377794__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000370000-000000067F00004002000140000000374000__000000914E3F38F0 000000067F00004002000140000000370000-000000067F00004002000140000000374000__000000931B9A2710 000000067F00004002000140000000374000-000000067F00004002000140000000378000__000000914E3F38F0 000000067F00004002000140000000374000-000000067F00004002000140000000378000__000000931B9A2710 000000067F00004002000140000000377794-000000067F00004002000140000000380157__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000378000-000000067F0000400200014000000037C000__000000914E3F38F0 000000067F00004002000140000000378000-000000067F0000400200014000000037C000__000000931B9A2710 000000067F0000400200014000000037C000-000000067F00004002000140000000380000__000000914E3F38F0 000000067F0000400200014000000037C000-000000067F00004002000140000000380000__000000931B9A2710 000000067F00004002000140000000380000-000000067F00004002000140000000384000__000000914E3F38F0 000000067F00004002000140000000380000-000000067F00004002000140000000384000__000000931B9A2710 000000067F00004002000140000000380157-000000067F00004002000140000000388B37__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000384000-000000067F00004002000140000000388000__000000914E3F38F0 000000067F00004002000140000000384000-000000067F00004002000140000000388000__000000931B9A2710 000000067F00004002000140000000388000-000000067F0000400200014000000038C000__000000914E3F38F0 000000067F00004002000140000000388000-000000067F0000400200014000000038C000__000000931B9A2710 000000067F00004002000140000000388B37-000000067F0000400200014000000039151E__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000038C000-000000067F00004002000140000000390000__000000914E3F38F0 000000067F0000400200014000000038C000-000000067F00004002000140000000390000__000000931B9A2710 000000067F00004002000140000000390000-000000067F00004002000140000000394000__000000914E3F38F0 000000067F00004002000140000000390000-000000067F00004002000140000000394000__000000931B9A2710 000000067F0000400200014000000039151E-000000067F00004002000140000000399F01__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000394000-000000067F00004002000140000000398000__000000914E3F38F0 000000067F00004002000140000000394000-000000067F00004002000140000000398000__000000931B9A2710 000000067F00004002000140000000398000-000000067F0000400200014000000039C000__000000914E3F38F0 000000067F00004002000140000000398000-000000067F0000400200014000000039C000__000000931B9A2710 000000067F00004002000140000000399F01-000000067F000040020001400000003A28D5__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000039C000-000000067F000040020001400000003A0000__000000914E3F38F0 000000067F0000400200014000000039C000-000000067F000040020001400000003A0000__000000931B9A2710 000000067F000040020001400000003A0000-000000067F000040020001400000003A4000__000000914E3F38F0 000000067F000040020001400000003A0000-000000067F000040020001400000003A4000__000000931B9A2710 000000067F000040020001400000003A28D5-000000067F000040020001400000003AB2B1__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003A4000-000000067F000040020001400000003A8000__000000914E3F38F0 000000067F000040020001400000003A4000-000000067F000040020001400000003A8000__000000931B9A2710 000000067F000040020001400000003A8000-000000067F000040020001400000003AC000__000000914E3F38F0 000000067F000040020001400000003A8000-000000067F000040020001400000003AC000__000000931B9A2710 000000067F000040020001400000003AB2B1-000000067F000040020001400000003B3C78__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003AC000-000000067F000040020001400000003B0000__000000914E3F38F0 000000067F000040020001400000003AC000-000000067F000040020001400000003B0000__000000931B9A2710 000000067F000040020001400000003B0000-000000067F000040020001400000003B4000__000000914E3F38F0 000000067F000040020001400000003B0000-000000067F000040020001400000003B4000__000000931B9A2710 000000067F000040020001400000003B3C78-000000067F000040020001400000003BC640__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003B4000-000000067F000040020001400000003B8000__000000914E3F38F0 000000067F000040020001400000003B4000-000000067F000040020001400000003B8000__000000931B9A2710 000000067F000040020001400000003B8000-000000067F000040020001400000003BC000__000000914E3F38F0 000000067F000040020001400000003B8000-000000067F000040020001400000003BC000__000000931B9A2710 000000067F000040020001400000003BC000-000000067F000040020001400000003C0000__000000914E3F38F0 000000067F000040020001400000003BC000-000000067F000040020001400000003C0000__000000931B9A2710 000000067F000040020001400000003BC640-000000067F000040020001400000003C5027__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003C0000-000000067F000040020001400000003C4000__000000914E3F38F0 000000067F000040020001400000003C0000-000000067F000040020001400000003C4000__000000931B9A2710 000000067F000040020001400000003C4000-000000067F000040020001400000003C8000__000000914E3F38F0 000000067F000040020001400000003C4000-000000067F000040020001400000003C8000__000000931B9A2710 000000067F000040020001400000003C5027-000000067F000040020001400000003CDA16__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003C8000-000000067F000040020001400000003CC000__000000914E3F38F0 000000067F000040020001400000003C8000-000000067F000040020001400000003CC000__000000931B9A2710 000000067F000040020001400000003CC000-000000067F000040020001400000003D0000__000000914E3F38F0 000000067F000040020001400000003CC000-000000067F000040020001400000003D0000__000000931B9A2710 000000067F000040020001400000003CDA16-000000067F000040020001400000003D6401__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003D0000-000000067F000040020001400000003D4000__000000914E3F38F0 000000067F000040020001400000003D0000-000000067F000040020001400000003D4000__000000931B9A2710 000000067F000040020001400000003D4000-000000067F000040020001400000003D8000__000000914E3F38F0 000000067F000040020001400000003D4000-000000067F000040020001400000003D8000__000000931B9A2710 000000067F000040020001400000003D6401-000000067F000040020001400000003DEDD4__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003D8000-000000067F000040020001400000003DC000__000000914E3F38F0 000000067F000040020001400000003D8000-000000067F000040020001400000003DC000__000000931B9A2710 000000067F000040020001400000003DC000-000000067F000040020001400000003E0000__000000914E3F38F0 000000067F000040020001400000003DC000-000000067F000040020001400000003E0000__000000931B9A2710 000000067F000040020001400000003DEDD4-000000067F000040020001400000003E77A4__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003E0000-000000067F000040020001400000003E4000__000000914E3F38F0 000000067F000040020001400000003E0000-000000067F000040020001400000003E4000__000000931B9A2710 000000067F000040020001400000003E4000-000000067F000040020001400000003E8000__000000914E3F38F0 000000067F000040020001400000003E4000-000000067F000040020001400000003E8000__000000931B9A2710 000000067F000040020001400000003E77A4-000000067F000040020001400000003F016A__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003E8000-000000067F000040020001400000003EC000__000000914E3F38F0 000000067F000040020001400000003E8000-000000067F000040020001400000003EC000__000000931B9A2710 000000067F000040020001400000003EC000-000000067F000040020001400000003F0000__000000914E3F38F0 000000067F000040020001400000003EC000-000000067F000040020001400000003F0000__000000931B9A2710 000000067F000040020001400000003F0000-000000067F000040020001400000003F4000__000000914E3F38F0 000000067F000040020001400000003F0000-000000067F000040020001400000003F4000__000000931B9A2710 000000067F000040020001400000003F016A-000000067F000040020001400000003F8B44__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003F4000-000000067F000040020001400000003F8000__000000914E3F38F0 000000067F000040020001400000003F4000-000000067F000040020001400000003F8000__000000931B9A2710 000000067F000040020001400000003F8000-000000067F000040020001400000003FC000__000000914E3F38F0 000000067F000040020001400000003F8000-000000067F000040020001400000003FC000__000000931B9A2710 000000067F000040020001400000003F8B44-000000067F0000400200014000000040152F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000003FC000-000000067F00004002000140000000400000__000000914E3F38F0 000000067F000040020001400000003FC000-000000067F00004002000140000000400000__000000931B9A2710 000000067F00004002000140000000400000-000000067F00004002000140000000404000__000000914E3F38F0 000000067F00004002000140000000400000-000000067F00004002000140000000404000__000000931B9A2710 000000067F0000400200014000000040152F-000000067F00004002000140000000409F1B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000404000-000000067F00004002000140000000408000__000000914E3F38F0 000000067F00004002000140000000404000-000000067F00004002000140000000408000__000000931B9A2710 000000067F00004002000140000000408000-000000067F0000400200014000000040C000__000000914E3F38F0 000000067F00004002000140000000408000-000000067F0000400200014000000040C000__000000931B9A2710 000000067F00004002000140000000409F1B-000000067F000040020001400000004128FB__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000040C000-000000067F00004002000140000000410000__000000914E3F38F0 000000067F0000400200014000000040C000-000000067F00004002000140000000410000__000000931B9A2710 000000067F00004002000140000000410000-000000067F00004002000140000000414000__000000914E3F38F0 000000067F00004002000140000000410000-000000067F00004002000140000000414000__000000931B9A2710 000000067F000040020001400000004128FB-000000067F0000400200014000000041B2E2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000414000-000000067F00004002000140000000418000__000000914E3F38F0 000000067F00004002000140000000414000-000000067F00004002000140000000418000__000000931B9A2710 000000067F00004002000140000000418000-000000067F0000400200014000000041C000__000000914E3F38F0 000000067F00004002000140000000418000-000000067F0000400200014000000041C000__000000931B9A2710 000000067F0000400200014000000041B2E2-000000067F00004002000140000000423CB0__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000041C000-000000067F00004002000140000000420000__000000914E3F38F0 000000067F0000400200014000000041C000-000000067F00004002000140000000420000__000000931B9A2710 000000067F00004002000140000000420000-000000067F00004002000140000000424000__000000914E3F38F0 000000067F00004002000140000000420000-000000067F00004002000140000000424000__000000931B9A2710 000000067F00004002000140000000423CB0-000000067F0000400200014000000042C674__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000424000-000000067F00004002000140000000428000__000000914E3F38F0 000000067F00004002000140000000424000-000000067F00004002000140000000428000__000000931B9A2710 000000067F00004002000140000000428000-000000067F0000400200014000000042C000__000000914E3F38F0 000000067F00004002000140000000428000-000000067F0000400200014000000042C000__000000931B9A2710 000000067F0000400200014000000042C000-000000067F00004002000140000000430000__000000914E3F38F0 000000067F0000400200014000000042C000-000000067F00004002000140000000430000__000000931B9A2710 000000067F0000400200014000000042C674-000000067F00004002000140000000435044__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000430000-000000067F00004002000140000000434000__000000914E3F38F0 000000067F00004002000140000000430000-000000067F00004002000140000000434000__000000931B9A2710 000000067F00004002000140000000434000-000000067F00004002000140000000438000__000000914E3F38F0 000000067F00004002000140000000434000-000000067F00004002000140000000438000__000000931B9A2710 000000067F00004002000140000000435044-000000067F0000400200014000000043DA33__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000438000-000000067F0000400200014000000043C000__000000914E3F38F0 000000067F00004002000140000000438000-000000067F0000400200014000000043C000__000000931B9A2710 000000067F0000400200014000000043C000-000000067F00004002000140000000440000__000000914E3F38F0 000000067F0000400200014000000043C000-000000067F00004002000140000000440000__000000931B9A2710 000000067F0000400200014000000043DA33-000000067F0000400200014000000044641A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000440000-000000067F00004002000140000000444000__000000914E3F38F0 000000067F00004002000140000000440000-000000067F00004002000140000000444000__000000931B9A2710 000000067F00004002000140000000444000-000000067F00004002000140000000448000__000000914E3F38F0 000000067F00004002000140000000444000-000000067F00004002000140000000448000__000000931B9A2710 000000067F0000400200014000000044641A-000000067F0000400200014000000044EDF8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000448000-000000067F0000400200014000000044C000__000000914E3F38F0 000000067F00004002000140000000448000-000000067F0000400200014000000044C000__000000931B9A2710 000000067F0000400200014000000044C000-000000067F00004002000140000000450000__000000914E3F38F0 000000067F0000400200014000000044C000-000000067F00004002000140000000450000__000000931B9A2710 000000067F0000400200014000000044EDF8-000000067F000040020001400000004577D5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000450000-000000067F00004002000140000000454000__000000914E3F38F0 000000067F00004002000140000000450000-000000067F00004002000140000000454000__000000931B9A2710 000000067F00004002000140000000454000-000000067F00004002000140000000458000__000000914E3F38F0 000000067F00004002000140000000454000-000000067F00004002000140000000458000__000000931B9A2710 000000067F000040020001400000004577D5-000000067F000040020001400000004601A6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000458000-000000067F0000400200014000000045C000__000000914E3F38F0 000000067F00004002000140000000458000-000000067F0000400200014000000045C000__000000931B9A2710 000000067F0000400200014000000045C000-000000067F00004002000140000000460000__000000914E3F38F0 000000067F0000400200014000000045C000-000000067F00004002000140000000460000__000000931B9A2710 000000067F00004002000140000000460000-000000067F00004002000140000000464000__000000914E3F38F0 000000067F00004002000140000000460000-000000067F00004002000140000000464000__000000931B9A2710 000000067F000040020001400000004601A6-000000067F00004002000140000000468B73__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000464000-000000067F00004002000140000000468000__000000914E3F38F0 000000067F00004002000140000000464000-000000067F00004002000140000000468000__000000931B9A2710 000000067F00004002000140000000468000-000000067F0000400200014000000046C000__000000914E3F38F0 000000067F00004002000140000000468000-000000067F0000400200014000000046C000__000000931B9A2710 000000067F00004002000140000000468B73-000000067F00004002000140000000471550__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000046C000-000000067F00004002000140000000470000__000000914E3F38F0 000000067F0000400200014000000046C000-000000067F00004002000140000000470000__000000931B9A2710 000000067F00004002000140000000470000-000000067F00004002000140000000474000__000000914E3F38F0 000000067F00004002000140000000470000-000000067F00004002000140000000474000__000000931B9A2710 000000067F00004002000140000000471550-000000067F00004002000140000000479F3B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000474000-000000067F00004002000140000000478000__000000914E3F38F0 000000067F00004002000140000000474000-000000067F00004002000140000000478000__000000931B9A2710 000000067F00004002000140000000478000-000000067F0000400200014000000047C000__000000914E3F38F0 000000067F00004002000140000000478000-000000067F0000400200014000000047C000__000000931B9A2710 000000067F00004002000140000000479F3B-000000067F00004002000140000000482925__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000047C000-000000067F00004002000140000000480000__000000914E3F38F0 000000067F0000400200014000000047C000-000000067F00004002000140000000480000__000000931B9A2710 000000067F00004002000140000000480000-000000067F00004002000140000000484000__000000914E3F38F0 000000067F00004002000140000000480000-000000067F00004002000140000000484000__000000931B9A2710 000000067F00004002000140000000482925-000000067F0000400200014000000048B308__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000484000-000000067F00004002000140000000488000__000000914E3F38F0 000000067F00004002000140000000484000-000000067F00004002000140000000488000__000000931B9A2710 000000067F00004002000140000000488000-000000067F0000400200014000000048C000__000000914E3F38F0 000000067F00004002000140000000488000-000000067F0000400200014000000048C000__000000931B9A2710 000000067F0000400200014000000048B308-000000067F00004002000140000000493CD0__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000048C000-000000067F00004002000140000000490000__000000914E3F38F0 000000067F0000400200014000000048C000-000000067F00004002000140000000490000__000000931B9A2710 000000067F00004002000140000000490000-000000067F00004002000140000000494000__000000914E3F38F0 000000067F00004002000140000000490000-000000067F00004002000140000000494000__000000931B9A2710 000000067F00004002000140000000493CD0-000000067F0000400200014000000049C6A6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000494000-000000067F00004002000140000000498000__000000914E3F38F0 000000067F00004002000140000000494000-000000067F00004002000140000000498000__000000931B9A2710 000000067F00004002000140000000498000-000000067F0000400200014000000049C000__000000914E3F38F0 000000067F00004002000140000000498000-000000067F0000400200014000000049C000__000000931B9A2710 000000067F0000400200014000000049C000-000000067F000040020001400000004A0000__000000914E3F38F0 000000067F0000400200014000000049C000-000000067F000040020001400000004A0000__000000931B9A2710 000000067F0000400200014000000049C6A6-000000067F000040020001400000004A506F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004A0000-000000067F000040020001400000004A4000__000000914E3F38F0 000000067F000040020001400000004A0000-000000067F000040020001400000004A4000__000000931B9A2710 000000067F000040020001400000004A4000-000000067F000040020001400000004A8000__000000914E3F38F0 000000067F000040020001400000004A4000-000000067F000040020001400000004A8000__000000931B9A2710 000000067F000040020001400000004A506F-000000067F000040020001400000004ADA52__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004A8000-000000067F000040020001400000004AC000__000000914E3F38F0 000000067F000040020001400000004A8000-000000067F000040020001400000004AC000__000000931B9A2710 000000067F000040020001400000004AC000-000000067F000040020001400000004B0000__000000914E3F38F0 000000067F000040020001400000004AC000-000000067F000040020001400000004B0000__000000931B9A2710 000000067F000040020001400000004ADA52-000000067F000040020001400000004B6437__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004B0000-000000067F000040020001400000004B4000__000000914E3F38F0 000000067F000040020001400000004B0000-000000067F000040020001400000004B4000__000000931B9A2710 000000067F000040020001400000004B4000-000000067F000040020001400000004B8000__000000914E3F38F0 000000067F000040020001400000004B4000-000000067F000040020001400000004B8000__000000931B9A2710 000000067F000040020001400000004B6437-000000067F000040020001400000004BEE1E__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004B8000-000000067F000040020001400000004BC000__000000914E3F38F0 000000067F000040020001400000004B8000-000000067F000040020001400000004BC000__000000931B9A2710 000000067F000040020001400000004BC000-000000067F000040020001400000004C0000__000000914E3F38F0 000000067F000040020001400000004BC000-000000067F000040020001400000004C0000__000000931B9A2710 000000067F000040020001400000004BEE1E-000000067F000040020001400000004C77FB__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004C0000-000000067F000040020001400000004C4000__000000914E3F38F0 000000067F000040020001400000004C0000-000000067F000040020001400000004C4000__000000931B9A2710 000000067F000040020001400000004C4000-000000067F000040020001400000004C8000__000000914E3F38F0 000000067F000040020001400000004C4000-000000067F000040020001400000004C8000__000000931B9A2710 000000067F000040020001400000004C77FB-000000067F000040020001400000004D01CF__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004C8000-000000067F000040020001400000004CC000__000000914E3F38F0 000000067F000040020001400000004C8000-000000067F000040020001400000004CC000__000000931B9A2710 000000067F000040020001400000004CC000-000000067F000040020001400000004D0000__000000914E3F38F0 000000067F000040020001400000004CC000-000000067F000040020001400000004D0000__000000931B9A2710 000000067F000040020001400000004D0000-000000067F000040020001400000004D4000__000000914E3F38F0 000000067F000040020001400000004D0000-000000067F000040020001400000004D4000__000000931B9A2710 000000067F000040020001400000004D01CF-000000067F000040020001400000004D8B9F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004D4000-000000067F000040020001400000004D8000__000000914E3F38F0 000000067F000040020001400000004D4000-000000067F000040020001400000004D8000__000000931B9A2710 000000067F000040020001400000004D8000-000000067F000040020001400000004DC000__000000914E3F38F0 000000067F000040020001400000004D8000-000000067F000040020001400000004DC000__000000931B9A2710 000000067F000040020001400000004D8B9F-000000067F000040020001400000004E1565__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004DC000-000000067F000040020001400000004E0000__000000914E3F38F0 000000067F000040020001400000004DC000-000000067F000040020001400000004E0000__000000931B9A2710 000000067F000040020001400000004E0000-000000067F000040020001400000004E4000__000000914E3F38F0 000000067F000040020001400000004E0000-000000067F000040020001400000004E4000__000000931B9A2710 000000067F000040020001400000004E1565-000000067F000040020001400000004E9F47__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004E4000-000000067F000040020001400000004E8000__000000914E3F38F0 000000067F000040020001400000004E4000-000000067F000040020001400000004E8000__000000931B9A2710 000000067F000040020001400000004E8000-000000067F000040020001400000004EC000__000000914E3F38F0 000000067F000040020001400000004E8000-000000067F000040020001400000004EC000__000000931B9A2710 000000067F000040020001400000004E9F47-000000067F000040020001400000004F2937__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004EC000-000000067F000040020001400000004F0000__000000914E3F38F0 000000067F000040020001400000004EC000-000000067F000040020001400000004F0000__000000931B9A2710 000000067F000040020001400000004F0000-000000067F000040020001400000004F4000__000000914E3F38F0 000000067F000040020001400000004F0000-000000067F000040020001400000004F4000__000000931B9A2710 000000067F000040020001400000004F2937-000000067F000040020001400000004FB31B__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004F4000-000000067F000040020001400000004F8000__000000914E3F38F0 000000067F000040020001400000004F4000-000000067F000040020001400000004F8000__000000931B9A2710 000000067F000040020001400000004F8000-000000067F000040020001400000004FC000__000000914E3F38F0 000000067F000040020001400000004F8000-000000067F000040020001400000004FC000__000000931B9A2710 000000067F000040020001400000004FB31B-000000067F00004002000140000000503CF8__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000004FC000-000000067F00004002000140000000500000__000000914E3F38F0 000000067F000040020001400000004FC000-000000067F00004002000140000000500000__000000931B9A2710 000000067F00004002000140000000500000-000000067F00004002000140000000504000__000000914E3F38F0 000000067F00004002000140000000500000-000000067F00004002000140000000504000__000000931B9A2710 000000067F00004002000140000000503CF8-000000067F0000400200014000000050C6D3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000504000-000000067F00004002000140000000508000__000000914E3F38F0 000000067F00004002000140000000504000-000000067F00004002000140000000508000__000000931B9A2710 000000067F00004002000140000000508000-000000067F0000400200014000000050C000__000000914E3F38F0 000000067F00004002000140000000508000-000000067F0000400200014000000050C000__000000931B9A2710 000000067F0000400200014000000050C000-000000067F00004002000140000000510000__000000914E3F38F0 000000067F0000400200014000000050C000-000000067F00004002000140000000510000__000000931B9A2710 000000067F0000400200014000000050C6D3-000000067F000040020001400000005150A6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000510000-000000067F00004002000140000000514000__000000914E3F38F0 000000067F00004002000140000000510000-000000067F00004002000140000000514000__000000931B9A2710 000000067F00004002000140000000514000-000000067F00004002000140000000518000__000000914E3F38F0 000000067F00004002000140000000514000-000000067F00004002000140000000518000__000000931B9A2710 000000067F000040020001400000005150A6-000000067F0000400200014000000051DA77__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000518000-000000067F0000400200014000000051C000__000000914E3F38F0 000000067F00004002000140000000518000-000000067F0000400200014000000051C000__000000931B9A2710 000000067F0000400200014000000051C000-000000067F00004002000140000000520000__000000914E3F38F0 000000067F0000400200014000000051C000-000000067F00004002000140000000520000__000000931B9A2710 000000067F0000400200014000000051DA77-000000067F0000400200014000000052645E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000520000-000000067F00004002000140000000524000__000000914E3F38F0 000000067F00004002000140000000520000-000000067F00004002000140000000524000__000000931B9A2710 000000067F00004002000140000000524000-000000067F00004002000140000000528000__000000914E3F38F0 000000067F00004002000140000000524000-000000067F00004002000140000000528000__000000931B9A2710 000000067F0000400200014000000052645E-000000067F0000400200014000000052EE48__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000528000-000000067F0000400200014000000052C000__000000914E3F38F0 000000067F00004002000140000000528000-000000067F0000400200014000000052C000__000000931B9A2710 000000067F0000400200014000000052C000-000000067F00004002000140000000530000__000000914E3F38F0 000000067F0000400200014000000052C000-000000067F00004002000140000000530000__000000931B9A2710 000000067F0000400200014000000052EE48-000000067F00004002000140000000537826__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000530000-000000067F00004002000140000000534000__000000914E3F38F0 000000067F00004002000140000000530000-000000067F00004002000140000000534000__000000931B9A2710 000000067F00004002000140000000534000-000000067F00004002000140000000538000__000000914E3F38F0 000000067F00004002000140000000534000-000000067F00004002000140000000538000__000000931B9A2710 000000067F00004002000140000000537826-000000067F00004002000140000000540201__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000538000-000000067F0000400200014000000053C000__000000914E3F38F0 000000067F00004002000140000000538000-000000067F0000400200014000000053C000__000000931B9A2710 000000067F0000400200014000000053C000-000000067F00004002000140000000540000__000000914E3F38F0 000000067F0000400200014000000053C000-000000067F00004002000140000000540000__000000931B9A2710 000000067F00004002000140000000540000-000000067F00004002000140000000544000__000000914E3F38F0 000000067F00004002000140000000540000-000000067F00004002000140000000544000__000000931B9A2710 000000067F00004002000140000000540201-000000067F00004002000140000000548BCA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000544000-000000067F00004002000140000000548000__000000914E3F38F0 000000067F00004002000140000000544000-000000067F00004002000140000000548000__000000931B9A2710 000000067F00004002000140000000548000-000000067F0000400200014000000054C000__000000914E3F38F0 000000067F00004002000140000000548000-000000067F0000400200014000000054C000__000000931B9A2710 000000067F00004002000140000000548BCA-000000067F0000400200014000000055159D__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000054C000-000000067F00004002000140000000550000__000000914E3F38F0 000000067F0000400200014000000054C000-000000067F00004002000140000000550000__000000931B9A2710 000000067F00004002000140000000550000-000000067F00004002000140000000554000__000000914E3F38F0 000000067F00004002000140000000550000-000000067F00004002000140000000554000__000000931B9A2710 000000067F0000400200014000000055159D-000000067F00004002000140000000559F6D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000554000-000000067F00004002000140000000558000__000000914E3F38F0 000000067F00004002000140000000554000-000000067F00004002000140000000558000__000000931B9A2710 000000067F00004002000140000000558000-000000067F0000400200014000000055C000__000000914E3F38F0 000000067F00004002000140000000558000-000000067F0000400200014000000055C000__000000931B9A2710 000000067F00004002000140000000559F6D-000000067F00004002000140000000562956__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000055C000-000000067F00004002000140000000560000__000000914E3F38F0 000000067F0000400200014000000055C000-000000067F00004002000140000000560000__000000931B9A2710 000000067F00004002000140000000560000-000000067F00004002000140000000564000__000000914E3F38F0 000000067F00004002000140000000560000-000000067F00004002000140000000564000__000000931B9A2710 000000067F00004002000140000000562956-000000067F0000400200014000000056B340__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000564000-000000067F00004002000140000000568000__000000914E3F38F0 000000067F00004002000140000000564000-000000067F00004002000140000000568000__000000931B9A2710 000000067F00004002000140000000568000-000000067F0000400200014000000056C000__000000914E3F38F0 000000067F00004002000140000000568000-000000067F0000400200014000000056C000__000000931B9A2710 000000067F0000400200014000000056B340-000000067F00004002000140000000573D1E__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000056C000-000000067F00004002000140000000570000__000000914E3F38F0 000000067F0000400200014000000056C000-000000067F00004002000140000000570000__000000931B9A2710 000000067F00004002000140000000570000-000000067F00004002000140000000574000__000000914E3F38F0 000000067F00004002000140000000570000-000000067F00004002000140000000574000__000000931B9A2710 000000067F00004002000140000000573D1E-000000067F0000400200014000000057C6F0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000574000-000000067F00004002000140000000578000__000000914E3F38F0 000000067F00004002000140000000574000-000000067F00004002000140000000578000__000000931B9A2710 000000067F00004002000140000000578000-000000067F0000400200014000000057C000__000000914E3F38F0 000000067F00004002000140000000578000-000000067F0000400200014000000057C000__000000931B9A2710 000000067F0000400200014000000057C000-000000067F00004002000140000000580000__000000914E3F38F0 000000067F0000400200014000000057C000-000000067F00004002000140000000580000__000000931B9A2710 000000067F0000400200014000000057C6F0-000000067F000040020001400000005850C8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000580000-000000067F00004002000140000000584000__000000914E3F38F0 000000067F00004002000140000000580000-000000067F00004002000140000000584000__000000931B9A2710 000000067F00004002000140000000584000-000000067F00004002000140000000588000__000000914E3F38F0 000000067F00004002000140000000584000-000000067F00004002000140000000588000__000000931B9A2710 000000067F000040020001400000005850C8-000000067F0000400200014000000058DA94__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000588000-000000067F0000400200014000000058C000__000000914E3F38F0 000000067F00004002000140000000588000-000000067F0000400200014000000058C000__000000931B9A2710 000000067F0000400200014000000058C000-000000067F00004002000140000000590000__000000914E3F38F0 000000067F0000400200014000000058C000-000000067F00004002000140000000590000__000000931B9A2710 000000067F0000400200014000000058DA94-000000067F00004002000140000000596465__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000590000-000000067F00004002000140000000594000__000000914E3F38F0 000000067F00004002000140000000590000-000000067F00004002000140000000594000__000000931B9A2710 000000067F00004002000140000000594000-000000067F00004002000140000000598000__000000914E3F38F0 000000067F00004002000140000000594000-000000067F00004002000140000000598000__000000931B9A2710 000000067F00004002000140000000596465-000000067F0000400200014000000059EE53__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000598000-000000067F0000400200014000000059C000__000000914E3F38F0 000000067F00004002000140000000598000-000000067F0000400200014000000059C000__000000931B9A2710 000000067F0000400200014000000059C000-000000067F000040020001400000005A0000__000000914E3F38F0 000000067F0000400200014000000059C000-000000067F000040020001400000005A0000__000000931B9A2710 000000067F0000400200014000000059EE53-000000067F000040020001400000005A783C__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005A0000-000000067F000040020001400000005A4000__000000914E3F38F0 000000067F000040020001400000005A0000-000000067F000040020001400000005A4000__000000931B9A2710 000000067F000040020001400000005A4000-000000067F000040020001400000005A8000__000000914E3F38F0 000000067F000040020001400000005A4000-000000067F000040020001400000005A8000__000000931B9A2710 000000067F000040020001400000005A783C-000000067F000040020001400000005B0217__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005A8000-000000067F000040020001400000005AC000__000000914E3F38F0 000000067F000040020001400000005A8000-000000067F000040020001400000005AC000__000000931B9A2710 000000067F000040020001400000005AC000-000000067F000040020001400000005B0000__000000914E3F38F0 000000067F000040020001400000005AC000-000000067F000040020001400000005B0000__000000931B9A2710 000000067F000040020001400000005B0000-000000067F000040020001400000005B4000__000000914E3F38F0 000000067F000040020001400000005B0000-000000067F000040020001400000005B4000__000000931B9A2710 000000067F000040020001400000005B0217-000000067F000040020001400000005B8BF1__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005B4000-000000067F000040020001400000005B8000__000000914E3F38F0 000000067F000040020001400000005B4000-000000067F000040020001400000005B8000__000000931B9A2710 000000067F000040020001400000005B8000-000000067F000040020001400000005BC000__000000914E3F38F0 000000067F000040020001400000005B8000-000000067F000040020001400000005BC000__000000931B9A2710 000000067F000040020001400000005B8BF1-000000067F000040020001400000005C15C5__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005BC000-000000067F000040020001400000005C0000__000000914E3F38F0 000000067F000040020001400000005BC000-000000067F000040020001400000005C0000__000000931B9A2710 000000067F000040020001400000005C0000-000000067F000040020001400000005C4000__000000914E3F38F0 000000067F000040020001400000005C0000-000000067F000040020001400000005C4000__000000931B9A2710 000000067F000040020001400000005C15C5-000000067F000040020001400000005C9F94__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005C4000-000000067F000040020001400000005C8000__000000914E3F38F0 000000067F000040020001400000005C4000-000000067F000040020001400000005C8000__000000931B9A2710 000000067F000040020001400000005C8000-000000067F000040020001400000005CC000__000000914E3F38F0 000000067F000040020001400000005C8000-000000067F000040020001400000005CC000__000000931B9A2710 000000067F000040020001400000005C9F94-000000067F000040020001400000005D2970__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005CC000-000000067F000040020001400000005D0000__000000914E3F38F0 000000067F000040020001400000005CC000-000000067F000040020001400000005D0000__000000931B9A2710 000000067F000040020001400000005D0000-000000067F000040020001400000005D4000__000000914E3F38F0 000000067F000040020001400000005D0000-000000067F000040020001400000005D4000__000000931B9A2710 000000067F000040020001400000005D2970-000000067F000040020001400000005DB35D__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005D4000-000000067F000040020001400000005D8000__000000914E3F38F0 000000067F000040020001400000005D4000-000000067F000040020001400000005D8000__000000931B9A2710 000000067F000040020001400000005D8000-000000067F000040020001400000005DC000__000000914E3F38F0 000000067F000040020001400000005D8000-000000067F000040020001400000005DC000__000000931B9A2710 000000067F000040020001400000005DB35D-000000067F000040020001400000005E3D3C__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005DC000-000000067F000040020001400000005E0000__000000914E3F38F0 000000067F000040020001400000005DC000-000000067F000040020001400000005E0000__000000931B9A2710 000000067F000040020001400000005E0000-000000067F000040020001400000005E4000__000000914E3F38F0 000000067F000040020001400000005E0000-000000067F000040020001400000005E4000__000000931B9A2710 000000067F000040020001400000005E3D3C-000000067F000040020001400000005EC713__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005E4000-000000067F000040020001400000005E8000__000000914E3F38F0 000000067F000040020001400000005E4000-000000067F000040020001400000005E8000__000000931B9A2710 000000067F000040020001400000005E8000-000000067F000040020001400000005EC000__000000914E3F38F0 000000067F000040020001400000005E8000-000000067F000040020001400000005EC000__000000931B9A2710 000000067F000040020001400000005EC000-000000067F000040020001400000005F0000__000000914E3F38F0 000000067F000040020001400000005EC000-000000067F000040020001400000005F0000__000000931B9A2710 000000067F000040020001400000005EC713-000000067F000040020001400000005F50E5__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005F0000-000000067F000040020001400000005F4000__000000914E3F38F0 000000067F000040020001400000005F0000-000000067F000040020001400000005F4000__000000931B9A2710 000000067F000040020001400000005F4000-000000067F000040020001400000005F8000__000000914E3F38F0 000000067F000040020001400000005F4000-000000067F000040020001400000005F8000__000000931B9A2710 000000067F000040020001400000005F50E5-000000067F000040020001400000005FDAC2__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000005F8000-000000067F000040020001400000005FC000__000000914E3F38F0 000000067F000040020001400000005F8000-000000067F000040020001400000005FC000__000000931B9A2710 000000067F000040020001400000005FC000-000000067F00004002000140000000600000__000000914E3F38F0 000000067F000040020001400000005FC000-000000067F00004002000140000000600000__000000931B9A2710 000000067F000040020001400000005FDAC2-000000067F0000400200014000000060648F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000600000-000000067F00004002000140000000604000__000000914E3F38F0 000000067F00004002000140000000600000-000000067F00004002000140000000604000__000000931B9A2710 000000067F00004002000140000000604000-000000067F00004002000140000000608000__000000914E3F38F0 000000067F00004002000140000000604000-000000067F00004002000140000000608000__000000931B9A2710 000000067F0000400200014000000060648F-000000067F0000400200014000000060EE6E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000608000-000000067F0000400200014000000060C000__000000914E3F38F0 000000067F00004002000140000000608000-000000067F0000400200014000000060C000__000000931B9A2710 000000067F0000400200014000000060C000-000000067F00004002000140000000610000__000000914E3F38F0 000000067F0000400200014000000060C000-000000067F00004002000140000000610000__000000931B9A2710 000000067F0000400200014000000060EE6E-000000067F00004002000140000000617862__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000610000-000000067F00004002000140000000614000__000000914E3F38F0 000000067F00004002000140000000610000-000000067F00004002000140000000614000__000000931B9A2710 000000067F00004002000140000000614000-000000067F00004002000140000000618000__000000914E3F38F0 000000067F00004002000140000000614000-000000067F00004002000140000000618000__000000931B9A2710 000000067F00004002000140000000617862-000000067F0000400200014000000062024A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000618000-000000067F0000400200014000000061C000__000000914E3F38F0 000000067F00004002000140000000618000-000000067F0000400200014000000061C000__000000931B9A2710 000000067F0000400200014000000061C000-000000067F00004002000140000000620000__000000914E3F38F0 000000067F0000400200014000000061C000-000000067F00004002000140000000620000__000000931B9A2710 000000067F00004002000140000000620000-000000067F00004002000140000000624000__000000914E3F38F0 000000067F00004002000140000000620000-000000067F00004002000140000000624000__000000931B9A2710 000000067F0000400200014000000062024A-000000067F00004002000140000000628C1D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000624000-000000067F00004002000140000000628000__000000914E3F38F0 000000067F00004002000140000000624000-000000067F00004002000140000000628000__000000931B9A2710 000000067F00004002000140000000628000-000000067F0000400200014000000062C000__000000914E3F38F0 000000067F00004002000140000000628000-000000067F0000400200014000000062C000__000000931B9A2710 000000067F00004002000140000000628C1D-000000067F000040020001400000006315E2__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000062C000-000000067F00004002000140000000630000__000000914E3F38F0 000000067F0000400200014000000062C000-000000067F00004002000140000000630000__000000931B9A2710 000000067F00004002000140000000630000-000000067F00004002000140000000634000__000000914E3F38F0 000000067F00004002000140000000630000-000000067F00004002000140000000634000__000000931B9A2710 000000067F000040020001400000006315E2-000000067F00004002000140000000639FBE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000634000-000000067F00004002000140000000638000__000000914E3F38F0 000000067F00004002000140000000634000-000000067F00004002000140000000638000__000000931B9A2710 000000067F00004002000140000000638000-000000067F0000400200014000000063C000__000000914E3F38F0 000000067F00004002000140000000638000-000000067F0000400200014000000063C000__000000931B9A2710 000000067F00004002000140000000639FBE-000000067F00004002000140000000642995__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000063C000-000000067F00004002000140000000640000__000000914E3F38F0 000000067F0000400200014000000063C000-000000067F00004002000140000000640000__000000931B9A2710 000000067F00004002000140000000640000-000000067F00004002000140000000644000__000000914E3F38F0 000000067F00004002000140000000640000-000000067F00004002000140000000644000__000000931B9A2710 000000067F00004002000140000000642995-000000067F0000400200014000000064B370__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000644000-000000067F00004002000140000000648000__000000914E3F38F0 000000067F00004002000140000000644000-000000067F00004002000140000000648000__000000931B9A2710 000000067F00004002000140000000648000-000000067F0000400200014000000064C000__000000914E3F38F0 000000067F00004002000140000000648000-000000067F0000400200014000000064C000__000000931B9A2710 000000067F0000400200014000000064B370-000000067F00004002000140000000653D64__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000064C000-000000067F00004002000140000000650000__000000914E3F38F0 000000067F0000400200014000000064C000-000000067F00004002000140000000650000__000000931B9A2710 000000067F00004002000140000000650000-000000067F00004002000140000000654000__000000914E3F38F0 000000067F00004002000140000000650000-000000067F00004002000140000000654000__000000931B9A2710 000000067F00004002000140000000653D64-000000067F0000400200014000000065C74F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000654000-000000067F00004002000140000000658000__000000914E3F38F0 000000067F00004002000140000000654000-000000067F00004002000140000000658000__000000931B9A2710 000000067F00004002000140000000658000-000000067F0000400200014000000065C000__000000914E3F38F0 000000067F00004002000140000000658000-000000067F0000400200014000000065C000__000000931B9A2710 000000067F0000400200014000000065C000-000000067F00004002000140000000660000__000000914E3F38F0 000000067F0000400200014000000065C000-000000067F00004002000140000000660000__000000931B9A2710 000000067F0000400200014000000065C74F-000000067F00004002000140000000665130__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000660000-000000067F00004002000140000000664000__000000914E3F38F0 000000067F00004002000140000000660000-000000067F00004002000140000000664000__000000931B9A2710 000000067F00004002000140000000664000-000000067F00004002000140000000668000__000000914E3F38F0 000000067F00004002000140000000664000-000000067F00004002000140000000668000__000000931B9A2710 000000067F00004002000140000000665130-000000067F0000400200014000000066DAFC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000668000-000000067F0000400200014000000066C000__000000914E3F38F0 000000067F00004002000140000000668000-000000067F0000400200014000000066C000__000000931B9A2710 000000067F0000400200014000000066C000-000000067F00004002000140000000670000__000000914E3F38F0 000000067F0000400200014000000066C000-000000067F00004002000140000000670000__000000931B9A2710 000000067F0000400200014000000066DAFC-000000067F000040020001400000006764CD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000670000-000000067F00004002000140000000674000__000000914E3F38F0 000000067F00004002000140000000670000-000000067F00004002000140000000674000__000000931B9A2710 000000067F00004002000140000000674000-000000067F00004002000140000000678000__000000914E3F38F0 000000067F00004002000140000000674000-000000067F00004002000140000000678000__000000931B9A2710 000000067F000040020001400000006764CD-000000067F0000400200014000000067EEA5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000678000-000000067F0000400200014000000067C000__000000914E3F38F0 000000067F00004002000140000000678000-000000067F0000400200014000000067C000__000000931B9A2710 000000067F0000400200014000000067C000-000000067F00004002000140000000680000__000000914E3F38F0 000000067F0000400200014000000067C000-000000067F00004002000140000000680000__000000931B9A2710 000000067F0000400200014000000067EEA5-000000067F0000400200014000000068788B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000680000-000000067F00004002000140000000684000__000000914E3F38F0 000000067F00004002000140000000680000-000000067F00004002000140000000684000__000000931B9A2710 000000067F00004002000140000000684000-000000067F00004002000140000000688000__000000914E3F38F0 000000067F00004002000140000000684000-000000067F00004002000140000000688000__000000931B9A2710 000000067F0000400200014000000068788B-000000067F0000400200014000000069026F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000688000-000000067F0000400200014000000068C000__000000914E3F38F0 000000067F00004002000140000000688000-000000067F0000400200014000000068C000__000000931B9A2710 000000067F0000400200014000000068C000-000000067F00004002000140000000690000__000000914E3F38F0 000000067F0000400200014000000068C000-000000067F00004002000140000000690000__000000931B9A2710 000000067F00004002000140000000690000-000000067F00004002000140000000694000__000000914E3F38F0 000000067F00004002000140000000690000-000000067F00004002000140000000694000__000000931B9A2710 000000067F0000400200014000000069026F-000000067F00004002000140000000698C51__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000694000-000000067F00004002000140000000698000__000000914E3F38F0 000000067F00004002000140000000694000-000000067F00004002000140000000698000__000000931B9A2710 000000067F00004002000140000000698000-000000067F0000400200014000000069C000__000000914E3F38F0 000000067F00004002000140000000698000-000000067F0000400200014000000069C000__000000931B9A2710 000000067F00004002000140000000698C51-000000067F000040020001400000006A1635__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000069C000-000000067F000040020001400000006A0000__000000914E3F38F0 000000067F0000400200014000000069C000-000000067F000040020001400000006A0000__000000931B9A2710 000000067F000040020001400000006A0000-000000067F000040020001400000006A4000__000000914E3F38F0 000000067F000040020001400000006A0000-000000067F000040020001400000006A4000__000000931B9A2710 000000067F000040020001400000006A1635-000000067F000040020001400000006AA005__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006A4000-000000067F000040020001400000006A8000__000000914E3F38F0 000000067F000040020001400000006A4000-000000067F000040020001400000006A8000__000000931B9A2710 000000067F000040020001400000006A8000-000000067F000040020001400000006AC000__000000914E3F38F0 000000067F000040020001400000006A8000-000000067F000040020001400000006AC000__000000931B9A2710 000000067F000040020001400000006AA005-000000067F000040020001400000006B29BB__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006AC000-000000067F000040020001400000006B0000__000000914E3F38F0 000000067F000040020001400000006AC000-000000067F000040020001400000006B0000__000000931B9A2710 000000067F000040020001400000006B0000-000000067F000040020001400000006B4000__000000914E3F38F0 000000067F000040020001400000006B0000-000000067F000040020001400000006B4000__000000931B9A2710 000000067F000040020001400000006B29BB-000000067F000040020001400000006BB38D__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006B4000-000000067F000040020001400000006B8000__000000914E3F38F0 000000067F000040020001400000006B4000-000000067F000040020001400000006B8000__000000931B9A2710 000000067F000040020001400000006B8000-000000067F000040020001400000006BC000__000000914E3F38F0 000000067F000040020001400000006B8000-000000067F000040020001400000006BC000__000000931B9A2710 000000067F000040020001400000006BB38D-000000067F000040020001400000006C3D79__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006BC000-000000067F000040020001400000006C0000__000000914E3F38F0 000000067F000040020001400000006BC000-000000067F000040020001400000006C0000__000000931B9A2710 000000067F000040020001400000006C0000-000000067F000040020001400000006C4000__000000914E3F38F0 000000067F000040020001400000006C0000-000000067F000040020001400000006C4000__000000931B9A2710 000000067F000040020001400000006C3D79-000000067F000040020001400000006CC765__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006C4000-000000067F000040020001400000006C8000__000000914E3F38F0 000000067F000040020001400000006C4000-000000067F000040020001400000006C8000__000000931B9A2710 000000067F000040020001400000006C8000-000000067F000040020001400000006CC000__000000914E3F38F0 000000067F000040020001400000006C8000-000000067F000040020001400000006CC000__000000931B9A2710 000000067F000040020001400000006CC000-000000067F000040020001400000006D0000__000000914E3F38F0 000000067F000040020001400000006CC000-000000067F000040020001400000006D0000__000000931B9A2710 000000067F000040020001400000006CC765-000000067F000040020001400000006D514B__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006D0000-000000067F000040020001400000006D4000__000000914E3F38F0 000000067F000040020001400000006D0000-000000067F000040020001400000006D4000__000000931B9A2710 000000067F000040020001400000006D4000-000000067F000040020001400000006D8000__000000914E3F38F0 000000067F000040020001400000006D4000-000000067F000040020001400000006D8000__000000931B9A2710 000000067F000040020001400000006D514B-000000067F000040020001400000006DDB2A__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006D8000-000000067F000040020001400000006DC000__000000914E3F38F0 000000067F000040020001400000006D8000-000000067F000040020001400000006DC000__000000931B9A2710 000000067F000040020001400000006DC000-000000067F000040020001400000006E0000__000000914E3F38F0 000000067F000040020001400000006DC000-000000067F000040020001400000006E0000__000000931B9A2710 000000067F000040020001400000006DDB2A-000000067F000040020001400000006E64F5__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006E0000-000000067F000040020001400000006E4000__000000914E3F38F0 000000067F000040020001400000006E0000-000000067F000040020001400000006E4000__000000931B9A2710 000000067F000040020001400000006E4000-000000067F000040020001400000006E8000__000000914E3F38F0 000000067F000040020001400000006E4000-000000067F000040020001400000006E8000__000000931B9A2710 000000067F000040020001400000006E64F5-000000067F000040020001400000006EEEC0__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006E8000-000000067F000040020001400000006EC000__000000914E3F38F0 000000067F000040020001400000006E8000-000000067F000040020001400000006EC000__000000931B9A2710 000000067F000040020001400000006EC000-000000067F000040020001400000006F0000__000000914E3F38F0 000000067F000040020001400000006EC000-000000067F000040020001400000006F0000__000000931B9A2710 000000067F000040020001400000006EEEC0-000000067F000040020001400000006F7891__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006F0000-000000067F000040020001400000006F4000__000000914E3F38F0 000000067F000040020001400000006F0000-000000067F000040020001400000006F4000__000000931B9A2710 000000067F000040020001400000006F4000-000000067F000040020001400000006F8000__000000914E3F38F0 000000067F000040020001400000006F4000-000000067F000040020001400000006F8000__000000931B9A2710 000000067F000040020001400000006F7891-000000067F00004002000140000000700279__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000006F8000-000000067F000040020001400000006FC000__000000914E3F38F0 000000067F000040020001400000006F8000-000000067F000040020001400000006FC000__000000931B9A2710 000000067F000040020001400000006FC000-000000067F00004002000140000000700000__000000914E3F38F0 000000067F000040020001400000006FC000-000000067F00004002000140000000700000__000000931B9A2710 000000067F00004002000140000000700000-000000067F00004002000140000000704000__000000914E3F38F0 000000067F00004002000140000000700000-000000067F00004002000140000000704000__000000931B9A2710 000000067F00004002000140000000700279-000000067F00004002000140000000708C68__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000704000-000000067F00004002000140000000708000__000000914E3F38F0 000000067F00004002000140000000704000-000000067F00004002000140000000708000__000000931B9A2710 000000067F00004002000140000000708000-000000067F0000400200014000000070C000__000000914E3F38F0 000000067F00004002000140000000708000-000000067F0000400200014000000070C000__000000931B9A2710 000000067F00004002000140000000708C68-000000067F00004002000140000000711656__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000070C000-000000067F00004002000140000000710000__000000914E3F38F0 000000067F0000400200014000000070C000-000000067F00004002000140000000710000__000000931B9A2710 000000067F00004002000140000000710000-000000067F00004002000140000000714000__000000914E3F38F0 000000067F00004002000140000000710000-000000067F00004002000140000000714000__000000931B9A2710 000000067F00004002000140000000711656-000000067F0000400200014000000071A02C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000714000-000000067F00004002000140000000718000__000000914E3F38F0 000000067F00004002000140000000714000-000000067F00004002000140000000718000__000000931B9A2710 000000067F00004002000140000000718000-000000067F0000400200014000000071C000__000000914E3F38F0 000000067F00004002000140000000718000-000000067F0000400200014000000071C000__000000931B9A2710 000000067F0000400200014000000071A02C-000000067F000040020001400000007229FA__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000071C000-000000067F00004002000140000000720000__000000914E3F38F0 000000067F0000400200014000000071C000-000000067F00004002000140000000720000__000000931B9A2710 000000067F00004002000140000000720000-000000067F00004002000140000000724000__000000914E3F38F0 000000067F00004002000140000000720000-000000067F00004002000140000000724000__000000931B9A2710 000000067F000040020001400000007229FA-000000067F0000400200014000000072B3C9__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000724000-000000067F00004002000140000000728000__000000914E3F38F0 000000067F00004002000140000000724000-000000067F00004002000140000000728000__000000931B9A2710 000000067F00004002000140000000728000-000000067F0000400200014000000072C000__000000914E3F38F0 000000067F00004002000140000000728000-000000067F0000400200014000000072C000__000000931B9A2710 000000067F0000400200014000000072B3C9-000000067F00004002000140000000733D9B__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000072C000-000000067F00004002000140000000730000__000000914E3F38F0 000000067F0000400200014000000072C000-000000067F00004002000140000000730000__000000931B9A2710 000000067F00004002000140000000730000-000000067F00004002000140000000734000__000000914E3F38F0 000000067F00004002000140000000730000-000000067F00004002000140000000734000__000000931B9A2710 000000067F00004002000140000000733D9B-000000067F0000400200014000000073C77C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000734000-000000067F00004002000140000000738000__000000914E3F38F0 000000067F00004002000140000000734000-000000067F00004002000140000000738000__000000931B9A2710 000000067F00004002000140000000738000-000000067F0000400200014000000073C000__000000914E3F38F0 000000067F00004002000140000000738000-000000067F0000400200014000000073C000__000000931B9A2710 000000067F0000400200014000000073C000-000000067F00004002000140000000740000__000000914E3F38F0 000000067F0000400200014000000073C000-000000067F00004002000140000000740000__000000931B9A2710 000000067F0000400200014000000073C77C-000000067F00004002000140000000745169__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000740000-000000067F00004002000140000000744000__000000914E3F38F0 000000067F00004002000140000000740000-000000067F00004002000140000000744000__000000931B9A2710 000000067F00004002000140000000744000-000000067F00004002000140000000748000__000000914E3F38F0 000000067F00004002000140000000744000-000000067F00004002000140000000748000__000000931B9A2710 000000067F00004002000140000000745169-000000067F0000400200014000000074DB4D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000748000-000000067F0000400200014000000074C000__000000914E3F38F0 000000067F00004002000140000000748000-000000067F0000400200014000000074C000__000000931B9A2710 000000067F0000400200014000000074C000-000000067F00004002000140000000750000__000000914E3F38F0 000000067F0000400200014000000074C000-000000067F00004002000140000000750000__000000931B9A2710 000000067F0000400200014000000074DB4D-000000067F00004002000140000000756529__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000750000-000000067F00004002000140000000754000__000000914E3F38F0 000000067F00004002000140000000750000-000000067F00004002000140000000754000__000000931B9A2710 000000067F00004002000140000000754000-000000067F00004002000140000000758000__000000914E3F38F0 000000067F00004002000140000000754000-000000067F00004002000140000000758000__000000931B9A2710 000000067F00004002000140000000756529-000000067F0000400200014000000075EEF6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000758000-000000067F0000400200014000000075C000__000000914E3F38F0 000000067F00004002000140000000758000-000000067F0000400200014000000075C000__000000931B9A2710 000000067F0000400200014000000075C000-000000067F00004002000140000000760000__000000914E3F38F0 000000067F0000400200014000000075C000-000000067F00004002000140000000760000__000000931B9A2710 000000067F0000400200014000000075EEF6-000000067F000040020001400000007678CA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000760000-000000067F00004002000140000000764000__000000914E3F38F0 000000067F00004002000140000000760000-000000067F00004002000140000000764000__000000931B9A2710 000000067F00004002000140000000764000-000000067F00004002000140000000768000__000000914E3F38F0 000000067F00004002000140000000764000-000000067F00004002000140000000768000__000000931B9A2710 000000067F000040020001400000007678CA-000000067F000040020001400000007702AD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000768000-000000067F0000400200014000000076C000__000000914E3F38F0 000000067F00004002000140000000768000-000000067F0000400200014000000076C000__000000931B9A2710 000000067F0000400200014000000076C000-000000067F00004002000140000000770000__000000914E3F38F0 000000067F0000400200014000000076C000-000000067F00004002000140000000770000__000000931B9A2710 000000067F00004002000140000000770000-000000067F00004002000140000000774000__000000914E3F38F0 000000067F00004002000140000000770000-000000067F00004002000140000000774000__000000931B9A2710 000000067F000040020001400000007702AD-000000067F00004002000140000000778C92__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000774000-000000067F00004002000140000000778000__000000914E3F38F0 000000067F00004002000140000000774000-000000067F00004002000140000000778000__000000931B9A2710 000000067F00004002000140000000778000-000000067F0000400200014000000077C000__000000914E3F38F0 000000067F00004002000140000000778000-000000067F0000400200014000000077C000__000000931B9A2710 000000067F00004002000140000000778C92-000000067F0000400200014000000078166A__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000077C000-000000067F00004002000140000000780000__000000914E3F38F0 000000067F0000400200014000000077C000-000000067F00004002000140000000780000__000000931B9A2710 000000067F00004002000140000000780000-000000067F00004002000140000000784000__000000914E3F38F0 000000067F00004002000140000000780000-000000067F00004002000140000000784000__000000931B9A2710 000000067F0000400200014000000078166A-000000067F0000400200014000000078A042__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000784000-000000067F00004002000140000000788000__000000914E3F38F0 000000067F00004002000140000000784000-000000067F00004002000140000000788000__000000931B9A2710 000000067F00004002000140000000788000-000000067F0000400200014000000078C000__000000914E3F38F0 000000067F00004002000140000000788000-000000067F0000400200014000000078C000__000000931B9A2710 000000067F0000400200014000000078A042-000000067F00004002000140000000792A24__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000078C000-000000067F00004002000140000000790000__000000914E3F38F0 000000067F0000400200014000000078C000-000000067F00004002000140000000790000__000000931B9A2710 000000067F00004002000140000000790000-000000067F00004002000140000000794000__000000914E3F38F0 000000067F00004002000140000000790000-000000067F00004002000140000000794000__000000931B9A2710 000000067F00004002000140000000792A24-000000067F0000400200014000000079B3FE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000794000-000000067F00004002000140000000798000__000000914E3F38F0 000000067F00004002000140000000794000-000000067F00004002000140000000798000__000000931B9A2710 000000067F00004002000140000000798000-000000067F0000400200014000000079C000__000000914E3F38F0 000000067F00004002000140000000798000-000000067F0000400200014000000079C000__000000931B9A2710 000000067F0000400200014000000079B3FE-000000067F000040020001400000007A3DE6__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000079C000-000000067F000040020001400000007A0000__000000914E3F38F0 000000067F0000400200014000000079C000-000000067F000040020001400000007A0000__000000931B9A2710 000000067F000040020001400000007A0000-000000067F000040020001400000007A4000__000000914E3F38F0 000000067F000040020001400000007A0000-000000067F000040020001400000007A4000__000000931B9A2710 000000067F000040020001400000007A3DE6-000000067F000040020001400000007AC7C4__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007A4000-000000067F000040020001400000007A8000__000000914E3F38F0 000000067F000040020001400000007A4000-000000067F000040020001400000007A8000__000000931B9A2710 000000067F000040020001400000007A8000-000000067F000040020001400000007AC000__000000914E3F38F0 000000067F000040020001400000007A8000-000000067F000040020001400000007AC000__000000931B9A2710 000000067F000040020001400000007AC000-000000067F000040020001400000007B0000__000000914E3F38F0 000000067F000040020001400000007AC000-000000067F000040020001400000007B0000__000000931B9A2710 000000067F000040020001400000007AC7C4-000000067F000040020001400000007B51A6__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007B0000-000000067F000040020001400000007B4000__000000914E3F38F0 000000067F000040020001400000007B0000-000000067F000040020001400000007B4000__000000931B9A2710 000000067F000040020001400000007B4000-000000067F000040020001400000007B8000__000000914E3F38F0 000000067F000040020001400000007B4000-000000067F000040020001400000007B8000__000000931B9A2710 000000067F000040020001400000007B51A6-000000067F000040020001400000007BDB7E__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007B8000-000000067F000040020001400000007BC000__000000914E3F38F0 000000067F000040020001400000007B8000-000000067F000040020001400000007BC000__000000931B9A2710 000000067F000040020001400000007BC000-000000067F000040020001400000007C0000__000000914E3F38F0 000000067F000040020001400000007BC000-000000067F000040020001400000007C0000__000000931B9A2710 000000067F000040020001400000007BDB7E-000000067F000040020001400000007C6558__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007C0000-000000067F000040020001400000007C4000__000000914E3F38F0 000000067F000040020001400000007C0000-000000067F000040020001400000007C4000__000000931B9A2710 000000067F000040020001400000007C4000-000000067F000040020001400000007C8000__000000914E3F38F0 000000067F000040020001400000007C4000-000000067F000040020001400000007C8000__000000931B9A2710 000000067F000040020001400000007C6558-000000067F000040020001400000007CEF2A__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007C8000-000000067F000040020001400000007CC000__000000914E3F38F0 000000067F000040020001400000007C8000-000000067F000040020001400000007CC000__000000931B9A2710 000000067F000040020001400000007CC000-000000067F000040020001400000007D0000__000000914E3F38F0 000000067F000040020001400000007CC000-000000067F000040020001400000007D0000__000000931B9A2710 000000067F000040020001400000007CEF2A-000000067F000040020001400000007D7903__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007D0000-000000067F000040020001400000007D4000__000000914E3F38F0 000000067F000040020001400000007D0000-000000067F000040020001400000007D4000__000000931B9A2710 000000067F000040020001400000007D4000-000000067F000040020001400000007D8000__000000914E3F38F0 000000067F000040020001400000007D4000-000000067F000040020001400000007D8000__000000931B9A2710 000000067F000040020001400000007D7903-000000067F000040020001400000007E02D9__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007D8000-000000067F000040020001400000007DC000__000000914E3F38F0 000000067F000040020001400000007D8000-000000067F000040020001400000007DC000__000000931B9A2710 000000067F000040020001400000007DC000-000000067F000040020001400000007E0000__000000914E3F38F0 000000067F000040020001400000007DC000-000000067F000040020001400000007E0000__000000931B9A2710 000000067F000040020001400000007E0000-000000067F000040020001400000007E4000__000000914E3F38F0 000000067F000040020001400000007E0000-000000067F000040020001400000007E4000__000000931B9A2710 000000067F000040020001400000007E02D9-000000067F000040020001400000007E8CAF__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007E4000-000000067F000040020001400000007E8000__000000914E3F38F0 000000067F000040020001400000007E4000-000000067F000040020001400000007E8000__000000931B9A2710 000000067F000040020001400000007E8000-000000067F000040020001400000007EC000__000000914E3F38F0 000000067F000040020001400000007E8000-000000067F000040020001400000007EC000__000000931B9A2710 000000067F000040020001400000007E8CAF-000000067F000040020001400000007F1692__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007EC000-000000067F000040020001400000007F0000__000000914E3F38F0 000000067F000040020001400000007EC000-000000067F000040020001400000007F0000__000000931B9A2710 000000067F000040020001400000007F0000-000000067F000040020001400000007F4000__000000914E3F38F0 000000067F000040020001400000007F0000-000000067F000040020001400000007F4000__000000931B9A2710 000000067F000040020001400000007F1692-000000067F000040020001400000007FA06B__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007F4000-000000067F000040020001400000007F8000__000000914E3F38F0 000000067F000040020001400000007F4000-000000067F000040020001400000007F8000__000000931B9A2710 000000067F000040020001400000007F8000-000000067F000040020001400000007FC000__000000914E3F38F0 000000067F000040020001400000007F8000-000000067F000040020001400000007FC000__000000931B9A2710 000000067F000040020001400000007FA06B-000000067F00004002000140000000802A45__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000007FC000-000000067F00004002000140000000800000__000000914E3F38F0 000000067F000040020001400000007FC000-000000067F00004002000140000000800000__000000931B9A2710 000000067F00004002000140000000800000-000000067F00004002000140000000804000__000000914E3F38F0 000000067F00004002000140000000800000-000000067F00004002000140000000804000__000000931B9A2710 000000067F00004002000140000000802A45-000000067F0000400200014000000080B41D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000804000-000000067F00004002000140000000808000__000000914E3F38F0 000000067F00004002000140000000804000-000000067F00004002000140000000808000__000000931B9A2710 000000067F00004002000140000000808000-000000067F0000400200014000000080C000__000000914E3F38F0 000000067F00004002000140000000808000-000000067F0000400200014000000080C000__000000931B9A2710 000000067F0000400200014000000080B41D-000000067F00004002000140000000813DF8__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000080C000-000000067F00004002000140000000810000__000000914E3F38F0 000000067F0000400200014000000080C000-000000067F00004002000140000000810000__000000931B9A2710 000000067F00004002000140000000810000-000000067F00004002000140000000814000__000000914E3F38F0 000000067F00004002000140000000810000-000000067F00004002000140000000814000__000000931B9A2710 000000067F00004002000140000000813DF8-000000067F0000400200014000000081C7DB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000814000-000000067F00004002000140000000818000__000000914E3F38F0 000000067F00004002000140000000814000-000000067F00004002000140000000818000__000000931B9A2710 000000067F00004002000140000000818000-000000067F0000400200014000000081C000__000000914E3F38F0 000000067F00004002000140000000818000-000000067F0000400200014000000081C000__000000931B9A2710 000000067F0000400200014000000081C000-000000067F00004002000140000000820000__000000914E3F38F0 000000067F0000400200014000000081C000-000000067F00004002000140000000820000__000000931B9A2710 000000067F0000400200014000000081C7DB-000000067F000040020001400000008251B6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000820000-000000067F00004002000140000000824000__000000914E3F38F0 000000067F00004002000140000000820000-000000067F00004002000140000000824000__000000931B9A2710 000000067F00004002000140000000824000-000000067F00004002000140000000828000__000000914E3F38F0 000000067F00004002000140000000824000-000000067F00004002000140000000828000__000000931B9A2710 000000067F000040020001400000008251B6-000000067F0000400200014000000082DB9B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000828000-000000067F0000400200014000000082C000__000000914E3F38F0 000000067F00004002000140000000828000-000000067F0000400200014000000082C000__000000931B9A2710 000000067F0000400200014000000082C000-000000067F00004002000140000000830000__000000914E3F38F0 000000067F0000400200014000000082C000-000000067F00004002000140000000830000__000000931B9A2710 000000067F0000400200014000000082DB9B-000000067F00004002000140000000836584__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000830000-000000067F00004002000140000000834000__000000914E3F38F0 000000067F00004002000140000000830000-000000067F00004002000140000000834000__000000931B9A2710 000000067F00004002000140000000834000-000000067F00004002000140000000838000__000000914E3F38F0 000000067F00004002000140000000834000-000000067F00004002000140000000838000__000000931B9A2710 000000067F00004002000140000000836584-000000067F0000400200014000000083EF61__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000838000-000000067F0000400200014000000083C000__000000914E3F38F0 000000067F00004002000140000000838000-000000067F0000400200014000000083C000__000000931B9A2710 000000067F0000400200014000000083C000-000000067F00004002000140000000840000__000000914E3F38F0 000000067F0000400200014000000083C000-000000067F00004002000140000000840000__000000931B9A2710 000000067F0000400200014000000083EF61-000000067F00004002000140000000847939__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000840000-000000067F00004002000140000000844000__000000914E3F38F0 000000067F00004002000140000000840000-000000067F00004002000140000000844000__000000931B9A2710 000000067F00004002000140000000844000-000000067F00004002000140000000848000__000000914E3F38F0 000000067F00004002000140000000844000-000000067F00004002000140000000848000__000000931B9A2710 000000067F00004002000140000000847939-000000067F00004002000140000000850319__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000848000-000000067F0000400200014000000084C000__000000914E3F38F0 000000067F00004002000140000000848000-000000067F0000400200014000000084C000__000000931B9A2710 000000067F0000400200014000000084C000-000000067F00004002000140000000850000__000000914E3F38F0 000000067F0000400200014000000084C000-000000067F00004002000140000000850000__000000931B9A2710 000000067F00004002000140000000850000-000000067F00004002000140000000854000__000000914E3F38F0 000000067F00004002000140000000850000-000000067F00004002000140000000854000__000000931B9A2710 000000067F00004002000140000000850319-000000067F00004002000140000000858CEC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000854000-000000067F00004002000140000000858000__000000914E3F38F0 000000067F00004002000140000000854000-000000067F00004002000140000000858000__000000931B9A2710 000000067F00004002000140000000858000-000000067F0000400200014000000085C000__000000914E3F38F0 000000067F00004002000140000000858000-000000067F0000400200014000000085C000__000000931B9A2710 000000067F00004002000140000000858CEC-000000067F000040020001400000008616C0__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000085C000-000000067F00004002000140000000860000__000000914E3F38F0 000000067F0000400200014000000085C000-000000067F00004002000140000000860000__000000931B9A2710 000000067F00004002000140000000860000-000000067F00004002000140000000864000__000000914E3F38F0 000000067F00004002000140000000860000-000000067F00004002000140000000864000__000000931B9A2710 000000067F000040020001400000008616C0-000000067F0000400200014000000086A0A7__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000864000-000000067F00004002000140000000868000__000000914E3F38F0 000000067F00004002000140000000864000-000000067F00004002000140000000868000__000000931B9A2710 000000067F00004002000140000000868000-000000067F0000400200014000000086C000__000000914E3F38F0 000000067F00004002000140000000868000-000000067F0000400200014000000086C000__000000931B9A2710 000000067F0000400200014000000086A0A7-000000067F00004002000140000000872A82__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000086C000-000000067F00004002000140000000870000__000000914E3F38F0 000000067F0000400200014000000086C000-000000067F00004002000140000000870000__000000931B9A2710 000000067F00004002000140000000870000-000000067F00004002000140000000874000__000000914E3F38F0 000000067F00004002000140000000870000-000000067F00004002000140000000874000__000000931B9A2710 000000067F00004002000140000000872A82-000000067F0000400200014000000087B45F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000874000-000000067F00004002000140000000878000__000000914E3F38F0 000000067F00004002000140000000874000-000000067F00004002000140000000878000__000000931B9A2710 000000067F00004002000140000000878000-000000067F0000400200014000000087C000__000000914E3F38F0 000000067F00004002000140000000878000-000000067F0000400200014000000087C000__000000931B9A2710 000000067F0000400200014000000087B45F-000000067F00004002000140000000883E35__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000087C000-000000067F00004002000140000000880000__000000914E3F38F0 000000067F0000400200014000000087C000-000000067F00004002000140000000880000__000000931B9A2710 000000067F00004002000140000000880000-000000067F00004002000140000000884000__000000914E3F38F0 000000067F00004002000140000000880000-000000067F00004002000140000000884000__000000931B9A2710 000000067F00004002000140000000883E35-000000067F0000400200014000000088C812__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000884000-000000067F00004002000140000000888000__000000914E3F38F0 000000067F00004002000140000000884000-000000067F00004002000140000000888000__000000931B9A2710 000000067F00004002000140000000888000-000000067F0000400200014000000088C000__000000914E3F38F0 000000067F00004002000140000000888000-000000067F0000400200014000000088C000__000000931B9A2710 000000067F0000400200014000000088C000-000000067F00004002000140000000890000__000000914E3F38F0 000000067F0000400200014000000088C000-000000067F00004002000140000000890000__000000931B9A2710 000000067F0000400200014000000088C812-000000067F000040020001400000008951E8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000890000-000000067F00004002000140000000894000__000000914E3F38F0 000000067F00004002000140000000890000-000000067F00004002000140000000894000__000000931B9A2710 000000067F00004002000140000000894000-000000067F00004002000140000000898000__000000914E3F38F0 000000067F00004002000140000000894000-000000067F00004002000140000000898000__000000931B9A2710 000000067F000040020001400000008951E8-000000067F0000400200014000000089DBC5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000898000-000000067F0000400200014000000089C000__000000914E3F38F0 000000067F00004002000140000000898000-000000067F0000400200014000000089C000__000000931B9A2710 000000067F0000400200014000000089C000-000000067F000040020001400000008A0000__000000914E3F38F0 000000067F0000400200014000000089C000-000000067F000040020001400000008A0000__000000931B9A2710 000000067F0000400200014000000089DBC5-000000067F000040020001400000008A65A8__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008A0000-000000067F000040020001400000008A4000__000000914E3F38F0 000000067F000040020001400000008A0000-000000067F000040020001400000008A4000__000000931B9A2710 000000067F000040020001400000008A4000-000000067F000040020001400000008A8000__000000914E3F38F0 000000067F000040020001400000008A4000-000000067F000040020001400000008A8000__000000931B9A2710 000000067F000040020001400000008A65A8-000000067F000040020001400000008AEF88__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008A8000-000000067F000040020001400000008AC000__000000914E3F38F0 000000067F000040020001400000008A8000-000000067F000040020001400000008AC000__000000931B9A2710 000000067F000040020001400000008AC000-000000067F000040020001400000008B0000__000000914E3F38F0 000000067F000040020001400000008AC000-000000067F000040020001400000008B0000__000000931B9A2710 000000067F000040020001400000008AEF88-000000067F000040020001400000008B7971__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008B0000-000000067F000040020001400000008B4000__000000914E3F38F0 000000067F000040020001400000008B0000-000000067F000040020001400000008B4000__000000931B9A2710 000000067F000040020001400000008B4000-000000067F000040020001400000008B8000__000000914E3F38F0 000000067F000040020001400000008B4000-000000067F000040020001400000008B8000__000000931B9A2710 000000067F000040020001400000008B7971-000000067F000040020001400000008C034C__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008B8000-000000067F000040020001400000008BC000__000000914E3F38F0 000000067F000040020001400000008B8000-000000067F000040020001400000008BC000__000000931B9A2710 000000067F000040020001400000008BC000-000000067F000040020001400000008C0000__000000914E3F38F0 000000067F000040020001400000008BC000-000000067F000040020001400000008C0000__000000931B9A2710 000000067F000040020001400000008C0000-000000067F000040020001400000008C4000__000000914E3F38F0 000000067F000040020001400000008C0000-000000067F000040020001400000008C4000__000000931B9A2710 000000067F000040020001400000008C034C-000000067F000040020001400000008C8D24__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008C4000-000000067F000040020001400000008C8000__000000914E3F38F0 000000067F000040020001400000008C4000-000000067F000040020001400000008C8000__000000931B9A2710 000000067F000040020001400000008C8000-000000067F000040020001400000008CC000__000000914E3F38F0 000000067F000040020001400000008C8000-000000067F000040020001400000008CC000__000000931B9A2710 000000067F000040020001400000008C8D24-000000067F000040020001400000008D16F9__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008CC000-000000067F000040020001400000008D0000__000000914E3F38F0 000000067F000040020001400000008CC000-000000067F000040020001400000008D0000__000000931B9A2710 000000067F000040020001400000008D0000-000000067F000040020001400000008D4000__000000914E3F38F0 000000067F000040020001400000008D0000-000000067F000040020001400000008D4000__000000931B9A2710 000000067F000040020001400000008D16F9-000000067F000040020001400000008DA0DC__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008D4000-000000067F000040020001400000008D8000__000000914E3F38F0 000000067F000040020001400000008D4000-000000067F000040020001400000008D8000__000000931B9A2710 000000067F000040020001400000008D8000-000000067F000040020001400000008DC000__000000914E3F38F0 000000067F000040020001400000008D8000-000000067F000040020001400000008DC000__000000931B9A2710 000000067F000040020001400000008DA0DC-000000067F000040020001400000008E2AC6__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008DC000-000000067F000040020001400000008E0000__000000914E3F38F0 000000067F000040020001400000008DC000-000000067F000040020001400000008E0000__000000931B9A2710 000000067F000040020001400000008E0000-000000067F000040020001400000008E4000__000000914E3F38F0 000000067F000040020001400000008E0000-000000067F000040020001400000008E4000__000000931B9A2710 000000067F000040020001400000008E2AC6-000000067F000040020001400000008EB4A1__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008E4000-000000067F000040020001400000008E8000__000000914E3F38F0 000000067F000040020001400000008E4000-000000067F000040020001400000008E8000__000000931B9A2710 000000067F000040020001400000008E8000-000000067F000040020001400000008EC000__000000914E3F38F0 000000067F000040020001400000008E8000-000000067F000040020001400000008EC000__000000931B9A2710 000000067F000040020001400000008EB4A1-000000067F000040020001400000008F3E7F__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008EC000-000000067F000040020001400000008F0000__000000914E3F38F0 000000067F000040020001400000008EC000-000000067F000040020001400000008F0000__000000931B9A2710 000000067F000040020001400000008F0000-000000067F000040020001400000008F4000__000000914E3F38F0 000000067F000040020001400000008F0000-000000067F000040020001400000008F4000__000000931B9A2710 000000067F000040020001400000008F3E7F-000000067F000040020001400000008FC85E__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000008F4000-000000067F000040020001400000008F8000__000000914E3F38F0 000000067F000040020001400000008F4000-000000067F000040020001400000008F8000__000000931B9A2710 000000067F000040020001400000008F8000-000000067F000040020001400000008FC000__000000914E3F38F0 000000067F000040020001400000008F8000-000000067F000040020001400000008FC000__000000931B9A2710 000000067F000040020001400000008FC000-000000067F00004002000140000000900000__000000914E3F38F0 000000067F000040020001400000008FC000-000000067F00004002000140000000900000__000000931B9A2710 000000067F000040020001400000008FC85E-000000067F0000400200014000000090523C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000900000-000000067F00004002000140000000904000__000000914E3F38F0 000000067F00004002000140000000900000-000000067F00004002000140000000904000__000000931B9A2710 000000067F00004002000140000000904000-000000067F00004002000140000000908000__000000914E3F38F0 000000067F00004002000140000000904000-000000067F00004002000140000000908000__000000931B9A2710 000000067F0000400200014000000090523C-000000067F0000400200014000000090DC13__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000908000-000000067F0000400200014000000090C000__000000914E3F38F0 000000067F00004002000140000000908000-000000067F0000400200014000000090C000__000000931B9A2710 000000067F0000400200014000000090C000-000000067F00004002000140000000910000__000000914E3F38F0 000000067F0000400200014000000090C000-000000067F00004002000140000000910000__000000931B9A2710 000000067F0000400200014000000090DC13-000000067F000040020001400000009165D8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000910000-000000067F00004002000140000000914000__000000914E3F38F0 000000067F00004002000140000000910000-000000067F00004002000140000000914000__000000931B9A2710 000000067F00004002000140000000914000-000000067F00004002000140000000918000__000000914E3F38F0 000000067F00004002000140000000914000-000000067F00004002000140000000918000__000000931B9A2710 000000067F000040020001400000009165D8-000000067F0000400200014000000091EFC0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000918000-000000067F0000400200014000000091C000__000000914E3F38F0 000000067F00004002000140000000918000-000000067F0000400200014000000091C000__000000931B9A2710 000000067F0000400200014000000091C000-000000067F00004002000140000000920000__000000914E3F38F0 000000067F0000400200014000000091C000-000000067F00004002000140000000920000__000000931B9A2710 000000067F0000400200014000000091EFC0-000000067F000040020001400000009279A0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000920000-000000067F00004002000140000000924000__000000914E3F38F0 000000067F00004002000140000000920000-000000067F00004002000140000000924000__000000931B9A2710 000000067F00004002000140000000924000-000000067F00004002000140000000928000__000000914E3F38F0 000000067F00004002000140000000924000-000000067F00004002000140000000928000__000000931B9A2710 000000067F000040020001400000009279A0-000000067F0000400200014000000093037A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000928000-000000067F0000400200014000000092C000__000000914E3F38F0 000000067F00004002000140000000928000-000000067F0000400200014000000092C000__000000931B9A2710 000000067F0000400200014000000092C000-000000067F00004002000140000000930000__000000914E3F38F0 000000067F0000400200014000000092C000-000000067F00004002000140000000930000__000000931B9A2710 000000067F00004002000140000000930000-000000067F00004002000140000000934000__000000914E3F38F0 000000067F00004002000140000000930000-000000067F00004002000140000000934000__000000931B9A2710 000000067F0000400200014000000093037A-000000067F00004002000140000000938D5F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000934000-000000067F00004002000140000000938000__000000914E3F38F0 000000067F00004002000140000000934000-000000067F00004002000140000000938000__000000931B9A2710 000000067F00004002000140000000938000-000000067F0000400200014000000093C000__000000914E3F38F0 000000067F00004002000140000000938000-000000067F0000400200014000000093C000__000000931B9A2710 000000067F00004002000140000000938D5F-000000067F00004002000140000000941744__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000093C000-000000067F00004002000140000000940000__000000914E3F38F0 000000067F0000400200014000000093C000-000000067F00004002000140000000940000__000000931B9A2710 000000067F00004002000140000000940000-000000067F00004002000140000000944000__000000914E3F38F0 000000067F00004002000140000000940000-000000067F00004002000140000000944000__000000931B9A2710 000000067F00004002000140000000941744-000000067F0000400200014000000094A116__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000944000-000000067F00004002000140000000948000__000000914E3F38F0 000000067F00004002000140000000944000-000000067F00004002000140000000948000__000000931B9A2710 000000067F00004002000140000000948000-000000067F0000400200014000000094C000__000000914E3F38F0 000000067F00004002000140000000948000-000000067F0000400200014000000094C000__000000931B9A2710 000000067F0000400200014000000094A116-000000067F00004002000140000000952AE4__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000094C000-000000067F00004002000140000000950000__000000914E3F38F0 000000067F0000400200014000000094C000-000000067F00004002000140000000950000__000000931B9A2710 000000067F00004002000140000000950000-000000067F00004002000140000000954000__000000914E3F38F0 000000067F00004002000140000000950000-000000067F00004002000140000000954000__000000931B9A2710 000000067F00004002000140000000952AE4-000000067F0000400200014000000095B4CF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000954000-000000067F00004002000140000000958000__000000914E3F38F0 000000067F00004002000140000000954000-000000067F00004002000140000000958000__000000931B9A2710 000000067F00004002000140000000958000-000000067F0000400200014000000095C000__000000914E3F38F0 000000067F00004002000140000000958000-000000067F0000400200014000000095C000__000000931B9A2710 000000067F0000400200014000000095B4CF-000000067F00004002000140000000963EB4__000000739A920D71-0000008D2DB5E0C1 000000067F0000400200014000000095C000-000000067F00004002000140000000960000__000000914E3F38F0 000000067F0000400200014000000095C000-000000067F00004002000140000000960000__000000931B9A2710 000000067F00004002000140000000960000-000000067F00004002000140000000964000__000000914E3F38F0 000000067F00004002000140000000960000-000000067F00004002000140000000964000__000000931B9A2710 000000067F00004002000140000000963EB4-000000067F0000400200014000000096C887__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000964000-000000067F00004002000140000000968000__000000914E3F38F0 000000067F00004002000140000000964000-000000067F00004002000140000000968000__000000931B9A2710 000000067F00004002000140000000968000-000000067F0000400200014000000096C000__000000914E3F38F0 000000067F00004002000140000000968000-000000067F0000400200014000000096C000__000000931B9A2710 000000067F0000400200014000000096C000-000000067F00004002000140000000970000__000000914E3F38F0 000000067F0000400200014000000096C000-000000067F00004002000140000000970000__000000931B9A2710 000000067F0000400200014000000096C887-000000067F0000400200014000000097527C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000970000-000000067F00004002000140000000974000__000000914E3F38F0 000000067F00004002000140000000970000-000000067F00004002000140000000974000__000000931B9A2710 000000067F00004002000140000000974000-000000067F00004002000140000000978000__000000914E3F38F0 000000067F00004002000140000000974000-000000067F00004002000140000000978000__000000931B9A2710 000000067F0000400200014000000097527C-000000067F0000400200014000000097DC5A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000978000-000000067F0000400200014000000097C000__000000914E3F38F0 000000067F00004002000140000000978000-000000067F0000400200014000000097C000__000000931B9A2710 000000067F0000400200014000000097C000-000000067F00004002000140000000980000__000000914E3F38F0 000000067F0000400200014000000097C000-000000067F00004002000140000000980000__000000931B9A2710 000000067F0000400200014000000097DC5A-000000067F00004002000140000000986635__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000980000-000000067F00004002000140000000984000__000000914E3F38F0 000000067F00004002000140000000980000-000000067F00004002000140000000984000__000000931B9A2710 000000067F00004002000140000000984000-000000067F00004002000140000000988000__000000914E3F38F0 000000067F00004002000140000000984000-000000067F00004002000140000000988000__000000931B9A2710 000000067F00004002000140000000986635-000000067F0000400200014000000098F001__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000988000-000000067F0000400200014000000098C000__000000914E3F38F0 000000067F00004002000140000000988000-000000067F0000400200014000000098C000__000000931B9A2710 000000067F0000400200014000000098C000-000000067F00004002000140000000990000__000000914E3F38F0 000000067F0000400200014000000098C000-000000067F00004002000140000000990000__000000931B9A2710 000000067F0000400200014000000098F001-000000067F000040020001400000009979DC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000990000-000000067F00004002000140000000994000__000000914E3F38F0 000000067F00004002000140000000990000-000000067F00004002000140000000994000__000000931B9A2710 000000067F00004002000140000000994000-000000067F00004002000140000000998000__000000914E3F38F0 000000067F00004002000140000000994000-000000067F00004002000140000000998000__000000931B9A2710 000000067F000040020001400000009979DC-000000067F000040020001400000009A03BB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000998000-000000067F0000400200014000000099C000__000000914E3F38F0 000000067F00004002000140000000998000-000000067F0000400200014000000099C000__000000931B9A2710 000000067F0000400200014000000099C000-000000067F000040020001400000009A0000__000000914E3F38F0 000000067F0000400200014000000099C000-000000067F000040020001400000009A0000__000000931B9A2710 000000067F000040020001400000009A0000-000000067F000040020001400000009A4000__000000914E3F38F0 000000067F000040020001400000009A0000-000000067F000040020001400000009A4000__000000931B9A2710 000000067F000040020001400000009A03BB-000000067F000040020001400000009A8D9C__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009A4000-000000067F000040020001400000009A8000__000000914E3F38F0 000000067F000040020001400000009A4000-000000067F000040020001400000009A8000__000000931B9A2710 000000067F000040020001400000009A8000-000000067F000040020001400000009AC000__000000914E3F38F0 000000067F000040020001400000009A8000-000000067F000040020001400000009AC000__000000931B9A2710 000000067F000040020001400000009A8D9C-000000067F000040020001400000009B1778__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009AC000-000000067F000040020001400000009B0000__000000914E3F38F0 000000067F000040020001400000009AC000-000000067F000040020001400000009B0000__000000931B9A2710 000000067F000040020001400000009B0000-000000067F000040020001400000009B4000__000000914E3F38F0 000000067F000040020001400000009B0000-000000067F000040020001400000009B4000__000000931B9A2710 000000067F000040020001400000009B1778-000000067F000040020001400000009BA15D__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009B4000-000000067F000040020001400000009B8000__000000914E3F38F0 000000067F000040020001400000009B4000-000000067F000040020001400000009B8000__000000931B9A2710 000000067F000040020001400000009B8000-000000067F000040020001400000009BC000__000000914E3F38F0 000000067F000040020001400000009B8000-000000067F000040020001400000009BC000__000000931B9A2710 000000067F000040020001400000009BA15D-000000067F000040020001400000009C2B39__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009BC000-000000067F000040020001400000009C0000__000000914E3F38F0 000000067F000040020001400000009BC000-000000067F000040020001400000009C0000__000000931B9A2710 000000067F000040020001400000009C0000-000000067F000040020001400000009C4000__000000914E3F38F0 000000067F000040020001400000009C0000-000000067F000040020001400000009C4000__000000931B9A2710 000000067F000040020001400000009C2B39-000000067F000040020001400000009CB50E__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009C4000-000000067F000040020001400000009C8000__000000914E3F38F0 000000067F000040020001400000009C4000-000000067F000040020001400000009C8000__000000931B9A2710 000000067F000040020001400000009C8000-000000067F000040020001400000009CC000__000000914E3F38F0 000000067F000040020001400000009C8000-000000067F000040020001400000009CC000__000000931B9A2710 000000067F000040020001400000009CB50E-000000067F000040020001400000009D3EE6__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009CC000-000000067F000040020001400000009D0000__000000914E3F38F0 000000067F000040020001400000009CC000-000000067F000040020001400000009D0000__000000931B9A2710 000000067F000040020001400000009D0000-000000067F000040020001400000009D4000__000000914E3F38F0 000000067F000040020001400000009D0000-000000067F000040020001400000009D4000__000000931B9A2710 000000067F000040020001400000009D3EE6-000000067F000040020001400000009DC8C6__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009D4000-000000067F000040020001400000009D8000__000000914E3F38F0 000000067F000040020001400000009D4000-000000067F000040020001400000009D8000__000000931B9A2710 000000067F000040020001400000009D8000-000000067F000040020001400000009DC000__000000914E3F38F0 000000067F000040020001400000009D8000-000000067F000040020001400000009DC000__000000931B9A2710 000000067F000040020001400000009DC000-000000067F000040020001400000009E0000__000000914E3F38F0 000000067F000040020001400000009DC000-000000067F000040020001400000009E0000__000000931B9A2710 000000067F000040020001400000009DC8C6-000000067F000040020001400000009E52AA__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009E0000-000000067F000040020001400000009E4000__000000914E3F38F0 000000067F000040020001400000009E0000-000000067F000040020001400000009E4000__000000931B9A2710 000000067F000040020001400000009E4000-000000067F000040020001400000009E8000__000000914E3F38F0 000000067F000040020001400000009E4000-000000067F000040020001400000009E8000__000000931B9A2710 000000067F000040020001400000009E52AA-000000067F000040020001400000009EDC8B__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009E8000-000000067F000040020001400000009EC000__000000914E3F38F0 000000067F000040020001400000009E8000-000000067F000040020001400000009EC000__000000931B9A2710 000000067F000040020001400000009EC000-000000067F000040020001400000009F0000__000000914E3F38F0 000000067F000040020001400000009EC000-000000067F000040020001400000009F0000__000000931B9A2710 000000067F000040020001400000009EDC8B-000000067F000040020001400000009F666E__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009F0000-000000067F000040020001400000009F4000__000000914E3F38F0 000000067F000040020001400000009F0000-000000067F000040020001400000009F4000__000000931B9A2710 000000067F000040020001400000009F4000-000000067F000040020001400000009F8000__000000914E3F38F0 000000067F000040020001400000009F4000-000000067F000040020001400000009F8000__000000931B9A2710 000000067F000040020001400000009F666E-000000067F000040020001400000009FF04D__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400000009F8000-000000067F000040020001400000009FC000__000000914E3F38F0 000000067F000040020001400000009F8000-000000067F000040020001400000009FC000__000000931B9A2710 000000067F000040020001400000009FC000-000000067F00004002000140000000A00000__000000914E3F38F0 000000067F000040020001400000009FC000-000000067F00004002000140000000A00000__000000931B9A2710 000000067F000040020001400000009FF04D-000000067F00004002000140000000A07A27__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A00000-000000067F00004002000140000000A04000__000000914E3F38F0 000000067F00004002000140000000A00000-000000067F00004002000140000000A04000__000000931B9A2710 000000067F00004002000140000000A04000-000000067F00004002000140000000A08000__000000914E3F38F0 000000067F00004002000140000000A04000-000000067F00004002000140000000A08000__000000931B9A2710 000000067F00004002000140000000A07A27-000000067F00004002000140000000A103FD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A08000-000000067F00004002000140000000A0C000__000000914E3F38F0 000000067F00004002000140000000A08000-000000067F00004002000140000000A0C000__000000931B9A2710 000000067F00004002000140000000A0C000-000000067F00004002000140000000A10000__000000914E3F38F0 000000067F00004002000140000000A0C000-000000067F00004002000140000000A10000__000000931B9A2710 000000067F00004002000140000000A10000-000000067F00004002000140000000A14000__000000914E3F38F0 000000067F00004002000140000000A10000-000000067F00004002000140000000A14000__000000931B9A2710 000000067F00004002000140000000A103FD-000000067F00004002000140000000A18DD8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A14000-000000067F00004002000140000000A18000__000000914E3F38F0 000000067F00004002000140000000A14000-000000067F00004002000140000000A18000__000000931B9A2710 000000067F00004002000140000000A18000-000000067F00004002000140000000A1C000__000000914E3F38F0 000000067F00004002000140000000A18000-000000067F00004002000140000000A1C000__000000931B9A2710 000000067F00004002000140000000A18DD8-000000067F00004002000140000000A217BD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A1C000-000000067F00004002000140000000A20000__000000914E3F38F0 000000067F00004002000140000000A1C000-000000067F00004002000140000000A20000__000000931B9A2710 000000067F00004002000140000000A20000-000000067F00004002000140000000A24000__000000914E3F38F0 000000067F00004002000140000000A20000-000000067F00004002000140000000A24000__000000931B9A2710 000000067F00004002000140000000A217BD-000000067F00004002000140000000A2A192__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A24000-000000067F00004002000140000000A28000__000000914E3F38F0 000000067F00004002000140000000A24000-000000067F00004002000140000000A28000__000000931B9A2710 000000067F00004002000140000000A28000-000000067F00004002000140000000A2C000__000000914E3F38F0 000000067F00004002000140000000A28000-000000067F00004002000140000000A2C000__000000931B9A2710 000000067F00004002000140000000A2A192-000000067F00004002000140000000A32B76__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A2C000-000000067F00004002000140000000A30000__000000914E3F38F0 000000067F00004002000140000000A2C000-000000067F00004002000140000000A30000__000000931B9A2710 000000067F00004002000140000000A30000-000000067F00004002000140000000A34000__000000914E3F38F0 000000067F00004002000140000000A30000-000000067F00004002000140000000A34000__000000931B9A2710 000000067F00004002000140000000A32B76-000000067F00004002000140000000A3B553__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A34000-000000067F00004002000140000000A38000__000000914E3F38F0 000000067F00004002000140000000A34000-000000067F00004002000140000000A38000__000000931B9A2710 000000067F00004002000140000000A38000-000000067F00004002000140000000A3C000__000000914E3F38F0 000000067F00004002000140000000A38000-000000067F00004002000140000000A3C000__000000931B9A2710 000000067F00004002000140000000A3B553-000000067F00004002000140000000A43F22__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A3C000-000000067F00004002000140000000A40000__000000914E3F38F0 000000067F00004002000140000000A3C000-000000067F00004002000140000000A40000__000000931B9A2710 000000067F00004002000140000000A40000-000000067F00004002000140000000A44000__000000914E3F38F0 000000067F00004002000140000000A40000-000000067F00004002000140000000A44000__000000931B9A2710 000000067F00004002000140000000A43F22-000000067F00004002000140000000A4C8FF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A44000-000000067F00004002000140000000A48000__000000914E3F38F0 000000067F00004002000140000000A44000-000000067F00004002000140000000A48000__000000931B9A2710 000000067F00004002000140000000A48000-000000067F00004002000140000000A4C000__000000914E3F38F0 000000067F00004002000140000000A48000-000000067F00004002000140000000A4C000__000000931B9A2710 000000067F00004002000140000000A4C000-000000067F00004002000140000000A50000__000000914E3F38F0 000000067F00004002000140000000A4C000-000000067F00004002000140000000A50000__000000931B9A2710 000000067F00004002000140000000A4C8FF-000000067F00004002000140000000A552E5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A50000-000000067F00004002000140000000A54000__000000914E3F38F0 000000067F00004002000140000000A50000-000000067F00004002000140000000A54000__000000931B9A2710 000000067F00004002000140000000A54000-000000067F00004002000140000000A58000__000000914E3F38F0 000000067F00004002000140000000A54000-000000067F00004002000140000000A58000__000000931B9A2710 000000067F00004002000140000000A552E5-000000067F00004002000140000000A5DCCD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A58000-000000067F00004002000140000000A5C000__000000914E3F38F0 000000067F00004002000140000000A58000-000000067F00004002000140000000A5C000__000000931B9A2710 000000067F00004002000140000000A5C000-000000067F00004002000140000000A60000__000000914E3F38F0 000000067F00004002000140000000A5C000-000000067F00004002000140000000A60000__000000931B9A2710 000000067F00004002000140000000A5DCCD-000000067F00004002000140000000A666AB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A60000-000000067F00004002000140000000A64000__000000914E3F38F0 000000067F00004002000140000000A60000-000000067F00004002000140000000A64000__000000931B9A2710 000000067F00004002000140000000A64000-000000067F00004002000140000000A68000__000000914E3F38F0 000000067F00004002000140000000A64000-000000067F00004002000140000000A68000__000000931B9A2710 000000067F00004002000140000000A666AB-000000067F00004002000140000000A6F093__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A68000-000000067F00004002000140000000A6C000__000000914E3F38F0 000000067F00004002000140000000A68000-000000067F00004002000140000000A6C000__000000931B9A2710 000000067F00004002000140000000A6C000-000000067F00004002000140000000A70000__000000914E3F38F0 000000067F00004002000140000000A6C000-000000067F00004002000140000000A70000__000000931B9A2710 000000067F00004002000140000000A6F093-000000067F00004002000140000000A77A6F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A70000-000000067F00004002000140000000A74000__000000914E3F38F0 000000067F00004002000140000000A70000-000000067F00004002000140000000A74000__000000931B9A2710 000000067F00004002000140000000A74000-000000067F00004002000140000000A78000__000000914E3F38F0 000000067F00004002000140000000A74000-000000067F00004002000140000000A78000__000000931B9A2710 000000067F00004002000140000000A77A6F-000000067F00004002000140000000A80445__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A78000-000000067F00004002000140000000A7C000__000000914E3F38F0 000000067F00004002000140000000A78000-000000067F00004002000140000000A7C000__000000931B9A2710 000000067F00004002000140000000A7C000-000000067F00004002000140000000A80000__000000914E3F38F0 000000067F00004002000140000000A7C000-000000067F00004002000140000000A80000__000000931B9A2710 000000067F00004002000140000000A80000-000000067F00004002000140000000A84000__000000914E3F38F0 000000067F00004002000140000000A80000-000000067F00004002000140000000A84000__000000931B9A2710 000000067F00004002000140000000A80445-000000067F00004002000140000000A88E32__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A84000-000000067F00004002000140000000A88000__000000914E3F38F0 000000067F00004002000140000000A84000-000000067F00004002000140000000A88000__000000931B9A2710 000000067F00004002000140000000A88000-000000067F00004002000140000000A8C000__000000914E3F38F0 000000067F00004002000140000000A88000-000000067F00004002000140000000A8C000__000000931B9A2710 000000067F00004002000140000000A88E32-000000067F00004002000140000000A91804__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A8C000-000000067F00004002000140000000A90000__000000914E3F38F0 000000067F00004002000140000000A8C000-000000067F00004002000140000000A90000__000000931B9A2710 000000067F00004002000140000000A90000-000000067F00004002000140000000A94000__000000914E3F38F0 000000067F00004002000140000000A90000-000000067F00004002000140000000A94000__000000931B9A2710 000000067F00004002000140000000A91804-000000067F00004002000140000000A9A1D9__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A94000-000000067F00004002000140000000A98000__000000914E3F38F0 000000067F00004002000140000000A94000-000000067F00004002000140000000A98000__000000931B9A2710 000000067F00004002000140000000A98000-000000067F00004002000140000000A9C000__000000914E3F38F0 000000067F00004002000140000000A98000-000000067F00004002000140000000A9C000__000000931B9A2710 000000067F00004002000140000000A9A1D9-000000067F00004002000140000000AA2BBC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000A9C000-000000067F00004002000140000000AA0000__000000914E3F38F0 000000067F00004002000140000000A9C000-000000067F00004002000140000000AA0000__000000931B9A2710 000000067F00004002000140000000AA0000-000000067F00004002000140000000AA4000__000000914E3F38F0 000000067F00004002000140000000AA0000-000000067F00004002000140000000AA4000__000000931B9A2710 000000067F00004002000140000000AA2BBC-000000067F00004002000140000000AAB5A0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AA4000-000000067F00004002000140000000AA8000__000000914E3F38F0 000000067F00004002000140000000AA4000-000000067F00004002000140000000AA8000__000000931B9A2710 000000067F00004002000140000000AA8000-000000067F00004002000140000000AAC000__000000914E3F38F0 000000067F00004002000140000000AA8000-000000067F00004002000140000000AAC000__000000931B9A2710 000000067F00004002000140000000AAB5A0-000000067F00004002000140000000AB3F74__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AAC000-000000067F00004002000140000000AB0000__000000914E3F38F0 000000067F00004002000140000000AAC000-000000067F00004002000140000000AB0000__000000931B9A2710 000000067F00004002000140000000AB0000-000000067F00004002000140000000AB4000__000000914E3F38F0 000000067F00004002000140000000AB0000-000000067F00004002000140000000AB4000__000000931B9A2710 000000067F00004002000140000000AB3F74-000000067F00004002000140000000ABC949__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AB4000-000000067F00004002000140000000AB8000__000000914E3F38F0 000000067F00004002000140000000AB4000-000000067F00004002000140000000AB8000__000000931B9A2710 000000067F00004002000140000000AB8000-000000067F00004002000140000000ABC000__000000914E3F38F0 000000067F00004002000140000000AB8000-000000067F00004002000140000000ABC000__000000931B9A2710 000000067F00004002000140000000ABC000-000000067F00004002000140000000AC0000__000000914E3F38F0 000000067F00004002000140000000ABC000-000000067F00004002000140000000AC0000__000000931B9A2710 000000067F00004002000140000000ABC949-000000067F00004002000140000000AC5324__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AC0000-000000067F00004002000140000000AC4000__000000914E3F38F0 000000067F00004002000140000000AC0000-000000067F00004002000140000000AC4000__000000931B9A2710 000000067F00004002000140000000AC4000-000000067F00004002000140000000AC8000__000000914E3F38F0 000000067F00004002000140000000AC4000-000000067F00004002000140000000AC8000__000000931B9A2710 000000067F00004002000140000000AC5324-000000067F00004002000140000000ACDCFB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AC8000-000000067F00004002000140000000ACC000__000000914E3F38F0 000000067F00004002000140000000AC8000-000000067F00004002000140000000ACC000__000000931B9A2710 000000067F00004002000140000000ACC000-000000067F00004002000140000000AD0000__000000914E3F38F0 000000067F00004002000140000000ACC000-000000067F00004002000140000000AD0000__000000931B9A2710 000000067F00004002000140000000ACDCFB-000000067F00004002000140000000AD66E4__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AD0000-000000067F00004002000140000000AD4000__000000914E3F38F0 000000067F00004002000140000000AD0000-000000067F00004002000140000000AD4000__000000931B9A2710 000000067F00004002000140000000AD4000-000000067F00004002000140000000AD8000__000000914E3F38F0 000000067F00004002000140000000AD4000-000000067F00004002000140000000AD8000__000000931B9A2710 000000067F00004002000140000000AD66E4-000000067F00004002000140000000ADF0C3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AD8000-000000067F00004002000140000000ADC000__000000914E3F38F0 000000067F00004002000140000000AD8000-000000067F00004002000140000000ADC000__000000931B9A2710 000000067F00004002000140000000ADC000-000000067F00004002000140000000AE0000__000000914E3F38F0 000000067F00004002000140000000ADC000-000000067F00004002000140000000AE0000__000000931B9A2710 000000067F00004002000140000000ADF0C3-000000067F00004002000140000000AE7AA6__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AE0000-000000067F00004002000140000000AE4000__000000914E3F38F0 000000067F00004002000140000000AE0000-000000067F00004002000140000000AE4000__000000931B9A2710 000000067F00004002000140000000AE4000-000000067F00004002000140000000AE8000__000000914E3F38F0 000000067F00004002000140000000AE4000-000000067F00004002000140000000AE8000__000000931B9A2710 000000067F00004002000140000000AE7AA6-000000067F00004002000140000000AF047C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AE8000-000000067F00004002000140000000AEC000__000000914E3F38F0 000000067F00004002000140000000AE8000-000000067F00004002000140000000AEC000__000000931B9A2710 000000067F00004002000140000000AEC000-000000067F00004002000140000000AF0000__000000914E3F38F0 000000067F00004002000140000000AEC000-000000067F00004002000140000000AF0000__000000931B9A2710 000000067F00004002000140000000AF0000-000000067F00004002000140000000AF4000__000000914E3F38F0 000000067F00004002000140000000AF0000-000000067F00004002000140000000AF4000__000000931B9A2710 000000067F00004002000140000000AF047C-000000067F00004002000140000000AF8E55__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AF4000-000000067F00004002000140000000AF8000__000000914E3F38F0 000000067F00004002000140000000AF4000-000000067F00004002000140000000AF8000__000000931B9A2710 000000067F00004002000140000000AF8000-000000067F00004002000140000000AFC000__000000914E3F38F0 000000067F00004002000140000000AF8000-000000067F00004002000140000000AFC000__000000931B9A2710 000000067F00004002000140000000AF8E55-000000067F00004002000140000000B0182C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000AFC000-000000067F00004002000140000000B00000__000000914E3F38F0 000000067F00004002000140000000AFC000-000000067F00004002000140000000B00000__000000931B9A2710 000000067F00004002000140000000B00000-000000067F00004002000140000000B04000__000000914E3F38F0 000000067F00004002000140000000B00000-000000067F00004002000140000000B04000__000000931B9A2710 000000067F00004002000140000000B0182C-000000067F00004002000140000000B0A1FF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B04000-000000067F00004002000140000000B08000__000000914E3F38F0 000000067F00004002000140000000B04000-000000067F00004002000140000000B08000__000000931B9A2710 000000067F00004002000140000000B08000-000000067F00004002000140000000B0C000__000000914E3F38F0 000000067F00004002000140000000B08000-000000067F00004002000140000000B0C000__000000931B9A2710 000000067F00004002000140000000B0A1FF-000000067F00004002000140000000B12BEC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B0C000-000000067F00004002000140000000B10000__000000914E3F38F0 000000067F00004002000140000000B0C000-000000067F00004002000140000000B10000__000000931B9A2710 000000067F00004002000140000000B10000-000000067F00004002000140000000B14000__000000914E3F38F0 000000067F00004002000140000000B10000-000000067F00004002000140000000B14000__000000931B9A2710 000000067F00004002000140000000B12BEC-000000067F00004002000140000000B1B5CE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B14000-000000067F00004002000140000000B18000__000000914E3F38F0 000000067F00004002000140000000B14000-000000067F00004002000140000000B18000__000000931B9A2710 000000067F00004002000140000000B18000-000000067F00004002000140000000B1C000__000000914E3F38F0 000000067F00004002000140000000B18000-000000067F00004002000140000000B1C000__000000931B9A2710 000000067F00004002000140000000B1B5CE-000000067F00004002000140000000B23FC0__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B1C000-000000067F00004002000140000000B20000__000000914E3F38F0 000000067F00004002000140000000B1C000-000000067F00004002000140000000B20000__000000931B9A2710 000000067F00004002000140000000B20000-000000067F00004002000140000000B24000__000000914E3F38F0 000000067F00004002000140000000B20000-000000067F00004002000140000000B24000__000000931B9A2710 000000067F00004002000140000000B23FC0-000000067F00004002000140000000B2C997__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B24000-000000067F00004002000140000000B28000__000000914E3F38F0 000000067F00004002000140000000B24000-000000067F00004002000140000000B28000__000000931B9A2710 000000067F00004002000140000000B28000-000000067F00004002000140000000B2C000__000000914E3F38F0 000000067F00004002000140000000B28000-000000067F00004002000140000000B2C000__000000931B9A2710 000000067F00004002000140000000B2C000-000000067F00004002000140000000B30000__000000914E3F38F0 000000067F00004002000140000000B2C000-000000067F00004002000140000000B30000__000000931B9A2710 000000067F00004002000140000000B2C997-000000067F00004002000140000000B35371__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B30000-000000067F00004002000140000000B34000__000000914E3F38F0 000000067F00004002000140000000B30000-000000067F00004002000140000000B34000__000000931B9A2710 000000067F00004002000140000000B34000-000000067F00004002000140000000B38000__000000914E3F38F0 000000067F00004002000140000000B34000-000000067F00004002000140000000B38000__000000931B9A2710 000000067F00004002000140000000B35371-000000067F00004002000140000000B3DD41__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B38000-000000067F00004002000140000000B3C000__000000914E3F38F0 000000067F00004002000140000000B38000-000000067F00004002000140000000B3C000__000000931B9A2710 000000067F00004002000140000000B3C000-000000067F00004002000140000000B40000__000000914E3F38F0 000000067F00004002000140000000B3C000-000000067F00004002000140000000B40000__000000931B9A2710 000000067F00004002000140000000B3DD41-000000067F00004002000140000000B46710__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B40000-000000067F00004002000140000000B44000__000000914E3F38F0 000000067F00004002000140000000B40000-000000067F00004002000140000000B44000__000000931B9A2710 000000067F00004002000140000000B44000-000000067F00004002000140000000B48000__000000914E3F38F0 000000067F00004002000140000000B44000-000000067F00004002000140000000B48000__000000931B9A2710 000000067F00004002000140000000B46710-000000067F00004002000140000000B4F0EB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B48000-000000067F00004002000140000000B4C000__000000914E3F38F0 000000067F00004002000140000000B48000-000000067F00004002000140000000B4C000__000000931B9A2710 000000067F00004002000140000000B4C000-000000067F00004002000140000000B50000__000000914E3F38F0 000000067F00004002000140000000B4C000-000000067F00004002000140000000B50000__000000931B9A2710 000000067F00004002000140000000B4F0EB-000000067F00004002000140000000B57ACA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B50000-000000067F00004002000140000000B54000__000000914E3F38F0 000000067F00004002000140000000B50000-000000067F00004002000140000000B54000__000000931B9A2710 000000067F00004002000140000000B54000-000000067F00004002000140000000B58000__000000914E3F38F0 000000067F00004002000140000000B54000-000000067F00004002000140000000B58000__000000931B9A2710 000000067F00004002000140000000B57ACA-000000067F00004002000140000000B604B4__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B58000-000000067F00004002000140000000B5C000__000000914E3F38F0 000000067F00004002000140000000B58000-000000067F00004002000140000000B5C000__000000931B9A2710 000000067F00004002000140000000B5C000-000000067F00004002000140000000B60000__000000914E3F38F0 000000067F00004002000140000000B5C000-000000067F00004002000140000000B60000__000000931B9A2710 000000067F00004002000140000000B60000-000000067F00004002000140000000B64000__000000914E3F38F0 000000067F00004002000140000000B60000-000000067F00004002000140000000B64000__000000931B9A2710 000000067F00004002000140000000B604B4-000000067F00004002000140000000B68E85__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B64000-000000067F00004002000140000000B68000__000000914E3F38F0 000000067F00004002000140000000B64000-000000067F00004002000140000000B68000__000000931B9A2710 000000067F00004002000140000000B68000-000000067F00004002000140000000B6C000__000000914E3F38F0 000000067F00004002000140000000B68000-000000067F00004002000140000000B6C000__000000931B9A2710 000000067F00004002000140000000B68E85-000000067F00004002000140000000B71863__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B6C000-000000067F00004002000140000000B70000__000000914E3F38F0 000000067F00004002000140000000B6C000-000000067F00004002000140000000B70000__000000931B9A2710 000000067F00004002000140000000B70000-000000067F00004002000140000000B74000__000000914E3F38F0 000000067F00004002000140000000B70000-000000067F00004002000140000000B74000__000000931B9A2710 000000067F00004002000140000000B71863-000000067F00004002000140000000B7A239__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B74000-000000067F00004002000140000000B78000__000000914E3F38F0 000000067F00004002000140000000B74000-000000067F00004002000140000000B78000__000000931B9A2710 000000067F00004002000140000000B78000-000000067F00004002000140000000B7C000__000000914E3F38F0 000000067F00004002000140000000B78000-000000067F00004002000140000000B7C000__000000931B9A2710 000000067F00004002000140000000B7A239-000000067F00004002000140000000B82C06__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B7C000-000000067F00004002000140000000B80000__000000914E3F38F0 000000067F00004002000140000000B7C000-000000067F00004002000140000000B80000__000000931B9A2710 000000067F00004002000140000000B80000-000000067F00004002000140000000B84000__000000914E3F38F0 000000067F00004002000140000000B80000-000000067F00004002000140000000B84000__000000931B9A2710 000000067F00004002000140000000B82C06-000000067F00004002000140000000B8B5E5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B84000-000000067F00004002000140000000B88000__000000914E3F38F0 000000067F00004002000140000000B84000-000000067F00004002000140000000B88000__000000931B9A2710 000000067F00004002000140000000B88000-000000067F00004002000140000000B8C000__000000914E3F38F0 000000067F00004002000140000000B88000-000000067F00004002000140000000B8C000__000000931B9A2710 000000067F00004002000140000000B8B5E5-000000067F00004002000140000000B93FD3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B8C000-000000067F00004002000140000000B90000__000000914E3F38F0 000000067F00004002000140000000B8C000-000000067F00004002000140000000B90000__000000931B9A2710 000000067F00004002000140000000B90000-000000067F00004002000140000000B94000__000000914E3F38F0 000000067F00004002000140000000B90000-000000067F00004002000140000000B94000__000000931B9A2710 000000067F00004002000140000000B93FD3-000000067F00004002000140000000B9C9B8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000B94000-000000067F00004002000140000000B98000__000000914E3F38F0 000000067F00004002000140000000B94000-000000067F00004002000140000000B98000__000000931B9A2710 000000067F00004002000140000000B98000-000000067F00004002000140000000B9C000__000000914E3F38F0 000000067F00004002000140000000B98000-000000067F00004002000140000000B9C000__000000931B9A2710 000000067F00004002000140000000B9C000-000000067F00004002000140000000BA0000__000000914E3F38F0 000000067F00004002000140000000B9C000-000000067F00004002000140000000BA0000__000000931B9A2710 000000067F00004002000140000000B9C9B8-000000067F00004002000140000000BA538E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BA0000-000000067F00004002000140000000BA4000__000000914E3F38F0 000000067F00004002000140000000BA0000-000000067F00004002000140000000BA4000__000000931B9A2710 000000067F00004002000140000000BA4000-000000067F00004002000140000000BA8000__000000914E3F38F0 000000067F00004002000140000000BA4000-000000067F00004002000140000000BA8000__000000931B9A2710 000000067F00004002000140000000BA538E-000000067F00004002000140000000BADD73__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BA8000-000000067F00004002000140000000BAC000__000000914E3F38F0 000000067F00004002000140000000BA8000-000000067F00004002000140000000BAC000__000000931B9A2710 000000067F00004002000140000000BAC000-000000067F00004002000140000000BB0000__000000914E3F38F0 000000067F00004002000140000000BAC000-000000067F00004002000140000000BB0000__000000931B9A2710 000000067F00004002000140000000BADD73-000000067F00004002000140000000BB674C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BB0000-000000067F00004002000140000000BB4000__000000914E3F38F0 000000067F00004002000140000000BB0000-000000067F00004002000140000000BB4000__000000931B9A2710 000000067F00004002000140000000BB4000-000000067F00004002000140000000BB8000__000000914E3F38F0 000000067F00004002000140000000BB4000-000000067F00004002000140000000BB8000__000000931B9A2710 000000067F00004002000140000000BB674C-000000067F00004002000140000000BBF113__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BB8000-000000067F00004002000140000000BBC000__000000914E3F38F0 000000067F00004002000140000000BB8000-000000067F00004002000140000000BBC000__000000931B9A2710 000000067F00004002000140000000BBC000-000000067F00004002000140000000BC0000__000000914E3F38F0 000000067F00004002000140000000BBC000-000000067F00004002000140000000BC0000__000000931B9A2710 000000067F00004002000140000000BBF113-000000067F00004002000140000000BC7AEE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BC0000-000000067F00004002000140000000BC4000__000000914E3F38F0 000000067F00004002000140000000BC0000-000000067F00004002000140000000BC4000__000000931B9A2710 000000067F00004002000140000000BC4000-000000067F00004002000140000000BC8000__000000914E3F38F0 000000067F00004002000140000000BC4000-000000067F00004002000140000000BC8000__000000931B9A2710 000000067F00004002000140000000BC7AEE-000000067F00004002000140000000BD04E2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BC8000-000000067F00004002000140000000BCC000__000000914E3F38F0 000000067F00004002000140000000BC8000-000000067F00004002000140000000BCC000__000000931B9A2710 000000067F00004002000140000000BCC000-000000067F00004002000140000000BD0000__000000914E3F38F0 000000067F00004002000140000000BCC000-000000067F00004002000140000000BD0000__000000931B9A2710 000000067F00004002000140000000BD0000-000000067F00004002000140000000BD4000__000000914E3F38F0 000000067F00004002000140000000BD0000-000000067F00004002000140000000BD4000__000000931B9A2710 000000067F00004002000140000000BD04E2-000000067F00004002000140000000BD8EC2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BD4000-000000067F00004002000140000000BD8000__000000914E3F38F0 000000067F00004002000140000000BD4000-000000067F00004002000140000000BD8000__000000931B9A2710 000000067F00004002000140000000BD8000-000000067F00004002000140000000BDC000__000000914E3F38F0 000000067F00004002000140000000BD8000-000000067F00004002000140000000BDC000__000000931B9A2710 000000067F00004002000140000000BD8EC2-000000067F00004002000140000000BE18A8__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BDC000-000000067F00004002000140000000BE0000__000000914E3F38F0 000000067F00004002000140000000BDC000-000000067F00004002000140000000BE0000__000000931B9A2710 000000067F00004002000140000000BE0000-000000067F00004002000140000000BE4000__000000914E3F38F0 000000067F00004002000140000000BE0000-000000067F00004002000140000000BE4000__000000931B9A2710 000000067F00004002000140000000BE18A8-000000067F00004002000140000000BEA27B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BE4000-000000067F00004002000140000000BE8000__000000914E3F38F0 000000067F00004002000140000000BE4000-000000067F00004002000140000000BE8000__000000931B9A2710 000000067F00004002000140000000BE8000-000000067F00004002000140000000BEC000__000000914E3F38F0 000000067F00004002000140000000BE8000-000000067F00004002000140000000BEC000__000000931B9A2710 000000067F00004002000140000000BEA27B-000000067F00004002000140000000BF2C4E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BEC000-000000067F00004002000140000000BF0000__000000914E3F38F0 000000067F00004002000140000000BEC000-000000067F00004002000140000000BF0000__000000931B9A2710 000000067F00004002000140000000BF0000-000000067F00004002000140000000BF4000__000000914E3F38F0 000000067F00004002000140000000BF0000-000000067F00004002000140000000BF4000__000000931B9A2710 000000067F00004002000140000000BF2C4E-000000067F00004002000140000000BFB624__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BF4000-000000067F00004002000140000000BF8000__000000914E3F38F0 000000067F00004002000140000000BF4000-000000067F00004002000140000000BF8000__000000931B9A2710 000000067F00004002000140000000BF8000-000000067F00004002000140000000BFC000__000000914E3F38F0 000000067F00004002000140000000BF8000-000000067F00004002000140000000BFC000__000000931B9A2710 000000067F00004002000140000000BFB624-000000067F00004002000140000000C04004__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000BFC000-000000067F00004002000140000000C00000__000000914E3F38F0 000000067F00004002000140000000BFC000-000000067F00004002000140000000C00000__000000931B9A2710 000000067F00004002000140000000C00000-000000067F00004002000140000000C04000__000000914E3F38F0 000000067F00004002000140000000C00000-000000067F00004002000140000000C04000__000000931B9A2710 000000067F00004002000140000000C04000-000000067F00004002000140000000C08000__000000914E3F38F0 000000067F00004002000140000000C04000-000000067F00004002000140000000C08000__000000931B9A2710 000000067F00004002000140000000C04004-000000067F00004002000140000000C0C9DC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C08000-000000067F00004002000140000000C0C000__000000914E3F38F0 000000067F00004002000140000000C08000-000000067F00004002000140000000C0C000__000000931B9A2710 000000067F00004002000140000000C0C000-000000067F00004002000140000000C10000__000000914E3F38F0 000000067F00004002000140000000C0C000-000000067F00004002000140000000C10000__000000931B9A2710 000000067F00004002000140000000C0C9DC-000000067F00004002000140000000C153C5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C10000-000000067F00004002000140000000C14000__000000914E3F38F0 000000067F00004002000140000000C10000-000000067F00004002000140000000C14000__000000931B9A2710 000000067F00004002000140000000C14000-000000067F00004002000140000000C18000__000000914E3F38F0 000000067F00004002000140000000C14000-000000067F00004002000140000000C18000__000000931B9A2710 000000067F00004002000140000000C153C5-000000067F00004002000140000000C1DDA3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C18000-000000067F00004002000140000000C1C000__000000914E3F38F0 000000067F00004002000140000000C18000-000000067F00004002000140000000C1C000__000000931B9A2710 000000067F00004002000140000000C1C000-000000067F00004002000140000000C20000__000000914E3F38F0 000000067F00004002000140000000C1C000-000000067F00004002000140000000C20000__000000931B9A2710 000000067F00004002000140000000C1DDA3-000000067F00004002000140000000C2677D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C20000-000000067F00004002000140000000C24000__000000914E3F38F0 000000067F00004002000140000000C20000-000000067F00004002000140000000C24000__000000931B9A2710 000000067F00004002000140000000C24000-000000067F00004002000140000000C28000__000000914E3F38F0 000000067F00004002000140000000C24000-000000067F00004002000140000000C28000__000000931B9A2710 000000067F00004002000140000000C2677D-000000067F00004002000140000000C2F155__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C28000-000000067F00004002000140000000C2C000__000000914E3F38F0 000000067F00004002000140000000C28000-000000067F00004002000140000000C2C000__000000931B9A2710 000000067F00004002000140000000C2C000-000000067F00004002000140000000C30000__000000914E3F38F0 000000067F00004002000140000000C2C000-000000067F00004002000140000000C30000__000000931B9A2710 000000067F00004002000140000000C2F155-000000067F00004002000140000000C37B30__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C30000-000000067F00004002000140000000C34000__000000914E3F38F0 000000067F00004002000140000000C30000-000000067F00004002000140000000C34000__000000931B9A2710 000000067F00004002000140000000C34000-000000067F00004002000140000000C38000__000000914E3F38F0 000000067F00004002000140000000C34000-000000067F00004002000140000000C38000__000000931B9A2710 000000067F00004002000140000000C37B30-000000067F00004002000140000000C4050D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C38000-000000067F00004002000140000000C3C000__000000914E3F38F0 000000067F00004002000140000000C38000-000000067F00004002000140000000C3C000__000000931B9A2710 000000067F00004002000140000000C3C000-000000067F00004002000140000000C40000__000000914E3F38F0 000000067F00004002000140000000C3C000-000000067F00004002000140000000C40000__000000931B9A2710 000000067F00004002000140000000C40000-000000067F00004002000140000000C44000__000000914E3F38F0 000000067F00004002000140000000C40000-000000067F00004002000140000000C44000__000000931B9A2710 000000067F00004002000140000000C4050D-000000067F00004002000140000000C48EEF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C44000-000000067F00004002000140000000C48000__000000914E3F38F0 000000067F00004002000140000000C44000-000000067F00004002000140000000C48000__000000931B9A2710 000000067F00004002000140000000C48000-000000067F00004002000140000000C4C000__000000914E3F38F0 000000067F00004002000140000000C48000-000000067F00004002000140000000C4C000__000000931B9A2710 000000067F00004002000140000000C48EEF-000000067F00004002000140000000C518D3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C4C000-000000067F00004002000140000000C50000__000000914E3F38F0 000000067F00004002000140000000C4C000-000000067F00004002000140000000C50000__000000931B9A2710 000000067F00004002000140000000C50000-000000067F00004002000140000000C54000__000000914E3F38F0 000000067F00004002000140000000C50000-000000067F00004002000140000000C54000__000000931B9A2710 000000067F00004002000140000000C518D3-000000067F00004002000140000000C5A2AB__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C54000-000000067F00004002000140000000C58000__000000914E3F38F0 000000067F00004002000140000000C54000-000000067F00004002000140000000C58000__000000931B9A2710 000000067F00004002000140000000C58000-000000067F00004002000140000000C5C000__000000914E3F38F0 000000067F00004002000140000000C58000-000000067F00004002000140000000C5C000__000000931B9A2710 000000067F00004002000140000000C5A2AB-000000067F00004002000140000000C62C8E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C5C000-000000067F00004002000140000000C60000__000000914E3F38F0 000000067F00004002000140000000C5C000-000000067F00004002000140000000C60000__000000931B9A2710 000000067F00004002000140000000C60000-000000067F00004002000140000000C64000__000000914E3F38F0 000000067F00004002000140000000C60000-000000067F00004002000140000000C64000__000000931B9A2710 000000067F00004002000140000000C62C8E-000000067F00004002000140000000C6B65C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C64000-000000067F00004002000140000000C68000__000000914E3F38F0 000000067F00004002000140000000C64000-000000067F00004002000140000000C68000__000000931B9A2710 000000067F00004002000140000000C68000-000000067F00004002000140000000C6C000__000000914E3F38F0 000000067F00004002000140000000C68000-000000067F00004002000140000000C6C000__000000931B9A2710 000000067F00004002000140000000C6B65C-000000067F00004002000140000000C74040__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C6C000-000000067F00004002000140000000C70000__000000914E3F38F0 000000067F00004002000140000000C6C000-000000067F00004002000140000000C70000__000000931B9A2710 000000067F00004002000140000000C70000-000000067F00004002000140000000C74000__000000914E3F38F0 000000067F00004002000140000000C70000-000000067F00004002000140000000C74000__000000931B9A2710 000000067F00004002000140000000C74000-000000067F00004002000140000000C78000__000000914E3F38F0 000000067F00004002000140000000C74000-000000067F00004002000140000000C78000__000000931B9A2710 000000067F00004002000140000000C74040-000000067F00004002000140000000C7CA16__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C78000-000000067F00004002000140000000C7C000__000000914E3F38F0 000000067F00004002000140000000C78000-000000067F00004002000140000000C7C000__000000931B9A2710 000000067F00004002000140000000C7C000-000000067F00004002000140000000C80000__000000914E3F38F0 000000067F00004002000140000000C7C000-000000067F00004002000140000000C80000__000000931B9A2710 000000067F00004002000140000000C7CA16-000000067F00004002000140000000C853EF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C80000-000000067F00004002000140000000C84000__000000914E3F38F0 000000067F00004002000140000000C80000-000000067F00004002000140000000C84000__000000931B9A2710 000000067F00004002000140000000C84000-000000067F00004002000140000000C88000__000000914E3F38F0 000000067F00004002000140000000C84000-000000067F00004002000140000000C88000__000000931B9A2710 000000067F00004002000140000000C853EF-000000067F00004002000140000000C8DDCD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C88000-000000067F00004002000140000000C8C000__000000914E3F38F0 000000067F00004002000140000000C88000-000000067F00004002000140000000C8C000__000000931B9A2710 000000067F00004002000140000000C8C000-000000067F00004002000140000000C90000__000000914E3F38F0 000000067F00004002000140000000C8C000-000000067F00004002000140000000C90000__000000931B9A2710 000000067F00004002000140000000C8DDCD-000000067F00004002000140000000C967AD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C90000-000000067F00004002000140000000C94000__000000914E3F38F0 000000067F00004002000140000000C90000-000000067F00004002000140000000C94000__000000931B9A2710 000000067F00004002000140000000C94000-000000067F00004002000140000000C98000__000000914E3F38F0 000000067F00004002000140000000C94000-000000067F00004002000140000000C98000__000000931B9A2710 000000067F00004002000140000000C967AD-000000067F00004002000140000000C9F189__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000C98000-000000067F00004002000140000000C9C000__000000914E3F38F0 000000067F00004002000140000000C98000-000000067F00004002000140000000C9C000__000000931B9A2710 000000067F00004002000140000000C9C000-000000067F00004002000140000000CA0000__000000914E3F38F0 000000067F00004002000140000000C9C000-000000067F00004002000140000000CA0000__000000931B9A2710 000000067F00004002000140000000C9F189-000000067F00004002000140000000CA7B70__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CA0000-000000067F00004002000140000000CA4000__000000914E3F38F0 000000067F00004002000140000000CA0000-000000067F00004002000140000000CA4000__000000931B9A2710 000000067F00004002000140000000CA4000-000000067F00004002000140000000CA8000__000000914E3F38F0 000000067F00004002000140000000CA4000-000000067F00004002000140000000CA8000__000000931B9A2710 000000067F00004002000140000000CA7B70-000000067F00004002000140000000CB0544__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CA8000-000000067F00004002000140000000CAC000__000000914E3F38F0 000000067F00004002000140000000CA8000-000000067F00004002000140000000CAC000__000000931B9A2710 000000067F00004002000140000000CAC000-000000067F00004002000140000000CB0000__000000914E3F38F0 000000067F00004002000140000000CAC000-000000067F00004002000140000000CB0000__000000931B9A2710 000000067F00004002000140000000CB0000-000000067F00004002000140000000CB4000__000000914E3F38F0 000000067F00004002000140000000CB0000-000000067F00004002000140000000CB4000__000000931B9A2710 000000067F00004002000140000000CB0544-000000067F00004002000140000000CB8F24__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CB4000-000000067F00004002000140000000CB8000__000000914E3F38F0 000000067F00004002000140000000CB4000-000000067F00004002000140000000CB8000__000000931B9A2710 000000067F00004002000140000000CB8000-000000067F00004002000140000000CBC000__000000914E3F38F0 000000067F00004002000140000000CB8000-000000067F00004002000140000000CBC000__000000931B9A2710 000000067F00004002000140000000CB8F24-000000067F00004002000140000000CC1904__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CBC000-000000067F00004002000140000000CC0000__000000914E3F38F0 000000067F00004002000140000000CBC000-000000067F00004002000140000000CC0000__000000931B9A2710 000000067F00004002000140000000CC0000-000000067F00004002000140000000CC4000__000000914E3F38F0 000000067F00004002000140000000CC0000-000000067F00004002000140000000CC4000__000000931B9A2710 000000067F00004002000140000000CC1904-000000067F00004002000140000000CCA2D5__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CC4000-000000067F00004002000140000000CC8000__000000914E3F38F0 000000067F00004002000140000000CC4000-000000067F00004002000140000000CC8000__000000931B9A2710 000000067F00004002000140000000CC8000-000000067F00004002000140000000CCC000__000000914E3F38F0 000000067F00004002000140000000CC8000-000000067F00004002000140000000CCC000__000000931B9A2710 000000067F00004002000140000000CCA2D5-000000067F00004002000140000000CD2CB2__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CCC000-000000067F00004002000140000000CD0000__000000914E3F38F0 000000067F00004002000140000000CCC000-000000067F00004002000140000000CD0000__000000931B9A2710 000000067F00004002000140000000CD0000-000000067F00004002000140000000CD4000__000000914E3F38F0 000000067F00004002000140000000CD0000-000000067F00004002000140000000CD4000__000000931B9A2710 000000067F00004002000140000000CD2CB2-000000067F00004002000140000000CDB695__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CD4000-000000067F00004002000140000000CD8000__000000914E3F38F0 000000067F00004002000140000000CD4000-000000067F00004002000140000000CD8000__000000931B9A2710 000000067F00004002000140000000CD8000-000000067F00004002000140000000CDC000__000000914E3F38F0 000000067F00004002000140000000CD8000-000000067F00004002000140000000CDC000__000000931B9A2710 000000067F00004002000140000000CDB695-000000067F00004002000140000000CE4071__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CDC000-000000067F00004002000140000000CE0000__000000914E3F38F0 000000067F00004002000140000000CDC000-000000067F00004002000140000000CE0000__000000931B9A2710 000000067F00004002000140000000CE0000-000000067F00004002000140000000CE4000__000000914E3F38F0 000000067F00004002000140000000CE0000-000000067F00004002000140000000CE4000__000000931B9A2710 000000067F00004002000140000000CE4000-000000067F00004002000140000000CE8000__000000914E3F38F0 000000067F00004002000140000000CE4000-000000067F00004002000140000000CE8000__000000931B9A2710 000000067F00004002000140000000CE4071-000000067F00004002000140000000CECA49__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CE8000-000000067F00004002000140000000CEC000__000000914E3F38F0 000000067F00004002000140000000CE8000-000000067F00004002000140000000CEC000__000000931B9A2710 000000067F00004002000140000000CEC000-000000067F00004002000140000000CF0000__000000914E3F38F0 000000067F00004002000140000000CEC000-000000067F00004002000140000000CF0000__000000931B9A2710 000000067F00004002000140000000CECA49-000000067F00004002000140000000CF5427__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CF0000-000000067F00004002000140000000CF4000__000000914E3F38F0 000000067F00004002000140000000CF0000-000000067F00004002000140000000CF4000__000000931B9A2710 000000067F00004002000140000000CF4000-000000067F00004002000140000000CF8000__000000914E3F38F0 000000067F00004002000140000000CF4000-000000067F00004002000140000000CF8000__000000931B9A2710 000000067F00004002000140000000CF5427-000000067F00004002000140000000CFDE02__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000CF8000-000000067F00004002000140000000CFC000__000000914E3F38F0 000000067F00004002000140000000CF8000-000000067F00004002000140000000CFC000__000000931B9A2710 000000067F00004002000140000000CFC000-000000067F00004002000140000000D00000__000000914E3F38F0 000000067F00004002000140000000CFC000-000000067F00004002000140000000D00000__000000931B9A2710 000000067F00004002000140000000CFDE02-000000067F00004002000140000000D067CC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D00000-000000067F00004002000140000000D04000__000000914E3F38F0 000000067F00004002000140000000D00000-000000067F00004002000140000000D04000__000000931B9A2710 000000067F00004002000140000000D04000-000000067F00004002000140000000D08000__000000914E3F38F0 000000067F00004002000140000000D04000-000000067F00004002000140000000D08000__000000931B9A2710 000000067F00004002000140000000D067CC-000000067F00004002000140000000D0F1C4__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D08000-000000067F00004002000140000000D0C000__000000914E3F38F0 000000067F00004002000140000000D08000-000000067F00004002000140000000D0C000__000000931B9A2710 000000067F00004002000140000000D0C000-000000067F00004002000140000000D10000__000000914E3F38F0 000000067F00004002000140000000D0C000-000000067F00004002000140000000D10000__000000931B9A2710 000000067F00004002000140000000D0F1C4-000000067F00004002000140000000D17B9F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D10000-000000067F00004002000140000000D14000__000000914E3F38F0 000000067F00004002000140000000D10000-000000067F00004002000140000000D14000__000000931B9A2710 000000067F00004002000140000000D14000-000000067F00004002000140000000D18000__000000914E3F38F0 000000067F00004002000140000000D14000-000000067F00004002000140000000D18000__000000931B9A2710 000000067F00004002000140000000D17B9F-000000067F00004002000140000000D2057B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D18000-000000067F00004002000140000000D1C000__000000914E3F38F0 000000067F00004002000140000000D18000-000000067F00004002000140000000D1C000__000000931B9A2710 000000067F00004002000140000000D1C000-000000067F00004002000140000000D20000__000000914E3F38F0 000000067F00004002000140000000D1C000-000000067F00004002000140000000D20000__000000931B9A2710 000000067F00004002000140000000D20000-000000067F00004002000140000000D24000__000000914E3F38F0 000000067F00004002000140000000D20000-000000067F00004002000140000000D24000__000000931B9A2710 000000067F00004002000140000000D2057B-000000067F00004002000140000000D28F4A__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D24000-000000067F00004002000140000000D28000__000000914E3F38F0 000000067F00004002000140000000D24000-000000067F00004002000140000000D28000__000000931B9A2710 000000067F00004002000140000000D28000-000000067F00004002000140000000D2C000__000000914E3F38F0 000000067F00004002000140000000D28000-000000067F00004002000140000000D2C000__000000931B9A2710 000000067F00004002000140000000D28F4A-000000067F00004002000140000000D31928__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D2C000-000000067F00004002000140000000D30000__000000914E3F38F0 000000067F00004002000140000000D2C000-000000067F00004002000140000000D30000__000000931B9A2710 000000067F00004002000140000000D30000-000000067F00004002000140000000D34000__000000914E3F38F0 000000067F00004002000140000000D30000-000000067F00004002000140000000D34000__000000931B9A2710 000000067F00004002000140000000D31928-000000067F00004002000140000000D3A302__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D34000-000000067F00004002000140000000D38000__000000914E3F38F0 000000067F00004002000140000000D34000-000000067F00004002000140000000D38000__000000931B9A2710 000000067F00004002000140000000D38000-000000067F00004002000140000000D3C000__000000914E3F38F0 000000067F00004002000140000000D38000-000000067F00004002000140000000D3C000__000000931B9A2710 000000067F00004002000140000000D3A302-000000067F00004002000140000000D42CCC__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D3C000-000000067F00004002000140000000D40000__000000914E3F38F0 000000067F00004002000140000000D3C000-000000067F00004002000140000000D40000__000000931B9A2710 000000067F00004002000140000000D40000-000000067F00004002000140000000D44000__000000914E3F38F0 000000067F00004002000140000000D40000-000000067F00004002000140000000D44000__000000931B9A2710 000000067F00004002000140000000D42CCC-000000067F00004002000140000000D4B6AE__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D44000-000000067F00004002000140000000D48000__000000914E3F38F0 000000067F00004002000140000000D44000-000000067F00004002000140000000D48000__000000931B9A2710 000000067F00004002000140000000D48000-000000067F00004002000140000000D4C000__000000914E3F38F0 000000067F00004002000140000000D48000-000000067F00004002000140000000D4C000__000000931B9A2710 000000067F00004002000140000000D4B6AE-000000067F00004002000140000000D5408F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D4C000-000000067F00004002000140000000D50000__000000914E3F38F0 000000067F00004002000140000000D4C000-000000067F00004002000140000000D50000__000000931B9A2710 000000067F00004002000140000000D50000-000000067F00004002000140000000D54000__000000914E3F38F0 000000067F00004002000140000000D50000-000000067F00004002000140000000D54000__000000931B9A2710 000000067F00004002000140000000D54000-000000067F00004002000140000000D58000__000000914E3F38F0 000000067F00004002000140000000D54000-000000067F00004002000140000000D58000__000000931B9A2710 000000067F00004002000140000000D5408F-000000067F00004002000140000000D5CA69__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D58000-000000067F00004002000140000000D5C000__000000914E3F38F0 000000067F00004002000140000000D58000-000000067F00004002000140000000D5C000__000000931B9A2710 000000067F00004002000140000000D5C000-000000067F00004002000140000000D60000__000000914E3F38F0 000000067F00004002000140000000D5C000-000000067F00004002000140000000D60000__000000931B9A2710 000000067F00004002000140000000D5CA69-000000067F00004002000140000000D6543E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D60000-000000067F00004002000140000000D64000__000000914E3F38F0 000000067F00004002000140000000D60000-000000067F00004002000140000000D64000__000000931B9A2710 000000067F00004002000140000000D64000-000000067F00004002000140000000D68000__000000914E3F38F0 000000067F00004002000140000000D64000-000000067F00004002000140000000D68000__000000931B9A2710 000000067F00004002000140000000D6543E-000000067F00004002000140000000D6DE1B__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D68000-000000067F00004002000140000000D6C000__000000914E3F38F0 000000067F00004002000140000000D68000-000000067F00004002000140000000D6C000__000000931B9A2710 000000067F00004002000140000000D6C000-000000067F00004002000140000000D70000__000000914E3F38F0 000000067F00004002000140000000D6C000-000000067F00004002000140000000D70000__000000931B9A2710 000000067F00004002000140000000D6DE1B-000000067F00004002000140000000D767FA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D70000-000000067F00004002000140000000D74000__000000914E3F38F0 000000067F00004002000140000000D70000-000000067F00004002000140000000D74000__000000931B9A2710 000000067F00004002000140000000D74000-000000067F00004002000140000000D78000__000000914E3F38F0 000000067F00004002000140000000D74000-000000067F00004002000140000000D78000__000000931B9A2710 000000067F00004002000140000000D767FA-000000067F00004002000140000000D7F1DD__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D78000-000000067F00004002000140000000D7C000__000000914E3F38F0 000000067F00004002000140000000D78000-000000067F00004002000140000000D7C000__000000931B9A2710 000000067F00004002000140000000D7C000-000000067F00004002000140000000D80000__000000914E3F38F0 000000067F00004002000140000000D7C000-000000067F00004002000140000000D80000__000000931B9A2710 000000067F00004002000140000000D7F1DD-000000067F00004002000140000000D87BBA__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D80000-000000067F00004002000140000000D84000__000000914E3F38F0 000000067F00004002000140000000D80000-000000067F00004002000140000000D84000__000000931B9A2710 000000067F00004002000140000000D84000-000000067F00004002000140000000D88000__000000914E3F38F0 000000067F00004002000140000000D84000-000000067F00004002000140000000D88000__000000931B9A2710 000000067F00004002000140000000D87BBA-000000067F00004002000140000000D9059C__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D88000-000000067F00004002000140000000D8C000__000000914E3F38F0 000000067F00004002000140000000D88000-000000067F00004002000140000000D8C000__000000931B9A2710 000000067F00004002000140000000D8C000-000000067F00004002000140000000D90000__000000914E3F38F0 000000067F00004002000140000000D8C000-000000067F00004002000140000000D90000__000000931B9A2710 000000067F00004002000140000000D90000-000000067F00004002000140000000D94000__000000914E3F38F0 000000067F00004002000140000000D90000-000000067F00004002000140000000D94000__000000931B9A2710 000000067F00004002000140000000D9059C-000000067F00004002000140000000D98F7F__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D94000-000000067F00004002000140000000D98000__000000914E3F38F0 000000067F00004002000140000000D94000-000000067F00004002000140000000D98000__000000931B9A2710 000000067F00004002000140000000D98000-000000067F00004002000140000000D9C000__000000914E3F38F0 000000067F00004002000140000000D98000-000000067F00004002000140000000D9C000__000000931B9A2710 000000067F00004002000140000000D98F7F-000000067F00004002000140000000DA1953__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000D9C000-000000067F00004002000140000000DA0000__000000914E3F38F0 000000067F00004002000140000000D9C000-000000067F00004002000140000000DA0000__000000931B9A2710 000000067F00004002000140000000DA0000-000000067F00004002000140000000DA4000__000000914E3F38F0 000000067F00004002000140000000DA0000-000000067F00004002000140000000DA4000__000000931B9A2710 000000067F00004002000140000000DA1953-000000067F00004002000140000000DAA32D__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DA4000-000000067F00004002000140000000DA8000__000000914E3F38F0 000000067F00004002000140000000DA4000-000000067F00004002000140000000DA8000__000000931B9A2710 000000067F00004002000140000000DA8000-000000067F00004002000140000000DAC000__000000914E3F38F0 000000067F00004002000140000000DA8000-000000067F00004002000140000000DAC000__000000931B9A2710 000000067F00004002000140000000DAA32D-000000067F00004002000140000000DB2D0E__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DAC000-000000067F00004002000140000000DB0000__000000914E3F38F0 000000067F00004002000140000000DAC000-000000067F00004002000140000000DB0000__000000931B9A2710 000000067F00004002000140000000DB0000-000000067F00004002000140000000DB4000__000000914E3F38F0 000000067F00004002000140000000DB0000-000000067F00004002000140000000DB4000__000000931B9A2710 000000067F00004002000140000000DB2D0E-000000067F00004002000140000000DBB6DF__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DB4000-000000067F00004002000140000000DB8000__000000914E3F38F0 000000067F00004002000140000000DB4000-000000067F00004002000140000000DB8000__000000931B9A2710 000000067F00004002000140000000DB8000-000000067F00004002000140000000DBC000__000000914E3F38F0 000000067F00004002000140000000DB8000-000000067F00004002000140000000DBC000__000000931B9A2710 000000067F00004002000140000000DBB6DF-000000067F00004002000140000000DC40C3__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DBC000-000000067F00004002000140000000DC0000__000000914E3F38F0 000000067F00004002000140000000DBC000-000000067F00004002000140000000DC0000__000000931B9A2710 000000067F00004002000140000000DC0000-000000067F00004002000140000000DC4000__000000914E3F38F0 000000067F00004002000140000000DC0000-000000067F00004002000140000000DC4000__000000931B9A2710 000000067F00004002000140000000DC4000-000000067F00004002000140000000DC8000__000000914E3F38F0 000000067F00004002000140000000DC4000-000000067F00004002000140000000DC8000__000000931B9A2710 000000067F00004002000140000000DC40C3-000000067F00004002000140000000DCCAA7__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DC8000-000000067F00004002000140000000DCC000__000000914E3F38F0 000000067F00004002000140000000DC8000-000000067F00004002000140000000DCC000__000000931B9A2710 000000067F00004002000140000000DCC000-000000067F00004002000140000000DD0000__000000914E3F38F0 000000067F00004002000140000000DCC000-000000067F00004002000140000000DD0000__000000931B9A2710 000000067F00004002000140000000DCCAA7-000000067F00004002000140000000DD2050__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000000DD0000-000000067F00004002000140000000DD4000__000000914E3F38F0 000000067F00004002000140000000DD0000-000000067F00004002000140000000DD4000__000000931B9A2710 000000067F00004002000140000000DD2050-000000067F00004002000140000000DDAA27__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000DD4000-000000067F00004002000140000000DD8000__000000914E3F38F0 000000067F00004002000140000000DD4000-000000067F00004002000140000000DD8000__000000931B9A2710 000000067F00004002000140000000DD8000-000000067F00004002000140000000DDC000__000000914E3F38F0 000000067F00004002000140000000DD8000-000000067F00004002000140000000DDC000__000000931B9A2710 000000067F00004002000140000000DDAA27-000000067F00004002000140000000DE3401__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000DDC000-000000067F00004002000140000000DE0000__000000914E3F38F0 000000067F00004002000140000000DDC000-000000067F00004002000140000000DE0000__000000931B9A2710 000000067F00004002000140000000DE0000-000000067F00004002000140000000DE4000__000000914E3F38F0 000000067F00004002000140000000DE0000-000000067F00004002000140000000DE4000__000000931B9A2710 000000067F00004002000140000000DE3401-000000067F00004002000140000000DEBDCD__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000DE4000-000000067F00004002000140000000DE8000__000000914E3F38F0 000000067F00004002000140000000DE4000-000000067F00004002000140000000DE8000__000000931B9A2710 000000067F00004002000140000000DE8000-000000067F00004002000140000000DEC000__000000914E3F38F0 000000067F00004002000140000000DE8000-000000067F00004002000140000000DEC000__000000931B9A2710 000000067F00004002000140000000DEBDCD-000000067F00004002000140000000DF47AF__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000DEC000-000000067F00004002000140000000DF0000__000000914E3F38F0 000000067F00004002000140000000DEC000-000000067F00004002000140000000DF0000__000000931B9A2710 000000067F00004002000140000000DF0000-000000067F00004002000140000000DF4000__000000914E3F38F0 000000067F00004002000140000000DF0000-000000067F00004002000140000000DF4000__000000931B9A2710 000000067F00004002000140000000DF4000-000000067F00004002000140000000DF8000__000000914E3F38F0 000000067F00004002000140000000DF4000-000000067F00004002000140000000DF8000__000000931B9A2710 000000067F00004002000140000000DF47AF-000000067F00004002000140000000DFD196__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000DF8000-000000067F00004002000140000000DFC000__000000914E3F38F0 000000067F00004002000140000000DF8000-000000067F00004002000140000000DFC000__000000931B9A2710 000000067F00004002000140000000DFC000-000000067F00004002000140000000E00000__000000914E3F38F0 000000067F00004002000140000000DFC000-000000067F00004002000140000000E00000__000000931B9A2710 000000067F00004002000140000000DFD196-000000067F00004002000140000000E05B74__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E00000-000000067F00004002000140000000E04000__000000914E3F38F0 000000067F00004002000140000000E00000-000000067F00004002000140000000E04000__000000931B9A2710 000000067F00004002000140000000E04000-000000067F00004002000140000000E08000__000000914E3F38F0 000000067F00004002000140000000E04000-000000067F00004002000140000000E08000__000000931B9A2710 000000067F00004002000140000000E05B74-000000067F00004002000140000000E0E54D__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E08000-000000067F00004002000140000000E0C000__000000914E3F38F0 000000067F00004002000140000000E08000-000000067F00004002000140000000E0C000__000000931B9A2710 000000067F00004002000140000000E0C000-000000067F00004002000140000000E10000__000000914E3F38F0 000000067F00004002000140000000E0C000-000000067F00004002000140000000E10000__000000931B9A2710 000000067F00004002000140000000E0E54D-000000067F00004002000140000000E16F24__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E10000-000000067F00004002000140000000E14000__000000914E3F38F0 000000067F00004002000140000000E10000-000000067F00004002000140000000E14000__000000931B9A2710 000000067F00004002000140000000E14000-000000067F00004002000140000000E18000__000000914E3F38F0 000000067F00004002000140000000E14000-000000067F00004002000140000000E18000__000000931B9A2710 000000067F00004002000140000000E16F24-000000067F00004002000140000000E1F8FB__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E18000-000000067F00004002000140000000E1C000__000000914E3F38F0 000000067F00004002000140000000E18000-000000067F00004002000140000000E1C000__000000931B9A2710 000000067F00004002000140000000E1C000-000000067F00004002000140000000E20000__000000914E3F38F0 000000067F00004002000140000000E1C000-000000067F00004002000140000000E20000__000000931B9A2710 000000067F00004002000140000000E1F8FB-000000067F00004002000140000000E282CC__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E20000-000000067F00004002000140000000E24000__000000914E3F38F0 000000067F00004002000140000000E20000-000000067F00004002000140000000E24000__000000931B9A2710 000000067F00004002000140000000E24000-000000067F00004002000140000000E28000__000000914E3F38F0 000000067F00004002000140000000E24000-000000067F00004002000140000000E28000__000000931B9A2710 000000067F00004002000140000000E28000-000000067F00004002000140000000E2C000__000000914E3F38F0 000000067F00004002000140000000E28000-000000067F00004002000140000000E2C000__000000931B9A2710 000000067F00004002000140000000E282CC-000000067F00004002000140000000E30CB1__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E2C000-000000067F00004002000140000000E30000__000000914E3F38F0 000000067F00004002000140000000E2C000-000000067F00004002000140000000E30000__000000931B9A2710 000000067F00004002000140000000E30000-000000067F00004002000140000000E34000__000000914E3F38F0 000000067F00004002000140000000E30000-000000067F00004002000140000000E34000__000000931B9A2710 000000067F00004002000140000000E30CB1-000000067F00004002000140000000E39694__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E34000-000000067F00004002000140000000E38000__000000914E3F38F0 000000067F00004002000140000000E34000-000000067F00004002000140000000E38000__000000931B9A2710 000000067F00004002000140000000E38000-000000067F00004002000140000000E3C000__000000914E3F38F0 000000067F00004002000140000000E38000-000000067F00004002000140000000E3C000__000000931B9A2710 000000067F00004002000140000000E39694-000000067F00004002000140000000E42072__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E3C000-000000067F00004002000140000000E40000__000000914E3F38F0 000000067F00004002000140000000E3C000-000000067F00004002000140000000E40000__000000931B9A2710 000000067F00004002000140000000E40000-000000067F00004002000140000000E44000__000000914E3F38F0 000000067F00004002000140000000E40000-000000067F00004002000140000000E44000__000000931B9A2710 000000067F00004002000140000000E42072-000000067F00004002000140000000E4AA53__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E44000-000000067F00004002000140000000E48000__000000914E3F38F0 000000067F00004002000140000000E44000-000000067F00004002000140000000E48000__000000931B9A2710 000000067F00004002000140000000E48000-000000067F00004002000140000000E4C000__000000914E3F38F0 000000067F00004002000140000000E48000-000000067F00004002000140000000E4C000__000000931B9A2710 000000067F00004002000140000000E4AA53-000000067F00004002000140000000E53428__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E4C000-000000067F00004002000140000000E50000__000000914E3F38F0 000000067F00004002000140000000E4C000-000000067F00004002000140000000E50000__000000931B9A2710 000000067F00004002000140000000E50000-000000067F00004002000140000000E54000__000000914E3F38F0 000000067F00004002000140000000E50000-000000067F00004002000140000000E54000__000000931B9A2710 000000067F00004002000140000000E53428-000000067F00004002000140000000E5BDF8__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E54000-000000067F00004002000140000000E58000__000000914E3F38F0 000000067F00004002000140000000E54000-000000067F00004002000140000000E58000__000000931B9A2710 000000067F00004002000140000000E58000-000000067F00004002000140000000E5C000__000000914E3F38F0 000000067F00004002000140000000E58000-000000067F00004002000140000000E5C000__000000931B9A2710 000000067F00004002000140000000E5BDF8-000000067F00004002000140000000E647D2__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E5C000-000000067F00004002000140000000E60000__000000914E3F38F0 000000067F00004002000140000000E5C000-000000067F00004002000140000000E60000__000000931B9A2710 000000067F00004002000140000000E60000-000000067F00004002000140000000E64000__000000914E3F38F0 000000067F00004002000140000000E60000-000000067F00004002000140000000E64000__000000931B9A2710 000000067F00004002000140000000E64000-000000067F00004002000140000000E68000__000000914E3F38F0 000000067F00004002000140000000E64000-000000067F00004002000140000000E68000__000000931B9A2710 000000067F00004002000140000000E647D2-000000067F00004002000140000000E6D1B1__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E68000-000000067F00004002000140000000E6C000__000000914E3F38F0 000000067F00004002000140000000E68000-000000067F00004002000140000000E6C000__000000931B9A2710 000000067F00004002000140000000E6C000-000000067F00004002000140000000E70000__000000914E3F38F0 000000067F00004002000140000000E6C000-000000067F00004002000140000000E70000__000000931B9A2710 000000067F00004002000140000000E6D1B1-000000067F00004002000140000000E75B9C__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E70000-000000067F00004002000140000000E74000__000000914E3F38F0 000000067F00004002000140000000E70000-000000067F00004002000140000000E74000__000000931B9A2710 000000067F00004002000140000000E74000-000000067F00004002000140000000E78000__000000914E3F38F0 000000067F00004002000140000000E74000-000000067F00004002000140000000E78000__000000931B9A2710 000000067F00004002000140000000E75B9C-000000067F00004002000140000000E7E573__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E78000-000000067F00004002000140000000E7C000__000000914E3F38F0 000000067F00004002000140000000E78000-000000067F00004002000140000000E7C000__000000931B9A2710 000000067F00004002000140000000E7C000-000000067F00004002000140000000E80000__000000900A539398 000000067F00004002000140000000E7C000-000000067F00004002000140000000E80000__000000914E3F38F0 000000067F00004002000140000000E7C000-000000067F00004002000140000000E80000__000000931B9A2710 000000067F00004002000140000000E7E573-000000067F00004002000140000200000000__0000008D2DB5E0C1-0000008E6D15F1F1 000000067F00004002000140000000E7E99B-000000067F00004002000140000000E87389__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000E80000-000000067F00004002000140000000E84000__000000900A539398 000000067F00004002000140000000E80000-000000067F00004002000140000000E84000__000000914E3F38F0 000000067F00004002000140000000E80000-000000067F00004002000140000000E84000__000000931B9A2710 000000067F00004002000140000000E84000-000000067F00004002000140000000E88000__000000900A539398 000000067F00004002000140000000E84000-000000067F00004002000140000000E88000__000000914E3F38F0 000000067F00004002000140000000E84000-000000067F00004002000140000000E88000__000000931B9A2710 000000067F00004002000140000000E87389-000000067F00004002000140000000E8FD63__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000E88000-000000067F00004002000140000000E8C000__000000900A539398 000000067F00004002000140000000E88000-000000067F00004002000140000000E8C000__000000914E3F38F0 000000067F00004002000140000000E88000-000000067F00004002000140000000E8C000__000000931B9A2710 000000067F00004002000140000000E8C000-000000067F00004002000140000000E90000__000000900A539398 000000067F00004002000140000000E8C000-000000067F00004002000140000000E90000__000000914E3F38F0 000000067F00004002000140000000E8C000-000000067F00004002000140000000E90000__000000931B9A2710 000000067F00004002000140000000E8FD63-000000067F00004002000140000000E98735__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000E90000-000000067F00004002000140000000E94000__000000900A539398 000000067F00004002000140000000E90000-000000067F00004002000140000000E94000__000000914E3F38F0 000000067F00004002000140000000E90000-000000067F00004002000140000000E94000__000000931B9A2710 000000067F00004002000140000000E94000-000000067F00004002000140000000E98000__000000900A539398 000000067F00004002000140000000E94000-000000067F00004002000140000000E98000__000000914E3F38F0 000000067F00004002000140000000E94000-000000067F00004002000140000000E98000__000000931B9A2710 000000067F00004002000140000000E98000-000000067F00004002000140000000E9C000__000000900A539398 000000067F00004002000140000000E98000-000000067F00004002000140000000E9C000__000000914E3F38F0 000000067F00004002000140000000E98000-000000067F00004002000140000000E9C000__000000931B9A2710 000000067F00004002000140000000E98735-000000067F00004002000140000000EA1109__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000E9C000-000000067F00004002000140000000EA0000__000000900A539398 000000067F00004002000140000000E9C000-000000067F00004002000140000000EA0000__000000914E3F38F0 000000067F00004002000140000000E9C000-000000067F00004002000140000000EA0000__000000931B9A2710 000000067F00004002000140000000EA0000-000000067F00004002000140000000EA4000__000000900A539398 000000067F00004002000140000000EA0000-000000067F00004002000140000000EA4000__000000914E3F38F0 000000067F00004002000140000000EA0000-000000067F00004002000140000000EA4000__000000931B9A2710 000000067F00004002000140000000EA1109-000000067F00004002000140000000EA9AE3__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000EA4000-000000067F00004002000140000000EA8000__000000900A539398 000000067F00004002000140000000EA4000-000000067F00004002000140000000EA8000__000000914E3F38F0 000000067F00004002000140000000EA4000-000000067F00004002000140000000EA8000__000000931B9A2710 000000067F00004002000140000000EA8000-000000067F00004002000140000000EAC000__000000914E3F38F0 000000067F00004002000140000000EA8000-000000067F00004002000140000000EAC000__000000931B9A2710 000000067F00004002000140000000EA8000-030000000000000000000000000000000002__0000008EBDA82990 000000067F00004002000140000000EA9AE3-000000067F00004002000140000000EB24C6__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000EAC000-000000067F00004002000140000000EB0000__000000914E3F38F0 000000067F00004002000140000000EAC000-000000067F00004002000140000000EB0000__000000931B9A2710 000000067F00004002000140000000EB0000-000000067F00004002000140000000EB4000__000000914E3F38F0 000000067F00004002000140000000EB0000-000000067F00004002000140000000EB4000__000000931B9A2710 000000067F00004002000140000000EB24C6-000000067F00004002000140000000EBAEA6__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000EB4000-000000067F00004002000140000000EB8000__000000914E3F38F0 000000067F00004002000140000000EB4000-000000067F00004002000140000000EB8000__000000931B9A2710 000000067F00004002000140000000EB8000-000000067F00004002000140000000EBC000__000000914E3F38F0 000000067F00004002000140000000EB8000-000000067F00004002000140000000EBC000__000000931B9A2710 000000067F00004002000140000000EBAEA6-000000067F00004002000140000000EC3890__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000EBC000-000000067F00004002000140000000EC0000__000000914E3F38F0 000000067F00004002000140000000EBC000-000000067F00004002000140000000EC0000__000000931B9A2710 000000067F00004002000140000000EC0000-000000067F00004002000140000000EC4000__000000914E3F38F0 000000067F00004002000140000000EC0000-000000067F00004002000140000000EC4000__000000931B9A2710 000000067F00004002000140000000EC3890-000000067F00004002000140000000ECC269__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000EC4000-000000067F00004002000140000000EC8000__000000914E3F38F0 000000067F00004002000140000000EC4000-000000067F00004002000140000000EC8000__000000931B9A2710 000000067F00004002000140000000EC8000-000000067F00004002000140000000ECC000__000000914E3F38F0 000000067F00004002000140000000EC8000-000000067F00004002000140000000ECC000__000000931B9A2710 000000067F00004002000140000000ECC000-000000067F00004002000140000000ED0000__000000914E3F38F0 000000067F00004002000140000000ECC000-000000067F00004002000140000000ED0000__000000931B9A2710 000000067F00004002000140000000ECC269-000000067F00004002000140000000ED4C46__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000ED0000-000000067F00004002000140000000ED4000__000000914E3F38F0 000000067F00004002000140000000ED0000-000000067F00004002000140000000ED4000__000000931B9A2710 000000067F00004002000140000000ED4000-000000067F00004002000140000000ED8000__000000900A539398 000000067F00004002000140000000ED4000-000000067F00004002000140000000ED8000__000000914E3F38F0 000000067F00004002000140000000ED4000-000000067F00004002000140000000ED8000__000000931B9A2710 000000067F00004002000140000000ED4C46-000000067F00004002000140000200000000__0000008E6D15F1F1-0000008F0CC5C6B1 000000067F00004002000140000000ED4EBC-000000067F00004002000140000000EDD899__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000ED8000-000000067F00004002000140000000EDC000__000000900A539398 000000067F00004002000140000000ED8000-000000067F00004002000140000000EDC000__000000914E3F38F0 000000067F00004002000140000000ED8000-000000067F00004002000140000000EDC000__000000931B9A2710 000000067F00004002000140000000EDC000-000000067F00004002000140000000EE0000__000000900A539398 000000067F00004002000140000000EDC000-000000067F00004002000140000000EE0000__000000914E3F38F0 000000067F00004002000140000000EDC000-000000067F00004002000140000000EE0000__000000931B9A2710 000000067F00004002000140000000EDD899-000000067F00004002000140000000EE6278__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000EE0000-000000067F00004002000140000000EE4000__000000900A539398 000000067F00004002000140000000EE0000-000000067F00004002000140000000EE4000__000000914E3F38F0 000000067F00004002000140000000EE0000-000000067F00004002000140000000EE4000__000000931B9A2710 000000067F00004002000140000000EE4000-000000067F00004002000140000000EE8000__000000900A539398 000000067F00004002000140000000EE4000-000000067F00004002000140000000EE8000__000000914E3F38F0 000000067F00004002000140000000EE4000-000000067F00004002000140000000EE8000__000000931B9A2710 000000067F00004002000140000000EE6278-000000067F00004002000140000000EEEC50__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000EE8000-000000067F00004002000140000000EEC000__000000900A539398 000000067F00004002000140000000EE8000-000000067F00004002000140000000EEC000__000000914E3F38F0 000000067F00004002000140000000EE8000-000000067F00004002000140000000EEC000__000000931B9A2710 000000067F00004002000140000000EEC000-000000067F00004002000140000000EF0000__000000900A539398 000000067F00004002000140000000EEC000-000000067F00004002000140000000EF0000__000000914E3F38F0 000000067F00004002000140000000EEC000-000000067F00004002000140000000EF0000__000000931B9A2710 000000067F00004002000140000000EEEC50-000000067F00004002000140000000EF7623__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000EF0000-000000067F00004002000140000000EF4000__000000900A539398 000000067F00004002000140000000EF0000-000000067F00004002000140000000EF4000__000000914E3F38F0 000000067F00004002000140000000EF0000-000000067F00004002000140000000EF4000__000000931B9A2710 000000067F00004002000140000000EF4000-000000067F00004002000140000000EF8000__000000900A539398 000000067F00004002000140000000EF4000-000000067F00004002000140000000EF8000__000000914E3F38F0 000000067F00004002000140000000EF4000-000000067F00004002000140000000EF8000__000000931B9A2710 000000067F00004002000140000000EF7623-000000067F00004002000140000000EFFFFA__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000EF8000-000000067F00004002000140000000EFC000__000000900A539398 000000067F00004002000140000000EF8000-000000067F00004002000140000000EFC000__000000914E3F38F0 000000067F00004002000140000000EF8000-000000067F00004002000140000000EFC000__000000931B9A2710 000000067F00004002000140000000EFC000-000000067F00004002000140000000F00000__000000900A539398 000000067F00004002000140000000EFC000-000000067F00004002000140000000F00000__000000914E3F38F0 000000067F00004002000140000000EFC000-000000067F00004002000140000000F00000__000000931B9A2710 000000067F00004002000140000000EFFFFA-000000067F00004002000140000000F089E5__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F00000-000000067F00004002000140000000F04000__000000900A539398 000000067F00004002000140000000F00000-000000067F00004002000140000000F04000__000000914E3F38F0 000000067F00004002000140000000F00000-000000067F00004002000140000000F04000__000000931B9A2710 000000067F00004002000140000000F04000-000000067F00004002000140000000F08000__000000900A539398 000000067F00004002000140000000F04000-000000067F00004002000140000000F08000__000000914E3F38F0 000000067F00004002000140000000F04000-000000067F00004002000140000000F08000__000000931B9A2710 000000067F00004002000140000000F08000-000000067F00004002000140000000F0C000__000000900A539398 000000067F00004002000140000000F08000-000000067F00004002000140000000F0C000__000000914E3F38F0 000000067F00004002000140000000F08000-000000067F00004002000140000000F0C000__000000931B9A2710 000000067F00004002000140000000F089E5-000000067F00004002000140000000F113CD__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F0C000-000000067F00004002000140000000F10000__000000900A539398 000000067F00004002000140000000F0C000-000000067F00004002000140000000F10000__000000914E3F38F0 000000067F00004002000140000000F0C000-000000067F00004002000140000000F10000__000000931B9A2710 000000067F00004002000140000000F10000-000000067F00004002000140000000F14000__000000900A539398 000000067F00004002000140000000F10000-000000067F00004002000140000000F14000__000000914E3F38F0 000000067F00004002000140000000F10000-000000067F00004002000140000000F14000__000000931B9A2710 000000067F00004002000140000000F113CD-000000067F00004002000140000000F19DA8__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F14000-000000067F00004002000140000000F18000__000000900A539398 000000067F00004002000140000000F14000-000000067F00004002000140000000F18000__000000914E3F38F0 000000067F00004002000140000000F14000-000000067F00004002000140000000F18000__000000931B9A2710 000000067F00004002000140000000F18000-000000067F00004002000140000000F1C000__000000900A539398 000000067F00004002000140000000F18000-000000067F00004002000140000000F1C000__000000914E3F38F0 000000067F00004002000140000000F18000-000000067F00004002000140000000F1C000__000000931B9A2710 000000067F00004002000140000000F19DA8-000000067F00004002000140000000F22786__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F1C000-000000067F00004002000140000000F20000__000000900A539398 000000067F00004002000140000000F1C000-000000067F00004002000140000000F20000__000000914E3F38F0 000000067F00004002000140000000F1C000-000000067F00004002000140000000F20000__000000931B9A2710 000000067F00004002000140000000F20000-000000067F00004002000140000000F24000__000000900A539398 000000067F00004002000140000000F20000-000000067F00004002000140000000F24000__000000914E3F38F0 000000067F00004002000140000000F20000-000000067F00004002000140000000F24000__000000931B9A2710 000000067F00004002000140000000F22786-000000067F00004002000140000000F2B162__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F24000-000000067F00004002000140000000F28000__000000900A539398 000000067F00004002000140000000F24000-000000067F00004002000140000000F28000__000000914E3F38F0 000000067F00004002000140000000F24000-000000067F00004002000140000000F28000__000000931B9A2710 000000067F00004002000140000000F28000-000000067F00004002000140000000F2C000__000000900A539398 000000067F00004002000140000000F28000-000000067F00004002000140000000F2C000__000000914E3F38F0 000000067F00004002000140000000F28000-000000067F00004002000140000000F2C000__000000931B9A2710 000000067F00004002000140000000F2B162-000000067F00004002000140000200000000__0000008F0CC5C6B1-0000008FAC75E259 000000067F00004002000140000000F2C000-000000067F00004002000140000000F30000__000000900A539398 000000067F00004002000140000000F2C000-000000067F00004002000140000000F30000__000000914E3F38F0 000000067F00004002000140000000F2C000-000000067F00004002000140000000F30000__000000931B9A2710 000000067F00004002000140000000F30000-000000067F00004002000140000000F34000__000000900A539398 000000067F00004002000140000000F30000-000000067F00004002000140000000F34000__000000914E3F38F0 000000067F00004002000140000000F30000-000000067F00004002000140000000F34000__000000931B9A2710 000000067F00004002000140000000F32D01-000000067F00004002000140000000F3B6CF__0000008FAC75E259-000000900BB52179 000000067F00004002000140000000F34000-000000067F00004002000140000000F38000__000000900A539398 000000067F00004002000140000000F34000-000000067F00004002000140000000F38000__000000914E3F38F0 000000067F00004002000140000000F34000-000000067F00004002000140000000F38000__000000931B9A2710 000000067F00004002000140000000F38000-000000067F00004002000140000000F3C000__000000900A539398 000000067F00004002000140000000F38000-000000067F00004002000140000000F3C000__000000914E3F38F0 000000067F00004002000140000000F38000-000000067F00004002000140000000F3C000__000000931B9A2710 000000067F00004002000140000000F3B6CF-000000067F00004002000140000000F440B8__0000008FAC75E259-000000900BB52179 000000067F00004002000140000000F3C000-000000067F00004002000140000000F40000__000000900A539398 000000067F00004002000140000000F3C000-000000067F00004002000140000000F40000__000000914E3F38F0 000000067F00004002000140000000F3C000-000000067F00004002000140000000F40000__000000931B9A2710 000000067F00004002000140000000F40000-000000067F00004002000140000000F44000__000000900A539398 000000067F00004002000140000000F40000-000000067F00004002000140000000F44000__000000914E3F38F0 000000067F00004002000140000000F40000-000000067F00004002000140000000F44000__000000931B9A2710 000000067F00004002000140000000F44000-000000067F00004002000140000000F48000__000000900A539398 000000067F00004002000140000000F44000-000000067F00004002000140000000F48000__000000914E3F38F0 000000067F00004002000140000000F44000-000000067F00004002000140000000F48000__000000931B9A2710 000000067F00004002000140000000F440B8-000000067F00004002000140000000F4CA9B__0000008FAC75E259-000000900BB52179 000000067F00004002000140000000F48000-000000067F00004002000140000000F4C000__000000900A539398 000000067F00004002000140000000F48000-000000067F00004002000140000000F4C000__000000914E3F38F0 000000067F00004002000140000000F48000-000000067F00004002000140000000F4C000__000000931B9A2710 000000067F00004002000140000000F4C000-000000067F00004002000140000000F50000__000000900A539398 000000067F00004002000140000000F4C000-000000067F00004002000140000000F50000__000000914E3F38F0 000000067F00004002000140000000F4C000-000000067F00004002000140000000F50000__000000931B9A2710 000000067F00004002000140000000F4CA9B-000000067F00004002000140000000F55479__0000008FAC75E259-000000900BB52179 000000067F00004002000140000000F50000-000000067F00004002000140000000F54000__000000900A539398 000000067F00004002000140000000F50000-000000067F00004002000140000000F54000__000000914E3F38F0 000000067F00004002000140000000F50000-000000067F00004002000140000000F54000__000000931B9A2710 000000067F00004002000140000000F54000-000000067F00004002000140000000F58000__000000900A539398 000000067F00004002000140000000F54000-000000067F00004002000140000000F58000__000000914E3F38F0 000000067F00004002000140000000F54000-000000067F00004002000140000000F58000__000000931B9A2710 000000067F00004002000140000000F55479-000000067F00004002000140000000F5DE56__0000008FAC75E259-000000900BB52179 000000067F00004002000140000000F58000-000000067F00004002000140000000F5C000__000000900A539398 000000067F00004002000140000000F58000-000000067F00004002000140000000F5C000__000000914E3F38F0 000000067F00004002000140000000F58000-000000067F00004002000140000000F5C000__000000931B9A2710 000000067F00004002000140000000F5C000-000000067F00004002000140040100000000__000000914E3F38F0 000000067F00004002000140000000F5C000-000000067F00004002000140040100000000__000000931B9A2710 000000067F00004002000140000000F5C000-030000000000000000000000000000000002__000000900A539398 000000067F00004002000140000000F5DE56-030000000000000000000000000000000002__0000008FAC75E259-000000900BB52179 000000067F000040020001400000FFFFFFFF-000000067F00004002000140000100000000__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000140000100000000-000000067F00004002000140000100000DEC__000000739A920D71-0000008D2DB5E0C1 000000067F000040020001400001FFFFFFFF-000000067F00004002000140000200000000__000000739A962E10-0000007F75893CE8 000000067F000040020001400001FFFFFFFF-000000067F00004002000140000200000000__0000007F75893CE8-0000008B40129080 000000067F000040020001400001FFFFFFFF-000000067F00004002000140000200000000__0000008B40129080-0000008D2DB5E0C1 000000067F000040020001400300FFFFFFFF-030000000000000000000000000000000002__000000739A920D71-0000008D2DB5E0C1 000000067F00004002000160000000000000-000000067F00004002000160000000004000__000000914E3F38F0 000000067F00004002000160000000000000-000000067F00004002000160000000004000__000000931B9A2710 000000067F00004002000160000000004000-000000067F00004002000160000000008000__000000914E3F38F0 000000067F00004002000160000000004000-000000067F00004002000160000000008000__000000931B9A2710 000000067F00004002000160000000007F7A-000000067F0000400200016000000000FEFD__000000900BB52179-0000009046EDA719 000000067F00004002000160000000008000-000000067F0000400200016000000000C000__000000914E3F38F0 000000067F00004002000160000000008000-000000067F0000400200016000000000C000__000000931B9A2710 000000067F0000400200016000000000C000-000000067F00004002000160000000010000__000000914E3F38F0 000000067F0000400200016000000000C000-000000067F00004002000160000000010000__000000931B9A2710 000000067F0000400200016000000000FEFD-000000067F00004002000160000000017E80__000000900BB52179-0000009046EDA719 000000067F00004002000160000000010000-000000067F00004002000160000000014000__000000914E3F38F0 000000067F00004002000160000000010000-000000067F00004002000160000000014000__000000931B9A2710 000000067F00004002000160000000014000-000000067F00004002000160000000018000__000000914E3F38F0 000000067F00004002000160000000014000-000000067F00004002000160000000018000__000000931B9A2710 000000067F00004002000160000000017E80-000000067F0000400200016000000001FE03__000000900BB52179-0000009046EDA719 000000067F00004002000160000000018000-000000067F0000400200016000000001C000__000000914E3F38F0 000000067F00004002000160000000018000-000000067F0000400200016000000001C000__000000931B9A2710 000000067F0000400200016000000001C000-000000067F00004002000160000000020000__000000914E3F38F0 000000067F0000400200016000000001C000-000000067F00004002000160000000020000__000000931B9A2710 000000067F0000400200016000000001FE03-000000067F00004002000160000000027D86__000000900BB52179-0000009046EDA719 000000067F00004002000160000000020000-000000067F00004002000160000000024000__000000914E3F38F0 000000067F00004002000160000000020000-000000067F00004002000160000000024000__000000931B9A2710 000000067F00004002000160000000024000-000000067F00004002000160000000028000__000000914E3F38F0 000000067F00004002000160000000024000-000000067F00004002000160000000028000__000000931B9A2710 000000067F00004002000160000000027D86-000000067F0000400200016000000002FD09__000000900BB52179-0000009046EDA719 000000067F00004002000160000000028000-000000067F0000400200016000000002C000__000000914E3F38F0 000000067F00004002000160000000028000-000000067F0000400200016000000002C000__000000931B9A2710 000000067F0000400200016000000002C000-000000067F00004002000160000000030000__000000914E3F38F0 000000067F0000400200016000000002C000-000000067F00004002000160000000030000__000000931B9A2710 000000067F0000400200016000000002FD09-030000000000000000000000000000000002__000000900BB52179-0000009046EDA719 000000067F00004002000160000000030000-000000067F00004002000160000000034000__000000914E3F38F0 000000067F00004002000160000000030000-000000067F00004002000160000000034000__000000931B9A2710 000000067F00004002000160000000034000-000000067F00004002000160000000038000__000000914E3F38F0 000000067F00004002000160000000034000-000000067F00004002000160000000038000__000000931B9A2710 000000067F00004002000160000000037E1D-000000067F0000400200016000000003FDA0__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000038000-000000067F0000400200016000000003C000__000000914E3F38F0 000000067F00004002000160000000038000-000000067F0000400200016000000003C000__000000931B9A2710 000000067F0000400200016000000003C000-000000067F00004002000160000000040000__000000914E3F38F0 000000067F0000400200016000000003C000-000000067F00004002000160000000040000__000000931B9A2710 000000067F0000400200016000000003FDA0-000000067F00004002000160000000047D23__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000040000-000000067F00004002000160000000044000__000000914E3F38F0 000000067F00004002000160000000040000-000000067F00004002000160000000044000__000000931B9A2710 000000067F00004002000160000000044000-000000067F00004002000160000000048000__000000914E3F38F0 000000067F00004002000160000000044000-000000067F00004002000160000000048000__000000931B9A2710 000000067F00004002000160000000047D23-000000067F0000400200016000000004FCA6__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000048000-000000067F0000400200016000000004C000__000000914E3F38F0 000000067F00004002000160000000048000-000000067F0000400200016000000004C000__000000931B9A2710 000000067F0000400200016000000004C000-000000067F00004002000160000000050000__000000914E3F38F0 000000067F0000400200016000000004C000-000000067F00004002000160000000050000__000000931B9A2710 000000067F0000400200016000000004FCA6-000000067F00004002000160000000057C29__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000050000-000000067F00004002000160000000054000__000000914E3F38F0 000000067F00004002000160000000050000-000000067F00004002000160000000054000__000000931B9A2710 000000067F00004002000160000000054000-000000067F00004002000160000000058000__000000914E3F38F0 000000067F00004002000160000000054000-000000067F00004002000160000000058000__000000931B9A2710 000000067F00004002000160000000057C29-000000067F0000400200016000000005FBAC__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000058000-000000067F0000400200016000000005C000__000000914E3F38F0 000000067F00004002000160000000058000-000000067F0000400200016000000005C000__000000931B9A2710 000000067F0000400200016000000005C000-000000067F00004002000160000000060000__000000914E3F38F0 000000067F0000400200016000000005C000-000000067F00004002000160000000060000__000000931B9A2710 000000067F0000400200016000000005FBAC-000000067F00004002000160000000067B2F__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000060000-000000067F00004002000160000000064000__000000914E3F38F0 000000067F00004002000160000000060000-000000067F00004002000160000000064000__000000931B9A2710 000000067F00004002000160000000064000-000000067F00004002000160000000068000__000000914E3F38F0 000000067F00004002000160000000064000-000000067F00004002000160000000068000__000000931B9A2710 000000067F00004002000160000000067B2F-000000067F0000400200016000000006FAB2__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000068000-000000067F0000400200016000000006C000__000000914E3F38F0 000000067F00004002000160000000068000-000000067F0000400200016000000006C000__000000931B9A2710 000000067F0000400200016000000006C000-000000067F00004002000160000000070000__000000914E3F38F0 000000067F0000400200016000000006C000-000000067F00004002000160000000070000__000000931B9A2710 000000067F0000400200016000000006FAB2-000000067F00004002000160000000077A35__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000070000-000000067F00004002000160000000074000__000000914E3F38F0 000000067F00004002000160000000070000-000000067F00004002000160000000074000__000000931B9A2710 000000067F00004002000160000000074000-000000067F00004002000160000000078000__000000914E3F38F0 000000067F00004002000160000000074000-000000067F00004002000160000000078000__000000931B9A2710 000000067F00004002000160000000077A35-000000067F0000400200016000000007F9B8__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000078000-000000067F0000400200016000000007C000__000000914E3F38F0 000000067F00004002000160000000078000-000000067F0000400200016000000007C000__000000931B9A2710 000000067F0000400200016000000007C000-000000067F00004002000160000000080000__000000914E3F38F0 000000067F0000400200016000000007C000-000000067F00004002000160000000080000__000000931B9A2710 000000067F0000400200016000000007F9B8-000000067F0000400200016000000008793B__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000080000-000000067F00004002000160000000084000__000000914E3F38F0 000000067F00004002000160000000080000-000000067F00004002000160000000084000__000000931B9A2710 000000067F00004002000160000000084000-000000067F00004002000160000000088000__000000914E3F38F0 000000067F00004002000160000000084000-000000067F00004002000160000000088000__000000931B9A2710 000000067F0000400200016000000008793B-000000067F0000400200016000000008F8BE__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000088000-000000067F0000400200016000000008C000__000000914E3F38F0 000000067F00004002000160000000088000-000000067F0000400200016000000008C000__000000931B9A2710 000000067F0000400200016000000008C000-000000067F00004002000160000000090000__000000914E3F38F0 000000067F0000400200016000000008C000-000000067F00004002000160000000090000__000000931B9A2710 000000067F0000400200016000000008F8BE-000000067F00004002000160000000097841__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000090000-000000067F00004002000160000000094000__000000914E3F38F0 000000067F00004002000160000000090000-000000067F00004002000160000000094000__000000931B9A2710 000000067F00004002000160000000094000-000000067F00004002000160000000098000__000000914E3F38F0 000000067F00004002000160000000094000-000000067F00004002000160000000098000__000000931B9A2710 000000067F00004002000160000000097841-000000067F0000400200016000000009F7C4__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000098000-000000067F0000400200016000000009C000__000000914E3F38F0 000000067F00004002000160000000098000-000000067F0000400200016000000009C000__000000931B9A2710 000000067F0000400200016000000009C000-000000067F000040020001600000000A0000__000000914E3F38F0 000000067F0000400200016000000009C000-000000067F000040020001600000000A0000__000000931B9A2710 000000067F0000400200016000000009F7C4-000000067F000040020001600000000A7747__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000A0000-000000067F000040020001600000000A4000__000000914E3F38F0 000000067F000040020001600000000A0000-000000067F000040020001600000000A4000__000000931B9A2710 000000067F000040020001600000000A4000-000000067F000040020001600000000A8000__000000914E3F38F0 000000067F000040020001600000000A4000-000000067F000040020001600000000A8000__000000931B9A2710 000000067F000040020001600000000A7747-000000067F000040020001600000000AF6CA__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000A8000-000000067F000040020001600000000AC000__000000914E3F38F0 000000067F000040020001600000000A8000-000000067F000040020001600000000AC000__000000931B9A2710 000000067F000040020001600000000AC000-000000067F000040020001600000000B0000__000000914E3F38F0 000000067F000040020001600000000AC000-000000067F000040020001600000000B0000__000000931B9A2710 000000067F000040020001600000000AF6CA-000000067F000040020001600000000B764D__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000B0000-000000067F000040020001600000000B4000__000000914E3F38F0 000000067F000040020001600000000B0000-000000067F000040020001600000000B4000__000000931B9A2710 000000067F000040020001600000000B4000-000000067F000040020001600000000B8000__000000914E3F38F0 000000067F000040020001600000000B4000-000000067F000040020001600000000B8000__000000931B9A2710 000000067F000040020001600000000B764D-000000067F000040020001600000000BF5D0__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000B8000-000000067F000040020001600000000BC000__000000914E3F38F0 000000067F000040020001600000000B8000-000000067F000040020001600000000BC000__000000931B9A2710 000000067F000040020001600000000BC000-000000067F000040020001600000000C0000__000000914E3F38F0 000000067F000040020001600000000BC000-000000067F000040020001600000000C0000__000000931B9A2710 000000067F000040020001600000000BF5D0-000000067F000040020001600000000C7553__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000C0000-000000067F000040020001600000000C4000__000000914E3F38F0 000000067F000040020001600000000C0000-000000067F000040020001600000000C4000__000000931B9A2710 000000067F000040020001600000000C4000-000000067F000040020001600000000C8000__000000914E3F38F0 000000067F000040020001600000000C4000-000000067F000040020001600000000C8000__000000931B9A2710 000000067F000040020001600000000C7553-000000067F000040020001600000000CF4D6__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000C8000-000000067F000040020001600000000CC000__000000914E3F38F0 000000067F000040020001600000000C8000-000000067F000040020001600000000CC000__000000931B9A2710 000000067F000040020001600000000CC000-000000067F000040020001600000000D0000__000000914E3F38F0 000000067F000040020001600000000CC000-000000067F000040020001600000000D0000__000000931B9A2710 000000067F000040020001600000000CF4D6-000000067F000040020001600000000D7459__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000D0000-000000067F000040020001600000000D4000__000000914E3F38F0 000000067F000040020001600000000D0000-000000067F000040020001600000000D4000__000000931B9A2710 000000067F000040020001600000000D4000-000000067F000040020001600000000D8000__000000914E3F38F0 000000067F000040020001600000000D4000-000000067F000040020001600000000D8000__000000931B9A2710 000000067F000040020001600000000D7459-000000067F000040020001600000000DF3DC__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000D8000-000000067F000040020001600000000DC000__000000914E3F38F0 000000067F000040020001600000000D8000-000000067F000040020001600000000DC000__000000931B9A2710 000000067F000040020001600000000DC000-000000067F000040020001600000000E0000__000000914E3F38F0 000000067F000040020001600000000DC000-000000067F000040020001600000000E0000__000000931B9A2710 000000067F000040020001600000000DF3DC-000000067F000040020001600000000E735F__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000E0000-000000067F000040020001600000000E4000__000000914E3F38F0 000000067F000040020001600000000E0000-000000067F000040020001600000000E4000__000000931B9A2710 000000067F000040020001600000000E4000-000000067F000040020001600000000E8000__000000914E3F38F0 000000067F000040020001600000000E4000-000000067F000040020001600000000E8000__000000931B9A2710 000000067F000040020001600000000E735F-000000067F000040020001600000000EF2E2__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000E8000-000000067F000040020001600000000EC000__000000914E3F38F0 000000067F000040020001600000000E8000-000000067F000040020001600000000EC000__000000931B9A2710 000000067F000040020001600000000EC000-000000067F000040020001600000000F0000__000000914E3F38F0 000000067F000040020001600000000EC000-000000067F000040020001600000000F0000__000000931B9A2710 000000067F000040020001600000000EF2E2-000000067F000040020001600000000F7265__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000F0000-000000067F000040020001600000000F4000__000000914E3F38F0 000000067F000040020001600000000F0000-000000067F000040020001600000000F4000__000000931B9A2710 000000067F000040020001600000000F4000-000000067F000040020001600000000F8000__000000914E3F38F0 000000067F000040020001600000000F4000-000000067F000040020001600000000F8000__000000931B9A2710 000000067F000040020001600000000F7265-000000067F000040020001600000000FF1E8__0000009046EDA719-000000914E3FE031 000000067F000040020001600000000F8000-000000067F000040020001600000000FC000__000000914E3F38F0 000000067F000040020001600000000F8000-000000067F000040020001600000000FC000__000000931B9A2710 000000067F000040020001600000000FC000-000000067F00004002000160000000100000__000000914E3F38F0 000000067F000040020001600000000FC000-000000067F00004002000160000000100000__000000931B9A2710 000000067F000040020001600000000FF1E8-000000067F0000400200016000000010716B__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000100000-000000067F00004002000160000000104000__000000914E3F38F0 000000067F00004002000160000000100000-000000067F00004002000160000000104000__000000931B9A2710 000000067F00004002000160000000104000-000000067F00004002000160000000108000__000000914E3F38F0 000000067F00004002000160000000104000-000000067F00004002000160000000108000__000000931B9A2710 000000067F0000400200016000000010716B-030000000000000000000000000000000002__0000009046EDA719-000000914E3FE031 000000067F00004002000160000000108000-000000067F0000400200016000000010C000__000000914E3F38F0 000000067F00004002000160000000108000-000000067F0000400200016000000010C000__000000931B9A2710 000000067F0000400200016000000010C000-000000067F00004002000160000100000000__000000931B9A2710 000000067F0000400200016000000010C000-030000000000000000000000000000000002__000000914E3F38F0 000000067F00004002000180000000000000-000000067F00004002000180000000004000__000000931B9A2710 000000067F00004002000180000000004000-000000067F00004002000180000000008000__000000931B9A2710 000000067F00004002000180000000007F7A-000000067F0000400200018000000000FEFD__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000008000-000000067F0000400200018000000000C000__000000931B9A2710 000000067F0000400200018000000000C000-000000067F00004002000180000000010000__000000931B9A2710 000000067F0000400200018000000000FEFD-000000067F00004002000180000000017E80__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000010000-000000067F00004002000180000000014000__000000931B9A2710 000000067F00004002000180000000014000-000000067F00004002000180000000018000__000000931B9A2710 000000067F00004002000180000000017E80-000000067F0000400200018000000001FE03__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000018000-000000067F0000400200018000000001C000__000000931B9A2710 000000067F0000400200018000000001C000-000000067F00004002000180000000020000__000000931B9A2710 000000067F0000400200018000000001FE03-000000067F00004002000180000000027D86__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000020000-000000067F00004002000180000000024000__000000931B9A2710 000000067F00004002000180000000024000-000000067F00004002000180000000028000__000000931B9A2710 000000067F00004002000180000000027D86-000000067F0000400200018000000002FD09__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000028000-000000067F0000400200018000000002C000__000000931B9A2710 000000067F0000400200018000000002C000-000000067F00004002000180000000030000__000000931B9A2710 000000067F0000400200018000000002FD09-000000067F00004002000180000000037C8C__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000030000-000000067F00004002000180000000034000__000000931B9A2710 000000067F00004002000180000000034000-000000067F00004002000180000000038000__000000931B9A2710 000000067F00004002000180000000037C8C-000000067F0000400200018000000003FC0F__000000914E3FE031-000000919CCE8B21 000000067F00004002000180000000038000-000000067F0000400200018000000003C000__000000931B9A2710 000000067F0000400200018000000003C000-000000067F00004002000180000000040000__000000926240EF70 000000067F0000400200018000000003C000-000000067F00004002000180000000040000__000000931B9AFDF8 000000067F0000400200018000000003FC0F-030000000000000000000000000000000002__000000914E3FE031-000000919CCE8B21 000000067F0000400200018000000003FE20-000000067F00004002000180000000047DA3__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000040000-000000067F00004002000180000000044000__000000926240EF70 000000067F00004002000180000000040000-000000067F00004002000180000000044000__000000931B9AFDF8 000000067F00004002000180000000044000-000000067F00004002000180000000048000__000000926240EF70 000000067F00004002000180000000044000-000000067F00004002000180000000048000__000000931B9AFDF8 000000067F00004002000180000000047DA3-000000067F0000400200018000000004FD26__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000048000-000000067F0000400200018000000004C000__000000926240EF70 000000067F00004002000180000000048000-000000067F0000400200018000000004C000__000000931B9AFDF8 000000067F0000400200018000000004C000-000000067F00004002000180000000050000__000000926240EF70 000000067F0000400200018000000004C000-000000067F00004002000180000000050000__000000931B9AFDF8 000000067F0000400200018000000004FD26-000000067F00004002000180000000057CA9__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000050000-000000067F00004002000180000000054000__000000926240EF70 000000067F00004002000180000000050000-000000067F00004002000180000000054000__000000931B9AFDF8 000000067F00004002000180000000054000-000000067F00004002000180000000058000__000000926240EF70 000000067F00004002000180000000054000-000000067F00004002000180000000058000__000000931B9AFDF8 000000067F00004002000180000000057CA9-000000067F0000400200018000000005FC2C__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000058000-000000067F0000400200018000000005C000__000000926240EF70 000000067F00004002000180000000058000-000000067F0000400200018000000005C000__000000931B9AFDF8 000000067F0000400200018000000005C000-000000067F00004002000180000000060000__000000926240EF70 000000067F0000400200018000000005C000-000000067F00004002000180000000060000__000000931B9AFDF8 000000067F0000400200018000000005FC2C-000000067F00004002000180000000067BAF__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000060000-000000067F00004002000180000000064000__000000926240EF70 000000067F00004002000180000000060000-000000067F00004002000180000000064000__000000931B9AFDF8 000000067F00004002000180000000064000-000000067F00004002000180000000068000__000000926240EF70 000000067F00004002000180000000064000-000000067F00004002000180000000068000__000000931B9AFDF8 000000067F00004002000180000000067BAF-000000067F0000400200018000000006FB32__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000068000-000000067F0000400200018000000006C000__000000926240EF70 000000067F00004002000180000000068000-000000067F0000400200018000000006C000__000000931B9AFDF8 000000067F0000400200018000000006C000-000000067F00004002000180000000070000__000000926240EF70 000000067F0000400200018000000006C000-000000067F00004002000180000000070000__000000931B9AFDF8 000000067F0000400200018000000006FB32-000000067F00004002000180000000077AB5__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000070000-000000067F00004002000180000000074000__000000926240EF70 000000067F00004002000180000000070000-000000067F00004002000180000000074000__000000931B9AFDF8 000000067F00004002000180000000074000-000000067F00004002000180000000078000__000000926240EF70 000000067F00004002000180000000074000-000000067F00004002000180000000078000__000000931B9AFDF8 000000067F00004002000180000000077AB5-000000067F0000400200018000000007FA38__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000078000-000000067F0000400200018000000007C000__000000926240EF70 000000067F00004002000180000000078000-000000067F0000400200018000000007C000__000000931B9AFDF8 000000067F0000400200018000000007C000-000000067F00004002000180000000080000__000000926240EF70 000000067F0000400200018000000007C000-000000067F00004002000180000000080000__000000931B9AFDF8 000000067F0000400200018000000007FA38-000000067F000040020001800000000879BB__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000080000-000000067F00004002000180000000084000__000000926240EF70 000000067F00004002000180000000080000-000000067F00004002000180000000084000__000000931B9AFDF8 000000067F00004002000180000000084000-000000067F00004002000180000000088000__000000926240EF70 000000067F00004002000180000000084000-000000067F00004002000180000000088000__000000931B9AFDF8 000000067F000040020001800000000879BB-000000067F0000400200018000000008F93E__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000088000-000000067F0000400200018000000008C000__000000926240EF70 000000067F00004002000180000000088000-000000067F0000400200018000000008C000__000000931B9AFDF8 000000067F0000400200018000000008C000-000000067F00004002000180000000090000__000000926240EF70 000000067F0000400200018000000008C000-000000067F00004002000180000000090000__000000931B9AFDF8 000000067F0000400200018000000008F93E-000000067F000040020001800000000978C1__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000090000-000000067F00004002000180000000094000__000000926240EF70 000000067F00004002000180000000090000-000000067F00004002000180000000094000__000000931B9AFDF8 000000067F00004002000180000000094000-000000067F00004002000180000000098000__000000926240EF70 000000067F00004002000180000000094000-000000067F00004002000180000000098000__000000931B9AFDF8 000000067F000040020001800000000978C1-000000067F0000400200018000000009F844__000000919CCE8B21-000000921B6384B9 000000067F00004002000180000000098000-000000067F0000400200018000000009C000__000000926240EF70 000000067F00004002000180000000098000-000000067F0000400200018000000009C000__000000931B9AFDF8 000000067F0000400200018000000009C000-000000067F000040020001800000000A0000__000000926240EF70 000000067F0000400200018000000009C000-000000067F000040020001800000000A0000__000000931B9AFDF8 000000067F0000400200018000000009F844-000000067F000040020001800000000A77C7__000000919CCE8B21-000000921B6384B9 000000067F000040020001800000000A0000-000000067F000040020001800000000A4000__000000926240EF70 000000067F000040020001800000000A0000-000000067F000040020001800000000A4000__000000931B9AFDF8 000000067F000040020001800000000A4000-000000067F000040020001800000000A8000__000000926240EF70 000000067F000040020001800000000A4000-000000067F000040020001800000000A8000__000000931B9A2710 000000067F000040020001800000000A77C7-000000067F00004002000180000100000000__000000919CCE8B21-000000921B6384B9 000000067F000040020001800000000A7AE0-000000067F000040020001800000000AFA63__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000A8000-000000067F000040020001800000000AC000__000000926240EF70 000000067F000040020001800000000A8000-000000067F000040020001800000000AC000__000000931B9A2710 000000067F000040020001800000000AC000-000000067F000040020001800000000B0000__000000926240EF70 000000067F000040020001800000000AC000-000000067F000040020001800000000B0000__000000931B9A2710 000000067F000040020001800000000AFA63-000000067F000040020001800000000B79E6__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000B0000-000000067F000040020001800000000B4000__000000926240EF70 000000067F000040020001800000000B0000-000000067F000040020001800000000B4000__000000931B9A2710 000000067F000040020001800000000B4000-000000067F000040020001800000000B8000__000000926240EF70 000000067F000040020001800000000B4000-000000067F000040020001800000000B8000__000000931B9A2710 000000067F000040020001800000000B79E6-000000067F000040020001800000000BF969__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000B8000-000000067F000040020001800000000BC000__000000926240EF70 000000067F000040020001800000000B8000-000000067F000040020001800000000BC000__000000931B9A2710 000000067F000040020001800000000BC000-000000067F000040020001800000000C0000__000000926240EF70 000000067F000040020001800000000BC000-000000067F000040020001800000000C0000__000000931B9A2710 000000067F000040020001800000000BF969-000000067F000040020001800000000C78EC__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000C0000-000000067F000040020001800000000C4000__000000926240EF70 000000067F000040020001800000000C0000-000000067F000040020001800000000C4000__000000931B9A2710 000000067F000040020001800000000C4000-000000067F000040020001800000000C8000__000000926240EF70 000000067F000040020001800000000C4000-000000067F000040020001800000000C8000__000000931B9A2710 000000067F000040020001800000000C78EC-000000067F000040020001800000000CF86F__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000C8000-000000067F000040020001800000000CC000__000000926240EF70 000000067F000040020001800000000C8000-000000067F000040020001800000000CC000__000000931B9A2710 000000067F000040020001800000000CC000-000000067F000040020001800000000D0000__000000926240EF70 000000067F000040020001800000000CC000-000000067F000040020001800000000D0000__000000931B9A2710 000000067F000040020001800000000CF86F-000000067F000040020001800000000D77F2__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000D0000-000000067F000040020001800000000D4000__000000926240EF70 000000067F000040020001800000000D0000-000000067F000040020001800000000D4000__000000931B9A2710 000000067F000040020001800000000D4000-000000067F000040020001800000000D8000__000000926240EF70 000000067F000040020001800000000D4000-000000067F000040020001800000000D8000__000000931B9A2710 000000067F000040020001800000000D77F2-000000067F000040020001800000000DF775__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000D8000-000000067F000040020001800000000DC000__000000926240EF70 000000067F000040020001800000000D8000-000000067F000040020001800000000DC000__000000931B9A2710 000000067F000040020001800000000DC000-000000067F000040020001800000000E0000__000000926240EF70 000000067F000040020001800000000DC000-000000067F000040020001800000000E0000__000000931B9A2710 000000067F000040020001800000000DF775-000000067F000040020001800000000E76F8__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000E0000-000000067F000040020001800000000E4000__000000926240EF70 000000067F000040020001800000000E0000-000000067F000040020001800000000E4000__000000931B9A2710 000000067F000040020001800000000E4000-000000067F000040020001800000000E8000__000000931B9A2710 000000067F000040020001800000000E4000-030000000000000000000000000000000002__000000926240EF70 000000067F000040020001800000000E76F8-000000067F000040020001800000000EF67B__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000E8000-000000067F000040020001800000000EC000__000000931B9A2710 000000067F000040020001800000000EC000-000000067F000040020001800000000F0000__000000931B9A2710 000000067F000040020001800000000EF67B-000000067F000040020001800000000F75FE__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000F0000-000000067F000040020001800000000F4000__000000931B9A2710 000000067F000040020001800000000F4000-000000067F000040020001800000000F8000__000000931B9A2710 000000067F000040020001800000000F75FE-000000067F000040020001800000000FF581__000000921B6384B9-00000092D346E5E9 000000067F000040020001800000000F8000-000000067F000040020001800000000FC000__000000931B9A2710 000000067F000040020001800000000FC000-000000067F00004002000180000000100000__000000931B9A2710 000000067F000040020001800000000FF581-000000067F00004002000180000000107504__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000100000-000000067F00004002000180000000104000__000000931B9A2710 000000067F00004002000180000000104000-000000067F00004002000180000000108000__000000931B9A2710 000000067F00004002000180000000107504-000000067F0000400200018000000010F487__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000108000-000000067F0000400200018000000010C000__000000931B9A2710 000000067F0000400200018000000010C000-000000067F00004002000180000000110000__000000931B9A2710 000000067F0000400200018000000010F487-000000067F0000400200018000000011740A__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000110000-000000067F00004002000180000000114000__000000931B9A2710 000000067F00004002000180000000114000-000000067F00004002000180000000118000__000000931B9A2710 000000067F0000400200018000000011740A-000000067F0000400200018000000011F38D__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000118000-000000067F0000400200018000000011C000__000000931B9A2710 000000067F0000400200018000000011C000-000000067F00004002000180000000120000__000000931B9A2710 000000067F0000400200018000000011F38D-000000067F00004002000180000000127310__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000120000-000000067F00004002000180000000124000__000000931B9A2710 000000067F00004002000180000000124000-000000067F00004002000180000000128000__000000931B9A2710 000000067F00004002000180000000127310-000000067F0000400200018000000012F293__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000128000-000000067F0000400200018000000012C000__000000931B9A2710 000000067F0000400200018000000012C000-000000067F00004002000180000000130000__000000931B9A2710 000000067F0000400200018000000012F293-000000067F00004002000180000000137216__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000130000-000000067F00004002000180000000134000__000000931B9A2710 000000067F00004002000180000000134000-000000067F00004002000180000000138000__000000931B9A2710 000000067F00004002000180000000137216-000000067F0000400200018000000013F199__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000138000-000000067F0000400200018000000013C000__000000931B9A2710 000000067F0000400200018000000013C000-000000067F00004002000180000000140000__000000931B9A2710 000000067F0000400200018000000013F199-000000067F0000400200018000000014711C__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000140000-000000067F00004002000180000000144000__000000931B9A2710 000000067F00004002000180000000144000-000000067F00004002000180000000148000__000000931B9A2710 000000067F0000400200018000000014711C-000000067F00004002000180000100000000__000000921B6384B9-00000092D346E5E9 000000067F00004002000180000000148000-000000067F0000400200018000000014C000__000000931B9A2710 000000067F0000400200018000000014C000-000000067F00004002000180000000150000__000000931B9A2710 000000067F0000400200018000000014F52F-000000067F000040020001800000001574B2__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000150000-000000067F00004002000180000000154000__000000931B9A2710 000000067F00004002000180000000154000-000000067F00004002000180000000158000__000000931B9A2710 000000067F000040020001800000001574B2-000000067F0000400200018000000015F435__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000158000-000000067F0000400200018000000015C000__000000931B9A2710 000000067F0000400200018000000015C000-000000067F00004002000180000000160000__000000931B9A2710 000000067F0000400200018000000015F435-000000067F000040020001800000001673B8__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000160000-000000067F00004002000180000000164000__000000931B9A2710 000000067F00004002000180000000164000-000000067F00004002000180000000168000__000000931B9A2710 000000067F000040020001800000001673B8-000000067F0000400200018000000016F33B__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000168000-000000067F0000400200018000000016C000__000000931B9A2710 000000067F0000400200018000000016C000-000000067F00004002000180000000170000__000000931B9A2710 000000067F0000400200018000000016F33B-000000067F000040020001800000001772BE__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000170000-000000067F00004002000180000000174000__000000931B9A2710 000000067F00004002000180000000174000-000000067F00004002000180000000178000__000000931B9A2710 000000067F000040020001800000001772BE-000000067F0000400200018000000017F241__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000178000-000000067F0000400200018000000017C000__000000931B9A2710 000000067F0000400200018000000017C000-000000067F00004002000180000000180000__000000931B9A2710 000000067F0000400200018000000017F241-030000000000000000000000000000000002__00000092D346E5E9-000000931B991E09 000000067F00004002000180000000180000-000000067F00004002000180000000184000__000000931B9A2710 000000067F00004002000180000000184000-030000000000000000000000000000000002__000000931B9A2710 ================================================ FILE: pageserver/benches/upload_queue.rs ================================================ //! Upload queue benchmarks. use std::str::FromStr as _; use std::sync::Arc; use std::sync::atomic::AtomicU32; use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use pageserver::tenant::IndexPart; use pageserver::tenant::metadata::TimelineMetadata; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; use pprof::criterion::{Output, PProfProfiler}; use utils::generation::Generation; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_upload_queue_next_ready, ); criterion_main!(benches); /// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks /// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload /// queue as a whole is thus quadratic. /// /// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test /// Delete and UploadMetadata instead. This is incidentally the most expensive case. fn bench_upload_queue_next_ready(c: &mut Criterion) { let mut g = c.benchmark_group("upload_queue_next_ready"); for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] { g.bench_function(format!("inprogress={inprogress}"), |b| { run_bench(b, inprogress).unwrap() }); } fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> { // Construct two layers. layer0 is in the indexes, layer1 will be deleted. let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); let metadata = LayerFileMetadata { shard: ShardIndex::new(ShardNumber(1), ShardCount(2)), generation: Generation::Valid(1), file_size: 0, }; // Construct the (initial and uploaded) index with layer0. let mut index = IndexPart::empty(TimelineMetadata::example()); index.layer_metadata.insert(layer0, metadata.clone()); // Construct the queue. let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&index, 0)?; // Populate inprogress_tasks with a bunch of layer1 deletions. let delete = UploadOp::Delete(Delete { layers: vec![(layer1, metadata)], }); for task_id in 0..(inprogress as u64) { queue.inprogress_tasks.insert( task_id, Arc::new(UploadTask { task_id, retries: AtomicU32::new(0), op: delete.clone(), coalesced_ops: Vec::new(), }), ); } // Benchmark index upload scheduling. let index_upload = UploadOp::UploadMetadata { uploaded: Box::new(index), }; b.iter(|| { queue.queued_operations.push_front(index_upload.clone()); assert!(queue.next_ready().is_some()); }); Ok(()) } } ================================================ FILE: pageserver/client/Cargo.toml ================================================ [package] name = "pageserver_client" version = "0.1.0" edition.workspace = true license.workspace = true [features] testing = [ "pageserver_api/testing" ] [dependencies] pageserver_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } http-utils.workspace = true utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } tokio-postgres.workspace = true tokio-stream.workspace = true tokio.workspace = true postgres_versioninfo.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true bytes.workspace = true ================================================ FILE: pageserver/client/src/lib.rs ================================================ pub mod mgmt_api; pub mod page_service; /// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool. // If file structure is per-kind not per-feature then where to put this? #[derive(Clone, Copy)] pub enum BlockUnblock { Block, Unblock, } impl std::fmt::Display for BlockUnblock { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let s = match self { BlockUnblock::Block => "block", BlockUnblock::Unblock => "unblock", }; f.write_str(s) } } ================================================ FILE: pageserver/client/src/mgmt_api/util.rs ================================================ //! Helpers to do common higher-level tasks with the [`Client`]. use std::sync::Arc; use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::{TenantId, TenantTimelineId}; use super::Client; /// Retrieve a list of all of the pageserver's timelines. /// /// Fails if there are sharded tenants present on the pageserver. pub async fn get_pageserver_tenant_timelines_unsharded( api_client: &Arc, ) -> anyhow::Result> { let mut timelines: Vec = Vec::new(); let mut tenants: Vec = Vec::new(); for ti in api_client.list_tenants().await? { if !ti.id.is_unsharded() { anyhow::bail!( "only unsharded tenants are supported at this time: {}", ti.id ); } tenants.push(ti.id.tenant_id) } let mut js = JoinSet::new(); for tenant_id in tenants { js.spawn({ let mgmt_api_client = Arc::clone(api_client); async move { ( tenant_id, mgmt_api_client .tenant_details(TenantShardId::unsharded(tenant_id)) .await .unwrap(), ) } }); } while let Some(res) = js.join_next().await { let (tenant_id, details) = res.unwrap(); for timeline_id in details.timelines { timelines.push(TenantTimelineId { tenant_id, timeline_id, }); } } Ok(timelines) } ================================================ FILE: pageserver/client/src/mgmt_api.rs ================================================ use std::collections::{BTreeMap, HashMap}; use std::error::Error as _; use std::time::Duration; use bytes::Bytes; use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; use postgres_versioninfo::PgMajorVersion; pub use reqwest::Body as ReqwestBody; use reqwest::{IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use crate::BlockUnblock; pub mod util; #[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, client: reqwest::Client, } #[derive(thiserror::Error, Debug)] pub enum Error { #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] SendRequest(reqwest::Error), #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), #[error("receive error body: {0}")] ReceiveErrorBody(String), #[error("pageserver API: {1}")] ApiError(StatusCode, String), #[error("Cancelled")] Cancelled, #[error("request timed out: {0}")] Timeout(String), } pub type Result = std::result::Result; pub trait ResponseErrorMessageExt: Sized { fn error_from_body(self) -> impl std::future::Future> + Send; } impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); if !(status.is_client_error() || status.is_server_error()) { return Ok(self); } let url = self.url().to_owned(); Err(match self.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url)) } }) } } pub enum ForceAwaitLogicalSize { Yes, No, } impl Client { pub fn new(client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), client, } } pub async fn list_tenants(&self) -> Result> { let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming /// Response. This function is suitable for pass-through/proxy use cases where we don't care /// what the response content looks like. /// /// Use/add one of the properly typed methods below if you know aren't proxying, and /// know what kind of response you expect. pub async fn op_raw(&self, method: Method, path: String) -> Result { debug_assert!(path.starts_with('/')); let uri = format!("{}{}", self.mgmt_api_endpoint, path); let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value); } req.send().await.map_err(Error::ReceiveBody) } pub async fn tenant_details( &self, tenant_shard_id: TenantShardId, ) -> Result { let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); self.get(uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn list_timelines( &self, tenant_shard_id: TenantShardId, ) -> Result> { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline", self.mgmt_api_endpoint ); self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_info( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); let uri = match force_await_logical_size { ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true), ForceAwaitLogicalSize::No => uri, }; self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn keyspace( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } fn start_request( &self, method: Method, uri: U, ) -> reqwest::RequestBuilder { let req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { req } } async fn request_noerror( &self, method: Method, uri: U, body: B, ) -> Result { self.start_request(method, uri) .json(&body) .send() .await .map_err(Error::ReceiveBody) } async fn request( &self, method: Method, uri: U, body: B, ) -> Result { let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; Ok(response) } pub async fn status(&self) -> Result<()> { let uri = format!("{}/v1/status", self.mgmt_api_endpoint); self.get(&uri).await?; Ok(()) } /// The tenant deletion API can return 202 if deletion is incomplete, or /// 404 if it is complete. Callers are responsible for checking the status /// code and retrying. Error codes other than 404 will return Err(). pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); match self.request(Method::DELETE, &uri, ()).await { Err(Error::ApiError(status_code, msg)) => { if status_code == StatusCode::NOT_FOUND { Ok(StatusCode::NOT_FOUND) } else { Err(Error::ApiError(status_code, msg)) } } Err(e) => Err(e), Ok(response) => Ok(response.status()), } } pub async fn tenant_time_travel_remote_storage( &self, tenant_shard_id: TenantShardId, timestamp: &str, done_if_after: &str, ) -> Result<()> { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}", self.mgmt_api_endpoint ); self.request(Method::PUT, &uri, ()).await?; Ok(()) } pub async fn tenant_timeline_compact( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_image_layer_creation: bool, must_force_image_layer_creation: bool, scheduled: bool, wait_until_done: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact", self.mgmt_api_endpoint )) .expect("Cannot build URL"); if force_image_layer_creation { path.query_pairs_mut() .append_pair("force_image_layer_creation", "true"); } if must_force_image_layer_creation { path.query_pairs_mut() .append_pair("must_force_image_layer_creation", "true"); } if scheduled { path.query_pairs_mut().append_pair("scheduled", "true"); } if wait_until_done { path.query_pairs_mut() .append_pair("wait_until_scheduled_compaction_done", "true"); path.query_pairs_mut() .append_pair("wait_until_uploaded", "true"); } self.request(Method::PUT, path, ()).await?; Ok(()) } /* BEGIN_HADRON */ pub async fn tenant_timeline_describe( &self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Result { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint )) .expect("Cannot build URL"); path.query_pairs_mut() .append_pair("include-image-consistent-lsn", "true"); let response: reqwest::Response = self.request(Method::GET, path, ()).await?; let body = response.json().await.map_err(Error::ReceiveBody)?; Ok(body) } pub async fn list_tenant_visible_size(&self) -> Result> { let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } /* END_HADRON */ pub async fn tenant_scan_remote_storage( &self, tenant_id: TenantId, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_id}/scan_remote_storage", self.mgmt_api_endpoint ); let response = self.request(Method::GET, &uri, ()).await?; let body = response.json().await.map_err(Error::ReceiveBody)?; Ok(body) } pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; Ok(()) } pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PATCH, &uri, req).await?; Ok(()) } pub async fn tenant_secondary_download( &self, tenant_id: TenantShardId, wait: Option, ) -> Result<(StatusCode, SecondaryProgress)> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/secondary/download", self.mgmt_api_endpoint, tenant_id )) .expect("Cannot build URL"); if let Some(wait) = wait { path.query_pairs_mut() .append_pair("wait_ms", &format!("{}", wait.as_millis())); } let response = self.request(Method::POST, path, ()).await?; let status = response.status(); let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?; Ok((status, progress)) } pub async fn tenant_secondary_status( &self, tenant_shard_id: TenantShardId, ) -> Result { let path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/secondary/status", self.mgmt_api_endpoint, tenant_shard_id )) .expect("Cannot build URL"); self.request(Method::GET, path, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { let path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/heatmap_upload", self.mgmt_api_endpoint, tenant_id )) .expect("Cannot build URL"); self.request(Method::POST, path, ()).await?; Ok(()) } pub async fn location_config( &self, tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, lazy: bool, ) -> Result<()> { let req_body = TenantLocationConfigRequest { config }; let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", self.mgmt_api_endpoint, tenant_shard_id )) // Should always work: mgmt_api_endpoint is configuration, not user input. .expect("Cannot build URL"); if lazy { path.query_pairs_mut().append_pair("lazy", "true"); } if let Some(flush_ms) = flush_ms { path.query_pairs_mut() .append_pair("flush_ms", &format!("{}", flush_ms.as_millis())); } self.request(Method::PUT, path, &req_body).await?; Ok(()) } pub async fn list_location_config(&self) -> Result { let path = format!("{}/v1/location_config", self.mgmt_api_endpoint); self.request(Method::GET, &path, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn get_location_config( &self, tenant_shard_id: TenantShardId, ) -> Result> { let path = format!( "{}/v1/location_config/{tenant_shard_id}", self.mgmt_api_endpoint ); self.request(Method::GET, &path, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, req: &TimelineCreateRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline", self.mgmt_api_endpoint, tenant_shard_id ); self.request(Method::POST, &uri, req) .await? .json() .await .map_err(Error::ReceiveBody) } /// The timeline deletion API can return 201 if deletion is incomplete, or /// 403 if it is complete. Callers are responsible for checking the status /// code and retrying. Error codes other than 403 will return Err(). pub async fn timeline_delete( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); match self.request(Method::DELETE, &uri, ()).await { Err(Error::ApiError(status_code, msg)) => { if status_code == StatusCode::NOT_FOUND { Ok(StatusCode::NOT_FOUND) } else { Err(Error::ApiError(status_code, msg)) } } Err(e) => Err(e), Ok(response) => Ok(response.status()), } } pub async fn timeline_detail( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); self.request(Method::GET, &uri, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_archival_config( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, req: &TimelineArchivalConfigRequest, ) -> Result<()> { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config", self.mgmt_api_endpoint ); self.request(Method::PUT, &uri, req) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_detach_ancestor( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, behavior: Option, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", self.mgmt_api_endpoint ); let mut uri = Url::parse(&uri) .map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?; if let Some(behavior) = behavior { uri.query_pairs_mut() .append_pair("detach_behavior", &behavior.to_string()); } self.request(Method::PUT, uri, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_block_unblock_gc( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, dir: BlockUnblock, ) -> Result<()> { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc", self.mgmt_api_endpoint, ); self.request(Method::POST, &uri, ()).await.map(|_| ()) } pub async fn timeline_download_heatmap_layers( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, recurse: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", self.mgmt_api_endpoint, tenant_shard_id, timeline_id )) .expect("Cannot build URL"); path.query_pairs_mut() .append_pair("recurse", &format!("{recurse}")); if let Some(concurrency) = concurrency { path.query_pairs_mut() .append_pair("concurrency", &format!("{concurrency}")); } self.request(Method::POST, path, ()).await.map(|_| ()) } pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", self.mgmt_api_endpoint, tenant_shard_id ); self.request(Method::POST, &uri, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn tenant_shard_split( &self, tenant_shard_id: TenantShardId, req: TenantShardSplitRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/shard_split", self.mgmt_api_endpoint, tenant_shard_id ); self.request(Method::PUT, &uri, req) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_list( &self, tenant_shard_id: &TenantShardId, ) -> Result> { let uri = format!( "{}/v1/tenant/{}/timeline", self.mgmt_api_endpoint, tenant_shard_id ); self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn tenant_synthetic_size( &self, tenant_shard_id: TenantShardId, ) -> Result { let uri = format!( "{}/v1/tenant/{}/synthetic_size", self.mgmt_api_endpoint, tenant_shard_id ); self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn put_io_engine( &self, engine: &pageserver_api::models::virtual_file::IoEngineKind, ) -> Result<()> { let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint); self.request(Method::PUT, uri, engine) .await? .json() .await .map_err(Error::ReceiveBody) } /// Configs io mode at runtime. pub async fn put_io_mode( &self, mode: &pageserver_api::models::virtual_file::IoMode, ) -> Result<()> { let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint); self.request(Method::PUT, uri, mode) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn get_utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); self.get(uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn top_tenant_shards( &self, request: TopTenantShardsRequest, ) -> Result { let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint); self.request(Method::POST, uri, request) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn layer_map_info( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/layer", self.mgmt_api_endpoint, tenant_shard_id, timeline_id, ); self.get(&uri) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn layer_evict( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, layer_file_name: &str, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/layer/{}", self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name ); let resp = self.request_noerror(Method::DELETE, &uri, ()).await?; match resp.status() { StatusCode::OK => Ok(true), StatusCode::NOT_MODIFIED => Ok(false), // TODO: dedupe this pattern / introduce separate error variant? status => Err(match resp.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) } }), } } pub async fn layer_ondemand_download( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, layer_file_name: &str, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/layer/{}", self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name ); let resp = self.request_noerror(Method::GET, &uri, ()).await?; match resp.status() { StatusCode::OK => Ok(true), StatusCode::NOT_MODIFIED => Ok(false), // TODO: dedupe this pattern / introduce separate error variant? status => Err(match resp.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) } }), } } pub async fn ingest_aux_files( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, aux_files: HashMap, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/ingest_aux_files", self.mgmt_api_endpoint, tenant_shard_id, timeline_id ); let resp = self .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files }) .await?; match resp.status() { StatusCode::OK => Ok(true), status => Err(match resp.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) } }), } } pub async fn list_aux_files( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, ) -> Result> { let uri = format!( "{}/v1/tenant/{}/timeline/{}/list_aux_files", self.mgmt_api_endpoint, tenant_shard_id, timeline_id ); let resp = self .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn }) .await?; match resp.status() { StatusCode::OK => { let resp: HashMap = resp.json().await.map_err(|e| { Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")) })?; Ok(resp) } status => Err(match resp.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) } }), } } pub async fn import_basebackup( &self, tenant_id: TenantId, timeline_id: TimelineId, base_lsn: Lsn, end_lsn: Lsn, pg_version: PgMajorVersion, basebackup_tarball: ReqwestBody, ) -> Result<()> { let pg_version = pg_version.major_version_num(); let uri = format!( "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", self.mgmt_api_endpoint, ); self.start_request(Method::PUT, uri) .body(basebackup_tarball) .send() .await .map_err(Error::SendRequest)? .error_from_body() .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn import_wal( &self, tenant_id: TenantId, timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, wal_tarball: ReqwestBody, ) -> Result<()> { let uri = format!( "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}", self.mgmt_api_endpoint, ); self.start_request(Method::PUT, uri) .body(wal_tarball) .send() .await .map_err(Error::SendRequest)? .error_from_body() .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn timeline_init_lsn_lease( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease", self.mgmt_api_endpoint, ); self.request(Method::POST, &uri, LsnLeaseRequest { lsn }) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn reset_alert_gauges(&self) -> Result<()> { let uri = format!( "{}/hadron-internal/reset_alert_gauges", self.mgmt_api_endpoint ); self.start_request(Method::POST, uri) .send() .await .map_err(Error::SendRequest)? .error_from_body() .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn wait_lsn( &self, tenant_shard_id: TenantShardId, request: TenantWaitLsnRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/wait_lsn", self.mgmt_api_endpoint, ); self.request_noerror(Method::POST, uri, request) .await .map(|resp| resp.status()) } pub async fn activate_post_import( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, activate_timeline_timeout: Duration, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}", self.mgmt_api_endpoint, tenant_shard_id, timeline_id, activate_timeline_timeout.as_millis() ); self.request(Method::PUT, uri, ()) .await? .json() .await .map_err(Error::ReceiveBody) } pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> { let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint); self.request(Method::POST, uri, spec) .await? .json() .await .map_err(Error::ReceiveBody) } } ================================================ FILE: pageserver/client/src/page_service.rs ================================================ use std::sync::{Arc, Mutex}; use futures::stream::{SplitSink, SplitStream}; use futures::{SinkExt, StreamExt}; use pageserver_api::pagestream_api::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, }; use pageserver_api::reltag::RelTag; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; use tokio_util::sync::CancellationToken; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; pub struct Client { client: tokio_postgres::Client, cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } pub struct BasebackupRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub lsn: Option, pub gzip: bool, } impl Client { pub async fn new(connstring: String) -> anyhow::Result { let (client, connection) = tokio_postgres::connect(&connstring, tokio_postgres::NoTls).await?; let conn_task_cancel = CancellationToken::new(); let conn_task = tokio::spawn({ let conn_task_cancel = conn_task_cancel.clone(); async move { tokio::select! { _ = conn_task_cancel.cancelled() => { } res = connection => { res.unwrap(); } } } }); Ok(Self { cancel_on_client_drop: Some(conn_task_cancel.drop_guard()), conn_task, client, }) } pub async fn pagestream( self, tenant_id: TenantId, timeline_id: TimelineId, ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away. let Client { cancel_on_client_drop, conn_task, client: _, } = self; let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning( ConnTaskRunning { cancel_on_client_drop, conn_task, }, ))); Ok(PagestreamClient { sink: PagestreamSender { shared: shared.clone(), sink, }, stream: PagestreamReceiver { shared: shared.clone(), stream, }, shared, }) } pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result { let BasebackupRequest { tenant_id, timeline_id, lsn, gzip, } = req; let mut args = Vec::with_capacity(5); args.push("basebackup".to_string()); args.push(format!("{tenant_id}")); args.push(format!("{timeline_id}")); if let Some(lsn) = lsn { args.push(format!("{lsn}")); } if *gzip { args.push("--gzip".to_string()) } Ok(self.client.copy_out(&args.join(" ")).await?) } } /// Create using [`Client::pagestream`]. pub struct PagestreamClient { shared: Arc>, sink: PagestreamSender, stream: PagestreamReceiver, } pub struct PagestreamSender { #[allow(dead_code)] shared: Arc>, sink: SplitSink, bytes::Bytes>, } pub struct PagestreamReceiver { #[allow(dead_code)] shared: Arc>, stream: SplitStream>, } enum PagestreamShared { ConnTaskRunning(ConnTaskRunning), ConnTaskCancelledJoinHandleReturnedOrDropped, } struct ConnTaskRunning { cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } pub struct RelTagBlockNo { pub rel_tag: RelTag, pub block_no: u32, } impl PagestreamClient { pub async fn shutdown(self) { let Self { shared, sink, stream, } = { self }; // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`. // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). // // If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`, // the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race). // // Further, the pageserver makes a lot of noise when it receives CopyFail. // Computes don't send it in practice, they just hard-close the connection. // // So, let's behave like the computes and suppress the CopyFail as follows: // kill the socket first, then drop copy_both. // // See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY // // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. // => https://github.com/neondatabase/neon/issues/6390 let ConnTaskRunning { cancel_on_client_drop, conn_task, } = { let mut guard = shared.lock().unwrap(); match std::mem::replace( &mut *guard, PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped, ) { PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running, PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(), } }; let _ = cancel_on_client_drop.unwrap(); conn_task.await.unwrap(); // Now drop the split copy_both. drop(sink); drop(stream); } pub fn split(self) -> (PagestreamSender, PagestreamReceiver) { let Self { shared: _, sink, stream, } = self; (sink, stream) } pub async fn getpage( &mut self, req: PagestreamGetPageRequest, ) -> anyhow::Result { self.getpage_send(req).await?; self.getpage_recv().await } pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { self.sink.getpage_send(req).await } pub async fn getpage_recv(&mut self) -> anyhow::Result { self.stream.getpage_recv().await } } impl PagestreamSender { // TODO: maybe make this impl Sink instead for better composability? pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> { let msg = msg.serialize(); self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?; Ok(()) } pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { self.send(PagestreamFeMessage::GetPage(req)).await } } impl PagestreamReceiver { // TODO: maybe make this impl Stream instead for better composability? pub async fn recv(&mut self) -> anyhow::Result { let next: Option> = self.stream.next().await; let next: bytes::Bytes = next.unwrap()?; PagestreamBeMessage::deserialize(next) } pub async fn getpage_recv(&mut self) -> anyhow::Result { let next: PagestreamBeMessage = self.recv().await?; match next { PagestreamBeMessage::GetPage(p) => Ok(p), PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) | PagestreamBeMessage::Nblocks(_) | PagestreamBeMessage::DbSize(_) | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", next.kind() ) } #[cfg(feature = "testing")] PagestreamBeMessage::Test(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", next.kind() ) } } } } ================================================ FILE: pageserver/client_grpc/Cargo.toml ================================================ [package] name = "pageserver_client_grpc" version = "0.1.0" edition.workspace = true license.workspace = true [features] testing = ["pageserver_api/testing"] [dependencies] anyhow.workspace = true arc-swap.workspace = true bytes.workspace = true compute_api.workspace = true futures.workspace = true pageserver_api.workspace = true pageserver_page_api.workspace = true tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tonic.workspace = true tracing.workspace = true utils.workspace = true workspace_hack.workspace = true ================================================ FILE: pageserver/client_grpc/src/client.rs ================================================ use std::collections::HashMap; use std::num::NonZero; use std::pin::pin; use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::anyhow; use arc_swap::ArcSwap; use futures::stream::FuturesUnordered; use futures::{FutureExt as _, StreamExt as _}; use tonic::codec::CompressionEncoding; use tracing::{debug, instrument}; use utils::logging::warn_slow; use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; use crate::retry::Retry; use compute_api::spec::PageserverProtocol; use pageserver_page_api as page_api; use pageserver_page_api::GetPageSplitter; use utils::id::{TenantId, TimelineId}; use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize}; /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up /// when full. /// /// Normal requests are small, and we don't pipeline them, so we can afford a large number of /// streams per connection. /// /// TODO: tune all of these constants, and consider making them configurable. const MAX_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(64).unwrap(); /// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a /// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and /// transmission delays. This also concentrates large window sizes on a smaller set of /// streams/connections, presumably reducing memory use. const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(16).unwrap(); /// The batch size threshold at which a GetPage request will use the bulk stream pool. /// /// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window /// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool. const BULK_THRESHOLD_BATCH_SIZE: usize = 5; /// The overall request call timeout, including retries and pool acquisition. /// TODO: should we retry forever? Should the caller decide? const CALL_TIMEOUT: Duration = Duration::from_secs(60); /// The per-request (retry attempt) timeout, including any lazy connection establishment. const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); /// The initial request retry backoff duration. The first retry does not back off. /// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support. const BASE_BACKOFF: Duration = Duration::from_millis(5); /// The maximum request retry backoff duration. const MAX_BACKOFF: Duration = Duration::from_secs(5); /// Threshold and interval for warning about slow operation. const SLOW_THRESHOLD: Duration = Duration::from_secs(3); /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the /// basic `page_api::Client` gRPC client, and supports: /// /// * Sharded tenants across multiple Pageservers. /// * Pooling of connections, clients, and streams for efficient resource use. /// * Concurrent use by many callers. /// * Internal handling of GetPage bidirectional streams. /// * Automatic retries. /// * Observability. /// /// The client has dedicated connection/client/stream pools per shard, for resource reuse. These /// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all /// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly /// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000 /// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with /// one stream per backend, but without the TCP connection overhead. In the common case we expect /// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits, /// read coalescing, sharding (backends typically only talk to one shard at a time), etc. /// /// TODO: this client does not support base backups or LSN leases, as these are only used by /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards. pub struct PageserverClient { /// The tenant ID. tenant_id: TenantId, /// The timeline ID. timeline_id: TimelineId, /// The JWT auth token for this tenant, if any. auth_token: Option, /// The compression to use, if any. compression: Option, /// The shards for this tenant. shards: ArcSwap, } impl PageserverClient { /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given /// in the shard spec, which must be complete and must use gRPC URLs. pub fn new( tenant_id: TenantId, timeline_id: TimelineId, shard_spec: ShardSpec, auth_token: Option, compression: Option, ) -> anyhow::Result { let shards = Shards::new( tenant_id, timeline_id, shard_spec, auth_token.clone(), compression, )?; Ok(Self { tenant_id, timeline_id, auth_token, compression, shards: ArcSwap::new(Arc::new(shards)), }) } /// Updates the shards from the given shard spec. In-flight requests will complete using the /// existing shards, but may retry with the new shards if they fail. /// /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are /// properly spun down and dropped afterwards. pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> { // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races // with concurrent updates, but that involves creating a new `Shards` on every attempt, // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere // in the stack, and if they're violated then we already have problems elsewhere, so a // best-effort but possibly-racy check is okay here. let old = self.shards.load_full(); if shard_spec.count < old.count { return Err(anyhow!( "can't reduce shard count from {} to {}", old.count, shard_spec.count )); } if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size { return Err(anyhow!( "can't change stripe size from {} to {}", old.stripe_size.expect("always Some when sharded"), shard_spec.stripe_size.expect("always Some when sharded") )); } let shards = Shards::new( self.tenant_id, self.timeline_id, shard_spec, self.auth_token.clone(), self.compression, )?; self.shards.store(Arc::new(shards)); Ok(()) } /// Returns the total size of a database, as # of bytes. #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))] pub async fn get_db_size( &self, req: page_api::GetDbSizeRequest, ) -> tonic::Result { debug!("sending request: {req:?}"); let resp = Self::with_retries(CALL_TIMEOUT, async |_| { // Relation metadata is only available on shard 0. let mut client = self.shards.load_full().get_zero().client().await?; Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await }) .await?; debug!("received response: {resp:?}"); Ok(resp) } /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle /// shard boundaries, and assembles the responses. /// /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status` /// errors. All responses will have `GetPageStatusCode::Ok`. #[instrument(skip_all, fields( req_id = %req.request_id, class = %req.request_class, rel = %req.rel, blkno = %req.block_numbers[0], blks = %req.block_numbers.len(), lsn = %req.read_lsn, ))] pub async fn get_page( &self, req: page_api::GetPageRequest, ) -> tonic::Result { // Make sure we have at least one page. if req.block_numbers.is_empty() { return Err(tonic::Status::invalid_argument("no block number")); } // The request attempt must be 0. The client will increment it internally. if req.request_id.attempt != 0 { return Err(tonic::Status::invalid_argument("request attempt must be 0")); } debug!("sending request: {req:?}"); // The shards may change while we're fetching pages. We execute the request using a stable // view of the shards (especially important for requests that span shards), but retry the // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary // retries and re-splits in some cases where requests span shards, but these are expected to // be rare. // // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this // once we figure out how to handle these. let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| { let mut req = req.clone(); req.request_id.attempt = attempt as u32; let shards = self.shards.load_full(); Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await }) .await?; debug!("received response: {resp:?}"); Ok(resp) } /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`. async fn get_page_with_shards( req: page_api::GetPageRequest, shards: &Shards, ) -> tonic::Result { // Fast path: request is for a single shard. if let Some(shard_id) = GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)? { return Self::get_page_with_shard(req, shards.get(shard_id)?).await; } // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and // reassemble the responses. let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)?; let mut shard_requests = FuturesUnordered::new(); for (shard_id, shard_req) in splitter.drain_requests() { let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?) .map(move |result| result.map(|resp| (shard_id, resp))); shard_requests.push(future); } while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { splitter.add_response(shard_id, shard_response)?; } Ok(splitter.collect_response()?) } /// Fetches pages on the given shard. Does not retry internally. async fn get_page_with_shard( req: page_api::GetPageRequest, shard: &Shard, ) -> tonic::Result { let mut stream = shard.stream(Self::is_bulk(&req)).await?; let resp = stream.send(req.clone()).await?; // Convert per-request errors into a tonic::Status. if resp.status_code != page_api::GetPageStatusCode::Ok { return Err(tonic::Status::new( resp.status_code.into(), resp.reason.unwrap_or_else(|| String::from("unknown error")), )); } // Check that we received the expected pages. if req.rel != resp.rel { return Err(tonic::Status::internal(format!( "shard {} returned wrong relation, expected {} got {}", shard.id, req.rel, resp.rel ))); } if !req .block_numbers .iter() .copied() .eq(resp.pages.iter().map(|p| p.block_number)) { return Err(tonic::Status::internal(format!( "shard {} returned wrong pages, expected {:?} got {:?}", shard.id, req.block_numbers, resp.pages .iter() .map(|page| page.block_number) .collect::>() ))); } Ok(resp) } /// Returns the size of a relation, as # of blocks. #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] pub async fn get_rel_size( &self, req: page_api::GetRelSizeRequest, ) -> tonic::Result { debug!("sending request: {req:?}"); let resp = Self::with_retries(CALL_TIMEOUT, async |_| { // Relation metadata is only available on shard 0. let mut client = self.shards.load_full().get_zero().client().await?; Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await }) .await?; debug!("received response: {resp:?}"); Ok(resp) } /// Fetches an SLRU segment. #[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))] pub async fn get_slru_segment( &self, req: page_api::GetSlruSegmentRequest, ) -> tonic::Result { debug!("sending request: {req:?}"); let resp = Self::with_retries(CALL_TIMEOUT, async |_| { // SLRU segments are only available on shard 0. let mut client = self.shards.load_full().get_zero().client().await?; Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await }) .await?; debug!("received response: {resp:?}"); Ok(resp) } /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout. async fn with_retries(timeout: Duration, f: F) -> tonic::Result where F: FnMut(usize) -> O, // pass attempt number, starting at 0 O: Future>, { Retry { timeout: Some(timeout), base_backoff: BASE_BACKOFF, max_backoff: MAX_BACKOFF, } .with(f) .await } /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout. async fn with_timeout( timeout: Duration, f: impl Future>, ) -> tonic::Result { let started = Instant::now(); tokio::time::timeout(timeout, f).await.map_err(|_| { tonic::Status::deadline_exceeded(format!( "request timed out after {:.3}s", started.elapsed().as_secs_f64() )) })? } /// Returns true if the request is considered a bulk request and should use the bulk pool. fn is_bulk(req: &page_api::GetPageRequest) -> bool { req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE } } /// Shard specification for a PageserverClient. pub struct ShardSpec { /// Maps shard indices to gRPC URLs. /// /// INVARIANT: every shard 0..count is present, and shard 0 is always present. /// INVARIANT: every URL is valid and uses grpc:// scheme. urls: HashMap, /// The shard count. /// /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. count: ShardCount, /// The stripe size for these shards. /// /// INVARIANT: None for unsharded tenants, Some for sharded. stripe_size: Option, } impl ShardSpec { /// Creates a new shard spec with the given URLs and stripe size. All shards must be given. /// The stripe size must be Some for sharded tenants, or None for unsharded tenants. pub fn new( urls: HashMap, stripe_size: Option, ) -> anyhow::Result { // Compute the shard count. let count = match urls.len() { 0 => return Err(anyhow!("no shards provided")), 1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()` n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")), n => ShardCount::new(n as u8), }; // Validate the stripe size. if stripe_size.is_none() && !count.is_unsharded() { return Err(anyhow!("stripe size must be given for sharded tenants")); } if stripe_size.is_some() && count.is_unsharded() { return Err(anyhow!("stripe size can't be given for unsharded tenants")); } // Validate the shard spec. for (shard_id, url) in &urls { // The shard index must match the computed shard count, even for unsharded tenants. if shard_id.shard_count != count { return Err(anyhow!("invalid shard index {shard_id}, expected {count}")); } // The shard index' number and count must be consistent. if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 { return Err(anyhow!("invalid shard index {shard_id}")); } // The above conditions guarantee that we have all shards 0..count: len() matches count, // shard number < count, and numbers are unique (via hashmap). // Validate the URL. if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc { return Err(anyhow!("invalid shard URL {url}: must use gRPC")); } } Ok(Self { urls, count, stripe_size, }) } } /// Tracks the tenant's shards. struct Shards { /// Shards by shard index. /// /// INVARIANT: every shard 0..count is present. /// INVARIANT: shard 0 is always present. by_index: HashMap, /// The shard count. /// /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. count: ShardCount, /// The stripe size. /// /// INVARIANT: None for unsharded tenants, Some for sharded. stripe_size: Option, } impl Shards { /// Creates a new set of shards based on a shard spec. fn new( tenant_id: TenantId, timeline_id: TimelineId, shard_spec: ShardSpec, auth_token: Option, compression: Option, ) -> anyhow::Result { // NB: the shard spec has already been validated when constructed. let mut shards = HashMap::with_capacity(shard_spec.urls.len()); for (shard_id, url) in shard_spec.urls { shards.insert( shard_id, Shard::new( url, tenant_id, timeline_id, shard_id, auth_token.clone(), compression, )?, ); } Ok(Self { by_index: shards, count: shard_spec.count, stripe_size: shard_spec.stripe_size, }) } /// Looks up the given shard. #[allow(clippy::result_large_err)] // TODO: check perf impact fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> { self.by_index .get(&shard_id) .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}"))) } /// Returns shard 0. fn get_zero(&self) -> &Shard { self.get(ShardIndex::new(ShardNumber(0), self.count)) .expect("always present") } } /// A single shard. Has dedicated resource pools with the following structure: /// /// * Channel pool: MAX_CLIENTS_PER_CHANNEL. /// * Client pool: unbounded. /// * Stream pool: unbounded. /// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL. /// * Bulk client pool: unbounded. /// * Bulk stream pool: unbounded. /// /// We use a separate bulk channel pool with a lower concurrency limit for large batch requests. /// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a /// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools /// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly /// similar (except for TCP transmission time). /// /// TODO: since we never use bounded pools, we could consider removing the pool limiters. However, /// the code is fairly trivial, so we may as well keep them around for now in case we need them. struct Shard { /// The shard ID. id: ShardIndex, /// Unary gRPC client pool. client_pool: Arc, /// GetPage stream pool. stream_pool: Arc, /// GetPage stream pool for bulk requests. bulk_stream_pool: Arc, } impl Shard { /// Creates a new shard. It has its own dedicated resource pools. fn new( url: String, tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, compression: Option, ) -> anyhow::Result { // Shard pools for unary requests and non-bulk GetPage requests. let client_pool = ClientPool::new( ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?, tenant_id, timeline_id, shard_id, auth_token.clone(), compression, None, // unbounded ); let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.). let bulk_stream_pool = StreamPool::new( ClientPool::new( ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?, tenant_id, timeline_id, shard_id, auth_token, compression, None, // unbounded, ), None, // unbounded ); Ok(Self { id: shard_id, client_pool, stream_pool, bulk_stream_pool, }) } /// Returns a pooled client for this shard. #[instrument(skip_all)] async fn client(&self) -> tonic::Result { warn_slow( "client pool acquisition", SLOW_THRESHOLD, pin!(self.client_pool.get()), ) .await } /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool. #[instrument(skip_all, fields(bulk))] async fn stream(&self, bulk: bool) -> tonic::Result { let pool = match bulk { false => &self.stream_pool, true => &self.bulk_stream_pool, }; warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await } } ================================================ FILE: pageserver/client_grpc/src/lib.rs ================================================ mod client; mod pool; mod retry; pub use client::{PageserverClient, ShardSpec}; ================================================ FILE: pageserver/client_grpc/src/pool.rs ================================================ //! This module provides various Pageserver gRPC client resource pools. //! //! These pools are designed to reuse gRPC resources (connections, clients, and streams) across //! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency //! of creating dedicated TCP connections and server tasks for every Postgres backend. //! //! Each resource has its own, nested pool. The pools are custom-built for the properties of each //! resource -- they are different enough that a generic pool isn't suitable. //! //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients //! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a //! per-channel client limit. Channels are closed immediately when empty, and indirectly rely on //! client/stream idle timeouts. //! //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared) //! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a //! single caller at a time, and is returned to the pool when dropped. Idle clients are removed //! from the pool after a while to free up resources. //! //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the //! ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a //! time, and is returned to the pool when dropped. Idle streams are removed from the pool after //! a while to free up resources. //! //! The stream only supports sending a single, synchronous request at a time, and does not support //! pipelining multiple requests from different callers onto the same stream -- instead, we scale //! out concurrent streams to improve throughput. There are many reasons for this design choice: //! //! * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by //! a single server task, which may block e.g. on layer downloads, LSN waits, etc. //! //! * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away //! (e.g. because of a timeout), the request would still be processed by the server and block //! requests behind it in the stream. It might even block its own timeout retry. //! //! * Stream scheduling becomes significantly simpler and cheaper. //! //! * Individual callers can still use client-side batching for pipelining. //! //! * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB //! per stream (2.5 GB for 100,000 streams), so we can afford to scale out. //! //! Each channel corresponds to one TCP connection. Each client unary request and each stream //! corresponds to one HTTP/2 stream and server task. //! //! TODO: error handling (including custom error types). //! TODO: observability. use std::collections::BTreeMap; use std::num::NonZero; use std::ops::{Deref, DerefMut}; use std::pin::Pin; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, Weak}; use std::time::{Duration, Instant}; use futures::{Stream, StreamExt as _}; use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch}; use tokio_stream::wrappers::WatchStream; use tokio_util::sync::CancellationToken; use tonic::codec::CompressionEncoding; use tonic::transport::{Channel, Endpoint}; use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; /// Reap clients/streams that have been idle for this long. Channels are reaped immediately when /// empty, and indirectly rely on the client/stream idle timeouts. /// /// A stream's client will be reaped after 2x the idle threshold (first stream the client), but /// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to /// keep its client around in the pool for a while. const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) { false => Duration::from_secs(180), true => Duration::from_secs(1), // exercise reaping in tests }; /// Reap idle resources with this interval. const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) { false => Duration::from_secs(10), true => Duration::from_secs(1), // exercise reaping in tests }; /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this. /// The pool does not limit the number of channels, and instead relies on `ClientPool` or /// `StreamPool` to limit the number of concurrent clients. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. /// /// TODO: consider prewarming a set of channels, to avoid initial connection latency. /// TODO: consider adding a circuit breaker for errors and fail fast. pub struct ChannelPool { /// Pageserver endpoint to connect to. endpoint: Endpoint, /// Max number of clients per channel. Beyond this, a new channel will be created. max_clients_per_channel: NonZero, /// Open channels. channels: Mutex>, /// Channel ID generator. next_channel_id: AtomicUsize, } type ChannelID = usize; struct ChannelEntry { /// The gRPC channel (i.e. TCP connection). Shared by multiple clients. channel: Channel, /// Number of clients using this channel. clients: usize, } impl ChannelPool { /// Creates a new channel pool for the given Pageserver endpoint. pub fn new(endpoint: E, max_clients_per_channel: NonZero) -> anyhow::Result> where E: TryInto + Send + Sync + 'static, >::Error: std::error::Error + Send + Sync, { Ok(Arc::new(Self { endpoint: endpoint.try_into()?, max_clients_per_channel, channels: Mutex::default(), next_channel_id: AtomicUsize::default(), })) } /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel. /// /// This never blocks (except for mutex acquisition). The channel is connected lazily on first /// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established /// automatically on failure (TODO: verify). /// /// Callers should not clone the returned channel, and must hold onto the returned guard as long /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf /// client requires an owned `Channel` and we don't have access to the channel's internal /// refcount. /// /// This is not performance-sensitive. It is only called when creating a new client, and clients /// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n) /// performance is therefore okay. pub fn get(self: &Arc) -> ChannelGuard { let mut channels = self.channels.lock().unwrap(); // Try to find an existing channel with available capacity. We check entries in BTreeMap // order, to fill up the lower-ordered channels first. The client/stream pools also prefer // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered // channels, and free up higher-ordered channels such that they can be reaped. for (&id, entry) in channels.iter_mut() { assert!( entry.clients <= self.max_clients_per_channel.get(), "channel overflow" ); assert_ne!(entry.clients, 0, "empty channel not reaped"); if entry.clients < self.max_clients_per_channel.get() { entry.clients += 1; return ChannelGuard { pool: Arc::downgrade(self), id, channel: Some(entry.channel.clone()), }; } } // Create a new channel. We connect lazily on first use, such that we don't block here and // other clients can join onto the same channel while it's connecting. let channel = self.endpoint.connect_lazy(); let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed); let entry = ChannelEntry { channel: channel.clone(), clients: 1, // account for the guard below }; channels.insert(id, entry); ChannelGuard { pool: Arc::downgrade(self), id, channel: Some(channel), } } } /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`, /// since the gRPC client requires an owned `Channel`. pub struct ChannelGuard { pool: Weak, id: ChannelID, channel: Option, } impl ChannelGuard { /// Returns the inner owned channel. Panics if called more than once. The caller must hold onto /// the guard as long as the channel is in use, and should not clone it. pub fn take(&mut self) -> Channel { self.channel.take().expect("channel already taken") } } /// Returns the channel to the pool. The channel is closed when empty. impl Drop for ChannelGuard { fn drop(&mut self) { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; let mut channels = pool.channels.lock().unwrap(); let entry = channels.get_mut(&self.id).expect("unknown channel"); assert!(entry.clients > 0, "channel underflow"); entry.clients -= 1; // Reap empty channels immediately. if entry.clients == 0 { channels.remove(&self.id); } } } /// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner /// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total /// number of concurrent clients to `max_clients` via semaphore. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. pub struct ClientPool { /// Tenant ID. tenant_id: TenantId, /// Timeline ID. timeline_id: TimelineId, /// Shard ID. shard_id: ShardIndex, /// Authentication token, if any. auth_token: Option, /// Compression to use. compression: Option, /// Channel pool to acquire channels from. channel_pool: Arc, /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded. limiter: Option>, /// Idle pooled clients. Acquired clients are removed from here and returned on drop. /// /// The first client in the map will be acquired next. The map is sorted by client ID, which in /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from /// lower-ordered channels. This allows us to free up and reap higher-ordered channels. idle: Mutex>, /// Reaps idle clients. idle_reaper: Reaper, /// Unique client ID generator. next_client_id: AtomicUsize, } type ClientID = (ChannelID, usize); struct ClientEntry { /// The pooled gRPC client. client: page_api::Client, /// The channel guard for the channel used by the client. channel_guard: ChannelGuard, /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by /// definition, so this is the time when it was added back to the pool. idle_since: Instant, } impl ClientPool { /// Creates a new client pool for the given tenant shard. Channels are acquired from the given /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to /// `max_clients` concurrent clients, or unbounded if None. pub fn new( channel_pool: Arc, tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, compression: Option, max_clients: Option>, ) -> Arc { let pool = Arc::new(Self { tenant_id, timeline_id, shard_id, auth_token, compression, channel_pool, idle: Mutex::default(), idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))), next_client_id: AtomicUsize::default(), }); pool.idle_reaper.spawn(&pool); pool } /// Gets a client from the pool, or creates a new one if necessary. Connections are established /// lazily and do not block, but this call can block if the pool is at `max_clients`. The client /// is returned to the pool when the guard is dropped. /// /// This is moderately performance-sensitive. It is called for every unary request, but these /// establish a new gRPC stream per request so they're already expensive. GetPage requests use /// the `StreamPool` instead. pub async fn get(self: &Arc) -> tonic::Result { // Acquire a permit if the pool is bounded. let mut permit = None; if let Some(limiter) = self.limiter.clone() { permit = Some(limiter.acquire_owned().await.expect("never closed")); } // Fast path: acquire an idle client from the pool. if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() { return Ok(ClientGuard { pool: Arc::downgrade(self), id, client: Some(entry.client), channel_guard: Some(entry.channel_guard), permit, }); } // Construct a new client. let mut channel_guard = self.channel_pool.get(); let client = page_api::Client::new( channel_guard.take(), self.tenant_id, self.timeline_id, self.shard_id, self.auth_token.clone(), self.compression, ) .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?; Ok(ClientGuard { pool: Arc::downgrade(self), id: ( channel_guard.id, self.next_client_id.fetch_add(1, Ordering::Relaxed), ), client: Some(client), channel_guard: Some(channel_guard), permit, }) } } impl Reapable for ClientPool { /// Reaps clients that have been idle since before the cutoff. fn reap_idle(&self, cutoff: Instant) { self.idle .lock() .unwrap() .retain(|_, entry| entry.idle_since >= cutoff) } } /// A client acquired from the pool. The inner client can be accessed via Deref. The client is /// returned to the pool when dropped. pub struct ClientGuard { pool: Weak, id: ClientID, client: Option, // Some until dropped channel_guard: Option, // Some until dropped permit: Option, // None if pool is unbounded } impl Deref for ClientGuard { type Target = page_api::Client; fn deref(&self) -> &Self::Target { self.client.as_ref().expect("not dropped") } } impl DerefMut for ClientGuard { fn deref_mut(&mut self) -> &mut Self::Target { self.client.as_mut().expect("not dropped") } } /// Returns the client to the pool. impl Drop for ClientGuard { fn drop(&mut self) { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; let entry = ClientEntry { client: self.client.take().expect("dropped once"), channel_guard: self.channel_guard.take().expect("dropped once"), idle_since: Instant::now(), }; pool.idle.lock().unwrap().insert(self.id, entry); _ = self.permit; // returned on drop, referenced for visibility } } /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream /// acquires a client from the inner `ClientPool` for the stream's lifetime. /// /// Individual streams only send a single request at a time, and do not pipeline multiple callers /// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily /// to eliminate head-of-line blocking. See the module documentation for more details. /// /// TODO: consider making this generic over request and response types; not currently needed. pub struct StreamPool { /// The client pool to acquire clients from. Must be unbounded. client_pool: Arc, /// Idle pooled streams. Acquired streams are removed from here and returned on drop. /// /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap /// higher-ordered channels. idle: Mutex>, /// Limits the max number of concurrent streams. None if the pool is unbounded. limiter: Option>, /// Reaps idle streams. idle_reaper: Reaper, } /// The stream ID. Reuses the inner client ID. type StreamID = ClientID; /// A pooled stream. struct StreamEntry { /// The bidirectional stream. stream: BiStream, /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`. idle_since: Instant, } /// A bidirectional GetPage stream and its client. Can send requests and receive responses. struct BiStream { /// The owning client. Holds onto the channel slot while the stream is alive. client: ClientGuard, /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a /// time, and the caller must await the response before sending another request. This is /// enforced by `StreamGuard::send`. sender: watch::Sender, /// Stream for receiving responses. receiver: Pin> + Send>>, } impl StreamPool { /// Creates a new stream pool, using the given client pool. It will use up to `max_streams` /// concurrent streams. /// /// The client pool must be unbounded. The stream pool will enforce its own limits, and because /// streams are long-lived they can cause persistent starvation if they exhaust the client pool. /// The stream pool should generally have its own dedicated client pool (but it can share a /// channel pool with others since these are always unbounded). pub fn new(client_pool: Arc, max_streams: Option>) -> Arc { assert!(client_pool.limiter.is_none(), "bounded client pool"); let pool = Arc::new(Self { client_pool, idle: Mutex::default(), limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))), idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), }); pool.idle_reaper.spawn(&pool); pool } /// Acquires an available stream from the pool, or spins up a new stream if all streams are /// full. Returns a guard that can be used to send requests and await the responses. Blocks if /// the pool is full. /// /// This is very performance-sensitive, as it is on the GetPage hot path. /// /// TODO: is a `Mutex` performant enough? Will it become too contended? We can't /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first /// to free up higher-ordered channels. pub async fn get(self: &Arc) -> tonic::Result { // Acquire a permit if the pool is bounded. let mut permit = None; if let Some(limiter) = self.limiter.clone() { permit = Some(limiter.acquire_owned().await.expect("never closed")); } // Fast path: acquire an idle stream from the pool. if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() { return Ok(StreamGuard { pool: Arc::downgrade(self), stream: Some(entry.stream), can_reuse: true, permit, }); } // Spin up a new stream. Uses a watch channel to send a single request at a time, since // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead. let mut client = self.client_pool.get().await?; let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default()); let req_stream = WatchStream::from_changes(req_rx); let resp_stream = client.get_pages(req_stream).await?; Ok(StreamGuard { pool: Arc::downgrade(self), stream: Some(BiStream { client, sender: req_tx, receiver: Box::pin(resp_stream), }), can_reuse: true, permit, }) } } impl Reapable for StreamPool { /// Reaps streams that have been idle since before the cutoff. fn reap_idle(&self, cutoff: Instant) { self.idle .lock() .unwrap() .retain(|_, entry| entry.idle_since >= cutoff); } } /// A stream acquired from the pool. Returned to the pool when dropped, unless there are still /// in-flight requests on the stream, or the stream failed. pub struct StreamGuard { pool: Weak, stream: Option, // Some until dropped can_reuse: bool, // returned to pool if true permit: Option, // None if pool is unbounded } impl StreamGuard { /// Sends a request on the stream and awaits the response. If the future is dropped before it /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the /// request and is not returned to the pool. The same is true if the stream errors, in which /// case the caller can't send further requests on the stream. /// /// We only support sending a single request at a time, to eliminate head-of-line blocking. See /// module documentation for details. /// /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status` /// to avoid tearing down the stream for per-request errors. Callers must check this. pub async fn send( &mut self, req: page_api::GetPageRequest, ) -> tonic::Result { let req_id = req.request_id; let stream = self.stream.as_mut().expect("not dropped"); // Mark the stream as not reusable while the request is in flight. We can't return the // stream to the pool until we receive the response, to avoid head-of-line blocking and // stale responses. Failed streams can't be reused either. if !self.can_reuse { return Err(tonic::Status::internal("stream can't be reused")); } self.can_reuse = false; // Send the request and receive the response. // // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests. stream .sender .send(req) .map_err(|_| tonic::Status::unavailable("stream closed"))?; let resp = stream .receiver .next() .await .ok_or_else(|| tonic::Status::unavailable("stream closed"))??; if resp.request_id != req_id { return Err(tonic::Status::internal(format!( "response ID {} does not match request ID {}", resp.request_id, req_id ))); } // Success, mark the stream as reusable. self.can_reuse = true; Ok(resp) } } impl Drop for StreamGuard { fn drop(&mut self) { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; // If the stream isn't reusable, it can't be returned to the pool. if !self.can_reuse { return; } // Place the idle stream back into the pool. let entry = StreamEntry { stream: self.stream.take().expect("dropped once"), idle_since: Instant::now(), }; pool.idle .lock() .unwrap() .insert(entry.stream.client.id, entry); _ = self.permit; // returned on drop, referenced for visibility } } /// Periodically reaps idle resources from a pool. struct Reaper { /// The task check interval. interval: Duration, /// The threshold for reaping idle resources. threshold: Duration, /// Cancels the reaper task. Cancelled when the reaper is dropped. cancel: CancellationToken, } impl Reaper { /// Creates a new reaper. pub fn new(threshold: Duration, interval: Duration) -> Self { Self { cancel: CancellationToken::new(), threshold, interval, } } /// Spawns a task to periodically reap idle resources from the given task pool. The task is /// cancelled when the reaper is dropped. pub fn spawn(&self, pool: &Arc) { // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool. let pool = Arc::downgrade(pool); let cancel = self.cancel.clone(); let (interval, threshold) = (self.interval, self.threshold); tokio::spawn(async move { loop { tokio::select! { _ = tokio::time::sleep(interval) => { let Some(pool) = pool.upgrade() else { return; // pool was dropped }; pool.reap_idle(Instant::now() - threshold); } _ = cancel.cancelled() => return, } } }); } } impl Drop for Reaper { fn drop(&mut self) { self.cancel.cancel(); // cancel reaper task } } /// A reapable resource pool. trait Reapable: Send + Sync + 'static { /// Reaps resources that have been idle since before the given cutoff. fn reap_idle(&self, cutoff: Instant); } ================================================ FILE: pageserver/client_grpc/src/retry.rs ================================================ use std::time::Duration; use futures::future::pending; use tokio::time::Instant; use tracing::{error, info, warn}; use utils::backoff::exponential_backoff_duration; /// A retry handler for Pageserver gRPC requests. /// /// This is used instead of backoff::retry for better control and observability. pub struct Retry { /// Timeout across all retry attempts. If None, retries forever. pub timeout: Option, /// The initial backoff duration. The first retry does not use a backoff. pub base_backoff: Duration, /// The maximum backoff duration. pub max_backoff: Duration, } impl Retry { /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors, /// using the current tracing span for context. /// /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. pub async fn with(&self, mut f: F) -> tonic::Result where F: FnMut(usize) -> O, // pass attempt number, starting at 0 O: Future>, { let started = Instant::now(); let deadline = self.timeout.map(|timeout| started + timeout); let mut last_error = None; let mut retries = 0; loop { // Set up a future to wait for the backoff, if any, and run the closure. let backoff_and_try = async { // NB: sleep() always sleeps 1ms, even when given a 0 argument. See: // https://github.com/tokio-rs/tokio/issues/6866 if let Some(backoff) = self.backoff_duration(retries) { tokio::time::sleep(backoff).await; } f(retries).await }; // Set up a future for the timeout, if any. let timeout = async { match deadline { Some(deadline) => tokio::time::sleep_until(deadline).await, None => pending().await, } }; // Wait for the backoff and request, or bail out if the timeout is exceeded. let result = tokio::select! { result = backoff_and_try => result, _ = timeout => { let last_error = last_error.unwrap_or_else(|| { tonic::Status::deadline_exceeded(format!( "request timed out after {:.3}s", started.elapsed().as_secs_f64() )) }); error!( "giving up after {:.3}s and {retries} retries, last error {:?}: {}", started.elapsed().as_secs_f64(), last_error.code(), last_error.message(), ); return Err(last_error); } }; match result { // Success, return the result. Ok(result) => { if retries > 0 { info!( "request succeeded after {retries} retries in {:.3}s", started.elapsed().as_secs_f64(), ); } return Ok(result); } // Error, retry or bail out. Err(status) => { let (code, message) = (status.code(), status.message()); let attempt = retries + 1; if !Self::should_retry(code) { // NB: include the attempt here too. This isn't necessarily the first // attempt, because the error may change between attempts. error!( "request failed with {code:?}: {message}, not retrying (attempt {attempt})" ); return Err(status); } warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})"); retries += 1; last_error = Some(status); } } } } /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first /// attempt and first retry never backs off, so this returns None for 0 and 1 retries. fn backoff_duration(&self, retries: usize) -> Option { let backoff = exponential_backoff_duration( (retries as u32).saturating_sub(1), // first retry does not back off self.base_backoff.as_secs_f64(), self.max_backoff.as_secs_f64(), ); (!backoff.is_zero()).then_some(backoff) } /// Returns true if the given status code should be retries. fn should_retry(code: tonic::Code) -> bool { match code { tonic::Code::Ok => panic!("unexpected Ok status code"), // These codes are transient, so retry them. tonic::Code::Aborted => true, tonic::Code::Cancelled => true, tonic::Code::DeadlineExceeded => true, // maybe transient slowness tonic::Code::ResourceExhausted => true, tonic::Code::Unavailable => true, // The following codes will like continue to fail, so don't retry. tonic::Code::AlreadyExists => false, tonic::Code::DataLoss => false, tonic::Code::FailedPrecondition => false, // NB: don't retry Internal. It is intended for serious errors such as invariant // violations, and is also used for client-side invariant checks that would otherwise // result in retry loops. tonic::Code::Internal => false, tonic::Code::InvalidArgument => false, tonic::Code::NotFound => false, tonic::Code::OutOfRange => false, tonic::Code::PermissionDenied => false, tonic::Code::Unauthenticated => false, tonic::Code::Unimplemented => false, tonic::Code::Unknown => false, } } } ================================================ FILE: pageserver/compaction/Cargo.toml ================================================ [package] name = "pageserver_compaction" version = "0.1.0" edition.workspace = true license.workspace = true [features] default = [] [dependencies] anyhow.workspace = true async-stream.workspace = true clap = { workspace = true, features = ["string"] } futures.workspace = true itertools.workspace = true once_cell.workspace = true pageserver_api.workspace = true pin-project-lite.workspace = true rand.workspace = true svg_fmt.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tracing.workspace = true tracing-subscriber.workspace = true utils.workspace = true workspace_hack.workspace = true [dev-dependencies] criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } ================================================ FILE: pageserver/compaction/TODO.md ================================================ # TODO - If the key space can be perfectly partitioned at some key, perform planning on each partition separately. For example, if we are compacting a level with layers like this: ``` : +--+ +----+ : +------+ | | | | : | | +--+ +----+ : +------+ : +-----+ +-+ : +--------+ | | | | : | | +-----+ +-+ : +--------+ : ``` At the dotted line, there is a natural split in the key space, such that all layers are either on the left or the right of it. We can compact the partitions separately. We could choose to create image layers for one partition but not the other one, for example. - All the layers don't have to be exactly the same size, we can choose to cut a layer short or stretch it a little larger than the target size, if it helps the overall system. We can help perfect partitions (see previous bullet point) to happen more frequently, by choosing the cut points wisely. For example, try to cut layers at boundaries of underlying image layers. And "snap to grid", i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0. - Avoid rewriting layers when we'd just create an identical layer to an input layer. - Parallelism. The code is already split up into planning and execution, so that we first split up the compaction work into "Jobs", and then execute them. It would be straightforward to execute multiple jobs in parallel. - Materialize extra pages in delta layers during compaction. This would reduce read amplification. There has been the idea of partial image layers. Materializing extra pages in the delta layers achieve the same goal, without introducing a new concept. ## Simulator - Expand the simulator for more workloads - Automate a test suite that runs the simluator with different workloads and spits out a table of results - Model read amplification - More sanity checking. One idea is to keep a reference count of each MockRecord, i.e. use Arc instead of plain MockRecord, and panic if a MockRecord that is newer than PITR horizon is completely dropped. That would indicate that the record was lost. ================================================ FILE: pageserver/compaction/src/bin/compaction-simulator.rs ================================================ use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::OnceLock; use clap::{Parser, Subcommand}; use pageserver_compaction::helpers::PAGE_SZ; use pageserver_compaction::simulator::MockTimeline; use rand::Rng; use utils::project_git_version; project_git_version!(GIT_VERSION); #[derive(Parser)] #[command( version = GIT_VERSION, about = "Neon Pageserver compaction simulator", long_about = "A developer tool to visualize and test compaction" )] #[command(propagate_version = true)] struct CliOpts { #[command(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { RunSuite, Simulate(SimulateCmd), } #[derive(Clone, clap::ValueEnum)] enum Distribution { Uniform, HotCold, } /// Read and update pageserver metadata file #[derive(Parser)] struct SimulateCmd { distribution: Distribution, /// Number of records to digest num_records: u64, /// Record length record_len: u64, // Logical database size in MB logical_size: u64, } async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> { let mut executor = MockTimeline::new(); // Convert the logical size in MB into a key range. let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ); //let key_range = u64::MIN..u64::MAX; println!( "starting simulation with key range {:016X}-{:016X}", key_range.start, key_range.end ); // helper function to print progress indicator let print_progress = |i| -> anyhow::Result<()> { if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 { print!( "\ringested {} / {} records, {} MiB / {} MiB...", i + 1, cmd.num_records, (i + 1) * cmd.record_len / (1_000_000), cmd.num_records * cmd.record_len / (1_000_000), ); std::io::stdout().flush()?; } Ok(()) }; match cmd.distribution { Distribution::Uniform => { for i in 0..cmd.num_records { executor.ingest_uniform(1, cmd.record_len, &key_range)?; executor.compact_if_needed().await?; print_progress(i)?; } } Distribution::HotCold => { let splitpoint = key_range.start + (key_range.end - key_range.start) / 10; let hot_key_range = 0..splitpoint; let cold_key_range = splitpoint..key_range.end; for i in 0..cmd.num_records { let chosen_range = if rand::rng().random_bool(0.9) { &hot_key_range } else { &cold_key_range }; executor.ingest_uniform(1, cmd.record_len, chosen_range)?; executor.compact_if_needed().await?; print_progress(i)?; } } } println!("done!"); executor.flush_l0(); executor.compact_if_needed().await?; let stats = executor.stats()?; // Print the stats to stdout, and also to a file print!("{stats}"); std::fs::write(results_path.join("stats.txt"), stats)?; let animation_path = results_path.join("compaction-animation.html"); executor.draw_history(std::fs::File::create(&animation_path)?)?; println!( "animation: file://{}", animation_path.canonicalize()?.display() ); Ok(()) } async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> { std::fs::create_dir(results_path)?; set_log_file(File::create(results_path.join("log"))?); let result = simulate(workload, results_path).await; set_log_stdout(); result } async fn run_suite() -> anyhow::Result<()> { let top_results_path = PathBuf::from(format!( "compaction-suite-results.{}", std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs() )); std::fs::create_dir(&top_results_path)?; let workload = SimulateCmd { distribution: Distribution::Uniform, // Generate 20 GB of WAL record_len: 1_000, num_records: 20_000_000, // Logical size 5 GB logical_size: 5_000, }; run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?; println!( "All tests finished. Results in {}", top_results_path.display() ); Ok(()) } use std::fs::File; use std::io::Stdout; use std::sync::Mutex; use tracing_subscriber::fmt::MakeWriter; use tracing_subscriber::fmt::writer::EitherWriter; static LOG_FILE: OnceLock>> = OnceLock::new(); fn get_log_output() -> &'static Mutex> { LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout()))) } fn set_log_file(f: File) { *get_log_output().lock().unwrap() = EitherWriter::A(f); } fn set_log_stdout() { *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout()); } fn init_logging() -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) }; // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; tracing_subscriber::registry() .with({ let log_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_ansi(false) .with_writer(|| get_log_output().make_writer()); log_layer.with_filter(rust_log_env_filter()) }) .init(); Ok(()) } #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = CliOpts::parse(); init_logging()?; match cli.command { Commands::Simulate(cmd) => { simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?; } Commands::RunSuite => { run_suite().await?; } }; Ok(()) } ================================================ FILE: pageserver/compaction/src/compact_tiered.rs ================================================ //! # Tiered compaction algorithm. //! //! Read all the input delta files, and write a new set of delta files that //! include all the input WAL records. See retile_deltas(). //! //! In a "normal" LSM tree, you get to remove any values that are overwritten by //! later values, but in our system, we keep all the history. So the reshuffling //! doesn't remove any garbage, it just reshuffles the records to reduce read //! amplification, i.e. the number of files that you need to access to find the //! WAL records for a given key. //! //! If the new delta files would be very "narrow", i.e. each file would cover //! only a narrow key range, then we create a new set of image files //! instead. The current threshold is that if the estimated total size of the //! image layers is smaller than the size of the deltas, then we create image //! layers. That amounts to 2x storage amplification, and it means that the //! distance of image layers in LSN dimension is roughly equal to the logical //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. use std::collections::{HashSet, VecDeque}; use std::ops::Range; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use tracing::{debug, info}; use utils::lsn::Lsn; use crate::helpers::{ PAGE_SZ, accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, }; use crate::identify_levels::identify_level; use crate::interface::*; /// Main entry point to compaction. /// /// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on /// everything below that point, that needs compaction. The cutoff LSN must /// partition the layers so that there are no layers that span across that /// LSN. To start compaction at the top of the tree, pass the end LSN of the /// written last L0 layer. pub async fn compact_tiered( executor: &mut E, end_lsn: Lsn, target_file_size: u64, fanout: u64, ctx: &E::RequestContext, ) -> anyhow::Result<()> { assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); let exp_base = fanout.max(2); // Start at L0 let mut current_level_no = 0; let mut current_level_target_height = target_file_size; loop { // end LSN +1 to include possible image layers exactly at 'end_lsn'. let all_layers = executor .get_layers( &(E::Key::MIN..E::Key::MAX), &(Lsn(u64::MIN)..end_lsn + 1), ctx, ) .await?; info!( "Compacting L{}, total # of layers: {}", current_level_no, all_layers.len() ); // Identify the range of LSNs that belong to this level. We assume that // each file in this level spans an LSN range up to 1.75x target file // size. That should give us enough slop that if we created a slightly // oversized L0 layer, e.g. because flushing the in-memory layer was // delayed for some reason, we don't consider the oversized layer to // belong to L1. But not too much slop, that we don't accidentally // "skip" levels. let max_height = (current_level_target_height as f64 * 1.75) as u64; let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else { break; }; // Calculate the height of this level. If the # of tiers exceeds the // fanout parameter, it's time to compact it. let depth = level.depth(); info!( "Level {} identified as LSN range {}-{}: depth {}", current_level_no, level.lsn_range.start, level.lsn_range.end, depth ); for l in &level.layers { debug!("LEVEL {} layer: {}", current_level_no, l.short_id()); } if depth < fanout { debug!( level = current_level_no, depth = depth, fanout, "too few deltas to compact" ); break; } compact_level( &level.lsn_range, &level.layers, executor, target_file_size, ctx, ) .await?; if current_level_target_height == u64::MAX { // our target height includes all possible lsns info!( level = current_level_no, depth = depth, "compaction loop reached max current_level_target_height" ); break; } current_level_no += 1; current_level_target_height = current_level_target_height.saturating_mul(exp_base); } Ok(()) } async fn compact_level( lsn_range: &Range, layers: &[E::Layer], executor: &mut E, target_file_size: u64, ctx: &E::RequestContext, ) -> anyhow::Result { let mut layer_fragments = Vec::new(); for l in layers { layer_fragments.push(LayerFragment::new(l.clone())); } let mut state = LevelCompactionState { shard_identity: *executor.get_shard_identity(), target_file_size, _lsn_range: lsn_range.clone(), layers: layer_fragments, jobs: Vec::new(), job_queue: Vec::new(), next_level: false, executor, }; let first_job = CompactionJob { key_range: E::Key::MIN..E::Key::MAX, lsn_range: lsn_range.clone(), strategy: CompactionStrategy::Divide, input_layers: state .layers .iter() .enumerate() .map(|i| LayerId(i.0)) .collect(), completed: false, }; state.jobs.push(first_job); state.job_queue.push(JobId(0)); state.execute(ctx).await?; info!( "compaction completed! Need to process next level: {}", state.next_level ); Ok(state.next_level) } /// Blackboard that keeps track of the state of all the jobs and work remaining struct LevelCompactionState<'a, E> where E: CompactionJobExecutor, { shard_identity: ShardIdentity, // parameters target_file_size: u64, _lsn_range: Range, layers: Vec>, // job queue jobs: Vec>, job_queue: Vec, /// If false, no need to compact levels below this next_level: bool, /// Interface to the outside world executor: &'a mut E, } #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] struct LayerId(usize); #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] struct JobId(usize); struct PendingJobSet { pending: HashSet, completed: HashSet, } impl PendingJobSet { fn new() -> Self { PendingJobSet { pending: HashSet::new(), completed: HashSet::new(), } } fn complete_job(&mut self, job_id: JobId) { self.pending.remove(&job_id); self.completed.insert(job_id); } fn all_completed(&self) -> bool { self.pending.is_empty() } } // When we decide to rewrite a set of layers, LayerFragment is used to keep // track which new layers supersede an old layer. When all the stakeholder jobs // have completed, this layer can be deleted. struct LayerFragment where E: CompactionJobExecutor, { layer: E::Layer, // If we will write new layers to replace this one, this keeps track of the // jobs that need to complete before this layer can be deleted. As the jobs // complete, they are moved from 'pending' to 'completed' set. Once the // 'pending' set becomes empty, the layer can be deleted. // // If None, this layer is not rewritten and must not be deleted. deletable_after: Option, deleted: bool, } impl LayerFragment where E: CompactionJobExecutor, { fn new(layer: E::Layer) -> Self { LayerFragment { layer, deletable_after: None, deleted: false, } } } #[derive(PartialEq)] enum CompactionStrategy { Divide, CreateDelta, CreateImage, } struct CompactionJob { key_range: Range, lsn_range: Range, strategy: CompactionStrategy, input_layers: Vec, completed: bool, } impl LevelCompactionState<'_, E> where E: CompactionJobExecutor, { /// Main loop of the executor. /// /// In each iteration, we take the next job from the queue, and execute it. /// The execution might add new jobs to the queue. Keep going until the /// queue is empty. /// /// Initially, the job queue consists of one Divide job over the whole /// level. On first call, it is divided into smaller jobs. async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> { // TODO: this would be pretty straightforward to parallelize with FuturesUnordered while let Some(next_job_id) = self.job_queue.pop() { info!("executing job {}", next_job_id.0); self.execute_job(next_job_id, ctx).await?; } // all done! Ok(()) } async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { let job = &self.jobs[job_id.0]; match job.strategy { CompactionStrategy::Divide => { self.divide_job(job_id, ctx).await?; Ok(()) } CompactionStrategy::CreateDelta => { let mut deltas: Vec = Vec::new(); let mut layer_ids: Vec = Vec::new(); for layer_id in &job.input_layers { let layer = &self.layers[layer_id.0].layer; if let Some(dl) = self.executor.downcast_delta_layer(layer, ctx).await? { deltas.push(dl.clone()); layer_ids.push(*layer_id); } } self.executor .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx) .await?; self.jobs[job_id.0].completed = true; // did we complete any fragments? for layer_id in layer_ids { let l = &mut self.layers[layer_id.0]; if let Some(deletable_after) = l.deletable_after.as_mut() { deletable_after.complete_job(job_id); if deletable_after.all_completed() { self.executor.delete_layer(&l.layer, ctx).await?; l.deleted = true; } } } self.next_level = true; Ok(()) } CompactionStrategy::CreateImage => { self.executor .create_image(job.lsn_range.end, &job.key_range, ctx) .await?; self.jobs[job_id.0].completed = true; // TODO: we could check if any layers < PITR horizon became deletable Ok(()) } } } fn push_job(&mut self, job: CompactionJob) -> JobId { let job_id = JobId(self.jobs.len()); self.jobs.push(job); self.job_queue.push(job_id); job_id } /// Take a partition of the key space, and decide how to compact it. /// /// TODO: Currently, this is called exactly once for the level, and we /// decide whether to create new image layers to cover the whole level, or /// write a new set of deltas. In the future, this should try to partition /// the key space, and make the decision separately for each partition. async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { let job = &self.jobs[job_id.0]; assert!(job.strategy == CompactionStrategy::Divide); // Check for dummy cases if job.input_layers.is_empty() { return Ok(()); } let job = &self.jobs[job_id.0]; assert!(job.strategy == CompactionStrategy::Divide); // Would it be better to create images for this partition? // Decide based on the average density of the level let keyspace_size = keyspace_total_size( &self .executor .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?, &self.shard_identity, ) * PAGE_SZ; let wal_size = job .input_layers .iter() .filter(|layer_id| self.layers[layer_id.0].layer.is_delta()) .map(|layer_id| self.layers[layer_id.0].layer.file_size()) .sum::(); if keyspace_size < wal_size { // seems worth it info!( "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}", keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size ); self.cover_with_images(job_id, ctx).await } else { // do deltas info!( "coverage not worth it, keyspace_size {}, wal_size {}", keyspace_size, wal_size ); self.retile_deltas(job_id, ctx).await } } // LSN // ^ // | // | ###|###|##### // | +--+-----+--+ +--+-----+--+ // | | | | | | | | | // | +--+--+--+--+ +--+--+--+--+ // | | | | | | | // | +---+-+-+---+ ==> +---+-+-+---+ // | | | | | | | | | // | +---+-+-++--+ +---+-+-++--+ // | | | | | | | | | // | +-----+--+--+ +-----+--+--+ // | // +--------------> key // async fn cover_with_images( &mut self, job_id: JobId, ctx: &E::RequestContext, ) -> anyhow::Result<()> { let job = &self.jobs[job_id.0]; assert!(job.strategy == CompactionStrategy::Divide); // XXX: do we still need the "holes" stuff? let mut new_jobs = Vec::new(); // Slide a window through the keyspace let keyspace = self .executor .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?; let mut window = KeyspaceWindow::new( E::Key::MIN..E::Key::MAX, keyspace, self.target_file_size / PAGE_SZ, ); while let Some(key_range) = window.choose_next_image(&self.shard_identity) { new_jobs.push(CompactionJob:: { key_range, lsn_range: job.lsn_range.clone(), strategy: CompactionStrategy::CreateImage, input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer? completed: false, }); } for j in new_jobs.into_iter().rev() { let _job_id = self.push_job(j); // TODO: image layers don't let us delete anything. unless < PITR horizon //let j = &self.jobs[job_id.0]; // for layer_id in j.input_layers.iter() { // self.layers[layer_id.0].pending_stakeholders.insert(job_id); //} } Ok(()) } // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // // We split the new delta layers on the key dimension. We iterate through // the key space, and for each key, check if including the next key to the // current output layer we're building would cause the layer to become too // large. If so, dump the current output layer and start new one. It's // possible that there is a single key with so many page versions that // storing all of them in a single layer file would be too large. In that // case, we also split on the LSN dimension. // // LSN // ^ // | // | +-----------+ +--+--+--+--+ // | | | | | | | | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ ==> | | | | | // | | | | | | | | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ +--+--+--+--+ // | // +--------------> key // // // If one key (X) has a lot of page versions: // // LSN // ^ // | (X) // | +-----------+ +--+--+--+--+ // | | | | | | | | // | +-----------+ | | +--+ | // | | | | | | | | // | +-----------+ ==> | | | | | // | | | | | +--+ | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ +--+--+--+--+ // | // +--------------> key // // TODO: this actually divides the layers into fixed-size chunks, not // based on the partitioning. // // TODO: we should also opportunistically materialize and // garbage collect what we can. async fn retile_deltas( &mut self, job_id: JobId, ctx: &E::RequestContext, ) -> anyhow::Result<()> { let job = &self.jobs[job_id.0]; assert!(job.strategy == CompactionStrategy::Divide); // Sweep the key space left to right, running an estimate of how much // disk size and keyspace we have accumulated // // Once the disk size reaches the target threshold, stop and think. // If we have accumulated only a narrow band of keyspace, create an // image layer. Otherwise write a delta layer. // FIXME: we are ignoring images here. Did we already divide the work // so that we won't encounter them here? let mut deltas: Vec = Vec::new(); for layer_id in &job.input_layers { let l = &self.layers[layer_id.0]; if let Some(dl) = self.executor.downcast_delta_layer(&l.layer, ctx).await? { deltas.push(dl.clone()); } } // Open stream let key_value_stream = std::pin::pin!( merge_delta_keys_buffered::(deltas.as_slice(), ctx) .await? .map(Result::<_, anyhow::Error>::Ok) ); let mut new_jobs = Vec::new(); // Slide a window through the keyspace let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size)); let mut all_in_window: bool = false; let mut window = Window::new(); // Helper function to create a job for a new delta layer with given key-lsn // rectangle. let create_delta_job = |key_range, lsn_range: &Range, new_jobs: &mut Vec<_>| { // The inputs for the job are all the input layers of the original job that // overlap with the rectangle. let batch_layers: Vec = job .input_layers .iter() .filter(|layer_id| { overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) }) .cloned() .collect(); assert!(!batch_layers.is_empty()); new_jobs.push(CompactionJob { key_range, lsn_range: lsn_range.clone(), strategy: CompactionStrategy::CreateDelta, input_layers: batch_layers, completed: false, }); }; loop { if all_in_window && window.is_empty() { // All done! break; } // If we now have enough keyspace for next delta layer in the window, create a // new delta layer if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) { create_delta_job(key_range, &job.lsn_range, &mut new_jobs); continue; } assert!(!all_in_window); // Process next key in the key space match key_accum.next().await.transpose()? { None => { all_in_window = true; } Some(next_key) if next_key.partition_lsns.is_empty() => { // Normal case: extend the window by the key window.feed(next_key.key, next_key.size); } Some(next_key) => { // A key with too large size impact for a single delta layer. This // case occurs if you make a huge number of updates for a single key. // // Drain the window with has_more = false to make a clean cut before // the key, and then make dedicated delta layers for the single key. // // We cannot cluster the key with the others, because we don't want // layer files to overlap with each other in the lsn,key space (no // overlaps for the rectangles). let key = next_key.key; debug!("key {key} with size impact larger than the layer size"); while !window.is_empty() { let has_more = false; let key_range = window.choose_next_delta(self.target_file_size, has_more) .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window"); create_delta_job(key_range, &job.lsn_range, &mut new_jobs); } // Not really required: but here for future resilience: // We make a "gap" here, so any structure the window holds should // probably be reset. window = Window::new(); let mut prior_lsn = job.lsn_range.start; let mut lsn_ranges = Vec::new(); for (lsn, _size) in next_key.partition_lsns.iter() { lsn_ranges.push(prior_lsn..*lsn); prior_lsn = *lsn; } lsn_ranges.push(prior_lsn..job.lsn_range.end); for lsn_range in lsn_ranges { let key_range = key..key.next(); create_delta_job(key_range, &lsn_range, &mut new_jobs); } } } } // All the input files are rewritten. Set up the tracking for when they can // be deleted. for layer_id in job.input_layers.iter() { let l = &mut self.layers[layer_id.0]; assert!(l.deletable_after.is_none()); l.deletable_after = Some(PendingJobSet::new()); } for j in new_jobs.into_iter().rev() { let job_id = self.push_job(j); let j = &self.jobs[job_id.0]; for layer_id in j.input_layers.iter() { self.layers[layer_id.0] .deletable_after .as_mut() .unwrap() .pending .insert(job_id); } } Ok(()) } } /// Sliding window through keyspace and values for image layer /// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points struct KeyspaceWindow { head: KeyspaceWindowHead, start_pos: KeyspaceWindowPos, } struct KeyspaceWindowHead { // overall key range to cover key_range: Range, keyspace: Vec>, target_keysize: u64, } #[derive(Clone)] struct KeyspaceWindowPos { end_key: K, keyspace_idx: usize, accum_keysize: u64, } impl KeyspaceWindowPos { fn reached_end(&self, w: &KeyspaceWindowHead) -> bool { self.keyspace_idx == w.keyspace.len() } // Advance the cursor until it reaches 'target_keysize'. fn advance_until_size( &mut self, w: &KeyspaceWindowHead, max_size: u64, shard_identity: &ShardIdentity, ) { while self.accum_keysize < max_size && !self.reached_end(w) { let curr_range = &w.keyspace[self.keyspace_idx]; if self.end_key < curr_range.start { // skip over any unused space self.end_key = curr_range.start; } // We're now within 'curr_range'. Can we advance past it completely? let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity); if (self.accum_keysize + distance as u64) < max_size { // oh yeah, it fits self.end_key = curr_range.end; self.keyspace_idx += 1; self.accum_keysize += distance as u64; } else { // advance within the range let skip_key = self.end_key.skip_some(); let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity); if (self.accum_keysize + distance as u64) < max_size { self.end_key = skip_key; self.accum_keysize += distance as u64; } else { self.end_key = self.end_key.next(); self.accum_keysize += 1; } } } } } impl KeyspaceWindow where K: CompactionKey, { fn new(key_range: Range, keyspace: CompactionKeySpace, target_keysize: u64) -> Self { assert!(keyspace.first().unwrap().start >= key_range.start); let start_key = key_range.start; let start_pos = KeyspaceWindowPos:: { end_key: start_key, keyspace_idx: 0, accum_keysize: 0, }; Self { head: KeyspaceWindowHead:: { key_range, keyspace, target_keysize, }, start_pos, } } fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option> { if self.start_pos.keyspace_idx == self.head.keyspace.len() { // we've reached the end return None; } let mut next_pos = self.start_pos.clone(); next_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + self.head.target_keysize, shard_identity, ); // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to // 1.25x target size let mut end_pos = next_pos.clone(); end_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), shard_identity, ); if end_pos.reached_end(&self.head) { // gobble up any unused keyspace between the last used key and end of the range assert!(end_pos.end_key <= self.head.key_range.end); end_pos.end_key = self.head.key_range.end; next_pos = end_pos; } let start_key = self.start_pos.end_key; self.start_pos = next_pos; Some(start_key..self.start_pos.end_key) } } // Take previous partitioning, based on the image layers below. // // Candidate is at the front: // // Consider stretching an image layer to next divider? If it's close enough, // that's the image candidate // // If it's too far, consider splitting at a reasonable point // // Is the image candidate smaller than the equivalent delta? If so, // split off the image. Otherwise, split off one delta. // Try to snap off the delta at a reasonable point struct WindowElement { start_key: K, // inclusive last_key: K, // inclusive accum_size: u64, } /// Sliding window through keyspace and values for delta layer tiling /// /// This is used to decide which delta layer to write next. struct Window { elems: VecDeque>, // last key that was split off, inclusive splitoff_key: Option, splitoff_size: u64, } impl Window where K: CompactionKey, { fn new() -> Self { Self { elems: VecDeque::new(), splitoff_key: None, splitoff_size: 0, } } fn feed(&mut self, key: K, size: u64) { let last_size; if let Some(last) = self.elems.back_mut() { // We require the keys to be strictly increasing for the window. // Keys should already have been deduplicated by `accum_key_values` assert!( last.last_key < key, "last_key(={}) >= key(={key})", last.last_key ); last_size = last.accum_size; } else { last_size = 0; } // This is a new key. let elem = WindowElement { start_key: key, last_key: key, accum_size: last_size + size, }; self.elems.push_back(elem); } fn remain_size(&self) -> u64 { self.elems.back().unwrap().accum_size - self.splitoff_size } fn peek_size(&self) -> u64 { self.elems.front().unwrap().accum_size - self.splitoff_size } fn is_empty(&self) -> bool { self.elems.is_empty() } fn commit_upto(&mut self, mut upto: usize) { while upto > 1 { let popped = self.elems.pop_front().unwrap(); self.elems.front_mut().unwrap().start_key = popped.start_key; upto -= 1; } } fn find_size_split(&self, target_size: u64) -> usize { self.elems .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size) } fn pop(&mut self) { let first = self.elems.pop_front().unwrap(); self.splitoff_size = first.accum_size; self.splitoff_key = Some(first.last_key); } // the difference between delta and image is that an image covers // any unused keyspace before and after, while a delta tries to // minimize that. TODO: difference not implemented fn pop_delta(&mut self) -> Range { let first = self.elems.front().unwrap(); let key_range = first.start_key..first.last_key.next(); self.pop(); key_range } // Prerequisite: we have enough input in the window // // On return None, the caller should feed more data and call again fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option> { if has_more && self.elems.is_empty() { // Starting up return None; } // If we still have an undersized candidate, just keep going while self.peek_size() < target_size { if self.elems.len() > 1 { self.commit_upto(2); } else if has_more { return None; } else { break; } } // Ensure we have enough input in the window to make a good decision if has_more && self.remain_size() < target_size * 5 / 4 { return None; } // The candidate on the front is now large enough, for a delta. // And we have enough data in the window to decide. // If we're willing to stretch it up to 1.25 target size, could we // gobble up the rest of the work? This avoids creating very small // "tail" layers at the end of the keyspace if !has_more && self.remain_size() < target_size * 5 / 4 { self.commit_upto(self.elems.len()); } else { let delta_split_at = self.find_size_split(target_size); self.commit_upto(delta_split_at); // If it's still not large enough, request the caller to fill the window if self.elems.len() == 1 && has_more { return None; } } Some(self.pop_delta()) } } ================================================ FILE: pageserver/compaction/src/helpers.rs ================================================ //! This file contains generic utility functions over the interface types, //! which could be handy for any compaction implementation. use std::collections::{BinaryHeap, VecDeque}; use std::fmt::Display; use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{Poll, ready}; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use utils::lsn::Lsn; use crate::interface::*; pub const PAGE_SZ: u64 = 8192; pub fn keyspace_total_size( keyspace: &CompactionKeySpace, shard_identity: &ShardIdentity, ) -> u64 where K: CompactionKey, { keyspace .iter() .map(|r| K::key_range_size(r, shard_identity) as u64) .sum() } pub fn overlaps_with(a: &Range, b: &Range) -> bool { !(a.end <= b.start || b.end <= a.start) } /// Whether a fully contains b, example as below /// ```plain /// | a | /// | b | /// ``` pub fn fully_contains(a: &Range, b: &Range) -> bool { a.start <= b.start && a.end >= b.end } pub fn union_to_keyspace(a: &mut CompactionKeySpace, b: CompactionKeySpace) { let x = std::mem::take(a); let mut all_ranges_iter = [x.into_iter(), b.into_iter()] .into_iter() .kmerge_by(|a, b| a.start < b.start); let mut ranges = Vec::new(); if let Some(first) = all_ranges_iter.next() { let (mut start, mut end) = (first.start, first.end); for r in all_ranges_iter { assert!(r.start >= start); if r.start > end { ranges.push(start..end); start = r.start; end = r.end; } else if r.end > end { end = r.end; } } ranges.push(start..end); } *a = ranges } pub fn intersect_keyspace( a: &CompactionKeySpace, r: &Range, ) -> CompactionKeySpace { let mut ranges: Vec> = Vec::new(); for x in a.iter() { if x.end <= r.start { continue; } if x.start >= r.end { break; } ranges.push(x.clone()) } // trim the ends if let Some(first) = ranges.first_mut() { first.start = std::cmp::max(first.start, r.start); } if let Some(last) = ranges.last_mut() { last.end = std::cmp::min(last.end, r.end); } ranges } /// Create a stream that iterates through all DeltaEntrys among all input /// layers, in key-lsn order. /// /// This is public because the create_delta() implementation likely wants to use this too /// TODO: move to a more shared place pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( layers: &'a [E::DeltaLayer], ctx: &'a E::RequestContext, ) -> MergeDeltaKeys<'a, E> { // Use a binary heap to merge the layers. Each input layer is initially // represented by a LazyLoadLayer::Unloaded element, which uses the start of // the layer's key range as the key. The first time a layer reaches the top // of the heap, all the keys of the layer are loaded into a sorted vector. // // This helps to keep the memory usage reasonable: we only need to hold in // memory the DeltaEntrys of the layers that overlap with the "current" key. let mut heap: BinaryHeap> = BinaryHeap::new(); for l in layers { heap.push(LazyLoadLayer::Unloaded(l)); } MergeDeltaKeys { heap, ctx, load_future: None, } } pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( layers: &'a [E::DeltaLayer], ctx: &'a E::RequestContext, ) -> anyhow::Result>::DeltaEntry<'a>>> { let mut keys = Vec::new(); for l in layers { // Boxing and casting to LoadFuture is required to obtain the right Sync bound. // If we do l.load_keys(ctx).await? directly, there is a compilation error. let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); keys.extend(load_future.await?.into_iter()); } keys.sort_by_key(|k| (k.key(), k.lsn())); let stream = futures::stream::iter(keys.into_iter()); Ok(stream) } enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } impl LazyLoadLayer<'_, E> { fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), Self::Unloaded(dl) => dl.key_range().start, } } fn min_lsn(&self) -> Lsn { match self { Self::Loaded(entries) => entries.front().unwrap().lsn(), Self::Unloaded(dl) => dl.lsn_range().start, } } } impl PartialOrd for LazyLoadLayer<'_, E> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for LazyLoadLayer<'_, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } impl PartialEq for LazyLoadLayer<'_, E> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == std::cmp::Ordering::Equal } } impl Eq for LazyLoadLayer<'_, E> {} type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; // Stream returned by `merge_delta_keys` pin_project! { #[allow(clippy::type_complexity)] pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> { heap: BinaryHeap>, #[pin] load_future: Option>::DeltaEntry<'a>>>, ctx: &'a E::RequestContext, } } impl<'a, E> Stream for MergeDeltaKeys<'a, E> where E: CompactionJobExecutor + 'a, { type Item = anyhow::Result<>::DeltaEntry<'a>>; fn poll_next( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Poll::Item>> { let mut this = self.project(); loop { if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() { // We are waiting for loading the keys to finish match ready!(load_future.as_mut().poll(cx)) { Ok(entries) => { this.load_future.set(None); *this.heap.peek_mut().unwrap() = LazyLoadLayer::Loaded(VecDeque::from(entries)); } Err(e) => { return Poll::Ready(Some(Err(e))); } } } // If the topmost layer in the heap hasn't been loaded yet, start // loading it. Otherwise return the next entry from it and update // the layer's position in the heap (this decreaseKey operation is // performed implicitly when `top` is dropped). if let Some(mut top) = this.heap.peek_mut() { match top.deref_mut() { LazyLoadLayer::Unloaded(l) => { let fut = l.load_keys(this.ctx); this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(entries) => { let result = entries.pop_front().unwrap(); if entries.is_empty() { std::collections::binary_heap::PeekMut::pop(top); } return Poll::Ready(Some(Ok(result))); } } } else { return Poll::Ready(None); } } } } // Accumulate values at key boundaries pub struct KeySize { pub key: K, pub num_values: u64, pub size: u64, /// The lsns to partition at (if empty then no per-lsn partitioning) pub partition_lsns: Vec<(Lsn, u64)>, } pub fn accum_key_values<'a, I, K, D, E>( input: I, target_size: u64, ) -> impl Stream, E>> where K: Eq + PartialOrd + Display + Copy, I: Stream>, D: CompactionDeltaEntry<'a, K>, { async_stream::try_stream! { // Initialize the state from the first value let mut input = std::pin::pin!(input); if let Some(first) = input.next().await { let first = first?; let mut part_size = first.size(); let mut accum: KeySize = KeySize { key: first.key(), num_values: 1, size: part_size, partition_lsns: Vec::new(), }; let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { let add_size = this.size(); if part_size + add_size > target_size { accum.partition_lsns.push((this.lsn(), part_size)); part_size = 0; } part_size += add_size; accum.size += add_size; accum.num_values += 1; } else { assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); last_key = accum.key; yield accum; part_size = this.size(); accum = KeySize { key: this.key(), num_values: 1, size: part_size, partition_lsns: Vec::new(), }; } } assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); yield accum; } } } ================================================ FILE: pageserver/compaction/src/identify_levels.rs ================================================ //! An LSM tree consists of multiple levels, each exponentially larger than the //! previous level. And each level consists of multiple "tiers". With tiered //! compaction, a level is compacted when it has accumulated more than N tiers, //! forming one tier on the next level. //! //! In the pageserver, we don't explicitly track the levels and tiers. Instead, //! we identify them by looking at the shapes of the layers. It's an easy task //! for a human, but it's not straightforward to come up with the exact //! rules. Especially if there are cases like interrupted, half-finished //! compactions, or highly skewed data distributions that have let us "skip" //! some levels. It's not critical to classify all cases correctly; at worst we //! delay some compaction work, and suffer from more read amplification, or we //! perform some unnecessary compaction work. //! //! `identify_level` performs that shape-matching. //! //! It returns a Level struct, which has `depth()` function to count the number //! of "tiers" in the level. The tier count is the max depth of stacked layers //! within the level. That's a good measure, because the point of compacting is //! to reduce read amplification, and the depth is what determines that. //! //! One interesting effect of this is that if we generate very small delta //! layers at L0, e.g. because the L0 layers are flushed by timeout rather than //! because they reach the target size, the L0 compaction will combine them to //! one larger file. But if the combined file is still smaller than the target //! file size, the file will still be considered to be part of L0 at the next //! iteration. use std::collections::BTreeSet; use std::ops::Range; use anyhow::bail; use tracing::{info, trace}; use utils::lsn::Lsn; use crate::interface::*; pub struct Level { pub lsn_range: Range, pub layers: Vec, } /// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are /// no layers that cross the boundary LSN. /// /// A further restriction is that all layers in the returned partition cover at /// most 'lsn_max_size' LSN bytes. pub async fn identify_level( all_layers: Vec, end_lsn: Lsn, lsn_max_size: u64, ) -> anyhow::Result>> where K: CompactionKey, L: CompactionLayer + Clone, { // filter out layers that are above the `end_lsn`, they are completely irrelevant. let mut layers = Vec::new(); for l in all_layers { if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { // shouldn't happen. Indicates that the caller passed a bogus // end_lsn. bail!( "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id() ); } // include image layers sitting exacty at `end_lsn`. let is_image = !l.is_delta(); if (is_image && l.lsn_range().start > end_lsn) || (!is_image && l.lsn_range().start >= end_lsn) { continue; } layers.push(l); } // All the remaining layers either belong to this level, or are below it. info!( "identify level at {}, size {}, num layers below: {}", end_lsn, lsn_max_size, layers.len() ); if layers.is_empty() { return Ok(None); } // Walk the ranges in LSN order. // // ----- end_lsn // | // | // v // layers.sort_by_key(|l| l.lsn_range().end); let mut candidate_start_lsn = end_lsn; let mut candidate_layers: Vec = Vec::new(); let mut current_best_start_lsn = end_lsn; let mut current_best_layers: Vec = Vec::new(); let mut iter = layers.into_iter(); loop { let Some(l) = iter.next_back() else { // Reached end. Accept the last candidate current_best_start_lsn = candidate_start_lsn; current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); break; }; trace!( "inspecting {} for candidate {}, current best {}", l.short_id(), candidate_start_lsn, current_best_start_lsn ); let r = l.lsn_range(); // Image layers don't restrict our choice of cutoff LSN if l.is_delta() { // Is this candidate workable? In other words, are there any // delta layers that span across this LSN // // Valid: Not valid: // + + // | | + // + <- candidate + | <- candidate // + + // | // + if r.end <= candidate_start_lsn { // Hooray, there are no crossing LSNs. And we have visited // through all the layers within candidate..end_lsn. The // current candidate can be accepted. current_best_start_lsn = r.end; current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); candidate_start_lsn = r.start; } // Is it small enough to be considered part of this level? if r.end.0 - r.start.0 > lsn_max_size { // Too large, this layer belongs to next level. Stop. trace!( "too large {}, size {} vs {}", l.short_id(), r.end.0 - r.start.0, lsn_max_size ); break; } // If this crosses the candidate lsn, push it down. if r.start < candidate_start_lsn { trace!( "layer {} prevents from stopping at {}", l.short_id(), candidate_start_lsn ); candidate_start_lsn = r.start; } } // Include this layer in our candidate candidate_layers.push(l); } Ok(if current_best_start_lsn == end_lsn { // empty level None } else { Some(Level { lsn_range: current_best_start_lsn..end_lsn, layers: current_best_layers, }) }) } impl Level { /// Count the number of deltas stacked on each other. pub fn depth(&self) -> u64 where K: CompactionKey, L: CompactionLayer, { struct Event { key: K, layer_idx: usize, start: bool, } let mut events: Vec> = Vec::new(); for (idx, l) in self.layers.iter().enumerate() { let key_range = l.key_range(); if key_range.end == key_range.start.next() && l.is_delta() { // Ignore single-key delta layers as they can be stacked on top of each other // as that is the only way to cut further. continue; } events.push(Event { key: l.key_range().start, layer_idx: idx, start: true, }); events.push(Event { key: l.key_range().end, layer_idx: idx, start: false, }); } events.sort_by_key(|e| (e.key, e.start)); // Sweep the key space left to right. Stop at each distinct key, and // count the number of deltas on top of the highest image at that key. // // This is a little inefficient, as we walk through the active_set on // every key. We could increment/decrement a counter on each step // instead, but that'd require a bit more complex bookkeeping. let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new(); let mut max_depth = 0; let mut events_iter = events.iter().peekable(); while let Some(e) = events_iter.next() { let l = &self.layers[e.layer_idx]; let is_image = !l.is_delta(); // update the active set if e.start { active_set.insert((l.lsn_range().end, is_image, e.layer_idx)); } else { active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx)); } // recalculate depth if this was the last event at this point let more_events_at_this_key = events_iter.peek().is_some_and(|next_e| next_e.key == e.key); if !more_events_at_this_key { let mut active_depth = 0; for (_end_lsn, is_image, _idx) in active_set.iter().rev() { if *is_image { break; } active_depth += 1; } if active_depth > max_depth { max_depth = active_depth; } } } debug_assert_eq!(active_set, BTreeSet::new()); max_depth } } #[cfg(test)] mod tests { use std::sync::{Arc, Mutex}; use super::*; use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; fn delta(key_range: Range, lsn_range: Range) -> MockLayer { MockLayer::Delta(Arc::new(MockDeltaLayer { key_range, lsn_range, // identify_level() doesn't pay attention to the rest of the fields file_size: 0, deleted: Mutex::new(false), records: vec![], })) } fn image(key_range: Range, lsn: Lsn) -> MockLayer { MockLayer::Image(Arc::new(MockImageLayer { key_range, lsn_range: lsn..(lsn + 1), // identify_level() doesn't pay attention to the rest of the fields file_size: 0, deleted: Mutex::new(false), })) } #[tokio::test] async fn test_identify_level() -> anyhow::Result<()> { let layers = vec![ delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)), delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)), delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)), ]; // All layers fit in the max file size let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) .await? .unwrap(); assert_eq!(level.depth(), 6); // Same LSN with smaller max file size. The second layer from the top is larger // and belongs to next level. let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) .await? .unwrap(); assert_eq!(level.depth(), 1); // Call with a smaller LSN let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000) .await? .unwrap(); assert_eq!(level.depth(), 2); // Call with an LSN that doesn't partition the space let result = identify_level(layers, Lsn(0x6000), 0x1000).await; assert!(result.is_err()); Ok(()) } #[tokio::test] async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> { // The files LSN ranges overlap, so even though there are more files that // fit under the file size, they are not included in the level because they // overlap so that we'd need to include the oldest file, too, which is // larger let layers = vec![ delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger ]; let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) .await? .unwrap(); assert_eq!(level.depth(), 1); Ok(()) } #[tokio::test] async fn test_depth_nonoverlapping() -> anyhow::Result<()> { // The key ranges don't overlap, so depth is only 1. let layers = vec![ delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)), delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)), delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), ]; let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) .await? .unwrap(); assert_eq!(level.layers.len(), 3); assert_eq!(level.depth(), 1); // Staggered. The 1st and 3rd layer don't overlap with each other. let layers = vec![ delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), ]; let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) .await? .unwrap(); assert_eq!(level.layers.len(), 3); assert_eq!(level.depth(), 2); Ok(()) } #[tokio::test] async fn test_depth_images() -> anyhow::Result<()> { let layers: Vec = vec![ delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), // This covers the same key range as the 2nd delta layer. The depth // in that key range is therefore 0. image(1500..2500, Lsn(0x9000)), ]; let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) .await? .unwrap(); assert_eq!(level.layers.len(), 4); assert_eq!(level.depth(), 1); Ok(()) } } ================================================ FILE: pageserver/compaction/src/interface.rs ================================================ //! This is what the compaction implementation needs to know about //! layers, keyspace etc. //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. use std::ops::Range; use futures::Future; use pageserver_api::key::Key; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::ShardIdentity; use utils::lsn::Lsn; /// Public interface. This is the main thing that the implementor needs to provide pub trait CompactionJobExecutor { // Type system. // // We assume that there are two kinds of layers, deltas and images. The // compaction doesn't distinguish whether they are stored locally or // remotely. // // The keyspace is defined by the CompactionKey trait. type Key: CompactionKey; type Layer: CompactionLayer + Clone; type DeltaLayer: CompactionDeltaLayer + Clone; type ImageLayer: CompactionImageLayer + Clone; // This is passed through to all the interface functions. The compaction // implementation doesn't do anything with it, but it might be useful for // the interface implementation. type RequestContext: CompactionRequestContext; // ---- // Functions that the planner uses to support its decisions // ---- fn get_shard_identity(&self) -> &ShardIdentity; /// Return all layers that overlap the given bounding box. fn get_layers( &mut self, key_range: &Range, lsn_range: &Range, ctx: &Self::RequestContext, ) -> impl Future>> + Send; fn get_keyspace( &mut self, key_range: &Range, lsn: Lsn, ctx: &Self::RequestContext, ) -> impl Future>> + Send; /// NB: This is a pretty expensive operation. In the real pageserver /// implementation, it downloads the layer, and keeps it resident /// until the DeltaLayer is dropped. fn downcast_delta_layer( &self, layer: &Self::Layer, ctx: &Self::RequestContext, ) -> impl Future>> + Send; // ---- // Functions to execute the plan // ---- /// Create a new image layer, materializing all the values in the key range, /// at given 'lsn'. fn create_image( &mut self, lsn: Lsn, key_range: &Range, ctx: &Self::RequestContext, ) -> impl Future> + Send; /// Create a new delta layer, containing all the values from 'input_layers' /// in the given key and LSN range. fn create_delta( &mut self, lsn_range: &Range, key_range: &Range, input_layers: &[Self::DeltaLayer], ctx: &Self::RequestContext, ) -> impl Future> + Send; /// Delete a layer. The compaction implementation will call this only after /// all the create_image() or create_delta() calls that deletion of this /// layer depends on have finished. But if the implementor has extra lazy /// background tasks, like uploading the index json file to remote storage. /// it is the implementation's responsibility to track those. fn delete_layer( &mut self, layer: &Self::Layer, ctx: &Self::RequestContext, ) -> impl Future> + Send; } pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { const MIN: Self; const MAX: Self; /// Calculate distance between key_range.start and key_range.end. /// /// This returns u32, for compatibility with Repository::key. If the /// distance is larger, return u32::MAX. fn key_range_size(key_range: &Range, shard_identity: &ShardIdentity) -> u32; // return "self + 1" fn next(&self) -> Self; // return "self + ". The amount to skip // is left to the implementation. // FIXME: why not just "add(u32)" ? This is hard to use fn skip_some(&self) -> Self; } impl CompactionKey for Key { const MIN: Self = Self::MIN; const MAX: Self = Self::MAX; fn key_range_size(r: &std::ops::Range, shard_identity: &ShardIdentity) -> u32 { ShardedRange::new(r.clone(), shard_identity).page_count() } fn next(&self) -> Key { (self as &Key).next() } fn skip_some(&self) -> Key { self.add(128) } } /// Contiguous ranges of keys that belong to the key space. In key order, and /// with no overlap. pub type CompactionKeySpace = Vec>; /// Functions needed from all layers. pub trait CompactionLayer { fn key_range(&self) -> &Range; fn lsn_range(&self) -> &Range; fn file_size(&self) -> u64; /// For debugging, short human-readable representation of the layer. E.g. filename. fn short_id(&self) -> String; fn is_delta(&self) -> bool; } pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. fn load_keys( &self, ctx: &E::RequestContext, ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} pub trait CompactionDeltaEntry<'a, K> { fn key(&self) -> K; fn lsn(&self) -> Lsn; fn size(&self) -> u64; } pub trait CompactionRequestContext {} ================================================ FILE: pageserver/compaction/src/lib.rs ================================================ // The main module implementing the compaction algorithm pub mod compact_tiered; pub(crate) mod identify_levels; // Traits that the caller of the compaction needs to implement pub mod interface; // Utility functions, useful for the implementation pub mod helpers; // A simulator with mock implementations of 'interface' pub mod simulator; ================================================ FILE: pageserver/compaction/src/simulator/draw.rs ================================================ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::fmt::Write; use std::ops::Range; use anyhow::Result; use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, Style, rgb}; use utils::lsn::Lsn; use super::Key; // Map values to their compressed coordinate - the index the value // would have in a sorted and deduplicated list of all values. struct CoordinateMap { map: BTreeMap, stretch: f32, } impl CoordinateMap { fn new(coords: Vec, stretch: f32) -> Self { let set: BTreeSet = coords.into_iter().collect(); let mut map: BTreeMap = BTreeMap::new(); for (i, e) in set.iter().enumerate() { map.insert(*e, i); } Self { map, stretch } } // This assumes that the map contains an exact point for this. // Use map_inexact for values inbetween fn map(&self, val: T) -> f32 { *self.map.get(&val).unwrap() as f32 * self.stretch } // the value is still assumed to be within the min/max bounds // (this is currently unused) fn _map_inexact(&self, val: T) -> f32 { let prev = *self.map.range(..=val).next().unwrap().1; let next = *self.map.range(val..).next().unwrap().1; // interpolate (prev as f32 + (next - prev) as f32) * self.stretch } fn max(&self) -> f32 { self.map.len() as f32 * self.stretch } } #[derive(PartialEq, Hash, Eq)] pub enum LayerTraceOp { Flush, CreateDelta, CreateImage, Delete, } impl std::fmt::Display for LayerTraceOp { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { let op_str = match self { LayerTraceOp::Flush => "flush", LayerTraceOp::CreateDelta => "create_delta", LayerTraceOp::CreateImage => "create_image", LayerTraceOp::Delete => "delete", }; f.write_str(op_str) } } #[derive(PartialEq, Hash, Eq, Clone)] pub struct LayerTraceFile { pub filename: String, pub key_range: Range, pub lsn_range: Range, } impl LayerTraceFile { fn is_image(&self) -> bool { self.lsn_range.end == self.lsn_range.start } } pub struct LayerTraceEvent { pub time_rel: u64, pub op: LayerTraceOp, pub file: LayerTraceFile, } pub fn draw_history(history: &[LayerTraceEvent], mut output: W) -> Result<()> { let mut files: Vec = Vec::new(); for event in history { files.push(event.file.clone()); } let last_time_rel = history.last().unwrap().time_rel; // Collect all coordinates let mut keys: Vec = vec![]; let mut lsns: Vec = vec![]; for f in files.iter() { keys.push(f.key_range.start); keys.push(f.key_range.end); lsns.push(f.lsn_range.start); lsns.push(f.lsn_range.end); } // Analyze let key_map = CoordinateMap::new(keys, 2.0); // Stretch out vertically for better visibility let lsn_map = CoordinateMap::new(lsns, 3.0); let mut svg = String::new(); // Draw writeln!( svg, "{}", BeginSvg { w: key_map.max(), h: lsn_map.max(), } )?; let lsn_max = lsn_map.max(); // Sort the files by LSN, but so that image layers go after all delta layers // The SVG is painted in the order the elements appear, and we want to draw // image layers on top of the delta layers if they overlap // // (This could also be implemented via z coordinates: image layers get one z // coord, delta layers get another z coord.) let mut files_sorted: Vec = files.into_iter().collect(); files_sorted.sort_by(|a, b| { if a.is_image() && !b.is_image() { Ordering::Greater } else if !a.is_image() && b.is_image() { Ordering::Less } else { a.lsn_range.end.cmp(&b.lsn_range.end) } }); writeln!(svg, "")?; let mut files_seen = HashSet::new(); for f in files_sorted { if files_seen.contains(&f) { continue; } let key_start = key_map.map(f.key_range.start); let key_end = key_map.map(f.key_range.end); let key_diff = key_end - key_start; if key_start >= key_end { panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = lsn_map.map(f.lsn_range.start); let lsn_end = lsn_map.map(f.lsn_range.end); // Fill in and thicken rectangle if it's an // image layer so that we can see it. let mut style = Style { fill: Fill::Color(rgb(0x80, 0x80, 0x80)), stroke: Stroke::Color(rgb(0, 0, 0), 0.5), opacity: 1.0, stroke_opacity: 1.0, }; let y_start = lsn_max - lsn_start; let y_end = lsn_max - lsn_end; let x_margin = 0.25; let y_margin = 0.5; match f.lsn_range.start.cmp(&f.lsn_range.end) { Ordering::Less => { write!( svg, r#" "#, f.filename, key_start + x_margin, y_end + y_margin, key_diff - x_margin * 2.0, y_start - y_end - y_margin * 2.0, 1.0, // border_radius, style, )?; write!(svg, "{}", f.filename)?; writeln!(svg, "")?; } Ordering::Equal => { //lsn_diff = 0.3; //lsn_offset = -lsn_diff / 2.0; //margin = 0.05; style.fill = Fill::Color(rgb(0x80, 0, 0x80)); style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0); write!( svg, r#" "#, f.filename, key_start + x_margin, y_end, key_end - x_margin, y_end, style, )?; write!( svg, "{}<br>{} - {}", f.filename, lsn_end, y_end )?; writeln!(svg, "")?; } Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } files_seen.insert(f); } writeln!(svg, "{EndSvg}")?; let mut layer_events_str = String::new(); let mut first = true; for e in history { if !first { writeln!(layer_events_str, ",")?; } write!( layer_events_str, r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#, e.time_rel, e.file.filename, e.op )?; first = false; } writeln!(layer_events_str)?; writeln!( output, r#"

{svg}
"# )?; Ok(()) } ================================================ FILE: pageserver/compaction/src/simulator.rs ================================================ mod draw; use std::fmt::Write; use std::ops::Range; use std::sync::{Arc, Mutex}; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; use utils::lsn::Lsn; use crate::helpers::{PAGE_SZ, merge_delta_keys, overlaps_with}; use crate::interface; use crate::interface::CompactionLayer; // // Implementation for the CompactionExecutor interface // pub struct MockTimeline { // Parameters for the compaction algorithm pub target_file_size: u64, tiers_per_level: u64, num_l0_flushes: u64, last_compact_at_flush: u64, last_flush_lsn: Lsn, // In-memory layer records: Vec, total_len: u64, start_lsn: Lsn, end_lsn: Lsn, // Current keyspace at `end_lsn`. This is updated on every ingested record. keyspace: KeySpace, // historic keyspaces old_keyspaces: Vec<(Lsn, KeySpace)>, // "on-disk" layers pub live_layers: Vec, num_deleted_layers: u64, // Statistics wal_ingested: u64, bytes_written: u64, bytes_deleted: u64, layers_created: u64, layers_deleted: u64, // All the events - creation and deletion of files - are collected // in 'history'. It is used to draw the SVG animation at the end. time: u64, history: Vec, } type KeySpace = interface::CompactionKeySpace; pub struct MockRequestContext {} impl interface::CompactionRequestContext for MockRequestContext {} pub type Key = u64; impl interface::CompactionKey for Key { const MIN: Self = u64::MIN; const MAX: Self = u64::MAX; fn key_range_size(key_range: &Range, _shard_identity: &ShardIdentity) -> u32 { std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 } fn next(&self) -> Self { self + 1 } fn skip_some(&self) -> Self { // round up to next xx self + 100 } } #[derive(Clone)] pub struct MockRecord { lsn: Lsn, key: Key, len: u64, } impl interface::CompactionDeltaEntry<'_, Key> for MockRecord { fn key(&self) -> Key { self.key } fn lsn(&self) -> Lsn { self.lsn } fn size(&self) -> u64 { self.len } } pub struct MockDeltaLayer { pub key_range: Range, pub lsn_range: Range, pub file_size: u64, pub deleted: Mutex, pub records: Vec, } impl interface::CompactionLayer for Arc { fn key_range(&self) -> &Range { &self.key_range } fn lsn_range(&self) -> &Range { &self.lsn_range } fn file_size(&self) -> u64 { self.file_size } fn short_id(&self) -> String { format!( "{:016X}-{:016X}__{:08X}-{:08X}", self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0 ) } fn is_delta(&self) -> bool { true } } impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; async fn load_keys(&self, _ctx: &MockRequestContext) -> anyhow::Result> { Ok(self.records.clone()) } } pub struct MockImageLayer { pub key_range: Range, pub lsn_range: Range, pub file_size: u64, pub deleted: Mutex, } impl interface::CompactionImageLayer for Arc {} impl interface::CompactionLayer for Arc { fn key_range(&self) -> &Range { &self.key_range } fn lsn_range(&self) -> &Range { &self.lsn_range } fn file_size(&self) -> u64 { self.file_size } fn short_id(&self) -> String { format!( "{:016X}-{:016X}__{:08X}", self.key_range.start, self.key_range.end, self.lsn_range.start.0, ) } fn is_delta(&self) -> bool { false } } impl MockTimeline { pub fn new() -> Self { MockTimeline { target_file_size: 256 * 1024 * 1024, tiers_per_level: 4, num_l0_flushes: 0, last_compact_at_flush: 0, last_flush_lsn: Lsn(0), records: Vec::new(), total_len: 0, start_lsn: Lsn(1000), end_lsn: Lsn(1000), keyspace: KeySpace::new(), old_keyspaces: vec![], live_layers: vec![], num_deleted_layers: 0, wal_ingested: 0, bytes_written: 0, bytes_deleted: 0, layers_created: 0, layers_deleted: 0, time: 0, history: Vec::new(), } } pub async fn compact(&mut self) -> anyhow::Result<()> { let ctx = MockRequestContext {}; crate::compact_tiered::compact_tiered( self, self.last_flush_lsn, self.target_file_size, self.tiers_per_level, &ctx, ) .await?; Ok(()) } // Ingest one record to the timeline pub fn ingest_record(&mut self, key: Key, len: u64) { self.records.push(MockRecord { lsn: self.end_lsn, key, len, }); self.total_len += len; self.end_lsn += len; if self.total_len > self.target_file_size { self.flush_l0(); } } pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> { if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level { self.compact().await?; self.last_compact_at_flush = self.num_l0_flushes; } Ok(()) } pub fn flush_l0(&mut self) { if self.records.is_empty() { return; } let mut records = std::mem::take(&mut self.records); records.sort_by_key(|rec| rec.key); let lsn_range = self.start_lsn..self.end_lsn; let new_layer = Arc::new(MockDeltaLayer { key_range: Key::MIN..Key::MAX, lsn_range: lsn_range.clone(), file_size: self.total_len, records, deleted: Mutex::new(false), }); info!("flushed L0 layer {}", new_layer.short_id()); self.live_layers.push(MockLayer::from(&new_layer)); // reset L0 self.start_lsn = self.end_lsn; self.total_len = 0; self.records = Vec::new(); self.layers_created += 1; self.bytes_written += new_layer.file_size; self.time += 1; self.history.push(LayerTraceEvent { time_rel: self.time, op: LayerTraceOp::Flush, file: LayerTraceFile { filename: new_layer.short_id(), key_range: new_layer.key_range.clone(), lsn_range: new_layer.lsn_range.clone(), }, }); self.num_l0_flushes += 1; self.last_flush_lsn = self.end_lsn; } // Ingest `num_records' records to the timeline, with random keys // uniformly distributed in `key_range` pub fn ingest_uniform( &mut self, num_records: u64, len: u64, key_range: &Range, ) -> anyhow::Result<()> { crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]); let mut rng = rand::rng(); for _ in 0..num_records { self.ingest_record(rng.random_range(key_range.clone()), len); self.wal_ingested += len; } Ok(()) } pub fn stats(&self) -> anyhow::Result { let mut s = String::new(); writeln!(s, "STATISTICS:")?; writeln!( s, "WAL ingested: {:>10} MB", self.wal_ingested / (1024 * 1024) )?; writeln!( s, "size created: {:>10} MB", self.bytes_written / (1024 * 1024) )?; writeln!( s, "size deleted: {:>10} MB", self.bytes_deleted / (1024 * 1024) )?; writeln!(s, "files created: {:>10}", self.layers_created)?; writeln!(s, "files deleted: {:>10}", self.layers_deleted)?; writeln!( s, "write amp: {:>10.2}", self.bytes_written as f64 / self.wal_ingested as f64 )?; writeln!( s, "storage amp: {:>10.2}", (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64 )?; Ok(s) } pub fn draw_history(&self, output: W) -> anyhow::Result<()> { draw::draw_history(&self.history, output) } } impl Default for MockTimeline { fn default() -> Self { Self::new() } } #[derive(Clone)] pub enum MockLayer { Delta(Arc), Image(Arc), } impl interface::CompactionLayer for MockLayer { fn key_range(&self) -> &Range { match self { MockLayer::Delta(this) => this.key_range(), MockLayer::Image(this) => this.key_range(), } } fn lsn_range(&self) -> &Range { match self { MockLayer::Delta(this) => this.lsn_range(), MockLayer::Image(this) => this.lsn_range(), } } fn file_size(&self) -> u64 { match self { MockLayer::Delta(this) => this.file_size, MockLayer::Image(this) => this.file_size, } } fn short_id(&self) -> String { match self { MockLayer::Delta(this) => this.short_id(), MockLayer::Image(this) => this.short_id(), } } fn is_delta(&self) -> bool { match self { MockLayer::Delta(_) => true, MockLayer::Image(_) => false, } } } impl MockLayer { fn is_deleted(&self) -> bool { let guard = match self { MockLayer::Delta(this) => this.deleted.lock().unwrap(), MockLayer::Image(this) => this.deleted.lock().unwrap(), }; *guard } fn mark_deleted(&self) { let mut deleted_guard = match self { MockLayer::Delta(this) => this.deleted.lock().unwrap(), MockLayer::Image(this) => this.deleted.lock().unwrap(), }; assert!(!*deleted_guard, "layer already deleted"); *deleted_guard = true; } } impl From<&Arc> for MockLayer { fn from(l: &Arc) -> Self { MockLayer::Delta(l.clone()) } } impl From<&Arc> for MockLayer { fn from(l: &Arc) -> Self { MockLayer::Image(l.clone()) } } impl interface::CompactionJobExecutor for MockTimeline { type Key = Key; type Layer = MockLayer; type DeltaLayer = Arc; type ImageLayer = Arc; type RequestContext = MockRequestContext; fn get_shard_identity(&self) -> &ShardIdentity { static IDENTITY: ShardIdentity = ShardIdentity::unsharded(); &IDENTITY } async fn get_layers( &mut self, key_range: &Range, lsn_range: &Range, _ctx: &Self::RequestContext, ) -> anyhow::Result> { // Clear any deleted layers from our vec self.live_layers.retain(|l| !l.is_deleted()); let layers: Vec = self .live_layers .iter() .filter(|l| { overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range) }) .cloned() .collect(); Ok(layers) } async fn get_keyspace( &mut self, key_range: &Range, _lsn: Lsn, _ctx: &Self::RequestContext, ) -> anyhow::Result> { // find it in the levels if self.old_keyspaces.is_empty() { Ok(crate::helpers::intersect_keyspace( &self.keyspace, key_range, )) } else { // not implemented // The mock implementation only allows requesting the // keyspace at the level's end LSN. That's all that the // current implementation needs. panic!("keyspace not available for requested lsn"); } } async fn downcast_delta_layer( &self, layer: &MockLayer, _ctx: &MockRequestContext, ) -> anyhow::Result>> { Ok(match layer { MockLayer::Delta(l) => Some(l.clone()), MockLayer::Image(_) => None, }) } async fn create_image( &mut self, lsn: Lsn, key_range: &Range, ctx: &MockRequestContext, ) -> anyhow::Result<()> { let keyspace = self.get_keyspace(key_range, lsn, ctx).await?; let mut accum_size: u64 = 0; for r in keyspace { accum_size += r.end - r.start; } let new_layer = Arc::new(MockImageLayer { key_range: key_range.clone(), lsn_range: lsn..lsn, file_size: accum_size * PAGE_SZ, deleted: Mutex::new(false), }); info!( "created image layer, size {}: {}", new_layer.file_size, new_layer.short_id() ); self.live_layers.push(MockLayer::Image(new_layer.clone())); // update stats self.bytes_written += new_layer.file_size; self.layers_created += 1; self.time += 1; self.history.push(LayerTraceEvent { time_rel: self.time, op: LayerTraceOp::CreateImage, file: LayerTraceFile { filename: new_layer.short_id(), key_range: new_layer.key_range.clone(), lsn_range: new_layer.lsn_range.clone(), }, }); Ok(()) } async fn create_delta( &mut self, lsn_range: &Range, key_range: &Range, input_layers: &[Arc], ctx: &MockRequestContext, ) -> anyhow::Result<()> { let mut key_value_stream = std::pin::pin!(merge_delta_keys::(input_layers, ctx)); let mut records: Vec = Vec::new(); let mut total_len = 2; while let Some(delta_entry) = key_value_stream.next().await { let delta_entry: MockRecord = delta_entry?; if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) { total_len += delta_entry.len; records.push(delta_entry); } } let total_records = records.len(); let new_layer = Arc::new(MockDeltaLayer { key_range: key_range.clone(), lsn_range: lsn_range.clone(), file_size: total_len, records, deleted: Mutex::new(false), }); info!( "created delta layer, recs {}, size {}: {}", total_records, total_len, new_layer.short_id() ); self.live_layers.push(MockLayer::Delta(new_layer.clone())); // update stats self.bytes_written += total_len; self.layers_created += 1; self.time += 1; self.history.push(LayerTraceEvent { time_rel: self.time, op: LayerTraceOp::CreateDelta, file: LayerTraceFile { filename: new_layer.short_id(), key_range: new_layer.key_range.clone(), lsn_range: new_layer.lsn_range.clone(), }, }); Ok(()) } async fn delete_layer( &mut self, layer: &Self::Layer, _ctx: &MockRequestContext, ) -> anyhow::Result<()> { let layer = std::pin::pin!(layer); info!("deleting layer: {}", layer.short_id()); self.num_deleted_layers += 1; self.bytes_deleted += layer.file_size(); layer.mark_deleted(); self.time += 1; self.history.push(LayerTraceEvent { time_rel: self.time, op: LayerTraceOp::Delete, file: LayerTraceFile { filename: layer.short_id(), key_range: layer.key_range().clone(), lsn_range: layer.lsn_range().clone(), }, }); Ok(()) } } ================================================ FILE: pageserver/compaction/tests/tests.rs ================================================ use once_cell::sync::OnceCell; use pageserver_compaction::interface::CompactionLayer; use pageserver_compaction::simulator::MockTimeline; use utils::logging; static LOG_HANDLE: OnceCell<()> = OnceCell::new(); pub(crate) fn setup_logging() { LOG_HANDLE.get_or_init(|| { logging::init( logging::LogFormat::Test, logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) .expect("Failed to init test logging"); }); } /// Test the extreme case that there are so many updates for a single key that /// even if we produce an extremely narrow delta layer, spanning just that one /// key, we still too many records to fit in the target file size. We need to /// split in the LSN dimension too in that case. #[tokio::test] async fn test_many_updates_for_single_key() { setup_logging(); let mut executor = MockTimeline::new(); executor.target_file_size = 1_000_000; // 1 MB // Ingest 10 MB of updates to a single key. for _ in 1..1000 { executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); executor.compact().await.unwrap(); } // Check that all the layers are smaller than the target size (with some slop) for l in executor.live_layers.iter() { println!("layer {}: {}", l.short_id(), l.file_size()); } for l in executor.live_layers.iter() { assert!(l.file_size() < executor.target_file_size * 2); // Sanity check that none of the delta layers are empty either. if l.is_delta() { assert!(l.file_size() > 0); } } } #[tokio::test] async fn test_simple_updates() { setup_logging(); let mut executor = MockTimeline::new(); executor.target_file_size = 500_000; // 500 KB // Ingest some traffic. for _ in 1..400 { executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); } for l in executor.live_layers.iter() { println!("layer {}: {}", l.short_id(), l.file_size()); } println!("Running compaction..."); executor.compact().await.unwrap(); for l in executor.live_layers.iter() { println!("layer {}: {}", l.short_id(), l.file_size()); } } ================================================ FILE: pageserver/ctl/Cargo.toml ================================================ [package] name = "pagectl" version = "0.1.0" edition.workspace = true license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow.workspace = true bincode.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } humantime.workspace = true itertools.workspace = true pageserver = { path = ".." } pageserver_api.workspace = true remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true serde_json.workspace = true ================================================ FILE: pageserver/ctl/src/download_remote_object.rs ================================================ use camino::Utf8PathBuf; use clap::Parser; use tokio_util::sync::CancellationToken; /// Download a specific object from remote storage to a local file. /// /// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment /// variable, in the same TOML format that the pageserver itself understands. This allows the /// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3, /// Azure Blob Storage and local files), as long as the credentials are available via the /// standard environment variables expected by the underlying SDKs. /// /// Examples for setting the environment variable: /// /// ```bash /// # AWS S3 (region can also be provided via AWS_REGION) /// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }' /// /// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY) /// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }' /// ``` #[derive(Parser)] pub(crate) struct DownloadRemoteObjectCmd { /// Key / path of the object to download (relative to the remote storage prefix). /// /// Examples: /// "wal/3aa8f.../00000001000000000000000A" /// "pageserver/v1/tenants//timelines//layer_12345" pub remote_path: String, /// Path of the local file to create. Existing file will be overwritten. /// /// Examples: /// "./segment" /// "/tmp/layer_12345.parquet" pub output_file: Utf8PathBuf, } pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> { use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig}; // Fetch remote storage configuration from the environment let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| { anyhow::anyhow!( "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config" ) })?; let config = RemoteStorageConfig::from_toml_str(&config_str)?; // Initialise remote storage client let storage = GenericRemoteStorage::from_config(&config).await?; // RemotePath must be relative – leading slashes confuse the parser. let remote_path_str = cmd.remote_path.trim_start_matches('/'); let remote_path = RemotePath::from_string(remote_path_str)?; let cancel = CancellationToken::new(); println!( "Downloading '{remote_path}' from remote storage bucket {:?} ...", config.storage.bucket_name() ); // Start the actual download let download = storage .download(&remote_path, &DownloadOpts::default(), &cancel) .await?; // Stream to file let mut reader = tokio_util::io::StreamReader::new(download.download_stream); let tmp_path = cmd.output_file.with_extension("tmp"); let mut file = tokio::fs::File::create(&tmp_path).await?; tokio::io::copy(&mut reader, &mut file).await?; file.sync_all().await?; // Atomically move into place tokio::fs::rename(&tmp_path, &cmd.output_file).await?; println!( "Downloaded to '{}'. Last modified: {:?}, etag: {}", cmd.output_file, download.last_modified, download.etag ); Ok(()) } ================================================ FILE: pageserver/ctl/src/draw_timeline_dir.rs ================================================ //! A tool for visualizing the arrangement of layerfiles within a timeline. //! //! It reads filenames from stdin and prints a svg on stdout. The image is a plot in //! page-lsn space, where every delta layer is a rectangle and every image layer is a //! thick line. Legend: //! - The x axis (left to right) represents page index. //! - The y axis represents LSN, growing upwards. //! //! Coordinates in both axis are compressed for better readability. //! (see ) //! //! The plain text API was chosen so that we can easily work with filenames from various //! sources; see the Usage section below for examples. //! //! # Usage //! //! ## Producing the SVG //! //! ```bash //! //! # local timeline dir //! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ //! grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg //! //! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` //! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg //! //! # From an `index_part.json` in S3 //! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg //! //! # enrich with lines for gc_cutoff and a child branch point //! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! //! ## Viewing //! //! **Inkscape** is better than the built-in viewers in browsers. //! //! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X) //! to see the layer file name in the comment field. //! //! ```bash //! //! # Linux //! inkscape out.svg //! //! # macOS //! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg //! //! ``` //! use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; use std::io::{self, BufRead}; use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; use anyhow::{Context, Result}; use pageserver_api::key::Key; use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, rectangle, rgb}; use utils::lsn::Lsn; use utils::project_git_version; project_git_version!(GIT_VERSION); // Map values to their compressed coordinate - the index the value // would have in a sorted and deduplicated list of all values. fn build_coordinate_compression_map(coords: Vec) -> BTreeMap { let set: BTreeSet = coords.into_iter().collect(); let mut map: BTreeMap = BTreeMap::new(); for (i, e) in set.iter().enumerate() { map.insert(*e, i); } map } fn parse_filename(name: &str) -> (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); // Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename: // 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp let lsns = split[1].split('.').collect::>()[0]; let mut lsns: Vec<&str> = lsns.split('-').collect(); // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001 // Handle generation number `-00000001` part if lsns.last().expect("should").len() == 8 { lsns.pop(); } // Handle version number `-v1` part if lsns.last().expect("should").starts_with('v') { lsns.pop(); } if lsns.len() == 1 { lsns.push(lsns[0]); } let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); (keys, lsns) } #[derive(Clone, Copy)] enum LineKind { GcCutoff, Branch, } impl From for Fill { fn from(value: LineKind) -> Self { match value { LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), LineKind::Branch => Fill::Color(rgb(0, 255, 0)), } } } impl FromStr for LineKind { type Err = anyhow::Error; fn from_str(s: &str) -> std::prelude::v1::Result { Ok(match s { "gc_cutoff" => LineKind::GcCutoff, "branch" => LineKind::Branch, _ => anyhow::bail!("unsupported linekind: {s}"), }) } } pub fn main() -> Result<()> { // Parse layer filenames from stdin struct Layer { filename: String, key_range: Range, lsn_range: Range, } let mut files: Vec = vec![]; let stdin = io::stdin(); let mut lines: Vec<(Lsn, LineKind)> = vec![]; for (lineno, line) in stdin.lock().lines().enumerate() { let lineno = lineno + 1; let line = line.unwrap(); if let Some((kind, lsn)) = line.split_once(':') { let (kind, lsn) = LineKind::from_str(kind) .context("parse kind") .and_then(|kind| { if lsn.contains('/') { Lsn::from_str(lsn) } else { Lsn::from_hex(lsn) } .map(|lsn| (kind, lsn)) .context("parse lsn") }) .with_context(|| format!("parse {line:?} on {lineno}"))?; lines.push((lsn, kind)); continue; } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); let (key_range, lsn_range) = parse_filename(filename); files.push(Layer { filename: filename.to_owned(), key_range, lsn_range, }); } // Collect all coordinates let mut keys: Vec = Vec::with_capacity(files.len()); let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); for Layer { key_range: keyr, lsn_range: lsnr, .. } in &files { keys.push(keyr.start); keys.push(keyr.end); lsns.push(lsnr.start); lsns.push(lsnr.end); } lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); // Initialize stats let mut num_deltas = 0; let mut num_images = 0; // Draw let stretch = 3.0; // Stretch out vertically for better visibility println!( "{}", BeginSvg { w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas for Layer { filename, key_range: keyr, lsn_range: lsnr, } in &files { let key_start = *key_map.get(&keyr.start).unwrap(); let key_end = *key_map.get(&keyr.end).unwrap(); let key_diff = key_end - key_start; let lsn_max = lsn_map.len(); if key_start >= key_end { panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = *lsn_map.get(&lsnr.start).unwrap(); let lsn_end = *lsn_map.get(&lsnr.end).unwrap(); let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an // image layer so that we can see it. match lsn_start.cmp(&lsn_end) { Ordering::Less => num_deltas += 1, Ordering::Equal => { num_images += 1; lsn_diff = 0.3; lsn_offset = -lsn_diff / 2.0; ymargin = 0.05; fill = Fill::Color(rgb(0, 0, 0)); } Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } println!( " {}", rectangle( 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) ) .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .border_radius(0.4) .comment(filename) ); } for (lsn, kind) in lines { let lsn_start = *lsn_map.get(&lsn).unwrap(); let lsn_end = lsn_start; let stretch = 2.0; let lsn_diff = 0.3; let lsn_offset = -lsn_diff / 2.0; let ymargin = 0.05; println!( "{}", rectangle( 0.0f32 + stretch * xmargin, stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), (key_map.len() + 10) as f32, stretch * (lsn_diff - 2.0 * ymargin) ) .fill(kind) ); } println!("{EndSvg}"); eprintln!("num_images: {num_images}"); eprintln!("num_deltas: {num_deltas}"); Ok(()) } ================================================ FILE: pageserver/ctl/src/index_part.rs ================================================ use std::str::FromStr; use anyhow::{Context, Ok}; use camino::Utf8PathBuf; use pageserver::tenant::{ IndexPart, layer_map::{LayerMap, SearchResult}, remote_timeline_client::{index::LayerFileMetadata, remote_layer_path}, storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak}, }; use pageserver_api::key::Key; use serde::Serialize; use std::collections::BTreeMap; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, shard::TenantShardId, }; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { Dump { path: Utf8PathBuf, }, /// Find all layers that need to be searched to construct the given page at the given LSN. Search { #[arg(long)] tenant_id: String, #[arg(long)] timeline_id: String, #[arg(long)] path: Utf8PathBuf, #[arg(long)] key: String, #[arg(long)] lsn: String, }, /// List all visible delta and image layers at the latest LSN. ListVisibleLayers { #[arg(long)] path: Utf8PathBuf, }, } fn create_layer_map_from_index_part( index_part: &IndexPart, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> LayerMap { let mut layer_map = LayerMap::default(); { let mut updates = layer_map.batch_update(); for (key, value) in index_part.layer_metadata.iter() { updates.insert_historic(PersistentLayerDesc::from_filename( tenant_shard_id, timeline_id, key.clone(), value.file_size, )); } } layer_map } async fn search_layers( tenant_id: &str, timeline_id: &str, path: &Utf8PathBuf, key: &str, lsn: &str, ) -> anyhow::Result<()> { let tenant_id = TenantId::from_str(tenant_id).unwrap(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let timeline_id = TimelineId::from_str(timeline_id).unwrap(); let index_json = { let bytes = tokio::fs::read(path).await?; IndexPart::from_json_bytes(&bytes).unwrap() }; let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id); let key = Key::from_hex(key)?; let lsn = Lsn::from_str(lsn).unwrap(); let mut end_lsn = lsn; loop { let result = layer_map.search(key, end_lsn); match result { Some(SearchResult { layer, lsn_floor }) => { let disk_layer = match layer { ReadableLayerWeak::PersistentLayer(layer) => layer, ReadableLayerWeak::InMemoryLayer(_) => { anyhow::bail!("unexpected in-memory layer") } }; let metadata = index_json .layer_metadata .get(&disk_layer.layer_name()) .unwrap(); println!( "{}", remote_layer_path( &tenant_id, &timeline_id, metadata.shard, &disk_layer.layer_name(), metadata.generation ) ); end_lsn = lsn_floor; } None => break, } } Ok(()) } #[derive(Debug, Clone, Serialize)] struct VisibleLayers { pub total_images: u64, pub total_image_bytes: u64, pub total_deltas: u64, pub total_delta_bytes: u64, pub layer_metadata: BTreeMap, } impl VisibleLayers { pub fn new() -> Self { Self { layer_metadata: BTreeMap::new(), total_images: 0, total_image_bytes: 0, total_deltas: 0, total_delta_bytes: 0, } } pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) { match name { LayerName::Image(_) => { self.total_images += 1; self.total_image_bytes += layer.file_size; } LayerName::Delta(_) => { self.total_deltas += 1; self.total_delta_bytes += layer.file_size; } } self.layer_metadata.insert(name, layer); } } async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> { let tenant_id = TenantId::generate(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let timeline_id = TimelineId::generate(); let bytes = tokio::fs::read(path).await.context("read file")?; let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?; let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id); let mut visible_layers = VisibleLayers::new(); let (layers, _key_space) = layer_map.get_visibility(Vec::new()); for (layer, visibility) in layers { if visibility == LayerVisibilityHint::Visible { visible_layers.add_layer( layer.layer_name(), index_part .layer_metadata .get(&layer.layer_name()) .unwrap() .clone(), ); } } let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?; println!("{output}"); Ok(()) } pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { match cmd { IndexPartCmd::Dump { path } => { let bytes = tokio::fs::read(path).await.context("read file")?; let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?; let output = serde_json::to_string_pretty(&des).context("serialize output")?; println!("{output}"); Ok(()) } IndexPartCmd::Search { tenant_id, timeline_id, path, key, lsn, } => search_layers(tenant_id, timeline_id, path, key, lsn).await, IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await, } } ================================================ FILE: pageserver/ctl/src/key.rs ================================================ use std::str::FromStr; use anyhow::Context; use clap::Parser; use pageserver_api::key::Key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize}; #[derive(Parser)] pub(super) struct DescribeKeyCommand { /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum input: Vec, /// The number of shards to calculate what Keys placement would be. #[arg(long)] shard_count: Option, /// The sharding stripe size. /// /// The default is hardcoded. It makes no sense to provide this without providing /// `--shard-count`. #[arg(long, requires = "shard_count")] stripe_size: Option, } /// Sharded shard count without unsharded count, which the actual ShardCount supports. #[derive(Clone, Copy)] pub(super) struct CustomShardCount(std::num::NonZeroU8); #[derive(Debug, thiserror::Error)] pub(super) enum InvalidShardCount { #[error(transparent)] ParsingFailed(#[from] std::num::ParseIntError), #[error("too few shards")] TooFewShards, } impl FromStr for CustomShardCount { type Err = InvalidShardCount; fn from_str(s: &str) -> Result { let inner: std::num::NonZeroU8 = s.parse()?; if inner.get() < 2 { Err(InvalidShardCount::TooFewShards) } else { Ok(CustomShardCount(inner)) } } } impl From for ShardCount { fn from(value: CustomShardCount) -> Self { ShardCount::new(value.0.get()) } } impl DescribeKeyCommand { pub(super) fn execute(self) { let DescribeKeyCommand { input, shard_count, stripe_size, } = self; let material = KeyMaterial::try_from(input.as_slice()).unwrap(); let kind = material.kind(); let key = Key::from(material); println!("parsed from {kind}: {key}:"); println!(); println!("{key:?}"); macro_rules! kind_query { ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}}; ($name:ident) => {{ let s: &'static str = stringify!($name); let s = s.strip_prefix("is_").unwrap_or(s); let s = s.strip_suffix("_key").unwrap_or(s); #[allow(clippy::needless_borrow)] (s, key.$name()) }}; } // the current characterization is a mess of these boolean queries and separate // "recognization". I think it accurately represents how strictly we model the Key // right now, but could of course be made less confusing. let queries = kind_query!([ is_rel_block_key, is_rel_vm_block_key, is_rel_fsm_block_key, is_slru_block_key, is_inherited_key, is_rel_size_key, is_slru_segment_size_key, ]); let recognized_kind = "recognized kind"; let metadata_key = "metadata key"; let shard_placement = "shard placement"; let longest = queries .iter() .map(|t| t.0) .chain([recognized_kind, metadata_key, shard_placement]) .map(|s| s.len()) .max() .unwrap(); let colon = 1; let padding = 1; for (name, is) in queries { let width = longest - name.len() + colon + padding; println!("{}{:width$}{}", name, ":", is); } let width = longest - recognized_kind.len() + colon + padding; println!( "{}{:width$}{:?}", recognized_kind, ":", RecognizedKeyKind::new(key), ); if let Some(shard_count) = shard_count { // seeing the sharding placement might be confusing, so leave it out unless shard // count was given. let stripe_size = stripe_size .map(ShardStripeSize) .unwrap_or(DEFAULT_STRIPE_SIZE); println!( "# placement with shard_count: {} and stripe_size: {}:", shard_count.0, stripe_size.0 ); let width = longest - shard_placement.len() + colon + padding; println!( "{}{:width$}{:?}", shard_placement, ":", pageserver_api::shard::describe(&key, shard_count.into(), stripe_size) ); } } } /// Hand-wavy "inputs we accept" for a key. #[derive(Debug)] pub(super) enum KeyMaterial { Hex(Key), String(SpanAttributesFromLogs), Split(RelTag, BlockNumber), } impl KeyMaterial { fn kind(&self) -> &'static str { match self { KeyMaterial::Hex(_) => "hex", KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split", } } } impl From for Key { fn from(value: KeyMaterial) -> Self { match value { KeyMaterial::Hex(key) => key, KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum)) | KeyMaterial::Split(rt, blocknum) => { pageserver_api::key::rel_block_to_key(rt, blocknum) } } } } impl> TryFrom<&[S]> for KeyMaterial { type Error = anyhow::Error; fn try_from(value: &[S]) -> Result { match value { [] => anyhow::bail!( "need 1..N positional arguments describing the key, try hex or a log line" ), [one] => { let one = one.as_ref(); let key = Key::from_hex(one).map(KeyMaterial::Hex); let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String); match (key, attrs) { (Ok(key), _) => Ok(key), (_, Ok(s)) => Ok(s), (Err(e1), Err(e2)) => anyhow::bail!( "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}" ), } } more => { // assume going left to right one of these is a reltag and then we find a blocknum // this works, because we don't have plain numbers at least right after reltag in // logs. for some definition of "works". let Some((reltag_at, reltag)) = more .iter() .map(AsRef::as_ref) .enumerate() .find_map(|(i, s)| { s.split_once("rel=") .map(|(_garbage, actual)| actual) .unwrap_or(s) .parse::() .ok() .map(|rt| (i, rt)) }) else { anyhow::bail!("found no RelTag in arguments"); }; let Some(blocknum) = more .iter() .map(AsRef::as_ref) .skip(reltag_at) .find_map(|s| { s.split_once("blkno=") .map(|(_garbage, actual)| actual) .unwrap_or(s) .parse::() .ok() }) else { anyhow::bail!("found no blocknum in arguments"); }; Ok(KeyMaterial::Split(reltag, blocknum)) } } } } #[derive(Debug)] pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber); impl std::str::FromStr for SpanAttributesFromLogs { type Err = anyhow::Error; fn from_str(s: &str) -> Result { // accept the span separator but do not require or fail if either is missing // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}" let (_, reltag) = s .split_once("rel=") .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?; let reltag = reltag.split_whitespace().next().unwrap(); let (_, blocknum) = s .split_once("blkno=") .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?; let blocknum = blocknum.split_whitespace().next().unwrap(); let reltag = reltag .parse() .with_context(|| format!("parse reltag from {reltag:?}"))?; let blocknum = blocknum .parse() .with_context(|| format!("parse blocknum from {blocknum:?}"))?; Ok(Self(reltag, blocknum)) } } #[derive(Debug)] #[allow(dead_code)] // debug print is used enum RecognizedKeyKind { DbDir, ControlFile, Checkpoint, AuxFilesV1, SlruDir(Result), RelMap(RelTagish<2>), RelDir(RelTagish<2>), AuxFileV2(Result>), } #[derive(Debug, PartialEq)] #[allow(unused)] enum AuxFileV2 { Recognized(&'static str, utils::Hex<[u8; 13]>), OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>), Other(utils::Hex<[u8; 13]>), } impl RecognizedKeyKind { fn new(key: Key) -> Option { use RecognizedKeyKind::{ AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir, }; let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key); Some(match key { pageserver_api::key::DBDIR_KEY => DbDir, pageserver_api::key::CONTROLFILE_KEY => ControlFile, pageserver_api::key::CHECKPOINT_KEY => Checkpoint, pageserver_api::key::AUX_FILES_KEY => AuxFilesV1, _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()), _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => { RelMap([key.field2, key.field3].into()) } _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => { RelDir([key.field2, key.field3].into()) } _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2( AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())), ), _ => return None, }) } } impl AuxFileV2 { fn new(key: Key) -> Option { const EMPTY_HASH: [u8; 13] = { let mut out = [0u8; 13]; let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes(); let mut i = 3; while i < 16 { out[i - 3] = hash[i]; i += 1; } out }; let bytes = key.to_i128().to_be_bytes(); let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap()); assert_eq!(EMPTY_HASH.len(), hash.0.len()); // TODO: we could probably find the preimages for the hashes Some(match (bytes[1], bytes[2]) { (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash), (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash), (1, 3) if hash.0 == EMPTY_HASH => { AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) } (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash), (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), (0xff, 0xff) => AuxFileV2::Other(hash), _ => return None, }) } } /// Prefix of RelTag, currently only known use cases are the two item versions. /// /// Renders like a reltag with `/`, nothing else. struct RelTagish([u32; N]); impl From<[u32; N]> for RelTagish { fn from(val: [u32; N]) -> Self { RelTagish(val) } } impl std::fmt::Debug for RelTagish { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use std::fmt::Write as _; let mut first = true; self.0.iter().try_for_each(|x| { if !first { f.write_char('/')?; } first = false; write!(f, "{x}") }) } } #[cfg(test)] mod tests { use pageserver::aux_file::encode_aux_file_key; use super::*; #[test] fn hex_is_key_material() { let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap(); assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}"); } #[test] fn single_positional_spanalike_is_key_material() { // why is this needed? if you are checking many, then copypaste starts to appeal let strings = [ ( line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0", ), (line!(), "rel=1663/208101/2620_fsm blkno=2"), (line!(), "rel=1663/208101/2620.1 blkno=2"), ]; let mut first: Option = None; for (line, example) in strings { let m = KeyMaterial::try_from(&[example][..]) .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); let key = Key::from(m); if let Some(first) = first { assert_eq!(first, key); } else { first = Some(key); } } // not supporting this is rather accidential, but I think the input parsing is lenient // enough already KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err(); } #[test] fn multiple_spanlike_args() { let strings = [ ( line!(), &[ "process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}", ][..], ), (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), (line!(), &["1663/208101/2620_fsm", "2"][..]), ]; let mut first: Option = None; for (line, example) in strings { let m = KeyMaterial::try_from(example) .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); let key = Key::from(m); if let Some(first) = first { assert_eq!(first, key); } else { first = Some(key); } } } #[test] fn recognized_auxfiles() { use AuxFileV2::*; let empty = [ 0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d, ]; let foobar = [ 0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18, ]; #[rustfmt::skip] let examples = [ (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))), (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))), (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))), (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))), (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))), (line!(), "foobar", Other(utils::Hex(foobar))), ]; for (line, path, expected) in examples { let key = encode_aux_file_key(path); let recognized = AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed")); assert_eq!(recognized, expected); } assert_eq!( AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()), None, "example key has one too few 0 after 6 before 1" ); } } ================================================ FILE: pageserver/ctl/src/layer_map_analyzer.rs ================================================ //! Tool for extracting content-dependent metadata about layers. Useful for scanning real project layer files and evaluating the effectiveness of different heuristics on them. //! //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; use std::str::FromStr; use std::{fs, str}; use anyhow::{Result, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::page_cache::{self, PAGE_SZ}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; use pageserver::tenant::storage_layer::delta_layer::{DELTA_KEY_SIZE, Summary}; use pageserver::tenant::storage_layer::{LayerName, range_overlaps}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::virtual_file::{self, VirtualFile}; use pageserver_api::key::{KEY_SIZE, Key}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; use crate::AnalyzeLayerMapCmd; const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; const DEFAULT_MAX_HOLES: usize = 10; /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(PartialEq, Eq)] pub struct Hole(Range); impl Ord for Hole { fn cmp(&self, other: &Self) -> Ordering { let other_len = other.0.end.to_i128() - other.0.start.to_i128(); let self_len = self.0.end.to_i128() - self.0.start.to_i128(); other_len.cmp(&self_len) } } impl PartialOrd for Hole { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } pub(crate) struct LayerFile { pub key_range: Range, pub lsn_range: Range, pub is_delta: bool, pub holes: Vec, } impl LayerFile { fn skips(&self, key_range: &Range) -> bool { if !range_overlaps(&self.key_range, key_range) { return false; } let start = match self .holes .binary_search_by_key(&key_range.start, |hole| hole.0.start) { Ok(index) => index, Err(index) => { if index == 0 { return false; } index - 1 } }; self.holes[start].0.end >= key_range.end } } pub(crate) fn parse_filename(name: &str) -> anyhow::Result { let layer_name = LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?; let holes = Vec::new(); Ok(LayerFile { key_range: layer_name.key_range().clone(), lsn_range: layer_name.lsn_as_range(), is_delta: layer_name.is_delta(), holes, }) } // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, block_reader, ); // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev_key: Option = None; tree_reader .visit( &[0u8; DELTA_KEY_SIZE], VisitDirection::Forwards, |key, _value| { let curr = Key::from_slice(&key[..KEY_SIZE]); if let Some(prev) = prev_key { if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH { heap.push(Hole(prev..curr)); if heap.len() > max_holes { heap.pop(); // remove smallest hole } } } prev_key = Some(curr.next()); true }, ctx, ) .await?; let mut holes = heap.into_vec(); holes.sort_by_key(|hole| hole.0.start); Ok(holes) } pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let storage_path = &cmd.path; let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), virtual_file::SyncMode::Sync, ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; let mut total_image_layers = 0usize; let mut total_excess_layers = 0usize; for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; } // Collect sorted vec of layers and count deltas let mut layers = Vec::new(); let mut n_deltas = 0usize; for layer in fs::read_dir(timeline.path())? { let layer = layer?; if let Ok(mut layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { if layer_file.is_delta { let layer_path = Utf8PathBuf::from_path_buf(layer.path()).expect("non-Unicode path"); layer_file.holes = get_holes(&layer_path, max_holes, &ctx).await?; n_deltas += 1; } layers.push(layer_file); } } layers.sort_by_key(|layer| layer.lsn_range.end); // Count the number of holes and number of excess layers. // Excess layer is image layer generated when holes in delta layers are not considered. let mut n_excess_layers = 0usize; let mut n_holes = 0usize; for i in 0..layers.len() { if !layers[i].is_delta { let mut n_deltas_since_last_image = 0usize; let mut n_skipped = 0usize; let img_key_range = &layers[i].key_range; for j in (0..i).rev() { if range_overlaps(img_key_range, &layers[j].key_range) { if layers[j].is_delta { n_deltas_since_last_image += 1; if layers[j].skips(img_key_range) { n_skipped += 1; } } else { // Image layer is always dense, despite to the fact that it doesn't contain all possible // key values in the specified range: there are may be no keys in the storage belonging // to the image layer range but not present in the image layer. break; } } } if n_deltas_since_last_image >= 3 && n_deltas_since_last_image - n_skipped < 3 { // It is just approximation: it doesn't take in account all image coverage. // Moreover the new layer map doesn't count total deltas, but the max stack of overlapping deltas. n_excess_layers += 1; } n_holes += n_skipped; } } println!( "Tenant {} timeline {} delta layers {} image layers {} excess layers {} holes {}", tenant.file_name().into_string().unwrap(), timeline.file_name().into_string().unwrap(), n_deltas, layers.len() - n_deltas, n_excess_layers, n_holes ); total_delta_layers += n_deltas; total_image_layers += layers.len() - n_deltas; total_excess_layers += n_excess_layers; } } println!( "Total delta layers {total_delta_layers} image layers {total_image_layers} excess layers {total_excess_layers}" ); Ok(()) } ================================================ FILE: pageserver/ctl/src/layers.rs ================================================ use std::fs::{self, File}; use std::path::{Path, PathBuf}; use anyhow::Result; use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, image_layer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::{LayerFile, parse_filename}; #[derive(Subcommand)] pub(crate) enum LayerCmd { /// List all tenants and timelines under the pageserver path /// /// Example: `cargo run --bin pagectl layer list .neon/` List { path: PathBuf }, /// List all layers of a given tenant and timeline /// /// Example: `cargo run --bin pagectl layer list .neon/` ListLayer { path: PathBuf, tenant: String, timeline: String, key: Option, }, /// Dump all information of a layer file DumpLayer { path: PathBuf, tenant: String, timeline: String, /// The id from list-layer command id: usize, }, /// Dump all information of a layer file locally DumpLayerLocal { path: PathBuf }, RewriteSummary { layer_file_path: Utf8PathBuf, #[clap(long)] new_tenant_id: Option, #[clap(long)] new_timeline_id: Option, }, } async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), virtual_file::SyncMode::Sync, ); page_cache::init(100); let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); let file = File::open(path)?; let delta_layer = DeltaLayer::new_for_path(path, file)?; delta_layer.dump(true, ctx).await?; Ok(()) } async fn read_image_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), virtual_file::SyncMode::Sync, ); page_cache::init(100); let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); let file = File::open(path)?; let image_layer = ImageLayer::new_for_path(path, file)?; image_layer.dump(true, ctx).await?; Ok(()) } pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); match cmd { LayerCmd::List { path } => { for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } println!("tenant {}", tenant.file_name().to_string_lossy()); for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; } println!("- timeline {}", timeline.file_name().to_string_lossy()); } } Ok(()) } LayerCmd::ListLayer { path, tenant, timeline, key, } => { let timeline_path = path .join(TENANTS_SEGMENT_NAME) .join(tenant) .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; let mut to_print = Vec::default(); for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { if let Some(key) = key { if layer_file.key_range.start <= *key && *key < layer_file.key_range.end { to_print.push((idx, layer_file)); } } else { to_print.push((idx, layer_file)); } idx += 1; } } if key.is_some() { to_print .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end)); } for (idx, layer_file) in to_print { print_layer_file(idx, &layer_file); } Ok(()) } LayerCmd::DumpLayer { path, tenant, timeline, id, } => { let timeline_path = path .join("tenants") .join(tenant) .join("timelines") .join(timeline); let mut idx = 0; for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { if *id == idx { print_layer_file(idx, &layer_file); if layer_file.is_delta { read_delta_file(layer.path(), &ctx).await?; } else { read_image_file(layer.path(), &ctx).await?; } break; } idx += 1; } } Ok(()) } LayerCmd::DumpLayerLocal { path } => { if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) { print_layer_file(0, &layer_file); if layer_file.is_delta { read_delta_file(path, &ctx).await?; } else { read_image_file(path, &ctx).await?; } } Ok(()) } LayerCmd::RewriteSummary { layer_file_path, new_tenant_id, new_timeline_id, } => { pageserver::virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), virtual_file::SyncMode::Sync, ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error) .with_scope_debug_tools(); macro_rules! rewrite_closure { ($($summary_ty:tt)*) => {{ |summary| $($summary_ty)* { tenant_id: new_tenant_id.unwrap_or(summary.tenant_id), timeline_id: new_timeline_id.unwrap_or(summary.timeline_id), ..summary } }}; } let res = ImageLayer::rewrite_summary( layer_file_path, rewrite_closure!(image_layer::Summary), &ctx, ) .await; match res { Ok(()) => { println!("Successfully rewrote summary of image layer {layer_file_path}"); return Ok(()); } Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough Err(image_layer::RewriteSummaryError::Other(e)) => { return Err(e); } } let res = DeltaLayer::rewrite_summary( layer_file_path, rewrite_closure!(delta_layer::Summary), &ctx, ) .await; match res { Ok(()) => { println!("Successfully rewrote summary of delta layer {layer_file_path}"); return Ok(()); } Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough Err(delta_layer::RewriteSummaryError::Other(e)) => { return Err(e); } } anyhow::bail!("not an image or delta layer: {layer_file_path}"); } } } fn print_layer_file(idx: usize, layer_file: &LayerFile) { println!( "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", idx, layer_file.key_range.start, layer_file.key_range.end, layer_file.lsn_range.start, layer_file.lsn_range.end, layer_file.is_delta, ); } ================================================ FILE: pageserver/ctl/src/main.rs ================================================ //! A helper tool to manage pageserver binary files. //! Accepts a file as an argument, attempts to parse it with all ways possible //! and prints its interpreted context. //! //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. mod download_remote_object; mod draw_timeline_dir; mod index_part; mod key; mod layer_map_analyzer; mod layers; mod page_trace; use std::str::FromStr; use std::time::{Duration, SystemTime}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use download_remote_object::DownloadRemoteObjectCmd; use index_part::IndexPartCmd; use layers::LayerCmd; use page_trace::PageTraceCmd; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::page_cache; use pageserver::task_mgr::TaskKind; use pageserver::tenant::dump_layerfile_from_path; use pageserver::tenant::metadata::TimelineMetadata; use pageserver::virtual_file::api::IoMode; use pageserver::virtual_file::{self}; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; use utils::id::TimelineId; use utils::logging::{self, LogFormat, TracingErrorLayerEnablement}; use utils::lsn::Lsn; use utils::project_git_version; project_git_version!(GIT_VERSION); #[derive(Parser)] #[command( version = GIT_VERSION, about = "Neon Pageserver binutils", long_about = "Reads pageserver (and related) binary files management utility" )] #[command(propagate_version = true)] struct CliOpts { #[command(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { Metadata(MetadataCmd), #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] Layer(LayerCmd), /// Debug print a hex key found from logs Key(key::DescribeKeyCommand), PageTrace(PageTraceCmd), DownloadRemoteObject(DownloadRemoteObjectCmd), } /// Read and update pageserver metadata file #[derive(Parser)] struct MetadataCmd { /// Input metadata file path metadata_path: Utf8PathBuf, /// Replace disk consistent Lsn disk_consistent_lsn: Option, /// Replace previous record Lsn prev_record_lsn: Option, /// Replace latest gc cuttoff latest_gc_cuttoff: Option, } #[derive(Parser)] struct PrintLayerFileCmd { /// Pageserver data path path: Utf8PathBuf, } /// Roll back the time for the specified prefix using S3 history. /// /// The command is fairly low level and powerful. Validation is only very light, /// so it is more powerful, and thus potentially more dangerous. #[derive(Parser)] struct TimeTravelRemotePrefixCmd { /// A configuration string for the remote_storage configuration. /// /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` config_toml_str: String, /// remote prefix to time travel recover. For safety reasons, we require it to contain /// a timeline or tenant ID in the prefix. prefix: String, /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. travel_to: String, /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. /// You can use a few seconds before invoking the command. Same format as `travel_to`. done_if_after: Option, } #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path path: Utf8PathBuf, /// Max holes max_holes: Option, } #[tokio::main] async fn main() -> anyhow::Result<()> { logging::init( LogFormat::Plain, TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); let cli = CliOpts::parse(); match cli.command { Commands::Layer(cmd) => { layers::main(&cmd).await?; } Commands::Metadata(cmd) => { handle_metadata(&cmd)?; } Commands::IndexPart(cmd) => { index_part::main(&cmd).await?; } Commands::DrawTimeline {} => { draw_timeline_dir::main()?; } Commands::AnalyzeLayerMap(cmd) => { layer_map_analyzer::main(&cmd).await?; } Commands::PrintLayerFile(cmd) => { if let Err(e) = read_pg_control_file(&cmd.path) { println!( "Failed to read input file as a pg control one: {e:#}\n\ Attempting to read it as layer file" ); print_layerfile(&cmd.path).await?; } } Commands::TimeTravelRemotePrefix(cmd) => { let timestamp = humantime::parse_rfc3339(&cmd.travel_to) .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { humantime::parse_rfc3339(done_if_after).map_err(|_e| { anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) })? } else { const SAFETY_MARGIN: Duration = Duration::from_secs(3); tokio::time::sleep(SAFETY_MARGIN).await; // Convert to string representation and back to get rid of sub-second values let done_if_after = SystemTime::now(); tokio::time::sleep(SAFETY_MARGIN).await; done_if_after }; let timestamp = strip_subsecond(timestamp); let done_if_after = strip_subsecond(done_if_after); let Some(prefix) = validate_prefix(&cmd.prefix) else { println!("specified prefix '{}' failed validation", cmd.prefix); return Ok(()); }; let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); // Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not // need to limit the number of versions we are going to delete. storage .unwrap() .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None) .await?; } Commands::Key(dkc) => dkc.execute(), Commands::PageTrace(cmd) => page_trace::main(&cmd)?, Commands::DownloadRemoteObject(cmd) => { download_remote_object::main(&cmd).await?; } }; Ok(()) } fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; println!("{control_file:?}"); let control_file_initdb = Lsn(control_file.checkPoint); println!( "pg_initdb_lsn: {}, aligned: {}", control_file_initdb, control_file_initdb.align() ); Ok(()) } async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), virtual_file::SyncMode::Sync, ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); dump_layerfile_from_path(path, true, &ctx).await } fn handle_metadata( MetadataCmd { metadata_path: path, disk_consistent_lsn, prev_record_lsn, latest_gc_cuttoff, }: &MetadataCmd, ) -> Result<(), anyhow::Error> { let metadata_bytes = std::fs::read(path)?; let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; // TODO: simplify this part if let Some(disk_consistent_lsn) = disk_consistent_lsn { meta = TimelineMetadata::new( *disk_consistent_lsn, meta.prev_record_lsn(), meta.ancestor_timeline(), meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), meta.pg_version(), ); update_meta = true; } if let Some(prev_record_lsn) = prev_record_lsn { meta = TimelineMetadata::new( meta.disk_consistent_lsn(), Some(*prev_record_lsn), meta.ancestor_timeline(), meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), meta.pg_version(), ); update_meta = true; } if let Some(latest_gc_cuttoff) = latest_gc_cuttoff { meta = TimelineMetadata::new( meta.disk_consistent_lsn(), meta.prev_record_lsn(), meta.ancestor_timeline(), meta.ancestor_lsn(), *latest_gc_cuttoff, meta.initdb_lsn(), meta.pg_version(), ); update_meta = true; } if update_meta { let metadata_bytes = meta.to_bytes()?; std::fs::write(path, metadata_bytes)?; } Ok(()) } /// Ensures that the given S3 prefix is sufficiently constrained. /// The command is very risky already and we don't want to expose something /// that allows usually unintentional and quite catastrophic time travel of /// an entire bucket, which would be a major catastrophy and away /// by only one character change (similar to "rm -r /home /username/foobar"). fn validate_prefix(prefix: &str) -> Option { if prefix.is_empty() { // Empty prefix means we want to specify the *whole* bucket return None; } let components = prefix.split('/').collect::>(); let (last, components) = { let last = components.last()?; if last.is_empty() { ( components.iter().nth_back(1)?, &components[..(components.len() - 1)], ) } else { (last, &components[..]) } }; 'valid: { if let Ok(_timeline_id) = TimelineId::from_str(last) { // Ends in either a tenant or timeline ID break 'valid; } if *last == "timelines" { if let Some(before_last) = components.iter().nth_back(1) { if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { // Has a valid tenant id break 'valid; } } } return None; } RemotePath::from_string(prefix).ok() } fn strip_subsecond(timestamp: SystemTime) -> SystemTime { let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") } #[cfg(test)] mod tests { use super::*; #[test] fn test_validate_prefix() { assert_eq!(validate_prefix(""), None); assert_eq!(validate_prefix("/"), None); #[track_caller] fn assert_valid(prefix: &str) { let remote_path = RemotePath::from_string(prefix).unwrap(); assert_eq!(validate_prefix(prefix), Some(remote_path)); } assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); // Path is not relative but absolute assert_eq!( validate_prefix( "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" ), None ); assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); assert_eq!(validate_prefix("wal"), None); assert_eq!(validate_prefix("/wal/"), None); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); // Partial tenant ID assert_eq!( validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), None ); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); assert_valid( "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683", ); assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); } } ================================================ FILE: pageserver/ctl/src/page_trace.rs ================================================ use std::collections::HashMap; use std::io::BufReader; use camino::Utf8PathBuf; use clap::Parser; use itertools::Itertools as _; use pageserver_api::key::{CompactKey, Key}; use pageserver_api::models::PageTraceEvent; use pageserver_api::reltag::RelTag; /// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats. #[derive(Parser)] pub(crate) struct PageTraceCmd { /// Trace input file. path: Utf8PathBuf, } pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> { let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?); let mut events: Vec = Vec::new(); loop { match bincode::deserialize_from(&mut file) { Ok(event) => events.push(event), Err(err) => { if let bincode::ErrorKind::Io(ref err) = *err { if err.kind() == std::io::ErrorKind::UnexpectedEof { break; } } return Err(err.into()); } } } let mut reads_by_relation: HashMap = HashMap::new(); let mut reads_by_key: HashMap = HashMap::new(); for event in events { let key = Key::from_compact(event.key); let reltag = RelTag { spcnode: key.field2, dbnode: key.field3, relnode: key.field4, forknum: key.field5, }; *reads_by_relation.entry(reltag).or_default() += 1; *reads_by_key.entry(event.key).or_default() += 1; } let multi_read_keys = reads_by_key .into_iter() .filter(|(_, count)| *count > 1) .sorted_by_key(|(key, count)| (-*count, *key)) .collect_vec(); println!("Multi-read keys: {}", multi_read_keys.len()); for (key, count) in multi_read_keys { println!(" {key}: {count}"); } let reads_by_relation = reads_by_relation .into_iter() .sorted_by_key(|(rel, count)| (-*count, *rel)) .collect_vec(); println!("Reads by relation:"); for (reltag, count) in reads_by_relation { println!(" {reltag}: {count}"); } Ok(()) } ================================================ FILE: pageserver/page_api/Cargo.toml ================================================ [package] name = "pageserver_page_api" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] anyhow.workspace = true bytes.workspace = true futures.workspace = true pageserver_api.workspace = true postgres_ffi_types.workspace = true prost.workspace = true prost-types.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true tonic.workspace = true utils.workspace = true workspace_hack.workspace = true [build-dependencies] tonic-build.workspace = true ================================================ FILE: pageserver/page_api/build.rs ================================================ use std::env; use std::path::PathBuf; /// Generates Rust code from .proto Protobuf schemas, along with a binary file /// descriptor set for Protobuf schema reflection. fn main() -> Result<(), Box> { let out_dir = PathBuf::from(env::var("OUT_DIR")?); tonic_build::configure() .bytes(["."]) .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin")) .compile_protos(&["proto/page_service.proto"], &["proto"]) .map_err(|err| err.into()) } ================================================ FILE: pageserver/page_api/proto/page_service.proto ================================================ // Page service, presented by pageservers for computes. // // This is the compute read path. It primarily serves page versions at given // LSNs, but also base backups, SLRU segments, and relation metadata. // // EXPERIMENTAL: this is still under development and subject to change. // // Request metadata headers: // - authorization: JWT token ("Bearer "), if auth is enabled // - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") // - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) // - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") // // The service can be accessed via e.g. grpcurl: // // ``` // grpcurl \ // -plaintext \ // -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ // -H "neon-shard-id: 0000" \ // -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ // -H "authorization: Bearer $JWT" \ // -d '{"read_lsn": {"request_lsn": 100000000, "not_modified_since_lsn": 1}, "db_oid": 1}' \ // localhost:51051 page_api.PageService/GetDbSize // ``` // // TODO: consider adding neon-compute-mode ("primary", "static", "replica"). // However, this will require reconnecting when changing modes. // // TODO: write implementation guidance on // - Health checks // - Tracing, OpenTelemetry // - Compression syntax = "proto3"; package page_api; import "google/protobuf/timestamp.proto"; service PageService { // NB: unlike libpq, there is no CheckRelExists in gRPC, at the compute team's request. Instead, // use GetRelSize with allow_missing=true to check existence. // Fetches a base backup. rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); // Returns the total size of a database, as # of bytes. rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); // Fetches pages. // // This is implemented as a bidirectional streaming RPC for performance. Unary // requests incur costs for e.g. HTTP/2 stream setup, header parsing, // authentication, and so on -- with streaming, we only pay these costs during // the initial stream setup. This ~doubles throughput in benchmarks. Other // RPCs use regular unary requests, since they are not as frequent and // performance-critical, and this simplifies implementation. // // NB: a gRPC status response (e.g. errors) will terminate the stream. The // stream may be shared by multiple Postgres backends, so we avoid this by // sending them as GetPageResponse.status_code instead. rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); // Returns the size of a relation, as # of blocks. rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); // Fetches an SLRU segment. rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage // collect the LSN until the lease expires. Must be acquired on all relevant shards. rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse); } // The LSN a request should read at. message ReadLsn { // The request's read LSN. Required. uint64 request_lsn = 1; // If given, the caller guarantees that the page has not been modified since // this LSN. Must be smaller than or equal to request_lsn. This allows the // Pageserver to serve an old page without waiting for the request LSN to // arrive. Valid for all request types. // // It is undefined behaviour to make a request such that the page was, in // fact, modified between request_lsn and not_modified_since_lsn. The // Pageserver might detect it and return an error, or it might return the old // page version or the new page version. Setting not_modified_since_lsn equal // to request_lsn is always safe, but can lead to unnecessary waiting. uint64 not_modified_since_lsn = 2; } // A relation identifier. message RelTag { uint32 spc_oid = 1; uint32 db_oid = 2; uint32 rel_number = 3; uint32 fork_number = 4; } // Requests a base backup. message GetBaseBackupRequest { // The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver. uint64 lsn = 1; // If true, logical replication slots will not be created. bool replica = 2; // If true, include relation files in the base backup. Mainly for debugging and tests. bool full = 3; // Compression algorithm to use. Base backups send a compressed payload instead of using gRPC // compression, so that we can cache compressed backups on the server. BaseBackupCompression compression = 4; } // Base backup compression algorithms. enum BaseBackupCompression { // Unknown algorithm. Used when clients send an unsupported algorithm. BASE_BACKUP_COMPRESSION_UNKNOWN = 0; // No compression. BASE_BACKUP_COMPRESSION_NONE = 1; // GZIP compression. BASE_BACKUP_COMPRESSION_GZIP = 2; } // Base backup response chunk, returned as an ordered stream. message GetBaseBackupResponseChunk { // A basebackup data chunk. The size is undefined, but bounded by the 4 MB // gRPC message size limit. bytes chunk = 1; } // Requests the size of a database, as # of bytes. Only valid on shard 0, other // shards will error. message GetDbSizeRequest { ReadLsn read_lsn = 1; uint32 db_oid = 2; } message GetDbSizeResponse { uint64 num_bytes = 1; } // Requests one or more pages. message GetPageRequest { // A request ID. Will be included in the response. Should be unique for // in-flight requests on the stream. RequestID request_id = 1; // The request class. GetPageClass request_class = 2; // The LSN to read at. ReadLsn read_lsn = 3; // The relation to read from. RelTag rel = 4; // Page numbers to read. Must belong to the remote shard. // // Multiple pages will be executed as a single batch by the Pageserver, // amortizing layer access costs and parallelizing them. This may increase the // latency of any individual request, but improves the overall latency and // throughput of the batch as a whole. // // TODO: this causes an allocation in the common single-block case. The sender // can use a SmallVec to stack-allocate it, but Prost will always deserialize // into a heap-allocated Vec. Consider optimizing this. // // TODO: we might be able to avoid a sort or something if we mandate that these // are always in order. But we can't currenly rely on this on the server, because // of compatibility with the libpq protocol handler. repeated uint32 block_number = 5; } // A Request ID. Should be unique for in-flight requests on a stream. Included in the response. message RequestID { // The base request ID. uint64 id = 1; // The request attempt. Starts at 0, incremented on each retry. uint32 attempt = 2; } // A GetPageRequest class. Primarily intended for observability, but may also be // used for prioritization in the future. enum GetPageClass { // Unknown class. For backwards compatibility: used when an older client version sends a class // that a newer server version has removed. GET_PAGE_CLASS_UNKNOWN = 0; // A normal request. This is the default. GET_PAGE_CLASS_NORMAL = 1; // A prefetch request. NB: can only be classified on pg < 18. GET_PAGE_CLASS_PREFETCH = 2; // A background request (e.g. vacuum). GET_PAGE_CLASS_BACKGROUND = 3; } // A GetPage response. // // A batch response will contain all of the requested pages. We could eagerly // emit individual pages as soon as they are ready, but on a readv() Postgres // holds buffer pool locks on all pages in the batch and we'll only return once // the entire batch is ready, so no one can make use of the individual pages. message GetPageResponse { // The original request's ID. RequestID request_id = 1; // The response status code. If not OK, the rel and page fields will be empty. GetPageStatusCode status_code = 2; // A string describing the status, if any. string reason = 3; // The relation that the pages belong to. RelTag rel = 4; // The page(s), in the same order as the request. repeated Page page = 5; } // A page. // // TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block // numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway. message Page { // The page number. uint32 block_number = 1; // The materialized page image, as an 8KB byte vector. bytes image = 2; } // A GetPageResponse status code. // // These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream // (potentially shared by many backends), and a gRPC status response would terminate the stream so // we send GetPageResponse messages with these codes instead. enum GetPageStatusCode { // Unknown status. For forwards compatibility: used when an older client version receives a new // status code from a newer server version. GET_PAGE_STATUS_CODE_UNKNOWN = 0; // The request was successful. GET_PAGE_STATUS_CODE_OK = 1; // The page did not exist. The tenant/timeline/shard has already been // validated during stream setup. GET_PAGE_STATUS_CODE_NOT_FOUND = 2; // The request was invalid. GET_PAGE_STATUS_CODE_INVALID_REQUEST = 3; // The request failed due to an internal server error. GET_PAGE_STATUS_CODE_INTERNAL_ERROR = 4; // The tenant is rate limited. Slow down and retry later. GET_PAGE_STATUS_CODE_SLOW_DOWN = 5; // NB: shutdown errors are emitted as a gRPC Unavailable status. // // TODO: consider adding a GET_PAGE_STATUS_CODE_LAYER_DOWNLOAD in the case of a layer download. // This could free up the server task to process other requests while the download is in progress. } // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on // shard 0, other shards will error. message GetRelSizeRequest { ReadLsn read_lsn = 1; RelTag rel = 2; // If true, return missing=true for missing relations instead of a NotFound error. bool allow_missing = 3; } message GetRelSizeResponse { // The number of blocks in the relation. uint32 num_blocks = 1; // If allow_missing=true, this is true for missing relations. bool missing = 2; } // Requests an SLRU segment. Only valid on shard 0, other shards will error. message GetSlruSegmentRequest { ReadLsn read_lsn = 1; uint32 kind = 2; uint32 segno = 3; } // Returns an SLRU segment. // // These are up 32 pages (256 KB), so we can send them as a single response. message GetSlruSegmentResponse { bytes segment = 1; } // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage // collect the LSN until the lease expires. Must be acquired on all relevant shards. message LeaseLsnRequest { // The LSN to lease. Can't be 0 or below the current GC cutoff. uint64 lsn = 1; } // Lease acquisition response. If the lease could not be granted because the LSN has already been // garbage collected, a FailedPrecondition status will be returned instead. message LeaseLsnResponse { // The lease expiration time. google.protobuf.Timestamp expires = 1; } ================================================ FILE: pageserver/page_api/src/client.rs ================================================ use anyhow::Context as _; use futures::future::ready; use futures::{Stream, StreamExt as _, TryStreamExt as _}; use tokio::io::AsyncRead; use tokio_util::io::StreamReader; use tonic::codec::CompressionEncoding; use tonic::metadata::AsciiMetadataValue; use tonic::service::Interceptor; use tonic::service::interceptor::InterceptedService; use tonic::transport::{Channel, Endpoint}; use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; use crate::model::*; use crate::proto; /// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain /// types from `model` rather than generated Protobuf types. pub struct Client { inner: proto::PageServiceClient>, } impl Client { /// Connects to the given gRPC endpoint. pub async fn connect( endpoint: E, tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, compression: Option, ) -> anyhow::Result where E: TryInto + Send + Sync + 'static, >::Error: std::error::Error + Send + Sync, { let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?; let channel = endpoint.connect().await?; Self::new( channel, tenant_id, timeline_id, shard_id, auth_token, compression, ) } /// Creates a new client using the given gRPC channel. pub fn new( channel: Channel, tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, compression: Option, ) -> anyhow::Result { let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?; let mut inner = proto::PageServiceClient::with_interceptor(channel, auth); if let Some(compression) = compression { // TODO: benchmark this (including network latency). inner = inner .accept_compressed(compression) .send_compressed(compression); } Ok(Self { inner }) } /// Fetches a base backup. pub async fn get_base_backup( &mut self, req: GetBaseBackupRequest, ) -> tonic::Result> { let req = proto::GetBaseBackupRequest::from(req); let chunks = self.inner.get_base_backup(req).await?.into_inner(); Ok(StreamReader::new( chunks .map_ok(|resp| resp.chunk) .map_err(std::io::Error::other), )) } /// Returns the total size of a database, as # of bytes. pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result { let req = proto::GetDbSizeRequest::from(req); let resp = self.inner.get_db_size(req).await?.into_inner(); Ok(resp.into()) } /// Fetches pages. /// /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are /// typically returned as status_code instead of errors, to avoid tearing down the entire stream /// via a tonic::Status error. pub async fn get_pages( &mut self, reqs: impl Stream + Send + 'static, ) -> tonic::Result> + Send + 'static> { let reqs = reqs.map(proto::GetPageRequest::from); let resps = self.inner.get_pages(reqs).await?.into_inner(); Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into())))) } /// Returns the size of a relation as # of blocks, or None if allow_missing=true and the /// relation does not exist. pub async fn get_rel_size( &mut self, req: GetRelSizeRequest, ) -> tonic::Result { let req = proto::GetRelSizeRequest::from(req); let resp = self.inner.get_rel_size(req).await?.into_inner(); Ok(resp.into()) } /// Fetches an SLRU segment. pub async fn get_slru_segment( &mut self, req: GetSlruSegmentRequest, ) -> tonic::Result { let req = proto::GetSlruSegmentRequest::from(req); let resp = self.inner.get_slru_segment(req).await?.into_inner(); Ok(resp.try_into()?) } /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. /// /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be /// acquired because the LSN has already been garbage collected. pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result { let req = proto::LeaseLsnRequest::from(req); let resp = self.inner.lease_lsn(req).await?.into_inner(); Ok(resp.try_into()?) } } /// Adds authentication metadata to gRPC requests. #[derive(Clone)] struct AuthInterceptor { tenant_id: AsciiMetadataValue, timeline_id: AsciiMetadataValue, shard_id: AsciiMetadataValue, auth_header: Option, // including "Bearer " prefix } impl AuthInterceptor { fn new( tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, ) -> anyhow::Result { Ok(Self { tenant_id: tenant_id.to_string().try_into()?, timeline_id: timeline_id.to_string().try_into()?, shard_id: shard_id.to_string().try_into()?, auth_header: auth_token .map(|token| format!("Bearer {token}").try_into()) .transpose()?, }) } } impl Interceptor for AuthInterceptor { fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result> { let metadata = req.metadata_mut(); metadata.insert("neon-tenant-id", self.tenant_id.clone()); metadata.insert("neon-timeline-id", self.timeline_id.clone()); metadata.insert("neon-shard-id", self.shard_id.clone()); if let Some(ref auth_header) = self.auth_header { metadata.insert("authorization", auth_header.clone()); } Ok(req) } } ================================================ FILE: pageserver/page_api/src/lib.rs ================================================ //! This crate provides the Pageserver's page API. It contains: //! //! * proto/page_service.proto: the Protobuf schema for the page API. //! * proto: auto-generated Protobuf types for gRPC. //! //! This crate is used by both the client and the server. Try to keep it slim. // Code generated by protobuf. pub mod proto { tonic::include_proto!("page_api"); /// File descriptor set for Protobuf schema reflection. This allows using /// e.g. grpcurl with the API. pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("page_api_descriptor"); pub use page_service_client::PageServiceClient; pub use page_service_server::{PageService, PageServiceServer}; } mod client; mod model; mod split; pub use client::Client; pub use model::*; pub use split::{GetPageSplitter, SplitError}; ================================================ FILE: pageserver/page_api/src/model.rs ================================================ //! Structs representing the canonical page service API. //! //! These mirror the autogenerated Protobuf types. The differences are: //! //! - Types that are in fact required by the API are not Options. The protobuf "required" //! attribute is deprecated and 'prost' marks a lot of members as optional because of that. //! (See for a gripe on this) //! //! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits. //! //! - Validate protocol invariants, via try_from() and try_into(). //! //! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain //! types. This is where it matters -- the Protobuf types are less strict than the domain types, and //! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g. //! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use bytes::Bytes; use postgres_ffi_types::Oid; // TODO: split out Lsn, RelTag, SlruKind and other basic types to a separate crate, to avoid // pulling in all of their other crate dependencies when building the client. use utils::lsn::Lsn; use crate::proto; /// A protocol error. Typically returned via try_from() or try_into(). #[derive(thiserror::Error, Clone, Debug)] pub enum ProtocolError { #[error("field '{0}' has invalid value '{1}'")] Invalid(&'static str, String), #[error("required field '{0}' is missing")] Missing(&'static str), } impl ProtocolError { /// Helper to generate a new ProtocolError::Invalid for the given field and value. pub fn invalid(field: &'static str, value: impl std::fmt::Debug) -> Self { Self::Invalid(field, format!("{value:?}")) } } impl From for tonic::Status { fn from(err: ProtocolError) -> Self { tonic::Status::invalid_argument(format!("{err}")) } } /// The LSN a request should read at. #[derive(Clone, Copy, Debug, Default)] pub struct ReadLsn { /// The request's read LSN. pub request_lsn: Lsn, /// If given, the caller guarantees that the page has not been modified since this LSN. Must be /// smaller than or equal to request_lsn. This allows the Pageserver to serve an old page /// without waiting for the request LSN to arrive. If not given, the request will read at the /// request_lsn and wait for it to arrive if necessary. Valid for all request types. /// /// It is undefined behaviour to make a request such that the page was, in fact, modified /// between request_lsn and not_modified_since_lsn. The Pageserver might detect it and return an /// error, or it might return the old page version or the new page version. Setting /// not_modified_since_lsn equal to request_lsn is always safe, but can lead to unnecessary /// waiting. pub not_modified_since_lsn: Option, } impl Display for ReadLsn { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let req_lsn = self.request_lsn; if let Some(mod_lsn) = self.not_modified_since_lsn { write!(f, "{req_lsn}>={mod_lsn}") } else { req_lsn.fmt(f) } } } impl TryFrom for ReadLsn { type Error = ProtocolError; fn try_from(pb: proto::ReadLsn) -> Result { if pb.request_lsn == 0 { return Err(ProtocolError::invalid("request_lsn", pb.request_lsn)); } if pb.not_modified_since_lsn > pb.request_lsn { return Err(ProtocolError::invalid( "not_modified_since_lsn", pb.not_modified_since_lsn, )); } Ok(Self { request_lsn: Lsn(pb.request_lsn), not_modified_since_lsn: match pb.not_modified_since_lsn { 0 => None, lsn => Some(Lsn(lsn)), }, }) } } impl From for proto::ReadLsn { fn from(read_lsn: ReadLsn) -> Self { Self { request_lsn: read_lsn.request_lsn.0, not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0, } } } // RelTag is defined in pageserver_api::reltag. pub type RelTag = pageserver_api::reltag::RelTag; impl TryFrom for RelTag { type Error = ProtocolError; fn try_from(pb: proto::RelTag) -> Result { Ok(Self { spcnode: pb.spc_oid, dbnode: pb.db_oid, relnode: pb.rel_number, forknum: pb .fork_number .try_into() .map_err(|_| ProtocolError::invalid("fork_number", pb.fork_number))?, }) } } impl From for proto::RelTag { fn from(rel_tag: RelTag) -> Self { Self { spc_oid: rel_tag.spcnode, db_oid: rel_tag.dbnode, rel_number: rel_tag.relnode, fork_number: rel_tag.forknum as u32, } } } /// Requests a base backup. #[derive(Clone, Copy, Debug)] pub struct GetBaseBackupRequest { /// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver. pub lsn: Option, /// If true, logical replication slots will not be created. pub replica: bool, /// If true, include relation files in the base backup. Mainly for debugging and tests. pub full: bool, /// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC /// compression, so that we can cache compressed backups on the server. pub compression: BaseBackupCompression, } impl TryFrom for GetBaseBackupRequest { type Error = ProtocolError; fn try_from(pb: proto::GetBaseBackupRequest) -> Result { Ok(Self { lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)), replica: pb.replica, full: pb.full, compression: pb.compression.try_into()?, }) } } impl From for proto::GetBaseBackupRequest { fn from(request: GetBaseBackupRequest) -> Self { Self { lsn: request.lsn.unwrap_or_default().0, replica: request.replica, full: request.full, compression: request.compression.into(), } } } /// Base backup compression algorithm. #[derive(Clone, Copy, Debug)] pub enum BaseBackupCompression { None, Gzip, } impl TryFrom for BaseBackupCompression { type Error = ProtocolError; fn try_from(pb: proto::BaseBackupCompression) -> Result { match pb { proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)), proto::BaseBackupCompression::None => Ok(Self::None), proto::BaseBackupCompression::Gzip => Ok(Self::Gzip), } } } impl TryFrom for BaseBackupCompression { type Error = ProtocolError; fn try_from(compression: i32) -> Result { proto::BaseBackupCompression::try_from(compression) .map_err(|_| ProtocolError::invalid("compression", compression)) .and_then(Self::try_from) } } impl From for proto::BaseBackupCompression { fn from(compression: BaseBackupCompression) -> Self { match compression { BaseBackupCompression::None => Self::None, BaseBackupCompression::Gzip => Self::Gzip, } } } impl From for i32 { fn from(compression: BaseBackupCompression) -> Self { proto::BaseBackupCompression::from(compression).into() } } pub type GetBaseBackupResponseChunk = Bytes; impl TryFrom for GetBaseBackupResponseChunk { type Error = ProtocolError; fn try_from(pb: proto::GetBaseBackupResponseChunk) -> Result { if pb.chunk.is_empty() { return Err(ProtocolError::Missing("chunk")); } Ok(pb.chunk) } } impl From for proto::GetBaseBackupResponseChunk { fn from(chunk: GetBaseBackupResponseChunk) -> Self { Self { chunk } } } /// Requests the size of a database, as # of bytes. Only valid on shard 0, other shards will error. #[derive(Clone, Copy, Debug)] pub struct GetDbSizeRequest { pub read_lsn: ReadLsn, pub db_oid: Oid, } impl TryFrom for GetDbSizeRequest { type Error = ProtocolError; fn try_from(pb: proto::GetDbSizeRequest) -> Result { Ok(Self { read_lsn: pb .read_lsn .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, db_oid: pb.db_oid, }) } } impl From for proto::GetDbSizeRequest { fn from(request: GetDbSizeRequest) -> Self { Self { read_lsn: Some(request.read_lsn.into()), db_oid: request.db_oid, } } } pub type GetDbSizeResponse = u64; impl From for GetDbSizeResponse { fn from(pb: proto::GetDbSizeResponse) -> Self { pb.num_bytes } } impl From for proto::GetDbSizeResponse { fn from(num_bytes: GetDbSizeResponse) -> Self { Self { num_bytes } } } /// Requests one or more pages. #[derive(Clone, Debug, Default)] pub struct GetPageRequest { /// A request ID. Will be included in the response. Should be unique for in-flight requests on /// the stream. pub request_id: RequestID, /// The request class. pub request_class: GetPageClass, /// The LSN to read at. pub read_lsn: ReadLsn, /// The relation to read from. pub rel: RelTag, /// Page numbers to read. Must belong to the remote shard. /// /// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access /// costs and parallelizing them. This may increase the latency of any individual request, but /// improves the overall latency and throughput of the batch as a whole. pub block_numbers: Vec, } impl TryFrom for GetPageRequest { type Error = ProtocolError; fn try_from(pb: proto::GetPageRequest) -> Result { if pb.block_number.is_empty() { return Err(ProtocolError::Missing("block_number")); } Ok(Self { request_id: pb .request_id .ok_or(ProtocolError::Missing("request_id"))? .into(), request_class: pb.request_class.into(), read_lsn: pb .read_lsn .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, block_numbers: pb.block_number, }) } } impl From for proto::GetPageRequest { fn from(request: GetPageRequest) -> Self { Self { request_id: Some(request.request_id.into()), request_class: request.request_class.into(), read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), block_number: request.block_numbers, } } } /// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RequestID { /// The base request ID. pub id: u64, // The request attempt. Starts at 0, incremented on each retry. pub attempt: u32, } impl RequestID { /// Creates a new RequestID with the given ID and an initial attempt of 0. pub fn new(id: u64) -> Self { Self { id, attempt: 0 } } } impl Display for RequestID { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}.{}", self.id, self.attempt) } } impl From for RequestID { fn from(pb: proto::RequestId) -> Self { Self { id: pb.id, attempt: pb.attempt, } } } impl From for RequestID { fn from(id: u64) -> Self { Self::new(id) } } impl From for proto::RequestId { fn from(request_id: RequestID) -> Self { Self { id: request_id.id, attempt: request_id.attempt, } } } /// A GetPage request class. #[derive(Clone, Copy, Debug, Default, strum_macros::Display)] pub enum GetPageClass { /// Unknown class. For backwards compatibility: used when an older client version sends a class /// that a newer server version has removed. Unknown, /// A normal request. This is the default. #[default] Normal, /// A prefetch request. NB: can only be classified on pg < 18. Prefetch, /// A background request (e.g. vacuum). Background, } impl From for GetPageClass { fn from(pb: proto::GetPageClass) -> Self { match pb { proto::GetPageClass::Unknown => Self::Unknown, proto::GetPageClass::Normal => Self::Normal, proto::GetPageClass::Prefetch => Self::Prefetch, proto::GetPageClass::Background => Self::Background, } } } impl From for GetPageClass { fn from(class: i32) -> Self { proto::GetPageClass::try_from(class) .unwrap_or(proto::GetPageClass::Unknown) .into() } } impl From for proto::GetPageClass { fn from(class: GetPageClass) -> Self { match class { GetPageClass::Unknown => Self::Unknown, GetPageClass::Normal => Self::Normal, GetPageClass::Prefetch => Self::Prefetch, GetPageClass::Background => Self::Background, } } } impl From for i32 { fn from(class: GetPageClass) -> Self { proto::GetPageClass::from(class).into() } } /// A GetPage response. /// /// A batch response will contain all of the requested pages. We could eagerly emit individual pages /// as soon as they are ready, but on a readv() Postgres holds buffer pool locks on all pages in the /// batch and we'll only return once the entire batch is ready, so no one can make use of the /// individual pages. #[derive(Clone, Debug)] pub struct GetPageResponse { /// The original request's ID. pub request_id: RequestID, /// The response status code. If not OK, the `rel` and `pages` fields will be empty. pub status_code: GetPageStatusCode, /// A string describing the status, if any. pub reason: Option, /// The relation that the pages belong to. pub rel: RelTag, // The page(s), in the same order as the request. pub pages: Vec, } impl TryFrom for GetPageResponse { type Error = ProtocolError; fn try_from(pb: proto::GetPageResponse) -> Result { Ok(Self { request_id: pb .request_id .ok_or(ProtocolError::Missing("request_id"))? .into(), status_code: pb.status_code.into(), reason: Some(pb.reason).filter(|r| !r.is_empty()), rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, pages: pb.page.into_iter().map(Page::from).collect(), }) } } impl From for proto::GetPageResponse { fn from(response: GetPageResponse) -> Self { Self { request_id: Some(response.request_id.into()), status_code: response.status_code.into(), reason: response.reason.unwrap_or_default(), rel: Some(response.rel.into()), page: response.pages.into_iter().map(proto::Page::from).collect(), } } } impl GetPageResponse { /// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a /// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a /// GetPageResponse with a non-OK status code instead. #[allow(clippy::result_large_err)] pub fn try_from_status( status: tonic::Status, request_id: RequestID, ) -> Result { // We shouldn't see an OK status here, because we're emitting an error. debug_assert_ne!(status.code(), tonic::Code::Ok); if status.code() == tonic::Code::Ok { return Err(tonic::Status::internal(format!( "unexpected OK status: {status:?}", ))); } // If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request // error and we should return a tonic::Status to terminate the stream. let Ok(status_code) = status.code().try_into() else { return Err(status); }; // Return a GetPageResponse for the status. Ok(Self { request_id, status_code, reason: Some(status.message().to_string()), rel: RelTag::default(), pages: Vec::new(), }) } } // A page. #[derive(Clone, Debug)] pub struct Page { /// The page number. pub block_number: u32, /// The materialized page image, as an 8KB byte vector. pub image: Bytes, } impl From for Page { fn from(pb: proto::Page) -> Self { Self { block_number: pb.block_number, image: pb.image, } } } impl From for proto::Page { fn from(page: Page) -> Self { Self { block_number: page.block_number, image: page.image, } } } /// A GetPage response status code. /// /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream /// (potentially shared by many backends), and a gRPC status response would terminate the stream so /// we send GetPageResponse messages with these codes instead. #[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)] pub enum GetPageStatusCode { /// Unknown status. For forwards compatibility: used when an older client version receives a new /// status code from a newer server version. Unknown, /// The request was successful. Ok, /// The page did not exist. The tenant/timeline/shard has already been validated during stream /// setup. NotFound, /// The request was invalid. InvalidRequest, /// The request failed due to an internal server error. InternalError, /// The tenant is rate limited. Slow down and retry later. SlowDown, } impl From for GetPageStatusCode { fn from(pb: proto::GetPageStatusCode) -> Self { match pb { proto::GetPageStatusCode::Unknown => Self::Unknown, proto::GetPageStatusCode::Ok => Self::Ok, proto::GetPageStatusCode::NotFound => Self::NotFound, proto::GetPageStatusCode::InvalidRequest => Self::InvalidRequest, proto::GetPageStatusCode::InternalError => Self::InternalError, proto::GetPageStatusCode::SlowDown => Self::SlowDown, } } } impl From for GetPageStatusCode { fn from(status_code: i32) -> Self { proto::GetPageStatusCode::try_from(status_code) .unwrap_or(proto::GetPageStatusCode::Unknown) .into() } } impl From for proto::GetPageStatusCode { fn from(status_code: GetPageStatusCode) -> Self { match status_code { GetPageStatusCode::Unknown => Self::Unknown, GetPageStatusCode::Ok => Self::Ok, GetPageStatusCode::NotFound => Self::NotFound, GetPageStatusCode::InvalidRequest => Self::InvalidRequest, GetPageStatusCode::InternalError => Self::InternalError, GetPageStatusCode::SlowDown => Self::SlowDown, } } } impl From for i32 { fn from(status_code: GetPageStatusCode) -> Self { proto::GetPageStatusCode::from(status_code).into() } } impl TryFrom for GetPageStatusCode { type Error = tonic::Code; fn try_from(code: tonic::Code) -> Result { use tonic::Code; let status_code = match code { Code::Ok => Self::Ok, // These are per-request errors, which should be returned as GetPageResponses. Code::AlreadyExists => Self::InvalidRequest, Code::DataLoss => Self::InternalError, Code::FailedPrecondition => Self::InvalidRequest, Code::InvalidArgument => Self::InvalidRequest, Code::Internal => Self::InternalError, Code::NotFound => Self::NotFound, Code::OutOfRange => Self::InvalidRequest, Code::ResourceExhausted => Self::SlowDown, // These should terminate the stream by returning a tonic::Status. Code::Aborted | Code::Cancelled | Code::DeadlineExceeded | Code::PermissionDenied | Code::Unauthenticated | Code::Unavailable | Code::Unimplemented | Code::Unknown => return Err(code), }; Ok(status_code) } } impl From for tonic::Code { fn from(status_code: GetPageStatusCode) -> Self { use tonic::Code; match status_code { GetPageStatusCode::Unknown => Code::Unknown, GetPageStatusCode::Ok => Code::Ok, GetPageStatusCode::NotFound => Code::NotFound, GetPageStatusCode::InvalidRequest => Code::InvalidArgument, GetPageStatusCode::InternalError => Code::Internal, GetPageStatusCode::SlowDown => Code::ResourceExhausted, } } } // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other // shards will error. #[derive(Clone, Copy, Debug)] pub struct GetRelSizeRequest { pub read_lsn: ReadLsn, pub rel: RelTag, /// If true, return missing=true for missing relations instead of a NotFound error. pub allow_missing: bool, } impl TryFrom for GetRelSizeRequest { type Error = ProtocolError; fn try_from(proto: proto::GetRelSizeRequest) -> Result { Ok(Self { read_lsn: proto .read_lsn .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, rel: proto.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, allow_missing: proto.allow_missing, }) } } impl From for proto::GetRelSizeRequest { fn from(request: GetRelSizeRequest) -> Self { Self { read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), allow_missing: request.allow_missing, } } } /// The size of a relation as number of blocks, or None if `allow_missing=true` and the relation /// does not exist. /// /// INVARIANT: never None if `allow_missing=false` (returns `NotFound` error instead). pub type GetRelSizeResponse = Option; impl From for GetRelSizeResponse { fn from(pb: proto::GetRelSizeResponse) -> Self { (!pb.missing).then_some(pb.num_blocks) } } impl From for proto::GetRelSizeResponse { fn from(resp: GetRelSizeResponse) -> Self { Self { num_blocks: resp.unwrap_or_default(), missing: resp.is_none(), } } } /// Requests an SLRU segment. Only valid on shard 0, other shards will error. #[derive(Clone, Copy, Debug)] pub struct GetSlruSegmentRequest { pub read_lsn: ReadLsn, pub kind: SlruKind, pub segno: u32, } impl TryFrom for GetSlruSegmentRequest { type Error = ProtocolError; fn try_from(pb: proto::GetSlruSegmentRequest) -> Result { Ok(Self { read_lsn: pb .read_lsn .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, kind: u8::try_from(pb.kind) .ok() .and_then(SlruKind::from_repr) .ok_or_else(|| ProtocolError::invalid("slru_kind", pb.kind))?, segno: pb.segno, }) } } impl From for proto::GetSlruSegmentRequest { fn from(request: GetSlruSegmentRequest) -> Self { Self { read_lsn: Some(request.read_lsn.into()), kind: request.kind as u32, segno: request.segno, } } } pub type GetSlruSegmentResponse = Bytes; impl TryFrom for GetSlruSegmentResponse { type Error = ProtocolError; fn try_from(pb: proto::GetSlruSegmentResponse) -> Result { if pb.segment.is_empty() { return Err(ProtocolError::Missing("segment")); } Ok(pb.segment) } } impl From for proto::GetSlruSegmentResponse { fn from(segment: GetSlruSegmentResponse) -> Self { Self { segment } } } // SlruKind is defined in pageserver_api::reltag. pub type SlruKind = pageserver_api::reltag::SlruKind; /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage /// collect the LSN until the lease expires. pub struct LeaseLsnRequest { /// The LSN to lease. pub lsn: Lsn, } impl TryFrom for LeaseLsnRequest { type Error = ProtocolError; fn try_from(pb: proto::LeaseLsnRequest) -> Result { if pb.lsn == 0 { return Err(ProtocolError::Missing("lsn")); } Ok(Self { lsn: Lsn(pb.lsn) }) } } impl From for proto::LeaseLsnRequest { fn from(request: LeaseLsnRequest) -> Self { Self { lsn: request.lsn.0 } } } /// Lease expiration time. If the lease could not be granted because the LSN has already been /// garbage collected, a FailedPrecondition status will be returned instead. pub type LeaseLsnResponse = SystemTime; impl TryFrom for LeaseLsnResponse { type Error = ProtocolError; fn try_from(pb: proto::LeaseLsnResponse) -> Result { let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?; UNIX_EPOCH .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32)) .ok_or_else(|| ProtocolError::invalid("expires", expires)) } } impl From for proto::LeaseLsnResponse { fn from(response: LeaseLsnResponse) -> Self { let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default(); Self { expires: Some(prost_types::Timestamp { seconds: expires.as_secs() as i64, nanos: expires.subsec_nanos() as i32, }), } } } ================================================ FILE: pageserver/page_api/src/split.rs ================================================ use std::collections::HashMap; use bytes::Bytes; use crate::model::*; use pageserver_api::key::rel_block_to_key; use pageserver_api::shard::key_to_shard_number; use utils::shard::{ShardCount, ShardIndex, ShardStripeSize}; /// Splits GetPageRequests that straddle shard boundaries and assembles the responses. /// TODO: add tests for this. pub struct GetPageSplitter { /// Split requests by shard index. requests: HashMap, /// The response being assembled. Preallocated with empty pages, to be filled in. response: GetPageResponse, /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used /// to assemble the response pages in the same order as the original request. block_shards: Vec, } impl GetPageSplitter { /// Checks if the given request only touches a single shard, and returns the shard ID. This is /// the common case, so we check first in order to avoid unnecessary allocations and overhead. pub fn for_single_shard( req: &GetPageRequest, count: ShardCount, stripe_size: Option, ) -> Result, SplitError> { // Fast path: unsharded tenant. if count.is_unsharded() { return Ok(Some(ShardIndex::unsharded())); } let Some(stripe_size) = stripe_size else { return Err("stripe size must be given for sharded tenants".into()); }; // Find the first page's shard, for comparison. let Some(&first_page) = req.block_numbers.first() else { return Err("no block numbers in request".into()); }; let key = rel_block_to_key(req.rel, first_page); let shard_number = key_to_shard_number(count, stripe_size, &key); Ok(req .block_numbers .iter() .skip(1) // computed above .all(|&blkno| { let key = rel_block_to_key(req.rel, blkno); key_to_shard_number(count, stripe_size, &key) == shard_number }) .then_some(ShardIndex::new(shard_number, count))) } /// Splits the given request. pub fn split( req: GetPageRequest, count: ShardCount, stripe_size: Option, ) -> Result { // The caller should make sure we don't split requests unnecessarily. debug_assert!( Self::for_single_shard(&req, count, stripe_size)?.is_none(), "unnecessary request split" ); if count.is_unsharded() { return Err("unsharded tenant, no point in splitting request".into()); } let Some(stripe_size) = stripe_size else { return Err("stripe size must be given for sharded tenants".into()); }; // Split the requests by shard index. let mut requests = HashMap::with_capacity(2); // common case let mut block_shards = Vec::with_capacity(req.block_numbers.len()); for &blkno in &req.block_numbers { let key = rel_block_to_key(req.rel, blkno); let shard_number = key_to_shard_number(count, stripe_size, &key); let shard_id = ShardIndex::new(shard_number, count); requests .entry(shard_id) .or_insert_with(|| GetPageRequest { request_id: req.request_id, request_class: req.request_class, rel: req.rel, read_lsn: req.read_lsn, block_numbers: Vec::new(), }) .block_numbers .push(blkno); block_shards.push(shard_id); } // Construct a response to be populated by shard responses. Preallocate empty page slots // with the expected block numbers. let response = GetPageResponse { request_id: req.request_id, status_code: GetPageStatusCode::Ok, reason: None, rel: req.rel, pages: req .block_numbers .into_iter() .map(|block_number| { Page { block_number, image: Bytes::new(), // empty page slot to be filled in } }) .collect(), }; Ok(Self { requests, response, block_shards, }) } /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations. pub fn drain_requests(&mut self) -> impl Iterator { self.requests.drain() } /// Adds a response from the given shard. The response must match the request ID and have an OK /// status code. A response must not already exist for the given shard ID. pub fn add_response( &mut self, shard_id: ShardIndex, response: GetPageResponse, ) -> Result<(), SplitError> { // The caller should already have converted status codes into tonic::Status. if response.status_code != GetPageStatusCode::Ok { return Err(SplitError(format!( "unexpected non-OK response for shard {shard_id}: {} {}", response.status_code, response.reason.unwrap_or_default() ))); } if response.request_id != self.response.request_id { return Err(SplitError(format!( "response ID mismatch for shard {shard_id}: expected {}, got {}", self.response.request_id, response.request_id ))); } if response.request_id != self.response.request_id { return Err(SplitError(format!( "response ID mismatch for shard {shard_id}: expected {}, got {}", self.response.request_id, response.request_id ))); } // Place the shard response pages into the assembled response, in request order. let mut pages = response.pages.into_iter(); for (i, &s) in self.block_shards.iter().enumerate() { if shard_id != s { continue; } let Some(slot) = self.response.pages.get_mut(i) else { return Err(SplitError(format!( "no block_shards slot {i} for shard {shard_id}" ))); }; let Some(page) = pages.next() else { return Err(SplitError(format!( "missing page {} in shard {shard_id} response", slot.block_number ))); }; if page.block_number != slot.block_number { return Err(SplitError(format!( "shard {shard_id} returned wrong page at index {i}, expected {} got {}", slot.block_number, page.block_number ))); } if !slot.image.is_empty() { return Err(SplitError(format!( "shard {shard_id} returned duplicate page {} at index {i}", slot.block_number ))); } *slot = page; } // Make sure we've consumed all pages from the shard response. if let Some(extra_page) = pages.next() { return Err(SplitError(format!( "shard {shard_id} returned extra page: {}", extra_page.block_number ))); } Ok(()) } /// Collects the final, assembled response. pub fn collect_response(self) -> Result { // Check that the response is complete. for (i, page) in self.response.pages.iter().enumerate() { if page.image.is_empty() { return Err(SplitError(format!( "missing page {} for shard {}", page.block_number, self.block_shards .get(i) .map(|s| s.to_string()) .unwrap_or_else(|| "?".to_string()) ))); } } Ok(self.response) } } /// A GetPageSplitter error. #[derive(Debug, thiserror::Error)] #[error("{0}")] pub struct SplitError(String); impl From<&str> for SplitError { fn from(err: &str) -> Self { SplitError(err.to_string()) } } impl From for SplitError { fn from(err: String) -> Self { SplitError(err) } } impl From for tonic::Status { fn from(err: SplitError) -> Self { tonic::Status::internal(err.0) } } ================================================ FILE: pageserver/pagebench/Cargo.toml ================================================ [package] name = "pagebench" version = "0.1.0" edition.workspace = true license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow.workspace = true async-trait.workspace = true bytes.workspace = true camino.workspace = true clap.workspace = true futures.workspace = true hdrhistogram.workspace = true humantime.workspace = true humantime-serde.workspace = true pprof.workspace = true rand.workspace = true reqwest.workspace = true serde.workspace = true serde_json.workspace = true tracing.workspace = true tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tonic.workspace = true url.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true pageserver_client_grpc.workspace = true pageserver_page_api.workspace = true utils = { path = "../../libs/utils/" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } ================================================ FILE: pageserver/pagebench/src/cmd/aux_files.rs ================================================ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; use pageserver_api::models::{TenantConfig, TenantConfigRequest}; use pageserver_api::shard::TenantShardId; use utils::id::TenantTimelineId; use utils::lsn::Lsn; /// Ingest aux files into the pageserver. #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, #[clap(long, default_value = "postgres://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, targets: Option>, } pub(crate) fn main(args: Args) -> anyhow::Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); let main_task = rt.spawn(main_impl(args)); rt.block_on(main_task).unwrap() } async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: None, targets: { if let Some(targets) = &args.targets { if targets.len() != 1 { anyhow::bail!("must specify exactly one target"); } Some(targets.clone()) } else { None } }, }, ) .await?; let timeline = timelines[0]; let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); let timeline_id = timeline.timeline_id; println!("operating on timeline {timeline}"); mgmt_api_client .set_tenant_config(&TenantConfigRequest { tenant_id: timeline.tenant_id, config: TenantConfig::default(), }) .await?; for batch in 0..100 { let items = (0..100) .map(|id| { ( format!("pg_logical/mappings/{batch:03}.{id:03}"), format!("{id:08}"), ) }) .collect::>(); let file_cnt = items.len(); mgmt_api_client .ingest_aux_files(tenant_shard_id, timeline_id, items) .await?; println!("ingested {file_cnt} files"); } for _ in 0..100 { let start = Instant::now(); let files = mgmt_api_client .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) .await?; println!( "{} files found in {}s", files.len(), start.elapsed().as_secs_f64() ); } anyhow::Ok(()) } ================================================ FILE: pageserver/pagebench/src/cmd/basebackup.rs ================================================ use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; use std::pin::Pin; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; use anyhow::anyhow; use futures::TryStreamExt as _; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; use pageserver_page_api as page_api; use rand::prelude::*; use tokio::io::AsyncRead; use tokio::sync::Barrier; use tokio::task::JoinSet; use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt as _}; use tokio_util::io::StreamReader; use tonic::async_trait; use tracing::{info, instrument}; use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; /// basebackup@LatestLSN #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, /// The Pageserver to connect to. Use postgresql:// for libpq, or grpc:// for gRPC. #[clap(long, default_value = "postgresql://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] num_clients: NonZeroUsize, #[clap(long)] no_compression: bool, #[clap(long)] runtime: Option, #[clap(long)] limit_to_first_n_targets: Option, targets: Option>, } #[derive(Debug, Default)] struct LiveStats { completed_requests: AtomicU64, } impl LiveStats { fn inc(&self) { self.completed_requests.fetch_add(1, Ordering::Relaxed); } } struct Target { timeline: TenantTimelineId, lsn_range: Option>, } #[derive(serde::Serialize)] struct Output { total: request_stats::Output, } tokio_thread_local_stats::declare!(STATS: request_stats::Stats); pub(crate) fn main(args: Args) -> anyhow::Result<()> { tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { main_impl(args, thread_local_stats) }) } async fn main_impl( args: Args, all_thread_local_stats: AllThreadLocalStats, ) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: args.limit_to_first_n_targets, targets: args.targets.clone(), }, ) .await?; let mut js = JoinSet::new(); for timeline in &timelines { js.spawn({ let timeline = *timeline; let info = mgmt_api_client .timeline_info( TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ForceAwaitLogicalSize::No, ) .await .unwrap(); async move { anyhow::Ok(Target { timeline, // TODO: support lsn_range != latest LSN lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)), }) } }); } let mut all_targets: Vec = Vec::new(); while let Some(res) = js.join_next().await { all_targets.push(res.unwrap().unwrap()); } let live_stats = Arc::new(LiveStats::default()); let num_client_tasks = timelines.len(); let num_live_stats_dump = 1; let num_work_sender_tasks = 1; let start_work_barrier = Arc::new(tokio::sync::Barrier::new( num_client_tasks + num_live_stats_dump + num_work_sender_tasks, )); let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); tokio::spawn({ let stats = Arc::clone(&live_stats); let start_work_barrier = Arc::clone(&start_work_barrier); async move { start_work_barrier.wait().await; loop { let start = std::time::Instant::now(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); let elapsed = start.elapsed(); info!( "RPS: {:.0}", completed_requests as f64 / elapsed.as_secs_f64() ); } } }); let mut work_senders = HashMap::new(); let mut tasks = Vec::new(); let scheme = match Url::parse(&args.page_service_connstring) { Ok(url) => url.scheme().to_lowercase().to_string(), Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), Err(err) => return Err(anyhow!("invalid connstring: {err}")), }; for &tl in &timelines { let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are work_senders.insert(tl, sender); let client: Box = match scheme.as_str() { "postgresql" | "postgres" => Box::new( LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, ), "grpc" => Box::new( GrpcClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, ), scheme => return Err(anyhow!("invalid scheme {scheme}")), }; tasks.push(tokio::spawn(run_worker( client, Arc::clone(&start_work_barrier), receiver, Arc::clone(&all_work_done_barrier), Arc::clone(&live_stats), ))); } let work_sender = async move { start_work_barrier.wait().await; loop { let (timeline, work) = { let mut rng = rand::rng(); let target = all_targets.choose(&mut rng).unwrap(); let lsn = target.lsn_range.clone().map(|r| rng.random_range(r)); (target.timeline, Work { lsn }) }; let sender = work_senders.get(&timeline).unwrap(); // TODO: what if this blocks? sender.send(work).await.ok().unwrap(); } }; if let Some(runtime) = args.runtime { match tokio::time::timeout(runtime.into(), work_sender).await { Ok(()) => unreachable!("work sender never terminates"), Err(_timeout) => { // this implicitly drops the work_senders, making all the clients exit } } } else { work_sender.await; unreachable!("work sender never terminates"); } for t in tasks { t.await.unwrap(); } let output = Output { total: { let mut agg_stats = request_stats::Stats::new(); for stats in all_thread_local_stats.lock().unwrap().iter() { let stats = stats.lock().unwrap(); agg_stats.add(&stats); } agg_stats.output() }, }; let output = serde_json::to_string_pretty(&output).unwrap(); println!("{output}"); anyhow::Ok(()) } #[derive(Copy, Clone)] struct Work { lsn: Option, } #[instrument(skip_all)] async fn run_worker( mut client: Box, start_work_barrier: Arc, mut work: tokio::sync::mpsc::Receiver, all_work_done_barrier: Arc, live_stats: Arc, ) { start_work_barrier.wait().await; while let Some(Work { lsn }) = work.recv().await { let start = Instant::now(); let stream = client.basebackup(lsn).await.unwrap(); let size = futures::io::copy(stream.compat(), &mut tokio::io::sink().compat_write()) .await .unwrap(); info!("basebackup size is {size} bytes"); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { stats.borrow().lock().unwrap().observe(elapsed).unwrap(); }); } all_work_done_barrier.wait().await; } /// A basebackup client. This allows switching out the client protocol implementation. #[async_trait] trait Client: Send { async fn basebackup( &mut self, lsn: Option, ) -> anyhow::Result>>; } /// A libpq-based Pageserver client. struct LibpqClient { inner: pageserver_client::page_service::Client, ttid: TenantTimelineId, compression: bool, } impl LibpqClient { async fn new( connstring: &str, ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { Ok(Self { inner: pageserver_client::page_service::Client::new(connstring.to_string()).await?, ttid, compression, }) } } #[async_trait] impl Client for LibpqClient { async fn basebackup( &mut self, lsn: Option, ) -> anyhow::Result>> { let req = BasebackupRequest { tenant_id: self.ttid.tenant_id, timeline_id: self.ttid.timeline_id, lsn, gzip: self.compression, }; let stream = self.inner.basebackup(&req).await?; Ok(Box::pin(StreamReader::new( stream.map_err(std::io::Error::other), ))) } } /// A gRPC Pageserver client. struct GrpcClient { inner: page_api::Client, compression: page_api::BaseBackupCompression, } impl GrpcClient { async fn new( connstring: &str, ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { let inner = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, ShardIndex::unsharded(), None, None, // NB: uses payload compression ) .await?; let compression = match compression { true => page_api::BaseBackupCompression::Gzip, false => page_api::BaseBackupCompression::None, }; Ok(Self { inner, compression }) } } #[async_trait] impl Client for GrpcClient { async fn basebackup( &mut self, lsn: Option, ) -> anyhow::Result>> { let req = page_api::GetBaseBackupRequest { lsn, replica: false, full: false, compression: self.compression, }; Ok(Box::pin(self.inner.get_base_backup(req).await?)) } } ================================================ FILE: pageserver/pagebench/src/cmd/getpage_latest_lsn.rs ================================================ use std::collections::{HashMap, HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; use bytes::Bytes; use camino::Utf8PathBuf; use futures::stream::FuturesUnordered; use futures::{Stream, StreamExt as _}; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use pageserver_client_grpc::{self as client_grpc, ShardSpec}; use pageserver_page_api as page_api; use rand::prelude::*; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, /// Pageserver connection string. Supports postgresql:// and grpc:// protocols. #[clap(long, default_value = "postgres://postgres@localhost:64000")] page_service_connstring: String, /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic /// no-frills `page_api::Client`. Only valid with grpc:// connstrings. #[clap(long)] rich_client: bool, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] num_clients: NonZeroUsize, #[clap(long)] runtime: Option, /// If true, enable compression (only for gRPC). #[clap(long)] compression: bool, /// Each client sends requests at the given rate. /// /// If a request takes too long and we should be issuing a new request already, /// we skip that request and account it as `MISSED`. #[clap(long)] per_client_rate: Option, /// Probability for sending `latest=true` in the request (uniform distribution). #[clap(long, default_value = "1")] req_latest_probability: f64, #[clap(long)] limit_to_first_n_targets: Option, /// For large pageserver installations, enumerating the keyspace takes a lot of time. /// If specified, the specified path is used to maintain a cache of the keyspace enumeration result. /// The cache is tagged and auto-invalided by the tenant/timeline ids only. /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction. #[clap(long)] keyspace_cache: Option, /// Before starting the benchmark, live-reconfigure the pageserver to use the given /// [`pageserver_api::models::virtual_file::IoEngineKind`]. #[clap(long)] set_io_engine: Option, /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct). #[clap(long)] set_io_mode: Option, /// Queue depth generated in each client. #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, /// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres /// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and /// 1 queue depth. /// /// The libpq protocol does not support client-side batching, and will submit batches as many /// individual requests, in the hope that the server will batch them. Each batch still counts as /// 1 RPS and 1 queue depth. #[clap(long, default_value = "1")] batch_size: NonZeroUsize, #[clap(long)] only_relnode: Option, targets: Option>, } /// State shared by all clients #[derive(Debug)] struct SharedState { start_work_barrier: tokio::sync::Barrier, live_stats: LiveStats, } #[derive(Debug, Default)] struct LiveStats { completed_requests: AtomicU64, missed: AtomicU64, } impl LiveStats { fn request_done(&self) { self.completed_requests.fetch_add(1, Ordering::Relaxed); } fn missed(&self, n: u64) { self.missed.fetch_add(n, Ordering::Relaxed); } } #[derive(Clone, serde::Serialize, serde::Deserialize)] struct KeyRange { timeline: TenantTimelineId, timeline_lsn: Lsn, start: i128, end: i128, } impl KeyRange { fn len(&self) -> i128 { self.end - self.start } } #[derive(PartialEq, Eq, Hash, Copy, Clone)] struct WorkerId { timeline: TenantTimelineId, num_client: usize, // from 0..args.num_clients } #[derive(serde::Serialize)] struct Output { total: request_stats::Output, } tokio_thread_local_stats::declare!(STATS: request_stats::Stats); pub(crate) fn main(args: Args) -> anyhow::Result<()> { tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { main_impl(args, thread_local_stats) }) } async fn main_impl( args: Args, all_thread_local_stats: AllThreadLocalStats, ) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), )); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; } if let Some(mode) = &args.set_io_mode { mgmt_api_client.put_io_mode(mode).await?; } // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: args.limit_to_first_n_targets, targets: args.targets.clone(), }, ) .await?; #[derive(serde::Deserialize)] struct KeyspaceCacheDe { tag: Vec, data: Vec, } #[derive(serde::Serialize)] struct KeyspaceCacheSer<'a> { tag: &'a [TenantTimelineId], data: &'a [KeyRange], } let cache = args .keyspace_cache .as_ref() .map(|keyspace_cache_file| { let contents = match std::fs::read(keyspace_cache_file) { Err(e) if e.kind() == std::io::ErrorKind::NotFound => { return anyhow::Ok(None); } x => x.context("read keyspace cache file")?, }; let cache: KeyspaceCacheDe = serde_json::from_slice(&contents).context("deserialize cache file")?; let tag_ok = HashSet::::from_iter(cache.tag.into_iter()) == HashSet::from_iter(timelines.iter().cloned()); info!("keyspace cache file matches tag: {tag_ok}"); anyhow::Ok(if tag_ok { Some(cache.data) } else { None }) }) .transpose()? .flatten(); let all_ranges: Vec = if let Some(cached) = cache { info!("using keyspace cache file"); cached } else { let mut js = JoinSet::new(); for timeline in &timelines { js.spawn({ let mgmt_api_client = Arc::clone(&mgmt_api_client); let timeline = *timeline; async move { let partitioning = mgmt_api_client .keyspace( TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ) .await?; let lsn = partitioning.at_lsn; let start = Instant::now(); let mut filtered = KeySpaceAccum::new(); // let's hope this is inlined and vectorized... // TODO: turn this loop into a is_rel_block_range() function. for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { let mut include = true; include &= i.is_rel_block_key(); if let Some(only_relnode) = args.only_relnode { include &= i.is_rel_block_of_rel(only_relnode); } if include { filtered.add_key(i); } i = i.next(); } } let filtered = filtered.to_keyspace(); let filter_duration = start.elapsed(); anyhow::Ok(( filter_duration, filtered.ranges.into_iter().map(move |r| KeyRange { timeline, timeline_lsn: lsn, start: r.start.to_i128(), end: r.end.to_i128(), }), )) } }); } let mut total_filter_duration = Duration::from_secs(0); let mut all_ranges: Vec = Vec::new(); while let Some(res) = js.join_next().await { let (filter_duration, range) = res.unwrap().unwrap(); all_ranges.extend(range); total_filter_duration += filter_duration; } info!("filter duration: {}", total_filter_duration.as_secs_f64()); if let Some(cachefile) = args.keyspace_cache.as_ref() { let cache = KeyspaceCacheSer { tag: &timelines, data: &all_ranges, }; let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?; std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?; info!("successfully wrote keyspace cache file"); } all_ranges }; let num_live_stats_dump = 1; let num_work_sender_tasks = args.num_clients.get() * timelines.len(); let num_main_impl = 1; let shared_state = Arc::new(SharedState { start_work_barrier: tokio::sync::Barrier::new( num_live_stats_dump + num_work_sender_tasks + num_main_impl, ), live_stats: LiveStats::default(), }); let cancel = CancellationToken::new(); let ss = shared_state.clone(); tokio::spawn({ async move { ss.start_work_barrier.wait().await; loop { let start = std::time::Instant::now(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; let stats = &ss.live_stats; let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); let missed = stats.missed.swap(0, Ordering::Relaxed); let elapsed = start.elapsed(); info!( "RPS: {:.0} MISSED: {:.0}", completed_requests as f64 / elapsed.as_secs_f64(), missed as f64 / elapsed.as_secs_f64() ); } } }); let rps_period = args .per_client_rate .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64))); let make_worker: &dyn Fn(WorkerId) -> Pin>> = &|worker_id| { let ss = shared_state.clone(); let cancel = cancel.clone(); let ranges: Vec = all_ranges .iter() .filter(|r| r.timeline == worker_id.timeline) .cloned() .collect(); let weights = rand::distr::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())).unwrap(); Box::pin(async move { let scheme = match Url::parse(&args.page_service_connstring) { Ok(url) => url.scheme().to_lowercase().to_string(), Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), Err(err) => panic!("invalid connstring: {err}"), }; let client: Box = match scheme.as_str() { "postgresql" | "postgres" => { assert!(!args.compression, "libpq does not support compression"); assert!(!args.rich_client, "rich client requires grpc://"); Box::new( LibpqClient::new(&args.page_service_connstring, worker_id.timeline) .await .unwrap(), ) } "grpc" if args.rich_client => Box::new( RichGrpcClient::new( &args.page_service_connstring, worker_id.timeline, args.compression, ) .await .unwrap(), ), "grpc" => Box::new( GrpcClient::new( &args.page_service_connstring, worker_id.timeline, args.compression, ) .await .unwrap(), ), scheme => panic!("unsupported scheme {scheme}"), }; run_worker(args, client, ss, cancel, rps_period, ranges, weights).await }) }; info!("spawning workers"); let mut workers = JoinSet::new(); for timeline in timelines.iter().cloned() { for num_client in 0..args.num_clients.get() { let worker_id = WorkerId { timeline, num_client, }; workers.spawn(make_worker(worker_id)); } } let workers = async move { while let Some(res) = workers.join_next().await { res.unwrap(); } }; info!("waiting for everything to become ready"); shared_state.start_work_barrier.wait().await; info!("work started"); if let Some(runtime) = args.runtime { tokio::time::sleep(runtime.into()).await; info!("runtime over, signalling cancellation"); cancel.cancel(); workers.await; info!("work sender exited"); } else { workers.await; unreachable!("work sender never terminates"); } let output = Output { total: { let mut agg_stats = request_stats::Stats::new(); for stats in all_thread_local_stats.lock().unwrap().iter() { let stats = stats.lock().unwrap(); agg_stats.add(&stats); } agg_stats.output() }, }; let output = serde_json::to_string_pretty(&output).unwrap(); println!("{output}"); anyhow::Ok(()) } async fn run_worker( args: &Args, mut client: Box, shared_state: Arc, cancel: CancellationToken, rps_period: Option, ranges: Vec, weights: rand::distr::weighted::WeightedIndex, ) { shared_state.start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; let mut req_id = 0; let batch_size: usize = args.batch_size.into(); // Track inflight requests by request ID and start time. This times the request duration, and // ensures responses match requests. We don't expect responses back in any particular order. // // NB: this does not check that all requests received a response, because we don't wait for the // inflight requests to complete when the duration elapses. let mut inflight: HashMap = HashMap::new(); while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { let periods_passed_until_now = usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap(); if periods_passed_until_now > ticks_processed { shared_state .live_stats .missed((periods_passed_until_now - ticks_processed) as u64); } ticks_processed = periods_passed_until_now; } while inflight.len() < args.queue_depth.get() { req_id += 1; let start = Instant::now(); let (req_lsn, mod_lsn, rel, blks) = { /// Converts a compact i128 key to a relation tag and block number. fn key_to_block(key: i128) -> (RelTag, u32) { let key = Key::from_i128(key); assert!(key.is_rel_block_key()); key.to_rel_block() .expect("we filter non-rel-block keys out above") } // Pick a random page from a random relation. let mut rng = rand::rng(); let r = &ranges[weights.sample(&mut rng)]; let key: i128 = rng.random_range(r.start..r.end); let (rel_tag, block_no) = key_to_block(key); let mut blks = VecDeque::with_capacity(batch_size); blks.push_back(block_no); // If requested, populate a batch of sequential pages. This is how Postgres will // request page batches (e.g. prefetches). If we hit the end of the relation, we // grow the batch towards the start too. for i in 1..batch_size { let (r, b) = key_to_block(key + i as i128); if r != rel_tag { break; // went outside relation } blks.push_back(b) } if blks.len() < batch_size { // Grow batch backwards if needed. for i in 1..batch_size { let (r, b) = key_to_block(key - i as i128); if r != rel_tag { break; // went outside relation } blks.push_front(b) } } // We assume that the entire batch can fit within the relation. assert_eq!(blks.len(), batch_size, "incomplete batch"); let req_lsn = if rng.random_bool(args.req_latest_probability) { Lsn::MAX } else { r.timeline_lsn }; (req_lsn, r.timeline_lsn, rel_tag, blks.into()) }; client .send_get_page(req_id, req_lsn, mod_lsn, rel, blks) .await .unwrap(); let old = inflight.insert(req_id, start); assert!(old.is_none(), "duplicate request ID {req_id}"); } let (req_id, pages) = client.recv_get_page().await.unwrap(); assert_eq!(pages.len(), batch_size, "unexpected page count"); assert!(pages.iter().all(|p| !p.is_empty()), "empty page"); let start = inflight .remove(&req_id) .expect("response for unknown request ID"); let end = Instant::now(); shared_state.live_stats.request_done(); ticks_processed += 1; STATS.with(|stats| { stats .borrow() .lock() .unwrap() .observe(end.duration_since(start)) .unwrap(); }); if let Some(period) = &rps_period { let next_at = client_start + Duration::from_micros( (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(), ); tokio::time::sleep_until(next_at.into()).await; } } } /// A benchmark client, to allow switching out the transport protocol. /// /// For simplicity, this just uses separate asynchronous send/recv methods. The send method could /// return a future that resolves when the response is received, but we don't really need it. #[async_trait] trait Client: Send { /// Sends an asynchronous GetPage request to the pageserver. async fn send_get_page( &mut self, req_id: u64, req_lsn: Lsn, mod_lsn: Lsn, rel: RelTag, blks: Vec, ) -> anyhow::Result<()>; /// Receives the next GetPage response from the pageserver. async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)>; } /// A libpq-based Pageserver client. struct LibpqClient { inner: pageserver_client::page_service::PagestreamClient, // Track sent batches, so we know how many responses to expect. batch_sizes: VecDeque, } impl LibpqClient { async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result { let inner = pageserver_client::page_service::Client::new(connstring.to_string()) .await? .pagestream(ttid.tenant_id, ttid.timeline_id) .await?; Ok(Self { inner, batch_sizes: VecDeque::new(), }) } } #[async_trait] impl Client for LibpqClient { async fn send_get_page( &mut self, req_id: u64, req_lsn: Lsn, mod_lsn: Lsn, rel: RelTag, blks: Vec, ) -> anyhow::Result<()> { // libpq doesn't support client-side batches, so we send a bunch of individual requests // instead in the hope that the server will batch them for us. We use the same request ID // for all, because we'll return a single batch response. self.batch_sizes.push_back(blks.len()); for blkno in blks { let req = PagestreamGetPageRequest { hdr: PagestreamRequest { reqid: req_id, request_lsn: req_lsn, not_modified_since: mod_lsn, }, rel, blkno, }; self.inner.getpage_send(req).await?; } Ok(()) } async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { let batch_size = self.batch_sizes.pop_front().unwrap(); let mut batch = Vec::with_capacity(batch_size); let mut req_id = None; for _ in 0..batch_size { let resp = self.inner.getpage_recv().await?; if req_id.is_none() { req_id = Some(resp.req.hdr.reqid); } assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch"); batch.push(resp.page); } Ok((req_id.unwrap(), batch)) } } /// A gRPC Pageserver client. struct GrpcClient { req_tx: tokio::sync::mpsc::Sender, resp_rx: Pin> + Send>>, } impl GrpcClient { async fn new( connstring: &str, ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { let mut client = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, ShardIndex::unsharded(), None, compression.then_some(tonic::codec::CompressionEncoding::Zstd), ) .await?; // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are // buffered by Tonic and the OS too. let (req_tx, req_rx) = tokio::sync::mpsc::channel(1); let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); let resp_rx = Box::pin(client.get_pages(req_stream).await?); Ok(Self { req_tx, resp_rx }) } } #[async_trait] impl Client for GrpcClient { async fn send_get_page( &mut self, req_id: u64, req_lsn: Lsn, mod_lsn: Lsn, rel: RelTag, blks: Vec, ) -> anyhow::Result<()> { let req = page_api::GetPageRequest { request_id: req_id.into(), request_class: page_api::GetPageClass::Normal, read_lsn: page_api::ReadLsn { request_lsn: req_lsn, not_modified_since_lsn: Some(mod_lsn), }, rel, block_numbers: blks, }; self.req_tx.send(req).await?; Ok(()) } async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { let resp = self.resp_rx.next().await.unwrap().unwrap(); anyhow::ensure!( resp.status_code == page_api::GetPageStatusCode::Ok, "unexpected status code: {}", resp.status_code, ); Ok(( resp.request_id.id, resp.pages.into_iter().map(|p| p.image).collect(), )) } } /// A rich gRPC Pageserver client. struct RichGrpcClient { inner: Arc, requests: FuturesUnordered< Pin> + Send>>, >, } impl RichGrpcClient { async fn new( connstring: &str, ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { let inner = Arc::new(client_grpc::PageserverClient::new( ttid.tenant_id, ttid.timeline_id, ShardSpec::new( [(ShardIndex::unsharded(), connstring.to_string())].into(), None, )?, None, compression.then_some(tonic::codec::CompressionEncoding::Zstd), )?); Ok(Self { inner, requests: FuturesUnordered::new(), }) } } #[async_trait] impl Client for RichGrpcClient { async fn send_get_page( &mut self, req_id: u64, req_lsn: Lsn, mod_lsn: Lsn, rel: RelTag, blks: Vec, ) -> anyhow::Result<()> { let req = page_api::GetPageRequest { request_id: req_id.into(), request_class: page_api::GetPageClass::Normal, read_lsn: page_api::ReadLsn { request_lsn: req_lsn, not_modified_since_lsn: Some(mod_lsn), }, rel, block_numbers: blks, }; let inner = self.inner.clone(); self.requests.push(Box::pin(async move { inner .get_page(req) .await .map_err(|err| anyhow::anyhow!("{err}")) })); Ok(()) } async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { let resp = self.requests.next().await.unwrap()?; Ok(( resp.request_id.id, resp.pages.into_iter().map(|p| p.image).collect(), )) } } ================================================ FILE: pageserver/pagebench/src/cmd/idle_streams.rs ================================================ use std::sync::Arc; use anyhow::anyhow; use futures::StreamExt; use tonic::transport::Endpoint; use tracing::info; use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag}; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::shard::ShardIndex; /// Starts a large number of idle gRPC GetPage streams. #[derive(clap::Parser)] pub(crate) struct Args { /// The Pageserver to connect to. Must use grpc://. #[clap(long, default_value = "grpc://localhost:51051")] server: String, /// The Pageserver HTTP API. #[clap(long, default_value = "http://localhost:9898")] http_server: String, /// The number of streams to open. #[clap(long, default_value = "100000")] count: usize, /// Number of streams per connection. #[clap(long, default_value = "100")] per_connection: usize, /// Send a single GetPage request on each stream. #[clap(long, default_value_t = false)] send_request: bool, } pub(crate) fn main(args: Args) -> anyhow::Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; rt.block_on(main_impl(args)) } async fn main_impl(args: Args) -> anyhow::Result<()> { // Discover a tenant and timeline to use. let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), args.http_server.clone(), None, )); let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: Some(1), targets: None, }, ) .await?; let ttid = timelines .first() .ok_or_else(|| anyhow!("no timelines found"))?; // Set up the initial client. let endpoint = Endpoint::from_shared(args.server.clone())?; let connect = async || { pageserver_page_api::Client::new( endpoint.connect().await?, ttid.tenant_id, ttid.timeline_id, ShardIndex::unsharded(), None, None, ) }; let mut client = connect().await?; let mut streams = Vec::with_capacity(args.count); // Create streams. for i in 0..args.count { if i % 100 == 0 { info!("opened {}/{} streams", i, args.count); } if i % args.per_connection == 0 && i > 0 { client = connect().await?; } let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel(); let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx); let mut resp_stream = client.get_pages(req_stream).await?; // Send request if specified. if args.send_request { req_tx.send(GetPageRequest { request_id: 1.into(), request_class: GetPageClass::Normal, read_lsn: ReadLsn { request_lsn: Lsn::MAX, not_modified_since_lsn: Some(Lsn(1)), }, rel: RelTag { spcnode: 1664, // pg_global dbnode: 0, // shared database relnode: 1262, // pg_authid forknum: 0, // init }, block_numbers: vec![0], })?; let resp = resp_stream .next() .await .transpose()? .ok_or_else(|| anyhow!("no response"))?; if resp.status_code != GetPageStatusCode::Ok { return Err(anyhow!("{} response", resp.status_code)); } } // Hold onto streams to avoid closing them. streams.push((req_tx, resp_stream)); } info!("opened {} streams, sleeping", args.count); // Block forever, to hold the idle streams open for inspection. futures::future::pending::<()>().await; Ok(()) } ================================================ FILE: pageserver/pagebench/src/cmd/ondemand_download_churn.rs ================================================ use std::f64; use std::num::NonZeroUsize; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use rand::seq::IndexedMutRandom; use tokio::sync::{OwnedSemaphorePermit, mpsc}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; /// Evict & on-demand download random layers. #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, #[clap(long)] pageserver_jwt: Option, #[clap(long)] runtime: Option, #[clap(long, default_value = "1")] tasks_per_target: NonZeroUsize, #[clap(long, default_value = "1")] concurrency_per_target: NonZeroUsize, /// Probability for sending `latest=true` in the request (uniform distribution). #[clap(long)] limit_to_first_n_targets: Option, /// Before starting the benchmark, live-reconfigure the pageserver to use the given /// [`pageserver_api::models::virtual_file::IoEngineKind`]. #[clap(long)] set_io_engine: Option, targets: Option>, } pub(crate) fn main(args: Args) -> anyhow::Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; let task = rt.spawn(main_impl(args)); rt.block_on(task).unwrap().unwrap(); Ok(()) } #[derive(serde::Serialize)] struct Output { downloads_count: u64, downloads_bytes: u64, evictions_count: u64, timeline_restarts: u64, #[serde(with = "humantime_serde")] runtime: Duration, } #[derive(Debug, Default)] struct LiveStats { evictions_count: AtomicU64, downloads_count: AtomicU64, downloads_bytes: AtomicU64, timeline_restarts: AtomicU64, } impl LiveStats { fn eviction_done(&self) { self.evictions_count.fetch_add(1, Ordering::Relaxed); } fn download_done(&self, size: u64) { self.downloads_count.fetch_add(1, Ordering::Relaxed); self.downloads_bytes.fetch_add(size, Ordering::Relaxed); } fn timeline_restart_done(&self) { self.timeline_restarts.fetch_add(1, Ordering::Relaxed); } } async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), )); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; } // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: args.limit_to_first_n_targets, targets: args.targets.clone(), }, ) .await?; let token = CancellationToken::new(); let mut tasks = JoinSet::new(); let periodic_stats = Arc::new(LiveStats::default()); let total_stats = Arc::new(LiveStats::default()); let start = Instant::now(); tasks.spawn({ let periodic_stats = Arc::clone(&periodic_stats); let total_stats = Arc::clone(&total_stats); let cloned_token = token.clone(); async move { let mut last_at = Instant::now(); loop { if cloned_token.is_cancelled() { return; } tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; let now = Instant::now(); let delta: Duration = now - last_at; last_at = now; let LiveStats { evictions_count, downloads_count, downloads_bytes, timeline_restarts, } = &*periodic_stats; let evictions_count = evictions_count.swap(0, Ordering::Relaxed); let downloads_count = downloads_count.swap(0, Ordering::Relaxed); let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed); let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed); total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed); total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed); total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed); let evictions_per_s = evictions_count as f64 / delta.as_secs_f64(); let downloads_per_s = downloads_count as f64 / delta.as_secs_f64(); let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64); info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}"); } } }); for tl in timelines { for _ in 0..args.tasks_per_target.get() { tasks.spawn(timeline_actor( args, Arc::clone(&mgmt_api_client), tl, Arc::clone(&periodic_stats), token.clone(), )); } } if let Some(runtime) = args.runtime { tokio::spawn(async move { tokio::time::sleep(runtime.into()).await; token.cancel(); }); } while let Some(res) = tasks.join_next().await { res.unwrap(); } let end = Instant::now(); let duration: Duration = end - start; let output = { let LiveStats { evictions_count, downloads_count, downloads_bytes, timeline_restarts, } = &*total_stats; Output { downloads_count: downloads_count.load(Ordering::Relaxed), downloads_bytes: downloads_bytes.load(Ordering::Relaxed), evictions_count: evictions_count.load(Ordering::Relaxed), timeline_restarts: timeline_restarts.load(Ordering::Relaxed), runtime: duration, } }; let output = serde_json::to_string_pretty(&output).unwrap(); println!("{output}"); Ok(()) } async fn timeline_actor( args: &'static Args, mgmt_api_client: Arc, timeline: TenantTimelineId, live_stats: Arc, token: CancellationToken, ) { // TODO: support sharding let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); struct Timeline { joinset: JoinSet<()>, layers: Vec>, concurrency: Arc, } while !token.is_cancelled() { debug!("restarting timeline"); let layer_map_info = mgmt_api_client .layer_map_info(tenant_shard_id, timeline.timeline_id) .await .unwrap(); let concurrency = Arc::new(tokio::sync::Semaphore::new( args.concurrency_per_target.get(), )); let mut joinset = JoinSet::new(); let layers = layer_map_info .historic_layers .into_iter() .map(|historic_layer| { let (tx, rx) = mpsc::channel(1); joinset.spawn(layer_actor( tenant_shard_id, timeline.timeline_id, historic_layer, rx, Arc::clone(&mgmt_api_client), Arc::clone(&live_stats), )); tx }) .collect::>(); let mut timeline = Timeline { joinset, layers, concurrency, }; live_stats.timeline_restart_done(); while !token.is_cancelled() { assert!(!timeline.joinset.is_empty()); if let Some(res) = timeline.joinset.try_join_next() { debug!(?res, "a layer actor exited, should not happen"); timeline.joinset.shutdown().await; break; } let mut permit = Some( Arc::clone(&timeline.concurrency) .acquire_owned() .await .unwrap(), ); loop { let layer_tx = { let mut rng = rand::rng(); timeline.layers.choose_mut(&mut rng).expect("no layers") }; match layer_tx.try_send(permit.take().unwrap()) { Ok(_) => break, Err(e) => match e { mpsc::error::TrySendError::Full(back) => { // TODO: retrying introduces bias away from slow downloaders permit.replace(back); } mpsc::error::TrySendError::Closed(_) => panic!(), }, } } } } } async fn layer_actor( tenant_shard_id: TenantShardId, timeline_id: TimelineId, mut layer: HistoricLayerInfo, mut rx: mpsc::Receiver, mgmt_api_client: Arc, live_stats: Arc, ) { #[derive(Clone, Copy)] enum Action { Evict, OnDemandDownload, } while let Some(_permit) = rx.recv().await { let action = if layer.is_remote() { Action::OnDemandDownload } else { Action::Evict }; let did_it = match action { Action::Evict => { let did_it = mgmt_api_client .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name()) .await .unwrap(); live_stats.eviction_done(); did_it } Action::OnDemandDownload => { let did_it = mgmt_api_client .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) .await .unwrap(); live_stats.download_done(layer.layer_file_size()); did_it } }; if !did_it { debug!("local copy of layer map appears out of sync, re-downloading"); return; } debug!("did it"); layer.set_remote(match action { Action::Evict => true, Action::OnDemandDownload => false, }); } } ================================================ FILE: pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs ================================================ use std::sync::Arc; use humantime::Duration; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use tokio::task::JoinSet; use utils::id::TenantTimelineId; #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, #[clap(long, default_value = "localhost:64000")] page_service_host_port: String, #[clap(long)] pageserver_jwt: Option, #[clap( long, help = "if specified, poll mgmt api to check whether init logical size calculation has completed" )] poll_for_completion: Option, #[clap(long)] limit_to_first_n_targets: Option, targets: Option>, } pub(crate) fn main(args: Args) -> anyhow::Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); let main_task = rt.spawn(main_impl(args)); rt.block_on(main_task).unwrap() } async fn main_impl(args: Args) -> anyhow::Result<()> { let args: &'static Args = Box::leak(Box::new(args)); let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench. args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), )); // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, crate::util::cli::targets::Spec { limit_to_first_n_targets: args.limit_to_first_n_targets, targets: args.targets.clone(), }, ) .await?; // kick it off let mut js = JoinSet::new(); for tl in timelines { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { let info = mgmt_api_client .timeline_info( TenantShardId::unsharded(tl.tenant_id), tl.timeline_id, ForceAwaitLogicalSize::Yes, ) .await .unwrap(); // Polling should not be strictly required here since we await // for the initial logical size, however it's possible for the request // to land before the timeline is initialised. This results in an approximate // logical size. if let Some(period) = args.poll_for_completion { let mut ticker = tokio::time::interval(period.into()); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); let mut info = info; while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client .timeline_info( TenantShardId::unsharded(tl.tenant_id), tl.timeline_id, ForceAwaitLogicalSize::Yes, ) .await .unwrap(); } } }); } while let Some(res) = js.join_next().await { let _: () = res.unwrap(); } Ok(()) } ================================================ FILE: pageserver/pagebench/src/main.rs ================================================ use std::fs::File; use clap::Parser; use tracing::info; use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. mod util { pub(crate) mod request_stats; #[macro_use] pub(crate) mod tokio_thread_local_stats; /// Re-usable pieces of CLI-specific code. pub(crate) mod cli { pub(crate) mod targets; } } /// The pagebench CLI sub-commands, dispatched in [`main`] below. mod cmd { pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; pub(super) mod idle_streams; pub(super) mod ondemand_download_churn; pub(super) mod trigger_initial_size_calculation; } /// Component-level performance test for pageserver. #[derive(clap::Parser)] struct Args { /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's /// written, e.g. via --runtime. #[arg(long)] profile: bool, #[command(subcommand)] subcommand: Subcommand, } #[derive(clap::Subcommand)] enum Subcommand { Basebackup(cmd::basebackup::Args), GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), OndemandDownloadChurn(cmd::ondemand_download_churn::Args), AuxFiles(cmd::aux_files::Args), IdleStreams(cmd::idle_streams::Args), } fn main() -> anyhow::Result<()> { logging::init( logging::LogFormat::Plain, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stderr, )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); let args = Args::parse(); // Start a CPU profile if requested. let mut profiler = None; if args.profile { profiler = Some( pprof::ProfilerGuardBuilder::default() .frequency(1000) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) .build()?, ); } match args.subcommand { Subcommand::Basebackup(args) => cmd::basebackup::main(args), Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), Subcommand::TriggerInitialSizeCalculation(args) => { cmd::trigger_initial_size_calculation::main(args) } Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), Subcommand::AuxFiles(args) => cmd::aux_files::main(args), Subcommand::IdleStreams(args) => cmd::idle_streams::main(args), }?; // Generate a CPU flamegraph if requested. if let Some(profiler) = profiler { let report = profiler.report().build()?; drop(profiler); // stop profiling let file = File::create("profile.svg")?; report.flamegraph(file)?; info!("wrote CPU profile flamegraph to profile.svg") } Ok(()) } ================================================ FILE: pageserver/pagebench/src/util/cli/targets.rs ================================================ use std::sync::Arc; use pageserver_client::mgmt_api; use tracing::info; use utils::id::TenantTimelineId; pub(crate) struct Spec { pub(crate) limit_to_first_n_targets: Option, pub(crate) targets: Option>, } pub(crate) async fn discover( api_client: &Arc, spec: Spec, ) -> anyhow::Result> { let mut timelines = if let Some(targets) = spec.targets { targets } else { mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await? }; if let Some(limit) = spec.limit_to_first_n_targets { timelines.sort(); // for determinism timelines.truncate(limit); if timelines.len() < limit { anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants"); } } info!("timelines:\n{:?}", timelines); info!("number of timelines:\n{:?}", timelines.len()); Ok(timelines) } ================================================ FILE: pageserver/pagebench/src/util/request_stats.rs ================================================ use std::time::Duration; use anyhow::Context; pub(crate) struct Stats { latency_histo: hdrhistogram::Histogram, } impl Stats { pub(crate) fn new() -> Self { Self { // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram, // which would skew the benchmark results. latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(), } } pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> { let micros: u64 = latency .as_micros() .try_into() .context("latency greater than u64")?; self.latency_histo .record(micros) .context("add to histogram")?; Ok(()) } pub(crate) fn output(&self) -> Output { let latency_percentiles = std::array::from_fn(|idx| { let micros = self .latency_histo .value_at_percentile(LATENCY_PERCENTILES[idx]); Duration::from_micros(micros) }); Output { request_count: self.latency_histo.len(), latency_mean: Duration::from_micros(self.latency_histo.mean() as u64), latency_percentiles: LatencyPercentiles { latency_percentiles, }, } } pub(crate) fn add(&mut self, other: &Self) { let Self { latency_histo } = self; latency_histo.add(&other.latency_histo).unwrap(); } } impl Default for Stats { fn default() -> Self { Self::new() } } const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99]; struct LatencyPercentiles { latency_percentiles: [Duration; 4], } impl serde::Serialize for LatencyPercentiles { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { use serde::ser::SerializeMap; let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) { ser.serialize_entry( &format!("p{p}"), &format!("{}", humantime::format_duration(*v)), )?; } ser.end() } } #[derive(serde::Serialize)] pub(crate) struct Output { request_count: u64, #[serde(with = "humantime_serde")] latency_mean: Duration, latency_percentiles: LatencyPercentiles, } ================================================ FILE: pageserver/pagebench/src/util/tokio_thread_local_stats.rs ================================================ pub(crate) type ThreadLocalStats = Arc>; pub(crate) type AllThreadLocalStats = Arc>>>; macro_rules! declare { ($THREAD_LOCAL_NAME:ident: $T:ty) => { thread_local! { pub static $THREAD_LOCAL_NAME: std::cell::RefCell> = std::cell::RefCell::new( std::sync::Arc::new(std::sync::Mutex::new(Default::default())) ); } }; } use std::sync::{Arc, Mutex}; pub(crate) use declare; macro_rules! main { ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{ let main_impl = $main_impl; let all = Arc::new(Mutex::new(Vec::new())); let rt = tokio::runtime::Builder::new_multi_thread() .on_thread_start({ let all = Arc::clone(&all); move || { // pre-initialize the thread local stats by accessesing them // (some stats like requests_stats::Stats are quite costly to initialize, // we don't want to pay that cost during the measurement period) $THREAD_LOCAL_NAME.with(|stats| { let stats: Arc<_> = Arc::clone(&*stats.borrow()); all.lock().unwrap().push(stats); }); } }) .enable_all() .build() .unwrap(); let main_task = rt.spawn(main_impl(all)); rt.block_on(main_task).unwrap() }}; } pub(crate) use main; ================================================ FILE: pageserver/src/assert_u64_eq_usize.rs ================================================ //! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case. pub(crate) const _ASSERT_U64_EQ_USIZE: () = { if std::mem::size_of::() != std::mem::size_of::() { panic!( "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information" ); } }; pub(crate) trait U64IsUsize { fn into_usize(self) -> usize; } impl U64IsUsize for u64 { #[inline(always)] fn into_usize(self) -> usize { #[allow(clippy::let_unit_value)] let _ = _ASSERT_U64_EQ_USIZE; self as usize } } pub(crate) trait UsizeIsU64 { fn into_u64(self) -> u64; } impl UsizeIsU64 for usize { #[inline(always)] fn into_u64(self) -> u64 { #[allow(clippy::let_unit_value)] let _ = _ASSERT_U64_EQ_USIZE; self as u64 } } pub const fn u64_to_usize(x: u64) -> usize { #[allow(clippy::let_unit_value)] let _ = _ASSERT_U64_EQ_USIZE; x as usize } ================================================ FILE: pageserver/src/auth.rs ================================================ use utils::auth::{AuthError, Claims, Scope}; use utils::id::TenantId; pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<(), AuthError> { match (&claims.scope, tenant_id) { (Scope::Tenant, None) => Err(AuthError( "Attempt to access management api with tenant scope. Permission denied".into(), )), (Scope::Tenant, Some(tenant_id)) => { if claims.tenant_id.unwrap() != tenant_id { return Err(AuthError("Tenant id mismatch. Permission denied".into())); } Ok(()) } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope ( Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Infra | Scope::Scrubber | Scope::ControllerPeer | Scope::TenantEndpoint, _, ) => Err(AuthError( format!( "JWT scope '{:?}' is ineligible for Pageserver auth", claims.scope ) .into(), )), } } ================================================ FILE: pageserver/src/aux_file.rs ================================================ use std::sync::Arc; use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE}; use tracing::warn; // BEGIN Copyright (c) 2017 Servo Contributors /// Const version of FNV hash. #[inline] #[must_use] pub const fn fnv_hash(bytes: &[u8]) -> u128 { const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d; const PRIME: u128 = 0x0000000001000000000000000000013B; let mut hash = INITIAL_STATE; let mut i = 0; while i < bytes.len() { hash ^= bytes[i] as u128; hash = hash.wrapping_mul(PRIME); i += 1; } hash } // END Copyright (c) 2017 Servo Contributors /// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash]. fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { let mut key: [u8; 16] = [0; METADATA_KEY_SIZE]; let hash = fnv_hash(data).to_be_bytes(); key[0] = AUX_KEY_PREFIX; key[1] = dir_level1; key[2] = dir_level2; key[3..16].copy_from_slice(&hash[3..16]); Key::from_metadata_key_fixed_size(&key) } const AUX_DIR_PG_LOGICAL: u8 = 0x01; const AUX_DIR_PG_REPLSLOT: u8 = 0x02; const AUX_DIR_PG_STAT: u8 = 0x03; const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// Encode the aux file into a fixed-size key. /// /// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. /// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path /// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix /// is roughly based on the first two components of the path, one unique number for one component. /// /// * pg_logical/mappings -> 0x0101 /// * pg_logical/snapshots -> 0x0102 /// * pg_logical/replorigin_checkpoint -> 0x0103 /// * pg_logical/others -> 0x01FF /// * pg_replslot/ -> 0x0201 /// * pg_stat/pgstat.stat -> 0x0301 /// * others -> 0xFFFF /// /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. /// The new file type must have never been written to the storage before. Otherwise, there could be data /// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. pub fn encode_aux_file_key(path: &str) -> Key { if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) } else if path == "pg_logical/replorigin_checkpoint" { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") } else if let Some(fname) = path.strip_prefix("pg_logical/") { if cfg!(debug_assertions) { warn!( "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", path ); } aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_replslot/") { aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_stat/") { aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes()) } else { if cfg!(debug_assertions) { warn!( "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", path ); } aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) } } const AUX_FILE_ENCODING_VERSION: u8 = 0x01; pub fn decode_file_value(val: &[u8]) -> anyhow::Result> { let mut ptr = val; if ptr.is_empty() { // empty value = no files return Ok(Vec::new()); } assert_eq!( ptr.get_u8(), AUX_FILE_ENCODING_VERSION, "unsupported aux file value" ); let mut files = vec![]; while ptr.has_remaining() { let key_len = ptr.get_u32() as usize; let key = &ptr[..key_len]; ptr.advance(key_len); let val_len = ptr.get_u32() as usize; let content = &ptr[..val_len]; ptr.advance(val_len); let path = std::str::from_utf8(key)?; files.push((path, content)); } Ok(files) } /// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference /// to the original value slice. Be cautious about memory consumption. pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result> { let mut ptr = val.clone(); if ptr.is_empty() { // empty value = no files return Ok(Vec::new()); } assert_eq!( ptr.get_u8(), AUX_FILE_ENCODING_VERSION, "unsupported aux file value" ); let mut files = vec![]; while ptr.has_remaining() { let key_len = ptr.get_u32() as usize; let key = ptr.slice(..key_len); ptr.advance(key_len); let val_len = ptr.get_u32() as usize; let content = ptr.slice(..val_len); ptr.advance(val_len); let path = std::str::from_utf8(&key)?.to_string(); files.push((path, content)); } Ok(files) } pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { if files.is_empty() { // no files = empty value return Ok(Vec::new()); } let mut encoded = vec![]; encoded.put_u8(AUX_FILE_ENCODING_VERSION); for (path, content) in files { if path.len() > u32::MAX as usize { anyhow::bail!("{} exceeds path size limit", path); } encoded.put_u32(path.len() as u32); encoded.put_slice(path.as_bytes()); if content.len() > u32::MAX as usize { anyhow::bail!("{} exceeds content size limit", path); } encoded.put_u32(content.len() as u32); encoded.put_slice(content); } Ok(encoded) } /// An estimation of the size of aux files. pub struct AuxFileSizeEstimator { aux_file_size_gauge: IntGauge, size: Arc>>, } impl AuxFileSizeEstimator { pub fn new(aux_file_size_gauge: IntGauge) -> Self { Self { aux_file_size_gauge, size: Arc::new(std::sync::Mutex::new(None)), } } /// When generating base backup or doing initial logical size calculation pub fn on_initial(&self, new_size: usize) { let mut guard = self.size.lock().unwrap(); *guard = Some(new_size as isize); self.report(new_size as isize); } pub fn on_add(&self, file_size: usize) { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size += file_size as isize; self.report(*size); } } pub fn on_remove(&self, file_size: usize) { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size -= file_size as isize; self.report(*size); } } pub fn on_update(&self, old_size: usize, new_size: usize) { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size += new_size as isize - old_size as isize; self.report(*size); } } pub fn report(&self, size: isize) { self.aux_file_size_gauge.set(size as i64); } } #[cfg(test)] mod tests { use super::*; #[test] fn test_hash_portable() { // AUX file encoding requires the hash to be portable across all platforms. This test case checks // if the algorithm produces the same hash across different environments. assert_eq!( 265160408618497461376862998434862070044, super::fnv_hash("test1".as_bytes()) ); assert_eq!( 295486155126299629456360817749600553988, super::fnv_hash("test/test2".as_bytes()) ); assert_eq!( 144066263297769815596495629667062367629, super::fnv_hash("".as_bytes()) ); } #[test] fn test_encoding_portable() { // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions // of the page server. assert_eq!( "62000001017F8B83D94F7081693471ABF91C", encode_aux_file_key("pg_logical/mappings/test1").to_string(), ); assert_eq!( "62000001027F8E83D94F7081693471ABFCCD", encode_aux_file_key("pg_logical/snapshots/test2").to_string(), ); assert_eq!( "62000001032E07BB014262B821756295C58D", encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(), ); assert_eq!( "62000001FF4F38E1C74754E7D03C1A660178", encode_aux_file_key("pg_logical/unsupported").to_string(), ); assert_eq!( "62000002017F8D83D94F7081693471ABFB92", encode_aux_file_key("pg_replslot/test3").to_string() ); assert_eq!( "620000FFFF2B6ECC8AEF93F643DC44F15E03", encode_aux_file_key("other_file_not_supported").to_string(), ); } #[test] fn test_value_encoding() { let files = vec![ ("pg_logical/1.file", "1111".as_bytes()), ("pg_logical/2.file", "2222".as_bytes()), ]; assert_eq!( files, decode_file_value(&encode_file_value(&files).unwrap()).unwrap() ); let files = vec![]; assert_eq!( files, decode_file_value(&encode_file_value(&files).unwrap()).unwrap() ); } } ================================================ FILE: pageserver/src/basebackup.rs ================================================ //! //! Generate a tarball with files needed to bootstrap ComputeNode. //! //! TODO: this module has nothing to do with PostgreSQL pg_basebackup. //! It could use a better name. //! //! Stateless Postgres compute node is launched by sending a tarball //! which contains non-relational data (multixacts, clog, filenodemaps, twophase files), //! generated pg_control and dummy segment of WAL. //! This module is responsible for creation of such tarball //! from data stored in object storage. //! use std::fmt::Write as FmtWrite; use std::sync::Arc; use std::time::{Instant, SystemTime}; use anyhow::{Context, anyhow}; use async_compression::tokio::write::GzipEncoder; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES}; use postgres_ffi::{ BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, }; use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; use tokio::io::{self, AsyncWrite, AsyncWriteExt as _}; use tokio_tar::{Builder, EntryType, Header}; use tracing::*; use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { #[error("basebackup pageserver error {0:#}")] Server(#[from] anyhow::Error), #[error("basebackup client error {0:#} when {1}")] Client(#[source] io::Error, &'static str), #[error("basebackup during shutdown")] Shutdown, } impl From for BasebackupError { fn from(value: PageReconstructError) -> Self { match value { PageReconstructError::Cancelled => BasebackupError::Shutdown, err => BasebackupError::Server(err.into()), } } } impl From for BasebackupError { fn from(value: GetVectoredError) -> Self { match value { GetVectoredError::Cancelled => BasebackupError::Shutdown, err => BasebackupError::Server(err.into()), } } } impl From for postgres_backend::QueryError { fn from(err: BasebackupError) -> Self { use postgres_backend::QueryError; use pq_proto::framed::ConnectionError; match err { BasebackupError::Client(err, _) => QueryError::Disconnected(ConnectionError::Io(err)), BasebackupError::Server(err) => QueryError::Other(err), BasebackupError::Shutdown => QueryError::Shutdown, } } } impl From for tonic::Status { fn from(err: BasebackupError) -> Self { use tonic::Code; let code = match &err { BasebackupError::Client(_, _) => Code::Cancelled, BasebackupError::Server(_) => Code::Internal, BasebackupError::Shutdown => Code::Unavailable, }; tonic::Status::new(code, err.to_string()) } } /// Create basebackup with non-rel data in it. /// Only include relational data if 'full_backup' is true. /// /// Currently we use empty 'req_lsn' in two cases: /// * During the basebackup right after timeline creation /// * When working without safekeepers. In this situation it is important to match the lsn /// we are taking basebackup on with the lsn that is used in pageserver's walreceiver /// to start the replication. #[allow(clippy::too_many_arguments)] pub async fn send_basebackup_tarball<'a, W>( write: &'a mut W, timeline: &'a Timeline, req_lsn: Option, prev_lsn: Option, full_backup: bool, replica: bool, gzip_level: Option, ctx: &'a RequestContext, ) -> Result<(), BasebackupError> where W: AsyncWrite + Send + Sync + Unpin, { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the // "neon.signal" file, so that postgres can read it during startup. // // We don't keep full history of record boundaries in the page server, // however, only the predecessor of the latest record on each // timeline. So we can only provide prev_record_lsn when you take a // base backup at the end of the timeline, i.e. at last_record_lsn. // Even at the end of the timeline, we sometimes don't have a valid // prev_lsn value; that happens if the timeline was just branched from // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. The caller should've // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) let end_of_timeline = timeline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { (Lsn(0), req_lsn) } } else { // Backup was requested at end of the timeline. let end_of_timeline = timeline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; // Consolidate the derived and the provided prev_lsn values let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn { if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { return Err(BasebackupError::Server(anyhow!( "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" ))); } provided_prev_lsn } else { backup_prev }; info!( "taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \ (full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})", ); let span = info_span!("send_tarball", backup_lsn=%lsn); let io_concurrency = IoConcurrency::spawn_from_conf( timeline.conf.get_vectored_concurrent_io, timeline .gate .enter() .map_err(|_| BasebackupError::Shutdown)?, ); if let Some(gzip_level) = gzip_level { let mut encoder = GzipEncoder::with_quality(write, gzip_level); Basebackup { ar: Builder::new_non_terminated(&mut encoder), timeline, lsn, prev_record_lsn, full_backup, replica, ctx, io_concurrency, } .send_tarball() .instrument(span) .await?; encoder .shutdown() .await .map_err(|err| BasebackupError::Client(err, "gzip"))?; } else { Basebackup { ar: Builder::new_non_terminated(write), timeline, lsn, prev_record_lsn, full_backup, replica, ctx, io_concurrency, } .send_tarball() .instrument(span) .await?; } Ok(()) } /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. struct Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, { ar: Builder<&'a mut W>, timeline: &'a Timeline, lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, replica: bool, ctx: &'a RequestContext, io_concurrency: IoConcurrency, } /// A sink that accepts SLRU blocks ordered by key and forwards /// full segments to the archive. struct SlruSegmentsBuilder<'a, 'b, W> where W: AsyncWrite + Send + Sync + Unpin, { ar: &'a mut Builder<&'b mut W>, buf: Vec, current_segment: Option<(SlruKind, u32)>, total_blocks: usize, } impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> where W: AsyncWrite + Send + Sync + Unpin, { fn new(ar: &'a mut Builder<&'b mut W>) -> Self { Self { ar, buf: Vec::new(), current_segment: None, total_blocks: 0, } } async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { let (kind, segno, _) = key.to_slru_block()?; match kind { SlruKind::Clog => { if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) { return Err(BasebackupError::Server(anyhow!( "invalid SlruKind::Clog record: block.len()={}", block.len() ))); } } SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { if block.len() != BLCKSZ as usize { return Err(BasebackupError::Server(anyhow!( "invalid {:?} record: block.len()={}", kind, block.len() ))); } } } let segment = (kind, segno); match self.current_segment { None => { self.current_segment = Some(segment); self.buf .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); } Some(current_seg) if current_seg == segment => { self.buf .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); } Some(_) => { self.flush().await?; self.current_segment = Some(segment); self.buf .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); } } Ok(()) } async fn flush(&mut self) -> Result<(), BasebackupError> { let nblocks = self.buf.len() / BLCKSZ as usize; let (kind, segno) = self.current_segment.take().unwrap(); let segname = format!("{kind}/{segno:>04X}"); let header = new_tar_header(&segname, self.buf.len() as u64)?; self.ar .append(&header, self.buf.as_slice()) .await .map_err(|e| BasebackupError::Client(e, "flush"))?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); self.buf.clear(); Ok(()) } async fn finish(mut self) -> Result<(), BasebackupError> { let res = if self.current_segment.is_none() || self.buf.is_empty() { Ok(()) } else { self.flush().await }; info!("Collected {} SLRU blocks", self.total_blocks); res } } impl Basebackup<'_, W> where W: AsyncWrite + Send + Sync + Unpin, { async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum // Construct the pg_control file from the persisted checkpoint and pg_control // information. But we only add this to the tarball at the end, so that if the // writing is interrupted half-way through, the resulting incomplete tarball will // be missing the pg_control file, which prevents PostgreSQL from starting up on // it. With proper error handling, you should never try to start up from an // incomplete basebackup in the first place, of course, but this is a nice little // extra safety measure. let checkpoint_bytes = self .timeline .get_checkpoint(self.lsn, self.ctx) .await .context("failed to get checkpoint bytes")?; let pg_control_bytes = self .timeline .get_control_file(self.lsn, self.ctx) .await .context("failed to get control bytes")?; let (pg_control_bytes, system_identifier, was_shutdown) = postgres_ffi::generate_pg_control( &pg_control_bytes, &checkpoint_bytes, self.lsn, self.timeline.pg_version, )?; let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; let pgversion = self.timeline.pg_version; let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); // Create pgdata subdirs structure for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, io::empty()) .await .map_err(|e| BasebackupError::Client(e, "send_tarball"))?; } // Send config files. for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar .append(&header, data) .await .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?; } else { let header = new_tar_header(filepath, 0)?; self.ar .append(&header, io::empty()) .await .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?; } } if !lazy_slru_download { // Gather non-relational files from object storage pages. let slru_partitions = self .timeline .get_slru_keyspace(Version::at(self.lsn), self.ctx) .await? .partition( self.timeline.get_shard_identity(), self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, BLCKSZ as u64, ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); for part in slru_partitions.parts { let query = VersionedKeySpaceQuery::uniform(part, self.lsn); let blocks = self .timeline .get_vectored(query, self.io_concurrency.clone(), self.ctx) .await?; for (key, block) in blocks { let block = block?; slru_builder.add_block(&key, block).await?; } } slru_builder.finish().await?; } let mut min_restart_lsn: Lsn = Lsn::MAX; let mut dbdir_cnt = 0; let mut rel_cnt = 0; // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn, self.ctx).await? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; dbdir_cnt += 1; // If full backup is requested, include all relation files. // Otherwise only include init forks of unlogged relations. let rels = self .timeline .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; for &rel in rels.iter() { rel_cnt += 1; // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in // `reinit.c` during recovery. if rel.forknum == INIT_FORKNUM { // I doubt we need _init fork itself, but having it at least // serves as a marker relation is unlogged. self.add_rel(rel, rel).await?; self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?; continue; } if self.full_backup { if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM)) { // skip this, will include it when we reach the init fork continue; } self.add_rel(rel, rel).await?; } } } self.timeline .db_rel_count .store(Some(Arc::new((dbdir_cnt, rel_cnt)))); let start_time = Instant::now(); let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) .await?; let aux_scan_time = start_time.elapsed(); let aux_estimated_size = aux_files .values() .map(|content| content.len()) .sum::(); info!( "Scanned {} aux files in {}ms, aux file content size = {}", aux_files.len(), aux_scan_time.as_millis(), aux_estimated_size ); for (path, content) in aux_files { if path.starts_with("pg_replslot") { // Do not create LR slots at standby because they are not used but prevent WAL truncation if self.replica { continue; } let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( content[offs..offs + 8].try_into().unwrap(), )); info!("Replication slot {} restart LSN={}", path, restart_lsn); min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); } else if path == "pg_logical/replorigin_checkpoint" { // replorigin_checkoint is written only on compute shutdown, so it contains // deteriorated values. So we generate our own version of this file for the particular LSN // based on information about replorigins extracted from transaction commit records. // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; } else if path == "pg_stat/pgstat.stat" && !was_shutdown { // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN // of a shutdown checkpoint. continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar .append(&header, &*content) .await .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?; } if min_restart_lsn != Lsn::MAX { info!( "Min restart LSN for logical replication is {}", min_restart_lsn ); let data = min_restart_lsn.0.to_le_bytes(); let header = new_tar_header("restart.lsn", data.len() as u64)?; self.ar .append(&header, &data[..]) .await .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?; } for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) .await? { self.add_twophase_file(xid).await?; } let repl_origins = self .timeline .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) .await?; let n_origins = repl_origins.len(); if n_origins != 0 { // // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins // extracted from transaction commit record. We are using this file to pass information about replication // origins to compute to allow logical replication to restart from proper point. // let mut content = Vec::with_capacity(n_origins * 16 + 8); content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes()); for (origin_id, origin_lsn) in repl_origins { content.extend_from_slice(&origin_id.to_le_bytes()); content.extend_from_slice(&[0u8; 6]); // align to 8 bytes content.extend_from_slice(&origin_lsn.0.to_le_bytes()); } let crc32 = crc32c::crc32c(&content); content.extend_from_slice(&crc32.to_le_bytes()); let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; self.ar.append(&header, &*content).await.map_err(|e| { BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint") })?; } fail_point!("basebackup-before-control-file", |_| { Err(BasebackupError::Server(anyhow!( "failpoint basebackup-before-control-file" ))) }); // Last, add the pg_control file and bootstrap WAL segment. self.add_pgcontrol_file(pg_control_bytes, system_identifier) .await?; self.ar .finish() .await .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?; debug!("all tarred up!"); Ok(()) } /// Add contents of relfilenode `src`, naming it as `dst`. async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline .get_rel_size(src, Version::at(self.lsn), self.ctx) .await?; // If the relation is empty, create an empty file if nblocks == 0 { let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar .append(&header, io::empty()) .await .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?; return Ok(()); } // Add a file for each chunk of blocks (aka segment) let mut startblk = 0; let mut seg = 0; while startblk < nblocks { let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks); let mut segment_data: Vec = vec![]; for blknum in startblk..endblk { let img = self .timeline // TODO: investigate using get_vectored for the entire startblk..endblk range. // But this code path is not on the critical path for most basebackups (?). .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; self.ar .append(&header, segment_data.as_slice()) .await .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?; seg += 1; startblk = endblk; } Ok(()) } // // Include database/tablespace directories. // // Each directory contains a PG_VERSION file, and the default database // directories also contain pg_filenode.map files. // async fn add_dbdir( &mut self, spcnode: u32, dbnode: u32, has_relmap_file: bool, ) -> Result<(), BasebackupError> { let relmap_img = if has_relmap_file { let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; if img.len() != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) { return Err(BasebackupError::Server(anyhow!( "img.len() != SIZE_OF_RELMAPFILE, img.len()={}", img.len(), ))); } Some(img) } else { None }; if spcnode == GLOBALTABLESPACE_OID { let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; self.ar .append(&header, &img[..]) .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?; } else { warn!("global/pg_filenode.map is missing"); } } else { // User defined tablespaces are not supported. However, as // a special case, if a tablespace/db directory is // completely empty, we can leave it out altogether. This // makes taking a base backup after the 'tablespace' // regression test pass, because the test drops the // created tablespaces after the tests. // // FIXME: this wouldn't be necessary, if we handled // XLOG_TBLSPC_DROP records. But we probably should just // throw an error on CREATE TABLESPACE in the first place. if !has_relmap_file && self .timeline .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await? .is_empty() { return Ok(()); } // User defined tablespaces are not supported if spcnode != DEFAULTTABLESPACE_OID { return Err(BasebackupError::Server(anyhow!( "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}" ))); } // Append dir path for each database let path = format!("base/{dbnode}"); let header = new_tar_header_dir(&path)?; self.ar .append(&header, io::empty()) .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { let dst_path = format!("base/{dbnode}/PG_VERSION"); let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; let relmap_path = format!("base/{dbnode}/pg_filenode.map"); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?; } }; Ok(()) } // // Extract twophase state files // async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) .await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); let path = if self.timeline.pg_version < PgMajorVersion::PG17 { format!("pg_twophase/{xid:>08X}") } else { format!("pg_twophase/{xid:>016X}") }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) .await .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?; Ok(()) } // // Add generated pg_control file and bootstrap WAL segment. // Also send neon.signal and zenith.signal file with extra bootstrap data. // async fn add_pgcontrol_file( &mut self, pg_control_bytes: Bytes, system_identifier: u64, ) -> Result<(), BasebackupError> { // add neon.signal file let mut neon_signal = String::new(); if self.prev_record_lsn == Lsn(0) { if self.timeline.is_ancestor_lsn(self.lsn) { write!(neon_signal, "PREV LSN: none") .map_err(|e| BasebackupError::Server(e.into()))?; } else { write!(neon_signal, "PREV LSN: invalid") .map_err(|e| BasebackupError::Server(e.into()))?; } } else { write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn) .map_err(|e| BasebackupError::Server(e.into()))?; } // TODO: Remove zenith.signal once all historical computes have been replaced // ... and thus support the neon.signal file. for signalfilename in ["neon.signal", "zenith.signal"] { self.ar .append( &new_tar_header(signalfilename, neon_signal.len() as u64)?, neon_signal.as_bytes(), ) .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?; } //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar .append(&header, &pg_control_bytes[..]) .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{wal_file_name}"); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; let wal_seg = postgres_ffi::generate_wal_segment( segno, system_identifier, self.timeline.pg_version, self.lsn, ) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; if wal_seg.len() != WAL_SEGMENT_SIZE { return Err(BasebackupError::Server(anyhow!( "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}", wal_seg.len() ))); } self.ar .append(&header, &wal_seg[..]) .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?; Ok(()) } } // // Create new tarball entry header // fn new_tar_header(path: &str, size: u64) -> anyhow::Result
{ let mut header = Header::new_gnu(); header.set_size(size); header.set_path(path)?; header.set_mode(0b110000000); // -rw------- header.set_mtime( // use currenttime as last modified time SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs(), ); header.set_cksum(); Ok(header) } fn new_tar_header_dir(path: &str) -> anyhow::Result
{ let mut header = Header::new_gnu(); header.set_size(0); header.set_path(path)?; header.set_mode(0o755); // -rw------- header.set_entry_type(EntryType::dir()); header.set_mtime( // use currenttime as last modified time SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs(), ); header.set_cksum(); Ok(header) } ================================================ FILE: pageserver/src/basebackup_cache.rs ================================================ use std::{collections::HashMap, sync::Arc}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use metrics::core::{AtomicU64, GenericCounter}; use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; use tokio::{ io::{AsyncWriteExt, BufWriter}, sync::mpsc::{Receiver, Sender, error::TrySendError}, }; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, shard::TenantShardId, }; use crate::{ basebackup::send_basebackup_tarball, context::{DownloadBehavior, RequestContext}, metrics::{ BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE, BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE, }, task_mgr::TaskKind, tenant::{ Timeline, mgr::{TenantManager, TenantSlot}, }, }; pub struct BasebackupPrepareRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub lsn: Lsn, } pub type BasebackupPrepareSender = Sender; pub type BasebackupPrepareReceiver = Receiver; #[derive(Clone)] struct CacheEntry { /// LSN at which the basebackup was taken. lsn: Lsn, /// Size of the basebackup archive in bytes. size_bytes: u64, } /// BasebackupCache stores cached basebackup archives for timelines on local disk. /// /// The main purpose of this cache is to speed up the startup process of compute nodes /// after scaling to zero. /// Thus, the basebackup is stored only for the latest LSN of the timeline and with /// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none). /// /// The cache receives prepare requests through the `BasebackupPrepareSender` channel, /// generates a basebackup from the timeline in the background, and stores it on disk. /// /// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache /// and ~1 RPS for get requests. pub struct BasebackupCache { data_dir: Utf8PathBuf, config: Option, entries: std::sync::Mutex>, prepare_sender: BasebackupPrepareSender, read_hit_count: GenericCounter, read_miss_count: GenericCounter, read_err_count: GenericCounter, prepare_skip_count: GenericCounter, } impl BasebackupCache { /// Create a new BasebackupCache instance. /// Also returns a BasebackupPrepareReceiver which is needed to start /// the background task. /// The cache is initialized from the data_dir in the background task. /// The cache will return `None` for any get requests until the initialization is complete. /// The background task is spawned separately using [`Self::spawn_background_task`] /// to avoid a circular dependency between the cache and the tenant manager. pub fn new( data_dir: Utf8PathBuf, config: Option, ) -> (Arc, BasebackupPrepareReceiver) { let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1); let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size); let cache = Arc::new(BasebackupCache { data_dir, config, entries: std::sync::Mutex::new(HashMap::new()), prepare_sender, read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), }); (cache, prepare_receiver) } /// Spawns the background task. /// The background task initializes the cache from the disk, /// processes prepare requests, and cleans up outdated cache entries. /// Noop if the cache is disabled (config is None). pub fn spawn_background_task( self: Arc, runtime_handle: &tokio::runtime::Handle, prepare_receiver: BasebackupPrepareReceiver, tenant_manager: Arc, cancel: CancellationToken, ) { if let Some(config) = self.config.clone() { let background = BackgroundTask { c: self, config, tenant_manager, cancel, entry_count: 0, total_size_bytes: 0, prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]), prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]), }; runtime_handle.spawn(background.run(prepare_receiver)); } } /// Send a basebackup prepare request to the background task. /// The basebackup will be prepared asynchronously, it does not block the caller. /// The request will be skipped if any cache limits are exceeded. pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) { let req = BasebackupPrepareRequest { tenant_shard_id, timeline_id, lsn, }; BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc(); let res = self.prepare_sender.try_send(req); if let Err(e) = res { BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); self.prepare_skip_count.inc(); match e { TrySendError::Full(_) => { // Basebackup prepares are pretty rare, normally we should not hit this. tracing::info!( tenant_id = %tenant_shard_id.tenant_id, %timeline_id, %lsn, "Basebackup prepare channel is full, skipping the request" ); } TrySendError::Closed(_) => { // Normal during shutdown, not critical. tracing::info!( tenant_id = %tenant_shard_id.tenant_id, %timeline_id, %lsn, "Basebackup prepare channel is closed, skipping the request" ); } } } } /// Gets a basebackup entry from the cache. /// If the entry is found, opens a file with the basebackup archive and returns it. /// The open file descriptor will prevent the file system from deleting the file /// even if the entry is removed from the cache in the background. pub async fn get( &self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Option { if !self.is_enabled() { return None; } // Fast path. Check if the entry exists using the in-memory state. let tti = TenantTimelineId::new(tenant_id, timeline_id); if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) { self.read_miss_count.inc(); return None; } let path = self.entry_path(tenant_id, timeline_id, lsn); match tokio::fs::File::open(path).await { Ok(file) => { self.read_hit_count.inc(); Some(file) } Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { // We may end up here if the basebackup was concurrently removed by the cleanup task. self.read_miss_count.inc(); } else { self.read_err_count.inc(); tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e); } None } } } pub fn is_enabled(&self) -> bool { self.config.is_some() } // Private methods. fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String { // The default format for LSN is 0/ABCDEF. // The backslash is not filename friendly, so serialize it as plain hex. let lsn = lsn.0; format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz") } fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf { self.data_dir .join(Self::entry_filename(tenant_id, timeline_id, lsn)) } } /// The background task that does the job to prepare basebackups /// and manage the cache entries on disk. /// It is a separate struct from BasebackupCache to allow holding /// a mutable reference to this state without a mutex lock, /// while BasebackupCache is referenced by the clients. struct BackgroundTask { c: Arc, config: BasebackupCacheConfig, tenant_manager: Arc, cancel: CancellationToken, /// Number of the entries in the cache. /// This counter is used for metrics and applying cache limits. /// It generally should be equal to c.entries.len(), but it's calculated /// pessimistically for abnormal situations: if we encountered some errors /// during removing the entry from disk, we won't decrement this counter to /// make sure that we don't exceed the limit with "trashed" files on the disk. /// It will also count files in the data_dir that are not valid cache entries. entry_count: usize, /// Total size of all the entries on the disk. /// This counter is used for metrics and applying cache limits. /// Similar to entry_count, it is calculated pessimistically for abnormal situations. total_size_bytes: u64, prepare_ok_count: GenericCounter, prepare_skip_count: GenericCounter, prepare_err_count: GenericCounter, } impl BackgroundTask { fn tmp_dir(&self) -> Utf8PathBuf { self.c.data_dir.join("tmp") } fn entry_tmp_path( &self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Utf8PathBuf { self.tmp_dir() .join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn)) } fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> { let parts: Vec<&str> = filename .strip_prefix("basebackup_")? .strip_suffix(".tar.gz")? .split('_') .collect(); if parts.len() != 3 { return None; } let tenant_id = parts[0].parse::().ok()?; let timeline_id = parts[1].parse::().ok()?; let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?); Some((tenant_id, timeline_id, lsn)) } // Recreate the tmp directory to clear all files in it. async fn clean_tmp_dir(&self) -> anyhow::Result<()> { let tmp_dir = self.tmp_dir(); if tmp_dir.exists() { tokio::fs::remove_dir_all(&tmp_dir).await?; } tokio::fs::create_dir_all(&tmp_dir).await?; Ok(()) } async fn cleanup(&mut self) -> anyhow::Result<()> { self.clean_tmp_dir().await?; // Leave only up-to-date entries. let entries_old = self.c.entries.lock().unwrap().clone(); let mut entries_new = HashMap::new(); for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() { if !tenant_shard_id.is_shard_zero() { continue; } let TenantSlot::Attached(tenant) = tenant_slot else { continue; }; let tenant_id = tenant_shard_id.tenant_id; for timeline in tenant.list_timelines() { let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id); if let Some(entry) = entries_old.get(&tti) { if timeline.get_last_record_lsn() <= entry.lsn { entries_new.insert(tti, entry.clone()); } } } } // Try to remove all entries that are not up-to-date. for (&tti, entry) in entries_old.iter() { if !entries_new.contains_key(&tti) { self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry) .await; } } // Note: BackgroundTask is the only writer for self.c.entries, // so it couldn't have been modified concurrently. *self.c.entries.lock().unwrap() = entries_new; Ok(()) } async fn on_startup(&mut self) -> anyhow::Result<()> { // Create data_dir if it does not exist. tokio::fs::create_dir_all(&self.c.data_dir) .await .context("Failed to create basebackup cache data directory")?; self.clean_tmp_dir() .await .context("Failed to clean tmp directory")?; // Read existing entries from the data_dir and add them to in-memory state. let mut entries = HashMap::::new(); let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?; while let Some(dir_entry) = dir.next_entry().await? { let filename = dir_entry.file_name(); if filename == "tmp" { // Skip the tmp directory. continue; } let size_bytes = dir_entry .metadata() .await .map_err(|e| { anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e) })? .len(); self.entry_count += 1; BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); self.total_size_bytes += size_bytes; BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref()); let Some((tenant_id, timeline_id, lsn)) = parsed else { tracing::warn!("Invalid basebackup cache file name: {:?}", filename); continue; }; let cur_entry = CacheEntry { lsn, size_bytes }; let tti = TenantTimelineId::new(tenant_id, timeline_id); use std::collections::hash_map::Entry::*; match entries.entry(tti) { Occupied(mut entry) => { let found_entry = entry.get(); // Leave only the latest entry, remove the old one. if cur_entry.lsn < found_entry.lsn { self.try_remove_entry(tenant_id, timeline_id, &cur_entry) .await; } else if cur_entry.lsn > found_entry.lsn { self.try_remove_entry(tenant_id, timeline_id, found_entry) .await; entry.insert(cur_entry); } else { // Two different filenames parsed to the same timline_id and LSN. // Should never happen. return Err(anyhow::anyhow!( "Duplicate basebackup cache entry with the same LSN: {:?}", filename )); } } Vacant(entry) => { entry.insert(cur_entry); } } } *self.c.entries.lock().unwrap() = entries; Ok(()) } async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) { // Panic in the background is a safe fallback. // It will drop receivers and the cache will be effectively disabled. self.on_startup() .await .expect("Failed to initialize basebackup cache"); let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period); cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { tokio::select! { Some(req) = prepare_receiver.recv() => { BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); if let Err(err) = self.prepare_basebackup( req.tenant_shard_id, req.timeline_id, req.lsn, ).await { tracing::info!("Failed to prepare basebackup: {:#}", err); self.prepare_err_count.inc(); continue; } } _ = cleanup_ticker.tick() => { self.cleanup().await.unwrap_or_else(|e| { tracing::warn!("Failed to clean up basebackup cache: {:#}", e); }); } _ = self.cancel.cancelled() => { tracing::info!("BasebackupCache background task cancelled"); break; } } } } /// Try to remove an entry from disk. /// The caller is responsible for removing the entry from the in-memory state. /// Updates size counters and corresponding metrics. /// Ignores the filesystem errors as not-so-important, but the size counters /// are not decremented in this case, so the file will continue to be counted /// towards the size limits. async fn try_remove_entry( &mut self, tenant_id: TenantId, timeline_id: TimelineId, entry: &CacheEntry, ) { let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn); match tokio::fs::remove_file(&entry_path).await { Ok(_) => {} Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} Err(e) => { tracing::warn!( "Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}", tenant_id, timeline_id, entry.lsn, e ); return; } } self.entry_count -= 1; BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); self.total_size_bytes -= entry.size_bytes; BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); } /// Insert the cache entry into in-memory state and update the size counters. /// Assumes that the file for the entry already exists on disk. /// If the entry already exists with previous LSN, it will be removed. async fn upsert_entry( &mut self, tenant_id: TenantId, timeline_id: TimelineId, entry: CacheEntry, ) { let tti = TenantTimelineId::new(tenant_id, timeline_id); self.entry_count += 1; BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); self.total_size_bytes += entry.size_bytes; BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); let old_entry = self.c.entries.lock().unwrap().insert(tti, entry); if let Some(old_entry) = old_entry { self.try_remove_entry(tenant_id, timeline_id, &old_entry) .await; } } /// Prepare a basebackup for the given timeline. /// /// If the basebackup already exists with a higher LSN or the timeline already /// has a higher last_record_lsn, skip the preparation. /// /// The basebackup is prepared in a temporary directory and then moved to the final /// location to make the operation atomic. async fn prepare_basebackup( &mut self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, req_lsn: Lsn, ) -> anyhow::Result<()> { tracing::info!( tenant_id = %tenant_shard_id.tenant_id, %timeline_id, %req_lsn, "Preparing basebackup for timeline", ); let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id); // TODO(diko): I don't think we will hit the limit, // but if we do, it makes sense to try to evict oldest entries. here if self.entry_count >= self.config.max_size_entries { tracing::info!( %tenant_shard_id, %timeline_id, %req_lsn, "Basebackup cache is full (max_size_entries), skipping basebackup", ); self.prepare_skip_count.inc(); return Ok(()); } if self.total_size_bytes >= self.config.max_total_size_bytes { tracing::info!( %tenant_shard_id, %timeline_id, %req_lsn, "Basebackup cache is full (max_total_size_bytes), skipping basebackup", ); self.prepare_skip_count.inc(); return Ok(()); } { let entries = self.c.entries.lock().unwrap(); if let Some(entry) = entries.get(&tti) { if entry.lsn >= req_lsn { tracing::info!( %timeline_id, %req_lsn, %entry.lsn, "Basebackup entry already exists for timeline with higher LSN, skipping basebackup", ); self.prepare_skip_count.inc(); return Ok(()); } } } let tenant = self .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let tenant_state = tenant.current_state(); if tenant_state != TenantState::Active { anyhow::bail!( "Tenant {} is not active, current state: {:?}", tenant_shard_id.tenant_id, tenant_state ) } let timeline = tenant.get_timeline(timeline_id, true)?; let last_record_lsn = timeline.get_last_record_lsn(); if last_record_lsn > req_lsn { tracing::info!( %timeline_id, %req_lsn, %last_record_lsn, "Timeline has a higher LSN than the requested one, skipping basebackup", ); self.prepare_skip_count.inc(); return Ok(()); } let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); let res = self .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn) .await; let entry = match res { Ok(entry) => entry, Err(err) => { tracing::info!("Failed to prepare basebackup tmp file: {:#}", err); // Try to clean up tmp file. If we fail, the background clean up task will take care of it. match tokio::fs::remove_file(&entry_tmp_path).await { Ok(_) => {} Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} Err(e) => { tracing::info!("Failed to remove basebackup tmp file: {:?}", e); } } return Err(err); } }; // Move the tmp file to the final location atomically. // The tmp file is fsynced, so it's guaranteed that we will not have a partial file // in the main directory. // It's not necessary to fsync the inode after renaming, because the worst case is that // the rename operation will be rolled back on the disk failure, the entry will disappear // from the main directory, and the entry access will cause a cache miss. let entry_path = self .c .entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); tokio::fs::rename(&entry_tmp_path, &entry_path).await?; self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry) .await; self.prepare_ok_count.inc(); Ok(()) } /// Prepares a basebackup in a temporary file. /// Guarantees that the tmp file is fsynced before returning. async fn prepare_basebackup_tmp( &self, entry_tmp_path: &Utf8Path, timeline: &Arc, req_lsn: Lsn, ) -> anyhow::Result { let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download); let ctx = ctx.with_scope_timeline(timeline); let file = tokio::fs::File::create(entry_tmp_path).await?; let mut writer = BufWriter::new(file); // We may receive a request before the WAL record is applied to the timeline. // Wait for the requested LSN to be applied. timeline .wait_lsn( req_lsn, crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache, crate::tenant::timeline::WaitLsnTimeout::Default, &ctx, ) .await?; send_basebackup_tarball( &mut writer, timeline, Some(req_lsn), None, false, false, // Level::Best because compression is not on the hot path of basebackup requests. // The decompression is almost not affected by the compression level. Some(async_compression::Level::Best), &ctx, ) .await?; writer.flush().await?; writer.into_inner().sync_all().await?; // TODO(diko): we can count it via Writer wrapper instead of a syscall. let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len(); Ok(CacheEntry { lsn: req_lsn, size_bytes, }) } } ================================================ FILE: pageserver/src/bin/pageserver.rs ================================================ #![recursion_limit = "300"] //! Main entry point for the Page Server executable. use std::env; use std::env::{VarError, var}; use std::io::Read; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; use pageserver::basebackup_cache::BasebackupCache; use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; use pageserver::controller_upcall_client::StorageControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::feature_resolver::FeatureResolver; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::page_service::GrpcPageServiceHandler; use pageserver::task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, }; use pageserver::tenant::{TenantSharedResources, mgr, secondary}; use pageserver::{ CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file, }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use tracing_utils::OtelGuard; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; use utils::logging::TracingErrorLayerEnablement; use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR}; use utils::sentry_init::init_sentry; use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ #[cfg(feature = "testing")] "testing", ]; fn version() -> String { format!( "{GIT_VERSION} failpoints: {}, features: {:?}", fail::has_failpoints(), FEATURES, ) } fn main() -> anyhow::Result<()> { let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); let arg_matches = cli().get_matches(); if arg_matches.get_flag("enabled-features") { println!("{{\"features\": {FEATURES:?} }}"); return Ok(()); } // Initialize up failpoints support let scenario = failpoint_support::init(); let workdir = arg_matches .get_one::("workdir") .map(Utf8Path::new) .unwrap_or_else(|| Utf8Path::new(".neon")); let workdir = workdir .canonicalize_utf8() .with_context(|| format!("Error opening workdir '{workdir}'"))?; let cfg_file_path = workdir.join("pageserver.toml"); let identity_file_path = workdir.join("identity.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // // It must be initialized before the custom panic hook is installed below. // // Regarding tracing_error enablement: at this time, we only use the // tracing_error crate to debug_assert that log spans contain tenant and timeline ids. // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module let tracing_error_layer_enablement = if cfg!(debug_assertions) { TracingErrorLayerEnablement::EnableWithRustLogFilter } else { TracingErrorLayerEnablement::Disabled }; logging::init( conf.log_format, tracing_error_layer_enablement, logging::Output::Stdout, )?; let otel_enablement = match &conf.tracing { Some(cfg) => tracing_utils::OtelEnablement::Enabled { service_name: "pageserver".to_string(), export_config: (&cfg.export_config).into(), }, None => tracing_utils::OtelEnablement::Disabled, }; let otel_guard = tracing_utils::init_performance_tracing(otel_enablement); if otel_guard.is_some() { info!(?conf.tracing, "starting with OTEL tracing enabled"); } // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), &[("node_id", &conf.id.to_string())], ); // Warn about ignored config items; see pageserver_api::config::ConfigToml // doc comment for rationale why we prefer this over serde(deny_unknown_fields). { let ignored_fields::Paths { paths } = &ignored; for path in paths { warn!(?path, "ignoring unknown configuration item"); } } // Log configuration items for feature-flag-like config // (maybe we should automate this with a visitor?). info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown. // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not. // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error. let tenants_path = conf.tenants_path(); { let open = || { nix::dir::Dir::open( tenants_path.as_std_path(), nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY, nix::sys::stat::Mode::empty(), ) }; let dirfd = match open() { Ok(dirfd) => dirfd, Err(e) => match e { nix::errno::Errno::ENOENT => { utils::crashsafe::create_dir_all(&tenants_path).with_context(|| { format!("Failed to create tenants root dir at '{tenants_path}'") })?; open().context("open tenants dir after creating it")? } e => anyhow::bail!(e), }, }; if conf.no_sync { info!("Skipping syncfs on startup"); } else { let started = Instant::now(); syncfs(dirfd)?; let elapsed = started.elapsed(); info!( elapsed_ms = elapsed.as_millis(), "made tenant directory contents durable" ); } } // Basic initialization of things that don't change after startup tracing::info!("Initializing virtual_file..."); virtual_file::init( conf.max_file_descriptors, conf.virtual_file_io_engine, conf.virtual_file_io_mode, if conf.no_sync { virtual_file::SyncMode::UnsafeNoSync } else { virtual_file::SyncMode::Sync }, ); tracing::info!("Initializing page_cache..."); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf, ignored, otel_guard).context("Failed to start pageserver")?; scenario.teardown(); Ok(()) } fn initialize_config( identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, workdir: &Utf8Path, ) -> anyhow::Result<(&'static PageServerConf, ignored_fields::Paths)> { // The deployment orchestrator writes out an indentity file containing the node id // for all pageservers. This file is the source of truth for the node id. In order // to allow for rolling back pageserver releases, the node id is also included in // the pageserver config that the deployment orchestrator writes to disk for the pageserver. // A rolled back version of the pageserver will get the node id from the pageserver.toml // config file. let identity = match std::fs::File::open(identity_file_path) { Ok(mut f) => { let md = f.metadata().context("stat config file")?; if !md.is_file() { anyhow::bail!( "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..." ); } let mut s = String::new(); f.read_to_string(&mut s).context("read identity file")?; toml_edit::de::from_str::(&s)? } Err(e) => { anyhow::bail!( "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..." ); } }; let config_file_contents = std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; // Deserialize the config file contents into a ConfigToml. let config_toml: pageserver_api::config::ConfigToml = { let deserializer = toml_edit::de::Deserializer::from_str(&config_file_contents) .context("build toml deserializer")?; let mut path_to_error_track = serde_path_to_error::Track::new(); let deserializer = serde_path_to_error::Deserializer::new(deserializer, &mut path_to_error_track); serde::Deserialize::deserialize(deserializer).context("deserialize config toml")? }; // Find unknown fields by re-serializing the parsed ConfigToml and comparing it to the on-disk file. // Any fields that are only in the on-disk version are unknown. // (The assumption here is that the ConfigToml doesn't to skip_serializing_if.) // (Make sure to read the ConfigToml doc comment on why we only want to warn about, but not fail startup, on unknown fields). let ignored = { let ondisk_toml = config_file_contents .parse::() .context("parse original config as toml document")?; let parsed_toml = toml_edit::ser::to_document(&config_toml) .context("re-serialize config to toml document")?; pageserver::config::ignored_fields::find(ondisk_toml, parsed_toml) }; // Construct the runtime god object (it's called PageServerConf but actually is just global shared state). let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) .context("runtime-validation of config toml")?; let conf = Box::leak(Box::new(conf)); Ok((conf, ignored)) } struct WaitForPhaseResult { timeout_remaining: Duration, skipped: Option, } /// During startup, we apply a timeout to our waits for readiness, to avoid /// stalling the whole service if one Tenant experiences some problem. Each /// phase may consume some of the timeout: this function returns the updated /// timeout for use in the next call. async fn wait_for_phase(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult where F: std::future::Future + Unpin, { let initial_t = Instant::now(); let skipped = match tokio::time::timeout(timeout, &mut fut).await { Ok(_) => None, Err(_) => { tracing::info!( timeout_millis = timeout.as_millis(), %phase, "Startup phase timed out, proceeding anyway" ); Some(fut) } }; WaitForPhaseResult { timeout_remaining: timeout .checked_sub(Instant::now().duration_since(initial_t)) .unwrap_or(Duration::ZERO), skipped, } } fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) { let elapsed = started_at.elapsed(); let secs = elapsed.as_secs_f64(); STARTUP_DURATION.with_label_values(&[phase]).set(secs); info!( elapsed_ms = elapsed.as_millis(), "{human_phase} ({secs:.3}s since start)" ) } fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ignored: ignored_fields::Paths, otel_guard: Option, ) -> anyhow::Result<()> { // Monotonic time for later calculating startup duration let started_startup_at = Instant::now(); // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. // A changed launch timestamp indicates a pageserver restart. info!( "version: {} launch_timestamp: {} build_tag: {}", version(), launch_ts.to_string(), BUILD_TAG, ); info!( "IO buffer alignment: {} bytes", pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT ); set_build_info_metric(GIT_VERSION, BUILD_TAG); set_launch_timestamp_metric(launch_ts); #[cfg(target_os = "linux")] metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap(); metrics::register_internal(Box::new( pageserver::metrics::tokio_epoll_uring::Collector::new(), )) .unwrap(); pageserver::preinitialize_metrics(conf, ignored); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes let failpoints = fail::list(); if !failpoints.is_empty() { info!( "started with failpoints: {}", failpoints .iter() .map(|(name, actions)| format!("{name}={actions}")) .collect::>() .join(";") ) } // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); // Ensure that the lock file is held even if the main thread of the process panics. // We need to release the lock file only when the process exits. std::mem::forget(lock_file); // Bind the HTTP, libpq, and gRPC ports early, to error out if they are // already in use. info!( "Starting pageserver http handler on {} with auth {:#?}", conf.listen_http_addr, conf.http_auth_type ); let http_listener = tcp_listener::bind(&conf.listen_http_addr)?; let https_listener = match conf.listen_https_addr.as_ref() { Some(https_addr) => { info!( "Starting pageserver https handler on {https_addr} with auth {:#?}", conf.http_auth_type ); Some(tcp_listener::bind(https_addr)?) } None => None, }; info!( "Starting pageserver pg protocol handler on {} with auth {:#?}", conf.listen_pg_addr, conf.pg_auth_type, ); let pageserver_listener = tcp_listener::bind(&conf.listen_pg_addr)?; // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. // These are configured via net.ipv4.tcp_keepalive_* sysctls. // // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't // support enabling keepalives while using the default OS sysctls. setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; let mut grpc_listener = None; if let Some(grpc_addr) = &conf.listen_grpc_addr { info!( "Starting pageserver gRPC handler on {grpc_addr} with auth {:#?}", conf.grpc_auth_type ); grpc_listener = Some(tcp_listener::bind(grpc_addr).map_err(|e| anyhow!("{e}"))?); } // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME .block_on(async { let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates( conf.ssl_ca_certs .iter() .map(pem::encode) .map(storage_broker::Certificate::from_pem), ); // Note: we do not attempt connecting here (but validate endpoints sanity). storage_broker::connect( conf.broker_endpoint.clone(), conf.broker_keepalive_interval, tls_config, ) }) .with_context(|| { format!( "create broker client for uri={:?} keepalive_interval={:?}", &conf.broker_endpoint, conf.broker_keepalive_interval, ) })?; // Initialize authentication for incoming connections let http_auth; let pg_auth; let grpc_auth; if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type].contains(&AuthType::NeonJWT) { // unwrap is ok because check is performed when creating config, so path is set and exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); info!("Loading public key(s) for verifying JWT tokens from {key_path:?}"); let jwt_auth = JwtAuth::from_key_path(key_path)?; let auth: Arc = Arc::new(SwappableJwtAuth::new(jwt_auth)); http_auth = match conf.http_auth_type { AuthType::Trust => None, AuthType::NeonJWT => Some(auth.clone()), }; pg_auth = match conf.pg_auth_type { AuthType::Trust => None, AuthType::NeonJWT => Some(auth.clone()), }; grpc_auth = match conf.grpc_auth_type { AuthType::Trust => None, AuthType::NeonJWT => Some(auth), }; } else { http_auth = None; pg_auth = None; grpc_auth = None; } let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api { let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, ))?; let server_config = rustls::ServerConfig::builder() .with_no_client_auth() .with_cert_resolver(resolver); Some(Arc::new(server_config)) } else { None }; match var("NEON_AUTH_TOKEN") { Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); pageserver::config::SAFEKEEPER_AUTH_TOKEN .set(Arc::new(v)) .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; } Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } Err(e) => return Err(e).with_context( || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable", ), }; // Top-level cancellation token for the process let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?; let feature_resolver = create_feature_resolver( conf, shutdown_pageserver.clone(), BACKGROUND_RUNTIME.handle(), )?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), StorageControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); // Up to this point no significant I/O has been done: this should have been fast. Record // duration prior to starting I/O intensive phase of startup. startup_checkpoint(started_startup_at, "initial", "Starting loading tenants"); STARTUP_IS_LOADING.set(1); // Startup staging or optimizing: // // We want to minimize downtime for `page_service` connections, and trying not to overload // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time. // // init_done_rx will notify when all initial load operations have completed. // // background_jobs_can_start (same name used to hold off background jobs from starting at // consumer side) will be dropped once we can start the background jobs. Currently it is behind // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout // (background_task_maximum_delay). let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel(); let (init_done_tx, init_done_rx) = utils::completion::channel(); let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel(); let order = pageserver::InitializationOrder { initial_tenant_load_remote: Some(init_done_tx), initial_tenant_load: Some(init_remote_done_tx), background_jobs_can_start: background_jobs_barrier.clone(), }; info!(config=?conf.l0_flush, "using l0_flush config"); let l0_flush_global_state = pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new( conf.basebackup_cache_dir(), conf.basebackup_cache_config.clone(), ); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = mgr::init( conf, background_purges.clone(), TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, l0_flush_global_state, basebackup_cache: Arc::clone(&basebackup_cache), feature_resolver: feature_resolver.clone(), }, shutdown_pageserver.clone(), ); let tenant_manager = Arc::new(tenant_manager); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?; basebackup_cache.spawn_background_task( BACKGROUND_RUNTIME.handle(), basebackup_prepare_receiver, Arc::clone(&tenant_manager), shutdown_pageserver.child_token(), ); BACKGROUND_RUNTIME.spawn({ let shutdown_pageserver = shutdown_pageserver.clone(); let drive_init = async move { // NOTE: unlike many futures in pageserver, this one is cancellation-safe let guard = scopeguard::guard_on_success((), |_| { tracing::info!("Cancelled before initial load completed") }); let timeout = conf.background_task_maximum_delay; let init_remote_done = std::pin::pin!(async { init_remote_done_rx.wait().await; startup_checkpoint( started_startup_at, "initial_tenant_load_remote", "Remote part of initial load completed", ); }); let WaitForPhaseResult { timeout_remaining: timeout, skipped: init_remote_skipped, } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await; let init_load_done = std::pin::pin!(async { init_done_rx.wait().await; startup_checkpoint( started_startup_at, "initial_tenant_load", "Initial load completed", ); STARTUP_IS_LOADING.set(0); }); let WaitForPhaseResult { timeout_remaining: _timeout, skipped: init_load_skipped, } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await; // initial logical sizes can now start, as they were waiting on init_done_rx. scopeguard::ScopeGuard::into_inner(guard); // allow background jobs to start: we either completed prior stages, or they reached timeout // and were skipped. It is important that we do not let them block background jobs indefinitely, // because things like consumption metrics for billing are blocked by this barrier. drop(background_jobs_can_start); startup_checkpoint( started_startup_at, "background_jobs_can_start", "Starting background jobs", ); // We are done. If we skipped any phases due to timeout, run them to completion here so that // they will eventually update their startup_checkpoint, and so that we do not declare the // 'complete' stage until all the other stages are really done. let guard = scopeguard::guard_on_success((), |_| { tracing::info!("Cancelled before waiting for skipped phases done") }); if let Some(f) = init_remote_skipped { f.await; } if let Some(f) = init_load_skipped { f.await; } scopeguard::ScopeGuard::into_inner(guard); startup_checkpoint(started_startup_at, "complete", "Startup complete"); }; async move { let mut drive_init = std::pin::pin!(drive_init); // just race these tasks tokio::select! { _ = shutdown_pageserver.cancelled() => {}, _ = &mut drive_init => {}, } } }); let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks( tenant_manager.clone(), remote_storage.clone(), background_jobs_barrier.clone(), shutdown_pageserver.clone(), ); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has // been configured. let disk_usage_eviction_state: Arc = Arc::default(); let disk_usage_eviction_task = launch_disk_usage_global_eviction_task( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), tenant_manager.clone(), background_jobs_barrier.clone(), ); // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. let (http_endpoint_listener, https_endpoint_listener) = { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper let router_state = Arc::new( http::routes::State::new( conf, tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), disk_usage_eviction_state, deletion_queue.new_client(), secondary_controller, feature_resolver.clone(), ) .context("Failed to initialize router state")?, ); let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; let service = Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?); let http_task = { let server = http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?; let cancel = CancellationToken::new(); let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "http endpoint listener", server.serve(cancel.clone()), )); HttpEndpointListener(CancellableTask { task, cancel }) }; let https_task = match https_listener { Some(https_listener) => { let tls_server_config = tls_server_config .clone() .expect("tls_server_config is set earlier if https is enabled"); let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config); let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; let cancel = CancellationToken::new(); let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "https endpoint listener", server.serve(cancel.clone()), )); Some(HttpsEndpointListener(CancellableTask { task, cancel })) } None => None, }; (http_task, https_task) }; /* BEGIN_HADRON */ let metrics_collection_task = { let cancel = shutdown_pageserver.child_token(); let task = crate::BACKGROUND_RUNTIME.spawn({ let cancel = cancel.clone(); let background_jobs_barrier = background_jobs_barrier.clone(); async move { if conf.force_metric_collection_on_scrape { return; } // first wait until background jobs are cleared to launch. tokio::select! { _ = cancel.cancelled() => { return; }, _ = background_jobs_barrier.wait() => {} }; let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL); loop { tokio::select! { _ = cancel.cancelled() => { tracing::info!("cancelled metrics collection task, exiting..."); break; }, _ = interval.tick() => {} } tokio::task::spawn_blocking(|| { METRICS_COLLECTOR.run_once(true); }); } } }); MetricsCollectionTask(CancellableTask { task, cancel }) }; /* END_HADRON */ let consumption_metrics_tasks = { let cancel = shutdown_pageserver.child_token(); let task = crate::BACKGROUND_RUNTIME.spawn({ let tenant_manager = tenant_manager.clone(); let cancel = cancel.clone(); async move { // first wait until background jobs are cleared to launch. // // this is because we only process active tenants and timelines, and the // Timeline::get_current_logical_size will spawn the logical size calculation, // which will not be rate-limited. tokio::select! { _ = cancel.cancelled() => { return; }, _ = background_jobs_barrier.wait() => {} }; pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await; } }); ConsumptionMetricsTasks(CancellableTask { task, cancel }) }; // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone()); let page_service = page_service::spawn( conf, tenant_manager.clone(), pg_auth, perf_trace_dispatch, { let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it pageserver_listener .set_nonblocking(true) .context("set listener to nonblocking")?; tokio::net::TcpListener::from_std(pageserver_listener) .context("create tokio listener")? }, if conf.enable_tls_page_service_api { tls_server_config } else { None }, feature_resolver.clone(), ); // Spawn a Pageserver gRPC server task. It will spawn separate tasks for each request/stream. // It uses a separate compute request Tokio runtime (COMPUTE_REQUEST_RUNTIME). // // NB: this port is exposed to computes. It should only provide services that we're okay with // computes accessing. Internal services should use a separate port. let mut page_service_grpc = None; if let Some(grpc_listener) = grpc_listener { page_service_grpc = Some(GrpcPageServiceHandler::spawn( tenant_manager.clone(), grpc_auth, otel_guard.as_ref().map(|g| g.dispatch.clone()), conf.get_vectored_concurrent_io, grpc_listener, )?); } // All started up! Now just sit and wait for shutdown signal. BACKGROUND_RUNTIME.block_on(async move { let signal_token = CancellationToken::new(); let signal_cancel = signal_token.child_token(); tokio::spawn(utils::signals::signal_handler(signal_token)); // Wait for cancellation signal and shut down the pageserver. // // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't // reach very far, and `task_mgr` is used instead. The plan is to change that over time. signal_cancel.cancelled().await; shutdown_pageserver.cancel(); pageserver::shutdown_pageserver( http_endpoint_listener, https_endpoint_listener, page_service, page_service_grpc, metrics_collection_task, consumption_metrics_tasks, disk_usage_eviction_task, &tenant_manager, background_purges, deletion_queue.clone(), secondary_controller_tasks, 0, ) .await; unreachable!(); }) } fn create_feature_resolver( conf: &'static PageServerConf, shutdown_pageserver: CancellationToken, handle: &tokio::runtime::Handle, ) -> anyhow::Result { FeatureResolver::spawn(conf, shutdown_pageserver, handle) } async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { config } else { anyhow::bail!("no remote storage configured, this is a deprecated configuration"); }; // Create the client let mut remote_storage = GenericRemoteStorage::from_config(config).await?; // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. if conf.test_remote_failures > 0 { info!( "Simulating remote failures for first {} attempts of each op", conf.test_remote_failures ); remote_storage = GenericRemoteStorage::unreliable_wrapper( remote_storage, conf.test_remote_failures, conf.test_remote_failures_probability, ); } Ok(remote_storage) } fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) .arg( Arg::new("workdir") .short('D') .long("workdir") .help("Working directory for the pageserver"), ) .arg( Arg::new("enabled-features") .long("enabled-features") .action(ArgAction::SetTrue) .help("Show enabled compile time features"), ) } #[test] fn verify_cli() { cli().debug_assert(); } ================================================ FILE: pageserver/src/bin/test_helper_slow_client_reads.rs ================================================ use std::io::{Read, Write, stdin, stdout}; use std::time::Duration; use clap::Parser; use pageserver_api::pagestream_api::{ PagestreamFeMessage, PagestreamRequest, PagestreamTestRequest, }; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; #[derive(clap::Parser)] struct Args { connstr: String, tenant_id: TenantId, timeline_id: TimelineId, } #[tokio::main] async fn main() -> anyhow::Result<()> { let Args { connstr, tenant_id, timeline_id, } = Args::parse(); let client = pageserver_client::page_service::Client::new(connstr).await?; let client = client.pagestream(tenant_id, timeline_id).await?; let (mut sender, _receiver) = client.split(); eprintln!("filling the pipe"); let mut msg = 0; loop { msg += 1; let fut = sender.send(PagestreamFeMessage::Test(PagestreamTestRequest { hdr: PagestreamRequest { reqid: 0, request_lsn: Lsn(23), not_modified_since: Lsn(23), }, batch_key: 42, message: format!("message {msg}"), })); let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { eprintln!("pipe seems full"); break; }; let _: () = res?; } let n = stdout().write(b"R")?; assert_eq!(n, 1); stdout().flush()?; eprintln!("waiting for signal to tell us to exit"); let mut buf = [0u8; 1]; stdin().read_exact(&mut buf)?; eprintln!("termination signal received, exiting"); anyhow::Ok(()) } ================================================ FILE: pageserver/src/config/ignored_fields.rs ================================================ //! Check for fields in the on-disk config file that were ignored when //! deserializing [`pageserver_api::config::ConfigToml`]. //! //! This could have been part of the [`pageserver_api::config`] module, //! but the way we identify unused fields in this module //! is specific to the format (TOML) and the implementation of the //! deserialization for that format ([`toml_edit`]). use std::collections::HashSet; use itertools::Itertools; /// Pass in the user-specified config and the re-serialized [`pageserver_api::config::ConfigToml`]. /// The returned [`Paths`] contains the paths to the fields that were ignored by deserialization /// of the [`pageserver_api::config::ConfigToml`]. pub fn find(user_specified: toml_edit::DocumentMut, reserialized: toml_edit::DocumentMut) -> Paths { let user_specified = paths(user_specified); let reserialized = paths(reserialized); fn paths(doc: toml_edit::DocumentMut) -> HashSet { let mut out = Vec::new(); let mut visitor = PathsVisitor::new(&mut out); visitor.visit_table_like(doc.as_table()); HashSet::from_iter(out) } let mut ignored = HashSet::new(); // O(n) because of HashSet for path in user_specified { if !reserialized.contains(&path) { ignored.insert(path); } } Paths { paths: ignored .into_iter() // sort lexicographically for deterministic output .sorted() .collect(), } } pub struct Paths { pub paths: Vec, } struct PathsVisitor<'a> { stack: Vec, out: &'a mut Vec, } impl<'a> PathsVisitor<'a> { fn new(out: &'a mut Vec) -> Self { Self { stack: Vec::new(), out, } } fn visit_table_like(&mut self, table_like: &dyn toml_edit::TableLike) { for (entry, item) in table_like.iter() { self.stack.push(entry.to_string()); self.visit_item(item); self.stack.pop(); } } fn visit_item(&mut self, item: &toml_edit::Item) { match item { toml_edit::Item::None => (), toml_edit::Item::Value(value) => self.visit_value(value), toml_edit::Item::Table(table) => { self.visit_table_like(table); } toml_edit::Item::ArrayOfTables(array_of_tables) => { for (i, table) in array_of_tables.iter().enumerate() { self.stack.push(format!("[{i}]")); self.visit_table_like(table); self.stack.pop(); } } } } fn visit_value(&mut self, value: &toml_edit::Value) { match value { toml_edit::Value::String(_) | toml_edit::Value::Integer(_) | toml_edit::Value::Float(_) | toml_edit::Value::Boolean(_) | toml_edit::Value::Datetime(_) => self.out.push(self.stack.join(".")), toml_edit::Value::Array(array) => { for (i, value) in array.iter().enumerate() { self.stack.push(format!("[{i}]")); self.visit_value(value); self.stack.pop(); } } toml_edit::Value::InlineTable(inline_table) => self.visit_table_like(inline_table), } } } #[cfg(test)] pub(crate) mod tests { fn test_impl(original: &str, parsed: &str, expect: [&str; 1]) { let original: toml_edit::DocumentMut = original.parse().expect("parse original config"); let parsed: toml_edit::DocumentMut = parsed.parse().expect("parse re-serialized config"); let super::Paths { paths: actual } = super::find(original, parsed); assert_eq!(actual, &expect); } #[test] fn top_level() { test_impl( r#" [a] b = 1 c = 2 d = 3 "#, r#" [a] b = 1 c = 2 "#, ["a.d"], ); } #[test] fn nested() { test_impl( r#" [a.b.c] d = 23 "#, r#" [a] e = 42 "#, ["a.b.c.d"], ); } #[test] fn array_of_tables() { test_impl( r#" [[a]] b = 1 c = 2 d = 3 "#, r#" [[a]] b = 1 c = 2 "#, ["a.[0].d"], ); } #[test] fn array() { test_impl( r#" foo = [ {bar = 23} ] "#, r#" foo = [ { blup = 42 }] "#, ["foo.[0].bar"], ); } } ================================================ FILE: pageserver/src/config.rs ================================================ //! Functions for handling page server configuration options //! //! Configuration options can be set in the pageserver.toml configuration //! file, or on the command line. //! See also `settings.md` for better description on every parameter. pub mod ignored_fields; use std::env; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::config::{ DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig, }; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use pem::Pem; use postgres_backend::AuthType; use postgres_ffi::PgMajorVersion; use remote_storage::{RemotePath, RemoteStorageConfig}; use reqwest::Url; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::virtual_file::io_engine; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// Global state of pageserver. /// /// It's mostly immutable configuration, but some semaphores and the /// like crept in over time and the name stuck. /// /// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] /// and passing that to [`PageServerConf::parse_and_validate`]. /// /// # Adding a New Field /// /// 1. Add the field to `pageserver_api::config::ConfigToml`. /// 2. Fix compiler errors (exhaustive destructuring will guide you). /// /// For fields that require additional validation or filling in of defaults at runtime, /// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, /// Example: 127.0.0.1:9899 pub listen_https_addr: Option, /// If set, expose a gRPC API on this address. /// Example: 127.0.0.1:51051 /// /// EXPERIMENTAL: this protocol is unstable and under active development. pub listen_grpc_addr: Option, /// Path to a file with certificate's private key for https and gRPC API. /// Default: server.key pub ssl_key_file: Utf8PathBuf, /// Path to a file with a X509 certificate for https and gRPC API. /// Default: server.crt pub ssl_cert_file: Utf8PathBuf, /// Period to reload certificate and private key from files. /// Default: 60s. pub ssl_cert_reload_period: Duration, /// Trusted root CA certificates to use in https APIs in PEM format. pub ssl_ca_certs: Vec, /// Current availability zone. Used for traffic metrics. pub availability_zone: Option, // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. pub wal_redo_timeout: Duration, pub superuser: String, pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, // Repository directory, relative to current working directory. // Normally, the page server changes the current working directory // to the repository, and 'workdir' is always '.'. But we don't do // that during unit testing, because the current directory is global // to the process but different unit tests work on different // repositories. pub workdir: Utf8PathBuf, pub pg_distrib_dir: Utf8PathBuf, // Authentication /// authentication method for the HTTP mgmt API pub http_auth_type: AuthType, /// authentication method for libpq connections from compute pub pg_auth_type: AuthType, /// authentication method for gRPC connections from compute pub grpc_auth_type: AuthType, /// Path to a file or directory containing public key(s) for verifying JWT tokens. /// Used for both mgmt and compute auth, if enabled. pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, pub default_tenant_conf: pageserver_api::config::TenantConfigToml, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, pub broker_keepalive_interval: Duration, pub log_format: LogFormat, /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. /// /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`. /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. /// See the comment in `eviction_task` for details. /// /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, // The number of allowed failures in remote storage operations. pub test_remote_failures: u64, // The probability of failure in remote storage operations. Only works when test_remote_failures > 1. // Use 100 for 100% failure, 0 for no failure. pub test_remote_failures_probability: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, /// How long will background tasks be delayed at most after initial load of tenants. /// /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works /// as we now isolate initial loading, initial logical size calculation and background tasks. /// Smaller nodes will have background tasks "not running" for this long unless every timeline /// has it's initial logical size calculated. Not running background tasks for some seconds is /// not terrible. pub background_task_maximum_delay: Duration, pub control_plane_api: Url, /// JWT token for use with the control plane API. pub control_plane_api_token: Option, pub import_pgdata_upcall_api: Option, pub import_pgdata_upcall_api_token: Option, pub import_pgdata_aws_endpoint_url: Option, /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize /// heatmap uploads vs. other remote storage operations. pub heatmap_upload_concurrency: usize, /// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly /// deprioritises secondary downloads vs. remote storage operations for attached tenants. pub secondary_download_concurrency: usize, /// Maximum number of WAL records to be ingested and committed at the same time pub ingest_batch_size: u64, pub virtual_file_io_engine: virtual_file::IoEngineKind, pub max_vectored_read_bytes: MaxVectoredReadBytes, /// Maximum number of keys to be read in a single get_vectored call. pub max_get_vectored_keys: MaxGetVectoredKeys, pub image_compression: ImageCompressionAlgorithm, /// Whether to offload archived timelines automatically pub timeline_offloading: bool, /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: crate::l0_flush::L0FlushConfig, /// Direct IO settings pub virtual_file_io_mode: virtual_file::IoMode, /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer /// files read. pub enable_read_path_debugging: bool, /// Interpreted protocol feature: if enabled, validate that the logical WAL received from /// safekeepers does not have gaps. pub validate_wal_contiguity: bool, /// When set, the previously written to disk heatmap is loaded on tenant attach and used /// to avoid clobbering the heatmap from new, cold, attached locations. pub load_previous_heatmap: bool, /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline. pub generate_unarchival_heatmap: bool, pub tracing: Option, /// Enable TLS in page service API. /// Does not force TLS: the client negotiates TLS usage during the handshake. /// Uses key and certificate from ssl_key_file/ssl_cert_file. pub enable_tls_page_service_api: bool, /// Run in development mode, which disables certain safety checks /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, /// PostHog integration config. pub posthog_config: Option, pub timeline_import_config: pageserver_api::config::TimelineImportConfig, pub basebackup_cache_config: Option, /// Defines what is a big tenant for the purpose of image layer generation. /// See Timeline::should_check_if_image_layers_required pub image_layer_generation_large_timeline_threshold: Option, /// Controls whether to collect all metrics on each scrape or to return potentially stale /// results. pub force_metric_collection_on_scrape: bool, } /// Token for authentication to safekeepers /// /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in /// the future, more tokens and auth may arrive for storage broker, completely changing the logic. /// Hence, we resort to a global variable for now instead of passing the token from the /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); impl PageServerConf { // // Repository paths, relative to workdir. // pub fn tenants_path(&self) -> Utf8PathBuf { self.workdir.join(TENANTS_SEGMENT_NAME) } pub fn deletion_prefix(&self) -> Utf8PathBuf { self.workdir.join("deletion") } pub fn metadata_path(&self) -> Utf8PathBuf { self.workdir.join("metadata.json") } pub fn basebackup_cache_dir(&self) -> Utf8PathBuf { self.workdir.join("basebackup_cache") } pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. const VERSION: u8 = 1; self.deletion_prefix() .join(format!("{sequence:016x}-{VERSION:02x}.list")) } pub fn deletion_header_path(&self) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. const VERSION: u8 = 1; self.deletion_prefix().join(format!("header-{VERSION:02x}")) } pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenants_path().join(tenant_shard_id.to_string()) } /// Points to a place in pageserver's local directory, /// where certain tenant's LocationConf be stored. pub(crate) fn tenant_location_config_path( &self, tenant_shard_id: &TenantShardId, ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_HEATMAP_BASENAME) } pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TIMELINES_SEGMENT_NAME) } pub fn timeline_path( &self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Utf8PathBuf { self.timelines_path(tenant_shard_id) .join(timeline_id.to_string()) } /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) } // // Postgres distribution paths // pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); Ok(path.join(pg_version.v_str())) } pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate( id: NodeId, config_toml: pageserver_api::config::ConfigToml, workdir: &Utf8Path, ) -> anyhow::Result { let pageserver_api::config::ConfigToml { listen_pg_addr, listen_http_addr, listen_https_addr, listen_grpc_addr, ssl_key_file, ssl_cert_file, ssl_cert_reload_period, ssl_ca_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, superuser, locale, page_cache_size, max_file_descriptors, pg_distrib_dir, http_auth_type, pg_auth_type, grpc_auth_type, auth_validation_public_key_path, remote_storage, broker_endpoint, broker_keepalive_interval, log_format, metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api, control_plane_api_token, control_plane_emergency_mode, import_pgdata_upcall_api, import_pgdata_upcall_api_token, import_pgdata_aws_endpoint_url, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, l0_flush, virtual_file_io_mode, concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, tenant_config, no_sync, page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, validate_wal_contiguity, load_previous_heatmap, generate_unarchival_heatmap, tracing, enable_tls_page_service_api, dev_mode, posthog_config, timeline_import_config, basebackup_cache_config, image_layer_generation_large_timeline_threshold, force_metric_collection_on_scrape, } = config_toml; let mut conf = PageServerConf { // ------------------------------------------------------------ // fields that are already fully validated by the ConfigToml Deserialize impl // ------------------------------------------------------------ listen_pg_addr, listen_http_addr, listen_https_addr, listen_grpc_addr, ssl_key_file, ssl_cert_file, ssl_cert_reload_period, availability_zone, wait_lsn_timeout, wal_redo_timeout, superuser, locale, page_cache_size, max_file_descriptors, http_auth_type, pg_auth_type, grpc_auth_type, auth_validation_public_key_path, remote_storage_config: remote_storage, broker_endpoint, broker_keepalive_interval, log_format, metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api: control_plane_api .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?, control_plane_emergency_mode, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, page_service_pipelining, get_vectored_concurrent_io, tracing, enable_tls_page_service_api, dev_mode, timeline_import_config, basebackup_cache_config, image_layer_generation_large_timeline_threshold, force_metric_collection_on_scrape, // ------------------------------------------------------------ // fields that require additional validation or custom handling // ------------------------------------------------------------ workdir: workdir.to_owned(), pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { std::env::current_dir() .expect("current_dir() failed") .try_into() .expect("current_dir() is not a valid Utf8Path") }), control_plane_api_token: control_plane_api_token.map(SecretString::from), id, default_tenant_conf: tenant_config, concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( concurrent_tenant_size_logical_size_queries, ), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( // re-use `concurrent_tenant_size_logical_size_queries` concurrent_tenant_size_logical_size_queries, ), virtual_file_io_engine: match virtual_file_io_engine { Some(v) => v, None => match crate::virtual_file::io_engine_feature_test() .context("auto-detect virtual_file_io_engine")? { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. eprintln!( "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}" ); engine } }, }, l0_flush: l0_flush .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), load_previous_heatmap: load_previous_heatmap.unwrap_or(true), generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true), ssl_ca_certs: match ssl_ca_file { Some(ssl_ca_file) => { let buf = std::fs::read(ssl_ca_file)?; pem::parse_many(&buf)? .into_iter() .filter(|pem| pem.tag() == "CERTIFICATE") .collect() } None => Vec::new(), }, posthog_config, }; // ------------------------------------------------------------ // custom validation code that covers more than one field in isolation // ------------------------------------------------------------ if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type] .contains(&AuthType::NeonJWT) { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); ensure!( auth_validation_public_key_path.exists(), format!( "Can't find auth_validation_public_key at '{auth_validation_public_key_path}'", ) ); } if let Some(tracing_config) = conf.tracing.as_ref() { let ratio = &tracing_config.sampling_ratio; ensure!( ratio.denominator != 0 && ratio.denominator >= ratio.numerator, format!( "Invalid sampling ratio: {}/{}", ratio.numerator, ratio.denominator ) ); let url = Url::parse(&tracing_config.export_config.endpoint) .map_err(anyhow::Error::msg) .with_context(|| { format!( "tracing endpoint URL is invalid : {}", tracing_config.export_config.endpoint ) })?; ensure!( url.scheme() == "http" || url.scheme() == "https", format!( "tracing endpoint URL must start with http:// or https://: {}", tracing_config.export_config.endpoint ) ); } IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) .map_err(anyhow::Error::msg) .with_context(|| { format!( "effective checkpoint distance is unsupported: {}", conf.default_tenant_conf.checkpoint_distance ) })?; if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size, .. }) = conf.page_service_pipelining { if max_batch_size.get() > conf.max_get_vectored_keys.get() { return Err(anyhow::anyhow!( "`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})", conf.max_get_vectored_keys.get() )); } }; Ok(conf) } #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); let test_id = uuid::Uuid::new_v4(); Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); let mut config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), synthetic_size_calculation_interval: Duration::from_secs(60), background_task_maximum_delay: Duration::ZERO, load_previous_heatmap: Some(true), generate_unarchival_heatmap: Some(true), control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()), ..Default::default() }; // Test authors tend to forget about the default 10min initial lease deadline // when writing tests, which turns their immediate gc requests via mgmt API // into no-ops. Override the binary default here, such that there is no initial // lease deadline by default in tests. Tests that care can always override it // themselves. // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0); PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } #[derive(serde::Deserialize, serde::Serialize)] pub struct PageserverIdentity { pub id: NodeId, } /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty /// semaphore cannot be distinguished, leading any feature using these to await forever (or until /// new permits are added). #[derive(Debug, Clone)] pub struct ConfigurableSemaphore { initial_permits: NonZeroUsize, inner: std::sync::Arc, } impl ConfigurableSemaphore { /// Initializse using a non-zero amount of permits. /// /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will /// behave like [`futures::future::pending`], just waiting until new permits are added. /// /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs pub fn new(initial_permits: NonZeroUsize) -> Self { ConfigurableSemaphore { initial_permits, inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), } } /// Returns the configured amount of permits. pub fn initial_permits(&self) -> NonZeroUsize { self.initial_permits } } impl PartialEq for ConfigurableSemaphore { fn eq(&self, other: &Self) -> bool { // the number of permits can be increased at runtime, so we cannot really fulfill the // PartialEq value equality otherwise self.initial_permits == other.initial_permits } } impl Eq for ConfigurableSemaphore {} impl ConfigurableSemaphore { pub fn inner(&self) -> &std::sync::Arc { &self.inner } } #[cfg(test)] mod tests { use std::time::Duration; use camino::Utf8PathBuf; use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder}; use rstest::rstest; use utils::{id::NodeId, serde_percent::Percent}; use super::PageServerConf; #[test] fn test_minimal_config_toml_is_valid() { // The minimal valid config for running a pageserver: // - control_plane_api is mandatory, as pageservers cannot run in isolation // - we use Default impl of everything else in this situation let input = r#" control_plane_api = "http://localhost:6666" "#; let config_toml = toml_edit::de::from_str::(input) .expect("empty config is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } #[test] fn test_config_tracing_endpoint_is_invalid() { let input = r#" control_plane_api = "http://localhost:6666" [tracing] sampling_ratio = { numerator = 1, denominator = 0 } [tracing.export_config] endpoint = "localhost:4317" protocol = "http-binary" timeout = "1ms" "#; let config_toml = toml_edit::de::from_str::(input) .expect("config has valid fields"); let workdir = Utf8PathBuf::from("/nonexistent"); PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect_err("parse_and_validate should fail for endpoint without scheme"); } #[rstest] #[case(32, 32, true)] #[case(64, 32, false)] #[case(64, 64, true)] #[case(128, 128, true)] fn test_config_max_batch_size_is_valid( #[case] max_batch_size: usize, #[case] max_get_vectored_keys: usize, #[case] is_valid: bool, ) { let input = format!( r#" control_plane_api = "http://localhost:6666" max_get_vectored_keys = {max_get_vectored_keys} page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }} "#, ); let config_toml = toml_edit::de::from_str::(&input) .expect("config has valid fields"); let workdir = Utf8PathBuf::from("/nonexistent"); let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir); assert_eq!(result.is_ok(), is_valid); } #[test] fn test_config_posthog_config_is_valid() { let input = r#" control_plane_api = "http://localhost:6666" [posthog_config] server_api_key = "phs_AAA" client_api_key = "phc_BBB" project_id = "000" private_api_url = "https://us.posthog.com" public_api_url = "https://us.i.posthog.com" "#; let config_toml = toml_edit::de::from_str::(input) .expect("posthogconfig is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } #[test] fn test_config_posthog_incomplete_config_is_valid() { let input = r#" control_plane_api = "http://localhost:6666" [posthog_config] server_api_key = "phs_AAA" private_api_url = "https://us.posthog.com" public_api_url = "https://us.i.posthog.com" "#; let config_toml = toml_edit::de::from_str::(input) .expect("posthogconfig is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } #[rstest] #[ case::omit_the_whole_config( DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(80).unwrap(), min_avail_bytes: 2_000_000_000, period: Duration::from_secs(60), eviction_order: Default::default(), #[cfg(feature = "testing")] mock_statvfs: None, enabled: true, }, r#" control_plane_api = "http://localhost:6666" "#, )] #[ case::omit_enabled_field( DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(80).unwrap(), min_avail_bytes: 1_000_000_000, period: Duration::from_secs(60), eviction_order: EvictionOrder::RelativeAccessed { highest_layer_count_loses_first: true, }, #[cfg(feature = "testing")] mock_statvfs: None, enabled: true, }, r#" control_plane_api = "http://localhost:6666" disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" } "#, )] #[case::disabled( DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(80).unwrap(), min_avail_bytes: 2_000_000_000, period: Duration::from_secs(60), eviction_order: EvictionOrder::RelativeAccessed { highest_layer_count_loses_first: true, }, #[cfg(feature = "testing")] mock_statvfs: None, enabled: false, }, r#" control_plane_api = "http://localhost:6666" disk_usage_based_eviction = { enabled = false } "# )] fn test_config_disk_usage_based_eviction_is_valid( #[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig, #[case] input: &str, ) { let config_toml = toml_edit::de::from_str::(input) .expect("disk_usage_based_eviction is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap(); let disk_usage_based_eviction = config.disk_usage_based_eviction; assert_eq!( expected_disk_usage_based_eviction, disk_usage_based_eviction ); } } ================================================ FILE: pageserver/src/consumption_metrics/disk_cache.rs ================================================ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use super::{NewMetricsRoot, NewRawMetric, RawMetric}; use crate::consumption_metrics::NewMetricsRefRoot; pub(super) fn read_metrics_from_serde_value( json_value: serde_json::Value, ) -> anyhow::Result> { if NewMetricsRoot::is_v2_metrics(&json_value) { let root = serde_json::from_value::(json_value)?; Ok(root.metrics) } else { let all_metrics = serde_json::from_value::>(json_value)?; let all_metrics = all_metrics .into_iter() .map(|(key, (event_type, value))| NewRawMetric { key, kind: event_type, value, }) .collect(); Ok(all_metrics) } } pub(super) async fn read_metrics_from_disk( path: Arc, ) -> anyhow::Result> { // do not add context to each error, callsite will log with full path let span = tracing::Span::current(); tokio::task::spawn_blocking(move || { let _e = span.entered(); if let Some(parent) = path.parent() { if let Err(e) = scan_and_delete_with_same_prefix(&path) { tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}"); } } let mut file = std::fs::File::open(&*path)?; let reader = std::io::BufReader::new(&mut file); let json_value = serde_json::from_reader::<_, serde_json::Value>(reader)?; read_metrics_from_serde_value(json_value) }) .await .context("read metrics join error") .and_then(|x| x) } fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> { let it = std::fs::read_dir(path.parent().expect("caller checked"))?; let prefix = path.file_name().expect("caller checked").to_string(); for entry in it { let entry = entry?; if !entry.metadata()?.is_file() { continue; } let file_name = entry.file_name(); if path.file_name().unwrap() == file_name { // do not remove our actual file continue; } let file_name = file_name.to_string_lossy(); if !file_name.starts_with(&*prefix) { continue; } let path = entry.path(); if let Err(e) = std::fs::remove_file(&path) { tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}"); } else { tracing::info!("cleaned up old tempfile {file_name:?}"); } } Ok(()) } pub(super) async fn flush_metrics_to_disk( current_metrics: &Arc>, path: &Arc, ) -> anyhow::Result<()> { use std::io::Write; anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}"); anyhow::ensure!( path.file_name().is_some(), "path must have filename: {path:?}" ); let span = tracing::Span::current(); tokio::task::spawn_blocking({ let current_metrics = current_metrics.clone(); let path = path.clone(); move || { let _e = span.entered(); let parent = path.parent().expect("existence checked"); let file_name = path.file_name().expect("existence checked"); let mut tempfile = camino_tempfile::Builder::new() .prefix(file_name) .suffix(".tmp") .tempfile_in(parent)?; tracing::debug!("using tempfile {:?}", tempfile.path()); // write out all of the raw metrics, to be read out later on restart as cached values { let mut writer = std::io::BufWriter::new(&mut tempfile); serde_json::to_writer( &mut writer, &NewMetricsRefRoot::new(current_metrics.as_ref()), ) .context("serialize metrics")?; writer .into_inner() .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?; } tempfile.flush()?; tempfile.as_file().sync_all()?; fail::fail_point!("before-persist-last-metrics-collected"); drop(tempfile.persist(&*path).map_err(|e| e.error)?); let f = std::fs::File::open(path.parent().unwrap())?; f.sync_all()?; anyhow::Ok(()) } }) .await .with_context(|| format!("write metrics to {path:?} join error")) .and_then(|x| x.with_context(|| format!("write metrics to {path:?}"))) } ================================================ FILE: pageserver/src/consumption_metrics/metrics/tests.rs ================================================ use std::collections::HashMap; use super::*; use crate::consumption_metrics::RawMetric; #[test] fn startup_collected_timeline_metrics_before_advancing() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let mut metrics = Vec::new(); let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); let logical_size = 0x42000; let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, ancestor_lsn: Lsn(0), current_exact_logical_size: Some(logical_size), pitr_enabled: true, pitr_cutoff: Some(pitr_cutoff), }; let now = DateTime::::from(SystemTime::now()); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( snap.loaded_at.1.into(), now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), MetricsKey::written_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } #[test] fn startup_collected_timeline_metrics_second_round() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [now, before, init] = time_backwards(); let now = DateTime::::from(now); let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id) .at(before, disk_consistent_lsn.0) .to_kv_pair()]); let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, ancestor_lsn: Lsn(0), current_exact_logical_size: Some(logical_size), pitr_enabled: true, pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), MetricsKey::written_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } #[test] fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [now, just_before, before, init] = time_backwards(); let now = DateTime::::from(now); let just_before = DateTime::::from(just_before); let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([ // at t=before was the last time the last_record_lsn changed MetricsKey::written_size(tenant_id, timeline_id) .at(before, disk_consistent_lsn.0) .to_kv_pair(), // end time of this event is used for the next ones MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until(before, just_before, 0) .to_kv_pair(), ]); let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, ancestor_lsn: Lsn(0), current_exact_logical_size: Some(logical_size), pitr_enabled: true, pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), MetricsKey::written_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } /// Tests that written sizes do not regress across restarts. #[test] fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [later, now, at_restart] = time_backwards(); // FIXME: tests would be so much easier if we did not need to juggle back and forth // SystemTime and DateTime:: ... Could do the conversion only at upload time? let now = DateTime::::from(now); let later = DateTime::::from(later); let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); let way_before = before_restart - std::time::Duration::from_secs(10 * 60); let before_restart = DateTime::::from(before_restart); let way_before = DateTime::::from(way_before); let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), ancestor_lsn: Lsn(0), current_exact_logical_size: None, pitr_enabled: true, pitr_cutoff: Some(Lsn(20)), }; let mut cache = HashMap::from([ MetricsKey::written_size(tenant_id, timeline_id) .at(before_restart, 100) .to_kv_pair(), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until( way_before, before_restart, // not taken into account, but the timestamps are important 999_999_999, ) .to_kv_pair(), ]); let mut metrics = Vec::new(); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( before_restart, now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80), ] ); // now if we cache these metrics, and re-run while "still in recovery" cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); // "still in recovery", because our snapshot did not change snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80), ] ); } /// Tests that written sizes do not regress across restarts, even on child branches. #[test] fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [later, now, at_restart] = time_backwards(); // FIXME: tests would be so much easier if we did not need to juggle back and forth // SystemTime and DateTime:: ... Could do the conversion only at upload time? let now = DateTime::::from(now); let later = DateTime::::from(later); let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); let way_before = before_restart - std::time::Duration::from_secs(10 * 60); let before_restart = DateTime::::from(before_restart); let way_before = DateTime::::from(way_before); let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), ancestor_lsn: Lsn(40), current_exact_logical_size: None, pitr_enabled: true, pitr_cutoff: Some(Lsn(20)), }; let mut cache = HashMap::from([ MetricsKey::written_size(tenant_id, timeline_id) .at(before_restart, 100) .to_kv_pair(), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until( way_before, before_restart, // not taken into account, but the timestamps are important 999_999_999, ) .to_kv_pair(), ]); let mut metrics = Vec::new(); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( before_restart, now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), ] ); // now if we cache these metrics, and re-run while "still in recovery" cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); // "still in recovery", because our snapshot did not change snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), ] ); } /// Tests that written sizes do not regress across restarts, even on child branches and /// with a PITR cutoff after the branch point. #[test] fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [later, now, at_restart] = time_backwards(); // FIXME: tests would be so much easier if we did not need to juggle back and forth // SystemTime and DateTime:: ... Could do the conversion only at upload time? let now = DateTime::::from(now); let later = DateTime::::from(later); let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); let way_before = before_restart - std::time::Duration::from_secs(10 * 60); let before_restart = DateTime::::from(before_restart); let way_before = DateTime::::from(way_before); let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), ancestor_lsn: Lsn(30), current_exact_logical_size: None, pitr_enabled: true, pitr_cutoff: Some(Lsn(40)), }; let mut cache = HashMap::from([ MetricsKey::written_size(tenant_id, timeline_id) .at(before_restart, 100) .to_kv_pair(), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until( way_before, before_restart, // not taken into account, but the timestamps are important 999_999_999, ) .to_kv_pair(), ]); let mut metrics = Vec::new(); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( before_restart, now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), ] ); // now if we cache these metrics, and re-run while "still in recovery" cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); // "still in recovery", because our snapshot did not change snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), ] ); } #[test] fn post_restart_current_exact_logical_size_uses_cached() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let [now, at_restart] = time_backwards(); let now = DateTime::::from(now); let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); let before_restart = DateTime::::from(before_restart); let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), ancestor_lsn: Lsn(0), current_exact_logical_size: None, pitr_enabled: true, pitr_cutoff: None, }; let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id) .at(before_restart, 100) .to_kv_pair()]); let mut metrics = Vec::new(); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); metrics.retain(|item| item.key.metric == Name::LogicalSize); assert_eq!( metrics, &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)] ); } #[test] fn post_restart_synthetic_size_uses_cached_if_available() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { remote_size: 1000, // not yet calculated synthetic_size: 0, }; let now = SystemTime::now(); let before_restart = DateTime::::from(now - std::time::Duration::from_secs(5 * 60)); let now = DateTime::::from(now); let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id) .at(before_restart, 1000) .to_kv_pair()]); let mut metrics = Vec::new(); ts.to_metrics(tenant_id, now, &cached, &mut metrics); assert_eq!( metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), MetricsKey::synthetic_size(tenant_id).at(now, 1000), ] ); } #[test] fn post_restart_synthetic_size_is_not_sent_when_not_cached() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { remote_size: 1000, // not yet calculated synthetic_size: 0, }; let now = SystemTime::now(); let now = DateTime::::from(now); let cached = HashMap::new(); let mut metrics = Vec::new(); ts.to_metrics(tenant_id, now, &cached, &mut metrics); assert_eq!( metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), // no synthetic size here ] ); } fn time_backwards() -> [std::time::SystemTime; N] { let mut times = [std::time::SystemTime::UNIX_EPOCH; N]; times[0] = std::time::SystemTime::now(); for behind in 1..N { times[behind] = times[0] - std::time::Duration::from_secs(behind as u64); } times } /// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff /// indicates otherwise. #[test] fn pitr_disabled_yields_no_history_size() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let mut metrics = Vec::new(); let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, ancestor_lsn: Lsn(0), current_exact_logical_size: None, pitr_enabled: false, pitr_cutoff: Some(pitr_cutoff), }; let now = DateTime::::from(SystemTime::now()); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( snap.loaded_at.1.into(), now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), MetricsKey::written_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), ] ); } /// Tests that uninitialized PITR cutoff does not emit any history size metric at all. #[test] fn pitr_uninitialized_does_not_emit_history_size() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let mut metrics = Vec::new(); let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, ancestor_lsn: Lsn(0), current_exact_logical_size: None, pitr_enabled: true, pitr_cutoff: None, }; let now = DateTime::::from(SystemTime::now()); snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); assert_eq!( metrics, &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( snap.loaded_at.1.into(), now, 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), MetricsKey::written_size_since_parent(tenant_id, timeline_id) .at(now, disk_consistent_lsn.0), ] ); } pub(crate) const fn metric_examples_old( tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, before: DateTime, ) -> [RawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), ] } pub(crate) const fn metric_examples( tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, before: DateTime, ) -> [NewRawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0), MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), ] } ================================================ FILE: pageserver/src/consumption_metrics/metrics.rs ================================================ use std::sync::Arc; use std::time::SystemTime; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; use crate::context::RequestContext; use crate::tenant::mgr::TenantManager; use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` /// instead of static str. // Do not rename any of these without first consulting with data team and partner // management. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] pub(super) enum Name { /// Timeline last_record_lsn, absolute. #[serde(rename = "written_size")] WrittenSize, /// Timeline last_record_lsn, incremental #[serde(rename = "written_data_bytes_delta")] WrittenSizeDelta, /// Written bytes only on this timeline (not including ancestors): /// written_size - ancestor_lsn /// /// On the root branch, this is equivalent to `written_size`. #[serde(rename = "written_size_since_parent")] WrittenSizeSinceParent, /// PITR history size only on this timeline (not including ancestors): /// last_record_lsn - max(pitr_cutoff, ancestor_lsn). /// /// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed /// the PITR cutoff yet. 0 if PITR is disabled. #[serde(rename = "pitr_history_size_since_parent")] PitrHistorySizeSinceParent, /// Timeline logical size #[serde(rename = "timeline_logical_size")] LogicalSize, /// Tenant remote size #[serde(rename = "remote_storage_size")] RemoteSize, /// Tenant synthetic size #[serde(rename = "synthetic_storage_size")] SyntheticSize, } /// Key that uniquely identifies the object this metric describes. /// /// This is a denormalization done at the MetricsKey const methods; these should not be constructed /// elsewhere. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] pub(crate) struct MetricsKey { pub(super) tenant_id: TenantId, #[serde(skip_serializing_if = "Option::is_none")] pub(super) timeline_id: Option, pub(super) metric: Name, } impl MetricsKey { const fn absolute_values(self) -> AbsoluteValueFactory { AbsoluteValueFactory(self) } const fn incremental_values(self) -> IncrementalValueFactory { IncrementalValueFactory(self) } } /// Helper type which each individual metric kind can return to produce only absolute values. struct AbsoluteValueFactory(MetricsKey); impl AbsoluteValueFactory { #[cfg(test)] const fn at_old_format(self, time: DateTime, val: u64) -> super::RawMetric { let key = self.0; (key, (EventType::Absolute { time }, val)) } const fn at(self, time: DateTime, val: u64) -> NewRawMetric { let key = self.0; NewRawMetric { key, kind: EventType::Absolute { time }, value: val, } } fn key(&self) -> &MetricsKey { &self.0 } } /// Helper type which each individual metric kind can return to produce only incremental values. struct IncrementalValueFactory(MetricsKey); impl IncrementalValueFactory { #[allow(clippy::wrong_self_convention)] const fn from_until( self, prev_end: DateTime, up_to: DateTime, val: u64, ) -> NewRawMetric { let key = self.0; // cannot assert prev_end < up_to because these are realtime clock based let when = EventType::Incremental { start_time: prev_end, stop_time: up_to, }; NewRawMetric { key, kind: when, value: val, } } #[allow(clippy::wrong_self_convention)] #[cfg(test)] const fn from_until_old_format( self, prev_end: DateTime, up_to: DateTime, val: u64, ) -> super::RawMetric { let key = self.0; // cannot assert prev_end < up_to because these are realtime clock based let when = EventType::Incremental { start_time: prev_end, stop_time: up_to, }; (key, (when, val)) } fn key(&self) -> &MetricsKey { &self.0 } } // the static part of a MetricsKey impl MetricsKey { /// Absolute value of [`Timeline::get_last_record_lsn`]. /// /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: Name::WrittenSize, } .absolute_values() } /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we /// previously sent, starting from the previously sent incremental time range ending at the /// latest absolute measurement. const fn written_size_delta( tenant_id: TenantId, timeline_id: TimelineId, ) -> IncrementalValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: Name::WrittenSizeDelta, } .incremental_values() } /// `written_size` - `ancestor_lsn`. const fn written_size_since_parent( tenant_id: TenantId, timeline_id: TimelineId, ) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: Name::WrittenSizeSinceParent, } .absolute_values() } /// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`). const fn pitr_history_size_since_parent( tenant_id: TenantId, timeline_id: TimelineId, ) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: Name::PitrHistorySizeSinceParent, } .absolute_values() } /// Exact [`Timeline::get_current_logical_size`]. /// /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size const fn timeline_logical_size( tenant_id: TenantId, timeline_id: TimelineId, ) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: Name::LogicalSize, } .absolute_values() } /// [`TenantShard::remote_size`] /// /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: None, metric: Name::RemoteSize, } .absolute_values() } /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. /// /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: None, metric: Name::SyntheticSize, } .absolute_values() } } pub(super) async fn collect_all_metrics( tenant_manager: &Arc, cached_metrics: &Cache, ctx: &RequestContext, ) -> Vec { use pageserver_api::models::TenantState; let started_at = std::time::Instant::now(); let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(err) => { tracing::error!("failed to list tenants: {:?}", err); return vec![]; } }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { if state != TenantState::Active || !id.is_shard_zero() { None } else { tenant_manager .get_attached_tenant_shard(id) .ok() .map(|tenant| (id.tenant_id, tenant)) } }); let res = collect(tenants, cached_metrics, ctx).await; tracing::info!( elapsed_ms = started_at.elapsed().as_millis(), total = res.len(), "collected metrics" ); res } async fn collect(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec where S: futures::stream::Stream)>, { let mut current_metrics: Vec = Vec::new(); let mut tenants = std::pin::pin!(tenants); while let Some((tenant_id, tenant)) = tenants.next().await { let timelines = tenant.list_timelines(); for timeline in timelines { let timeline_id = timeline.timeline_id; match TimelineSnapshot::collect(&timeline, ctx) { Ok(Some(snap)) => { snap.to_metrics( tenant_id, timeline_id, Utc::now(), &mut current_metrics, cache, ); } Ok(None) => {} Err(e) => { tracing::error!( "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}", timeline.timeline_id ); continue; } } } let snap = TenantSnapshot::collect(&tenant); snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); } current_metrics } /// In-between abstraction to allow testing metrics without actual Tenants. struct TenantSnapshot { remote_size: u64, synthetic_size: u64, } impl TenantSnapshot { /// Collect tenant status to have metrics created out of it. fn collect(t: &Arc) -> Self { TenantSnapshot { remote_size: t.remote_size(), // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one synthetic_size: t.cached_synthetic_size(), } } fn to_metrics( &self, tenant_id: TenantId, now: DateTime, cached: &Cache, metrics: &mut Vec, ) { let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); let synthetic_size = { let factory = MetricsKey::synthetic_size(tenant_id); let mut synthetic_size = self.synthetic_size; if synthetic_size == 0 { if let Some(item) = cached.get(factory.key()) { // use the latest value from previous session, TODO: check generation number synthetic_size = item.value; } } if synthetic_size != 0 { // only send non-zeroes because otherwise these show up as errors in logs Some(factory.at(now, synthetic_size)) } else { None } }; metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten()); } } /// Internal type to make timeline metric production testable. /// /// As this value type contains all of the information needed from a timeline to produce the /// metrics, it can easily be created with different values in test. struct TimelineSnapshot { loaded_at: (Lsn, SystemTime), last_record_lsn: Lsn, ancestor_lsn: Lsn, current_exact_logical_size: Option, /// Whether PITR is enabled (pitr_interval > 0). pitr_enabled: bool, /// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately /// Some(last_record_lsn), but may lag behind it since it's computed periodically. pitr_cutoff: Option, } impl TimelineSnapshot { /// Collect the metrics from an actual timeline. /// /// Fails currently only when [`Timeline::get_current_logical_size`] fails. /// /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size fn collect( t: &Arc, ctx: &RequestContext, ) -> anyhow::Result> { if !t.is_active() { // no collection for broken or stopping needed, we will still keep the cached values // though at the caller. Ok(None) } else { let loaded_at = t.loaded_at; let last_record_lsn = t.get_last_record_lsn(); let ancestor_lsn = t.get_ancestor_lsn(); let pitr_enabled = !t.get_pitr_interval().is_zero(); let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time; let current_exact_logical_size = { let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); let size = span.in_scope(|| { t.get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::Background, ctx, ) }); match size { // Only send timeline logical size when it is fully calculated. CurrentLogicalSize::Exact(ref size) => Some(size.into()), CurrentLogicalSize::Approximate(_) => None, } }; Ok(Some(TimelineSnapshot { loaded_at, last_record_lsn, ancestor_lsn, current_exact_logical_size, pitr_enabled, pitr_cutoff, })) } } /// Produce the timeline consumption metrics into the `metrics` argument. fn to_metrics( &self, tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, metrics: &mut Vec, cache: &Cache, ) { let timeline_written_size = u64::from(self.last_record_lsn); let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id); let last_stop_time = cache.get(written_size_delta_key.key()).map(|item| { item.kind .incremental_timerange() .expect("never create EventType::Absolute for written_size_delta") .end }); let written_size_now = MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size); // by default, use the last sent written_size as the basis for // calculating the delta. if we don't yet have one, use the load time value. let prev: (DateTime, u64) = cache .get(&written_size_now.key) .map(|item| { // use the prev time from our last incremental update, or default to latest // absolute update on the first round. let prev_at = item .kind .absolute_time() .expect("never create EventType::Incremental for written_size"); let prev_at = last_stop_time.unwrap_or(prev_at); (*prev_at, item.value) }) .unwrap_or_else(|| { // if we don't have a previous point of comparison, compare to the load time // lsn. let (disk_consistent_lsn, loaded_at) = &self.loaded_at; (DateTime::from(*loaded_at), disk_consistent_lsn.0) }); let up_to = now; let written_size_last = written_size_now.value.max(prev.1); // don't regress if let Some(delta) = written_size_now.value.checked_sub(prev.1) { let key_value = written_size_delta_key.from_until(prev.0, up_to, delta); // written_size_delta metrics.push(key_value); // written_size metrics.push(written_size_now); } else { // the cached value was ahead of us, report zero until we've caught up metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0)); // the cached value was ahead of us, report the same until we've caught up metrics.push(NewRawMetric { key: written_size_now.key, kind: written_size_now.kind, value: prev.1, }); } // Compute the branch-local written size. let written_size_since_parent_key = MetricsKey::written_size_since_parent(tenant_id, timeline_id); metrics.push( written_size_since_parent_key .at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)), ); // Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the // PITR cutoff. 0 if PITR is disabled. let pitr_history_size_since_parent_key = MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id); if !self.pitr_enabled { metrics.push(pitr_history_size_since_parent_key.at(now, 0)); } else if let Some(pitr_cutoff) = self.pitr_cutoff { metrics.push(pitr_history_size_since_parent_key.at( now, written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0), )); } { let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); let current_or_previous = self .current_exact_logical_size .or_else(|| cache.get(factory.key()).map(|item| item.value)); if let Some(size) = current_or_previous { metrics.push(factory.at(now, size)); } } } } #[cfg(test)] mod tests; #[cfg(test)] pub(crate) use tests::{metric_examples, metric_examples_old}; ================================================ FILE: pageserver/src/consumption_metrics/upload.rs ================================================ use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::id::{TenantId, TimelineId}; use super::metrics::Name; use super::{Cache, MetricsKey, NewRawMetric, RawMetric}; /// How the metrics from pageserver are identified. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] struct Ids { pub(super) tenant_id: TenantId, #[serde(skip_serializing_if = "Option::is_none")] pub(super) timeline_id: Option, } /// Serialize and write metrics to an HTTP endpoint #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, metrics: &[NewRawMetric], cached_metrics: &mut Cache, idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { let mut uploaded = 0; let mut failed = 0; let started_at = std::time::Instant::now(); let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys); while let Some(res) = iter.next() { let (chunk, body) = res?; let event_bytes = body.len(); let is_last = iter.len() == 0; let res = upload(client, metric_collection_endpoint, body, cancel, is_last) .instrument(tracing::info_span!( "upload", %event_bytes, uploaded, total = metrics.len(), )) .await; match res { Ok(()) => { for item in chunk { cached_metrics.insert(item.key, item.clone()); } uploaded += chunk.len(); } Err(_) => { // failure(s) have already been logged // // however this is an inconsistency: if we crash here, we will start with the // values as uploaded. in practice, the rejections no longer happen. failed += chunk.len(); } } } let elapsed = started_at.elapsed(); tracing::info!( uploaded, failed, elapsed_ms = elapsed.as_millis(), "done sending metrics" ); Ok(()) } /// Serialize and write metrics to a remote storage object #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] pub(super) async fn upload_metrics_bucket( client: &GenericRemoteStorage, cancel: &CancellationToken, node_id: &str, metrics: &[NewRawMetric], idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { if metrics.is_empty() { // Skip uploads if we have no metrics, so that readers don't have to handle the edge case // of an empty object. return Ok(()); } // Compose object path let datetime: DateTime = SystemTime::now().into(); let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/hour=%H/%H:%M:%SZ"); let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; // Set up a gzip writer into a buffer let mut compressed_bytes: Vec = Vec::new(); let compressed_writer = std::io::Cursor::new(&mut compressed_bytes); let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer); // Serialize and write into compressed buffer let started_at = std::time::Instant::now(); for res in serialize_in_chunks_ndjson(CHUNK_SIZE, metrics, idempotency_keys) { let (_chunk, body) = res?; gzip_writer.write_all(&body).await?; } gzip_writer.flush().await?; gzip_writer.shutdown().await?; let compressed_length = compressed_bytes.len(); // Write to remote storage client .upload_storage_object( futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))), compressed_length, &path, cancel, ) .await?; let elapsed = started_at.elapsed(); tracing::info!( compressed_length, elapsed_ms = elapsed.as_millis(), "write metrics bucket at {path}", ); Ok(()) } /// Serializes the input metrics as JSON in chunks of chunk_size. The provided /// idempotency keys are injected into the corresponding metric events (reused /// across different metrics sinks), and must have the same length as input. fn serialize_in_chunks<'a>( chunk_size: usize, input: &'a [NewRawMetric], idempotency_keys: &'a [IdempotencyKey<'a>], ) -> impl ExactSizeIterator> + 'a { use bytes::BufMut; assert_eq!(input.len(), idempotency_keys.len()); struct Iter<'a> { inner: std::slice::Chunks<'a, NewRawMetric>, idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, chunk_size: usize, // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries buffer: bytes::BytesMut, // chunk amount of events are reused to produce the serialized document scratch: Vec>, } impl<'a> Iterator for Iter<'a> { type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>; fn next(&mut self) -> Option { let chunk = self.inner.next()?; if self.scratch.is_empty() { // first round: create events with N strings self.scratch.extend( chunk .iter() .zip(&mut self.idempotency_keys) .map(|(raw_metric, key)| raw_metric.as_event(key)), ); } else { // next rounds: update_in_place to reuse allocations assert_eq!(self.scratch.len(), self.chunk_size); itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); } let res = serde_json::to_writer( (&mut self.buffer).writer(), &EventChunk { events: (&self.scratch[..chunk.len()]).into(), }, ); match res { Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))), Err(e) => Some(Err(e)), } } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } impl ExactSizeIterator for Iter<'_> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); let idempotency_keys = idempotency_keys.iter(); let scratch = Vec::new(); Iter { inner, idempotency_keys, chunk_size, buffer, scratch, } } /// Serializes the input metrics as NDJSON in chunks of chunk_size. Each event /// is serialized as a separate JSON object on its own line. The provided /// idempotency keys are injected into the corresponding metric events (reused /// across different metrics sinks), and must have the same length as input. fn serialize_in_chunks_ndjson<'a>( chunk_size: usize, input: &'a [NewRawMetric], idempotency_keys: &'a [IdempotencyKey<'a>], ) -> impl ExactSizeIterator> + 'a { use bytes::BufMut; assert_eq!(input.len(), idempotency_keys.len()); struct Iter<'a> { inner: std::slice::Chunks<'a, NewRawMetric>, idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, chunk_size: usize, // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries buffer: bytes::BytesMut, // chunk amount of events are reused to produce the serialized document scratch: Vec>, } impl<'a> Iterator for Iter<'a> { type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>; fn next(&mut self) -> Option { let chunk = self.inner.next()?; if self.scratch.is_empty() { // first round: create events with N strings self.scratch.extend( chunk .iter() .zip(&mut self.idempotency_keys) .map(|(raw_metric, key)| raw_metric.as_event(key)), ); } else { // next rounds: update_in_place to reuse allocations assert_eq!(self.scratch.len(), self.chunk_size); itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); } // Serialize each event as NDJSON (one JSON object per line) for event in self.scratch[..chunk.len()].iter() { let res = serde_json::to_writer((&mut self.buffer).writer(), event); if let Err(e) = res { return Some(Err(e)); } // Add newline after each event to follow NDJSON format self.buffer.put_u8(b'\n'); } Some(Ok((chunk, self.buffer.split().freeze()))) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } impl ExactSizeIterator for Iter<'_> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); let idempotency_keys = idempotency_keys.iter(); let scratch = Vec::new(); Iter { inner, idempotency_keys, chunk_size, buffer, scratch, } } trait RawMetricExt { fn as_event(&self, key: &IdempotencyKey<'_>) -> Event; fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>); } impl RawMetricExt for RawMetric { fn as_event(&self, key: &IdempotencyKey<'_>) -> Event { let MetricsKey { metric, tenant_id, timeline_id, } = self.0; let (kind, value) = self.1; Event { kind, metric, idempotency_key: key.to_string(), value, extra: Ids { tenant_id, timeline_id, }, } } fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>) { use std::fmt::Write; let MetricsKey { metric, tenant_id, timeline_id, } = self.0; let (kind, value) = self.1; *event = Event { kind, metric, idempotency_key: { event.idempotency_key.clear(); write!(event.idempotency_key, "{key}").unwrap(); std::mem::take(&mut event.idempotency_key) }, value, extra: Ids { tenant_id, timeline_id, }, }; } } impl RawMetricExt for NewRawMetric { fn as_event(&self, key: &IdempotencyKey<'_>) -> Event { let MetricsKey { metric, tenant_id, timeline_id, } = self.key; let kind = self.kind; let value = self.value; Event { kind, metric, idempotency_key: key.to_string(), value, extra: Ids { tenant_id, timeline_id, }, } } fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>) { use std::fmt::Write; let MetricsKey { metric, tenant_id, timeline_id, } = self.key; let kind = self.kind; let value = self.value; *event = Event { kind, metric, idempotency_key: { event.idempotency_key.clear(); write!(event.idempotency_key, "{key}").unwrap(); std::mem::take(&mut event.idempotency_key) }, value, extra: Ids { tenant_id, timeline_id, }, }; } } pub(crate) trait KeyGen<'a> { fn generate(&self) -> IdempotencyKey<'a>; } impl<'a> KeyGen<'a> for &'a str { fn generate(&self) -> IdempotencyKey<'a> { IdempotencyKey::generate(self) } } enum UploadError { Rejected(reqwest::StatusCode), Reqwest(reqwest::Error), Cancelled, } impl std::fmt::Debug for UploadError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // use same impl because backoff::retry will log this using both std::fmt::Display::fmt(self, f) } } impl std::fmt::Display for UploadError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use UploadError::*; match self { Rejected(code) => write!(f, "server rejected the metrics with {code}"), Reqwest(e) => write!( f, "request failed: {e}{}", e.source().map(|e| format!(": {e}")).unwrap_or_default() ), Cancelled => write!(f, "cancelled"), } } } impl UploadError { fn is_reject(&self) -> bool { matches!(self, UploadError::Rejected(_)) } } // this is consumed by the test verifiers static LAST_IN_BATCH: reqwest::header::HeaderName = reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch"); async fn upload( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, body: bytes::Bytes, cancel: &CancellationToken, is_last: bool, ) -> Result<(), UploadError> { let warn_after = 3; let max_attempts = 10; // this is used only with tests so far let last_value = if is_last { "true" } else { "false" }; let res = utils::backoff::retry( || async { let res = client .post(metric_collection_endpoint.clone()) .header(reqwest::header::CONTENT_TYPE, "application/json") .header(LAST_IN_BATCH.clone(), last_value) .body(body.clone()) .send() .await; let res = res.and_then(|res| res.error_for_status()); // 10 redirects are normally allowed, so we don't need worry about 3xx match res { Ok(_response) => Ok(()), Err(e) => { let status = e.status().filter(|s| s.is_client_error()); if let Some(status) = status { // rejection used to be a thing when the server could reject a // whole batch of metrics if one metric was bad. Err(UploadError::Rejected(status)) } else { Err(UploadError::Reqwest(e)) } } } }, UploadError::is_reject, warn_after, max_attempts, "upload consumption_metrics", cancel, ) .await .ok_or_else(|| UploadError::Cancelled) .and_then(|x| x); match &res { Ok(_) => {} Err(e) if e.is_reject() => { // permanent errors currently do not get logged by backoff::retry // display alternate has no effect, but keeping it here for easier pattern matching. tracing::error!("failed to upload metrics: {e:#}"); } Err(_) => { // these have been logged already } } res } #[cfg(test)] mod tests { use chrono::{DateTime, Utc}; use once_cell::sync::Lazy; use super::*; use crate::consumption_metrics::NewMetricsRefRoot; use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value; #[test] fn chunked_serialization() { let examples = metric_samples(); assert!(examples.len() > 1); let now = Utc::now(); let idempotency_keys = (0..examples.len()) .map(|i| FixedGen::new(now, "1", i as u16).generate()) .collect::>(); // need to use Event here because serde_json::Value uses default hashmap, not linked // hashmap #[derive(serde::Deserialize)] struct EventChunk { events: Vec>, } let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); for chunk_size in 1..examples.len() { let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); // if these are equal, it means that multi-chunking version works as well assert_eq!(correct, actual); } } #[test] fn chunked_serialization_ndjson() { let examples = metric_samples(); assert!(examples.len() > 1); let now = Utc::now(); let idempotency_keys = (0..examples.len()) .map(|i| FixedGen::new(now, "1", i as u16).generate()) .collect::>(); // Parse NDJSON format - each line is a separate JSON object let parse_ndjson = |body: &[u8]| -> Vec> { let body_str = std::str::from_utf8(body).unwrap(); body_str .trim_end_matches('\n') .lines() .filter(|line| !line.is_empty()) .map(|line| serde_json::from_str::>(line).unwrap()) .collect() }; let correct = serialize_in_chunks_ndjson(examples.len(), &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| parse_ndjson(&body)) .collect::>(); for chunk_size in 1..examples.len() { let actual = serialize_in_chunks_ndjson(chunk_size, &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| parse_ndjson(&body)) .collect::>(); // if these are equal, it means that multi-chunking version works as well assert_eq!(correct, actual); } } #[derive(Clone, Copy)] struct FixedGen<'a>(chrono::DateTime, &'a str, u16); impl<'a> FixedGen<'a> { fn new(now: chrono::DateTime, node_id: &'a str, nonce: u16) -> Self { FixedGen(now, node_id, nonce) } } impl<'a> KeyGen<'a> for FixedGen<'a> { fn generate(&self) -> IdempotencyKey<'a> { IdempotencyKey::for_tests(self.0, self.1, self.2) } } static SAMPLES_NOW: Lazy> = Lazy::new(|| { DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z") .unwrap() .into() }); #[test] fn metric_image_stability() { // it is important that these strings stay as they are let examples = [ ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), ( line!(), r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, ), ]; let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0); let examples = examples.into_iter().zip(metric_samples()); for ((line, expected), item) in examples { let e = consumption_metrics::Event { kind: item.kind, metric: item.key.metric, idempotency_key: idempotency_key.to_string(), value: item.value, extra: Ids { tenant_id: item.key.tenant_id, timeline_id: item.key.timeline_id, }, }; let actual = serde_json::to_string(&e).unwrap(); assert_eq!( expected, actual, "example for {:?} from line {line}", item.kind ); } } #[test] fn disk_format_upgrade() { let old_samples_json = serde_json::to_value(metric_samples_old()).unwrap(); let new_samples = serde_json::to_value(NewMetricsRefRoot::new(metric_samples().as_ref())).unwrap(); let upgraded_samples = read_metrics_from_serde_value(old_samples_json).unwrap(); let new_samples = read_metrics_from_serde_value(new_samples).unwrap(); assert_eq!(upgraded_samples, new_samples); } fn metric_samples_old() -> [RawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z") .unwrap() .into(); let [now, before] = [*SAMPLES_NOW, before]; super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } fn metric_samples() -> [NewRawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z") .unwrap() .into(); let [now, before] = [*SAMPLES_NOW, before]; super::super::metrics::metric_examples(tenant_id, timeline_id, now, before) } } ================================================ FILE: pageserver/src/consumption_metrics.rs ================================================ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, SystemTime}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use itertools::Itertools as _; use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use serde::{Deserialize, Serialize}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; use crate::config::PageServerConf; use crate::consumption_metrics::metrics::MetricsKey; use crate::consumption_metrics::upload::KeyGen as _; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::mgr::TenantManager; use crate::tenant::size::CalculateSyntheticSizeError; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{LogicalSizeCalculationCause, TenantShard}; mod disk_cache; mod metrics; mod upload; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// Basically a key-value pair, but usually in a Vec except for [`Cache`]. /// /// This is as opposed to `consumption_metrics::Event` which is the externally communicated form. /// Difference is basically the missing idempotency key, which lives only for the duration of /// upload attempts. type RawMetric = (MetricsKey, (EventType, u64)); /// The new serializable metrics format #[derive(Serialize, Deserialize)] struct NewMetricsRoot { version: usize, metrics: Vec, } impl NewMetricsRoot { pub fn is_v2_metrics(json_value: &serde_json::Value) -> bool { if let Some(ver) = json_value.get("version") { if let Some(2) = ver.as_u64() { return true; } } false } } /// The new serializable metrics format #[derive(Serialize)] struct NewMetricsRefRoot<'a> { version: usize, metrics: &'a [NewRawMetric], } impl<'a> NewMetricsRefRoot<'a> { fn new(metrics: &'a [NewRawMetric]) -> Self { Self { version: 2, metrics, } } } /// The new serializable metrics format #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] struct NewRawMetric { key: MetricsKey, kind: EventType, value: u64, // TODO: add generation field and check against generations } impl NewRawMetric { #[cfg(test)] fn to_kv_pair(&self) -> (MetricsKey, NewRawMetric) { (self.key, self.clone()) } } /// Caches the [`RawMetric`]s /// /// In practice, during startup, last sent values are stored here to be used in calculating new /// ones. After successful uploading, the cached values are updated to cache. This used to be used /// for deduplication, but that is no longer needed. type Cache = HashMap; pub async fn run( conf: &'static PageServerConf, tenant_manager: Arc, cancel: CancellationToken, ) { let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else { return; }; let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); let metrics_ctx = RequestContext::todo_child( TaskKind::MetricsCollection, // This task itself shouldn't download anything. // The actual size calculation does need downloads, and // creates a child context with the right DownloadBehavior. DownloadBehavior::Error, ); let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "consumption metrics collection", collect_metrics( tenant_manager.clone(), metric_collection_endpoint, &conf.metric_collection_bucket, conf.metric_collection_interval, conf.id, local_disk_storage, cancel.clone(), metrics_ctx, ) .instrument(info_span!("metrics_collection")), )); let worker_ctx = RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "synthetic size calculation", calculate_synthetic_size_worker( tenant_manager.clone(), conf.synthetic_size_calculation_interval, cancel.clone(), worker_ctx, ) .instrument(info_span!("synthetic_size_worker")), )); let (collect_metrics, synthetic_size_worker) = futures::future::join(collect_metrics, synthetic_size_worker).await; collect_metrics .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); synthetic_size_worker .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); } /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] async fn collect_metrics( tenant_manager: Arc, metric_collection_endpoint: &Url, metric_collection_bucket: &Option, metric_collection_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { let path: Arc = Arc::new(local_disk_storage); let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); let mut cached_metrics = tokio::select! { _ = cancel.cancelled() => return Ok(()), ret = restore_and_reschedule => ret, }; // define client here to reuse it for all requests let client = reqwest::ClientBuilder::new() .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT) .build() .expect("Failed to create http client with timeout"); let bucket_client = if let Some(bucket_config) = metric_collection_bucket { match GenericRemoteStorage::from_config(bucket_config).await { Ok(client) => Some(client), Err(e) => { // Non-fatal error: if we were given an invalid config, we will proceed // with sending metrics over the network, but not to S3. tracing::warn!("Invalid configuration for metric_collection_bucket: {e}"); None } } } else { None }; let node_id = node_id.to_string(); loop { let started_at = Instant::now(); // these are point in time, with variable "now" let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; // Pre-generate event idempotency keys, to reuse them across the bucket // and HTTP sinks. let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate()) .take(metrics.len()) .collect_vec(); let metrics = Arc::new(metrics); // why not race cancellation here? because we are one of the last tasks, and if we are // already here, better to try to flush the new values. let flush = async { match disk_cache::flush_metrics_to_disk(&metrics, &path).await { Ok(()) => { tracing::debug!("flushed metrics to disk"); } Err(e) => { // idea here is that if someone creates a directory as our path, then they // might notice it from the logs before shutdown and remove it tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } if let Some(bucket_client) = &bucket_client { let res = upload::upload_metrics_bucket( bucket_client, &cancel, &node_id, &metrics, &idempotency_keys, ) .await; if let Err(e) = res { tracing::error!("failed to upload to remote storage: {e:#}"); } } }; let upload = async { let res = upload::upload_metrics_http( &client, metric_collection_endpoint, &cancel, &metrics, &mut cached_metrics, &idempotency_keys, ) .await; if let Err(e) = res { // serialization error which should never happen tracing::error!("failed to upload via HTTP due to {e:#}"); } }; // let these run concurrently let (_, _) = tokio::join!(flush, upload); crate::tenant::tasks::warn_when_period_overrun( started_at.elapsed(), metric_collection_interval, BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); let res = tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled()) .await; if res.is_ok() { return Ok(()); } } } /// Called on the first iteration in an attempt to join the metric uploading schedule from previous /// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts. /// /// Cancellation safe. async fn restore_and_reschedule( path: &Arc, metric_collection_interval: Duration, ) -> Cache { let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await { Ok(found_some) => { // there is no min needed because we write these sequentially in // collect_all_metrics let earlier_metric_at = found_some .iter() .map(|item| item.kind.recorded_at()) .copied() .next(); let cached = found_some .into_iter() .map(|item| (item.key, item)) .collect::(); (cached, earlier_metric_at) } Err(e) => { use std::io::{Error, ErrorKind}; let root = e.root_cause(); let maybe_ioerr = root.downcast_ref::(); let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound); if !is_not_found { tracing::info!("failed to read any previous metrics from {path:?}: {e:#}"); } (HashMap::new(), None) } }; if let Some(earlier_metric_at) = earlier_metric_at { let earlier_metric_at: SystemTime = earlier_metric_at.into(); let error = reschedule(earlier_metric_at, metric_collection_interval).await; if let Some(error) = error { if error.as_secs() >= 60 { tracing::info!( error_ms = error.as_millis(), "startup scheduling error due to restart" ) } } } cached } async fn reschedule( earlier_metric_at: SystemTime, metric_collection_interval: Duration, ) -> Option { let now = SystemTime::now(); match now.duration_since(earlier_metric_at) { Ok(from_last_send) if from_last_send < metric_collection_interval => { let sleep_for = metric_collection_interval - from_last_send; let deadline = std::time::Instant::now() + sleep_for; tokio::time::sleep_until(deadline.into()).await; let now = std::time::Instant::now(); // executor threads might be busy, add extra measurements Some(if now < deadline { deadline - now } else { now - deadline }) } Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)), Err(_) => { tracing::warn!( ?now, ?earlier_metric_at, "oldest recorded metric is in future; first values will come out with inconsistent timestamps" ); earlier_metric_at.duration_since(now).ok() } } } /// Caclculate synthetic size for each active tenant async fn calculate_synthetic_size_worker( tenant_manager: Arc, synthetic_size_calculation_interval: Duration, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); scopeguard::defer! { info!("calculate_synthetic_size_worker stopped"); }; loop { let started_at = Instant::now(); let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(e) => { warn!("cannot get tenant list: {e:#}"); continue; } }; for (tenant_shard_id, tenant_state, _gen) in tenants { if tenant_state != TenantState::Active { continue; } if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; } let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else { continue; }; if !tenant.is_active() { continue; } // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. calculate_and_log(&tenant, &cancel, &ctx).await; } crate::tenant::tasks::warn_when_period_overrun( started_at.elapsed(), synthetic_size_calculation_interval, BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker, ); let res = tokio::time::timeout_at( started_at + synthetic_size_calculation_interval, cancel.cancelled(), ) .await; if res.is_ok() { return Ok(()); } } } async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) { const CAUSE: LogicalSizeCalculationCause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? // We can put in some prioritization for consumption metrics. // Same for the loop that fetches computed metrics. // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, // which turns out is really handy to understand the system. match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await { Ok(_) => {} Err(CalculateSyntheticSizeError::Cancelled) => {} Err(e) => { let tenant_shard_id = tenant.tenant_shard_id(); error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); } } } ================================================ FILE: pageserver/src/context.rs ================================================ //! Defines [`RequestContext`]. //! //! It is a structure that we use throughout the pageserver to propagate //! high-level context from places that _originate_ activity down to the //! shared code paths at the heart of the pageserver. It's inspired by //! Golang's `context.Context`. //! //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions: //! 1. What high-level activity ([`TaskKind`]) needs this page? //! We need that information as a categorical dimension for page access //! statistics, which we, in turn, need to guide layer eviction policy design. //! 2. How should we behave if, to produce the page image, we need to //! on-demand download a layer file ([`DownloadBehavior`]). //! //! [`RequestContext`] satisfies those needs. //! The current implementation is a small `struct` that is passed through //! the call chain by reference. //! //! ### Future Work //! //! However, we do not intend to stop here, since there are other needs that //! require carrying information from high to low levels of the app. //! //! Most importantly, **cancellation signaling** in response to //! 1. timeouts (page_service max response time) and //! 2. lifecycle requests (detach tenant, delete timeline). //! //! Related to that, there is sometimes a need to ensure that all tokio tasks spawned //! by the transitive callees of a request have finished. The keyword here //! is **Structured Concurrency**, and right now, we use `task_mgr` in most places, //! `TaskHandle` in some places, and careful code review around `FuturesUnordered` //! or `JoinSet` in other places. //! //! We do not yet have a systematic cancellation story in pageserver, and it is //! pretty clear that [`RequestContext`] will be responsible for that. //! So, the API already prepares for this role through the //! [`RequestContext::detached_child`] and [`RequestContext::attached_child`] methods. //! See their doc comments for details on how we will use them in the future. //! //! It is not clear whether or how we will enforce Structured Concurrency, and //! what role [`RequestContext`] will play there. //! So, the API doesn't prepare us for this topic. //! //! Other future uses of `RequestContext`: //! - Communicate compute & IO priorities (user-initiated request vs. background-loop) //! - Request IDs for distributed tracing //! - Request/Timeline/Tenant-scoped log levels //! //! RequestContext might look quite different once it supports those features. //! Likely, it will have a shape similar to Golang's `context.Context`. //! //! ### Why A Struct Instead Of Method Parameters //! //! What's typical about such information is that it needs to be passed down //! along the call chain from high level to low level, but few of the functions //! in the middle need to understand it. //! Further, it is to be expected that we will need to propagate more data //! in the future (see the earlier section on future work). //! Hence, for functions in the middle of the call chain, we have the following //! requirements: //! 1. It should be easy to forward the context to callees. //! 2. To propagate more data from high-level to low-level code, the functions in //! the middle should not need to be modified. //! //! The solution is to have a container structure ([`RequestContext`]) that //! carries the information. Functions that don't care about what's in it //! pass it along to callees. //! //! ### Why Not Task-Local Variables //! //! One could use task-local variables (the equivalent of thread-local variables) //! to address the immediate needs outlined above. //! However, we reject task-local variables because: //! 1. they are implicit, thereby making it harder to trace the data flow in code //! reviews and during debugging, //! 2. they can be mutable, which enables implicit return data flow, //! 3. they are restrictive in that code which fans out into multiple tasks, //! or even threads, needs to carefully propagate the state. //! //! In contrast, information flow with [`RequestContext`] is //! 1. always explicit, //! 2. strictly uni-directional because RequestContext is immutable, //! 3. tangible because a [`RequestContext`] is just a value. //! When creating child activities, regardless of whether it's a task, //! thread, or even an RPC to another service, the value can //! be used like any other argument. //! //! The solution is that all code paths are infected with precisely one //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. use std::{sync::Arc, time::Duration}; use once_cell::sync::Lazy; use tracing::warn; use utils::{id::TimelineId, shard::TenantShardId}; use crate::{ metrics::{StorageIoSizeMetrics, TimelineMetrics}, task_mgr::TaskKind, tenant::Timeline, }; use futures::FutureExt; use futures::future::BoxFuture; use std::future::Future; use tracing_utils::perf_span::{PerfInstrument, PerfSpan}; use tracing::{Dispatch, Span}; // The main structure of this module, see module-level comment. pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, read_path_debug: bool, scope: Scope, perf_span: Option, perf_span_dispatch: Option, } #[derive(Clone)] pub(crate) enum Scope { Global { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, SecondaryTenant { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, SecondaryTimeline { io_size_metrics: crate::metrics::StorageIoSizeMetrics, }, Timeline { // We wrap the `Arc`s inside another Arc to avoid child // context creation contending for the ref counters of the Arc, // which are shared among all tasks that operate on the timeline, especially // concurrent page_service connections. #[allow(clippy::redundant_allocation)] arc_arc: Arc>, }, #[cfg(test)] UnitTest { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, DebugTools { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, } static GLOBAL_IO_SIZE_METRICS: Lazy = Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*")); impl Scope { pub(crate) fn new_global() -> Self { Scope::Global { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start /// of a compaction iteration. pub(crate) fn new_timeline(timeline: &Timeline) -> Self { Scope::Timeline { arc_arc: Arc::new(Arc::clone(&timeline.metrics)), } } pub(crate) fn new_page_service_pagestream( timeline_handle: &crate::tenant::timeline::handle::Handle< crate::page_service::TenantManagerTypes, >, ) -> Self { Scope::Timeline { arc_arc: Arc::clone(&timeline_handle.metrics), } } pub(crate) fn new_secondary_timeline( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Self { // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle. let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = tenant_shard_id.shard_slug().to_string(); let timeline_id = timeline_id.to_string(); let io_size_metrics = crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); Scope::SecondaryTimeline { io_size_metrics } } pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self { // Before propagating metrics via RequestContext, the labels were inferred from file path. // The only user of VirtualFile at tenant scope is the heatmap download & read. // The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*). // Thus, we do the same here, and extend that for anything secondary-tenant scoped. // // If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future, // we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown, // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile // at this point, so, we were able to completely side-step tenant-scoped stuff there). Scope::SecondaryTenant { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } #[cfg(test)] pub(crate) fn new_unit_test() -> Self { Scope::UnitTest { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } pub(crate) fn new_debug_tools() -> Self { Scope::DebugTools { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } } /// The kind of access to the page cache. #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)] pub enum PageContentKind { Unknown, DeltaLayerSummary, DeltaLayerBtreeNode, DeltaLayerValue, ImageLayerSummary, ImageLayerBtreeNode, ImageLayerValue, InMemoryLayer, } /// Desired behavior if the operation requires an on-demand download /// to proceed. #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum DownloadBehavior { /// Download the layer file. It can take a while. Download, /// Download the layer file, but print a warning to the log. This should be used /// in code where the layer file is expected to already exist locally. Warn, /// Return a PageReconstructError::NeedsDownload error Error, } /// Whether this request should update access times used in LRU eviction #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub(crate) enum AccessStatsBehavior { /// Update access times: this request's access to data should be taken /// as a hint that the accessed layer is likely to be accessed again Update, /// Do not update access times: this request is accessing the layer /// but does not want to indicate that the layer should be retained in cache, /// perhaps because the requestor is a compaction routine that will soon cover /// this layer with another. Skip, } pub struct RequestContextBuilder { inner: RequestContext, } impl RequestContextBuilder { /// A new builder with default settings pub fn new(task_kind: TaskKind) -> Self { Self { inner: RequestContext { task_kind, download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, read_path_debug: false, scope: Scope::new_global(), perf_span: None, perf_span_dispatch: None, }, } } pub fn from(original: &RequestContext) -> Self { Self { inner: original.clone(), } } pub fn task_kind(mut self, k: TaskKind) -> Self { self.inner.task_kind = k; self } /// Configure the DownloadBehavior of the context: whether to /// download missing layers, and/or warn on the download. pub fn download_behavior(mut self, b: DownloadBehavior) -> Self { self.inner.download_behavior = b; self } /// Configure the AccessStatsBehavior of the context: whether layer /// accesses should update the access time of the layer. pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self { self.inner.access_stats_behavior = b; self } pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self { self.inner.page_content_kind = k; self } pub(crate) fn read_path_debug(mut self, b: bool) -> Self { self.inner.read_path_debug = b; self } pub(crate) fn scope(mut self, s: Scope) -> Self { self.inner.scope = s; self } pub(crate) fn perf_span_dispatch(mut self, dispatch: Option) -> Self { self.inner.perf_span_dispatch = dispatch; self } pub fn root_perf_span(mut self, make_span: Fn) -> Self where Fn: FnOnce() -> Span, { assert!(self.inner.perf_span.is_none()); assert!(self.inner.perf_span_dispatch.is_some()); let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); let new_span = tracing::dispatcher::with_default(dispatcher, make_span); self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); self } pub fn perf_span(mut self, make_span: Fn) -> Self where Fn: FnOnce(&Span) -> Span, { if let Some(ref perf_span) = self.inner.perf_span { assert!(self.inner.perf_span_dispatch.is_some()); let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); let new_span = tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); } self } pub fn root(self) -> RequestContext { self.inner } pub fn attached_child(self) -> RequestContext { self.inner } pub fn detached_child(self) -> RequestContext { self.inner } } impl RequestContext { /// Private clone implementation /// /// Callers should use the [`RequestContextBuilder`] or child spaning APIs of /// [`RequestContext`]. fn clone(&self) -> Self { Self { task_kind: self.task_kind, download_behavior: self.download_behavior, access_stats_behavior: self.access_stats_behavior, page_content_kind: self.page_content_kind, read_path_debug: self.read_path_debug, scope: self.scope.clone(), perf_span: self.perf_span.clone(), perf_span_dispatch: self.perf_span_dispatch.clone(), } } /// Create a new RequestContext that has no parent. /// /// The function is called `new` because, once we add children /// to it using `detached_child` or `attached_child`, the context /// form a tree (not implemented yet since cancellation will be /// the first feature that requires a tree). /// /// # Future: Cancellation /// /// The only reason why a context like this one can be canceled is /// because someone explicitly canceled it. /// It has no parent, so it cannot inherit cancellation from there. pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { RequestContextBuilder::new(task_kind) .download_behavior(download_behavior) .root() } /// Create a detached child context for a task that may outlive `self`. /// /// Use this when spawning new background activity that should complete /// even if the current request is canceled. /// /// # Future: Cancellation /// /// Cancellation of `self` will not propagate to the child context returned /// by this method. /// /// # Future: Structured Concurrency /// /// We could add the Future as a parameter to this function, spawn it as a task, /// and pass to the new task the child context as an argument. /// That would be an ergonomic improvement. /// /// We could make new calls to this function fail if `self` is already canceled. pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { RequestContextBuilder::from(self) .task_kind(task_kind) .download_behavior(download_behavior) .detached_child() } /// Create a child of context `self` for a task that shall not outlive `self`. /// /// Use this when fanning-out work to other async tasks. /// /// # Future: Cancellation /// /// Cancelling a context will propagate to its attached children. /// /// # Future: Structured Concurrency /// /// We could add the Future as a parameter to this function, spawn it as a task, /// and track its `JoinHandle` inside the `RequestContext`. /// /// We could then provide another method to allow waiting for all child tasks /// to finish. /// /// We could make new calls to this function fail if `self` is already canceled. /// Alternatively, we could allow the creation but not spawn the task. /// The method to wait for child tasks would return an error, indicating /// that the child task was not started because the context was canceled. pub fn attached_child(&self) -> Self { RequestContextBuilder::from(self).attached_child() } /// Use this function when you should be creating a child context using /// [`attached_child`] or [`detached_child`], but your caller doesn't provide /// a context and you are unwilling to change all callers to provide one. /// /// Before we add cancellation, we should get rid of this method. /// /// [`attached_child`]: Self::attached_child /// [`detached_child`]: Self::detached_child pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { Self::new(task_kind, download_behavior) } pub fn with_scope_timeline(&self, timeline: &Arc) -> Self { RequestContextBuilder::from(self) .scope(Scope::new_timeline(timeline)) .attached_child() } pub(crate) fn with_scope_page_service_pagestream( &self, timeline_handle: &crate::tenant::timeline::handle::Handle< crate::page_service::TenantManagerTypes, >, ) -> Self { RequestContextBuilder::from(self) .scope(Scope::new_page_service_pagestream(timeline_handle)) .attached_child() } pub fn with_scope_secondary_timeline( &self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Self { RequestContextBuilder::from(self) .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id)) .attached_child() } pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self { RequestContextBuilder::from(self) .scope(Scope::new_secondary_tenant(tenant_shard_id)) .attached_child() } #[cfg(test)] pub fn with_scope_unit_test(&self) -> Self { RequestContextBuilder::from(self) .task_kind(TaskKind::UnitTest) .scope(Scope::new_unit_test()) .attached_child() } pub fn with_scope_debug_tools(&self) -> Self { RequestContextBuilder::from(self) .task_kind(TaskKind::DebugTool) .scope(Scope::new_debug_tools()) .attached_child() } pub fn task_kind(&self) -> TaskKind { self.task_kind } pub fn download_behavior(&self) -> DownloadBehavior { self.download_behavior } pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior { self.access_stats_behavior } pub(crate) fn page_content_kind(&self) -> PageContentKind { self.page_content_kind } pub(crate) fn read_path_debug(&self) -> bool { self.read_path_debug } pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics { match &self.scope { Scope::Global { io_size_metrics } => { let is_unit_test = cfg!(test); let is_regress_test_build = cfg!(feature = "testing"); if is_unit_test || is_regress_test_build { panic!("all VirtualFile instances are timeline-scoped"); } else { use once_cell::sync::Lazy; use std::sync::Mutex; use std::time::Duration; use utils::rate_limit::RateLimit; static LIMIT: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); let mut guard = LIMIT.lock().unwrap(); guard.call2(|rate_limit_stats| { warn!( %rate_limit_stats, backtrace=%std::backtrace::Backtrace::force_capture(), "all VirtualFile instances are timeline-scoped", ); }); io_size_metrics } } Scope::Timeline { arc_arc } => &arc_arc.storage_io_size, Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics, Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, #[cfg(test)] Scope::UnitTest { io_size_metrics } => io_size_metrics, Scope::DebugTools { io_size_metrics } => io_size_metrics, } } pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) { if duration == Duration::ZERO { return; } match &self.scope { Scope::Timeline { arc_arc } => arc_arc .wait_ondemand_download_time .observe(self.task_kind, duration), _ => { use once_cell::sync::Lazy; use std::sync::Mutex; use std::time::Duration; use utils::rate_limit::RateLimit; static LIMIT: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); let mut guard = LIMIT.lock().unwrap(); guard.call2(|rate_limit_stats| { warn!( %rate_limit_stats, backtrace=%std::backtrace::Backtrace::force_capture(), "ondemand downloads should always happen within timeline scope", ); }); } } } pub(crate) fn perf_follows_from(&self, from: &RequestContext) { if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) { span.inner().follows_from(from_span.inner()); } } pub(crate) fn has_perf_span(&self) -> bool { self.perf_span.is_some() } } /// [`Future`] extension trait that allow for creating performance /// spans on sampled requests pub(crate) trait PerfInstrumentFutureExt<'a>: Future + Send { /// Instrument this future with a new performance span when the /// provided request context indicates the originator request /// was sampled. Otherwise, just box the future and return it as is. fn maybe_perf_instrument( self, ctx: &RequestContext, make_span: Fn, ) -> BoxFuture<'a, Self::Output> where Self: Sized + 'a, Fn: FnOnce(&Span) -> Span, { match &ctx.perf_span { Some(perf_span) => { assert!(ctx.perf_span_dispatch.is_some()); let dispatcher = ctx.perf_span_dispatch.as_ref().unwrap(); let new_span = tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); let new_perf_span = PerfSpan::new(new_span, dispatcher.clone()); self.instrument(new_perf_span).boxed() } None => self.boxed(), } } } // Implement the trait for all types that satisfy the trait bounds impl<'a, T: Future + Send + 'a> PerfInstrumentFutureExt<'a> for T {} ================================================ FILE: pageserver/src/controller_upcall_client.rs ================================================ use std::collections::HashMap; use std::net::IpAddr; use futures::Future; use pageserver_api::config::NodeMetadata; use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest}; use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse, }; use reqwest::Certificate; use serde::Serialize; use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use url::Url; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::{backoff, failpoint_support, ip_address}; use crate::config::PageServerConf; use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). pub struct StorageControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, node_ip_addr: Option, cancel: CancellationToken, } /// Represent operations which internally retry on all errors other than /// cancellation token firing: the only way they can fail is ShuttingDown. pub enum RetryForeverError { ShuttingDown, } pub trait StorageControllerUpcallApi { fn re_attach( &self, conf: &PageServerConf, empty_local_disk: bool, ) -> impl Future< Output = Result, RetryForeverError>, > + Send; fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> impl Future, RetryForeverError>> + Send; fn put_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, status: ShardImportStatus, ) -> impl Future> + Send; fn get_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, ) -> impl Future> + Send; } impl StorageControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self { let mut url = conf.control_plane_api.clone(); if let Ok(mut segs) = url.path_segments_mut() { // This ensures that `url` ends with a slash if it doesn't already. // That way, we can subsequently use join() to safely attach extra path elements. segs.pop_if_empty().push(""); } let mut client = reqwest::ClientBuilder::new(); if let Some(jwt) = &conf.control_plane_api_token { let mut headers = reqwest::header::HeaderMap::new(); headers.insert( "Authorization", format!("Bearer {}", jwt.get_contents()).parse().unwrap(), ); client = client.default_headers(headers); } for cert in &conf.ssl_ca_certs { client = client.add_root_certificate( Certificate::from_der(cert.contents()).expect("Invalid certificate in config"), ); } // Intentionally panics if we encountered any errors parsing or reading the IP address. // Note that if the required environment variable is not set, `read_node_ip_addr_from_env` returns `Ok(None)` // instead of an error. let node_ip_addr = ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address."); Self { http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), node_ip_addr, } } #[tracing::instrument(skip_all)] async fn retry_http_forever( &self, url: &url::Url, request: R, method: reqwest::Method, ) -> Result where R: Serialize, T: DeserializeOwned, { let res = backoff::retry( || async { let response = self .http_client .request(method.clone(), url.clone()) .json(&request) .send() .await?; response.error_for_status_ref()?; response.json::().await }, |_| false, 3, u32::MAX, "storage controller upcall", &self.cancel, ) .await .ok_or(RetryForeverError::ShuttingDown)? .expect("We retry forever, this should never be reached"); Ok(res) } pub(crate) fn base_url(&self) -> &Url { &self.base_url } } impl StorageControllerUpcallApi for StorageControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn re_attach( &self, conf: &PageServerConf, empty_local_disk: bool, ) -> Result, RetryForeverError> { let url = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); // Include registration content in the re-attach request if a metadata file is readable let metadata_path = conf.metadata_path(); let register = match tokio::fs::read_to_string(&metadata_path).await { Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { Ok(m) => { // Since we run one time at startup, be generous in our logging and // dump all metadata. tracing::info!("Loaded node metadata: {m}"); let az_id = { let az_id_from_metadata = m .other .get("availability_zone_id") .and_then(|jv| jv.as_str().map(|str| str.to_owned())); match az_id_from_metadata { Some(az_id) => Some(AvailabilityZone(az_id)), None => { tracing::warn!( "metadata.json does not contain an 'availability_zone_id' field" ); conf.availability_zone.clone().map(AvailabilityZone) } } }; if az_id.is_none() { panic!( "Availablity zone id could not be inferred from metadata.json or pageserver config" ); } Some(NodeRegisterRequest { node_id: conf.id, listen_pg_addr: m.postgres_host, listen_pg_port: m.postgres_port, listen_grpc_addr: m.grpc_host, listen_grpc_port: m.grpc_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, listen_https_port: m.https_port, node_ip_addr: self.node_ip_addr, availability_zone_id: az_id.expect("Checked above"), }) } Err(e) => { tracing::error!("Unreadable metadata in {metadata_path}: {e}"); None } }, Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { // This is legal: we may have been deployed with some external script // doing registration for us. tracing::info!("Metadata file not found at {metadata_path}"); } else { on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}")) } None } }; let request = ReAttachRequest { node_id: self.node_id, register: register.clone(), empty_local_disk: Some(empty_local_disk), }; let response: ReAttachResponse = self .retry_http_forever(&url, request, reqwest::Method::POST) .await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), self.node_id, register, ); failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); Ok(response .tenants .into_iter() .map(|rart| (rart.id, rart)) .collect::>()) } /// Block until we get a successful response, or error out if we are shut down #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> Result, RetryForeverError> { let url = self .base_url .join("validate") .expect("Failed to build validate path"); // When sending validate requests, break them up into chunks so that we // avoid possible edge cases of generating any HTTP requests that // require database I/O across many thousands of tenants. let mut result: HashMap = HashMap::with_capacity(tenants.len()); for tenant_chunk in (tenants).chunks(128) { let request = ValidateRequest { tenants: tenant_chunk .iter() .map(|(id, generation)| ValidateRequestTenant { id: *id, r#gen: (*generation).into().expect( "Generation should always be valid for a Tenant doing deletions", ), }) .collect(), }; failpoint_support::sleep_millis_async!( "control-plane-client-validate-sleep", &self.cancel ); if self.cancel.is_cancelled() { return Err(RetryForeverError::ShuttingDown); } let response: ValidateResponse = self .retry_http_forever(&url, request, reqwest::Method::POST) .await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } } Ok(result.into_iter().collect()) } /// Send a shard import status to the storage controller /// /// The implementation must have at-least-once delivery semantics. /// To this end, we retry the request until it succeeds. If the pageserver /// restarts or crashes, the shard import will start again from the beggining. #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn put_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, status: ShardImportStatus, ) -> Result<(), RetryForeverError> { let url = self .base_url .join("timeline_import_status") .expect("Failed to build path"); let request = PutTimelineImportStatusRequest { tenant_shard_id, timeline_id, generation, status, }; self.retry_http_forever(&url, request, reqwest::Method::POST) .await } #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn get_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, ) -> Result { let url = self .base_url .join("timeline_import_status") .expect("Failed to build path"); let request = TimelineImportStatusRequest { tenant_shard_id, timeline_id, generation, }; let response: ShardImportStatus = self .retry_http_forever(&url, request, reqwest::Method::GET) .await?; Ok(response) } } ================================================ FILE: pageserver/src/deletion_queue/deleter.rs ================================================ //! The deleter is the final stage in the deletion queue. It accumulates remote //! paths to delete, and periodically executes them in batches of up to 1000 //! using the DeleteObjects request. //! //! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller //! number of full-sized DeleteObjects requests, rather than a larger number of //! smaller requests. use std::time::Duration; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::{backoff, pausable_failpoint}; use super::{DeletionQueueError, FlushOp}; use crate::metrics; const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); pub(super) enum DeleterMessage { Delete(Vec), Flush(FlushOp), } /// Non-persistent deletion queue, for coalescing multiple object deletes into /// larger DeleteObjects requests. pub(super) struct Deleter { // Accumulate up to 1000 keys for the next deletion operation accumulator: Vec, rx: tokio::sync::mpsc::Receiver, cancel: CancellationToken, remote_storage: GenericRemoteStorage, } impl Deleter { pub(super) fn new( remote_storage: GenericRemoteStorage, rx: tokio::sync::mpsc::Receiver, cancel: CancellationToken, ) -> Self { Self { remote_storage, rx, cancel, accumulator: Vec::new(), } } /// Wrap the remote `delete_objects` with a failpoint async fn remote_delete(&self) -> Result<(), anyhow::Error> { // A backoff::retry is used here for two reasons: // - To provide a backoff rather than busy-polling the API on errors // - To absorb transient 429/503 conditions without hitting our error // logging path for issues deleting objects. backoff::retry( || async { fail::fail_point!("deletion-queue-before-execute", |_| { info!("Skipping execution, failpoint set"); metrics::DELETION_QUEUE .remote_errors .with_label_values(&["failpoint"]) .inc(); Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute")) }); self.remote_storage .delete_objects(&self.accumulator, &self.cancel) .await }, TimeoutOrCancel::caused_by_cancel, 3, 10, "executing deletion batch", &self.cancel, ) .await .ok_or_else(|| anyhow::anyhow!("Shutting down")) .and_then(|x| x) } /// Block until everything in accumulator has been executed async fn flush(&mut self) -> Result<(), DeletionQueueError> { while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { pausable_failpoint!("deletion-queue-before-execute-pause"); match self.remote_delete().await { Ok(()) => { // Note: we assume that the remote storage layer returns Ok(()) if some // or all of the deleted objects were already gone. metrics::DELETION_QUEUE .keys_executed .inc_by(self.accumulator.len() as u64); info!( "Executed deletion batch {}..{}", self.accumulator .first() .expect("accumulator should be non-empty"), self.accumulator .last() .expect("accumulator should be non-empty"), ); self.accumulator.clear(); } Err(e) => { if self.cancel.is_cancelled() { return Err(DeletionQueueError::ShuttingDown); } warn!("DeleteObjects request failed: {e:#}, will continue trying"); metrics::DELETION_QUEUE .remote_errors .with_label_values(&["execute"]) .inc(); } }; } if self.cancel.is_cancelled() { // Expose an error because we may not have actually flushed everything Err(DeletionQueueError::ShuttingDown) } else { Ok(()) } } pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> { let max_keys_per_delete = self.remote_storage.max_keys_per_delete(); self.accumulator.reserve(max_keys_per_delete); loop { if self.cancel.is_cancelled() { return Err(DeletionQueueError::ShuttingDown); } let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { Ok(Some(m)) => m, Ok(None) => { // All queue senders closed info!("Shutting down"); return Err(DeletionQueueError::ShuttingDown); } Err(_) => { // Timeout, we hit deadline to execute whatever we have in hand. These functions will // return immediately if no work is pending self.flush().await?; continue; } }; match msg { DeleterMessage::Delete(mut list) => { while !list.is_empty() || self.accumulator.len() == max_keys_per_delete { if self.accumulator.len() == max_keys_per_delete { self.flush().await?; // If we have received this number of keys, proceed with attempting to execute assert_eq!(self.accumulator.len(), 0); } let available_slots = max_keys_per_delete - self.accumulator.len(); let take_count = std::cmp::min(available_slots, list.len()); for path in list.drain(list.len() - take_count..) { self.accumulator.push(path); } } } DeleterMessage::Flush(flush_op) => { // If flush() errors, we drop the flush_op and the caller will get // an error recv()'ing their oneshot channel. self.flush().await?; flush_op.notify(); } } } } } ================================================ FILE: pageserver/src/deletion_queue/list_writer.rs ================================================ //! The list writer is the first stage in the deletion queue. It accumulates //! layers to delete, and periodically writes out these layers into a persistent //! DeletionList. //! //! The purpose of writing DeletionLists is to decouple the decision to //! delete an object from the validation required to execute it: even if //! validation is not possible, e.g. due to a control plane outage, we can //! still persist our intent to delete an object, in a way that would //! survive a restart. //! //! DeletionLists are passed onwards to the Validator. use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; use tracing::{debug, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage}; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path}; use crate::tenant::storage_layer::LayerName; use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error}; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order // of magnitude 1MB when we are under heavy delete load. const DELETION_LIST_TARGET_SIZE: usize = 16384; // Ordinarily, we only flush to DeletionList periodically, to bound the window during // which we might leak objects from not flushing a DeletionList after // the objects are already unlinked from timeline metadata. const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000); // If someone is waiting for a flush to DeletionList, only delay a little to accumulate // more objects before doing the flush. const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100); #[derive(Debug)] pub(super) struct DeletionOp { pub(super) tenant_shard_id: TenantShardId, pub(super) timeline_id: TimelineId, // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing /// this deletion. pub(super) generation: Generation, } #[derive(Debug)] pub(super) struct RecoverOp { pub(super) attached_tenants: HashMap, } #[derive(Debug)] pub(super) enum ListWriterQueueMessage { Delete(DeletionOp), // Wait until all prior deletions make it into a persistent DeletionList Flush(FlushOp), // Wait until all prior deletions have been executed (i.e. objects are actually deleted) FlushExecute(FlushOp), // Call once after re-attaching to control plane, to notify the deletion queue about // latest attached generations & load any saved deletion lists from disk. Recover(RecoverOp), } pub(super) struct ListWriter { conf: &'static PageServerConf, // Incoming frontend requests to delete some keys rx: tokio::sync::mpsc::UnboundedReceiver, // Outbound requests to the backend to execute deletion lists we have composed. tx: tokio::sync::mpsc::Sender, // The list we are currently building, contains a buffer of keys to delete // and our next sequence number pending: DeletionList, // These FlushOps should notify the next time we flush pending_flushes: Vec, // Worker loop is torn down when this fires. cancel: CancellationToken, // Safety guard to do recovery exactly once recovered: bool, } impl ListWriter { // Initially DeletionHeader.validated_sequence is zero. The place we start our // sequence numbers must be higher than that. const BASE_SEQUENCE: u64 = 1; pub(super) fn new( conf: &'static PageServerConf, rx: tokio::sync::mpsc::UnboundedReceiver, tx: tokio::sync::mpsc::Sender, cancel: CancellationToken, ) -> Self { Self { pending: DeletionList::new(Self::BASE_SEQUENCE), conf, rx, tx, pending_flushes: Vec::new(), cancel, recovered: false, } } /// Try to flush `list` to persistent storage /// /// This does not return errors, because on failure to flush we do not lose /// any state: flushing will be retried implicitly on the next deadline async fn flush(&mut self) { if self.pending.is_empty() { for f in self.pending_flushes.drain(..) { f.notify(); } return; } match self.pending.save(self.conf).await { Ok(_) => { info!(sequence = self.pending.sequence, "Stored deletion list"); for f in self.pending_flushes.drain(..) { f.notify(); } // Take the list we've accumulated, replace it with a fresh list for the next sequence let next_list = DeletionList::new(self.pending.sequence + 1); let list = std::mem::replace(&mut self.pending, next_list); if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await { // This is allowed to fail: it will only happen if the backend worker is shut down, // so we can just drop this on the floor. info!("Deletion list dropped, this is normal during shutdown ({e:#})"); } } Err(e) => { metrics::DELETION_QUEUE.unexpected_errors.inc(); warn!( sequence = self.pending.sequence, "Failed to write deletion list, will retry later ({e:#})" ); } } } /// Load the header, to learn the sequence number up to which deletions /// have been validated. We will apply validated=true to DeletionLists /// <= this sequence when loading them. /// /// It is not an error for the header to not exist: we return None, and /// the caller should act as if validated_sequence is 0 async fn load_validated_sequence(&self) -> Result, anyhow::Error> { let header_path = self.conf.deletion_header_path(); match tokio::fs::read(&header_path).await { Ok(header_bytes) => { match serde_json::from_slice::(&header_bytes) { Ok(h) => Ok(Some(h.validated_sequence)), Err(e) => { warn!( "Failed to deserialize deletion header, ignoring {header_path}: {e:#}", ); // This should never happen unless we make a mistake with our serialization. // Ignoring a deletion header is not consequential for correctnes because all deletions // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up. metrics::DELETION_QUEUE.unexpected_errors.inc(); Ok(None) } } } Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { debug!("Deletion header {header_path} not found, first start?"); Ok(None) } else { on_fatal_io_error(&e, "reading deletion header"); } } } } async fn recover( &mut self, attached_tenants: HashMap, ) -> Result<(), anyhow::Error> { debug!( "recovering with {} attached tenants", attached_tenants.len() ); // Load the header let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0); self.pending.sequence = validated_sequence + 1; let deletion_directory = self.conf.deletion_prefix(); let mut dir = tokio::fs::read_dir(&deletion_directory) .await .fatal_err("read deletion directory"); let list_name_pattern = Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); let temp_extension = format!(".{TEMP_SUFFIX}"); let header_path = self.conf.deletion_header_path(); let mut seqs: Vec = Vec::new(); while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") { let file_name = dentry.file_name(); let dentry_str = file_name.to_string_lossy(); if file_name == header_path.file_name().unwrap_or("") { // Don't try and parse the header's name like a list continue; } if dentry_str.ends_with(&temp_extension) { info!("Cleaning up temporary file {dentry_str}"); let absolute_path = deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path")); tokio::fs::remove_file(&absolute_path) .await .fatal_err("delete temp file"); continue; } let file_name = dentry.file_name().to_owned(); let basename = file_name.to_string_lossy(); let seq_part = if let Some(m) = list_name_pattern.captures(&basename) { m.name("sequence") .expect("Non optional group should be present") .as_str() } else { warn!("Unexpected key in deletion queue: {basename}"); metrics::DELETION_QUEUE.unexpected_errors.inc(); continue; }; let seq: u64 = match u64::from_str_radix(seq_part, 16) { Ok(s) => s, Err(e) => { warn!("Malformed key '{basename}': {e}"); metrics::DELETION_QUEUE.unexpected_errors.inc(); continue; } }; seqs.push(seq); } seqs.sort(); // Start our next deletion list from after the last location validated by // previous process lifetime, or after the last location found (it is updated // below after enumerating the deletion lists) self.pending.sequence = validated_sequence + 1; if let Some(max_list_seq) = seqs.last() { self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1); } for s in seqs { let list_path = self.conf.deletion_list_path(s); let list_bytes = tokio::fs::read(&list_path) .await .fatal_err("read deletion list"); let mut deletion_list = match serde_json::from_slice::(&list_bytes) { Ok(l) => l, Err(e) => { // Drop the list on the floor: any objects it referenced will be left behind // for scrubbing to clean up. This should never happen unless we have a serialization bug. warn!(sequence = s, "Failed to deserialize deletion list: {e}"); metrics::DELETION_QUEUE.unexpected_errors.inc(); continue; } }; if deletion_list.sequence <= validated_sequence { // If the deletion list falls below valid_seq, we may assume that it was // already validated the last time this pageserver ran. Otherwise, we still // load it, as it may still contain content valid in this generation. deletion_list.validated = true; } else { // Special case optimization: if a tenant is still attached, and no other // generation was issued to another node in the interval while we restarted, // then we may treat deletion lists from the previous generation as if they // belong to our currently attached generation, and proceed to validate & execute. for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants { if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) { if attached_gen.previous() == tenant_list.generation { info!( seq=%s, tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), old_gen=?tenant_list.generation, new_gen=?attached_gen, "Updating gen on recovered list"); tenant_list.generation = *attached_gen; } else { info!( seq=%s, tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), old_gen=?tenant_list.generation, new_gen=?attached_gen, "Encountered stale generation on recovered list"); } } } } info!( validated = deletion_list.validated, sequence = deletion_list.sequence, "Recovered deletion list" ); // We will drop out of recovery if this fails: it indicates that we are shutting down // or the backend has panicked metrics::DELETION_QUEUE .keys_submitted .inc_by(deletion_list.len() as u64); self.tx .send(ValidatorQueueMessage::Delete(deletion_list)) .await?; } info!(next_sequence = self.pending.sequence, "Replay complete"); Ok(()) } /// This is the front-end ingest, where we bundle up deletion requests into DeletionList /// and write them out, for later validation by the backend and execution by the executor. pub(super) async fn background(&mut self) { info!("Started deletion frontend worker"); // Synchronous, but we only do it once per process lifetime so it's tolerable if let Err(e) = create_dir_all(self.conf.deletion_prefix()) { tracing::error!( "Failed to create deletion list directory {}, deletions will not be executed ({e})", self.conf.deletion_prefix(), ); metrics::DELETION_QUEUE.unexpected_errors.inc(); return; } while !self.cancel.is_cancelled() { let timeout = if self.pending_flushes.is_empty() { FRONTEND_DEFAULT_TIMEOUT } else { FRONTEND_FLUSHING_TIMEOUT }; let msg = match tokio::time::timeout(timeout, self.rx.recv()).await { Ok(Some(msg)) => msg, Ok(None) => { // Queue sender destroyed, shutting down break; } Err(_) => { // Hit deadline, flush. self.flush().await; continue; } }; match msg { ListWriterQueueMessage::Delete(op) => { assert!( self.recovered, "Cannot process deletions before recovery. This is a bug." ); debug!( "Delete: ingesting {} layers, {} other objects", op.layers.len(), op.objects.len() ); let mut layer_paths = Vec::new(); for (layer, meta) in op.layers { layer_paths.push(remote_layer_path( &op.tenant_shard_id.tenant_id, &op.timeline_id, meta.shard, &layer, meta.generation, )); } layer_paths.extend(op.objects); if !self.pending.push( &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, ) { self.flush().await; let retry_succeeded = self.pending.push( &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, ); if !retry_succeeded { // Unexpected: after we flush, we should have // drained self.pending, so a conflict on // generation numbers should be impossible. tracing::error!( "Failed to enqueue deletions, leaking objects. This is a bug." ); metrics::DELETION_QUEUE.unexpected_errors.inc(); } } } ListWriterQueueMessage::Flush(op) => { if self.pending.is_empty() { // Execute immediately debug!("Flush: No pending objects, flushing immediately"); op.notify() } else { // Execute next time we flush debug!("Flush: adding to pending flush list for next deadline flush"); self.pending_flushes.push(op); } } ListWriterQueueMessage::FlushExecute(op) => { debug!("FlushExecute: passing through to backend"); // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await { info!("Can't flush, shutting down ({e})"); // Caller will get error when their oneshot sender was dropped. } } ListWriterQueueMessage::Recover(op) => { if self.recovered { tracing::error!( "Deletion queue recovery called more than once. This is a bug." ); metrics::DELETION_QUEUE.unexpected_errors.inc(); // Non-fatal: although this is a bug, since we did recovery at least once we may proceed. continue; } if let Err(e) = self.recover(op.attached_tenants).await { // This should only happen in truly unrecoverable cases, like the recovery finding that the backend // queue receiver has been dropped, or something is critically broken with // the local filesystem holding deletion lists. info!( "Deletion queue recover aborted, deletion queue will not proceed ({e})" ); metrics::DELETION_QUEUE.unexpected_errors.inc(); return; } else { self.recovered = true; } } } if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() { self.flush().await; } } info!("Deletion queue shut down."); } } ================================================ FILE: pageserver/src/deletion_queue/validator.rs ================================================ //! The validator is responsible for validating DeletionLists for execution, //! based on whether the generation in the DeletionList is still the latest //! generation for a tenant. //! //! The purpose of validation is to ensure split-brain safety in the cluster //! of pageservers: a deletion may only be executed if the tenant generation //! that originated it is still current. See docs/rfcs/025-generation-numbers.md //! The purpose of accumulating lists before validating them is to reduce load //! on the control plane API by issuing fewer, larger requests. //! //! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn //! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn //! to decide when old //! //! Deletions are passed onward to the Deleter. use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use camino::Utf8PathBuf; use tokio_util::sync::CancellationToken; use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; use crate::config::PageServerConf; use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi}; use crate::metrics; use crate::virtual_file::MaybeFatalIo; // After this length of time, do any validation work that is pending, // even if we haven't accumulated many keys to delete. // // This also causes updates to remote_consistent_lsn to be validated, even // if there were no deletions enqueued. const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); // If we have received this number of keys, proceed with attempting to execute const AUTOFLUSH_KEY_COUNT: usize = 16384; #[derive(Debug)] pub(super) enum ValidatorQueueMessage { Delete(DeletionList), Flush(FlushOp), } pub(super) struct Validator where C: StorageControllerUpcallApi, { conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes controller_upcall_client: C, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. pending_lists: Vec, // DeletionLists which have passed validation and are ready to execute. validated_lists: Vec, // Sum of all the lengths of lists in pending_lists pending_key_count: usize, // Lsn validation state: we read projected LSNs and write back visible LSNs // after validation. This is the LSN equivalent of `pending_validation_lists`: // it is drained in [`validate`] lsn_table: Arc>, // If we failed to rewrite a deletion list due to local filesystem I/O failure, // we must remember that and refuse to advance our persistent validated sequence // number past the failure. list_write_failed: Option, cancel: CancellationToken, } impl Validator where C: StorageControllerUpcallApi, { pub(super) fn new( conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, controller_upcall_client: C, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { Self { conf, rx, tx, controller_upcall_client, lsn_table, pending_lists: Vec::new(), validated_lists: Vec::new(), pending_key_count: 0, list_write_failed: None, cancel, } } /// Process any outstanding validations of generations of pending LSN updates or pending /// DeletionLists. /// /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists /// go into the queue of ready-to-execute lists. async fn validate(&mut self) -> Result<(), DeletionQueueError> { let mut tenant_generations = HashMap::new(); for list in &self.pending_lists { for (tenant_id, tenant_list) in &list.tenants { // Note: DeletionLists are in logical time order, so generation always // goes up. By doing a simple insert() we will always end up with // the latest generation seen for a tenant. tenant_generations.insert(*tenant_id, tenant_list.generation); } } let pending_lsn_updates = { let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned"); std::mem::take(&mut *lsn_table) }; for (tenant_id, update) in &pending_lsn_updates.tenants { let entry = tenant_generations .entry(*tenant_id) .or_insert(update.generation); if update.generation > *entry { *entry = update.generation; } } if tenant_generations.is_empty() { // No work to do return Ok(()); } let tenants_valid = match self .controller_upcall_client .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) .await { Ok(tenants) => tenants, Err(RetryForeverError::ShuttingDown) => { // The only way a validation call returns an error is when the cancellation token fires return Err(DeletionQueueError::ShuttingDown); } }; let mut validated_sequence: Option = None; // Apply the validation results to the pending LSN updates for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants { let validated_generation = tenant_generations .get(&tenant_id) .expect("Map was built from the same keys we're reading"); let valid = tenants_valid .get(&tenant_id) .copied() // If the tenant was missing from the validation response, it has been deleted. // The Timeline that requested the LSN update is probably already torn down, // or will be torn down soon. In this case, drop the update by setting valid=false. .unwrap_or(false); if valid && *validated_generation == tenant_lsn_state.generation { for (timeline_id, pending_lsn) in tenant_lsn_state.timelines { tracing::debug!( %tenant_id, %timeline_id, current = %pending_lsn.result_slot.load(), projected = %pending_lsn.projected, "advancing validated remote_consistent_lsn", ); pending_lsn.result_slot.store(pending_lsn.projected); } } else { // If we failed validation, then do not apply any of the projected updates info!( "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation ); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } // Apply the validation results to the pending deletion lists for list in &mut self.pending_lists { // Filter the list based on whether the server responded valid: true. // If a tenant is omitted in the response, it has been deleted, and we should // proceed with deletion. let mut mutated = false; list.tenants.retain(|tenant_id, tenant| { let validated_generation = tenant_generations .get(tenant_id) .expect("Map was built from the same keys we're reading"); // If the tenant was missing from the validation response, it has been deleted. // This means that a deletion is valid, but also redundant since the tenant's // objects should have already been deleted. Treat it as invalid to drop the // redundant deletion. let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false); // A list is valid if it comes from the current _or previous_ generation. // - The previous generation case is permitted due to how we store deletion lists locally: // if we see the immediately previous generation in a locally stored deletion list, // it proves that this node's disk was used for both current & previous generations, // and therefore no other node was involved in between: the two generations may be // logically treated as the same. // - In that previous generation case, we rewrote it to the current generation // in recover(), so the comparison here is simply an equality. let this_list_valid = valid && (tenant.generation == *validated_generation); if !this_list_valid { info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64); } this_list_valid }); list.validated = true; if mutated { // Save the deletion list if we had to make changes due to stale generations. The // saved list is valid for execution. if let Err(e) = list.save(self.conf).await { // Highly unexpected. Could happen if e.g. disk full. // If we didn't save the trimmed list, it is _not_ valid to execute. warn!("Failed to save modified deletion list {list}: {e:#}"); metrics::DELETION_QUEUE.unexpected_errors.inc(); // Rather than have a complex retry process, just drop it and leak the objects, // scrubber will clean up eventually. list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution. // We must remember this failure, to prevent later writing out a header that // would imply the unwritable list was valid on disk. if self.list_write_failed.is_none() { self.list_write_failed = Some(list.sequence); } } } validated_sequence = Some(list.sequence); } if let Some(validated_sequence) = validated_sequence { if let Some(list_write_failed) = self.list_write_failed { // Rare error case: we failed to write out a deletion list to excise invalid // entries, so we cannot advance the header's valid sequence number past that point. // // In this state we will continue to validate, execute and delete deletion lists, // we just cannot update the header. It should be noticed and fixed by a human due to // the nonzero value of our unexpected_errors metric. warn!( sequence_number = list_write_failed, "Cannot write header because writing a deletion list failed earlier", ); } else { // Write the queue header to record how far validation progressed. This avoids having // to rewrite each DeletionList to set validated=true in it. let header = DeletionHeader::new(validated_sequence); // Drop result because the validated_sequence is an optimization. If we fail to save it, // then restart, we will drop some deletion lists, creating work for scrubber. // The save() function logs a warning on error. if let Err(e) = header.save(self.conf).await { warn!("Failed to write deletion queue header: {e:#}"); metrics::DELETION_QUEUE.unexpected_errors.inc(); } } } // Transfer the validated lists to the validated queue, for eventual execution self.validated_lists.append(&mut self.pending_lists); Ok(()) } async fn cleanup_lists(&mut self, list_paths: Vec) { for list_path in list_paths { debug!("Removing deletion list {list_path}"); tokio::fs::remove_file(&list_path) .await .fatal_err("remove deletion list"); } } async fn flush(&mut self) -> Result<(), DeletionQueueError> { tracing::debug!("Flushing with {} pending lists", self.pending_lists.len()); // Issue any required generation validation calls to the control plane self.validate().await?; // After successful validation, nothing is pending: any lists that // made it through validation will be in validated_lists. assert!(self.pending_lists.is_empty()); self.pending_key_count = 0; tracing::debug!( "Validation complete, have {} validated lists", self.validated_lists.len() ); // Return quickly if we have no validated lists to execute. This avoids flushing the // executor when an idle backend hits its autoflush interval if self.validated_lists.is_empty() { return Ok(()); } // Drain `validated_lists` into the executor let mut executing_lists = Vec::new(); for list in self.validated_lists.drain(..) { let list_path = self.conf.deletion_list_path(list.sequence); let objects = list.into_remote_paths(); self.tx .send(DeleterMessage::Delete(objects)) .await .map_err(|_| DeletionQueueError::ShuttingDown)?; executing_lists.push(list_path); } self.flush_executor().await?; // Erase the deletion lists whose keys have all be deleted from remote storage self.cleanup_lists(executing_lists).await; Ok(()) } async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> { // Flush the executor, so that all the keys referenced by these deletion lists // are actually removed from remote storage. This is a precondition to deleting // the deletion lists themselves. let (flush_op, rx) = FlushOp::new(); self.tx .send(DeleterMessage::Flush(flush_op)) .await .map_err(|_| DeletionQueueError::ShuttingDown)?; rx.await.map_err(|_| DeletionQueueError::ShuttingDown) } pub(super) async fn background(&mut self) { tracing::info!("Started deletion backend worker"); while !self.cancel.is_cancelled() { let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { Ok(Some(m)) => m, Ok(None) => { // All queue senders closed info!("Shutting down"); break; } Err(_) => { // Timeout, we hit deadline to execute whatever we have in hand. These functions will // return immediately if no work is pending. match self.flush().await { Ok(()) => {} Err(DeletionQueueError::ShuttingDown) => { // If we are shutting down, then auto-flush can safely be skipped } } continue; } }; match msg { ValidatorQueueMessage::Delete(list) => { if list.validated { // A pre-validated list may only be seen during recovery, if we are recovering // a DeletionList whose on-disk state has validated=true self.validated_lists.push(list) } else { self.pending_key_count += list.len(); self.pending_lists.push(list); } if self.pending_key_count > AUTOFLUSH_KEY_COUNT { match self.flush().await { Ok(()) => {} Err(DeletionQueueError::ShuttingDown) => { // If we are shutting down, then auto-flush can safely be skipped } } } } ValidatorQueueMessage::Flush(op) => { match self.flush().await { Ok(()) => { op.notify(); } Err(DeletionQueueError::ShuttingDown) => { // If we fail due to shutting down, we will just drop `op` to propagate that status. } } } } } } } ================================================ FILE: pageserver/src/deletion_queue.rs ================================================ mod deleter; mod list_writer; mod validator; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use anyhow::Context; use camino::Utf8PathBuf; use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; use validator::ValidatorQueueMessage; use self::deleter::Deleter; use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; use self::validator::Validator; use crate::config::PageServerConf; use crate::controller_upcall_client::StorageControllerUpcallApi; use crate::metrics; use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; use crate::tenant::storage_layer::LayerName; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; // TODO: configurable for how long to wait before executing deletions /// We aggregate object deletions from many tenants in one place, for several reasons: /// - Coalesce deletions into fewer DeleteObjects calls /// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes /// to flush any outstanding deletions. /// - Globally control throughput of deletions, as these are a low priority task: do /// not compete with the same S3 clients/connections used for higher priority uploads. /// - Enable gating deletions on validation of a tenant's generation number, to make /// it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md) /// /// There are two kinds of deletion: deferred and immediate. A deferred deletion /// may be intentionally delayed to protect passive readers of S3 data, and is /// subject to a generation number validation step. An immediate deletion is /// ready to execute immediately, and is only queued up so that it can be coalesced /// with other deletions in flight. /// /// Deferred deletions pass through three steps: /// - ListWriter: accumulate deletion requests from Timelines, and batch them up into /// DeletionLists, which are persisted to disk. /// - Validator: accumulate deletion lists, and validate them en-masse prior to passing /// the keys in the list onward for actual deletion. Also validate remote_consistent_lsn /// updates for running timelines. /// - Deleter: accumulate object keys that the validator has validated, and execute them in /// batches of 1000 keys via DeleteObjects. /// /// Non-deferred deletions, such as during timeline deletion, bypass the first /// two stages and are passed straight into the Deleter. /// /// Internally, each stage is joined by a channel to the next. On disk, there is only /// one queue (of DeletionLists), which is written by the frontend and consumed /// by the backend. #[derive(Clone)] pub struct DeletionQueue { client: DeletionQueueClient, // Parent cancellation token for the tokens passed into background workers cancel: CancellationToken, } /// Opaque wrapper around individual worker tasks, to avoid making the /// worker objects themselves public pub struct DeletionQueueWorkers where C: StorageControllerUpcallApi + Send + Sync, { frontend: ListWriter, backend: Validator, executor: Deleter, } impl DeletionQueueWorkers where C: StorageControllerUpcallApi + Send + Sync + 'static, { pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> { let jh_frontend = runtime.spawn(async move { self.frontend .background() .instrument(tracing::info_span!(parent:None, "deletion frontend")) .await }); let jh_backend = runtime.spawn(async move { self.backend .background() .instrument(tracing::info_span!(parent:None, "deletion backend")) .await }); let jh_executor = runtime.spawn(async move { self.executor .background() .instrument(tracing::info_span!(parent:None, "deletion executor")) .await }); runtime.spawn({ async move { jh_frontend.await.expect("error joining frontend worker"); jh_backend.await.expect("error joining backend worker"); drop(jh_executor.await.expect("error joining executor worker")); } }) } } /// A FlushOp is just a oneshot channel, where we send the transmit side down /// another channel, and the receive side will receive a message when the channel /// we're flushing has reached the FlushOp we sent into it. /// /// The only extra behavior beyond the channel is that the notify() method does not /// return an error when the receive side has been dropped, because in this use case /// it is harmless (the code that initiated the flush no longer cares about the result). #[derive(Debug)] struct FlushOp { tx: tokio::sync::oneshot::Sender<()>, } impl FlushOp { fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) { let (tx, rx) = tokio::sync::oneshot::channel::<()>(); (Self { tx }, rx) } fn notify(self) { if self.tx.send(()).is_err() { // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush. debug!("deletion queue flush from dropped client"); }; } } #[derive(Clone, Debug)] pub struct DeletionQueueClient { tx: tokio::sync::mpsc::UnboundedSender, executor_tx: tokio::sync::mpsc::Sender, lsn_table: Arc>, } #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct TenantDeletionList { /// For each Timeline, a list of key fragments to append to the timeline remote path /// when reconstructing a full key timelines: HashMap>, /// The generation in which this deletion was emitted: note that this may not be the /// same as the generation of any layers being deleted. The generation of the layer /// has already been absorbed into the keys in `objects` generation: Generation, } impl TenantDeletionList { pub(crate) fn len(&self) -> usize { self.timelines.values().map(|v| v.len()).sum() } } /// Files ending with this suffix will be ignored and erased /// during recovery as startup. const TEMP_SUFFIX: &str = "tmp"; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct DeletionList { /// Serialization version, for future use version: u8, /// Used for constructing a unique key for each deletion list we write out. sequence: u64, /// To avoid repeating tenant/timeline IDs in every key, we store keys in /// nested HashMaps by TenantTimelineID. Each Tenant only appears once /// with one unique generation ID: if someone tries to push a second generation /// ID for the same tenant, we will start a new DeletionList. tenants: HashMap, /// Avoid having to walk `tenants` to calculate the number of keys in /// the nested deletion lists size: usize, /// Set to true when the list has undergone validation with the control /// plane and the remaining contents of `tenants` are valid. A list may /// also be implicitly marked valid by DeletionHeader.validated_sequence /// advancing to >= DeletionList.sequence #[serde(default)] #[serde(skip_serializing_if = "std::ops::Not::not")] validated: bool, } #[derive(Debug, Serialize, Deserialize)] struct DeletionHeader { /// Serialization version, for future use version: u8, /// The highest sequence number (inclusive) that has been validated. All deletion /// lists on disk with a sequence <= this value are safe to execute. validated_sequence: u64, } impl DeletionHeader { const VERSION_LATEST: u8 = 1; fn new(validated_sequence: u64) -> Self { Self { version: Self::VERSION_LATEST, validated_sequence, } } async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { debug!("Saving deletion list header {:?}", self); let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; let header_path = conf.deletion_header_path(); let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes) .await .maybe_fatal_err("save deletion header")?; Ok(()) } } impl DeletionList { const VERSION_LATEST: u8 = 1; fn new(sequence: u64) -> Self { Self { version: Self::VERSION_LATEST, sequence, tenants: HashMap::new(), size: 0, validated: false, } } fn is_empty(&self) -> bool { self.tenants.is_empty() } fn len(&self) -> usize { self.size } /// Returns true if the push was accepted, false if the caller must start a new /// deletion list. fn push( &mut self, tenant: &TenantShardId, timeline: &TimelineId, generation: Generation, objects: &mut Vec, ) -> bool { if objects.is_empty() { // Avoid inserting an empty TimelineDeletionList: this preserves the property // that if we have no keys, then self.objects is empty (used in Self::is_empty) return true; } let tenant_entry = self .tenants .entry(*tenant) .or_insert_with(|| TenantDeletionList { timelines: HashMap::new(), generation, }); if tenant_entry.generation != generation { // Only one generation per tenant per list: signal to // caller to start a new list. return false; } let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default(); let timeline_remote_path = remote_timeline_path(tenant, timeline); self.size += objects.len(); timeline_entry.extend(objects.drain(..).map(|p| { p.strip_prefix(&timeline_remote_path) .expect("Timeline paths always start with the timeline prefix") .to_string() })); true } fn into_remote_paths(self) -> Vec { let mut result = Vec::new(); for (tenant, tenant_deletions) in self.tenants.into_iter() { for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() { let timeline_remote_path = remote_timeline_path(&tenant, &timeline); result.extend( timeline_layers .into_iter() .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))), ); } } result } async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { let path = conf.deletion_list_path(self.sequence); let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); VirtualFile::crashsafe_overwrite(path, temp_path, bytes) .await .maybe_fatal_err("save deletion list") .map_err(Into::into) } } impl std::fmt::Display for DeletionList { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "DeletionList", self.sequence, self.tenants.len(), self.size ) } } struct PendingLsn { projected: Lsn, result_slot: Arc, } struct TenantLsnState { timelines: HashMap, // In what generation was the most recent update proposed? generation: Generation, } #[derive(Default)] struct VisibleLsnUpdates { tenants: HashMap, } impl VisibleLsnUpdates { fn new() -> Self { Self { tenants: HashMap::new(), } } } impl std::fmt::Debug for VisibleLsnUpdates { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len()) } } #[derive(Error, Debug)] pub enum DeletionQueueError { #[error("Deletion queue unavailable during shutdown")] ShuttingDown, } impl DeletionQueueClient { /// This is cancel-safe. If you drop the future before it completes, the message /// is not pushed, although in the context of the deletion queue it doesn't matter: once /// we decide to do a deletion the decision is always final. fn do_push( &self, queue: &tokio::sync::mpsc::UnboundedSender, msg: T, ) -> Result<(), DeletionQueueError> { match queue.send(msg) { Ok(_) => Ok(()), Err(e) => { // This shouldn't happen, we should shut down all tenants before // we shut down the global delete queue. If we encounter a bug like this, // we may leak objects as deletions won't be processed. error!("Deletion queue closed while pushing, shutting down? ({e})"); Err(DeletionQueueError::ShuttingDown) } } } pub(crate) fn recover( &self, attached_tenants: HashMap, ) -> Result<(), DeletionQueueError> { self.do_push( &self.tx, ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }), ) } /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside /// world, it must validate its generation number before doing so. Rather than do this synchronously, /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most /// recently validated separately. /// /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates. The /// backend will later wake up and notice that the tenant's generation requires validation. pub(crate) async fn update_remote_consistent_lsn( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, lsn: Lsn, result_slot: Arc, ) { let mut locked = self .lsn_table .write() .expect("Lock should never be poisoned"); let tenant_entry = locked .tenants .entry(tenant_shard_id) .or_insert(TenantLsnState { timelines: HashMap::new(), generation: current_generation, }); if tenant_entry.generation != current_generation { // Generation might have changed if we were detached and then re-attached: in this case, // state from the previous generation cannot be trusted. tenant_entry.timelines.clear(); tenant_entry.generation = current_generation; } tenant_entry.timelines.insert( timeline_id, PendingLsn { projected: lsn, result_slot, }, ); } /// Submit a list of layers for deletion: this function will return before the deletion is /// persistent, but it may be executed at any time after this function enters: do not push /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer /// references them). /// /// The `current_generation` is the generation of this pageserver's current attachment. The /// generations in `layers` are the generations in which those layers were written. pub(crate) fn push_layers( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { // None generations are not valid for attached tenants: they must always be attached in // a known generation. None generations are still permitted for layers in the index because // they may be historical. assert!(!current_generation.is_none()); metrics::DELETION_QUEUE .keys_submitted .inc_by(layers.len() as u64); self.do_push( &self.tx, ListWriterQueueMessage::Delete(DeletionOp { tenant_shard_id, timeline_id, layers, generation: current_generation, objects: Vec::new(), }), ) } /// This is cancel-safe. If you drop the future the flush may still happen in the background. async fn do_flush( &self, queue: &tokio::sync::mpsc::UnboundedSender, msg: T, rx: tokio::sync::oneshot::Receiver<()>, ) -> Result<(), DeletionQueueError> { self.do_push(queue, msg)?; if rx.await.is_err() { // This shouldn't happen if tenants are shut down before deletion queue. If we // encounter a bug like this, then a flusher will incorrectly believe it has flushed // when it hasn't, possibly leading to leaking objects. error!("Deletion queue dropped flush op while client was still waiting"); Err(DeletionQueueError::ShuttingDown) } else { Ok(()) } } /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList) /// /// This is cancel-safe. If you drop the future the flush may still happen in the background. pub async fn flush(&self) -> Result<(), DeletionQueueError> { let (flush_op, rx) = FlushOp::new(); self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx) .await } /// Issue a flush without waiting for it to complete. This is useful on advisory flushes where /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant /// detach where flushing is nice but not necessary. /// /// This function provides no guarantees of work being done. pub fn flush_advisory(&self) { let (flush_op, _) = FlushOp::new(); // Transmit the flush message, ignoring any result (such as a closed channel during shutdown). drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op))); } // Wait until all previous deletions are executed pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> { debug!("flush_execute: flushing to deletion lists..."); // Flush any buffered work to deletion lists self.flush().await?; // Flush the backend into the executor of deletion lists let (flush_op, rx) = FlushOp::new(); debug!("flush_execute: flushing backend..."); self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx) .await?; debug!("flush_execute: finished flushing backend..."); // Flush any immediate-mode deletions (the above backend flush will only flush // the executor if deletions had flowed through the backend) debug!("flush_execute: flushing execution..."); self.flush_immediate().await?; debug!("flush_execute: finished flushing execution..."); Ok(()) } /// This interface bypasses the persistent deletion queue, and any validation /// that this pageserver is still elegible to execute the deletions. It is for /// use in timeline deletions, where the control plane is telling us we may /// delete everything in the timeline. /// /// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`. pub(crate) async fn push_immediate( &self, objects: Vec, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted .inc_by(objects.len() as u64); self.executor_tx .send(DeleterMessage::Delete(objects)) .await .map_err(|_| DeletionQueueError::ShuttingDown) } /// Companion to push_immediate. When this returns Ok, all prior objects sent /// into push_immediate have been deleted from remote storage. pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> { let (flush_op, rx) = FlushOp::new(); self.executor_tx .send(DeleterMessage::Flush(flush_op)) .await .map_err(|_| DeletionQueueError::ShuttingDown)?; rx.await.map_err(|_| DeletionQueueError::ShuttingDown) } } impl DeletionQueue { pub fn new_client(&self) -> DeletionQueueClient { self.client.clone() } /// Caller may use the returned object to construct clients with new_client. /// Caller should tokio::spawn the background() members of the two worker objects returned: /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. pub fn new( remote_storage: GenericRemoteStorage, controller_upcall_client: C, conf: &'static PageServerConf, ) -> (Self, DeletionQueueWorkers) where C: StorageControllerUpcallApi + Send + Sync, { // Unbounded channel: enables non-async functions to submit deletions. The actual length is // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent // enough to avoid this taking pathologically large amount of memory. let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16); // Shallow channel: it carries lists of paths, and we expect the main queueing to // happen in the backend (persistent), not in this queue. let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16); let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())); // The deletion queue has an independent cancellation token to // the general pageserver shutdown token, because it stays alive a bit // longer to flush after Tenants have all been torn down. let cancel = CancellationToken::new(); ( Self { client: DeletionQueueClient { tx, executor_tx: executor_tx.clone(), lsn_table: lsn_table.clone(), }, cancel: cancel.clone(), }, DeletionQueueWorkers { frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()), backend: Validator::new( conf, backend_rx, executor_tx, controller_upcall_client, lsn_table.clone(), cancel.clone(), ), executor: Deleter::new(remote_storage, executor_rx, cancel.clone()), }, ) } pub async fn shutdown(&mut self, timeout: Duration) { match tokio::time::timeout(timeout, self.client.flush()).await { Ok(Ok(())) => { tracing::info!("Deletion queue flushed successfully on shutdown") } Ok(Err(DeletionQueueError::ShuttingDown)) => { // This is not harmful for correctness, but is unexpected: the deletion // queue's workers should stay alive as long as there are any client handles instantiated. tracing::warn!("Deletion queue stopped prematurely"); } Err(_timeout) => { tracing::warn!("Timed out flushing deletion queue on shutdown") } } // We only cancel _after_ flushing: otherwise we would be shutting down the // components that do the flush. self.cancel.cancel(); } } #[cfg(test)] mod test { use std::io::ErrorKind; use std::time::Duration; use camino::Utf8Path; use hex_literal::hex; use pageserver_api::key::Key; use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::ShardIndex; use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::task::JoinHandle; use tracing::info; use super::*; use crate::controller_upcall_client::RetryForeverError; use crate::tenant::harness::TenantHarness; use crate::tenant::storage_layer::DeltaLayerName; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); struct TestSetup { harness: TenantHarness, remote_fs_dir: Utf8PathBuf, storage: GenericRemoteStorage, mock_control_plane: MockStorageController, deletion_queue: DeletionQueue, worker_join: JoinHandle<()>, } impl TestSetup { /// Simulate a pageserver restart by destroying and recreating the deletion queue async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( self.storage.clone(), self.mock_control_plane.clone(), self.harness.conf, ); tracing::debug!("Spawning worker for new queue queue"); let worker_join = workers.spawn_with(&tokio::runtime::Handle::current()); let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join); let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue); tracing::debug!("Joining worker from previous queue"); old_deletion_queue.cancel.cancel(); old_worker_join .await .expect("Failed to join workers for previous deletion queue"); } fn set_latest_generation(&self, gen_: Generation) { let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() .insert(tenant_shard_id, gen_); } /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, file_name: LayerName, gen_: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix()); let content: Vec = format!("placeholder contents of {file_name}").into(); std::fs::write( remote_timeline_path.join(remote_layer_file_name.clone()), content, )?; Ok(remote_layer_file_name) } } #[derive(Debug, Clone)] struct MockStorageController { pub latest_generation: std::sync::Arc>>, } impl MockStorageController { fn new() -> Self { Self { latest_generation: Arc::default(), } } } impl StorageControllerUpcallApi for MockStorageController { async fn re_attach( &self, _conf: &PageServerConf, _empty_local_disk: bool, ) -> Result, RetryForeverError> { unimplemented!() } async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> Result, RetryForeverError> { let mut result = HashMap::new(); let latest_generation = self.latest_generation.lock().unwrap(); for (tenant_shard_id, generation) in tenants { if let Some(latest) = latest_generation.get(&tenant_shard_id) { result.insert(tenant_shard_id, *latest == generation); } } Ok(result) } async fn put_timeline_import_status( &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, _generation: Generation, _status: pageserver_api::models::ShardImportStatus, ) -> Result<(), RetryForeverError> { unimplemented!() } async fn get_timeline_import_status( &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, _generation: Generation, ) -> Result { unimplemented!() } } async fn setup(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); let harness = TenantHarness::create(test_name).await?; // We do not load() the harness: we only need its config and remote_storage // Set up a GenericRemoteStorage targetting a directory let remote_fs_dir = harness.conf.workdir.join("remote_fs"); std::fs::create_dir_all(remote_fs_dir)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let storage = GenericRemoteStorage::from_config(&storage_config) .await .unwrap(); let mock_control_plane = MockStorageController::new(); let (deletion_queue, worker) = DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf); let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); Ok(TestSetup { harness, remote_fs_dir, storage, mock_control_plane, deletion_queue, worker_join, }) } // TODO: put this in a common location so that we can share with remote_timeline_client's tests fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path) { let mut expected: Vec = expected.iter().map(|x| String::from(*x)).collect(); expected.sort(); let mut found: Vec = Vec::new(); let dir = match std::fs::read_dir(remote_path) { Ok(d) => d, Err(e) => { if e.kind() == ErrorKind::NotFound { if expected.is_empty() { // We are asserting prefix is empty: it is expected that the dir is missing return; } else { assert_eq!(expected, Vec::::new()); unreachable!(); } } else { panic!("Unexpected error listing {remote_path}: {e}"); } } }; for entry in dir.flatten() { let entry_name = entry.file_name(); let fname = entry_name.to_str().unwrap(); found.push(String::from(fname)); } found.sort(); assert_eq!(expected, found); } fn assert_local_files(expected: &[&str], directory: &Utf8Path) { let dir = match std::fs::read_dir(directory) { Ok(d) => d, Err(_) => { assert_eq!(expected, &Vec::::new()); return; } }; let mut found = Vec::new(); for dentry in dir { let dentry = dentry.unwrap(); let file_name = dentry.file_name(); let file_name_str = file_name.to_string_lossy(); found.push(file_name_str.to_string()); } found.sort(); assert_eq!(expected, found); } #[tokio::test] async fn deletion_queue_smoke() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it let ctx = setup("deletion_queue_smoke") .await .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); // Exercise the distinction between the generation of the layers // we delete, and the generation of the running Tenant. let layer_generation = Generation::new(0xdeadbeef); let now_generation = Generation::new(0xfeedbeef); let layer_metadata = LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); let remote_layer_file_name_1 = format!("{}{}", layer_file_name_1, layer_generation.get_suffix()); // Set mock control plane state to valid for our generation ctx.set_latest_generation(now_generation); // Inject a victim file to remote storage info!("Writing"); std::fs::create_dir_all(&remote_timeline_path)?; std::fs::write( remote_timeline_path.join(remote_layer_file_name_1.clone()), content, )?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) info!("Pushing"); client.push_layers( tenant_shard_id, TIMELINE_ID, now_generation, [(layer_file_name_1.clone(), layer_metadata)].to_vec(), )?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); assert_local_files(&[], &deletion_prefix); // File should still be there after we write a deletion list (we haven't pushed enough to execute anything) info!("Flushing"); client.flush().await?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); assert_local_files(&["0000000000000001-01.list"], &deletion_prefix); // File should go away when we execute info!("Flush-executing"); client.flush_execute().await?; assert_remote_files(&[], &remote_timeline_path); assert_local_files(&["header-01"], &deletion_prefix); // Flushing on an empty queue should succeed immediately, and not write any lists info!("Flush-executing on empty"); client.flush_execute().await?; assert_local_files(&["header-01"], &deletion_prefix); Ok(()) } #[tokio::test] async fn deletion_queue_validation() -> anyhow::Result<()> { let ctx = setup("deletion_queue_validation") .await .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; // Generation that the control plane thinks is current let latest_generation = Generation::new(0xdeadbeef); // Generation that our DeletionQueue thinks the tenant is running with let stale_generation = latest_generation.previous(); // Generation that our example layer file was written with let layer_generation = stale_generation.previous(); let layer_metadata = LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); ctx.set_latest_generation(latest_generation); let tenant_shard_id = ctx.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); // Initial state: a remote layer exists let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); client.push_layers( tenant_shard_id, TIMELINE_ID, stale_generation, [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), )?; // We enqueued the operation in a stale generation: it should have failed validation tracing::debug!("Flushing..."); tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); client.push_layers( tenant_shard_id, TIMELINE_ID, latest_generation, [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), )?; // We enqueued the operation in a fresh generation: it should have passed validation tracing::debug!("Flushing..."); tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; assert_remote_files(&[], &remote_timeline_path); Ok(()) } #[tokio::test] async fn deletion_queue_recovery() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it let mut ctx = setup("deletion_queue_recovery") .await .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; let tenant_shard_id = ctx.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); let layer_generation = Generation::new(0xdeadbeef); let now_generation = Generation::new(0xfeedbeef); let layer_metadata = LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); // Inject a deletion in the generation before generation_now: after restart, // this deletion should _not_ get executed (only the immediately previous // generation gets that treatment) let remote_layer_file_name_historical = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; client.push_layers( tenant_shard_id, TIMELINE_ID, now_generation.previous(), [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), )?; // Inject a deletion in the generation before generation_now: after restart, // this deletion should get executed, because we execute deletions in the // immediately previous generation on the same node. let remote_layer_file_name_previous = ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; client.push_layers( tenant_shard_id, TIMELINE_ID, now_generation, [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), )?; client.flush().await?; assert_remote_files( &[ &remote_layer_file_name_historical, &remote_layer_file_name_previous, ], &remote_timeline_path, ); // Different generatinos for the same tenant will cause two separate // deletion lists to be emitted. assert_local_files( &["0000000000000001-01.list", "0000000000000002-01.list"], &deletion_prefix, ); // Simulate a node restart: the latest generation advances let now_generation = now_generation.next(); ctx.set_latest_generation(now_generation); // Restart the deletion queue drop(client); ctx.restart().await; let client = ctx.deletion_queue.new_client(); client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?; info!("Flush-executing"); client.flush_execute().await?; // The deletion from immediately prior generation was executed, the one from // an older generation was not. assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path); Ok(()) } } /// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. #[cfg(test)] pub(crate) mod mock { use std::sync::atomic::{AtomicUsize, Ordering}; use tracing::info; use super::*; use crate::tenant::remote_timeline_client::remote_layer_path; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, executor_rx: tokio::sync::mpsc::Receiver, cancel: CancellationToken, executed: Arc, } impl ConsumerState { async fn consume(&mut self, remote_storage: &GenericRemoteStorage) { info!("Executing all pending deletions"); // Transform all executor messages to generic frontend messages loop { use either::Either; let msg = tokio::select! { left = self.executor_rx.recv() => Either::Left(left), right = self.rx.recv() => Either::Right(right), }; match msg { Either::Left(None) => break, Either::Right(None) => break, Either::Left(Some(DeleterMessage::Delete(objects))) => { for path in objects { match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } Err(e) => { error!("Failed to delete {path}, leaking object! ({e})"); } } self.executed.fetch_add(1, Ordering::Relaxed); } } Either::Left(Some(DeleterMessage::Flush(flush_op))) => { flush_op.notify(); } Either::Right(Some(ListWriterQueueMessage::Delete(op))) => { let mut objects = op.objects; for (layer, meta) in op.layers { objects.push(remote_layer_path( &op.tenant_shard_id.tenant_id, &op.timeline_id, meta.shard, &layer, meta.generation, )); } for path in objects { info!("Executing deletion {path}"); match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } Err(e) => { error!("Failed to delete {path}, leaking object! ({e})"); } } self.executed.fetch_add(1, Ordering::Relaxed); } } Either::Right(Some(ListWriterQueueMessage::Flush(op))) => { op.notify(); } Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => { // We have already executed all prior deletions because mock does them inline op.notify(); } Either::Right(Some(ListWriterQueueMessage::Recover(_))) => { // no-op in mock } } } } } pub struct MockDeletionQueue { tx: tokio::sync::mpsc::UnboundedSender, executor_tx: tokio::sync::mpsc::Sender, lsn_table: Arc>, } impl MockDeletionQueue { pub fn new(remote_storage: Option) -> Self { let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384); let executed = Arc::new(AtomicUsize::new(0)); let mut consumer = ConsumerState { rx, executor_rx, cancel: CancellationToken::new(), executed: executed.clone(), }; tokio::spawn(async move { if let Some(remote_storage) = &remote_storage { consumer.consume(remote_storage).await; } }); Self { tx, executor_tx, lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), } } #[allow(clippy::await_holding_lock)] pub async fn pump(&self) { let (tx, rx) = tokio::sync::oneshot::channel(); self.executor_tx .send(DeleterMessage::Flush(FlushOp { tx })) .await .expect("Failed to send flush message"); rx.await.ok(); } pub(crate) fn new_client(&self) -> DeletionQueueClient { DeletionQueueClient { tx: self.tx.clone(), executor_tx: self.executor_tx.clone(), lsn_table: self.lsn_table.clone(), } } } /// Test round-trip serialization/deserialization, and test stability of the format /// vs. a static expected string for the serialized version. #[test] fn deletion_list_serialization() -> anyhow::Result<()> { let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c" .to_string() .parse::()?; let timeline_id = "be322c834ed9e709e63b5c9698691910" .to_string() .parse::()?; let generation = Generation::new(123); let object = RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?; let mut objects = [object].to_vec(); let mut example = DeletionList::new(1); example.push(&tenant_id, &timeline_id, generation, &mut objects); let encoded = serde_json::to_string(&example)?; let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string(); assert_eq!(encoded, expected); let decoded = serde_json::from_str::(&encoded)?; assert_eq!(example, decoded); Ok(()) } } ================================================ FILE: pageserver/src/disk_usage_eviction_task.rs ================================================ //! This module implements the pageserver-global disk-usage-based layer eviction task. //! //! # Mechanics //! //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background //! loop that evicts layers in response to a shortage of available bytes //! in the $repo/tenants directory's filesystem. //! //! The loop runs periodically at a configurable `period`. //! //! Each loop iteration uses `statvfs` to determine filesystem-level space usage. //! It compares the returned usage data against two different types of thresholds. //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. //! //! # Eviction Policy //! //! There are two thresholds: //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. //! If the actual usage is higher, the threshold is exceeded. //! `min_avail_bytes` is the absolute available space in bytes. //! If the actual usage is lower, the threshold is exceeded. //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again. //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. //! The reservation is to keep the most recently accessed X bytes per tenant resident. //! If we cannot relieve pressure by evicting layers outside of the reservation, we //! start evicting layers that are part of the reservation, LRU first. //! //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` //! throughout the code, but, no actual variable carries that name. //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress //! during page reconstruction. //! An alternative default for all tenants can be specified in the `tenant_config` section of the config. //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`). // Implementation notes: // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. use std::sync::Arc; use std::time::SystemTime; use anyhow::Context; use pageserver_api::config::DiskUsageEvictionTaskConfig; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::completion; use utils::id::TimelineId; use crate::config::PageServerConf; use crate::metrics::disk_usage_based_eviction::METRICS; use crate::task_mgr::{self, BACKGROUND_RUNTIME}; use crate::tenant::mgr::TenantManager; use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::tenant::secondary::SecondaryTenant; use crate::tenant::storage_layer::{ AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint, }; use crate::tenant::tasks::sleep_random; use crate::{CancellableTask, DiskUsageEvictionTask}; /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. RelativeAccessed { /// Determines if the tenant with most layers should lose first. /// /// Having this enabled is currently the only reasonable option, because the order in which /// we read tenants is deterministic. If we find the need to use this as `false`, we need /// to ensure nondeterminism by adding in a random number to break the /// `relative_last_activity==0.0` ties. highest_layer_count_loses_first: bool, }, } impl From for EvictionOrder { fn from(value: pageserver_api::config::EvictionOrder) -> Self { match value { pageserver_api::config::EvictionOrder::RelativeAccessed { highest_layer_count_loses_first, } => Self::RelativeAccessed { highest_layer_count_loses_first, }, } } } impl EvictionOrder { fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; match self { RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { (*partition, candidate.relative_last_activity) }), } } /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants /// layers in **most** recently used order. fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 { use EvictionOrder::*; match self { RelativeAccessed { highest_layer_count_loses_first, } => { // keeping the -1 or not decides if every tenant should lose their least recently accessed // layer OR if this should happen in the order of having highest layer count: let fudge = if *highest_layer_count_loses_first { // relative_last_activity vs. tenant layer count: // - 0.1..=1.0 (10 layers) // - 0.01..=1.0 (100 layers) // - 0.001..=1.0 (1000 layers) // // leading to evicting less of the smallest tenants. 0 } else { // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could // be that less than 10k layer evictions is enough, so we would not need to evict from // all tenants. // // as the tenant ordering is now deterministic this could hit the same tenants // disproportionetly on multiple invocations. alternative could be to remember how many // layers did we evict last time from this tenant, and inject that as an additional // fudge here. 1 }; let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1); let divider = total as f32; // most recently used is always (total - 0) / divider == 1.0 // least recently used depends on the fudge: // - (total - 1) - (total - 1) / total => 0 / total // - total - (total - 1) / total => 1 / total let distance = (total - index) as f32; finite_f32::FiniteF32::try_from_normalized(distance / divider) .unwrap_or_else(|val| { tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}"); finite_f32::FiniteF32::ZERO }) } } } } #[derive(Default)] pub struct State { /// Exclude http requests and background task from running at the same time. mutex: tokio::sync::Mutex<()>, } pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, state: Arc, tenant_manager: Arc, background_jobs_barrier: completion::Barrier, ) -> Option { let task_config = &conf.disk_usage_based_eviction; if !task_config.enabled { info!("disk usage based eviction task not configured"); return None; }; info!("launching disk usage based eviction task"); let cancel = CancellationToken::new(); let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "disk usage based eviction", { let cancel = cancel.clone(); async move { // wait until initial load is complete, because we cannot evict from loading tenants. tokio::select! { _ = cancel.cancelled() => { return anyhow::Ok(()); }, _ = background_jobs_barrier.wait() => { } }; disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel) .await; anyhow::Ok(()) } }, )); Some(DiskUsageEvictionTask(CancellableTask { cancel, task })) } #[instrument(skip_all)] async fn disk_usage_eviction_task( state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, tenant_manager: Arc, cancel: CancellationToken, ) { scopeguard::defer! { info!("disk usage based eviction task finishing"); }; if sleep_random(task_config.period, &cancel).await.is_err() { return; } let mut iteration_no = 0; loop { iteration_no += 1; let start = Instant::now(); async { let res = disk_usage_eviction_task_iteration( state, task_config, storage, &tenant_manager, &cancel, ) .await; match res { Ok(()) => {} Err(e) => { // these stat failures are expected to be very rare warn!("iteration failed, unexpected error: {e:#}"); } } } .instrument(tracing::info_span!("iteration", iteration_no)) .await; let sleep_until = start + task_config.period; if tokio::time::timeout_at(sleep_until, cancel.cancelled()) .await .is_ok() { break; } } } pub trait Usage: Clone + Copy + std::fmt::Debug { fn has_pressure(&self) -> bool; fn add_available_bytes(&mut self, bytes: u64); } async fn disk_usage_eviction_task_iteration( state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, tenant_manager: &Arc, cancel: &CancellationToken, ) -> anyhow::Result<()> { let tenants_dir = tenant_manager.get_conf().tenants_path(); let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; let res = disk_usage_eviction_task_iteration_impl( state, storage, usage_pre, tenant_manager, task_config.eviction_order.into(), cancel, ) .await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); match outcome { IterationOutcome::NoPressure | IterationOutcome::Cancelled => { // nothing to do, select statement below will handle things } IterationOutcome::Finished(outcome) => { // Verify with statvfs whether we made any real progress let after = filesystem_level_usage::get(&tenants_dir, task_config) // It's quite unlikely to hit the error here. Keep the code simple and bail out. .context("get filesystem-level disk usage after evictions")?; debug!(?after, "disk usage"); if after.has_pressure() { // Don't bother doing an out-of-order iteration here now. // In practice, the task period is set to a value in the tens-of-seconds range, // which will cause another iteration to happen soon enough. // TODO: deltas between the three different usages would be helpful, // consider MiB, GiB, TiB warn!(?outcome, ?after, "disk usage still high"); } else { info!(?outcome, ?after, "disk usage pressure relieved"); } } } } Err(e) => { error!("disk_usage_eviction_iteration failed: {:#}", e); } } Ok(()) } #[derive(Debug, Serialize)] #[allow(clippy::large_enum_variant)] pub enum IterationOutcome { NoPressure, Cancelled, Finished(IterationOutcomeFinished), } #[derive(Debug, Serialize)] pub struct IterationOutcomeFinished { /// The actual usage observed before we started the iteration. before: U, /// The expected value for `after`, according to internal accounting, after phase 1. planned: PlannedUsage, /// The outcome of phase 2, where we actually do the evictions. /// /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will /// be the same as `planned`. assumed: AssumedUsage, } #[derive(Debug, Serialize)] struct AssumedUsage { /// The expected value for `after`, after phase 2. projected_after: U, /// The layers we failed to evict during phase 2. failed: LayerCount, } #[derive(Debug, Serialize)] struct PlannedUsage { respecting_tenant_min_resident_size: U, fallback_to_global_lru: Option, } #[derive(Debug, Default, Serialize)] struct LayerCount { file_sizes: u64, count: usize, } pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, _storage: &GenericRemoteStorage, usage_pre: U, tenant_manager: &Arc, eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) let _g = state .mutex .try_lock() .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; debug!(?usage_pre, "disk usage"); if !usage_pre.has_pressure() { return Ok(IterationOutcome::NoPressure); } warn!( ?usage_pre, "running disk usage based eviction due to pressure" ); let (candidates, collection_time) = { let started_at = std::time::Instant::now(); match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()), } }; METRICS.layers_collected.inc_by(candidates.len() as u64); tracing::info!( elapsed_ms = collection_time.as_millis(), total_layers = candidates.len(), "collection completed" ); // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { let nth = i + 1; let total_candidates = candidates.len(); let size = candidate.layer.get_file_size(); let rel = candidate.relative_last_activity; debug!( "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), candidate.layer.get_tenant_shard_id(), candidate.layer.get_timeline_id(), candidate.layer.get_name(), ); } // phase1: select victims to relieve pressure // // Walk through the list of candidates, until we have accumulated enough layers to get // us back under the pressure threshold. 'usage_planned' is updated so that it tracks // how much disk space would be used after evicting all the layers up to the current // point in the list. // // If we get far enough in the list that we start to evict layers that are below // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. let (evicted_amount, usage_planned) = select_victims(&candidates, usage_pre).into_amount_and_planned(); METRICS.layers_selected.inc_by(evicted_amount as u64); // phase2: evict layers let mut js = tokio::task::JoinSet::new(); let limit = 1000; let mut evicted = candidates.into_iter().take(evicted_amount).fuse(); let mut consumed_all = false; // After the evictions, `usage_assumed` is the post-eviction usage, // according to internal accounting. let mut usage_assumed = usage_pre; let mut evictions_failed = LayerCount::default(); let evict_layers = async move { loop { let next = if js.len() >= limit || consumed_all { js.join_next().await } else if !js.is_empty() { // opportunistically consume ready result, one per each new evicted futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x) } else { None }; if let Some(next) = next { match next { Ok(Ok(file_size)) => { METRICS.layers_evicted.inc(); /*BEGIN_HADRON */ METRICS.bytes_evicted.inc_by(file_size); /*END_HADRON */ usage_assumed.add_available_bytes(file_size); } Ok(Err(( file_size, EvictionError::NotFound | EvictionError::Downloaded | EvictionError::Timeout, ))) => { evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ } Err(je) => tracing::error!("unknown JoinError: {je:?}"), } } if consumed_all && js.is_empty() { break; } // calling again when consumed_all is fine as evicted is fused. let Some((_partition, candidate)) = evicted.next() else { if !consumed_all { tracing::info!("all evictions started, waiting"); consumed_all = true; } continue; }; match candidate.layer { EvictionLayer::Attached(layer) => { let file_size = layer.layer_desc().file_size; js.spawn(async move { // have a low eviction waiting timeout because our LRU calculations go stale fast; // also individual layer evictions could hang because of bugs and we do not want to // pause disk_usage_based_eviction for such. let timeout = std::time::Duration::from_secs(5); match layer.evict_and_wait(timeout).await { Ok(()) => Ok(file_size), Err(e) => Err((file_size, e)), } }); } EvictionLayer::Secondary(layer) => { let file_size = layer.metadata.file_size; js.spawn(async move { layer .secondary_tenant .evict_layer(layer.timeline_id, layer.name) .await; Ok(file_size) }); } } tokio::task::yield_now().await; } (usage_assumed, evictions_failed) }; let started_at = std::time::Instant::now(); let evict_layers = async move { let mut evict_layers = std::pin::pin!(evict_layers); let maximum_expected = std::time::Duration::from_secs(10); let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await; let tuple = if let Ok(tuple) = res { tuple } else { let elapsed = started_at.elapsed(); tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing"); evict_layers.await }; let elapsed = started_at.elapsed(); tracing::info!(elapsed_ms = elapsed.as_millis(), "completed"); tuple }; let evict_layers = evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount)); let (usage_assumed, evictions_failed) = tokio::select! { tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { // dropping joinset will abort all pending evict_and_waits and that is fine, our // requests will still stand return Ok(IterationOutcome::Cancelled); } }; Ok(IterationOutcome::Finished(IterationOutcomeFinished { before: usage_pre, planned: usage_planned, assumed: AssumedUsage { projected_after: usage_assumed, failed: evictions_failed, }, })) } #[derive(Clone)] pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } /// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name. #[derive(Clone)] pub(crate) enum EvictionLayer { Attached(Layer), Secondary(EvictionSecondaryLayer), } impl From for EvictionLayer { fn from(value: Layer) -> Self { Self::Attached(value) } } impl EvictionLayer { pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { match self { Self::Attached(l) => &l.layer_desc().tenant_shard_id, Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(), } } pub(crate) fn get_timeline_id(&self) -> &TimelineId { match self { Self::Attached(l) => &l.layer_desc().timeline_id, Self::Secondary(sl) => &sl.timeline_id, } } pub(crate) fn get_name(&self) -> LayerName { match self { Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } pub(crate) fn get_file_size(&self) -> u64 { match self { Self::Attached(l) => l.layer_desc().file_size, Self::Secondary(sl) => sl.metadata.file_size, } } } #[derive(Clone)] pub(crate) struct EvictionCandidate { pub(crate) layer: EvictionLayer, pub(crate) last_activity_ts: SystemTime, pub(crate) relative_last_activity: finite_f32::FiniteF32, pub(crate) visibility: LayerVisibilityHint, } impl std::fmt::Display for EvictionLayer { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Self::Attached(l) => l.fmt(f), Self::Secondary(sl) => { write!(f, "{}/{}", sl.timeline_id, sl.name) } } } } #[derive(Default)] pub(crate) struct DiskUsageEvictionInfo { /// Timeline's largest layer (remote or resident) pub max_layer_size: Option, /// Timeline's resident layers pub resident_layers: Vec, } impl std::fmt::Debug for EvictionCandidate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it // having to allocate a string to this is bad, but it will rarely be formatted let ts = chrono::DateTime::::from(self.last_activity_ts); let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); struct DisplayIsDebug<'a, T>(&'a T); impl std::fmt::Debug for DisplayIsDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } f.debug_struct("LocalLayerInfoForDiskUsageEviction") .field("layer", &DisplayIsDebug(&self.layer)) .field("last_activity", &ts) .finish() } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] enum EvictionPartition { // A layer that is un-wanted by the tenant: evict all these first, before considering // any other layers EvictNow, // Above the minimum size threshold: this layer is a candidate for eviction. Above, // Below the minimum size threshold: this layer should only be evicted if all the // tenants' layers above the minimum size threshold have already been considered. Below, } enum EvictionCandidates { Cancelled, Finished(Vec<(EvictionPartition, EvictionCandidate)>), } /// Gather the eviction candidates. /// /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction /// order. A caller that evicts in that order, until pressure is relieved, implements /// the eviction policy outlined in the module comment. /// /// # Example with EvictionOrder::AbsoluteAccessed /// /// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. /// The eviction order would be /// /// ```text /// partition last_activity_ts tenant/layer /// Above 18:30 A/c /// Above 19:00 A/b /// Above 18:29 B/c /// Above 19:05 B/b /// Above 20:00 B/a /// Above 20:03 A/a /// Below 20:30 A/d /// Below 20:40 B/d /// Below 20:45 B/e /// Below 20:58 A/e /// ``` /// /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. /// /// But, if we need to evict 900 bytes to relieve pressure, we'd evict /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition /// after exhauting the `Above` partition. /// So, we did not respect each tenant's min_resident_size. /// /// # Example with EvictionOrder::RelativeAccessed /// /// ```text /// partition relative_age last_activity_ts tenant/layer /// Above 0/4 18:30 A/c /// Above 0/4 18:29 B/c /// Above 1/4 19:00 A/b /// Above 1/4 19:05 B/b /// Above 2/4 20:00 B/a /// Above 2/4 20:03 A/a /// Below 3/4 20:30 A/d /// Below 3/4 20:40 B/d /// Below 4/4 20:45 B/e /// Below 4/4 20:58 A/e /// ``` /// /// With tenants having the same number of layers the picture does not change much. The same with /// A having many more layers **resident** (not all of them listed): /// /// ```text /// Above 0/100 18:30 A/c /// Above 0/4 18:29 B/c /// Above 1/100 19:00 A/b /// Above 2/100 20:03 A/a /// Above 3/100 20:03 A/nth_3 /// Above 4/100 20:03 A/nth_4 /// ... /// Above 1/4 19:05 B/b /// Above 25/100 20:04 A/nth_25 /// ... /// Above 2/4 20:00 B/a /// Above 50/100 20:10 A/nth_50 /// ... /// Below 3/4 20:40 B/d /// Below 99/100 20:30 A/nth_99 /// Below 4/4 20:45 B/e /// Below 100/100 20:58 A/nth_100 /// ``` /// /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is /// difficult to see is what happens on the next round assuming the evicting 23 from the above list /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has /// appeared: /// /// ```text /// Above 0/87 20:04 A/nth_23 /// Above 0/3 19:05 B/b /// Above 0/50 20:59 C/nth_0 /// Above 1/87 20:04 A/nth_24 /// Above 1/50 21:00 C/nth_1 /// Above 2/87 20:04 A/nth_25 /// ... /// Above 16/50 21:02 C/nth_16 /// Above 1/3 20:00 B/a /// Above 27/87 20:10 A/nth_50 /// ... /// Below 2/3 20:40 B/d /// Below 49/50 21:05 C/nth_49 /// Below 86/87 20:30 A/nth_99 /// Below 3/3 20:45 B/e /// Below 50/50 21:05 C/nth_50 /// Below 87/87 20:58 A/nth_100 /// ``` /// /// Now relieving pressure with 23 layers would cost: /// - tenant A 14 layers /// - tenant B 1 layer /// - tenant C 8 layers async fn collect_eviction_candidates( tenant_manager: &Arc, eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); // get a snapshot of the list of tenants let tenants = tenant_manager .list_tenants() .context("get list of tenants")?; // TODO: avoid listing every layer in every tenant: this loop can block the executor, // and the resulting data structure can be huge. // (https://github.com/neondatabase/neon/issues/6224) let mut candidates = Vec::new(); for (tenant_id, _state, _gen) in tenants { if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) { Ok(tenant) if tenant.is_active() => tenant, Ok(_) => { debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active"); continue; } Err(e) => { // this can happen if tenant has lifecycle transition after we fetched it debug!("failed to get tenant: {e:#}"); continue; } }; if tenant.cancel.is_cancelled() { info!(%tenant_id, "Skipping tenant for eviction, it is shutting down"); continue; } let started_at = std::time::Instant::now(); // collect layers from all timelines in this tenant // // If one of the timelines becomes `!is_active()` during the iteration, // for example because we're shutting down, then `max_layer_size` can be too small. // That's OK. This code only runs under a disk pressure situation, and being // a little unfair to tenants during shutdown in such a situation is tolerable. let mut tenant_candidates = Vec::new(); let mut max_layer_size = 0; for tl in tenant.list_timelines() { if !tl.is_active() { continue; } let info = tl.get_local_layers_for_disk_usage_eviction().await; debug!( tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len() ); tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } } // Also consider layers of timelines being imported for eviction for tl in tenant.list_importing_timelines() { let info = tl.timeline.get_local_layers_for_disk_usage_eviction().await; debug!( tenant_id=%tl.timeline.tenant_shard_id.tenant_id, shard_id=%tl.timeline.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline.timeline_id, "timeline resident layers count: {}", info.resident_layers.len() ); tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } } // `min_resident_size` defaults to maximum layer file size of the tenant. // This ensures that each tenant can have at least one layer resident at a given time, // ensuring forward progress for a single Timeline::get in that tenant. // It's a questionable heuristic since, usually, there are many Timeline::get // requests going on for a tenant, and, at least in Neon prod, the median // layer file size is much smaller than the compaction target size. // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. // That's what's typically used by the various background loops. // // The default can be overridden with a fixed value in the tenant conf. // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { debug!( tenant_id=%tenant.tenant_shard_id().tenant_id, shard_id=%tenant.tenant_shard_id().shard_slug(), overridden_size=s, "using overridden min resident size for tenant" ); s } else { debug!( tenant_id=%tenant.tenant_shard_id().tenant_id, shard_id=%tenant.tenant_shard_id().shard_slug(), max_layer_size, "using max layer size as min_resident_size for tenant", ); max_layer_size }; // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer, // where the inputs are: // - whether the layer is visible // - whether the layer is above/below the min_resident_size cutline tenant_candidates .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; let total = tenant_candidates.len(); let tenant_candidates = tenant_candidates .into_iter() .enumerate() .map(|(i, mut candidate)| { // as we iterate this reverse sorted list, the most recently accessed layer will always // be 1.0; this is for us to evict it last. candidate.relative_last_activity = eviction_order.relative_last_activity(total, i); let partition = match candidate.visibility { LayerVisibilityHint::Covered => { // Covered layers are evicted first EvictionPartition::EvictNow } LayerVisibilityHint::Visible => { cumsum += i128::from(candidate.layer.get_file_size()); if cumsum > min_resident_size as i128 { EvictionPartition::Above } else { // The most recent layers below the min_resident_size threshold // are the last to be evicted. EvictionPartition::Below } } }; (partition, candidate) }); METRICS .tenant_layer_count .observe(tenant_candidates.len() as f64); candidates.extend(tenant_candidates); let elapsed = started_at.elapsed(); METRICS .tenant_collection_time .observe(elapsed.as_secs_f64()); if elapsed > LOG_DURATION_THRESHOLD { tracing::info!( tenant_id=%tenant.tenant_shard_id().tenant_id, shard_id=%tenant.tenant_shard_id().shard_slug(), elapsed_ms = elapsed.as_millis(), "collection took longer than threshold" ); } } // Note: the same tenant ID might be hit twice, if it transitions from attached to // secondary while we run. That is okay: when we eventually try and run the eviction, // the `Gate` on the object will ensure that whichever one has already been shut down // will not delete anything. let mut secondary_tenants = Vec::new(); tenant_manager.foreach_secondary_tenants( |_tenant_shard_id: &TenantShardId, state: &Arc| { secondary_tenants.push(state.clone()); }, ); for tenant in secondary_tenants { // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is // to prevent repeated disk usage based evictions from completely draining less often // updating secondaries. let (mut layer_info, total_layers) = tenant.get_layers_for_eviction(); debug_assert!( total_layers >= layer_info.resident_layers.len(), "total_layers ({total_layers}) must be at least the resident_layers.len() ({})", layer_info.resident_layers.len() ); let started_at = std::time::Instant::now(); layer_info .resident_layers .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let tenant_candidates = layer_info .resident_layers .into_iter() .enumerate() .map(|(i, mut candidate)| { candidate.relative_last_activity = eviction_order.relative_last_activity(total_layers, i); ( // Secondary locations' layers are always considered above the min resident size, // i.e. secondary locations are permitted to be trimmed to zero layers if all // the layers have sufficiently old access times. EvictionPartition::Above, candidate, ) }); METRICS .tenant_layer_count .observe(tenant_candidates.len() as f64); candidates.extend(tenant_candidates); tokio::task::yield_now().await; let elapsed = started_at.elapsed(); METRICS .tenant_collection_time .observe(elapsed.as_secs_f64()); if elapsed > LOG_DURATION_THRESHOLD { tracing::info!( tenant_id=%tenant.tenant_shard_id().tenant_id, shard_id=%tenant.tenant_shard_id().shard_slug(), elapsed_ms = elapsed.as_millis(), "collection took longer than threshold" ); } } debug_assert!( EvictionPartition::Above < EvictionPartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" ); debug_assert!( EvictionPartition::EvictNow < EvictionPartition::Above, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" ); eviction_order.sort(&mut candidates); Ok(EvictionCandidates::Finished(candidates)) } /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to /// relieve pressure. /// /// Returns the amount of candidates selected, with the planned usage. fn select_victims( candidates: &[(EvictionPartition, EvictionCandidate)], usage_pre: U, ) -> VictimSelection { let mut usage_when_switched = None; let mut usage_planned = usage_pre; let mut evicted_amount = 0; for (i, (partition, candidate)) in candidates.iter().enumerate() { if !usage_planned.has_pressure() { break; } if partition == &EvictionPartition::Below && usage_when_switched.is_none() { usage_when_switched = Some((usage_planned, i)); } usage_planned.add_available_bytes(candidate.layer.get_file_size()); evicted_amount += 1; } VictimSelection { amount: evicted_amount, usage_pre, usage_when_switched, usage_planned, } } struct VictimSelection { amount: usize, usage_pre: U, usage_when_switched: Option<(U, usize)>, usage_planned: U, } impl VictimSelection { fn into_amount_and_planned(self) -> (usize, PlannedUsage) { debug!( evicted_amount=%self.amount, "took enough candidates for pressure to be relieved" ); if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() { warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); } let planned = match self.usage_when_switched { Some((respecting_tenant_min_resident_size, _)) => PlannedUsage { respecting_tenant_min_resident_size, fallback_to_global_lru: Some(self.usage_planned), }, None => PlannedUsage { respecting_tenant_min_resident_size: self.usage_planned, fallback_to_global_lru: None, }, }; (self.amount, planned) } } /// A totally ordered f32 subset we can use with sorting functions. pub(crate) mod finite_f32 { /// A totally ordered f32 subset we can use with sorting functions. #[derive(Clone, Copy, PartialEq)] pub struct FiniteF32(f32); impl std::fmt::Debug for FiniteF32 { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Debug::fmt(&self.0, f) } } impl std::fmt::Display for FiniteF32 { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Display::fmt(&self.0, f) } } impl std::cmp::Eq for FiniteF32 {} impl std::cmp::PartialOrd for FiniteF32 { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl std::cmp::Ord for FiniteF32 { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.0.total_cmp(&other.0) } } impl TryFrom for FiniteF32 { type Error = f32; fn try_from(value: f32) -> Result { if value.is_finite() { Ok(FiniteF32(value)) } else { Err(value) } } } impl From for f32 { fn from(value: FiniteF32) -> f32 { value.0 } } impl FiniteF32 { pub const ZERO: FiniteF32 = FiniteF32(0.0); pub fn try_from_normalized(value: f32) -> Result { if (0.0..=1.0).contains(&value) { // -0.0 is within the range, make sure it is assumed 0.0..=1.0 let value = value.abs(); Ok(FiniteF32(value)) } else { Err(value) } } pub fn into_inner(self) -> f32 { self.into() } } } mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; use super::DiskUsageEvictionTaskConfig; use crate::statvfs::Statvfs; #[derive(Debug, Clone, Copy)] pub struct Usage<'a> { config: &'a DiskUsageEvictionTaskConfig, /// Filesystem capacity total_bytes: u64, /// Free filesystem space avail_bytes: u64, } impl super::Usage for Usage<'_> { fn has_pressure(&self) -> bool { let usage_pct = (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; let pressures = [ ( "min_avail_bytes", self.avail_bytes < self.config.min_avail_bytes, ), ( "max_usage_pct", usage_pct >= self.config.max_usage_pct.get() as u64, ), ]; pressures.into_iter().any(|(_, has_pressure)| has_pressure) } fn add_available_bytes(&mut self, bytes: u64) { self.avail_bytes += bytes; } } pub fn get<'a>( tenants_dir: &Utf8Path, config: &'a DiskUsageEvictionTaskConfig, ) -> anyhow::Result> { let mock_config = { #[cfg(feature = "testing")] { config.mock_statvfs.as_ref() } #[cfg(not(feature = "testing"))] { None } }; let stat = Statvfs::get(tenants_dir, mock_config) .context("statvfs failed, presumably directory got unlinked")?; let (avail_bytes, total_bytes) = stat.get_avail_total_bytes(); Ok(Usage { config, total_bytes, avail_bytes, }) } #[test] fn max_usage_pct_pressure() { use std::time::Duration; use utils::serde_percent::Percent; use super::Usage as _; let mut usage = Usage { config: &DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(85).unwrap(), min_avail_bytes: 0, period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, eviction_order: pageserver_api::config::EvictionOrder::default(), enabled: true, }, total_bytes: 100_000, avail_bytes: 0, }; assert!(usage.has_pressure(), "expected pressure at 100%"); usage.add_available_bytes(14_000); assert!(usage.has_pressure(), "expected pressure at 86%"); usage.add_available_bytes(999); assert!(usage.has_pressure(), "expected pressure at 85.001%"); usage.add_available_bytes(1); assert!(usage.has_pressure(), "expected pressure at precisely 85%"); usage.add_available_bytes(1); assert!(!usage.has_pressure(), "no pressure at 84.999%"); usage.add_available_bytes(999); assert!(!usage.has_pressure(), "no pressure at 84%"); usage.add_available_bytes(16_000); assert!(!usage.has_pressure()); } } #[cfg(test)] mod tests { use super::*; #[test] fn relative_equal_bounds() { let order = EvictionOrder::RelativeAccessed { highest_layer_count_loses_first: false, }; let len = 10; let v = (0..len) .map(|i| order.relative_last_activity(len, i).into_inner()) .collect::>(); assert_eq!(v.first(), Some(&1.0)); assert_eq!(v.last(), Some(&0.0)); assert!(v.windows(2).all(|slice| slice[0] > slice[1])); } #[test] fn relative_spare_bounds() { let order = EvictionOrder::RelativeAccessed { highest_layer_count_loses_first: true, }; let len = 10; let v = (0..len) .map(|i| order.relative_last_activity(len, i).into_inner()) .collect::>(); assert_eq!(v.first(), Some(&1.0)); assert_eq!(v.last(), Some(&0.1)); assert!(v.windows(2).all(|slice| slice[0] > slice[1])); } } ================================================ FILE: pageserver/src/feature_resolver.rs ================================================ use std::{ collections::HashMap, sync::{Arc, atomic::AtomicBool}, time::Duration, }; use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError, PostHogFlagFilterPropertyValue, }; use rand::Rng; use remote_storage::RemoteStorageKind; use serde_json::json; use tokio_util::sync::CancellationToken; use utils::id::TenantId; use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION, tenant::TenantShard}; const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); #[derive(Clone)] pub struct FeatureResolver { inner: Option>, internal_properties: Option>>, force_overrides_for_testing: Arc>>, } impl FeatureResolver { pub fn new_disabled() -> Self { Self { inner: None, internal_properties: None, force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), } } pub fn update(&self, spec: String) -> anyhow::Result<()> { if let Some(inner) = &self.inner { inner.update(spec)?; } Ok(()) } pub fn spawn( conf: &PageServerConf, shutdown_pageserver: CancellationToken, handle: &tokio::runtime::Handle, ) -> anyhow::Result { // DO NOT block in this function: make it return as fast as possible to avoid startup delays. if let Some(posthog_config) = &conf.posthog_config { let posthog_client_config = match posthog_config.clone().try_into_posthog_config() { Ok(config) => config, Err(e) => { tracing::warn!( "invalid posthog config, skipping posthog integration: {}", e ); return Ok(FeatureResolver { inner: None, internal_properties: None, force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new( HashMap::new(), ))), }); } }; let inner = FeatureResolverBackgroundLoop::new(posthog_client_config, shutdown_pageserver); let inner = Arc::new(inner); // The properties shared by all tenants on this pageserver. let internal_properties = { let mut properties = HashMap::new(); properties.insert( "pageserver_id".to_string(), PostHogFlagFilterPropertyValue::String(conf.id.to_string()), ); if let Some(availability_zone) = &conf.availability_zone { properties.insert( "availability_zone".to_string(), PostHogFlagFilterPropertyValue::String(availability_zone.clone()), ); } // Infer region based on the remote storage config. if let Some(remote_storage) = &conf.remote_storage_config { match &remote_storage.storage { RemoteStorageKind::AwsS3(config) => { properties.insert( "region".to_string(), PostHogFlagFilterPropertyValue::String(format!( "aws-{}", config.bucket_region )), ); } RemoteStorageKind::AzureContainer(config) => { properties.insert( "region".to_string(), PostHogFlagFilterPropertyValue::String(format!( "azure-{}", config.container_region )), ); } RemoteStorageKind::LocalFs { .. } => { properties.insert( "region".to_string(), PostHogFlagFilterPropertyValue::String("local".to_string()), ); } RemoteStorageKind::GCS { .. } => { properties.insert( "region".to_string(), PostHogFlagFilterPropertyValue::String("local".to_string()), ); } } } // TODO: move this to a background task so that we don't block startup in case of slow disk let metadata_path = conf.metadata_path(); match std::fs::read_to_string(&metadata_path) { Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { Ok(metadata) => { properties.insert( "hostname".to_string(), PostHogFlagFilterPropertyValue::String(metadata.http_host), ); if let Some(cplane_region) = metadata.other.get("region_id") { if let Some(cplane_region) = cplane_region.as_str() { // This region contains the cell number properties.insert( "neon_region".to_string(), PostHogFlagFilterPropertyValue::String( cplane_region.to_string(), ), ); } } } Err(e) => { tracing::warn!("Failed to parse metadata.json: {}", e); } }, Err(e) => { tracing::warn!("Failed to read metadata.json: {}", e); } } Arc::new(properties) }; let fake_tenants = { let mut tenants = Vec::new(); for i in 0..10 { let distinct_id = format!( "fake_tenant_{}_{}_{}", conf.availability_zone.as_deref().unwrap_or_default(), conf.id, i ); let tenant_properties = PerTenantProperties { remote_size_mb: Some(rand::rng().random_range(100.0..1000000.00)), db_count_max: Some(rand::rng().random_range(1..1000)), rel_count_max: Some(rand::rng().random_range(1..1000)), } .into_posthog_properties(); let properties = Self::collect_properties_inner( distinct_id.clone(), Some(&internal_properties), &tenant_properties, ); tenants.push(CaptureEvent { event: "initial_tenant_report".to_string(), distinct_id, properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties }); } tenants }; inner.clone().spawn( handle, posthog_config .refresh_interval .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL), fake_tenants, ); Ok(FeatureResolver { inner: Some(inner), internal_properties: Some(internal_properties), force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } else { Ok(FeatureResolver { inner: None, internal_properties: None, force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } } fn collect_properties_inner( tenant_id: String, internal_properties: Option<&HashMap>, tenant_properties: &HashMap, ) -> HashMap { let mut properties = HashMap::new(); if let Some(internal_properties) = internal_properties { for (key, value) in internal_properties.iter() { properties.insert(key.clone(), value.clone()); } } properties.insert( "tenant_id".to_string(), PostHogFlagFilterPropertyValue::String(tenant_id), ); for (key, value) in tenant_properties.iter() { properties.insert(key.clone(), value.clone()); } properties } /// Collect all properties availble for the feature flag evaluation. pub(crate) fn collect_properties( &self, tenant_id: TenantId, tenant_properties: &HashMap, ) -> HashMap { Self::collect_properties_inner( tenant_id.to_string(), self.internal_properties.as_deref(), tenant_properties, ) } /// Evaluate a multivariate feature flag. Currently, we do not support any properties. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be /// propagated beyond where the feature flag gets resolved. pub fn evaluate_multivariate( &self, flag_key: &str, tenant_id: TenantId, tenant_properties: &HashMap, ) -> Result { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { return Ok(value.clone()); } if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(value) => { FEATURE_FLAG_EVALUATION .with_label_values(&[flag_key, "ok", value]) .inc(); } Err(e) => { FEATURE_FLAG_EVALUATION .with_label_values(&[flag_key, "error", e.as_variant_str()]) .inc(); } } res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), )) } } /// Evaluate a boolean feature flag. Currently, we do not support any properties. /// /// Returns `Ok(())` if the flag is evaluated to true, otherwise returns an error. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be /// propagated beyond where the feature flag gets resolved. pub fn evaluate_boolean( &self, flag_key: &str, tenant_id: TenantId, tenant_properties: &HashMap, ) -> Result<(), PostHogEvaluationError> { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { return if value == "true" { Ok(()) } else { Err(PostHogEvaluationError::NoConditionGroupMatched) }; } if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(()) => { FEATURE_FLAG_EVALUATION .with_label_values(&[flag_key, "ok", "true"]) .inc(); } Err(e) => { FEATURE_FLAG_EVALUATION .with_label_values(&[flag_key, "error", e.as_variant_str()]) .inc(); } } res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), )) } } pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result { if let Some(inner) = &self.inner { inner.feature_store().is_feature_flag_boolean(flag_key) } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled, cannot auto-determine the flag type" .to_string(), )) } } /// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it /// from a single thread so it won't race. pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) { let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone(); if let Some(value) = value { force_overrides.insert(flag_key.to_string(), value.to_string()); } else { force_overrides.remove(flag_key); } self.force_overrides_for_testing .store(Arc::new(force_overrides)); } } struct PerTenantProperties { pub remote_size_mb: Option, pub db_count_max: Option, pub rel_count_max: Option, } impl PerTenantProperties { pub fn into_posthog_properties(self) -> HashMap { let mut properties = HashMap::new(); if let Some(remote_size_mb) = self.remote_size_mb { properties.insert( "tenant_remote_size_mb".to_string(), PostHogFlagFilterPropertyValue::Number(remote_size_mb), ); } if let Some(db_count) = self.db_count_max { properties.insert( "tenant_db_count_max".to_string(), PostHogFlagFilterPropertyValue::Number(db_count as f64), ); } if let Some(rel_count) = self.rel_count_max { properties.insert( "tenant_rel_count_max".to_string(), PostHogFlagFilterPropertyValue::Number(rel_count as f64), ); } properties } } pub struct TenantFeatureResolver { inner: FeatureResolver, tenant_id: TenantId, cached_tenant_properties: ArcSwap>, // Add feature flag on the critical path below. // // If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of // resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the // housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions. pub feature_test_remote_size_flag: AtomicBool, } impl TenantFeatureResolver { pub fn new(inner: FeatureResolver, tenant_id: TenantId) -> Self { Self { inner, tenant_id, cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())), feature_test_remote_size_flag: AtomicBool::new(false), } } pub fn evaluate_multivariate(&self, flag_key: &str) -> Result { self.inner.evaluate_multivariate( flag_key, self.tenant_id, &self.cached_tenant_properties.load(), ) } pub fn evaluate_boolean(&self, flag_key: &str) -> Result<(), PostHogEvaluationError> { self.inner.evaluate_boolean( flag_key, self.tenant_id, &self.cached_tenant_properties.load(), ) } pub fn collect_properties(&self) -> HashMap { self.inner .collect_properties(self.tenant_id, &self.cached_tenant_properties.load()) } pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result { self.inner.is_feature_flag_boolean(flag_key) } /// Refresh the cached properties and flags on the critical path. pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { // Any of the remote size is none => this property is none. let mut remote_size_mb = Some(0.0); // Any of the db or rel count is available => this property is available. let mut db_count_max = None; let mut rel_count_max = None; for timeline in tenant_shard.list_timelines() { let size = timeline.metrics.resident_physical_size_get(); if size == 0 { remote_size_mb = None; break; } if let Some(ref mut remote_size_mb) = remote_size_mb { *remote_size_mb += size as f64 / 1024.0 / 1024.0; } if let Some(data) = timeline.db_rel_count.load_full() { let (db_count, rel_count) = *data.as_ref(); if db_count_max.is_none() { db_count_max = Some(db_count); } if rel_count_max.is_none() { rel_count_max = Some(rel_count); } db_count_max = db_count_max.map(|max| max.max(db_count)); rel_count_max = rel_count_max.map(|max| max.max(rel_count)); } } self.cached_tenant_properties.store(Arc::new( PerTenantProperties { remote_size_mb, db_count_max, rel_count_max, } .into_posthog_properties(), )); // BEGIN: Update the feature flag on the critical path. self.feature_test_remote_size_flag.store( self.evaluate_boolean("test-remote-size-flag").is_ok(), std::sync::atomic::Ordering::Relaxed, ); // END: Update the feature flag on the critical path. } } ================================================ FILE: pageserver/src/http/mod.rs ================================================ pub mod routes; pub use routes::make_router; ================================================ FILE: pageserver/src/http/openapi_spec.yml ================================================ openapi: "3.0.2" info: title: Page Server API description: Neon Pageserver API version: "1.0" license: name: "Apache" url: https://github.com/neondatabase/neon/blob/main/LICENSE servers: - url: "" paths: /v1/status: description: Healthcheck endpoint get: description: Healthcheck security: [] responses: "200": description: OK content: application/json: schema: type: object required: - id properties: id: type: integer /v1/disk_usage_eviction/run: put: description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space. security: [] requestBody: content: application/json: schema: type: object required: - evict_bytes properties: evict_bytes: type: integer responses: "200": description: | The run completed. This does not necessarily mean that we actually evicted `evict_bytes`. Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`. content: application/json: schema: type: object /v1/reload_auth_validation_keys: post: description: Reloads the JWT public keys from their pre-configured location on disk. responses: "200": description: The reload completed successfully. /v1/tenant/{tenant_id}: parameters: - name: tenant_id in: path required: true schema: type: string get: description: Get tenant status responses: "200": description: Currently returns the flag whether the tenant has inprogress timeline downloads content: application/json: schema: $ref: "#/components/schemas/TenantInfo" delete: description: | Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting a non-existent tenant is considered successful (returns 200). responses: "200": description: Tenant was successfully deleted, or was already not found. "503": description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted) /v1/tenant/{tenant_id}/time_travel_remote_storage: parameters: - name: tenant_id in: path required: true schema: type: string - name: travel_to in: query required: true schema: type: string format: date-time - name: done_if_after in: query required: true schema: type: string format: date-time put: description: Time travel the tenant's remote storage responses: "200": description: OK content: application/json: schema: type: string /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex get: description: Get info about the timeline responses: "200": description: TimelineInfo content: application/json: schema: $ref: "#/components/schemas/TimelineInfo" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: "404": description: Timeline not found. This is the success path. content: application/json: schema: $ref: "#/components/schemas/NotFoundError" "409": description: Deletion is already in progress, continue polling content: application/json: schema: $ref: "#/components/schemas/ConflictError" "412": description: Tenant is missing, or timeline has children content: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex get: description: Get timestamp for a given LSN parameters: - name: lsn in: query required: true schema: type: string format: hex description: A LSN to get the timestamp responses: "200": description: OK content: application/json: schema: type: string format: date-time "412": description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN content: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex get: description: Get LSN by a timestamp parameters: - name: timestamp in: query required: true schema: type: string format: date-time description: A timestamp to get the LSN - name: with_lease in: query required: false schema: type: boolean description: Whether to grant a lease to the corresponding LSN. Default to false. responses: "200": description: OK content: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex post: description: Obtains a lease for the given LSN. requestBody: content: application/json: schema: type: object required: - lsn properties: lsn: description: A LSN to obtain the lease for. type: string format: hex responses: "200": description: OK content: application/json: schema: $ref: "#/components/schemas/LsnLease" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex put: description: Garbage collect given timeline responses: "200": description: OK content: application/json: schema: type: string /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex post: description: Persistently add a gc blocking at the tenant level because of this timeline responses: "200": description: OK /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex post: description: Persistently remove a tenant level gc blocking for this timeline responses: "200": description: OK /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/mark_invisible: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string format: hex put: requestBody: content: application/json: schema: type: object properties: is_visible: type: boolean default: false responses: "200": description: OK /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: flush_ms in: query required: false schema: type: integer - name: lazy in: query required: false schema: type: boolean description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL and page service requests, and _secondary_ tenants, i.e. those which are just keeping a warm cache in anticipation of transitioning to attached state in the future. This is a declarative, idempotent API: there are not separate endpoints for different tenant location configurations. Rather, this single endpoint accepts a description of the desired location configuration, and makes whatever changes are required to reach that state. In imperative terms, this API is used to attach and detach tenants, and to transition tenants to and from secondary mode. This is a synchronous API: there is no 202 response. State transitions should always be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case the caller controls the runtime of the request. In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions to AttachedStale and Detached. Flushing is never necessary for correctness, but is an important optimization when doing migrations. The `flush_ms` parameter controls whether flushing should be attempted, and how much time is allowed for flushing. If the time limit expires, the requested transition will continue without waiting for any outstanding data to flush. Callers should use a duration which is substantially less than their HTTP client's request timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions where flushing doesn't make sense, the server will ignore it. It is safe to retry requests, but if one receives a 409 or 503 response, it is not useful to retry aggressively: there is probably an existing request still ongoing. requestBody: required: false content: application/json: schema: $ref: "#/components/schemas/TenantLocationConfigRequest" responses: "200": description: Tenant is now in requested state content: application/json: schema: $ref: "#/components/schemas/TenantLocationConfigResponse" "409": description: | The tenant is already being modified, perhaps by a concurrent call to this API content: application/json: schema: $ref: "#/components/schemas/ConflictError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string post: description: | Marks the initdb archive for preservation upon deletion of the timeline or tenant. This is meant to be part of the disaster recovery process. responses: "202": description: Tenant scheduled to load successfully /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string put: description: | Either archives or unarchives the given timeline. An archived timeline may not have any non-archived children. requestBody: required: true content: application/json: schema: $ref: "#/components/schemas/ArchivalConfigRequest" responses: "200": description: Timeline (un)archived successfully "409": description: | The tenant/timeline is already being modified, perhaps by a concurrent call to this API content: application/json: schema: $ref: "#/components/schemas/ConflictError" "500": description: Generic operation error content: application/json: schema: $ref: "#/components/schemas/Error" "503": description: Temporarily unavailable, please retry. content: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id in: path required: true schema: type: string - name: inputs_only in: query required: false schema: type: boolean description: | When true, skip calculation and only provide the model inputs (for debugging). Defaults to false. - name: retention_period in: query required: false schema: type: integer description: | Override the default retention period (in bytes) used for size calculation. get: description: | Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). responses: "200": description: OK, content: application/json: schema: $ref: "#/components/schemas/SyntheticSizeResponse" text/html: schema: type: string description: SVG representation of the tenant and its timelines. "401": description: Unauthorized Error content: application/json: schema: $ref: "#/components/schemas/UnauthorizedError" "403": description: Forbidden Error content: application/json: schema: $ref: "#/components/schemas/ForbiddenError" "500": description: Generic operation error content: application/json: schema: $ref: "#/components/schemas/Error" "503": description: Temporarily unavailable, please retry. content: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/heatmap_upload: parameters: - name: tenant_shard_id in: path required: true schema: type: string post: description: | If the location is in an attached mode, upload the current state to the remote heatmap responses: "200": description: Success /v1/tenant/{tenant_shard_id}/secondary/download: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: wait_ms description: If set, we will wait this long for download to complete, and if it isn't complete then return 202 in: query required: false schema: type: integer post: description: | If the location is in secondary mode, download latest heatmap and layers responses: "200": description: Success content: application/json: schema: $ref: "#/components/schemas/SecondaryProgress" "202": description: Download has started but not yet finished content: application/json: schema: $ref: "#/components/schemas/SecondaryProgress" /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path required: true schema: type: string post: description: | Create a timeline. Returns new timeline id on success. Recreating the same timeline will succeed if the parameters match the existing timeline. If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. To ensure durability, the caller must retry the creation until success. Just because the timeline is visible via other endpoints does not mean it is durable. Future versions may stop showing timelines that are not yet durable. requestBody: content: application/json: schema: type: object required: - new_timeline_id properties: new_timeline_id: type: string format: hex ancestor_timeline_id: type: string format: hex ancestor_start_lsn: type: string format: hex pg_version: type: integer read_only: type: boolean existing_initdb_timeline_id: type: string format: hex import_pgdata: $ref: "#/components/schemas/TimelineCreateRequestImportPgdata" responses: "201": description: Timeline was created, or already existed with matching parameters content: application/json: schema: $ref: "#/components/schemas/TimelineInfo" "406": description: Permanently unsatisfiable request, don't retry. content: application/json: schema: $ref: "#/components/schemas/Error" "409": description: Timeline already exists, with different parameters. Creation cannot proceed. content: application/json: schema: $ref: "#/components/schemas/ConflictError" "429": description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry. content: application/json: schema: $ref: "#/components/schemas/Error" get: description: Get timelines for tenant responses: "200": description: TimelineInfo content: application/json: schema: type: array items: $ref: "#/components/schemas/TimelineInfo" /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string put: description: | Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. Current implementation might not be retryable across failure cases, but will be enhanced in future. Detaching should be expected to be expensive operation. Timeouts should be retried. parameters: - name: detach_behavior in: query required: false schema: description: Currently valid values are `v1`, `v2` type: string responses: "200": description: | The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. If any timelines were deleted after reparenting, they might not be on this list. content: application/json: schema: $ref: "#/components/schemas/AncestorDetached" "400": description: | Number of early checks meaning the timeline cannot be detached now: - the ancestor of timeline has an ancestor: not supported, see RFC content: application/json: schema: $ref: "#/components/schemas/Error" "404": description: Tenant or timeline not found. content: application/json: schema: $ref: "#/components/schemas/NotFoundError" "409": description: | The timeline can never be detached: - timeline has no ancestor, implying that the timeline has never had an ancestor content: application/json: schema: $ref: "#/components/schemas/ConflictError" "500": description: | Transient error, for example, pageserver shutdown happened while processing the request but we were unable to distinguish that. Must be retried. content: application/json: schema: $ref: "#/components/schemas/Error" "503": description: | Temporarily unavailable, please retry. Possible reasons: - another timeline detach for the same tenant is underway, please retry later - detected shutdown error content: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant: get: description: Get tenants list responses: "200": description: TenantInfo content: application/json: schema: type: array items: $ref: "#/components/schemas/TenantInfo" post: description: | Create a tenant. Returns new tenant id on success. If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: content: application/json: schema: $ref: "#/components/schemas/TenantCreateRequest" responses: "201": description: New tenant created successfully content: application/json: schema: type: string "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" /v1/tenant/config: put: description: | Update tenant's config by setting it to the provided value Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: content: application/json: schema: $ref: "#/components/schemas/TenantConfigRequest" responses: "200": description: OK content: application/json: schema: type: array items: $ref: "#/components/schemas/TenantInfo" patch: description: | Update tenant's config additively by patching the updated fields provided. Null values unset the field and non-null values upsert it. Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: content: application/json: schema: $ref: "#/components/schemas/TenantConfigRequest" responses: "200": description: OK content: application/json: schema: type: array items: $ref: "#/components/schemas/TenantInfo" /v1/tenant/{tenant_id}/config: parameters: - name: tenant_id in: path required: true schema: type: string get: description: | Returns tenant's config description: specific config overrides a tenant has and the effective config. responses: "200": description: Tenant config, specific and effective content: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers: parameters: - name: tenant_shard_id in: path required: true schema: type: string - name: timeline_id in: path required: true schema: type: string - name: concurrency description: Maximum number of concurrent downloads (capped at remote storage concurrency) in: query required: false schema: type: integer - name: recurse description: When set, will recurse with the downloads into ancestor timelines in: query required: false schema: type: boolean post: description: | Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter may be used to target all shards of a tenant when the unsharded form is used, or a specific tenant shard with the sharded form. responses: "200": description: Success delete: description: Stop any on-going background downloads of heatmap layers for the specified timeline. responses: "200": description: Success /v1/utilization: get: description: | Returns the pageservers current utilization and fitness score for new tenants. responses: "200": description: Pageserver utilization and fitness score content: application/json: schema: $ref: "#/components/schemas/PageserverUtilization" components: securitySchemes: JWT: type: http scheme: bearer bearerFormat: JWT schemas: TenantInfo: type: object required: - id - attachment_status properties: id: type: string current_physical_size: type: integer attachment_status: description: | Status of this tenant's attachment to this pageserver. - `maybe` means almost nothing, don't read anything into it except for the fact that the pageserver _might_ be already writing to the tenant's S3 state, so, DO NOT ATTACH the tenant to any other pageserver, or we risk split-brain. - `attached` means that the attach operation has completed, successfully - `failed` means that attach has failed. For reason check corresponding `reason` failed. `failed` is the terminal state, retrying attach call wont resolve the issue. For example this can be caused by s3 being unreachable. The retry may be implemented with call to detach, though it would be better to not automate it and inspec failed state manually before proceeding with a retry. type: object required: - slug - data properties: slug: type: string enum: [ "maybe", "attached", "failed" ] data: type: object properties: reason: type: string gc_blocking: type: string TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' - $ref: '#/components/schemas/TenantLoadRequest' - type: object required: - new_tenant_id properties: new_tenant_id: type: string TenantLoadRequest: type: object properties: generation: type: integer description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' - type: object required: - tenant_id properties: tenant_id: type: string TenantLocationConfigRequest: type: object required: - mode properties: mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] description: Mode of functionality that this pageserver will run in for this tenant. generation: type: integer description: Attachment generation number, mandatory when `mode` is an attached state secondary_conf: $ref: '#/components/schemas/SecondaryConfig' tenant_conf: $ref: '#/components/schemas/TenantConfig' TenantLocationConfigResponse: type: object required: - shards properties: shards: description: Pageservers where this tenant's shards are attached. Not populated for secondary locations. type: array items: $ref: "#/components/schemas/TenantShardLocation" stripe_size: description: If multiple shards are present, this field contains the sharding stripe size, else it is null. type: integer nullable: true TenantShardLocation: type: object required: - node_id - shard_id properties: node_id: description: Pageserver node ID where this shard is attached type: integer shard_id: description: Tenant shard ID of the shard type: string SecondaryConfig: type: object properties: warm: type: boolean description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. ArchivalConfigRequest: type: object required: - state properties: state: description: The archival state of a timeline type: string enum: ["Archived", "Unarchived"] TenantConfig: type: object properties: gc_period: type: string gc_horizon: type: integer pitr_interval: type: string checkpoint_distance: type: integer checkpoint_timeout: type: string compaction_target_size: type: integer compaction_period: type: string compaction_threshold: type: string compaction_upper_limit: type: string image_creation_threshold: type: integer walreceiver_connect_timeout: type: string lagging_wal_timeout: type: string max_lsn_wal_lag: type: integer heatmap_period: type: string TenantConfigResponse: type: object properties: tenant_specific_overrides: $ref: "#/components/schemas/TenantConfig" effective_config: $ref: "#/components/schemas/TenantConfig" TimelineCreateRequestImportPgdata: type: object required: - location - idempotency_key properties: idempotency_key: type: string location: $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation" TimelineCreateRequestImportPgdataLocation: type: object properties: AwsS3: $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3" TimelineCreateRequestImportPgdataLocationAwsS3: type: object properties: region: type: string bucket: type: string key: type: string required: - region - bucket - key TimelineInfo: type: object required: - timeline_id - tenant_id - last_record_lsn - disk_consistent_lsn - state - min_readable_lsn properties: timeline_id: type: string format: hex tenant_id: type: string last_record_lsn: type: string format: hex disk_consistent_lsn: type: string format: hex remote_consistent_lsn: type: string format: hex remote_consistent_lsn_visible: type: string format: hex ancestor_timeline_id: type: string format: hex ancestor_lsn: type: string format: hex prev_record_lsn: type: string format: hex current_logical_size: type: integer current_physical_size: type: integer wal_source_connstr: type: string last_received_msg_lsn: type: string format: hex last_received_msg_ts: type: integer state: type: string min_readable_lsn: type: string format: hex applied_gc_cutoff_lsn: type: string format: hex safekeepers: $ref: "#/components/schemas/TimelineSafekeepersInfo" TimelineSafekeepersInfo: type: object required: - tenant_id - timeline_id - generation - safekeepers properties: tenant_id: type: string format: hex timeline_id: type: string format: hex generation: type: integer safekeepers: type: array items: $ref: "#/components/schemas/TimelineSafekeeperInfo" TimelineSafekeeperInfo: type: object required: - id - hostname properties: id: type: integer hostname: type: string SyntheticSizeResponse: type: object required: - id - size - segment_sizes - inputs properties: id: type: string format: hex size: type: integer nullable: true description: | Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: $ref: "#/components/schemas/SegmentSize" inputs: type: object properties: segments: type: array items: $ref: "#/components/schemas/SegmentData" timeline_inputs: type: array items: $ref: "#/components/schemas/TimelineInput" SegmentSize: type: object required: - method - accum_size properties: method: type: string accum_size: type: integer SegmentData: type: object required: - segment properties: segment: type: object required: - lsn properties: parent: type: integer lsn: type: integer size: type: integer needed: type: boolean timeline_id: type: string format: hex kind: type: string TimelineInput: type: object required: - timeline_id properties: ancestor_id: type: string ancestor_lsn: type: string timeline_id: type: string format: hex LsnByTimestampResponse: type: object required: - lsn - kind properties: lsn: type: string format: hex kind: type: string enum: [past, present, future, nodata] valid_until: type: string format: date-time description: The expiration time of the granted lease. LsnLease: type: object required: - valid_until properties: valid_until: type: string format: date-time PageserverUtilization: type: object required: - disk_usage_bytes - free_space_bytes - utilization_score properties: disk_usage_bytes: type: integer format: int64 minimum: 0 description: The amount of disk space currently used. free_space_bytes: type: integer format: int64 minimum: 0 description: The amount of usable disk space left. utilization_score: type: integer format: int64 minimum: 0 maximum: 9223372036854775807 default: 9223372036854775807 description: | Lower is better score for how good this pageserver would be for the next tenant. The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. SecondaryProgress: type: object required: - heatmap_mtime - layers_downloaded - layers_total - bytes_downloaded - bytes_total properties: heatmap_mtime: type: string format: date-time description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format) layers_downloaded: type: integer format: int64 description: How many layers from the latest layer heatmap are present on disk bytes_downloaded: type: integer format: int64 description: How many bytes of layer content from the latest layer heatmap are present on disk layers_total: type: integer format: int64 description: How many layers were in the latest layer heatmap bytes_total: type: integer format: int64 description: How many bytes of layer content were in the latest layer heatmap AncestorDetached: type: object required: - reparented_timelines properties: reparented_timelines: type: array description: Set of reparented timeline ids items: type: string format: hex description: TimelineId Error: type: object required: - msg properties: msg: type: string UnauthorizedError: type: object required: - msg properties: msg: type: string ForbiddenError: type: object required: - msg properties: msg: type: string ServiceUnavailableError: type: object required: - msg properties: msg: type: string NotFoundError: type: object required: - msg properties: msg: type: string ConflictError: type: object required: - msg properties: msg: type: string PreconditionFailedError: type: object required: - msg properties: msg: type: string security: - JWT: [] ================================================ FILE: pageserver/src/http/routes.rs ================================================ //! //! Management HTTP API //! use std::cmp::Reverse; use std::collections::BTreeMap; use std::collections::BinaryHeap; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result, anyhow}; use bytes::Bytes; use enumset::EnumSet; use futures::future::join_all; use futures::{StreamExt, TryFutureExt}; use http_utils::endpoint::{ self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, }; use http_utils::error::{ApiError, HttpErrorBody}; use http_utils::failpoints::failpoints_handler; use http_utils::json::{json_request, json_request_maybe, json_response}; use http_utils::request::{ get_request_param, must_get_query_param, must_parse_query_param, parse_query_param, parse_request_param, }; use http_utils::{RequestExt, RouterBuilder}; use humantime::format_rfc3339; use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::models::{ DetachBehavior, DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, TimelinePatchIndexPartRequest, TimelineVisibilityState, TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; use serde::{Deserialize, Serialize}; use serde_json::json; use tenant_size_model::svg::SvgBranchKind; use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::SwappableJwtAuth; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; use crate::config::PageServerConf; use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::feature_resolver::FeatureResolver; use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, }; use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines, }; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::ValuesReconstructState; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout, WaitLsnWaiter, import_pgdata, }; use crate::tenant::{ GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, remote_timeline_client, }; use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant}; // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. #[cfg(not(feature = "testing"))] pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to // finish attaching, if calls to remote storage are slow. #[cfg(feature = "testing")] pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, allowlist_routes: &'static [&'static str], remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, latest_utilization: tokio::sync::Mutex>, feature_resolver: FeatureResolver, } impl State { #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, feature_resolver: FeatureResolver, ) -> anyhow::Result { let allowlist_routes = &[ "/v1/status", "/v1/doc", "/swagger.yml", "/metrics", "/profile/cpu", "/profile/heap", ]; Ok(Self { conf, tenant_manager, auth, allowlist_routes, remote_storage, broker_client, disk_usage_eviction_state, deletion_queue_client, secondary_controller, latest_utilization: Default::default(), feature_resolver, }) } } #[inline(always)] fn get_state(request: &Request) -> &State { request .data::>() .expect("unknown state type") .as_ref() } #[inline(always)] fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } /// Check that the requester is authorized to operate on given tenant fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) }) } impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(other) => ApiError::InternalServerError(other), PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()), PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } } } impl From for ApiError { fn from(tmie: TenantMapInsertError) -> ApiError { match tmie { TenantMapInsertError::SlotError(e) => e.into(), TenantMapInsertError::SlotUpsertError(e) => e.into(), TenantMapInsertError::Other(e) => ApiError::InternalServerError(e), } } } impl From for ApiError { fn from(e: TenantSlotError) -> ApiError { use TenantSlotError::*; match e { NotFound(tenant_id) => { ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into()) } InProgress => { ApiError::ResourceUnavailable("Tenant is being modified concurrently".into()) } MapState(e) => e.into(), } } } impl From for ApiError { fn from(e: TenantSlotUpsertError) -> ApiError { use TenantSlotUpsertError::*; match e { InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")), MapState(e) => e.into(), ShuttingDown(_) => ApiError::ShuttingDown, } } } impl From for ApiError { fn from(e: UpsertLocationError) -> ApiError { use UpsertLocationError::*; match e { BadRequest(e) => ApiError::BadRequest(e), Unavailable(_) => ApiError::ShuttingDown, e @ InProgress => ApiError::Conflict(format!("{e}")), Flush(e) | InternalError(e) => ApiError::InternalServerError(e), } } } impl From for ApiError { fn from(e: TenantMapError) -> ApiError { use TenantMapError::*; match e { StillInitializing | ShuttingDown => { ApiError::ResourceUnavailable(format!("{e}").into()) } } } } impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { TenantStateError::IsStopping(_) => { ApiError::ResourceUnavailable("Tenant is stopping".into()) } TenantStateError::SlotError(e) => e.into(), TenantStateError::SlotUpsertError(e) => e.into(), TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)), } } } impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()), GetTenantError::ShardNotFound(tid) => { ApiError::NotFound(anyhow!("tenant {tid}").into()) } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. ApiError::ResourceUnavailable("Tenant not yet active".into()) } GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()), } } } impl From for ApiError { fn from(gte: GetTimelineError) -> Self { // Rationale: tenant is activated only after eligble timelines activate ApiError::NotFound(gte.into()) } } impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { GetActiveTenantError::Broken(reason) => { ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) } GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { ApiError::ShuttingDown } GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{e}")), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), GetActiveTenantError::WaitForActiveTimeout { .. } => { ApiError::ResourceUnavailable(format!("{e}").into()) } GetActiveTenantError::SwitchedTenant => { // in our HTTP handlers, this error doesn't happen // TODO: separate error types ApiError::ResourceUnavailable("switched tenant".into()) } } } } impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), HasChildren(children) => ApiError::PreconditionFailed( format!("Cannot delete timeline which has child timelines: {children:?}") .into_boxed_str(), ), a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()), Cancelled => ApiError::ResourceUnavailable("shutting down".into()), Other(e) => ApiError::InternalServerError(e), } } } impl From for ApiError { fn from(value: crate::tenant::TimelineArchivalError) -> Self { use crate::tenant::TimelineArchivalError::*; match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), Cancelled => ApiError::ShuttingDown, e @ HasArchivedParent(_) => { ApiError::PreconditionFailed(e.to_string().into_boxed_str()) } HasUnarchivedChildren(children) => ApiError::PreconditionFailed( format!( "Cannot archive timeline which has non-archived child timelines: {children:?}" ) .into_boxed_str(), ), a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), Other(e) => ApiError::InternalServerError(e), } } } impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { use crate::tenant::mgr::DeleteTimelineError::*; match value { // Report Precondition failed so client can distinguish between // "tenant is missing" case from "timeline is missing" Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed( "Requested tenant is missing".to_owned().into_boxed_str(), ), Tenant(t) => ApiError::from(t), Timeline(t) => ApiError::from(t), } } } impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self { use crate::tenant::mgr::DeleteTenantError::*; match value { SlotError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), Cancelled => ApiError::ShuttingDown, } } } impl From for ApiError { fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError { use crate::tenant::secondary::SecondaryTenantError; match ste { SecondaryTenantError::GetTenant(gte) => gte.into(), SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown, } } } impl From for ApiError { fn from(err: crate::tenant::FinalizeTimelineImportError) -> ApiError { use crate::tenant::FinalizeTimelineImportError::*; match err { ImportTaskStillRunning => { ApiError::ResourceUnavailable("Import task still running".into()) } ShuttingDown => ApiError::ShuttingDown, } } } // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, force_await_initial_logical_size: bool, include_image_consistent_lsn: bool, ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); if force_await_initial_logical_size { timeline.clone().await_initial_logical_size().await } let mut info = build_timeline_info_common( timeline, ctx, tenant::timeline::GetLogicalSizePriority::Background, ) .await?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while // we're executing this function, we will outlive the timeline on-disk state. info.current_logical_size_non_incremental = Some( timeline .get_current_logical_size_non_incremental(info.last_record_lsn, ctx) .await?, ); } // HADRON if include_image_consistent_lsn { info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?); } Ok(info) } async fn build_timeline_info_common( timeline: &Arc, ctx: &RequestContext, logical_size_task_priority: tenant::timeline::GetLogicalSizePriority, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); let initdb_lsn = timeline.initdb_lsn; let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); if let Some(info) = guard.as_ref() { ( Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. Some(info.last_received_msg_lsn), Some(info.last_received_msg_ts), ) } else { (None, None, None) } }; let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); let ancestor_lsn = match timeline.get_ancestor_lsn() { Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), }; let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); // Report is_archived = false if the timeline is still loading let is_archived = timeline.is_archived().unwrap_or(false); let remote_consistent_lsn_projected = timeline .get_remote_consistent_lsn_projected() .unwrap_or(Lsn(0)); let remote_consistent_lsn_visible = timeline .get_remote_consistent_lsn_visible() .unwrap_or(Lsn(0)); let is_invisible = timeline.remote_client.is_invisible().unwrap_or(false); let walreceiver_status = timeline.walreceiver_status(); let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); // Externally, expose the lowest LSN that can be used to create a branch. // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we // actually trimmed data to), which can pass each other when PITR is changed. let min_readable_lsn = std::cmp::max( timeline.get_gc_cutoff_lsn().unwrap_or_default(), *timeline.get_applied_gc_cutoff_lsn(), ); let (rel_size_migration, rel_size_migrated_at) = timeline.get_rel_size_v2_status(); let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), remote_consistent_lsn: remote_consistent_lsn_projected, remote_consistent_lsn_visible, initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), current_logical_size_is_accurate: match current_logical_size.accuracy() { tenant::timeline::logical_size::Accuracy::Approximate => false, tenant::timeline::logical_size::Accuracy::Exact => true, }, directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, pitr_history_size, within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, pg_version: timeline.pg_version, state, is_archived: Some(is_archived), rel_size_migration: Some(rel_size_migration), rel_size_migrated_at, is_invisible: Some(is_invisible), walreceiver_status, // HADRON image_consistent_lsn: None, }; Ok(info) } fn build_timeline_offloaded_info(offloaded: &Arc) -> OffloadedTimelineInfo { let &OffloadedTimeline { tenant_shard_id, timeline_id, ancestor_retain_lsn, ancestor_timeline_id, archived_at, .. } = offloaded.as_ref(); OffloadedTimelineInfo { tenant_id: tenant_shard_id, timeline_id, ancestor_retain_lsn, ancestor_timeline_id, archived_at: archived_at.and_utc(), } } // healthcheck handler async fn status_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let config = get_config(&request); json_response(StatusCode::OK, StatusResponse { id: config.id }) } async fn reload_auth_validation_keys_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let config = get_config(&request); let state = get_state(&request); let Some(shared_auth) = &state.auth else { return json_response(StatusCode::BAD_REQUEST, ()); }; // unwrap is ok because check is performed when creating config, so path is set and exists let key_path = config.auth_validation_public_key_path.as_ref().unwrap(); info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}"); match utils::auth::JwtAuth::from_key_path(key_path) { Ok(new_auth) => { shared_auth.swap(new_auth); json_response(StatusCode::OK, ()) } Err(e) => { let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); json_response( StatusCode::INTERNAL_SERVER_ERROR, HttpErrorBody::from_msg(err_msg.to_string()), ) } } } async fn timeline_create_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let new_timeline_id = request_data.new_timeline_id; // fill in the default pg_version if not provided & convert request into domain model let params: tenant::CreateTimelineParams = match request_data.mode { TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, pg_version, } => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap { new_timeline_id, existing_initdb_timeline_id, pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION), }), TimelineCreateRequestMode::Branch { ancestor_timeline_id, ancestor_start_lsn, read_only: _, pg_version: _, } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch { new_timeline_id, ancestor_timeline_id, ancestor_start_lsn, }), TimelineCreateRequestMode::ImportPgdata { import_pgdata: TimelineCreateRequestModeImportPgdata { location, idempotency_key, }, } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata { idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new( idempotency_key.0, ), new_timeline_id, location: { use import_pgdata::index_part_format::Location; use pageserver_api::models::ImportPgdataLocation; match location { #[cfg(feature = "testing")] ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path }, ImportPgdataLocation::AwsS3 { region, bucket, key, } => Location::AwsS3 { region, bucket, key, }, } }, }), }; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; // earlier versions of the code had pg_version and ancestor_lsn in the span // => continue to provide that information, but, through a log message that doesn't require us to destructure tracing::info!(?params, "creating timeline"); match tenant .create_timeline(params, state.broker_client.clone(), &ctx) .await { Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. let timeline_info = build_timeline_info_common( &new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User, ) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } Err(_) if tenant.cancel.is_cancelled() => { // In case we get some ugly error type during shutdown, cast it into a clean 503. json_response( StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()), ) } Err(e @ tenant::CreateTimelineError::Conflict) => { json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())) } Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response( StatusCode::TOO_MANY_REQUESTS, HttpErrorBody::from_msg(e.to_string()), ), Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(format!("{err:#}")), ), Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response( StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()), ), Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response( StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(e.to_string()), ), Err(tenant::CreateTimelineError::ShuttingDown) => json_response( StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("tenant shutting down".to_string()), ), Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %new_timeline_id, )) .await } async fn timeline_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; let include_image_consistent_lsn: Option = parse_query_param(&request, "include-image-consistent-lsn")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), include_image_consistent_lsn.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) .await .context("Failed to build timeline info") .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); } Ok::, ApiError>(response_data) } .instrument(info_span!("timeline_list", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, response_data) } async fn timeline_and_offloaded_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; let include_image_consistent_lsn: Option = parse_query_param(&request, "include-image-consistent-lsn")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let (timelines, offloadeds) = tenant.list_timelines_and_offloaded(); let mut timeline_infos = Vec::with_capacity(timelines.len()); for timeline in timelines { let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), include_image_consistent_lsn.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) .await .context("Failed to build timeline info") .map_err(ApiError::InternalServerError)?; timeline_infos.push(timeline_info); } let offloaded_infos = offloadeds .into_iter() .map(|offloaded| build_timeline_offloaded_info(&offloaded)) .collect::>(); let res = TimelinesInfoAndOffloaded { timelines: timeline_infos, offloaded: offloaded_infos, }; Ok::(res) } .instrument(info_span!("timeline_and_offloaded_list", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, response_data) } async fn timeline_preserve_initdb_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); // Part of the process for disaster recovery from safekeeper-stored WAL: // If we don't recover into a new timeline but want to keep the timeline ID, // then the initdb archive is deleted. This endpoint copies it to a different // location where timeline recreation cand find it. async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let timeline = tenant.get_timeline(timeline_id, false)?; timeline .preserve_initdb_archive() .await .context("preserving initdb archive") .map_err(ApiError::InternalServerError)?; Ok::<_, ApiError>(()) } .instrument(info_span!("timeline_preserve_initdb_archive", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::OK, ()) } async fn timeline_archival_config_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant .apply_timeline_archival_config( timeline_id, request_data.state, state.broker_client.clone(), ctx, ) .await?; Ok::<_, ApiError>(()) } .instrument(info_span!("timeline_archival_config", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), state = ?request_data.state, %timeline_id)) .await?; json_response(StatusCode::OK, ()) } /// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency /// measure only. /// /// Some examples of safe patches: /// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors. /// - Force set the index part to use reldir v2 (migrating/migrated). /// /// Some examples of unsafe patches: /// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause /// errors. /// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background. async fn timeline_patch_index_part_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?; check_permission(&request, None)?; // require global permission for this request let state = get_state(&request); async { let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; if request_data.rel_size_migration.is_none() && request_data.rel_size_migrated_at.is_some() { return Err(ApiError::BadRequest(anyhow!( "updating rel_size_migrated_at without rel_size_migration is not allowed" ))); } if let Some(rel_size_migration) = request_data.rel_size_migration { timeline .update_rel_size_v2_status(rel_size_migration, request_data.rel_size_migrated_at) .map_err(ApiError::InternalServerError)?; } if let Some(gc_compaction_last_completed_lsn) = request_data.gc_compaction_last_completed_lsn { timeline .update_gc_compaction_state(GcCompactionState { last_completed_lsn: gc_compaction_last_completed_lsn, }) .map_err(ApiError::InternalServerError)?; } if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn { { let guard = timeline.applied_gc_cutoff_lsn.lock_for_write(); guard.store_and_unlock(applied_gc_cutoff_lsn); } } if request_data.force_index_update { timeline .remote_client .force_schedule_index_upload() .context("force schedule index upload") .map_err(ApiError::InternalServerError)?; } Ok::<_, ApiError>(()) } .instrument(info_span!("timeline_patch_index_part", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::OK, ()) } async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; // HADRON let include_image_consistent_lsn: Option = parse_query_param(&request, "include-image-consistent-lsn")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let state = get_state(&request); let timeline_info = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timeline = tenant.get_timeline(timeline_id, false)?; let ctx = &ctx.with_scope_timeline(&timeline); let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), include_image_consistent_lsn.unwrap_or(false), ctx, ) .await .context("get local timeline info") .map_err(ApiError::InternalServerError)?; Ok::<_, ApiError>(timeline_info) } .instrument(info_span!("timeline_detail", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::OK, timeline_info) } async fn get_lsn_by_timestamp_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Lsn calculations by timestamp are only available on shard zero" ))); } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?; let timestamp = humantime::parse_rfc3339(×tamp_raw) .with_context(|| format!("Invalid time: {timestamp_raw:?}")) .map_err(ApiError::BadRequest)?; let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; #[derive(serde::Serialize, Debug)] struct Result { lsn: Lsn, kind: &'static str, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] #[serde(flatten)] lease: Option, } let (lsn, kind) = match result { LsnForTimestamp::Present(lsn) => (lsn, "present"), LsnForTimestamp::Future(lsn) => (lsn, "future"), LsnForTimestamp::Past(lsn) => (lsn, "past"), LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), }; let lease = if with_lease { timeline .init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) .inspect_err(|_| { warn!("fail to grant a lease to {}", lsn); }) .ok() } else { None }; let result = Result { lsn, kind, lease }; let valid_until = result .lease .as_ref() .map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string()); tracing::info!( lsn=?result.lsn, kind=%result.kind, timestamp=%timestamp_raw, valid_until=?valid_until, "lsn_by_timestamp finished" ); json_response(StatusCode::OK, result) } async fn get_timestamp_of_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Timestamp calculations by lsn are only available on shard zero" ))); } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let lsn_str = must_get_query_param(&request, "lsn")?; let lsn = Lsn::from_str(&lsn_str) .with_context(|| format!("Invalid LSN: {lsn_str:?}")) .map_err(ApiError::BadRequest)?; let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { Some(time) => { let time = format_rfc3339( postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?, ) .to_string(); json_response(StatusCode::OK, time) } None => Err(ApiError::PreconditionFailed( format!("Timestamp for lsn {lsn} not found").into(), )), } } async fn timeline_delete_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id) .map_err(|e| { match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => { ApiError::PreconditionFailed( "Requested tenant is missing".to_string().into_boxed_str(), ) } e => e.into(), } })?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) } async fn tenant_reset_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let drop_cache: Option = parse_query_param(&request, "drop_cache")?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); state .tenant_manager .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } async fn tenant_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let state = get_state(&request); let response_data = state .tenant_manager .list_tenants() .map_err(|_| { ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() .map(|(id, state, gen_)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), generation: (*gen_) .into() .expect("Tenants are always attached with a generation"), gc_blocking: None, }) .collect::>(); json_response(StatusCode::OK, response_data) } async fn tenant_status( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. let activate = true; #[cfg(feature = "testing")] let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); let tenant_info = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; if activate { // This is advisory: we prefer to let the tenant activate on-demand when this function is // called, but it is still valid to return 200 and describe the current state of the tenant // if it doesn't make it into an active state. tenant .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) .await .ok(); } // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { current_physical_size += timeline.layer_size_sum().await; } let state = tenant.current_state(); Result::<_, ApiError>::Ok(TenantDetails { tenant_info: TenantInfo { id: tenant_shard_id, state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), generation: tenant .generation() .into() .expect("Tenants are always attached with a generation"), gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")), }, walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), }) } .instrument(info_span!("tenant_status_handler", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, tenant_info) } async fn tenant_delete_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { // TODO openapi spec let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); state .tenant_manager .delete_tenant(tenant_shard_id) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug() )) .await?; json_response(StatusCode::OK, ()) } /// HTTP endpoint to query the current tenant_size of a tenant. /// /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used /// to debug any of the calculations. Requires `tenant_id` request parameter, supports /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model /// values. /// /// 'retention_period' query parameter overrides the cutoff that is used to calculate the size /// (only if it is shorter than the real cutoff). /// /// Note: we don't update the cached size and prometheus metric here. /// The retention period might be different, and it's nice to have a method to just calculate it /// without modifying anything anyway. async fn tenant_size_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); let state = get_state(&request); if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); } let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; // this can be long operation let inputs = tenant .gather_size_inputs( retention_period, LogicalSizeCalculationCause::TenantSizeHandler, &cancel, &ctx, ) .await .map_err(|e| match e { crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown, other => ApiError::InternalServerError(anyhow::anyhow!(other)), })?; let mut sizes = None; let accepts_html = headers .get(header::ACCEPT) .map(|v| v == "text/html") .unwrap_or_default(); if !inputs_only.unwrap_or(false) { let storage_model = inputs.calculate_model(); let size = storage_model.calculate(); // If request header expects html, return html if accepts_html { return synthetic_size_html_response(inputs, storage_model, size); } sizes = Some(size); } else if accepts_html { return Err(ApiError::BadRequest(anyhow!( "inputs_only parameter is incompatible with html output request" ))); } /// The type resides in the pageserver not to expose `ModelInputs`. #[derive(serde::Serialize)] struct TenantHistorySize { id: TenantId, /// Size is a mixture of WAL and logical size, so the unit is bytes. /// /// Will be none if `?inputs_only=true` was given. size: Option, /// Size of each segment used in the model. /// Will be null if `?inputs_only=true` was given. segment_sizes: Option>, inputs: crate::tenant::size::ModelInputs, } json_response( StatusCode::OK, TenantHistorySize { id: tenant_shard_id.tenant_id, size: sizes.as_ref().map(|x| x.total_size), segment_sizes: sizes.map(|x| x.segments), inputs, }, ) } async fn tenant_shard_split_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let req: TenantShardSplitRequest = json_request(&mut request).await?; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let new_shards = state .tenant_manager .shard_split( tenant, ShardCount::new(req.new_shard_count), req.new_stripe_size, &ctx, ) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, TenantShardSplitResponse { new_shards }) } async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); let state = get_state(&request); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let layer_map_info = timeline .layer_map_info(reset) .await .map_err(|_shutdown| ApiError::ShuttingDown)?; json_response(StatusCode::OK, layer_map_info) } #[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))] async fn timeline_layer_scan_disposable_keys( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_name: LayerName = parse_request_param(&request, "layer_name")?; tracing::Span::current().record( "tenant_id", tracing::field::display(&tenant_shard_id.tenant_id), ); tracing::Span::current().record( "shard_id", tracing::field::display(tenant_shard_id.shard_slug()), ); tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id)); tracing::Span::current().record("layer_name", tracing::field::display(&layer_name)); let state = get_state(&request); check_permission(&request, Some(tenant_shard_id.tenant_id))?; // technically the timeline need not be active for this scan to complete let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let guard = timeline .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else { return Err(ApiError::NotFound( anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(), )); }; let resident_layer = layer .download_and_keep_resident(&ctx) .await .map_err(|err| match err { tenant::storage_layer::layer::DownloadError::TimelineShutdown | tenant::storage_layer::layer::DownloadError::DownloadCancelled => { ApiError::ShuttingDown } tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | tenant::storage_layer::layer::DownloadError::DownloadRequired | tenant::storage_layer::layer::DownloadError::NotFile(_) | tenant::storage_layer::layer::DownloadError::DownloadFailed | tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => { ApiError::InternalServerError(err.into()) } #[cfg(test)] tenant::storage_layer::layer::DownloadError::Failpoint(_) => { ApiError::InternalServerError(err.into()) } })?; let keys = resident_layer .load_keys(&ctx) .await .map_err(ApiError::InternalServerError)?; let shard_identity = timeline.get_shard_identity(); let mut disposable_count = 0; let mut not_disposable_count = 0; let cancel = cancel.clone(); for (i, key) in keys.into_iter().enumerate() { if shard_identity.is_key_disposable(&key) { disposable_count += 1; tracing::debug!(key = %key, key.dbg=?key, "disposable key"); } else { not_disposable_count += 1; } #[allow(clippy::collapsible_if)] if i % 10000 == 0 { if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() { return Err(ApiError::ShuttingDown); } } } json_response( StatusCode::OK, pageserver_api::models::ScanDisposableKeysResponse { disposable_count, not_disposable_count, }, ) } async fn timeline_download_heatmap_layers_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { // Only used in the case where remote storage is not configured. const DEFAULT_MAX_CONCURRENCY: usize = 100; // A conservative default. const DEFAULT_CONCURRENCY: usize = 16; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let desired_concurrency = parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let max_concurrency = get_config(&request) .remote_storage_config .as_ref() .map(|c| c.concurrency_limit()) .unwrap_or(DEFAULT_MAX_CONCURRENCY); let concurrency = std::cmp::min(max_concurrency, desired_concurrency); timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?; json_response(StatusCode::ACCEPTED, ()) } async fn timeline_shutdown_download_heatmap_layers_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; timeline.stop_and_drain_heatmap_layers_download().await; json_response(StatusCode::OK, ()) } async fn layer_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let downloaded = timeline .download_layer(&layer_name, &ctx) .await .map_err(|e| match e { tenant::storage_layer::layer::DownloadError::TimelineShutdown | tenant::storage_layer::layer::DownloadError::DownloadCancelled => { ApiError::ShuttingDown } other => ApiError::InternalServerError(other.into()), })?; match downloaded { Some(true) => json_response(StatusCode::OK, ()), Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } async fn evict_timeline_layer_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let evicted = timeline .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; match evicted { Some(true) => json_response(StatusCode::OK, ()), Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } async fn timeline_gc_blocking_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { block_or_unblock_gc(request, true).await } async fn timeline_gc_unblocking_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { block_or_unblock_gc(request, false).await } /// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding. /// Use the `pagectl page-trace` command to decode and analyze the output. async fn timeline_page_trace_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let state = get_state(&request); check_permission(&request, None)?; let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024); let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5); // Convert size limit to event limit based on the serialized size of an event. The event size is // fixed, as the default bincode serializer uses fixed-width integer encoding. let event_size = bincode::serialize(&PageTraceEvent::default()) .map_err(|err| ApiError::InternalServerError(err.into()))? .len(); let event_limit = size_limit / event_size; let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; // Install a page trace, unless one is already in progress. We just use a buffered channel, // which may 2x the memory usage in the worst case, but it's still bounded. let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit); let cur = timeline.page_trace.load(); let installed = cur.is_none() && timeline .page_trace .compare_and_swap(cur, Some(Arc::new(trace_tx))) .is_none(); if !installed { return Err(ApiError::Conflict("page trace already active".to_string())); } defer!(timeline.page_trace.store(None)); // uninstall on return // Collect the trace and return it to the client. We could stream the response, but this is // simple and fine. let mut body = Vec::with_capacity(size_limit); let deadline = Instant::now() + Duration::from_secs(time_limit_secs); while body.len() < size_limit { tokio::select! { event = trace_rx.recv() => { let Some(event) = event else { break; // shouldn't happen (sender doesn't close, unless timeline dropped) }; bincode::serialize_into(&mut body, &event) .map_err(|err| ApiError::InternalServerError(err.into()))?; } _ = tokio::time::sleep_until(deadline) => break, // time limit reached _ = cancel.cancelled() => return Err(ApiError::Cancelled), } } Ok(Response::builder() .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/octet-stream") .body(hyper::Body::from(body)) .unwrap()) } /// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. /// /// Both are technically unsafe because they might fire off index uploads, thus they are POST. async fn block_or_unblock_gc( request: Request, block: bool, ) -> Result, ApiError> { use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timeline = tenant.get_timeline(timeline_id, true)?; let fut = async { if block { timeline.block_gc(&tenant).await.map(|_| ()) } else { timeline.unblock_gc(&tenant).await } }; let span = tracing::info_span!( "block_or_unblock_gc", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id, block = block, ); let res = fut.instrument(span).await; res.map_err(|e| { if e.is::() || e.is::() { ApiError::ShuttingDown } else { ApiError::InternalServerError(e) } })?; json_response(StatusCode::OK, ()) } /// Get tenant_size SVG graph along with the JSON data. fn synthetic_size_html_response( inputs: ModelInputs, storage_model: StorageModel, sizes: SizeResult, ) -> Result, ApiError> { let mut timeline_ids: Vec = Vec::new(); let mut timeline_map: HashMap = HashMap::new(); for (index, ti) in inputs.timeline_inputs.iter().enumerate() { timeline_map.insert(ti.timeline_id, index); timeline_ids.push(ti.timeline_id.to_string()); } let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs .segments .iter() .map(|seg| { ( *timeline_map.get(&seg.timeline_id).unwrap(), seg.kind.into(), ) }) .collect(); let svg = tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes) .map_err(ApiError::InternalServerError)?; let mut response = String::new(); use std::fmt::Write; write!(response, "\n\n").unwrap(); write!(response, "
\n{svg}\n
").unwrap(); writeln!(response, "Project size: {}", sizes.total_size).unwrap(); writeln!(response, "
").unwrap();
    writeln!(
        response,
        "{}",
        serde_json::to_string_pretty(&inputs).unwrap()
    )
    .unwrap();
    writeln!(
        response,
        "{}",
        serde_json::to_string_pretty(&sizes.segments).unwrap()
    )
    .unwrap();
    writeln!(response, "
").unwrap(); write!(response, "\n\n").unwrap(); html_response(StatusCode::OK, response) } pub fn html_response(status: StatusCode, data: String) -> Result, ApiError> { let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "text/html") .body(Body::from(data.as_bytes().to_vec())) .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let response = HashMap::from([ ( "tenant_specific_overrides", serde_json::to_value(tenant.tenant_specific_overrides()) .context("serializing tenant specific overrides") .map_err(ApiError::InternalServerError)?, ), ( "effective_config", serde_json::to_value(tenant.effective_config()) .context("serializing effective config") .map_err(ApiError::InternalServerError)?, ), ]); json_response(StatusCode::OK, response) } async fn update_tenant_config_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; let new_tenant_conf = request_data.config; let state = get_state(&request); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; // This is a legacy API that only operates on attached tenants: the preferred // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single( new_tenant_conf.clone(), tenant.get_generation(), ShardParameters::from(tenant.get_shard_identity()), ); tenant .get_shard_identity() .assert_equal(location_conf.shard); // not strictly necessary since we construct it above crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let _ = tenant .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone())) .expect("Closure returns Ok()"); json_response(StatusCode::OK, ()) } async fn patch_tenant_config_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let request_data: TenantConfigPatchRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let updated = tenant .update_tenant_config(|crnt| { crnt.apply_patch(request_data.config.clone()) .map_err(anyhow::Error::new) }) .map_err(ApiError::BadRequest)?; // This is a legacy API that only operates on attached tenants: the preferred // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single( updated, tenant.get_generation(), ShardParameters::from(tenant.get_shard_identity()), ); tenant .get_shard_identity() .assert_equal(location_conf.shard); // not strictly necessary since we construct it above crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; json_response(StatusCode::OK, ()) } async fn put_tenant_location_config_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); let conf = state.conf; fail::fail_point!("put-location-conf-handler", |_| { Err(ApiError::ResourceUnavailable("failpoint".into())) }); // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { if let Err(e) = state .tenant_manager .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client) .instrument(info_span!("tenant_detach", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug() )) .await { match e { TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { // This API is idempotent: a NotFound on a detach is fine. } _ => return Err(e.into()), } } return json_response(StatusCode::OK, ()); } let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; // lazy==true queues up for activation or jumps the queue like normal when a compute connects, // similar to at startup ordering. let spawn_mode = if lazy { tenant::SpawnMode::Lazy } else { tenant::SpawnMode::Eager }; let tenant = state .tenant_manager .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) .await?; let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size()); let attached = tenant.is_some(); if let Some(_flush_ms) = flush { match state .secondary_controller .upload_tenant(tenant_shard_id) .await { Ok(()) => { tracing::info!("Uploaded heatmap during flush"); } Err(e) => { tracing::warn!("Failed to flush heatmap: {e}"); } } } else { tracing::info!("No flush requested when configuring"); } // This API returns a vector of pageservers where the tenant is attached: this is // primarily for use in the sharding service. For compatibilty, we also return this // when called directly on a pageserver, but the payload is always zero or one shards. let mut response = TenantLocationConfigResponse { shards: Vec::new(), stripe_size: None, }; if attached { response.shards.push(TenantShardLocation { shard_id: tenant_shard_id, node_id: state.conf.id, }); if tenant_shard_id.shard_count.count() > 1 { // Stripe size should be set if we are attached debug_assert!(stripe_size.is_some()); response.stripe_size = stripe_size; } } json_response(StatusCode::OK, response) } async fn list_location_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let slots = state.tenant_manager.list(); let result = LocationConfigListResponse { tenant_shards: slots .into_iter() .map(|(tenant_shard_id, slot)| { let v = match slot { TenantSlot::Attached(t) => Some(t.get_location_conf()), TenantSlot::Secondary(s) => Some(s.get_location_conf()), TenantSlot::InProgress(_) => None, }; (tenant_shard_id, v) }) .collect(), }; json_response(StatusCode::OK, result) } async fn get_location_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let slot = state.tenant_manager.get(tenant_shard_id); let Some(slot) = slot else { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant shard not found").into(), )); }; let result: Option = match slot { TenantSlot::Attached(t) => Some(t.get_location_conf()), TenantSlot::Secondary(s) => Some(s.get_location_conf()), TenantSlot::InProgress(_) => None, }; json_response(StatusCode::OK, result) } // Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached // (from all pageservers) as it invalidates consistency assumptions. async fn tenant_time_travel_remote_storage_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timestamp_raw = must_get_query_param(&request, "travel_to")?; let timestamp = humantime::parse_rfc3339(×tamp_raw) .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}")) .map_err(ApiError::BadRequest)?; let done_if_after_raw = must_get_query_param(&request, "done_if_after")?; let done_if_after = humantime::parse_rfc3339(&done_if_after_raw) .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}")) .map_err(ApiError::BadRequest)?; // This is just a sanity check to fend off naive wrong usages of the API: // the tenant needs to be detached *everywhere* let state = get_state(&request); let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id); if we_manage_tenant { return Err(ApiError::BadRequest(anyhow!( "Tenant {tenant_shard_id} is already attached at this pageserver" ))); } if timestamp > done_if_after { return Err(ApiError::BadRequest(anyhow!( "The done_if_after timestamp comes before the timestamp to recover to" ))); } tracing::info!( "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}" ); remote_timeline_client::upload::time_travel_recover_tenant( &state.remote_storage, &tenant_shard_id, timestamp, done_if_after, &cancel, ) .await .map_err(|e| match e { TimeTravelError::BadInput(e) => { warn!("bad input error: {e}"); ApiError::BadRequest(anyhow!("bad input error")) } TimeTravelError::Unimplemented => { ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage")) } TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")), TimeTravelError::TooManyVersions => { ApiError::InternalServerError(anyhow!("too many versions in remote storage")) } TimeTravelError::Other(e) => { warn!("internal error: {e}"); ApiError::InternalServerError(anyhow!("internal error")) } })?; json_response(StatusCode::OK, ()) } /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. async fn handle_tenant_break( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; let state = get_state(&r); state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)? .set_broken("broken from test".to_owned()) .await; json_response(StatusCode::OK, ()) } // Obtains an lsn lease on the given timeline. async fn lsn_lease_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let lsn = json_request::(&mut request).await?.lsn; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let result = async { timeline .init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) .map_err(|e| { ApiError::InternalServerError( e.context(format!("invalid lsn lease request at {lsn}")), ) }) } .instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::OK, result) } // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let gc_req: TimelineGcRequest = json_request(&mut request).await?; let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let gc_result = state .tenant_manager .immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx) .await?; json_response(StatusCode::OK, gc_result) } // Cancel scheduled compaction tasks async fn timeline_cancel_compact_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.cancel_scheduled_compaction(timeline_id); json_response(StatusCode::OK, ()) } .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } // Get compact info of a timeline async fn timeline_compact_info_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let resp = tenant.get_scheduled_compaction_tasks(timeline_id); json_response(StatusCode::OK, resp) } .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } // Run compaction immediately on given timeline. async fn timeline_compact_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let compact_request = json_request_maybe::>(&mut request).await?; let state = get_state(&request); let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? { flags |= CompactFlags::EnhancedGcBottomMostCompaction; } if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { flags |= CompactFlags::DryRun; } // Manual compaction does not yield for L0. let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); let wait_until_scheduled_compaction_done = parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")? .unwrap_or(false); let sub_compaction = compact_request .as_ref() .map(|r| r.sub_compaction) .unwrap_or(false); let sub_compaction_max_job_size_mb = compact_request .as_ref() .and_then(|r| r.sub_compaction_max_job_size_mb); let options = CompactOptions { compact_key_range: compact_request .as_ref() .and_then(|r| r.compact_key_range.clone()), compact_lsn_range: compact_request .as_ref() .and_then(|r| r.compact_lsn_range.clone()), flags, sub_compaction, sub_compaction_max_job_size_mb, gc_compaction_do_metadata_compaction: false, }; let scheduled = compact_request .as_ref() .map(|r| r.scheduled) .unwrap_or(false); async { let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if scheduled { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?; if wait_until_scheduled_compaction_done { // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction. rx.await.ok(); } } else { timeline .compact_with_options(&cancel, options, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; if wait_until_uploaded { timeline.remote_client.wait_completion().await // XXX map to correct ApiError for the cases where it's due to shutdown .context("wait completion").map_err(ApiError::InternalServerError)?; } } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn timeline_mark_invisible_handler( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let compact_request = json_request_maybe::>(&mut request).await?; let state = get_state(&request); let visibility = match compact_request { Some(req) => match req.is_visible { Some(true) => TimelineVisibilityState::Visible, Some(false) | None => TimelineVisibilityState::Invisible, }, None => TimelineVisibilityState::Invisible, }; async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let timeline = tenant.get_timeline(timeline_id, true)?; timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } // Run offload immediately on given timeline. async fn timeline_offload_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; if tenant.get_offloaded_timeline(timeline_id).is_ok() { return json_response(StatusCode::OK, ()); } let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; if !tenant.timeline_has_no_attached_children(timeline_id) { return Err(ApiError::PreconditionFailed( "timeline has attached children".into(), )); } if let (false, reason) = timeline.can_offload() { return Err(ApiError::PreconditionFailed( format!("Timeline::can_offload() check failed: {reason}") .into(), )); } offload_timeline(&tenant, &timeline) .await .map_err(|e| { match e { OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()), OffloadError::AlreadyInProgress => ApiError::Conflict("Timeline already being offloaded or deleted".into()), _ => ApiError::InternalServerError(anyhow!(e)) } })?; json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } // Run checkpoint immediately on given timeline. async fn timeline_checkpoint_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload. let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true); let wait_until_flushed: bool = parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true); let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if wait_until_flushed { timeline.freeze_and_flush().await } else { timeline.freeze().await.and(Ok(())) }.map_err(|e| { match e { tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, other => ApiError::InternalServerError(other.into()), } })?; if compact { timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| if e.is_cancel() { ApiError::ShuttingDown } else { ApiError::InternalServerError(e.into_anyhow()) } )?; } if wait_until_uploaded { tracing::info!("Waiting for uploads to complete..."); timeline.remote_client.wait_completion().await // XXX map to correct ApiError for the cases where it's due to shutdown .context("wait completion").map_err(ApiError::InternalServerError)?; tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0))); } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn timeline_download_remote_layers_handler_post( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); match timeline.spawn_download_all_remote_layers(body, &ctx).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), } } async fn timeline_download_remote_layers_handler_get( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") .map_err(|e| ApiError::NotFound(e.into()))?; json_response(StatusCode::OK, info) } async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { use pageserver_api::models::detach_ancestor::AncestorDetached; use crate::tenant::timeline::detach_ancestor; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let behavior: Option = parse_query_param(&request, "detach_behavior")?; let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); async move { let mut options = detach_ancestor::Options::default(); let rewrite_concurrency = parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; let copy_concurrency = parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?; [ (&mut options.rewrite_concurrency, rewrite_concurrency), (&mut options.copy_concurrency, copy_concurrency), ] .into_iter() .filter_map(|(target, val)| val.map(|val| (target, val))) .for_each(|(target, val)| *target = val); let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again // during shutdown. This early upload ensures the pageserver does not need to upload too many // things and creates downtime during timeline reloads. for timeline in tenant.list_timelines() { timeline .remote_client .wait_completion() .await .map_err(|e| { ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into()) })?; } tracing::info!("all timeline upload queues are drained"); let timeline = tenant.get_timeline(timeline_id, true)?; let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline .prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx) .await?; // uncomment to allow early as possible Tenant::drop // drop(tenant); let resp = match progress { detach_ancestor::Progress::Prepared(attempt, prepared) => { // it would be great to tag the guard on to the tenant activation future let reparented_timelines = state .tenant_manager .complete_detaching_timeline_ancestor( tenant_shard_id, timeline_id, prepared, behavior, attempt, ctx, ) .await?; AncestorDetached { reparented_timelines, } } detach_ancestor::Progress::Done(resp) => resp, }; json_response(StatusCode::OK, resp) } .instrument(span) .await } async fn deletion_queue_flush( r: Request, cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&r); let execute = parse_query_param(&r, "execute")?.unwrap_or(false); let flush = async { if execute { state.deletion_queue_client.flush_execute().await } else { state.deletion_queue_client.flush().await } } // DeletionQueueError's only case is shutting down. .map_err(|_| ApiError::ShuttingDown); tokio::select! { res = flush => { res.map(|()| json_response(StatusCode::OK, ()))? } _ = cancel.cancelled() => { Err(ApiError::ShuttingDown) } } } /// Try if `GetPage@Lsn` is successful, useful for manual debugging. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] struct GetPageResponse { pub page: Bytes, pub layers_visited: u32, pub delta_layers_visited: u32, pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } async fn getpage_at_lsn_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { getpage_at_lsn_handler_inner(false, request, cancel).await } async fn touchpage_at_lsn_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { getpage_at_lsn_handler_inner(true, request, cancel).await } /// Try if `GetPage@Lsn` is successful, useful for manual debugging. async fn getpage_at_lsn_handler_inner( touch: bool, request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; // Require pageserver admin permission for this API instead of only tenant-level token. check_permission(&request, None)?; let state = get_state(&request); struct Key(pageserver_api::key::Key); impl std::str::FromStr for Key { type Err = anyhow::Error; fn from_str(s: &str) -> std::result::Result { pageserver_api::key::Key::from_hex(s).map(Key) } } let key: Key = parse_query_param(&request, "key")? .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?; let lsn: Option = parse_query_param(&request, "lsn")?; async { let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) .download_behavior(DownloadBehavior::Download) .scope(context::Scope::new_timeline(&timeline)) .read_path_debug(true) .root(); // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); if touch { json_response(StatusCode::OK, ()) } else { let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential()); let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?; let response = GetPageResponse { page, layers_visited: reconstruct_state.get_layers_visited(), delta_layers_visited: reconstruct_state.get_delta_layers_visited(), records: reconstruct_state.debug_state.records.clone(), img: reconstruct_state.debug_state.img.clone(), }; json_response(StatusCode::OK, response) } } .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn timeline_collect_keyspace( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace. // Therefore, we split dense/sparse keys in this API. let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn }; json_response(StatusCode::OK, res) } .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn active_timeline_of_active_tenant( tenant_manager: &TenantManager, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( req: Request, _cancel: CancellationToken, ) -> Result, ApiError> { // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. let query = req.uri().query(); let _ = std::panic::catch_unwind(|| { panic!("unconditional panic for testing panic hook integration; request query: {query:?}") }); json_response(StatusCode::NO_CONTENT, ()) } async fn disk_usage_eviction_run( mut r: Request, cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] struct Config { /// How many bytes to evict before reporting that pressure is relieved. evict_bytes: u64, #[serde(default)] eviction_order: pageserver_api::config::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] struct Usage { // remains unchanged after instantiation of the struct evict_bytes: u64, // updated by `add_available_bytes` freed_bytes: u64, } impl crate::disk_usage_eviction_task::Usage for Usage { fn has_pressure(&self) -> bool { self.evict_bytes > self.freed_bytes } fn add_available_bytes(&mut self, bytes: u64) { self.freed_bytes += bytes; } } let config = json_request::(&mut r).await?; let usage = Usage { evict_bytes: config.evict_bytes, freed_bytes: 0, }; let state = get_state(&r); let eviction_state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( &eviction_state, &state.remote_storage, usage, &state.tenant_manager, config.eviction_order.into(), &cancel, ) .await; info!(?res, "disk_usage_eviction_task_iteration_impl finished"); let res = res.map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, res) } async fn secondary_upload_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; state .secondary_controller .upload_tenant(tenant_shard_id) .await?; json_response(StatusCode::OK, ()) } async fn tenant_scan_remote_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let mut response = TenantScanRemoteStorageResponse::default(); let (shards, _other_keys) = list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; for tenant_shard_id in shards { let (timeline_ids, _other_keys) = list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let mut generation = Generation::none(); for timeline_id in timeline_ids { match download_index_part( &state.remote_storage, &tenant_shard_id, &timeline_id, Generation::MAX, &cancel, ) .instrument(info_span!("download_index_part", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await { Ok((index_part, index_generation, _index_mtime)) => { tracing::info!( "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn() ); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { // This is normal for tenants that were created with multiple shards: they have an unsharded path // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. tracing::info!( "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping" ); continue; } Err(e) => { return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); } }; } let result = download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel) .instrument(info_span!("download_tenant_manifest", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())) .await; let stripe_size = match result { Ok((manifest, _, _)) => manifest.stripe_size, Err(DownloadError::NotFound) => None, Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))), }; response.shards.push(TenantScanRemoteStorageShard { tenant_shard_id, generation: generation.into(), stripe_size, }); } if response.shards.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(), )); } json_response(StatusCode::OK, response) } async fn secondary_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis); // We don't need this to issue the download request, but: // - it enables us to cleanly return 404 if we get a request for an absent shard // - we will use this to provide status feedback in the response let Some(secondary_tenant) = state .tenant_manager .get_secondary_tenant_shard(tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), )); }; let timeout = wait.unwrap_or(Duration::MAX); let result = tokio::time::timeout( timeout, state.secondary_controller.download_tenant(tenant_shard_id), ) .await; let progress = secondary_tenant.progress.lock().unwrap().clone(); let status = match result { Ok(Ok(())) => { if progress.layers_downloaded >= progress.layers_total { // Download job ran to completion StatusCode::OK } else { // Download dropped out without errors because it ran out of time budget StatusCode::ACCEPTED } } // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. Ok(Err(e)) => return Err(e.into()), // A timeout is not an error: we have started the download, we're just not done // yet. The caller will get a response body indicating status. Err(_) => StatusCode::ACCEPTED, }; json_response(status, progress) } async fn wait_lsn_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?; let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let mut wait_futures = Vec::default(); for timeline in tenant.list_timelines() { let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else { continue; }; let fut = { let timeline = timeline.clone(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); async move { timeline .wait_lsn( *lsn, WaitLsnWaiter::HttpEndpoint, WaitLsnTimeout::Custom(wait_lsn_request.timeout), &ctx, ) .await } }; wait_futures.push(fut); } if wait_futures.is_empty() { return json_response(StatusCode::NOT_FOUND, ()); } let all_done = tokio::select! { results = join_all(wait_futures) => { results.iter().all(|res| res.is_ok()) }, _ = cancel.cancelled() => { return Err(ApiError::Cancelled); } }; let status = if all_done { StatusCode::OK } else { StatusCode::ACCEPTED }; json_response(status, ()) } async fn secondary_status_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let Some(secondary_tenant) = state .tenant_manager .get_secondary_tenant_shard(tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), )); }; let progress = secondary_tenant.progress.lock().unwrap().clone(); json_response(StatusCode::OK, progress) } async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, HttpErrorBody::from_msg("page not found".to_owned()), ) } async fn post_tracing_event_handler( mut r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { #[derive(Debug, serde::Deserialize)] #[serde(rename_all = "lowercase")] enum Level { Error, Warn, Info, Debug, Trace, } #[derive(Debug, serde::Deserialize)] struct Request { level: Level, message: String, } let body: Request = json_request(&mut r) .await .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; match body.level { Level::Error => tracing::error!(?body.message), Level::Warn => tracing::warn!(?body.message), Level::Info => tracing::info!(?body.message), Level::Debug => tracing::debug!(?body.message), Level::Trace => tracing::trace!(?body.message), } json_response(StatusCode::OK, ()) } async fn put_io_engine_handler( mut r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?; crate::virtual_file::io_engine::set(kind); json_response(StatusCode::OK, ()) } async fn put_io_mode_handler( mut r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; let mode: IoMode = json_request(&mut r).await?; crate::virtual_file::set_io_mode(mode); json_response(StatusCode::OK, ()) } /// Polled by control plane. /// /// See [`crate::utilization`]. async fn get_utilization( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { fail::fail_point!("get-utilization-http-handler", |_| { Err(ApiError::ResourceUnavailable("failpoint".into())) }); // this probably could be completely public, but lets make that change later. check_permission(&r, None)?; let state = get_state(&r); let mut g = state.latest_utilization.lock().await; let regenerate_every = Duration::from_secs(1); let still_valid = g .as_ref() .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); // avoid needless statvfs calls even though those should be non-blocking fast. // regenerate at most 1Hz to allow polling at any rate. if !still_valid { let path = state.conf.tenants_path(); let doc = crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) .map_err(ApiError::InternalServerError)?; let mut buf = Vec::new(); serde_json::to_writer(&mut buf, &doc) .context("serialize") .map_err(ApiError::InternalServerError)?; let body = bytes::Bytes::from(buf); *g = Some((std::time::Instant::now(), body)); } // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork let cached = g.as_ref().expect("just set").1.clone(); Response::builder() .header(hyper::http::header::CONTENT_TYPE, "application/json") // thought of using http date header, but that is second precision which does not give any // debugging aid .status(StatusCode::OK) .body(hyper::Body::from(cached)) .context("build response") .map_err(ApiError::InternalServerError) } /// HADRON async fn list_tenant_visible_size_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let state = get_state(&request); let mut map = BTreeMap::new(); for (tenant_shard_id, slot) in state.tenant_manager.list() { match slot { TenantSlot::Attached(tenant) => { let visible_size = tenant.get_visible_size(); map.insert(tenant_shard_id, visible_size); } TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { continue; } } } json_response(StatusCode::OK, map) } async fn list_aux_files( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: ListAuxFilesRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let io_concurrency = IoConcurrency::spawn_from_conf( state.conf.get_vectored_concurrent_io, timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); let files = timeline .list_aux_files(body.lsn, &ctx, io_concurrency) .await?; json_response(StatusCode::OK, files) } async fn perf_info( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let result = timeline.perf_info().await; json_response(StatusCode::OK, result) } async fn ingest_aux_files( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: IngestAuxFilesRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let mut modification = timeline.begin_modification( Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */ ); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); for (fname, content) in body.aux_files { modification .put_file(&fname, content.as_bytes(), &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; } modification .commit(&ctx) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } /// Report on the largest tenants on this pageserver, for the storage controller to identify /// candidates for splitting async fn post_top_tenants( mut r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; let request: TopTenantShardsRequest = json_request(&mut r).await?; let state = get_state(&r); fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 { match order_by { TenantSorting::ResidentSize => sizes.resident_size, TenantSorting::MaxLogicalSize => sizes.max_logical_size, TenantSorting::MaxLogicalSizePerShard => sizes.max_logical_size_per_shard, } } #[derive(Eq, PartialEq)] struct HeapItem { metric: u64, sizes: TopTenantShardItem, } impl PartialOrd for HeapItem { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which /// supports popping the greatest item but not the smallest. impl Ord for HeapItem { fn cmp(&self, other: &Self) -> std::cmp::Ordering { Reverse(self.metric).cmp(&Reverse(other.metric)) } } let mut top_n: BinaryHeap = BinaryHeap::with_capacity(request.limit); // FIXME: this is a lot of clones to take this tenant list for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() { if let Some(shards_lt) = request.where_shards_lt { // Ignore tenants which already have >= this many shards if tenant_shard_id.shard_count >= shards_lt { continue; } } let sizes = match tenant_slot { TenantSlot::Attached(tenant) => tenant.get_sizes(), TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { continue; } }; let metric = get_size_metric(&sizes, &request.order_by); if let Some(gt) = request.where_gt { // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work if metric <= gt { continue; } }; match top_n.peek() { None => { // Top N list is empty: candidate becomes first member top_n.push(HeapItem { metric, sizes }); } Some(i) if i.metric > metric && top_n.len() < request.limit => { // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end top_n.push(HeapItem { metric, sizes }); } Some(i) if i.metric > metric => { // List is at limit and lowest value is greater than our candidate, drop it. } Some(_) => top_n.push(HeapItem { metric, sizes }), } while top_n.len() > request.limit { top_n.pop(); } } json_response( StatusCode::OK, TopTenantShardsResponse { shards: top_n.into_iter().map(|i| i.sizes).collect(), }, ) } async fn put_tenant_timeline_import_basebackup( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?; check_permission(&request, Some(tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(), base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); async move { let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let broker_client = state.broker_client.clone(); let mut body = StreamReader::new( request .into_body() .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))), ); tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let (timeline, timeline_ctx) = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .map_err(ApiError::InternalServerError) .await?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute // from connecting before that and writing conflicting wal. // // This is not relevant for pageserver->pageserver migrations, since there's // no wal to import. But should be fixed if we want to import from postgres. // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. // Import basebackup provided via CopyData info!("importing basebackup"); timeline .import_basebackup_from_tar( tenant.clone(), &mut body, base_lsn, broker_client, &timeline_ctx, ) .await .map_err(ApiError::InternalServerError)?; // Read the end of the tar archive. read_tar_eof(body) .await .map_err(ApiError::InternalServerError)?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup // and checking that it matches in size with what was imported. // It wouldn't work if base came from vanilla postgres though, // since we discard some log files. info!("done"); json_response(StatusCode::OK, ()) } .instrument(span) .await } async fn put_tenant_timeline_import_wal( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?; let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; check_permission(&request, Some(tenant_id))?; let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); async move { let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) .download_behavior(DownloadBehavior::Warn) .scope(context::Scope::new_timeline(&timeline)) .root(); let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { std::io::Error::other( anyhow::anyhow!(error)) }) })); let last_record_lsn = timeline.get_last_record_lsn(); if last_record_lsn != start_lsn { return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); } // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. // Import wal provided via CopyData info!("importing wal"); crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?; info!("wal import complete"); // Read the end of the tar archive. read_tar_eof(body).await.map_err(ApiError::InternalServerError)?; // TODO Does it make sense to overshoot? if timeline.get_last_record_lsn() < end_lsn { return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); } // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); timeline.freeze_and_flush().await.map_err(|e| match e { tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, other => ApiError::InternalServerError(anyhow::anyhow!(other)), })?; info!("done"); json_response(StatusCode::OK, ()) }.instrument(span).await } /// Activate a timeline after its import has completed /// /// The endpoint is idempotent and callers are expected to retry all /// errors until a successful response. async fn activate_post_import_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1); let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")? .map(Duration::from_millis) .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT); let span = info_span!( "activate_post_import_handler", tenant_id=%tenant_shard_id.tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug() ); async move { let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant.finalize_importing_timeline(timeline_id).await?; match tenant.get_timeline(timeline_id, false) { Ok(_timeline) => { // Timeline is already visible. Reset not required: fall through. } Err(GetTimelineError::NotFound { .. }) => { // This is crude: we reset the whole tenant such that the new timeline is detected // and activated. We can come up with something more granular in the future. // // Note that we only reset the tenant if required: when the timeline is // not present in [`Tenant::timelines`]. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); state .tenant_manager .reset_tenant(tenant_shard_id, false, &ctx) .await .map_err(ApiError::InternalServerError)?; } Err(GetTimelineError::ShuttingDown) => { return Err(ApiError::ShuttingDown); } Err(GetTimelineError::NotActive { .. }) => { unreachable!("Called get_timeline with active_only=false"); } } let timeline = tenant.get_timeline(timeline_id, false)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn) .with_scope_timeline(&timeline); let result = tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await; match result { Ok(Ok(())) => { // fallthrough } // Timeline reached some other state that's not active // TODO(vlad): if the tenant is broken, return a permananet error Ok(Err(_timeline_state)) => { return Err(ApiError::InternalServerError(anyhow::anyhow!( "Timeline activation failed" ))); } // Activation timed out Err(_) => { return Err(ApiError::Timeout("Timeline activation timed out".into())); } } let timeline_info = build_timeline_info( &timeline, false, // include_non_incremental_logical_size, false, // force_await_initial_logical_size false, // include_image_consistent_lsn &ctx, ) .await .context("get local timeline info") .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, timeline_info) } .instrument(span) .await } // [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts // after we manually rectify situations such as local SSD data loss. We will eventually automate this. async fn hadron_reset_alert_gauges( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; LOCAL_DATA_LOSS_SUSPECTED.set(0); json_response(StatusCode::OK, ()) } /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. /// `tokio_tar` already read the first such block. Read the second all-zeros block, /// and check that there is no more data after the EOF marker. /// /// 'tar' command can also write extra blocks of zeros, up to a record /// size, controlled by the --record-size argument. Ignore them too. async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> { use tokio::io::AsyncReadExt; let mut buf = [0u8; 512]; // Read the all-zeros block, and verify it let mut total_bytes = 0; while total_bytes < 512 { let nbytes = reader.read(&mut buf[total_bytes..]).await?; total_bytes += nbytes; if nbytes == 0 { break; } } if total_bytes < 512 { anyhow::bail!("incomplete or invalid tar EOF marker"); } if !buf.iter().all(|&x| x == 0) { anyhow::bail!("invalid tar EOF marker"); } // Drain any extra zero-blocks after the EOF marker let mut trailing_bytes = 0; let mut seen_nonzero_bytes = false; loop { let nbytes = reader.read(&mut buf).await?; trailing_bytes += nbytes; if !buf.iter().all(|&x| x == 0) { seen_nonzero_bytes = true; } if nbytes == 0 { break; } } if seen_nonzero_bytes { anyhow::bail!("unexpected non-zero bytes after the tar archive"); } if trailing_bytes % 512 != 0 { anyhow::bail!( "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive" ); } Ok(()) } async fn force_refresh_feature_flag( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; tenant .feature_resolver .refresh_properties_and_flags(&tenant); json_response(StatusCode::OK, ()) } async fn tenant_evaluate_feature_flag( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let flag: String = parse_request_param(&request, "flag_key")?; let as_type: Option = parse_query_param(&request, "as")?; let state = get_state(&request); async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) // and we don't need to worry about it for now. let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else if as_type.as_deref() == Some("multivariate") { let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { // Auto infer the type of the feature flag. let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?; if is_boolean { let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } } } .instrument(info_span!("tenant_evaluate_feature_flag", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await } async fn force_override_feature_flag_for_testing_put( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let flag: String = parse_request_param(&request, "flag_key")?; let value: String = must_parse_query_param(&request, "value")?; let state = get_state(&request); state .feature_resolver .force_override_for_testing(&flag, Some(&value)); json_response(StatusCode::OK, ()) } async fn force_override_feature_flag_for_testing_delete( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let flag: String = parse_request_param(&request, "flag_key")?; let state = get_state(&request); state .feature_resolver .force_override_for_testing(&flag, None); json_response(StatusCode::OK, ()) } async fn update_feature_flag_spec( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; let body = json_request(&mut request).await?; let state = get_state(&request); state .feature_resolver .update(body) .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) /// - Logs the request depending on the request method (by `request_span`) /// - Logs the response if it was not successful (by `request_span` /// - Shields the handler function from async cancellations. Hyper can drop the handler /// Future if the connection to the client is lost, but most of the pageserver code is /// not async cancellation safe. This converts the dropped future into a graceful cancellation /// request with a CancellationToken. async fn api_handler(request: Request, handler: H) -> Result, ApiError> where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { if request.uri() != &"/v1/failpoints".parse::().unwrap() { fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable( "failpoint".into() ))); fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError( anyhow::anyhow!("failpoint") ))); } // Spawn a new task to handle the request, to protect the handler from unexpected // async cancellations. Most pageserver functions are not async cancellation safe. // We arm a drop-guard, so that if Hyper drops the Future, we signal the task // with the cancellation token. let token = CancellationToken::new(); let cancel_guard = token.clone().drop_guard(); let result = request_span(request, move |r| async { let handle = tokio::spawn( async { let token_cloned = token.clone(); let result = handler(r, token).await; if token_cloned.is_cancelled() { // dropguard has executed: we will never turn this result into response. // // at least temporarily do {:?} logging; these failures are rare enough but // could hide difficult errors. match &result { Ok(response) => { let status = response.status(); info!(%status, "Cancelled request finished successfully") } Err(e) => match e { ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => { // Don't log this at error severity: they are normal during lifecycle of tenants/process info!("Cancelled request aborted for shutdown") } _ => { // Log these in a highly visible way, because we have no client to send the response to, but // would like to know that something went wrong. error!("Cancelled request finished with an error: {e:?}") } }, } } // only logging for cancelled panicked request handlers is the tracing_panic_hook, // which should suffice. // // there is still a chance to lose the result due to race between // returning from here and the actual connection closing happening // before outer task gets to execute. leaving that up for #5815. result } .in_current_span(), ); match handle.await { // TODO: never actually return Err from here, always Ok(...) so that we can log // spanned errors. Call api_error_handler instead and return appropriate Body. Ok(result) => result, Err(e) => { // The handler task panicked. We have a global panic handler that logs the // panic with its backtrace, so no need to log that here. Only log a brief // message to make it clear that we returned the error to the client. error!("HTTP request handler task panicked: {e:#}"); // Don't return an Error here, because then fallback error handler that was // installed in make_router() will print the error. Instead, construct the // HTTP error response and return that. Ok( ApiError::InternalServerError(anyhow!("HTTP request handler task panicked")) .into_response(), ) } } }) .await; cancel_guard.disarm(); result } /// Like api_handler, but returns an error response if the server is built without /// the 'testing' feature. async fn testing_api_handler( desc: &str, request: Request, handler: H, ) -> Result, ApiError> where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { if cfg!(feature = "testing") { api_handler(request, handler).await } else { std::future::ready(Err(ApiError::BadRequest(anyhow!( "Cannot {desc} because pageserver was compiled without testing APIs", )))) .await } } pub fn make_router( state: Arc, launch_ts: &'static LaunchTimestamp, auth: Option>, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() } })) } router = router.middleware( endpoint::add_response_header_middleware( "PAGESERVER_LAUNCH_TIMESTAMP", &launch_ts.to_string(), ) .expect("construct launch timestamp header middleware"), ); let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape; let prometheus_metrics_handler_wrapper = move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape); Ok(router .data(state) .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) }) .post("/v1/reload_auth_validation_keys", |r| { api_handler(r, reload_auth_validation_keys_handler) }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .get("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_status) }) .delete("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_delete_handler) }) .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) .patch("/v1/tenant/config", |r| { api_handler(r, patch_tenant_config_handler) }) .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/shard_split", |r| { api_handler(r, tenant_shard_split_handler) }) .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { api_handler(r, put_tenant_location_config_handler) }) .get("/v1/location_config", |r| { api_handler(r, list_location_config_handler) }) .get("/v1/location_config/:tenant_shard_id", |r| { api_handler(r, get_location_config_handler) }) .put( "/v1/tenant/:tenant_shard_id/time_travel_remote_storage", |r| api_handler(r, tenant_time_travel_remote_storage_handler), ) .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) .get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| { api_handler(r, timeline_and_offloaded_list_handler) }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_create_handler) }) .post("/v1/tenant/:tenant_shard_id/reset", |r| { api_handler(r, tenant_reset_handler) }) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", |r| api_handler(r, timeline_preserve_initdb_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config", |r| api_handler(r, timeline_archival_config_handler), ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp", |r| api_handler(r, get_lsn_by_timestamp_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part", |r| api_handler(r, timeline_patch_index_part_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", |r| api_handler(r, lsn_lease_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_info_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), ) .delete( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_cancel_compact_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible", |r| api_handler( r, timeline_mark_invisible_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor", |r| api_handler(r, timeline_detach_ancestor_handler), ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", |r| api_handler(r, layer_map_info_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", |r| api_handler(r, timeline_download_heatmap_layers_handler), ) .delete( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), ) .delete( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys", |r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", |r| api_handler(r, timeline_gc_blocking_handler), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", |r| api_handler(r, timeline_gc_unblocking_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace", |r| api_handler(r, timeline_page_trace_handler), ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| { api_handler(r, tenant_scan_remote_handler) }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| { api_handler(r, secondary_status_handler) }) .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) .post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| { api_handler(r, wait_lsn_handler) }) .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) .get("/v1/panic", |r| api_handler(r, always_panic_handler)) .post("/v1/tracing/event", |r| { testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", |r| api_handler(r, touchpage_at_lsn_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) .get("/v1/utilization", |r| api_handler(r, get_utilization)) .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files", |r| testing_api_handler("list_aux_files", r, list_aux_files), ) .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info", |r| testing_api_handler("perf_info", r, perf_info), ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup", |r| api_handler(r, put_tenant_timeline_import_basebackup), ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", |r| api_handler(r, put_tenant_timeline_import_wal), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", |r| api_handler(r, activate_post_import_handler), ) .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| { api_handler(r, tenant_evaluate_feature_flag) }) .post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| { api_handler(r, force_refresh_feature_flag) }) .put("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put) }) .delete("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete) }) .post("/v1/feature_flag_spec", |r| { api_handler(r, update_feature_flag_spec) }) .post("/hadron-internal/reset_alert_gauges", |r| { api_handler(r, hadron_reset_alert_gauges) }) .any(handler_404)) } ================================================ FILE: pageserver/src/import_datadir.rs ================================================ //! //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{ BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName, pg_constants, }; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; use utils::lsn::Lsn; use wal_decoder::models::InterpretedWalRecord; use walkdir::WalkDir; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::{WalIngest, WalIngestErrorKind}; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { // Read control file to extract the LSN let controlfile_path = path.join("global").join("pg_control"); let controlfile_buf = std::fs::read(&controlfile_path) .with_context(|| format!("reading controlfile: {controlfile_path}"))?; let controlfile = ControlFileData::decode(&controlfile_buf)?; let lsn = controlfile.checkPoint; Ok(Lsn(lsn)) } /// /// Import all relation data pages from local disk into the repository. /// /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. pub async fn import_timeline_from_postgres_datadir( tline: &Timeline, pgdata_path: &Utf8Path, pgdata_lsn: Lsn, ctx: &RequestContext, ) -> Result<()> { let mut pg_control: Option = None; // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary let mut modification = tline.begin_modification_for_import(pgdata_lsn); modification.init_empty()?; // Import all but pg_wal let all_but_wal = WalkDir::new(pgdata_path) .into_iter() .filter_entry(|entry| !entry.path().ends_with("pg_wal")); for entry in all_but_wal { let entry = entry?; let metadata = entry.metadata().expect("error getting dir entry metadata"); if metadata.is_file() { let absolute_path = entry.path(); let relative_path = absolute_path.strip_prefix(pgdata_path)?; let mut file = tokio::fs::File::open(absolute_path).await?; let len = metadata.len() as usize; if let Some(control_file) = import_file(&mut modification, relative_path, &mut file, len, ctx).await? { pg_control = Some(control_file); } modification.flush(ctx).await?; } } // We're done importing all the data files. modification.commit(ctx).await?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; ensure!( pg_control.state == DBState_DB_SHUTDOWNED, "Postgres cluster was not shut down cleanly" ); ensure!( pg_control.checkPointCopy.redo == pgdata_lsn.0, "unexpected checkpoint REDO pointer" ); // Import WAL. This is needed even when starting from a shutdown checkpoint, because // this reads the checkpoint record itself, advancing the tip of the timeline to // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( &pgdata_path.join("pg_wal"), tline, Lsn(pg_control.checkPointCopy.redo), pgdata_lsn, ctx, ) .await?; Ok(()) } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. async fn import_rel( modification: &mut DatadirModification<'_>, path: &Path, spcoid: Oid, dboid: Oid, reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); let filename = &path .file_name() .expect("missing rel filename") .to_string_lossy(); let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| { warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); e })?; let mut buf: [u8; 8192] = [0u8; 8192]; ensure!(len % BLCKSZ as usize == 0); let nblocks = len / BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, dbnode: dboid, relnode, forknum, }; let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32); // Call put_rel_creation for every segment of the relation, // because there is no guarantee about the order in which we are processing segments. // ignore "relation already exists" error // // FIXME: Keep track of which relations we've already created? // https://github.com/neondatabase/neon/issues/3309 if let Err(e) = modification .put_rel_creation(rel, nblocks as u32, ctx) .await { match e.kind { WalIngestErrorKind::RelationAlreadyExists(rel) => { debug!("Relation {rel} already exists. We must be extending it.") } _ => return Err(e.into()), } } loop { let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { let key = rel_block_to_key(rel, blknum); if modification.tline.get_shard_identity().is_key_local(&key) { modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; } } // TODO: UnexpectedEof is expected Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32); ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } _ => { bail!("error reading file {}: {:#}", path.display(), err); } }, }; blknum += 1; } // Update relation size // // If we process rel segments out of order, // put_rel_extend will skip the update. modification.put_rel_extend(rel, blknum, ctx).await?; Ok(()) } /// Import an SLRU segment file /// async fn import_slru( modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { info!("importing slru file {path:?}"); let mut buf: [u8; 8192] = [0u8; 8192]; let filename = &path .file_name() .with_context(|| format!("missing slru filename for path {path:?}"))? .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ let nblocks = len / BLCKSZ as usize; ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); modification .put_slru_segment_creation(slru, segno, nblocks as u32, ctx) .await?; let mut rpageno = 0; loop { let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { modification.put_slru_page_image( slru, segno, rpageno, Bytes::copy_from_slice(&buf), )?; } // TODO: UnexpectedEof is expected Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. ensure!(rpageno == nblocks as u32, "unexpected EOF"); break; } _ => { bail!("error reading file {}: {:#}", path.display(), err); } }, }; rpageno += 1; } Ok(()) } /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. async fn import_wal( walpath: &Utf8Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; let mut walingest = WalIngest::new(tline, startpoint, ctx).await?; let shard = vec![*tline.get_shard_identity()]; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let mut buf = Vec::new(); // Read local file let mut path = walpath.join(&filename); // It could be as .partial if !PathBuf::from(&path).exists() { path = walpath.join(filename + ".partial"); } // Slurp the WAL file let mut file = std::fs::File::open(&path)?; if offset > 0 { use std::io::Seek; file.seek(std::io::SeekFrom::Start(offset as u64))?; } use std::io::Read; let nread = file.read_to_end(&mut buf)?; if nread != WAL_SEGMENT_SIZE - offset { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } waldecoder.feed_bytes(&buf); let mut nrecords = 0; let mut modification = tline.begin_modification_for_import(last_lsn); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, &shard, lsn, tline.pg_version, )? .remove(tline.get_shard_identity()) .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) .await?; WAL_INGEST.records_committed.inc(); modification.commit(ctx).await?; last_lsn = lsn; nrecords += 1; trace!("imported record at {} (end {})", lsn, endpoint); } } debug!("imported {} records up to {}", nrecords, last_lsn); segno += 1; offset = 0; } if last_lsn != startpoint { info!("reached end of WAL at {}", last_lsn); } else { info!("no WAL to import at {}", last_lsn); } Ok(()) } pub async fn import_basebackup_from_tar( tline: &Timeline, reader: &mut (impl AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, ctx: &RequestContext, ) -> Result<()> { info!("importing base at {base_lsn}"); let mut modification = tline.begin_modification_for_import(base_lsn); modification.init_empty()?; let mut pg_control: Option = None; // Import base let mut entries = Archive::new(reader).entries()?; while let Some(base_tar_entry) = entries.next().await { let mut entry = base_tar_entry?; let header = entry.header(); let len = header.entry_size()? as usize; let file_path = header.path()?.into_owned(); match header.entry_type() { tokio_tar::EntryType::Regular => { if let Some(res) = import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await? { // We found the pg_control file. pg_control = Some(res); } modification.flush(ctx).await?; } tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); } _ => { bail!( "entry {} in backup tar archive is of unexpected type: {:?}", file_path.display(), header.entry_type() ); } } } // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; modification.commit(ctx).await?; Ok(()) } pub async fn import_wal_from_tar( tline: &Timeline, reader: &mut (impl AsyncRead + Send + Sync + Unpin), start_lsn: Lsn, end_lsn: Lsn, ctx: &RequestContext, ) -> Result<()> { // Set up walingest mutable state let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?; let shard = vec![*tline.get_shard_identity()]; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); let mut pg_wal_tar = Archive::new(reader); let mut pg_wal_entries = pg_wal_tar.entries()?; while last_lsn <= end_lsn { let bytes = { let mut entry = pg_wal_entries .next() .await .ok_or_else(|| anyhow::anyhow!("expected more wal"))??; let header = entry.header(); let file_path = header.path()?.into_owned(); match header.entry_type() { tokio_tar::EntryType::Regular => { // FIXME: assume postgresql tli 1 for now let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let file_name = file_path .file_name() .expect("missing wal filename") .to_string_lossy(); ensure!(expected_filename == file_name); debug!("processing wal file {:?}", file_path); read_all_bytes(&mut entry).await? } tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); continue; } _ => { bail!( "entry {} in WAL tar archive is of unexpected type: {:?}", file_path.display(), header.entry_type() ); } } }; waldecoder.feed_bytes(&bytes[offset..]); let mut modification = tline.begin_modification_for_import(last_lsn); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, &shard, lsn, tline.pg_version, )? .remove(tline.get_shard_identity()) .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) .await?; modification.commit(ctx).await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); } } debug!("imported records up to {}", last_lsn); segno += 1; offset = 0; } if last_lsn != start_lsn { info!("reached end of WAL at {}", last_lsn); } else { info!("there was no WAL to import at {}", last_lsn); } // Log any extra unused files while let Some(e) = pg_wal_entries.next().await { let entry = e?; let header = entry.header(); let file_path = header.path()?.into_owned(); info!("skipping {:?}", file_path); } Ok(()) } async fn import_file( modification: &mut DatadirModification<'_>, file_path: &Path, reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ctx: &RequestContext, ) -> Result> { let file_name = match file_path.file_name() { Some(name) => name.to_string_lossy(), None => return Ok(None), }; if file_name.starts_with('.') { // tar archives on macOs, created without COPYFILE_DISABLE=1 env var // will contain "fork files", skip them. return Ok(None); } if file_path.starts_with("global") { let spcnode = postgres_ffi_types::constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_name.as_ref() { "pg_control" => { let bytes = read_all_bytes(reader).await?; // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&bytes[..])?; let checkpoint_bytes = pg_control.checkPointCopy.encode()?; modification.put_checkpoint(checkpoint_bytes)?; debug!("imported control file"); // Import it as ControlFile modification.put_control_file(bytes)?; return Ok(Some(pg_control)); } "pg_filenode.map" => { let bytes = read_all_bytes(reader).await?; modification .put_relmap_file(spcnode, dbnode, bytes, ctx) .await?; debug!("imported relmap file") } "PG_VERSION" => { debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?; debug!("imported rel creation"); } } } else if file_path.starts_with("base") { let spcnode = postgres_ffi_types::constants::DEFAULTTABLESPACE_OID; let dbnode: u32 = file_path .iter() .nth(1) .expect("invalid file path, expected dbnode") .to_string_lossy() .parse()?; match file_name.as_ref() { "pg_filenode.map" => { let bytes = read_all_bytes(reader).await?; modification .put_relmap_file(spcnode, dbnode, bytes, ctx) .await?; debug!("imported relmap file") } "PG_VERSION" => { debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?; debug!("imported rel creation"); } } } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; if modification.tline.tenant_shard_id.is_shard_zero() { import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported clog slru"); } } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; if modification.tline.tenant_shard_id.is_shard_zero() { import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact offsets slru"); } } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; if modification.tline.tenant_shard_id.is_shard_zero() { import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } } else if file_path.starts_with("pg_twophase") { let bytes = read_all_bytes(reader).await?; // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, // it's a 32-bit TransactionId, which fits in u64 anyway. let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; debug!("imported twophase file"); } else if file_path.starts_with("pg_wal") { debug!("found wal file in base section. ignore it"); } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") { // Parse zenith signal file to set correct previous LSN let bytes = read_all_bytes(reader).await?; // neon.signal format is "PREV LSN: prev_lsn" // TODO write serialization and deserialization in the same place. let neon_signal = std::str::from_utf8(&bytes)?.trim(); let prev_lsn = match neon_signal { "PREV LSN: none" => Lsn(0), "PREV LSN: invalid" => Lsn(0), other => { let split = other.split(':').collect::>(); split[1] .trim() .parse::() .context("can't parse neon.signal")? } }; // neon.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. let writer = modification.tline.writer().await; writer.finish_write(prev_lsn); debug!("imported neon signal {}", prev_lsn); } else if file_path.starts_with("pg_tblspc") { // TODO Backups exported from neon won't have pg_tblspc, but we will need // this to import arbitrary postgres databases. bail!("Importing pg_tblspc is not implemented"); } else { debug!( "ignoring unrecognized file \"{}\" in tar archive", file_path.display() ); } Ok(None) } async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result { let mut buf: Vec = vec![]; reader.read_to_end(&mut buf).await?; Ok(Bytes::from(buf)) } ================================================ FILE: pageserver/src/l0_flush.rs ================================================ use std::num::NonZeroUsize; use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { Direct { max_concurrency: NonZeroUsize }, } impl Default for L0FlushConfig { fn default() -> Self { Self::Direct { // TODO: using num_cpus results in different peak memory usage on different instance types. max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(), } } } impl From for L0FlushConfig { fn from(config: pageserver_api::models::L0FlushConfig) -> Self { match config { pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => { Self::Direct { max_concurrency } } } } } #[derive(Clone)] pub struct L0FlushGlobalState(Arc); pub enum Inner { Direct { semaphore: tokio::sync::Semaphore }, } impl L0FlushGlobalState { pub fn new(config: L0FlushConfig) -> Self { match config { L0FlushConfig::Direct { max_concurrency } => { let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); Self(Arc::new(Inner::Direct { semaphore })) } } } pub fn inner(&self) -> &Arc { &self.0 } } ================================================ FILE: pageserver/src/lib.rs ================================================ #![recursion_limit = "300"] #![deny(clippy::undocumented_unsafe_blocks)] mod auth; pub mod basebackup; pub mod basebackup_cache; pub mod config; pub mod consumption_metrics; pub mod context; pub mod controller_upcall_client; pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod feature_resolver; pub mod http; pub mod import_datadir; pub mod l0_flush; extern crate hyper0 as hyper; use futures::StreamExt; use futures::stream::FuturesUnordered; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; use postgres_ffi::PgMajorVersion; use tenant::mgr::{BackgroundPurges, TenantManager}; use tenant::secondary; use tracing::{info, info_span}; /// Current storage format version /// /// This is embedded in the header of all the layer files. /// If you make any backwards-incompatible changes to the storage /// format, bump this! /// Note that TimelineMetadata uses its own version number to track /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; // Target used for performance traces. pub const PERF_TRACE_TARGET: &str = "P"; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; pub struct CancellableTask { pub task: tokio::task::JoinHandle<()>, pub cancel: CancellationToken, } pub struct HttpEndpointListener(pub CancellableTask); pub struct HttpsEndpointListener(pub CancellableTask); pub struct ConsumptionMetricsTasks(pub CancellableTask); pub struct DiskUsageEvictionTask(pub CancellableTask); // HADRON pub struct MetricsCollectionTask(pub CancellableTask); impl CancellableTask { pub async fn shutdown(self) { self.cancel.cancel(); self.task.await.unwrap(); } } #[tracing::instrument(skip_all, fields(%exit_code))] #[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( http_listener: HttpEndpointListener, https_listener: Option, page_service: page_service::Listener, grpc_task: Option, metrics_collection_task: MetricsCollectionTask, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, tenant_manager: &TenantManager, background_purges: BackgroundPurges, mut deletion_queue: DeletionQueue, secondary_controller_tasks: secondary::GlobalTasks, exit_code: i32, ) { use std::time::Duration; let started_at = std::time::Instant::now(); // If the orderly shutdown below takes too long, we still want to make // sure that all walredo processes are killed and wait()ed on by us, not systemd. // // (Leftover walredo processes are the hypothesized trigger for the systemd freezes // that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387. // // We use a thread instead of a tokio task because the background runtime is likely busy // with the final flushing / uploads. This activity here has priority, and due to lack // of scheduling priority feature sin the tokio scheduler, using a separate thread is // an effective priority booster. let walredo_extraordinary_shutdown_thread_span = { let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread"); span.follows_from(tracing::Span::current()); span }; let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new(); let walredo_extraordinary_shutdown_thread = std::thread::spawn({ let walredo_extraordinary_shutdown_thread_cancel = walredo_extraordinary_shutdown_thread_cancel.clone(); move || { let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); let _entered = rt.enter(); let _entered = walredo_extraordinary_shutdown_thread_span.enter(); if let Ok(()) = rt.block_on(tokio::time::timeout( Duration::from_secs(8), walredo_extraordinary_shutdown_thread_cancel.cancelled(), )) { info!("cancellation requested"); return; } let managers = tenant::WALREDO_MANAGERS .lock() .unwrap() // prevents new walredo managers from being inserted .take() .expect("only we take()"); // Use FuturesUnordered to get in queue early for each manager's // heavier_once_cell semaphore wait list. // Also, for idle tenants that for some reason haven't // shut down yet, it's quite likely that we're not going // to get Poll::Pending once. let mut futs: FuturesUnordered<_> = managers .into_iter() .filter_map(|(_, mgr)| mgr.upgrade()) .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await }) .collect(); info!(count=%futs.len(), "built FuturesUnordered"); let mut last_log_at = std::time::Instant::now(); #[derive(Debug, Default)] struct Results { initiated: u64, already: u64, } let mut results = Results::default(); while let Some(we_initiated) = rt.block_on(futs.next()) { if we_initiated { results.initiated += 1; } else { results.already += 1; } if last_log_at.elapsed() > Duration::from_millis(100) { info!(remaining=%futs.len(), ?results, "progress"); last_log_at = std::time::Instant::now(); } } info!(?results, "done"); } }); // Shut down the libpq endpoint task. This prevents new connections from // being accepted. let remaining_connections = timed( page_service.stop_accepting(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) .await; // Shut down the gRPC server task, including request handlers. if let Some(grpc_task) = grpc_task { timed( grpc_task.shutdown(), "shutdown gRPC PageRequestHandler", Duration::from_secs(3), ) .await; } // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) .await; // Shut down any page service tasks: any in-progress work for particular timelines or tenants // should already have been canclled via mgr::shutdown_all_tenants timed( remaining_connections.shutdown(), "shutdown PageRequestHandlers", Duration::from_secs(1), ) .await; // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; // HADRON timed( metrics_collection_task.0.shutdown(), "shutdown metrics collections metrics", Duration::from_secs(1), ) .await; timed( consumption_metrics_worker.0.shutdown(), "shutdown consumption metrics", Duration::from_secs(1), ) .await; timed( futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())), "shutdown disk usage eviction", Duration::from_secs(1), ) .await; timed( background_purges.shutdown(), "shutdown background purges", Duration::from_secs(1), ) .await; if let Some(https_listener) = https_listener { timed( https_listener.0.shutdown(), "shutdown https", Duration::from_secs(1), ) .await; } // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. timed( http_listener.0.shutdown(), "shutdown http", Duration::from_secs(1), ) .await; timed( secondary_controller_tasks.wait(), // cancellation happened in caller "secondary controller wait", Duration::from_secs(1), ) .await; // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), "shutdown leftovers", Duration::from_secs(1), ) .await; info!("cancel & join walredo_extraordinary_shutdown_thread"); walredo_extraordinary_shutdown_thread_cancel.cancel(); walredo_extraordinary_shutdown_thread.join().unwrap(); info!("walredo_extraordinary_shutdown_thread done"); info!( elapsed_ms = started_at.elapsed().as_millis(), "Shut down successfully completed" ); std::process::exit(exit_code); } /// Per-tenant configuration file. /// Full path: `tenants//config-v1`. pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; /// Per-tenant copy of their remote heatmap, downloaded into the local /// tenant path while in secondary mode. pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; pub fn is_temporary(path: &Utf8Path) -> bool { match path.file_name() { Some(name) => name.ends_with(TEMP_FILE_SUFFIX), None => false, } } /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// /// The instances of this value exist only during startup, otherwise `None` is provided, meaning no /// delaying is needed. #[derive(Clone)] pub struct InitializationOrder { /// Each initial tenant load task carries this until it is done loading timelines from remote storage pub initial_tenant_load_remote: Option, /// Each initial tenant load task carries this until completion. pub initial_tenant_load: Option, /// Barrier for when we can start any background jobs. /// /// This can be broken up later on, but right now there is just one class of a background job. pub background_jobs_can_start: utils::completion::Barrier, } /// Time the future with a warning when it exceeds a threshold. async fn timed( fut: Fut, name: &str, warn_at: std::time::Duration, ) -> ::Output { let started = std::time::Instant::now(); let mut fut = std::pin::pin!(fut); match tokio::time::timeout(warn_at, &mut fut).await { Ok(ret) => { tracing::info!( stage = name, elapsed_ms = started.elapsed().as_millis(), "completed" ); ret } Err(_) => { tracing::info!( stage = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); let ret = fut.await; // this has a global allowed_errors tracing::warn!( stage = name, elapsed_ms = started.elapsed().as_millis(), "completed, took longer than expected" ); ret } } } /// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled. async fn timed_after_cancellation( fut: Fut, name: &str, warn_at: std::time::Duration, cancel: &CancellationToken, ) -> ::Output { let mut fut = std::pin::pin!(fut); tokio::select! { _ = cancel.cancelled() => { timed(fut, name, warn_at).await } ret = &mut fut => { ret } } } #[cfg(test)] mod timed_tests { use std::time::Duration; use super::timed; #[tokio::test] async fn timed_completes_when_inner_future_completes() { // A future that completes on time should have its result returned let r1 = timed( async move { tokio::time::sleep(Duration::from_millis(10)).await; 123 }, "test 1", Duration::from_millis(50), ) .await; assert_eq!(r1, 123); // A future that completes too slowly should also have its result returned let r1 = timed( async move { tokio::time::sleep(Duration::from_millis(50)).await; 456 }, "test 1", Duration::from_millis(10), ) .await; assert_eq!(r1, 456); } } ================================================ FILE: pageserver/src/metrics.rs ================================================ use std::cell::Cell; use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, }; use once_cell::sync::Lazy; use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; use crate::config; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::throttle::ThrottleResult; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user /// queries. /// /// The buckets capture the majority of latencies in the microsecond and /// millisecond range but also extend far enough up to distinguish "bad" from /// "really bad". const CRITICAL_OP_BUCKETS: &[f64] = &[ 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s ]; // Metrics collected on operations on the storage repository. #[derive(Debug, VariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] LayerFlush, #[strum(serialize = "layer flush delay")] LayerFlushDelay, #[strum(serialize = "compact")] Compact, #[strum(serialize = "create images")] CreateImages, #[strum(serialize = "logical size")] LogicalSize, #[strum(serialize = "imitate logical size")] ImitateLogicalSize, #[strum(serialize = "load layer map")] LoadLayerMap, #[strum(serialize = "gc")] Gc, #[strum(serialize = "find gc cutoffs")] FindGcCutoffs, } pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { register_counter_vec!( "pageserver_storage_operations_seconds_sum", "Total time spent on storage operations with operation, tenant and timeline dimensions", &["operation", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_storage_operations_seconds_count", "Count of storage operations with operation, tenant and timeline dimensions", &["operation", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); /* BEGIN_HADRON */ pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_active_storage_operations_count", "Count of active storage operations with operation, tenant and timeline dimensions", &["operation", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); /*END_HADRON */ // Buckets for background operations like compaction, GC, size calculation const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds_global", "Time spent on storage operations", &["operation"], STORAGE_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); /// Measures layers visited per read (i.e. read amplification). /// /// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits /// are amortized across the batch, and some layers may not intersect with a given key, each visited /// layer contributes directly to the observed latency for every read in the batch, which is what we /// care about. pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_layers_per_read", "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", &["tenant_id", "shard_id", "timeline_id"], // Low resolution to reduce cardinality. vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_per_read_global", "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_per_read_batch_global", "Layers visited to serve a single read batch (read amplification), regardless of number of reads.", vec![ 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 ], ) .expect("failed to define a metric") }); pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_per_read_amortized_global", "Layers visited to serve a single read (read amplification). Amortized across a batch: \ all visited layers are divided by number of reads.", vec![ 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 ], ) .expect("failed to define a metric") }); pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { // We expect this to be low because of Postgres checkpoints. Let's see if that holds. register_histogram!( "pageserver_deltas_per_read_global", "Number of delta pages applied to image page per read", vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); pub(crate) static CONCURRENT_INITDBS: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_concurrent_initdb", "Number of initdb processes running" ) .expect("failed to define a metric") }); pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_initdb_semaphore_seconds_global", "Time spent getting a permit from the global initdb semaphore", STORAGE_OP_BUCKETS.into() ) .expect("failed to define metric") }); pub(crate) static INITDB_RUN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_initdb_seconds_global", "Time spent performing initdb", STORAGE_OP_BUCKETS.into() ) .expect("failed to define metric") }); pub(crate) struct GetVectoredLatency { map: EnumMap>, } #[allow(dead_code)] pub(crate) struct ScanLatency { map: EnumMap>, } impl GetVectoredLatency { // Only these task types perform vectored gets. Filter all other tasks out to reduce total // cardinality of the metric. const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler]; pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { self.map[task_kind].as_ref() } } impl ScanLatency { // Only these task types perform vectored gets. Filter all other tasks out to reduce total // cardinality of the metric. const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { self.map[task_kind].as_ref() } } pub(crate) struct ScanLatencyOngoingRecording<'a> { parent: &'a Histogram, start: std::time::Instant, } impl<'a> ScanLatencyOngoingRecording<'a> { pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { let start = Instant::now(); ScanLatencyOngoingRecording { parent, start } } pub(crate) fn observe(self) { let elapsed = self.start.elapsed(); self.parent.observe(elapsed.as_secs_f64()); } } pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", "Time spent in get_vectored.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); GetVectoredLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { let task_kind = TaskKind::from_usize(task_kind_idx); if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); Some(inner.with_label_values(&[task_kind])) } else { None } })), } }); pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_scan_seconds", "Time spent in scan.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); ScanLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { let task_kind = TaskKind::from_usize(task_kind_idx); if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); Some(inner.with_label_values(&[task_kind])) } else { None } })), } }); pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_immutable: IntCounter, pub read_hits_immutable: IntCounter, } pub(crate) struct PageCacheMetrics { map: EnumMap>, } static PAGE_CACHE_READ_HITS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_read_hits_total", "Number of read accesses to the page cache that hit", &["task_kind", "key_kind", "content_kind", "hit_kind"] ) .expect("failed to define a metric") }); static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_read_accesses_total", "Number of read accesses to the page cache", &["task_kind", "key_kind", "content_kind"] ) .expect("failed to define a metric") }); pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { map: EnumMap::from_array(std::array::from_fn(|task_kind| { let task_kind = TaskKind::from_usize(task_kind); let task_kind: &'static str = task_kind.into(); EnumMap::from_array(std::array::from_fn(|content_kind| { let content_kind = PageContentKind::from_usize(content_kind); let content_kind: &'static str = content_kind.into(); PageCacheMetricsForTaskKind { read_accesses_immutable: { PAGE_CACHE_READ_ACCESSES .get_metric_with_label_values(&[task_kind, "immutable", content_kind]) .unwrap() }, read_hits_immutable: { PAGE_CACHE_READ_HITS .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"]) .unwrap() }, } })) })), }); impl PageCacheMetrics { pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind { &self.map[ctx.task_kind()][ctx.page_content_kind()] } } pub(crate) struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, pub current_bytes_immutable: UIntGauge, } static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_page_cache_size_current_bytes", "Current size of the page cache in bytes, by key kind", &["key_kind"] ) .expect("failed to define a metric") }); pub(crate) static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheSizeMetrics { max_bytes: { register_uint_gauge!( "pageserver_page_cache_size_max_bytes", "Maximum size of the page cache in bytes" ) .expect("failed to define a metric") }, current_bytes_immutable: { PAGE_CACHE_SIZE_CURRENT_BYTES .get_metric_with_label_values(&["immutable"]) .unwrap() }, }); pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; use metrics::{IntCounter, IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; #[derive(Clone, Copy)] pub(crate) enum Outcome { FoundSlotUnused { iters: NonZeroUsize }, FoundSlotEvicted { iters: NonZeroUsize }, ItersExceeded { iters: NonZeroUsize }, } static ITERS_TOTAL_VEC: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_find_victim_iters_total", "Counter for the number of iterations in the find_victim loop", &["outcome"], ) .expect("failed to define a metric") }); static CALLS_VEC: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_find_victim_calls", "Incremented at the end of each find_victim() call.\ Filter by outcome to get e.g., eviction rate.", &["outcome"] ) .unwrap() }); pub(crate) fn observe(outcome: Outcome) { macro_rules! dry { ($label:literal, $iters:expr) => {{ static LABEL: &'static str = $label; static ITERS_TOTAL: Lazy = Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL])); static CALLS: Lazy = Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL])); ITERS_TOTAL.inc_by(($iters.get()) as u64); CALLS.inc(); }}; } match outcome { Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters), Outcome::FoundSlotEvicted { iters } => { dry!("found_evicted", iters) } Outcome::ItersExceeded { iters } => { dry!("err_iters_exceeded", iters); super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit); } } } } static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { register_int_counter_vec!( "page_cache_errors_total", "Number of timeouts while acquiring a pinned slot in the page cache", &["error_kind"] ) .expect("failed to define a metric") }); pub(crate) static FEATURE_FLAG_EVALUATION: Lazy = Lazy::new(|| { register_counter_vec!( "pageserver_feature_flag_evaluation", "Number of times a feature flag is evaluated", &["flag_key", "status", "value"], ) .unwrap() }); #[derive(IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum PageCacheErrorKind { AcquirePinnedSlotTimeout, EvictIterLimit, } pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) { PAGE_CACHE_ERRORS .get_metric_with_label_values(&[error_kind.into()]) .unwrap() .inc(); } pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive. Updated on completion of the wait_lsn operation.", CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); pub(crate) static WAIT_LSN_START_FINISH_COUNTERPAIR: Lazy = Lazy::new(|| { register_int_counter_pair_vec!( "pageserver_wait_lsn_started_count", "Number of wait_lsn operations started.", "pageserver_wait_lsn_finished_count", "Number of wait_lsn operations finished.", &["tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); pub(crate) static WAIT_LSN_IN_PROGRESS_MICROS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_wait_lsn_in_progress_micros", "Time spent waiting for WAL to arrive, by timeline_id. Updated periodically while waiting.", &["tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_wait_lsn_in_progress_micros_global", "Time spent waiting for WAL to arrive, globally. Updated periodically while waiting." ) .expect("failed to define a metric") }); pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_ondemand_download_bytes_total", "Total bytes of layers on-demand downloaded", &["task_kind"] ) .expect("failed to define a metric") }); pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_ondemand_download_count", "Total count of layers on-demand downloaded", &["task_kind"] ) .expect("failed to define a metric") }); pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m ]; /// The task kinds for which we want to track wait times for on-demand downloads. /// Other task kinds' wait times are accumulated in label value `unknown`. pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [ TaskKind::PageRequestHandler, TaskKind::WalReceiverConnectionHandler, ]; pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy> = Lazy::new(|| { let histo = register_histogram_vec!( "pageserver_wait_ondemand_download_seconds_global", "Observations are individual tasks' wait times for on-demand downloads. \ If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.", &["task_kind"], WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(), ) .expect("failed to define a metric"); WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS .iter() .map(|task_kind| histo.with_label_values(&[task_kind.into()])) .collect::>() }); pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy = Lazy::new(|| { register_counter_vec!( // use a name that _could_ be evolved into a per-timeline histogram later "pageserver_wait_ondemand_download_seconds_sum", "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline", &["tenant_id", "shard_id", "timeline_id", "task_kind"], ) .unwrap() }); pub struct WaitOndemandDownloadTimeSum { counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()], } impl WaitOndemandDownloadTimeSum { pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS .iter() .map(|task_kind| { WAIT_ONDEMAND_DOWNLOAD_TIME_SUM .get_metric_with_label_values(&[ tenant_id, shard_id, timeline_id, task_kind.into(), ]) .unwrap() }) .collect::>(); Self { counters: counters.try_into().unwrap(), } } pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) { let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS .iter() .enumerate() .find(|(_, kind)| **kind == task_kind); let Some((idx, _)) = maybe else { return; }; WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64()); let counter = &self.counters[idx]; counter.inc_by(duration.as_secs_f64()); } } pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) { for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS { let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[ tenant_id, shard_id, timeline_id, task_kind.into(), ]); } } pub(crate) fn preinitialize_global_metrics() { Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL); } } static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", "Last record LSN grouped by timeline", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static DISK_CONSISTENT_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_disk_consistent_lsn", "Disk consistent LSN grouped by timeline", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_projected_remote_consistent_lsn", "Projected remote consistent LSN grouped by timeline", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_pitr_history_size", "Data written since PITR cutoff on this timeline", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); #[derive( strum_macros::EnumIter, strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr, )] #[strum(serialize_all = "kebab_case")] pub(crate) enum LayerKind { Delta, Image, } #[derive( strum_macros::EnumIter, strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr, )] #[strum(serialize_all = "kebab_case")] pub(crate) enum LayerLevel { // We don't track the currently open ephemeral layer, since there's always exactly 1 and its // size changes. See `TIMELINE_EPHEMERAL_BYTES`. Frozen, L0, L1, } static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_bytes", "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)", &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_count", "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)", &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_archive_size", "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static STANDBY_HORIZON: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_standby_horizon", "Standby apply LSN for which GC is hold off, by timeline.", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", "The size of the layer files present in the pageserver's filesystem, for attached locations.", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static VISIBLE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_visible_physical_size", "The size of the layer files present in the pageserver's filesystem.", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_resident_physical_size_global", "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions." ) .expect("failed to define a metric") }); static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", "The size of the layer files present in the remote storage that are listed in the remote index_part.json.", // Corollary: If any files are missing from the index part, they won't be included here. &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_remote_physical_size_global", "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions." ) .expect("failed to define a metric") }); pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_layers_total", "Total on-demand downloaded layers" ) .unwrap() }); pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_bytes_total", "Total bytes of layers on-demand downloaded", ) .unwrap() }); static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_current_logical_size", "Current logical size grouped by timeline", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define current logical size metric") }); static AUX_FILE_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_aux_file_estimated_size", "The size of all aux files for a timeline in aux file v2 store.", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static VALID_LSN_LEASE_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_valid_lsn_lease_count", "The number of valid leases after refreshing gc info.", &["tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_circuit_breaker_broken", "How many times a circuit breaker has broken" ) .expect("failed to define a metric") }); pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_circuit_breaker_unbroken", "How many times a circuit breaker has been un-broken (recovered)" ) .expect("failed to define a metric") }); pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_compression_image_in_bytes_total", "Size of data written into image layers before compression" ) .expect("failed to define a metric") }); pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_compression_image_in_bytes_considered", "Size of potentially compressible data written into image layers before compression" ) .expect("failed to define a metric") }); pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_compression_image_in_bytes_chosen", "Size of data whose compressed form was written into image layers" ) .expect("failed to define a metric") }); pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_compression_image_out_bytes_total", "Size of compressed image layer written" ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_relsize_latest_cache_entries", "Number of entries in the latest relation size cache", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_relsize_latest_cache_hits", "Latest relation size cache hits", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_relsize_latest_cache_misses", "Relation size latest cache misses", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_relsize_snapshot_cache_entries", "Number of entries in the pitr relation size cache", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_relsize_snapshot_cache_hits", "Pitr relation size cache hits", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_relsize_snapshot_cache_misses", "Relation size snapshot cache misses", ) .expect("failed to define a metric") }); pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_relsize_cache_misses_old", "Relation size cache misses where the lookup LSN is older than the last relation update" ) .expect("failed to define a metric") }); pub(crate) mod initial_logical_size { use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec}; use once_cell::sync::Lazy; pub(crate) struct StartCalculation(IntCounterVec); pub(crate) static START_CALCULATION: Lazy = Lazy::new(|| { StartCalculation( register_int_counter_vec!( "pageserver_initial_logical_size_start_calculation", "Incremented each time we start an initial logical size calculation attempt. \ The `circumstances` label provides some additional details.", &["attempt", "circumstances"] ) .unwrap(), ) }); struct DropCalculation { first: IntCounter, retry: IntCounter, } static DROP_CALCULATION: Lazy = Lazy::new(|| { let vec = register_int_counter_vec!( "pageserver_initial_logical_size_drop_calculation", "Incremented each time we abort a started size calculation attmpt.", &["attempt"] ) .unwrap(); DropCalculation { first: vec.with_label_values(&["first"]), retry: vec.with_label_values(&["retry"]), } }); pub(crate) struct Calculated { pub(crate) births: IntCounter, pub(crate) deaths: IntCounter, } pub(crate) static CALCULATED: Lazy = Lazy::new(|| Calculated { births: register_int_counter!( "pageserver_initial_logical_size_finish_calculation", "Incremented every time we finish calculation of initial logical size.\ If everything is working well, this should happen at most once per Timeline object." ) .unwrap(), deaths: register_int_counter!( "pageserver_initial_logical_size_drop_finished_calculation", "Incremented when we drop a finished initial logical size calculation result.\ Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge." ) .unwrap(), }); pub(crate) struct OngoingCalculationGuard { inc_drop_calculation: Option, } #[derive(strum_macros::IntoStaticStr)] pub(crate) enum StartCircumstances { EmptyInitial, SkippedConcurrencyLimiter, AfterBackgroundTasksRateLimit, } impl StartCalculation { pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard { let circumstances_label: &'static str = circumstances.into(); self.0 .with_label_values(&["first", circumstances_label]) .inc(); OngoingCalculationGuard { inc_drop_calculation: Some(DROP_CALCULATION.first.clone()), } } pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard { let circumstances_label: &'static str = circumstances.into(); self.0 .with_label_values(&["retry", circumstances_label]) .inc(); OngoingCalculationGuard { inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()), } } } impl Drop for OngoingCalculationGuard { fn drop(&mut self) { if let Some(counter) = self.inc_drop_calculation.take() { counter.inc(); } } } impl OngoingCalculationGuard { pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard { drop(self.inc_drop_calculation.take()); CALCULATED.births.inc(); FinishedCalculationGuard { inc_on_drop: CALCULATED.deaths.clone(), } } } pub(crate) struct FinishedCalculationGuard { inc_on_drop: IntCounter, } impl Drop for FinishedCalculationGuard { fn drop(&mut self) { self.inc_on_drop.inc(); } } // context: https://github.com/neondatabase/neon/issues/5963 pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size", "Counter for the following event: walreceiver calls\ Timeline::get_current_logical_size() and it returns `Approximate` for the first time." ) .unwrap() }); } static DIRECTORY_ENTRIES_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_directory_entries_count", "Sum of the entries in pageserver-stored directory listings", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", "Count of tenants per state", &["state"] ) .expect("Failed to register pageserver_tenant_states_count metric") }); pub(crate) static TIMELINE_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_timeline_states_count", "Count of timelines per state", &["state"] ) .expect("Failed to register pageserver_timeline_states_count metric") }); /// A set of broken tenants. /// /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken /// tenant. pub(crate) static BROKEN_TENANTS_SET: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_broken_tenants_count", "Set of broken tenants", &["tenant_id", "shard_id"] ) .expect("Failed to register pageserver_tenant_states_count metric") }); pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_synthetic_cached_size_bytes", "Synthetic size of each tenant in bytes", &["tenant_id"] ) .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_offloaded_timelines", "Number of offloaded timelines of a tenant", &["tenant_id", "shard_id"] ) .expect("Failed to register pageserver_tenant_offloaded_timelines metric") }); pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", "Time spent on a single eviction iteration", &["period_secs", "threshold_secs"], STORAGE_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); static EVICTIONS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_evictions", "Number of layers evicted from the pageserver", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_evictions_with_low_residence_duration", "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ Residence duration is determined using the `residence_duration_data_source`.", &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] ) .expect("failed to define a metric") }); pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_unexpected_ondemand_downloads_count", "Number of unexpected on-demand downloads. \ We log more context for each increment, so, forgo any labels in this metric.", ) .expect("failed to define a metric") }); /// How long did we take to start up? Broken down by labels to describe /// different phases of startup. pub static STARTUP_DURATION: Lazy = Lazy::new(|| { register_gauge_vec!( "pageserver_startup_duration_seconds", "Time taken by phases of pageserver startup, in seconds", &["phase"] ) .expect("Failed to register pageserver_startup_duration_seconds metric") }); pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_startup_is_loading", "1 while in initial startup load of tenants, 0 at other times" ) .expect("Failed to register pageserver_startup_is_loading") }); pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_timeline_ephemeral_bytes", "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated." ) .expect("Failed to register metric") }); /// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things /// like how long it took to load. /// /// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant /// metrics are rather expensive, and usually fine grained stuff makes more sense /// at a timeline level than tenant level. pub(crate) struct TenantMetrics { /// How long did tenants take to go from construction to active state? pub(crate) activation: Histogram, pub(crate) preload: Histogram, pub(crate) attach: Histogram, /// How many tenants are included in the initial startup of the pagesrever? pub(crate) startup_scheduled: IntCounter, pub(crate) startup_complete: IntCounter, } pub(crate) static TENANT: Lazy = Lazy::new(|| { TenantMetrics { activation: register_histogram!( "pageserver_tenant_activation_seconds", "Time taken by tenants to activate, in seconds", CRITICAL_OP_BUCKETS.into() ) .expect("Failed to register metric"), preload: register_histogram!( "pageserver_tenant_preload_seconds", "Time taken by tenants to load remote metadata on startup/attach, in seconds", CRITICAL_OP_BUCKETS.into() ) .expect("Failed to register metric"), attach: register_histogram!( "pageserver_tenant_attach_seconds", "Time taken by tenants to intialize, after remote metadata is already loaded", CRITICAL_OP_BUCKETS.into() ) .expect("Failed to register metric"), startup_scheduled: register_int_counter!( "pageserver_tenant_startup_scheduled", "Number of tenants included in pageserver startup (doesn't count tenants attached later)" ).expect("Failed to register metric"), startup_complete: register_int_counter!( "pageserver_tenant_startup_complete", "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \ should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \ tenants: such cases will lead to this metric never reaching the scheduled count." ).expect("Failed to register metric"), } }); /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub(crate) struct EvictionsWithLowResidenceDuration { data_source: &'static str, threshold: Duration, counter: Option, } pub(crate) struct EvictionsWithLowResidenceDurationBuilder { data_source: &'static str, threshold: Duration, } impl EvictionsWithLowResidenceDurationBuilder { pub fn new(data_source: &'static str, threshold: Duration) -> Self { Self { data_source, threshold, } } fn build( &self, tenant_id: &str, shard_id: &str, timeline_id: &str, ) -> EvictionsWithLowResidenceDuration { let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION .get_metric_with_label_values(&[ tenant_id, shard_id, timeline_id, self.data_source, &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), ]) .unwrap(); EvictionsWithLowResidenceDuration { data_source: self.data_source, threshold: self.threshold, counter: Some(counter), } } } impl EvictionsWithLowResidenceDuration { fn threshold_label_value(threshold: Duration) -> String { format!("{}", threshold.as_secs()) } pub fn observe(&self, observed_value: Duration) { if observed_value < self.threshold { self.counter .as_ref() .expect("nobody calls this function after `remove_from_vec`") .inc(); } } pub fn change_threshold( &mut self, tenant_id: &str, shard_id: &str, timeline_id: &str, new_threshold: Duration, ) { if new_threshold == self.threshold { return; } let mut with_new = EvictionsWithLowResidenceDurationBuilder::new( self.data_source, new_threshold, ) .build(tenant_id, shard_id, timeline_id); std::mem::swap(self, &mut with_new); with_new.remove(tenant_id, shard_id, timeline_id); } // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) { let Some(_counter) = self.counter.take() else { return; }; let threshold = Self::threshold_label_value(self.threshold); let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ tenant_id, shard_id, timeline_id, self.data_source, &threshold, ]); match removed { Err(e) => { // this has been hit in staging as // , but we don't know how. // because we can be in the drop path already, don't risk: // - "double-panic => illegal instruction" or // - future "drop panick => abort" // // so just nag: (the error has the labels) tracing::warn!( "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}" ); } Ok(()) => { // to help identify cases where we double-remove the same values, let's log all // deletions? tracing::info!( "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source ); } } } } // Metrics collected on disk IO operations // // Roughly logarithmic scale. const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 0.00005, // 50us 0.00006, // 60us 0.00007, // 70us 0.00008, // 80us 0.00009, // 90us 0.0001, // 100us 0.000110, // 110us 0.000120, // 120us 0.000130, // 130us 0.000140, // 140us 0.000150, // 150us 0.000160, // 160us 0.000170, // 170us 0.000180, // 180us 0.000190, // 190us 0.000200, // 200us 0.000210, // 210us 0.000220, // 220us 0.000230, // 230us 0.000240, // 240us 0.000250, // 250us 0.000300, // 300us 0.000350, // 350us 0.000400, // 400us 0.000450, // 450us 0.000500, // 500us 0.000600, // 600us 0.000700, // 700us 0.000800, // 800us 0.000900, // 900us 0.001000, // 1ms 0.002000, // 2ms 0.003000, // 3ms 0.004000, // 4ms 0.005000, // 5ms 0.01000, // 10ms 0.02000, // 20ms 0.05000, // 50ms ]; /// VirtualFile fs operation variants. /// /// Operations: /// - open ([`std::fs::OpenOptions::open`]) /// - close (dropping [`crate::virtual_file::VirtualFile`]) /// - close-by-replace (close by replacement algorithm) /// - read (`read_at`) /// - write (`write_at`) /// - seek (modify internal position or file length query) /// - fsync ([`std::fs::File::sync_all`]) /// - metadata ([`std::fs::File::metadata`]) #[derive( Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, )] pub(crate) enum StorageIoOperation { Open, OpenAfterReplace, Close, CloseByReplace, Read, Write, Seek, Fsync, Metadata, SetLen, } impl StorageIoOperation { pub fn as_str(&self) -> &'static str { match self { StorageIoOperation::Open => "open", StorageIoOperation::OpenAfterReplace => "open-after-replace", StorageIoOperation::Close => "close", StorageIoOperation::CloseByReplace => "close-by-replace", StorageIoOperation::Read => "read", StorageIoOperation::Write => "write", StorageIoOperation::Seek => "seek", StorageIoOperation::Fsync => "fsync", StorageIoOperation::Metadata => "metadata", StorageIoOperation::SetLen => "set_len", } } } /// Tracks time taken by fs operations near VirtualFile. #[derive(Debug)] pub(crate) struct StorageIoTime { metrics: [Histogram; StorageIoOperation::COUNT], } impl StorageIoTime { fn new() -> Self { let storage_io_histogram_vec = register_histogram_vec!( "pageserver_io_operations_seconds", "Time spent in IO operations", &["operation"], STORAGE_IO_TIME_BUCKETS.into() ) .expect("failed to define a metric"); let metrics = std::array::from_fn(|i| { let op = StorageIoOperation::from_repr(i).unwrap(); storage_io_histogram_vec .get_metric_with_label_values(&[op.as_str()]) .unwrap() }); Self { metrics } } pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram { &self.metrics[op as usize] } } pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(StorageIoTime::new); #[derive(Clone, Copy)] #[repr(usize)] pub(crate) enum StorageIoSizeOperation { Read, Write, } impl StorageIoSizeOperation { pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"]; fn as_str(&self) -> &'static str { Self::VARIANTS[*self as usize] } } // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); #[derive(Clone, Debug)] pub(crate) struct StorageIoSizeMetrics { pub read: UIntGauge, pub write: UIntGauge, } impl StorageIoSizeMetrics { pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { let read = STORAGE_IO_SIZE .get_metric_with_label_values(&[ StorageIoSizeOperation::Read.as_str(), tenant_id, shard_id, timeline_id, ]) .unwrap(); let write = STORAGE_IO_SIZE .get_metric_with_label_values(&[ StorageIoSizeOperation::Write.as_str(), tenant_id, shard_id, timeline_id, ]) .unwrap(); Self { read, write } } } #[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; pub(crate) static SIZE_MAX: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_virtual_file_descriptor_cache_size_max", "Maximum number of open file descriptors in the cache." ) .unwrap() }); // SIZE_CURRENT: derive it like so: // ``` // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$") // -ignoring(operation) // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"} // ``` } #[cfg(not(test))] pub(crate) mod virtual_file_io_engine { use super::*; pub(crate) static KIND: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_virtual_file_io_engine_kind", "The configured io engine for VirtualFile", &["kind"], ) .unwrap() }); } pub(crate) struct SmgrOpTimer(Option); pub(crate) struct SmgrOpTimerInner { global_execution_latency_histo: Histogram, per_timeline_execution_latency_histo: Option, global_batch_wait_time: Histogram, per_timeline_batch_wait_time: Histogram, global_flush_in_progress_micros: IntCounter, per_timeline_flush_in_progress_micros: IntCounter, throttling: Arc, timings: SmgrOpTimerState, } /// The stages of request processing are represented by the enum variants. /// Used as part of [`SmgrOpTimerInner::timings`]. /// /// Request processing calls into the `SmgrOpTimer::observe_*` methods at the /// transition points. /// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`] /// to the next state. /// /// Each request goes through every stage, in all configurations. /// #[derive(Debug)] enum SmgrOpTimerState { Received { // In the future, we may want to track the full time the request spent // inside pageserver process (time spent in kernel buffers can't be tracked). // `received_at` would be used for that. #[allow(dead_code)] received_at: Instant, }, Throttling { throttle_started_at: Instant, }, Batching { throttle_done_at: Instant, }, Executing { execution_started_at: Instant, }, Flushing, // NB: when adding observation points, remember to update the Drop impl. } // NB: when adding observation points, remember to update the Drop impl. impl SmgrOpTimer { /// See [`SmgrOpTimerState`] for more context. pub(crate) fn observe_throttle_start(&mut self, at: Instant) { let Some(inner) = self.0.as_mut() else { return; }; let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else { return; }; inner.throttling.count_accounted_start.inc(); inner.timings = SmgrOpTimerState::Throttling { throttle_started_at: at, }; } /// See [`SmgrOpTimerState`] for more context. pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) { let Some(inner) = self.0.as_mut() else { return; }; let SmgrOpTimerState::Throttling { throttle_started_at, } = &inner.timings else { return; }; inner.throttling.count_accounted_finish.inc(); match throttle { ThrottleResult::NotThrottled { end } => { inner.timings = SmgrOpTimerState::Batching { throttle_done_at: end, }; } ThrottleResult::Throttled { end } => { // update metrics inner.throttling.count_throttled.inc(); inner .throttling .wait_time .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap()); // state transition inner.timings = SmgrOpTimerState::Batching { throttle_done_at: end, }; } } } /// See [`SmgrOpTimerState`] for more context. pub(crate) fn observe_execution_start(&mut self, at: Instant) { let Some(inner) = self.0.as_mut() else { return; }; let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else { return; }; // update metrics let batch = at - *throttle_done_at; inner.global_batch_wait_time.observe(batch.as_secs_f64()); inner .per_timeline_batch_wait_time .observe(batch.as_secs_f64()); // state transition inner.timings = SmgrOpTimerState::Executing { execution_started_at: at, } } /// For all but the first caller, this is a no-op. /// The first callers receives Some, subsequent ones None. /// /// See [`SmgrOpTimerState`] for more context. pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option { // NB: unlike the other observe_* methods, this one take()s. #[allow(clippy::question_mark)] // maintain similar code pattern. let Some(mut inner) = self.0.take() else { return None; }; let SmgrOpTimerState::Executing { execution_started_at, } = &inner.timings else { return None; }; // update metrics let execution = at - *execution_started_at; inner .global_execution_latency_histo .observe(execution.as_secs_f64()); if let Some(per_timeline_execution_latency_histo) = &inner.per_timeline_execution_latency_histo { per_timeline_execution_latency_histo.observe(execution.as_secs_f64()); } // state transition inner.timings = SmgrOpTimerState::Flushing; // return the flush in progress object which // will do the remaining metrics updates let SmgrOpTimerInner { global_flush_in_progress_micros, per_timeline_flush_in_progress_micros, .. } = inner; Some(SmgrOpFlushInProgress { global_micros: global_flush_in_progress_micros, per_timeline_micros: per_timeline_flush_in_progress_micros, }) } } /// The last stage of request processing is serializing and flushing the request /// into the TCP connection. We want to make slow flushes observable /// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`] /// to periodically bump the metric. /// /// If in the future we decide that we're not interested in live updates, we can /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there, /// and remove this struct from the code base. pub(crate) struct SmgrOpFlushInProgress { global_micros: IntCounter, per_timeline_micros: IntCounter, } impl Drop for SmgrOpTimer { fn drop(&mut self) { // In case of early drop, update any of the remaining metrics with // observations so that (started,finished) counter pairs balance out // and all counters on the latency path have the the same number of // observations. // It's technically lying and it would be better if each metric had // a separate label or similar for cancelled requests. // But we don't have that right now and counter pairs balancing // out is useful when using the metrics in panels and whatnot. let now = Instant::now(); self.observe_throttle_start(now); self.observe_throttle_done(ThrottleResult::NotThrottled { end: now }); self.observe_execution_start(now); let maybe_flush_timer = self.observe_execution_end(now); drop(maybe_flush_timer); } } impl SmgrOpFlushInProgress { /// The caller must guarantee that `socket_fd`` outlives this function. pub(crate) async fn measure(self, started_at: Instant, fut: Fut, socket_fd: RawFd) -> O where Fut: std::future::Future, { let mut fut = std::pin::pin!(fut); let mut logged = false; let mut last_counter_increment_at = started_at; let mut observe_guard = scopeguard::guard( |is_timeout| { let now = Instant::now(); // Increment counter { let elapsed_since_last_observe = now - last_counter_increment_at; self.global_micros .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); self.per_timeline_micros .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); last_counter_increment_at = now; } // Log something on every timeout, and on completion but only if we hit a timeout. if is_timeout || logged { logged = true; let elapsed_total = now - started_at; let msg = if is_timeout { "slow flush ongoing" } else { "slow flush completed or cancelled" }; let (inq, outq) = { // SAFETY: caller guarantees that `socket_fd` outlives this function. #[cfg(target_os = "linux")] unsafe { ( utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2), utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2), ) } #[cfg(not(target_os = "linux"))] { _ = socket_fd; // appease unused lint on macOS (-1, -1) } }; let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64()); tracing::info!(elapsed_total_secs, inq, outq, msg); } }, |mut observe| { observe(false); }, ); loop { match tokio::time::timeout(Duration::from_secs(10), &mut fut).await { Ok(v) => return v, Err(_timeout) => { (*observe_guard)(true); } } } } } #[derive( Debug, Clone, Copy, IntoStaticStr, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum SmgrQueryType { GetRelExists, GetRelSize, GetPageAtLsn, GetDbSize, GetSlruSegment, #[cfg(feature = "testing")] Test, } #[derive( Debug, Clone, Copy, IntoStaticStr, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum GetPageBatchBreakReason { BatchFull, NonBatchableRequest, NonUniformLsn, SamePageAtDifferentLsn, NonUniformTimeline, ExecutorSteal, #[cfg(feature = "testing")] NonUniformKey, } pub(crate) struct SmgrQueryTimePerTimeline { global_started: [IntCounter; SmgrQueryType::COUNT], global_latency: [Histogram; SmgrQueryType::COUNT], per_timeline_getpage_started: IntCounter, per_timeline_getpage_latency: Histogram, global_batch_size: Histogram, per_timeline_batch_size: Histogram, global_flush_in_progress_micros: IntCounter, per_timeline_flush_in_progress_micros: IntCounter, global_batch_wait_time: Histogram, per_timeline_batch_wait_time: Histogram, global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT], per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics, throttling: Arc, } static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { register_int_counter_vec!( // it's a counter, but, name is prepared to extend it to a histogram of queue depth "pageserver_smgr_query_started_global_count", "Number of smgr queries started, aggregated by query type.", &["smgr_query_type"], ) .expect("failed to define a metric") }); static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_int_counter_vec!( // it's a counter, but, name is prepared to extend it to a histogram of queue depth "pageserver_smgr_query_started_count", "Number of smgr queries started, aggregated by query type and tenant/timeline.", &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); /// Per-timeline smgr histogram buckets should be the same as the compute buckets, such that the /// metrics are comparable across compute and Pageserver. See also: /// /// static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = &[0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.1, 1.0, 3.0]; static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent _executing_ smgr query handling, excluding batch and throttle delays.", &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"], SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(), ) .expect("failed to define a metric") }); static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy> = Lazy::new(|| { [ 1, 10, 20, 40, 60, 80, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1_000, // 1ms 2_000, 4_000, 6_000, 8_000, 10_000, // 10ms 20_000, 40_000, 60_000, 80_000, 100_000, 200_000, 400_000, 600_000, 800_000, 1_000_000, // 1s 2_000_000, 4_000_000, 6_000_000, 8_000_000, 10_000_000, // 10s 20_000_000, 50_000_000, 100_000_000, 200_000_000, 1_000_000_000, // 1000s ] .into_iter() .map(Duration::from_micros) .map(|d| d.as_secs_f64()) .collect() }); static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds_global", "Like pageserver_smgr_query_seconds, but aggregated to instance level.", &["smgr_query_type"], SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(), ) .expect("failed to define a metric") }); static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy> = Lazy::new(|| { (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap()) .map(|v| v.into()) .collect() }); static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_page_service_batch_size_global", "Batch size of pageserver page service requests", PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(), ) .expect("failed to define a metric") }); static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy> = Lazy::new(|| { let mut buckets = Vec::new(); for i in 0.. { let bucket = 1 << i; if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() { break; } buckets.push(bucket.into()); } buckets }); static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_page_service_batch_size", "Batch size of pageserver page service requests", &["tenant_id", "shard_id", "timeline_id"], PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone() ) .expect("failed to define a metric") }); static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy = Lazy::new(|| { register_int_counter_vec!( // it's a counter, but, name is prepared to extend it to a histogram of queue depth "pageserver_page_service_batch_break_reason_global", "Reason for breaking batches of get page requests", &["reason"], ) .expect("failed to define a metric") }); struct GetPageBatchBreakReasonTimelineMetrics { map: EnumMap, } impl GetPageBatchBreakReasonTimelineMetrics { fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self { GetPageBatchBreakReasonTimelineMetrics { map: EnumMap::from_array(std::array::from_fn(|reason_idx| { let reason = GetPageBatchBreakReason::from_usize(reason_idx); PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[ tenant_id, shard_slug, timeline_id, reason.into(), ]) })), } } fn inc(&self, reason: GetPageBatchBreakReason) { self.map[reason].inc() } } static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_service_batch_break_reason", "Reason for breaking batches of get page requests", &["tenant_id", "shard_id", "timeline_id", "reason"], ) .expect("failed to define a metric") }); pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_page_service_config_max_batch_size", "Configured maximum batch size for the server-side batching functionality of page_service. \ Labels expose more of the configuration parameters.", &["mode", "execution", "batching"] ) .expect("failed to define a metric") }); fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); let (label_values, value) = match conf { PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1), PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size, execution, batching, }) => { let mode = "pipelined"; let execution = match execution { PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { "concurrent-futures" } PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks", }; let batching = match batching { PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn", PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn", }; ([mode, execution, batching], max_batch_size.get()) } }; PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE .with_label_values(&label_values) .set(value.try_into().unwrap()); } static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_service_pagestream_flush_in_progress_micros", "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \ If the flush is particularly slow, this counter will be updated periodically to make slow flushes \ easily discoverable in monitoring. \ Hence, this is NOT a completion latency historgram.", &["tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_page_service_pagestream_flush_in_progress_micros_global", "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.", ) .expect("failed to define a metric") }); static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_page_service_pagestream_batch_wait_time_seconds", "Time a request spent waiting in its batch until the batch moved to throttle&execution.", &["tenant_id", "shard_id", "timeline_id"], SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(), ) .expect("failed to define a metric") }); static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_page_service_pagestream_batch_wait_time_seconds_global", "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.", SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(), ) .expect("failed to define a metric") }); impl SmgrQueryTimePerTimeline { pub(crate) fn new( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, pagestream_throttle_metrics: Arc, ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); let global_started = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); SMGR_QUERY_STARTED_GLOBAL .get_metric_with_label_values(&[op.into()]) .unwrap() }); let global_latency = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) .unwrap() }); let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE .get_metric_with_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), &tenant_id, &shard_slug, &timeline_id, ]) .unwrap(); let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE .get_metric_with_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), &tenant_id, &shard_slug, &timeline_id, ]) .unwrap(); let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone(); let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone(); let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); let global_batch_break_reason = std::array::from_fn(|i| { let reason = GetPageBatchBreakReason::from_usize(i); PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL .get_metric_with_label_values(&[reason.into()]) .unwrap() }); let per_timeline_batch_break_reason = GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id); let global_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone(); let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); Self { global_started, global_latency, per_timeline_getpage_latency, per_timeline_getpage_started, global_batch_size, per_timeline_batch_size, global_flush_in_progress_micros, per_timeline_flush_in_progress_micros, global_batch_wait_time, per_timeline_batch_wait_time, global_batch_break_reason, per_timeline_batch_break_reason, throttling: pagestream_throttle_metrics, } } pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer { self.global_started[op as usize].inc(); let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) { self.per_timeline_getpage_started.inc(); Some(self.per_timeline_getpage_latency.clone()) } else { None }; SmgrOpTimer(Some(SmgrOpTimerInner { global_execution_latency_histo: self.global_latency[op as usize].clone(), per_timeline_execution_latency_histo: per_timeline_latency_histo, global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(), per_timeline_flush_in_progress_micros: self .per_timeline_flush_in_progress_micros .clone(), global_batch_wait_time: self.global_batch_wait_time.clone(), per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(), throttling: self.throttling.clone(), timings: SmgrOpTimerState::Received { received_at }, })) } /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer pub(crate) fn observe_getpage_batch_start( &self, batch_size: usize, break_reason: GetPageBatchBreakReason, ) { self.global_batch_size.observe(batch_size as f64); self.per_timeline_batch_size.observe(batch_size as f64); self.global_batch_break_reason[break_reason.into_usize()].inc(); self.per_timeline_batch_break_reason.inc(break_reason); } } // keep in sync with control plane Go code so that we can validate // compute's basebackup_ms metric with our perspective in the context of SLI/SLO. static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { // Go code uses milliseconds. Variable is called `computeStartupBuckets` [ 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000, 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000, ] .map(|ms| (ms as f64) / 1000.0) }); pub(crate) struct BasebackupQueryTime { ok: Histogram, error: Histogram, client_error: Histogram, } pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { let vec = register_histogram_vec!( "pageserver_basebackup_query_seconds", "Histogram of basebackup queries durations, by result type", &["result"], COMPUTE_STARTUP_BUCKETS.to_vec(), ) .expect("failed to define a metric"); BasebackupQueryTime { ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), error: vec.get_metric_with_label_values(&["error"]).unwrap(), client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(), } }); pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> { parent: &'a BasebackupQueryTime, start: std::time::Instant, } impl BasebackupQueryTime { pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> { let start = Instant::now(); BasebackupQueryTimeOngoingRecording { parent: self, start, } } } impl BasebackupQueryTimeOngoingRecording<'_> { pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed().as_secs_f64(); // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, Err(QueryError::Shutdown) | Err(QueryError::Reconnect) => { // Do not observe ok/err for shutdown/reconnect. // Reconnect error might be raised when the operation is waiting for LSN and the tenant shutdown interrupts // the operation. A reconnect error will be issued and the client will retry. return; } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) if is_expected_io_error(io_error) => { &self.parent.client_error } Err(_) => &self.parent.error, }; metric.observe(elapsed); } } pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { register_int_counter_pair_vec!( "pageserver_live_connections_started", "Number of network connections that we started handling", "pageserver_live_connections_finished", "Number of network connections that we finished handling", &["pageserver_connection_kind"] ) .expect("failed to define a metric") }); #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] pub(crate) enum ComputeCommandKind { PageStreamV3, PageStreamV2, Basebackup, Fullbackup, LeaseLsn, } pub(crate) struct ComputeCommandCounters { map: EnumMap, } pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy::new(|| { let inner = register_int_counter_vec!( "pageserver_compute_commands", "Number of compute -> pageserver commands processed", &["command"] ) .expect("failed to define a metric"); ComputeCommandCounters { map: EnumMap::from_array(std::array::from_fn(|i| { let command = ComputeCommandKind::from_usize(i); let command_str: &'static str = command.into(); inner.with_label_values(&[command_str]) })), } }); impl ComputeCommandCounters { pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter { &self.map[command] } } // remote storage metrics static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { register_int_counter_pair_vec!( "pageserver_remote_timeline_client_calls_started", "Number of started calls to remote timeline client.", "pageserver_remote_timeline_client_calls_finished", "Number of finshed calls to remote timeline client.", &[ "tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind" ], ) .unwrap() }); static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_remote_timeline_client_bytes_started", "Incremented by the number of bytes associated with a remote timeline client operation. \ The increment happens when the operation is scheduled.", &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"], ) .expect("failed to define a metric") }); static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_remote_timeline_client_bytes_finished", "Incremented by the number of bytes associated with a remote timeline client operation. \ The increment happens when the operation finishes (regardless of success/failure/shutdown).", &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"], ) .expect("failed to define a metric") }); pub(crate) struct TenantManagerMetrics { tenant_slots_attached: UIntGauge, tenant_slots_secondary: UIntGauge, tenant_slots_inprogress: UIntGauge, pub(crate) tenant_slot_writes: IntCounter, pub(crate) unexpected_errors: IntCounter, } impl TenantManagerMetrics { /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects /// exactly: they track the lifetime of the slots _in the tenant map_. pub(crate) fn slot_inserted(&self, slot: &TenantSlot) { match slot { TenantSlot::Attached(_) => { self.tenant_slots_attached.inc(); } TenantSlot::Secondary(_) => { self.tenant_slots_secondary.inc(); } TenantSlot::InProgress(_) => { self.tenant_slots_inprogress.inc(); } } } pub(crate) fn slot_removed(&self, slot: &TenantSlot) { match slot { TenantSlot::Attached(_) => { self.tenant_slots_attached.dec(); } TenantSlot::Secondary(_) => { self.tenant_slots_secondary.dec(); } TenantSlot::InProgress(_) => { self.tenant_slots_inprogress.dec(); } } } #[cfg(all(debug_assertions, not(test)))] pub(crate) fn slots_total(&self) -> u64 { self.tenant_slots_attached.get() + self.tenant_slots_secondary.get() + self.tenant_slots_inprogress.get() } } pub(crate) static TENANT_MANAGER: Lazy = Lazy::new(|| { let tenant_slots = register_uint_gauge_vec!( "pageserver_tenant_manager_slots", "How many slots currently exist, including all attached, secondary and in-progress operations", &["mode"] ) .expect("failed to define a metric"); TenantManagerMetrics { tenant_slots_attached: tenant_slots .get_metric_with_label_values(&["attached"]) .unwrap(), tenant_slots_secondary: tenant_slots .get_metric_with_label_values(&["secondary"]) .unwrap(), tenant_slots_inprogress: tenant_slots .get_metric_with_label_values(&["inprogress"]) .unwrap(), tenant_slot_writes: register_int_counter!( "pageserver_tenant_manager_slot_writes", "Writes to a tenant slot, including all of create/attach/detach/delete" ) .expect("failed to define a metric"), unexpected_errors: register_int_counter!( "pageserver_tenant_manager_unexpected_errors_total", "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." ) .expect("failed to define a metric"), } }); pub(crate) struct DeletionQueueMetrics { pub(crate) keys_submitted: IntCounter, pub(crate) keys_dropped: IntCounter, pub(crate) keys_executed: IntCounter, pub(crate) keys_validated: IntCounter, pub(crate) dropped_lsn_updates: IntCounter, pub(crate) unexpected_errors: IntCounter, pub(crate) remote_errors: IntCounterVec, } pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { DeletionQueueMetrics{ keys_submitted: register_int_counter!( "pageserver_deletion_queue_submitted_total", "Number of objects submitted for deletion" ) .expect("failed to define a metric"), keys_dropped: register_int_counter!( "pageserver_deletion_queue_dropped_total", "Number of object deletions dropped due to stale generation." ) .expect("failed to define a metric"), keys_executed: register_int_counter!( "pageserver_deletion_queue_executed_total", "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion" ) .expect("failed to define a metric"), keys_validated: register_int_counter!( "pageserver_deletion_queue_validated_total", "Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage." ) .expect("failed to define a metric"), dropped_lsn_updates: register_int_counter!( "pageserver_deletion_queue_dropped_lsn_updates_total", "Updates to remote_consistent_lsn dropped due to stale generation number." ) .expect("failed to define a metric"), unexpected_errors: register_int_counter!( "pageserver_deletion_queue_unexpected_errors_total", "Number of unexpected condiions that may stall the queue: any value above zero is unexpected." ) .expect("failed to define a metric"), remote_errors: register_int_counter_vec!( "pageserver_deletion_queue_remote_errors_total", "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects", &["op_kind"], ) .expect("failed to define a metric") } }); pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, pub(crate) upload_heatmap_duration: Histogram, pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" ) .expect("failed to define a metric"), upload_heatmap_errors: register_int_counter!( "pageserver_secondary_upload_heatmap_errors", "Failures writing heatmap to remote storage" ) .expect("failed to define a metric"), upload_heatmap_duration: register_histogram!( "pageserver_secondary_upload_heatmap_duration", "Time to build and upload a heatmap, including any waiting inside the remote storage client" ) .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( "pageserver_secondary_download_layer", "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), } }); pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_secondary_resident_physical_size", "The size of the layer files present in the pageserver's filesystem, for secondary locations.", &["tenant_id", "shard_id"] ) .expect("failed to define a metric") }); pub(crate) static NODE_UTILIZATION_SCORE: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_utilization_score", "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded", ) .expect("failed to define a metric") }); pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_secondary_heatmap_total_size", "The total size in bytes of all layers in the most recently downloaded heatmap.", &["tenant_id", "shard_id"] ) .expect("failed to define a metric") }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, Download, Delete, } impl RemoteOpKind { pub fn as_str(&self) -> &'static str { match self { Self::Upload => "upload", Self::Download => "download", Self::Delete => "delete", } } } #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum RemoteOpFileKind { Layer, Index, } impl RemoteOpFileKind { pub fn as_str(&self) -> &'static str { match self { Self::Layer => "layer", Self::Index => "index", } } } pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_timeline_client_seconds_global", "Time spent on remote timeline client operations. \ Grouped by task_kind, file_kind, operation_kind and status. \ The task_kind is \ - for layer downloads, populated from RequestContext (primary objective of having the label) \ - for index downloads, set to 'unknown' \ - for any upload operation, set to 'RemoteUploadTask' \ This keeps dimensionality at bay. \ Does not account for time spent waiting in remote timeline client's queues.", &["task_kind", "file_kind", "op_kind", "status"] ) .expect("failed to define a metric") }); pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_task_events", "Number of task start/stop/fail events.", &["event"], ) .expect("Failed to register tenant_task_events metric") }); pub struct BackgroundLoopSemaphoreMetrics { counters: EnumMap, durations: EnumMap, waiting_tasks: EnumMap, running_tasks: EnumMap, } pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = Lazy::new(|| { let counters = register_int_counter_pair_vec!( "pageserver_background_loop_semaphore_wait_start_count", "Counter for background loop concurrency-limiting semaphore acquire calls started", "pageserver_background_loop_semaphore_wait_finish_count", "Counter for background loop concurrency-limiting semaphore acquire calls finished", &["task"], ) .unwrap(); let durations = register_histogram_vec!( "pageserver_background_loop_semaphore_wait_seconds", "Seconds spent waiting on background loop semaphore acquisition", &["task"], vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0], ) .unwrap(); let waiting_tasks = register_int_gauge_vec!( "pageserver_background_loop_semaphore_waiting_tasks", "Number of background loop tasks waiting for semaphore", &["task"], ) .unwrap(); let running_tasks = register_int_gauge_vec!( "pageserver_background_loop_semaphore_running_tasks", "Number of background loop tasks running concurrently", &["task"], ) .unwrap(); BackgroundLoopSemaphoreMetrics { counters: EnumMap::from_array(std::array::from_fn(|i| { let kind = BackgroundLoopKind::from_usize(i); counters.with_label_values(&[kind.into()]) })), durations: EnumMap::from_array(std::array::from_fn(|i| { let kind = BackgroundLoopKind::from_usize(i); durations.with_label_values(&[kind.into()]) })), waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| { let kind = BackgroundLoopKind::from_usize(i); waiting_tasks.with_label_values(&[kind.into()]) })), running_tasks: EnumMap::from_array(std::array::from_fn(|i| { let kind = BackgroundLoopKind::from_usize(i); running_tasks.with_label_values(&[kind.into()]) })), } }); impl BackgroundLoopSemaphoreMetrics { /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the /// semaphore is acquired, and drop it when the task completes or is cancelled. pub(crate) fn record( &self, task: BackgroundLoopKind, ) -> BackgroundLoopSemaphoreMetricsRecorder { BackgroundLoopSemaphoreMetricsRecorder::start(self, task) } } /// Records metrics for a background task. pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> { metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind, start: Instant, wait_counter_guard: Option, } impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> { /// Starts recording semaphore metrics, by recording wait time and incrementing /// `wait_start_count` and `waiting_tasks`. fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self { metrics.waiting_tasks[task].inc(); Self { metrics, task, start: Instant::now(), wait_counter_guard: Some(metrics.counters[task].guard()), } } /// Signals that the semaphore has been acquired, and updates relevant metrics. pub fn acquired(&mut self) -> Duration { let waited = self.start.elapsed(); self.wait_counter_guard.take().expect("already acquired"); self.metrics.durations[self.task].observe(waited.as_secs_f64()); self.metrics.waiting_tasks[self.task].dec(); self.metrics.running_tasks[self.task].inc(); waited } } impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> { /// The task either completed or was cancelled. fn drop(&mut self) { if self.wait_counter_guard.take().is_some() { // Waiting. self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64()); self.metrics.waiting_tasks[self.task].dec(); } else { // Running. self.metrics.running_tasks[self.task].dec(); } } } pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_background_loop_period_overrun_count", "Incremented whenever warn_when_period_overrun() logs a warning.", &["task", "period"], ) .expect("failed to define a metric") }); // walreceiver metrics pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_walreceiver_started_connections_total", "Number of started walreceiver connections" ) .expect("failed to define a metric") }); pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { register_int_gauge!( "pageserver_walreceiver_active_managers", "Number of active walreceiver managers" ) .expect("failed to define a metric") }); pub(crate) static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_walreceiver_switches_total", "Number of walreceiver manager change_connection calls", &["reason"] ) .expect("failed to define a metric") }); pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_walreceiver_broker_updates_total", "Number of received broker updates in walreceiver" ) .expect("failed to define a metric") }); pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_walreceiver_candidates_events_total", "Number of walreceiver candidate events", &["event"] ) .expect("failed to define a metric") }); pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy = Lazy::new(|| { register_int_gauge!( "pageserver_local_data_loss_suspected", "Non-zero value indicates that pageserver local data loss is suspected (and highly likely)." ) .expect("failed to define a metric") }); // Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish // it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by // Postgres compute to fetch data from pageservers. // A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the // request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify // issues with compute configuration, caused by either the compute node itself being stuck in the wrong // configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration // and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests // are indicative of bugs (and unavailability). pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_misrouted_pagestream_requests_total", "Number of pageserver pagestream requests that were routed to the wrong pageserver" ) .expect("failed to define a metric") }); // Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories: // - success // - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors) // - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.) pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_pagestream_handler_results_total", "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)", &["outcome"] ) .expect("failed to define a metric") }); // Constants for pageserver_pagestream_handler_results_total's outcome labels pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success"; pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error"; pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error"; // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting // for access to the postgres process ('wait') since there is only one for // each tenant. /// Time buckets are small because we want to be able to measure the /// smallest redo processing times. These buckets allow us to measure down /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. /// /// Values up to 1s are recorded because metrics show that we have redo /// durations and lock times larger than 0.250s. macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000, 1.000_000, ] }; } /// While we're at it, also measure the amount of records replayed in each /// operation. We have a global 'total replayed' counter, but that's not /// as useful as 'what is the skew for how many records we replay in one /// operation'. macro_rules! redo_histogram_count_buckets { () => { vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] }; } macro_rules! redo_bytes_histogram_count_buckets { () => { // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets) // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too. vec![ 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0, 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0, ] }; } pub(crate) struct WalIngestMetrics { pub(crate) bytes_received: IntCounter, pub(crate) records_received: IntCounter, pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) values_committed_metadata_images: IntCounter, pub(crate) values_committed_metadata_deltas: IntCounter, pub(crate) values_committed_data_images: IntCounter, pub(crate) values_committed_data_deltas: IntCounter, pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter, } impl WalIngestMetrics { pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) { if stats.metadata_images > 0 { self.values_committed_metadata_images .inc_by(stats.metadata_images); } if stats.metadata_deltas > 0 { self.values_committed_metadata_deltas .inc_by(stats.metadata_deltas); } if stats.data_images > 0 { self.values_committed_data_images.inc_by(stats.data_images); } if stats.data_deltas > 0 { self.values_committed_data_deltas.inc_by(stats.data_deltas); } } } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { let values_committed = register_int_counter_vec!( "pageserver_wal_ingest_values_committed", "Number of values committed to pageserver storage from WAL records", &["class", "kind"], ) .expect("failed to define a metric"); WalIngestMetrics { bytes_received: register_int_counter!( "pageserver_wal_ingest_bytes_received", "Bytes of WAL ingested from safekeepers", ) .unwrap(), records_received: register_int_counter!( "pageserver_wal_ingest_records_received", "Number of WAL records received from safekeepers" ) .expect("failed to define a metric"), records_observed: register_int_counter!( "pageserver_wal_ingest_records_observed", "Number of WAL records observed from safekeepers. These are metadata only records for shard 0." ) .expect("failed to define a metric"), records_committed: register_int_counter!( "pageserver_wal_ingest_records_committed", "Number of WAL records which resulted in writes to pageserver storage" ) .expect("failed to define a metric"), values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]), values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]), values_committed_data_images: values_committed.with_label_values(&["data", "image"]), values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]), gap_blocks_zeroed_on_rel_extend: register_int_counter!( "pageserver_gap_blocks_zeroed_on_rel_extend", "Total number of zero gap blocks written on relation extends" ) .expect("failed to define a metric"), } }); pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_timeline_wal_records_received", "Number of WAL records received per shard", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", "Time spent on WAL redo", redo_histogram_time_buckets!() ) .expect("failed to define a metric") }); pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", "Histogram of number of records replayed per redo in the Postgres WAL redo process", redo_histogram_count_buckets!(), ) .expect("failed to define a metric") }); pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_bytes_histogram", "Histogram of number of records replayed per redo sent to Postgres", redo_bytes_histogram_count_buckets!(), ) .expect("failed to define a metric") }); // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count? pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_replayed_wal_records_total", "Number of WAL records replayed in WAL redo process" ) .unwrap() }); #[rustfmt::skip] pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_process_launch_duration", "Histogram of the duration of successful WalRedoProcess::launch calls", vec![ 0.0002, 0.0004, 0.0006, 0.0008, 0.0010, 0.0020, 0.0040, 0.0060, 0.0080, 0.0100, 0.0200, 0.0400, 0.0600, 0.0800, 0.1000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000 ], ) .expect("failed to define a metric") }); pub(crate) struct WalRedoProcessCounters { pub(crate) started: IntCounter, pub(crate) killed_by_cause: EnumMap, pub(crate) active_stderr_logger_tasks_started: IntCounter, pub(crate) active_stderr_logger_tasks_finished: IntCounter, } #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)] pub(crate) enum WalRedoKillCause { WalRedoProcessDrop, NoLeakChildDrop, Startup, } impl Default for WalRedoProcessCounters { fn default() -> Self { let started = register_int_counter!( "pageserver_wal_redo_process_started_total", "Number of WAL redo processes started", ) .unwrap(); let killed = register_int_counter_vec!( "pageserver_wal_redo_process_stopped_total", "Number of WAL redo processes stopped", &["cause"], ) .unwrap(); let active_stderr_logger_tasks_started = register_int_counter!( "pageserver_walredo_stderr_logger_tasks_started_total", "Number of active walredo stderr logger tasks that have started", ) .unwrap(); let active_stderr_logger_tasks_finished = register_int_counter!( "pageserver_walredo_stderr_logger_tasks_finished_total", "Number of active walredo stderr logger tasks that have finished", ) .unwrap(); Self { started, killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| { let cause = WalRedoKillCause::from_usize(i); let cause_str: &'static str = cause.into(); killed.with_label_values(&[cause_str]) })), active_stderr_logger_tasks_started, active_stderr_logger_tasks_finished, } } } pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, stopped: Cell, } impl StorageTimeMetricsTimer { fn new(metrics: StorageTimeMetrics) -> Self { /*BEGIN_HADRON */ // record the active operation as the timer starts metrics.timeline_active_count.inc(); /*END_HADRON */ Self { metrics, start: Instant::now(), stopped: Cell::new(false), } } /// Returns the elapsed duration of the timer. pub fn elapsed(&self) -> Duration { self.start.elapsed() } /// Record the time from creation to now and return it. pub fn stop_and_record(self) -> Duration { let duration = self.elapsed(); let seconds = duration.as_secs_f64(); self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(seconds); /* BEGIN_HADRON*/ self.stopped.set(true); self.metrics.timeline_active_count.dec(); /*END_HADRON */ duration } /// Turns this timer into a timer, which will always record -- usually this means recording /// regardless an early `?` path was taken in a function. pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer { AlwaysRecordingStorageTimeMetricsTimer(Some(self)) } } /*BEGIN_HADRON */ impl Drop for StorageTimeMetricsTimer { fn drop(&mut self) { if !self.stopped.get() { self.metrics.timeline_active_count.dec(); } } } /*END_HADRON */ pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); impl Drop for AlwaysRecordingStorageTimeMetricsTimer { fn drop(&mut self) { if let Some(inner) = self.0.take() { inner.stop_and_record(); } } } impl AlwaysRecordingStorageTimeMetricsTimer { /// Returns the elapsed duration of the timer. pub fn elapsed(&self) -> Duration { self.0.as_ref().expect("not dropped yet").elapsed() } } /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] pub(crate) struct StorageTimeMetrics { /// Sum of f64 seconds, per operation, tenant_id and timeline_id timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id timeline_count: IntCounter, /*BEGIN_HADRON */ /// Number of active operations per operation, tenant_id, and timeline_id timeline_active_count: IntGauge, /*END_HADRON */ /// Global histogram having only the "operation" label. global_histogram: Histogram, } impl StorageTimeMetrics { pub fn new( operation: StorageTimeOperation, tenant_id: &str, shard_id: &str, timeline_id: &str, ) -> Self { let operation: &'static str = operation.into(); let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); /*BEGIN_HADRON */ let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); /*END_HADRON */ let global_histogram = STORAGE_TIME_GLOBAL .get_metric_with_label_values(&[operation]) .unwrap(); StorageTimeMetrics { timeline_sum, timeline_count, timeline_active_count, global_histogram, } } /// Starts timing a new operation. /// /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop. pub fn start_timer(&self) -> StorageTimeMetricsTimer { StorageTimeMetricsTimer::new(self.clone()) } } pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub flush_delay_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_lsn_gauge: IntGauge, pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, pub layers_per_read: Histogram, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub aux_file_size_gauge: IntGauge, pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, /// Number of valid LSN leases. pub valid_lsn_lease_count_gauge: UIntGauge, pub wal_records_received: IntCounter, pub storage_io_size: StorageIoSizeMetrics, pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, pub wait_lsn_start_finish_counterpair: IntCounterPair, pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum, shutdown: std::sync::atomic::AtomicBool, } impl TimelineMetrics { pub fn new( tenant_shard_id: &TenantShardId, timeline_id_raw: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id_raw.to_string(); let flush_time_histo = StorageTimeMetrics::new( StorageTimeOperation::LayerFlush, &tenant_id, &shard_id, &timeline_id, ); let flush_delay_histo = StorageTimeMetrics::new( StorageTimeOperation::LayerFlushDelay, &tenant_id, &shard_id, &timeline_id, ); let compact_time_histo = StorageTimeMetrics::new( StorageTimeOperation::Compact, &tenant_id, &shard_id, &timeline_id, ); let create_images_time_histo = StorageTimeMetrics::new( StorageTimeOperation::CreateImages, &tenant_id, &shard_id, &timeline_id, ); let logical_size_histo = StorageTimeMetrics::new( StorageTimeOperation::LogicalSize, &tenant_id, &shard_id, &timeline_id, ); let imitate_logical_size_histo = StorageTimeMetrics::new( StorageTimeOperation::ImitateLogicalSize, &tenant_id, &shard_id, &timeline_id, ); let load_layer_map_histo = StorageTimeMetrics::new( StorageTimeOperation::LoadLayerMap, &tenant_id, &shard_id, &timeline_id, ); let garbage_collect_histo = StorageTimeMetrics::new( StorageTimeOperation::Gc, &tenant_id, &shard_id, &timeline_id, ); let find_gc_cutoffs_histo = StorageTimeMetrics::new( StorageTimeOperation::FindGcCutoffs, &tenant_id, &shard_id, &timeline_id, ); let last_record_lsn_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let pitr_history_size = PITR_HISTORY_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let archival_size = TIMELINE_ARCHIVE_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let layers_per_read = LAYERS_PER_READ .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let aux_file_size_gauge = AUX_FILE_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 let directory_entries_count_gauge_closure = { let tenant_shard_id = *tenant_shard_id; let timeline_id_raw = *timeline_id_raw; move || { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id_raw.to_string(); let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); gauge } }; let directory_entries_count_gauge: Lazy UIntGauge>> = Lazy::new(Box::new(directory_entries_count_gauge_closure)); let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder .build(&tenant_id, &shard_id, &timeline_id); let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); let wait_lsn_in_progress_micros = GlobalAndPerTenantIntCounter { global: WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS.clone(), per_tenant: WAIT_LSN_IN_PROGRESS_MICROS .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(), }; let wait_lsn_start_finish_counterpair = WAIT_LSN_START_FINISH_COUNTERPAIR .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let wait_ondemand_download_time = wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new( &tenant_id, &shard_id, &timeline_id, ); TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc(); TimelineMetrics { tenant_id, shard_id, timeline_id, flush_time_histo, flush_delay_histo, compact_time_histo, create_images_time_histo, logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, find_gc_cutoffs_histo, load_layer_map_histo, last_record_lsn_gauge, disk_consistent_lsn_gauge, pitr_history_size, archival_size, layers_per_read, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, current_logical_size_gauge, aux_file_size_gauge, directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), storage_io_size, valid_lsn_lease_count_gauge, wal_records_received, wait_lsn_in_progress_micros, wait_lsn_start_finish_counterpair, wait_ondemand_download_time, shutdown: std::sync::atomic::AtomicBool::default(), } } pub(crate) fn record_new_file_metrics(&self, sz: u64) { self.resident_physical_size_add(sz); } pub(crate) fn resident_physical_size_sub(&self, sz: u64) { self.resident_physical_size_gauge.sub(sz); crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz); } pub(crate) fn resident_physical_size_add(&self, sz: u64) { self.resident_physical_size_gauge.add(sz); crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz); } pub(crate) fn resident_physical_size_get(&self) -> u64 { self.resident_physical_size_gauge.get() } /// Generates TIMELINE_LAYER labels for a persistent layer. fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { true => LayerLevel::L0, false => LayerLevel::L1, }; let kind = match layer_desc.is_delta() { true => LayerKind::Delta, false => LayerKind::Image, }; [ &self.tenant_id, &self.shard_id, &self.timeline_id, level.into(), kind.into(), ] } /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer. fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] { [ &self.tenant_id, &self.shard_id, &self.timeline_id, LayerLevel::Frozen.into(), LayerKind::Delta.into(), // by definition ] } /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics. pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() .dec(); TIMELINE_LAYER_SIZE .get_metric_with_label_values(&labels) .unwrap() .sub(size); } /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics. pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() .inc(); TIMELINE_LAYER_SIZE .get_metric_with_label_values(&labels) .unwrap() .add(size); } /// Removes a persistent layer from TIMELINE_LAYER metrics. pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) { let labels = self.make_layer_labels(layer_desc); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() .dec(); TIMELINE_LAYER_SIZE .get_metric_with_label_values(&labels) .unwrap() .sub(layer_desc.file_size); } /// Adds a persistent layer to TIMELINE_LAYER metrics. pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) { let labels = self.make_layer_labels(layer_desc); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() .inc(); TIMELINE_LAYER_SIZE .get_metric_with_label_values(&labels) .unwrap() .add(layer_desc.file_size); } pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown .swap(true, std::sync::atomic::Ordering::Relaxed); if was_shutdown { // this happens on tenant deletion because tenant first shuts down timelines, then // invokes timeline deletion which first shuts down the timeline again. // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 return; } TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec(); let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); for ref level in LayerLevel::iter() { for ref kind in LayerKind::iter() { let labels: [&str; 5] = [tenant_id, shard_id, timeline_id, level.into(), kind.into()]; let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels); let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels); } } let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() .unwrap() .remove(tenant_id, shard_id, timeline_id); // The following metrics are born outside of the TimelineMetrics lifecycle but still // removed at the end of it. The idea is to have the metrics outlive the // entity during which they're observed, e.g., the smgr metrics shall // outlive an individual smgr connection, but not the timeline. for op in StorageTimeOperation::VARIANTS { let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[ op, tenant_id, shard_id, timeline_id, ]); let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[ op, tenant_id, shard_id, timeline_id, ]); /* BEGIN_HADRON */ let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[ op, tenant_id, shard_id, timeline_id, ]); /*END_HADRON */ } for op in StorageIoSizeOperation::VARIANTS { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } let _ = WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]); { let mut res = [Ok(()), Ok(())]; WAIT_LSN_START_FINISH_COUNTERPAIR .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); } wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id); let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, shard_id, timeline_id, ]); let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, shard_id, timeline_id, ]); let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[ tenant_id, shard_id, timeline_id, ]); let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[ tenant_id, shard_id, timeline_id, ]); let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[ tenant_id, shard_id, timeline_id, ]); let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[ tenant_id, shard_id, timeline_id, ]); for reason in GetPageBatchBreakReason::iter() { let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[ tenant_id, shard_id, timeline_id, reason.into(), ]); } } } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { let tid = tenant_shard_id.tenant_id.to_string(); let shard_id = tenant_shard_id.shard_slug().to_string(); // Only shard zero deals in synthetic sizes if tenant_shard_id.is_shard_zero() { let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]); tenant_throttling::remove_tenant_metrics(tenant_shard_id); // we leave the BROKEN_TENANTS_SET entry if any } /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { last_set: AtomicU64, gauge: UIntGauge, } impl PerTimelineRemotePhysicalSizeGauge { fn new(per_timeline_gauge: UIntGauge) -> Self { Self { last_set: AtomicU64::new(0), gauge: per_timeline_gauge, } } pub(crate) fn set(&self, sz: u64) { self.gauge.set(sz); let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed); if sz < prev { REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz); } else { REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev); }; } pub(crate) fn get(&self) -> u64 { self.gauge.get() } } impl Drop for PerTimelineRemotePhysicalSizeGauge { fn drop(&mut self) { REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed)); } } pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, shard_id: String, timeline_id: String, pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge, calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge, } impl RemoteTimelineClientMetrics { pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { let tenant_id_str = tenant_shard_id.tenant_id.to_string(); let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); let timeline_id_str = timeline_id.to_string(); let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new( REMOTE_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) .unwrap(), ); let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) .unwrap(); RemoteTimelineClientMetrics { tenant_id: tenant_id_str, shard_id: shard_id_str, timeline_id: timeline_id_str, calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge, projected_remote_consistent_lsn_gauge, } } pub fn remote_operation_time( &self, task_kind: Option, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, status: &'static str, ) -> Histogram { REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY .get_metric_with_label_values(&[ task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"), file_kind.as_str(), op_kind.as_str(), status, ]) .unwrap() } fn calls_counter_pair( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntCounterPair { let mut guard = self.calls.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { REMOTE_TIMELINE_CLIENT_CALLS .get_metric_with_label_values(&[ &self.tenant_id, &self.shard_id, &self.timeline_id, key.0, key.1, ]) .unwrap() }); metric.clone() } fn bytes_started_counter( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntCounter { let mut guard = self.bytes_started_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER .get_metric_with_label_values(&[ &self.tenant_id, &self.shard_id, &self.timeline_id, key.0, key.1, ]) .unwrap() }); metric.clone() } fn bytes_finished_counter( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntCounter { let mut guard = self.bytes_finished_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER .get_metric_with_label_values(&[ &self.tenant_id, &self.shard_id, &self.timeline_id, key.0, key.1, ]) .unwrap() }); metric.clone() } } #[cfg(test)] impl RemoteTimelineClientMetrics { pub fn get_bytes_started_counter_value( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> Option { let guard = self.bytes_started_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); guard.get(&key).map(|counter| counter.get()) } pub fn get_bytes_finished_counter_value( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> Option { let guard = self.bytes_finished_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); guard.get(&key).map(|counter| counter.get()) } } /// See [`RemoteTimelineClientMetrics::call_begin`]. #[must_use] pub(crate) struct RemoteTimelineClientCallMetricGuard { /// Decremented on drop. calls_counter_pair: Option, /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. bytes_finished: Option<(IntCounter, u64)>, } impl RemoteTimelineClientCallMetricGuard { /// Consume this guard object without performing the metric updates it would do on `drop()`. /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { let RemoteTimelineClientCallMetricGuard { calls_counter_pair, bytes_finished, } = &mut self; calls_counter_pair.take(); bytes_finished.take(); } } impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { let RemoteTimelineClientCallMetricGuard { calls_counter_pair, bytes_finished, } = self; if let Some(guard) = calls_counter_pair.take() { guard.dec(); } if let Some((bytes_finished_metric, value)) = bytes_finished { bytes_finished_metric.inc_by(*value); } } } /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to /// track the byte size of this call in applicable metric(s). pub(crate) enum RemoteTimelineClientMetricsCallTrackSize { /// Do not account for this call's byte size in any metrics. /// The `reason` field is there to make the call sites self-documenting /// about why they don't need the metric. DontTrackSize { reason: &'static str }, /// Track the byte size of the call in applicable metric(s). Bytes(u64), } impl RemoteTimelineClientMetrics { /// Update the metrics that change when a call to the remote timeline client instance starts. /// /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions. /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that /// is more suitable. /// Never do both. pub(crate) fn call_begin( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); calls_counter_pair.inc(); let bytes_finished = match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { // nothing to do None } RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { self.bytes_started_counter(file_kind, op_kind).inc_by(size); let finished_counter = self.bytes_finished_counter(file_kind, op_kind); Some((finished_counter, size)) } }; RemoteTimelineClientCallMetricGuard { calls_counter_pair: Some(calls_counter_pair), bytes_finished, } } /// Manually udpate the metrics that track completions, instead of using the guard object. /// Using the guard object is generally preferable. /// See [`call_begin`](Self::call_begin) for more context. pub(crate) fn call_end( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) { let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); calls_counter_pair.dec(); match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { self.bytes_finished_counter(file_kind, op_kind).inc_by(size); } } } } impl Drop for RemoteTimelineClientMetrics { fn drop(&mut self) { let RemoteTimelineClientMetrics { tenant_id, shard_id, timeline_id, remote_physical_size_gauge, calls, bytes_started_counter, bytes_finished_counter, projected_remote_consistent_lsn_gauge, } = self; for ((a, b), _) in calls.get_mut().unwrap().drain() { let mut res = [Ok(()), Ok(())]; REMOTE_TIMELINE_CLIENT_CALLS .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]); // don't care about results } for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ tenant_id, shard_id, timeline_id, a, b, ]); } for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[ tenant_id, shard_id, timeline_id, a, b, ]); } { let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } { let _ = projected_remote_consistent_lsn_gauge; let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[ tenant_id, shard_id, timeline_id, ]); } } } /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. pub(crate) trait MeasureRemoteOp: Sized + Future> { async fn measure_remote_op( self, task_kind: Option, // not all caller contexts have a RequestContext / TaskKind handy file_kind: RemoteOpFileKind, op: RemoteOpKind, metrics: Arc, ) -> Result { let start = Instant::now(); let res = self.await; let duration = start.elapsed(); let status = if res.is_ok() { &"success" } else { &"failure" }; metrics .remote_operation_time(task_kind, &file_kind, &op, status) .observe(duration.as_secs_f64()); res } } impl MeasureRemoteOp for Fut where Fut: Sized + Future> {} pub mod tokio_epoll_uring { use std::collections::HashMap; use std::sync::{Arc, Mutex}; use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter}; use once_cell::sync::Lazy; /// Shared storage for tokio-epoll-uring thread local metrics. pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy = Lazy::new(|| { let slots_submission_queue_depth = register_histogram!( "pageserver_tokio_epoll_uring_slots_submission_queue_depth", "The slots waiters queue depth of each tokio_epoll_uring system", vec![ 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 ], ) .expect("failed to define a metric"); ThreadLocalMetricsStorage { observers: Mutex::new(HashMap::new()), slots_submission_queue_depth, } }); pub struct ThreadLocalMetricsStorage { /// List of thread local metrics observers. observers: Mutex>>, /// A histogram shared between all thread local systems /// for collecting slots submission queue depth. slots_submission_queue_depth: Histogram, } /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic. /// /// The System makes observations into [`Self`] and periodically, the collector /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`]. /// /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`]. /// But except for the periodic flush, the lock is uncontended so there's no waiting /// for cache coherence protocol to get an exclusive cache line. pub struct ThreadLocalMetrics { /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth. slots_submission_queue_depth: Mutex, } impl ThreadLocalMetricsStorage { /// Registers a new thread local system. Returns a thread local metrics observer. pub fn register_system(&self, id: u64) -> Arc { let per_system_metrics = Arc::new(ThreadLocalMetrics::new( self.slots_submission_queue_depth.local(), )); let mut g = self.observers.lock().unwrap(); g.insert(id, Arc::clone(&per_system_metrics)); per_system_metrics } /// Removes metrics observer for a thread local system. /// This should be called before dropping a thread local system. pub fn remove_system(&self, id: u64) { let mut g = self.observers.lock().unwrap(); g.remove(&id); } /// Flush all thread local metrics to the shared storage. pub fn flush_thread_local_metrics(&self) { let g = self.observers.lock().unwrap(); g.values().for_each(|local| { local.flush(); }); } } impl ThreadLocalMetrics { pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self { ThreadLocalMetrics { slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth), } } /// Flushes the thread local metrics to shared aggregator. pub fn flush(&self) { let Self { slots_submission_queue_depth, } = self; slots_submission_queue_depth.lock().unwrap().flush(); } } impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics { fn observe_slots_submission_queue_depth(&self, queue_depth: u64) { let Self { slots_submission_queue_depth, } = self; slots_submission_queue_depth .lock() .unwrap() .observe(queue_depth as f64); } } pub struct Collector { descs: Vec, systems_created: UIntGauge, systems_destroyed: UIntGauge, thread_local_metrics_storage: &'static ThreadLocalMetricsStorage, } impl metrics::core::Collector for Collector { fn desc(&self) -> Vec<&metrics::core::Desc> { self.descs.iter().collect() } fn collect(&self) -> Vec { let mut mfs = Vec::with_capacity(Self::NMETRICS); let tokio_epoll_uring::metrics::GlobalMetrics { systems_created, systems_destroyed, } = tokio_epoll_uring::metrics::global(); self.systems_created.set(systems_created); mfs.extend(self.systems_created.collect()); self.systems_destroyed.set(systems_destroyed); mfs.extend(self.systems_destroyed.collect()); self.thread_local_metrics_storage .flush_thread_local_metrics(); mfs.extend( self.thread_local_metrics_storage .slots_submission_queue_depth .collect(), ); mfs } } impl Collector { const NMETRICS: usize = 3; #[allow(clippy::new_without_default)] pub fn new() -> Self { let mut descs = Vec::new(); let systems_created = UIntGauge::new( "pageserver_tokio_epoll_uring_systems_created", "counter of tokio-epoll-uring systems that were created", ) .unwrap(); descs.extend( metrics::core::Collector::desc(&systems_created) .into_iter() .cloned(), ); let systems_destroyed = UIntGauge::new( "pageserver_tokio_epoll_uring_systems_destroyed", "counter of tokio-epoll-uring systems that were destroyed", ) .unwrap(); descs.extend( metrics::core::Collector::desc(&systems_destroyed) .into_iter() .cloned(), ); Self { descs, systems_created, systems_destroyed, thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE, } } } pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count", "Number of times where thread_local_system creation spanned multiple executor threads", ) .unwrap() }); pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count", "Number of times thread_local_system creation failed and was retried after back-off.", ) .unwrap() }); } pub(crate) struct GlobalAndPerTenantIntCounter { global: IntCounter, per_tenant: IntCounter, } impl GlobalAndPerTenantIntCounter { #[inline(always)] pub(crate) fn inc(&self) { self.inc_by(1) } #[inline(always)] pub(crate) fn inc_by(&self, n: u64) { self.global.inc_by(n); self.per_tenant.inc_by(n); } } pub(crate) mod tenant_throttling { use metrics::register_int_counter_vec; use once_cell::sync::Lazy; use utils::shard::TenantShardId; use super::GlobalAndPerTenantIntCounter; pub(crate) struct Metrics { pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter, pub(super) wait_time: GlobalAndPerTenantIntCounter, pub(super) count_throttled: GlobalAndPerTenantIntCounter, } static COUNT_ACCOUNTED_START: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count_accounted_start_global", "Count of tenant throttling starts, by kind of throttle.", &["kind"] ) .unwrap() }); static COUNT_ACCOUNTED_START_PER_TENANT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count_accounted_start", "Count of tenant throttling starts, by kind of throttle.", &["kind", "tenant_id", "shard_id"] ) .unwrap() }); static COUNT_ACCOUNTED_FINISH: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count_accounted_finish_global", "Count of tenant throttling finishes, by kind of throttle.", &["kind"] ) .unwrap() }); static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count_accounted_finish", "Count of tenant throttling finishes, by kind of throttle.", &["kind", "tenant_id", "shard_id"] ) .unwrap() }); static WAIT_USECS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_wait_usecs_sum_global", "Sum of microseconds that spent waiting throttle by kind of throttle.", &["kind"] ) .unwrap() }); static WAIT_USECS_PER_TENANT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_wait_usecs_sum", "Sum of microseconds that spent waiting throttle by kind of throttle.", &["kind", "tenant_id", "shard_id"] ) .unwrap() }); static WAIT_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count_global", "Count of tenant throttlings, by kind of throttle.", &["kind"] ) .unwrap() }); static WAIT_COUNT_PER_TENANT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_throttling_count", "Count of tenant throttlings, by kind of throttle.", &["kind", "tenant_id", "shard_id"] ) .unwrap() }); const KINDS: &[&str] = &["pagestream"]; pub type Pagestream = Metrics<0>; impl Metrics { pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self { let per_tenant_label_values = &[ KINDS[KIND], &tenant_shard_id.tenant_id.to_string(), &tenant_shard_id.shard_slug().to_string(), ]; Metrics { count_accounted_start: { GlobalAndPerTenantIntCounter { global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_START_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_accounted_finish: { GlobalAndPerTenantIntCounter { global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT .with_label_values(per_tenant_label_values), } }, wait_time: { GlobalAndPerTenantIntCounter { global: WAIT_USECS.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_USECS_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_throttled: { GlobalAndPerTenantIntCounter { global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_COUNT_PER_TENANT .with_label_values(per_tenant_label_values), } }, } } } pub(crate) fn preinitialize_global_metrics() { Lazy::force(&COUNT_ACCOUNTED_START); Lazy::force(&COUNT_ACCOUNTED_FINISH); Lazy::force(&WAIT_USECS); Lazy::force(&WAIT_COUNT); } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { for m in &[ &COUNT_ACCOUNTED_START_PER_TENANT, &COUNT_ACCOUNTED_FINISH_PER_TENANT, &WAIT_USECS_PER_TENANT, &WAIT_COUNT_PER_TENANT, ] { for kind in KINDS { let _ = m.remove_label_values(&[ kind, &tenant_shard_id.tenant_id.to_string(), &tenant_shard_id.shard_slug().to_string(), ]); } } } } pub(crate) mod disk_usage_based_eviction { use super::*; pub(crate) struct Metrics { pub(crate) tenant_collection_time: Histogram, pub(crate) tenant_layer_count: Histogram, pub(crate) layers_collected: IntCounter, pub(crate) layers_selected: IntCounter, pub(crate) layers_evicted: IntCounter, /*BEGIN_HADRON */ pub(crate) bytes_evicted: IntCounter, /*END_HADRON */ } impl Default for Metrics { fn default() -> Self { let tenant_collection_time = register_histogram!( "pageserver_disk_usage_based_eviction_tenant_collection_seconds", "Time spent collecting layers from a tenant -- not normalized by collected layer amount", vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0] ) .unwrap(); let tenant_layer_count = register_histogram!( "pageserver_disk_usage_based_eviction_tenant_collected_layers", "Amount of layers gathered from a tenant", vec![5.0, 50.0, 500.0, 5000.0, 50000.0] ) .unwrap(); let layers_collected = register_int_counter!( "pageserver_disk_usage_based_eviction_collected_layers_total", "Amount of layers collected" ) .unwrap(); let layers_selected = register_int_counter!( "pageserver_disk_usage_based_eviction_select_layers_total", "Amount of layers selected" ) .unwrap(); let layers_evicted = register_int_counter!( "pageserver_disk_usage_based_eviction_evicted_layers_total", "Amount of layers successfully evicted" ) .unwrap(); /*BEGIN_HADRON */ let bytes_evicted = register_int_counter!( "pageserver_disk_usage_based_eviction_evicted_bytes_total", "Amount of bytes successfully evicted" ) .unwrap(); /*END_HADRON */ Self { tenant_collection_time, tenant_layer_count, layers_collected, layers_selected, layers_evicted, bytes_evicted, } } } pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tokio_executor_thread_configured_count", "Total number of configued tokio executor threads in the process. The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", &["setup"], ) .unwrap() }); pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); let _guard = SERIALIZE.lock().unwrap(); TOKIO_EXECUTOR_THREAD_COUNT.reset(); TOKIO_EXECUTOR_THREAD_COUNT .get_metric_with_label_values(&[setup]) .unwrap() .set(u64::try_from(num_threads.get()).unwrap()); } pub(crate) static BASEBACKUP_CACHE_READ: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_basebackup_cache_read_total", "Number of read accesses to the basebackup cache grouped by hit/miss/error", &["result"] ) .expect("failed to define a metric") }); pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_basebackup_cache_prepare_total", "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error", &["result"] ) .expect("failed to define a metric") }); pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_basebackup_cache_entries_total", "Number of entries in the basebackup cache" ) .expect("failed to define a metric") }); pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_basebackup_cache_size_bytes", "Total size of all basebackup cache entries on disk in bytes" ) .expect("failed to define a metric") }); pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_basebackup_cache_prepare_queue_size", "Number of requests in the basebackup prepare channel" ) .expect("failed to define a metric") }); static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_config_ignored_items", "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\ The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\ The value for an unknown config item is always 1.\ There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).", &["item"] ) .unwrap() }); pub fn preinitialize_metrics( conf: &'static PageServerConf, ignored: config::ignored_fields::Paths, ) { set_page_service_config_max_batch_size(&conf.page_service_pipelining); PAGESERVER_CONFIG_IGNORED_ITEMS .with_label_values(&[""]) .set(0); for path in &ignored.paths { PAGESERVER_CONFIG_IGNORED_ITEMS .with_label_values(&[path]) .set(1); } // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of // order: // - global metrics reside in a Lazy // - access via crate::metrics::PS_METRICS.some_metric.inc() // - could move the statics into TimelineMetrics::new()? // counters [ &UNEXPECTED_ONDEMAND_DOWNLOADS, &WALRECEIVER_STARTED_CONNECTIONS, &WALRECEIVER_BROKER_UPDATES, &WALRECEIVER_CANDIDATES_ADDED, &WALRECEIVER_CANDIDATES_REMOVED, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, &REMOTE_ONDEMAND_DOWNLOADED_BYTES, &CIRCUIT_BREAKERS_BROKEN, &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, &MISROUTED_PAGESTREAM_REQUESTS, ] .into_iter() .for_each(|c| { Lazy::force(c); }); // Deletion queue stats Lazy::force(&DELETION_QUEUE); // Tenant stats Lazy::force(&TENANT); // Tenant manager stats Lazy::force(&TENANT_MANAGER); Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); Lazy::force(&disk_usage_based_eviction::METRICS); for state_name in pageserver_api::models::TenantState::VARIANTS { // initialize the metric for all gauges, otherwise the time series might seemingly show // values from last restart. TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0); } // countervecs [ &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT, &SMGR_QUERY_STARTED_GLOBAL, &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL, ] .into_iter() .for_each(|c| { Lazy::force(c); }); // gauges WALRECEIVER_ACTIVE_MANAGERS.get(); LOCAL_DATA_LOSS_SUSPECTED.get(); // histograms [ &LAYERS_PER_READ_GLOBAL, &LAYERS_PER_READ_BATCH_GLOBAL, &LAYERS_PER_READ_AMORTIZED_GLOBAL, &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, &PAGE_SERVICE_BATCH_SIZE_GLOBAL, &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL, ] .into_iter() .for_each(|h| { Lazy::force(h); }); // Custom Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&COMPUTE_COMMANDS_COUNTERS); Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); tenant_throttling::preinitialize_global_metrics(); wait_ondemand_download_time::preinitialize_global_metrics(); } ================================================ FILE: pageserver/src/page_cache.rs ================================================ //! //! Global page cache //! //! The page cache uses up most of the memory in the page server. It is shared //! by all tenants, and it is used to store different kinds of pages. Sharing //! the cache allows memory to be dynamically allocated where it's needed the //! most. //! //! The page cache consists of fixed-size buffers, 8 kB each to match the //! PostgreSQL buffer size, and a Slot struct for each buffer to contain //! information about what's stored in the buffer. //! //! # Types Of Pages //! //! [`PageCache`] only supports immutable pages. //! Hence there is no need to worry about coherency. //! //! Two types of pages are supported: //! //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`]. //! //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only. //! It uses the page cache only for the blocks that are already fully written and immutable. //! //! # Filling The Page Cache //! //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! //! The cache key for **immutable file** pages is [`FileId`] and a block number. //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following: //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`]. //! * Get a [`FileId`] using [`next_file_id`]. //! * Use the mechanism to associate the on-disk file with the returned [`FileId`]. //! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`]. //! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains //! a read guard for the page. Just use it. //! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains //! a write guard for the page. Fill the page with the contents of the on-disk file. //! Then call [`PageWriteGuard::mark_valid`] to mark the page as valid. //! Then try again to [`PageCache::read_immutable_buf`]. //! Unless there's high cache pressure, the page should now be cached. //! (TODO: allow downgrading the write guard to a read guard to ensure forward progress.) //! //! # Locking //! //! There are two levels of locking involved: There's one lock for the "mapping" //! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer //! slot, and a separate lock on each slot. To read or write the contents of a //! slot, you must hold the lock on the slot in read or write mode, //! respectively. To change the mapping of a slot, i.e. to evict a page or to //! assign a buffer for a page, you must hold the mapping lock and the lock on //! the slot at the same time. //! //! Whenever you need to hold both locks simultaneously, the slot lock must be //! acquired first. This consistent ordering avoids deadlocks. To look up a page //! in the cache, you would first look up the mapping, while holding the mapping //! lock, and then lock the slot. You must release the mapping lock in between, //! to obey the lock ordering and avoid deadlock. //! //! A slot can momentarily have invalid contents, even if it's already been //! inserted to the mapping, but you must hold the write-lock on the slot until //! the contents are valid. If you need to release the lock without initializing //! the contents, you must remove the mapping first. We make that easy for the //! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has //! initialized it. If the guard is dropped without calling mark_valid(), the //! mapping is automatically removed and the slot is marked free. //! use std::collections::HashMap; use std::collections::hash_map::Entry; use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::Duration; use anyhow::Context; use once_cell::sync::OnceCell; use crate::context::RequestContext; use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics}; use crate::virtual_file::{IoBufferMut, IoPageSlice}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; /// /// Initialize the page cache. This must be called once at page server startup. /// pub fn init(size: usize) { if PAGE_CACHE.set(PageCache::new(size)).is_err() { panic!("page cache already initialized"); } } /// /// Get a handle to the page cache. /// pub fn get() -> &'static PageCache { // // In unit tests, page server startup doesn't happen and no one calls // page_cache::init(). Initialize it here with a tiny cache, so that the // page cache is usable in unit tests. // if cfg!(test) { PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE)) } else { PAGE_CACHE.get().expect("page cache not initialized") } } pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize; const MAX_USAGE_COUNT: u8 = 5; /// See module-level comment. #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct FileId(u64); static NEXT_ID: AtomicU64 = AtomicU64::new(1); /// See module-level comment. pub fn next_file_id() -> FileId { FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed)) } /// /// CacheKey uniquely identifies a "thing" to cache in the page cache. /// #[derive(Debug, PartialEq, Eq, Clone)] #[allow(clippy::enum_variant_names)] enum CacheKey { ImmutableFilePage { file_id: FileId, blkno: u32 }, } struct Slot { inner: tokio::sync::RwLock, usage_count: AtomicU8, } struct SlotInner { key: Option, // for `coalesce_readers_permit` permit: std::sync::Mutex>, buf: IoPageSlice<'static>, } impl Slot { /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT. fn inc_usage_count(&self) { let _ = self .usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { if val == MAX_USAGE_COUNT { None } else { Some(val + 1) } }); } /// Decrement usage count on the buffer, unless it's already zero. Returns /// the old usage count. fn dec_usage_count(&self) -> u8 { let count_res = self.usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { if val == 0 { None } else { Some(val - 1) } }); match count_res { Ok(usage_count) => usage_count, Err(usage_count) => usage_count, } } /// Sets the usage count to a specific value. fn set_usage_count(&self, count: u8) { self.usage_count.store(count, Ordering::Relaxed); } } impl SlotInner { /// If there is aready a reader, drop our permit and share its permit, just like we share read access. fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc { let mut guard = self.permit.lock().unwrap(); if let Some(existing_permit) = guard.upgrade() { drop(guard); drop(permit); existing_permit } else { let permit = Arc::new(permit); *guard = Arc::downgrade(&permit); permit } } } pub struct PageCache { immutable_page_map: std::sync::RwLock>, /// The actual buffers with their metadata. slots: Box<[Slot]>, pinned_slots: Arc, /// Index of the next candidate to evict, for the Clock replacement algorithm. /// This is interpreted modulo the page cache size. next_evict_slot: AtomicUsize, size_metrics: &'static PageCacheSizeMetrics, } struct PinnedSlotsPermit { _permit: tokio::sync::OwnedSemaphorePermit, } /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked /// until the guard is dropped. /// pub struct PageReadGuard<'i> { _permit: Arc, slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>, } impl std::ops::Deref for PageReadGuard<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { self.slot_guard.buf.deref() } } impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { fn as_ref(&self) -> &[u8; PAGE_SZ] { self.slot_guard.buf.as_ref() } } /// /// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked /// until the guard is dropped. /// /// Counterintuitively, this is used even for a read, if the requested page is not /// currently found in the page cache. In that case, the caller of lock_for_read() /// is expected to fill in the page contents and call mark_valid(). pub struct PageWriteGuard<'i> { state: PageWriteGuardState<'i>, } enum PageWriteGuardState<'i> { Invalid { inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>, _permit: PinnedSlotsPermit, }, Downgraded, } impl std::ops::DerefMut for PageWriteGuard<'_> { fn deref_mut(&mut self) -> &mut Self::Target { match &mut self.state { PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(), PageWriteGuardState::Downgraded => unreachable!(), } } } impl std::ops::Deref for PageWriteGuard<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { match &self.state { PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(), PageWriteGuardState::Downgraded => unreachable!(), } } } impl<'a> PageWriteGuard<'a> { /// Mark that the buffer contents are now valid. #[must_use] pub fn mark_valid(mut self) -> PageReadGuard<'a> { let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded); match prev { PageWriteGuardState::Invalid { inner, _permit } => { assert!(inner.key.is_some()); PageReadGuard { _permit: Arc::new(_permit), slot_guard: inner.downgrade(), } } PageWriteGuardState::Downgraded => unreachable!(), } } } impl Drop for PageWriteGuard<'_> { /// /// If the buffer was allocated for a page that was not already in the /// cache, but the lock_for_read/write() caller dropped the buffer without /// initializing it, remove the mapping from the page cache. /// fn drop(&mut self) { match &mut self.state { PageWriteGuardState::Invalid { inner, _permit } => { assert!(inner.key.is_some()); let self_key = inner.key.as_ref().unwrap(); PAGE_CACHE.get().unwrap().remove_mapping(self_key); inner.key = None; } PageWriteGuardState::Downgraded => {} } } } /// lock_for_read() return value pub enum ReadBufResult<'a> { Found(PageReadGuard<'a>), NotFound(PageWriteGuard<'a>), } impl PageCache { pub async fn read_immutable_buf( &self, file_id: FileId, blkno: u32, ctx: &RequestContext, ) -> anyhow::Result { self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx) .await } // // Section 2: Internal interface functions for lookup/update. // // To add support for a new kind of "thing" to cache, you will need // to add public interface routines above, and code to deal with the // "mappings" after this section. But the routines in this section should // not require changes. async fn try_get_pinned_slot_permit(&self) -> anyhow::Result { match tokio::time::timeout( // Choose small timeout, neon_smgr does its own retries. // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869 Duration::from_secs(10), Arc::clone(&self.pinned_slots).acquire_owned(), ) .await { Ok(res) => Ok(PinnedSlotsPermit { _permit: res.expect("this semaphore is never closed"), }), Err(_timeout) => { crate::metrics::page_cache_errors_inc( crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, ); anyhow::bail!("timeout: there were page guards alive for all page cache slots") } } } /// Look up a page in the cache. /// async fn try_lock_for_read( &self, cache_key: &CacheKey, permit: &mut Option, ) -> Option { if let Some(slot_idx) = self.search_mapping(cache_key) { // The page was found in the mapping. Lock the slot, and re-check // that it's still what we expected (because we released the mapping // lock already, another thread could have evicted the page) let slot = &self.slots[slot_idx]; let inner = slot.inner.read().await; if inner.key.as_ref() == Some(cache_key) { slot.inc_usage_count(); return Some(PageReadGuard { _permit: inner.coalesce_readers_permit(permit.take().unwrap()), slot_guard: inner, }); } } None } /// Return a locked buffer for given block. /// /// Like try_lock_for_read(), if the search criteria is not exact and the /// page is already found in the cache, *cache_key is updated. /// /// If the page is not found in the cache, this allocates a new buffer for /// it. The caller may then initialize the buffer with the contents, and /// call mark_valid(). /// /// Example usage: /// /// ```ignore /// let cache = page_cache::get(); /// /// match cache.lock_for_read(&key) { /// ReadBufResult::Found(read_guard) => { /// // The page was found in cache. Use it /// }, /// ReadBufResult::NotFound(write_guard) => { /// // The page was not found in cache. Read it from disk into the /// // buffer. /// //read_my_page_from_disk(write_guard); /// /// // The buffer contents are now valid. Tell the page cache. /// write_guard.mark_valid(); /// }, /// } /// ``` /// async fn lock_for_read( &self, cache_key: &CacheKey, ctx: &RequestContext, ) -> anyhow::Result { let mut permit = Some(self.try_get_pinned_slot_permit().await?); let (read_access, hit) = match cache_key { CacheKey::ImmutableFilePage { .. } => ( &crate::metrics::PAGE_CACHE .for_ctx(ctx) .read_accesses_immutable, &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable, ), }; read_access.inc(); let mut is_first_iteration = true; loop { // First check if the key already exists in the cache. if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await { debug_assert!(permit.is_none()); if is_first_iteration { hit.inc(); } return Ok(ReadBufResult::Found(read_guard)); } debug_assert!(permit.is_some()); is_first_iteration = false; // Not found. Find a victim buffer let (slot_idx, mut inner) = self .find_victim(permit.as_ref().unwrap()) .await .context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted // our victim buffer unnecessarily. Put it into the free list and // continue with the slot that the other thread chose. if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) { // TODO: put to free list // We now just loop back to start from beginning. This is not // optimal, we'll perform the lookup in the mapping again, which // is not really necessary because we already got // 'existing_slot_idx'. But this shouldn't happen often enough // to matter much. continue; } // Make the slot ready let slot = &self.slots[slot_idx]; inner.key = Some(cache_key.clone()); slot.set_usage_count(1); debug_assert!( { let guard = inner.permit.lock().unwrap(); guard.upgrade().is_none() }, "we hold a write lock, so, no one else should have a permit" ); return Ok(ReadBufResult::NotFound(PageWriteGuard { state: PageWriteGuardState::Invalid { _permit: permit.take().unwrap(), inner, }, })); } } // // Section 3: Mapping functions // /// Search for a page in the cache using the given search key. /// /// Returns the slot index, if any. /// /// NOTE: We don't hold any lock on the mapping on return, so the slot might /// get recycled for an unrelated page immediately after this function /// returns. The caller is responsible for re-checking that the slot still /// contains the page with the same key before using it. /// fn search_mapping(&self, cache_key: &CacheKey) -> Option { match cache_key { CacheKey::ImmutableFilePage { file_id, blkno } => { let map = self.immutable_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) } } } /// /// Remove mapping for given key. /// fn remove_mapping(&self, old_key: &CacheKey) { match old_key { CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); map.remove(&(*file_id, *blkno)) .expect("could not find old key in mapping"); self.size_metrics.current_bytes_immutable.sub_page_sz(1); } } } /// /// Insert mapping for given key. /// /// If a mapping already existed for the given key, returns the slot index /// of the existing mapping and leaves it untouched. fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option { match new_key { CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); match map.entry((*file_id, *blkno)) { Entry::Occupied(entry) => Some(*entry.get()), Entry::Vacant(entry) => { entry.insert(slot_idx); self.size_metrics.current_bytes_immutable.add_page_sz(1); None } } } } } // // Section 4: Misc internal helpers // /// Find a slot to evict. /// /// On return, the slot is empty and write-locked. async fn find_victim( &self, _permit_witness: &PinnedSlotsPermit, ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard)> { let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { iters += 1; let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len(); let slot = &self.slots[slot_idx]; if slot.dec_usage_count() == 0 { let mut inner = match slot.inner.try_write() { Ok(inner) => inner, Err(_err) => { if iters > iter_limit { // NB: Even with the permits, there's no hard guarantee that we will find a slot with // any particular number of iterations: other threads might race ahead and acquire and // release pins just as we're scanning the array. // // Imagine that nslots is 2, and as starting point, usage_count==1 on all // slots. There are two threads running concurrently, A and B. A has just // acquired the permit from the semaphore. // // A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search // B: Acquire permit. // B: Look at slot 2, decrement its usage_count to zero and continue the search // B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1. // B: Release pin and permit again // B: Acquire permit. // B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1. // B: Release pin and permit again // // Now we're back in the starting situation that both slots have // usage_count 1, but A has now been through one iteration of the // find_victim() loop. This can repeat indefinitely and on each // iteration, A's iteration count increases by one. // // So, even though the semaphore for the permits is fair, the victim search // itself happens in parallel and is not fair. // Hence even with a permit, a task can theoretically be starved. // To avoid this, we'd need tokio to give priority to tasks that are holding // permits for longer. // Note that just yielding to tokio during iteration without such // priority boosting is likely counter-productive. We'd just give more opportunities // for B to bump usage count, further starving A. page_cache_eviction_metrics::observe( page_cache_eviction_metrics::Outcome::ItersExceeded { iters: iters.try_into().unwrap(), }, ); anyhow::bail!("exceeded evict iter limit"); } continue; } }; if let Some(old_key) = &inner.key { // remove mapping for old buffer self.remove_mapping(old_key); inner.key = None; page_cache_eviction_metrics::observe( page_cache_eviction_metrics::Outcome::FoundSlotEvicted { iters: iters.try_into().unwrap(), }, ); } else { page_cache_eviction_metrics::observe( page_cache_eviction_metrics::Outcome::FoundSlotUnused { iters: iters.try_into().unwrap(), }, ); } return Ok((slot_idx, inner)); } } } /// Initialize a new page cache /// /// This should be called only once at page server startup. fn new(num_pages: usize) -> Self { assert!(num_pages > 0, "page cache size must be > 0"); // We could use Vec::leak here, but that potentially also leaks // uninitialized reserved capacity. With into_boxed_slice and Box::leak // this is avoided. let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak(); let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; size_metrics.max_bytes.set_page_sz(num_pages); size_metrics.current_bytes_immutable.set_page_sz(0); let slots = page_buffer .chunks_exact_mut(PAGE_SZ) .map(|chunk| { // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned. let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) }; Slot { inner: tokio::sync::RwLock::new(SlotInner { key: None, buf, permit: std::sync::Mutex::new(Weak::new()), }), usage_count: AtomicU8::new(0), } }) .collect(); Self { immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), size_metrics, pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)), } } } trait PageSzBytesMetric { fn set_page_sz(&self, count: usize); fn add_page_sz(&self, count: usize); fn sub_page_sz(&self, count: usize); } #[inline(always)] fn count_times_page_sz(count: usize) -> u64 { u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap() } impl PageSzBytesMetric for metrics::UIntGauge { fn set_page_sz(&self, count: usize) { self.set(count_times_page_sz(count)); } fn add_page_sz(&self, count: usize) { self.add(count_times_page_sz(count)); } fn sub_page_sz(&self, count: usize) { self.sub(count_times_page_sz(count)); } } ================================================ FILE: pageserver/src/page_service.rs ================================================ //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. use std::any::Any; use std::borrow::Cow; use std::num::NonZeroUsize; use std::os::fd::AsRawFd; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; use chrono::Utc; use futures::future::BoxFuture; use futures::stream::FuturesUnordered; use futures::{FutureExt, Stream, StreamExt as _}; use itertools::Itertools; use jsonwebtoken::TokenData; use once_cell::sync::OnceCell; use pageserver_api::config::{ GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{PageTraceEvent, TenantState}; use pageserver_api::pagestream_api::{ self, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, PagestreamProtocolVersion, PagestreamRequest, }; use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; use pageserver_page_api::proto; use pageserver_page_api::{self as page_api, GetPageSplitter}; use postgres_backend::{ AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; use postgres_ffi::BLCKSZ; use postgres_ffi_types::constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; use smallvec::{SmallVec, smallvec}; use strum_macros::IntoStaticStr; use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tonic::service::Interceptor as _; use tonic::transport::server::TcpConnectInfo; use tracing::*; use utils::auth::{Claims, Scope, SwappableJwtAuth}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::logging::log_slow; use utils::lsn::Lsn; use utils::shard::ShardIndex; use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; use utils::{failpoint_support, span_record}; use crate::auth::check_permission; use crate::basebackup::{self, BasebackupError}; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; use crate::feature_resolver::FeatureResolver; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, }; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::handle::{Handle, HandleUpgradeError, WeakHandle}; use crate::tenant::timeline::{self, WaitLsnError, WaitLsnTimeout, WaitLsnWaiter}; use crate::tenant::{GetTimelineError, PageReconstructError, Timeline}; use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation}; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which /// is not yet in state [`TenantState::Active`]. /// /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. /// HADRON: reduced timeout and we will retry in Cache::get(). const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); /// Threshold at which to log slow GetPage requests. const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); /// The idle time before sending TCP keepalive probes for gRPC connections. The /// interval and timeout between each probe is configured via sysctl. This /// allows detecting dead connections sooner. const GRPC_TCP_KEEPALIVE_TIME: Duration = Duration::from_secs(60); /// Whether to enable TCP nodelay for gRPC connections. This disables Nagle's /// algorithm, which can cause latency spikes for small messages. const GRPC_TCP_NODELAY: bool = true; /// The interval between HTTP2 keepalive pings. This allows shutting down server /// tasks when clients are unresponsive. const GRPC_HTTP2_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30); /// The timeout for HTTP2 keepalive pings. Should be <= GRPC_KEEPALIVE_INTERVAL. const GRPC_HTTP2_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(20); /// Number of concurrent gRPC streams per TCP connection. We expect something /// like 8 GetPage streams per connections, plus any unary requests. const GRPC_MAX_CONCURRENT_STREAMS: u32 = 256; /////////////////////////////////////////////////////////////////////////////// pub struct Listener { cancel: CancellationToken, /// Cancel the listener task through `listen_cancel` to shut down the listener /// and get a handle on the existing connections. task: JoinHandle, } pub struct Connections { cancel: CancellationToken, tasks: tokio::task::JoinSet, gate: Gate, } pub fn spawn( conf: &'static PageServerConf, tenant_manager: Arc, pg_auth: Option>, perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, tls_config: Option>, feature_resolver: FeatureResolver, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( TaskKind::LibpqEndpointListener, // listener task shouldn't need to download anything. (We will // create a separate sub-contexts for each connection, with their // own download behavior. This context is used only to listen and // accept connections.) DownloadBehavior::Error, ); let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "libpq listener", libpq_listener_main( conf, tenant_manager, pg_auth, perf_trace_dispatch, tcp_listener, conf.pg_auth_type, tls_config, conf.page_service_pipelining.clone(), feature_resolver, libpq_ctx, cancel.clone(), ) .map(anyhow::Ok), )); Listener { cancel, task } } impl Listener { pub async fn stop_accepting(self) -> Connections { self.cancel.cancel(); self.task .await .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") } } impl Connections { pub(crate) async fn shutdown(self) { let Self { cancel, mut tasks, gate, } = self; cancel.cancel(); while let Some(res) = tasks.join_next().await { Self::handle_connection_completion(res); } gate.close().await; } fn handle_connection_completion(res: Result, tokio::task::JoinError>) { match res { Ok(Ok(())) => {} Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), Err(e) => error!("page_service connection task panicked: {:?}", e), } } } /// /// Main loop of the page service. /// /// Listens for connections, and launches a new handler task for each. /// /// Returns Ok(()) upon cancellation via `cancel`, returning the set of /// open connections. /// #[allow(clippy::too_many_arguments)] pub async fn libpq_listener_main( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, feature_resolver: FeatureResolver, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { let connections_cancel = CancellationToken::new(); let connections_gate = Gate::default(); let mut connection_handler_tasks = tokio::task::JoinSet::default(); loop { let gate_guard = match connections_gate.enter() { Ok(guard) => guard, Err(_) => break, }; let accepted = tokio::select! { biased; _ = listener_cancel.cancelled() => break, next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { let res = next.expect("we dont poll while empty"); Connections::handle_connection_completion(res); continue; } accepted = listener.accept() => accepted, }; match accepted { Ok((socket, peer_addr)) => { // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); let connection_ctx = RequestContextBuilder::from(&listener_ctx) .task_kind(TaskKind::PageRequestHandler) .download_behavior(DownloadBehavior::Download) .perf_span_dispatch(perf_trace_dispatch.clone()) .detached_child(); connection_handler_tasks.spawn(page_service_conn_main( conf, tenant_manager.clone(), local_auth, socket, auth_type, tls_config.clone(), pipelining_config.clone(), feature_resolver.clone(), connection_ctx, connections_cancel.child_token(), gate_guard, )); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. error!("accept() failed: {:?}", err); } } } debug!("page_service listener loop terminated"); Connections { cancel: connections_cancel, tasks: connection_handler_tasks, gate: connections_gate, } } type ConnectionHandlerResult = anyhow::Result<()>; /// Perf root spans start at the per-request level, after shard routing. /// This struct carries connection-level information to the root perf span definition. #[derive(Clone, Default)] struct ConnectionPerfSpanFields { peer_addr: String, application_name: Option, compute_mode: Option, } #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, feature_resolver: FeatureResolver, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, ) -> ConnectionHandlerResult { let _guard = LIVE_CONNECTIONS .with_label_values(&["page_service"]) .guard(); socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr().context("get peer address")?; let perf_span_fields = ConnectionPerfSpanFields { peer_addr: peer_addr.to_string(), application_name: None, // filled in later compute_mode: None, // filled in later }; tracing::Span::current().record("peer_addr", field::display(peer_addr)); // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: // - long enough for most valid compute connections // - less than infinite to stop us from "leaking" connections to long-gone computes // // no write timeout is used, because the kernel is assumed to error writes after some time. let mut socket = tokio_io_timeout::TimeoutReader::new(socket); let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default let socket_timeout_ms = (|| { fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| { // Exponential distribution for simulating // poor network conditions, expect about avg_timeout_ms to be around 15 // in tests if let Some(avg_timeout_ms) = avg_timeout_ms { let avg = avg_timeout_ms.parse::().unwrap() as f32; let u = rand::random::(); ((1.0 - u).ln() / (-avg)) as u64 } else { default_timeout_ms } }); default_timeout_ms })(); // A timeout here does not mean the client died, it can happen if it's just idle for // a while: we will tear down this PageServerHandler and instantiate a new one if/when // they reconnect. socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = Box::pin(socket); fail::fail_point!("ps::connection-start::pre-login"); // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( tenant_manager, auth, pipelining_config, conf.get_vectored_concurrent_io, perf_span_fields, connection_ctx, cancel.clone(), feature_resolver.clone(), gate_guard, ); let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { // we've been requested to shut down Ok(()) } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { if is_expected_io_error(&io_error) { info!("Postgres client disconnected ({io_error})"); Ok(()) } else { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); Err(io_error).context(format!( "Postgres connection error for tenant_id={tenant_id:?} client at peer_addr={peer_addr}" )) } } other => { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); other.context(format!( "Postgres query error for tenant_id={tenant_id:?} client peer_addr={peer_addr}" )) } } } /// Page service connection handler. struct PageServerHandler { auth: Option>, claims: Option, /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, /// `process_query` creates a child context from this one. connection_ctx: RequestContext, perf_span_fields: ConnectionPerfSpanFields, cancel: CancellationToken, /// None only while pagestream protocol is being processed. timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, feature_resolver: FeatureResolver, gate_guard: GateGuard, } struct TimelineHandles { wrapper: TenantManagerWrapper, /// Note on size: the typical size of this map is 1. The largest size we expect /// to see is the number of shards divided by the number of pageservers (typically < 2), /// or the ratio used when splitting shards (i.e. how many children created from one) /// parent shard, where a "large" number might be ~8. handles: timeline::handle::Cache, } impl TimelineHandles { fn new(tenant_manager: Arc) -> Self { Self { wrapper: TenantManagerWrapper { tenant_manager, tenant_id: OnceCell::new(), }, handles: Default::default(), } } async fn get( &mut self, tenant_id: TenantId, timeline_id: TimelineId, shard_selector: ShardSelector, ) -> Result, GetActiveTimelineError> { if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id { return Err(GetActiveTimelineError::Tenant( GetActiveTenantError::SwitchedTenant, )); } self.handles .get(timeline_id, shard_selector, &self.wrapper) .await } fn tenant_id(&self) -> Option { self.wrapper.tenant_id.get().copied() } } pub(crate) struct TenantManagerWrapper { tenant_manager: Arc, // We do not support switching tenant_id on a connection at this point. // We can can add support for this later if needed without changing // the protocol. tenant_id: once_cell::sync::OnceCell, } pub(crate) struct TenantManagerTypes; impl timeline::handle::Types for TenantManagerTypes { type TenantManager = TenantManagerWrapper; type Timeline = TenantManagerCacheItem; } pub(crate) struct TenantManagerCacheItem { pub(crate) timeline: Arc, // allow() for cheap propagation through RequestContext inside a task #[allow(clippy::redundant_allocation)] pub(crate) metrics: Arc>, #[allow(dead_code)] // we store it to keep the gate open pub(crate) gate_guard: GateGuard, } impl std::ops::Deref for TenantManagerCacheItem { type Target = Arc; fn deref(&self) -> &Self::Target { &self.timeline } } impl timeline::handle::Timeline for TenantManagerCacheItem { fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { Timeline::shard_timeline_id(&self.timeline) } fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { &self.timeline.handles } fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { Timeline::get_shard_identity(&self.timeline) } } impl timeline::handle::TenantManager for TenantManagerWrapper { async fn resolve( &self, timeline_id: TimelineId, shard_selector: ShardSelector, ) -> Result { let tenant_id = self.tenant_id.get().expect("we set this in get()"); let timeout = ACTIVE_TENANT_TIMEOUT; let wait_start = Instant::now(); let deadline = wait_start + timeout; let tenant_shard = loop { let resolved = self .tenant_manager .resolve_attached_shard(tenant_id, shard_selector); match resolved { ShardResolveResult::Found(tenant_shard) => break tenant_shard, ShardResolveResult::NotFound => { MISROUTED_PAGESTREAM_REQUESTS.inc(); return Err(GetActiveTimelineError::Tenant( GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)), )); } ShardResolveResult::InProgress(barrier) => { // We can't authoritatively answer right now: wait for InProgress state // to end, then try again tokio::select! { _ = barrier.wait() => { // The barrier completed: proceed around the loop to try looking up again }, _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { latest_state: None, wait_time: timeout, })); } } } }; }; tracing::debug!("Waiting for tenant to enter active state..."); tenant_shard .wait_to_become_active(deadline.duration_since(Instant::now())) .await .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; let gate_guard = match timeline.gate.enter() { Ok(guard) => guard, Err(_) => { return Err(GetActiveTimelineError::Timeline( GetTimelineError::ShuttingDown, )); } }; let metrics = Arc::new(Arc::clone(&timeline.metrics)); Ok(TenantManagerCacheItem { timeline, metrics, gate_guard, }) } } /// Whether to hold the applied GC cutoff guard when processing GetPage requests. /// This is determined once at the start of pagestream subprotocol handling based on /// feature flags, configuration, and test conditions. #[derive(Debug, Clone, Copy)] enum HoldAppliedGcCutoffGuard { Yes, No, } #[derive(thiserror::Error, Debug)] enum PageStreamError { /// We encountered an error that should prompt the client to reconnect: /// in practice this means we drop the connection without sending a response. #[error("Reconnect required: {0}")] Reconnect(Cow<'static, str>), /// We were instructed to shutdown while processing the query #[error("Shutting down")] Shutdown, /// Something went wrong reading a page: this likely indicates a pageserver bug #[error("Read error")] Read(#[source] PageReconstructError), /// Ran out of time waiting for an LSN #[error("LSN timeout: {0}")] LsnTimeout(WaitLsnError), /// The entity required to serve the request (tenant or timeline) is not found, /// or is not found in a suitable state to serve a request. #[error("Not found: {0}")] NotFound(Cow<'static, str>), /// Request asked for something that doesn't make sense, like an invalid LSN #[error("Bad request: {0}")] BadRequest(Cow<'static, str>), } impl From for tonic::Status { fn from(err: PageStreamError) -> Self { use tonic::Code; let message = err.to_string(); let code = match err { PageStreamError::Reconnect(_) => Code::Unavailable, PageStreamError::Shutdown => Code::Unavailable, PageStreamError::Read(err) => match err { PageReconstructError::Cancelled => Code::Unavailable, PageReconstructError::MissingKey(_) => Code::NotFound, PageReconstructError::AncestorLsnTimeout(err) => tonic::Status::from(err).code(), PageReconstructError::Other(_) => Code::Internal, PageReconstructError::WalRedo(_) => Code::Internal, }, PageStreamError::LsnTimeout(err) => tonic::Status::from(err).code(), PageStreamError::NotFound(_) => Code::NotFound, PageStreamError::BadRequest(_) => Code::InvalidArgument, }; tonic::Status::new(code, message) } } impl From for PageStreamError { fn from(value: PageReconstructError) -> Self { match value { PageReconstructError::Cancelled => Self::Shutdown, e => Self::Read(e), } } } impl From for PageStreamError { fn from(value: GetActiveTimelineError) -> Self { match value { GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive( TenantState::Stopping { .. }, )) | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown, GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), } } } impl From for PageStreamError { fn from(value: WaitLsnError) -> Self { match value { e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), WaitLsnError::Shutdown => Self::Shutdown, e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()), } } } impl From for QueryError { fn from(value: WaitLsnError) -> Self { match value { e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)), WaitLsnError::Shutdown => Self::Shutdown, WaitLsnError::BadState { .. } => Self::Reconnect, } } } #[derive(thiserror::Error, Debug)] struct BatchedPageStreamError { req: PagestreamRequest, err: PageStreamError, } impl std::fmt::Display for BatchedPageStreamError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.err.fmt(f) } } struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, lsn_range: LsnRange, ctx: RequestContext, // If the request is perf enabled, this contains a context // with a perf span tracking the time spent waiting for the executor. batch_wait_ctx: Option, } #[cfg(feature = "testing")] struct BatchedTestRequest { req: pagestream_api::PagestreamTestRequest, timer: SmgrOpTimer, } /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, /// so that we don't keep the [`Timeline::gate`] open while the batch /// is being built up inside the [`spsc_fold`] (pagestream pipelining). #[derive(IntoStaticStr)] #[allow(clippy::large_enum_variant)] enum BatchedFeMessage { Exists { span: Span, timer: SmgrOpTimer, shard: WeakHandle, req: PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, shard: WeakHandle, req: PagestreamNblocksRequest, }, GetPage { span: Span, shard: WeakHandle, applied_gc_cutoff_guard: Option>, pages: SmallVec<[BatchedGetPageRequest; 1]>, batch_break_reason: GetPageBatchBreakReason, }, DbSize { span: Span, timer: SmgrOpTimer, shard: WeakHandle, req: PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, shard: WeakHandle, req: PagestreamGetSlruSegmentRequest, }, #[cfg(feature = "testing")] Test { span: Span, shard: WeakHandle, requests: Vec, }, RespondError { span: Span, error: BatchedPageStreamError, }, } impl BatchedFeMessage { fn as_static_str(&self) -> &'static str { self.into() } fn observe_execution_start(&mut self, at: Instant) { match self { BatchedFeMessage::Exists { timer, .. } | BatchedFeMessage::Nblocks { timer, .. } | BatchedFeMessage::DbSize { timer, .. } | BatchedFeMessage::GetSlruSegment { timer, .. } => { timer.observe_execution_start(at); } BatchedFeMessage::GetPage { pages, .. } => { for page in pages { page.timer.observe_execution_start(at); } } #[cfg(feature = "testing")] BatchedFeMessage::Test { requests, .. } => { for req in requests { req.timer.observe_execution_start(at); } } BatchedFeMessage::RespondError { .. } => {} } } fn should_break_batch( &self, other: &BatchedFeMessage, max_batch_size: NonZeroUsize, batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, ) -> Option { match (self, other) { ( BatchedFeMessage::GetPage { shard: accum_shard, pages: accum_pages, .. }, BatchedFeMessage::GetPage { shard: this_shard, pages: this_pages, .. }, ) => { assert_eq!(this_pages.len(), 1); if accum_pages.len() >= max_batch_size.get() { trace!(%max_batch_size, "stopping batching because of batch size"); assert_eq!(accum_pages.len(), max_batch_size.get()); return Some(GetPageBatchBreakReason::BatchFull); } if !accum_shard.is_same_handle_as(this_shard) { trace!("stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. return Some(GetPageBatchBreakReason::NonUniformTimeline); } match batching_strategy { PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => { if let Some(last_in_batch) = accum_pages.last() { if last_in_batch.lsn_range.effective_lsn != this_pages[0].lsn_range.effective_lsn { trace!( accum_lsn = %last_in_batch.lsn_range.effective_lsn, this_lsn = %this_pages[0].lsn_range.effective_lsn, "stopping batching because LSN changed" ); return Some(GetPageBatchBreakReason::NonUniformLsn); } } } PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => { // The read path doesn't curently support serving the same page at different LSNs. // While technically possible, it's uncertain if the complexity is worth it. // Break the batch if such a case is encountered. let same_page_different_lsn = accum_pages.iter().any(|batched| { batched.req.rel == this_pages[0].req.rel && batched.req.blkno == this_pages[0].req.blkno && batched.lsn_range.effective_lsn != this_pages[0].lsn_range.effective_lsn }); if same_page_different_lsn { trace!( rel=%this_pages[0].req.rel, blkno=%this_pages[0].req.blkno, lsn=%this_pages[0].lsn_range.effective_lsn, "stopping batching because same page was requested at different LSNs" ); return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn); } } } None } #[cfg(feature = "testing")] ( BatchedFeMessage::Test { shard: accum_shard, requests: accum_requests, .. }, BatchedFeMessage::Test { shard: this_shard, requests: this_requests, .. }, ) => { assert!(this_requests.len() == 1); if accum_requests.len() >= max_batch_size.get() { trace!(%max_batch_size, "stopping batching because of batch size"); assert_eq!(accum_requests.len(), max_batch_size.get()); return Some(GetPageBatchBreakReason::BatchFull); } if !accum_shard.is_same_handle_as(this_shard) { trace!("stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. return Some(GetPageBatchBreakReason::NonUniformTimeline); } let this_batch_key = this_requests[0].req.batch_key; let accum_batch_key = accum_requests[0].req.batch_key; if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); return Some(GetPageBatchBreakReason::NonUniformKey); } None } (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest), } } } impl PageServerHandler { #[allow(clippy::too_many_arguments)] pub fn new( tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, connection_ctx: RequestContext, cancel: CancellationToken, feature_resolver: FeatureResolver, gate_guard: GateGuard, ) -> Self { PageServerHandler { auth, claims: None, connection_ctx, perf_span_fields, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, get_vectored_concurrent_io, feature_resolver, gate_guard, } } /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect /// cancellation if there aren't any timelines in the cache. /// /// If calling from a function that doesn't use the `[Self::shard_timelines]` cache, then pass in the /// timeline cancellation token. async fn flush_cancellable( &self, pgb: &mut PostgresBackend, cancel: &CancellationToken, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { tokio::select!( flush_r = pgb.flush() => { Ok(flush_r?) }, _ = cancel.cancelled() => { Err(QueryError::Shutdown) } ) } #[allow(clippy::too_many_arguments)] async fn pagestream_read_message( pgb: &mut PostgresBackendReader, tenant_id: TenantId, timeline_id: TimelineId, timeline_handles: &mut TimelineHandles, conn_perf_span_fields: &ConnectionPerfSpanFields, cancel: &CancellationToken, ctx: &RequestContext, protocol_version: PagestreamProtocolVersion, parent_span: Span, hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard, ) -> Result, QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { let msg = tokio::select! { biased; _ = cancel.cancelled() => { return Err(QueryError::Shutdown) } msg = pgb.read_message() => { msg } }; let received_at = Instant::now(); let copy_data_bytes = match msg? { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => { return Ok(None); } Some(m) => { return Err(QueryError::Other(anyhow::anyhow!( "unexpected message: {m:?} during COPY" ))); } None => { return Ok(None); } // client disconnected }; trace!("query: {copy_data_bytes:?}"); fail::fail_point!("ps::handle-pagerequest-message"); // parse request let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelExists, received_at, ) .await?; BatchedFeMessage::Exists { span, timer, shard: shard.downgrade(), req, } } PagestreamFeMessage::Nblocks(req) => { let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelSize, received_at, ) .await?; BatchedFeMessage::Nblocks { span, timer, shard: shard.downgrade(), req, } } PagestreamFeMessage::DbSize(req) => { let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetDbSize, received_at, ) .await?; BatchedFeMessage::DbSize { span, timer, shard: shard.downgrade(), req, } } PagestreamFeMessage::GetSlruSegment(req) => { let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetSlruSegment, received_at, ) .await?; BatchedFeMessage::GetSlruSegment { span, timer, shard: shard.downgrade(), req, } } PagestreamFeMessage::GetPage(req) => { // avoid a somewhat costly Span::record() by constructing the entire span in one go. macro_rules! mkspan { (before shard routing) => {{ tracing::info_span!( parent: &parent_span, "handle_get_page_request", request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, ) }}; ($shard_id:expr) => {{ tracing::info_span!( parent: &parent_span, "handle_get_page_request", request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, shard_id = %$shard_id, ) }}; } macro_rules! respond_error { ($span:expr, $error:expr) => {{ let error = BatchedFeMessage::RespondError { span: $span, error: BatchedPageStreamError { req: req.hdr, err: $error, }, }; Ok(Some(error)) }}; } let key = rel_block_to_key(req.rel, req.blkno); let res = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) .await; let shard = match res { Ok(tl) => tl, Err(e) => { let span = mkspan!(before shard routing); match e { GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => { // We already know this tenant exists in general, because we resolved it at // start of connection. Getting a NotFound here indicates that the shard containing // the requested page is not present on this node: the client's knowledge of shard->pageserver // mapping is out of date. // // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration // and talk to a different pageserver. MISROUTED_PAGESTREAM_REQUESTS.inc(); return respond_error!( span, PageStreamError::Reconnect( "getpage@lsn request routed to wrong shard".into() ) ); } e => { return respond_error!(span, e.into()); } } } }; let ctx = if shard.is_get_page_request_sampled() { RequestContextBuilder::from(ctx) .root_perf_span(|| { info_span!( target: PERF_TRACE_TARGET, "GET_PAGE", peer_addr = conn_perf_span_fields.peer_addr, application_name = conn_perf_span_fields.application_name, compute_mode = conn_perf_span_fields.compute_mode, tenant_id = %tenant_id, shard_id = %shard.get_shard_identity().shard_slug(), timeline_id = %timeline_id, lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, request_id = %req.hdr.reqid, key = %key, ) }) .attached_child() } else { ctx.attached_child() }; // This ctx travels as part of the BatchedFeMessage through // batching into the request handler. // The request handler needs to do some per-request work // (relsize check) before dispatching the batch as a single // get_vectored call to the Timeline. // This ctx will be used for the reslize check, whereas the // get_vectored call will be a different ctx with separate // perf span. let ctx = ctx.with_scope_page_service_pagestream(&shard); // Similar game for this `span`: we funnel it through so that // request handler log messages contain the request-specific fields. let span = mkspan!(shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetPageAtLsn, received_at, ) .maybe_perf_instrument(&ctx, |current_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: current_perf_span, "THROTTLE", ) }) .await?; let applied_gc_cutoff_guard = shard.get_applied_gc_cutoff_lsn(); // hold guard // We're holding the Handle let effective_lsn = match Self::effective_request_lsn( &shard, shard.get_last_record_lsn(), req.hdr.request_lsn, req.hdr.not_modified_since, &applied_gc_cutoff_guard, ) { Ok(lsn) => lsn, Err(e) => { return respond_error!(span, e); } }; let applied_gc_cutoff_guard = match hold_gc_cutoff_guard { HoldAppliedGcCutoffGuard::Yes => Some(applied_gc_cutoff_guard), HoldAppliedGcCutoffGuard::No => { drop(applied_gc_cutoff_guard); None } }; let batch_wait_ctx = if ctx.has_perf_span() { Some( RequestContextBuilder::from(&ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "WAIT_EXECUTOR", ) }) .attached_child(), ) } else { None }; BatchedFeMessage::GetPage { span, shard: shard.downgrade(), applied_gc_cutoff_guard, pages: smallvec![BatchedGetPageRequest { req, timer, lsn_range: LsnRange { effective_lsn, request_lsn: req.hdr.request_lsn }, ctx, batch_wait_ctx, }], // The executor grabs the batch when it becomes idle. // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the // default reason for breaking the batch. batch_break_reason: GetPageBatchBreakReason::ExecutorSteal, } } #[cfg(feature = "testing")] PagestreamFeMessage::Test(req) => { let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug()); let timer = Self::record_op_start_and_throttle( &shard, metrics::SmgrQueryType::Test, received_at, ) .await?; BatchedFeMessage::Test { span, shard: shard.downgrade(), requests: vec![BatchedTestRequest { req, timer }], } } }; Ok(Some(batched_msg)) } /// Starts a SmgrOpTimer at received_at and throttles the request. async fn record_op_start_and_throttle( shard: &Handle, op: metrics::SmgrQueryType, received_at: Instant, ) -> Result { // It's important to start the smgr op metric recorder as early as possible // so that the _started counters are incremented before we do // any serious waiting, e.g., for throttle, batching, or actual request handling. let mut timer = shard.query_metrics.start_smgr_op(op, received_at); let now = Instant::now(); timer.observe_throttle_start(now); let throttled = tokio::select! { res = shard.pagestream_throttle.throttle(1, now) => res, _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown), }; timer.observe_throttle_done(throttled); Ok(timer) } /// Post-condition: `batch` is Some() #[instrument(skip_all, level = tracing::Level::TRACE)] #[allow(clippy::boxed_local)] fn pagestream_do_batch( batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, max_batch_size: NonZeroUsize, batch: &mut Result, this_msg: Result, ) -> Result<(), Result> { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); let this_msg = match this_msg { Ok(this_msg) => this_msg, Err(e) => return Err(Err(e)), }; let eligible_batch = match batch { Ok(b) => b, Err(_) => { return Err(Ok(this_msg)); } }; let batch_break = eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy); match batch_break { Some(reason) => { if let BatchedFeMessage::GetPage { batch_break_reason, .. } = eligible_batch { *batch_break_reason = reason; } Err(Ok(this_msg)) } None => { // ok to batch match (eligible_batch, this_msg) { ( BatchedFeMessage::GetPage { pages: accum_pages, applied_gc_cutoff_guard: accum_applied_gc_cutoff_guard, .. }, BatchedFeMessage::GetPage { pages: this_pages, applied_gc_cutoff_guard: this_applied_gc_cutoff_guard, .. }, ) => { accum_pages.extend(this_pages); // the minimum of the two guards will keep data for both alive match (&accum_applied_gc_cutoff_guard, this_applied_gc_cutoff_guard) { (None, None) => (), (None, Some(this)) => *accum_applied_gc_cutoff_guard = Some(this), (Some(_), None) => (), (Some(accum), Some(this)) => { if **accum > *this { *accum_applied_gc_cutoff_guard = Some(this); } } }; Ok(()) } #[cfg(feature = "testing")] ( BatchedFeMessage::Test { requests: accum_requests, .. }, BatchedFeMessage::Test { requests: this_requests, .. }, ) => { accum_requests.extend(this_requests); Ok(()) } // Shape guaranteed by [`BatchedFeMessage::should_break_batch`] _ => unreachable!(), } } } } #[instrument(level = tracing::Level::DEBUG, skip_all)] async fn pagestream_handle_batched_message( &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, io_concurrency: IoConcurrency, cancel: &CancellationToken, protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { let started_at = Instant::now(); let batch = { let mut batch = batch; batch.observe_execution_start(started_at); batch }; // Dispatch the batch to the appropriate request handler. let log_slow_name = batch.as_static_str(); let (mut handler_results, span) = { // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and // won't fit on the stack. let mut boxpinned = Box::pin(Self::pagestream_dispatch_batched_message( batch, io_concurrency, ctx, )); log_slow( log_slow_name, LOG_SLOW_GETPAGE_THRESHOLD, boxpinned.as_mut(), ) .await? }; // We purposefully don't count flush time into the smgr operation timer. // // The reason is that current compute client will not perform protocol processing // if the postgres backend process is doing things other than `->smgr_read()`. // This is especially the case for prefetch. // // If the compute doesn't read from the connection, eventually TCP will backpressure // all the way into our flush call below. // // The timer's underlying metric is used for a storage-internal latency SLO and // we don't want to include latency in it that we can't control. // And as pointed out above, in this case, we don't control the time that flush will take. // // We put each response in the batch onto the wire in a separate pgb_writer.flush() // call, which (all unmeasured) adds syscall overhead but reduces time to first byte // and avoids building up a "giant" contiguous userspace buffer to hold the entire response. // TODO: vectored socket IO would be great, but pgb_writer doesn't support that. let flush_timers = { let flushing_start_time = Instant::now(); let mut flush_timers = Vec::with_capacity(handler_results.len()); for handler_result in &mut handler_results { let flush_timer = match handler_result { Ok((_response, timer, _ctx)) => Some( timer .observe_execution_end(flushing_start_time) .expect("we are the first caller"), ), Err(_) => { // TODO: measure errors None } }; flush_timers.push(flush_timer); } assert_eq!(flush_timers.len(), handler_results.len()); flush_timers }; // Map handler result to protocol behavior. // Some handler errors cause exit from pagestream protocol. // Other handler errors are sent back as an error message and we stay in pagestream protocol. for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) { let (response_msg, ctx) = match handler_result { Err(e) => match &e.err { PageStreamError::Shutdown => { // BEGIN HADRON PAGESTREAM_HANDLER_RESULTS_TOTAL .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR]) .inc(); // END HADRON // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the // connection. span.in_scope(|| info!("dropping connection due to shutdown")); return Err(QueryError::Shutdown); } PageStreamError::Reconnect(_reason) => { span.in_scope(|| { // BEGIN HADRON // We can get here because the compute node is pointing at the wrong PS. We // already have a metric to keep track of this so suppressing this log to // reduce log spam. The information in this log message is not going to be that // helpful given the volume of logs that can be generated. // info!("handler requested reconnect: {reason}") // END HADRON }); // BEGIN HADRON PAGESTREAM_HANDLER_RESULTS_TOTAL .with_label_values(&[ metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR, ]) .inc(); // END HADRON return Err(QueryError::Reconnect); } PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) | PageStreamError::NotFound(_) | PageStreamError::BadRequest(_) => { // BEGIN HADRON if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err { PAGESTREAM_HANDLER_RESULTS_TOTAL .with_label_values(&[ metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR, ]) .inc(); } else { PAGESTREAM_HANDLER_RESULTS_TOTAL .with_label_values(&[ metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR, ]) .inc(); } // END HADRON // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. let full = utils::error::report_compact_sources(&e.err); span.in_scope(|| { error!("error reading relation or page version: {full:#}") }); ( PagestreamBeMessage::Error(PagestreamErrorResponse { req: e.req, message: e.err.to_string(), }), None, ) } }, Ok((response_msg, _op_timer_already_observed, ctx)) => { // BEGIN HADRON PAGESTREAM_HANDLER_RESULTS_TOTAL .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS]) .inc(); // END HADRON (response_msg, Some(ctx)) } }; let ctx = ctx.map(|req_ctx| { RequestContextBuilder::from(&req_ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "FLUSH_RESPONSE", ) }) .attached_child() }); // // marshal & transmit response message // pgb_writer.write_message_noflush(&BeMessage::CopyData( &response_msg.serialize(protocol_version), ))?; failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel); // what we want to do let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure( Instant::now(), flush_fut, socket_fd, )), None => futures::future::Either::Right(flush_fut), }; let flush_fut = if let Some(req_ctx) = ctx.as_ref() { futures::future::Either::Left( flush_fut.maybe_perf_instrument(req_ctx, |current_perf_span| { current_perf_span.clone() }), ) } else { futures::future::Either::Right(flush_fut) }; // do it while respecting cancellation let _: () = async move { tokio::select! { biased; _ = cancel.cancelled() => { // We were requested to shut down. info!("shutdown request received in page handler"); return Err(QueryError::Shutdown) } res = flush_fut => { res?; } } Ok(()) } .await?; } Ok(()) } /// Helper which dispatches a batched message to the appropriate handler. /// Returns a vec of results, along with the extracted trace span. async fn pagestream_dispatch_batched_message( batch: BatchedFeMessage, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Result< ( Vec>, Span, ), QueryError, > { macro_rules! upgrade_handle_and_set_context { ($shard:ident) => {{ let weak_handle = &$shard; let handle = weak_handle.upgrade()?; let ctx = ctx.with_scope_page_service_pagestream(&handle); (handle, ctx) }}; } Ok(match batch { BatchedFeMessage::Exists { span, timer, shard, req, } => { fail::fail_point!("ps::handle-pagerequest-message::exists"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ Self::handle_get_rel_exists_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (PagestreamBeMessage::Exists(msg), timer, ctx)) .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), ], span, ) } BatchedFeMessage::Nblocks { span, timer, shard, req, } => { fail::fail_point!("ps::handle-pagerequest-message::nblocks"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ Self::handle_get_nblocks_request(&shard, &req, false, &ctx) .instrument(span.clone()) .await .map(|msg| msg.expect("allow_missing=false")) .map(|msg| (PagestreamBeMessage::Nblocks(msg), timer, ctx)) .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), ], span, ) } BatchedFeMessage::GetPage { span, shard, applied_gc_cutoff_guard, pages, batch_break_reason, } => { fail::fail_point!("ps::handle-pagerequest-message::getpage"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( { let npages = pages.len(); trace!(npages, "handling getpage request"); let res = Self::handle_get_page_at_lsn_request_batched( &shard, pages, io_concurrency, batch_break_reason, &ctx, ) .instrument(span.clone()) .await; assert_eq!(res.len(), npages); drop(applied_gc_cutoff_guard); res }, span, ) } BatchedFeMessage::DbSize { span, timer, shard, req, } => { fail::fail_point!("ps::handle-pagerequest-message::dbsize"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ Self::handle_db_size_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (PagestreamBeMessage::DbSize(msg), timer, ctx)) .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), ], span, ) } BatchedFeMessage::GetSlruSegment { span, timer, shard, req, } => { fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ Self::handle_get_slru_segment_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (PagestreamBeMessage::GetSlruSegment(msg), timer, ctx)) .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), ], span, ) } #[cfg(feature = "testing")] BatchedFeMessage::Test { span, shard, requests, } => { fail::fail_point!("ps::handle-pagerequest-message::test"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( { let npages = requests.len(); trace!(npages, "handling getpage request"); let res = Self::handle_test_request_batch(&shard, requests, &ctx) .instrument(span.clone()) .await; assert_eq!(res.len(), npages); res }, span, ) } BatchedFeMessage::RespondError { span, error } => { // We've already decided to respond with an error, so we don't need to // call the handler. (vec![Err(error)], span) } }) } /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. /// /// # Coding Discipline /// /// Coding discipline within this function: all interaction with the `pgb` connection /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. /// This is so that we can shutdown page_service quickly. #[instrument(skip_all, fields(hold_gc_cutoff_guard))] async fn handle_pagerequests( &mut self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; tokio::select! { biased; _ = self.cancel.cancelled() => { return Err(QueryError::Shutdown) } res = pgb.flush() => { res?; } } let io_concurrency = IoConcurrency::spawn_from_conf( self.get_vectored_concurrent_io, match self.gate_guard.try_clone() { Ok(guard) => guard, Err(_) => { info!("shutdown request received in page handler"); return Err(QueryError::Shutdown); } }, ); let pgb_reader = pgb .split() .context("implementation error: split pgb into reader and writer")?; let timeline_handles = self .timeline_handles .take() .expect("implementation error: timeline_handles should not be locked"); // Evaluate the expensive feature resolver check once per pagestream subprotocol handling // instead of once per GetPage request. This is shared between pipelined and serial paths. let hold_gc_cutoff_guard = if cfg!(test) || cfg!(feature = "testing") { HoldAppliedGcCutoffGuard::Yes } else { // Use the global feature resolver with the tenant ID directly, avoiding the need // to get a timeline/shard which might not be available on this pageserver node. let empty_properties = std::collections::HashMap::new(); match self.feature_resolver.evaluate_boolean( "page-service-getpage-hold-applied-gc-cutoff-guard", tenant_id, &empty_properties, ) { Ok(()) => HoldAppliedGcCutoffGuard::Yes, Err(_) => HoldAppliedGcCutoffGuard::No, } }; // record it in the span of handle_pagerequests so that both the request_span // and the pipeline implementation spans contains the field. Span::current().record( "hold_gc_cutoff_guard", tracing::field::debug(&hold_gc_cutoff_guard), ); let request_span = info_span!("request"); let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { PageServicePipeliningConfig::Pipelined(pipelining_config) => { self.handle_pagerequests_pipelined( pgb, pgb_reader, tenant_id, timeline_id, timeline_handles, request_span, pipelining_config, protocol_version, io_concurrency, hold_gc_cutoff_guard, &ctx, ) .await } PageServicePipeliningConfig::Serial => { self.handle_pagerequests_serial( pgb, pgb_reader, tenant_id, timeline_id, timeline_handles, request_span, protocol_version, io_concurrency, hold_gc_cutoff_guard, &ctx, ) .await } }; debug!("pagestream subprotocol shut down cleanly"); pgb.unsplit(pgb_reader) .context("implementation error: unsplit pgb")?; let replaced = self.timeline_handles.replace(timeline_handles); assert!(replaced.is_none()); result } #[allow(clippy::too_many_arguments)] async fn handle_pagerequests_serial( &mut self, pgb_writer: &mut PostgresBackend, mut pgb_reader: PostgresBackendReader, tenant_id: TenantId, timeline_id: TimelineId, mut timeline_handles: TimelineHandles, request_span: Span, protocol_version: PagestreamProtocolVersion, io_concurrency: IoConcurrency, hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), Result<(), QueryError>, ) where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { let cancel = self.cancel.clone(); let err = loop { let msg = Self::pagestream_read_message( &mut pgb_reader, tenant_id, timeline_id, &mut timeline_handles, &self.perf_span_fields, &cancel, ctx, protocol_version, request_span.clone(), hold_gc_cutoff_guard, ) .await; let msg = match msg { Ok(msg) => msg, Err(e) => break e, }; let msg = match msg { Some(msg) => msg, None => { debug!("pagestream subprotocol end observed"); return ((pgb_reader, timeline_handles), Ok(())); } }; let result = self .pagestream_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), &cancel, protocol_version, ctx, ) .await; match result { Ok(()) => {} Err(e) => break e, } }; ((pgb_reader, timeline_handles), Err(err)) } /// # Cancel-Safety /// /// May leak tokio tasks if not polled to completion. #[allow(clippy::too_many_arguments)] async fn handle_pagerequests_pipelined( &mut self, pgb_writer: &mut PostgresBackend, pgb_reader: PostgresBackendReader, tenant_id: TenantId, timeline_id: TimelineId, mut timeline_handles: TimelineHandles, request_span: Span, pipelining_config: PageServicePipeliningConfigPipelined, protocol_version: PagestreamProtocolVersion, io_concurrency: IoConcurrency, hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), Result<(), QueryError>, ) where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { // // Pipelined pagestream handling consists of // - a Batcher that reads requests off the wire and // and batches them if possible, // - an Executor that processes the batched requests. // // The batch is built up inside an `spsc_fold` channel, // shared betwen Batcher (Sender) and Executor (Receiver). // // The Batcher continously folds client requests into the batch, // while the Executor can at any time take out what's in the batch // in order to process it. // This means the next batch builds up while the Executor // executes the last batch. // // CANCELLATION // // We run both Batcher and Executor futures to completion before // returning from this function. // // If Executor exits first, it signals cancellation to the Batcher // via a CancellationToken that is child of `self.cancel`. // If Batcher exits first, it signals cancellation to the Executor // by dropping the spsc_fold channel Sender. // // CLEAN SHUTDOWN // // Clean shutdown means that the client ends the COPYBOTH session. // In response to such a client message, the Batcher exits. // The Executor continues to run, draining the spsc_fold channel. // Once drained, the spsc_fold recv will fail with a distinct error // indicating that the sender disconnected. // The Executor exits with Ok(()) in response to that error. // // Server initiated shutdown is not clean shutdown, but instead // is an error Err(QueryError::Shutdown) that is propagated through // error propagation. // // ERROR PROPAGATION // // When the Batcher encounter an error, it sends it as a value // through the spsc_fold channel and exits afterwards. // When the Executor observes such an error in the channel, // it exits returning that error value. // // This design ensures that the Executor stage will still process // the batch that was in flight when the Batcher encountered an error, // thereby beahving identical to a serial implementation. let PageServicePipeliningConfigPipelined { max_batch_size, execution, batching: batching_strategy, } = pipelining_config; // Macro to _define_ a pipeline stage. macro_rules! pipeline_stage { ($name:literal, $cancel:expr, $make_fut:expr) => {{ let cancel: CancellationToken = $cancel; let stage_fut = $make_fut(cancel.clone()); async move { scopeguard::defer! { debug!("exiting"); } timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel) .await } .instrument(tracing::info_span!($name)) }}; } // // Batcher // let perf_span_fields = self.perf_span_fields.clone(); let cancel_batcher = self.cancel.child_token(); let (mut batch_tx, mut batch_rx) = spsc_fold::channel(); let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| { let ctx = ctx.attached_child(); async move { let mut pgb_reader = pgb_reader; let mut exit = false; while !exit { let read_res = Self::pagestream_read_message( &mut pgb_reader, tenant_id, timeline_id, &mut timeline_handles, &perf_span_fields, &cancel_batcher, &ctx, protocol_version, request_span.clone(), hold_gc_cutoff_guard, ) .await; let Some(read_res) = read_res.transpose() else { debug!("client-initiated shutdown"); break; }; exit |= read_res.is_err(); let could_send = batch_tx .send(read_res, |batch, res| { Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res) }) .await; exit |= could_send.is_err(); } (pgb_reader, timeline_handles) } }); // // Executor // let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| { let ctx = ctx.attached_child(); async move { let _cancel_batcher = cancel_batcher.drop_guard(); loop { let maybe_batch = batch_rx.recv().await; let batch = match maybe_batch { Ok(batch) => batch, Err(spsc_fold::RecvError::SenderGone) => { debug!("upstream gone"); return Ok(()); } }; let mut batch = match batch { Ok(batch) => batch, Err(e) => { return Err(e); } }; if let BatchedFeMessage::GetPage { pages, span: _, shard: _, applied_gc_cutoff_guard: _, batch_break_reason: _, } = &mut batch { for req in pages { req.batch_wait_ctx.take(); } } self.pagestream_handle_batched_message( pgb_writer, batch, io_concurrency.clone(), &cancel, protocol_version, &ctx, ) .await?; } } }); // // Execute the stages. // match execution { PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { tokio::join!(batcher, executor) } PageServiceProtocolPipelinedExecutionStrategy::Tasks => { // These tasks are not tracked anywhere. let read_messages_task = tokio::spawn(batcher); let (read_messages_task_res, executor_res_) = tokio::join!(read_messages_task, executor,); ( read_messages_task_res.expect("propagated panic from read_messages"), executor_res_, ) } } } /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about /// which version of the page is being requested. The primary compute node /// will always request the latest page version, by setting 'request_lsn' to /// the last inserted or flushed WAL position, while a standby will request /// a version at the LSN that it's currently caught up to. /// /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. /// /// In addition to the request LSN, each request carries another LSN, /// 'not_modified_since', which is a hint to the pageserver that the client /// knows that the page has not been modified between 'not_modified_since' /// and the request LSN. This allows skipping the wait, as long as the WAL /// up to 'not_modified_since' has arrived. If the client doesn't have any /// information about when the page was modified, it will use /// not_modified_since == lsn. If the client lies and sends a too low /// not_modified_hint such that there are in fact later page versions, the /// behavior is undefined: the pageserver may return any of the page versions /// or an error. async fn wait_or_get_last_lsn( timeline: &Timeline, request_lsn: Lsn, not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, ) -> Result { let last_record_lsn = timeline.get_last_record_lsn(); let effective_request_lsn = Self::effective_request_lsn( timeline, last_record_lsn, request_lsn, not_modified_since, latest_gc_cutoff_lsn, )?; if effective_request_lsn > last_record_lsn { timeline .wait_lsn( not_modified_since, crate::tenant::timeline::WaitLsnWaiter::PageService, timeline::WaitLsnTimeout::Default, ctx, ) .await?; // Since we waited for 'effective_request_lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the last-record LSN can // advance immediately after we return anyway) } Ok(effective_request_lsn) } fn effective_request_lsn( timeline: &Timeline, last_record_lsn: Lsn, request_lsn: Lsn, not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ) -> Result { // Sanity check the request if request_lsn < not_modified_since { return Err(PageStreamError::BadRequest( format!( "invalid request with request LSN {request_lsn} and not_modified_since {not_modified_since}", ) .into(), )); } // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus if request_lsn == Lsn::INVALID { return Err(PageStreamError::BadRequest( "invalid LSN(0) in request".into(), )); } // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. // // We may have older data available, but we make a best effort to detect this case and return an error, // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); if !gc_info.lsn_covered_by_lease(request_lsn) { return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) ); } } if not_modified_since > last_record_lsn { Ok(not_modified_since) } else { // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) // here instead. That would give the same result, since we know that there // haven't been any modifications since 'not_modified_since'. Using an older // LSN might be faster, because that could allow skipping recent layers when // finding the page. However, we have historically used 'last_record_lsn', so // stick to that for now. Ok(std::cmp::min(last_record_lsn, request_lsn)) } } /// Handles the lsn lease request. /// If a lease cannot be obtained, the client will receive NULL. #[instrument(skip_all, fields(shard_id, %lsn))] async fn handle_make_lsn_lease( &mut self, pgb: &mut PostgresBackend, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { let timeline = self .timeline_handles .as_mut() .unwrap() .get( tenant_shard_id.tenant_id, timeline_id, ShardSelector::Known(tenant_shard_id.to_index()), ) .await?; set_tracing_field_shard_id(&timeline); let lease = timeline .renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) .inspect_err(|e| { warn!("{e}"); }) .ok(); let valid_until_str = lease.map(|l| { l.valid_until .duration_since(SystemTime::UNIX_EPOCH) .expect("valid_until is earlier than UNIX_EPOCH") .as_millis() .to_string() }); info!( "acquired lease for {} until {}", lsn, valid_until_str.as_deref().unwrap_or("") ); let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"valid_until", )]))? .write_message_noflush(&BeMessage::DataRow(&[bytes]))?; Ok(()) } #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( timeline: &Timeline, req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) .await?; let exists = timeline .get_rel_exists( req.rel, Version::LsnRange(LsnRange { effective_lsn: lsn, request_lsn: req.hdr.request_lsn, }), ctx, ) .await?; Ok(PagestreamExistsResponse { req: *req, exists }) } /// If `allow_missing` is true, returns None instead of Err on missing relations. Otherwise, /// never returns None. It is only supported by the gRPC protocol, so we pass it separately to /// avoid changing the libpq protocol types. #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( timeline: &Timeline, req: &PagestreamNblocksRequest, allow_missing: bool, ctx: &RequestContext, ) -> Result, PageStreamError> { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) .await?; let n_blocks = timeline .get_rel_size_in_reldir( req.rel, Version::LsnRange(LsnRange { effective_lsn: lsn, request_lsn: req.hdr.request_lsn, }), None, allow_missing, ctx, ) .await?; let Some(n_blocks) = n_blocks else { return Ok(None); }; Ok(Some(PagestreamNblocksResponse { req: *req, n_blocks, })) } #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( timeline: &Timeline, req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) .await?; let total_blocks = timeline .get_db_size( DEFAULTTABLESPACE_OID, req.dbnode, Version::LsnRange(LsnRange { effective_lsn: lsn, request_lsn: req.hdr.request_lsn, }), ctx, ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamDbSizeResponse { req: *req, db_size }) } #[instrument(skip_all)] async fn handle_get_page_at_lsn_request_batched( timeline: &Timeline, requests: SmallVec<[BatchedGetPageRequest; 1]>, io_concurrency: IoConcurrency, batch_break_reason: GetPageBatchBreakReason, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); timeline .query_metrics .observe_getpage_batch_start(requests.len(), batch_break_reason); // If a page trace is running, submit an event for this request. if let Some(page_trace) = timeline.page_trace.load().as_ref() { let time = SystemTime::now(); for batch in &requests { let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact(); // Ignore error (trace buffer may be full or tracer may have disconnected). _ = page_trace.try_send(PageTraceEvent { key, effective_lsn: batch.lsn_range.effective_lsn, time, }); } } // If any request in the batch needs to wait for LSN, then do so now. let mut perf_instrument = false; let max_effective_lsn = requests .iter() .map(|req| { if req.ctx.has_perf_span() { perf_instrument = true; } req.lsn_range.effective_lsn }) .max() .expect("batch is never empty"); let ctx = match perf_instrument { true => RequestContextBuilder::from(ctx) .root_perf_span(|| { info_span!( target: PERF_TRACE_TARGET, "GET_VECTORED", tenant_id = %timeline.tenant_shard_id.tenant_id, timeline_id = %timeline.timeline_id, shard = %timeline.tenant_shard_id.shard_slug(), %max_effective_lsn ) }) .attached_child(), false => ctx.attached_child(), }; let last_record_lsn = timeline.get_last_record_lsn(); if max_effective_lsn > last_record_lsn { if let Err(e) = timeline .wait_lsn( max_effective_lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, timeline::WaitLsnTimeout::Default, &ctx, ) .maybe_perf_instrument(&ctx, |current_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: current_perf_span, "WAIT_LSN", ) }) .await { return Vec::from_iter(requests.into_iter().map(|req| { Err(BatchedPageStreamError { err: PageStreamError::from(e.clone()), req: req.req.hdr, }) })); } } let results = timeline .get_rel_page_at_lsn_batched( requests.iter().map(|p| { ( &p.req.rel, &p.req.blkno, p.lsn_range, p.ctx.attached_child(), ) }), io_concurrency, &ctx, ) .await; assert_eq!(results.len(), requests.len()); // TODO: avoid creating the new Vec here Vec::from_iter( requests .into_iter() .zip(results.into_iter()) .map(|(req, res)| { res.map(|page| { ( PagestreamBeMessage::GetPage( pagestream_api::PagestreamGetPageResponse { req: req.req, page }, ), req.timer, req.ctx, ) }) .map_err(|e| BatchedPageStreamError { err: PageStreamError::from(e), req: req.req.hdr, }) }), ) } #[instrument(skip_all, fields(shard_id))] async fn handle_get_slru_segment_request( timeline: &Timeline, req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) .await?; let kind = SlruKind::from_repr(req.kind) .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; Ok(PagestreamGetSlruSegmentResponse { req: *req, segment }) } // NB: this impl mimics what we do for batched getpage requests. #[cfg(feature = "testing")] #[instrument(skip_all, fields(shard_id))] async fn handle_test_request_batch( timeline: &Timeline, requests: Vec, _ctx: &RequestContext, ) -> Vec> { // real requests would do something with the timeline let mut results = Vec::with_capacity(requests.len()); for _req in requests.iter() { tokio::task::yield_now().await; results.push({ if timeline.cancel.is_cancelled() { Err(PageReconstructError::Cancelled) } else { Ok(()) } }); } // TODO: avoid creating the new Vec here Vec::from_iter( requests .into_iter() .zip(results.into_iter()) .map(|(req, res)| { res.map(|()| { ( PagestreamBeMessage::Test(pagestream_api::PagestreamTestResponse { req: req.req.clone(), }), req.timer, RequestContext::new( TaskKind::PageRequestHandler, DownloadBehavior::Warn, ), ) }) .map_err(|e| BatchedPageStreamError { err: PageStreamError::from(e), req: req.req.hdr, }) }), ) } /// Note on "fullbackup": /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, /// but that is not applicable anymore. /// /// # Coding Discipline /// /// Coding discipline within this function: all interaction with the `pgb` connection /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. /// This is so that we can shutdown page_service quickly. /// /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive /// to connection cancellation. #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, gzip: bool, replica: bool, ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { let started = std::time::Instant::now(); let timeline = self .timeline_handles .as_mut() .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; set_tracing_field_shard_id(&timeline); let ctx = ctx.with_scope_timeline(&timeline); if timeline.is_archived() == Some(true) { tracing::info!( "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it." ); return Err(QueryError::NotFound("timeline is archived".into())); } let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); timeline .wait_lsn( lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, crate::tenant::timeline::WaitLsnTimeout::Default, &ctx, ) .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; } let lsn_awaited_after = started.elapsed(); // switch client to COPYOUT pgb.write_message_noflush(&BeMessage::CopyOutResponse) .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &self.cancel).await?; let mut from_cache = false; // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) if full_backup { let mut writer = pgb.copyout_writer(); basebackup::send_basebackup_tarball( &mut writer, &timeline, lsn, prev_lsn, full_backup, replica, None, &ctx, ) .await?; } else { let mut writer = BufWriter::new(pgb.copyout_writer()); let cached = timeline .get_cached_basebackup_if_enabled(lsn, prev_lsn, full_backup, replica, gzip) .await; if let Some(mut cached) = cached { from_cache = true; tokio::io::copy(&mut cached, &mut writer) .await .map_err(|err| { BasebackupError::Client(err, "handle_basebackup_request,cached,copy") })?; } else { basebackup::send_basebackup_tarball( &mut writer, &timeline, lsn, prev_lsn, full_backup, replica, // NB: using fast compression because it's on the critical path for compute // startup. For an empty database, we get <100KB with this method. The // Level::Best compression method gives us <20KB, but maybe we should add // basebackup caching on compute shutdown first. gzip.then_some(async_compression::Level::Fastest), &ctx, ) .await?; } writer .flush() .await .map_err(|err| BasebackupError::Client(err, "handle_basebackup_request,flush"))?; } pgb.write_message_noflush(&BeMessage::CopyDone) .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; let basebackup_after = started .elapsed() .checked_sub(lsn_awaited_after) .unwrap_or(Duration::ZERO); info!( lsn_await_millis = lsn_awaited_after.as_millis(), basebackup_millis = basebackup_after.as_millis(), %from_cache, "basebackup complete" ); Ok(()) } // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id fn check_permission(&self, tenant_id: Option) -> Result<(), QueryError> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); } // auth is some, just checked above, when auth is some // then claims are always present because of checks during connection init // so this expect won't trigger let claims = self .claims .as_ref() .expect("claims presence already checked"); check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } } /// `basebackup tenant timeline [lsn] [--gzip] [--replica]` #[derive(Debug, Clone, Eq, PartialEq)] struct BaseBackupCmd { tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, gzip: bool, replica: bool, } /// `fullbackup tenant timeline [lsn] [prev_lsn]` #[derive(Debug, Clone, Eq, PartialEq)] struct FullBackupCmd { tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, } /// `pagestream_v2 tenant timeline` #[derive(Debug, Clone, Eq, PartialEq)] struct PageStreamCmd { tenant_id: TenantId, timeline_id: TimelineId, protocol_version: PagestreamProtocolVersion, } /// `lease lsn tenant timeline lsn` #[derive(Debug, Clone, Eq, PartialEq)] struct LeaseLsnCmd { tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn, } #[derive(Debug, Clone, Eq, PartialEq)] enum PageServiceCmd { Set, PageStream(PageStreamCmd), BaseBackup(BaseBackupCmd), FullBackup(FullBackupCmd), LeaseLsn(LeaseLsnCmd), } impl PageStreamCmd { fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() != 2 { bail!( "invalid number of parameters for pagestream command: {}", query ); } let tenant_id = TenantId::from_str(parameters[0]) .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?; let timeline_id = TimelineId::from_str(parameters[1]) .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?; Ok(Self { tenant_id, timeline_id, protocol_version, }) } } impl FullBackupCmd { fn parse(query: &str) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() < 2 || parameters.len() > 4 { bail!( "invalid number of parameters for basebackup command: {}", query ); } let tenant_id = TenantId::from_str(parameters[0]) .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?; let timeline_id = TimelineId::from_str(parameters[1]) .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if let Some(lsn_str) = parameters.get(2) { Some( Lsn::from_str(lsn_str) .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) { Some( Lsn::from_str(prev_lsn_str) .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?, ) } else { None }; Ok(Self { tenant_id, timeline_id, lsn, prev_lsn, }) } } impl BaseBackupCmd { fn parse(query: &str) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() < 2 { bail!( "invalid number of parameters for basebackup command: {}", query ); } let tenant_id = TenantId::from_str(parameters[0]) .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?; let timeline_id = TimelineId::from_str(parameters[1]) .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?; let lsn; let flags_parse_from; if let Some(maybe_lsn) = parameters.get(2) { if *maybe_lsn == "latest" { lsn = None; flags_parse_from = 3; } else if maybe_lsn.starts_with("--") { lsn = None; flags_parse_from = 2; } else { lsn = Some( Lsn::from_str(maybe_lsn) .with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?, ); flags_parse_from = 3; } } else { lsn = None; flags_parse_from = 2; } let mut gzip = false; let mut replica = false; for ¶m in ¶meters[flags_parse_from..] { match param { "--gzip" => { if gzip { bail!("duplicate parameter for basebackup command: {param}") } gzip = true } "--replica" => { if replica { bail!("duplicate parameter for basebackup command: {param}") } replica = true } _ => bail!("invalid parameter for basebackup command: {param}"), } } Ok(Self { tenant_id, timeline_id, lsn, gzip, replica, }) } } impl LeaseLsnCmd { fn parse(query: &str) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() != 3 { bail!( "invalid number of parameters for lease lsn command: {}", query ); } let tenant_shard_id = TenantShardId::from_str(parameters[0]) .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?; let timeline_id = TimelineId::from_str(parameters[1]) .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?; let lsn = Lsn::from_str(parameters[2]) .with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?; Ok(Self { tenant_shard_id, timeline_id, lsn, }) } } impl PageServiceCmd { fn parse(query: &str) -> anyhow::Result { let query = query.trim(); let Some((cmd, other)) = query.split_once(' ') else { bail!("cannot parse query: {query}") }; match cmd.to_ascii_lowercase().as_str() { "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse( other, PagestreamProtocolVersion::V2, )?)), "pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse( other, PagestreamProtocolVersion::V3, )?)), "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)), "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)), "lease" => { let Some((cmd2, other)) = other.split_once(' ') else { bail!("invalid lease command: {cmd}"); }; let cmd2 = cmd2.to_ascii_lowercase(); if cmd2 == "lsn" { Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?)) } else { bail!("invalid lease command: {cmd}"); } } "set" => Ok(Self::Set), _ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")), } } } /// Parse the startup options from the postgres wire protocol startup packet. /// /// It takes a sequence of `-c option=X` or `-coption=X`. It parses the options string /// by best effort and returns all the options parsed (key-value pairs) and a bool indicating /// whether all options are successfully parsed. There could be duplicates in the options /// if the caller passed such parameters. fn parse_options(options: &str) -> (Vec<(String, String)>, bool) { let mut parsing_config = false; let mut has_error = false; let mut config = Vec::new(); for item in options.split_whitespace() { if item == "-c" { if !parsing_config { parsing_config = true; } else { // "-c" followed with another "-c" tracing::warn!("failed to parse the startup options: {options}"); has_error = true; break; } } else if item.starts_with("-c") || parsing_config { let Some((mut key, value)) = item.split_once('=') else { // "-c" followed with an invalid option tracing::warn!("failed to parse the startup options: {options}"); has_error = true; break; }; if !parsing_config { // Parse "-coptions=X" let Some(stripped_key) = key.strip_prefix("-c") else { tracing::warn!("failed to parse the startup options: {options}"); has_error = true; break; }; key = stripped_key; } config.push((key.to_string(), value.to_string())); parsing_config = false; } else { tracing::warn!("failed to parse the startup options: {options}"); has_error = true; break; } } if parsing_config { // "-c" without the option tracing::warn!("failed to parse the startup options: {options}"); has_error = true; } (config, has_error) } impl postgres_backend::Handler for PageServerHandler where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data: TokenData = self .auth .as_ref() .unwrap() .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?) .map_err(|e| QueryError::Unauthorized(e.0))?; if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { return Err(QueryError::Unauthorized( "jwt token scope is Tenant, but tenant id is missing".into(), )); } debug!( "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); self.claims = Some(data.claims); Ok(()) } fn startup( &mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket, ) -> Result<(), QueryError> { fail::fail_point!("ps::connection-start::startup-packet"); if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(app_name) = params.get("application_name") { self.perf_span_fields.application_name = Some(app_name.to_string()); Span::current().record("application_name", field::display(app_name)); } if let Some(options) = params.get("options") { let (config, _) = parse_options(options); for (key, value) in config { if key == "neon.compute_mode" { self.perf_span_fields.compute_mode = Some(value.clone()); Span::current().record("compute_mode", field::display(value)); } } } }; Ok(()) } #[instrument(skip_all, fields(tenant_id, timeline_id))] async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { fail::fail_point!("simulated-bad-compute-connection", |_| { info!("Hit failpoint for bad connection"); Err(QueryError::SimulatedConnectionError) }); fail::fail_point!("ps::connection-start::process-query"); let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string}"); let query = PageServiceCmd::parse(query_string)?; match query { PageServiceCmd::PageStream(PageStreamCmd { tenant_id, timeline_id, protocol_version, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; let command_kind = match protocol_version { PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2, PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3, }; COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc(); self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx) .await?; } PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn, gzip, replica, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; COMPUTE_COMMANDS_COUNTERS .for_command(ComputeCommandKind::Basebackup) .inc(); let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(); let res = async { self.handle_basebackup_request( pgb, tenant_id, timeline_id, lsn, None, false, gzip, replica, &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; Result::<(), QueryError>::Ok(()) } .await; metric_recording.observe(&res); res?; } // same as basebackup, but result includes relational data as well PageServiceCmd::FullBackup(FullBackupCmd { tenant_id, timeline_id, lsn, prev_lsn, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; COMPUTE_COMMANDS_COUNTERS .for_command(ComputeCommandKind::Fullbackup) .inc(); // Check that the timeline exists self.handle_basebackup_request( pgb, tenant_id, timeline_id, lsn, prev_lsn, true, false, false, &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } PageServiceCmd::Set => { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect // TODO: allow setting options, i.e., application_name/compute_mode via SET commands pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } PageServiceCmd::LeaseLsn(LeaseLsnCmd { tenant_shard_id, timeline_id, lsn, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_shard_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_shard_id.tenant_id))?; COMPUTE_COMMANDS_COUNTERS .for_command(ComputeCommandKind::LeaseLsn) .inc(); match self .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx) .await { Ok(()) => { pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))? } Err(e) => { error!("error obtaining lsn lease for {lsn}: {e:?}"); pgb.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))? } }; } } Ok(()) } } /// Serves the page service over gRPC. Dispatches to PageServerHandler for request processing. /// /// TODO: rename to PageServiceHandler when libpq impl is removed. pub struct GrpcPageServiceHandler { tenant_manager: Arc, ctx: RequestContext, /// Cancelled to shut down the server. Tonic will shut down in response to this, but wait for /// in-flight requests to complete. Any tasks we spawn ourselves must respect this token. cancel: CancellationToken, /// Any tasks we spawn ourselves should clone this gate guard, so that we can wait for them to /// complete during shutdown. Request handlers implicitly hold this guard already. gate_guard: GateGuard, /// `get_vectored` concurrency setting. get_vectored_concurrent_io: GetVectoredConcurrentIo, } impl GrpcPageServiceHandler { /// Spawns a gRPC server for the page service. /// /// Returns a `CancellableTask` handle that can be used to shut down the server. It waits for /// any in-flight requests and tasks to complete first. /// /// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we /// need to reimplement the TCP+TLS accept loop ourselves. pub fn spawn( tenant_manager: Arc, auth: Option>, perf_trace_dispatch: Option, get_vectored_concurrent_io: GetVectoredConcurrentIo, listener: std::net::TcpListener, ) -> anyhow::Result { // Set up a cancellation token for shutting down the server, and a gate to wait for all // requests and spawned tasks to complete. let cancel = CancellationToken::new(); let gate = Gate::default(); let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler) .download_behavior(DownloadBehavior::Download) .perf_span_dispatch(perf_trace_dispatch) .detached_child(); // Set up the TCP socket. We take a preconfigured TcpListener to bind the // port early during startup. let incoming = { let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std listener.set_nonblocking(true)?; tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std( listener, )?) .with_nodelay(Some(GRPC_TCP_NODELAY)) .with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME)) }; // Set up the gRPC server. // // TODO: consider tuning window sizes. let mut server = tonic::transport::Server::builder() .http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL)) .http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT)) .max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS)); // Main page service stack. Uses a mix of Tonic interceptors and Tower layers: // // * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service. // // * Layers: allow async code, can run code after the service response. However, only has access // to the raw HTTP request/response, not the gRPC types. let page_service_handler = GrpcPageServiceHandler { tenant_manager, ctx, cancel: cancel.clone(), gate_guard: gate.enter().expect("gate was just created"), get_vectored_concurrent_io, }; let observability_layer = ObservabilityLayer; let mut tenant_interceptor = TenantMetadataInterceptor; let mut auth_interceptor = TenantAuthInterceptor::new(auth); let page_service = tower::ServiceBuilder::new() // Create tracing span and record request start time. .layer(observability_layer) // Intercept gRPC requests. .layer(tonic::service::InterceptorLayer::new(move |mut req| { // Extract tenant metadata. req = tenant_interceptor.call(req)?; // Authenticate tenant JWT token. req = auth_interceptor.call(req)?; Ok(req) })) // Run the page service. .service( proto::PageServiceServer::new(page_service_handler) // Support both gzip and zstd compression. The client decides what to use. .accept_compressed(tonic::codec::CompressionEncoding::Gzip) .accept_compressed(tonic::codec::CompressionEncoding::Zstd) .send_compressed(tonic::codec::CompressionEncoding::Gzip) .send_compressed(tonic::codec::CompressionEncoding::Zstd), ); let server = server.add_service(page_service); // Reflection service for use with e.g. grpcurl. let reflection_service = tonic_reflection::server::Builder::configure() .register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET) .build_v1()?; let server = server.add_service(reflection_service); // Spawn server task. It runs until the cancellation token fires and in-flight requests and // tasks complete. The `CancellableTask` will wait for the task's join handle, which // implicitly waits for the gate to close. let task_cancel = cancel.clone(); let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "grpc pageservice listener", async move { server .serve_with_incoming_shutdown(incoming, task_cancel.cancelled()) .await?; // Server exited cleanly. All requests should have completed by now. Wait for any // spawned tasks to complete as well (e.g. IoConcurrency sidecars) via the gate. gate.close().await; anyhow::Ok(()) }, )); Ok(CancellableTask { task, cancel }) } /// Generates a PagestreamRequest header from a ReadLsn and request ID. fn make_hdr( read_lsn: page_api::ReadLsn, req_id: Option, ) -> PagestreamRequest { PagestreamRequest { reqid: req_id.map(|r| r.id).unwrap_or_default(), request_lsn: read_lsn.request_lsn, not_modified_since: read_lsn .not_modified_since_lsn .unwrap_or(read_lsn.request_lsn), } } /// Acquires a timeline handle for the given request. The shard index must match a local shard. /// /// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`]. async fn get_request_timeline( &self, req: &tonic::Request, ) -> Result, GetActiveTimelineError> { let TenantTimelineId { tenant_id, timeline_id, } = *extract::(req); let shard_index = *extract::(req); // TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to // avoid the unnecessary overhead. TimelineHandles::new(self.tenant_manager.clone()) .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) .await } /// Acquires a timeline handle for the given request, which must be for shard zero. Most /// metadata requests are only valid on shard zero. /// /// NB: during an ongoing shard split, the compute will keep talking to the parent shard until /// the split is committed, but the parent shard may have been removed in the meanwhile. In that /// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`]. /// /// TODO: revamp the split protocol to avoid this child routing. async fn get_request_timeline_shard_zero( &self, req: &tonic::Request, ) -> Result, tonic::Status> { let TenantTimelineId { tenant_id, timeline_id, } = *extract::(req); let shard_index = *extract::(req); if shard_index.shard_number.0 != 0 { return Err(tonic::Status::invalid_argument(format!( "request only valid on shard zero (requested shard {shard_index})", ))); } // TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to // avoid the unnecessary overhead. let mut handles = TimelineHandles::new(self.tenant_manager.clone()); match handles .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) .await { Ok(timeline) => Ok(timeline), Err(err) => { // We may be in the middle of a shard split. Try to find a child shard 0. if let Ok(timeline) = handles .get(tenant_id, timeline_id, ShardSelector::Zero) .await && timeline.get_shard_index().shard_count > shard_index.shard_count { return Ok(timeline); } Err(err.into()) } } } /// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start. /// Only errors if the timeline is shutting down. /// /// TODO: move timer construction to ObservabilityLayer (see TODO there). /// TODO: decouple rate limiting (middleware?), and return SlowDown errors instead. async fn record_op_start_and_throttle( timeline: &Handle, op: metrics::SmgrQueryType, received_at: Instant, ) -> Result { let mut timer = PageServerHandler::record_op_start_and_throttle(timeline, op, received_at) .await .map_err(|err| match err { // record_op_start_and_throttle() only returns Shutdown. QueryError::Shutdown => tonic::Status::unavailable(format!("{err}")), err => tonic::Status::internal(format!("unexpected error: {err}")), })?; timer.observe_execution_start(Instant::now()); Ok(timer) } /// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC. /// /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a /// GetPageResponse with an appropriate status code to avoid terminating the stream. /// /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or /// split them up in the client or server. #[instrument(skip_all, fields( req_id = %req.request_id, rel = %req.rel, blkno = %req.block_numbers[0], blks = %req.block_numbers.len(), lsn = %req.read_lsn, ))] async fn get_page( ctx: &RequestContext, timeline: Handle, req: page_api::GetPageRequest, io_concurrency: IoConcurrency, received_at: Instant, ) -> Result { let ctx = ctx.with_scope_page_service_pagestream(&timeline); for &blkno in &req.block_numbers { let shard = timeline.get_shard_identity(); let key = rel_block_to_key(req.rel, blkno); if !shard.is_key_local(&key) { return Err(tonic::Status::invalid_argument(format!( "block {blkno} of relation {} requested on wrong shard {} (is on {})", req.rel, timeline.get_shard_index(), ShardIndex::new(shard.get_shard_number(&key), shard.count), ))); } } let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard let effective_lsn = PageServerHandler::effective_request_lsn( &timeline, timeline.get_last_record_lsn(), req.read_lsn.request_lsn, req.read_lsn .not_modified_since_lsn .unwrap_or(req.read_lsn.request_lsn), &latest_gc_cutoff_lsn, )?; let mut batch = SmallVec::with_capacity(req.block_numbers.len()); for blkno in req.block_numbers { // TODO: this creates one timer per page and throttles it. We should have a timer for // the entire batch, and throttle only the batch, but this is equivalent to what // PageServerHandler does already so we keep it for now. let timer = Self::record_op_start_and_throttle( &timeline, metrics::SmgrQueryType::GetPageAtLsn, received_at, ) .await?; batch.push(BatchedGetPageRequest { req: PagestreamGetPageRequest { hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)), rel: req.rel, blkno, }, lsn_range: LsnRange { effective_lsn, request_lsn: req.read_lsn.request_lsn, }, timer, ctx: ctx.attached_child(), batch_wait_ctx: None, // TODO: add tracing }); } // TODO: this does a relation size query for every page in the batch. Since this batch is // all for one relation, we could do this only once. However, this is not the case for the // libpq implementation. let results = PageServerHandler::handle_get_page_at_lsn_request_batched( &timeline, batch, io_concurrency, GetPageBatchBreakReason::BatchFull, // TODO: not relevant for gRPC batches &ctx, ) .await; let mut resp = page_api::GetPageResponse { request_id: req.request_id, status_code: page_api::GetPageStatusCode::Ok, reason: None, rel: req.rel, pages: Vec::with_capacity(results.len()), }; for result in results { match result { Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page { block_number: r.req.blkno, image: r.page, }), Ok((resp, _, _)) => { return Err(tonic::Status::internal(format!( "unexpected response: {resp:?}" ))); } Err(err) => return Err(err.err.into()), }; } Ok(resp) } /// Processes a GetPage request when there is a potential shard split in progress. We have to /// reroute the request to any local child shards, and split batch requests that straddle /// multiple child shards. /// /// Parent shards are split and removed incrementally (there may be many parent shards when /// splitting an already-sharded tenant), but the compute is only notified once the overall /// split commits, which can take several minutes. In the meanwhile, the compute will be sending /// requests to the parent shards. /// /// TODO: add test infrastructure to provoke this situation frequently and for long periods of /// time, to properly exercise it. /// /// TODO: revamp the split protocol to avoid this, e.g.: /// * Keep the parent shard until the split commits and the compute is notified. /// * Notify the compute about each subsplit. /// * Return an error that updates the compute's shard map. #[instrument(skip_all)] #[allow(clippy::too_many_arguments)] async fn maybe_split_get_page( ctx: &RequestContext, handles: &mut TimelineHandles, tenant_id: TenantId, timeline_id: TimelineId, parent: ShardIndex, req: page_api::GetPageRequest, io_concurrency: IoConcurrency, received_at: Instant, ) -> Result { // Check the first page to see if we have any child shards at all. Otherwise, the compute is // just talking to the wrong Pageserver. If the parent has been split, the shard now owning // the page must have a higher shard count. let timeline = handles .get( tenant_id, timeline_id, ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])), ) .await?; let shard_id = timeline.get_shard_identity(); if shard_id.count <= parent.shard_count { return Err(HandleUpgradeError::ShutDown.into()); // emulate original error } // Fast path: the request fits in a single shard. if let Some(shard_index) = GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))? { // We got the shard ID from the first page, so these must be equal. assert_eq!(shard_index.shard_number, shard_id.number); assert_eq!(shard_index.shard_count, shard_id.count); return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await; } // The request spans multiple shards; split it and dispatch parallel requests. All pages // were originally in the parent shard, and during a split all children are local, so we // expect to find local shards for all pages. let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))?; let mut shard_requests = FuturesUnordered::new(); for (shard_index, shard_req) in splitter.drain_requests() { let timeline = handles .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) .await?; let future = Self::get_page( ctx, timeline, shard_req, io_concurrency.clone(), received_at, ) .map(move |result| result.map(|resp| (shard_index, resp))); shard_requests.push(future); } while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? { splitter.add_response(shard_index, shard_response)?; } Ok(splitter.collect_response()?) } } /// Implements the gRPC page service. /// /// On client disconnect (e.g. timeout or client shutdown), Tonic will drop the request handler /// futures, so the read path must be cancellation-safe. On server shutdown, Tonic will wait for /// in-flight requests to complete. /// /// TODO: when the libpq impl is removed, remove the Pagestream types and inline the handler code. #[tonic::async_trait] impl proto::PageService for GrpcPageServiceHandler { type GetBaseBackupStream = Pin< Box> + Send>, >; type GetPagesStream = Pin> + Send>>; #[instrument(skip_all, fields(lsn))] async fn get_base_backup( &self, req: tonic::Request, ) -> Result, tonic::Status> { // Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this // to be the sweet spot where throughput is saturated. const CHUNK_SIZE: usize = 256 * 1024; let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); // Validate the request and decorate the span. if timeline.is_archived() == Some(true) { return Err(tonic::Status::failed_precondition("timeline is archived")); } let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?; span_record!(lsn=?req.lsn); // Wait for the LSN to arrive, if given. if let Some(lsn) = req.lsn { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); timeline .wait_lsn( lsn, WaitLsnWaiter::PageService, WaitLsnTimeout::Default, &ctx, ) .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .map_err(|err| { tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}")) })?; } // Spawn a task to run the basebackup. let span = Span::current(); let gate_guard = self .gate_guard .try_clone() .map_err(|_| tonic::Status::unavailable("shutting down"))?; let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { let _gate_guard = gate_guard; // keep gate open until task completes let gzip_level = match req.compression { page_api::BaseBackupCompression::None => None, // NB: using fast compression because it's on the critical path for compute // startup. For an empty database, we get <100KB with this method. The // Level::Best compression method gives us <20KB, but maybe we should add // basebackup caching on compute shutdown first. page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest), }; // Check for a cached basebackup. let cached = timeline .get_cached_basebackup_if_enabled( req.lsn, None, req.full, req.replica, gzip_level.is_some(), ) .await; let result = if let Some(mut cached) = cached { // If we have a cached basebackup, send it. tokio::io::copy(&mut cached, &mut simplex_write) .await .map(|_| ()) .map_err(|err| BasebackupError::Client(err, "cached,copy")) } else { basebackup::send_basebackup_tarball( &mut simplex_write, &timeline, req.lsn, None, req.full, req.replica, gzip_level, &ctx, ) .instrument(span) // propagate request span .await }; simplex_write .shutdown() .await .map_err(|err| BasebackupError::Client(err, "simplex_write"))?; result }); // Emit chunks of size CHUNK_SIZE. let chunks = async_stream::try_stream! { loop { let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE); loop { let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| { tonic::Status::internal(format!("failed to read basebackup chunk: {err}")) })?; if n == 0 { break; // full chunk or closed stream } } let chunk = chunk.into_inner().freeze(); if chunk.is_empty() { break; } yield proto::GetBaseBackupResponseChunk::from(chunk); } // Wait for the basebackup task to exit and check for errors. jh.await.map_err(|err| { tonic::Status::internal(format!("basebackup failed: {err}")) })??; }; Ok(tonic::Response::new(Box::pin(chunks))) } #[instrument(skip_all, fields(db_oid, lsn))] async fn get_db_size( &self, req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?; span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn); let req = PagestreamDbSizeRequest { hdr: Self::make_hdr(req.read_lsn, None), dbnode: req.db_oid, }; // Execute the request and convert the response. let _timer = Self::record_op_start_and_throttle( &timeline, metrics::SmgrQueryType::GetDbSize, received_at, ) .await?; let resp = PageServerHandler::handle_db_size_request(&timeline, &req, &ctx).await?; let resp = resp.db_size as page_api::GetDbSizeResponse; Ok(tonic::Response::new(resp.into())) } // NB: don't instrument this, instrument each streamed request. async fn get_pages( &self, req: tonic::Request>, ) -> Result, tonic::Status> { // Extract the timeline from the request and check that it exists. // // NB: during shard splits, the compute may still send requests to the parent shard. We'll // reroute requests to the child shards below, but we also detect the common cases here // where either the shard exists or no shards exist at all. If we have a child shard, we // can't acquire a weak handle because we don't know which child shard to use yet. let TenantTimelineId { tenant_id, timeline_id, } = *extract::(&req); let shard_index = *extract::(&req); let mut handles = TimelineHandles::new(self.tenant_manager.clone()); let timeline = match handles .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) .await { // The timeline shard exists. Keep a weak handle to reuse for each request. Ok(timeline) => Some(timeline.downgrade()), // The shard doesn't exist, but a child shard does. We'll reroute requests later. Err(_) if self.tenant_manager.has_child_shard(tenant_id, shard_index) => None, // Failed to fetch the timeline, and no child shard exists. Error out. Err(err) => return Err(err.into()), }; // Spawn an IoConcurrency sidecar, if enabled. let gate_guard = self .gate_guard .try_clone() .map_err(|_| tonic::Status::unavailable("shutting down"))?; let io_concurrency = IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard); // Construct the GetPageRequest stream handler. let span = Span::current(); let ctx = self.ctx.attached_child(); let cancel = self.cancel.clone(); let mut reqs = req.into_inner(); let resps = async_stream::try_stream! { loop { // Wait for the next client request. // // NB: Tonic considers the entire stream to be an in-flight request and will wait // for it to complete before shutting down. React to cancellation between requests. let req = tokio::select! { biased; _ = cancel.cancelled() => Err(tonic::Status::unavailable("shutting down")), result = reqs.message() => match result { Ok(Some(req)) => Ok(req), Ok(None) => break, // client closed the stream Err(err) => Err(err), }, }?; let received_at = Instant::now(); let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default(); // Process the request, using a closure to capture errors. let process_request = async || { let req = page_api::GetPageRequest::try_from(req)?; // Fast path: use the pre-acquired timeline handle. if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) { return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at) .instrument(span.clone()) // propagate request span .await } // The timeline handle is stale. During shard splits, the compute may still be // sending requests to the parent shard. Try to re-route requests to the child // shards, and split any batch requests that straddle multiple child shards. Self::maybe_split_get_page( &ctx, &mut handles, tenant_id, timeline_id, shard_index, req, io_concurrency.clone(), received_at, ) .instrument(span.clone()) // propagate request span .await }; // Return the response. Convert per-request errors to GetPageResponses if // appropriate, or terminate the stream with a tonic::Status. yield match process_request().await { Ok(resp) => resp.into(), Err(status) => { // Log the error, since ObservabilityLayer won't see stream errors. // TODO: it would be nice if we could propagate the get_page() fields here. span.in_scope(|| { warn!("request failed with {:?}: {}", status.code(), status.message()); }); page_api::GetPageResponse::try_from_status(status, req_id)?.into() } } } }; Ok(tonic::Response::new(Box::pin(resps))) } #[instrument(skip_all, fields(rel, lsn, allow_missing))] async fn get_rel_size( &self, req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?; let allow_missing = req.allow_missing; span_record!(rel=%req.rel, lsn=%req.read_lsn, allow_missing=%req.allow_missing); let req = PagestreamNblocksRequest { hdr: Self::make_hdr(req.read_lsn, None), rel: req.rel, }; // Execute the request and convert the response. let _timer = Self::record_op_start_and_throttle( &timeline, metrics::SmgrQueryType::GetRelSize, received_at, ) .await?; let resp = PageServerHandler::handle_get_nblocks_request(&timeline, &req, allow_missing, &ctx) .await?; let resp: page_api::GetRelSizeResponse = resp.map(|resp| resp.n_blocks); Ok(tonic::Response::new(resp.into())) } #[instrument(skip_all, fields(kind, segno, lsn))] async fn get_slru_segment( &self, req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?; span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn); let req = PagestreamGetSlruSegmentRequest { hdr: Self::make_hdr(req.read_lsn, None), kind: req.kind as u8, segno: req.segno, }; // Execute the request and convert the response. let _timer = Self::record_op_start_and_throttle( &timeline, metrics::SmgrQueryType::GetSlruSegment, received_at, ) .await?; let resp = PageServerHandler::handle_get_slru_segment_request(&timeline, &req, &ctx).await?; let resp: page_api::GetSlruSegmentResponse = resp.segment; Ok(tonic::Response::new(resp.into())) } #[instrument(skip_all, fields(lsn))] async fn lease_lsn( &self, req: tonic::Request, ) -> Result, tonic::Status> { // TODO: this won't work during shard splits, as the request is directed at a specific shard // but the parent shard is removed before the split commits and the compute is notified // (which can take several minutes for large tenants). That's also the case for the libpq // implementation, so we keep the behavior for now. let timeline = self.get_request_timeline(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); // Validate and convert the request, and decorate the span. let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?; span_record!(lsn=%req.lsn); // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted. let lease_length = timeline.get_lsn_lease_length(); let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) { Ok(lease) => lease.valid_until, Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))), }; // TODO: is this spammy? Move it compute-side? info!( "acquired lease for {} until {}", req.lsn, chrono::DateTime::::from(expires).to_rfc3339() ); Ok(tonic::Response::new(expires.into())) } } /// gRPC middleware layer that handles observability concerns: /// /// * Creates and enters a tracing span. /// * Records the request start time as a ReceivedAt request extension. /// /// TODO: add perf tracing. /// TODO: add timing and metrics. /// TODO: add logging. #[derive(Clone)] struct ObservabilityLayer; impl tower::Layer for ObservabilityLayer { type Service = ObservabilityLayerService; fn layer(&self, inner: S) -> Self::Service { Self::Service { inner } } } #[derive(Clone)] struct ObservabilityLayerService { inner: S, } #[derive(Clone, Copy)] struct ReceivedAt(Instant); impl tonic::server::NamedService for ObservabilityLayerService { const NAME: &'static str = S::NAME; // propagate inner service name } impl tower::Service> for ObservabilityLayerService where S: tower::Service, Response = http::Response> + Send, S::Future: Send + 'static, { type Response = S::Response; type Error = S::Error; type Future = BoxFuture<'static, Result>; fn call(&mut self, mut req: http::Request) -> Self::Future { // Record the request start time as a request extension. // // TODO: we should start a timer here instead, but it currently requires a timeline handle // and SmgrQueryType, which we don't have yet. Refactor it to provide it later. req.extensions_mut().insert(ReceivedAt(Instant::now())); // Extract the peer address and gRPC method. let peer = req .extensions() .get::() .and_then(|info| info.remote_addr()) .map(|addr| addr.to_string()) .unwrap_or_default(); let method = req .uri() .path() .split('/') .nth(2) .unwrap_or(req.uri().path()) .to_string(); // Create a basic tracing span. // // Enter the span for the current thread and instrument the future. It is not sufficient to // only instrument the future, since it only takes effect after the future is returned and // polled, not when the inner service is called below (e.g. during interceptor execution). let span = info_span!( "grpc:pageservice", // These will be populated by TenantMetadataInterceptor. tenant_id = field::Empty, timeline_id = field::Empty, shard_id = field::Empty, // NB: empty fields must be listed first above. Otherwise, the field names will be // clobbered when the empty fields are populated. They will be output last regardless. %peer, %method, ); let _guard = span.enter(); // Construct a future for calling the inner service, but don't await it. This avoids having // to clone the inner service into the future below. let call = self.inner.call(req); async move { // Await the inner service call. let result = call.await; // Log gRPC error statuses. This won't include request info from handler spans, but it // will catch all errors (even those emitted before handler spans are constructed). Only // unary request errors are logged here, not streaming response errors. if let Ok(ref resp) = result && let Some(status) = tonic::Status::from_header_map(resp.headers()) && status.code() != tonic::Code::Ok { // TODO: it would be nice if we could propagate the handler span's request fields // here. This could e.g. be done by attaching the request fields to // tonic::Status::metadata via a proc macro. warn!( "request failed with {:?}: {}", status.code(), status.message() ); } result } .instrument(span.clone()) .boxed() } fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { self.inner.poll_ready(cx) } } /// gRPC interceptor that decodes tenant metadata and stores it as request extensions of type /// TenantTimelineId and ShardIndex. #[derive(Clone)] struct TenantMetadataInterceptor; impl tonic::service::Interceptor for TenantMetadataInterceptor { fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { // Decode the tenant ID. let tenant_id = req .metadata() .get("neon-tenant-id") .ok_or_else(|| tonic::Status::invalid_argument("missing neon-tenant-id"))? .to_str() .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id"))?; let tenant_id = TenantId::from_str(tenant_id) .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id"))?; // Decode the timeline ID. let timeline_id = req .metadata() .get("neon-timeline-id") .ok_or_else(|| tonic::Status::invalid_argument("missing neon-timeline-id"))? .to_str() .map_err(|_| tonic::Status::invalid_argument("invalid neon-timeline-id"))?; let timeline_id = TimelineId::from_str(timeline_id) .map_err(|_| tonic::Status::invalid_argument("invalid neon-timeline-id"))?; // Decode the shard ID. let shard_id = req .metadata() .get("neon-shard-id") .ok_or_else(|| tonic::Status::invalid_argument("missing neon-shard-id"))? .to_str() .map_err(|_| tonic::Status::invalid_argument("invalid neon-shard-id"))?; let shard_id = ShardIndex::from_str(shard_id) .map_err(|_| tonic::Status::invalid_argument("invalid neon-shard-id"))?; // Stash them in the request. let extensions = req.extensions_mut(); extensions.insert(TenantTimelineId::new(tenant_id, timeline_id)); extensions.insert(shard_id); // Decorate the tracing span. span_record!(%tenant_id, %timeline_id, %shard_id); Ok(req) } } /// Authenticates gRPC page service requests. #[derive(Clone)] struct TenantAuthInterceptor { auth: Option>, } impl TenantAuthInterceptor { fn new(auth: Option>) -> Self { Self { auth } } } impl tonic::service::Interceptor for TenantAuthInterceptor { fn call(&mut self, req: tonic::Request<()>) -> Result, tonic::Status> { // Do nothing if auth is disabled. let Some(auth) = self.auth.as_ref() else { return Ok(req); }; // Fetch the tenant ID from the request extensions (set by TenantMetadataInterceptor). let TenantTimelineId { tenant_id, .. } = *extract::(&req); // Fetch and decode the JWT token. let jwt = req .metadata() .get("authorization") .ok_or_else(|| tonic::Status::unauthenticated("no authorization header"))? .to_str() .map_err(|_| tonic::Status::invalid_argument("invalid authorization header"))? .strip_prefix("Bearer ") .ok_or_else(|| tonic::Status::invalid_argument("invalid authorization header"))? .trim(); let jwtdata: TokenData = auth .decode(jwt) .map_err(|err| tonic::Status::invalid_argument(format!("invalid JWT token: {err}")))?; let claims = jwtdata.claims; // Check if the token is valid for this tenant. check_permission(&claims, Some(tenant_id)) .map_err(|err| tonic::Status::permission_denied(err.to_string()))?; // TODO: consider stashing the claims in the request extensions, if needed. Ok(req) } } /// Extracts the given type from the request extensions, or panics if it is missing. fn extract(req: &tonic::Request) -> &T { extract_from(req.extensions()) } /// Extract the given type from the request extensions, or panics if it is missing. This variant /// can extract both from a tonic::Request and http::Request. fn extract_from(ext: &http::Extensions) -> &T { let Some(value) = ext.get::() else { let name = std::any::type_name::(); panic!("extension {name} should be set by middleware"); }; value } #[derive(Debug, thiserror::Error)] pub(crate) enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] Timeline(#[from] GetTimelineError), } impl From for QueryError { fn from(e: GetActiveTimelineError) -> Self { match e { GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown, GetActiveTimelineError::Tenant(e) => e.into(), GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()), } } } impl From for tonic::Status { fn from(err: GetActiveTimelineError) -> Self { let message = err.to_string(); let code = match err { GetActiveTimelineError::Tenant(err) => tonic::Status::from(err).code(), GetActiveTimelineError::Timeline(err) => tonic::Status::from(err).code(), }; tonic::Status::new(code, message) } } impl From for tonic::Status { fn from(err: GetTimelineError) -> Self { use tonic::Code; let code = match &err { GetTimelineError::NotFound { .. } => Code::NotFound, GetTimelineError::NotActive { .. } => Code::Unavailable, GetTimelineError::ShuttingDown => Code::Unavailable, }; tonic::Status::new(code, err.to_string()) } } impl From for QueryError { fn from(e: GetActiveTenantError) -> Self { match e { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), GetActiveTenantError::Cancelled | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { QueryError::Shutdown } e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()), e => QueryError::Other(anyhow::anyhow!(e)), } } } impl From for tonic::Status { fn from(err: GetActiveTenantError) -> Self { use tonic::Code; let code = match &err { GetActiveTenantError::Broken(_) => Code::Internal, GetActiveTenantError::Cancelled => Code::Unavailable, GetActiveTenantError::NotFound(_) => Code::NotFound, GetActiveTenantError::SwitchedTenant => Code::Unavailable, GetActiveTenantError::WaitForActiveTimeout { .. } => Code::Unavailable, GetActiveTenantError::WillNotBecomeActive(_) => Code::Unavailable, }; tonic::Status::new(code, err.to_string()) } } impl From for QueryError { fn from(e: HandleUpgradeError) -> Self { match e { HandleUpgradeError::ShutDown => QueryError::Shutdown, } } } impl From for tonic::Status { fn from(err: HandleUpgradeError) -> Self { match err { HandleUpgradeError::ShutDown => tonic::Status::unavailable("timeline is shutting down"), } } } fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); tracing::Span::current().record( "shard_id", tracing::field::display(timeline.tenant_shard_id.shard_slug()), ); debug_assert_current_span_has_tenant_and_timeline_id(); } struct WaitedForLsn(Lsn); impl From for Lsn { fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self { lsn } } #[cfg(test)] mod tests { use utils::shard::ShardCount; use super::*; #[test] fn pageservice_cmd_parse() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap(); assert_eq!( cmd, PageServiceCmd::PageStream(PageStreamCmd { tenant_id, timeline_id, protocol_version: PagestreamProtocolVersion::V2, }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: None, gzip: false, replica: false }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: None, gzip: true, replica: false }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: None, gzip: false, replica: false }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE")) .unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()), gzip: false, replica: false }) ); let cmd = PageServiceCmd::parse(&format!( "basebackup {tenant_id} {timeline_id} --replica --gzip" )) .unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: None, gzip: true, replica: true }) ); let cmd = PageServiceCmd::parse(&format!( "basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip" )) .unwrap(); assert_eq!( cmd, PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, timeline_id, lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()), gzip: true, replica: true }) ); let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap(); assert_eq!( cmd, PageServiceCmd::FullBackup(FullBackupCmd { tenant_id, timeline_id, lsn: None, prev_lsn: None }) ); let cmd = PageServiceCmd::parse(&format!( "fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF" )) .unwrap(); assert_eq!( cmd, PageServiceCmd::FullBackup(FullBackupCmd { tenant_id, timeline_id, lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()), prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()), }) ); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let cmd = PageServiceCmd::parse(&format!( "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE" )) .unwrap(); assert_eq!( cmd, PageServiceCmd::LeaseLsn(LeaseLsnCmd { tenant_shard_id, timeline_id, lsn: Lsn::from_str("0/16ABCDE").unwrap(), }) ); let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1]; let cmd = PageServiceCmd::parse(&format!( "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE" )) .unwrap(); assert_eq!( cmd, PageServiceCmd::LeaseLsn(LeaseLsnCmd { tenant_shard_id, timeline_id, lsn: Lsn::from_str("0/16ABCDE").unwrap(), }) ); let cmd = PageServiceCmd::parse("set a = b").unwrap(); assert_eq!(cmd, PageServiceCmd::Set); let cmd = PageServiceCmd::parse("SET foo").unwrap(); assert_eq!(cmd, PageServiceCmd::Set); } #[test] fn pageservice_cmd_err_handling() { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let cmd = PageServiceCmd::parse("unknown_command"); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse("pagestream_v2"); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx")); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx")); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!( "basebackup {tenant_id} {timeline_id} --gzip --gzip" )); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!( "basebackup {tenant_id} {timeline_id} --gzip --unknown" )); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!( "basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE" )); assert!(cmd.is_err()); let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE")); assert!(cmd.is_err()); } #[test] fn test_parse_options() { let (config, has_error) = parse_options(" -c neon.compute_mode=primary "); assert!(!has_error); assert_eq!( config, vec![("neon.compute_mode".to_string(), "primary".to_string())] ); let (config, has_error) = parse_options(" -c neon.compute_mode=primary -c foo=bar "); assert!(!has_error); assert_eq!( config, vec![ ("neon.compute_mode".to_string(), "primary".to_string()), ("foo".to_string(), "bar".to_string()), ] ); let (config, has_error) = parse_options(" -c neon.compute_mode=primary -cfoo=bar"); assert!(!has_error); assert_eq!( config, vec![ ("neon.compute_mode".to_string(), "primary".to_string()), ("foo".to_string(), "bar".to_string()), ] ); let (_, has_error) = parse_options("-c"); assert!(has_error); let (_, has_error) = parse_options("-c foo=bar -c -c"); assert!(has_error); let (_, has_error) = parse_options(" "); assert!(!has_error); let (_, has_error) = parse_options(" -c neon.compute_mode"); assert!(has_error); } } ================================================ FILE: pageserver/src/pgdatadir_mapping.rs ================================================ //! //! This provides an abstraction to store PostgreSQL relations and other files //! in the key-value store that implements the Repository interface. //! //! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! use std::collections::{BTreeSet, HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; use std::sync::Arc; use crate::walingest::{WalIngestError, WalIngestErrorKind}; use crate::{PERF_TRACE_TARGET, ensure_walingest}; use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use pageserver_api::key::{ AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId}; use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi_types::{Oid, RepOriginId, TimestampTz}; use serde::{Deserialize, Serialize}; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn}; use utils::bin_ser::{BeSer, DeserializeError}; use utils::lsn::Lsn; use utils::pausable_failpoint; use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS, RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS, RELSIZE_SNAPSHOT_CACHE_MISSES, }; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. pub const MAX_AUX_FILE_V2_DELTAS: usize = 16; #[derive(Debug)] pub enum LsnForTimestamp { /// Found commits both before and after the given timestamp Present(Lsn), /// Found no commits after the given timestamp, this means /// that the newest data in the branch is older than the given /// timestamp. /// /// All commits <= LSN happened before the given timestamp Future(Lsn), /// The queried timestamp is past our horizon we look back at (PITR) /// /// All commits > LSN happened after the given timestamp, /// but any commits < LSN might have happened before or after /// the given timestamp. We don't know because no data before /// the given lsn is available. Past(Lsn), /// We have found no commit with a timestamp, /// so we can't return anything meaningful. /// /// The associated LSN is the lower bound value we can safely /// create branches on, but no statement is made if it is /// older or newer than the timestamp. /// /// This variant can e.g. be returned right after a /// cluster import. NoData(Lsn), } /// Each request to page server contains LSN range: `not_modified_since..request_lsn`. /// See comments libs/pageserver_api/src/models.rs. /// Based on this range and `last_record_lsn` PS calculates `effective_lsn`. /// But to distinguish requests from primary and replicas we need also to pass `request_lsn`. #[derive(Debug, Clone, Copy, Default)] pub struct LsnRange { pub effective_lsn: Lsn, pub request_lsn: Lsn, } impl LsnRange { pub fn at(lsn: Lsn) -> LsnRange { LsnRange { effective_lsn: lsn, request_lsn: lsn, } } pub fn is_latest(&self) -> bool { self.request_lsn == Lsn::MAX } } #[derive(Debug, thiserror::Error)] pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] Cancelled, /// Something went wrong while reading the metadata we use to calculate logical size /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`] /// in the `From` implementation for this variant. #[error(transparent)] PageRead(PageReconstructError), /// Something went wrong deserializing metadata that we read to calculate logical size #[error("decode error: {0}")] Decode(#[from] DeserializeError), } #[derive(Debug, thiserror::Error)] pub(crate) enum CollectKeySpaceError { #[error(transparent)] Decode(#[from] DeserializeError), #[error(transparent)] PageRead(PageReconstructError), #[error("cancelled")] Cancelled, } impl CollectKeySpaceError { pub(crate) fn is_cancel(&self) -> bool { match self { CollectKeySpaceError::Decode(_) => false, CollectKeySpaceError::PageRead(e) => e.is_cancel(), CollectKeySpaceError::Cancelled => true, } } pub(crate) fn into_anyhow(self) -> anyhow::Error { match self { CollectKeySpaceError::Decode(e) => anyhow::Error::new(e), CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e), CollectKeySpaceError::Cancelled => anyhow::Error::new(self), } } } impl From for CollectKeySpaceError { fn from(err: PageReconstructError) -> Self { match err { PageReconstructError::Cancelled => Self::Cancelled, err => Self::PageRead(err), } } } impl From for CalculateLogicalSizeError { fn from(pre: PageReconstructError) -> Self { match pre { PageReconstructError::Cancelled => Self::Cancelled, _ => Self::PageRead(pre), } } } #[derive(Debug, thiserror::Error)] pub enum RelationError { #[error("invalid relnode")] InvalidRelnode, } /// /// This impl provides all the functionality to store PostgreSQL relations, SLRUs, /// and other special kinds of files, in a versioned key-value store. The /// Timeline struct provides the key-value store. /// /// This is a separate impl, so that we can easily include all these functions in a Timeline /// implementation, and might be moved into a separate struct later. impl Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// /// This provides a transaction-like interface to perform a bunch /// of modifications atomically. /// /// To ingest a WAL record, call begin_modification(lsn) to get a /// DatadirModification object. Use the functions in the object to /// modify the repository state, updating all the pages and metadata /// that the WAL record affects. When you're done, call commit() to /// commit the changes. /// /// Lsn stored in modification is advanced by `ingest_record` and /// is used by `commit()` to update `last_record_lsn`. /// /// Calling commit() will flush all the changes and reset the state, /// so the `DatadirModification` struct can be reused to perform the next modification. /// /// Note that any pending modifications you make through the /// modification object won't be visible to calls to the 'get' and list /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification where Self: Sized, { DatadirModification { tline: self, pending_lsns: Vec::new(), pending_metadata_pages: HashMap::new(), pending_data_batch: None, pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), pending_metadata_bytes: 0, is_importing_pgdata: false, lsn, } } pub fn begin_modification_for_import(&self, lsn: Lsn) -> DatadirModification where Self: Sized, { DatadirModification { tline: self, pending_lsns: Vec::new(), pending_metadata_pages: HashMap::new(), pending_data_batch: None, pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), pending_metadata_bytes: 0, is_importing_pgdata: true, lsn, } } //------------------------------------------------------------------------------ // Public GET functions //------------------------------------------------------------------------------ /// Look up given page version. pub(crate) async fn get_rel_page_at_lsn( &self, tag: RelTag, blknum: BlockNumber, version: Version<'_>, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> Result { match version { Version::LsnRange(lsns) => { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( pages .iter() .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())), io_concurrency.clone(), ctx, ) .await; assert_eq!(res.len(), 1); res.into_iter().next().unwrap() } Version::Modified(modification) => { if tag.relnode == 0 { return Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), )); } let nblocks = self.get_rel_size(tag, version, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, version.get_lsn(), nblocks ); return Ok(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); modification.get(key, ctx).await } } } /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages. /// /// The ordering of the returned vec corresponds to the ordering of `pages`. /// /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future /// if the client goes away (e.g. due to timeout or cancellation). /// TODO: verify that it actually is cancellation-safe. pub(crate) async fn get_rel_page_at_lsn_batched( &self, pages: impl ExactSizeIterator, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); let mut slots_filled = 0; let page_count = pages.len(); // Would be nice to use smallvec here but it doesn't provide the spare_capacity_mut() API. let mut result = Vec::with_capacity(pages.len()); let result_slots = result.spare_capacity_mut(); let mut keys_slots: HashMap> = HashMap::with_capacity(pages.len()); let mut req_keyspaces: HashMap = HashMap::with_capacity(pages.len()); for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), ))); slots_filled += 1; continue; } let lsn = lsns.effective_lsn; let nblocks = { let ctx = RequestContextBuilder::from(&ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "GET_REL_SIZE", reltag=%tag, lsn=%lsn, ) }) .attached_child(); match self .get_rel_size(*tag, Version::LsnRange(lsns), &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await { Ok(nblocks) => nblocks, Err(err) => { result_slots[response_slot_idx].write(Err(err)); slots_filled += 1; continue; } } }; if *blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, lsn, nblocks ); result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone())); slots_filled += 1; continue; } let key = rel_block_to_key(*tag, *blknum); let ctx = RequestContextBuilder::from(&ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "GET_BATCH", batch_size = %page_count, ) }) .attached_child(); let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); let acc = req_keyspaces.entry(lsn).or_default(); acc.add_key(key); } let query: Vec<(Lsn, KeySpace)> = req_keyspaces .into_iter() .map(|(lsn, acc)| (lsn, acc.to_keyspace())) .collect(); let query = VersionedKeySpaceQuery::scattered(query); let res = self .get_vectored(query, io_concurrency, ctx) .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone()) .await; match res { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); let (first_slot, first_req_ctx) = key_slots.next().unwrap(); for (slot, req_ctx) in key_slots { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { PageReconstructError::Cancelled => PageReconstructError::Cancelled, x @ PageReconstructError::Other(_) | x @ PageReconstructError::AncestorLsnTimeout(_) | x @ PageReconstructError::WalRedo(_) | x @ PageReconstructError::MissingKey(_) => { PageReconstructError::Other(anyhow::anyhow!( "there was more than one request for this key in the batch, error logged once: {x:?}" )) } }), }; result_slots[slot].write(clone); // There is no standardized way to express that the batched span followed from N request spans. // So, abuse the system and mark the request contexts as follows_from the batch span, so we get // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for. req_ctx.perf_follows_from(ctx); slots_filled += 1; } result_slots[first_slot].write(res); first_req_ctx.perf_follows_from(ctx); slots_filled += 1; } } Err(err) => { // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size // (We enforce the max batch size outside of this function, in the code that constructs the batch request.) for (slot, req_ctx) in keys_slots.values().flatten() { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled), // TODO: restructure get_vectored API to make this error per-key GetVectoredError::MissingKey(err) => { Err(PageReconstructError::Other(anyhow::anyhow!( "whole vectored get request failed because one or more of the requested keys were missing: {err:?}" ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::GetReadyAncestorError(err) => { Err(PageReconstructError::Other(anyhow::anyhow!( "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}" ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::Other(err) => Err(PageReconstructError::Other( anyhow::anyhow!("whole vectored get request failed: {err:?}"), )), // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) } // NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::Oversized(err, max) => { Err(anyhow::anyhow!("batching oversized: {err} > {max}").into()) } }; req_ctx.perf_follows_from(ctx); result_slots[*slot].write(err); } slots_filled += keys_slots.values().map(|slots| slots.len()).sum::(); } }; assert_eq!(slots_filled, page_count); // SAFETY: // 1. `result` and any of its uninint members are not read from until this point // 2. The length below is tracked at run-time and matches the number of requested pages. unsafe { result.set_len(page_count); } result } /// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on /// other shards, by only accounting for relations the shard has pages for, and only accounting /// for pages up to the highest page number it has stored. pub(crate) async fn get_db_size( &self, spcnode: Oid, dbnode: Oid, version: Version<'_>, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; if rels.is_empty() { return Ok(0); } // Pre-deserialize the rel directory to avoid duplicated work in `get_relsize_cached`. let reldir_key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, reldir_key, ctx).await?; let reldir = RelDirectory::des(&buf)?; for rel in rels { let n_blocks = self .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), false, ctx) .await? .expect("allow_missing=false"); total_blocks += n_blocks as usize; } Ok(total_blocks) } /// Get size of a relation file. The relation must exist, otherwise an error is returned. /// /// This is only accurate on shard 0. On other shards, it will return the size up to the highest /// page number stored in the shard. pub(crate) async fn get_rel_size( &self, tag: RelTag, version: Version<'_>, ctx: &RequestContext, ) -> Result { Ok(self .get_rel_size_in_reldir(tag, version, None, false, ctx) .await? .expect("allow_missing=false")) } /// Get size of a relation file. If `allow_missing` is true, returns None for missing relations, /// otherwise errors. /// /// INVARIANT: never returns None if `allow_missing=false`. /// /// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`. pub(crate) async fn get_rel_size_in_reldir( &self, tag: RelTag, version: Version<'_>, deserialized_reldir_v1: Option<(Key, &RelDirectory)>, allow_missing: bool, ctx: &RequestContext, ) -> Result, PageReconstructError> { if tag.relnode == 0 { return Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), )); } if let Some(nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(Some(nblocks)); } if allow_missing && !self .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx) .await? { return Ok(None); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) && !self .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx) .await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. return Ok(Some(0)); } let key = rel_size_to_key(tag); let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); self.update_cached_rel_size(tag, version, nblocks); Ok(Some(nblocks)) } /// Does the relation exist? /// /// Only shard 0 has a full view of the relations. Other shards only know about relations that /// the shard stores pages for. /// pub(crate) async fn get_rel_exists( &self, tag: RelTag, version: Version<'_>, ctx: &RequestContext, ) -> Result { self.get_rel_exists_in_reldir(tag, version, None, ctx).await } async fn get_rel_exists_in_reldir_v1( &self, tag: RelTag, version: Version<'_>, deserialized_reldir_v1: Option<(Key, &RelDirectory)>, ctx: &RequestContext, ) -> Result { let key = rel_dir_to_key(tag.spcnode, tag.dbnode); if let Some((cached_key, dir)) = deserialized_reldir_v1 { if cached_key == key { return Ok(dir.rels.contains(&(tag.relnode, tag.forknum))); } else if cfg!(test) || cfg!(feature = "testing") { panic!("cached reldir key mismatch: {cached_key} != {key}"); } else { warn!("cached reldir key mismatch: {cached_key} != {key}"); } // Fallback to reading the directory from the datadir. } let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) } async fn get_rel_exists_in_reldir_v2( &self, tag: RelTag, version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?).map_err( |_| { PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: decode failed, {}", key )) }, )?; let exists_v2 = buf == RelDirExists::Exists; Ok(exists_v2) } /// Does the relation exist? With a cached deserialized `RelDirectory`. /// /// There are some cases where the caller loops across all relations. In that specific case, /// the caller should obtain the deserialized `RelDirectory` first and then call this function /// to avoid duplicated work of deserliazation. This is a hack and should be removed by introducing /// a new API (e.g., `get_rel_exists_batched`). pub(crate) async fn get_rel_exists_in_reldir( &self, tag: RelTag, version: Version<'_>, deserialized_reldir_v1: Option<(Key, &RelDirectory)>, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { return Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), )); } // first try to lookup relation in cache if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(true); } // then check if the database was already initialized. // get_rel_exists can be called before dbdir is created. let buf = version.get(self, DBDIR_KEY, ctx).await?; let dbdirs = DbDirectory::des(&buf)?.dbdirs; if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } let (v2_status, migrated_lsn) = self.get_rel_size_v2_status(); match v2_status { RelSizeMigration::Legacy => { let v1_exists = self .get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx) .await?; Ok(v1_exists) } RelSizeMigration::Migrating | RelSizeMigration::Migrated if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) => { // For requests below the migrated LSN, we still use the v1 read path. let v1_exists = self .get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx) .await?; Ok(v1_exists) } RelSizeMigration::Migrating => { let v1_exists = self .get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx) .await?; let v2_exists_res = self.get_rel_exists_in_reldir_v2(tag, version, ctx).await; match v2_exists_res { Ok(v2_exists) if v1_exists == v2_exists => {} Ok(v2_exists) => { tracing::warn!( "inconsistent v1/v2 reldir keyspace for rel {}: v1_exists={}, v2_exists={}", tag, v1_exists, v2_exists ); } Err(e) => { tracing::warn!("failed to get rel exists in v2: {e}"); } } Ok(v1_exists) } RelSizeMigration::Migrated => { let v2_exists = self.get_rel_exists_in_reldir_v2(tag, version, ctx).await?; Ok(v2_exists) } } } async fn list_rels_v1( &self, spcnode: Oid, dbnode: Oid, version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; let rels_v1: HashSet = HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { spcnode, dbnode, relnode: *relnode, forknum: *forknum, })); Ok(rels_v1) } async fn list_rels_v2( &self, spcnode: Oid, dbnode: Oid, version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { let key_range = rel_tag_sparse_key_range(spcnode, dbnode); let io_concurrency = IoConcurrency::spawn_from_conf( self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, ); let results = self .scan( KeySpace::single(key_range), version.get_lsn(), ctx, io_concurrency, ) .await?; let mut rels = HashSet::new(); for (key, val) in results { let val = RelDirExists::decode(&val?).map_err(|_| { PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: decode failed, {}", key )) })?; if key.field6 != 1 { return Err(PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: field6 != 1, {}", key ))); } if key.field2 != spcnode { return Err(PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: field2 != spcnode, {}", key ))); } if key.field3 != dbnode { return Err(PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: field3 != dbnode, {}", key ))); } let tag = RelTag { spcnode, dbnode, relnode: key.field4, forknum: key.field5, }; if val == RelDirExists::Removed { debug_assert!(!rels.contains(&tag), "removed reltag in v2"); continue; } let did_not_contain = rels.insert(tag); debug_assert!(did_not_contain, "duplicate reltag in v2"); } Ok(rels) } /// Get a list of all existing relations in given tablespace and database. /// /// Only shard 0 has a full view of the relations. Other shards only know about relations that /// the shard stores pages for. /// /// # Cancel-Safety /// /// This method is cancellation-safe. pub(crate) async fn list_rels( &self, spcnode: Oid, dbnode: Oid, version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { let (v2_status, migrated_lsn) = self.get_rel_size_v2_status(); match v2_status { RelSizeMigration::Legacy => { let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; Ok(rels_v1) } RelSizeMigration::Migrating | RelSizeMigration::Migrated if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) => { // For requests below the migrated LSN, we still use the v1 read path. let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; Ok(rels_v1) } RelSizeMigration::Migrating => { let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; let rels_v2_res = self.list_rels_v2(spcnode, dbnode, version, ctx).await; match rels_v2_res { Ok(rels_v2) if rels_v1 == rels_v2 => {} Ok(rels_v2) => { tracing::warn!( "inconsistent v1/v2 reldir keyspace for db {} {}: v1_rels.len()={}, v2_rels.len()={}", spcnode, dbnode, rels_v1.len(), rels_v2.len() ); } Err(e) => { tracing::warn!("failed to list rels in v2: {e}"); } } Ok(rels_v1) } RelSizeMigration::Migrated => { let rels_v2 = self.list_rels_v2(spcnode, dbnode, version, ctx).await?; Ok(rels_v2) } } } /// Get the whole SLRU segment pub(crate) async fn get_slru_segment( &self, kind: SlruKind, segno: u32, lsn: Lsn, ctx: &RequestContext, ) -> Result { assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self .get_slru_segment_size(kind, segno, Version::at(lsn), ctx) .await?; let keyspace = KeySpace::single( slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks), ); let batches = keyspace.partition( self.get_shard_identity(), self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, ); let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); for batch in batches.parts { let query = VersionedKeySpaceQuery::uniform(batch, lsn); let blocks = self .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, block) in blocks { let block = block?; segment.extend_from_slice(&block[..BLCKSZ as usize]); } } Ok(segment.freeze()) } /// Get size of an SLRU segment pub(crate) async fn get_slru_segment_size( &self, kind: SlruKind, segno: u32, version: Version<'_>, ctx: &RequestContext, ) -> Result { assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_segment_size_to_key(kind, segno); let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) } /// Does the slru segment exist? pub(crate) async fn get_slru_segment_exists( &self, kind: SlruKind, segno: u32, version: Version<'_>, ctx: &RequestContext, ) -> Result { assert!(self.tenant_shard_id.is_shard_zero()); // fetch directory listing let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; let dir = SlruSegmentDirectory::des(&buf)?; Ok(dir.segments.contains(&segno)) } /// Locate LSN, such that all transactions that committed before /// 'search_timestamp' are visible, but nothing newer is. /// /// This is not exact. Commit timestamps are not guaranteed to be ordered, /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// pub(crate) async fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { pausable_failpoint!("find-lsn-for-timestamp-pausable"); let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); let gc_cutoff_planned = { let gc_info = self.gc_info.read().unwrap(); info!(cutoffs=?gc_info.cutoffs, applied_cutoff=%*gc_cutoff_lsn_guard, "starting find_lsn_for_timestamp"); gc_info.min_cutoff() }; // Usually the planned cutoff is newer than the cutoff of the last gc run, // but let's be defensive. let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be // on the safe side. let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn()); let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the // LSN divided by 8. let mut low = min_lsn.0 / 8; let mut high = max_lsn.0 / 8 + 1; let mut found_smaller = false; let mut found_larger = false; while low < high { if cancel.is_cancelled() { return Err(PageReconstructError::Cancelled); } // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; let cmp = match self .is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), &mut found_smaller, &mut found_larger, ctx, ) .await { Ok(res) => res, Err(PageReconstructError::MissingKey(e)) => { warn!( "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e ); // Return that we didn't find any requests smaller than the LSN, and logging the error. return Ok(LsnForTimestamp::Past(min_lsn)); } Err(e) => return Err(e), }; if cmp { high = mid; } else { low = mid + 1; } } // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN, // so the LSN of the last commit record before or at `search_timestamp`. // Remove one from `low` to get `t`. // // FIXME: it would be better to get the LSN of the previous commit. // Otherwise, if you restore to the returned LSN, the database will // include physical changes from later commits that will be marked // as aborted, and will need to be vacuumed away. let commit_lsn = Lsn((low - 1) * 8); match (found_smaller, found_larger) { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. Ok(LsnForTimestamp::NoData(min_lsn)) } (false, true) => { // Didn't find any commit timestamps smaller than the request Ok(LsnForTimestamp::Past(min_lsn)) } (true, _) if commit_lsn < min_lsn => { // the search above did set found_smaller to true but it never increased the lsn. // Then, low is still the old min_lsn, and the subtraction above gave a value // below the min_lsn. We should never do that. Ok(LsnForTimestamp::Past(min_lsn)) } (true, false) => { // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future` // case, anyway. Ok(LsnForTimestamp::Future(commit_lsn)) } (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)), } } /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'. /// /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits /// with a smaller/larger timestamp. /// pub(crate) async fn is_latest_commit_timestamp_ge_than( &self, search_timestamp: TimestampTz, probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, ctx: &RequestContext, ) -> Result { self.map_all_timestamps(probe_lsn, ctx, |timestamp| { if timestamp >= search_timestamp { *found_larger = true; return ControlFlow::Break(true); } else { *found_smaller = true; } ControlFlow::Continue(()) }) .await } /// Obtain the timestamp for the given lsn. /// /// If the lsn has no timestamps (e.g. no commits), returns None. pub(crate) async fn get_timestamp_for_lsn( &self, probe_lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { let mut max: Option = None; self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| { if let Some(max_prev) = max { max = Some(max_prev.max(timestamp)); } else { max = Some(timestamp); } ControlFlow::Continue(()) }) .await?; Ok(max) } /// Runs the given function on all the timestamps for a given lsn /// /// The return value is either given by the closure, or set to the `Default` /// impl's output. async fn map_all_timestamps( &self, probe_lsn: Lsn, ctx: &RequestContext, mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx) .await? { let nblocks = self .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx) .await?; let keyspace = KeySpace::single( slru_block_to_key(SlruKind::Clog, segno, 0) ..slru_block_to_key(SlruKind::Clog, segno, nblocks), ); let batches = keyspace.partition( self.get_shard_identity(), self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, ); for batch in batches.parts.into_iter().rev() { let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn); let blocks = self .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, clog_page) in blocks.into_iter().rev() { let clog_page = clog_page?; if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); match f(timestamp) { ControlFlow::Break(b) => return Ok(b), ControlFlow::Continue(()) => (), } } } } } Ok(Default::default()) } pub(crate) async fn get_slru_keyspace( &self, version: Version<'_>, ctx: &RequestContext, ) -> Result { let mut accum = KeySpaceAccum::new(); for kind in SlruKind::iter() { let mut segments: Vec = self .list_slru_segments(kind, version, ctx) .await? .into_iter() .collect(); segments.sort_unstable(); for seg in segments { let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?; accum.add_range( slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count), ); } } Ok(accum.to_keyspace()) } /// Get a list of SLRU segments pub(crate) async fn list_slru_segments( &self, kind: SlruKind, version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; Ok(SlruSegmentDirectory::des(&buf)?.segments) } pub(crate) async fn get_relmap_file( &self, spcnode: Oid, dbnode: Oid, version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = relmap_file_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; Ok(buf) } pub(crate) async fn list_dbdirs( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn, ctx).await?; Ok(DbDirectory::des(&buf)?.dbdirs) } pub(crate) async fn get_twophase_file( &self, xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { let key = twophase_file_key(xid); let buf = self.get(key, lsn, ctx).await?; Ok(buf) } pub(crate) async fn list_twophase_files( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; if self.pg_version >= PgMajorVersion::PG17 { Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) } else { Ok(TwoPhaseDirectory::des(&buf)? .xids .iter() .map(|x| u64::from(*x)) .collect()) } } pub(crate) async fn get_control_file( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result { self.get(CONTROLFILE_KEY, lsn, ctx).await } pub(crate) async fn get_checkpoint( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result { self.get(CHECKPOINT_KEY, lsn, ctx).await } async fn list_aux_files_v2( &self, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self .scan( KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx, io_concurrency, ) .await?; let mut result = HashMap::new(); let mut sz = 0; for (_, v) in kv { let v = v?; let v = aux_file::decode_file_value_bytes(&v) .context("value decode") .map_err(PageReconstructError::Other)?; for (fname, content) in v { sz += fname.len(); sz += content.len(); result.insert(fname, content); } } self.aux_file_size_estimator.on_initial(sz); Ok(result) } pub(crate) async fn trigger_aux_file_size_computation( &self, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> Result<(), PageReconstructError> { self.list_aux_files_v2(lsn, ctx, io_concurrency).await?; Ok(()) } pub(crate) async fn list_aux_files( &self, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { self.list_aux_files_v2(lsn, ctx, io_concurrency).await } pub(crate) async fn get_replorigins( &self, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self .scan( KeySpace::single(repl_origin_key_range()), lsn, ctx, io_concurrency, ) .await?; let mut result = HashMap::new(); for (k, v) in kv { let v = v?; if v.is_empty() { // This is a tombstone -- we can skip it. // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone, // we also need to consider that. Such tombstones might be written on the detach ancestor code path to // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.) continue; } let origin_id = k.field6 as RepOriginId; let origin_lsn = Lsn::des(&v) .with_context(|| format!("decode replorigin value for {origin_id}: {v:?}"))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } } Ok(result) } /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. /// /// # Cancel-Safety /// /// This method is cancellation-safe. pub(crate) async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) }); // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf)?; let mut total_size: u64 = 0; let mut dbdir_cnt = 0; let mut rel_cnt = 0; for &(spcnode, dbnode) in dbdir.dbdirs.keys() { dbdir_cnt += 1; for rel in self .list_rels(spcnode, dbnode, Version::at(lsn), ctx) .await? { rel_cnt += 1; if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } let relsize_key = rel_size_to_key(rel); let mut buf = self.get(relsize_key, lsn, ctx).await?; let relsize = buf.get_u32_le(); total_size += relsize as u64; } } self.db_rel_count .store(Some(Arc::new((dbdir_cnt, rel_cnt)))); Ok(total_size * BLCKSZ as u64) } /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used /// for gc-compaction. /// /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to /// be kept only for a specific range of LSN. /// /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to /// determine which keys to retain/drop for gc-compaction. /// /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace /// to be retained for each of the branch LSN. /// /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_gc_compaction_keyspace( &self, ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { let metadata_key_begin = Key::metadata_key_range().start; let aux_v1_key = AUX_FILES_KEY; let dense_keyspace = KeySpace { ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin], }; Ok(( dense_keyspace, SparseKeySpace(KeySpace::single(Key::metadata_key_range())), )) } /// /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). /// /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); // The dbdir metadata always exists result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them let dbdir = self.list_dbdirs(lsn, ctx).await?; let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect(); dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b)); for ((spcnode, dbnode), has_relmap_file) in dbs { if has_relmap_file { result.add_key(relmap_file_key(spcnode, dbnode)); } result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self .list_rels(spcnode, dbnode, Version::at(lsn), ctx) .await? .into_iter() .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); let mut buf = self.get(relsize_key, lsn, ctx).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); result.add_key(relsize_key); } } // Iterate SLRUs next if self.tenant_shard_id.is_shard_zero() { for kind in [ SlruKind::Clog, SlruKind::MultiXactMembers, SlruKind::MultiXactOffsets, ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); let buf = self.get(slrudir_key, lsn, ctx).await?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); let mut buf = self.get(segsize_key, lsn, ctx).await?; let segsize = buf.get_u32_le(); result.add_range( slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), ); result.add_key(segsize_key); } } } // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); let mut xids: Vec = self .list_twophase_files(lsn, ctx) .await? .iter() .cloned() .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); } result.add_key(CONTROLFILE_KEY); result.add_key(CHECKPOINT_KEY); // Add extra keyspaces in the test cases. Some test cases write keys into the storage without // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` // and the keys will not be garbage-colllected. #[cfg(test)] { let guard = self.extra_test_dense_keyspace.load(); for kr in &guard.ranges { result.add_range(kr.clone()); } } let dense_keyspace = result.to_keyspace(); let sparse_keyspace = SparseKeySpace(KeySpace { ranges: vec![ Key::metadata_aux_key_range(), repl_origin_key_range(), Key::rel_dir_sparse_key_range(), ], }); if cfg!(debug_assertions) { // Verify if the sparse keyspaces are ordered and non-overlapping. // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each // category of sparse keys are split into their own image/delta files. If there // are overlapping keyspaces, they will be automatically merged by keyspace accum, // and we want the developer to keep the keyspaces separated. let ranges = &sparse_keyspace.0.ranges; // TODO: use a single overlaps_with across the codebase fn overlaps_with(a: &Range, b: &Range) -> bool { !(a.end <= b.start || b.end <= a.start) } for i in 0..ranges.len() { for j in 0..i { if overlaps_with(&ranges[i], &ranges[j]) { panic!( "overlapping sparse keyspace: {}..{} and {}..{}", ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end ); } } } for i in 1..ranges.len() { assert!( ranges[i - 1].end <= ranges[i].start, "unordered sparse keyspace: {}..{} and {}..{}", ranges[i - 1].start, ranges[i - 1].end, ranges[i].start, ranges[i].end ); } } Ok((dense_keyspace, sparse_keyspace)) } /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size /// at the particular LSN (snapshot). pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option { let lsn = version.get_lsn(); { let rel_size_cache = self.rel_size_latest_cache.read().unwrap(); if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { if lsn >= *cached_lsn { RELSIZE_LATEST_CACHE_HITS.inc(); return Some(*nblocks); } RELSIZE_CACHE_MISSES_OLD.inc(); } } { let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) { RELSIZE_SNAPSHOT_CACHE_HITS.inc(); return Some(*nblock); } } if version.is_latest() { RELSIZE_LATEST_CACHE_MISSES.inc(); } else { RELSIZE_SNAPSHOT_CACHE_MISSES.inc(); } None } /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) { let lsn = version.get_lsn(); if version.is_latest() { let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); match rel_size_cache.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { *cached_lsn = (lsn, nblocks); } } hash_map::Entry::Vacant(entry) => { entry.insert((lsn, nblocks)); RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } } else { let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); if rel_size_cache.capacity() != 0 { rel_size_cache.insert((lsn, tag), nblocks); RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64); } } } /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() { RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); if rel_size_cache.remove(tag).is_some() { RELSIZE_LATEST_CACHE_ENTRIES.dec(); } } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. /// /// It is created by the 'begin_record' function. It is called for each WAL /// record, so that all the modifications by a one WAL record appear atomic. pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. pub tline: &'a Timeline, /// Current LSN of the modification lsn: Lsn, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'. pending_metadata_pages: HashMap>, /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for /// which keys are stored here. pending_data_batch: Option, /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>, /// An **approximation** of how many metadata bytes will be written to the EphemeralFile. pending_metadata_bytes: usize, /// Whether we are importing a pgdata directory. is_importing_pgdata: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum MetricsUpdate { /// Set the metrics to this value Set(u64), /// Increment the metrics by this value Add(u64), /// Decrement the metrics by this value Sub(u64), } /// Controls the behavior of the reldir keyspace. pub struct RelDirMode { // Whether we can read the v2 keyspace or not. current_status: RelSizeMigration, // Whether we should initialize the v2 keyspace or not. initialize: bool, } impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024; /// Get the current lsn pub(crate) fn get_lsn(&self) -> Lsn { self.lsn } pub(crate) fn approx_pending_bytes(&self) -> usize { self.pending_data_batch .as_ref() .map_or(0, |b| b.buffer_size()) + self.pending_metadata_bytes } pub(crate) fn has_dirty_data(&self) -> bool { self.pending_data_batch .as_ref() .is_some_and(|b| b.has_data()) } /// Returns statistics about the currently pending modifications. pub(crate) fn stats(&self) -> DatadirModificationStats { let mut stats = DatadirModificationStats::default(); for (_, _, value) in self.pending_metadata_pages.values().flatten() { match value { Value::Image(_) => stats.metadata_images += 1, Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1, Value::WalRecord(_) => stats.metadata_deltas += 1, } } for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) { match valuemeta { ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1, ValueMeta::Serialized(_) => stats.data_deltas += 1, ValueMeta::Observed(_) => {} } } stats } /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> { ensure_walingest!( lsn >= self.lsn, "setting an older lsn {} than {} is not allowed", lsn, self.lsn ); if lsn > self.lsn { self.pending_lsns.push(self.lsn); self.lsn = lsn; } Ok(()) } /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means /// keys that represent literal blocks that postgres can read. So data includes relation blocks and /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata. /// /// The distinction is important because data keys are handled on a fast path where dirty writes are /// not readable until this modification is committed, whereas metadata keys are visible for read /// via [`Self::get`] as soon as their record has been ingested. fn is_data_key(key: &Key) -> bool { key.is_rel_block_key() || key.is_slru_block_key() } /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to /// always exist. pub fn init_empty(&mut self) -> anyhow::Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; self.pending_directory_entries .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); let buf = if self.tline.pg_version >= PgMajorVersion::PG17 { TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { xids: HashSet::new(), }) } else { TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), }) }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0))); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); // Initialize SLRUs on shard 0 only: creating these on other shards would be // harmless but they'd just be dropped on later compaction. if self.tline.tenant_shard_id.is_shard_zero() { self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); self.pending_directory_entries.push(( DirectoryKind::SlruSegment(SlruKind::Clog), MetricsUpdate::Set(0), )); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); self.pending_directory_entries.push(( DirectoryKind::SlruSegment(SlruKind::Clog), MetricsUpdate::Set(0), )); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); self.pending_directory_entries.push(( DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), MetricsUpdate::Set(0), )); } Ok(()) } #[cfg(test)] pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> { self.init_empty()?; self.put_control_file(bytes::Bytes::from_static( b"control_file contents do not matter", )) .context("put_control_file")?; self.put_checkpoint(bytes::Bytes::from_static( b"checkpoint_file contents do not matter", )) .context("put_checkpoint_file")?; Ok(()) } /// Creates a relation if it is not already present. /// Returns the current size of the relation pub(crate) async fn create_relation_if_required( &mut self, rel: RelTag, ctx: &RequestContext, ) -> Result { // Get current size and put rel creation if rel doesn't exist // // NOTE: we check the cache first even though get_rel_exists and get_rel_size would // check the cache too. This is because eagerly checking the cache results in // less work overall and 10% better performance. It's more work on cache miss // but cache miss is rare. if let Some(nblocks) = self .tline .get_cached_rel_size(&rel, Version::Modified(self)) { Ok(nblocks) } else if !self .tline .get_rel_exists(rel, Version::Modified(self), ctx) .await? { // create it with 0 size initially, the logic below will extend it self.put_rel_creation(rel, 0, ctx).await?; Ok(0) } else { Ok(self .tline .get_rel_size(rel, Version::Modified(self), ctx) .await?) } } /// Given a block number for a relation (which represents a newly written block), /// the previous block count of the relation, and the shard info, find the gaps /// that were created by the newly written block if any. fn find_gaps( rel: RelTag, blkno: u32, previous_nblocks: u32, shard: &ShardIdentity, ) -> Option { let mut key = rel_block_to_key(rel, blkno); let mut gap_accum = None; for gap_blkno in previous_nblocks..blkno { key.field6 = gap_blkno; if shard.get_shard_number(&key) != shard.number { continue; } gap_accum .get_or_insert_with(KeySpaceAccum::new) .add_key(key); } gap_accum.map(|accum| accum.to_keyspace()) } pub async fn ingest_batch( &mut self, mut batch: SerializedValueBatch, // TODO(vlad): remove this argument and replace the shard check with is_key_local shard: &ShardIdentity, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let mut gaps_at_lsns = Vec::default(); for meta in batch.metadata.iter() { let key = Key::from_compact(meta.key()); let (rel, blkno) = key .to_rel_block() .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?; let new_nblocks = blkno + 1; let old_nblocks = self.create_relation_if_required(rel, ctx).await?; if new_nblocks > old_nblocks { self.put_rel_extend(rel, new_nblocks, ctx).await?; } if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) { gaps_at_lsns.push((gaps, meta.lsn())); } } if !gaps_at_lsns.is_empty() { batch.zero_gaps(gaps_at_lsns); } match self.pending_data_batch.as_mut() { Some(pending_batch) => { pending_batch.extend(batch); } None if batch.has_data() => { self.pending_data_batch = Some(batch); } None => { // Nothing to initialize the batch with } } Ok(()) } /// Put a new page version that can be constructed from a WAL record /// /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the /// current end-of-file. It's up to the caller to check that the relation size /// matches the blocks inserted! pub fn put_rel_wal_record( &mut self, rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, ) -> Result<(), WalIngestError> { ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } // Same, but for an SLRU. pub fn put_slru_wal_record( &mut self, kind: SlruKind, segno: u32, blknum: BlockNumber, rec: NeonWalRecord, ) -> Result<(), WalIngestError> { if !self.tline.tenant_shard_id.is_shard_zero() { return Ok(()); } self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), ); Ok(()) } /// Like put_wal_record, but with ready-made image of the page. pub fn put_rel_page_image( &mut self, rel: RelTag, blknum: BlockNumber, img: Bytes, ) -> Result<(), WalIngestError> { ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } pub fn put_slru_page_image( &mut self, kind: SlruKind, segno: u32, blknum: BlockNumber, img: Bytes, ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(key, Value::Image(img)); Ok(()) } pub(crate) fn put_rel_page_image_zero( &mut self, rel: RelTag, blknum: BlockNumber, ) -> Result<(), WalIngestError> { ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self .pending_data_batch .get_or_insert_with(SerializedValueBatch::default); batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn); Ok(()) } pub(crate) fn put_slru_page_image_zero( &mut self, kind: SlruKind, segno: u32, blknum: BlockNumber, ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self .pending_data_batch .get_or_insert_with(SerializedValueBatch::default); batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn); Ok(()) } /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that /// we enable it, we also need to persist it in `index_part.json` (initialize is true). /// /// As this function is only used on the write path, we do not need to read the migrated_at /// field. pub fn maybe_enable_rel_size_v2(&mut self, is_create: bool) -> anyhow::Result { // TODO: define the behavior of the tenant-level config flag and use feature flag to enable this feature let (status, _) = self.tline.get_rel_size_v2_status(); let config = self.tline.get_rel_size_v2_enabled(); match (config, status) { (false, RelSizeMigration::Legacy) => { // tenant config didn't enable it and we didn't write any reldir_v2 key yet Ok(RelDirMode { current_status: RelSizeMigration::Legacy, initialize: false, }) } (false, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { // index_part already persisted that the timeline has enabled rel_size_v2 Ok(RelDirMode { current_status: status, initialize: false, }) } (true, RelSizeMigration::Legacy) => { // The first time we enable it, we need to persist it in `index_part.json` // The caller should update the reldir status once the initialization is done. // // Only initialize the v2 keyspace on new relation creation. No initialization // during `timeline_create` (TODO: fix this, we should allow, but currently it // hits consistency issues). Ok(RelDirMode { current_status: RelSizeMigration::Legacy, initialize: is_create && !self.is_importing_pgdata, }) } (true, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { // index_part already persisted that the timeline has enabled rel_size_v2 // and we don't need to do anything Ok(RelDirMode { current_status: status, initialize: false, }) } } } /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, spcnode: Oid, dbnode: Oid, img: Bytes, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let v2_mode = self .maybe_enable_rel_size_v2(false) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); if r.is_none() || r == Some(false) { // The dbdir entry didn't exist, or it contained a // 'false'. The 'insert' call already updated it with // 'true', now write the updated 'dbdirs' map back. let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); } if r.is_none() { if v2_mode.current_status != RelSizeMigration::Legacy { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } // Create RelDirectory in v1 keyspace. TODO: if we have fully migrated to v2, no need to create this directory. // Some code path relies on this directory to be present. We should remove it once we starts to set tenants to // `RelSizeMigration::Migrated` state (currently we don't, all tenants will have `RelSizeMigration::Migrating`). let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), ); } self.put(relmap_file_key(spcnode, dbnode), Value::Image(img)); Ok(()) } pub async fn put_twophase_file( &mut self, xid: u64, img: Bytes, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { Err(WalIngestErrorKind::FileAlreadyExists(xid))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, MetricsUpdate::Set(dir.xids.len() as u64), )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid = xid as u32; let mut dir = TwoPhaseDirectory::des(&dirbuf)?; if !dir.xids.insert(xid) { Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, MetricsUpdate::Set(dir.xids.len() as u64), )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) } pub async fn set_replorigin( &mut self, origin_id: RepOriginId, origin_lsn: Lsn, ) -> Result<(), WalIngestError> { let key = repl_origin_key(origin_id); self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); Ok(()) } pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> { self.set_replorigin(origin_id, Lsn::INVALID).await } pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } pub async fn drop_dbdir( &mut self, spcnode: Oid, dbnode: Oid, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let total_blocks = self .tline .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) .await?; // Remove entry from dbdir let buf = self.get(DBDIR_KEY, ctx).await?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dir.dbdirs.len() as u64), )); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( "dropped dbdir for spcnode {} dbnode {} did not exist in db directory", spcnode, dbnode ); } // Update logical database size. self.pending_nblocks -= total_blocks as i64; // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); Ok(()) } async fn initialize_rel_size_v2_keyspace( &mut self, ctx: &RequestContext, dbdir: &DbDirectory, ) -> Result<(), WalIngestError> { // Copy everything from relv1 to relv2; TODO: check if there's any key in the v2 keyspace, if so, abort. tracing::info!("initializing rel_size_v2 keyspace"); let mut rel_cnt = 0; // relmap_exists (the value of dbdirs hashmap) does not affect the migration: we need to copy things over anyways for &(spcnode, dbnode) in dbdir.dbdirs.keys() { let rel_dir_key = rel_dir_to_key(spcnode, dbnode); let rel_dir = RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?; for (relnode, forknum) in rel_dir.rels { let sparse_rel_dir_key = rel_tag_sparse_key(spcnode, dbnode, relnode, forknum); self.put( sparse_rel_dir_key, Value::Image(RelDirExists::Exists.encode()), ); tracing::info!( "migrated rel_size_v2: {}", RelTag { spcnode, dbnode, relnode, forknum } ); rel_cnt += 1; } } tracing::info!( "initialized rel_size_v2 keyspace at lsn {}: migrated {} relations", self.lsn, rel_cnt ); self.tline .update_rel_size_v2_status(RelSizeMigration::Migrating, Some(self.lsn)) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; Ok::<_, WalIngestError>(()) } async fn put_rel_creation_v1( &mut self, rel: RelTag, dbdir_exists: bool, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // Reldir v1 write path let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if !dbdir_exists { // Create the RelDirectory RelDirectory::default() } else { // reldir already exists, fetch it RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } if !dbdir_exists { self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) } self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); self.put( rel_dir_key, Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), ); Ok(()) } async fn put_rel_creation_v2( &mut self, rel: RelTag, dbdir_exists: bool, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // Reldir v2 write path let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 let val = self.sparse_get(sparse_rel_dir_key, ctx).await?; let val = RelDirExists::decode_option(val) .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?; if val == RelDirExists::Exists { Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } self.put( sparse_rel_dir_key, Value::Image(RelDirExists::Exists.encode()), ); if !dbdir_exists { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); Ok(()) } /// Create a relation fork. /// /// 'nblocks' is the initial size. pub async fn put_rel_creation( &mut self, rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { if rel.relnode == 0 { Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "invalid relnode" )))?; } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), )); self.put(DBDIR_KEY, Value::Image(buf.into())); false } else { true }; let mut v2_mode = self .maybe_enable_rel_size_v2(true) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_mode.initialize { if let Err(e) = self.initialize_rel_size_v2_keyspace(ctx, &dbdir).await { tracing::warn!("error initializing rel_size_v2 keyspace: {}", e); // TODO: circuit breaker so that it won't retry forever } else { v2_mode.current_status = RelSizeMigration::Migrating; } } if v2_mode.current_status != RelSizeMigration::Migrated { self.put_rel_creation_v1(rel, dbdir_exists, ctx).await?; } if v2_mode.current_status != RelSizeMigration::Legacy { let write_v2_res = self.put_rel_creation_v2(rel, dbdir_exists, ctx).await; if let Err(e) = write_v2_res { if v2_mode.current_status == RelSizeMigration::Migrated { return Err(e); } tracing::warn!("error writing rel_size_v2 keyspace: {}", e); } } // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); self.pending_nblocks += nblocks as i64; // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the // caller. Ok(()) } /// Truncate relation pub async fn put_rel_truncation( &mut self, rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline .get_rel_exists(rel, Version::Modified(self), ctx) .await? { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key, ctx).await?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); // Update logical database size. self.pending_nblocks -= old_size as i64 - nblocks as i64; } Ok(()) } /// Extend relation /// If new size is smaller, do nothing. pub async fn put_rel_extend( &mut self, rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); // Put size let size_key = rel_size_to_key(rel); let old_size = self.get(size_key, ctx).await?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); self.pending_nblocks += nblocks as i64 - old_size as i64; } Ok(()) } async fn put_rel_drop_v1( &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> Result, WalIngestError> { let mut dropped_rels = BTreeSet::new(); for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; let mut dir = RelDirectory::des(&buf)?; let mut dirty = false; for rel_tag in rel_tags { let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; dropped_rels.insert(rel_tag); true } else { false }; if found { // update logical size let size_key = rel_size_to_key(rel_tag); let old_size = self.get(size_key, ctx).await?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove entry from relation size cache self.tline.remove_cached_rel_size(&rel_tag); // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage. self.delete(rel_key_range(rel_tag)); } } if dirty { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); } } Ok(dropped_rels) } async fn put_rel_drop_v2( &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> Result, WalIngestError> { let mut dropped_rels = BTreeSet::new(); for ((spc_node, db_node), rel_tags) in drop_relations { for rel_tag in rel_tags { let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?; if val == RelDirExists::Exists { dropped_rels.insert(rel_tag); self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); // put tombstone self.put(key, Value::Image(RelDirExists::Removed.encode())); } } } Ok(dropped_rels) } /// Drop some relations pub(crate) async fn put_rel_drops( &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let v2_mode = self .maybe_enable_rel_size_v2(false) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; match v2_mode.current_status { RelSizeMigration::Legacy => { self.put_rel_drop_v1(drop_relations, ctx).await?; } RelSizeMigration::Migrating => { let dropped_rels_v1 = self.put_rel_drop_v1(drop_relations.clone(), ctx).await?; let dropped_rels_v2_res = self.put_rel_drop_v2(drop_relations, ctx).await; match dropped_rels_v2_res { Ok(dropped_rels_v2) => { if dropped_rels_v1 != dropped_rels_v2 { tracing::warn!( "inconsistent v1/v2 rel drop: dropped_rels_v1.len()={}, dropped_rels_v2.len()={}", dropped_rels_v1.len(), dropped_rels_v2.len() ); } } Err(e) => { tracing::warn!("error dropping rels: {}", e); } } } RelSizeMigration::Migrated => { self.put_rel_drop_v2(drop_relations, ctx).await?; } } Ok(()) } pub async fn put_slru_segment_creation( &mut self, kind: SlruKind, segno: u32, nblocks: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Add it to the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?; } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), MetricsUpdate::Set(dir.segments.len() as u64), )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), ); // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); // even if nblocks > 0, we don't insert any actual blocks here Ok(()) } /// Extend SLRU segment pub fn put_slru_extend( &mut self, kind: SlruKind, segno: u32, nblocks: BlockNumber, ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); Ok(()) } /// This method is used for marking truncated SLRU files pub async fn drop_slru_segment( &mut self, kind: SlruKind, segno: u32, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), MetricsUpdate::Set(dir.segments.len() as u64), )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), ); // Delete size entry, as well as all blocks self.delete(slru_segment_key_range(kind, segno)); Ok(()) } /// Drop a relmapper file (pg_filenode.map) pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> { // TODO Ok(()) } /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, xid: u64, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&buf)?; if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, MetricsUpdate::Set(dir.xids.len() as u64), )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid: u32 = u32::try_from(xid) .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, MetricsUpdate::Set(dir.xids.len() as u64), )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); Ok(()) } pub async fn put_file( &mut self, path: &str, content: &[u8], ctx: &RequestContext, ) -> Result<(), WalIngestError> { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine let old_val = match self.get(key, ctx).await { Ok(val) => Some(val), Err(PageReconstructError::MissingKey(_)) => None, Err(e) => return Err(e.into()), }; let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)? } else { Vec::new() }; let mut other_files = Vec::with_capacity(files.len()); let mut modifying_file = None; for file @ (p, content) in files { if path == p { assert!( modifying_file.is_none(), "duplicated entries found for {path}" ); modifying_file = Some(content); } else { other_files.push(file); } } let mut new_files = other_files; match (modifying_file, content.is_empty()) { (Some(old_content), false) => { self.tline .aux_file_size_estimator .on_update(old_content.len(), content.len()); new_files.push((path, content)); } (Some(old_content), true) => { self.tline .aux_file_size_estimator .on_remove(old_content.len()); // not adding the file key to the final `new_files` vec. } (None, false) => { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit. // Compute doesn't know if previous version of this file exists or not, so // attempt to delete non-existing file can cause this message. // To avoid false alarms, log it as info rather than warning. (None, true) if path.starts_with("pg_stat/") => { info!("removing non-existing pg_stat file: {}", path) } (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files) .map_err(WalIngestErrorKind::EncodeAuxFileError)?; self.put(key, Value::Image(new_val.into())); Ok(()) } /// /// Flush changes accumulated so far to the underlying repository. /// /// Usually, changes made in DatadirModification are atomic, but this allows /// you to flush them to the underlying repository before the final `commit`. /// That allows to free up the memory used to hold the pending changes. /// /// Currently only used during bulk import of a data directory. In that /// context, breaking the atomicity is OK. If the import is interrupted, the /// whole import fails and the timeline will be deleted anyway. /// (Or to be precise, it will be left behind for debugging purposes and /// ignored, see ) /// /// Note: A consequence of flushing the pending operations is that they /// won't be visible to subsequent operations until `commit`. The function /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; if pending_nblocks < 10000 { return Ok(()); } let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. if let Some(batch) = self.pending_data_batch.take() { tracing::debug!( "Flushing batch with max_lsn={}. Last record LSN is {}", batch.max_lsn, self.tline.get_last_record_lsn() ); // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. writer.put_batch(batch, ctx).await?; } if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); self.pending_nblocks = 0; } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { writer.update_directory_entries_count(kind, count); } Ok(()) } /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; // Ordering: the items in this batch do not need to be in any global order, but values for // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on // this to do efficient updates to its index. See [`wal_decoder::serialized_batch`] for // more details. let metadata_batch = { let pending_meta = self .pending_metadata_pages .drain() .flat_map(|(key, values)| { values .into_iter() .map(move |(lsn, value_size, value)| (key, lsn, value_size, value)) }) .collect::>(); if pending_meta.is_empty() { None } else { Some(SerializedValueBatch::from_values(pending_meta)) } }; let data_batch = self.pending_data_batch.take(); let maybe_batch = match (data_batch, metadata_batch) { (Some(mut data), Some(metadata)) => { data.extend(metadata); Some(data) } (Some(data), None) => Some(data), (None, Some(metadata)) => Some(metadata), (None, None) => None, }; if let Some(batch) = maybe_batch { tracing::debug!( "Flushing batch with max_lsn={}. Last record LSN is {}", batch.max_lsn, self.tline.get_last_record_lsn() ); // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. writer.put_batch(batch, ctx).await?; } if !self.pending_deletions.is_empty() { writer.delete_batch(&self.pending_deletions, ctx).await?; self.pending_deletions.clear(); } self.pending_lsns.push(self.lsn); for pending_lsn in self.pending_lsns.drain(..) { // TODO(vlad): pretty sure the comment below is not valid anymore // and we can call finish write with the latest LSN // // Ideally, we should be able to call writer.finish_write() only once // with the highest LSN. However, the last_record_lsn variable in the // timeline keeps track of the latest LSN and the immediate previous LSN // so we need to record every LSN to not leave a gap between them. writer.finish_write(pending_lsn); } if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { writer.update_directory_entries_count(kind, count); } self.pending_metadata_bytes = 0; Ok(()) } pub(crate) fn len(&self) -> usize { self.pending_metadata_pages.len() + self.pending_data_batch.as_ref().map_or(0, |b| b.len()) + self.pending_deletions.len() } /// Read a page from the Timeline we are writing to. For metadata pages, this passes through /// a cache in Self, which makes writes earlier in this modification visible to WAL records later /// in the modification. /// /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data /// page must ensure that the pages they read are already committed in Timeline, for example /// DB create operations are always preceded by a call to commit(). This is special cased because /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes, /// and not data pages. async fn get(&self, key: Key, ctx: &RequestContext) -> Result { if !Self::is_data_key(&key) { // Have we already updated the same key? Read the latest pending updated // version in that case. // // Note: we don't check pending_deletions. It is an error to request a // value that has been removed, deletion only avoids leaking storage. if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) { if let Some((_, _, value)) = values.last() { return if let Value::Image(img) = value { Ok(img.clone()) } else { // Currently, we never need to read back a WAL record that we // inserted in the same "transaction". All the metadata updates // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. Err(PageReconstructError::Other(anyhow::anyhow!( "unexpected pending WAL record" ))) }; } } } else { // This is an expensive check, so we only do it in debug mode. If reading a data key, // this key should never be present in pending_data_pages. We ensure this by committing // modifications before ingesting DB create operations, which are the only kind that reads // data pages during ingest. if cfg!(debug_assertions) { assert!( !self .pending_data_batch .as_ref() .is_some_and(|b| b.updates_key(&key)) ); } } // Metadata page cache miss, or we're reading a data page. let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); self.tline.get(key, lsn, ctx).await } /// Get a key from the sparse keyspace. Automatically converts the missing key error /// and the empty value into None. async fn sparse_get( &self, key: Key, ctx: &RequestContext, ) -> Result, PageReconstructError> { let val = self.get(key, ctx).await; match val { Ok(val) if val.is_empty() => Ok(None), Ok(val) => Ok(Some(val)), Err(PageReconstructError::MissingKey(_)) => Ok(None), Err(e) => Err(e), } } #[cfg(test)] pub fn put_for_unit_test(&mut self, key: Key, val: Value) { self.put(key, val); } fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) } else { self.put_metadata(key.to_compact(), val) } } fn put_data(&mut self, key: CompactKey, val: Value) { let batch = self .pending_data_batch .get_or_insert_with(SerializedValueBatch::default); batch.put(key, val, self.lsn); } fn put_metadata(&mut self, key: CompactKey, val: Value) { let values = self.pending_metadata_pages.entry(key).or_default(); // Replace the previous value if it exists at the same lsn if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { // Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place self.pending_metadata_bytes -= *last_value_ser_size; *last_value_ser_size = val.serialized_size().unwrap() as usize; self.pending_metadata_bytes += *last_value_ser_size; // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much // have been generated by synthesized zero page writes prior to the first real write to a page. *last_value = val; return; } } let val_serialized_size = val.serialized_size().unwrap() as usize; self.pending_metadata_bytes += val_serialized_size; values.push((self.lsn, val_serialized_size, val)); if key == CHECKPOINT_KEY.to_compact() { tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}"); } } fn delete(&mut self, key_range: Range) { trace!("DELETE {}-{}", key_range.start, key_range.end); self.pending_deletions.push((key_range, self.lsn)); } } /// Statistics for a DatadirModification. #[derive(Default)] pub struct DatadirModificationStats { pub metadata_images: u64, pub metadata_deltas: u64, pub data_images: u64, pub data_deltas: u64, } /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. /// /// During WAL ingestion, the records from multiple LSNs may be batched in the same /// modification before being flushed to the timeline. Hence, the routines in WalIngest /// need to look up the keys in the modification first before looking them up in the /// timeline to not miss the latest updates. #[derive(Clone, Copy)] pub enum Version<'a> { LsnRange(LsnRange), Modified(&'a DatadirModification<'a>), } impl Version<'_> { async fn get( &self, timeline: &Timeline, key: Key, ctx: &RequestContext, ) -> Result { match self { Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await, Version::Modified(modification) => modification.get(key, ctx).await, } } /// Get a key from the sparse keyspace. Automatically converts the missing key error /// and the empty value into None. async fn sparse_get( &self, timeline: &Timeline, key: Key, ctx: &RequestContext, ) -> Result, PageReconstructError> { let val = self.get(timeline, key, ctx).await; match val { Ok(val) if val.is_empty() => Ok(None), Ok(val) => Ok(Some(val)), Err(PageReconstructError::MissingKey(_)) => Ok(None), Err(e) => Err(e), } } pub fn is_latest(&self) -> bool { match self { Version::LsnRange(lsns) => lsns.is_latest(), Version::Modified(_) => true, } } pub fn get_lsn(&self) -> Lsn { match self { Version::LsnRange(lsns) => lsns.effective_lsn, Version::Modified(modification) => modification.lsn, } } pub fn at(lsn: Lsn) -> Self { Version::LsnRange(LsnRange { effective_lsn: lsn, request_lsn: lsn, }) } } //--- Metadata structs stored in key-value pairs in the repository. #[derive(Debug, Serialize, Deserialize)] pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of // pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files // were named like "pg_twophase/000002E5", now they're like // "pg_twophsae/0000000A000002E4". #[derive(Debug, Serialize, Deserialize)] pub(crate) struct TwoPhaseDirectory { pub(crate) xids: HashSet, } #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectoryV17 { xids: HashSet, } #[derive(Debug, Serialize, Deserialize, Default)] pub(crate) struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) // // TODO: Store it as a btree or radix tree or something else that spans multiple // key-value pairs, if you have a lot of relations pub(crate) rels: HashSet<(Oid, u8)>, } #[derive(Debug, Serialize, Deserialize)] struct RelSizeEntry { nblocks: u32, } #[derive(Debug, Serialize, Deserialize, Default)] pub(crate) struct SlruSegmentDirectory { // Set of SLRU segments that exist. pub(crate) segments: HashSet, } #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] #[repr(u8)] pub(crate) enum DirectoryKind { Db, TwoPhase, Rel, AuxFiles, SlruSegment(SlruKind), RelV2, } impl DirectoryKind { pub(crate) const KINDS_NUM: usize = ::LENGTH; pub(crate) fn offset(&self) -> usize { self.into_usize() } } static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use hex_literal::hex; use pageserver_api::models::ShardParameters; use utils::id::TimelineId; use utils::shard::{ShardCount, ShardNumber, ShardStripeSize}; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::harness::TenantHarness; /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline #[tokio::test] async fn aux_files_round_trip() -> anyhow::Result<()> { let name = "aux_files_round_trip"; let harness = TenantHarness::create(name).await?; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); let (tenant, ctx) = harness.load().await; let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); // First modification: insert two keys let mut modification = tline.begin_modification(Lsn(0x1000)); modification.put_file("foo/bar1", b"content1", &ctx).await?; modification.set_lsn(Lsn(0x1008))?; modification.put_file("foo/bar2", b"content2", &ctx).await?; modification.commit(&ctx).await?; let expect_1008 = HashMap::from([ ("foo/bar1".to_string(), Bytes::from_static(b"content1")), ("foo/bar2".to_string(), Bytes::from_static(b"content2")), ]); let io_concurrency = IoConcurrency::spawn_for_test(); let readback = tline .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) .await?; assert_eq!(readback, expect_1008); // Second modification: update one key, remove the other let mut modification = tline.begin_modification(Lsn(0x2000)); modification.put_file("foo/bar1", b"content3", &ctx).await?; modification.set_lsn(Lsn(0x2008))?; modification.put_file("foo/bar2", b"", &ctx).await?; modification.commit(&ctx).await?; let expect_2008 = HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); let readback = tline .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone()) .await?; assert_eq!(readback, expect_2008); // Reading back in time works let readback = tline .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) .await?; assert_eq!(readback, expect_1008); Ok(()) } #[test] fn gap_finding() { let rel = RelTag { spcnode: 1663, dbnode: 208101, relnode: 2620, forknum: 0, }; let base_blkno = 1; let base_key = rel_block_to_key(rel, base_blkno); let before_base_key = rel_block_to_key(rel, base_blkno - 1); let shard = ShardIdentity::unsharded(); let mut previous_nblocks = 0; for i in 0..10 { let crnt_blkno = base_blkno + i; let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard); previous_nblocks = crnt_blkno + 1; if i == 0 { // The first block we write is 1, so we should find the gap. assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key)); } else { assert!(gaps.is_none()); } } // This is an update to an already existing block. No gaps here. let update_blkno = 5; let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard); assert!(gaps.is_none()); // This is an update past the current end block. let after_gap_blkno = 20; let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard); let gap_start_key = rel_block_to_key(rel, previous_nblocks); let after_gap_key = rel_block_to_key(rel, after_gap_blkno); assert_eq!( gaps.unwrap(), KeySpace::single(gap_start_key..after_gap_key) ); } #[test] fn sharded_gap_finding() { let rel = RelTag { spcnode: 1663, dbnode: 208101, relnode: 2620, forknum: 0, }; let first_blkno = 6; // This shard will get the even blocks let shard = ShardIdentity::from_params( ShardNumber(0), ShardParameters { count: ShardCount(2), stripe_size: ShardStripeSize(1), }, ); // Only keys belonging to this shard are considered as gaps. let mut previous_nblocks = 0; let gaps = DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap(); assert!(!gaps.ranges.is_empty()); for gap_range in gaps.ranges { let mut k = gap_range.start; while k != gap_range.end { assert_eq!(shard.get_shard_number(&k), shard.number); k = k.next(); } } previous_nblocks = first_blkno; let update_blkno = 2; let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard); assert!(gaps.is_none()); } /* fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { let incremental = timeline.get_current_logical_size(); let non_incremental = timeline .get_current_logical_size_non_incremental(lsn) .unwrap(); assert_eq!(incremental, non_incremental); } */ /* /// /// Test list_rels() function, with branches and dropped relations /// #[test] fn test_list_rels_drop() -> Result<()> { let repo = RepoHarness::create("test_list_rels_drop")?.load(); let tline = create_empty_timeline(repo, TIMELINE_ID)?; const TESTDB: u32 = 111; // Import initial dummy checkpoint record, otherwise the get_timeline() call // after branching fails below let mut writer = tline.begin_record(Lsn(0x10)); writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; writer.finish()?; // Create a relation on the timeline let mut writer = tline.begin_record(Lsn(0x20)); writer.put_rel_page_image(TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; writer.finish()?; let writer = tline.begin_record(Lsn(0x00)); writer.finish()?; // Check that list_rels() lists it after LSN 2, but no before it assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); // Create a branch, check that the relation is visible there repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?; let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { Some(timeline) => timeline, None => panic!("Should have a local timeline"), }; let newtline = DatadirTimelineImpl::new(newtline); assert!(newtline .list_rels(0, TESTDB, Lsn(0x30))? .contains(&TESTREL_A)); // Drop it on the branch let mut new_writer = newtline.begin_record(Lsn(0x40)); new_writer.drop_relation(TESTREL_A)?; new_writer.finish()?; // Check that it's no longer listed on the branch after the point where it was dropped assert!(newtline .list_rels(0, TESTDB, Lsn(0x30))? .contains(&TESTREL_A)); assert!(!newtline .list_rels(0, TESTDB, Lsn(0x40))? .contains(&TESTREL_A)); // Run checkpoint and garbage collection and check that it's still not visible newtline.checkpoint(CheckpointConfig::Forced)?; repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; assert!(!newtline .list_rels(0, TESTDB, Lsn(0x40))? .contains(&TESTREL_A)); Ok(()) } */ /* #[test] fn test_read_beyond_eof() -> Result<()> { let repo = RepoHarness::create("test_read_beyond_eof")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; make_some_layers(&tline, Lsn(0x20))?; let mut writer = tline.begin_record(Lsn(0x60)); walingest.put_rel_page_image( &mut writer, TESTREL_A, 0, TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x60))), )?; writer.finish()?; // Test read before rel creation. Should error out. assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err()); // Read block beyond end of relation at different points in time. // These reads should fall into different delta, image, and in-memory layers. assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE); assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE); // Test on an in-memory layer with no preceding layer let mut writer = tline.begin_record(Lsn(0x70)); walingest.put_rel_page_image( &mut writer, TESTREL_B, 0, TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), )?; writer.finish()?; assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE); Ok(()) } */ } ================================================ FILE: pageserver/src/span.rs ================================================ use utils::tracing_span_assert::check_fields_present; mod extractors { use utils::tracing_span_assert::ConstExtractor; pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id"); pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id"); pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id"); } #[track_caller] pub(crate) fn debug_assert_current_span_has_tenant_id() { if cfg!(debug_assertions) { if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID]) { panic!("missing extractors: {missing:?}") } } } #[track_caller] pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { if cfg!(debug_assertions) { if let Err(missing) = check_fields_present!([ &extractors::TENANT_ID, &extractors::SHARD_ID, &extractors::TIMELINE_ID, ]) { panic!("missing extractors: {missing:?}") } } } #[track_caller] pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() { if cfg!(debug_assertions) { if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,]) { panic!("missing extractors: {missing:?}") } } } ================================================ FILE: pageserver/src/statvfs.rs ================================================ //! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking. use camino::Utf8Path; pub enum Statvfs { Real(nix::sys::statvfs::Statvfs), Mock(mock::Statvfs), } // NB: on macOS, the block count type of struct statvfs is u32. // The workaround seems to be to use the non-standard statfs64 call. // Sincce it should only be a problem on > 2TiB disks, let's ignore // the problem for now and upcast to u64. impl Statvfs { pub fn get(tenants_dir: &Utf8Path, mocked: Option<&mock::Behavior>) -> nix::Result { if let Some(mocked) = mocked { Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?)) } else { Ok(Statvfs::Real(nix::sys::statvfs::statvfs( tenants_dir.as_std_path(), )?)) } } // NB: allow() because the block count type is u32 on macOS. #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)] pub fn blocks(&self) -> u64 { match self { Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), Statvfs::Mock(stat) => stat.blocks, } } // NB: allow() because the block count type is u32 on macOS. #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)] pub fn blocks_available(&self) -> u64 { match self { Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), Statvfs::Mock(stat) => stat.blocks_available, } } pub fn fragment_size(&self) -> u64 { match self { Statvfs::Real(stat) => stat.fragment_size(), Statvfs::Mock(stat) => stat.fragment_size, } } pub fn block_size(&self) -> u64 { match self { Statvfs::Real(stat) => stat.block_size(), Statvfs::Mock(stat) => stat.block_size, } } /// Get the available and total bytes on the filesystem. pub fn get_avail_total_bytes(&self) -> (u64, u64) { // https://unix.stackexchange.com/a/703650 let blocksize = if self.fragment_size() > 0 { self.fragment_size() } else { self.block_size() }; // use blocks_available (b_avail) since, pageserver runs as unprivileged user let avail_bytes = self.blocks_available() * blocksize; let total_bytes = self.blocks() * blocksize; (avail_bytes, total_bytes) } } pub mod mock { use camino::Utf8Path; pub use pageserver_api::config::statvfs::mock::Behavior; use regex::Regex; use tracing::log::info; pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); match behavior { Behavior::Success { blocksize, total_blocks, name_filter, } => { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); // round it up to the nearest block multiple let used_blocks = used_bytes.div_ceil(*blocksize); if used_blocks > *total_blocks { panic!( "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}" ); } let avail_blocks = total_blocks - used_blocks; Ok(Statvfs { blocks: *total_blocks, blocks_available: avail_blocks, fragment_size: *blocksize, block_size: *blocksize, }) } #[cfg(feature = "testing")] Behavior::Failure { mocked_error } => Err((*mocked_error).into()), } } fn walk_dir_disk_usage(path: &Utf8Path, name_filter: Option<&Regex>) -> anyhow::Result { let mut total = 0; for entry in walkdir::WalkDir::new(path) { let entry = entry?; if !entry.file_type().is_file() { continue; } if !name_filter .as_ref() .map(|filter| filter.is_match(entry.file_name().to_str().unwrap())) .unwrap_or(true) { continue; } let m = match entry.metadata() { Ok(m) => m, Err(e) if is_not_found(&e) => { // some temp file which got removed right as we are walking continue; } Err(e) => { return Err(anyhow::Error::new(e) .context(format!("get metadata of {:?}", entry.path()))); } }; total += m.len(); } Ok(total) } fn is_not_found(e: &walkdir::Error) -> bool { let Some(io_error) = e.io_error() else { return false; }; let kind = io_error.kind(); matches!(kind, std::io::ErrorKind::NotFound) } pub struct Statvfs { pub blocks: u64, pub blocks_available: u64, pub fragment_size: u64, pub block_size: u64, } } ================================================ FILE: pageserver/src/task_mgr.rs ================================================ //! //! This module provides centralized handling of tokio tasks in the Page Server. //! //! We provide a few basic facilities: //! - A global registry of tasks that lists what kind of tasks they are, and //! which tenant or timeline they are working on //! //! - The ability to request a task to shut down. //! //! //! # How it works? //! //! There is a global hashmap of all the tasks (`TASKS`). Whenever a new //! task is spawned, a PageServerTask entry is added there, and when a //! task dies, it removes itself from the hashmap. If you want to kill a //! task, you can scan the hashmap to find it. //! //! # Task shutdown //! //! To kill a task, we rely on co-operation from the victim. Each task is //! expected to periodically call the `is_shutdown_requested()` function, and //! if it returns true, exit gracefully. In addition to that, when waiting for //! the network or other long-running operation, you can use //! `shutdown_watcher()` function to get a Future that will become ready if //! the current task has been requested to shut down. You can use that with //! Tokio select!(). //! //! TODO: This would be a good place to also handle panics in a somewhat sane way. //! Depending on what task panics, we might want to kill the whole server, or //! only a single tenant or timeline. //! use std::collections::HashMap; use std::fmt; use std::future::Future; use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Duration; use futures::FutureExt; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use utils::env; use utils::id::TimelineId; use crate::metrics::set_tokio_runtime_setup; // // There are four runtimes: // // Compute request runtime // - used to handle connections from compute nodes. Any tasks related to satisfying // GetPage requests, base backups, import, and other such compute node operations // are handled by the Compute request runtime // - page_service.rs // - this includes layer downloads from remote storage, if a layer is needed to // satisfy a GetPage request // // Management request runtime // - used to handle HTTP API requests // // WAL receiver runtime: // - used to handle WAL receiver connections. // - and to receiver updates from storage_broker // // Background runtime // - layer flushing // - garbage collection // - compaction // - remote storage uploads // - initial tenant loading // // Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct // runtime. // // There might be situations when one task needs to wait for a task running in another // Runtime to finish. For example, if a background operation needs a layer from remote // storage, it will start to download it. If a background operation needs a remote layer, // and the download was already initiated by a GetPage request, the background task // will wait for the download - running in the Page server runtime - to finish. // Another example: the initial tenant loading tasks are launched in the background ops // runtime. If a GetPage request comes in before the load of a tenant has finished, the // GetPage request will wait for the tenant load to finish. // // The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to // protect data structures. Let's keep it that way. Synchronous code is easier to debug // and analyze, and there's a lot of hairy, low-level, performance critical code there. // // It's nice to have different runtimes, so that you can quickly eyeball how much CPU // time each class of operations is taking, with 'top -H' or similar. // // It's also good to avoid hogging all threads that would be needed to process // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. NonZeroUsize::new( std::env::var("TOKIO_WORKER_THREADS") .map(|s| s.parse::().unwrap()) .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), ) .expect("the max() ensures that this is not zero") }); enum TokioRuntimeMode { SingleThreaded, MultiThreaded { num_workers: NonZeroUsize }, } impl FromStr for TokioRuntimeMode { type Err = String; fn from_str(s: &str) -> Result { match s { "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), s => match s.strip_prefix("multi_thread:") { Some("default") => Ok(TokioRuntimeMode::MultiThreaded { num_workers: *TOKIO_WORKER_THREADS, }), Some(suffix) => { let num_workers = suffix.parse::().map_err(|e| { format!( "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", ) })?; Ok(TokioRuntimeMode::MultiThreaded { num_workers }) } None => Err(format!("invalid runtime config: {s:?}")), }, } } } static TOKIO_THREAD_STACK_SIZE: Lazy = Lazy::new(|| { env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE") // the default 2MiB are insufficent, especially in debug mode .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap()) }); static ONE_RUNTIME: Lazy> = Lazy::new(|| { let thread_name = "pageserver-tokio"; let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { // If the env var is not set, leave this static as None. set_tokio_runtime_setup( "multiple-runtimes", NUM_MULTIPLE_RUNTIMES .checked_mul(*TOKIO_WORKER_THREADS) .unwrap(), ); return None; }; Some(match mode { TokioRuntimeMode::SingleThreaded => { set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); tokio::runtime::Builder::new_current_thread() .thread_name(thread_name) .enable_all() .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one single runtime") } TokioRuntimeMode::MultiThreaded { num_workers } => { set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); tokio::runtime::Builder::new_multi_thread() .thread_name(thread_name) .enable_all() .worker_threads(num_workers.get()) .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one multi-threaded runtime") } }) }); /// Declare a lazy static variable named `$varname` that will resolve /// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` /// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation /// declares a separate runtime and the lazy static variable `$varname` /// will resolve to that separate runtime. /// /// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if /// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime /// otherwise. macro_rules! pageserver_runtime { ($varname:ident, $name:literal) => { pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { if let Some(runtime) = &*ONE_RUNTIME { return runtime; } static RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() .thread_name($name) .worker_threads(TOKIO_WORKER_THREADS.get()) .enable_all() .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect(std::concat!("Failed to create runtime ", $name)) }); &*RUNTIME }); }; } pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); // Bump this number when adding a new pageserver_runtime! const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap(); #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); impl fmt::Display for PageserverTaskId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } } /// Each task that we track is associated with a "task ID". It's just an /// increasing number that we assign. Note that it is different from tokio::task::Id. static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1); /// Global registry of tasks static TASKS: Lazy>>> = Lazy::new(|| Mutex::new(HashMap::new())); task_local! { // This is a cancellation token which will be cancelled when a task needs to shut down. The // root token is kept in the global registry, so that anyone can send the signal to request // task shutdown. static SHUTDOWN_TOKEN: CancellationToken; // Each task holds reference to its own PageServerTask here. static CURRENT_TASK: Arc; } /// /// There are many kinds of tasks in the system. Some are associated with a particular /// tenant or timeline, while others are global. /// /// Note that we don't try to limit how many task of a certain kind can be running /// at the same time. /// #[derive( Debug, // NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy enumset::EnumSetType, enum_map::Enum, serde::Serialize, serde::Deserialize, strum_macros::IntoStaticStr, strum_macros::EnumString, )] pub enum TaskKind { // Pageserver startup, i.e., `main` Startup, // libpq listener task. It just accepts connection and spawns a // PageRequestHandler task for each connection. LibpqEndpointListener, // HTTP endpoint listener. HttpEndpointListener, /// Task that handles a single page service connection. A PageRequestHandler /// task starts detached from any particular tenant or timeline, but it can /// be associated with one later, after receiving a command from the client. /// Also used for the gRPC page service API, including the main server task. PageRequestHandler, /// Manages the WAL receiver connection for one timeline. /// It subscribes to events from storage_broker and decides which safekeeper to connect to. /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library. /// There is at most one connection at any given time. /// /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`. /// The `Client` object is what library users use to make requests & get responses. /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// /// Once the connection is established, the `TaskHandle` task spawns a /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. /// /// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler /// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager WalReceiverConnectionHandler, /// The task that polls the `tokio-postgres::Connection` object. /// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler). /// See the comment on [`WalReceiverManager`](Self::WalReceiverManager). WalReceiverConnectionPoller, // Garbage collection worker. One per tenant GarbageCollector, // Compaction. One per tenant. Compaction, // Eviction. One per timeline. Eviction, // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.). TenantHousekeeping, /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, /// See [`crate::tenant::secondary`]. SecondaryDownloads, /// See [`crate::tenant::secondary`]. SecondaryUploads, // Initial logical size calculation InitialLogicalSizeCalculation, OndemandLogicalSizeCalculation, // Task that flushes frozen in-memory layers to disk LayerFlushTask, // Task that uploads a file to remote storage RemoteUploadTask, // task that handles the initial downloading of all tenants InitialLoad, // task that handles attaching a tenant Attach, // Used mostly for background deletion from s3 TimelineDeletionWorker, // task that handhes metrics collection MetricsCollection, // task that drives downloading layers DownloadAllRemoteLayers, // Task that calculates synthetis size for all active tenants CalculateSyntheticSize, // A request that comes in via the pageserver HTTP API. MgmtRequest, DebugTool, EphemeralFilePreWarmPageCache, LayerDownload, #[cfg(test)] UnitTest, DetachAncestor, ImportPgdata, /// Background task of [`crate::basebackup_cache::BasebackupCache`]. /// Prepares basebackups and clears outdated entries. BasebackupCache, } #[derive(Default)] struct MutableTaskState { /// Handle for waiting for the task to exit. It can be None, if the /// the task has already exited. join_handle: Option>, } struct PageServerTask { task_id: PageserverTaskId, kind: TaskKind, name: String, // To request task shutdown, just cancel this token. cancel: CancellationToken, /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] tenant_shard_id: TenantShardId, timeline_id: Option, mutable: Mutex, } /// Launch a new task /// Note: if shutdown_process_on_error is set to true failure /// of the task will lead to shutdown of entire process pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, tenant_shard_id: TenantShardId, timeline_id: Option, name: &str, future: F, ) -> PageserverTaskId where F: Future> + Send + 'static, { let cancel = CancellationToken::new(); let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); let task = Arc::new(PageServerTask { task_id: PageserverTaskId(task_id), kind, name: name.to_string(), cancel: cancel.clone(), tenant_shard_id, timeline_id, mutable: Mutex::new(MutableTaskState { join_handle: None }), }); TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); let mut task_mut = task.mutable.lock().unwrap(); let task_name = name.to_string(); let task_cloned = Arc::clone(&task); let join_handle = runtime.spawn(task_wrapper( task_name, task_id, task_cloned, cancel, future, )); task_mut.join_handle = Some(join_handle); drop(task_mut); // The task is now running. Nothing more to do here PageserverTaskId(task_id) } /// This wrapper function runs in a newly-spawned task. It initializes the /// task-local variables and calls the payload function. async fn task_wrapper( task_name: String, task_id: u64, task: Arc, shutdown_token: CancellationToken, future: F, ) where F: Future> + Send + 'static, { debug!("Starting task '{}'", task_name); // wrap the future so we log panics and errors let tenant_shard_id = task.tenant_shard_id; let timeline_id = task.timeline_id; let fut = async move { // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the // unwinding that would expose us to unwind-unsafe behavior. let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(())) => { debug!("Task '{}' exited normally", task_name); } Ok(Err(err)) => { error!( "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", task_name, tenant_shard_id, timeline_id, err ); } Err(err) => { error!( "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", task_name, tenant_shard_id, timeline_id, err ); } } }; // add the task-locals let fut = CURRENT_TASK.scope(task, fut); let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut); // poll future to completion fut.await; // Remove our entry from the global hashmap. TASKS .lock() .unwrap() .remove(&task_id) .expect("no task in registry"); } pub async fn exit_on_panic_or_error( task_name: &'static str, future: impl Future>, ) -> T where E: std::fmt::Debug, { // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the // unwinding that would expose us to unwind-unsafe behavior. let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(val)) => val, Ok(Err(err)) => { error!( task_name, "Task exited with error, exiting process: {err:?}" ); std::process::exit(1); } Err(panic_obj) => { error!(task_name, "Task panicked, exiting process: {panic_obj:?}"); std::process::exit(1); } } } /// Signal and wait for tasks to shut down. /// /// /// The arguments are used to select the tasks to kill. Any None arguments are /// ignored. For example, to shut down all WalReceiver tasks: /// /// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) /// /// Or to shut down all tasks for given timeline: /// /// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id)) /// pub async fn shutdown_tasks( kind: Option, tenant_shard_id: Option, timeline_id: Option, ) { let mut victim_tasks = Vec::new(); { let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); victim_tasks.push(( Arc::clone(task), task.kind, task.tenant_shard_id, task.timeline_id, )); } } } let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none(); for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { if log_all { // warn to catch these in tests; there shouldn't be any warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } const INITIAL_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(1); const PERIODIC_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(60); if tokio::time::timeout(INITIAL_COMPLAIN_TIMEOUT, &mut join_handle) .await .is_err() { // allow some time to elapse before logging to cut down the number of log // lines. info!("waiting for task {} to shut down", task.name); loop { tokio::select! { // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) // - task errors are already logged in the wrapper _ = &mut join_handle => break, _ = tokio::time::sleep(PERIODIC_COMPLAIN_TIMEOUT) => info!("still waiting for task {} to shut down", task.name), } } info!("task {} completed", task.name); } } else { // Possibly one of: // * The task had not even fully started yet. // * It was shut down concurrently and already exited } } } pub fn current_task_kind() -> Option { CURRENT_TASK.try_with(|ct| ct.kind).ok() } pub fn current_task_id() -> Option { CURRENT_TASK.try_with(|ct| ct.task_id).ok() } /// A Future that can be used to check if the current task has been requested to /// shut down. pub async fn shutdown_watcher() { let token = SHUTDOWN_TOKEN .try_with(|t| t.clone()) .expect("shutdown_watcher() called in an unexpected task or thread"); token.cancelled().await; } /// Clone the current task's cancellation token, which can be moved across tasks. /// /// When the task which is currently executing is shutdown, the cancellation token will be /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or /// `tokio::task::JoinSet::spawn`. pub fn shutdown_token() -> CancellationToken { let res = SHUTDOWN_TOKEN.try_with(|t| t.clone()); if cfg!(test) { // in tests this method is called from non-taskmgr spawned tasks, and that is all ok. res.unwrap_or_default() } else { res.expect("shutdown_token() called in an unexpected task or thread") } } /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) { true_or_false } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); } false } } ================================================ FILE: pageserver/src/tenant/blob_io.rs ================================================ //! //! Functions for reading and writing variable-sized "blobs". //! //! Each blob begins with a 1- or 4-byte length field, followed by the //! actual data. If the length is smaller than 128 bytes, the length //! is written as a one byte. If it's larger than that, the length //! is written as a four-byte integer, in big-endian, with the high //! bit set. This way, we can detect whether it's 1- or 4-byte header //! by peeking at the first byte. For blobs larger than 128 bits, //! we also specify three reserved bits, only one of the three bit //! patterns is currently in use (0b011) and signifies compression //! with zstd. //! //! len < 128: 0XXXXXXX //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use std::cmp::min; use anyhow::Context; use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::IoBuf; use tokio_util::sync::CancellationToken; use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; use crate::virtual_file::IoBufferMut; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::owned_buffers_io::write::{BufferedWriter, FlushTaskError}; use crate::virtual_file::owned_buffers_io::write::{BufferedWriterShutdownMode, OwnedAsyncWriter}; #[derive(Copy, Clone, Debug)] pub struct CompressionInfo { pub written_compressed: bool, pub compressed_size: Option, } /// A blob header, with header+data length and compression info. /// /// TODO: use this more widely, and add an encode() method too. /// TODO: document the header format. #[derive(Clone, Copy, Default)] pub struct Header { pub header_len: usize, pub data_len: usize, pub compression_bits: u8, } impl Header { /// Decodes a header from a byte slice. pub fn decode(bytes: &[u8]) -> anyhow::Result { let Some(&first_header_byte) = bytes.first() else { anyhow::bail!("zero-length blob header"); }; // If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes. if first_header_byte < 0x80 { return Ok(Self { header_len: 1, // by definition data_len: first_header_byte as usize, compression_bits: BYTE_UNCOMPRESSED, }); } // Otherwise, this is a 4-byte header containing compression information and length. const HEADER_LEN: usize = 4; let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN] .try_into() .map_err(|_| anyhow::anyhow!("blob header too short: {bytes:?}"))?; // TODO: verify the compression bits and convert to an enum. let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK; header_buf[0] &= !LEN_COMPRESSION_BIT_MASK; let data_len = u32::from_be_bytes(header_buf) as usize; Ok(Self { header_len: HEADER_LEN, data_len, compression_bits, }) } /// Returns the total header+data length. pub fn total_len(&self) -> usize { self.header_len + self.data_len } } #[derive(Debug, thiserror::Error)] pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), #[error(transparent)] Other(anyhow::Error), } impl WriteBlobError { pub fn is_cancel(&self) -> bool { match self { WriteBlobError::Flush(e) => e.is_cancel(), WriteBlobError::Other(_) => false, } } pub fn into_anyhow(self) -> anyhow::Error { match self { WriteBlobError::Flush(e) => e.into_anyhow(), WriteBlobError::Other(e) => e, } } } impl BlockCursor<'_> { /// Read a blob into a new buffer. pub async fn read_blob( &self, offset: u64, ctx: &RequestContext, ) -> Result, std::io::Error> { let mut buf = Vec::new(); self.read_blob_into_buf(offset, &mut buf, ctx).await?; Ok(buf) } /// Read blob into the given buffer. Any previous contents in the buffer /// are overwritten. pub async fn read_blob_into_buf( &self, offset: u64, dstbuf: &mut Vec, ctx: &RequestContext, ) -> Result<(), std::io::Error> { let mut blknum = (offset / PAGE_SZ as u64) as u32; let mut off = (offset % PAGE_SZ as u64) as usize; let mut buf = self.read_blk(blknum, ctx).await?; // peek at the first byte, to determine if it's a 1- or 4-byte length let first_len_byte = buf[off]; let len: usize = if first_len_byte < 0x80 { // 1-byte length header off += 1; first_len_byte as usize } else { // 4-byte length header let mut len_buf = [0u8; 4]; let thislen = PAGE_SZ - off; if thislen < 4 { // it is split across two pages len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); blknum += 1; buf = self.read_blk(blknum, ctx).await?; len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); off = 4 - thislen; } else { len_buf.copy_from_slice(&buf[off..off + 4]); off += 4; } let bit_mask = if self.read_compressed { !LEN_COMPRESSION_BIT_MASK } else { 0x7f }; len_buf[0] &= bit_mask; u32::from_be_bytes(len_buf) as usize }; let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; let mut tmp_buf = Vec::new(); let buf_to_write; let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed { if compression_bits > BYTE_UNCOMPRESSED { warn!("reading key above future limit ({len} bytes)"); } buf_to_write = dstbuf; None } else if compression_bits == BYTE_ZSTD { buf_to_write = &mut tmp_buf; Some(dstbuf) } else { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, format!("invalid compression byte {compression_bits:x}"), ); return Err(error); }; buf_to_write.clear(); buf_to_write.reserve(len); // Read the payload let mut remain = len; while remain > 0 { let mut page_remain = PAGE_SZ - off; if page_remain == 0 { // continue on next page blknum += 1; buf = self.read_blk(blknum, ctx).await?; off = 0; page_remain = PAGE_SZ; } let this_blk_len = min(remain, page_remain); buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]); remain -= this_blk_len; off += this_blk_len; } if let Some(dstbuf) = compression { if compression_bits == BYTE_ZSTD { let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf); decoder.write_all(buf_to_write).await?; decoder.flush().await?; } else { unreachable!("already checked above") } } Ok(()) } } /// Reserved bits for length and compression pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; /// The maximum size of blobs we support. The highest few bits /// are reserved for compression and other further uses. pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff; pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; /// A wrapper of `VirtualFile` that allows users to write blobs. pub struct BlobWriter { /// We do tiny writes for the length headers; they need to be in an owned buffer; io_buf: Option, writer: BufferedWriter, offset: u64, } impl BlobWriter where W: OwnedAsyncWriter + std::fmt::Debug + Send + Sync + 'static, { /// See [`BufferedWriter`] struct-level doc comment for semantics of `start_offset`. pub fn new( file: W, start_offset: u64, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ctx: &RequestContext, flush_task_span: tracing::Span, ) -> anyhow::Result { Ok(Self { io_buf: Some(BytesMut::new()), writer: BufferedWriter::new( file, start_offset, || IoBufferMut::with_capacity(Self::CAPACITY), gate.enter()?, cancel, ctx, flush_task_span, ), offset: start_offset, }) } pub fn size(&self) -> u64 { self.offset } const CAPACITY: usize = 64 * 1024; /// Writes `src_buf` to the file at the current offset. async fn write_all( &mut self, src_buf: FullSlice, ctx: &RequestContext, ) -> (FullSlice, Result<(), FlushTaskError>) { let res = self .writer // TODO: why are we taking a FullSlice if we're going to pass a borrow downstack? // Can remove all the complexity around owned buffers upstack .write_buffered_borrowed(&src_buf, ctx) .await .map(|len| { self.offset += len as u64; }); (src_buf, res) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. pub async fn write_blob( &mut self, srcbuf: FullSlice, ctx: &RequestContext, ) -> (FullSlice, Result) { let (buf, res) = self .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) .await; (buf, res.map(|(off, _compression_info)| off)) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. pub(crate) async fn write_blob_maybe_compressed( &mut self, srcbuf: FullSlice, ctx: &RequestContext, algorithm: ImageCompressionAlgorithm, ) -> ( FullSlice, Result<(u64, CompressionInfo), WriteBlobError>, ) { let offset = self.offset; let mut compression_info = CompressionInfo { written_compressed: false, compressed_size: None, }; let len = srcbuf.len(); let mut io_buf = self.io_buf.take().expect("we always put it back below"); io_buf.clear(); let mut compressed_buf = None; let ((io_buf_slice, hdr_res), srcbuf) = async { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); let (slice, res) = self.write_all(io_buf.slice_len(), ctx).await; let res = res.map_err(WriteBlobError::Flush); ((slice, res), srcbuf) } else { // Write a 4-byte length header if len > MAX_SUPPORTED_BLOB_LEN { return ( ( io_buf.slice_len(), Err(WriteBlobError::Other(anyhow::anyhow!( "blob too large ({len} bytes)" ))), ), srcbuf, ); } let (high_bit_mask, len_written, srcbuf) = match algorithm { ImageCompressionAlgorithm::Zstd { level } => { let mut encoder = if let Some(level) = level { async_compression::tokio::write::ZstdEncoder::with_quality( Vec::new(), Level::Precise(level.into()), ) } else { async_compression::tokio::write::ZstdEncoder::new(Vec::new()) }; encoder.write_all(&srcbuf[..]).await.unwrap(); encoder.shutdown().await.unwrap(); let compressed = encoder.into_inner(); compression_info.compressed_size = Some(compressed.len()); if compressed.len() < len { compression_info.written_compressed = true; let compressed_len = compressed.len(); compressed_buf = Some(compressed); (BYTE_ZSTD, compressed_len, srcbuf) } else { (BYTE_UNCOMPRESSED, len, srcbuf) } } ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf), }; let mut len_buf = (len_written as u32).to_be_bytes(); assert_eq!(len_buf[0] & 0xf0, 0); len_buf[0] |= high_bit_mask; io_buf.extend_from_slice(&len_buf[..]); let (slice, res) = self.write_all(io_buf.slice_len(), ctx).await; let res = res.map_err(WriteBlobError::Flush); ((slice, res), srcbuf) } } .await; self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner()); match hdr_res { Ok(_) => (), Err(e) => return (srcbuf, Err(e)), } let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await; (srcbuf, res) } else { self.write_all(srcbuf, ctx).await }; let res = res.map_err(WriteBlobError::Flush); (srcbuf, res.map(|_| (offset, compression_info))) } /// Writes a raw blob containing both header and data, returning its offset. pub(crate) async fn write_blob_raw( &mut self, raw_with_header: FullSlice, ctx: &RequestContext, ) -> (FullSlice, Result) { // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), }; if raw_with_header.len() != header.total_len() { let header_total_len = header.total_len(); let raw_len = raw_with_header.len(); return ( raw_with_header, Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); } let offset = self.offset; let (raw_with_header, result) = self.write_all(raw_with_header, ctx).await; let result = result.map_err(WriteBlobError::Flush); (raw_with_header, result.map(|_| offset)) } /// Finish this blob writer and return the underlying `W`. pub async fn shutdown( self, mode: BufferedWriterShutdownMode, ctx: &RequestContext, ) -> Result { let (_, file) = self.writer.shutdown(mode, ctx).await?; Ok(file) } } #[cfg(test)] pub(crate) mod tests { use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; use tracing::info_span; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; use crate::tenant::block_io::BlockReaderRef; use crate::virtual_file; use crate::virtual_file::TempVirtualFile; use crate::virtual_file::VirtualFile; async fn round_trip_test(blobs: &[Vec]) -> anyhow::Result<()> { round_trip_test_compressed(blobs, false).await } pub(crate) async fn write_maybe_compressed( blobs: &[Vec], compression: bool, ctx: &RequestContext, ) -> anyhow::Result<(Utf8TempDir, Utf8PathBuf, Vec)> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); // Write part (in block to drop the file) let mut offsets = Vec::new(); { let file = TempVirtualFile::new( VirtualFile::open_with_options_v2( pathbuf.as_path(), virtual_file::OpenOptions::new() .create_new(true) .write(true), ctx, ) .await?, gate.enter()?, ); let mut wtr = BlobWriter::new(file, 0, &gate, cancel.clone(), ctx, info_span!("test")).unwrap(); for blob in blobs.iter() { let (_, res) = if compression { let res = wtr .write_blob_maybe_compressed( blob.clone().slice_len(), ctx, ImageCompressionAlgorithm::Zstd { level: Some(1) }, ) .await; (res.0, res.1.map(|(off, _)| off)) } else { wtr.write_blob(blob.clone().slice_len(), ctx).await }; let offs = res?; offsets.push(offs); } let file = wtr .shutdown( BufferedWriterShutdownMode::ZeroPadToNextMultiple(PAGE_SZ), ctx, ) .await?; file.disarm_into_inner() }; Ok((temp_dir, pathbuf, offsets)) } async fn round_trip_test_compressed( blobs: &[Vec], compression: bool, ) -> anyhow::Result<()> { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed(blobs, compression, &ctx).await?; println!("Done writing!"); let file = VirtualFile::open_v2(pathbuf, &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let blob_read = rdr.read_blob(*offset, &ctx).await?; assert_eq!( blob, &blob_read, "mismatch for idx={idx} at offset={offset}" ); } Ok(()) } pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::rng(); (0..len).map(|_| rng.random()).collect::<_>() } #[tokio::test] async fn test_one() -> anyhow::Result<()> { let blobs = &[vec![12, 21, 22]]; round_trip_test(blobs).await?; Ok(()) } #[tokio::test] async fn test_hello_simple() -> anyhow::Result<()> { let blobs = &[ vec![0, 1, 2, 3], b"Hello, World!".to_vec(), Vec::new(), b"foobar".to_vec(), ]; round_trip_test(blobs).await?; round_trip_test_compressed(blobs, true).await?; Ok(()) } #[tokio::test] async fn test_really_big_array() -> anyhow::Result<()> { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), b"hello".to_vec(), random_array(66 * PAGE_SZ), vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test(blobs).await?; round_trip_test_compressed(blobs, true).await?; Ok(()) } #[tokio::test] async fn test_arrays_inc() -> anyhow::Result<()> { let blobs = (0..PAGE_SZ / 8) .map(|v| random_array(v * 16)) .collect::>(); round_trip_test(&blobs).await?; Ok(()) } #[tokio::test] async fn test_arrays_random_size() -> anyhow::Result<()> { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { let mut sz: u16 = rng.random(); // Make 50% of the arrays small if rng.random() { sz &= 63; } random_array(sz.into()) }) .collect::>(); round_trip_test(&blobs).await?; Ok(()) } #[tokio::test] async fn test_arrays_page_boundary() -> anyhow::Result<()> { let blobs = &[ random_array(PAGE_SZ - 4), random_array(PAGE_SZ - 4), random_array(PAGE_SZ - 4), ]; round_trip_test(blobs).await?; Ok(()) } } ================================================ FILE: pageserver/src/tenant/block_io.rs ================================================ //! //! Low-level Block-oriented I/O functions //! use std::ops::Deref; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult}; #[cfg(test)] use crate::virtual_file::IoBufferMut; use crate::virtual_file::{IoBuffer, VirtualFile}; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache /// /// There are currently two implementations: EphemeralFile, and FileBlockReader /// below. pub trait BlockReader { /// /// Create a new "cursor" for reading from this reader. /// /// A cursor caches the last accessed page, allowing for faster /// access if the same block is accessed repeatedly. fn block_cursor(&self) -> BlockCursor<'_>; } impl BlockReader for &B where B: BlockReader, { fn block_cursor(&self) -> BlockCursor<'_> { (*self).block_cursor() } } /// Reference to an in-memory copy of an immutable on-disk block. pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] IoBufferMut(IoBufferMut), } impl From> for BlockLease<'static> { fn from(value: PageReadGuard<'static>) -> BlockLease<'static> { BlockLease::PageReadGuard(value) } } #[cfg(test)] impl From> for BlockLease<'_> { fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self { BlockLease::Arc(value) } } impl Deref for BlockLease<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] BlockLease::IoBufferMut(v) => { TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") } } } } /// Provides the ability to read blocks from different sources, /// similar to using traits for this purpose. /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] VirtualFile(&'a VirtualFile), } impl BlockReaderRef<'_> { #[inline(always)] async fn read_blk( &self, blknum: u32, ctx: &RequestContext, ) -> Result { use BlockReaderRef::*; match self { FileBlockReader(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] VirtualFile(r) => r.read_blk(blknum, ctx).await, } } } /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so /// that in many cases you can use a BlockCursor as a drop-in replacement for /// the underlying BlockReader. For example: /// /// ```no_run /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; /// # use pageserver::context::RequestContext; /// # let reader: FileBlockReader = unimplemented!("stub"); /// # let ctx: RequestContext = unimplemented!("stub"); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1, &ctx); /// // do stuff with 'buf' /// let buf = cursor.read_blk(2, &ctx); /// // do stuff with 'buf' /// ``` /// pub struct BlockCursor<'a> { pub(super) read_compressed: bool, reader: BlockReaderRef<'a>, } impl<'a> BlockCursor<'a> { pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self { Self::new_with_compression(reader, false) } pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self { BlockCursor { read_compressed, reader, } } // Needed by cli pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { read_compressed: false, reader: BlockReaderRef::FileBlockReader(reader), } } /// Read a block. /// /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) #[inline(always)] pub async fn read_blk( &self, blknum: u32, ctx: &RequestContext, ) -> Result { self.reader.read_blk(blknum, ctx).await } } /// An adapter for reading a (virtual) file using the page cache. /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. #[derive(Clone)] pub struct FileBlockReader<'a> { pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, compressed_reads: bool, } impl<'a> FileBlockReader<'a> { pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { FileBlockReader { file_id, file, compressed_reads: true, } } /// Read a page from the underlying file into given buffer. async fn fill_buffer( &self, buf: PageWriteGuard<'static>, blkno: u32, ctx: &RequestContext, ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx) .await } /// Read a block. /// /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) pub async fn read_blk<'b>( &self, blknum: u32, ctx: &RequestContext, ) -> Result, std::io::Error> { let cache = page_cache::get(); match cache .read_immutable_buf(self.file_id, blknum, ctx) .await .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))? { ReadBufResult::Found(guard) => Ok(guard.into()), ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?; Ok(write_guard.mark_valid().into()) } } } } impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new_with_compression( BlockReaderRef::FileBlockReader(self), self.compressed_reads, ) } } /// /// Trait for block-oriented output /// pub trait BlockWriter { /// /// Write a page to the underlying storage. /// /// 'buf' must be of size PAGE_SZ. Returns the block number the page was /// written to. /// fn write_blk(&mut self, buf: IoBuffer) -> Result; } /// /// A simple in-memory buffer of blocks. /// pub struct BlockBuf { pub blocks: Vec, } impl BlockWriter for BlockBuf { fn write_blk(&mut self, buf: IoBuffer) -> Result { assert!(buf.len() == PAGE_SZ); let blknum = self.blocks.len(); self.blocks.push(buf); Ok(blknum as u32) } } impl BlockBuf { pub fn new() -> Self { BlockBuf { blocks: Vec::new() } } pub fn size(&self) -> u64 { (self.blocks.len() * PAGE_SZ) as u64 } } impl Default for BlockBuf { fn default() -> Self { Self::new() } } ================================================ FILE: pageserver/src/tenant/checks.rs ================================================ use std::collections::BTreeSet; use itertools::Itertools; use pageserver_compaction::helpers::overlaps_with; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). /// /// The function implements a fast path check and a slow path check. /// /// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain /// | | | | /// | 1 | | 2 | | 3 | /// | | | | | | /// ``` /// /// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have /// the same LSN range. /// /// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, /// /// ```plain /// | | | 2 | | | /// | 1 | |-------| | 3 | /// | | | 4 | | | /// /// If layer 2 and 4 contain the same single key, this is also a valid layer map. /// /// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition. /// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...") pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) let mut all_delta_layers = Vec::new(); for name in metadata { if let LayerName::Delta(layer) = name { all_delta_layers.push(layer.clone()); } } for layer in &all_delta_layers { if layer.key_range.start.next() != layer.key_range.end { let lsn_range = &layer.lsn_range; lsn_split_point.insert(lsn_range.start); lsn_split_point.insert(lsn_range.end); } } for (idx, layer) in all_delta_layers.iter().enumerate() { if layer.key_range.start.next() == layer.key_range.end { continue; } let lsn_range = layer.lsn_range.clone(); let intersects = lsn_split_point.range(lsn_range).collect_vec(); if intersects.len() > 1 { // A slow path to check if the layer intersects with any other delta layer. for (other_idx, other_layer) in all_delta_layers.iter().enumerate() { if other_idx == idx { // do not check self intersects with self continue; } if overlaps_with(&layer.lsn_range, &other_layer.lsn_range) && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( "layer violates the layer map LSN split assumption: layer {layer} intersects with layer {other_layer}" ); return Some(err); } } } } None } ================================================ FILE: pageserver/src/tenant/config.rs ================================================ //! Functions for handling per-tenant configuration options //! //! If tenant is created with --config option, //! the tenant-specific config will be stored in tenant's directory. //! Otherwise, global pageserver's config is used. //! //! If the tenant config file is corrupted, the tenant will be disabled. //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! use pageserver_api::models; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::{Deserialize, Serialize}; use utils::critical; use utils::generation::Generation; #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached /// pageserver. This is the "normal" attachment mode. Single, /// Our generation number is current as far as we know, but we are advised that another /// pageserver is still attached, and therefore to avoid executing deletions. This is /// the attachment mode of a pagesever that is the destination of a migration. Multi, /// Our generation number is superseded, or about to be superseded. We are advised /// to avoid remote storage writes if possible, and to avoid sending billing data. This /// is the attachment mode of a pageserver that is the origin of a migration. Stale, } #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct AttachedLocationConfig { pub(crate) generation: Generation, pub(crate) attach_mode: AttachmentMode, // TODO: add a flag to override AttachmentMode's policies under // disk pressure (i.e. unblock uploads under disk pressure in Stale // state, unblock deletions after timeout in Multi state) } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct SecondaryLocationConfig { /// If true, keep the local cache warm by polling remote storage pub(crate) warm: bool, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum LocationMode { Attached(AttachedLocationConfig), Secondary(SecondaryLocationConfig), } /// Per-tenant, per-pageserver configuration. All pageservers use the same TenantConf, /// but have distinct LocationConf. #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] pub(crate) struct LocationConf { /// The location-specific part of the configuration, describes the operating /// mode of this pageserver for this tenant. pub(crate) mode: LocationMode, /// The detailed shard identity. This structure is already scoped within /// a TenantShardId, but we need the full ShardIdentity to enable calculating /// key->shard mappings. /// /// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended /// stripe size. Otherwise, a split request that does not specify a stripe size may use a /// different default than storcon, which can lead to incorrect stripe sizes and corruption. pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations pub(crate) tenant_conf: pageserver_api::models::TenantConfig, } impl std::fmt::Debug for LocationConf { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self.mode { LocationMode::Attached(conf) => { write!( f, "Attached {:?}, gen={:?}", conf.attach_mode, conf.generation ) } LocationMode::Secondary(conf) => { write!(f, "Secondary, warm={}", conf.warm) } } } } impl AttachedLocationConfig { /// Consult attachment mode to determine whether we are currently permitted /// to delete layers. This is only advisory, not required for data safety. /// See [`AttachmentMode`] for more context. pub(crate) fn may_delete_layers_hint(&self) -> bool { // TODO: add an override for disk pressure in AttachedLocationConfig, // and respect it here. match &self.attach_mode { AttachmentMode::Single => true, AttachmentMode::Multi | AttachmentMode::Stale => { // In Multi mode we avoid doing deletions because some other // attached pageserver might get 404 while trying to read // a layer we delete which is still referenced in their metadata. // // In Stale mode, we avoid doing deletions because we expect // that they would ultimately fail validation in the deletion // queue due to our stale generation. false } } } /// Whether we are currently hinted that it is worthwhile to upload layers. /// This is only advisory, not required for data safety. /// See [`AttachmentMode`] for more context. pub(crate) fn may_upload_layers_hint(&self) -> bool { // TODO: add an override for disk pressure in AttachedLocationConfig, // and respect it here. match &self.attach_mode { AttachmentMode::Single | AttachmentMode::Multi => true, AttachmentMode::Stale => { // In Stale mode, we avoid doing uploads because we expect that // our replacement pageserver will already have started its own // IndexPart that will never reference layers we upload: it is // wasteful. false } } } } impl LocationConf { /// For use when loading from a legacy configuration: presence of a tenant /// implies it is in AttachmentMode::Single, which used to be the only /// possible state. This function should eventually be removed. pub(crate) fn attached_single( tenant_conf: pageserver_api::models::TenantConfig, generation: Generation, shard_params: models::ShardParameters, ) -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { generation, attach_mode: AttachmentMode::Single, }), shard: ShardIdentity::from_params(ShardNumber(0), shard_params), tenant_conf, } } /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). pub(crate) fn attach_in_generation( &mut self, mode: AttachmentMode, generation: Generation, stripe_size: ShardStripeSize, ) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; attach_conf.attach_mode = mode; } LocationMode::Secondary(_) => { // We are promoted to attached by the control plane's re-attach response self.mode = LocationMode::Attached(AttachedLocationConfig { generation, attach_mode: mode, }) } } // This should never happen. // TODO: turn this into a proper assertion. if stripe_size != self.shard.stripe_size { critical!( "stripe size mismatch: {} != {}", self.shard.stripe_size, stripe_size, ); } self.shard.stripe_size = stripe_size; } pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result { let tenant_conf = conf.tenant_conf.clone(); fn get_generation(conf: &'_ models::LocationConfig) -> Result { conf.generation .map(Generation::new) .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching")) } let mode = match &conf.mode { models::LocationConfigMode::AttachedMulti => { LocationMode::Attached(AttachedLocationConfig { generation: get_generation(conf)?, attach_mode: AttachmentMode::Multi, }) } models::LocationConfigMode::AttachedSingle => { LocationMode::Attached(AttachedLocationConfig { generation: get_generation(conf)?, attach_mode: AttachmentMode::Single, }) } models::LocationConfigMode::AttachedStale => { LocationMode::Attached(AttachedLocationConfig { generation: get_generation(conf)?, attach_mode: AttachmentMode::Stale, }) } models::LocationConfigMode::Secondary => { anyhow::ensure!(conf.generation.is_none()); let warm = conf .secondary_conf .as_ref() .map(|c| c.warm) .unwrap_or(false); LocationMode::Secondary(SecondaryLocationConfig { warm }) } models::LocationConfigMode::Detached => { // Should not have been called: API code should translate this mode // into a detach rather than trying to decode it as a LocationConf return Err(anyhow::anyhow!("Cannot decode a Detached configuration")); } }; let shard = if conf.shard_count == 0 { // NB: carry over the persisted stripe size instead of using the default. This doesn't // matter for most practical purposes, since unsharded tenants don't use the stripe // size, but can cause inconsistencies between storcon and Pageserver and cause manual // splits without `new_stripe_size` to use an unintended stripe size. ShardIdentity::unsharded_with_stripe_size(ShardStripeSize(conf.shard_stripe_size)) } else { ShardIdentity::new( ShardNumber(conf.shard_number), ShardCount::new(conf.shard_count), ShardStripeSize(conf.shard_stripe_size), )? }; Ok(Self { shard, mode, tenant_conf, }) } } impl Default for LocationConf { // TODO: this should be removed once tenant loading can guarantee that we are never // loading from a directory without a configuration. // => tech debt since https://github.com/neondatabase/neon/issues/1555 fn default() -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { generation: Generation::none(), attach_mode: AttachmentMode::Single, }), tenant_conf: pageserver_api::models::TenantConfig::default(), shard: ShardIdentity::unsharded(), } } } #[cfg(test)] mod tests { #[test] fn serde_roundtrip_tenant_conf_opt() { let small_conf = pageserver_api::models::TenantConfig { gc_horizon: Some(42), ..Default::default() }; let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); assert_eq!(toml_form, "gc_horizon = 42\n"); assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap()); let json_form = serde_json::to_string(&small_conf).unwrap(); assert_eq!(json_form, "{\"gc_horizon\":42}"); assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } } ================================================ FILE: pageserver/src/tenant/debug.rs ================================================ use std::{ops::Range, str::FromStr, sync::Arc}; use crate::walredo::RedoAttemptType; use base64::{Engine as _, engine::general_purpose::STANDARD}; use bytes::{Bytes, BytesMut}; use camino::Utf8PathBuf; use clap::Parser; use itertools::Itertools; use pageserver_api::{ key::Key, keyspace::KeySpace, shard::{ShardIdentity, ShardStripeSize}, }; use postgres_ffi::PgMajorVersion; use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn}; use tracing::Instrument; use utils::{ generation::Generation, id::{TenantId, TimelineId}, lsn::Lsn, shard::{ShardCount, ShardIndex, ShardNumber}, }; use wal_decoder::models::record::NeonWalRecord; use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, tenant::storage_layer::ValueReconstructState, walredo::harness::RedoHarness, }; use super::{ WalRedoManager, WalredoManagerId, harness::TenantHarness, remote_timeline_client::LayerFileMetadata, storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState}, }; fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes { // To match the logic in libs/wal_decoder/src/serialized_batch.rs let mut new_image: BytesMut = img_bytes.into(); if is_fpw && !page_is_new(&new_image) { page_set_lsn(&mut new_image, next_record_lsn); } assert_eq!(new_image.len(), BLCKSZ as usize); new_image.freeze() } async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); let redo_harness = RedoHarness::new()?; let span = redo_harness.span(); let tenant_conf = pageserver_api::models::TenantConfig { ..Default::default() }; let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); let tenant = TenantHarness::create_custom( "search_key", tenant_conf, tenant_id, ShardIdentity::unsharded(), Generation::new(1), ) .await? .do_try_load_with_redo( Arc::new(WalRedoManager::Prod( WalredoManagerId::next(), redo_harness.manager, )), &ctx, ) .await .unwrap(); let timeline = tenant .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) .await?; let contents = tokio::fs::read_to_string(input) .await .map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}"))) .unwrap(); let lines = contents.lines(); let mut last_wal_lsn: Option = None; let state = { let mut state = ValueReconstructState::default(); let mut is_fpw = false; let mut is_first_line = true; for line in lines { if is_first_line { is_first_line = false; if line.trim() == "FPW" { is_fpw = true; } continue; // Skip the first line. } // Each input line is in the "," format. let (lsn_str, payload_b64) = line .split_once(',') .expect("Invalid input format: expected ','"); // Parse the LSN and decode the payload. let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format"); let bytes = Bytes::from( STANDARD .decode(payload_b64.trim()) .expect("Invalid base64 payload"), ); // The first line is considered the base image, the rest are WAL records. if state.img.is_none() { state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes))); } else { let wal_record = NeonWalRecord::Postgres { will_init: false, rec: bytes, }; state.records.push((lsn, wal_record)); last_wal_lsn.replace(lsn); } } state }; assert!(state.img.is_some(), "No base image found"); assert!(!state.records.is_empty(), "No WAL records found"); let result = timeline .reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage) .instrument(span.clone()) .await?; eprintln!("final image: {:?}", STANDARD.encode(result)); Ok(()) } async fn search_key( tenant_id: TenantId, timeline_id: TimelineId, dir: String, key: Key, lsn: Lsn, ) -> anyhow::Result<()> { let shard_index = ShardIndex { shard_number: ShardNumber(0), shard_count: ShardCount(4), }; let redo_harness = RedoHarness::new()?; let span = redo_harness.span(); let tenant_conf = pageserver_api::models::TenantConfig { ..Default::default() }; let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); let tenant = TenantHarness::create_custom( "search_key", tenant_conf, tenant_id, ShardIdentity::new( shard_index.shard_number, shard_index.shard_count, ShardStripeSize(32768), ) .unwrap(), Generation::new(1), ) .await? .do_try_load_with_redo( Arc::new(WalRedoManager::Prod( WalredoManagerId::next(), redo_harness.manager, )), &ctx, ) .await .unwrap(); let timeline = tenant .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) .await?; let mut delta_layers: Vec = Vec::new(); let mut img_layer: Option = Option::None; let mut dir = tokio::fs::read_dir(dir).await?; loop { let entry = dir.next_entry().await?; if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() { break; } let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap(); let layer_name = match LayerName::from_str(path.file_name().unwrap()) { Ok(name) => name, Err(_) => { eprintln!("Skipped invalid layer: {path}"); continue; } }; let layer = Layer::for_resident( tenant.conf, &timeline, path.clone(), layer_name, LayerFileMetadata::new( tokio::fs::metadata(path.clone()).await?.len(), Generation::new(1), shard_index, ), ); if layer.layer_desc().is_delta() { delta_layers.push(layer.into()); } else if img_layer.is_none() { img_layer = Some(layer.into()); } else { anyhow::bail!("Found multiple image layers"); } } // sort delta layers based on the descending order of LSN delta_layers.sort_by(|a, b| { b.layer_desc() .get_lsn_range() .start .cmp(&a.layer_desc().get_lsn_range().start) }); let mut state = ValuesReconstructState::new(IoConcurrency::Sequential); let key_space = KeySpace::single(Range { start: key, end: key.next(), }); let lsn_range = Range { start: img_layer .as_ref() .map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()), end: lsn, }; for delta_layer in delta_layers.iter() { delta_layer .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) .await?; } img_layer .as_ref() .unwrap() .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) .await?; for (_key, result) in std::mem::take(&mut state.keys) { let state = result.collect_pending_ios().await?; if state.img.is_some() { eprintln!( "image: {}: {:x?}", state.img.as_ref().unwrap().0, STANDARD.encode(state.img.as_ref().unwrap().1.clone()) ); } for delta in state.records.iter() { match &delta.1 { NeonWalRecord::Postgres { will_init, rec } => { eprintln!( "delta: {}: will_init: {}, {:x?}", delta.0, will_init, STANDARD.encode(rec) ); } _ => { eprintln!("delta: {}: {:x?}", delta.0, delta.1); } } } let result = timeline .reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage) .instrument(span.clone()) .await?; eprintln!("final image: {lsn} : {result:?}"); } Ok(()) } /// Redo all WALs against the base image in the input file. Return the base64 encoded final image. /// Each line in the input file must be in the form "," where: /// * `` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`. /// * `` is the base64‐encoded page image (first line) or WAL record (subsequent lines). /// /// The first line provides the base image of a page. The LSN is the LSN of "next record" following /// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping /// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200. /// /// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the /// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here /// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should /// be 0/1. #[derive(Parser)] struct RedoWalsCmd { #[clap(long)] input: String, #[clap(long)] key: String, } #[tokio::test] async fn test_redo_wals() -> anyhow::Result<()> { let args = std::env::args().collect_vec(); let pos = args .iter() .position(|arg| arg == "--") .unwrap_or(args.len()); let slice = &args[pos..args.len()]; let cmd = match RedoWalsCmd::try_parse_from(slice) { Ok(cmd) => cmd, Err(err) => { eprintln!("{err}"); return Ok(()); } }; let key = Key::from_hex(&cmd.key).unwrap(); redo_wals(&cmd.input, key).await?; Ok(()) } /// Search for a page at the given LSN in all layers of the data_dir. /// Return the base64-encoded image and all WAL records, as well as the final reconstructed image. #[derive(Parser)] struct SearchKeyCmd { #[clap(long)] tenant_id: String, #[clap(long)] timeline_id: String, #[clap(long)] data_dir: String, #[clap(long)] key: String, #[clap(long)] lsn: String, } #[tokio::test] async fn test_search_key() -> anyhow::Result<()> { let args = std::env::args().collect_vec(); let pos = args .iter() .position(|arg| arg == "--") .unwrap_or(args.len()); let slice = &args[pos..args.len()]; let cmd = match SearchKeyCmd::try_parse_from(slice) { Ok(cmd) => cmd, Err(err) => { eprintln!("{err}"); return Ok(()); } }; let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap(); let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap(); let key = Key::from_hex(&cmd.key).unwrap(); let lsn = Lsn::from_str(&cmd.lsn).unwrap(); search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?; Ok(()) } ================================================ FILE: pageserver/src/tenant/disk_btree.rs ================================================ //! //! Simple on-disk B-tree implementation //! //! This is used as the index structure within image and delta layers //! //! Features: //! - Fixed-width keys //! - Fixed-width values (VALUE_SZ) //! - The tree is created in a bulk operation. Insert/deletion after creation //! is not supported //! - page-oriented //! //! TODO: //! - maybe something like an Adaptive Radix Tree would be more efficient? //! - the values stored by image and delta layers are offsets into the file, //! and they are in monotonically increasing order. Prefix compression would //! be very useful for them, too. //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! use std::cmp::Ordering; use std::iter::Rev; use std::ops::{Range, RangeInclusive}; use std::{io, result}; use async_stream::try_stream; use byteorder::{BE, ReadBytesExt}; use bytes::BufMut; use either::Either; use futures::{Stream, StreamExt}; use hex; use thiserror::Error; use tracing::error; use crate::context::RequestContext; use crate::tenant::block_io::{BlockReader, BlockWriter}; use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; pub const PAGE_SZ: usize = 8192; #[derive(Clone, Copy, Debug)] struct Value([u8; VALUE_SZ]); impl Value { fn from_slice(slice: &[u8]) -> Value { let mut b = [0u8; VALUE_SZ]; b.copy_from_slice(slice); Value(b) } fn from_u64(x: u64) -> Value { assert!(x <= 0x007f_ffff_ffff); Value([ (x >> 32) as u8, (x >> 24) as u8, (x >> 16) as u8, (x >> 8) as u8, x as u8, ]) } fn from_blknum(x: u32) -> Value { Value([ 0x80, (x >> 24) as u8, (x >> 16) as u8, (x >> 8) as u8, x as u8, ]) } #[allow(dead_code)] fn is_offset(self) -> bool { self.0[0] & 0x80 != 0 } fn to_u64(self) -> u64 { let b = &self.0; ((b[0] as u64) << 32) | ((b[1] as u64) << 24) | ((b[2] as u64) << 16) | ((b[3] as u64) << 8) | b[4] as u64 } fn to_blknum(self) -> u32 { let b = &self.0; assert!(b[0] == 0x80); ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32 } } #[derive(Error, Debug)] pub enum DiskBtreeError { #[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)] AppendOverflow(u64), #[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")] UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> }, #[error("Could not push to new leaf node")] FailedToPushToNewLeafNode, #[error("IoError: {0}")] Io(#[from] io::Error), } pub type Result = result::Result; /// This is the on-disk representation. struct OnDiskNode<'a, const L: usize> { // Fixed-width fields num_children: u16, level: u8, prefix_len: u8, suffix_len: u8, // Variable-length fields. These are stored on-disk after the fixed-width // fields, in this order. In the in-memory representation, these point to // the right parts in the page buffer. prefix: &'a [u8], keys: &'a [u8], values: &'a [u8], } impl OnDiskNode<'_, L> { /// /// Interpret a PAGE_SZ page as a node. /// fn deparse(buf: &[u8]) -> Result> { let mut cursor = std::io::Cursor::new(buf); let num_children = cursor.read_u16::()?; let level = cursor.read_u8()?; let prefix_len = cursor.read_u8()?; let suffix_len = cursor.read_u8()?; let mut off = cursor.position(); let prefix_off = off as usize; off += prefix_len as u64; let keys_off = off as usize; let keys_len = num_children as usize * suffix_len as usize; off += keys_len as u64; let values_off = off as usize; let values_len = num_children as usize * VALUE_SZ; //off += values_len as u64; let prefix = &buf[prefix_off..prefix_off + prefix_len as usize]; let keys = &buf[keys_off..keys_off + keys_len]; let values = &buf[values_off..values_off + values_len]; Ok(OnDiskNode { num_children, level, prefix_len, suffix_len, prefix, keys, values, }) } /// /// Read a value at 'idx' /// fn value(&self, idx: usize) -> Value { let value_off = idx * VALUE_SZ; let value_slice = &self.values[value_off..value_off + VALUE_SZ]; Value::from_slice(value_slice) } fn binary_search( &self, search_key: &[u8; L], keybuf: &mut [u8], ) -> result::Result { let mut size = self.num_children as usize; let mut low = 0; let mut high = size; while low < high { let mid = low + size / 2; let key_off = mid * self.suffix_len as usize; let suffix = &self.keys[key_off..key_off + self.suffix_len as usize]; // Does this match? keybuf[self.prefix_len as usize..].copy_from_slice(suffix); let cmp = keybuf[..].cmp(search_key); if cmp == Ordering::Less { low = mid + 1; } else if cmp == Ordering::Greater { high = mid; } else { return Ok(mid); } size = high - low; } Err(low) } } /// /// Public reader object, to search the tree. /// #[derive(Clone)] pub struct DiskBtreeReader where R: BlockReader, { start_blk: u32, root_blk: u32, reader: R, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VisitDirection { Forwards, Backwards, } impl DiskBtreeReader where R: BlockReader, { pub fn new(start_blk: u32, root_blk: u32, reader: R) -> Self { DiskBtreeReader { start_blk, root_blk, reader, } } /// /// Read the value for given key. Returns the value, or None if it doesn't exist. /// pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result> { let mut result: Option = None; self.visit( search_key, VisitDirection::Forwards, |key, value| { if key == search_key { result = Some(value); } false }, ctx, ) .await?; Ok(result) } pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a> where R: 'a + Send, { DiskBtreeIterator { stream: Box::pin(self.into_stream(start_key, ctx)), } } /// Return a stream which yields all key, value pairs from the index /// starting from the first key greater or equal to `start_key`. /// /// Note 1: that this is a copy of [`Self::visit`]. /// TODO: Once the sequential read path is removed this will become /// the only index traversal method. /// /// Note 2: this function used to take `&self` but it now consumes `self`. This is due to /// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self` /// requires the reader to be present when the stream is used, and this creates a lifetime /// dependency between the reader and the stream. Now if we want to create an iterator that /// holds the stream, someone will need to keep a reference to the reader, which is inconvenient /// to use from the image/delta layer APIs. /// /// Feel free to add the `&self` variant back if it's necessary. pub fn into_stream<'a>( self, start_key: &'a [u8; L], ctx: &'a RequestContext, ) -> impl Stream, u64), DiskBtreeError>> + 'a where R: 'a, { try_stream! { let mut stack = Vec::new(); stack.push((self.root_blk, None)); let block_cursor = self.reader.block_cursor(); let mut node_buf = [0_u8; PAGE_SZ]; while let Some((node_blknum, opt_iter)) = stack.pop() { // Read the node, through the PS PageCache, into local variable `node_buf`. // We could keep the page cache read guard alive, but, at the time of writing, // we run quite small PS PageCache s => can't risk running out of // PageCache space because this stream isn't consumed fast enough. let page_read_guard = block_cursor .read_blk(self.start_blk + node_blknum, ctx) .await?; node_buf.copy_from_slice(page_read_guard.as_ref()); drop(page_read_guard); // drop page cache read guard early let node = OnDiskNode::deparse(&node_buf)?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; assert!(node.num_children > 0); let mut keybuf = Vec::new(); keybuf.extend(node.prefix); keybuf.resize(prefix_len + suffix_len, 0); let mut iter: Either, Rev>> = if let Some(iter) = opt_iter { iter } else { // Locate the first match let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) { Ok(idx) => idx, Err(idx) => { if node.level == 0 { // Imagine that the node contains the following keys: // // 1 // 3 <-- idx // 5 // // If the search key is '2' and there is exact match, // the binary search would return the index of key // '3'. That's cool, '3' is the first key to return. idx } else { // This is an internal page, so each key represents a lower // bound for what's in the child page. If there is no exact // match, we have to return the *previous* entry. // // 1 <-- return this // 3 <-- idx // 5 idx.saturating_sub(1) } } }; Either::Left(idx..node.num_children.into()) }; // idx points to the first match now. Keep going from there while let Some(idx) = iter.next() { let key_off = idx * suffix_len; let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf yield (keybuf.clone(), value.to_u64()); } else { stack.push((node_blknum, Some(iter))); stack.push((value.to_blknum(), None)); break; } } } } } /// /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning /// backwards) /// pub async fn visit( &self, search_key: &[u8; L], dir: VisitDirection, mut visitor: V, ctx: &RequestContext, ) -> Result where V: FnMut(&[u8], u64) -> bool, { let mut stack = Vec::new(); stack.push((self.root_blk, None)); let block_cursor = self.reader.block_cursor(); while let Some((node_blknum, opt_iter)) = stack.pop() { // Locate the node. let node_buf = block_cursor .read_blk(self.start_blk + node_blknum, ctx) .await?; let node = OnDiskNode::deparse(node_buf.as_ref())?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; assert!(node.num_children > 0); let mut keybuf = Vec::new(); keybuf.extend(node.prefix); keybuf.resize(prefix_len + suffix_len, 0); let mut iter = if let Some(iter) = opt_iter { iter } else if dir == VisitDirection::Forwards { // Locate the first match let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { Ok(idx) => idx, Err(idx) => { if node.level == 0 { // Imagine that the node contains the following keys: // // 1 // 3 <-- idx // 5 // // If the search key is '2' and there is exact match, // the binary search would return the index of key // '3'. That's cool, '3' is the first key to return. idx } else { // This is an internal page, so each key represents a lower // bound for what's in the child page. If there is no exact // match, we have to return the *previous* entry. // // 1 <-- return this // 3 <-- idx // 5 idx.saturating_sub(1) } } }; Either::Left(idx..node.num_children.into()) } else { let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { Ok(idx) => { // Exact match. That's the first entry to return, and walk // backwards from there. idx } Err(idx) => { // No exact match. The binary search returned the index of the // first key that's > search_key. Back off by one, and walk // backwards from there. if let Some(idx) = idx.checked_sub(1) { idx } else { return Ok(false); } } }; Either::Right((0..=idx).rev()) }; // idx points to the first match now. Keep going from there while let Some(idx) = iter.next() { let key_off = idx * suffix_len; let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf if !visitor(&keybuf, value.to_u64()) { return Ok(false); } } else { stack.push((node_blknum, Some(iter))); stack.push((value.to_blknum(), None)); break; } } } Ok(true) } #[allow(dead_code)] pub async fn dump(&self, ctx: &RequestContext) -> Result<()> { let mut stack = Vec::new(); stack.push((self.root_blk, String::new(), 0, 0, 0)); let block_cursor = self.reader.block_cursor(); while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() { let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?; let buf: &[u8] = blk.as_ref(); let node = OnDiskNode::::deparse(buf)?; if child_idx == 0 { print!("{:indent$}", "", indent = depth * 2); let path_prefix = stack .iter() .map(|(_blknum, path, ..)| path.as_str()) .collect::(); println!( "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}", hex::encode(node.prefix), node.suffix_len ); } if child_idx + 1 < node.num_children { let key_off = key_off + node.suffix_len as usize; stack.push((blknum, path.clone(), depth, child_idx + 1, key_off)); } let key = &node.keys[key_off..key_off + node.suffix_len as usize]; let val = node.value(child_idx as usize); print!("{:indent$}", "", indent = depth * 2 + 2); println!("{}: {}", hex::encode(key), hex::encode(val.0)); if node.level > 0 { stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0)); } } Ok(()) } } pub struct DiskBtreeIterator<'a> { #[allow(clippy::type_complexity)] stream: std::pin::Pin< Box, u64), DiskBtreeError>> + 'a + Send>, >, } impl DiskBtreeIterator<'_> { pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { self.stream.next().await } } /// /// Public builder object, for creating a new tree. /// /// Usage: Create a builder object by calling 'new', load all the data into the /// tree by calling 'append' for each key-value pair, and then call 'finish' /// /// 'L' is the key length in bytes pub struct DiskBtreeBuilder where W: BlockWriter, { writer: W, /// /// `stack[0]` is the current root page, `stack.last()` is the leaf. /// /// We maintain the length of the stack to be always greater than zero. /// Two exceptions are: /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. /// So because other methods cannot see the intermediate state invariant still holds. /// 2. `Self::finish`. It consumes self and does not return it back, /// which means that this is where the structure is destroyed. /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append /// is called in increasing key order. last_key: Option<[u8; L]>, } impl DiskBtreeBuilder where W: BlockWriter, { pub fn new(writer: W) -> Self { DiskBtreeBuilder { writer, last_key: None, stack: vec![BuildNode::new(0)], } } pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> { if value > MAX_VALUE { return Err(DiskBtreeError::AppendOverflow(value)); } if let Some(last_key) = &self.last_key { if key <= last_key { return Err(DiskBtreeError::UnsortedInput { key: key.as_slice().into(), last_key: last_key.as_slice().into(), }); } } self.last_key = Some(*key); self.append_internal(key, Value::from_u64(value)) } fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { // Try to append to the current leaf buffer let last = self .stack .last_mut() .expect("should always have at least one item"); let level = last.level; if last.push(key, value) { return Ok(()); } // It did not fit. Try to compress, and if it succeeds to make // some room on the node, try appending to it again. #[allow(clippy::collapsible_if)] if last.compress() { if last.push(key, value) { return Ok(()); } } // Could not append to the current leaf. Flush it and create a new one. self.flush_node()?; // Replace the node we flushed with an empty one and append the new // key to it. let mut last = BuildNode::new(level); if !last.push(key, value) { return Err(DiskBtreeError::FailedToPushToNewLeafNode); } self.stack.push(last); Ok(()) } /// Flush the bottommost node in the stack to disk. Appends a downlink to its parent, /// and recursively flushes the parent too, if it becomes full. If the root page becomes full, /// creates a new root page, increasing the height of the tree. fn flush_node(&mut self) -> Result<()> { // Get the current bottommost node in the stack and flush it to disk. let last = self .stack .pop() .expect("should always have at least one item"); let buf = last.pack(); let downlink_key = last.first_key(); let downlink_ptr = self.writer.write_blk(buf)?; // Append the downlink to the parent. If there is no parent, ie. this was the root page, // create a new root page, increasing the height of the tree. if self.stack.is_empty() { self.stack.push(BuildNode::new(last.level + 1)); } self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr)) } /// /// Flushes everything to disk, and returns the block number of the root page. /// The caller must store the root block number "out-of-band", and pass it /// to the DiskBtreeReader::new() when you want to read the tree again. /// (In the image and delta layers, it is stored in the beginning of the file, /// in the summary header) /// pub fn finish(mut self) -> Result<(u32, W)> { // flush all levels, except the root. while self.stack.len() > 1 { self.flush_node()?; } let root = self .stack .first() .expect("by the check above we left one item there"); let buf = root.pack(); let root_blknum = self.writer.write_blk(buf)?; Ok((root_blknum, self.writer)) } pub fn borrow_writer(&self) -> &W { &self.writer } } /// /// BuildNode represesnts an incomplete page that we are appending to. /// #[derive(Clone, Debug)] struct BuildNode { num_children: u16, level: u8, prefix: Vec, suffix_len: usize, keys: Vec, values: Vec, size: usize, // physical size of this node, if it was written to disk like this } const NODE_SIZE: usize = PAGE_SZ; const NODE_HDR_SIZE: usize = 2 + 1 + 1 + 1; impl BuildNode { fn new(level: u8) -> Self { BuildNode { num_children: 0, level, prefix: Vec::new(), suffix_len: 0, keys: Vec::new(), values: Vec::new(), size: NODE_HDR_SIZE, } } /// Try to append a key-value pair to this node. Returns 'true' on /// success, 'false' if the page was full or the key was /// incompatible with the prefix of the existing keys. fn push(&mut self, key: &[u8; L], value: Value) -> bool { // If we have already performed prefix-compression on the page, // check that the incoming key has the same prefix. if self.num_children > 0 { // does the prefix allow it? if !key.starts_with(&self.prefix) { return false; } } else { self.suffix_len = key.len(); } // Is the node too full? if self.size + self.suffix_len + VALUE_SZ >= NODE_SIZE { return false; } // All clear self.num_children += 1; self.keys.extend(&key[self.prefix.len()..]); self.values.extend(value.0); assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); self.size += self.suffix_len + VALUE_SZ; true } /// /// Perform prefix-compression. /// /// Returns 'true' on success, 'false' if no compression was possible. /// fn compress(&mut self) -> bool { let first_suffix = self.first_suffix(); let last_suffix = self.last_suffix(); // Find the common prefix among all keys let mut prefix_len = 0; while prefix_len < self.suffix_len { if first_suffix[prefix_len] != last_suffix[prefix_len] { break; } prefix_len += 1; } if prefix_len == 0 { return false; } // Can compress. Rewrite the keys without the common prefix. self.prefix.extend(&self.keys[..prefix_len]); let mut new_keys = Vec::new(); let mut key_off = 0; while key_off < self.keys.len() { let next_key_off = key_off + self.suffix_len; new_keys.extend(&self.keys[key_off + prefix_len..next_key_off]); key_off = next_key_off; } self.keys = new_keys; self.suffix_len -= prefix_len; self.size -= prefix_len * self.num_children as usize; self.size += prefix_len; assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); true } /// /// Serialize the node to on-disk format. /// fn pack(&self) -> IoBuffer { assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); assert!(self.num_children > 0); let mut buf = IoBufferMut::with_capacity(PAGE_SZ); buf.put_u16(self.num_children); buf.put_u8(self.level); buf.put_u8(self.prefix.len() as u8); buf.put_u8(self.suffix_len as u8); buf.put(&self.prefix[..]); buf.put(&self.keys[..]); buf.put(&self.values[..]); assert!(buf.len() == self.size); assert!(buf.len() <= PAGE_SZ); buf.extend_with(0, PAGE_SZ - buf.len()); buf.freeze() } fn first_suffix(&self) -> &[u8] { &self.keys[..self.suffix_len] } fn last_suffix(&self) -> &[u8] { &self.keys[self.keys.len() - self.suffix_len..] } /// Return the full first key of the page, including the prefix fn first_key(&self) -> [u8; L] { let mut key = [0u8; L]; key[..self.prefix.len()].copy_from_slice(&self.prefix); key[self.prefix.len()..].copy_from_slice(self.first_suffix()); key } } #[cfg(test)] pub(crate) mod tests { use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; #[derive(Clone, Default)] pub(crate) struct TestDisk { blocks: Vec, } impl TestDisk { fn new() -> Self { Self::default() } pub(crate) fn read_blk(&self, blknum: u32) -> io::Result { let mut buf = [0u8; PAGE_SZ]; buf.copy_from_slice(&self.blocks[blknum as usize]); Ok(std::sync::Arc::new(buf).into()) } } impl BlockReader for TestDisk { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new(BlockReaderRef::TestDisk(self)) } } impl BlockWriter for &mut TestDisk { fn write_blk(&mut self, buf: IoBuffer) -> io::Result { let blknum = self.blocks.len(); self.blocks.push(buf); Ok(blknum as u32) } } #[tokio::test] async fn basic() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let all_keys: Vec<&[u8; 6]> = vec![ b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", ]; let all_data: Vec<(&[u8; 6], u64)> = all_keys .iter() .enumerate() .map(|(idx, key)| (*key, idx as u64)) .collect(); for (key, val) in all_data.iter() { writer.append(key, *val)?; } let (root_offset, _writer) = writer.finish()?; let reader = DiskBtreeReader::new(0, root_offset, disk); reader.dump(&ctx).await?; // Test the `get` function on all the keys. for (key, val) in all_data.iter() { assert_eq!(reader.get(key, &ctx).await?, Some(*val)); } // And on some keys that don't exist assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None); assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None); assert_eq!(reader.get(b"xaaabx", &ctx).await?, None); // Test search with `visit` function let search_key = b"xabaaa"; let expected: Vec<(Vec, u64)> = all_data .iter() .filter(|(key, _value)| key[..] >= search_key[..]) .map(|(key, value)| (key.to_vec(), *value)) .collect(); let mut data = Vec::new(); reader .visit( search_key, VisitDirection::Forwards, |key, value| { data.push((key.to_vec(), value)); true }, &ctx, ) .await?; assert_eq!(data, expected); // Test a backwards scan let mut expected: Vec<(Vec, u64)> = all_data .iter() .filter(|(key, _value)| key[..] <= search_key[..]) .map(|(key, value)| (key.to_vec(), *value)) .collect(); expected.reverse(); let mut data = Vec::new(); reader .visit( search_key, VisitDirection::Backwards, |key, value| { data.push((key.to_vec(), value)); true }, &ctx, ) .await?; assert_eq!(data, expected); // Backward scan where nothing matches reader .visit( b"aaaaaa", VisitDirection::Backwards, |key, value| { panic!("found unexpected key {}: {}", hex::encode(key), value); }, &ctx, ) .await?; // Full scan let expected: Vec<(Vec, u64)> = all_data .iter() .map(|(key, value)| (key.to_vec(), *value)) .collect(); let mut data = Vec::new(); reader .visit( &[0u8; 6], VisitDirection::Forwards, |key, value| { data.push((key.to_vec(), value)); true }, &ctx, ) .await?; assert_eq!(data, expected); Ok(()) } #[tokio::test] async fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); const NUM_KEYS: u64 = 1000; let mut all_data: BTreeMap = BTreeMap::new(); for idx in 0..NUM_KEYS { let key_int: u64 = 1 + idx * 2; let key = u64::to_be_bytes(key_int); writer.append(&key, idx)?; all_data.insert(key_int, idx); } let (root_offset, _writer) = writer.finish()?; let reader = DiskBtreeReader::new(0, root_offset, disk); reader.dump(&ctx).await?; use std::sync::Mutex; let result = Mutex::new(Vec::new()); let limit: AtomicUsize = AtomicUsize::new(10); let take_ten = |key: &[u8], value: u64| { let mut keybuf = [0u8; 8]; keybuf.copy_from_slice(key); let key_int = u64::from_be_bytes(keybuf); let mut result = result.lock().unwrap(); result.push((key_int, value)); // keep going until we have 10 matches result.len() < limit.load(Ordering::Relaxed) }; for search_key_int in 0..(NUM_KEYS * 2 + 10) { let search_key = u64::to_be_bytes(search_key_int); assert_eq!( reader.get(&search_key, &ctx).await?, all_data.get(&search_key_int).cloned() ); // Test a forward scan starting with this key result.lock().unwrap().clear(); reader .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx) .await?; let expected = all_data .range(search_key_int..) .take(10) .map(|(&key, &val)| (key, val)) .collect::>(); assert_eq!(*result.lock().unwrap(), expected); // And a backwards scan result.lock().unwrap().clear(); reader .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx) .await?; let expected = all_data .range(..=search_key_int) .rev() .take(10) .map(|(&key, &val)| (key, val)) .collect::>(); assert_eq!(*result.lock().unwrap(), expected); } // full scan let search_key = u64::to_be_bytes(0); limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); reader .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx) .await?; let expected = all_data .iter() .map(|(&key, &val)| (key, val)) .collect::>(); assert_eq!(*result.lock().unwrap(), expected); // full scan let search_key = u64::to_be_bytes(u64::MAX); limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); reader .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx) .await?; let expected = all_data .iter() .rev() .map(|(&key, &val)| (key, val)) .collect::>(); assert_eq!(*result.lock().unwrap(), expected); Ok(()) } #[tokio::test] async fn random_data() -> Result<()> { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); // Generate random keys with exponential distribution, to // exercise the prefix compression const NUM_KEYS: usize = 100000; let mut all_data: BTreeMap = BTreeMap::new(); for idx in 0..NUM_KEYS { let u: f64 = rand::rng().random_range(0.0..1.0); let t = -(f64::ln(u)); let key_int = (t * 1000000.0) as u128; all_data.insert(key_int, idx as u64); } // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 16>::new(&mut disk); for (&key, &val) in all_data.iter() { writer.append(&u128::to_be_bytes(key), val)?; } let (root_offset, _writer) = writer.finish()?; let reader = DiskBtreeReader::new(0, root_offset, disk); // Test get() operation on all the keys for (&key, &val) in all_data.iter() { let search_key = u128::to_be_bytes(key); assert_eq!(reader.get(&search_key, &ctx).await?, Some(val)); } // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { let key_int = rand::rng().random::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } // Test boundary cases assert!( reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await? == all_data.get(&u128::MIN).cloned() ); assert!( reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await? == all_data.get(&u128::MAX).cloned() ); // Test iterator and get_stream API let mut iter = reader.iter(&[0; 16], &ctx); let mut cnt = 0; while let Some(res) = iter.next().await { let (key, val) = res?; let key = u128::from_be_bytes(key.as_slice().try_into().unwrap()); assert_eq!(val, *all_data.get(&key).unwrap()); cnt += 1; } assert_eq!(cnt, all_data.len()); Ok(()) } #[test] fn unsorted_input() { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk); let _ = writer.append(b"ba", 1); let _ = writer.append(b"bb", 2); let err = writer.append(b"aa", 3).expect_err("should've failed"); match err { DiskBtreeError::UnsortedInput { key, last_key } => { assert_eq!(key.as_ref(), b"aa".as_slice()); assert_eq!(last_key.as_ref(), b"bb".as_slice()); } _ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"), } } /// /// This test contains a particular data set, see disk_btree_test_data.rs /// #[tokio::test] async fn particular_data() -> Result<()> { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); for (key, val) in disk_btree_test_data::TEST_DATA { writer.append(&key, val)?; } let (root_offset, writer) = writer.finish()?; println!("SIZE: {} blocks", writer.blocks.len()); let reader = DiskBtreeReader::new(0, root_offset, disk); // Test get() operation on all the keys for (key, val) in disk_btree_test_data::TEST_DATA { assert_eq!(reader.get(&key, &ctx).await?, Some(val)); } // Test full scan let mut count = 0; reader .visit( &[0u8; 26], VisitDirection::Forwards, |_key, _value| { count += 1; true }, &ctx, ) .await?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); reader.dump(&ctx).await?; Ok(()) } } #[cfg(test)] #[path = "disk_btree_test_data.rs"] mod disk_btree_test_data; ================================================ FILE: pageserver/src/tenant/disk_btree_test_data.rs ================================================ use hex_literal::hex; /// Test data set for the 'particular_data' test in disk_btree.rs /// /// This test contains a particular data set, representing all the keys /// generated by the 'test_random_updates' unit test. I extracted this while /// trying to debug a failure in that test. The bug turned out to be /// elsewhere, and I'm not sure if this is still useful, but keeping it for /// now... Maybe it's a useful data set to show the typical key-values used /// by a delta layer, for evaluating how well the prefix compression works. #[rustfmt::skip] pub static TEST_DATA: [([u8; 26], u64); 2000] = [ (hex!("0100000000333333334444444455000000000000000000000010"), 0x004001), (hex!("0100000000333333334444444455000000000000000000007cb0"), 0x0040a1), (hex!("0100000000333333334444444455000000010000000000000020"), 0x004141), (hex!("0100000000333333334444444455000000020000000000000030"), 0x0041e1), (hex!("01000000003333333344444444550000000200000000000051a0"), 0x004281), (hex!("0100000000333333334444444455000000030000000000000040"), 0x004321), (hex!("0100000000333333334444444455000000030000000000006cf0"), 0x0043c1), (hex!("0100000000333333334444444455000000030000000000007140"), 0x004461), (hex!("0100000000333333334444444455000000040000000000000050"), 0x004501), (hex!("01000000003333333344444444550000000400000000000047f0"), 0x0045a1), (hex!("01000000003333333344444444550000000400000000000072b0"), 0x004641), (hex!("0100000000333333334444444455000000050000000000000060"), 0x0046e1), (hex!("0100000000333333334444444455000000050000000000005550"), 0x004781), (hex!("0100000000333333334444444455000000060000000000000070"), 0x004821), (hex!("01000000003333333344444444550000000600000000000044a0"), 0x0048c1), (hex!("0100000000333333334444444455000000060000000000006870"), 0x004961), (hex!("0100000000333333334444444455000000070000000000000080"), 0x004a01), (hex!("0100000000333333334444444455000000080000000000000090"), 0x004aa1), (hex!("0100000000333333334444444455000000080000000000004150"), 0x004b41), (hex!("01000000003333333344444444550000000900000000000000a0"), 0x004be1), (hex!("01000000003333333344444444550000000a00000000000000b0"), 0x004c81), (hex!("01000000003333333344444444550000000a0000000000006680"), 0x004d21), (hex!("01000000003333333344444444550000000b00000000000000c0"), 0x004dc1), (hex!("01000000003333333344444444550000000b0000000000006230"), 0x004e61), (hex!("01000000003333333344444444550000000c00000000000000d0"), 0x004f01), (hex!("01000000003333333344444444550000000d00000000000000e0"), 0x004fa1), (hex!("01000000003333333344444444550000000e00000000000000f0"), 0x005041), (hex!("01000000003333333344444444550000000e0000000000006000"), 0x0050e1), (hex!("01000000003333333344444444550000000f0000000000000100"), 0x005181), (hex!("01000000003333333344444444550000000f00000000000053c0"), 0x005221), (hex!("01000000003333333344444444550000000f0000000000006580"), 0x0052c1), (hex!("0100000000333333334444444455000000100000000000000110"), 0x005361), (hex!("01000000003333333344444444550000001000000000000046c0"), 0x005401), (hex!("0100000000333333334444444455000000100000000000004e40"), 0x0054a1), (hex!("0100000000333333334444444455000000110000000000000120"), 0x005541), (hex!("0100000000333333334444444455000000120000000000000130"), 0x0055e1), (hex!("01000000003333333344444444550000001200000000000066d0"), 0x005681), (hex!("0100000000333333334444444455000000130000000000000140"), 0x005721), (hex!("0100000000333333334444444455000000130000000000007710"), 0x0057c1), (hex!("0100000000333333334444444455000000140000000000000150"), 0x005861), (hex!("0100000000333333334444444455000000140000000000006c40"), 0x005901), (hex!("0100000000333333334444444455000000150000000000000160"), 0x0059a1), (hex!("0100000000333333334444444455000000150000000000005990"), 0x005a41), (hex!("0100000000333333334444444455000000160000000000000170"), 0x005ae1), (hex!("0100000000333333334444444455000000160000000000005530"), 0x005b81), (hex!("0100000000333333334444444455000000170000000000000180"), 0x005c21), (hex!("0100000000333333334444444455000000170000000000004290"), 0x005cc1), (hex!("0100000000333333334444444455000000180000000000000190"), 0x005d61), (hex!("01000000003333333344444444550000001800000000000051c0"), 0x005e01), (hex!("01000000003333333344444444550000001900000000000001a0"), 0x005ea1), (hex!("0100000000333333334444444455000000190000000000005420"), 0x005f41), (hex!("0100000000333333334444444455000000190000000000005770"), 0x005fe1), (hex!("01000000003333333344444444550000001900000000000079d0"), 0x006081), (hex!("01000000003333333344444444550000001a00000000000001b0"), 0x006121), (hex!("01000000003333333344444444550000001a0000000000006f70"), 0x0061c1), (hex!("01000000003333333344444444550000001a0000000000007150"), 0x006261), (hex!("01000000003333333344444444550000001b00000000000001c0"), 0x006301), (hex!("01000000003333333344444444550000001b0000000000005070"), 0x0063a1), (hex!("01000000003333333344444444550000001c00000000000001d0"), 0x006441), (hex!("01000000003333333344444444550000001d00000000000001e0"), 0x0064e1), (hex!("01000000003333333344444444550000001e00000000000001f0"), 0x006581), (hex!("01000000003333333344444444550000001e0000000000005650"), 0x006621), (hex!("01000000003333333344444444550000001f0000000000000200"), 0x0066c1), (hex!("01000000003333333344444444550000001f0000000000006ca0"), 0x006761), (hex!("0100000000333333334444444455000000200000000000000210"), 0x006801), (hex!("0100000000333333334444444455000000200000000000005fc0"), 0x0068a1), (hex!("0100000000333333334444444455000000210000000000000220"), 0x006941), (hex!("0100000000333333334444444455000000210000000000006430"), 0x0069e1), (hex!("0100000000333333334444444455000000220000000000000230"), 0x006a81), (hex!("01000000003333333344444444550000002200000000000040e0"), 0x006b21), (hex!("0100000000333333334444444455000000230000000000000240"), 0x006bc1), (hex!("01000000003333333344444444550000002300000000000042d0"), 0x006c61), (hex!("0100000000333333334444444455000000240000000000000250"), 0x006d01), (hex!("0100000000333333334444444455000000250000000000000260"), 0x006da1), (hex!("01000000003333333344444444550000002500000000000058c0"), 0x006e41), (hex!("0100000000333333334444444455000000260000000000000270"), 0x006ee1), (hex!("0100000000333333334444444455000000260000000000004020"), 0x006f81), (hex!("0100000000333333334444444455000000270000000000000280"), 0x007021), (hex!("0100000000333333334444444455000000280000000000000290"), 0x0070c1), (hex!("0100000000333333334444444455000000280000000000007c00"), 0x007161), (hex!("01000000003333333344444444550000002900000000000002a0"), 0x007201), (hex!("01000000003333333344444444550000002a00000000000002b0"), 0x0072a1), (hex!("01000000003333333344444444550000002b00000000000002c0"), 0x007341), (hex!("01000000003333333344444444550000002c00000000000002d0"), 0x0073e1), (hex!("01000000003333333344444444550000002c00000000000041b0"), 0x007481), (hex!("01000000003333333344444444550000002c0000000000004c30"), 0x007521), (hex!("01000000003333333344444444550000002d00000000000002e0"), 0x0075c1), (hex!("01000000003333333344444444550000002d0000000000005e40"), 0x007661), (hex!("01000000003333333344444444550000002d0000000000006990"), 0x007701), (hex!("01000000003333333344444444550000002e00000000000002f0"), 0x0077a1), (hex!("01000000003333333344444444550000002f0000000000000300"), 0x007841), (hex!("01000000003333333344444444550000002f0000000000004a70"), 0x0078e1), (hex!("01000000003333333344444444550000002f0000000000006b40"), 0x007981), (hex!("0100000000333333334444444455000000300000000000000310"), 0x007a21), (hex!("0100000000333333334444444455000000310000000000000320"), 0x007ac1), (hex!("0100000000333333334444444455000000320000000000000330"), 0x007b61), (hex!("01000000003333333344444444550000003200000000000041a0"), 0x007c01), (hex!("0100000000333333334444444455000000320000000000007340"), 0x007ca1), (hex!("0100000000333333334444444455000000320000000000007730"), 0x007d41), (hex!("0100000000333333334444444455000000330000000000000340"), 0x007de1), (hex!("01000000003333333344444444550000003300000000000055a0"), 0x007e81), (hex!("0100000000333333334444444455000000340000000000000350"), 0x007f21), (hex!("0100000000333333334444444455000000350000000000000360"), 0x007fc1), (hex!("01000000003333333344444444550000003500000000000077a0"), 0x008061), (hex!("0100000000333333334444444455000000360000000000000370"), 0x008101), (hex!("0100000000333333334444444455000000370000000000000380"), 0x0081a1), (hex!("0100000000333333334444444455000000380000000000000390"), 0x008241), (hex!("01000000003333333344444444550000003900000000000003a0"), 0x0082e1), (hex!("01000000003333333344444444550000003a00000000000003b0"), 0x008381), (hex!("01000000003333333344444444550000003a00000000000071c0"), 0x008421), (hex!("01000000003333333344444444550000003b00000000000003c0"), 0x0084c1), (hex!("01000000003333333344444444550000003c00000000000003d0"), 0x008561), (hex!("01000000003333333344444444550000003d00000000000003e0"), 0x008601), (hex!("01000000003333333344444444550000003e00000000000003f0"), 0x0086a1), (hex!("01000000003333333344444444550000003e00000000000062e0"), 0x008741), (hex!("01000000003333333344444444550000003f0000000000000400"), 0x0087e1), (hex!("0100000000333333334444444455000000400000000000000410"), 0x008881), (hex!("0100000000333333334444444455000000400000000000004460"), 0x008921), (hex!("0100000000333333334444444455000000400000000000005b90"), 0x0089c1), (hex!("01000000003333333344444444550000004000000000000079b0"), 0x008a61), (hex!("0100000000333333334444444455000000410000000000000420"), 0x008b01), (hex!("0100000000333333334444444455000000420000000000000430"), 0x008ba1), (hex!("0100000000333333334444444455000000420000000000005640"), 0x008c41), (hex!("0100000000333333334444444455000000430000000000000440"), 0x008ce1), (hex!("01000000003333333344444444550000004300000000000072a0"), 0x008d81), (hex!("0100000000333333334444444455000000440000000000000450"), 0x008e21), (hex!("0100000000333333334444444455000000450000000000000460"), 0x008ec1), (hex!("0100000000333333334444444455000000450000000000005750"), 0x008f61), (hex!("01000000003333333344444444550000004500000000000077b0"), 0x009001), (hex!("0100000000333333334444444455000000460000000000000470"), 0x0090a1), (hex!("0100000000333333334444444455000000470000000000000480"), 0x009141), (hex!("0100000000333333334444444455000000480000000000000490"), 0x0091e1), (hex!("01000000003333333344444444550000004800000000000069e0"), 0x009281), (hex!("01000000003333333344444444550000004900000000000004a0"), 0x009321), (hex!("0100000000333333334444444455000000490000000000007370"), 0x0093c1), (hex!("01000000003333333344444444550000004a00000000000004b0"), 0x009461), (hex!("01000000003333333344444444550000004a0000000000005cb0"), 0x009501), (hex!("01000000003333333344444444550000004b00000000000004c0"), 0x0095a1), (hex!("01000000003333333344444444550000004c00000000000004d0"), 0x009641), (hex!("01000000003333333344444444550000004c0000000000004880"), 0x0096e1), (hex!("01000000003333333344444444550000004c0000000000007a40"), 0x009781), (hex!("01000000003333333344444444550000004d00000000000004e0"), 0x009821), (hex!("01000000003333333344444444550000004d0000000000006390"), 0x0098c1), (hex!("01000000003333333344444444550000004e00000000000004f0"), 0x009961), (hex!("01000000003333333344444444550000004e0000000000004db0"), 0x009a01), (hex!("01000000003333333344444444550000004f0000000000000500"), 0x009aa1), (hex!("0100000000333333334444444455000000500000000000000510"), 0x009b41), (hex!("0100000000333333334444444455000000510000000000000520"), 0x009be1), (hex!("01000000003333333344444444550000005100000000000069c0"), 0x009c81), (hex!("0100000000333333334444444455000000520000000000000530"), 0x009d21), (hex!("0100000000333333334444444455000000520000000000006e60"), 0x009dc1), (hex!("01000000003333333344444444550000005200000000000070c0"), 0x009e61), (hex!("0100000000333333334444444455000000530000000000000540"), 0x009f01), (hex!("0100000000333333334444444455000000530000000000005840"), 0x009fa1), (hex!("0100000000333333334444444455000000540000000000000550"), 0x00a041), (hex!("01000000003333333344444444550000005400000000000043e0"), 0x00a0e1), (hex!("01000000003333333344444444550000005400000000000074e0"), 0x00a181), (hex!("0100000000333333334444444455000000550000000000000560"), 0x00a221), (hex!("0100000000333333334444444455000000550000000000003ee0"), 0x00a2c1), (hex!("0100000000333333334444444455000000560000000000000570"), 0x00a361), (hex!("0100000000333333334444444455000000570000000000000580"), 0x00a401), (hex!("0100000000333333334444444455000000570000000000007030"), 0x00a4a1), (hex!("0100000000333333334444444455000000580000000000000590"), 0x00a541), (hex!("0100000000333333334444444455000000580000000000005340"), 0x00a5e1), (hex!("01000000003333333344444444550000005800000000000059f0"), 0x00a681), (hex!("0100000000333333334444444455000000580000000000006930"), 0x00a721), (hex!("01000000003333333344444444550000005900000000000005a0"), 0x00a7c1), (hex!("0100000000333333334444444455000000590000000000003f90"), 0x00a861), (hex!("01000000003333333344444444550000005a00000000000005b0"), 0x00a901), (hex!("01000000003333333344444444550000005b00000000000005c0"), 0x00a9a1), (hex!("01000000003333333344444444550000005b00000000000062c0"), 0x00aa41), (hex!("01000000003333333344444444550000005c00000000000005d0"), 0x00aae1), (hex!("01000000003333333344444444550000005c0000000000005a70"), 0x00ab81), (hex!("01000000003333333344444444550000005c0000000000005dd0"), 0x00ac21), (hex!("01000000003333333344444444550000005d00000000000005e0"), 0x00acc1), (hex!("01000000003333333344444444550000005d0000000000005730"), 0x00ad61), (hex!("01000000003333333344444444550000005e00000000000005f0"), 0x00ae01), (hex!("01000000003333333344444444550000005e0000000000004f40"), 0x00aea1), (hex!("01000000003333333344444444550000005f0000000000000600"), 0x00af41), (hex!("0100000000333333334444444455000000600000000000000610"), 0x00afe1), (hex!("0100000000333333334444444455000000600000000000007c40"), 0x00b081), (hex!("0100000000333333334444444455000000610000000000000620"), 0x00b121), (hex!("0100000000333333334444444455000000610000000000007860"), 0x00b1c1), (hex!("0100000000333333334444444455000000620000000000000630"), 0x00b261), (hex!("0100000000333333334444444455000000620000000000005050"), 0x00b301), (hex!("0100000000333333334444444455000000630000000000000640"), 0x00b3a1), (hex!("0100000000333333334444444455000000640000000000000650"), 0x00b441), (hex!("0100000000333333334444444455000000650000000000000660"), 0x00b4e1), (hex!("0100000000333333334444444455000000650000000000005330"), 0x00b581), (hex!("0100000000333333334444444455000000660000000000000670"), 0x00b621), (hex!("0100000000333333334444444455000000660000000000004e20"), 0x00b6c1), (hex!("0100000000333333334444444455000000660000000000005ee0"), 0x00b761), (hex!("0100000000333333334444444455000000660000000000006360"), 0x00b801), (hex!("0100000000333333334444444455000000670000000000000680"), 0x00b8a1), (hex!("0100000000333333334444444455000000670000000000004040"), 0x00b941), (hex!("0100000000333333334444444455000000680000000000000690"), 0x00b9e1), (hex!("0100000000333333334444444455000000680000000000003f80"), 0x00ba81), (hex!("01000000003333333344444444550000006800000000000041e0"), 0x00bb21), (hex!("01000000003333333344444444550000006900000000000006a0"), 0x00bbc1), (hex!("0100000000333333334444444455000000690000000000006080"), 0x00bc61), (hex!("01000000003333333344444444550000006a00000000000006b0"), 0x00bd01), (hex!("01000000003333333344444444550000006a00000000000042f0"), 0x00bda1), (hex!("01000000003333333344444444550000006b00000000000006c0"), 0x00be41), (hex!("01000000003333333344444444550000006b00000000000052f0"), 0x00bee1), (hex!("01000000003333333344444444550000006b0000000000005980"), 0x00bf81), (hex!("01000000003333333344444444550000006b0000000000006170"), 0x00c021), (hex!("01000000003333333344444444550000006c00000000000006d0"), 0x00c0c1), (hex!("01000000003333333344444444550000006d00000000000006e0"), 0x00c161), (hex!("01000000003333333344444444550000006d0000000000006fb0"), 0x00c201), (hex!("01000000003333333344444444550000006e00000000000006f0"), 0x00c2a1), (hex!("01000000003333333344444444550000006e00000000000065b0"), 0x00c341), (hex!("01000000003333333344444444550000006e0000000000007970"), 0x00c3e1), (hex!("01000000003333333344444444550000006f0000000000000700"), 0x00c481), (hex!("01000000003333333344444444550000006f0000000000005900"), 0x00c521), (hex!("01000000003333333344444444550000006f0000000000006d90"), 0x00c5c1), (hex!("0100000000333333334444444455000000700000000000000710"), 0x00c661), (hex!("01000000003333333344444444550000007000000000000045c0"), 0x00c701), (hex!("0100000000333333334444444455000000700000000000004d40"), 0x00c7a1), (hex!("0100000000333333334444444455000000710000000000000720"), 0x00c841), (hex!("0100000000333333334444444455000000710000000000004dc0"), 0x00c8e1), (hex!("0100000000333333334444444455000000710000000000007550"), 0x00c981), (hex!("0100000000333333334444444455000000720000000000000730"), 0x00ca21), (hex!("0100000000333333334444444455000000720000000000003ec0"), 0x00cac1), (hex!("01000000003333333344444444550000007200000000000045a0"), 0x00cb61), (hex!("0100000000333333334444444455000000720000000000006770"), 0x00cc01), (hex!("0100000000333333334444444455000000720000000000006bc0"), 0x00cca1), (hex!("0100000000333333334444444455000000730000000000000740"), 0x00cd41), (hex!("0100000000333333334444444455000000730000000000005250"), 0x00cde1), (hex!("01000000003333333344444444550000007300000000000075f0"), 0x00ce81), (hex!("0100000000333333334444444455000000740000000000000750"), 0x00cf21), (hex!("0100000000333333334444444455000000740000000000003ff0"), 0x00cfc1), (hex!("01000000003333333344444444550000007400000000000079e0"), 0x00d061), (hex!("0100000000333333334444444455000000750000000000000760"), 0x00d101), (hex!("0100000000333333334444444455000000750000000000004310"), 0x00d1a1), (hex!("0100000000333333334444444455000000760000000000000770"), 0x00d241), (hex!("0100000000333333334444444455000000770000000000000780"), 0x00d2e1), (hex!("01000000003333333344444444550000007700000000000062f0"), 0x00d381), (hex!("0100000000333333334444444455000000770000000000006940"), 0x00d421), (hex!("0100000000333333334444444455000000780000000000000790"), 0x00d4c1), (hex!("01000000003333333344444444550000007900000000000007a0"), 0x00d561), (hex!("0100000000333333334444444455000000790000000000007af0"), 0x00d601), (hex!("01000000003333333344444444550000007a00000000000007b0"), 0x00d6a1), (hex!("01000000003333333344444444550000007b00000000000007c0"), 0x00d741), (hex!("01000000003333333344444444550000007b00000000000067e0"), 0x00d7e1), (hex!("01000000003333333344444444550000007b0000000000007890"), 0x00d881), (hex!("01000000003333333344444444550000007c00000000000007d0"), 0x00d921), (hex!("01000000003333333344444444550000007d00000000000007e0"), 0x00d9c1), (hex!("01000000003333333344444444550000007e00000000000007f0"), 0x00da61), (hex!("01000000003333333344444444550000007f0000000000000800"), 0x00db01), (hex!("01000000003333333344444444550000007f0000000000005be0"), 0x00dba1), (hex!("0100000000333333334444444455000000800000000000000810"), 0x00dc41), (hex!("0100000000333333334444444455000000810000000000000820"), 0x00dce1), (hex!("0100000000333333334444444455000000810000000000007190"), 0x00dd81), (hex!("0100000000333333334444444455000000820000000000000830"), 0x00de21), (hex!("0100000000333333334444444455000000820000000000004ab0"), 0x00dec1), (hex!("0100000000333333334444444455000000830000000000000840"), 0x00df61), (hex!("0100000000333333334444444455000000830000000000006720"), 0x00e001), (hex!("0100000000333333334444444455000000840000000000000850"), 0x00e0a1), (hex!("0100000000333333334444444455000000850000000000000860"), 0x00e141), (hex!("01000000003333333344444444550000008500000000000054f0"), 0x00e1e1), (hex!("0100000000333333334444444455000000850000000000007920"), 0x00e281), (hex!("0100000000333333334444444455000000860000000000000870"), 0x00e321), (hex!("01000000003333333344444444550000008600000000000060e0"), 0x00e3c1), (hex!("0100000000333333334444444455000000860000000000006be0"), 0x00e461), (hex!("0100000000333333334444444455000000870000000000000880"), 0x00e501), (hex!("0100000000333333334444444455000000870000000000006820"), 0x00e5a1), (hex!("0100000000333333334444444455000000880000000000000890"), 0x00e641), (hex!("01000000003333333344444444550000008900000000000008a0"), 0x00e6e1), (hex!("0100000000333333334444444455000000890000000000007c30"), 0x00e781), (hex!("01000000003333333344444444550000008a00000000000008b0"), 0x00e821), (hex!("01000000003333333344444444550000008b00000000000008c0"), 0x00e8c1), (hex!("01000000003333333344444444550000008b0000000000005910"), 0x00e961), (hex!("01000000003333333344444444550000008b0000000000006fe0"), 0x00ea01), (hex!("01000000003333333344444444550000008c00000000000008d0"), 0x00eaa1), (hex!("01000000003333333344444444550000008c0000000000006800"), 0x00eb41), (hex!("01000000003333333344444444550000008d00000000000008e0"), 0x00ebe1), (hex!("01000000003333333344444444550000008d0000000000005810"), 0x00ec81), (hex!("01000000003333333344444444550000008d0000000000007c90"), 0x00ed21), (hex!("01000000003333333344444444550000008e00000000000008f0"), 0x00edc1), (hex!("01000000003333333344444444550000008e00000000000058f0"), 0x00ee61), (hex!("01000000003333333344444444550000008f0000000000000900"), 0x00ef01), (hex!("01000000003333333344444444550000008f0000000000005a30"), 0x00efa1), (hex!("0100000000333333334444444455000000900000000000000910"), 0x00f041), (hex!("0100000000333333334444444455000000900000000000006130"), 0x00f0e1), (hex!("0100000000333333334444444455000000900000000000006550"), 0x00f181), (hex!("0100000000333333334444444455000000910000000000000920"), 0x00f221), (hex!("01000000003333333344444444550000009100000000000079f0"), 0x00f2c1), (hex!("0100000000333333334444444455000000920000000000000930"), 0x00f361), (hex!("0100000000333333334444444455000000920000000000005620"), 0x00f401), (hex!("0100000000333333334444444455000000920000000000005e90"), 0x00f4a1), (hex!("01000000003333333344444444550000009200000000000063d0"), 0x00f541), (hex!("01000000003333333344444444550000009200000000000076c0"), 0x00f5e1), (hex!("0100000000333333334444444455000000930000000000000940"), 0x00f681), (hex!("01000000003333333344444444550000009300000000000044e0"), 0x00f721), (hex!("0100000000333333334444444455000000940000000000000950"), 0x00f7c1), (hex!("0100000000333333334444444455000000940000000000007a30"), 0x00f861), (hex!("0100000000333333334444444455000000950000000000000960"), 0x00f901), (hex!("0100000000333333334444444455000000950000000000007a70"), 0x00f9a1), (hex!("0100000000333333334444444455000000960000000000000970"), 0x00fa41), (hex!("0100000000333333334444444455000000970000000000000980"), 0x00fae1), (hex!("0100000000333333334444444455000000970000000000007330"), 0x00fb81), (hex!("0100000000333333334444444455000000980000000000000990"), 0x00fc21), (hex!("0100000000333333334444444455000000980000000000005af0"), 0x00fcc1), (hex!("0100000000333333334444444455000000980000000000007ae0"), 0x00fd61), (hex!("01000000003333333344444444550000009900000000000009a0"), 0x00fe01), (hex!("0100000000333333334444444455000000990000000000005160"), 0x00fea1), (hex!("0100000000333333334444444455000000990000000000006850"), 0x00ff41), (hex!("01000000003333333344444444550000009a00000000000009b0"), 0x00ffe1), (hex!("01000000003333333344444444550000009b00000000000009c0"), 0x010081), (hex!("01000000003333333344444444550000009b0000000000005010"), 0x010121), (hex!("01000000003333333344444444550000009c00000000000009d0"), 0x0101c1), (hex!("01000000003333333344444444550000009c00000000000042e0"), 0x010261), (hex!("01000000003333333344444444550000009d00000000000009e0"), 0x010301), (hex!("01000000003333333344444444550000009d00000000000057f0"), 0x0103a1), (hex!("01000000003333333344444444550000009e00000000000009f0"), 0x010441), (hex!("01000000003333333344444444550000009e0000000000004ef0"), 0x0104e1), (hex!("01000000003333333344444444550000009f0000000000000a00"), 0x010581), (hex!("01000000003333333344444444550000009f0000000000006110"), 0x010621), (hex!("0100000000333333334444444455000000a00000000000000a10"), 0x0106c1), (hex!("0100000000333333334444444455000000a10000000000000a20"), 0x010761), (hex!("0100000000333333334444444455000000a100000000000040d0"), 0x010801), (hex!("0100000000333333334444444455000000a10000000000007670"), 0x0108a1), (hex!("0100000000333333334444444455000000a20000000000000a30"), 0x010941), (hex!("0100000000333333334444444455000000a200000000000074d0"), 0x0109e1), (hex!("0100000000333333334444444455000000a30000000000000a40"), 0x010a81), (hex!("0100000000333333334444444455000000a30000000000004c90"), 0x010b21), (hex!("0100000000333333334444444455000000a40000000000000a50"), 0x010bc1), (hex!("0100000000333333334444444455000000a50000000000000a60"), 0x010c61), (hex!("0100000000333333334444444455000000a60000000000000a70"), 0x010d01), (hex!("0100000000333333334444444455000000a60000000000006d80"), 0x010da1), (hex!("0100000000333333334444444455000000a60000000000007830"), 0x010e41), (hex!("0100000000333333334444444455000000a70000000000000a80"), 0x010ee1), (hex!("0100000000333333334444444455000000a700000000000064f0"), 0x010f81), (hex!("0100000000333333334444444455000000a80000000000000a90"), 0x011021), (hex!("0100000000333333334444444455000000a90000000000000aa0"), 0x0110c1), (hex!("0100000000333333334444444455000000a90000000000005e30"), 0x011161), (hex!("0100000000333333334444444455000000aa0000000000000ab0"), 0x011201), (hex!("0100000000333333334444444455000000ab0000000000000ac0"), 0x0112a1), (hex!("0100000000333333334444444455000000ac0000000000000ad0"), 0x011341), (hex!("0100000000333333334444444455000000ac0000000000006d20"), 0x0113e1), (hex!("0100000000333333334444444455000000ac0000000000007000"), 0x011481), (hex!("0100000000333333334444444455000000ad0000000000000ae0"), 0x011521), (hex!("0100000000333333334444444455000000ae0000000000000af0"), 0x0115c1), (hex!("0100000000333333334444444455000000ae0000000000004a10"), 0x011661), (hex!("0100000000333333334444444455000000af0000000000000b00"), 0x011701), (hex!("0100000000333333334444444455000000af0000000000004e10"), 0x0117a1), (hex!("0100000000333333334444444455000000b00000000000000b10"), 0x011841), (hex!("0100000000333333334444444455000000b00000000000004280"), 0x0118e1), (hex!("0100000000333333334444444455000000b000000000000077e0"), 0x011981), (hex!("0100000000333333334444444455000000b10000000000000b20"), 0x011a21), (hex!("0100000000333333334444444455000000b20000000000000b30"), 0x011ac1), (hex!("0100000000333333334444444455000000b30000000000000b40"), 0x011b61), (hex!("0100000000333333334444444455000000b30000000000004bc0"), 0x011c01), (hex!("0100000000333333334444444455000000b40000000000000b50"), 0x011ca1), (hex!("0100000000333333334444444455000000b50000000000000b60"), 0x011d41), (hex!("0100000000333333334444444455000000b50000000000004fa0"), 0x011de1), (hex!("0100000000333333334444444455000000b50000000000006a60"), 0x011e81), (hex!("0100000000333333334444444455000000b60000000000000b70"), 0x011f21), (hex!("0100000000333333334444444455000000b60000000000005630"), 0x011fc1), (hex!("0100000000333333334444444455000000b70000000000000b80"), 0x012061), (hex!("0100000000333333334444444455000000b80000000000000b90"), 0x012101), (hex!("0100000000333333334444444455000000b80000000000006f80"), 0x0121a1), (hex!("0100000000333333334444444455000000b90000000000000ba0"), 0x012241), (hex!("0100000000333333334444444455000000ba0000000000000bb0"), 0x0122e1), (hex!("0100000000333333334444444455000000bb0000000000000bc0"), 0x012381), (hex!("0100000000333333334444444455000000bb00000000000047c0"), 0x012421), (hex!("0100000000333333334444444455000000bb0000000000006060"), 0x0124c1), (hex!("0100000000333333334444444455000000bc0000000000000bd0"), 0x012561), (hex!("0100000000333333334444444455000000bd0000000000000be0"), 0x012601), (hex!("0100000000333333334444444455000000bd0000000000004e80"), 0x0126a1), (hex!("0100000000333333334444444455000000be0000000000000bf0"), 0x012741), (hex!("0100000000333333334444444455000000bf0000000000000c00"), 0x0127e1), (hex!("0100000000333333334444444455000000bf00000000000047a0"), 0x012881), (hex!("0100000000333333334444444455000000bf0000000000006da0"), 0x012921), (hex!("0100000000333333334444444455000000c00000000000000c10"), 0x0129c1), (hex!("0100000000333333334444444455000000c10000000000000c20"), 0x012a61), (hex!("0100000000333333334444444455000000c20000000000000c30"), 0x012b01), (hex!("0100000000333333334444444455000000c20000000000004bd0"), 0x012ba1), (hex!("0100000000333333334444444455000000c20000000000006ac0"), 0x012c41), (hex!("0100000000333333334444444455000000c30000000000000c40"), 0x012ce1), (hex!("0100000000333333334444444455000000c30000000000004660"), 0x012d81), (hex!("0100000000333333334444444455000000c40000000000000c50"), 0x012e21), (hex!("0100000000333333334444444455000000c50000000000000c60"), 0x012ec1), (hex!("0100000000333333334444444455000000c60000000000000c70"), 0x012f61), (hex!("0100000000333333334444444455000000c60000000000005880"), 0x013001), (hex!("0100000000333333334444444455000000c60000000000006b70"), 0x0130a1), (hex!("0100000000333333334444444455000000c70000000000000c80"), 0x013141), (hex!("0100000000333333334444444455000000c80000000000000c90"), 0x0131e1), (hex!("0100000000333333334444444455000000c80000000000005310"), 0x013281), (hex!("0100000000333333334444444455000000c80000000000005db0"), 0x013321), (hex!("0100000000333333334444444455000000c80000000000007040"), 0x0133c1), (hex!("0100000000333333334444444455000000c80000000000007290"), 0x013461), (hex!("0100000000333333334444444455000000c90000000000000ca0"), 0x013501), (hex!("0100000000333333334444444455000000c90000000000004fe0"), 0x0135a1), (hex!("0100000000333333334444444455000000ca0000000000000cb0"), 0x013641), (hex!("0100000000333333334444444455000000ca0000000000006140"), 0x0136e1), (hex!("0100000000333333334444444455000000ca0000000000007700"), 0x013781), (hex!("0100000000333333334444444455000000cb0000000000000cc0"), 0x013821), (hex!("0100000000333333334444444455000000cc0000000000000cd0"), 0x0138c1), (hex!("0100000000333333334444444455000000cd0000000000000ce0"), 0x013961), (hex!("0100000000333333334444444455000000cd0000000000003f20"), 0x013a01), (hex!("0100000000333333334444444455000000cd00000000000040f0"), 0x013aa1), (hex!("0100000000333333334444444455000000cd0000000000004ec0"), 0x013b41), (hex!("0100000000333333334444444455000000ce0000000000000cf0"), 0x013be1), (hex!("0100000000333333334444444455000000ce0000000000007200"), 0x013c81), (hex!("0100000000333333334444444455000000cf0000000000000d00"), 0x013d21), (hex!("0100000000333333334444444455000000cf00000000000046a0"), 0x013dc1), (hex!("0100000000333333334444444455000000cf0000000000005960"), 0x013e61), (hex!("0100000000333333334444444455000000d00000000000000d10"), 0x013f01), (hex!("0100000000333333334444444455000000d00000000000005f30"), 0x013fa1), (hex!("0100000000333333334444444455000000d10000000000000d20"), 0x014041), (hex!("0100000000333333334444444455000000d10000000000007a00"), 0x0140e1), (hex!("0100000000333333334444444455000000d20000000000000d30"), 0x014181), (hex!("0100000000333333334444444455000000d30000000000000d40"), 0x014221), (hex!("0100000000333333334444444455000000d40000000000000d50"), 0x0142c1), (hex!("0100000000333333334444444455000000d50000000000000d60"), 0x014361), (hex!("0100000000333333334444444455000000d50000000000004960"), 0x014401), (hex!("0100000000333333334444444455000000d500000000000055d0"), 0x0144a1), (hex!("0100000000333333334444444455000000d500000000000067d0"), 0x014541), (hex!("0100000000333333334444444455000000d60000000000000d70"), 0x0145e1), (hex!("0100000000333333334444444455000000d70000000000000d80"), 0x014681), (hex!("0100000000333333334444444455000000d80000000000000d90"), 0x014721), (hex!("0100000000333333334444444455000000d800000000000065f0"), 0x0147c1), (hex!("0100000000333333334444444455000000d90000000000000da0"), 0x014861), (hex!("0100000000333333334444444455000000d90000000000004980"), 0x014901), (hex!("0100000000333333334444444455000000da0000000000000db0"), 0x0149a1), (hex!("0100000000333333334444444455000000da00000000000048c0"), 0x014a41), (hex!("0100000000333333334444444455000000da00000000000072c0"), 0x014ae1), (hex!("0100000000333333334444444455000000da00000000000076b0"), 0x014b81), (hex!("0100000000333333334444444455000000db0000000000000dc0"), 0x014c21), (hex!("0100000000333333334444444455000000dc0000000000000dd0"), 0x014cc1), (hex!("0100000000333333334444444455000000dc00000000000040a0"), 0x014d61), (hex!("0100000000333333334444444455000000dc00000000000074c0"), 0x014e01), (hex!("0100000000333333334444444455000000dd0000000000000de0"), 0x014ea1), (hex!("0100000000333333334444444455000000dd0000000000004e50"), 0x014f41), (hex!("0100000000333333334444444455000000dd0000000000007270"), 0x014fe1), (hex!("0100000000333333334444444455000000de0000000000000df0"), 0x015081), (hex!("0100000000333333334444444455000000de00000000000078d0"), 0x015121), (hex!("0100000000333333334444444455000000df0000000000000e00"), 0x0151c1), (hex!("0100000000333333334444444455000000df0000000000004d30"), 0x015261), (hex!("0100000000333333334444444455000000df0000000000006c30"), 0x015301), (hex!("0100000000333333334444444455000000e00000000000000e10"), 0x0153a1), (hex!("0100000000333333334444444455000000e00000000000005d30"), 0x015441), (hex!("0100000000333333334444444455000000e10000000000000e20"), 0x0154e1), (hex!("0100000000333333334444444455000000e10000000000004610"), 0x015581), (hex!("0100000000333333334444444455000000e100000000000051d0"), 0x015621), (hex!("0100000000333333334444444455000000e10000000000005f10"), 0x0156c1), (hex!("0100000000333333334444444455000000e20000000000000e30"), 0x015761), (hex!("0100000000333333334444444455000000e20000000000007a90"), 0x015801), (hex!("0100000000333333334444444455000000e30000000000000e40"), 0x0158a1), (hex!("0100000000333333334444444455000000e30000000000005ae0"), 0x015941), (hex!("0100000000333333334444444455000000e40000000000000e50"), 0x0159e1), (hex!("0100000000333333334444444455000000e50000000000000e60"), 0x015a81), (hex!("0100000000333333334444444455000000e50000000000004700"), 0x015b21), (hex!("0100000000333333334444444455000000e500000000000065d0"), 0x015bc1), (hex!("0100000000333333334444444455000000e60000000000000e70"), 0x015c61), (hex!("0100000000333333334444444455000000e60000000000004fd0"), 0x015d01), (hex!("0100000000333333334444444455000000e70000000000000e80"), 0x015da1), (hex!("0100000000333333334444444455000000e70000000000005150"), 0x015e41), (hex!("0100000000333333334444444455000000e70000000000005920"), 0x015ee1), (hex!("0100000000333333334444444455000000e80000000000000e90"), 0x015f81), (hex!("0100000000333333334444444455000000e80000000000004320"), 0x016021), (hex!("0100000000333333334444444455000000e80000000000005ec0"), 0x0160c1), (hex!("0100000000333333334444444455000000e90000000000000ea0"), 0x016161), (hex!("0100000000333333334444444455000000e900000000000043b0"), 0x016201), (hex!("0100000000333333334444444455000000ea0000000000000eb0"), 0x0162a1), (hex!("0100000000333333334444444455000000ea0000000000003ea0"), 0x016341), (hex!("0100000000333333334444444455000000ea0000000000004f50"), 0x0163e1), (hex!("0100000000333333334444444455000000ea0000000000007520"), 0x016481), (hex!("0100000000333333334444444455000000eb0000000000000ec0"), 0x016521), (hex!("0100000000333333334444444455000000ec0000000000000ed0"), 0x0165c1), (hex!("0100000000333333334444444455000000ec0000000000006670"), 0x016661), (hex!("0100000000333333334444444455000000ed0000000000000ee0"), 0x016701), (hex!("0100000000333333334444444455000000ee0000000000000ef0"), 0x0167a1), (hex!("0100000000333333334444444455000000ee0000000000004d10"), 0x016841), (hex!("0100000000333333334444444455000000ef0000000000000f00"), 0x0168e1), (hex!("0100000000333333334444444455000000f00000000000000f10"), 0x016981), (hex!("0100000000333333334444444455000000f00000000000007220"), 0x016a21), (hex!("0100000000333333334444444455000000f00000000000007540"), 0x016ac1), (hex!("0100000000333333334444444455000000f10000000000000f20"), 0x016b61), (hex!("0100000000333333334444444455000000f100000000000066f0"), 0x016c01), (hex!("0100000000333333334444444455000000f20000000000000f30"), 0x016ca1), (hex!("0100000000333333334444444455000000f20000000000007810"), 0x016d41), (hex!("0100000000333333334444444455000000f30000000000000f40"), 0x016de1), (hex!("0100000000333333334444444455000000f30000000000007b70"), 0x016e81), (hex!("0100000000333333334444444455000000f40000000000000f50"), 0x016f21), (hex!("0100000000333333334444444455000000f400000000000059c0"), 0x016fc1), (hex!("0100000000333333334444444455000000f50000000000000f60"), 0x017061), (hex!("0100000000333333334444444455000000f50000000000003fb0"), 0x017101), (hex!("0100000000333333334444444455000000f50000000000005740"), 0x0171a1), (hex!("0100000000333333334444444455000000f500000000000064d0"), 0x017241), (hex!("0100000000333333334444444455000000f50000000000006960"), 0x0172e1), (hex!("0100000000333333334444444455000000f60000000000000f70"), 0x017381), (hex!("0100000000333333334444444455000000f60000000000006d00"), 0x017421), (hex!("0100000000333333334444444455000000f70000000000000f80"), 0x0174c1), (hex!("0100000000333333334444444455000000f80000000000000f90"), 0x017561), (hex!("0100000000333333334444444455000000f90000000000000fa0"), 0x017601), (hex!("0100000000333333334444444455000000fa0000000000000fb0"), 0x0176a1), (hex!("0100000000333333334444444455000000fa00000000000067b0"), 0x017741), (hex!("0100000000333333334444444455000000fb0000000000000fc0"), 0x0177e1), (hex!("0100000000333333334444444455000000fb0000000000004eb0"), 0x017881), (hex!("0100000000333333334444444455000000fb0000000000006ef0"), 0x017921), (hex!("0100000000333333334444444455000000fc0000000000000fd0"), 0x0179c1), (hex!("0100000000333333334444444455000000fc0000000000004470"), 0x017a61), (hex!("0100000000333333334444444455000000fc0000000000005940"), 0x017b01), (hex!("0100000000333333334444444455000000fd0000000000000fe0"), 0x017ba1), (hex!("0100000000333333334444444455000000fe0000000000000ff0"), 0x017c41), (hex!("0100000000333333334444444455000000ff0000000000001000"), 0x017ce1), (hex!("0100000000333333334444444455000000ff0000000000005690"), 0x017d81), (hex!("0100000000333333334444444455000001000000000000001010"), 0x017e21), (hex!("0100000000333333334444444455000001000000000000005210"), 0x017ec1), (hex!("01000000003333333344444444550000010000000000000070a0"), 0x017f61), (hex!("0100000000333333334444444455000001010000000000001020"), 0x018001), (hex!("0100000000333333334444444455000001010000000000006b80"), 0x0180a1), (hex!("0100000000333333334444444455000001020000000000001030"), 0x018141), (hex!("0100000000333333334444444455000001030000000000001040"), 0x0181e1), (hex!("0100000000333333334444444455000001030000000000004c80"), 0x018281), (hex!("0100000000333333334444444455000001040000000000001050"), 0x018321), (hex!("0100000000333333334444444455000001040000000000004850"), 0x0183c1), (hex!("01000000003333333344444444550000010400000000000057b0"), 0x018461), (hex!("0100000000333333334444444455000001050000000000001060"), 0x018501), (hex!("01000000003333333344444444550000010500000000000048d0"), 0x0185a1), (hex!("0100000000333333334444444455000001050000000000007870"), 0x018641), (hex!("0100000000333333334444444455000001060000000000001070"), 0x0186e1), (hex!("0100000000333333334444444455000001060000000000004f90"), 0x018781), (hex!("0100000000333333334444444455000001060000000000006270"), 0x018821), (hex!("0100000000333333334444444455000001070000000000001080"), 0x0188c1), (hex!("01000000003333333344444444550000010700000000000063b0"), 0x018961), (hex!("0100000000333333334444444455000001080000000000001090"), 0x018a01), (hex!("01000000003333333344444444550000010900000000000010a0"), 0x018aa1), (hex!("0100000000333333334444444455000001090000000000006f40"), 0x018b41), (hex!("01000000003333333344444444550000010a00000000000010b0"), 0x018be1), (hex!("01000000003333333344444444550000010a0000000000006640"), 0x018c81), (hex!("01000000003333333344444444550000010b00000000000010c0"), 0x018d21), (hex!("01000000003333333344444444550000010c00000000000010d0"), 0x018dc1), (hex!("01000000003333333344444444550000010d00000000000010e0"), 0x018e61), (hex!("01000000003333333344444444550000010e00000000000010f0"), 0x018f01), (hex!("01000000003333333344444444550000010e0000000000005c40"), 0x018fa1), (hex!("01000000003333333344444444550000010e0000000000007ba0"), 0x019041), (hex!("01000000003333333344444444550000010f0000000000001100"), 0x0190e1), (hex!("01000000003333333344444444550000010f0000000000005c30"), 0x019181), (hex!("0100000000333333334444444455000001100000000000001110"), 0x019221), (hex!("0100000000333333334444444455000001100000000000007640"), 0x0192c1), (hex!("0100000000333333334444444455000001110000000000001120"), 0x019361), (hex!("01000000003333333344444444550000011100000000000052c0"), 0x019401), (hex!("0100000000333333334444444455000001110000000000005710"), 0x0194a1), (hex!("0100000000333333334444444455000001110000000000006a00"), 0x019541), (hex!("0100000000333333334444444455000001120000000000001130"), 0x0195e1), (hex!("0100000000333333334444444455000001130000000000001140"), 0x019681), (hex!("0100000000333333334444444455000001140000000000001150"), 0x019721), (hex!("0100000000333333334444444455000001140000000000003fa0"), 0x0197c1), (hex!("01000000003333333344444444550000011400000000000054b0"), 0x019861), (hex!("0100000000333333334444444455000001140000000000006070"), 0x019901), (hex!("0100000000333333334444444455000001150000000000001160"), 0x0199a1), (hex!("0100000000333333334444444455000001150000000000005320"), 0x019a41), (hex!("0100000000333333334444444455000001150000000000006600"), 0x019ae1), (hex!("0100000000333333334444444455000001150000000000006df0"), 0x019b81), (hex!("01000000003333333344444444550000011500000000000079c0"), 0x019c21), (hex!("0100000000333333334444444455000001160000000000001170"), 0x019cc1), (hex!("0100000000333333334444444455000001170000000000001180"), 0x019d61), (hex!("0100000000333333334444444455000001170000000000004a60"), 0x019e01), (hex!("01000000003333333344444444550000011700000000000063c0"), 0x019ea1), (hex!("0100000000333333334444444455000001180000000000001190"), 0x019f41), (hex!("0100000000333333334444444455000001180000000000004530"), 0x019fe1), (hex!("01000000003333333344444444550000011800000000000077c0"), 0x01a081), (hex!("01000000003333333344444444550000011900000000000011a0"), 0x01a121), (hex!("01000000003333333344444444550000011a00000000000011b0"), 0x01a1c1), (hex!("01000000003333333344444444550000011a00000000000041c0"), 0x01a261), (hex!("01000000003333333344444444550000011a00000000000061e0"), 0x01a301), (hex!("01000000003333333344444444550000011b00000000000011c0"), 0x01a3a1), (hex!("01000000003333333344444444550000011c00000000000011d0"), 0x01a441), (hex!("01000000003333333344444444550000011c0000000000005f90"), 0x01a4e1), (hex!("01000000003333333344444444550000011d00000000000011e0"), 0x01a581), (hex!("01000000003333333344444444550000011d0000000000004160"), 0x01a621), (hex!("01000000003333333344444444550000011e00000000000011f0"), 0x01a6c1), (hex!("01000000003333333344444444550000011e00000000000056d0"), 0x01a761), (hex!("01000000003333333344444444550000011f0000000000001200"), 0x01a801), (hex!("01000000003333333344444444550000011f0000000000004510"), 0x01a8a1), (hex!("0100000000333333334444444455000001200000000000001210"), 0x01a941), (hex!("0100000000333333334444444455000001210000000000001220"), 0x01a9e1), (hex!("0100000000333333334444444455000001210000000000005140"), 0x01aa81), (hex!("0100000000333333334444444455000001210000000000006710"), 0x01ab21), (hex!("0100000000333333334444444455000001210000000000006f50"), 0x01abc1), (hex!("0100000000333333334444444455000001220000000000001230"), 0x01ac61), (hex!("0100000000333333334444444455000001220000000000005570"), 0x01ad01), (hex!("0100000000333333334444444455000001220000000000007ac0"), 0x01ada1), (hex!("0100000000333333334444444455000001230000000000001240"), 0x01ae41), (hex!("0100000000333333334444444455000001240000000000001250"), 0x01aee1), (hex!("0100000000333333334444444455000001240000000000006cd0"), 0x01af81), (hex!("0100000000333333334444444455000001250000000000001260"), 0x01b021), (hex!("01000000003333333344444444550000012500000000000046b0"), 0x01b0c1), (hex!("0100000000333333334444444455000001250000000000005eb0"), 0x01b161), (hex!("0100000000333333334444444455000001260000000000001270"), 0x01b201), (hex!("0100000000333333334444444455000001260000000000004630"), 0x01b2a1), (hex!("0100000000333333334444444455000001270000000000001280"), 0x01b341), (hex!("0100000000333333334444444455000001270000000000004ff0"), 0x01b3e1), (hex!("0100000000333333334444444455000001270000000000006ec0"), 0x01b481), (hex!("0100000000333333334444444455000001280000000000001290"), 0x01b521), (hex!("01000000003333333344444444550000012900000000000012a0"), 0x01b5c1), (hex!("0100000000333333334444444455000001290000000000005f60"), 0x01b661), (hex!("01000000003333333344444444550000012a00000000000012b0"), 0x01b701), (hex!("01000000003333333344444444550000012a0000000000005480"), 0x01b7a1), (hex!("01000000003333333344444444550000012b00000000000012c0"), 0x01b841), (hex!("01000000003333333344444444550000012b00000000000065a0"), 0x01b8e1), (hex!("01000000003333333344444444550000012b00000000000066c0"), 0x01b981), (hex!("01000000003333333344444444550000012c00000000000012d0"), 0x01ba21), (hex!("01000000003333333344444444550000012c00000000000064b0"), 0x01bac1), (hex!("01000000003333333344444444550000012d00000000000012e0"), 0x01bb61), (hex!("01000000003333333344444444550000012d00000000000049c0"), 0x01bc01), (hex!("01000000003333333344444444550000012d0000000000004bf0"), 0x01bca1), (hex!("01000000003333333344444444550000012e00000000000012f0"), 0x01bd41), (hex!("01000000003333333344444444550000012e0000000000005ed0"), 0x01bde1), (hex!("01000000003333333344444444550000012f0000000000001300"), 0x01be81), (hex!("01000000003333333344444444550000012f00000000000049a0"), 0x01bf21), (hex!("0100000000333333334444444455000001300000000000001310"), 0x01bfc1), (hex!("0100000000333333334444444455000001300000000000007840"), 0x01c061), (hex!("0100000000333333334444444455000001310000000000001320"), 0x01c101), (hex!("0100000000333333334444444455000001310000000000005f70"), 0x01c1a1), (hex!("0100000000333333334444444455000001320000000000001330"), 0x01c241), (hex!("0100000000333333334444444455000001320000000000005a00"), 0x01c2e1), (hex!("0100000000333333334444444455000001330000000000001340"), 0x01c381), (hex!("0100000000333333334444444455000001330000000000006c70"), 0x01c421), (hex!("0100000000333333334444444455000001340000000000001350"), 0x01c4c1), (hex!("0100000000333333334444444455000001340000000000005c60"), 0x01c561), (hex!("0100000000333333334444444455000001350000000000001360"), 0x01c601), (hex!("0100000000333333334444444455000001350000000000004f10"), 0x01c6a1), (hex!("0100000000333333334444444455000001360000000000001370"), 0x01c741), (hex!("0100000000333333334444444455000001360000000000004c60"), 0x01c7e1), (hex!("0100000000333333334444444455000001370000000000001380"), 0x01c881), (hex!("0100000000333333334444444455000001380000000000001390"), 0x01c921), (hex!("01000000003333333344444444550000013900000000000013a0"), 0x01c9c1), (hex!("0100000000333333334444444455000001390000000000004ea0"), 0x01ca61), (hex!("01000000003333333344444444550000013a00000000000013b0"), 0x01cb01), (hex!("01000000003333333344444444550000013a0000000000007350"), 0x01cba1), (hex!("01000000003333333344444444550000013b00000000000013c0"), 0x01cc41), (hex!("01000000003333333344444444550000013c00000000000013d0"), 0x01cce1), (hex!("01000000003333333344444444550000013c0000000000007050"), 0x01cd81), (hex!("01000000003333333344444444550000013d00000000000013e0"), 0x01ce21), (hex!("01000000003333333344444444550000013d0000000000006bd0"), 0x01cec1), (hex!("01000000003333333344444444550000013e00000000000013f0"), 0x01cf61), (hex!("01000000003333333344444444550000013e00000000000058e0"), 0x01d001), (hex!("01000000003333333344444444550000013f0000000000001400"), 0x01d0a1), (hex!("01000000003333333344444444550000013f0000000000004740"), 0x01d141), (hex!("0100000000333333334444444455000001400000000000001410"), 0x01d1e1), (hex!("0100000000333333334444444455000001400000000000003f10"), 0x01d281), (hex!("0100000000333333334444444455000001400000000000006d40"), 0x01d321), (hex!("01000000003333333344444444550000014000000000000072d0"), 0x01d3c1), (hex!("0100000000333333334444444455000001410000000000001420"), 0x01d461), (hex!("0100000000333333334444444455000001420000000000001430"), 0x01d501), (hex!("0100000000333333334444444455000001430000000000001440"), 0x01d5a1), (hex!("0100000000333333334444444455000001440000000000001450"), 0x01d641), (hex!("0100000000333333334444444455000001450000000000001460"), 0x01d6e1), (hex!("0100000000333333334444444455000001460000000000001470"), 0x01d781), (hex!("01000000003333333344444444550000014600000000000055c0"), 0x01d821), (hex!("0100000000333333334444444455000001470000000000001480"), 0x01d8c1), (hex!("0100000000333333334444444455000001470000000000004570"), 0x01d961), (hex!("0100000000333333334444444455000001470000000000004be0"), 0x01da01), (hex!("0100000000333333334444444455000001480000000000001490"), 0x01daa1), (hex!("0100000000333333334444444455000001480000000000005360"), 0x01db41), (hex!("01000000003333333344444444550000014900000000000014a0"), 0x01dbe1), (hex!("01000000003333333344444444550000014a00000000000014b0"), 0x01dc81), (hex!("01000000003333333344444444550000014a00000000000053d0"), 0x01dd21), (hex!("01000000003333333344444444550000014b00000000000014c0"), 0x01ddc1), (hex!("01000000003333333344444444550000014b0000000000005950"), 0x01de61), (hex!("01000000003333333344444444550000014c00000000000014d0"), 0x01df01), (hex!("01000000003333333344444444550000014c0000000000004f60"), 0x01dfa1), (hex!("01000000003333333344444444550000014d00000000000014e0"), 0x01e041), (hex!("01000000003333333344444444550000014d0000000000004520"), 0x01e0e1), (hex!("01000000003333333344444444550000014d0000000000005200"), 0x01e181), (hex!("01000000003333333344444444550000014e00000000000014f0"), 0x01e221), (hex!("01000000003333333344444444550000014e0000000000005bd0"), 0x01e2c1), (hex!("01000000003333333344444444550000014f0000000000001500"), 0x01e361), (hex!("01000000003333333344444444550000014f00000000000060d0"), 0x01e401), (hex!("0100000000333333334444444455000001500000000000001510"), 0x01e4a1), (hex!("01000000003333333344444444550000015000000000000075e0"), 0x01e541), (hex!("0100000000333333334444444455000001510000000000001520"), 0x01e5e1), (hex!("0100000000333333334444444455000001510000000000005c00"), 0x01e681), (hex!("0100000000333333334444444455000001510000000000006af0"), 0x01e721), (hex!("0100000000333333334444444455000001510000000000007b80"), 0x01e7c1), (hex!("0100000000333333334444444455000001520000000000001530"), 0x01e861), (hex!("0100000000333333334444444455000001520000000000004c70"), 0x01e901), (hex!("0100000000333333334444444455000001530000000000001540"), 0x01e9a1), (hex!("0100000000333333334444444455000001540000000000001550"), 0x01ea41), (hex!("0100000000333333334444444455000001540000000000007cd0"), 0x01eae1), (hex!("0100000000333333334444444455000001550000000000001560"), 0x01eb81), (hex!("0100000000333333334444444455000001550000000000004ae0"), 0x01ec21), (hex!("01000000003333333344444444550000015500000000000068c0"), 0x01ecc1), (hex!("0100000000333333334444444455000001560000000000001570"), 0x01ed61), (hex!("01000000003333333344444444550000015600000000000064a0"), 0x01ee01), (hex!("0100000000333333334444444455000001570000000000001580"), 0x01eea1), (hex!("0100000000333333334444444455000001580000000000001590"), 0x01ef41), (hex!("0100000000333333334444444455000001580000000000006d30"), 0x01efe1), (hex!("01000000003333333344444444550000015800000000000074f0"), 0x01f081), (hex!("01000000003333333344444444550000015900000000000015a0"), 0x01f121), (hex!("01000000003333333344444444550000015900000000000053a0"), 0x01f1c1), (hex!("01000000003333333344444444550000015900000000000055e0"), 0x01f261), (hex!("0100000000333333334444444455000001590000000000006210"), 0x01f301), (hex!("01000000003333333344444444550000015900000000000067c0"), 0x01f3a1), (hex!("01000000003333333344444444550000015a00000000000015b0"), 0x01f441), (hex!("01000000003333333344444444550000015b00000000000015c0"), 0x01f4e1), (hex!("01000000003333333344444444550000015c00000000000015d0"), 0x01f581), (hex!("01000000003333333344444444550000015c0000000000004d80"), 0x01f621), (hex!("01000000003333333344444444550000015c00000000000073f0"), 0x01f6c1), (hex!("01000000003333333344444444550000015d00000000000015e0"), 0x01f761), (hex!("01000000003333333344444444550000015e00000000000015f0"), 0x01f801), (hex!("01000000003333333344444444550000015e0000000000004120"), 0x01f8a1), (hex!("01000000003333333344444444550000015e0000000000004350"), 0x01f941), (hex!("01000000003333333344444444550000015e0000000000007c50"), 0x01f9e1), (hex!("01000000003333333344444444550000015f0000000000001600"), 0x01fa81), (hex!("0100000000333333334444444455000001600000000000001610"), 0x01fb21), (hex!("0100000000333333334444444455000001600000000000004840"), 0x01fbc1), (hex!("0100000000333333334444444455000001600000000000004b10"), 0x01fc61), (hex!("0100000000333333334444444455000001600000000000007060"), 0x01fd01), (hex!("0100000000333333334444444455000001610000000000001620"), 0x01fda1), (hex!("0100000000333333334444444455000001610000000000005300"), 0x01fe41), (hex!("0100000000333333334444444455000001620000000000001630"), 0x01fee1), (hex!("0100000000333333334444444455000001620000000000006530"), 0x01ff81), (hex!("0100000000333333334444444455000001630000000000001640"), 0x020021), (hex!("0100000000333333334444444455000001640000000000001650"), 0x0200c1), (hex!("0100000000333333334444444455000001650000000000001660"), 0x020161), (hex!("0100000000333333334444444455000001660000000000001670"), 0x020201), (hex!("0100000000333333334444444455000001670000000000001680"), 0x0202a1), (hex!("0100000000333333334444444455000001670000000000007310"), 0x020341), (hex!("0100000000333333334444444455000001680000000000001690"), 0x0203e1), (hex!("0100000000333333334444444455000001680000000000007b50"), 0x020481), (hex!("01000000003333333344444444550000016900000000000016a0"), 0x020521), (hex!("01000000003333333344444444550000016900000000000049d0"), 0x0205c1), (hex!("01000000003333333344444444550000016a00000000000016b0"), 0x020661), (hex!("01000000003333333344444444550000016a00000000000078b0"), 0x020701), (hex!("01000000003333333344444444550000016b00000000000016c0"), 0x0207a1), (hex!("01000000003333333344444444550000016b0000000000004100"), 0x020841), (hex!("01000000003333333344444444550000016c00000000000016d0"), 0x0208e1), (hex!("01000000003333333344444444550000016c0000000000006e00"), 0x020981), (hex!("01000000003333333344444444550000016d00000000000016e0"), 0x020a21), (hex!("01000000003333333344444444550000016e00000000000016f0"), 0x020ac1), (hex!("01000000003333333344444444550000016e0000000000004ac0"), 0x020b61), (hex!("01000000003333333344444444550000016e0000000000007820"), 0x020c01), (hex!("01000000003333333344444444550000016f0000000000001700"), 0x020ca1), (hex!("0100000000333333334444444455000001700000000000001710"), 0x020d41), (hex!("0100000000333333334444444455000001700000000000005830"), 0x020de1), (hex!("0100000000333333334444444455000001710000000000001720"), 0x020e81), (hex!("01000000003333333344444444550000017100000000000072f0"), 0x020f21), (hex!("0100000000333333334444444455000001720000000000001730"), 0x020fc1), (hex!("0100000000333333334444444455000001720000000000004870"), 0x021061), (hex!("01000000003333333344444444550000017200000000000070b0"), 0x021101), (hex!("0100000000333333334444444455000001730000000000001740"), 0x0211a1), (hex!("0100000000333333334444444455000001740000000000001750"), 0x021241), (hex!("0100000000333333334444444455000001750000000000001760"), 0x0212e1), (hex!("0100000000333333334444444455000001750000000000005670"), 0x021381), (hex!("0100000000333333334444444455000001750000000000005870"), 0x021421), (hex!("0100000000333333334444444455000001760000000000001770"), 0x0214c1), (hex!("0100000000333333334444444455000001770000000000001780"), 0x021561), (hex!("0100000000333333334444444455000001770000000000005000"), 0x021601), (hex!("0100000000333333334444444455000001770000000000007090"), 0x0216a1), (hex!("0100000000333333334444444455000001780000000000001790"), 0x021741), (hex!("01000000003333333344444444550000017800000000000048a0"), 0x0217e1), (hex!("0100000000333333334444444455000001780000000000006bf0"), 0x021881), (hex!("01000000003333333344444444550000017900000000000017a0"), 0x021921), (hex!("01000000003333333344444444550000017900000000000057d0"), 0x0219c1), (hex!("0100000000333333334444444455000001790000000000006660"), 0x021a61), (hex!("01000000003333333344444444550000017a00000000000017b0"), 0x021b01), (hex!("01000000003333333344444444550000017a0000000000004970"), 0x021ba1), (hex!("01000000003333333344444444550000017a0000000000005dc0"), 0x021c41), (hex!("01000000003333333344444444550000017b00000000000017c0"), 0x021ce1), (hex!("01000000003333333344444444550000017b0000000000004ee0"), 0x021d81), (hex!("01000000003333333344444444550000017b00000000000054c0"), 0x021e21), (hex!("01000000003333333344444444550000017c00000000000017d0"), 0x021ec1), (hex!("01000000003333333344444444550000017c0000000000003fc0"), 0x021f61), (hex!("01000000003333333344444444550000017c00000000000063e0"), 0x022001), (hex!("01000000003333333344444444550000017c0000000000006520"), 0x0220a1), (hex!("01000000003333333344444444550000017d00000000000017e0"), 0x022141), (hex!("01000000003333333344444444550000017d0000000000006220"), 0x0221e1), (hex!("01000000003333333344444444550000017d0000000000007120"), 0x022281), (hex!("01000000003333333344444444550000017e00000000000017f0"), 0x022321), (hex!("01000000003333333344444444550000017f0000000000001800"), 0x0223c1), (hex!("0100000000333333334444444455000001800000000000001810"), 0x022461), (hex!("0100000000333333334444444455000001810000000000001820"), 0x022501), (hex!("01000000003333333344444444550000018100000000000041f0"), 0x0225a1), (hex!("0100000000333333334444444455000001810000000000007590"), 0x022641), (hex!("0100000000333333334444444455000001820000000000001830"), 0x0226e1), (hex!("0100000000333333334444444455000001820000000000004ce0"), 0x022781), (hex!("0100000000333333334444444455000001830000000000001840"), 0x022821), (hex!("01000000003333333344444444550000018300000000000042c0"), 0x0228c1), (hex!("0100000000333333334444444455000001840000000000001850"), 0x022961), (hex!("0100000000333333334444444455000001840000000000004f70"), 0x022a01), (hex!("0100000000333333334444444455000001850000000000001860"), 0x022aa1), (hex!("0100000000333333334444444455000001850000000000006470"), 0x022b41), (hex!("0100000000333333334444444455000001850000000000007500"), 0x022be1), (hex!("0100000000333333334444444455000001860000000000001870"), 0x022c81), (hex!("0100000000333333334444444455000001860000000000004770"), 0x022d21), (hex!("0100000000333333334444444455000001870000000000001880"), 0x022dc1), (hex!("0100000000333333334444444455000001870000000000006a30"), 0x022e61), (hex!("0100000000333333334444444455000001880000000000001890"), 0x022f01), (hex!("0100000000333333334444444455000001880000000000007410"), 0x022fa1), (hex!("01000000003333333344444444550000018900000000000018a0"), 0x023041), (hex!("01000000003333333344444444550000018900000000000044d0"), 0x0230e1), (hex!("0100000000333333334444444455000001890000000000005ac0"), 0x023181), (hex!("01000000003333333344444444550000018a00000000000018b0"), 0x023221), (hex!("01000000003333333344444444550000018a0000000000006260"), 0x0232c1), (hex!("01000000003333333344444444550000018a0000000000006d70"), 0x023361), (hex!("01000000003333333344444444550000018b00000000000018c0"), 0x023401), (hex!("01000000003333333344444444550000018b0000000000004aa0"), 0x0234a1), (hex!("01000000003333333344444444550000018b0000000000006fd0"), 0x023541), (hex!("01000000003333333344444444550000018c00000000000018d0"), 0x0235e1), (hex!("01000000003333333344444444550000018c00000000000051b0"), 0x023681), (hex!("01000000003333333344444444550000018c0000000000006650"), 0x023721), (hex!("01000000003333333344444444550000018d00000000000018e0"), 0x0237c1), (hex!("01000000003333333344444444550000018e00000000000018f0"), 0x023861), (hex!("01000000003333333344444444550000018e00000000000041d0"), 0x023901), (hex!("01000000003333333344444444550000018f0000000000001900"), 0x0239a1), (hex!("01000000003333333344444444550000018f0000000000007600"), 0x023a41), (hex!("0100000000333333334444444455000001900000000000001910"), 0x023ae1), (hex!("0100000000333333334444444455000001900000000000005410"), 0x023b81), (hex!("0100000000333333334444444455000001900000000000006760"), 0x023c21), (hex!("0100000000333333334444444455000001910000000000001920"), 0x023cc1), (hex!("0100000000333333334444444455000001920000000000001930"), 0x023d61), (hex!("0100000000333333334444444455000001920000000000004ca0"), 0x023e01), (hex!("0100000000333333334444444455000001920000000000005d80"), 0x023ea1), (hex!("0100000000333333334444444455000001920000000000005fd0"), 0x023f41), (hex!("01000000003333333344444444550000019200000000000070d0"), 0x023fe1), (hex!("0100000000333333334444444455000001930000000000001940"), 0x024081), (hex!("0100000000333333334444444455000001930000000000004010"), 0x024121), (hex!("0100000000333333334444444455000001930000000000007ca0"), 0x0241c1), (hex!("0100000000333333334444444455000001940000000000001950"), 0x024261), (hex!("0100000000333333334444444455000001950000000000001960"), 0x024301), (hex!("0100000000333333334444444455000001950000000000005380"), 0x0243a1), (hex!("0100000000333333334444444455000001960000000000001970"), 0x024441), (hex!("0100000000333333334444444455000001960000000000006de0"), 0x0244e1), (hex!("0100000000333333334444444455000001970000000000001980"), 0x024581), (hex!("01000000003333333344444444550000019700000000000048f0"), 0x024621), (hex!("0100000000333333334444444455000001980000000000001990"), 0x0246c1), (hex!("0100000000333333334444444455000001980000000000006510"), 0x024761), (hex!("01000000003333333344444444550000019900000000000019a0"), 0x024801), (hex!("0100000000333333334444444455000001990000000000007570"), 0x0248a1), (hex!("0100000000333333334444444455000001990000000000007580"), 0x024941), (hex!("01000000003333333344444444550000019a00000000000019b0"), 0x0249e1), (hex!("01000000003333333344444444550000019a0000000000004050"), 0x024a81), (hex!("01000000003333333344444444550000019a0000000000004ba0"), 0x024b21), (hex!("01000000003333333344444444550000019a0000000000005540"), 0x024bc1), (hex!("01000000003333333344444444550000019a00000000000061c0"), 0x024c61), (hex!("01000000003333333344444444550000019a0000000000007c60"), 0x024d01), (hex!("01000000003333333344444444550000019b00000000000019c0"), 0x024da1), (hex!("01000000003333333344444444550000019b0000000000006240"), 0x024e41), (hex!("01000000003333333344444444550000019c00000000000019d0"), 0x024ee1), (hex!("01000000003333333344444444550000019d00000000000019e0"), 0x024f81), (hex!("01000000003333333344444444550000019d0000000000004640"), 0x025021), (hex!("01000000003333333344444444550000019d00000000000052a0"), 0x0250c1), (hex!("01000000003333333344444444550000019d00000000000052b0"), 0x025161), (hex!("01000000003333333344444444550000019e00000000000019f0"), 0x025201), (hex!("01000000003333333344444444550000019f0000000000001a00"), 0x0252a1), (hex!("01000000003333333344444444550000019f0000000000006b20"), 0x025341), (hex!("0100000000333333334444444455000001a00000000000001a10"), 0x0253e1), (hex!("0100000000333333334444444455000001a10000000000001a20"), 0x025481), (hex!("0100000000333333334444444455000001a10000000000005460"), 0x025521), (hex!("0100000000333333334444444455000001a10000000000005d20"), 0x0255c1), (hex!("0100000000333333334444444455000001a100000000000068f0"), 0x025661), (hex!("0100000000333333334444444455000001a20000000000001a30"), 0x025701), (hex!("0100000000333333334444444455000001a20000000000007170"), 0x0257a1), (hex!("0100000000333333334444444455000001a30000000000001a40"), 0x025841), (hex!("0100000000333333334444444455000001a40000000000001a50"), 0x0258e1), (hex!("0100000000333333334444444455000001a50000000000001a60"), 0x025981), (hex!("0100000000333333334444444455000001a60000000000001a70"), 0x025a21), (hex!("0100000000333333334444444455000001a70000000000001a80"), 0x025ac1), (hex!("0100000000333333334444444455000001a70000000000005a90"), 0x025b61), (hex!("0100000000333333334444444455000001a70000000000006440"), 0x025c01), (hex!("0100000000333333334444444455000001a80000000000001a90"), 0x025ca1), (hex!("0100000000333333334444444455000001a80000000000004800"), 0x025d41), (hex!("0100000000333333334444444455000001a90000000000001aa0"), 0x025de1), (hex!("0100000000333333334444444455000001aa0000000000001ab0"), 0x025e81), (hex!("0100000000333333334444444455000001aa0000000000005b60"), 0x025f21), (hex!("0100000000333333334444444455000001ab0000000000001ac0"), 0x025fc1), (hex!("0100000000333333334444444455000001ab0000000000006700"), 0x026061), (hex!("0100000000333333334444444455000001ab00000000000071d0"), 0x026101), (hex!("0100000000333333334444444455000001ac0000000000001ad0"), 0x0261a1), (hex!("0100000000333333334444444455000001ac0000000000007380"), 0x026241), (hex!("0100000000333333334444444455000001ad0000000000001ae0"), 0x0262e1), (hex!("0100000000333333334444444455000001ad0000000000006350"), 0x026381), (hex!("0100000000333333334444444455000001ae0000000000001af0"), 0x026421), (hex!("0100000000333333334444444455000001af0000000000001b00"), 0x0264c1), (hex!("0100000000333333334444444455000001af0000000000007390"), 0x026561), (hex!("0100000000333333334444444455000001b00000000000001b10"), 0x026601), (hex!("0100000000333333334444444455000001b10000000000001b20"), 0x0266a1), (hex!("0100000000333333334444444455000001b10000000000005cc0"), 0x026741), (hex!("0100000000333333334444444455000001b20000000000001b30"), 0x0267e1), (hex!("0100000000333333334444444455000001b20000000000004fb0"), 0x026881), (hex!("0100000000333333334444444455000001b30000000000001b40"), 0x026921), (hex!("0100000000333333334444444455000001b40000000000001b50"), 0x0269c1), (hex!("0100000000333333334444444455000001b50000000000001b60"), 0x026a61), (hex!("0100000000333333334444444455000001b60000000000001b70"), 0x026b01), (hex!("0100000000333333334444444455000001b600000000000048e0"), 0x026ba1), (hex!("0100000000333333334444444455000001b70000000000001b80"), 0x026c41), (hex!("0100000000333333334444444455000001b70000000000005ca0"), 0x026ce1), (hex!("0100000000333333334444444455000001b70000000000007900"), 0x026d81), (hex!("0100000000333333334444444455000001b80000000000001b90"), 0x026e21), (hex!("0100000000333333334444444455000001b80000000000004d90"), 0x026ec1), (hex!("0100000000333333334444444455000001b90000000000001ba0"), 0x026f61), (hex!("0100000000333333334444444455000001b90000000000003f40"), 0x027001), (hex!("0100000000333333334444444455000001ba0000000000001bb0"), 0x0270a1), (hex!("0100000000333333334444444455000001ba00000000000042a0"), 0x027141), (hex!("0100000000333333334444444455000001ba00000000000067f0"), 0x0271e1), (hex!("0100000000333333334444444455000001ba00000000000073a0"), 0x027281), (hex!("0100000000333333334444444455000001bb0000000000001bc0"), 0x027321), (hex!("0100000000333333334444444455000001bb0000000000004a00"), 0x0273c1), (hex!("0100000000333333334444444455000001bb0000000000005e00"), 0x027461), (hex!("0100000000333333334444444455000001bc0000000000001bd0"), 0x027501), (hex!("0100000000333333334444444455000001bc0000000000004230"), 0x0275a1), (hex!("0100000000333333334444444455000001bc0000000000005860"), 0x027641), (hex!("0100000000333333334444444455000001bd0000000000001be0"), 0x0276e1), (hex!("0100000000333333334444444455000001bd0000000000007c70"), 0x027781), (hex!("0100000000333333334444444455000001be0000000000001bf0"), 0x027821), (hex!("0100000000333333334444444455000001be0000000000007770"), 0x0278c1), (hex!("0100000000333333334444444455000001be0000000000007cf0"), 0x027961), (hex!("0100000000333333334444444455000001bf0000000000001c00"), 0x027a01), (hex!("0100000000333333334444444455000001bf0000000000006490"), 0x027aa1), (hex!("0100000000333333334444444455000001c00000000000001c10"), 0x027b41), (hex!("0100000000333333334444444455000001c10000000000001c20"), 0x027be1), (hex!("0100000000333333334444444455000001c10000000000004600"), 0x027c81), (hex!("0100000000333333334444444455000001c20000000000001c30"), 0x027d21), (hex!("0100000000333333334444444455000001c20000000000006e30"), 0x027dc1), (hex!("0100000000333333334444444455000001c30000000000001c40"), 0x027e61), (hex!("0100000000333333334444444455000001c40000000000001c50"), 0x027f01), (hex!("0100000000333333334444444455000001c50000000000001c60"), 0x027fa1), (hex!("0100000000333333334444444455000001c60000000000001c70"), 0x028041), (hex!("0100000000333333334444444455000001c60000000000004240"), 0x0280e1), (hex!("0100000000333333334444444455000001c60000000000005bb0"), 0x028181), (hex!("0100000000333333334444444455000001c70000000000001c80"), 0x028221), (hex!("0100000000333333334444444455000001c80000000000001c90"), 0x0282c1), (hex!("0100000000333333334444444455000001c90000000000001ca0"), 0x028361), (hex!("0100000000333333334444444455000001c90000000000006730"), 0x028401), (hex!("0100000000333333334444444455000001ca0000000000001cb0"), 0x0284a1), (hex!("0100000000333333334444444455000001ca00000000000070f0"), 0x028541), (hex!("0100000000333333334444444455000001cb0000000000001cc0"), 0x0285e1), (hex!("0100000000333333334444444455000001cb00000000000071a0"), 0x028681), (hex!("0100000000333333334444444455000001cc0000000000001cd0"), 0x028721), (hex!("0100000000333333334444444455000001cc0000000000005280"), 0x0287c1), (hex!("0100000000333333334444444455000001cc0000000000005d90"), 0x028861), (hex!("0100000000333333334444444455000001cd0000000000001ce0"), 0x028901), (hex!("0100000000333333334444444455000001cd00000000000069b0"), 0x0289a1), (hex!("0100000000333333334444444455000001ce0000000000001cf0"), 0x028a41), (hex!("0100000000333333334444444455000001ce0000000000004540"), 0x028ae1), (hex!("0100000000333333334444444455000001cf0000000000001d00"), 0x028b81), (hex!("0100000000333333334444444455000001cf00000000000076a0"), 0x028c21), (hex!("0100000000333333334444444455000001d00000000000001d10"), 0x028cc1), (hex!("0100000000333333334444444455000001d000000000000060a0"), 0x028d61), (hex!("0100000000333333334444444455000001d10000000000001d20"), 0x028e01), (hex!("0100000000333333334444444455000001d20000000000001d30"), 0x028ea1), (hex!("0100000000333333334444444455000001d30000000000001d40"), 0x028f41), (hex!("0100000000333333334444444455000001d30000000000004000"), 0x028fe1), (hex!("0100000000333333334444444455000001d30000000000004140"), 0x029081), (hex!("0100000000333333334444444455000001d30000000000006790"), 0x029121), (hex!("0100000000333333334444444455000001d40000000000001d50"), 0x0291c1), (hex!("0100000000333333334444444455000001d50000000000001d60"), 0x029261), (hex!("0100000000333333334444444455000001d60000000000001d70"), 0x029301), (hex!("0100000000333333334444444455000001d60000000000004b50"), 0x0293a1), (hex!("0100000000333333334444444455000001d60000000000007430"), 0x029441), (hex!("0100000000333333334444444455000001d70000000000001d80"), 0x0294e1), (hex!("0100000000333333334444444455000001d70000000000006920"), 0x029581), (hex!("0100000000333333334444444455000001d80000000000001d90"), 0x029621), (hex!("0100000000333333334444444455000001d80000000000005b30"), 0x0296c1), (hex!("0100000000333333334444444455000001d90000000000001da0"), 0x029761), (hex!("0100000000333333334444444455000001da0000000000001db0"), 0x029801), (hex!("0100000000333333334444444455000001da0000000000004af0"), 0x0298a1), (hex!("0100000000333333334444444455000001da0000000000007240"), 0x029941), (hex!("0100000000333333334444444455000001da0000000000007470"), 0x0299e1), (hex!("0100000000333333334444444455000001db0000000000001dc0"), 0x029a81), (hex!("0100000000333333334444444455000001db00000000000045d0"), 0x029b21), (hex!("0100000000333333334444444455000001dc0000000000001dd0"), 0x029bc1), (hex!("0100000000333333334444444455000001dd0000000000001de0"), 0x029c61), (hex!("0100000000333333334444444455000001dd0000000000004bb0"), 0x029d01), (hex!("0100000000333333334444444455000001dd0000000000004cd0"), 0x029da1), (hex!("0100000000333333334444444455000001dd0000000000006100"), 0x029e41), (hex!("0100000000333333334444444455000001dd0000000000007bb0"), 0x029ee1), (hex!("0100000000333333334444444455000001de0000000000001df0"), 0x029f81), (hex!("0100000000333333334444444455000001de0000000000004260"), 0x02a021), (hex!("0100000000333333334444444455000001de0000000000006040"), 0x02a0c1), (hex!("0100000000333333334444444455000001df0000000000001e00"), 0x02a161), (hex!("0100000000333333334444444455000001df0000000000005fa0"), 0x02a201), (hex!("0100000000333333334444444455000001df0000000000006a70"), 0x02a2a1), (hex!("0100000000333333334444444455000001df0000000000006dc0"), 0x02a341), (hex!("0100000000333333334444444455000001e00000000000001e10"), 0x02a3e1), (hex!("0100000000333333334444444455000001e00000000000007010"), 0x02a481), (hex!("0100000000333333334444444455000001e10000000000001e20"), 0x02a521), (hex!("0100000000333333334444444455000001e10000000000005720"), 0x02a5c1), (hex!("0100000000333333334444444455000001e10000000000006830"), 0x02a661), (hex!("0100000000333333334444444455000001e20000000000001e30"), 0x02a701), (hex!("0100000000333333334444444455000001e20000000000005100"), 0x02a7a1), (hex!("0100000000333333334444444455000001e30000000000001e40"), 0x02a841), (hex!("0100000000333333334444444455000001e40000000000001e50"), 0x02a8e1), (hex!("0100000000333333334444444455000001e40000000000003f30"), 0x02a981), (hex!("0100000000333333334444444455000001e40000000000005220"), 0x02aa21), (hex!("0100000000333333334444444455000001e50000000000001e60"), 0x02aac1), (hex!("0100000000333333334444444455000001e50000000000006f60"), 0x02ab61), (hex!("0100000000333333334444444455000001e60000000000001e70"), 0x02ac01), (hex!("0100000000333333334444444455000001e60000000000006c80"), 0x02aca1), (hex!("0100000000333333334444444455000001e70000000000001e80"), 0x02ad41), (hex!("0100000000333333334444444455000001e80000000000001e90"), 0x02ade1), (hex!("0100000000333333334444444455000001e80000000000004e30"), 0x02ae81), (hex!("0100000000333333334444444455000001e90000000000001ea0"), 0x02af21), (hex!("0100000000333333334444444455000001e90000000000005470"), 0x02afc1), (hex!("0100000000333333334444444455000001ea0000000000001eb0"), 0x02b061), (hex!("0100000000333333334444444455000001ea0000000000007980"), 0x02b101), (hex!("0100000000333333334444444455000001eb0000000000001ec0"), 0x02b1a1), (hex!("0100000000333333334444444455000001eb0000000000004390"), 0x02b241), (hex!("0100000000333333334444444455000001eb0000000000005970"), 0x02b2e1), (hex!("0100000000333333334444444455000001ec0000000000001ed0"), 0x02b381), (hex!("0100000000333333334444444455000001ec0000000000005d50"), 0x02b421), (hex!("0100000000333333334444444455000001ec00000000000076e0"), 0x02b4c1), (hex!("0100000000333333334444444455000001ed0000000000001ee0"), 0x02b561), (hex!("0100000000333333334444444455000001ed0000000000006190"), 0x02b601), (hex!("0100000000333333334444444455000001ee0000000000001ef0"), 0x02b6a1), (hex!("0100000000333333334444444455000001ee0000000000004900"), 0x02b741), (hex!("0100000000333333334444444455000001ef0000000000001f00"), 0x02b7e1), (hex!("0100000000333333334444444455000001ef0000000000006c60"), 0x02b881), (hex!("0100000000333333334444444455000001f00000000000001f10"), 0x02b921), (hex!("0100000000333333334444444455000001f00000000000006950"), 0x02b9c1), (hex!("0100000000333333334444444455000001f10000000000001f20"), 0x02ba61), (hex!("0100000000333333334444444455000001f10000000000006400"), 0x02bb01), (hex!("0100000000333333334444444455000001f20000000000001f30"), 0x02bba1), (hex!("0100000000333333334444444455000001f20000000000006f00"), 0x02bc41), (hex!("0100000000333333334444444455000001f20000000000007b10"), 0x02bce1), (hex!("0100000000333333334444444455000001f30000000000001f40"), 0x02bd81), (hex!("0100000000333333334444444455000001f40000000000001f50"), 0x02be21), (hex!("0100000000333333334444444455000001f50000000000001f60"), 0x02bec1), (hex!("0100000000333333334444444455000001f500000000000044f0"), 0x02bf61), (hex!("0100000000333333334444444455000001f60000000000001f70"), 0x02c001), (hex!("0100000000333333334444444455000001f70000000000001f80"), 0x02c0a1), (hex!("0100000000333333334444444455000001f70000000000004ad0"), 0x02c141), (hex!("0100000000333333334444444455000001f80000000000001f90"), 0x02c1e1), (hex!("0100000000333333334444444455000001f90000000000001fa0"), 0x02c281), (hex!("0100000000333333334444444455000001f90000000000003f60"), 0x02c321), (hex!("0100000000333333334444444455000001f90000000000004a80"), 0x02c3c1), (hex!("0100000000333333334444444455000001fa0000000000001fb0"), 0x02c461), (hex!("0100000000333333334444444455000001fa0000000000006f90"), 0x02c501), (hex!("0100000000333333334444444455000001fb0000000000001fc0"), 0x02c5a1), (hex!("0100000000333333334444444455000001fc0000000000001fd0"), 0x02c641), (hex!("0100000000333333334444444455000001fc0000000000004a90"), 0x02c6e1), (hex!("0100000000333333334444444455000001fd0000000000001fe0"), 0x02c781), (hex!("0100000000333333334444444455000001fd0000000000005f50"), 0x02c821), (hex!("0100000000333333334444444455000001fe0000000000001ff0"), 0x02c8c1), (hex!("0100000000333333334444444455000001ff0000000000002000"), 0x02c961), (hex!("0100000000333333334444444455000002000000000000002010"), 0x02ca01), (hex!("0100000000333333334444444455000002000000000000005f00"), 0x02caa1), (hex!("0100000000333333334444444455000002000000000000006840"), 0x02cb41), (hex!("0100000000333333334444444455000002010000000000002020"), 0x02cbe1), (hex!("0100000000333333334444444455000002020000000000002030"), 0x02cc81), (hex!("0100000000333333334444444455000002030000000000002040"), 0x02cd21), (hex!("0100000000333333334444444455000002040000000000002050"), 0x02cdc1), (hex!("01000000003333333344444444550000020400000000000051f0"), 0x02ce61), (hex!("0100000000333333334444444455000002050000000000002060"), 0x02cf01), (hex!("0100000000333333334444444455000002060000000000002070"), 0x02cfa1), (hex!("0100000000333333334444444455000002060000000000005c80"), 0x02d041), (hex!("01000000003333333344444444550000020600000000000061d0"), 0x02d0e1), (hex!("01000000003333333344444444550000020600000000000078c0"), 0x02d181), (hex!("0100000000333333334444444455000002070000000000002080"), 0x02d221), (hex!("0100000000333333334444444455000002070000000000006ba0"), 0x02d2c1), (hex!("0100000000333333334444444455000002080000000000002090"), 0x02d361), (hex!("01000000003333333344444444550000020900000000000020a0"), 0x02d401), (hex!("01000000003333333344444444550000020900000000000067a0"), 0x02d4a1), (hex!("01000000003333333344444444550000020a00000000000020b0"), 0x02d541), (hex!("01000000003333333344444444550000020a0000000000004950"), 0x02d5e1), (hex!("01000000003333333344444444550000020a0000000000004de0"), 0x02d681), (hex!("01000000003333333344444444550000020b00000000000020c0"), 0x02d721), (hex!("01000000003333333344444444550000020b0000000000004b00"), 0x02d7c1), (hex!("01000000003333333344444444550000020c00000000000020d0"), 0x02d861), (hex!("01000000003333333344444444550000020d00000000000020e0"), 0x02d901), (hex!("01000000003333333344444444550000020e00000000000020f0"), 0x02d9a1), (hex!("01000000003333333344444444550000020f0000000000002100"), 0x02da41), (hex!("0100000000333333334444444455000002100000000000002110"), 0x02dae1), (hex!("0100000000333333334444444455000002110000000000002120"), 0x02db81), (hex!("0100000000333333334444444455000002110000000000004490"), 0x02dc21), (hex!("0100000000333333334444444455000002120000000000002130"), 0x02dcc1), (hex!("0100000000333333334444444455000002130000000000002140"), 0x02dd61), (hex!("01000000003333333344444444550000021300000000000046d0"), 0x02de01), (hex!("01000000003333333344444444550000021300000000000046e0"), 0x02dea1), (hex!("0100000000333333334444444455000002130000000000004b70"), 0x02df41), (hex!("0100000000333333334444444455000002140000000000002150"), 0x02dfe1), (hex!("0100000000333333334444444455000002140000000000006c50"), 0x02e081), (hex!("0100000000333333334444444455000002150000000000002160"), 0x02e121), (hex!("01000000003333333344444444550000021500000000000043c0"), 0x02e1c1), (hex!("0100000000333333334444444455000002160000000000002170"), 0x02e261), (hex!("01000000003333333344444444550000021600000000000055b0"), 0x02e301), (hex!("0100000000333333334444444455000002160000000000006150"), 0x02e3a1), (hex!("0100000000333333334444444455000002170000000000002180"), 0x02e441), (hex!("01000000003333333344444444550000021700000000000053b0"), 0x02e4e1), (hex!("0100000000333333334444444455000002170000000000007460"), 0x02e581), (hex!("0100000000333333334444444455000002180000000000002190"), 0x02e621), (hex!("01000000003333333344444444550000021900000000000021a0"), 0x02e6c1), (hex!("01000000003333333344444444550000021a00000000000021b0"), 0x02e761), (hex!("01000000003333333344444444550000021a0000000000007650"), 0x02e801), (hex!("01000000003333333344444444550000021b00000000000021c0"), 0x02e8a1), (hex!("01000000003333333344444444550000021b0000000000004b20"), 0x02e941), (hex!("01000000003333333344444444550000021c00000000000021d0"), 0x02e9e1), (hex!("01000000003333333344444444550000021c0000000000007610"), 0x02ea81), (hex!("01000000003333333344444444550000021d00000000000021e0"), 0x02eb21), (hex!("01000000003333333344444444550000021d0000000000005f40"), 0x02ebc1), (hex!("01000000003333333344444444550000021e00000000000021f0"), 0x02ec61), (hex!("01000000003333333344444444550000021e0000000000005a50"), 0x02ed01), (hex!("01000000003333333344444444550000021e0000000000005ff0"), 0x02eda1), (hex!("01000000003333333344444444550000021f0000000000002200"), 0x02ee41), (hex!("01000000003333333344444444550000021f00000000000043a0"), 0x02eee1), (hex!("01000000003333333344444444550000021f0000000000004cb0"), 0x02ef81), (hex!("01000000003333333344444444550000021f0000000000004e00"), 0x02f021), (hex!("0100000000333333334444444455000002200000000000002210"), 0x02f0c1), (hex!("0100000000333333334444444455000002210000000000002220"), 0x02f161), (hex!("0100000000333333334444444455000002210000000000006290"), 0x02f201), (hex!("0100000000333333334444444455000002210000000000007230"), 0x02f2a1), (hex!("0100000000333333334444444455000002220000000000002230"), 0x02f341), (hex!("0100000000333333334444444455000002220000000000006ea0"), 0x02f3e1), (hex!("0100000000333333334444444455000002230000000000002240"), 0x02f481), (hex!("0100000000333333334444444455000002230000000000004710"), 0x02f521), (hex!("0100000000333333334444444455000002240000000000002250"), 0x02f5c1), (hex!("0100000000333333334444444455000002250000000000002260"), 0x02f661), (hex!("0100000000333333334444444455000002260000000000002270"), 0x02f701), (hex!("0100000000333333334444444455000002260000000000005b40"), 0x02f7a1), (hex!("0100000000333333334444444455000002260000000000006300"), 0x02f841), (hex!("0100000000333333334444444455000002270000000000002280"), 0x02f8e1), (hex!("0100000000333333334444444455000002270000000000005b80"), 0x02f981), (hex!("0100000000333333334444444455000002280000000000002290"), 0x02fa21), (hex!("0100000000333333334444444455000002280000000000003ed0"), 0x02fac1), (hex!("0100000000333333334444444455000002280000000000004550"), 0x02fb61), (hex!("01000000003333333344444444550000022800000000000077d0"), 0x02fc01), (hex!("01000000003333333344444444550000022900000000000022a0"), 0x02fca1), (hex!("0100000000333333334444444455000002290000000000006480"), 0x02fd41), (hex!("01000000003333333344444444550000022a00000000000022b0"), 0x02fde1), (hex!("01000000003333333344444444550000022a0000000000005450"), 0x02fe81), (hex!("01000000003333333344444444550000022b00000000000022c0"), 0x02ff21), (hex!("01000000003333333344444444550000022b0000000000006dd0"), 0x02ffc1), (hex!("01000000003333333344444444550000022c00000000000022d0"), 0x030061), (hex!("01000000003333333344444444550000022c0000000000006890"), 0x030101), (hex!("01000000003333333344444444550000022d00000000000022e0"), 0x0301a1), (hex!("01000000003333333344444444550000022e00000000000022f0"), 0x030241), (hex!("01000000003333333344444444550000022e0000000000004f20"), 0x0302e1), (hex!("01000000003333333344444444550000022f0000000000002300"), 0x030381), (hex!("01000000003333333344444444550000022f0000000000005260"), 0x030421), (hex!("01000000003333333344444444550000022f00000000000053f0"), 0x0304c1), (hex!("0100000000333333334444444455000002300000000000002310"), 0x030561), (hex!("01000000003333333344444444550000023000000000000050e0"), 0x030601), (hex!("0100000000333333334444444455000002310000000000002320"), 0x0306a1), (hex!("0100000000333333334444444455000002310000000000007800"), 0x030741), (hex!("0100000000333333334444444455000002320000000000002330"), 0x0307e1), (hex!("0100000000333333334444444455000002330000000000002340"), 0x030881), (hex!("0100000000333333334444444455000002330000000000004d70"), 0x030921), (hex!("0100000000333333334444444455000002330000000000005cf0"), 0x0309c1), (hex!("0100000000333333334444444455000002340000000000002350"), 0x030a61), (hex!("0100000000333333334444444455000002350000000000002360"), 0x030b01), (hex!("0100000000333333334444444455000002350000000000006970"), 0x030ba1), (hex!("0100000000333333334444444455000002360000000000002370"), 0x030c41), (hex!("0100000000333333334444444455000002360000000000005270"), 0x030ce1), (hex!("0100000000333333334444444455000002370000000000002380"), 0x030d81), (hex!("0100000000333333334444444455000002370000000000005d70"), 0x030e21), (hex!("0100000000333333334444444455000002380000000000002390"), 0x030ec1), (hex!("01000000003333333344444444550000023800000000000069a0"), 0x030f61), (hex!("01000000003333333344444444550000023900000000000023a0"), 0x031001), (hex!("01000000003333333344444444550000023900000000000052e0"), 0x0310a1), (hex!("0100000000333333334444444455000002390000000000005a10"), 0x031141), (hex!("0100000000333333334444444455000002390000000000007440"), 0x0311e1), (hex!("01000000003333333344444444550000023a00000000000023b0"), 0x031281), (hex!("01000000003333333344444444550000023a0000000000003f00"), 0x031321), (hex!("01000000003333333344444444550000023a0000000000004430"), 0x0313c1), (hex!("01000000003333333344444444550000023a0000000000007070"), 0x031461), (hex!("01000000003333333344444444550000023a00000000000074a0"), 0x031501), (hex!("01000000003333333344444444550000023b00000000000023c0"), 0x0315a1), (hex!("01000000003333333344444444550000023b0000000000004730"), 0x031641), (hex!("01000000003333333344444444550000023b00000000000068b0"), 0x0316e1), (hex!("01000000003333333344444444550000023c00000000000023d0"), 0x031781), (hex!("01000000003333333344444444550000023c0000000000004680"), 0x031821), (hex!("01000000003333333344444444550000023d00000000000023e0"), 0x0318c1), (hex!("01000000003333333344444444550000023d00000000000059a0"), 0x031961), (hex!("01000000003333333344444444550000023e00000000000023f0"), 0x031a01), (hex!("01000000003333333344444444550000023f0000000000002400"), 0x031aa1), (hex!("0100000000333333334444444455000002400000000000002410"), 0x031b41), (hex!("0100000000333333334444444455000002400000000000004920"), 0x031be1), (hex!("01000000003333333344444444550000024000000000000066e0"), 0x031c81), (hex!("01000000003333333344444444550000024000000000000076f0"), 0x031d21), (hex!("01000000003333333344444444550000024000000000000078e0"), 0x031dc1), (hex!("0100000000333333334444444455000002410000000000002420"), 0x031e61), (hex!("0100000000333333334444444455000002420000000000002430"), 0x031f01), (hex!("0100000000333333334444444455000002420000000000006590"), 0x031fa1), (hex!("0100000000333333334444444455000002430000000000002440"), 0x032041), (hex!("0100000000333333334444444455000002430000000000004d00"), 0x0320e1), (hex!("0100000000333333334444444455000002440000000000002450"), 0x032181), (hex!("0100000000333333334444444455000002440000000000005f80"), 0x032221), (hex!("0100000000333333334444444455000002450000000000002460"), 0x0322c1), (hex!("0100000000333333334444444455000002450000000000004940"), 0x032361), (hex!("0100000000333333334444444455000002460000000000002470"), 0x032401), (hex!("0100000000333333334444444455000002470000000000002480"), 0x0324a1), (hex!("0100000000333333334444444455000002470000000000004dd0"), 0x032541), (hex!("0100000000333333334444444455000002470000000000005930"), 0x0325e1), (hex!("01000000003333333344444444550000024700000000000061b0"), 0x032681), (hex!("0100000000333333334444444455000002470000000000007740"), 0x032721), (hex!("0100000000333333334444444455000002480000000000002490"), 0x0327c1), (hex!("0100000000333333334444444455000002480000000000004890"), 0x032861), (hex!("01000000003333333344444444550000024900000000000024a0"), 0x032901), (hex!("01000000003333333344444444550000024a00000000000024b0"), 0x0329a1), (hex!("01000000003333333344444444550000024b00000000000024c0"), 0x032a41), (hex!("01000000003333333344444444550000024c00000000000024d0"), 0x032ae1), (hex!("01000000003333333344444444550000024d00000000000024e0"), 0x032b81), (hex!("01000000003333333344444444550000024d0000000000004070"), 0x032c21), (hex!("01000000003333333344444444550000024e00000000000024f0"), 0x032cc1), (hex!("01000000003333333344444444550000024e00000000000066a0"), 0x032d61), (hex!("01000000003333333344444444550000024e0000000000006ab0"), 0x032e01), (hex!("01000000003333333344444444550000024f0000000000002500"), 0x032ea1), (hex!("0100000000333333334444444455000002500000000000002510"), 0x032f41), (hex!("0100000000333333334444444455000002510000000000002520"), 0x032fe1), (hex!("0100000000333333334444444455000002510000000000007320"), 0x033081), (hex!("0100000000333333334444444455000002520000000000002530"), 0x033121), (hex!("0100000000333333334444444455000002520000000000006410"), 0x0331c1), (hex!("0100000000333333334444444455000002530000000000002540"), 0x033261), (hex!("0100000000333333334444444455000002530000000000005110"), 0x033301), (hex!("0100000000333333334444444455000002540000000000002550"), 0x0333a1), (hex!("01000000003333333344444444550000025400000000000040c0"), 0x033441), (hex!("0100000000333333334444444455000002540000000000006a40"), 0x0334e1), (hex!("0100000000333333334444444455000002550000000000002560"), 0x033581), (hex!("0100000000333333334444444455000002550000000000005190"), 0x033621), (hex!("0100000000333333334444444455000002560000000000002570"), 0x0336c1), (hex!("01000000003333333344444444550000025600000000000061f0"), 0x033761), (hex!("0100000000333333334444444455000002570000000000002580"), 0x033801), (hex!("0100000000333333334444444455000002580000000000002590"), 0x0338a1), (hex!("01000000003333333344444444550000025800000000000043d0"), 0x033941), (hex!("01000000003333333344444444550000025900000000000025a0"), 0x0339e1), (hex!("0100000000333333334444444455000002590000000000006bb0"), 0x033a81), (hex!("01000000003333333344444444550000025a00000000000025b0"), 0x033b21), (hex!("01000000003333333344444444550000025a0000000000005fb0"), 0x033bc1), (hex!("01000000003333333344444444550000025a00000000000064c0"), 0x033c61), (hex!("01000000003333333344444444550000025b00000000000025c0"), 0x033d01), (hex!("01000000003333333344444444550000025b0000000000005c10"), 0x033da1), (hex!("01000000003333333344444444550000025c00000000000025d0"), 0x033e41), (hex!("01000000003333333344444444550000025c0000000000007d00"), 0x033ee1), (hex!("01000000003333333344444444550000025d00000000000025e0"), 0x033f81), (hex!("01000000003333333344444444550000025e00000000000025f0"), 0x034021), (hex!("01000000003333333344444444550000025e00000000000045e0"), 0x0340c1), (hex!("01000000003333333344444444550000025e0000000000006ee0"), 0x034161), (hex!("01000000003333333344444444550000025f0000000000002600"), 0x034201), (hex!("01000000003333333344444444550000025f00000000000050b0"), 0x0342a1), (hex!("01000000003333333344444444550000025f0000000000007690"), 0x034341), (hex!("0100000000333333334444444455000002600000000000002610"), 0x0343e1), (hex!("0100000000333333334444444455000002600000000000007b60"), 0x034481), (hex!("0100000000333333334444444455000002610000000000002620"), 0x034521), (hex!("0100000000333333334444444455000002620000000000002630"), 0x0345c1), (hex!("0100000000333333334444444455000002630000000000002640"), 0x034661), (hex!("0100000000333333334444444455000002640000000000002650"), 0x034701), (hex!("0100000000333333334444444455000002650000000000002660"), 0x0347a1), (hex!("0100000000333333334444444455000002650000000000006180"), 0x034841), (hex!("0100000000333333334444444455000002660000000000002670"), 0x0348e1), (hex!("0100000000333333334444444455000002660000000000005430"), 0x034981), (hex!("0100000000333333334444444455000002660000000000007a60"), 0x034a21), (hex!("0100000000333333334444444455000002670000000000002680"), 0x034ac1), (hex!("01000000003333333344444444550000026700000000000077f0"), 0x034b61), (hex!("0100000000333333334444444455000002680000000000002690"), 0x034c01), (hex!("01000000003333333344444444550000026900000000000026a0"), 0x034ca1), (hex!("01000000003333333344444444550000026a00000000000026b0"), 0x034d41), (hex!("01000000003333333344444444550000026a0000000000007530"), 0x034de1), (hex!("01000000003333333344444444550000026b00000000000026c0"), 0x034e81), (hex!("01000000003333333344444444550000026b00000000000058b0"), 0x034f21), (hex!("01000000003333333344444444550000026b00000000000066b0"), 0x034fc1), (hex!("01000000003333333344444444550000026b0000000000006b10"), 0x035061), (hex!("01000000003333333344444444550000026c00000000000026d0"), 0x035101), (hex!("01000000003333333344444444550000026d00000000000026e0"), 0x0351a1), (hex!("01000000003333333344444444550000026d0000000000004210"), 0x035241), (hex!("01000000003333333344444444550000026d0000000000005490"), 0x0352e1), (hex!("01000000003333333344444444550000026d0000000000005e60"), 0x035381), (hex!("01000000003333333344444444550000026d00000000000068e0"), 0x035421), (hex!("01000000003333333344444444550000026d0000000000007020"), 0x0354c1), (hex!("01000000003333333344444444550000026d0000000000007300"), 0x035561), (hex!("01000000003333333344444444550000026e00000000000026f0"), 0x035601), (hex!("01000000003333333344444444550000026f0000000000002700"), 0x0356a1), (hex!("01000000003333333344444444550000026f0000000000004910"), 0x035741), (hex!("0100000000333333334444444455000002700000000000002710"), 0x0357e1), (hex!("0100000000333333334444444455000002710000000000002720"), 0x035881), (hex!("01000000003333333344444444550000027100000000000050c0"), 0x035921), (hex!("0100000000333333334444444455000002720000000000002730"), 0x0359c1), (hex!("0100000000333333334444444455000002730000000000002740"), 0x035a61), (hex!("0100000000333333334444444455000002740000000000002750"), 0x035b01), (hex!("0100000000333333334444444455000002740000000000007490"), 0x035ba1), (hex!("0100000000333333334444444455000002750000000000002760"), 0x035c41), (hex!("0100000000333333334444444455000002760000000000002770"), 0x035ce1), (hex!("0100000000333333334444444455000002760000000000004790"), 0x035d81), (hex!("0100000000333333334444444455000002770000000000002780"), 0x035e21), (hex!("01000000003333333344444444550000027700000000000050a0"), 0x035ec1), (hex!("0100000000333333334444444455000002780000000000002790"), 0x035f61), (hex!("0100000000333333334444444455000002780000000000004330"), 0x036001), (hex!("0100000000333333334444444455000002780000000000006b00"), 0x0360a1), (hex!("01000000003333333344444444550000027900000000000027a0"), 0x036141), (hex!("01000000003333333344444444550000027a00000000000027b0"), 0x0361e1), (hex!("01000000003333333344444444550000027b00000000000027c0"), 0x036281), (hex!("01000000003333333344444444550000027b0000000000004930"), 0x036321), (hex!("01000000003333333344444444550000027b0000000000006250"), 0x0363c1), (hex!("01000000003333333344444444550000027c00000000000027d0"), 0x036461), (hex!("01000000003333333344444444550000027d00000000000027e0"), 0x036501), (hex!("01000000003333333344444444550000027d0000000000005ce0"), 0x0365a1), (hex!("01000000003333333344444444550000027d0000000000005fe0"), 0x036641), (hex!("01000000003333333344444444550000027e00000000000027f0"), 0x0366e1), (hex!("01000000003333333344444444550000027f0000000000002800"), 0x036781), (hex!("01000000003333333344444444550000027f0000000000003e90"), 0x036821), (hex!("01000000003333333344444444550000027f0000000000007910"), 0x0368c1), (hex!("0100000000333333334444444455000002800000000000002810"), 0x036961), (hex!("0100000000333333334444444455000002800000000000004990"), 0x036a01), (hex!("0100000000333333334444444455000002800000000000006160"), 0x036aa1), (hex!("0100000000333333334444444455000002800000000000006740"), 0x036b41), (hex!("0100000000333333334444444455000002810000000000002820"), 0x036be1), (hex!("0100000000333333334444444455000002820000000000002830"), 0x036c81), (hex!("0100000000333333334444444455000002820000000000005170"), 0x036d21), (hex!("0100000000333333334444444455000002830000000000002840"), 0x036dc1), (hex!("0100000000333333334444444455000002840000000000002850"), 0x036e61), (hex!("0100000000333333334444444455000002840000000000004810"), 0x036f01), (hex!("0100000000333333334444444455000002840000000000006aa0"), 0x036fa1), (hex!("0100000000333333334444444455000002850000000000002860"), 0x037041), (hex!("0100000000333333334444444455000002860000000000002870"), 0x0370e1), (hex!("0100000000333333334444444455000002860000000000005080"), 0x037181), (hex!("0100000000333333334444444455000002870000000000002880"), 0x037221), (hex!("0100000000333333334444444455000002870000000000004e60"), 0x0372c1), (hex!("0100000000333333334444444455000002880000000000002890"), 0x037361), (hex!("0100000000333333334444444455000002880000000000005060"), 0x037401), (hex!("0100000000333333334444444455000002880000000000006f20"), 0x0374a1), (hex!("01000000003333333344444444550000028900000000000028a0"), 0x037541), (hex!("01000000003333333344444444550000028900000000000047e0"), 0x0375e1), (hex!("01000000003333333344444444550000028a00000000000028b0"), 0x037681), (hex!("01000000003333333344444444550000028a0000000000005ab0"), 0x037721), (hex!("01000000003333333344444444550000028a0000000000007130"), 0x0377c1), (hex!("01000000003333333344444444550000028a0000000000007660"), 0x037861), (hex!("01000000003333333344444444550000028b00000000000028c0"), 0x037901), (hex!("01000000003333333344444444550000028b00000000000054e0"), 0x0379a1), (hex!("01000000003333333344444444550000028c00000000000028d0"), 0x037a41), (hex!("01000000003333333344444444550000028c00000000000046f0"), 0x037ae1), (hex!("01000000003333333344444444550000028c00000000000061a0"), 0x037b81), (hex!("01000000003333333344444444550000028d00000000000028e0"), 0x037c21), (hex!("01000000003333333344444444550000028e00000000000028f0"), 0x037cc1), (hex!("01000000003333333344444444550000028e0000000000004130"), 0x037d61), (hex!("01000000003333333344444444550000028f0000000000002900"), 0x037e01), (hex!("01000000003333333344444444550000028f0000000000007510"), 0x037ea1), (hex!("0100000000333333334444444455000002900000000000002910"), 0x037f41), (hex!("0100000000333333334444444455000002900000000000004a40"), 0x037fe1), (hex!("0100000000333333334444444455000002910000000000002920"), 0x038081), (hex!("0100000000333333334444444455000002920000000000002930"), 0x038121), (hex!("0100000000333333334444444455000002920000000000004e90"), 0x0381c1), (hex!("0100000000333333334444444455000002930000000000002940"), 0x038261), (hex!("0100000000333333334444444455000002930000000000006880"), 0x038301), (hex!("0100000000333333334444444455000002940000000000002950"), 0x0383a1), (hex!("0100000000333333334444444455000002940000000000007bc0"), 0x038441), (hex!("0100000000333333334444444455000002950000000000002960"), 0x0384e1), (hex!("0100000000333333334444444455000002960000000000002970"), 0x038581), (hex!("01000000003333333344444444550000029600000000000059d0"), 0x038621), (hex!("0100000000333333334444444455000002970000000000002980"), 0x0386c1), (hex!("0100000000333333334444444455000002970000000000004a50"), 0x038761), (hex!("0100000000333333334444444455000002970000000000005f20"), 0x038801), (hex!("01000000003333333344444444550000029700000000000068d0"), 0x0388a1), (hex!("0100000000333333334444444455000002980000000000002990"), 0x038941), (hex!("0100000000333333334444444455000002980000000000004370"), 0x0389e1), (hex!("0100000000333333334444444455000002980000000000004420"), 0x038a81), (hex!("01000000003333333344444444550000029900000000000029a0"), 0x038b21), (hex!("01000000003333333344444444550000029a00000000000029b0"), 0x038bc1), (hex!("01000000003333333344444444550000029a0000000000006010"), 0x038c61), (hex!("01000000003333333344444444550000029a0000000000006980"), 0x038d01), (hex!("01000000003333333344444444550000029b00000000000029c0"), 0x038da1), (hex!("01000000003333333344444444550000029c00000000000029d0"), 0x038e41), (hex!("01000000003333333344444444550000029c0000000000007480"), 0x038ee1), (hex!("01000000003333333344444444550000029d00000000000029e0"), 0x038f81), (hex!("01000000003333333344444444550000029d0000000000005030"), 0x039021), (hex!("01000000003333333344444444550000029d0000000000007780"), 0x0390c1), (hex!("01000000003333333344444444550000029d0000000000007a50"), 0x039161), (hex!("01000000003333333344444444550000029e00000000000029f0"), 0x039201), (hex!("01000000003333333344444444550000029e00000000000074b0"), 0x0392a1), (hex!("01000000003333333344444444550000029f0000000000002a00"), 0x039341), (hex!("0100000000333333334444444455000002a00000000000002a10"), 0x0393e1), (hex!("0100000000333333334444444455000002a10000000000002a20"), 0x039481), (hex!("0100000000333333334444444455000002a20000000000002a30"), 0x039521), (hex!("0100000000333333334444444455000002a20000000000004c50"), 0x0395c1), (hex!("0100000000333333334444444455000002a20000000000006f10"), 0x039661), (hex!("0100000000333333334444444455000002a30000000000002a40"), 0x039701), (hex!("0100000000333333334444444455000002a40000000000002a50"), 0x0397a1), (hex!("0100000000333333334444444455000002a40000000000005d60"), 0x039841), (hex!("0100000000333333334444444455000002a50000000000002a60"), 0x0398e1), (hex!("0100000000333333334444444455000002a50000000000005440"), 0x039981), (hex!("0100000000333333334444444455000002a50000000000005890"), 0x039a21), (hex!("0100000000333333334444444455000002a60000000000002a70"), 0x039ac1), (hex!("0100000000333333334444444455000002a70000000000002a80"), 0x039b61), (hex!("0100000000333333334444444455000002a700000000000054a0"), 0x039c01), (hex!("0100000000333333334444444455000002a70000000000007280"), 0x039ca1), (hex!("0100000000333333334444444455000002a80000000000002a90"), 0x039d41), (hex!("0100000000333333334444444455000002a90000000000002aa0"), 0x039de1), (hex!("0100000000333333334444444455000002aa0000000000002ab0"), 0x039e81), (hex!("0100000000333333334444444455000002ab0000000000002ac0"), 0x039f21), (hex!("0100000000333333334444444455000002ab0000000000006c90"), 0x039fc1), (hex!("0100000000333333334444444455000002ac0000000000002ad0"), 0x03a061), (hex!("0100000000333333334444444455000002ac0000000000006db0"), 0x03a101), (hex!("0100000000333333334444444455000002ad0000000000002ae0"), 0x03a1a1), (hex!("0100000000333333334444444455000002ad00000000000065e0"), 0x03a241), (hex!("0100000000333333334444444455000002ad0000000000007b40"), 0x03a2e1), (hex!("0100000000333333334444444455000002ae0000000000002af0"), 0x03a381), (hex!("0100000000333333334444444455000002ae0000000000004d20"), 0x03a421), (hex!("0100000000333333334444444455000002ae0000000000006f30"), 0x03a4c1), (hex!("0100000000333333334444444455000002af0000000000002b00"), 0x03a561), (hex!("0100000000333333334444444455000002b00000000000002b10"), 0x03a601), (hex!("0100000000333333334444444455000002b00000000000004560"), 0x03a6a1), (hex!("0100000000333333334444444455000002b00000000000005800"), 0x03a741), (hex!("0100000000333333334444444455000002b00000000000005a60"), 0x03a7e1), (hex!("0100000000333333334444444455000002b10000000000002b20"), 0x03a881), (hex!("0100000000333333334444444455000002b10000000000007b30"), 0x03a921), (hex!("0100000000333333334444444455000002b20000000000002b30"), 0x03a9c1), (hex!("0100000000333333334444444455000002b20000000000004440"), 0x03aa61), (hex!("0100000000333333334444444455000002b20000000000004f80"), 0x03ab01), (hex!("0100000000333333334444444455000002b20000000000005020"), 0x03aba1), (hex!("0100000000333333334444444455000002b30000000000002b40"), 0x03ac41), (hex!("0100000000333333334444444455000002b40000000000002b50"), 0x03ace1), (hex!("0100000000333333334444444455000002b50000000000002b60"), 0x03ad81), (hex!("0100000000333333334444444455000002b500000000000059e0"), 0x03ae21), (hex!("0100000000333333334444444455000002b60000000000002b70"), 0x03aec1), (hex!("0100000000333333334444444455000002b70000000000002b80"), 0x03af61), (hex!("0100000000333333334444444455000002b80000000000002b90"), 0x03b001), (hex!("0100000000333333334444444455000002b80000000000004590"), 0x03b0a1), (hex!("0100000000333333334444444455000002b800000000000047d0"), 0x03b141), (hex!("0100000000333333334444444455000002b80000000000006030"), 0x03b1e1), (hex!("0100000000333333334444444455000002b80000000000006a20"), 0x03b281), (hex!("0100000000333333334444444455000002b80000000000006a90"), 0x03b321), (hex!("0100000000333333334444444455000002b90000000000002ba0"), 0x03b3c1), (hex!("0100000000333333334444444455000002ba0000000000002bb0"), 0x03b461), (hex!("0100000000333333334444444455000002ba0000000000006e80"), 0x03b501), (hex!("0100000000333333334444444455000002bb0000000000002bc0"), 0x03b5a1), (hex!("0100000000333333334444444455000002bc0000000000002bd0"), 0x03b641), (hex!("0100000000333333334444444455000002bc0000000000004b30"), 0x03b6e1), (hex!("0100000000333333334444444455000002bd0000000000002be0"), 0x03b781), (hex!("0100000000333333334444444455000002bd0000000000005e10"), 0x03b821), (hex!("0100000000333333334444444455000002be0000000000002bf0"), 0x03b8c1), (hex!("0100000000333333334444444455000002bf0000000000002c00"), 0x03b961), (hex!("0100000000333333334444444455000002c00000000000002c10"), 0x03ba01), (hex!("0100000000333333334444444455000002c10000000000002c20"), 0x03baa1), (hex!("0100000000333333334444444455000002c10000000000003ef0"), 0x03bb41), (hex!("0100000000333333334444444455000002c20000000000002c30"), 0x03bbe1), (hex!("0100000000333333334444444455000002c200000000000056e0"), 0x03bc81), (hex!("0100000000333333334444444455000002c30000000000002c40"), 0x03bd21), (hex!("0100000000333333334444444455000002c30000000000004b60"), 0x03bdc1), (hex!("0100000000333333334444444455000002c40000000000002c50"), 0x03be61), (hex!("0100000000333333334444444455000002c400000000000045f0"), 0x03bf01), (hex!("0100000000333333334444444455000002c40000000000005290"), 0x03bfa1), (hex!("0100000000333333334444444455000002c50000000000002c60"), 0x03c041), (hex!("0100000000333333334444444455000002c60000000000002c70"), 0x03c0e1), (hex!("0100000000333333334444444455000002c60000000000006ae0"), 0x03c181), (hex!("0100000000333333334444444455000002c70000000000002c80"), 0x03c221), (hex!("0100000000333333334444444455000002c70000000000005680"), 0x03c2c1), (hex!("0100000000333333334444444455000002c70000000000006e10"), 0x03c361), (hex!("0100000000333333334444444455000002c80000000000002c90"), 0x03c401), (hex!("0100000000333333334444444455000002c90000000000002ca0"), 0x03c4a1), (hex!("0100000000333333334444444455000002ca0000000000002cb0"), 0x03c541), (hex!("0100000000333333334444444455000002cb0000000000002cc0"), 0x03c5e1), (hex!("0100000000333333334444444455000002cc0000000000002cd0"), 0x03c681), (hex!("0100000000333333334444444455000002cc0000000000005b50"), 0x03c721), (hex!("0100000000333333334444444455000002cd0000000000002ce0"), 0x03c7c1), (hex!("0100000000333333334444444455000002ce0000000000002cf0"), 0x03c861), (hex!("0100000000333333334444444455000002ce00000000000043f0"), 0x03c901), (hex!("0100000000333333334444444455000002ce0000000000006420"), 0x03c9a1), (hex!("0100000000333333334444444455000002cf0000000000002d00"), 0x03ca41), (hex!("0100000000333333334444444455000002d00000000000002d10"), 0x03cae1), (hex!("0100000000333333334444444455000002d10000000000002d20"), 0x03cb81), (hex!("0100000000333333334444444455000002d10000000000005370"), 0x03cc21), (hex!("0100000000333333334444444455000002d20000000000002d30"), 0x03ccc1), (hex!("0100000000333333334444444455000002d20000000000005ef0"), 0x03cd61), (hex!("0100000000333333334444444455000002d20000000000006570"), 0x03ce01), (hex!("0100000000333333334444444455000002d30000000000002d40"), 0x03cea1), (hex!("0100000000333333334444444455000002d30000000000007360"), 0x03cf41), (hex!("0100000000333333334444444455000002d40000000000002d50"), 0x03cfe1), (hex!("0100000000333333334444444455000002d400000000000079a0"), 0x03d081), (hex!("0100000000333333334444444455000002d50000000000002d60"), 0x03d121), (hex!("0100000000333333334444444455000002d50000000000004250"), 0x03d1c1), (hex!("0100000000333333334444444455000002d50000000000006050"), 0x03d261), (hex!("0100000000333333334444444455000002d60000000000002d70"), 0x03d301), (hex!("0100000000333333334444444455000002d60000000000007080"), 0x03d3a1), (hex!("0100000000333333334444444455000002d70000000000002d80"), 0x03d441), (hex!("0100000000333333334444444455000002d80000000000002d90"), 0x03d4e1), (hex!("0100000000333333334444444455000002d80000000000007110"), 0x03d581), (hex!("0100000000333333334444444455000002d800000000000073c0"), 0x03d621), (hex!("0100000000333333334444444455000002d800000000000075a0"), 0x03d6c1), (hex!("0100000000333333334444444455000002d90000000000002da0"), 0x03d761), (hex!("0100000000333333334444444455000002d90000000000004860"), 0x03d801), (hex!("0100000000333333334444444455000002d90000000000006b60"), 0x03d8a1), (hex!("0100000000333333334444444455000002da0000000000002db0"), 0x03d941), (hex!("0100000000333333334444444455000002da0000000000006630"), 0x03d9e1), (hex!("0100000000333333334444444455000002db0000000000002dc0"), 0x03da81), (hex!("0100000000333333334444444455000002dc0000000000002dd0"), 0x03db21), (hex!("0100000000333333334444444455000002dc0000000000004830"), 0x03dbc1), (hex!("0100000000333333334444444455000002dd0000000000002de0"), 0x03dc61), (hex!("0100000000333333334444444455000002de0000000000002df0"), 0x03dd01), (hex!("0100000000333333334444444455000002de0000000000004f00"), 0x03dda1), (hex!("0100000000333333334444444455000002df0000000000002e00"), 0x03de41), (hex!("0100000000333333334444444455000002e00000000000002e10"), 0x03dee1), (hex!("0100000000333333334444444455000002e10000000000002e20"), 0x03df81), (hex!("0100000000333333334444444455000002e10000000000006e90"), 0x03e021), (hex!("0100000000333333334444444455000002e20000000000002e30"), 0x03e0c1), (hex!("0100000000333333334444444455000002e200000000000053e0"), 0x03e161), (hex!("0100000000333333334444444455000002e30000000000002e40"), 0x03e201), (hex!("0100000000333333334444444455000002e30000000000006020"), 0x03e2a1), (hex!("0100000000333333334444444455000002e30000000000006540"), 0x03e341), (hex!("0100000000333333334444444455000002e40000000000002e50"), 0x03e3e1), (hex!("0100000000333333334444444455000002e50000000000002e60"), 0x03e481), (hex!("0100000000333333334444444455000002e50000000000005180"), 0x03e521), (hex!("0100000000333333334444444455000002e50000000000007bf0"), 0x03e5c1), (hex!("0100000000333333334444444455000002e60000000000002e70"), 0x03e661), (hex!("0100000000333333334444444455000002e60000000000005350"), 0x03e701), (hex!("0100000000333333334444444455000002e60000000000007960"), 0x03e7a1), (hex!("0100000000333333334444444455000002e70000000000002e80"), 0x03e841), (hex!("0100000000333333334444444455000002e80000000000002e90"), 0x03e8e1), (hex!("0100000000333333334444444455000002e90000000000002ea0"), 0x03e981), (hex!("0100000000333333334444444455000002ea0000000000002eb0"), 0x03ea21), (hex!("0100000000333333334444444455000002eb0000000000002ec0"), 0x03eac1), (hex!("0100000000333333334444444455000002ec0000000000002ed0"), 0x03eb61), (hex!("0100000000333333334444444455000002ec0000000000006c10"), 0x03ec01), (hex!("0100000000333333334444444455000002ed0000000000002ee0"), 0x03eca1), (hex!("0100000000333333334444444455000002ed0000000000005590"), 0x03ed41), (hex!("0100000000333333334444444455000002ed0000000000005cd0"), 0x03ede1), (hex!("0100000000333333334444444455000002ed0000000000006910"), 0x03ee81), (hex!("0100000000333333334444444455000002ee0000000000002ef0"), 0x03ef21), (hex!("0100000000333333334444444455000002ef0000000000002f00"), 0x03efc1), (hex!("0100000000333333334444444455000002ef0000000000004ed0"), 0x03f061), (hex!("0100000000333333334444444455000002f00000000000002f10"), 0x03f101), (hex!("0100000000333333334444444455000002f00000000000004cf0"), 0x03f1a1), (hex!("0100000000333333334444444455000002f00000000000005d10"), 0x03f241), (hex!("0100000000333333334444444455000002f00000000000006860"), 0x03f2e1), (hex!("0100000000333333334444444455000002f00000000000006b50"), 0x03f381), (hex!("0100000000333333334444444455000002f00000000000007100"), 0x03f421), (hex!("0100000000333333334444444455000002f00000000000007aa0"), 0x03f4c1), (hex!("0100000000333333334444444455000002f10000000000002f20"), 0x03f561), (hex!("0100000000333333334444444455000002f20000000000002f30"), 0x03f601), (hex!("0100000000333333334444444455000002f200000000000044b0"), 0x03f6a1), (hex!("0100000000333333334444444455000002f30000000000002f40"), 0x03f741), (hex!("0100000000333333334444444455000002f300000000000075b0"), 0x03f7e1), (hex!("0100000000333333334444444455000002f40000000000002f50"), 0x03f881), (hex!("0100000000333333334444444455000002f400000000000060f0"), 0x03f921), (hex!("0100000000333333334444444455000002f50000000000002f60"), 0x03f9c1), (hex!("0100000000333333334444444455000002f50000000000007210"), 0x03fa61), (hex!("0100000000333333334444444455000002f60000000000002f70"), 0x03fb01), (hex!("0100000000333333334444444455000002f60000000000006610"), 0x03fba1), (hex!("0100000000333333334444444455000002f70000000000002f80"), 0x03fc41), (hex!("0100000000333333334444444455000002f70000000000007560"), 0x03fce1), (hex!("0100000000333333334444444455000002f80000000000002f90"), 0x03fd81), (hex!("0100000000333333334444444455000002f80000000000006320"), 0x03fe21), (hex!("0100000000333333334444444455000002f90000000000002fa0"), 0x03fec1), (hex!("0100000000333333334444444455000002f90000000000006e50"), 0x03ff61), (hex!("0100000000333333334444444455000002fa0000000000002fb0"), 0x040001), (hex!("0100000000333333334444444455000002fb0000000000002fc0"), 0x0400a1), (hex!("0100000000333333334444444455000002fb0000000000004780"), 0x040141), (hex!("0100000000333333334444444455000002fc0000000000002fd0"), 0x0401e1), (hex!("0100000000333333334444444455000002fd0000000000002fe0"), 0x040281), (hex!("0100000000333333334444444455000002fd0000000000005600"), 0x040321), (hex!("0100000000333333334444444455000002fd0000000000006c00"), 0x0403c1), (hex!("0100000000333333334444444455000002fe0000000000002ff0"), 0x040461), (hex!("0100000000333333334444444455000002ff0000000000003000"), 0x040501), (hex!("0100000000333333334444444455000003000000000000003010"), 0x0405a1), (hex!("0100000000333333334444444455000003000000000000004080"), 0x040641), (hex!("0100000000333333334444444455000003010000000000003020"), 0x0406e1), (hex!("0100000000333333334444444455000003010000000000006340"), 0x040781), (hex!("0100000000333333334444444455000003020000000000003030"), 0x040821), (hex!("0100000000333333334444444455000003020000000000005b00"), 0x0408c1), (hex!("0100000000333333334444444455000003020000000000007b20"), 0x040961), (hex!("0100000000333333334444444455000003030000000000003040"), 0x040a01), (hex!("01000000003333333344444444550000030300000000000056b0"), 0x040aa1), (hex!("0100000000333333334444444455000003030000000000006280"), 0x040b41), (hex!("0100000000333333334444444455000003030000000000007ad0"), 0x040be1), (hex!("0100000000333333334444444455000003040000000000003050"), 0x040c81), (hex!("0100000000333333334444444455000003040000000000005c50"), 0x040d21), (hex!("0100000000333333334444444455000003050000000000003060"), 0x040dc1), (hex!("01000000003333333344444444550000030500000000000072e0"), 0x040e61), (hex!("0100000000333333334444444455000003060000000000003070"), 0x040f01), (hex!("0100000000333333334444444455000003060000000000004360"), 0x040fa1), (hex!("0100000000333333334444444455000003060000000000004380"), 0x041041), (hex!("0100000000333333334444444455000003060000000000004820"), 0x0410e1), (hex!("0100000000333333334444444455000003060000000000006d10"), 0x041181), (hex!("0100000000333333334444444455000003070000000000003080"), 0x041221), (hex!("0100000000333333334444444455000003070000000000004450"), 0x0412c1), (hex!("0100000000333333334444444455000003080000000000003090"), 0x041361), (hex!("0100000000333333334444444455000003080000000000005ad0"), 0x041401), (hex!("01000000003333333344444444550000030900000000000030a0"), 0x0414a1), (hex!("01000000003333333344444444550000030a00000000000030b0"), 0x041541), (hex!("01000000003333333344444444550000030a0000000000007760"), 0x0415e1), (hex!("01000000003333333344444444550000030b00000000000030c0"), 0x041681), (hex!("01000000003333333344444444550000030b0000000000007a80"), 0x041721), (hex!("01000000003333333344444444550000030c00000000000030d0"), 0x0417c1), (hex!("01000000003333333344444444550000030d00000000000030e0"), 0x041861), (hex!("01000000003333333344444444550000030d0000000000003eb0"), 0x041901), (hex!("01000000003333333344444444550000030e00000000000030f0"), 0x0419a1), (hex!("01000000003333333344444444550000030f0000000000003100"), 0x041a41), (hex!("01000000003333333344444444550000030f0000000000004690"), 0x041ae1), (hex!("01000000003333333344444444550000030f0000000000006900"), 0x041b81), (hex!("0100000000333333334444444455000003100000000000003110"), 0x041c21), (hex!("01000000003333333344444444550000031000000000000058a0"), 0x041cc1), (hex!("0100000000333333334444444455000003110000000000003120"), 0x041d61), (hex!("0100000000333333334444444455000003110000000000004200"), 0x041e01), (hex!("0100000000333333334444444455000003120000000000003130"), 0x041ea1), (hex!("0100000000333333334444444455000003130000000000003140"), 0x041f41), (hex!("0100000000333333334444444455000003130000000000004d50"), 0x041fe1), (hex!("0100000000333333334444444455000003130000000000005400"), 0x042081), (hex!("0100000000333333334444444455000003130000000000005520"), 0x042121), (hex!("0100000000333333334444444455000003140000000000003150"), 0x0421c1), (hex!("0100000000333333334444444455000003140000000000006450"), 0x042261), (hex!("0100000000333333334444444455000003150000000000003160"), 0x042301), (hex!("01000000003333333344444444550000031500000000000062d0"), 0x0423a1), (hex!("0100000000333333334444444455000003160000000000003170"), 0x042441), (hex!("0100000000333333334444444455000003160000000000004c40"), 0x0424e1), (hex!("0100000000333333334444444455000003160000000000007c80"), 0x042581), (hex!("0100000000333333334444444455000003170000000000003180"), 0x042621), (hex!("0100000000333333334444444455000003170000000000004400"), 0x0426c1), (hex!("0100000000333333334444444455000003170000000000005090"), 0x042761), (hex!("0100000000333333334444444455000003170000000000006cb0"), 0x042801), (hex!("0100000000333333334444444455000003180000000000003190"), 0x0428a1), (hex!("0100000000333333334444444455000003180000000000006560"), 0x042941), (hex!("01000000003333333344444444550000031900000000000031a0"), 0x0429e1), (hex!("01000000003333333344444444550000031900000000000052d0"), 0x042a81), (hex!("01000000003333333344444444550000031900000000000057e0"), 0x042b21), (hex!("01000000003333333344444444550000031a00000000000031b0"), 0x042bc1), (hex!("01000000003333333344444444550000031a00000000000071e0"), 0x042c61), (hex!("01000000003333333344444444550000031b00000000000031c0"), 0x042d01), (hex!("01000000003333333344444444550000031c00000000000031d0"), 0x042da1), (hex!("01000000003333333344444444550000031c0000000000004480"), 0x042e41), (hex!("01000000003333333344444444550000031c0000000000005790"), 0x042ee1), (hex!("01000000003333333344444444550000031c0000000000007be0"), 0x042f81), (hex!("01000000003333333344444444550000031d00000000000031e0"), 0x043021), (hex!("01000000003333333344444444550000031d0000000000005560"), 0x0430c1), (hex!("01000000003333333344444444550000031e00000000000031f0"), 0x043161), (hex!("01000000003333333344444444550000031f0000000000003200"), 0x043201), (hex!("01000000003333333344444444550000031f0000000000004190"), 0x0432a1), (hex!("0100000000333333334444444455000003200000000000003210"), 0x043341), (hex!("0100000000333333334444444455000003210000000000003220"), 0x0433e1), (hex!("0100000000333333334444444455000003220000000000003230"), 0x043481), (hex!("0100000000333333334444444455000003230000000000003240"), 0x043521), (hex!("01000000003333333344444444550000032300000000000069d0"), 0x0435c1), (hex!("0100000000333333334444444455000003240000000000003250"), 0x043661), (hex!("0100000000333333334444444455000003250000000000003260"), 0x043701), (hex!("01000000003333333344444444550000032500000000000042b0"), 0x0437a1), (hex!("01000000003333333344444444550000032500000000000064e0"), 0x043841), (hex!("0100000000333333334444444455000003260000000000003270"), 0x0438e1), (hex!("0100000000333333334444444455000003270000000000003280"), 0x043981), (hex!("0100000000333333334444444455000003270000000000005b20"), 0x043a21), (hex!("0100000000333333334444444455000003270000000000006330"), 0x043ac1), (hex!("0100000000333333334444444455000003270000000000006810"), 0x043b61), (hex!("0100000000333333334444444455000003280000000000003290"), 0x043c01), (hex!("01000000003333333344444444550000032900000000000032a0"), 0x043ca1), (hex!("01000000003333333344444444550000032900000000000056f0"), 0x043d41), (hex!("0100000000333333334444444455000003290000000000005e20"), 0x043de1), (hex!("0100000000333333334444444455000003290000000000005e70"), 0x043e81), (hex!("01000000003333333344444444550000032a00000000000032b0"), 0x043f21), (hex!("01000000003333333344444444550000032b00000000000032c0"), 0x043fc1), (hex!("01000000003333333344444444550000032b0000000000005500"), 0x044061), (hex!("01000000003333333344444444550000032b0000000000005a20"), 0x044101), (hex!("01000000003333333344444444550000032c00000000000032d0"), 0x0441a1), (hex!("01000000003333333344444444550000032c0000000000004060"), 0x044241), (hex!("01000000003333333344444444550000032c0000000000004760"), 0x0442e1), (hex!("01000000003333333344444444550000032d00000000000032e0"), 0x044381), (hex!("01000000003333333344444444550000032d00000000000068a0"), 0x044421), (hex!("01000000003333333344444444550000032e00000000000032f0"), 0x0444c1), (hex!("01000000003333333344444444550000032f0000000000003300"), 0x044561), (hex!("0100000000333333334444444455000003300000000000003310"), 0x044601), (hex!("0100000000333333334444444455000003300000000000006e40"), 0x0446a1), (hex!("0100000000333333334444444455000003310000000000003320"), 0x044741), (hex!("0100000000333333334444444455000003310000000000004620"), 0x0447e1), (hex!("0100000000333333334444444455000003320000000000003330"), 0x044881), (hex!("0100000000333333334444444455000003330000000000003340"), 0x044921), (hex!("0100000000333333334444444455000003330000000000004b80"), 0x0449c1), (hex!("0100000000333333334444444455000003340000000000003350"), 0x044a61), (hex!("0100000000333333334444444455000003350000000000003360"), 0x044b01), (hex!("0100000000333333334444444455000003360000000000003370"), 0x044ba1), (hex!("0100000000333333334444444455000003370000000000003380"), 0x044c41), (hex!("0100000000333333334444444455000003380000000000003390"), 0x044ce1), (hex!("01000000003333333344444444550000033900000000000033a0"), 0x044d81), (hex!("0100000000333333334444444455000003390000000000006b90"), 0x044e21), (hex!("01000000003333333344444444550000033a00000000000033b0"), 0x044ec1), (hex!("01000000003333333344444444550000033a0000000000007420"), 0x044f61), (hex!("01000000003333333344444444550000033b00000000000033c0"), 0x045001), (hex!("01000000003333333344444444550000033b0000000000007620"), 0x0450a1), (hex!("01000000003333333344444444550000033c00000000000033d0"), 0x045141), (hex!("01000000003333333344444444550000033c0000000000006b30"), 0x0451e1), (hex!("01000000003333333344444444550000033d00000000000033e0"), 0x045281), (hex!("01000000003333333344444444550000033e00000000000033f0"), 0x045321), (hex!("01000000003333333344444444550000033e00000000000048b0"), 0x0453c1), (hex!("01000000003333333344444444550000033e0000000000004e70"), 0x045461), (hex!("01000000003333333344444444550000033f0000000000003400"), 0x045501), (hex!("01000000003333333344444444550000033f0000000000006380"), 0x0455a1), (hex!("0100000000333333334444444455000003400000000000003410"), 0x045641), (hex!("0100000000333333334444444455000003410000000000003420"), 0x0456e1), (hex!("0100000000333333334444444455000003410000000000006090"), 0x045781), (hex!("0100000000333333334444444455000003420000000000003430"), 0x045821), (hex!("01000000003333333344444444550000034200000000000073d0"), 0x0458c1), (hex!("0100000000333333334444444455000003430000000000003440"), 0x045961), (hex!("0100000000333333334444444455000003430000000000006370"), 0x045a01), (hex!("01000000003333333344444444550000034300000000000075c0"), 0x045aa1), (hex!("0100000000333333334444444455000003440000000000003450"), 0x045b41), (hex!("0100000000333333334444444455000003450000000000003460"), 0x045be1), (hex!("0100000000333333334444444455000003460000000000003470"), 0x045c81), (hex!("01000000003333333344444444550000034600000000000055f0"), 0x045d21), (hex!("0100000000333333334444444455000003470000000000003480"), 0x045dc1), (hex!("0100000000333333334444444455000003470000000000003fe0"), 0x045e61), (hex!("0100000000333333334444444455000003480000000000003490"), 0x045f01), (hex!("0100000000333333334444444455000003480000000000007990"), 0x045fa1), (hex!("01000000003333333344444444550000034900000000000034a0"), 0x046041), (hex!("0100000000333333334444444455000003490000000000004410"), 0x0460e1), (hex!("01000000003333333344444444550000034a00000000000034b0"), 0x046181), (hex!("01000000003333333344444444550000034a00000000000062a0"), 0x046221), (hex!("01000000003333333344444444550000034a0000000000007260"), 0x0462c1), (hex!("01000000003333333344444444550000034b00000000000034c0"), 0x046361), (hex!("01000000003333333344444444550000034b0000000000005760"), 0x046401), (hex!("01000000003333333344444444550000034b0000000000006200"), 0x0464a1), (hex!("01000000003333333344444444550000034c00000000000034d0"), 0x046541), (hex!("01000000003333333344444444550000034d00000000000034e0"), 0x0465e1), (hex!("01000000003333333344444444550000034e00000000000034f0"), 0x046681), (hex!("01000000003333333344444444550000034e0000000000007790"), 0x046721), (hex!("01000000003333333344444444550000034f0000000000003500"), 0x0467c1), (hex!("0100000000333333334444444455000003500000000000003510"), 0x046861), (hex!("0100000000333333334444444455000003510000000000003520"), 0x046901), (hex!("0100000000333333334444444455000003520000000000003530"), 0x0469a1), (hex!("01000000003333333344444444550000035200000000000056a0"), 0x046a41), (hex!("0100000000333333334444444455000003530000000000003540"), 0x046ae1), (hex!("0100000000333333334444444455000003540000000000003550"), 0x046b81), (hex!("01000000003333333344444444550000035400000000000047b0"), 0x046c21), (hex!("0100000000333333334444444455000003550000000000003560"), 0x046cc1), (hex!("0100000000333333334444444455000003550000000000004500"), 0x046d61), (hex!("0100000000333333334444444455000003560000000000003570"), 0x046e01), (hex!("0100000000333333334444444455000003560000000000004fc0"), 0x046ea1), (hex!("0100000000333333334444444455000003560000000000007160"), 0x046f41), (hex!("0100000000333333334444444455000003560000000000007400"), 0x046fe1), (hex!("0100000000333333334444444455000003570000000000003580"), 0x047081), (hex!("0100000000333333334444444455000003580000000000003590"), 0x047121), (hex!("0100000000333333334444444455000003580000000000005a80"), 0x0471c1), (hex!("01000000003333333344444444550000035900000000000035a0"), 0x047261), (hex!("01000000003333333344444444550000035900000000000073b0"), 0x047301), (hex!("01000000003333333344444444550000035a00000000000035b0"), 0x0473a1), (hex!("01000000003333333344444444550000035a0000000000004c20"), 0x047441), (hex!("01000000003333333344444444550000035b00000000000035c0"), 0x0474e1), (hex!("01000000003333333344444444550000035b0000000000005120"), 0x047581), (hex!("01000000003333333344444444550000035c00000000000035d0"), 0x047621), (hex!("01000000003333333344444444550000035c0000000000004300"), 0x0476c1), (hex!("01000000003333333344444444550000035c0000000000005a40"), 0x047761), (hex!("01000000003333333344444444550000035c0000000000006620"), 0x047801), (hex!("01000000003333333344444444550000035c0000000000006ed0"), 0x0478a1), (hex!("01000000003333333344444444550000035d00000000000035e0"), 0x047941), (hex!("01000000003333333344444444550000035d0000000000005df0"), 0x0479e1), (hex!("01000000003333333344444444550000035e00000000000035f0"), 0x047a81), (hex!("01000000003333333344444444550000035f0000000000003600"), 0x047b21), (hex!("01000000003333333344444444550000035f00000000000058d0"), 0x047bc1), (hex!("0100000000333333334444444455000003600000000000003610"), 0x047c61), (hex!("0100000000333333334444444455000003600000000000007b90"), 0x047d01), (hex!("0100000000333333334444444455000003610000000000003620"), 0x047da1), (hex!("0100000000333333334444444455000003610000000000006ad0"), 0x047e41), (hex!("0100000000333333334444444455000003620000000000003630"), 0x047ee1), (hex!("01000000003333333344444444550000036200000000000063a0"), 0x047f81), (hex!("0100000000333333334444444455000003630000000000003640"), 0x048021), (hex!("0100000000333333334444444455000003630000000000007250"), 0x0480c1), (hex!("0100000000333333334444444455000003640000000000003650"), 0x048161), (hex!("0100000000333333334444444455000003640000000000005510"), 0x048201), (hex!("0100000000333333334444444455000003640000000000007850"), 0x0482a1), (hex!("0100000000333333334444444455000003650000000000003660"), 0x048341), (hex!("0100000000333333334444444455000003660000000000003670"), 0x0483e1), (hex!("0100000000333333334444444455000003660000000000004650"), 0x048481), (hex!("01000000003333333344444444550000036600000000000050d0"), 0x048521), (hex!("0100000000333333334444444455000003660000000000006eb0"), 0x0485c1), (hex!("0100000000333333334444444455000003670000000000003680"), 0x048661), (hex!("01000000003333333344444444550000036700000000000071f0"), 0x048701), (hex!("0100000000333333334444444455000003680000000000003690"), 0x0487a1), (hex!("01000000003333333344444444550000036900000000000036a0"), 0x048841), (hex!("0100000000333333334444444455000003690000000000005c70"), 0x0488e1), (hex!("01000000003333333344444444550000036a00000000000036b0"), 0x048981), (hex!("01000000003333333344444444550000036a00000000000071b0"), 0x048a21), (hex!("01000000003333333344444444550000036b00000000000036c0"), 0x048ac1), (hex!("01000000003333333344444444550000036b0000000000004670"), 0x048b61), (hex!("01000000003333333344444444550000036c00000000000036d0"), 0x048c01), (hex!("01000000003333333344444444550000036c0000000000004750"), 0x048ca1), (hex!("01000000003333333344444444550000036c0000000000006fa0"), 0x048d41), (hex!("01000000003333333344444444550000036d00000000000036e0"), 0x048de1), (hex!("01000000003333333344444444550000036d0000000000003f70"), 0x048e81), (hex!("01000000003333333344444444550000036d0000000000004b90"), 0x048f21), (hex!("01000000003333333344444444550000036d00000000000057a0"), 0x048fc1), (hex!("01000000003333333344444444550000036e00000000000036f0"), 0x049061), (hex!("01000000003333333344444444550000036e00000000000075d0"), 0x049101), (hex!("01000000003333333344444444550000036f0000000000003700"), 0x0491a1), (hex!("0100000000333333334444444455000003700000000000003710"), 0x049241), (hex!("0100000000333333334444444455000003700000000000005aa0"), 0x0492e1), (hex!("0100000000333333334444444455000003710000000000003720"), 0x049381), (hex!("0100000000333333334444444455000003710000000000005130"), 0x049421), (hex!("0100000000333333334444444455000003710000000000006fc0"), 0x0494c1), (hex!("0100000000333333334444444455000003710000000000007b00"), 0x049561), (hex!("0100000000333333334444444455000003720000000000003730"), 0x049601), (hex!("01000000003333333344444444550000037200000000000054d0"), 0x0496a1), (hex!("0100000000333333334444444455000003730000000000003740"), 0x049741), (hex!("0100000000333333334444444455000003730000000000004220"), 0x0497e1), (hex!("0100000000333333334444444455000003740000000000003750"), 0x049881), (hex!("0100000000333333334444444455000003740000000000004720"), 0x049921), (hex!("0100000000333333334444444455000003750000000000003760"), 0x0499c1), (hex!("0100000000333333334444444455000003750000000000004110"), 0x049a61), (hex!("0100000000333333334444444455000003760000000000003770"), 0x049b01), (hex!("0100000000333333334444444455000003770000000000003780"), 0x049ba1), (hex!("0100000000333333334444444455000003780000000000003790"), 0x049c41), (hex!("0100000000333333334444444455000003780000000000004b40"), 0x049ce1), (hex!("0100000000333333334444444455000003780000000000005660"), 0x049d81), (hex!("0100000000333333334444444455000003780000000000005ea0"), 0x049e21), (hex!("01000000003333333344444444550000037900000000000037a0"), 0x049ec1), (hex!("01000000003333333344444444550000037a00000000000037b0"), 0x049f61), (hex!("01000000003333333344444444550000037b00000000000037c0"), 0x04a001), (hex!("01000000003333333344444444550000037c00000000000037d0"), 0x04a0a1), (hex!("01000000003333333344444444550000037c0000000000004340"), 0x04a141), (hex!("01000000003333333344444444550000037c0000000000005230"), 0x04a1e1), (hex!("01000000003333333344444444550000037d00000000000037e0"), 0x04a281), (hex!("01000000003333333344444444550000037d00000000000051e0"), 0x04a321), (hex!("01000000003333333344444444550000037e00000000000037f0"), 0x04a3c1), (hex!("01000000003333333344444444550000037e0000000000004090"), 0x04a461), (hex!("01000000003333333344444444550000037e0000000000005c20"), 0x04a501), (hex!("01000000003333333344444444550000037f0000000000003800"), 0x04a5a1), (hex!("0100000000333333334444444455000003800000000000003810"), 0x04a641), (hex!("0100000000333333334444444455000003800000000000007630"), 0x04a6e1), (hex!("0100000000333333334444444455000003810000000000003820"), 0x04a781), (hex!("0100000000333333334444444455000003820000000000003830"), 0x04a821), (hex!("0100000000333333334444444455000003820000000000004170"), 0x04a8c1), (hex!("0100000000333333334444444455000003830000000000003840"), 0x04a961), (hex!("0100000000333333334444444455000003840000000000003850"), 0x04aa01), (hex!("0100000000333333334444444455000003850000000000003860"), 0x04aaa1), (hex!("0100000000333333334444444455000003850000000000004180"), 0x04ab41), (hex!("0100000000333333334444444455000003850000000000005c90"), 0x04abe1), (hex!("0100000000333333334444444455000003850000000000005da0"), 0x04ac81), (hex!("0100000000333333334444444455000003850000000000006ff0"), 0x04ad21), (hex!("0100000000333333334444444455000003860000000000003870"), 0x04adc1), (hex!("01000000003333333344444444550000038600000000000065c0"), 0x04ae61), (hex!("0100000000333333334444444455000003870000000000003880"), 0x04af01), (hex!("0100000000333333334444444455000003870000000000007cc0"), 0x04afa1), (hex!("0100000000333333334444444455000003880000000000003890"), 0x04b041), (hex!("01000000003333333344444444550000038900000000000038a0"), 0x04b0e1), (hex!("01000000003333333344444444550000038a00000000000038b0"), 0x04b181), (hex!("01000000003333333344444444550000038a00000000000073e0"), 0x04b221), (hex!("01000000003333333344444444550000038b00000000000038c0"), 0x04b2c1), (hex!("01000000003333333344444444550000038c00000000000038d0"), 0x04b361), (hex!("01000000003333333344444444550000038d00000000000038e0"), 0x04b401), (hex!("01000000003333333344444444550000038d00000000000069f0"), 0x04b4a1), (hex!("01000000003333333344444444550000038d0000000000007680"), 0x04b541), (hex!("01000000003333333344444444550000038e00000000000038f0"), 0x04b5e1), (hex!("01000000003333333344444444550000038f0000000000003900"), 0x04b681), (hex!("01000000003333333344444444550000038f00000000000045b0"), 0x04b721), (hex!("01000000003333333344444444550000038f0000000000007180"), 0x04b7c1), (hex!("0100000000333333334444444455000003900000000000003910"), 0x04b861), (hex!("0100000000333333334444444455000003910000000000003920"), 0x04b901), (hex!("0100000000333333334444444455000003910000000000004a20"), 0x04b9a1), (hex!("0100000000333333334444444455000003920000000000003930"), 0x04ba41), (hex!("01000000003333333344444444550000039200000000000059b0"), 0x04bae1), (hex!("0100000000333333334444444455000003930000000000003940"), 0x04bb81), (hex!("0100000000333333334444444455000003930000000000006cc0"), 0x04bc21), (hex!("0100000000333333334444444455000003940000000000003950"), 0x04bcc1), (hex!("01000000003333333344444444550000039400000000000056c0"), 0x04bd61), (hex!("0100000000333333334444444455000003950000000000003960"), 0x04be01), (hex!("0100000000333333334444444455000003950000000000004cc0"), 0x04bea1), (hex!("0100000000333333334444444455000003950000000000007720"), 0x04bf41), (hex!("0100000000333333334444444455000003960000000000003970"), 0x04bfe1), (hex!("0100000000333333334444444455000003960000000000004da0"), 0x04c081), (hex!("0100000000333333334444444455000003960000000000004df0"), 0x04c121), (hex!("0100000000333333334444444455000003960000000000004f30"), 0x04c1c1), (hex!("01000000003333333344444444550000039600000000000050f0"), 0x04c261), (hex!("0100000000333333334444444455000003960000000000007940"), 0x04c301), (hex!("0100000000333333334444444455000003970000000000003980"), 0x04c3a1), (hex!("0100000000333333334444444455000003970000000000005850"), 0x04c441), (hex!("0100000000333333334444444455000003970000000000007bd0"), 0x04c4e1), (hex!("0100000000333333334444444455000003980000000000003990"), 0x04c581), (hex!("0100000000333333334444444455000003980000000000004c00"), 0x04c621), (hex!("0100000000333333334444444455000003980000000000005580"), 0x04c6c1), (hex!("01000000003333333344444444550000039900000000000039a0"), 0x04c761), (hex!("0100000000333333334444444455000003990000000000005820"), 0x04c801), (hex!("01000000003333333344444444550000039a00000000000039b0"), 0x04c8a1), (hex!("01000000003333333344444444550000039b00000000000039c0"), 0x04c941), (hex!("01000000003333333344444444550000039b0000000000004c10"), 0x04c9e1), (hex!("01000000003333333344444444550000039b0000000000006460"), 0x04ca81), (hex!("01000000003333333344444444550000039c00000000000039d0"), 0x04cb21), (hex!("01000000003333333344444444550000039d00000000000039e0"), 0x04cbc1), (hex!("01000000003333333344444444550000039d00000000000044c0"), 0x04cc61), (hex!("01000000003333333344444444550000039d00000000000049e0"), 0x04cd01), (hex!("01000000003333333344444444550000039e00000000000039f0"), 0x04cda1), (hex!("01000000003333333344444444550000039f0000000000003a00"), 0x04ce41), (hex!("0100000000333333334444444455000003a00000000000003a10"), 0x04cee1), (hex!("0100000000333333334444444455000003a10000000000003a20"), 0x04cf81), (hex!("0100000000333333334444444455000003a10000000000006a80"), 0x04d021), (hex!("0100000000333333334444444455000003a20000000000003a30"), 0x04d0c1), (hex!("0100000000333333334444444455000003a200000000000062b0"), 0x04d161), (hex!("0100000000333333334444444455000003a30000000000003a40"), 0x04d201), (hex!("0100000000333333334444444455000003a30000000000006ce0"), 0x04d2a1), (hex!("0100000000333333334444444455000003a40000000000003a50"), 0x04d341), (hex!("0100000000333333334444444455000003a50000000000003a60"), 0x04d3e1), (hex!("0100000000333333334444444455000003a60000000000003a70"), 0x04d481), (hex!("0100000000333333334444444455000003a60000000000007750"), 0x04d521), (hex!("0100000000333333334444444455000003a70000000000003a80"), 0x04d5c1), (hex!("0100000000333333334444444455000003a70000000000005b10"), 0x04d661), (hex!("0100000000333333334444444455000003a80000000000003a90"), 0x04d701), (hex!("0100000000333333334444444455000003a80000000000006c20"), 0x04d7a1), (hex!("0100000000333333334444444455000003a90000000000003aa0"), 0x04d841), (hex!("0100000000333333334444444455000003a90000000000005b70"), 0x04d8e1), (hex!("0100000000333333334444444455000003a900000000000070e0"), 0x04d981), (hex!("0100000000333333334444444455000003aa0000000000003ab0"), 0x04da21), (hex!("0100000000333333334444444455000003aa00000000000049f0"), 0x04dac1), (hex!("0100000000333333334444444455000003aa0000000000004d60"), 0x04db61), (hex!("0100000000333333334444444455000003ab0000000000003ac0"), 0x04dc01), (hex!("0100000000333333334444444455000003ac0000000000003ad0"), 0x04dca1), (hex!("0100000000333333334444444455000003ac0000000000004580"), 0x04dd41), (hex!("0100000000333333334444444455000003ad0000000000003ae0"), 0x04dde1), (hex!("0100000000333333334444444455000003ae0000000000003af0"), 0x04de81), (hex!("0100000000333333334444444455000003af0000000000003b00"), 0x04df21), (hex!("0100000000333333334444444455000003b00000000000003b10"), 0x04dfc1), (hex!("0100000000333333334444444455000003b10000000000003b20"), 0x04e061), (hex!("0100000000333333334444444455000003b10000000000003fd0"), 0x04e101), (hex!("0100000000333333334444444455000003b20000000000003b30"), 0x04e1a1), (hex!("0100000000333333334444444455000003b30000000000003b40"), 0x04e241), (hex!("0100000000333333334444444455000003b40000000000003b50"), 0x04e2e1), (hex!("0100000000333333334444444455000003b40000000000007450"), 0x04e381), (hex!("0100000000333333334444444455000003b50000000000003b60"), 0x04e421), (hex!("0100000000333333334444444455000003b60000000000003b70"), 0x04e4c1), (hex!("0100000000333333334444444455000003b70000000000003b80"), 0x04e561), (hex!("0100000000333333334444444455000003b70000000000006d50"), 0x04e601), (hex!("0100000000333333334444444455000003b80000000000003b90"), 0x04e6a1), (hex!("0100000000333333334444444455000003b800000000000057c0"), 0x04e741), (hex!("0100000000333333334444444455000003b800000000000078a0"), 0x04e7e1), (hex!("0100000000333333334444444455000003b90000000000003ba0"), 0x04e881), (hex!("0100000000333333334444444455000003b90000000000006750"), 0x04e921), (hex!("0100000000333333334444444455000003ba0000000000003bb0"), 0x04e9c1), (hex!("0100000000333333334444444455000003ba0000000000007a10"), 0x04ea61), (hex!("0100000000333333334444444455000003ba0000000000007a20"), 0x04eb01), (hex!("0100000000333333334444444455000003bb0000000000003bc0"), 0x04eba1), (hex!("0100000000333333334444444455000003bb0000000000005bc0"), 0x04ec41), (hex!("0100000000333333334444444455000003bc0000000000003bd0"), 0x04ece1), (hex!("0100000000333333334444444455000003bc0000000000005e80"), 0x04ed81), (hex!("0100000000333333334444444455000003bc0000000000007ab0"), 0x04ee21), (hex!("0100000000333333334444444455000003bd0000000000003be0"), 0x04eec1), (hex!("0100000000333333334444444455000003bd00000000000049b0"), 0x04ef61), (hex!("0100000000333333334444444455000003be0000000000003bf0"), 0x04f001), (hex!("0100000000333333334444444455000003be0000000000005780"), 0x04f0a1), (hex!("0100000000333333334444444455000003be0000000000007930"), 0x04f141), (hex!("0100000000333333334444444455000003bf0000000000003c00"), 0x04f1e1), (hex!("0100000000333333334444444455000003bf0000000000005de0"), 0x04f281), (hex!("0100000000333333334444444455000003bf00000000000060b0"), 0x04f321), (hex!("0100000000333333334444444455000003bf00000000000060c0"), 0x04f3c1), (hex!("0100000000333333334444444455000003bf0000000000006a50"), 0x04f461), (hex!("0100000000333333334444444455000003c00000000000003c10"), 0x04f501), (hex!("0100000000333333334444444455000003c00000000000004030"), 0x04f5a1), (hex!("0100000000333333334444444455000003c10000000000003c20"), 0x04f641), (hex!("0100000000333333334444444455000003c20000000000003c30"), 0x04f6e1), (hex!("0100000000333333334444444455000003c200000000000040b0"), 0x04f781), (hex!("0100000000333333334444444455000003c30000000000003c40"), 0x04f821), (hex!("0100000000333333334444444455000003c40000000000003c50"), 0x04f8c1), (hex!("0100000000333333334444444455000003c40000000000005ba0"), 0x04f961), (hex!("0100000000333333334444444455000003c50000000000003c60"), 0x04fa01), (hex!("0100000000333333334444444455000003c60000000000003c70"), 0x04faa1), (hex!("0100000000333333334444444455000003c70000000000003c80"), 0x04fb41), (hex!("0100000000333333334444444455000003c70000000000004270"), 0x04fbe1), (hex!("0100000000333333334444444455000003c80000000000003c90"), 0x04fc81), (hex!("0100000000333333334444444455000003c80000000000006e70"), 0x04fd21), (hex!("0100000000333333334444444455000003c90000000000003ca0"), 0x04fdc1), (hex!("0100000000333333334444444455000003ca0000000000003cb0"), 0x04fe61), (hex!("0100000000333333334444444455000003ca0000000000006e20"), 0x04ff01), (hex!("0100000000333333334444444455000003ca0000000000007c20"), 0x04ffa1), (hex!("0100000000333333334444444455000003cb0000000000003cc0"), 0x050041), (hex!("0100000000333333334444444455000003cc0000000000003cd0"), 0x0500e1), (hex!("0100000000333333334444444455000003cc0000000000006120"), 0x050181), (hex!("0100000000333333334444444455000003cc0000000000007950"), 0x050221), (hex!("0100000000333333334444444455000003cd0000000000003ce0"), 0x0502c1), (hex!("0100000000333333334444444455000003ce0000000000003cf0"), 0x050361), (hex!("0100000000333333334444444455000003cf0000000000003d00"), 0x050401), (hex!("0100000000333333334444444455000003d00000000000003d10"), 0x0504a1), (hex!("0100000000333333334444444455000003d10000000000003d20"), 0x050541), (hex!("0100000000333333334444444455000003d10000000000005e50"), 0x0505e1), (hex!("0100000000333333334444444455000003d10000000000007880"), 0x050681), (hex!("0100000000333333334444444455000003d20000000000003d30"), 0x050721), (hex!("0100000000333333334444444455000003d20000000000005d00"), 0x0507c1), (hex!("0100000000333333334444444455000003d30000000000003d40"), 0x050861), (hex!("0100000000333333334444444455000003d30000000000005d40"), 0x050901), (hex!("0100000000333333334444444455000003d300000000000063f0"), 0x0509a1), (hex!("0100000000333333334444444455000003d40000000000003d50"), 0x050a41), (hex!("0100000000333333334444444455000003d40000000000005700"), 0x050ae1), (hex!("0100000000333333334444444455000003d400000000000078f0"), 0x050b81), (hex!("0100000000333333334444444455000003d50000000000003d60"), 0x050c21), (hex!("0100000000333333334444444455000003d60000000000003d70"), 0x050cc1), (hex!("0100000000333333334444444455000003d70000000000003d80"), 0x050d61), (hex!("0100000000333333334444444455000003d80000000000003d90"), 0x050e01), (hex!("0100000000333333334444444455000003d80000000000006690"), 0x050ea1), (hex!("0100000000333333334444444455000003d90000000000003da0"), 0x050f41), (hex!("0100000000333333334444444455000003d900000000000076d0"), 0x050fe1), (hex!("0100000000333333334444444455000003da0000000000003db0"), 0x051081), (hex!("0100000000333333334444444455000003db0000000000003dc0"), 0x051121), (hex!("0100000000333333334444444455000003db0000000000004a30"), 0x0511c1), (hex!("0100000000333333334444444455000003db0000000000005390"), 0x051261), (hex!("0100000000333333334444444455000003dc0000000000003dd0"), 0x051301), (hex!("0100000000333333334444444455000003dc0000000000006d60"), 0x0513a1), (hex!("0100000000333333334444444455000003dd0000000000003de0"), 0x051441), (hex!("0100000000333333334444444455000003de0000000000003df0"), 0x0514e1), (hex!("0100000000333333334444444455000003df0000000000003e00"), 0x051581), (hex!("0100000000333333334444444455000003df0000000000005240"), 0x051621), (hex!("0100000000333333334444444455000003df0000000000005610"), 0x0516c1), (hex!("0100000000333333334444444455000003e00000000000003e10"), 0x051761), (hex!("0100000000333333334444444455000003e00000000000006500"), 0x051801), (hex!("0100000000333333334444444455000003e10000000000003e20"), 0x0518a1), (hex!("0100000000333333334444444455000003e10000000000006a10"), 0x051941), (hex!("0100000000333333334444444455000003e10000000000007c10"), 0x0519e1), (hex!("0100000000333333334444444455000003e20000000000003e30"), 0x051a81), (hex!("0100000000333333334444444455000003e20000000000006310"), 0x051b21), (hex!("0100000000333333334444444455000003e30000000000003e40"), 0x051bc1), (hex!("0100000000333333334444444455000003e40000000000003e50"), 0x051c61), (hex!("0100000000333333334444444455000003e40000000000006780"), 0x051d01), (hex!("0100000000333333334444444455000003e40000000000007ce0"), 0x051da1), (hex!("0100000000333333334444444455000003e50000000000003e60"), 0x051e41), (hex!("0100000000333333334444444455000003e60000000000003e70"), 0x051ee1), (hex!("0100000000333333334444444455000003e60000000000005040"), 0x051f81), (hex!("0100000000333333334444444455000003e60000000000005bf0"), 0x052021), (hex!("0100000000333333334444444455000003e70000000000003e80"), 0x0520c1), (hex!("0100000000333333334444444455000003e70000000000003f50"), 0x052161), ]; ================================================ FILE: pageserver/src/tenant/ephemeral_file.rs ================================================ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. use std::io; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use camino::Utf8PathBuf; use num_traits::Num; use pageserver_api::shard::TenantShardId; use tokio_epoll_uring::{BoundedBuf, Slice}; use tokio_util::sync::CancellationToken; use tracing::{error, info_span}; use utils::id::TimelineId; use utils::sync::gate::GateGuard; use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::{Buffer, FlushTaskError}; use crate::virtual_file::{self, IoBufferMut, TempVirtualFile, VirtualFile, owned_buffers_io}; use self::owned_buffers_io::write::OwnedAsyncWriter; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter, buffered_writer: tokio::sync::RwLock, bytes_written: AtomicU64, resource_units: std::sync::Mutex, } type BufferedWriter = owned_buffers_io::write::BufferedWriter< IoBufferMut, TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter, >; /// A TempVirtualFile that is co-owned by the [`EphemeralFile`]` and [`BufferedWriter`]. /// /// (Actually [`BufferedWriter`] internally is just a client to a background flush task. /// The co-ownership is between [`EphemeralFile`] and that flush task.) /// /// Co-ownership allows us to serve reads for data that has already been flushed by the [`BufferedWriter`]. #[derive(Debug, Clone)] struct TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter { inner: Arc, } const TAIL_SZ: usize = 64 * 1024; impl EphemeralFile { pub async fn create( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // TempVirtualFile requires us to never reuse a filename while an old // instance of TempVirtualFile created with that filename is not done dropping yet. // So, we use a monotonic counter to disambiguate the filenames. static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed); let filename = conf .timeline_path(&tenant_shard_id, &timeline_id) .join(Utf8PathBuf::from(format!( "ephemeral-{filename_disambiguator}" ))); let file = TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter::new( VirtualFile::open_with_options_v2( &filename, virtual_file::OpenOptions::new() .create_new(true) .read(true) .write(true), ctx, ) .await?, gate.enter()?, ); let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, page_cache_file_id, file: file.clone(), buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new( file, 0, || IoBufferMut::with_capacity(TAIL_SZ), gate.enter()?, cancel.child_token(), ctx, info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename), )), bytes_written: AtomicU64::new(0), resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()), }) } } impl TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter { fn new(file: VirtualFile, gate_guard: GateGuard) -> Self { Self { inner: Arc::new(TempVirtualFile::new(file, gate_guard)), } } } impl OwnedAsyncWriter for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter { fn write_all_at( &self, buf: owned_buffers_io::io_buf_ext::FullSlice, offset: u64, ctx: &RequestContext, ) -> impl std::future::Future< Output = ( owned_buffers_io::io_buf_ext::FullSlice, std::io::Result<()>, ), > + Send { self.inner.write_all_at(buf, offset, ctx) } fn set_len( &self, len: u64, ctx: &RequestContext, ) -> impl Future> + Send { self.inner.set_len(len, ctx) } } impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter { type Target = VirtualFile; fn deref(&self) -> &Self::Target { &self.inner } } #[derive(Debug, thiserror::Error)] pub(crate) enum EphemeralFileWriteError { #[error("cancelled")] Cancelled, } impl EphemeralFile { pub(crate) fn len(&self) -> u64 { // TODO(vlad): The value returned here is not always correct if // we have more than one concurrent writer. Writes are always // sequenced, but we could grab the buffered writer lock if we wanted // to. self.bytes_written.load(Ordering::Acquire) } pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { self.page_cache_file_id } pub(crate) async fn load_to_io_buf( &self, ctx: &RequestContext, ) -> Result { let size = self.len().into_usize(); let buf = IoBufferMut::with_capacity(size); let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?; assert_eq!(nread, size); let buf = slice.into_inner(); assert_eq!(buf.len(), nread); assert_eq!(buf.capacity(), size, "we shouldn't be reallocating"); Ok(buf) } /// Returns the offset at which the first byte of the input was written, for use /// in constructing indices over the written value. /// /// Panics if the write is short because there's no way we can recover from that. /// TODO: make upstack handle this as an error. pub(crate) async fn write_raw( &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result { let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?; if let Some(control) = control { control.release().await; } Ok(pos) } async fn write_raw_controlled( &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result<(u64, Option), EphemeralFileWriteError> { let mut writer = self.buffered_writer.write().await; let (nwritten, control) = writer .write_buffered_borrowed_controlled(srcbuf, ctx) .await .map_err(|e| match e { FlushTaskError::Cancelled => EphemeralFileWriteError::Cancelled, })?; assert_eq!( nwritten, srcbuf.len(), "buffered writer has no short writes" ); // There's no realistic risk of overflow here. We won't have exabytes sized files on disk. let pos = self .bytes_written .fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel); let mut resource_units = self.resource_units.lock().unwrap(); resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed)); Ok((pos, control)) } pub(crate) fn tick(&self) -> Option { let mut resource_units = self.resource_units.lock().unwrap(); let len = self.bytes_written.load(Ordering::Relaxed); resource_units.publish_size(len) } } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { async fn read_exact_at_eof_ok( &self, start: u64, mut dst: tokio_epoll_uring::Slice, ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { // We will fill the slice in back to front. Hence, we need // the slice to be fully initialized. // TODO(vlad): Is there a nicer way of doing this? dst.as_mut_rust_slice_full_zeroed(); let writer = self.buffered_writer.read().await; // Read bytes written while under lock. This is a hack to deal with concurrent // writes updating the number of bytes written. `bytes_written` is not DIO alligned // but we may end the read there. // // TODO(vlad): Feels like there's a nicer path where we align the end if it // shoots over the end of the file. let bytes_written = self.bytes_written.load(Ordering::Acquire); let dst_cap = dst.bytes_total().into_u64(); let end = { // saturating_add is correct here because the max file size is u64::MAX, so, // if start + dst.len() > u64::MAX, then we know it will be a short read let mut end: u64 = start.saturating_add(dst_cap); if end > bytes_written { end = bytes_written; } end }; let submitted_offset = writer.bytes_submitted(); let maybe_flushed = writer.inspect_maybe_flushed(); let mutable = match writer.inspect_mutable() { Some(mutable) => &mutable[0..mutable.pending()], None => { // Timeline::cancel and hence buffered writer flush was cancelled. // Remain read-available while timeline is shutting down. &[] } }; // inclusive, exclusive #[derive(Debug)] struct Range(N, N); impl Range { fn len(&self) -> N { if self.0 > self.1 { N::zero() } else { self.1 - self.0 } } } let (written_range, maybe_flushed_range) = { if maybe_flushed.is_some() { // [ written ][ maybe_flushed ][ mutable ] // ^ // `submitted_offset` // <++++++ on disk +++++++????????????????> ( Range( start, std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)), ), Range( std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)), std::cmp::min(end, submitted_offset), ), ) } else { // [ written ][ mutable ] // ^ // `submitted_offset` // <++++++ on disk +++++++++++++++++++++++> ( Range(start, std::cmp::min(end, submitted_offset)), // zero len Range(submitted_offset, u64::MIN), ) } }; let mutable_range = Range(std::cmp::max(start, submitted_offset), end); // There are three sources from which we might have to read data: // 1. The file itself // 2. The buffer which contains changes currently being flushed // 3. The buffer which contains chnages yet to be flushed // // For better concurrency, we do them in reverse order: perform the in-memory // reads while holding the writer lock, drop the writer lock and read from the // file if required. let dst = if mutable_range.len() > 0 { let offset_in_buffer = mutable_range .0 .checked_sub(submitted_offset) .unwrap() .into_usize(); let to_copy = &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; let bounds = dst.bounds(); let mut view = dst.slice({ let start = written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); start..end }); view.as_mut_rust_slice_full_zeroed() .copy_from_slice(to_copy); Slice::from_buf_bounds(Slice::into_inner(view), bounds) } else { dst }; let dst = if maybe_flushed_range.len() > 0 { let offset_in_buffer = maybe_flushed_range .0 .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64)) .unwrap() .into_usize(); // Checked previously the buffer is Some. let maybe_flushed = maybe_flushed.unwrap(); let to_copy = &maybe_flushed [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())]; let bounds = dst.bounds(); let mut view = dst.slice({ let start = written_range.len().into_usize(); let end = start .checked_add(maybe_flushed_range.len().into_usize()) .unwrap(); start..end }); view.as_mut_rust_slice_full_zeroed() .copy_from_slice(to_copy); Slice::from_buf_bounds(Slice::into_inner(view), bounds) } else { dst }; drop(writer); let dst = if written_range.len() > 0 { let bounds = dst.bounds(); let slice = self .file .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) .await?; Slice::from_buf_bounds(Slice::into_inner(slice), bounds) } else { dst }; // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs Ok((dst, (end - start).into_usize())) } } /// Does the given filename look like an ephemeral file? pub fn is_ephemeral_file(filename: &str) -> bool { if let Some(rest) = filename.strip_prefix("ephemeral-") { rest.parse::().is_ok() } else { false } } #[cfg(test)] mod tests { use std::fs; use std::str::FromStr; use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; fn harness( test_name: &str, ) -> Result< ( &'static PageServerConf, TenantShardId, TimelineId, RequestContext, ), io::Error, > { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); // Make a static copy of the config. This can never be free'd, but that's // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap(); let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?; let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); Ok((conf, tenant_shard_id, timeline_id, ctx)) } #[tokio::test] async fn ephemeral_file_holds_gate_open() { const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_file_holds_gate_open").unwrap(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); let mut closing = tokio::task::spawn(async move { gate.close().await; }); // gate is entered until the ephemeral file is dropped // do not start paused tokio-epoll-uring has a sleep loop tokio::time::pause(); tokio::time::timeout(FOREVER, &mut closing) .await .expect_err("closing cannot complete before dropping"); // this is a requirement of the reset_tenant functionality: we have to be able to restart a // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate drop(file); tokio::time::timeout(FOREVER, &mut closing) .await .expect("closing completes right away") .expect("closing does not panic"); } #[tokio::test] async fn test_ephemeral_file_basics() { let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); let writer = file.buffered_writer.read().await; let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); drop(writer); let write_nbytes = cap * 2 + cap / 2; let content: Vec = rand::rng() .sample_iter(rand::distr::StandardUniform) .take(write_nbytes) .collect(); let mut value_offsets = Vec::new(); for range in (0..write_nbytes) .step_by(align) .map(|start| start..(start + align).min(write_nbytes)) { let off = file.write_raw(&content[range], &ctx).await.unwrap(); value_offsets.push(off); } assert_eq!(file.len() as usize, write_nbytes); for (i, range) in (0..write_nbytes) .step_by(align) .map(|start| start..(start + align).min(write_nbytes)) .enumerate() { assert_eq!(value_offsets[i], range.start.into_u64()); let buf = IoBufferMut::with_capacity(range.len()); let (buf_slice, nread) = file .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx) .await .unwrap(); let buf = buf_slice.into_inner(); assert_eq!(nread, range.len()); assert_eq!(&buf, &content[range]); } let file_contents = std::fs::read(file.file.path()).unwrap(); assert!(file_contents == content[0..cap * 2]); let writer = file.buffered_writer.read().await; let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap(); assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); let mutable_buffer_contents = writer.mutable(); assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } #[tokio::test] async fn test_flushes_do_happen() { let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); // mutable buffer and maybe_flushed buffer each has `cap` bytes. let writer = file.buffered_writer.read().await; let cap = writer.mutable().capacity(); drop(writer); let content: Vec = rand::rng() .sample_iter(rand::distr::StandardUniform) .take(cap * 2 + cap / 2) .collect(); file.write_raw(&content, &ctx).await.unwrap(); // assert the state is as this test expects it to be let load_io_buf_res = file.load_to_io_buf(&ctx).await.unwrap(); assert_eq!(&load_io_buf_res[..], &content[0..cap * 2 + cap / 2]); let md = file.file.path().metadata().unwrap(); assert_eq!( md.len(), 2 * cap.into_u64(), "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); let writer = file.buffered_writer.read().await; assert_eq!( &writer.inspect_maybe_flushed().unwrap()[0..cap], &content[cap..cap * 2] ); assert_eq!( &writer.mutable()[0..cap / 2], &content[cap * 2..cap * 2 + cap / 2] ); } #[tokio::test] async fn test_read_split_across_file_and_buffer() { // This test exercises the logic on the read path that splits the logical read // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer. // // This test build on the assertions in test_flushes_do_happen let (conf, tenant_id, timeline_id, ctx) = harness("test_read_split_across_file_and_buffer").unwrap(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); let writer = file.buffered_writer.read().await; let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); drop(writer); let content: Vec = rand::rng() .sample_iter(rand::distr::StandardUniform) .take(cap * 2 + cap / 2) .collect(); let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap(); let test_read = |start: usize, len: usize| { let file = &file; let ctx = &ctx; let content = &content; async move { let (buf, nread) = file .read_exact_at_eof_ok( start.into_u64(), IoBufferMut::with_capacity(len).slice_full(), ctx, ) .await .unwrap(); assert_eq!(nread, len); assert_eq!(&buf.into_inner(), &content[start..(start + len)]); } }; let test_read_all_offset_combinations = || { async move { test_read(align, align).await; // border onto edge of file test_read(cap - align, align).await; // read across file and buffer test_read(cap - align, 2 * align).await; // stay from start of maybe flushed buffer test_read(cap, align).await; // completely within maybe flushed buffer test_read(cap + align, align).await; // border onto edge of maybe flushed buffer. test_read(cap * 2 - align, align).await; // read across maybe flushed and mutable buffer test_read(cap * 2 - align, 2 * align).await; // read across three segments test_read(cap - align, cap + 2 * align).await; // completely within mutable buffer test_read(cap * 2 + align, align).await; } }; // completely within the file range assert!(align < cap, "test assumption"); assert!(cap % align == 0); // test reads at different flush stages. let not_started = control.unwrap().into_not_started(); test_read_all_offset_combinations().await; let in_progress = not_started.ready_to_flush(); test_read_all_offset_combinations().await; in_progress.wait_until_flush_is_done().await; test_read_all_offset_combinations().await; } } ================================================ FILE: pageserver/src/tenant/gc_block.rs ================================================ use std::collections::HashMap; use std::sync::Arc; use utils::id::TimelineId; use super::remote_timeline_client::index::GcBlockingReason; type Storage = HashMap>; /// GcBlock provides persistent (per-timeline) gc blocking. #[derive(Default)] pub(crate) struct GcBlock { /// The timelines which have current reasons to block gc. /// /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. reasons: std::sync::Mutex, /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this. /// /// Do not add any more features taking and forbidding taking this lock. It should be /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`] /// synchronizes with gc attempts by locking and unlocking this mutex. blocking: Arc>, } impl GcBlock { /// Start another gc iteration. /// /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with /// it's ending, or if not currently possible, a value describing the reasons why not. /// /// Cancellation safe. pub(super) async fn start(&self) -> Result { let reasons = { let g = self.reasons.lock().unwrap(); // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in // tests, we use everything. we should warn if the gc has been consecutively blocked // for more than 1h (within single tenant session?). BlockingReasons::clean_and_summarize(g) }; if let Some(reasons) = reasons { Err(reasons) } else { Ok(Guard { _inner: self.blocking.clone().lock_owned().await, }) } } /// Describe the current gc blocking reasons. /// /// TODO: make this json serializable. pub(crate) fn summary(&self) -> Option { let g = self.reasons.lock().unwrap(); BlockingReasons::summarize(&g) } /// Start blocking gc for this one timeline for the given reason. /// /// This is not a guard based API but instead it mimics set API. The returned future will not /// resolve until an existing gc round has completed. /// /// Returns true if this block was new, false if gc was already blocked for this reason. /// /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will /// keep the gc blocking reason. pub(crate) async fn insert( &self, timeline: &super::Timeline, reason: GcBlockingReason, ) -> anyhow::Result { let (added, uploaded) = { let mut g = self.reasons.lock().unwrap(); let set = g.entry(timeline.timeline_id).or_default(); let added = set.insert(reason); // LOCK ORDER: intentionally hold the lock, see self.reasons. let uploaded = timeline .remote_client .schedule_insert_gc_block_reason(reason)?; (added, uploaded) }; uploaded.await?; // ensure that any ongoing gc iteration has completed drop(self.blocking.lock().await); Ok(added) } /// Remove blocking gc for this one timeline and the given reason. pub(crate) async fn remove( &self, timeline: &super::Timeline, reason: GcBlockingReason, ) -> anyhow::Result<()> { use std::collections::hash_map::Entry; super::span::debug_assert_current_span_has_tenant_and_timeline_id(); let (remaining_blocks, uploaded) = { let mut g = self.reasons.lock().unwrap(); match g.entry(timeline.timeline_id) { Entry::Occupied(mut oe) => { let set = oe.get_mut(); set.remove(reason); if set.is_empty() { oe.remove(); } } Entry::Vacant(_) => { // we must still do the index_part.json update regardless, in case we had earlier // been cancelled } } let remaining_blocks = g.len(); // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons let uploaded = timeline .remote_client .schedule_remove_gc_block_reason(reason)?; (remaining_blocks, uploaded) }; uploaded.await?; // no need to synchronize with gc iteration again if remaining_blocks > 0 { tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked"); } else { tracing::info!("gc is now unblocked for the tenant"); } Ok(()) } pub(crate) fn before_delete(&self, timeline_id: &super::TimelineId) { let unblocked = { let mut g = self.reasons.lock().unwrap(); if g.is_empty() { return; } g.remove(timeline_id); BlockingReasons::clean_and_summarize(g).is_none() }; if unblocked { tracing::info!("gc is now unblocked following deletion"); } } /// Initialize with the non-deleted timelines of this tenant. pub(crate) fn set_scanned(&self, scanned: Storage) { let mut g = self.reasons.lock().unwrap(); assert!(g.is_empty()); g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { tracing::info!(summary=?reasons, "initialized with gc blocked"); } } } pub(crate) struct Guard { _inner: tokio::sync::OwnedMutexGuard<()>, } #[derive(Debug)] pub(crate) struct BlockingReasons { timelines: usize, reasons: enumset::EnumSet, } impl std::fmt::Display for BlockingReasons { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{} timelines block for {:?}", self.timelines, self.reasons ) } } impl BlockingReasons { fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { let mut reasons = enumset::EnumSet::empty(); g.retain(|_key, value| { reasons = reasons.union(*value); !value.is_empty() }); if !g.is_empty() { Some(BlockingReasons { timelines: g.len(), reasons, }) } else { None } } fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { if g.is_empty() { None } else { let reasons = g .values() .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); Some(BlockingReasons { timelines: g.len(), reasons, }) } } } ================================================ FILE: pageserver/src/tenant/gc_result.rs ================================================ use std::ops::AddAssign; use std::time::Duration; use anyhow::Result; use serde::Serialize; /// /// Result of performing GC /// #[derive(Default, Serialize, Debug)] pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, pub layers_needed_by_pitr: u64, pub layers_needed_by_branches: u64, pub layers_needed_by_leases: u64, pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, /// The layers which were garbage collected. /// /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be /// dropped in tests. #[cfg(feature = "testing")] #[serde(skip)] pub(crate) doomed_layers: Vec, } // helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result where S: serde::Serializer, { d.as_millis().serialize(serializer) } impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; self.layers_needed_by_pitr += other.layers_needed_by_pitr; self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; self.layers_needed_by_branches += other.layers_needed_by_branches; self.layers_needed_by_leases += other.layers_needed_by_leases; self.layers_not_updated += other.layers_not_updated; self.layers_removed += other.layers_removed; self.elapsed += other.elapsed; #[cfg(feature = "testing")] { let mut other = other; self.doomed_layers.append(&mut other.doomed_layers); } } } ================================================ FILE: pageserver/src/tenant/layer_map/historic_layer_coverage.rs ================================================ use std::collections::BTreeMap; use std::ops::Range; use tracing::info; use super::layer_coverage::LayerCoverageTuple; use crate::tenant::storage_layer::PersistentLayerDesc; /// Layers in this module are identified and indexed by this data. /// /// This is a helper struct to enable sorting layers by lsn.start. /// /// These three values are enough to uniquely identify a layer, since /// a layer is obligated to contain all contents within range, so two /// deltas (or images) with the same range have identical content. #[derive(Debug, PartialEq, Eq, Clone)] pub struct LayerKey { // TODO I use i128 and u64 because it was easy for prototyping, // testing, and benchmarking. If we can use the Lsn and Key // types without overhead that would be preferable. pub key: Range, pub lsn: Range, pub is_image: bool, } impl PartialOrd for LayerKey { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for LayerKey { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // NOTE we really care about comparing by lsn.start first self.lsn .start .cmp(&other.lsn.start) .then(self.lsn.end.cmp(&other.lsn.end)) .then(self.key.start.cmp(&other.key.start)) .then(self.key.end.cmp(&other.key.end)) .then(self.is_image.cmp(&other.is_image)) } } impl From<&PersistentLayerDesc> for LayerKey { fn from(layer: &PersistentLayerDesc) -> Self { let kr = layer.get_key_range(); let lr = layer.get_lsn_range(); LayerKey { key: kr.start.to_i128()..kr.end.to_i128(), lsn: lr.start.0..lr.end.0, is_image: !layer.is_incremental(), } } } /// Efficiently queryable layer coverage for each LSN. /// /// Allows answering layer map queries very efficiently, /// but doesn't allow retroactive insertion, which is /// sometimes necessary. See BufferedHistoricLayerCoverage. pub struct HistoricLayerCoverage { /// The latest state head: LayerCoverageTuple, /// TODO: this could be an ordered vec using binary search. /// We push into this map everytime we add a layer, so might see some benefit /// All previous states historic: BTreeMap>, } impl Default for HistoricLayerCoverage { fn default() -> Self { Self::new() } } impl HistoricLayerCoverage { pub fn new() -> Self { Self { head: LayerCoverageTuple::default(), historic: BTreeMap::default(), } } /// Add a layer /// /// Panics if new layer has older lsn.start than an existing layer. /// See BufferedHistoricLayerCoverage for a more general insertion method. pub fn insert(&mut self, layer_key: LayerKey, value: Value) { // It's only a persistent map, not a retroactive one if let Some(last_entry) = self.historic.iter().next_back() { let last_lsn = last_entry.0; if layer_key.lsn.start < *last_lsn { panic!("unexpected retroactive insert"); } } // Insert into data structure let target = if layer_key.is_image { &mut self.head.image_coverage } else { &mut self.head.delta_coverage }; target.insert(layer_key.key, layer_key.lsn.clone(), value); // Remember history. Clone is O(1) self.historic.insert(layer_key.lsn.start, self.head.clone()); } /// Query at a particular LSN, inclusive pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple> { match self.historic.range(..=lsn).next_back() { Some((_, v)) => Some(v), None => None, } } /// Remove all entries after a certain LSN (inclusive) pub fn trim(&mut self, begin: &u64) { self.historic.split_off(begin); self.head = self .historic .iter() .next_back() .map(|(_, v)| v.clone()) .unwrap_or_default(); } } /// This is the most basic test that demonstrates intended usage. /// All layers in this test have height 1. #[test] fn test_persistent_simple() { let mut map = HistoricLayerCoverage::::new(); map.insert( LayerKey { key: 0..5, lsn: 100..101, is_image: true, }, "Layer 1".to_string(), ); map.insert( LayerKey { key: 3..9, lsn: 110..111, is_image: true, }, "Layer 2".to_string(), ); map.insert( LayerKey { key: 5..6, lsn: 120..121, is_image: true, }, "Layer 3".to_string(), ); // After Layer 1 insertion let version = map.get_version(105).unwrap(); assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string())); assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); // After Layer 2 insertion let version = map.get_version(115).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string())); assert_eq!(version.image_coverage.query(11), None); // After Layer 3 insertion let version = map.get_version(125).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string())); assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string())); } /// Cover simple off-by-one edge cases #[test] fn test_off_by_one() { let mut map = HistoricLayerCoverage::::new(); map.insert( LayerKey { key: 3..5, lsn: 100..110, is_image: true, }, "Layer 1".to_string(), ); // Check different LSNs let version = map.get_version(99); assert!(version.is_none()); let version = map.get_version(100).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); let version = map.get_version(110).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); // Check different keys let version = map.get_version(105).unwrap(); assert_eq!(version.image_coverage.query(2), None); assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string())); assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); assert_eq!(version.image_coverage.query(5), None); } /// White-box regression test, checking for incorrect removal of node at key.end #[test] fn test_regression() { let mut map = HistoricLayerCoverage::::new(); map.insert( LayerKey { key: 0..5, lsn: 0..5, is_image: false, }, "Layer 1".to_string(), ); map.insert( LayerKey { key: 0..5, lsn: 1..2, is_image: false, }, "Layer 2".to_string(), ); // If an insertion operation improperly deletes the endpoint of a previous layer // (which is more likely to happen with layers that collide on key.end), we will // end up with an infinite layer, covering the entire keyspace. Here we assert // that there's no layer at key 100 because we didn't insert any layer there. let version = map.get_version(100).unwrap(); assert_eq!(version.delta_coverage.query(100), None); } /// Cover edge cases where layers begin or end on the same key #[test] fn test_key_collision() { let mut map = HistoricLayerCoverage::::new(); map.insert( LayerKey { key: 3..5, lsn: 100..110, is_image: true, }, "Layer 10".to_string(), ); map.insert( LayerKey { key: 5..8, lsn: 100..110, is_image: true, }, "Layer 11".to_string(), ); map.insert( LayerKey { key: 3..4, lsn: 200..210, is_image: true, }, "Layer 20".to_string(), ); // Check after layer 11 let version = map.get_version(105).unwrap(); assert_eq!(version.image_coverage.query(2), None); assert_eq!( version.image_coverage.query(3), Some("Layer 10".to_string()) ); assert_eq!( version.image_coverage.query(5), Some("Layer 11".to_string()) ); assert_eq!( version.image_coverage.query(7), Some("Layer 11".to_string()) ); assert_eq!(version.image_coverage.query(8), None); // Check after layer 20 let version = map.get_version(205).unwrap(); assert_eq!(version.image_coverage.query(2), None); assert_eq!( version.image_coverage.query(3), Some("Layer 20".to_string()) ); assert_eq!( version.image_coverage.query(5), Some("Layer 11".to_string()) ); assert_eq!( version.image_coverage.query(7), Some("Layer 11".to_string()) ); assert_eq!(version.image_coverage.query(8), None); } /// Test when rectangles have nontrivial height and possibly overlap #[test] fn test_persistent_overlapping() { let mut map = HistoricLayerCoverage::::new(); // Add 3 key-disjoint layers with varying LSN ranges map.insert( LayerKey { key: 1..2, lsn: 100..200, is_image: true, }, "Layer 1".to_string(), ); map.insert( LayerKey { key: 4..5, lsn: 110..200, is_image: true, }, "Layer 2".to_string(), ); map.insert( LayerKey { key: 7..8, lsn: 120..300, is_image: true, }, "Layer 3".to_string(), ); // Add wide and short layer map.insert( LayerKey { key: 0..9, lsn: 130..199, is_image: true, }, "Layer 4".to_string(), ); // Add wide layer taller than some map.insert( LayerKey { key: 0..9, lsn: 140..201, is_image: true, }, "Layer 5".to_string(), ); // Add wide layer taller than all map.insert( LayerKey { key: 0..9, lsn: 150..301, is_image: true, }, "Layer 6".to_string(), ); // After layer 4 insertion let version = map.get_version(135).unwrap(); assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string())); assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string())); assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string())); assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string())); assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string())); assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string())); // After layer 5 insertion let version = map.get_version(145).unwrap(); assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string())); assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string())); assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string())); assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string())); assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string())); assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string())); assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string())); // After layer 6 insertion let version = map.get_version(155).unwrap(); assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string())); assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string())); } /// Wrapper for HistoricLayerCoverage that allows us to hack around the lack /// of support for retroactive insertion by rebuilding the map since the /// change. /// /// Why is this needed? We most often insert new layers with newer LSNs, /// but during compaction we create layers with non-latest LSN, and during /// GC we delete historic layers. /// /// Even though rebuilding is an expensive (N log N) solution to the problem, /// it's not critical since we do something equally expensive just to decide /// whether or not to create new image layers. /// TODO It's not expensive but it's not great to hold a layer map write lock /// for that long. /// /// If this becomes an actual bottleneck, one solution would be to build a /// segment tree that holds PersistentLayerMaps. Though this would mean that /// we take an additional log(N) performance hit for queries, which will probably /// still be more critical. /// /// See this for more on persistent and retroactive techniques: /// pub struct BufferedHistoricLayerCoverage { /// A persistent layer map that we rebuild when we need to retroactively update historic_coverage: HistoricLayerCoverage, /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds. buffer: BTreeMap>, /// All current layers. This is not used for search. Only to make rebuilds easier. // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of // [`Self::historic_coverage`] instead of doubling memory usage. // [`Self::len`]: can require rebuild and serve from latest historic // [`Self::iter`]: already requires rebuild => can serve from latest historic layers: BTreeMap, } impl std::fmt::Debug for BufferedHistoricLayerCoverage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RetroactiveLayerMap") .field("buffer", &self.buffer) .field("layers", &self.layers) .finish() } } impl Default for BufferedHistoricLayerCoverage { fn default() -> Self { Self::new() } } impl BufferedHistoricLayerCoverage { pub fn new() -> Self { Self { historic_coverage: HistoricLayerCoverage::::new(), buffer: BTreeMap::new(), layers: BTreeMap::new(), } } pub fn insert(&mut self, layer_key: LayerKey, value: Value) { self.buffer.insert(layer_key, Some(value)); } pub fn remove(&mut self, layer_key: LayerKey) { self.buffer.insert(layer_key, None); } pub fn rebuild(&mut self) { // Find the first LSN that needs to be rebuilt let rebuild_since: u64 = match self.buffer.iter().next() { Some((LayerKey { lsn, .. }, _)) => lsn.start, None => return, // No need to rebuild if buffer is empty }; // Apply buffered updates to self.layers let num_updates = self.buffer.len(); self.buffer.retain(|layer_key, layer| { match layer { Some(l) => { self.layers.insert(layer_key.clone(), l.clone()); } None => { self.layers.remove(layer_key); } }; false }); // Rebuild let mut num_inserted = 0; self.historic_coverage.trim(&rebuild_since); for (layer_key, layer) in self.layers.range( LayerKey { lsn: rebuild_since..0, key: 0..0, is_image: false, }.., ) { self.historic_coverage .insert(layer_key.clone(), layer.clone()); num_inserted += 1; } // TODO maybe only warn if ratio is at least 10 info!( "Rebuilt layer map. Did {} insertions to process a batch of {} updates.", num_inserted, num_updates, ) } /// Iterate all the layers pub fn iter(&self) -> impl ExactSizeIterator { // NOTE we can actually perform this without rebuilding, // but it's not necessary for now. if !self.buffer.is_empty() { panic!("rebuild pls") } self.layers.values().cloned() } /// Return a reference to a queryable map, assuming all updates /// have already been processed using self.rebuild() pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage> { // NOTE we error here instead of implicitly rebuilding because // rebuilding is somewhat expensive. // TODO maybe implicitly rebuild and log/sentry an error? if !self.buffer.is_empty() { anyhow::bail!("rebuild required") } Ok(&self.historic_coverage) } pub(crate) fn len(&self) -> usize { self.layers.len() } } #[test] fn test_retroactive_regression_1() { let mut map = BufferedHistoricLayerCoverage::new(); map.insert( LayerKey { key: 0..21267647932558653966460912964485513215, lsn: 23761336..23761457, is_image: false, }, "sdfsdfs".to_string(), ); map.rebuild(); let version = map.get().unwrap().get_version(23761457).unwrap(); assert_eq!( version.delta_coverage.query(100), Some("sdfsdfs".to_string()) ); } #[test] fn test_retroactive_simple() { let mut map = BufferedHistoricLayerCoverage::new(); // Append some images in increasing LSN order map.insert( LayerKey { key: 0..5, lsn: 100..101, is_image: true, }, "Image 1".to_string(), ); map.insert( LayerKey { key: 3..9, lsn: 110..111, is_image: true, }, "Image 2".to_string(), ); map.insert( LayerKey { key: 4..6, lsn: 120..121, is_image: true, }, "Image 3".to_string(), ); map.insert( LayerKey { key: 8..9, lsn: 120..121, is_image: true, }, "Image 4".to_string(), ); // Add a delta layer out of order map.insert( LayerKey { key: 2..5, lsn: 105..106, is_image: false, }, "Delta 1".to_string(), ); // Rebuild so we can start querying map.rebuild(); { let map = map.get().expect("rebuilt"); let version = map.get_version(90); assert!(version.is_none()); let version = map.get_version(102).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string())); let version = map.get_version(107).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string())); assert_eq!(version.delta_coverage.query(4), Some("Delta 1".to_string())); let version = map.get_version(115).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string())); let version = map.get_version(125).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string())); } // Remove Image 3 map.remove(LayerKey { key: 4..6, lsn: 120..121, is_image: true, }); map.rebuild(); { // Check deletion worked let map = map.get().expect("rebuilt"); let version = map.get_version(125).unwrap(); assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string())); assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string())); } } ================================================ FILE: pageserver/src/tenant/layer_map/layer_coverage.rs ================================================ use std::ops::Range; // NOTE the `im` crate has 20x more downloads and also has // persistent/immutable BTree. But it's bugged so rpds is a // better choice use rpds::RedBlackTreeMapSync; /// Data structure that can efficiently: /// - find the latest layer by lsn.end at a given key /// - iterate the latest layers in a key range /// - insert layers in non-decreasing lsn.start order /// /// For a detailed explanation and justification of this approach, see: /// /// /// NOTE The struct is parameterized over Value for easier /// testing, but in practice it's some sort of layer. pub struct LayerCoverage { /// For every change in coverage (as we sweep the key space) /// we store (lsn.end, value). /// /// NOTE We use an immutable/persistent tree so that we can keep historic /// versions of this coverage without cloning the whole thing and /// incurring quadratic memory cost. See HistoricLayerCoverage. /// /// NOTE We use the Sync version of the map because we want Self to /// be Sync. Using nonsync might be faster, if we can work with /// that. nodes: RedBlackTreeMapSync>, } impl Default for LayerCoverage { fn default() -> Self { Self::new() } } impl LayerCoverage { pub fn new() -> Self { Self { nodes: RedBlackTreeMapSync::default(), } } /// Helper function to subdivide the key range without changing any values /// /// This operation has no semantic effect by itself. It only helps us pin in /// place the part of the coverage we don't want to change when inserting. /// /// As an analogy, think of a polygon. If you add a vertex along one of the /// segments, the polygon is still the same, but it behaves differently when /// we move or delete one of the other points. /// /// Complexity: O(log N) fn add_node(&mut self, key: i128) { let value = match self.nodes.range(..=key).next_back() { Some((_, Some(v))) => Some(v.clone()), Some((_, None)) => None, None => None, }; self.nodes.insert_mut(key, value); } /// Insert a layer. /// /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation. pub fn insert(&mut self, key: Range, lsn: Range, value: Value) { // Add nodes at endpoints // // NOTE The order of lines is important. We add nodes at the start // and end of the key range **before updating any nodes** in order // to pin down the current coverage outside of the relevant key range. // Only the coverage inside the layer's key range should change. self.add_node(key.start); self.add_node(key.end); // Raise the height where necessary // // NOTE This loop is worst case O(N), but amortized O(log N) in the special // case when rectangles have no height. In practice I don't think we'll see // the kind of layer intersections needed to trigger O(N) behavior. The worst // case is N/2 horizontal layers overlapped with N/2 vertical layers in a // grid pattern. let mut to_update = Vec::new(); let mut to_remove = Vec::new(); let mut prev_covered = false; for (k, node) in self.nodes.range(key) { let needs_cover = match node { None => true, Some((h, _)) => h < &lsn.end, }; if needs_cover { match prev_covered { true => to_remove.push(*k), false => to_update.push(*k), } } prev_covered = needs_cover; } // TODO check if the nodes inserted at key.start and key.end are safe // to remove. It's fine to keep them but they could be redundant. for k in to_update { self.nodes.insert_mut(k, Some((lsn.end, value.clone()))); } for k in to_remove { self.nodes.remove_mut(&k); } } /// Get the latest (by lsn.end) layer at a given key /// /// Complexity: O(log N) pub fn query(&self, key: i128) -> Option { self.nodes .range(..=key) .next_back()? .1 .as_ref() .map(|(_, v)| v.clone()) } /// Iterate the changes in layer coverage in a given range. You will likely /// want to start with self.query(key.start), and then follow up with self.range /// /// Complexity: O(log N + result_size) pub fn range(&self, key: Range) -> impl '_ + Iterator)> { self.nodes .range(key) .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone()))) } /// Returns an iterator which includes all coverage changes for layers that intersect /// with the provided range. pub fn range_overlaps( &self, key_range: &Range, ) -> impl Iterator)> + '_ where Value: Eq, { let first_change = self.query(key_range.start); match first_change { Some(change) => { // If the start of the range is covered, we have to deal with two cases: // 1. Start of the range is aligned with the start of a layer. // In this case the return of `self.range` will contain the layer which aligns with the start of the key range. // We advance said iterator to avoid duplicating the first change. // 2. Start of the range is not aligned with the start of a layer. let range = key_range.start..key_range.end; let mut range_coverage = self.range(range).peekable(); if range_coverage .peek() .is_some_and(|c| c.1.as_ref() == Some(&change)) { range_coverage.next(); } itertools::Either::Left( std::iter::once((key_range.start, Some(change))).chain(range_coverage), ) } None => { let range = key_range.start..key_range.end; let coverage = self.range(range); itertools::Either::Right(coverage) } } } /// O(1) clone pub fn clone(&self) -> Self { Self { nodes: self.nodes.clone(), } } } /// Image and delta coverage at a specific LSN. pub struct LayerCoverageTuple { pub image_coverage: LayerCoverage, pub delta_coverage: LayerCoverage, } impl Default for LayerCoverageTuple { fn default() -> Self { Self { image_coverage: LayerCoverage::default(), delta_coverage: LayerCoverage::default(), } } } impl LayerCoverageTuple { pub fn clone(&self) -> Self { Self { image_coverage: self.image_coverage.clone(), delta_coverage: self.delta_coverage.clone(), } } } ================================================ FILE: pageserver/src/tenant/layer_map.rs ================================================ //! //! The layer map tracks what layers exist in a timeline. //! //! When the timeline is first accessed, the server lists of all layer files //! in the timelines/ directory, and populates this map with //! ImageLayer and DeltaLayer structs corresponding to each file. When the first //! new WAL record is received, we create an InMemoryLayer to hold the incoming //! records. Now and then, in the checkpoint() function, the in-memory layer is //! are frozen, and it is split up into new image and delta layers and the //! corresponding files are written to disk. //! //! Design overview: //! //! The `search` method of the layer map is on the read critical path, so we've //! built an efficient data structure for fast reads, stored in `LayerMap::historic`. //! Other read methods are less critical but still impact performance of background tasks. //! //! This data structure relies on a persistent/immutable binary search tree. See the //! following lecture for an introduction //! Summary: A persistent/immutable BST (and persistent data structures in general) allows //! you to modify the tree in such a way that each modification creates a new "version" //! of the tree. When you modify it, you get a new version, but all previous versions are //! still accessible too. So if someone is still holding a reference to an older version, //! they continue to see the tree as it was then. The persistent BST stores all the //! different versions in an efficient way. //! //! Our persistent BST maintains a map of which layer file "covers" each key. It has only //! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property //! to handle the LSN dimension. //! //! To build the layer map, we insert each layer to the persistent BST in LSN.start order, //! starting from the oldest one. After each insertion, we grab a reference to that "version" //! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See //! `historic_layer_coverage.rs`. //! //! To search for a particular key-LSN pair, you first look up the right "version" in the //! BTreeMap. Then you search that version of the BST with the key. //! //! The persistent BST keeps all the versions, but there is no way to change the old versions //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need //! to throw away most of the persistent BST and build a new one, starting from the oldest //! LSN. See [`LayerMap::flush_updates()`]. //! mod historic_layer_coverage; mod layer_coverage; use std::collections::{BTreeMap, HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; use std::time::Instant; use anyhow::Result; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use tokio::sync::watch; use utils::lsn::Lsn; use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; use crate::context::RequestContext; use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak}; /// /// LayerMap tracks what layers exist on a timeline. /// pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new // records. If it is None, 'next_open_layer_at' will be set instead, indicating // where the start LSN of the next InMemoryLayer that is to be created. // pub open_layer: Option>, pub next_open_layer_at: Option, /// /// Frozen layers, if any. Frozen layers are in-memory layers that /// are no longer added to, but haven't been written out to disk /// yet. They contain WAL older than the current 'open_layer' or /// 'next_open_layer_at', but newer than any historic layer. /// The frozen layers are in order from oldest to newest, so that /// the newest one is in the 'back' of the VecDeque, and the oldest /// in the 'front'. /// pub frozen_layers: VecDeque>, /// Index of the historic layers optimized for search historic: BufferedHistoricLayerCoverage>, /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. /// /// NB: make sure to notify `watch_l0_deltas` on changes. l0_delta_layers: Vec>, /// Notifies about L0 delta layer changes, sending the current number of L0 layers. watch_l0_deltas: watch::Sender, } impl Default for LayerMap { fn default() -> Self { Self { open_layer: Default::default(), next_open_layer_at: Default::default(), frozen_layers: Default::default(), historic: Default::default(), l0_delta_layers: Default::default(), watch_l0_deltas: watch::channel(0).0, } } } /// The primary update API for the layer map. /// /// Batching historic layer insertions and removals is good for /// performance and this struct helps us do that correctly. #[must_use] pub struct BatchedUpdates<'a> { // While we hold this exclusive reference to the layer map the type checker // will prevent us from accidentally reading any unflushed updates. layer_map: &'a mut LayerMap, } /// Provide ability to batch more updates while hiding the read /// API so we don't accidentally read without flushing. impl BatchedUpdates<'_> { /// /// Insert an on-disk layer. /// // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap` pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) { self.layer_map.insert_historic_noflush(layer_desc) } /// /// Remove an on-disk layer from the map. /// /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) { self.layer_map.remove_historic_noflush(layer_desc) } // We will flush on drop anyway, but this method makes it // more explicit that there is some work being done. /// Apply all updates pub fn flush(self) { // Flush happens on drop } } // Ideally the flush() method should be called explicitly for more // controlled execution. But if we forget we'd rather flush on drop // than panic later or read without flushing. // // TODO maybe warn if flush hasn't explicitly been called impl Drop for BatchedUpdates<'_> { fn drop(&mut self) { self.layer_map.flush_updates(); } } /// Return value of LayerMap::search #[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { pub layer: ReadableLayerWeak, pub lsn_floor: Lsn, } /// Return value of [`LayerMap::range_search`] /// /// Contains a mapping from a layer description to a keyspace /// accumulator that contains all the keys which intersect the layer /// from the original search space. #[derive(Debug)] pub struct RangeSearchResult { pub found: HashMap, } impl RangeSearchResult { fn new() -> Self { Self { found: HashMap::new(), } } fn map_to_in_memory_layer( in_memory_layer: Option, range: Range, ) -> RangeSearchResult { match in_memory_layer { Some(inmem) => { let search_result = SearchResult { lsn_floor: inmem.get_lsn_range().start, layer: ReadableLayerWeak::InMemoryLayer(inmem), }; let mut accum = KeySpaceAccum::new(); accum.add_range(range); RangeSearchResult { found: HashMap::from([(search_result, accum)]), } } None => RangeSearchResult::new(), } } } /// Collector for results of range search queries on the LayerMap. /// It should be provided with two iterators for the delta and image coverage /// that contain all the changes for layers which intersect the range. struct RangeSearchCollector where Iter: Iterator>)>, { in_memory_layer: Option, delta_coverage: Peekable, image_coverage: Peekable, key_range: Range, end_lsn: Lsn, current_delta: Option>, current_image: Option>, result: RangeSearchResult, } #[derive(Debug)] enum NextLayerType { Delta(i128), Image(i128), Both(i128), } impl NextLayerType { fn next_change_at_key(&self) -> Key { match self { NextLayerType::Delta(at) => Key::from_i128(*at), NextLayerType::Image(at) => Key::from_i128(*at), NextLayerType::Both(at) => Key::from_i128(*at), } } } impl RangeSearchCollector where Iter: Iterator>)>, { fn new( key_range: Range, end_lsn: Lsn, in_memory_layer: Option, delta_coverage: Iter, image_coverage: Iter, ) -> Self { Self { in_memory_layer, delta_coverage: delta_coverage.peekable(), image_coverage: image_coverage.peekable(), key_range, end_lsn, current_delta: None, current_image: None, result: RangeSearchResult::new(), } } /// Run the collector. Collection is implemented via a two pointer algorithm. /// One pointer tracks the start of the current range and the other tracks /// the beginning of the next range which will overlap with the next change /// in coverage across both image and delta. fn collect(mut self) -> RangeSearchResult { let next_layer_type = self.choose_next_layer_type(); let mut current_range_start = match next_layer_type { None => { // No changes for the range self.pad_range(self.key_range.clone()); return self.result; } Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => { // Changes only after the end of the range self.pad_range(self.key_range.clone()); return self.result; } Some(layer_type) => { // Changes for the range exist. let coverage_start = layer_type.next_change_at_key(); let range_before = self.key_range.start..coverage_start; self.pad_range(range_before); self.advance(&layer_type); coverage_start } }; while current_range_start < self.key_range.end { let next_layer_type = self.choose_next_layer_type(); match next_layer_type { Some(t) => { let current_range_end = t.next_change_at_key(); self.add_range(current_range_start..current_range_end); current_range_start = current_range_end; self.advance(&t); } None => { self.add_range(current_range_start..self.key_range.end); current_range_start = self.key_range.end; } } } self.result } /// Map a range which does not intersect any persistent layers to /// the in-memory layer candidate. fn pad_range(&mut self, key_range: Range) { if !key_range.is_empty() { if let Some(ref inmem) = self.in_memory_layer { let search_result = SearchResult { layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()), lsn_floor: inmem.get_lsn_range().start, }; self.result .found .entry(search_result) .or_default() .add_range(key_range); } } } /// Select the appropiate layer for the given range and update /// the collector. fn add_range(&mut self, covered_range: Range) { let selected = LayerMap::select_layer( self.current_delta.clone(), self.current_image.clone(), self.in_memory_layer.clone(), self.end_lsn, ); match selected { Some(search_result) => self .result .found .entry(search_result) .or_default() .add_range(covered_range), None => self.pad_range(covered_range), } } /// Move to the next coverage change. fn advance(&mut self, layer_type: &NextLayerType) { match layer_type { NextLayerType::Delta(_) => { let (_, layer) = self.delta_coverage.next().unwrap(); self.current_delta = layer; } NextLayerType::Image(_) => { let (_, layer) = self.image_coverage.next().unwrap(); self.current_image = layer; } NextLayerType::Both(_) => { let (_, image_layer) = self.image_coverage.next().unwrap(); let (_, delta_layer) = self.delta_coverage.next().unwrap(); self.current_image = image_layer; self.current_delta = delta_layer; } } } /// Pick the next coverage change: the one at the lesser key or both if they're alligned. fn choose_next_layer_type(&mut self) -> Option { let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key); let next_image_at = self.image_coverage.peek().map(|(key, _)| key); match (next_delta_at, next_image_at) { (None, None) => None, (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)), (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)), (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => { Some(NextLayerType::Image(*next_image_at)) } (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => { Some(NextLayerType::Delta(*next_delta_at)) } (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)), } } } #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct InMemoryLayerDesc { handle: InMemoryLayerHandle, lsn_range: Range, } impl InMemoryLayerDesc { pub(crate) fn get_lsn_range(&self) -> Range { self.lsn_range.clone() } } #[derive(Debug, PartialEq, Eq, Clone, Hash)] enum InMemoryLayerHandle { Open, Frozen(usize), } impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given /// 'key', with lsn.start < 'end_lsn'. /// /// The caller of this function is the page reconstruction /// algorithm looking for the next relevant delta layer, or /// the terminal image layer. The caller will pass the lsn_floor /// value as end_lsn in the next call to search. /// /// If there's an image layer exactly below the given end_lsn, /// search should return that layer regardless if there are /// overlapping deltas. /// /// If the latest layer is a delta and there is an overlapping /// image with it below, the lsn_floor returned should be right /// above that image so we don't skip it in the search. Otherwise /// the lsn_floor returned should be the bottom of the delta layer /// because we should make as much progress down the lsn axis /// as possible. It's fine if this way we skip some overlapping /// deltas, because the delta we returned would contain the same /// wal content. /// /// TODO: This API is convoluted and inefficient. If the caller /// makes N search calls, we'll end up finding the same latest /// image layer N times. We should either cache the latest image /// layer result, or simplify the api to `get_latest_image` and /// `get_latest_delta`, and only call `get_latest_image` once. /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { let in_memory_layer = self.search_in_memory_layer(end_lsn); let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { Some(version) => version, None => { return in_memory_layer.map(|desc| SearchResult { lsn_floor: desc.get_lsn_range().start, layer: ReadableLayerWeak::InMemoryLayer(desc), }); } }; let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn) } /// Select a layer from three potential candidates (in-memory, delta and image layer). /// The candidates represent the first layer of each type which intersect a key range. /// /// Layer types have an in implicit priority (image > delta > in-memory). For instance, /// if we have the option of reading an LSN range from both an image and a delta, we /// should read from the image. fn select_layer( delta_layer: Option>, image_layer: Option>, in_memory_layer: Option, end_lsn: Lsn, ) -> Option { assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); match (delta_layer, image_layer, in_memory_layer) { (None, None, None) => None, (None, Some(image), None) => { let lsn_floor = image.get_lsn_range().start; Some(SearchResult { layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor, }) } (Some(delta), None, None) => { let lsn_floor = delta.get_lsn_range().start; Some(SearchResult { layer: ReadableLayerWeak::PersistentLayer(delta), lsn_floor, }) } (Some(delta), Some(image), None) => { let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { Some(SearchResult { layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor: img_lsn, }) } else { // If the delta overlaps with the image in the LSN dimension, do a partial // up to the image layer. let lsn_floor = std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { layer: ReadableLayerWeak::PersistentLayer(delta), lsn_floor, }) } } (None, None, Some(inmem)) => { let lsn_floor = inmem.get_lsn_range().start; Some(SearchResult { layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } (None, Some(image), Some(inmem)) => { // If the in-memory layer overlaps with the image in the LSN dimension, do a partial // up to the image layer. let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { Some(SearchResult { layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor: img_lsn, }) } else { let lsn_floor = std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } } (Some(delta), None, Some(inmem)) => { // Overlaps between delta and in-memory layers are not a valid // state, but we handle them here for completeness. let delta_end = delta.get_lsn_range().end; let delta_is_newer = delta_end >= inmem.get_lsn_range().end; let delta_exact_match = delta_end == end_lsn; if delta_is_newer || delta_exact_match { Some(SearchResult { lsn_floor: delta.get_lsn_range().start, layer: ReadableLayerWeak::PersistentLayer(delta), }) } else { // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial // up to the delta layer. let lsn_floor = std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end); Some(SearchResult { layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } } (Some(delta), Some(image), Some(inmem)) => { // Determine the preferred persistent layer without taking the in-memory layer // into consideration. let persistent_res = Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn) .unwrap(); let persistent_l = match persistent_res.layer { ReadableLayerWeak::PersistentLayer(l) => l, ReadableLayerWeak::InMemoryLayer(_) => unreachable!(), }; // Now handle the in-memory layer overlaps. let inmem_res = if persistent_l.is_delta() { Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn) .unwrap() } else { Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn) .unwrap() }; Some(SearchResult { layer: inmem_res.layer, // Use the more restrictive LSN floor lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor), }) } } } pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { let in_memory_layer = self.search_in_memory_layer(end_lsn); let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { Some(version) => version, None => { return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range); } }; let raw_range = key_range.start.to_i128()..key_range.end.to_i128(); let delta_changes = version.delta_coverage.range_overlaps(&raw_range); let image_changes = version.image_coverage.range_overlaps(&raw_range); let collector = RangeSearchCollector::new( key_range, end_lsn, in_memory_layer, delta_changes, image_changes, ); collector.collect() } /// Start a batch of updates, applied on drop pub fn batch_update(&mut self) -> BatchedUpdates<'_> { BatchedUpdates { layer_map: self } } /// /// Insert an on-disk layer /// /// Helper function for BatchedUpdates::insert_historic /// /// TODO(chi): remove L generic so that we do not need to pass layer object. pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); self.watch_l0_deltas .send_replace(self.l0_delta_layers.len()); } self.historic.insert( historic_layer_coverage::LayerKey::from(&layer_desc), layer_desc.into(), ); } /// /// Remove an on-disk layer from the map. /// /// Helper function for BatchedUpdates::remove_historic /// pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); self.l0_delta_layers = l0_delta_layers; self.watch_l0_deltas .send_replace(self.l0_delta_layers.len()); // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, // vtable) pairs. assert_eq!( self.l0_delta_layers.len(), len_before - 1, "failed to locate removed historic layer from l0_delta_layers" ); } } /// Helper function for BatchedUpdates::drop. pub(self) fn flush_updates(&mut self) { self.historic.rebuild(); } /// Is there a newer image layer for given key- and LSN-range? Or a set /// of image layers within the specified lsn range that cover the entire /// specified key range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> bool { if key.is_empty() { // Vacuously true. There's a newer image for all 0 of the kerys in the range. return true; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, None => return false, }; let start = key.start.to_i128(); let end = key.end.to_i128(); let layer_covers = |layer: Option>| match layer { Some(layer) => layer.get_lsn_range().start >= lsn.start, None => false, }; // Check the start is covered if !layer_covers(version.image_coverage.query(start)) { return false; } // Check after all changes of coverage for (_, change_val) in version.image_coverage.range(start..end) { if !layer_covers(change_val) { return false; } } true } pub fn iter_historic_layers(&self) -> impl ExactSizeIterator> { self.historic.iter() } /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option { let is_below = |l: &Arc| { let start_lsn = l.get_lsn_range().start; below > start_lsn }; if let Some(open) = &self.open_layer { if is_below(open) { return Some(InMemoryLayerDesc { handle: InMemoryLayerHandle::Open, lsn_range: open.get_lsn_range(), }); } } self.frozen_layers .iter() .enumerate() .rfind(|(_idx, l)| is_below(l)) .map(|(idx, l)| InMemoryLayerDesc { handle: InMemoryLayerHandle::Frozen(idx), lsn_range: l.get_lsn_range(), }) } pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc { match desc.handle { InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(), InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(), } } /// /// Divide the whole given range of keys into sub-ranges based on the latest /// image layer that covers each range at the specified lsn (inclusive). /// This is used when creating new image layers. pub fn image_coverage( &self, key_range: &Range, lsn: Lsn, ) -> Vec<(Range, Option>)> { let version = match self.historic.get().unwrap().get_version(lsn.0) { Some(v) => v, None => return vec![], }; let start = key_range.start.to_i128(); let end = key_range.end.to_i128(); // Initialize loop variables let mut coverage: Vec<(Range, Option>)> = vec![]; let mut current_key = start; let mut current_val = version.image_coverage.query(start); // Loop through the change events and push intervals for (change_key, change_val) in version.image_coverage.range(start..end) { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); current_key = change_key; current_val.clone_from(&change_val); } // Add the final interval let kr = Key::from_i128(current_key)..Key::from_i128(end); coverage.push((kr, current_val.take())); coverage } /// Check if the key range resembles that of an L0 layer. pub fn is_l0(key_range: &Range, is_delta_layer: bool) -> bool { is_delta_layer && key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: /// layers that should count towards deciding whether or not to reimage /// a certain partition range. /// /// There are two kinds of layers we currently consider reimage-worthy: /// /// Case 1: Non-L0 layers are currently reimage-worthy by default. /// TODO Some of these layers are very sparse and cover the entire key /// range. Replacing 256MB of data (or less!) with terabytes of /// images doesn't seem wise. We need a better heuristic, possibly /// based on some of these factors: /// a) whether this layer has any wal in this partition range /// b) the size of the layer /// c) the number of images needed to cover it /// d) the estimated time until we'll have to reimage over it for GC /// /// Case 2: Since L0 layers by definition cover the entire key space, we consider /// them reimage-worthy only when the entire key space can be covered by very few /// images (currently 1). /// TODO The optimal number should probably be slightly higher than 1, but to /// implement that we need to plumb a lot more context into this function /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 if !Self::is_l0(&layer.key_range, layer.is_delta) { return true; } // Case 2 if partition_range == &(Key::MIN..Key::MAX) { return true; } false } /// Count the height of the tallest stack of reimage-worthy deltas /// in this 2d region. /// /// If `limit` is provided we don't try to count above that number. /// /// This number is used to compute the largest number of deltas that /// we'll need to visit for any page reconstruction in this region. /// We use this heuristic to decide whether to create an image layer. pub fn count_deltas(&self, key: &Range, lsn: &Range, limit: Option) -> usize { // We get the delta coverage of the region, and for each part of the coverage // we recurse right underneath the delta. The recursion depth is limited by // the largest result this function could return, which is in practice between // 3 and 10 (since we usually try to create an image when the number gets larger). if lsn.is_empty() || key.is_empty() || limit == Some(0) { return 0; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, None => return 0, }; let start = key.start.to_i128(); let end = key.end.to_i128(); // Initialize loop variables let mut max_stacked_deltas = 0; let mut current_key = start; let mut current_val = version.delta_coverage.query(start); // Loop through the delta coverage and recurse on each part for (change_key, change_val) in version.delta_coverage.range(start..end) { // If there's a relevant delta in this part, add 1 and recurse down if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, ); } } } current_key = change_key; current_val.clone_from(&change_val); } // Consider the last part if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(end); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, ); } } } max_stacked_deltas } /* BEGIN_HADRON */ /** * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully. * It works by first finding the latest image layers and store them into a map. Then for each delta layer, * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase * image LSN to 150 because there is no WAL record in between). * Finally, the image consistent LSN is computed by taking the minimum of all image layers. */ pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn { struct ImageLayerInfo { // creation LSN of the image layer image_lsn: Lsn, // the current minimum LSN of newer delta layers with overlapping key ranges min_delta_lsn: Lsn, } let started_at = Instant::now(); let min_l0_deltas_lsn = { let l0_deltas = self.level0_deltas(); l0_deltas .iter() .map(|layer| layer.get_lsn_range().start) .min() .unwrap_or(disk_consistent_lsn) }; let global_key_range = Key::MIN..Key::MAX; // step 1: collect all most recent image layers into a map // map: end key to image_layer_info let mut image_map: BTreeMap = BTreeMap::new(); for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) { let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0)); image_map.insert( img_range.end, ImageLayerInfo { image_lsn: img_lsn, min_delta_lsn: min_l0_deltas_lsn, }, ); } // step 2: go through all delta layers, and update the image layer info with overlapping // key ranges for layer in self.historic.iter() { if !layer.is_delta { continue; } let delta_key_range = layer.get_key_range(); let delta_lsn_range = layer.get_lsn_range(); for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) { debug_assert!(img_end_key >= &delta_key_range.start); if delta_lsn_range.end > img_info.image_lsn { // the delta layer includes WAL records after the image // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3 img_info.min_delta_lsn = std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start); } if img_end_key >= &delta_key_range.end { // we have fully processed all overlapping image layers break; } } } // step 3, go through all image layers and find the image consistent LSN let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap(); let mut prev_key = Key::MIN; for (img_key, img_info) in image_map { tracing::debug!( "Image layer {:?}:{} has min delta lsn {}", Range { start: prev_key, end: img_key, }, img_info.image_lsn, img_info.min_delta_lsn, ); let image_lsn = std::cmp::max( img_info.image_lsn, img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)), ); img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn); prev_key = img_key; } tracing::info!( "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.", img_consistent_lsn, disk_consistent_lsn, started_at.elapsed().as_millis(), self.historic.len() ); img_consistent_lsn } /* END_HADRON */ /// Return all L0 delta layers pub fn level0_deltas(&self) -> &Vec> { &self.l0_delta_layers } /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers. pub fn watch_level0_deltas(&self) -> watch::Receiver { self.watch_l0_deltas.subscribe() } /// debugging function to print out the contents of the layer map #[allow(unused)] pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!("Begin dump LayerMap"); println!("open_layer:"); if let Some(open_layer) = &self.open_layer { open_layer.dump(verbose, ctx).await?; } println!("frozen_layers:"); for frozen_layer in self.frozen_layers.iter() { frozen_layer.dump(verbose, ctx).await?; } println!("historic_layers:"); for desc in self.iter_historic_layers() { desc.dump(); } println!("End dump LayerMap"); Ok(()) } /// `read_points` represent the tip of a timeline and any branch points, i.e. the places /// where we expect to serve reads. /// /// This function is O(N) and should be called infrequently. The caller is responsible for /// looking up and updating the Layer objects for these layer descriptors. pub fn get_visibility( &self, mut read_points: Vec, ) -> ( Vec<(Arc, LayerVisibilityHint)>, KeySpace, ) { // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas // KeySpace is intended to be composed statically and iterated over. struct KeyShadow { // Map of range start to range end inner: RangeSetBlaze, } impl KeyShadow { fn new() -> Self { Self { inner: Default::default(), } } fn contains(&self, range: Range) -> bool { let range_incl = range.start.to_i128()..=range.end.to_i128() - 1; self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint( CheckSortedDisjoint::from([range_incl]), )) } /// Add the input range to the keys covered by self. /// /// Return true if inserting this range covered some keys that were previously not covered fn cover(&mut self, insert: Range) -> bool { let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1; self.inner.ranges_insert(range_incl) } fn reset(&mut self) { self.inner = Default::default(); } fn to_keyspace(&self) -> KeySpace { let mut accum = KeySpaceAccum::new(); for range_incl in self.inner.ranges() { let range = Range { start: Key::from_i128(*range_incl.start()), end: Key::from_i128(range_incl.end() + 1), }; accum.add_range(range) } accum.to_keyspace() } } // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow, // and a ReadPoint read_points.sort_by_key(|rp| rp.0); let mut shadow = KeyShadow::new(); // We will interleave all our read points and layers into a sorted collection enum Item { ReadPoint { lsn: Lsn }, Layer(Arc), } let mut items = Vec::with_capacity(self.historic.len() + read_points.len()); items.extend(self.iter_historic_layers().map(Item::Layer)); items.extend( read_points .into_iter() .map(|rp| Item::ReadPoint { lsn: rp }), ); // Ordering: we want to iterate like this: // 1. Highest LSNs first // 2. Consider images before deltas if they end at the same LSNs (images cover deltas) // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible) items.sort_by_key(|item| { std::cmp::Reverse(match item { Item::Layer(layer) => { if layer.is_delta() { (Lsn(layer.get_lsn_range().end.0 - 1), 0) } else { (layer.image_layer_lsn(), 1) } } Item::ReadPoint { lsn } => (*lsn, 2), }) }); let mut results = Vec::with_capacity(self.historic.len()); let mut maybe_covered_deltas: Vec> = Vec::new(); for item in items { let (reached_lsn, is_readpoint) = match &item { Item::ReadPoint { lsn } => (lsn, true), Item::Layer(layer) => (&layer.lsn_range.start, false), }; maybe_covered_deltas.retain(|d| { if *reached_lsn >= d.lsn_range.start && is_readpoint { // We encountered a readpoint within the delta layer: it is visible results.push((d.clone(), LayerVisibilityHint::Visible)); false } else if *reached_lsn < d.lsn_range.start { // We passed the layer's range without encountering a read point: it is not visible results.push((d.clone(), LayerVisibilityHint::Covered)); false } else { // We're still in the delta layer: continue iterating true } }); match item { Item::ReadPoint { lsn: _lsn } => { // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have // to assume that the whole key range is visible at the branch point. shadow.reset(); } Item::Layer(layer) => { let visibility = if layer.is_delta() { if shadow.contains(layer.get_key_range()) { // If a layer isn't visible based on current state, we must defer deciding whether // it is truly not visible until we have advanced past the delta's range: we might // encounter another branch point within this delta layer's LSN range. maybe_covered_deltas.push(layer); continue; } else { LayerVisibilityHint::Visible } } else { let modified = shadow.cover(layer.get_key_range()); if modified { // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered LayerVisibilityHint::Visible } else { // An image layer in a region that was already covered LayerVisibilityHint::Covered } }; results.push((layer, visibility)); } } } // Drain any remaining maybe_covered deltas results.extend( maybe_covered_deltas .into_iter() .map(|d| (d, LayerVisibilityHint::Covered)), ); (results, shadow.to_keyspace()) } } #[cfg(test)] mod tests { use std::collections::HashMap; use std::path::PathBuf; use crate::{ DEFAULT_PG_VERSION, tenant::{harness::TenantHarness, storage_layer::LayerName}, }; use pageserver_api::key::DBDIR_KEY; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use tokio_util::sync::CancellationToken; use utils::id::{TenantId, TimelineId}; use utils::shard::TenantShardId; use super::*; use crate::tenant::IndexPart; #[derive(Clone)] struct LayerDesc { key_range: Range, lsn_range: Range, is_delta: bool, } fn create_layer_map(layers: Vec) -> LayerMap { let mut layer_map = LayerMap::default(); for layer in layers { layer_map.insert_historic_noflush(PersistentLayerDesc::new_test( layer.key_range, layer.lsn_range, layer.is_delta, )); } layer_map.flush_updates(); layer_map } fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { let lhs: HashMap = lhs .found .into_iter() .map(|(search_result, accum)| (search_result, accum.to_keyspace())) .collect(); let rhs: HashMap = rhs .found .into_iter() .map(|(search_result, accum)| (search_result, accum.to_keyspace())) .collect(); assert_eq!(lhs, rhs); } #[cfg(test)] fn brute_force_range_search( layer_map: &LayerMap, key_range: Range, end_lsn: Lsn, ) -> RangeSearchResult { let mut range_search_result = RangeSearchResult::new(); let mut key = key_range.start; while key != key_range.end { let res = layer_map.search(key, end_lsn); if let Some(res) = res { range_search_result .found .entry(res) .or_default() .add_key(key); } key = key.next(); } range_search_result } #[test] fn ranged_search_on_empty_layer_map() { let layer_map = LayerMap::default(); let range = Key::from_i128(100)..Key::from_i128(200); let res = layer_map.range_search(range.clone(), Lsn(100)); assert_range_search_result_eq(res, RangeSearchResult::new()); } #[tokio::test] async fn ranged_search() { let harness = TenantHarness::create("ranged_search").await.unwrap(); let (tenant, ctx) = harness.load().await; let cancel = CancellationToken::new(); let timeline_id = TimelineId::generate(); // Create the timeline such that the in-memory layers can be written // to the timeline directory. tenant .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let gate = utils::sync::gate::Gate::default(); let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range| { let layer = InMemoryLayer::create( harness.conf, timeline_id, harness.tenant_shard_id, lsn_range.start, &gate, &cancel, &ctx, ) .await .unwrap(); layer.freeze(lsn_range.end).await; layer_map.frozen_layers.push_back(Arc::new(layer)); }; let in_memory_layer_configurations = [ vec![], // Overlaps with the top-most image vec![Lsn(35)..Lsn(50)], ]; let layers = vec![ LayerDesc { key_range: Key::from_i128(15)..Key::from_i128(50), lsn_range: Lsn(5)..Lsn(6), is_delta: false, }, LayerDesc { key_range: Key::from_i128(10)..Key::from_i128(20), lsn_range: Lsn(5)..Lsn(20), is_delta: true, }, LayerDesc { key_range: Key::from_i128(15)..Key::from_i128(25), lsn_range: Lsn(20)..Lsn(30), is_delta: true, }, LayerDesc { key_range: Key::from_i128(35)..Key::from_i128(40), lsn_range: Lsn(25)..Lsn(35), is_delta: true, }, LayerDesc { key_range: Key::from_i128(35)..Key::from_i128(40), lsn_range: Lsn(40)..Lsn(41), is_delta: false, }, ]; let mut layer_map = create_layer_map(layers.clone()); for in_memory_layers in in_memory_layer_configurations { for in_mem_layer_range in in_memory_layers { add_in_memory_layer(&mut layer_map, in_mem_layer_range).await; } for start in 0..60 { for end in (start + 1)..60 { let range = Key::from_i128(start)..Key::from_i128(end); let result = layer_map.range_search(range.clone(), Lsn(100)); let expected = brute_force_range_search(&layer_map, range, Lsn(100)); eprintln!("{start}..{end}: {result:?}"); assert_range_search_result_eq(result, expected); } } } } #[test] fn layer_visibility_basic() { // A simple synthetic input, as a smoke test. let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); let timeline_id = TimelineId::generate(); let mut layer_map = LayerMap::default(); let mut updates = layer_map.batch_update(); const FAKE_LAYER_SIZE: u64 = 1024; let inject_delta = |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn_start: u64, lsn_end: u64| { let desc = PersistentLayerDesc::new_delta( tenant_shard_id, timeline_id, Range { start: Key::from_i128(key_start), end: Key::from_i128(key_end), }, Range { start: Lsn(lsn_start), end: Lsn(lsn_end), }, 1024, ); updates.insert_historic(desc.clone()); desc }; let inject_image = |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| { let desc = PersistentLayerDesc::new_img( tenant_shard_id, timeline_id, Range { start: Key::from_i128(key_start), end: Key::from_i128(key_end), }, Lsn(lsn), FAKE_LAYER_SIZE, ); updates.insert_historic(desc.clone()); desc }; // // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios // we expect to handle. You can follow these examples through in the same order as they would be processed // by the function under test. // let mut read_points = vec![Lsn(1000)]; // A delta ahead of any image layer let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110); // An image layer is visible and covers some layers beneath itself let visible_covering_img = inject_image(&mut updates, 5, 25, 99); // A delta layer covered by the image layer: should be covered let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100); // A delta layer partially covered by an image layer: should be visible let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100); // A delta layer not covered by an image layer: should be visible let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100); // An image layer covered by the image layer above: should be covered let covered_image = inject_image(&mut updates, 10, 20, 89); // An image layer partially covered by an image layer: should be visible let partially_covered_image = inject_image(&mut updates, 1, 7, 89); // An image layer not covered by an image layer: should be visible let not_covered_image = inject_image(&mut updates, 1, 4, 89); // A read point: this will make subsequent layers below here visible, even if there are // more recent layers covering them. read_points.push(Lsn(80)); // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79); // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range: // the read point should make it visible, even though its end LSN is covered let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69); let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69); read_points.push(Lsn(65)); let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69); let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65); updates.flush(); let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); let layer_visibilities = layer_visibilities.into_iter().collect::>(); assert_eq!( layer_visibilities.get(&ahead_layer), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&visible_covering_img), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&covered_delta), Some(&LayerVisibilityHint::Covered) ); assert_eq!( layer_visibilities.get(&partially_covered_delta), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(¬_covered_delta), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&covered_image), Some(&LayerVisibilityHint::Covered) ); assert_eq!( layer_visibilities.get(&partially_covered_image), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(¬_covered_image), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&covered_delta_below_read_point), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&covering_img_between_read_points), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&covered_delta_between_read_points), Some(&LayerVisibilityHint::Covered) ); assert_eq!( layer_visibilities.get(&covered_delta_intersects_read_point), Some(&LayerVisibilityHint::Visible) ); assert_eq!( layer_visibilities.get(&visible_img_after_last_read_point), Some(&LayerVisibilityHint::Visible) ); // Shadow should include all the images below the last read point let expected_shadow = KeySpace { ranges: vec![Key::from_i128(10)..Key::from_i128(20)], }; assert_eq!(shadow, expected_shadow); } fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } #[test] fn layer_visibility_realistic() { // Load a large example layermap let index_raw = std::fs::read_to_string(fixture_path( "test_data/indices/mixed_workload/index_part.json", )) .unwrap(); let index: IndexPart = serde_json::from_str::(&index_raw).unwrap(); let tenant_id = TenantId::generate(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); let timeline_id = TimelineId::generate(); let mut layer_map = LayerMap::default(); let mut updates = layer_map.batch_update(); for (layer_name, layer_metadata) in index.layer_metadata { let layer_desc = match layer_name { LayerName::Image(layer_name) => PersistentLayerDesc { key_range: layer_name.key_range.clone(), lsn_range: layer_name.lsn_as_range(), tenant_shard_id, timeline_id, is_delta: false, file_size: layer_metadata.file_size, }, LayerName::Delta(layer_name) => PersistentLayerDesc { key_range: layer_name.key_range, lsn_range: layer_name.lsn_range, tenant_shard_id, timeline_id, is_delta: true, file_size: layer_metadata.file_size, }, }; updates.insert_historic(layer_desc); } updates.flush(); let read_points = vec![index.metadata.disk_consistent_lsn()]; let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); for (layer_desc, visibility) in &layer_visibilities { tracing::info!("{layer_desc:?}: {visibility:?}"); eprintln!("{layer_desc:?}: {visibility:?}"); } // The shadow should be non-empty, since there were some image layers assert!(!shadow.ranges.is_empty()); // At least some layers should be marked covered assert!( layer_visibilities .iter() .any(|i| matches!(i.1, LayerVisibilityHint::Covered)) ); let layer_visibilities = layer_visibilities.into_iter().collect::>(); // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it for (layer_desc, visible) in &layer_visibilities { let mut coverage = KeySpaceRandomAccum::new(); let mut covered_by = Vec::new(); for other_layer in layer_map.iter_historic_layers() { if &other_layer == layer_desc { continue; } if !other_layer.is_delta() && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1) && other_layer.key_range.start <= layer_desc.key_range.end && layer_desc.key_range.start <= other_layer.key_range.end { coverage.add_range(other_layer.get_key_range()); covered_by.push((*other_layer).clone()); } } let coverage = coverage.to_keyspace(); let expect_visible = if coverage.ranges.len() == 1 && coverage.contains(&layer_desc.key_range.start) && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1)) { LayerVisibilityHint::Covered } else { LayerVisibilityHint::Visible }; if expect_visible != *visible { eprintln!( "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}", layer_desc.key_range.start, layer_desc.key_range.end, layer_desc.lsn_range.start, layer_desc.lsn_range.end, layer_desc.is_delta() ); if expect_visible == LayerVisibilityHint::Covered { eprintln!("Covered by:"); for other in covered_by { eprintln!( " {}..{} @ {}", other.get_key_range().start, other.get_key_range().end, other.image_layer_lsn() ); } if let Some(range) = coverage.ranges.first() { eprintln!( "Total coverage from contributing layers: {}..{}", range.start, range.end ); } else { eprintln!( "Total coverage from contributing layers: {:?}", coverage.ranges ); } } } assert_eq!(expect_visible, *visible); } // Sanity: the layer that holds latest data for the DBDIR key should always be visible // (just using this key as a key that will always exist for any layermap fixture) let dbdir_layer = { let readable_layer = layer_map .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) .unwrap(); match readable_layer.layer { ReadableLayerWeak::PersistentLayer(desc) => desc, ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""), } }; assert!(matches!( layer_visibilities.get(&dbdir_layer).unwrap(), LayerVisibilityHint::Visible )); } /* BEGIN_HADRON */ #[test] fn test_compute_image_consistent_lsn() { let mut layer_map = LayerMap::default(); let disk_consistent_lsn = Lsn(1000); // case 1: empty layer map let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!( disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(), image_consistent_lsn ); // case 2: only L0 delta layer { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(100), Lsn(900)..Lsn(990), true, )); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(100), Lsn(850)..Lsn(899), true, )); } // should use min L0 delta LSN - 1 as image consistent LSN let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(849), image_consistent_lsn); // case 3: 3 images, no L1 delta { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(40), Lsn(100)..Lsn(100), false, )); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(40)..Key::from_i128(70), Lsn(200)..Lsn(200), false, )); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(70)..Key::from_i128(100), Lsn(150)..Lsn(150), false, )); } // should use min L0 delta LSN - 1 as image consistent LSN let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(849), image_consistent_lsn); // case 4: 3 images with 1 L1 delta { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(50), Lsn(300)..Lsn(350), true, )); } let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(299), image_consistent_lsn); // case 5: 3 images with 1 more L1 delta with smaller LSN { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(50)..Key::from_i128(72), Lsn(200)..Lsn(300), true, )); } let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(199), image_consistent_lsn); // case 6: 3 images with more newer L1 deltas (no impact on final results) { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(30), Lsn(400)..Lsn(500), true, )); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(35)..Key::from_i128(100), Lsn(450)..Lsn(600), true, )); } let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(199), image_consistent_lsn); // case 7: 3 images with more older L1 deltas (no impact on final results) { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(40), Lsn(0)..Lsn(50), true, )); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(50)..Key::from_i128(100), Lsn(10)..Lsn(60), true, )); } let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(199), image_consistent_lsn); // case 8: 3 images with one more L1 delta with overlapping LSN range { let mut updates = layer_map.batch_update(); updates.insert_historic(PersistentLayerDesc::new_test( Key::from_i128(0)..Key::from_i128(50), Lsn(50)..Lsn(250), true, )); } let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); assert_eq!(Lsn(100), image_consistent_lsn); } /* END_HADRON */ } #[cfg(test)] mod select_layer_tests { use super::*; fn create_persistent_layer( start_lsn: u64, end_lsn: u64, is_delta: bool, ) -> Arc { if !is_delta { assert_eq!(end_lsn, start_lsn + 1); } Arc::new(PersistentLayerDesc::new_test( Key::MIN..Key::MAX, Lsn(start_lsn)..Lsn(end_lsn), is_delta, )) } fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc { InMemoryLayerDesc { handle: InMemoryLayerHandle::Open, lsn_range: Lsn(start_lsn)..Lsn(end_lsn), } } #[test] fn test_select_layer_empty() { assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none()); } #[test] fn test_select_layer_only_delta() { let delta = create_persistent_layer(10, 20, true); let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); } #[test] fn test_select_layer_only_image() { let image = create_persistent_layer(10, 11, false); let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } #[test] fn test_select_layer_only_inmem() { let inmem = create_inmem_layer(10, 20); let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); } #[test] fn test_select_layer_image_inside_delta() { let delta = create_persistent_layer(10, 20, true); let image = create_persistent_layer(15, 16, false); let result = LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100)) .unwrap(); assert_eq!(result.lsn_floor, Lsn(16)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), None, result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(15)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } #[test] fn test_select_layer_newer_image() { let delta = create_persistent_layer(10, 20, true); let image = create_persistent_layer(25, 26, false); let result = LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) .unwrap(); assert_eq!(result.lsn_floor, Lsn(25)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); let result = LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); } #[test] fn test_select_layer_delta_with_older_image() { let delta = create_persistent_layer(15, 25, true); let image = create_persistent_layer(10, 11, false); let result = LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) .unwrap(); assert_eq!(result.lsn_floor, Lsn(15)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); let result = LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } #[test] fn test_select_layer_image_inside_inmem() { let image = create_persistent_layer(15, 16, false); let inmem = create_inmem_layer(10, 25); let result = LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30)) .unwrap(); assert_eq!(result.lsn_floor, Lsn(16)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); let result = LayerMap::select_layer( None, Some(image.clone()), Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(15)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); let result = LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); } #[test] fn test_select_layer_delta_inside_inmem() { let delta_top = create_persistent_layer(15, 20, true); let delta_bottom = create_persistent_layer(10, 15, true); let inmem = create_inmem_layer(15, 25); let result = LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30)) .unwrap(); assert_eq!(result.lsn_floor, Lsn(20)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); let result = LayerMap::select_layer( Some(delta_top.clone()), None, Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(15)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top)) ); let result = LayerMap::select_layer( Some(delta_bottom.clone()), None, Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(10)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom)) ); } #[test] fn test_select_layer_all_overlap_1() { let inmem = create_inmem_layer(10, 30); let delta = create_persistent_layer(15, 25, true); let image = create_persistent_layer(20, 21, false); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), Lsn(50), ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(25)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(21)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(20)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } #[test] fn test_select_layer_all_overlap_2() { let inmem = create_inmem_layer(20, 30); let delta = create_persistent_layer(10, 40, true); let image = create_persistent_layer(25, 26, false); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), Lsn(50), ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(26)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(25)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } #[test] fn test_select_layer_all_overlap_3() { let inmem = create_inmem_layer(30, 40); let delta = create_persistent_layer(10, 30, true); let image = create_persistent_layer(20, 21, false); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), Some(inmem.clone()), Lsn(50), ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(30)); assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), None, result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(21)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) ); let result = LayerMap::select_layer( Some(delta.clone()), Some(image.clone()), None, result.lsn_floor, ) .unwrap(); assert_eq!(result.lsn_floor, Lsn(20)); assert!( matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) ); } } ================================================ FILE: pageserver/src/tenant/metadata.rs ================================================ //! Describes the legacy now hopefully no longer modified per-timeline metadata. //! //! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and //! their timelines, this struct and its original serialization format is still needed because //! they were written a long time ago. //! //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json //! versioning. //! //! To clean up this module we need to migrate all index_part.json files to a later version. //! While doing this, we need to be mindful about s3 based recovery as well, so it might take //! however long we keep the old versions to be able to delete the old code. After that, we can //! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and //! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards //! compatibility. //! //! [`remote_timeline_client`]: super::remote_timeline_client //! [`IndexPart`]: super::remote_timeline_client::index::IndexPart use anyhow::ensure; use postgres_ffi::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::bin_ser::{BeSer, SerializeError}; use utils::id::TimelineId; use utils::lsn::Lsn; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; /// Previous supported format versions. /// /// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming /// that requires a scrubber run which is yet to be done. const METADATA_OLD_FORMAT_VERSION: u16 = 3; /// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic. /// /// This is the same assumption that PostgreSQL makes with the control file, /// /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; /// Legacy metadata stored as a component of `index_part.json` per timeline. /// /// Do not make new changes to this type or the module. In production, we have two different kinds /// of serializations of this type: bincode and json. Bincode version reflects what used to be /// stored on disk in earlier versions and does internal crc32 checksumming. /// /// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would /// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern /// as-exists in `index_part.json` ([`self::modern_serde`]). /// /// ```compile_fail /// #[derive(serde::Serialize)] /// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata); /// ``` /// /// ```compile_fail /// #[derive(serde::Deserialize)] /// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata); /// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, body: TimelineMetadataBodyV2, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataHeader { checksum: u32, // CRC of serialized metadata body size: u16, // size of serialized metadata format_version: u16, // metadata format version (used for compatibility checks) } impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { type Error = Crc32CalculationFailed; fn try_from(value: &TimelineMetadataBodyV2) -> Result { #[derive(Default)] struct Crc32Sink { crc: u32, count: usize, } impl std::io::Write for Crc32Sink { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.crc = crc32c::crc32c_append(self.crc, buf); self.count += buf.len(); Ok(buf.len()) } fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works // across serialization versions let mut sink = Crc32Sink::default(); ::ser_into(value, &mut sink) .map_err(Crc32CalculationFailed)?; let size = METADATA_HDR_SIZE + sink.count; Ok(TimelineMetadataHeader { checksum: sink.crc, size: size as u16, format_version: METADATA_FORMAT_VERSION, }) } } #[derive(thiserror::Error, Debug)] #[error("re-serializing for crc32 failed")] struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); const METADATA_HDR_SIZE: usize = size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV2 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a // lot. We only store it in the metadata file when we flush *all* the // in-memory data so that 'last_record_lsn' is the same as // 'disk_consistent_lsn'. That's OK, because after page server restart, as // soon as we reprocess at least one record, we will have a valid // 'prev_record_lsn' value in memory again. This is only really needed when // doing a clean shutdown, so that there is no more WAL beyond // 'disk_consistent_lsn' prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, // The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`]. latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, pg_version: PgMajorVersion, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV1 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a // lot. We only store it in the metadata file when we flush *all* the // in-memory data so that 'last_record_lsn' is the same as // 'disk_consistent_lsn'. That's OK, because after page server restart, as // soon as we reprocess at least one record, we will have a valid // 'prev_record_lsn' value in memory again. This is only really needed when // doing a clean shutdown, so that there is no more WAL beyond // 'disk_consistent_lsn' prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, } impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, pg_version: PgMajorVersion, ) -> Self { Self { hdr: TimelineMetadataHeader { checksum: 0, size: 0, format_version: METADATA_FORMAT_VERSION, }, body: TimelineMetadataBodyV2 { disk_consistent_lsn, prev_record_lsn, ancestor_timeline, ancestor_lsn, latest_gc_cutoff_lsn, initdb_lsn, pg_version, }, } } #[cfg(test)] pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result { self.hdr = TimelineMetadataHeader::try_from(&self.body)?; Ok(self) } fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; // backward compatible only up to this version ensure!( hdr.format_version == METADATA_OLD_FORMAT_VERSION, "unsupported metadata format version {}", hdr.format_version ); let metadata_size = hdr.size as usize; let body: TimelineMetadataBodyV1 = TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; let body = TimelineMetadataBodyV2 { disk_consistent_lsn: body.disk_consistent_lsn, prev_record_lsn: body.prev_record_lsn, ancestor_timeline: body.ancestor_timeline, ancestor_lsn: body.ancestor_lsn, latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, initdb_lsn: body.initdb_lsn, pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 }; hdr.format_version = METADATA_FORMAT_VERSION; Ok(Self { hdr, body }) } pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; let metadata_size = hdr.size as usize; ensure!( metadata_size <= METADATA_MAX_SIZE, "corrupted metadata file" ); let calculated_checksum = crc32c::crc32c(&metadata_bytes[METADATA_HDR_SIZE..metadata_size]); ensure!( hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); if hdr.format_version != METADATA_FORMAT_VERSION { // If metadata has the old format, // upgrade it and return the result TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) } else { let body = TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; ensure!( body.disk_consistent_lsn.is_aligned(), "disk_consistent_lsn is not aligned" ); Ok(TimelineMetadata { hdr, body }) } } pub fn to_bytes(&self) -> Result, SerializeError> { let body_bytes = self.body.ser()?; let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); Ok(metadata_bytes) } /// [`Lsn`] that corresponds to the corresponding timeline directory /// contents, stored locally in the pageserver workdir. pub fn disk_consistent_lsn(&self) -> Lsn { self.body.disk_consistent_lsn } pub fn prev_record_lsn(&self) -> Option { self.body.prev_record_lsn } pub fn ancestor_timeline(&self) -> Option { self.body.ancestor_timeline } pub fn ancestor_lsn(&self) -> Lsn { self.body.ancestor_lsn } /// When reparenting, the `ancestor_lsn` does not change. /// /// Returns true if anything was changed. pub fn reparent(&mut self, timeline: &TimelineId) { assert!(self.body.ancestor_timeline.is_some()); // no assertion for redoing this: it's fine, we may have to repeat this multiple times over self.body.ancestor_timeline = Some(*timeline); } /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { // Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we // ensure the LSN is the same. So we don't check the timeline ID. if self.body.ancestor_lsn != Lsn(0) { assert_eq!(self.body.ancestor_lsn, branchpoint.1); } self.body.ancestor_timeline = None; self.body.ancestor_lsn = Lsn(0); } pub fn latest_gc_cutoff_lsn(&self) -> Lsn { self.body.latest_gc_cutoff_lsn } pub fn initdb_lsn(&self) -> Lsn { self.body.initdb_lsn } pub fn pg_version(&self) -> PgMajorVersion { self.body.pg_version } // Checksums make it awkward to build a valid instance by hand. This helper // provides a TimelineMetadata with a valid checksum in its header. pub fn example() -> Self { let instance = Self::new( "0/16960E8".parse::().unwrap(), None, None, Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), PgMajorVersion::PG14, ); let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() } pub(crate) fn apply(&mut self, update: &MetadataUpdate) { self.body.disk_consistent_lsn = update.disk_consistent_lsn; self.body.prev_record_lsn = update.prev_record_lsn; self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn; } } pub(crate) mod modern_serde { use serde::{Deserialize, Serialize}; use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, { // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec field with // BeSer. struct Visitor; impl<'d> serde::de::Visitor<'d> for Visitor { type Value = TimelineMetadata; fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.write_str("BeSer bytes or json structure") } fn visit_seq
(self, seq: A) -> Result where A: serde::de::SeqAccess<'d>, { use serde::de::Error; let de = serde::de::value::SeqAccessDeserializer::new(seq); Vec::::deserialize(de) .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))? } fn visit_map(self, map: A) -> Result where A: serde::de::MapAccess<'d>, { use serde::de::Error; let de = serde::de::value::MapAccessDeserializer::new(map); let body = TimelineMetadataBodyV2::deserialize(de)?; let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?; Ok(TimelineMetadata { hdr, body }) } } deserializer.deserialize_any(Visitor) } pub(crate) fn serialize( metadata: &TimelineMetadata, serializer: S, ) -> Result where S: serde::Serializer, { // header is not needed, upon reading we've upgraded all v1 to v2 metadata.body.serialize(serializer) } #[test] fn deserializes_bytes_as_well_as_equivalent_body_v2() { #[derive(serde::Deserialize, serde::Serialize)] struct Wrapper( #[serde(deserialize_with = "deserialize", serialize_with = "serialize")] TimelineMetadata, ); let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]"; let wrapper_from_bytes = serde_json::from_str::(too_many_bytes).unwrap(); let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap(); assert_eq!( serialized, serde_json::json! {{ "disk_consistent_lsn": "0/149FD90", "prev_record_lsn": "0/149FD18", "ancestor_timeline": null, "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/149FD18", "initdb_lsn": "0/149FD18", "pg_version": 15 }} ); let wrapper_from_json = serde_json::value::from_value::(serialized).unwrap(); assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0); } } /// Parts of the metadata which are regularly modified. pub(crate) struct MetadataUpdate { disk_consistent_lsn: Lsn, prev_record_lsn: Option, latest_gc_cutoff_lsn: Lsn, } impl MetadataUpdate { pub(crate) fn new( disk_consistent_lsn: Lsn, prev_record_lsn: Option, latest_gc_cutoff_lsn: Lsn, ) -> Self { Self { disk_consistent_lsn, prev_record_lsn, latest_gc_cutoff_lsn, } } } #[cfg(test)] mod tests { use super::*; use crate::tenant::harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { let original_metadata = TimelineMetadata::new( Lsn(0x200), Some(Lsn(0x100)), Some(TIMELINE_ID), Lsn(0), Lsn(0), Lsn(0), // Any version will do here, so use the default crate::DEFAULT_PG_VERSION, ); let metadata_bytes = original_metadata .to_bytes() .expect("Should serialize correct metadata to bytes"); let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) .expect("Should deserialize its own bytes"); assert_eq!( deserialized_metadata.body, original_metadata.body, "Metadata that was serialized to bytes and deserialized back should not change" ); } // Generate old version metadata and read it with current code. // Ensure that it is upgraded correctly #[test] fn test_metadata_upgrade() { #[derive(Debug, Clone, PartialEq, Eq)] struct TimelineMetadataV1 { hdr: TimelineMetadataHeader, body: TimelineMetadataBodyV1, } let metadata_v1 = TimelineMetadataV1 { hdr: TimelineMetadataHeader { checksum: 0, size: 0, format_version: METADATA_OLD_FORMAT_VERSION, }, body: TimelineMetadataBodyV1 { disk_consistent_lsn: Lsn(0x200), prev_record_lsn: Some(Lsn(0x100)), ancestor_timeline: Some(TIMELINE_ID), ancestor_lsn: Lsn(0), latest_gc_cutoff_lsn: Lsn(0), initdb_lsn: Lsn(0), }, }; impl TimelineMetadataV1 { pub fn to_bytes(&self) -> anyhow::Result> { let body_bytes = self.body.ser()?; let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, format_version: METADATA_OLD_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); Ok(metadata_bytes) } } let metadata_bytes = metadata_v1 .to_bytes() .expect("Should serialize correct metadata to bytes"); // This should deserialize to the latest version format let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) .expect("Should deserialize its own bytes"); let expected_metadata = TimelineMetadata::new( Lsn(0x200), Some(Lsn(0x100)), Some(TIMELINE_ID), Lsn(0), Lsn(0), Lsn(0), PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 ); assert_eq!( deserialized_metadata.body, expected_metadata.body, "Metadata of the old version {METADATA_OLD_FORMAT_VERSION} should be upgraded to the latest version {METADATA_FORMAT_VERSION}" ); } #[test] fn test_metadata_bincode_serde_ensure_roundtrip() { let original_metadata = TimelineMetadata::new( Lsn(0x200), Some(Lsn(0x100)), Some(TIMELINE_ID), Lsn(0), Lsn(0), Lsn(0), // Updating this version to 17 will cause the test to fail at the // next assert_eq!(). PgMajorVersion::PG16, ); let expected_bytes = vec![ /* TimelineMetadataHeader */ 74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) /* TimelineMetadataBodyV2 */ 0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes) 1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes) 1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119, 136, // ancestor_timeline (17 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes) 0, 0, 0, 16, // pg_version (4 bytes) /* padding bytes */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; let metadata_ser_bytes = original_metadata.to_bytes().unwrap(); assert_eq!(metadata_ser_bytes, expected_bytes); let expected_metadata = { let mut temp_metadata = original_metadata; let body_bytes = temp_metadata .body .ser() .expect("Cannot serialize the metadata body"); let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; temp_metadata.hdr = hdr; temp_metadata }; let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap(); assert_eq!(des_metadata, expected_metadata); } } ================================================ FILE: pageserver/src/tenant/mgr.rs ================================================ //! This module acts as a switchboard to access different repositories managed by this //! page server. use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; use anyhow::Context; use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::{DetachBehavior, LocationConfigMode}; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::Rng; use rand::distr::Alphanumeric; use remote_storage::TimeoutOrCancel; use sysinfo::SystemExt; use tokio::fs; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use utils::{backoff, completion, crashsafe}; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ RetryForeverError, StorageControllerUpcallApi, StorageControllerUpcallClient, }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{ AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState, }; use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service /// reads and ingest WAL. /// - `Secondary`: is only keeping a local cache warm. /// /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because /// that way we avoid having to carefully switch a tenant's ingestion etc on and off during /// its lifetime, and we can preserve some important safety invariants like `Tenant` always /// having a properly acquired generation (Secondary doesn't need a generation) #[derive(Clone)] pub(crate) enum TenantSlot { Attached(Arc), Secondary(Arc), /// In this state, other administrative operations acting on the TenantId should /// block, or return a retry indicator equivalent to HTTP 503. InProgress(utils::completion::Barrier), } impl std::fmt::Debug for TenantSlot { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()), Self::Secondary(_) => write!(f, "Secondary"), Self::InProgress(_) => write!(f, "InProgress"), } } } impl TenantSlot { /// Return the `Tenant` in this slot if attached, else None fn get_attached(&self) -> Option<&Arc> { match self { Self::Attached(t) => Some(t), Self::Secondary(_) => None, Self::InProgress(_) => None, } } } /// The tenants known to the pageserver. /// The enum variants are used to distinguish the different states that the pageserver can be in. pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is not done yet. Initializing, /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`TenantManager::tenant_map_acquire_slot`]. Open(BTreeMap), /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. #[derive(Copy, Clone)] pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. Zero, /// Pick the shard that holds this key Page(Key), /// The shard ID is known: pick the given shard Known(ShardIndex), } /// A convenience for use with the re_attach ControllerUpcallClient function: rather /// than the serializable struct, we build this enum that encapsulates /// the invariant that attached tenants always have generations. /// /// This represents the subset of a LocationConfig that we receive during re-attach. pub(crate) enum TenantStartupMode { Attached((AttachmentMode, Generation, ShardStripeSize)), Secondary, } impl TenantStartupMode { /// Return the generation & mode that should be used when starting /// this tenant. /// /// If this returns None, the re-attach struct is in an invalid state and /// should be ignored in the response. fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), (LocationConfigMode::AttachedMulti, Some(g)) => Some(Self::Attached(( AttachmentMode::Multi, Generation::new(g), rart.stripe_size, ))), (LocationConfigMode::AttachedSingle, Some(g)) => Some(Self::Attached(( AttachmentMode::Single, Generation::new(g), rart.stripe_size, ))), (LocationConfigMode::AttachedStale, Some(g)) => Some(Self::Attached(( AttachmentMode::Stale, Generation::new(g), rart.stripe_size, ))), _ => { tracing::warn!( "Received invalid re-attach state for tenant {}: {rart:?}", rart.id ); None } } } } /// Result type for looking up a TenantId to a specific shard pub(crate) enum ShardResolveResult { NotFound, Found(Arc), // Wait for this barrrier, then query again InProgress(utils::completion::Barrier), } impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, /// None is returned. pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { m.get(tenant_shard_id).and_then(|slot| slot.get_attached()) } } } #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { TenantsMap::Initializing => 0, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.len(), } } } /// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then /// the slower actual deletion in the background. /// /// This is "safe" in that that it won't leave behind a partially deleted directory /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting /// the contents. /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { let parent = path .as_ref() .parent() // It is invalid to call this function with a relative path. Tenant directories // should always have a parent. .ok_or(std::io::Error::new( std::io::ErrorKind::InvalidInput, "Path must be absolute", ))?; let rand_suffix = rand::rng() .sample_iter(&Alphanumeric) .take(8) .map(char::from) .collect::() + TEMP_FILE_SUFFIX; let tmp_path = path_with_suffix_extension(&path, &rand_suffix); fs::rename(path.as_ref(), &tmp_path).await?; fs::File::open(parent) .await? .sync_all() .await .maybe_fatal_err("safe_rename_tenant_dir")?; Ok(tmp_path) } /// See [`Self::spawn`]. #[derive(Clone, Default)] pub struct BackgroundPurges(tokio_util::task::TaskTracker); impl BackgroundPurges { /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in /// the background, and thereby avoid blocking any API requests on this deletion completing. /// /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. /// Thus the [`BackgroundPurges`] type to keep track of these tasks. pub fn spawn(&self, tmp_path: Utf8PathBuf) { // because on shutdown we close and wait, we are misusing TaskTracker a bit. // // so first acquire a token, then check if the tracker has been closed. the tracker might get closed // right after, but at least the shutdown will wait for what we are spawning next. let token = self.0.token(); if self.0.is_closed() { warn!( %tmp_path, "trying to spawn background purge during shutdown, ignoring" ); return; } let span = info_span!(parent: None, "background_purge", %tmp_path); let task = move || { let _token = token; let _entered = span.entered(); if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) { // should we fatal_io_error here? warn!(%error, "failed to purge tenant directory"); } }; BACKGROUND_RUNTIME.spawn_blocking(task); } /// When this future completes, all background purges have completed. /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`]. /// /// Concurrent calls will coalesce. /// /// # Cancellation-Safety /// /// If this future is dropped before polled to completion, concurrent and subsequent /// instances of this future will continue to be correct. #[instrument(skip_all)] pub async fn shutdown(&self) { // forbid new tasks (can be called many times) self.0.close(); self.0.wait().await; } } /// Responsible for storing and mutating the collection of all tenants /// that this pageserver has state for. /// /// Every Tenant and SecondaryTenant instance lives inside the TenantManager. /// /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach /// the same tenant twice concurrently, or trying to configure the same tenant into secondary /// and attached modes concurrently. pub struct TenantManager { conf: &'static PageServerConf, tenants: std::sync::RwLock, resources: TenantSharedResources, // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or // when the tenant detaches. cancel: CancellationToken, background_purges: BackgroundPurges, } fn emergency_generations( tenant_confs: &HashMap>, ) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { let lc = match lc { Ok(lc) => lc, Err(_) => return None, }; Some(( *tid, match &lc.mode { LocationMode::Attached(alc) => TenantStartupMode::Attached(( alc.attach_mode, alc.generation, lc.shard.stripe_size, )), LocationMode::Secondary(_) => TenantStartupMode::Secondary, }, )) }) .collect() } async fn init_load_generations( conf: &'static PageServerConf, tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, ) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) } else { let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. let empty_local_disk = tenant_confs.is_empty(); match client.re_attach(conf, empty_local_disk).await { Ok(tenants) => tenants .into_iter() .flat_map(|(id, rart)| { TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm)) }) .collect(), Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } } }; // The deletion queue needs to know about the startup attachment state to decide which (if any) stored // deletion list entries may still be valid. We provide that by pushing a recovery operation into // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions // are processed, even though we don't block on recovery completing here. let attached_tenants = generations .iter() .flat_map(|(id, start_mode)| { match start_mode { TenantStartupMode::Attached((_mode, generation, _stripe_size)) => Some(generation), TenantStartupMode::Secondary => None, } .map(|gen_| (*id, *gen_)) }) .collect(); resources.deletion_queue_client.recover(attached_tenants)?; Ok(Some(generations)) } /// Given a directory discovered in the pageserver's tenants/ directory, attempt /// to load a tenant config from it. /// /// If we cleaned up something expected (like an empty dir or a temp dir), return None. fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, dentry: Utf8DirEntry, ) -> Option> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already // a temporary path std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir"); return None; } // This case happens if we crash during attachment before writing a config into the dir let is_empty = tenant_dir_path .is_empty_dir() .fatal_err("Checking for empty tenant dir"); if is_empty { info!("removing empty tenant directory {tenant_dir_path:?}"); std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir"); return None; } Some(TenantShard::load_tenant_config(conf, &tenant_shard_id)) } /// Initial stage of load: walk the local tenants directory, clean up any temp files, /// and load configurations for the tenants we found. /// /// Do this in parallel, because we expect 10k+ tenants, so serial execution can take /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, ) -> HashMap> { let tenants_dir = conf.tenants_path(); let dentries = tokio::task::spawn_blocking(move || -> Vec { let context = format!("read tenants dir {tenants_dir}"); let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context); dir_entries .collect::, std::io::Error>>() .fatal_err(&context) }) .await .expect("Config load task panicked"); let mut configs = HashMap::new(); let mut join_set = JoinSet::new(); for dentry in dentries { let tenant_shard_id = match dentry.file_name().parse::() { Ok(id) => id, Err(_) => { warn!( "Invalid tenant path (garbage in our repo directory?): '{}'", dentry.file_name() ); continue; } }; join_set.spawn_blocking(move || { ( tenant_shard_id, load_tenant_config(conf, tenant_shard_id, dentry), ) }); } while let Some(r) = join_set.join_next().await { let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task"); if let Some(tenant_config) = tenant_config { configs.insert(tenant_shard_id, tenant_config); } } configs } #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTenantError { #[error("Tenant map slot error {0}")] SlotError(#[from] TenantSlotError), #[error("Cancelled")] Cancelled, #[error(transparent)] Other(#[from] anyhow::Error), } /// Initialize repositories at `Initializing` state. pub fn init( conf: &'static PageServerConf, background_purges: BackgroundPurges, resources: TenantSharedResources, cancel: CancellationToken, ) -> TenantManager { TenantManager { conf, tenants: std::sync::RwLock::new(TenantsMap::Initializing), resources, cancel, background_purges, } } /// Transition repositories from `Initializing` state to `Open` state with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the tenant once download is completed. #[instrument(skip_all)] pub async fn init_tenant_mgr( tenant_manager: Arc, init_order: InitializationOrder, ) -> anyhow::Result<()> { debug_assert!(matches!( *tenant_manager.tenants.read().unwrap(), TenantsMap::Initializing )); let mut tenants = BTreeMap::new(); let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); let conf = tenant_manager.conf; let resources = &tenant_manager.resources; let cancel = &tenant_manager.cancel; let background_purges = &tenant_manager.background_purges; // Initialize dynamic limits that depend on system resources let system_memory = sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory()) .total_memory(); let max_ephemeral_layer_bytes = conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); tracing::info!( "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory" ); inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( max_ephemeral_layer_bytes, std::sync::atomic::Ordering::Relaxed, ); // Scan local filesystem for attached tenants let tenant_configs = init_load_tenant_configs(conf).await; // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?; // Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants. // This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node // so the Storage Controller has not had the time to move tenants out. let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes { tenant_configs.is_empty() && !tenant_modes.is_empty() } else { false }; if data_loss_suspected { tracing::error!( "Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants" ); } LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 }); tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", tenant_configs.len(), conf.concurrent_tenant_warmup.initial_permits() ); TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); // Accumulate futures for writing tenant configs, so that we can execute in parallel let mut config_write_futs = Vec::new(); // Update the location configs according to the re-attach response and persist them to disk tracing::info!("Updating {} location configs", tenant_configs.len()); for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let mut location_conf = match location_conf { Ok(l) => l, Err(e) => { // This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}"); continue; } }; // FIXME: if we were attached, and get demoted to secondary on re-attach, we // don't have a place to get a config. // (https://github.com/neondatabase/neon/issues/5377) const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = SecondaryLocationConfig { warm: true }; if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. match tenant_modes.get(&tenant_shard_id) { None => { info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); match safe_rename_tenant_dir(&tenant_dir_path).await { Ok(tmp_path) => { background_purges.spawn(tmp_path); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}"); } }; // We deleted local content: move on to next tenant, don't try and spawn this one. continue; } Some(TenantStartupMode::Secondary) => { if !matches!(location_conf.mode, LocationMode::Secondary(_)) { location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } } Some(TenantStartupMode::Attached((attach_mode, generation, stripe_size))) => { let old_gen_higher = match &location_conf.mode { LocationMode::Attached(AttachedLocationConfig { generation: old_generation, attach_mode: _attach_mode, }) => { if old_generation > generation { Some(old_generation) } else { None } } _ => None, }; if let Some(old_generation) = old_gen_higher { tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", old_generation ); // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away // local disk content: demote to secondary rather than detaching. location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } else { location_conf.attach_in_generation(*attach_mode, *generation, *stripe_size); } } } } else { // Legacy mode: no generation information, any tenant present // on local disk may activate info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. config_write_futs.push(async move { let r = TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await; (tenant_shard_id, location_conf, r) }); } // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency tracing::info!( "Writing {} location config files...", config_write_futs.len() ); let config_write_results = futures::stream::iter(config_write_futs) .buffer_unordered(16) .collect::>() .await; tracing::info!( "Spawning {} tenant shard locations...", config_write_results.len() ); // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running for (tenant_shard_id, location_conf, config_write_result) in config_write_results { // Writing a config to local disk is foundational to startup up tenants: panic if we can't. config_write_result.fatal_err("write tenant shard config file"); let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { LocationMode::Attached(attached_conf) => TenantSlot::Attached( tenant_spawn( conf, tenant_shard_id, &tenant_dir_path, resources.clone(), AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf), shard_identity, Some(init_order.clone()), SpawnMode::Lazy, &ctx, ) .expect("global shutdown during init_tenant_mgr cannot happen"), ), LocationMode::Secondary(secondary_conf) => { info!( tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), "Starting secondary tenant" ); TenantSlot::Secondary(SecondaryTenant::new( tenant_shard_id, shard_identity, location_conf.tenant_conf, &secondary_conf, )) } }; METRICS.slot_inserted(&slot); tenants.insert(tenant_shard_id, slot); } info!("Processed {} local tenants at startup", tenants.len()); let mut tenant_map = tenant_manager.tenants.write().unwrap(); *tenant_map = TenantsMap::Open(tenants); Ok(()) } /// Wrapper for Tenant::spawn that checks invariants before running #[allow(clippy::too_many_arguments)] fn tenant_spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, resources: TenantSharedResources, location_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, mode: SpawnMode, ctx: &RequestContext, ) -> Result, GlobalShutDown> { // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode // to avoid impacting prod runtime performance. assert!(!crate::is_temporary(tenant_path)); debug_assert!(tenant_path.is_dir()); debug_assert!( conf.tenant_location_config_path(&tenant_shard_id) .try_exists() .unwrap() ); TenantShard::spawn( conf, tenant_shard_id, resources, location_conf, shard_identity, init_order, mode, ctx, ) } #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] BadRequest(anyhow::Error), #[error("Cannot change config in this state: {0}")] Unavailable(#[from] TenantMapError), #[error("Tenant is already being modified")] InProgress, #[error("Failed to flush: {0}")] Flush(anyhow::Error), /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state. #[error("Internal error: {0}")] InternalError(anyhow::Error), } impl TenantManager { /// Convenience function so that anyone with a TenantManager can get at the global configuration, without /// having to pass it around everywhere as a separate object. pub(crate) fn get_conf(&self) -> &'static PageServerConf { self.conf } /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently /// undergoing a state change (i.e. slot is InProgress). /// /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it. pub(crate) fn get_attached_tenant_shard( &self, tenant_shard_id: TenantShardId, ) -> Result, GetTenantError> { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::ShardNotFound(tenant_shard_id)) } } } pub(crate) fn get_secondary_tenant_shard( &self, tenant_shard_id: TenantShardId, ) -> Option> { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) .ok() .flatten(); match peek_slot { Some(TenantSlot::Secondary(s)) => Some(s.clone()), _ => None, } } /// Whether the `TenantManager` is responsible for the tenant shard pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) .ok() .flatten(); peek_slot.is_some() } /// Returns whether a local shard exists that's a child of the given tenant shard. Note that /// this just checks for any shard with a larger shard count, and it may not be a direct child /// of the given shard (their keyspace may not overlap). pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool { match &*self.tenants.read().unwrap() { TenantsMap::Initializing => false, TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots .range(TenantShardId::tenant_range(tenant_id)) .any(|(tsid, _)| tsid.shard_count > shard_index.shard_count), } } #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, tenant_shard_id: TenantShardId, new_location_config: LocationConf, flush: Option, mut spawn_mode: SpawnMode, ctx: &RequestContext, ) -> Result>, UpsertLocationError> { debug_assert_current_span_has_tenant_id(); info!("configuring tenant location to state {new_location_config:?}"); enum FastPathModified { Attached(Arc), Secondary(Arc), } // Special case fast-path for updates to existing slots: if our upsert is only updating configuration, // then we do not need to set the slot to InProgress, we can just call into the // existng tenant. let fast_path_taken = { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; match (&new_location_config.mode, peek_slot) { (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { match attach_conf.generation.cmp(&tenant.generation) { Ordering::Equal => { // A transition from Attached to Attached in the same generation, we may // take our fast path and just provide the updated configuration // to the tenant. tenant.set_new_location_config( AttachedTenantConf::try_from( self.conf, new_location_config.clone(), ) .map_err(UpsertLocationError::BadRequest)?, ); Some(FastPathModified::Attached(tenant.clone())) } Ordering::Less => { return Err(UpsertLocationError::BadRequest(anyhow::anyhow!( "Generation {:?} is less than existing {:?}", attach_conf.generation, tenant.generation ))); } Ordering::Greater => { // Generation advanced, fall through to general case of replacing `Tenant` object None } } } ( LocationMode::Secondary(secondary_conf), Some(TenantSlot::Secondary(secondary_tenant)), ) => { secondary_tenant.set_config(secondary_conf); secondary_tenant.set_tenant_conf(&new_location_config.tenant_conf); Some(FastPathModified::Secondary(secondary_tenant.clone())) } _ => { // Not an Attached->Attached transition, fall through to general case None } } }; // Fast-path continued: having dropped out of the self.tenants lock, do the async // phase of writing config and/or waiting for flush, before returning. match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { tenant .shard_identity .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, &new_location_config, ) .await .fatal_err("write tenant shard config"); // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If // the caller set `flush`, then flush to remote storage. if let LocationMode::Attached(AttachedLocationConfig { generation: _, attach_mode: AttachmentMode::Stale, }) = &new_location_config.mode { if let Some(flush_timeout) = flush { match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { Ok(Err(e)) => { return Err(UpsertLocationError::Flush(e)); } Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( timeout_ms = flush_timeout.as_millis(), "Timed out waiting for flush to remote storage, proceeding anyway." ) } } } } return Ok(Some(tenant)); } Some(FastPathModified::Secondary(secondary_tenant)) => { secondary_tenant .shard_identity .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, &new_location_config, ) .await .fatal_err("write tenant shard config"); return Ok(None); } None => { // Proceed with the general case procedure, where we will shutdown & remove any existing // slot contents and replace with a fresh one } }; // General case for upserts to TenantsMap, excluding the case above: we will substitute an // InProgress value to the slot while we make whatever changes are required. The state for // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: // the state is ill-defined while we're in transition. Transitions are async, but fast: we do // not do significant I/O, and shutdowns should be prompt via cancellation tokens. let mut slot_guard = self .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) .map_err(|e| match e { TenantSlotError::NotFound(_) => { unreachable!("Called with mode Any") } TenantSlotError::InProgress => UpsertLocationError::InProgress, TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s), })?; match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { tenant .shard_identity .assert_equal(new_location_config.shard); // The case where we keep a Tenant alive was covered above in the special case // for Attached->Attached transitions in the same generation. By this point, // if we see an attached tenant we know it will be discarded and should be // shut down. let (_guard, progress) = utils::completion::channel(); match tenant.get_attach_mode() { AttachmentMode::Single | AttachmentMode::Multi => { // Before we leave our state as the presumed holder of the latest generation, // flush any outstanding deletions to reduce the risk of leaking objects. self.resources.deletion_queue_client.flush_advisory() } AttachmentMode::Stale => { // If we're stale there's not point trying to flush deletions } }; info!("Shutting down attached tenant"); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); barrier.wait().await; } } slot_guard.drop_old_value().expect("We just shut it down"); // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then // the caller thinks they're creating but the tenant already existed. We must switch to // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, // rather than assuming it to be empty. spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(secondary_tenant)) => { secondary_tenant .shard_identity .assert_equal(new_location_config.shard); info!("Shutting down secondary tenant"); secondary_tenant.shutdown().await; } Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. return Err(UpsertLocationError::InternalError(anyhow::anyhow!( "Acquired an InProgress slot, this is a bug." ))); } None => { // Slot was vacant, nothing needs shutting down. } } let tenant_path = self.conf.tenant_path(&tenant_shard_id); let timelines_path = self.conf.timelines_path(&tenant_shard_id); // Directory structure is the same for attached and secondary modes: // create it if it doesn't exist. Timeline load/creation expects the // timelines/ subdir to already exist. // // Does not need to be fsync'd because local storage is just a cache. tokio::fs::create_dir_all(&timelines_path) .await .fatal_err("create timelines/ dir"); // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) .await .fatal_err("write tenant shard config"); let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { let shard_identity = new_location_config.shard; TenantSlot::Secondary(SecondaryTenant::new( tenant_shard_id, shard_identity, new_location_config.tenant_conf, secondary_config, )) } LocationMode::Attached(_attach_config) => { let shard_identity = new_location_config.shard; // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config) .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( self.conf, tenant_shard_id, &tenant_path, self.resources.clone(), attached_conf, shard_identity, None, spawn_mode, ctx, ) .map_err(|_: GlobalShutDown| { UpsertLocationError::Unavailable(TenantMapError::ShuttingDown) })?; TenantSlot::Attached(tenant) } }; let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot { Some(tenant.clone()) } else { None }; match slot_guard.upsert(new_slot) { Err(TenantSlotUpsertError::InternalError(e)) => { Err(UpsertLocationError::InternalError(anyhow::anyhow!(e))) } Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants // are shutdown. // // We must shut it down inline here. match new_slot { TenantSlot::InProgress(_) => { // Unreachable because we never insert an InProgress unreachable!() } TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); info!( "Shutting down just-spawned tenant, because tenant manager is shut down" ); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); } Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); barrier.wait().await; } } } TenantSlot::Secondary(secondary_tenant) => { secondary_tenant.shutdown().await; } } Err(UpsertLocationError::Unavailable( TenantMapError::ShuttingDown, )) } Ok(()) => Ok(attached_tenant), } } fn tenant_map_acquire_slot( &self, tenant_shard_id: &TenantShardId, mode: TenantSlotAcquireMode, ) -> Result { use TenantSlotAcquireMode::*; METRICS.tenant_slot_writes.inc(); let mut locked = self.tenants.write().unwrap(); let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()), TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown.into()), TenantsMap::Open(m) => m, }; use std::collections::btree_map::Entry; let entry = m.entry(*tenant_shard_id); match entry { Entry::Vacant(v) => match mode { MustExist => { tracing::debug!("Vacant && MustExist: return NotFound"); Err(TenantSlotError::NotFound(*tenant_shard_id)) } _ => { let (completion, barrier) = utils::completion::channel(); let inserting = TenantSlot::InProgress(barrier); METRICS.slot_inserted(&inserting); v.insert(inserting); tracing::debug!("Vacant, inserted InProgress"); Ok(SlotGuard::new( *tenant_shard_id, None, completion, &self.tenants, )) } }, Entry::Occupied(mut o) => { // Apply mode-driven checks match (o.get(), mode) { (TenantSlot::InProgress(_), _) => { tracing::debug!("Occupied, failing for InProgress"); Err(TenantSlotError::InProgress) } _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); let in_progress = TenantSlot::InProgress(barrier); METRICS.slot_inserted(&in_progress); let old_value = o.insert(in_progress); METRICS.slot_removed(&old_value); tracing::debug!("Occupied, replaced with InProgress"); Ok(SlotGuard::new( *tenant_shard_id, Some(old_value), completion, &self.tenants, )) } } } } } /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same /// LocationConf that was last used to attach it. Optionally, the local file cache may be /// dropped before re-attaching. /// /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations /// where an issue is identified that would go away with a restart of the tenant. /// /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks /// to respect the cancellation tokens used in normal shutdown(). #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))] pub(crate) async fn reset_tenant( &self, tenant_shard_id: TenantShardId, drop_cache: bool, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut slot_guard = self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let Some(old_slot) = slot_guard.get_old_value() else { anyhow::bail!("Tenant not found when trying to reset"); }; let Some(tenant) = old_slot.get_attached() else { slot_guard.revert(); anyhow::bail!("Tenant is not in attached state"); }; let (_guard, progress) = utils::completion::channel(); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } Err(_barrier) => { slot_guard.revert(); anyhow::bail!("Cannot reset Tenant, already shutting down"); } } let tenant_path = self.conf.tenant_path(&tenant_shard_id); let timelines_path = self.conf.timelines_path(&tenant_shard_id); let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?; if drop_cache { tracing::info!("Dropping local file cache"); match tokio::fs::read_dir(&timelines_path).await { Err(e) => { tracing::warn!("Failed to list timelines while dropping cache: {}", e); } Ok(mut entries) => { while let Some(entry) = entries.next_entry().await? { tokio::fs::remove_dir_all(entry.path()).await?; } } } } let shard_identity = config.shard; let tenant = tenant_spawn( self.conf, tenant_shard_id, &tenant_path, self.resources.clone(), AttachedTenantConf::try_from(self.conf, config)?, shard_identity, None, SpawnMode::Eager, ctx, )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; Ok(()) } pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec> { let locked = self.tenants.read().unwrap(); match &*locked { TenantsMap::Initializing => Vec::new(), TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map .values() .filter_map(|slot| { slot.get_attached() .and_then(|t| if t.is_active() { Some(t.clone()) } else { None }) }) .collect(), } } // Do some synchronous work for all tenant slots in Secondary state. The provided // callback should be small and fast, as it will be called inside the global // TenantsMap lock. pub(crate) fn foreach_secondary_tenants(&self, mut func: F) where // TODO: let the callback return a hint to drop out of the loop early F: FnMut(&TenantShardId, &Arc), { let locked = self.tenants.read().unwrap(); let map = match &*locked { TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return, TenantsMap::Open(m) => m, }; for (tenant_id, slot) in map { if let TenantSlot::Secondary(state) = slot { // Only expose secondary tenants that are not currently shutting down if !state.cancel.is_cancelled() { func(tenant_id, state) } } } } /// Total list of all tenant slots: this includes attached, secondary, and InProgress. pub(crate) fn list(&self) -> Vec<(TenantShardId, TenantSlot)> { let locked = self.tenants.read().unwrap(); match &*locked { TenantsMap::Initializing => Vec::new(), TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { map.iter().map(|(k, v)| (*k, v.clone())).collect() } } } pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option { let locked = self.tenants.read().unwrap(); match &*locked { TenantsMap::Initializing => None, TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { map.get(&tenant_shard_id).cloned() } } } /// If a tenant is attached, detach it. Then remove its data from remote storage. /// /// A tenant is considered deleted once it is gone from remote storage. It is the caller's /// responsibility to avoid trying to attach the tenant again or use it any way once deletion /// has started: this operation is not atomic, and must be retried until it succeeds. /// /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage /// controller uses this to purge all remote tenant data, including any stale parent shards that /// may remain after splits. Ideally, this special case would be handled elsewhere. See: /// . pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, ) -> Result<(), DeleteTenantError> { super::span::debug_assert_current_span_has_tenant_id(); async fn delete_local( conf: &PageServerConf, background_purges: &BackgroundPurges, tenant_shard_id: &TenantShardId, ) -> anyhow::Result<()> { let local_tenant_directory = conf.tenant_path(tenant_shard_id); let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| { format!("local tenant directory {local_tenant_directory:?} rename") })?; background_purges.spawn(tmp_dir); Ok(()) } let slot_guard = self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; match &slot_guard.old_value { Some(TenantSlot::Attached(tenant)) => { // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and // deletion will be resumed across restarts. let tenant = tenant.clone(); let (_guard, progress) = utils::completion::channel(); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); barrier.wait().await; } } delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::Secondary(secondary_tenant)) => { secondary_tenant.shutdown().await; delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::InProgress(_)) => unreachable!(), None => {} }; // Fall through: local state for this tenant is no longer present, proceed with remote delete. // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result // in 500 responses to delete requests. // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will // 503/retry, rather than kicking off a wasteful concurrent deletion. // NB: this also deletes partial prefixes, i.e. a path will delete all // _/* objects. See method comment for why. backoff::retry( || async move { self.resources .remote_storage .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel) .await }, |_| false, // backoff::retry handles cancellation 1, 3, &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"), &self.cancel, ) .await .unwrap_or(Err(TimeoutOrCancel::Cancel.into())) .map_err(|err| { if TimeoutOrCancel::caused_by_cancel(&err) { return DeleteTenantError::Cancelled; } DeleteTenantError::Other(err) }) } #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] pub(crate) async fn shard_split( &self, tenant: Arc, new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, ) -> anyhow::Result> { let tenant_shard_id = *tenant.get_tenant_shard_id(); let r = self .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx) .await; if r.is_err() { // Shard splitting might have left the original shard in a partially shut down state (it // stops the shard's remote timeline client). Reset it to ensure we leave things in // a working state. if self.get(tenant_shard_id).is_some() { tracing::warn!("Resetting after shard split failure"); if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { // Log this error because our return value will still be the original error, not this one. This is // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or // setting it broken probably won't help either. tracing::error!("Failed to reset: {e}"); } } } r } pub(crate) async fn do_shard_split( &self, tenant: Arc, new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, ) -> anyhow::Result> { let tenant_shard_id = *tenant.get_tenant_shard_id(); // Validate the incoming request if new_shard_count.count() <= tenant_shard_id.shard_count.count() { anyhow::bail!("Requested shard count is not an increase"); } let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count(); if !expansion_factor.is_power_of_two() { anyhow::bail!("Requested split is not a power of two"); } if let Some(new_stripe_size) = new_stripe_size { if tenant.get_shard_stripe_size() != new_stripe_size && tenant_shard_id.shard_count.count() > 1 { // This tenant already has multiple shards, it is illegal to try and change its stripe size anyhow::bail!( "Shard stripe size may not be modified once tenant has multiple shards" ); } } // Plan: identify what the new child shards will be let child_shards = tenant_shard_id.split(new_shard_count); tracing::info!( "Shard {} splits into: {}", tenant_shard_id.to_index(), child_shards .iter() .map(|id| format!("{}", id.to_index())) .join(",") ); fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!( "failpoint" ))); let parent_shard_identity = tenant.shard_identity; let parent_tenant_conf = tenant.get_tenant_conf(); let parent_generation = tenant.generation; // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation if let Err(e) = tenant.split_prepare(&child_shards).await { // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might // have been left in a partially-shut-down state. tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning"); return Err(e); } fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!( "failpoint" ))); self.resources.deletion_queue_client.flush_advisory(); // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant // // TODO: keeping the parent as InProgress while spawning the children causes read // unavailability, as we can't acquire a new timeline handle for it (existing handles appear // to still work though, even downgraded ones). The parent should be available for reads // until the children are ready -- potentially until *all* subsplits across all parent // shards are complete and the compute has been notified. See: // . drop(tenant); let mut parent_slot_guard = self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let parent = match parent_slot_guard.get_old_value() { Some(TenantSlot::Attached(t)) => t, Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"), Some(TenantSlot::InProgress(_)) => { // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress // it would return an error. unreachable!() } None => { // We don't actually need the parent shard to still be attached to do our work, but it's // a weird enough situation that the caller probably didn't want us to continue working // if they had detached the tenant they requested the split on. anyhow::bail!("Detached parent shard in the middle of split!") } }; fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!( "failpoint" ))); // Optimization: hardlink layers from the parent into the children, so that they don't have to // re-download & duplicate the data referenced in their initial IndexPart self.shard_split_hardlink(parent, child_shards.clone()) .await?; fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!( "failpoint" ))); // Take a snapshot of where the parent's WAL ingest had got to: we will wait for // child shards to reach this point. let mut target_lsns = HashMap::new(); for timeline in parent.timelines.lock().unwrap().clone().values() { target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn()); } // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources // and could slow down the children trying to catch up. // Phase 3: Spawn the child shards for child_shard in &child_shards { let mut child_shard_identity = parent_shard_identity; if let Some(new_stripe_size) = new_stripe_size { child_shard_identity.stripe_size = new_stripe_size; } child_shard_identity.count = child_shard.shard_count; child_shard_identity.number = child_shard.shard_number; let child_location_conf = LocationConf { mode: LocationMode::Attached(AttachedLocationConfig { generation: parent_generation, attach_mode: AttachmentMode::Single, }), shard: child_shard_identity, tenant_conf: parent_tenant_conf.clone(), }; self.upsert_location( *child_shard, child_location_conf, None, SpawnMode::Eager, ctx, ) .await?; } fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!( "failpoint" ))); // Phase 4: wait for child chards WAL ingest to catch up to target LSN for child_shard_id in &child_shards { let child_shard_id = *child_shard_id; let child_shard = { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?; peek_slot.and_then(|s| s.get_attached()).cloned() }; if let Some(t) = child_shard { // Wait for the child shard to become active: this should be very quick because it only // has to download the index_part that we just uploaded when creating it. if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await { // This is not fatal: we have durably created the child shard. It just makes the // split operation less seamless for clients, as we will may detach the parent // shard before the child shards are fully ready to serve requests. tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}"); continue; } let timelines = t.timelines.lock().unwrap().clone(); for timeline in timelines.values() { let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else { continue; }; tracing::info!( "Waiting for child shard {}/{} to reach target lsn {}...", child_shard_id, timeline.timeline_id, target_lsn ); fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( "failpoint" ))); if let Err(e) = timeline .wait_lsn( *target_lsn, crate::tenant::timeline::WaitLsnWaiter::Tenant, crate::tenant::timeline::WaitLsnTimeout::Default, ctx, ) .await { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. tracing::warn!( "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}", timeline.timeline_id ); } else { tracing::info!( "Child shard {}/{} reached target lsn {}", child_shard_id, timeline.timeline_id, target_lsn ); } } } } // Phase 5: Shut down the parent shard. We leave it on disk in case the split fails and we // have to roll back to the parent shard, avoiding a cold start. It will be cleaned up once // the storage controller commits the split, or if all else fails, on the next restart. // // TODO: We don't flush the ephemeral layer here, because the split is likely to succeed and // catching up the parent should be reasonably quick. Consider using FreezeAndFlush instead. let (_guard, progress) = completion::channel(); match parent.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(other) => { other.wait().await; } } fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" ))); parent_slot_guard.drop_old_value()?; // Phase 6: Release the InProgress on the parent shard drop(parent_slot_guard); utils::pausable_failpoint!("shard-split-post-finish-pause"); Ok(child_shards) } /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization /// to avoid the children downloading them again. /// /// For each resident layer in the parent shard, we will hard link it into all of the child shards. async fn shard_split_hardlink( &self, parent_shard: &TenantShard, child_shards: Vec, ) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_id(); let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id()); let (parent_timelines, parent_layers) = { let mut parent_layers = Vec::new(); let timelines = parent_shard.timelines.lock().unwrap().clone(); let parent_timelines = timelines.keys().cloned().collect::>(); for timeline in timelines.values() { tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); let layers = timeline .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; for layer in layers.likely_resident_layers() { let relative_path = layer .local_path() .strip_prefix(&parent_path) .context("Removing prefix from parent layer path")?; parent_layers.push(relative_path.to_owned()); } } if parent_layers.is_empty() { tracing::info!("Ancestor shard has no resident layer to hard link"); } (parent_timelines, parent_layers) }; let mut child_prefixes = Vec::new(); let mut create_dirs = Vec::new(); for child in child_shards { let child_prefix = self.conf.tenant_path(&child); create_dirs.push(child_prefix.clone()); create_dirs.extend( parent_timelines .iter() .map(|t| self.conf.timeline_path(&child, t)), ); child_prefixes.push(child_prefix); } // Since we will do a large number of small filesystem metadata operations, batch them into // spawn_blocking calls rather than doing each one as a tokio::fs round-trip. let span = tracing::Span::current(); let jh = tokio::task::spawn_blocking(move || -> anyhow::Result { // Run this synchronous code in the same log context as the outer function that spawned it. let _span = span.enter(); tracing::info!("Creating {} directories", create_dirs.len()); for dir in &create_dirs { if let Err(e) = std::fs::create_dir_all(dir) { // Ignore AlreadyExists errors, drop out on all other errors match e.kind() { std::io::ErrorKind::AlreadyExists => {} _ => { return Err(anyhow::anyhow!(e).context(format!("Creating {dir}"))); } } } } for child_prefix in child_prefixes { tracing::info!( "Hard-linking {} parent layers into child path {}", parent_layers.len(), child_prefix ); for relative_layer in &parent_layers { let parent_path = parent_path.join(relative_layer); let child_path = child_prefix.join(relative_layer); if let Err(e) = std::fs::hard_link(&parent_path, &child_path) { match e.kind() { std::io::ErrorKind::AlreadyExists => {} std::io::ErrorKind::NotFound => { tracing::info!( "Layer {} not found during hard-linking, evicted during split?", relative_layer ); } _ => { return Err(anyhow::anyhow!(e).context(format!( "Hard linking {relative_layer} into {child_prefix}" ))); } } } } } // Durability is not required for correctness, but if we crashed during split and // then came restarted with empty timeline dirs, it would be very inefficient to // re-populate from remote storage. tracing::info!("fsyncing {} directories", create_dirs.len()); for dir in create_dirs { if let Err(e) = crashsafe::fsync(&dir) { // Something removed a newly created timeline dir out from underneath us? Extremely // unexpected, but not worth panic'ing over as this whole function is just an // optimization. tracing::warn!("Failed to fsync directory {dir}: {e}") } } Ok(parent_layers.len()) }); match jh.await { Ok(Ok(layer_count)) => { tracing::info!(count = layer_count, "Hard linked layers into child shards"); } Ok(Err(e)) => { // This is an optimization, so we tolerate failure. tracing::warn!("Error hard-linking layers, proceeding anyway: {e}") } Err(e) => { // This is something totally unexpected like a panic, so bail out. anyhow::bail!("Error joining hard linking task: {e}"); } } Ok(()) } /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// /// NB: We leave the tenants in the map, so that they remain accessible through /// the management API until we shut it down. If we removed the shut-down tenants /// from the tenants map, the management API would return 404 for these tenants, /// because TenantsMap::get() now returns `None`. /// That could be easily misinterpreted by control plane, the consumer of the /// management API. For example, it could attach the tenant on a different pageserver. /// We would then be in split-brain once this pageserver restarts. #[instrument(skip_all)] pub(crate) async fn shutdown(&self) { self.cancel.cancel(); self.shutdown_all_tenants0().await } async fn shutdown_all_tenants0(&self) { let mut join_set = JoinSet::new(); #[cfg(all(debug_assertions, not(test)))] { // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, // as it happens implicitly at the end of tests etc. let m = self.tenants.read().unwrap(); debug_assert_eq!(METRICS.slots_total(), m.len() as u64); } // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. let (total_in_progress, total_attached) = { let mut m = self.tenants.write().unwrap(); match &mut *m { TenantsMap::Initializing => { *m = TenantsMap::ShuttingDown(BTreeMap::default()); info!("tenants map is empty"); return; } TenantsMap::Open(tenants) => { let mut shutdown_state = BTreeMap::new(); let mut total_in_progress = 0; let mut total_attached = 0; for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() { match v { TenantSlot::Attached(t) => { shutdown_state .insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { let res = { let (_guard, shutdown_progress) = completion::channel(); t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { // join the another shutdown in progress other_progress.wait().await; } // we cannot afford per tenant logging here, because if s3 is degraded, we are // going to log too many lines debug!("tenant successfully stopped"); } .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); total_attached += 1; } TenantSlot::Secondary(state) => { // We don't need to wait for this individually per-tenant: the // downloader task will be waited on eventually, this cancel // is just to encourage it to drop out if it is doing work // for this tenant right now. state.cancel.cancel(); shutdown_state .insert(tenant_shard_id, TenantSlot::Secondary(state)); } TenantSlot::InProgress(notify) => { // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will // wait for their notifications to fire in this function. join_set.spawn(async move { notify.wait().await; }); total_in_progress += 1; } } } *m = TenantsMap::ShuttingDown(shutdown_state); (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { error!( "already shutting down, this function isn't supposed to be called more than once" ); return; } } }; let started_at = std::time::Instant::now(); info!( "Waiting for {} InProgress tenants and {} Attached tenants to shut down", total_in_progress, total_attached ); let total = join_set.len(); let mut panicked = 0; let mut buffering = true; const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500); let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR)); while !join_set.is_empty() { tokio::select! { Some(joined) = join_set.join_next() => { match joined { Ok(()) => {}, Err(join_error) if join_error.is_cancelled() => { unreachable!("we are not cancelling any of the tasks"); } Err(join_error) if join_error.is_panic() => { // cannot really do anything, as this panic is likely a bug panicked += 1; } Err(join_error) => { warn!("unknown kind of JoinError: {join_error}"); } } if !buffering { // buffer so that every 500ms since the first update (or starting) we'll log // how far away we are; this is because we will get SIGKILL'd at 10s, and we // are not able to log *then*. buffering = true; buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR); } }, _ = &mut buffered, if buffering => { buffering = false; info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown"); } } } if panicked > 0 { warn!( panicked, total, "observed panicks while shutting down tenants" ); } // caller will log how long we took } /// Detaches a tenant, and removes its local files asynchronously. /// /// File removal is idempotent: even if the tenant has already been removed, this will still /// remove any local files. This is used during shard splits, where we leave the parent shard's /// files around in case we have to roll back the split. pub(crate) async fn detach_tenant( &self, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, deletion_queue_client: &DeletionQueueClient, ) -> Result<(), TenantStateError> { if let Some(tmp_path) = self .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) .await? { self.background_purges.spawn(tmp_path); } Ok(()) } /// Detaches a tenant. This renames the tenant directory to a temporary path and returns it, /// allowing the caller to delete it asynchronously. Returns None if the dir is already removed. async fn detach_tenant0( &self, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, deletion_queue_client: &DeletionQueueClient, ) -> Result, TenantStateError> { let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); if !tokio::fs::try_exists(&local_tenant_directory).await? { // If the tenant directory doesn't exist, it's already cleaned up. return Ok(None); } safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| { format!("local tenant directory {local_tenant_directory:?} rename") }) .map(Some) }; let mut removal_result = self .remove_tenant_from_memory( tenant_shard_id, tenant_dir_rename_operation(tenant_shard_id), ) .await; // If the tenant was not found, it was likely already removed. Attempt to remove the tenant // directory on disk anyway. For example, during shard splits, we shut down and remove the // parent shard, but leave its directory on disk in case we have to roll back the split. // // TODO: it would be better to leave the parent shard attached until the split is committed. // This will be needed by the gRPC page service too, such that a compute can continue to // read from the parent shard until it's notified about the new child shards. See: // . if let Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) = removal_result { removal_result = tenant_dir_rename_operation(tenant_shard_id) .await .map_err(TenantStateError::Other); } // Flush pending deletions, so that they have a good chance of passing validation // before this tenant is potentially re-attached elsewhere. deletion_queue_client.flush_advisory(); removal_result } pub(crate) fn list_tenants( &self, ) -> Result, TenantMapListError> { let tenants = self.tenants.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, }; Ok(m.iter() .filter_map(|(id, tenant)| match tenant { TenantSlot::Attached(tenant) => { Some((*id, tenant.current_state(), tenant.generation())) } TenantSlot::Secondary(_) => None, TenantSlot::InProgress(_) => None, }) .collect()) } /// Completes an earlier prepared timeline detach ancestor. pub(crate) async fn complete_detaching_timeline_ancestor( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, behavior: DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { use detach_ancestor::Error; let slot_guard = self .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist) .map_err(|e| { use TenantSlotError::*; match e { MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), } })?; let tenant = { let old_slot = slot_guard .get_old_value() .as_ref() .expect("requested MustExist"); let Some(tenant) = old_slot.get_attached() else { return Err(Error::DetachReparent(anyhow::anyhow!( "Tenant is not in attached state" ))); }; if !tenant.is_active() { return Err(Error::DetachReparent(anyhow::anyhow!( "Tenant is not active" ))); } tenant.clone() }; let timeline = tenant .get_timeline(timeline_id, true) .map_err(Error::NotFound)?; let resp = timeline .detach_from_ancestor_and_reparent( &tenant, prepared, attempt.ancestor_timeline_id, attempt.ancestor_lsn, behavior, ctx, ) .await?; let mut slot_guard = slot_guard; let tenant = if resp.reset_tenant_required() { attempt.before_reset_tenant(); let (_guard, progress) = utils::completion::channel(); match tenant.shutdown(progress, ShutdownMode::Reload).await { Ok(()) => { slot_guard.drop_old_value().expect("it was just shutdown"); } Err(_barrier) => { slot_guard.revert(); // this really should not happen, at all, unless a shutdown without acquiring // tenant slot was already going? regardless, on restart the attempt tracking // will reset to retryable. return Err(Error::ShuttingDown); } } let tenant_path = self.conf.tenant_path(&tenant_shard_id); let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id) .map_err(|e| Error::DetachReparent(e.into()))?; let shard_identity = config.shard; let tenant = tenant_spawn( self.conf, tenant_shard_id, &tenant_path, self.resources.clone(), AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?, shard_identity, None, SpawnMode::Eager, ctx, ) .map_err(|_| Error::ShuttingDown)?; { let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); assert!( g.is_none(), "there cannot be any new timeline detach ancestor on newly created tenant" ); *g = Some((attempt.timeline_id, attempt.new_barrier())); } // if we bail out here, we will not allow a new attempt, which should be fine. // pageserver should be shutting down regardless? tenant_reset would help, unless it // runs into the same problem. slot_guard .upsert(TenantSlot::Attached(tenant.clone())) .map_err(|e| match e { TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown, other => Error::DetachReparent(other.into()), })?; tenant } else { tracing::info!("skipping tenant_reset as no changes made required it"); tenant }; if let Some(reparented) = resp.completed() { // finally ask the restarted tenant to complete the detach // // rationale for 9999s: we don't really have a timetable here; if retried, the caller // will get an 503. tenant .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(|e| { use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; use pageserver_api::models::TenantState; match e { Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { Error::ShuttingDown } other => Error::Complete(other.into()), } })?; utils::pausable_failpoint!( "timeline-detach-ancestor::after_activating_before_finding-pausable" ); let timeline = tenant .get_timeline(attempt.timeline_id, true) .map_err(Error::NotFound)?; timeline .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) .await .map(|()| reparented) } else { // at least the latest versions have now been downloaded and refreshed; be ready to // retry another time. Err(Error::FailedToReparentAll) } } /// A page service client sends a TenantId, and to look up the correct Tenant we must /// resolve this to a fully qualified TenantShardId. /// /// During shard splits: we shall see parent shards in InProgress state and skip them, and /// instead match on child shards which should appear in Attached state. Very early in a shard /// split, or in other cases where a shard is InProgress, we will return our own InProgress result /// to instruct the caller to wait for that to finish before querying again. pub(crate) fn resolve_attached_shard( &self, tenant_id: &TenantId, selector: ShardSelector, ) -> ShardResolveResult { let tenants = self.tenants.read().unwrap(); let mut want_shard: Option = None; let mut any_in_progress = None; match &*tenants { TenantsMap::Initializing => ShardResolveResult::NotFound, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { // Ignore all slots that don't contain an attached tenant let tenant = match &slot.1 { TenantSlot::Attached(t) => t, TenantSlot::InProgress(barrier) => { // We might still find a usable shard, but in case we don't, remember that // we saw at least one InProgress slot, so that we can distinguish this case // from a simple NotFound in our return value. any_in_progress = Some(barrier.clone()); continue; } _ => continue, }; match selector { ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { // Each time we find an attached slot with a different shard count, // recompute the expected shard number: during shard splits we might // have multiple shards with the old shard count. if want_shard.is_none() || want_shard.unwrap().shard_count != tenant.shard_identity.count { want_shard = Some(ShardIndex { shard_number: tenant.shard_identity.get_shard_number(&key), shard_count: tenant.shard_identity.count, }); } if Some(ShardIndex { shard_number: tenant.shard_identity.number, shard_count: tenant.shard_identity.count, }) == want_shard { return ShardResolveResult::Found(tenant.clone()); } } ShardSelector::Known(shard) if tenant.shard_identity.shard_index() == shard => { return ShardResolveResult::Found(tenant.clone()); } _ => continue, } } // Fall through: we didn't find a slot that was in Attached state & matched our selector. If // we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise // this requested shard simply isn't found. if let Some(barrier) = any_in_progress { ShardResolveResult::InProgress(barrier) } else { ShardResolveResult::NotFound } } } } /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The /// returned values are: /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. /// how much space they would use if not impacted by disk usage eviction. /// - the number of tenant shards currently on this pageserver, including attached /// and secondary. /// /// This function is quite expensive: callers are expected to cache the result and /// limit how often they call it. pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { let tenants = self.tenants.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, }; let shard_count = m.len(); let mut wanted_bytes = 0; for tenant_slot in m.values() { match tenant_slot { TenantSlot::InProgress(_barrier) => { // While a slot is being changed, we can't know how much storage it wants. This // means this function's output can fluctuate if a lot of changes are going on // (such as transitions from secondary to attached). // // We could wait for the barrier and retry, but it's important that the utilization // API is responsive, and the data quality impact is not very significant. continue; } TenantSlot::Attached(tenant) => { wanted_bytes += tenant.local_storage_wanted(); } TenantSlot::Secondary(secondary) => { let progress = secondary.progress.lock().unwrap(); wanted_bytes += if progress.heatmap_mtime.is_some() { // If we have heatmap info, then we will 'want' the sum // of the size of layers in the heatmap: this is how much space // we would use if not doing any eviction. progress.bytes_total } else { // In the absence of heatmap info, assume that the secondary location simply // needs as much space as it is currently using. secondary.resident_size_metric.get() } } } } Ok((wanted_bytes, shard_count as u32)) } #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn immediate_gc( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, ) -> Result { let tenant = { let guard = self.tenants.read().unwrap(); guard .get(&tenant_shard_id) .cloned() .with_context(|| format!("tenant {tenant_shard_id}")) .map_err(|e| ApiError::NotFound(e.into()))? }; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; // Run in task_mgr to avoid race with tenant_detach operation let ctx: RequestContext = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; fail::fail_point!("immediate_gc_task_pre"); #[allow(unused_mut)] let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. #[cfg(feature = "testing")] { // we need to synchronize with drop completion for python tests without polling for // log messages if let Ok(result) = result.as_mut() { let mut js = tokio::task::JoinSet::new(); for layer in std::mem::take(&mut result.doomed_layers) { js.spawn(layer.wait_drop()); } tracing::info!( total = js.len(), "starting to wait for the gc'd layers to be dropped" ); while let Some(res) = js.join_next().await { res.expect("wait_drop should not panic"); } } let timeline = tenant.get_timeline(timeline_id, false).ok(); let rtc = timeline.as_ref().map(|x| &x.remote_client); if let Some(rtc) = rtc { // layer drops schedule actions on remote timeline client to actually do the // deletions; don't care about the shutdown error, just exit fast drop(rtc.wait_completion().await); } } result.map_err(|e| match e { GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, GcError::TimelineNotFound => { ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) } other => ApiError::InternalServerError(anyhow::anyhow!(other)), }) } /// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise. /// Allows to remove other tenant resources manually, via `tenant_cleanup`. /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal async fn remove_tenant_from_memory( &self, tenant_shard_id: TenantShardId, tenant_cleanup: F, ) -> Result where F: std::future::Future>, { let mut slot_guard = self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; // allow pageserver shutdown to await for our completion let (_guard, progress) = completion::channel(); // The SlotGuard allows us to manipulate the Tenant object without fear of some // concurrent API request doing something else for the same tenant ID. let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to // wait for it but return an error right away because these are distinct requests. slot_guard.revert(); return Err(TenantStateError::IsStopping(tenant_shard_id)); } } Some(tenant) } Some(TenantSlot::Secondary(secondary_state)) => { tracing::info!("Shutting down in secondary mode"); secondary_state.shutdown().await; None } Some(TenantSlot::InProgress(_)) => { // Acquiring a slot guarantees its old value was not InProgress unreachable!(); } None => None, }; match tenant_cleanup .await .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}")) { Ok(hook_value) => { // Success: drop the old TenantSlot::Attached. slot_guard .drop_old_value() .expect("We just called shutdown"); Ok(hook_value) } Err(e) => { // If we had a Tenant, set it to Broken and put it back in the TenantsMap if let Some(attached_tenant) = attached_tenant { attached_tenant.set_broken(e.to_string()).await; } // Leave the broken tenant in the map slot_guard.revert(); Err(TenantStateError::Other(e)) } } } } #[derive(Debug, thiserror::Error)] pub(crate) enum GetTenantError { /// NotFound is a TenantId rather than TenantShardId, because this error type is used from /// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard. #[error("Tenant {0} not found")] NotFound(TenantId), #[error("Tenant {0} not found")] ShardNotFound(TenantShardId), #[error("Tenant {0} is not active")] NotActive(TenantShardId), // Initializing or shutting down: cannot authoritatively say whether we have this tenant #[error("Tenant map is not available: {0}")] MapState(#[from] TenantMapError), } #[derive(thiserror::Error, Debug)] pub(crate) enum GetActiveTenantError { /// We may time out either while TenantSlot is InProgress, or while the Tenant /// is in a non-Active state #[error( "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}" )] WaitForActiveTimeout { latest_state: Option, wait_time: Duration, }, /// The TenantSlot is absent, or in secondary mode #[error(transparent)] NotFound(#[from] GetTenantError), /// Cancellation token fired while we were waiting #[error("cancelled")] Cancelled, /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken) #[error("will not become active. Current state: {0}")] WillNotBecomeActive(TenantState), /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should /// never happen. #[error("Tenant is broken: {0}")] Broken(String), #[error("reconnect to switch tenant id")] SwitchedTenant, } #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] Tenant(#[from] GetTenantError), #[error("Timeline {0}")] Timeline(#[from] crate::tenant::DeleteTimelineError), } #[derive(Debug, thiserror::Error)] pub(crate) enum TenantStateError { #[error("Tenant {0} is stopping")] IsStopping(TenantShardId), #[error(transparent)] SlotError(#[from] TenantSlotError), #[error(transparent)] SlotUpsertError(#[from] TenantSlotUpsertError), #[error(transparent)] Other(#[from] anyhow::Error), } #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapListError { #[error("tenant map is still initiailizing")] Initializing, } #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapInsertError { #[error(transparent)] SlotError(#[from] TenantSlotError), #[error(transparent)] SlotUpsertError(#[from] TenantSlotUpsertError), #[error(transparent)] Other(#[from] anyhow::Error), } /// Superset of TenantMapError: issues that can occur when acquiring a slot /// for a particular tenant ID. #[derive(Debug, thiserror::Error)] pub(crate) enum TenantSlotError { /// When acquiring a slot with the expectation that the tenant already exists. #[error("Tenant {0} not found")] NotFound(TenantShardId), // Tried to read a slot that is currently being mutated by another administrative // operation. #[error("tenant has a state change in progress, try again later")] InProgress, #[error(transparent)] MapState(#[from] TenantMapError), } /// Superset of TenantMapError: issues that can occur when using a SlotGuard /// to insert a new value. #[derive(thiserror::Error)] pub(crate) enum TenantSlotUpsertError { /// An error where the slot is in an unexpected state, indicating a code bug #[error("Internal error updating Tenant")] InternalError(Cow<'static, str>), #[error(transparent)] MapState(TenantMapError), // If we encounter TenantManager shutdown during upsert, we must carry the Completion // from the SlotGuard, so that the caller can hold it while they clean up: otherwise // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that // was protected by the SlotGuard. #[error("Shutting down")] ShuttingDown((TenantSlot, utils::completion::Completion)), } impl std::fmt::Debug for TenantSlotUpsertError { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Self::InternalError(reason) => write!(f, "Internal Error {reason}"), Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"), Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"), } } } #[derive(Debug, thiserror::Error)] enum TenantSlotDropError { /// It is only legal to drop a TenantSlot if its contents are fully shut down #[error("Tenant was not shut down")] NotShutdown, } /// Errors that can happen any time we are walking the tenant map to try and acquire /// the TenantSlot for a particular tenant. #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapError { // Tried to read while initializing #[error("tenant map is still initializing")] StillInitializing, // Tried to read while shutting down #[error("tenant map is shutting down")] ShuttingDown, } /// Guards a particular tenant_id's content in the TenantsMap. /// /// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] /// for this tenant, which acts as a marker for any operations targeting /// this tenant to retry later, or wait for the InProgress state to end. /// /// This structure enforces the important invariant that we do not have overlapping /// tasks that will try to use local storage for a the same tenant ID: we enforce that /// the previous contents of a slot have been shut down before the slot can be /// left empty or used for something else /// /// Holders of a SlotGuard should explicitly dispose of it, using either `upsert` /// to provide a new value, or `revert` to put the slot back into its initial /// state. If the SlotGuard is dropped without calling either of these, then /// we will leave the slot empty if our `old_value` is already shut down, else /// we will replace the slot with `old_value` (equivalent to doing a revert). /// /// The `old_value` may be dropped before the SlotGuard is dropped, by calling /// `drop_old_value`. It is an error to call this without shutting down /// the conents of `old_value`. pub(crate) struct SlotGuard<'a> { tenant_shard_id: TenantShardId, old_value: Option, upserted: bool, /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will /// release any waiters as soon as this SlotGuard is dropped. completion: utils::completion::Completion, tenants: &'a std::sync::RwLock, } impl<'a> SlotGuard<'a> { fn new( tenant_shard_id: TenantShardId, old_value: Option, completion: utils::completion::Completion, tenants: &'a std::sync::RwLock, ) -> Self { Self { tenant_shard_id, old_value, upserted: false, completion, tenants, } } /// Get any value that was present in the slot before we acquired ownership /// of it: in state transitions, this will be the old state. /// // FIXME: get_ prefix // FIXME: this should be .as_ref() -- unsure why no clippy fn get_old_value(&self) -> &Option { &self.old_value } /// Emplace a new value in the slot. This consumes the guard, and after /// returning, the slot is no longer protected from concurrent changes. fn upsert(mut self, new_value: TenantSlot) -> Result<(), TenantSlotUpsertError> { if !self.old_value_is_shutdown() { // This is a bug: callers should never try to drop an old value without // shutting it down return Err(TenantSlotUpsertError::InternalError( "Old TenantSlot value not shut down".into(), )); } let replaced: Option = { let mut locked = self.tenants.write().unwrap(); if let TenantSlot::InProgress(_) = new_value { // It is never expected to try and upsert InProgress via this path: it should // only be written via the tenant_map_acquire_slot path. If we hit this it's a bug. return Err(TenantSlotUpsertError::InternalError( "Attempt to upsert an InProgress state".into(), )); } let m = match &mut *locked { TenantsMap::Initializing => { return Err(TenantSlotUpsertError::MapState( TenantMapError::StillInitializing, )); } TenantsMap::ShuttingDown(_) => { return Err(TenantSlotUpsertError::ShuttingDown(( new_value, self.completion.clone(), ))); } TenantsMap::Open(m) => m, }; METRICS.slot_inserted(&new_value); let replaced = m.insert(self.tenant_shard_id, new_value); self.upserted = true; if let Some(replaced) = replaced.as_ref() { METRICS.slot_removed(replaced); } replaced }; // Sanity check: on an upsert we should always be replacing an InProgress marker match replaced { Some(TenantSlot::InProgress(_)) => { // Expected case: we find our InProgress in the map: nothing should have // replaced it because the code that acquires slots will not grant another // one for the same TenantId. Ok(()) } None => { METRICS.unexpected_errors.inc(); error!( tenant_shard_id = %self.tenant_shard_id, "Missing InProgress marker during tenant upsert, this is a bug." ); Err(TenantSlotUpsertError::InternalError( "Missing InProgress marker during tenant upsert".into(), )) } Some(slot) => { METRICS.unexpected_errors.inc(); error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug. Contents: {:?}", slot); Err(TenantSlotUpsertError::InternalError( "Unexpected contents of TenantSlot".into(), )) } } } /// Replace the InProgress slot with whatever was in the guard when we started fn revert(mut self) { if let Some(value) = self.old_value.take() { match self.upsert(value) { Err(TenantSlotUpsertError::InternalError(_)) => { // We already logged the error, nothing else we can do. } Err( TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_), ) => { // If the map is shutting down, we need not replace anything } Ok(()) => {} } } } /// We may never drop our old value until it is cleanly shut down: otherwise we might leave /// rogue background tasks that would write to the local tenant directory that this guard /// is responsible for protecting fn old_value_is_shutdown(&self) -> bool { match self.old_value.as_ref() { Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(), Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(), Some(TenantSlot::InProgress(_)) => { // A SlotGuard cannot be constructed for a slot that was already InProgress unreachable!() } None => true, } } /// The guard holder is done with the old value of the slot: they are obliged to already /// shut it down before we reach this point. fn drop_old_value(&mut self) -> Result<(), TenantSlotDropError> { if !self.old_value_is_shutdown() { Err(TenantSlotDropError::NotShutdown) } else { self.old_value.take(); Ok(()) } } } impl<'a> Drop for SlotGuard<'a> { fn drop(&mut self) { if self.upserted { return; } // Our old value is already shutdown, or it never existed: it is safe // for us to fully release the TenantSlot back into an empty state let mut locked = self.tenants.write().unwrap(); let m = match &mut *locked { TenantsMap::Initializing => { // There is no map, this should never happen. return; } TenantsMap::ShuttingDown(_) => { // When we transition to shutdown, InProgress elements are removed // from the map, so we do not need to clean up our Inprogress marker. // See [`shutdown_all_tenants0`] return; } TenantsMap::Open(m) => m, }; use std::collections::btree_map::Entry; match m.entry(self.tenant_shard_id) { Entry::Occupied(mut entry) => { if !matches!(entry.get(), TenantSlot::InProgress(_)) { METRICS.unexpected_errors.inc(); error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug. Contents: {:?}", entry.get()); } if self.old_value_is_shutdown() { METRICS.slot_removed(entry.get()); entry.remove(); } else { let inserting = self.old_value.take().unwrap(); METRICS.slot_inserted(&inserting); let replaced = entry.insert(inserting); METRICS.slot_removed(&replaced); } } Entry::Vacant(_) => { METRICS.unexpected_errors.inc(); error!( tenant_shard_id = %self.tenant_shard_id, "Missing InProgress marker during SlotGuard drop, this is a bug." ); } } } } enum TenantSlotPeekMode { /// In Read mode, peek will be permitted to see the slots even if the pageserver is shutting down Read, /// In Write mode, trying to peek at a slot while the pageserver is shutting down is an error Write, } fn tenant_map_peek_slot<'a>( tenants: &'a std::sync::RwLockReadGuard<'a, TenantsMap>, tenant_shard_id: &TenantShardId, mode: TenantSlotPeekMode, ) -> Result, TenantMapError> { match tenants.deref() { TenantsMap::Initializing => Err(TenantMapError::StillInitializing), TenantsMap::ShuttingDown(m) => match mode { TenantSlotPeekMode::Read => Ok(Some( // When reading in ShuttingDown state, we must translate None results // into a ShuttingDown error, because absence of a tenant shard ID in the map // isn't a reliable indicator of the tenant being gone: it might have been // InProgress when shutdown started, and cleaned up from that state such // that it's now no longer in the map. Callers will have to wait until // we next start up to get a proper answer. This avoids incorrect 404 API responses. m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?, )), TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown), }, TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)), } } enum TenantSlotAcquireMode { /// Acquire the slot irrespective of current state, or whether it already exists Any, /// Return an error if trying to acquire a slot and it doesn't already exist MustExist, } use http_utils::error::ApiError; use pageserver_api::models::TimelineGcRequest; use crate::tenant::gc_result::GcResult; #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; use camino::Utf8PathBuf; use storage_broker::BrokerClientChannel; use tracing::Instrument; use super::super::harness::TenantHarness; use super::TenantsMap; use crate::{ basebackup_cache::BasebackupCache, tenant::{ TenantSharedResources, mgr::{BackgroundPurges, TenantManager, TenantSlot}, }, }; #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. let h = TenantHarness::create("shutdown_awaits_in_progress_tenant") .await .unwrap(); let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant let id = t.tenant_shard_id(); // tenant harness configures the logging and we cannot escape it let span = h.span(); let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually // permit it to proceed: that will stick the tenant in InProgress let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant_manager = TenantManager { tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)), conf: h.conf, resources: TenantSharedResources { broker_client: BrokerClientChannel::connect_lazy("foobar.com") .await .unwrap(), remote_storage: h.remote_storage.clone(), deletion_queue_client: h.deletion_queue.new_client(), l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new( h.conf.l0_flush.clone(), ), basebackup_cache, feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(), }, cancel: tokio_util::sync::CancellationToken::new(), background_purges: BackgroundPurges::default(), }; let tenant_manager = Arc::new(tenant_manager); let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel(); let (until_cleanup_started, cleanup_started) = utils::completion::channel(); let mut remove_tenant_from_memory_task = { let tenant_manager = tenant_manager.clone(); let jh = tokio::spawn({ async move { let cleanup = async move { drop(until_cleanup_started); can_complete_cleanup.wait().await; anyhow::Ok(()) }; tenant_manager.remove_tenant_from_memory(id, cleanup).await } .instrument(h.span()) }); // now the long cleanup should be in place, with the stopping state cleanup_started.wait().await; jh }; let mut shutdown_task = { let (until_shutdown_started, shutdown_started) = utils::completion::channel(); let tenant_manager = tenant_manager.clone(); let shutdown_task = tokio::spawn(async move { drop(until_shutdown_started); tenant_manager.shutdown_all_tenants0().await; }); shutdown_started.wait().await; shutdown_task }; let long_time = std::time::Duration::from_secs(15); tokio::select! { _ = &mut shutdown_task => unreachable!("shutdown should block on remove_tenant_from_memory completing"), _ = &mut remove_tenant_from_memory_task => unreachable!("remove_tenant_from_memory_task should not complete until explicitly unblocked"), _ = tokio::time::sleep(long_time) => {}, } drop(until_cleanup_completed); // Now that we allow it to proceed, shutdown should complete immediately remove_tenant_from_memory_task.await.unwrap().unwrap(); shutdown_task.await.unwrap(); } } ================================================ FILE: pageserver/src/tenant/remote_timeline_client/download.rs ================================================ //! Helper functions to download files from remote storage with a RemoteStorage //! //! The functions in this module retry failed operations automatically, according //! to the FAILED_DOWNLOAD_RETRIES constant. use std::collections::HashSet; use std::future::Future; use std::str::FromStr; use std::sync::atomic::AtomicU64; use std::time::SystemTime; use anyhow::{Context, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, }; use tokio::fs::{self, File, OpenOptions}; use tokio::io::AsyncSeekExt; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; use super::index::{IndexPart, LayerFileMetadata}; use super::manifest::TenantManifest; use super::{ FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_manifest_prefix, remote_tenant_path, }; use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; use crate::tenant::Generation; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::virtual_file; use crate::virtual_file::owned_buffers_io::write::FlushTaskError; use crate::virtual_file::{IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::virtual_file::{TempVirtualFile, owned_buffers_io}; /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. #[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, local_path: &Utf8Path, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, &timeline_id, layer_metadata.shard, layer_file_name, layer_metadata.generation, ); let (bytes_amount, temp_file) = download_retry( || async { // TempVirtualFile requires us to never reuse a filename while an old // instance of TempVirtualFile created with that filename is not done dropping yet. // So, we use a monotonic counter to disambiguate the filenames. static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed); let temp_file_path = path_with_suffix_extension( local_path, &format!("{filename_disambiguator:x}.{TEMP_DOWNLOAD_EXTENSION}"), ); let temp_file = TempVirtualFile::new( VirtualFile::open_with_options_v2( &temp_file_path, virtual_file::OpenOptions::new() .create_new(true) .write(true), ctx, ) .await .with_context(|| format!("create a temp file for layer download: {temp_file_path}")) .map_err(DownloadError::Other)?, gate.enter().map_err(|_| DownloadError::Cancelled)?, ); download_object(storage, &remote_path, temp_file, gate, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) .await?; let expected = layer_metadata.file_size; if expected != bytes_amount { return Err(DownloadError::Other(anyhow!( "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {:?}", temp_file.path() ))); } fail::fail_point!("remote-storage-download-pre-rename", |_| { Err(DownloadError::Other(anyhow!( "remote-storage-download-pre-rename failpoint triggered" ))) }); // Try rename before disarming the temp file. // That way, if rename fails for whatever reason, we clean up the temp file on the return path. fs::rename(temp_file.path(), &local_path) .await .with_context(|| format!("rename download layer file to {local_path}")) .map_err(DownloadError::Other)?; // The temp file's VirtualFile points to the temp_file_path which we moved above. // Drop it immediately, it's invalid. // This will get better in https://github.com/neondatabase/neon/issues/11692 let _: VirtualFile = temp_file.disarm_into_inner(); // NB: The gate guard that was stored in `temp_file` is dropped but we continue // to operate on it and on the parent timeline directory. // Those operations are safe to do because higher-level code is holding another gate guard: // - attached mode: the download task spawned by struct Layer is holding the gate guard // - secondary mode: The TenantDownloader::download holds the gate open // The rename above is not durable yet. // It doesn't matter for crash consistency because pageserver startup deletes temp // files and we'll re-download on demand if necessary. // We use fatal_err() below because the after the rename above, // the in-memory state of the filesystem already has the layer file in its final place, // and subsequent pageserver code could think it's durable while it really isn't. let work = { let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior()); async move { let timeline_dir = VirtualFile::open(&timeline_path, &ctx) .await .fatal_err("VirtualFile::open for timeline dir fsync"); timeline_dir .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); } }; crate::virtual_file::io_engine::get() .spawn_blocking_and_block_on_if_std(work) .await; tracing::debug!("download complete: {local_path}"); Ok(bytes_amount) } /// Download the object `src_path` in the remote `storage` to local path `dst_path`. /// /// If Ok() is returned, the download succeeded and the inode & data have been made durable. /// (Note that the directory entry for the inode is not made durable.) /// The file size in bytes is returned. /// /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. /// The unlinking has _not_ been made durable. async fn download_object( storage: &GenericRemoteStorage, src_path: &RemotePath, destination_file: TempVirtualFile, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result<(u64, TempVirtualFile), DownloadError> { let mut download = storage .download(src_path, &DownloadOpts::default(), cancel) .await?; pausable_failpoint!("before-downloading-layer-stream-pausable"); let dst_path = destination_file.path().to_owned(); let mut buffered = owned_buffers_io::write::BufferedWriter::::new( destination_file, 0, || IoBufferMut::with_capacity(super::BUFFER_SIZE), gate.enter().map_err(|_| DownloadError::Cancelled)?, cancel.child_token(), ctx, tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path), ); // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { let chunk = match res { Ok(chunk) => chunk, Err(e) => return Err(DownloadError::from(e)), }; buffered .write_buffered_borrowed(&chunk, ctx) .await .map_err(|e| match e { FlushTaskError::Cancelled => DownloadError::Cancelled, })?; } buffered .shutdown( owned_buffers_io::write::BufferedWriterShutdownMode::PadThenTruncate, ctx, ) .await .map_err(|e| match e { FlushTaskError::Cancelled => DownloadError::Cancelled, }) } .await?; // not using sync_data because it can lose file size update destination_file .sync_all() .await .maybe_fatal_err("download_object sync_all") .with_context(|| format!("failed to fsync source file at {dst_path}")) .map_err(DownloadError::Other)?; Ok((bytes_amount, destination_file)) } const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { let extension = path.extension(); match extension { Some(TEMP_DOWNLOAD_EXTENSION) => true, Some(_) => false, None => false, } } async fn list_identifiers( storage: &GenericRemoteStorage, prefix: RemotePath, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> where T: FromStr + Eq + std::hash::Hash, { let listing = download_retry_forever( || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel), &format!("list identifiers in prefix {prefix}"), &cancel, ) .await?; let mut parsed_ids = HashSet::new(); let mut other_prefixes = HashSet::new(); for id_remote_storage_key in listing.prefixes { let object_name = id_remote_storage_key.object_name().ok_or_else(|| { anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}") })?; match object_name.parse::() { Ok(t) => parsed_ids.insert(t), Err(_) => other_prefixes.insert(object_name.to_string()), }; } for object in listing.keys { let object_name = object .key .object_name() .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?; other_prefixes.insert(object_name.to_string()); } Ok((parsed_ids, other_prefixes)) } /// List shards of given tenant in remote storage pub(crate) async fn list_remote_tenant_shards( storage: &GenericRemoteStorage, tenant_id: TenantId, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> { let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id)); list_identifiers::(storage, remote_path, cancel).await } /// List timelines of given tenant shard in remote storage pub async fn list_remote_timelines( storage: &GenericRemoteStorage, tenant_shard_id: TenantShardId, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> { fail::fail_point!("storage-sync-list-remote-timelines", |_| { anyhow::bail!("storage-sync-list-remote-timelines"); }); let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash(); list_identifiers::(storage, remote_path, cancel).await } async fn do_download_remote_path_retry_forever( storage: &GenericRemoteStorage, remote_path: &RemotePath, download_opts: DownloadOpts, cancel: &CancellationToken, ) -> Result<(Vec, SystemTime), DownloadError> { download_retry_forever( || async { let download = storage .download(remote_path, &download_opts, cancel) .await?; let mut bytes = Vec::new(); let stream = download.download_stream; let mut stream = StreamReader::new(stream); tokio::io::copy_buf(&mut stream, &mut bytes).await?; Ok((bytes, download.last_modified)) }, &format!("download {remote_path:?}"), cancel, ) .await } async fn do_download_tenant_manifest( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, _timeline_id: Option<&TimelineId>, generation: Generation, cancel: &CancellationToken, ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> { let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); let download_opts = DownloadOpts { kind: DownloadKind::Small, ..Default::default() }; let (manifest_bytes, manifest_bytes_mtime) = do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes) .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}")) .map_err(DownloadError::Other)?; Ok((tenant_manifest, generation, manifest_bytes_mtime)) } async fn do_download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: Option<&TimelineId>, index_generation: Generation, cancel: &CancellationToken, ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { let timeline_id = timeline_id.expect("A timeline ID is always provided when downloading an index"); let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); let download_opts = DownloadOpts { kind: DownloadKind::Small, ..Default::default() }; let (index_part_bytes, index_part_mtime) = do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; Ok((index_part, index_generation, index_part_mtime)) } /// Metadata objects are "generationed", meaning that they include a generation suffix. This /// function downloads the object with the highest generation <= `my_generation`. /// /// Data objects (layer files) also include a generation in their path, but there is no equivalent /// search process, because their reference from an index includes the generation. /// /// An expensive object listing operation is only done if necessary: the typical fast path is to issue two /// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding /// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back /// to listing objects. /// /// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]` /// * `what`: for logging, what object are we downloading /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation) /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless /// `cancel`` has fired. This function does not do its own retries of GET operations, and relies /// on the function passed in to do so. /// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object. #[allow(clippy::too_many_arguments)] #[tracing::instrument(skip_all, fields(generation=?my_generation))] pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>( storage: &'a GenericRemoteStorage, tenant_shard_id: &'a TenantShardId, timeline_id: Option<&'a TimelineId>, my_generation: Generation, what: &str, prefix: RemotePath, do_download: DF, parse_path: PF, cancel: &'a CancellationToken, ) -> Result<(T, Generation, SystemTime), DownloadError> where DF: Fn( &'a GenericRemoteStorage, &'a TenantShardId, Option<&'a TimelineId>, Generation, &'a CancellationToken, ) -> DFF, DFF: Future>, PF: Fn(RemotePath) -> Option, T: 'static, { debug_assert_current_span_has_tenant_id(); if my_generation.is_none() { // Operating without generations: just fetch the generation-less path return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; } // Stale case: If we were intentionally attached in a stale generation, the remote object may already // exist in our generation. // // This is an optimization to avoid doing the listing for the general case below. let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; match res { Ok(decoded) => { tracing::debug!("Found {what} from current generation (this is a stale attachment)"); return Ok(decoded); } Err(DownloadError::NotFound) => {} Err(e) => return Err(e), }; // Typical case: the previous generation of this tenant was running healthily, and had uploaded the object // we are seeking in that generation. We may safely start from this index without doing a listing, because: // - We checked for current generation case above // - generations > my_generation are to be ignored // - any other objects that exist would have an older generation than `previous_gen`, and // we want to find the most recent object from a previous generation. // // This is an optimization to avoid doing the listing for the general case below. let res = do_download( storage, tenant_shard_id, timeline_id, my_generation.previous(), cancel, ) .await; match res { Ok(decoded) => { tracing::debug!("Found {what} from previous generation"); return Ok(decoded); } Err(DownloadError::NotFound) => { tracing::debug!("No {what} found from previous generation, falling back to listing"); } Err(e) => { return Err(e); } } // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. let paths = download_retry( || async { storage .list(Some(&prefix), ListingMode::NoDelimiter, None, cancel) .await }, "list index_part files", cancel, ) .await? .keys; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = paths .into_iter() .filter_map(|o| parse_path(o.key)) .filter(|g| g <= &my_generation) .max(); match max_previous_generation { Some(g) => { tracing::debug!("Found {what} in generation {g:?}"); do_download(storage, tenant_shard_id, timeline_id, g, cancel).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. tracing::debug!("No {what}* found"); do_download( storage, tenant_shard_id, timeline_id, Generation::none(), cancel, ) .await } } } /// index_part.json objects are suffixed with a generation number, so we cannot /// directly GET the latest index part without doing some probing. /// /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md pub(crate) async fn download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, cancel: &CancellationToken, ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); download_generation_object( storage, tenant_shard_id, Some(timeline_id), my_generation, "index_part", index_prefix, do_download_index_part, parse_remote_index_path, cancel, ) .await } pub(crate) async fn download_tenant_manifest( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, my_generation: Generation, cancel: &CancellationToken, ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> { let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id); download_generation_object( storage, tenant_shard_id, None, my_generation, "tenant-manifest", manifest_prefix, do_download_tenant_manifest, parse_remote_tenant_manifest_path, cancel, ) .await } pub(crate) async fn download_initdb_tar_zst( conf: &'static PageServerConf, storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, cancel: &CancellationToken, ) -> Result<(Utf8PathBuf, File), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id); let remote_preserved_path = remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id); let timeline_path = conf.timelines_path(tenant_shard_id); if !timeline_path.exists() { tokio::fs::create_dir_all(&timeline_path) .await .with_context(|| format!("timeline dir creation {timeline_path}")) .map_err(DownloadError::Other)?; } let temp_path = timeline_path.join(format!( "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" )); let file = download_retry( || async { let file = OpenOptions::new() .create(true) .truncate(true) .read(true) .write(true) .open(&temp_path) .await .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; let download = match storage .download(&remote_path, &DownloadOpts::default(), cancel) .await { Ok(dl) => dl, Err(DownloadError::NotFound) => { storage .download(&remote_preserved_path, &DownloadOpts::default(), cancel) .await? } Err(other) => Err(other)?, }; let mut download = tokio_util::io::StreamReader::new(download.download_stream); let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file); tokio::io::copy_buf(&mut download, &mut writer).await?; let mut file = writer.into_inner(); file.seek(std::io::SeekFrom::Start(0)) .await .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}")) .map_err(DownloadError::Other)?; Ok(file) }, &format!("download {remote_path}"), cancel, ) .await .inspect_err(|_e| { // Do a best-effort attempt at deleting the temporary file upon encountering an error. // We don't have async here nor do we want to pile on any extra errors. if let Err(e) = std::fs::remove_file(&temp_path) { if e.kind() != std::io::ErrorKind::NotFound { warn!("error deleting temporary file {temp_path}: {e}"); } } })?; Ok((temp_path, file)) } /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (S3), spurious network /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) pub(super) async fn download_retry( op: O, description: &str, cancel: &CancellationToken, ) -> Result where O: FnMut() -> F, F: Future>, { backoff::retry( op, DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, cancel, ) .await .ok_or_else(|| DownloadError::Cancelled) .and_then(|x| x) } pub(crate) async fn download_retry_forever( op: O, description: &str, cancel: &CancellationToken, ) -> Result where O: FnMut() -> F, F: Future>, { backoff::retry( op, DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, u32::MAX, description, cancel, ) .await .ok_or_else(|| DownloadError::Cancelled) .and_then(|x| x) } ================================================ FILE: pageserver/src/tenant/remote_timeline_client/index.rs ================================================ //! In-memory index to track the tenant files on the remote storage. //! //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::RelSizeMigration; use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; use super::is_same_remote_layer_path; use crate::tenant::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. /// /// This type needs to be backwards and forwards compatible. When changing the fields, /// remember to add a test case for the changed version. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct IndexPart { /// Debugging aid describing the version of this type. #[serde(default)] version: usize, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub deleted_at: Option, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub archived_at: Option, /// This field supports import-from-pgdata ("fast imports" platform feature). /// We don't currently use fast imports, so, this field is None for all production timelines. /// See for more information. #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub import_pgdata: Option, /// Layer filenames and metadata. For an index persisted in remote storage, all layers must /// exist in remote storage. pub layer_metadata: HashMap, /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be /// reused. pub(super) disk_consistent_lsn: Lsn, // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes" // for backwards compatibility. #[serde( rename = "metadata_bytes", alias = "metadata", with = "crate::tenant::metadata::modern_serde" )] pub metadata: TimelineMetadata, #[serde(default)] pub(crate) lineage: Lineage, #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) gc_blocking: Option, /// Describes the kind of aux files stored in the timeline. /// /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. /// A V1 setting after V2 files have been committed is not accepted. /// /// None means no aux files have been written to the storage before the point /// when this flag is introduced. /// /// This flag is not used any more as all tenants have been transitioned to the new aux file policy. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field. #[serde(skip_serializing_if = "Option::is_none", default)] l2_lsn: Option, /// State for the garbage-collecting compaction pass. /// /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside /// the PITR window and not needed by child timelines. /// /// A commonly used synonym for this compaction pass is /// "bottommost-compaction" because the affected LSN range /// is the "bottom" of the (key,lsn) map. /// /// Gc-compaction is a quite expensive operation; that's why we use /// trigger condition. /// This field here holds the state pertaining to that trigger condition /// and (in future) to the progress of the gc-compaction, so that it's /// resumable across restarts & migrations. /// /// Note that the underlying algorithm is _also_ called `gc-compaction` /// in most places & design docs; but in fact it is more flexible than /// just the specific use case here; it needs a new name. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) gc_compaction: Option, /// The timestamp when the timeline was marked invisible in synthetic size calculations. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) marked_invisible_at: Option, /// The LSN at which we started the rel size migration. Accesses below this LSN should be /// processed with the v1 read path. Usually this LSN should be set together with `rel_size_migration`. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migrated_at: Option, } #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct GcCompactionState { /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN. pub(crate) last_completed_lsn: Lsn, } impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. /// /// Version is currently informative only. /// Version history /// - 2: added `deleted_at` /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. /// - 5: lineage was added /// - 6: last_aux_file_policy is added. /// - 7: metadata_bytes is no longer written, but still read /// - 8: added `archived_at` /// - 9: +gc_blocking /// - 10: +import_pgdata /// - 11: +rel_size_migration /// - 12: +l2_lsn /// - 13: +gc_compaction /// - 14: +marked_invisible_at /// - 15: +rel_size_migrated_at const LATEST_VERSION: usize = 15; // Versions we may see when reading from a bucket. pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; pub const FILE_NAME: &'static str = "index_part.json"; pub fn empty(metadata: TimelineMetadata) -> Self { IndexPart { version: Self::LATEST_VERSION, layer_metadata: Default::default(), disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, archived_at: None, lineage: Default::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, } } pub fn version(&self) -> usize { self.version } /// If you want this under normal operations, read it from self.metadata: /// this method is just for the scrubber to use when validating an index. pub fn duplicated_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn } pub fn from_json_bytes(bytes: &[u8]) -> Result { serde_json::from_slice::(bytes) } pub fn to_json_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } #[cfg(test)] pub(crate) fn example() -> Self { Self::empty(TimelineMetadata::example()) } /// Returns true if the index contains a reference to the given layer (i.e. file path). /// /// TODO: there should be a variant of LayerName for the physical remote path that contains /// information about the shard and generation, to avoid passing in metadata. pub fn references(&self, name: &LayerName, metadata: &LayerFileMetadata) -> bool { let Some(index_metadata) = self.layer_metadata.get(name) else { return false; }; is_same_remote_layer_path(name, metadata, name, index_metadata) } /// Check for invariants in the index: this is useful when uploading an index to ensure that if /// we encounter a bug, we do not persist buggy metadata. pub(crate) fn validate(&self) -> Result<(), String> { if self.import_pgdata.is_none() && self.metadata.ancestor_timeline().is_none() && self.layer_metadata.is_empty() { // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must // always have at least one layer. return Err("Index has no ancestor and no layers".to_string()); } Ok(()) } } /// Metadata gathered for each of the layer files. /// /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which /// might have less or more metadata depending if upgrading or rolling back an upgrade. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub struct LayerFileMetadata { pub file_size: u64, #[serde(default = "Generation::none")] #[serde(skip_serializing_if = "Generation::is_none")] pub generation: Generation, #[serde(default = "ShardIndex::unsharded")] #[serde(skip_serializing_if = "ShardIndex::is_unsharded")] pub shard: ShardIndex, } impl LayerFileMetadata { pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { LayerFileMetadata { file_size, generation, shard, } } /// Helper to get both generation and file size in a tuple pub fn generation_file_size(&self) -> (Generation, u64) { (self.generation, self.file_size) } } /// Limited history of earlier ancestors. /// /// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly /// reparented by having an later timeline be detached from it's ancestor. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub(crate) struct Lineage { /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. #[serde(skip_serializing_if = "is_false", default)] reparenting_history_truncated: bool, /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] /// /// These are stored in case we want to support WAL based DR on the timeline. There can be many /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings /// after [`Self::original_ancestor`] has been set. #[serde(skip_serializing_if = "Vec::is_empty", default)] reparenting_history: Vec, /// The ancestor from which this timeline has been detached from and when. /// /// If you are adding support for detaching from a hierarchy, consider changing the ancestry /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. // FIXME: this is insufficient even for path of two timelines for future wal recovery // purposes: // // assuming a "old main" which has received most of the WAL, and has a branch "new main", // starting a bit before "old main" last_record_lsn. the current version works fine, // because we will know to replay wal and branch at the recorded Lsn to do wal recovery. // // then assuming "new main" would similarly receive a branch right before its last_record_lsn, // "new new main". the current implementation would just store ("new main", ancestor_lsn, _) // here. however, we cannot recover from WAL using only that information, we would need the // whole ancestry here: // // ```json // [ // ["old main", ancestor_lsn("new main"), _], // ["new main", ancestor_lsn("new new main"), _] // ] // ``` #[serde(skip_serializing_if = "Option::is_none", default)] original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, } fn is_false(b: &bool) -> bool { !b } impl Lineage { const REMEMBER_AT_MOST: usize = 100; pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { if self.reparenting_history.last() == Some(old_ancestor) { // do not re-record it false } else { #[cfg(feature = "testing")] { let existing = self .reparenting_history .iter() .position(|x| x == old_ancestor); assert_eq!( existing, None, "we cannot reparent onto and off and onto the same timeline twice" ); } let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; self.reparenting_history_truncated |= drop_oldest; if drop_oldest { self.reparenting_history.remove(0); } self.reparenting_history.push(*old_ancestor); true } } /// Returns true if anything changed. pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { if let Some((id, lsn, _)) = self.original_ancestor { assert_eq!( &(id, lsn), branchpoint, "detaching attempt has to be for the same ancestor we are already detached from" ); false } else { self.original_ancestor = Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); true } } /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed /// to start a read/write primary at this lsn". /// /// Returns true if the Lsn was previously our branch point. pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { self.original_ancestor .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } /// Returns true if the timeline originally had an ancestor, and no longer has one. pub(crate) fn is_detached_from_ancestor(&self) -> bool { self.original_ancestor.is_some() } /// Returns original ancestor timeline id and lsn that this timeline has been detached from. pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) } pub(crate) fn is_reparented(&self) -> bool { !self.reparenting_history.is_empty() } } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub(crate) struct GcBlocking { pub(crate) started_at: NaiveDateTime, pub(crate) reasons: enumset::EnumSet, } #[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)] #[enumset(serialize_repr = "list")] pub(crate) enum GcBlockingReason { Manual, DetachAncestor, } impl GcBlocking { pub(super) fn started_now_for(reason: GcBlockingReason) -> Self { GcBlocking { started_at: chrono::Utc::now().naive_utc(), reasons: enumset::EnumSet::only(reason), } } /// Returns true if the given reason is one of the reasons why the gc is blocked. pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool { self.reasons.contains(reason) } /// Returns a version of self with the given reason. pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self { assert!(!self.blocked_by(reason)); let mut reasons = self.reasons; reasons.insert(reason); Self { started_at: self.started_at, reasons, } } /// Returns a version of self without the given reason. Assumption is that if /// there are no more reasons, we can unblock the gc by returning `None`. pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option { assert!(self.blocked_by(reason)); if self.reasons.len() == 1 { None } else { let mut reasons = self.reasons; assert!(reasons.remove(reason)); assert!(!reasons.is_empty()); Some(Self { started_at: self.started_at, reasons, }) } } } #[cfg(test)] mod tests { use postgres_ffi::PgMajorVersion; use std::str::FromStr; use utils::id::TimelineId; use super::*; #[test] fn v1_indexpart_is_parsed() { let example = r#"{ "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] }"#; let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, archived_at: None, lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v1_indexpart_is_parsed_with_optional_missing_layers() { let example = r#"{ "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], "missing_layers":["This shouldn't fail deserialization"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] }"#; let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, archived_at: None, lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v2_indexpart_is_parsed_with_deleted_at() { let example = r#"{ "version":2, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], "missing_layers":["This shouldn't fail deserialization"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], "deleted_at": "2023-07-31T09:00:00.123" }"#; let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 2, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn empty_layers_are_parsed() { let empty_layers_json = r#"{ "version":1, "timeline_layers":[], "layer_metadata":{}, "disk_consistent_lsn":"0/2532648", "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] }"#; let expected = IndexPart { version: 1, layer_metadata: HashMap::new(), disk_consistent_lsn: "0/2532648".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[ 136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, 38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, 210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]) .unwrap(), deleted_at: None, archived_at: None, lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); assert_eq!(empty_layers_parsed, expected); } #[test] fn v4_indexpart_is_parsed() { let example = r#"{ "version":4, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], "deleted_at": "2023-07-31T09:00:00.123" }"#; let expected = IndexPart { version: 4, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v5_indexpart_is_parsed() { let example = r#"{ "version":5, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, "disk_consistent_lsn":"0/15A7618", "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], "lineage":{ "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] } }"#; let expected = IndexPart { version: 5, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata { file_size: 23289856, generation: Generation::new(1), shard: ShardIndex::unsharded(), }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata { file_size: 1015808, generation: Generation::new(1), shard: ShardIndex::unsharded(), }) ]), disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v6_indexpart_is_parsed() { let example = r#"{ "version":6, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], "deleted_at": "2023-07-31T09:00:00.123", "lineage":{ "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] }, "last_aux_file_policy": "V2" }"#; let expected = IndexPart { version: 6, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v7_indexpart_is_parsed() { let example = r#"{ "version": 7, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "deleted_at": "2023-07-31T09:00:00.123" }"#; let expected = IndexPart { version: 7, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v8_indexpart_is_parsed() { let example = r#"{ "version": 8, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "deleted_at": "2023-07-31T09:00:00.123", "archived_at": "2023-04-29T09:00:00.123" }"#; let expected = IndexPart { version: 8, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v9_indexpart_is_parsed() { let example = r#"{ "version": 9, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] } }"#; let expected = IndexPart { version: 9, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: None, rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v10_importpgdata_is_parsed() { let example = r#"{ "version": 10, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] }, "import_pgdata": { "V1": { "Done": { "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", "started_at": "2024-11-13T09:23:42.123", "finished_at": "2024-11-13T09:42:23.123" } } } }"#; let expected = IndexPart { version: 10, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: None, l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v11_rel_size_migration_is_parsed() { let example = r#"{ "version": 11, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] }, "import_pgdata": { "V1": { "Done": { "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", "started_at": "2024-11-13T09:23:42.123", "finished_at": "2024-11-13T09:42:23.123" } } }, "rel_size_migration": "legacy" }"#; let expected = IndexPart { version: 11, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: None, gc_compaction: None, marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v12_v13_l2_gc_ompaction_is_parsed() { let example = r#"{ "version": 13, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] }, "import_pgdata": { "V1": { "Done": { "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", "started_at": "2024-11-13T09:23:42.123", "finished_at": "2024-11-13T09:42:23.123" } } }, "rel_size_migration": "legacy", "l2_lsn": "0/16960E8", "gc_compaction": { "last_completed_lsn": "0/16960E8" } }"#; let expected = IndexPart { version: 13, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), gc_compaction: Some(GcCompactionState { last_completed_lsn: "0/16960E8".parse::().unwrap(), }), marked_invisible_at: None, rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v14_marked_invisible_at_is_parsed() { let example = r#"{ "version": 14, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] }, "import_pgdata": { "V1": { "Done": { "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", "started_at": "2024-11-13T09:23:42.123", "finished_at": "2024-11-13T09:42:23.123" } } }, "rel_size_migration": "legacy", "l2_lsn": "0/16960E8", "gc_compaction": { "last_completed_lsn": "0/16960E8" }, "marked_invisible_at": "2023-07-31T09:00:00.123" }"#; let expected = IndexPart { version: 14, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), gc_compaction: Some(GcCompactionState { last_completed_lsn: "0/16960E8".parse::().unwrap(), }), marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), rel_size_migrated_at: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } #[test] fn v15_rel_size_migrated_at_is_parsed() { let example = r#"{ "version": 15, "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata": { "disk_consistent_lsn": "0/16960E8", "prev_record_lsn": "0/1696070", "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", "ancestor_lsn": "0/0", "latest_gc_cutoff_lsn": "0/1696070", "initdb_lsn": "0/1696070", "pg_version": 14 }, "gc_blocking": { "started_at": "2024-07-19T09:00:00.123", "reasons": ["DetachAncestor"] }, "import_pgdata": { "V1": { "Done": { "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", "started_at": "2024-11-13T09:23:42.123", "finished_at": "2024-11-13T09:42:23.123" } } }, "rel_size_migration": "legacy", "l2_lsn": "0/16960E8", "gc_compaction": { "last_completed_lsn": "0/16960E8" }, "marked_invisible_at": "2023-07-31T09:00:00.123", "rel_size_migrated_at": "0/16960E8" }"#; let expected = IndexPart { version: 15, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { file_size: 9007199254741001, generation: Generation::none(), shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::new( Lsn::from_str("0/16960E8").unwrap(), Some(Lsn::from_str("0/1696070").unwrap()), Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), gc_blocking: Some(GcBlocking { started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), }), last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), gc_compaction: Some(GcCompactionState { last_completed_lsn: "0/16960E8".parse::().unwrap(), }), marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), rel_size_migrated_at: Some("0/16960E8".parse::().unwrap()), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } fn parse_naive_datetime(s: &str) -> NaiveDateTime { chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() } } ================================================ FILE: pageserver/src/tenant/remote_timeline_client/manifest.rs ================================================ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::ShardStripeSize; /// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant /// shard-wide information that must be persisted in remote storage. /// /// The manifest is always updated on tenant attach, and as needed. #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct TenantManifest { /// The manifest version. Incremented on manifest format changes, even non-breaking ones. /// Manifests must generally always be backwards and forwards compatible for one release, to /// allow release rollbacks. pub version: usize, /// This tenant's stripe size. This is only advisory, and used to recover tenant data from /// remote storage. The autoritative source is the storage controller. If None, assume the /// original default value of 32768 blocks (256 MB). #[serde(skip_serializing_if = "Option::is_none")] pub stripe_size: Option, /// The list of offloaded timelines together with enough information /// to not have to actually load them. /// /// Note: the timelines mentioned in this list might be deleted, i.e. /// we don't hold an invariant that the references aren't dangling. /// Existence of index-part.json is the actual indicator of timeline existence. #[serde(default)] pub offloaded_timelines: Vec, } /// The remote level representation of an offloaded timeline. /// /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`], /// but the two datastructures serve different needs, this is for a persistent disk format /// that must be backwards compatible, while the other is only for informative purposes. #[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)] pub struct OffloadedTimelineManifest { pub timeline_id: TimelineId, /// Whether the timeline has a parent it has been branched off from or not pub ancestor_timeline_id: Option, /// Whether to retain the branch lsn at the ancestor or not pub ancestor_retain_lsn: Option, /// The time point when the timeline was archived pub archived_at: NaiveDateTime, } /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We /// do not use deny_unknown_fields, so new fields are not breaking. /// /// 1: initial version /// 2: +stripe_size /// /// When adding new versions, also add a parse_vX test case below. pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2; impl TenantManifest { /// Returns true if the manifests are equal, ignoring the version number. This avoids /// re-uploading all manifests just because the version number is bumped. pub fn eq_ignoring_version(&self, other: &Self) -> bool { // Fast path: if the version is equal, just compare directly. if self.version == other.version { return self == other; } // We could alternatively just clone and modify the version here. let Self { version: _, // ignore version stripe_size, offloaded_timelines, } = self; stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines } /// Decodes a manifest from JSON. pub fn from_json_bytes(bytes: &[u8]) -> Result { serde_json::from_slice(bytes) } /// Encodes a manifest as JSON. pub fn to_json_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } } #[cfg(test)] mod tests { use std::str::FromStr; use utils::id::TimelineId; use super::*; /// Empty manifests should be parsed. Version is required. #[test] fn parse_empty() -> anyhow::Result<()> { let json = r#"{ "version": 0 }"#; let expected = TenantManifest { version: 0, stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); Ok(()) } /// Unknown fields should be ignored, for forwards compatibility. #[test] fn parse_unknown_fields() -> anyhow::Result<()> { let json = r#"{ "version": 1, "foo": "bar" }"#; let expected = TenantManifest { version: 1, stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); Ok(()) } /// v1 manifests should be parsed, for backwards compatibility. #[test] fn parse_v1() -> anyhow::Result<()> { let json = r#"{ "version": 1, "offloaded_timelines": [ { "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", "archived_at": "2025-03-07T11:07:11.373105434" }, { "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", "ancestor_retain_lsn": "0/1F79038", "archived_at": "2025-03-05T11:10:22.257901390" } ] }"#; let expected = TenantManifest { version: 1, stripe_size: None, offloaded_timelines: vec![ OffloadedTimelineManifest { timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, ancestor_timeline_id: None, ancestor_retain_lsn: None, archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, }, OffloadedTimelineManifest { timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, ancestor_timeline_id: Some(TimelineId::from_str( "5c4df612fd159e63c1b7853fe94d97da", )?), ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, }, ], }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); Ok(()) } /// v2 manifests should be parsed, for backwards compatibility. #[test] fn parse_v2() -> anyhow::Result<()> { let json = r#"{ "version": 2, "stripe_size": 32768, "offloaded_timelines": [ { "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", "archived_at": "2025-03-07T11:07:11.373105434" }, { "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", "ancestor_retain_lsn": "0/1F79038", "archived_at": "2025-03-05T11:10:22.257901390" } ] }"#; let expected = TenantManifest { version: 2, stripe_size: Some(ShardStripeSize(32768)), offloaded_timelines: vec![ OffloadedTimelineManifest { timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, ancestor_timeline_id: None, ancestor_retain_lsn: None, archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, }, OffloadedTimelineManifest { timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, ancestor_timeline_id: Some(TimelineId::from_str( "5c4df612fd159e63c1b7853fe94d97da", )?), ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, }, ], }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); Ok(()) } } ================================================ FILE: pageserver/src/tenant/remote_timeline_client/upload.rs ================================================ //! Helper functions to upload files to remote storage with a RemoteStorage use std::io::{ErrorKind, SeekFrom}; use std::num::NonZeroU32; use std::time::SystemTime; use anyhow::{Context, bail}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use tracing::info; use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; use super::Generation; use super::index::IndexPart; use super::manifest::TenantManifest; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, }; /// Serializes and uploads the given index part data to the remote storage. pub(crate) async fn upload_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, index_part: &IndexPart, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); fail_point!("before-upload-index", |_| { bail!("failpoint before-upload-index") }); pausable_failpoint!("before-upload-index-pausable"); // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this // (this should never happen) index_part.validate().map_err(|e| anyhow::anyhow!(e))?; // FIXME: this error comes too late let serialized = index_part.to_json_bytes()?; let serialized = Bytes::from(serialized); let index_part_size = serialized.len(); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); storage .upload_storage_object( futures::stream::once(futures::future::ready(Ok(serialized))), index_part_size, &remote_path, cancel, ) .await .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Serializes and uploads the given tenant manifest data to the remote storage. pub(crate) async fn upload_tenant_manifest( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, generation: Generation, tenant_manifest: &TenantManifest, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new tenant manifest"); fail_point!("before-upload-manifest", |_| { bail!("failpoint before-upload-manifest") }); pausable_failpoint!("before-upload-manifest-pausable"); let serialized = Bytes::from(tenant_manifest.to_json_bytes()?); let tenant_manifest_size = serialized.len(); let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); storage .upload_storage_object( futures::stream::once(futures::future::ready(Ok(serialized))), tenant_manifest_size, &remote_path, cancel, ) .await .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'")) } /// Attempts to upload given layer files. /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( storage: &'a GenericRemoteStorage, local_path: &'a Utf8Path, remote_path: &'a RemotePath, metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { bail!("failpoint before-upload-layer") }); pausable_failpoint!("before-upload-layer-pausable"); let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { // If we encounter this arm, it wasn't intended, but it's also not // a big problem, if it's because the file was deleted before an // upload. However, a nonexistent file can also be indicative of // something worse, like when a file is scheduled for upload before // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); if metadata_size != fs_size { bail!( "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" ); } let fs_size = usize::try_from(fs_size) .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; /* BEGIN_HADRON */ let mut metadata = None; match storage { // Pass the file path as a storage metadata to minimize changes to neon. // Otherwise, we need to change the upload interface. GenericRemoteStorage::AzureBlob(s) => { let block_size_mb = s.put_block_size_mb.unwrap_or(0); if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 { metadata = Some(remote_storage::StorageMetadata::from([( "databricks_azure_put_block", local_path.as_str(), )])); } } GenericRemoteStorage::LocalFs(_) => {} GenericRemoteStorage::AwsS3(_) => {} GenericRemoteStorage::Unreliable(_) => {} GenericRemoteStorage::GCS(_) => {} }; /* END_HADRON */ let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage .upload(reader, fs_size, remote_path, metadata, cancel) .await .with_context(|| format!("upload layer from local path '{local_path}'")) } pub(super) async fn copy_timeline_layer( storage: &GenericRemoteStorage, source_path: &RemotePath, target_path: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-copy-layer", |_| { bail!("failpoint before-copy-layer") }); pausable_failpoint!("before-copy-layer-pausable"); storage .copy_object(source_path, target_path, cancel) .await .with_context(|| format!("copy layer {source_path} to {target_path}")) } /// Uploads the given `initdb` data to the remote storage. pub(crate) async fn upload_initdb_dir( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, mut initdb_tar_zst: File, size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading initdb dir"); // We might have read somewhat into the file already in the prior retry attempt initdb_tar_zst.seek(SeekFrom::Start(0)).await?; let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); storage .upload_storage_object(file, size as usize, &remote_path, cancel) .await .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } pub(crate) async fn preserve_initdb_archive( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, cancel: &CancellationToken, ) -> anyhow::Result<()> { let source_path = remote_initdb_archive_path(tenant_id, timeline_id); let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id); storage .copy_object(&source_path, &dest_path, cancel) .await .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'")) } pub(crate) async fn time_travel_recover_tenant( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, ) -> Result<(), TimeTravelError> { let warn_after = 3; let max_attempts = 10; let mut prefixes = Vec::with_capacity(2); if tenant_shard_id.is_shard_zero() { // Also recover the unsharded prefix for a shard of zero: // - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is sharded, we still want to recover the initdb data, but we only // want to do it once, so let's do it on the 0 shard let timelines_path_unsharded = super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id); prefixes.push(timelines_path_unsharded); } if !tenant_shard_id.is_unsharded() { // If the tenant is sharded, we need to recover the sharded prefix let timelines_path = super::remote_timelines_path(tenant_shard_id); prefixes.push(timelines_path); } // Limit the number of versions deletions, mostly so that we don't // keep requesting forever if the list is too long, as we'd put the // list in RAM. // Building a list of 100k entries that reaches the limit roughly takes // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. const COMPLEXITY_LIMIT: Option = NonZeroU32::new(100_000); for prefix in &prefixes { backoff::retry( || async { storage .time_travel_recover( Some(prefix), timestamp, done_if_after, cancel, COMPLEXITY_LIMIT, ) .await }, |e| !matches!(e, TimeTravelError::Other(_)), warn_after, max_attempts, "time travel recovery of tenant prefix", cancel, ) .await .ok_or_else(|| TimeTravelError::Cancelled) .and_then(|x| x)?; } Ok(()) } ================================================ FILE: pageserver/src/tenant/remote_timeline_client.rs ================================================ //! This module manages synchronizing local FS with remote storage. //! //! # Overview //! //! * [`RemoteTimelineClient`] provides functions related to upload/download of a particular timeline. //! It contains a queue of pending uploads, and manages the queue, performing uploads in parallel //! when it's safe to do so. //! //! * Stand-alone function, [`list_remote_timelines`], to get list of timelines of a tenant. //! //! These functions use the low-level remote storage client, [`remote_storage::RemoteStorage`]. //! //! # APIs & How To Use Them //! //! There is a [RemoteTimelineClient] for each [Timeline][`crate::tenant::Timeline`] in the system, //! unless the pageserver is configured without remote storage. //! //! We allocate the client instance in [Timeline][`crate::tenant::Timeline`], i.e., //! either in [`crate::tenant::mgr`] during startup or when creating a new //! timeline. //! However, the client does not become ready for use until we've initialized its upload queue: //! //! - For timelines that already have some state on the remote storage, we use //! [`RemoteTimelineClient::init_upload_queue`] . //! - For newly created timelines, we use //! [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. //! //! The former takes the remote's [`IndexPart`] as an argument, possibly retrieved //! using [`list_remote_timelines`]. We'll elaborate on [`IndexPart`] in the next section. //! //! Whenever we've created/updated/deleted a file in a timeline directory, we schedule //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]: //! //! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file. //! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file. //! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files. //! //! Internally, these functions create [`UploadOp`]s and put them in a queue. //! //! There are also APIs for downloading files. //! These are not part of the aforementioned queuing and will not be discussed //! further here, except in the section covering tenant attach. //! //! # Remote Storage Structure & [`IndexPart`] Index File //! //! The "directory structure" in the remote storage mirrors the local directory structure, with paths //! like `tenants//timelines//`. //! Yet instead of keeping the `metadata` file remotely, we wrap it with more //! data in an "index file" aka [`IndexPart`], containing the list of **all** remote //! files for a given timeline. //! If a file is not referenced from [`IndexPart`], it's not part of the remote storage state. //! //! Having the `IndexPart` also avoids expensive and slow `S3 list` commands. //! //! # Consistency //! //! To have a consistent remote structure, it's important that uploads and //! deletions are performed in the right order. For example, the index file //! contains a list of layer files, so it must not be uploaded until all the //! layer files that are in its list have been successfully uploaded. //! //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as //! described above. //! //! From the user's perspective, the operations are executed sequentially. //! Internally, the client knows which operations can be performed in parallel, //! and which operations act like a "barrier" that require preceding operations //! to finish. The calling code just needs to call the schedule-functions in the //! correct order, and the client will parallelize the operations in a way that //! is safe. For more details, see `UploadOp::can_bypass`. //! //! All of this relies on the following invariants: //! //! - We rely on read-after write consistency in the remote storage. //! - Layer files are immutable. //! //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote //! storage. Different tenants can be attached to different pageservers, but if the //! same tenant is attached to two pageservers at the same time, they will overwrite //! each other's index file updates, and confusion will ensue. There's no interlock or //! mechanism to detect that in the pageserver, we rely on the control plane to ensure //! that that doesn't happen. //! //! ## Implementation Note //! //! The *actual* remote state lags behind the *desired* remote state while //! there are in-flight operations. //! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`]. //! It is initialized based on the [`IndexPart`] that was passed during init //! and updated with every `schedule_*` function call. //! All this is necessary necessary to compute the future [`IndexPart`]s //! when scheduling an operation while other operations that also affect the //! remote [`IndexPart`] are in flight. //! //! # Retries & Error Handling //! //! The client retries operations indefinitely, using exponential back-off. //! There is no way to force a retry, i.e., interrupt the back-off. //! This could be built easily. //! //! # Cancellation //! //! The operations execute as plain [`task_mgr`] tasks, scoped to //! the client's tenant and timeline. //! Dropping the client will drop queued operations but not executing operations. //! These will complete unless the `task_mgr` tasks are cancelled using `task_mgr` //! APIs, e.g., during pageserver shutdown, timeline delete, or tenant detach. //! //! # Completion //! //! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately, //! and submit a request through the DeletionQueue to update //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has //! validated that our generation is not stale. It is this visible value //! that is advertized to safekeepers as a signal that that they can //! delete the WAL up to that LSN. //! //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait //! for all pending operations to complete. It does not prevent more //! operations from getting scheduled. //! //! # Crash Consistency //! //! We do not persist the upload queue state. //! If we drop the client, or crash, all unfinished operations are lost. //! //! To recover, the following steps need to be taken: //! - Retrieve the current remote [`IndexPart`]. This gives us a //! consistent remote state, assuming the user scheduled the operations in //! the correct order. //! - Initiate upload queue with that [`IndexPart`]. //! - Reschedule all lost operations by comparing the local filesystem state //! and remote state as per [`IndexPart`]. This is done in //! [`TenantShard::timeline_init_and_sync`]. //! //! Note that if we crash during file deletion between the index update //! that removes the file from the list of files, and deleting the remote file, //! the file is leaked in the remote storage. Similarly, if a new file is created //! and uploaded, but the pageserver dies permanently before updating the //! remote index file, the new file is leaked in remote storage. We accept and //! tolerate that for now. //! Note further that we cannot easily fix this by scheduling deletes for every //! file that is present only on the remote, because we cannot distinguish the //! following two cases: //! - (1) We had the file locally, deleted it locally, scheduled a remote delete, //! but crashed before it finished remotely. //! - (2) We never had the file locally because we haven't on-demand downloaded //! it yet. //! //! # Downloads //! //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for //! downloading files from the remote storage. Downloads are performed immediately //! against the `RemoteStorage`, independently of the upload queue. //! //! When we attach a tenant, we perform the following steps: //! - create `Tenant` object in `TenantState::Attaching` state //! - List timelines that are present in remote storage, and for each: //! - download their remote [`IndexPart`]s //! - create `Timeline` struct and a `RemoteTimelineClient` //! - initialize the client's upload queue with its `IndexPart` //! - schedule uploads for layers that are only present locally. //! - After the above is done for each timeline, open the tenant for business by //! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. //! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! //! # Operating Without Remote Storage //! //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is //! not created and the uploads are skipped. //! //! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map pub(crate) mod download; pub mod index; pub mod manifest; pub(crate) mod upload; use std::collections::{HashMap, HashSet, VecDeque}; use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::{ download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; use pageserver_api::models::{RelSizeMigration, TimelineArchivalState, TimelineVisibilityState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use regex::Regex; use remote_storage::{ DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, }; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::pausable_failpoint; use utils::shard::ShardNumber; use self::index::IndexPart; use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, }; use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token}; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; use crate::tenant::upload_queue::{ Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadQueueStoppedDeletable, UploadTask, }; use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id}; use crate::{TENANT_HEATMAP_BASENAME, task_mgr}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_DOWNLOAD_RETRIES times, we give up pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10; // Similarly log failed uploads and deletions at WARN level, after this many // retries. Uploads and deletions are retried forever, though. pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const INITDB_PATH: &str = "initdb.tar.zst"; pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; /// Doing non-essential flushes of deletion queue is subject to this timeout, after /// which we warn and skip. const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), } #[derive(Debug, thiserror::Error)] pub enum PersistIndexPartWithDeletedFlagError { #[error("another task is already setting the deleted_flag, started at {0:?}")] AlreadyInProgress(NaiveDateTime), #[error("the deleted_flag was already set, value is {0:?}")] AlreadyDeleted(NaiveDateTime), #[error(transparent)] Other(#[from] anyhow::Error), } #[derive(Debug, thiserror::Error)] pub enum WaitCompletionError { #[error(transparent)] NotInitialized(NotInitialized), #[error("wait_completion aborted because upload queue was stopped")] UploadQueueShutDownOrStopped, } #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; #[derive(Debug, thiserror::Error)] pub enum ShutdownIfArchivedError { #[error(transparent)] NotInitialized(NotInitialized), #[error("timeline is not archived")] NotArchived, } /// Behavioral modes that enable seamless live migration. /// /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. struct RemoteTimelineClientConfig { /// If this is false, then update to remote_consistent_lsn are dropped rather /// than being submitted to DeletionQueue for validation. This behavior is /// used when a tenant attachment is known to have a stale generation number, /// such that validation attempts will always fail. This is not necessary /// for correctness, but avoids spamming error statistics with failed validations /// when doing migrations of tenants. process_remote_consistent_lsn_updates: bool, /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient /// rather than being submitted to the DeletionQueue. This behavior is used when a tenant /// is known to be multi-attached, in order to avoid disrupting other attached tenants /// whose generations' metadata refers to the deleted objects. block_deletions: bool, } /// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do /// not carry the entire LocationConf structure: it's much more than we need. The From /// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient. impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig { fn from(lc: &AttachedLocationConfig) -> Self { Self { block_deletions: !lc.may_delete_layers_hint(), process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(), } } } /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them /// across tenants. This also handles retries of failed uploads. /// /// Upload and delete requests are ordered so that before a deletion is /// performed, we wait for all preceding uploads to finish. This ensures sure /// that if you perform a compaction operation that reshuffles data in layer /// files, we don't have a transient state where the old files have already been /// deleted, but new files have not yet been uploaded. /// /// Similarly, this enforces an order between index-file uploads, and layer /// uploads. Before an index-file upload is performed, all preceding layer /// uploads must be finished. /// /// This also maintains a list of remote files, and automatically includes that /// in the index part file, whenever timeline metadata is uploaded. /// /// Downloads are not queued, they are performed immediately. pub(crate) struct RemoteTimelineClient { conf: &'static PageServerConf, runtime: tokio::runtime::Handle, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, upload_queue: Mutex, pub(crate) metrics: Arc, storage_impl: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, /// Subset of tenant configuration used to control upload behaviors during migrations config: std::sync::RwLock, cancel: CancellationToken, } impl Drop for RemoteTimelineClient { fn drop(&mut self) { debug!("dropping RemoteTimelineClient"); } } impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline /// /// Note: the caller must initialize the upload queue before any uploads can be scheduled, /// by calling init_upload_queue. /// pub(crate) fn new( remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, location_conf: &AttachedLocationConfig, ) -> RemoteTimelineClient { RemoteTimelineClient { conf, runtime: if cfg!(test) { // remote_timeline_client.rs tests rely on current-thread runtime tokio::runtime::Handle::current() } else { BACKGROUND_RUNTIME.handle().clone() }, tenant_shard_id, timeline_id, generation, storage_impl: remote_storage, deletion_queue_client, upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &tenant_shard_id, &timeline_id, )), config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)), cancel: CancellationToken::new(), } } /// Initialize the upload queue for a remote storage that already received /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { // Set the maximum number of inprogress tasks to the remote storage concurrency. There's // certainly no point in starting more upload tasks than this. let inprogress_limit = self .conf .remote_storage_config .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); info!( "initialized upload queue from remote index with {} layer files", index_part.layer_metadata.len() ); Ok(()) } /// Initialize the upload queue for the case where the remote storage is empty, /// i.e., it doesn't have an `IndexPart`. /// /// `rel_size_v2_status` needs to be carried over during branching, and that's why /// it's passed in here. pub fn init_upload_queue_for_empty_remote( &self, local_metadata: &TimelineMetadata, rel_size_v2_migration: Option, rel_size_migrated_at: Option, ) -> anyhow::Result<()> { // Set the maximum number of inprogress tasks to the remote storage concurrency. There's // certainly no point in starting more upload tasks than this. let inprogress_limit = self .conf .remote_storage_config .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); let initialized_queue = upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; initialized_queue.dirty.rel_size_migration = rel_size_v2_migration; initialized_queue.dirty.rel_size_migrated_at = rel_size_migrated_at; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) } /// Initialize the queue in stopped state. Used in startup path /// to continue deletion operation interrupted by pageserver crash or restart. pub fn init_upload_queue_stopped_to_continue_deletion( &self, index_part: &IndexPart, ) -> anyhow::Result<()> { // FIXME: consider newtype for DeletedIndexPart. let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!( "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; let inprogress_limit = self .conf .remote_storage_config .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); self.stop_impl(&mut upload_queue); upload_queue .stopped_mut() .expect("stopped above") .deleted_at = SetDeletedFlagProgress::Successful(deleted_at); Ok(()) } /// Notify this client of a change to its parent tenant's config, as this may cause us to /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle) pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) { let new_conf = RemoteTimelineClientConfig::from(location_conf); let unblocked = !new_conf.block_deletions; // Update config before draining deletions, so that we don't race with more being // inserted. This can result in deletions happening our of order, but that does not // violate any invariants: deletions only need to be ordered relative to upload of the index // that dereferences the deleted objects, and we are not changing that order. *self.config.write().unwrap() = new_conf; if unblocked { // If we may now delete layers, drain any that were blocked in our old // configuration state let mut queue_locked = self.upload_queue.lock().unwrap(); if let Ok(queue) = queue_locked.initialized_mut() { let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); for d in blocked_deletions { if let Err(e) = self.deletion_queue_client.push_layers( self.tenant_shard_id, self.timeline_id, self.generation, d.layers, ) { // This could happen if the pageserver is shut down while a tenant // is transitioning from a deletion-blocked state: we will leak some // S3 objects in this case. warn!("Failed to drain blocked deletions: {}", e); break; } } } } } /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q .upload_queue_for_deletion .get_last_remote_consistent_lsn_projected(), } } pub fn remote_consistent_lsn_visible(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some( q.upload_queue_for_deletion .get_last_remote_consistent_lsn_visible(), ), } } /// Returns true if this timeline was previously detached at this Lsn and the remote timeline /// client is currently initialized. pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { self.upload_queue .lock() .unwrap() .initialized_mut() .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn)) .unwrap_or(false) } /// Returns whether the timeline is archived. /// Return None if the remote index_part hasn't been downloaded yet. pub(crate) fn is_archived(&self) -> Option { self.upload_queue .lock() .unwrap() .initialized_mut() .map(|q| q.clean.0.archived_at.is_some()) .ok() } /// Returns true if the timeline is invisible in synthetic size calculations. pub(crate) fn is_invisible(&self) -> Option { self.upload_queue .lock() .unwrap() .initialized_mut() .map(|q| q.clean.0.marked_invisible_at.is_some()) .ok() } /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived. /// /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet. pub(crate) fn archived_at_stopped_queue( &self, ) -> Result, UploadQueueNotReadyError> { self.upload_queue .lock() .unwrap() .stopped_mut() .map(|q| q.upload_queue_for_deletion.clean.0.archived_at) .map_err(|_| UploadQueueNotReadyError) } fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part .layer_metadata .values() .map(|ilmd| ilmd.file_size) .sum() } else { 0 }; self.metrics.remote_physical_size_gauge.set(size); } pub fn get_remote_physical_size(&self) -> u64 { self.metrics.remote_physical_size_gauge.get() } // // Download operations. // // These don't use the per-timeline queue. They do use the global semaphore in // S3Bucket, to limit the total number of concurrent operations, though. // /// Download index file pub async fn download_index_file( &self, cancel: &CancellationToken, ) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, &RemoteOpKind::Download, crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: "no need for a downloads gauge", }, ); let (index_part, index_generation, index_last_modified) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, cancel, ) .measure_remote_op( Option::::None, RemoteOpFileKind::Index, RemoteOpKind::Download, Arc::clone(&self.metrics), ) .await?; // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g. // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is // also a newer index available, that is surprising. const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600); let index_age = index_last_modified.elapsed().unwrap_or_else(|e| { if e.duration() > Duration::from_secs(5) { // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution // timestamp, it is common to be out by at least 1 second. tracing::warn!("Index has modification time in the future: {e}"); } Duration::ZERO }); if index_age > INDEX_AGE_CHECKS_THRESHOLD { tracing::info!( ?index_generation, age = index_age.as_secs_f64(), "Loaded an old index, checking for other indices..." ); // Find the highest-generation index let (_latest_index_part, latest_index_generation, latest_index_mtime) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, Generation::MAX, cancel, ) .await?; if latest_index_generation > index_generation { // Unexpected! Why are we loading such an old index if a more recent one exists? // We will refuse to proceed, as there is no reasonable scenario where this should happen, but // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation // backwards). tracing::error!( ?index_generation, ?latest_index_generation, ?latest_index_mtime, "Found a newer index while loading an old one" ); return Err(DownloadError::Fatal( "Index age exceeds threshold and a newer index exists".into(), )); } } if index_part.deleted_at.is_some() { Ok(MaybeDeletedIndexPart::Deleted(index_part)) } else { Ok(MaybeDeletedIndexPart::IndexPart(index_part)) } } /// Download a (layer) file from `path`, into local filesystem. /// /// 'layer_metadata' is the metadata from the remote index file. /// /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, local_path: &Utf8Path, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, &RemoteOpKind::Download, crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: "no need for a downloads gauge", }, ); download::download_layer_file( self.conf, &self.storage_impl, self.tenant_shard_id, self.timeline_id, layer_file_name, layer_metadata, local_path, gate, cancel, ctx, ) .measure_remote_op( Some(ctx.task_kind()), RemoteOpFileKind::Layer, RemoteOpKind::Download, Arc::clone(&self.metrics), ) .await? }; REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc(); REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size); Ok(downloaded_size) } // // Upload operations. // /// Launch an index-file upload operation in the background, with /// fully updated metadata. /// /// This should only be used to upload initial metadata to remote storage. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previously scheduled layer file /// upload operations have completed successfully. This is to /// ensure that when the index file claims that layers X, Y and Z /// exist in remote storage, they really do. To wait for the upload /// to complete, use `wait_completion`. /// /// If there were any changes to the list of files, i.e. if any /// layer file uploads were scheduled, since the last index file /// upload, those will be included too. pub fn schedule_index_upload_for_full_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; // As documented in the struct definition, it's ok for latest_metadata to be // ahead of what's _actually_ on the remote during index upload. upload_queue.dirty.metadata = metadata.clone(); self.schedule_index_upload(upload_queue); Ok(()) } /// Launch an index-file upload operation in the background, with only parts of the metadata /// updated. /// /// This is the regular way of updating metadata on layer flushes or Gc. /// /// Using this lighter update mechanism allows for reparenting and detaching without changes to /// `index_part.json`, while being more clear on what values update regularly. pub(crate) fn schedule_index_upload_for_metadata_update( self: &Arc, update: &MetadataUpdate, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.metadata.apply(update); // Defense in depth: if we somehow generated invalid metadata, do not persist it. upload_queue .dirty .validate() .map_err(|e| anyhow::anyhow!(e))?; self.schedule_index_upload(upload_queue); Ok(()) } /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. /// /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, /// so either if the change is already sitting in the queue, but not commited yet, or the change has not /// been in the queue yet. pub(crate) fn schedule_index_upload_for_timeline_archival_state( self: &Arc, state: TimelineArchivalState, ) -> anyhow::Result { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; /// Returns Some(_) if a change is needed, and Some(true) if it's a /// change needed to set archived_at. fn need_change( archived_at: &Option, state: TimelineArchivalState, ) -> Option { match (archived_at, state) { (Some(_), TimelineArchivalState::Archived) | (None, TimelineArchivalState::Unarchived) => { // Nothing to do tracing::info!("intended state matches present state"); None } (None, TimelineArchivalState::Archived) => Some(true), (Some(_), TimelineArchivalState::Unarchived) => Some(false), } } let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state); if let Some(archived_at_set) = need_upload_scheduled { let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); upload_queue.dirty.archived_at = intended_archived_at; self.schedule_index_upload(upload_queue); } let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); Ok(need_wait) } pub(crate) fn schedule_index_upload_for_timeline_invisible_state( self: &Arc, state: TimelineVisibilityState, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; fn need_change( marked_invisible_at: &Option, state: TimelineVisibilityState, ) -> Option { match (marked_invisible_at, state) { (Some(_), TimelineVisibilityState::Invisible) => Some(false), (None, TimelineVisibilityState::Invisible) => Some(true), (Some(_), TimelineVisibilityState::Visible) => Some(false), (None, TimelineVisibilityState::Visible) => Some(true), } } let need_upload_scheduled = need_change(&upload_queue.dirty.marked_invisible_at, state); if let Some(marked_invisible_at_set) = need_upload_scheduled { let intended_marked_invisible_at = marked_invisible_at_set.then(|| Utc::now().naive_utc()); upload_queue.dirty.marked_invisible_at = intended_marked_invisible_at; self.schedule_index_upload(upload_queue); } Ok(()) } /// Shuts the timeline client down, but only if the timeline is archived. /// /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the /// same lock to prevent races between unarchival and offloading: unarchival requires the /// upload queue to be initialized, and leaves behind an upload queue where either dirty /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload /// queue. pub(crate) async fn shutdown_if_archived( self: &Arc, ) -> Result<(), ShutdownIfArchivedError> { { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard .initialized_mut() .map_err(ShutdownIfArchivedError::NotInitialized)?; match ( upload_queue.dirty.archived_at.is_none(), upload_queue.clean.0.archived_at.is_none(), ) { // The expected case: the timeline is archived and we don't want to unarchive (false, false) => {} (true, false) => { tracing::info!("can't shut down timeline: timeline slated for unarchival"); return Err(ShutdownIfArchivedError::NotArchived); } (dirty_archived, true) => { tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage"); return Err(ShutdownIfArchivedError::NotArchived); } } // Set the shutting_down flag while the guard from the archival check is held. // This prevents a race with unarchival, as initialized_mut will not return // an upload queue from this point. // Also launch the queued tasks like shutdown() does. if !upload_queue.shutting_down { upload_queue.shutting_down = true; upload_queue.queued_operations.push_back(UploadOp::Shutdown); // this operation is not counted similar to Barrier self.launch_queued_tasks(upload_queue); } } self.shutdown().await; Ok(()) } /// Launch an index-file upload operation in the background, setting `import_pgdata` field. pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( self: &Arc, state: Option, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.import_pgdata = state; self.schedule_index_upload(upload_queue); Ok(()) } /// If the `import_pgdata` field marks the timeline as having an import in progress, /// launch an index-file upload operation that transitions it to done in the background pub(crate) fn schedule_index_upload_for_import_pgdata_finalize( self: &Arc, ) -> anyhow::Result<()> { use import_pgdata::index_part_format; let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; let to_update = match &upload_queue.dirty.import_pgdata { Some(import) if !import.is_done() => Some(import), Some(_) | None => None, }; if let Some(old) = to_update { let new = index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done { idempotency_key: old.idempotency_key().clone(), started_at: *old.started_at(), finished_at: chrono::Utc::now().naive_utc(), })); upload_queue.dirty.import_pgdata = Some(new); self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, gc_compaction_state: GcCompactionState, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.gc_compaction = Some(gc_compaction_state); self.schedule_index_upload(upload_queue); Ok(()) } /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field. pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update( self: &Arc, rel_size_v2_status: RelSizeMigration, rel_size_migrated_at: Option, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status); upload_queue.dirty.rel_size_migrated_at = rel_size_migrated_at; // TODO: allow this operation to bypass the validation check because we might upload the index part // with no layers but the flag updated. For now, we just modify the index part in memory and the next // upload will include the flag. // self.schedule_index_upload(upload_queue); Ok(()) } /// /// Launch an index-file upload operation in the background, if necessary. /// /// Use this function to schedule the update of the index file after /// scheduling file uploads or deletions. If no file uploads or deletions /// have been scheduled since the last index file upload, this does /// nothing. /// /// Like schedule_index_upload_for_metadata_update(), this merely adds /// the upload to the upload queue and returns quickly. pub fn schedule_index_upload_for_file_changes(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { self.schedule_index_upload(upload_queue); } Ok(()) } /// Only used in the `patch_index_part` HTTP API to force trigger an index upload. pub fn force_schedule_index_upload(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_index_upload(upload_queue); Ok(()) } /// Launch an index-file upload operation in the background (internal function) fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; // make sure it serializes before doing it in perform_upload_task so that it doesn't // look like a retryable error let void = std::io::sink(); serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json"); let index_part = &upload_queue.dirty; info!( "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", index_part.layer_metadata.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); let op = UploadOp::UploadMetadata { uploaded: Box::new(index_part.clone()), }; self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); } /// Reparent this timeline to a new parent. /// /// A retryable step of timeline ancestor detach. pub(crate) async fn schedule_reparenting_and_wait( self: &Arc, new_parent: &TimelineId, ) -> anyhow::Result<()> { let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else { return Err(anyhow::anyhow!( "cannot reparent without a current ancestor" )); }; let uploaded = &upload_queue.clean.0.metadata; if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { // nothing to do None } else { upload_queue.dirty.metadata.reparent(new_parent); upload_queue.dirty.lineage.record_previous_ancestor(&prev); self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } }; if let Some(receiver) = receiver { Self::wait_completion0(receiver).await?; } Ok(()) } /// Schedules uploading a new version of `index_part.json` with the given layers added, /// detaching from ancestor and waits for it to complete. /// /// This is used with `Timeline::detach_ancestor` functionality. pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait( self: &Arc, layers: &[Layer], adopted: (TimelineId, Lsn), ) -> anyhow::Result<()> { let barrier = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { None } else { upload_queue.dirty.metadata.detach_from_ancestor(&adopted); upload_queue.dirty.lineage.record_detaching(&adopted); for layer in layers { let prev = upload_queue .dirty .layer_metadata .insert(layer.layer_desc().layer_name(), layer.metadata()); assert!(prev.is_none(), "copied layer existed already {layer}"); } self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } }; if let Some(barrier) = barrier { Self::wait_completion0(barrier).await?; } Ok(()) } /// Adds a gc blocking reason for this timeline if one does not exist already. /// /// A retryable step of timeline detach ancestor. /// /// Returns a future which waits until the completion of the upload. pub(crate) fn schedule_insert_gc_block_reason( self: &Arc, reason: index::GcBlockingReason, ) -> Result>, NotInitialized> { let maybe_barrier = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if let index::GcBlockingReason::DetachAncestor = reason { if upload_queue.dirty.metadata.ancestor_timeline().is_none() { drop(guard); panic!("cannot start detach ancestor if there is nothing to detach from"); } } let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason)); let current = upload_queue.dirty.gc_blocking.as_ref(); let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); match (current, uploaded) { (x, y) if wanted(x) && wanted(y) => None, (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), // Usual case: !wanted(x) && !wanted(y) // // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to // turn on and off some reason. (x, y) => { if !wanted(x) && wanted(y) { // this could be avoided by having external in-memory synchronization, like // timeline detach ancestor warn!( ?reason, op = "insert", "unexpected: two racing processes to enable and disable a gc blocking reason" ); } // at this point, the metadata must always show that there is a parent upload_queue.dirty.gc_blocking = current .map(|x| x.with_reason(reason)) .or_else(|| Some(index::GcBlocking::started_now_for(reason))); self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } }; Ok(async move { if let Some(barrier) = maybe_barrier { Self::wait_completion0(barrier).await?; } Ok(()) }) } /// Removes a gc blocking reason for this timeline if one exists. /// /// A retryable step of timeline detach ancestor. /// /// Returns a future which waits until the completion of the upload. pub(crate) fn schedule_remove_gc_block_reason( self: &Arc, reason: index::GcBlockingReason, ) -> Result>, NotInitialized> { let maybe_barrier = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if let index::GcBlockingReason::DetachAncestor = reason { if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { drop(guard); panic!("cannot complete timeline_ancestor_detach while not detached"); } } let wanted = |x: Option<&index::GcBlocking>| { x.is_none() || x.is_some_and(|b| !b.blocked_by(reason)) }; let current = upload_queue.dirty.gc_blocking.as_ref(); let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); match (current, uploaded) { (x, y) if wanted(x) && wanted(y) => None, (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), (x, y) => { if !wanted(x) && wanted(y) { warn!( ?reason, op = "remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)" ); } upload_queue.dirty.gc_blocking = current.as_ref().and_then(|x| x.without_reason(reason)); assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } }; Ok(async move { if let Some(barrier) = maybe_barrier { Self::wait_completion0(barrier).await?; } Ok(()) }) } /// Launch an upload operation in the background; the file is added to be included in next /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_layer_file_upload0(upload_queue, layer); self.launch_queued_tasks(upload_queue); Ok(()) } fn schedule_layer_file_upload0( self: &Arc, upload_queue: &mut UploadQueueInitialized, layer: ResidentLayer, ) { let metadata = layer.metadata(); upload_queue .dirty .layer_metadata .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( gen=?metadata.generation, shard=?metadata.shard, "scheduled layer file upload {layer}", ); let op = UploadOp::UploadLayer(layer, metadata, None); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } /// Launch a delete operation in the background. /// /// The operation does not modify local filesystem state. /// /// Note: This schedules an index file upload before the deletions. The /// deletion won't actually be performed, until all previously scheduled /// upload operations, and the index file upload, have completed /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; let with_metadata = self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); // Launch the tasks immediately, if possible self.launch_queued_tasks(upload_queue); Ok(()) } /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the /// layer files, leaving them dangling. /// /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`] /// is invoked on them. pub(crate) fn schedule_gc_update( self: &Arc, gc_layers: &[Layer], ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; // just forget the return value; after uploading the next index_part.json, we can consider // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); Ok(()) } pub(crate) fn schedule_unlinking_of_layers_from_index_part( self: &Arc, names: I, ) -> Result<(), NotInitialized> where I: IntoIterator, { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); Ok(()) } /// Update the remote index file, removing the to-be-deleted files from the index, /// allowing scheduling of actual deletions later. fn schedule_unlinking_of_layers_from_index_part0( self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, ) -> Vec<(LayerName, LayerFileMetadata)> where I: IntoIterator, { // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata // is later used when physically deleting layers, to construct key paths. let with_metadata: Vec<_> = names .into_iter() .filter_map(|name| { let meta = upload_queue.dirty.layer_metadata.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; Some((name, meta)) } else { // This can only happen if we forgot to to schedule the file upload // before scheduling the delete. Log it because it is a rare/strange // situation, and in case something is misbehaving, we'd like to know which // layers experienced this. info!("Deleting layer {name} not found in latest_files list, never uploaded?"); None } }) .collect(); #[cfg(feature = "testing")] for (name, metadata) in &with_metadata { let gen_ = metadata.generation; if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) { if unexpected == gen_ { tracing::error!("{name} was unlinked twice with same generation"); } else { tracing::error!( "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}" ); } } } // after unlinking files from the upload_queue.latest_files we must always schedule an // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { self.schedule_index_upload(upload_queue); } with_metadata } /// Schedules deletion for layer files which have previously been unlinked from the /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_deletion_of_unlinked0(upload_queue, layers); self.launch_queued_tasks(upload_queue); Ok(()) } fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still // be referenced by other shards. We are free to delete them locally and remove // them from our index (and would have already done so when we reach this point // in the code), but we may not delete them remotely. with_metadata.retain(|(name, meta)| { let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number && meta.shard.shard_count == self.tenant_shard_id.shard_count; if !retain { tracing::debug!( "Skipping deletion of ancestor-shard layer {name}, from shard {}", meta.shard ); } retain }); for (name, meta) in &with_metadata { info!( "scheduling deletion of layer {}{} (shard {})", name, meta.generation.get_suffix(), meta.shard ); } #[cfg(feature = "testing")] for (name, meta) in &with_metadata { let gen_ = meta.generation; match upload_queue.dangling_files.remove(name) { Some(same) if same == gen_ => { /* expected */ } Some(other) => { tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}"); } None => { tracing::error!("{name} was unlinked but was not dangling"); } } } // schedule the actual deletions if with_metadata.is_empty() { // avoid scheduling the op & bumping the metric return; } let op = UploadOp::Delete(Delete { layers: with_metadata, }); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } /// Schedules a compaction update to the remote `index_part.json`. /// /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers. pub(crate) fn schedule_compaction_update( self: &Arc, compacted_from: &[Layer], compacted_to: &[ResidentLayer], ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; for layer in compacted_to { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); Ok(()) } /// Wait for all previously scheduled uploads/deletions to complete pub(crate) async fn wait_completion(self: &Arc) -> Result<(), WaitCompletionError> { let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard .initialized_mut() .map_err(WaitCompletionError::NotInitialized)?; self.schedule_barrier0(upload_queue) }; Self::wait_completion0(receiver).await } async fn wait_completion0( mut receiver: tokio::sync::watch::Receiver<()>, ) -> Result<(), WaitCompletionError> { if receiver.changed().await.is_err() { return Err(WaitCompletionError::UploadQueueShutDownOrStopped); } Ok(()) } pub(crate) fn schedule_barrier(self: &Arc) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_barrier0(upload_queue); Ok(()) } fn schedule_barrier0( self: &Arc, upload_queue: &mut UploadQueueInitialized, ) -> tokio::sync::watch::Receiver<()> { let (sender, receiver) = tokio::sync::watch::channel(()); let barrier_op = UploadOp::Barrier(sender); upload_queue.queued_operations.push_back(barrier_op); // Don't count this kind of operation! // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); receiver } /// Wait for all previously scheduled operations to complete, and then stop. /// /// Not cancellation safe pub(crate) async fn shutdown(self: &Arc) { // On cancellation the queue is left in ackward state of refusing new operations but // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. let sg = scopeguard::guard((), |_| { tracing::error!( "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error" ) }); let fut = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = match &mut *guard { UploadQueue::Stopped(_) => { scopeguard::ScopeGuard::into_inner(sg); return; } UploadQueue::Uninitialized => { // transition into Stopped state self.stop_impl(&mut guard); scopeguard::ScopeGuard::into_inner(sg); return; } UploadQueue::Initialized(init) => init, }; // if the queue is already stuck due to a shutdown operation which was cancelled, then // just don't add more of these as they would never complete. // // TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue // in every place we would not have to jump through this hoop, and this method could be // made cancellable. if !upload_queue.shutting_down { upload_queue.shutting_down = true; upload_queue.queued_operations.push_back(UploadOp::Shutdown); // this operation is not counted similar to Barrier self.launch_queued_tasks(upload_queue); } upload_queue.shutdown_ready.clone().acquire_owned() }; let res = fut.await; scopeguard::ScopeGuard::into_inner(sg); match res { Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"), Err(_closed) => { // expected } } self.stop(); } /// Set the deleted_at field in the remote index file. /// /// This fails if the upload queue has not been `stop()`ed. /// /// The caller is responsible for calling `stop()` AND for waiting /// for any ongoing upload tasks to finish after `stop()` has succeeded. /// Check method [`RemoteTimelineClient::stop`] for details. #[instrument(skip_all)] pub(crate) async fn persist_index_part_with_deleted_flag( self: &Arc, ) -> Result<(), PersistIndexPartWithDeletedFlagError> { let index_part_with_deleted_at = { let mut locked = self.upload_queue.lock().unwrap(); // We must be in stopped state because otherwise // we can have inprogress index part upload that can overwrite the file // with missing is_deleted flag that we going to set below let stopped = locked.stopped_mut()?; match stopped.deleted_at { SetDeletedFlagProgress::NotRunning => (), // proceed SetDeletedFlagProgress::InProgress(at) => { return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at)); } SetDeletedFlagProgress::Successful(at) => { return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at)); } }; let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); let mut index_part = stopped.upload_queue_for_deletion.dirty.clone(); index_part.deleted_at = Some(deleted_at); index_part }; let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| { let mut locked = self_clone.upload_queue.lock().unwrap(); let stopped = locked .stopped_mut() .expect("there's no way out of Stopping, and we checked it's Stopping above"); stopped.deleted_at = SetDeletedFlagProgress::NotRunning; }); pausable_failpoint!("persist_deleted_index_part"); backoff::retry( || { upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, &index_part_with_deleted_at, &self.cancel, ) }, |_e| false, 1, // have just a couple of attempts // when executed as part of timeline deletion this happens in context of api call // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", &self.cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x)?; // all good, disarm the guard and mark as success ScopeGuard::into_inner(undo_deleted_at); { let mut locked = self.upload_queue.lock().unwrap(); let stopped = locked .stopped_mut() .expect("there's no way out of Stopping, and we checked it's Stopping above"); stopped.deleted_at = SetDeletedFlagProgress::Successful( index_part_with_deleted_at .deleted_at .expect("we set it above"), ); } Ok(()) } pub(crate) fn is_deleting(&self) -> bool { let mut locked = self.upload_queue.lock().unwrap(); locked.stopped_mut().is_ok() } pub(crate) async fn preserve_initdb_archive( self: &Arc, tenant_id: &TenantId, timeline_id: &TimelineId, cancel: &CancellationToken, ) -> anyhow::Result<()> { backoff::retry( || async { upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel) .await }, TimeoutOrCancel::caused_by_cancel, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "preserve_initdb_tar_zst", &cancel.clone(), ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) .context("backing up initdb archive")?; Ok(()) } /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload. /// /// This is not normally needed. pub(crate) async fn upload_layer_file( self: &Arc, uploaded: &ResidentLayer, cancel: &CancellationToken, ) -> anyhow::Result<()> { let remote_path = remote_layer_path( &self.tenant_shard_id.tenant_id, &self.timeline_id, uploaded.metadata().shard, &uploaded.layer_desc().layer_name(), uploaded.metadata().generation, ); backoff::retry( || async { upload::upload_timeline_layer( &self.storage_impl, uploaded.local_path(), &remote_path, uploaded.metadata().file_size, cancel, ) .await }, TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "upload a layer without adding it to latest files", cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) .context("upload a layer without adding it to latest files") } /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is /// not added to be part of a future `index_part.json` upload. pub(crate) async fn copy_timeline_layer( self: &Arc, adopted: &Layer, adopted_as: &Layer, cancel: &CancellationToken, ) -> anyhow::Result<()> { let source_remote_path = remote_layer_path( &self.tenant_shard_id.tenant_id, &adopted .get_timeline_id() .expect("Source timeline should be alive"), adopted.metadata().shard, &adopted.layer_desc().layer_name(), adopted.metadata().generation, ); let target_remote_path = remote_layer_path( &self.tenant_shard_id.tenant_id, &self.timeline_id, adopted_as.metadata().shard, &adopted_as.layer_desc().layer_name(), adopted_as.metadata().generation, ); backoff::retry( || async { upload::copy_timeline_layer( &self.storage_impl, &source_remote_path, &target_remote_path, cancel, ) .await }, TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "copy timeline layer", cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) .context("remote copy timeline layer") } async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { match tokio::time::timeout( DELETION_QUEUE_FLUSH_TIMEOUT, self.deletion_queue_client.flush_immediate(), ) .await { Ok(result) => result, Err(_timeout) => { // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. tracing::warn!( "Timed out waiting for deletion queue flush, acking deletion anyway" ); Ok(()) } } } /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. pub(crate) async fn delete_all(self: &Arc) -> Result<(), DeleteTimelineError> { debug_assert_current_span_has_tenant_and_timeline_id(); let layers: Vec = { let mut locked = self.upload_queue.lock().unwrap(); let stopped = locked.stopped_mut().map_err(DeleteTimelineError::Other)?; if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) { return Err(DeleteTimelineError::Other(anyhow::anyhow!( "deleted_at is not set" ))); } debug_assert!(stopped.upload_queue_for_deletion.no_pending_work()); stopped .upload_queue_for_deletion .dirty .layer_metadata .drain() .filter(|(_file_name, meta)| { // Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from // all shards anyway, we _could_ delete these, but // - it creates a potential race if other shards are still // using the layers while this shard deletes them. // - it means that if we rolled back the shard split, the ancestor shards would be in a state where // these timelines are present but corrupt (their index exists but some layers don't) // // These layers will eventually be cleaned up by the scrubber when it does physical GC. meta.shard.shard_number == self.tenant_shard_id.shard_number && meta.shard.shard_count == self.tenant_shard_id.shard_count }) .map(|(file_name, meta)| { remote_layer_path( &self.tenant_shard_id.tenant_id, &self.timeline_id, meta.shard, &file_name, meta.generation, ) }) .collect() }; let layer_deletion_count = layers.len(); self.deletion_queue_client .push_immediate(layers) .await .map_err(|_| DeleteTimelineError::Cancelled)?; // Delete the initdb.tar.zst, which is not always present, but deletion attempts of // inexistant objects are not considered errors. let initdb_path = remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id); self.deletion_queue_client .push_immediate(vec![initdb_path]) .await .map_err(|_| DeleteTimelineError::Cancelled)?; // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); // Execute all pending deletions, so that when we proceed to do a listing below, we aren't // taking the burden of listing all the layers that we already know we should delete. self.flush_deletion_queue() .await .map_err(|_| DeleteTimelineError::Cancelled)?; let cancel = shutdown_token(); let remaining = download_retry( || async { self.storage_impl .list( Some(&timeline_storage_path), ListingMode::NoDelimiter, None, &cancel, ) .await }, "list remaining files", &cancel, ) .await .context("list files remaining files")? .keys; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute let latest_index = remaining .iter() .filter(|o| { o.key .object_name() .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) .filter_map(|o| { parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_)) }) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( // No generation-suffixed indices, assume we are dealing with // a legacy index. remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()), ); let remaining_layers: Vec = remaining .into_iter() .filter_map(|o| { if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) { None } else { Some(o.key) } }) .inspect(|path| { if let Some(name) = path.object_name() { info!(%name, "deleting a file not referenced from index_part.json"); } else { warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json"); } }) .collect(); let not_referenced_count = remaining_layers.len(); if !remaining_layers.is_empty() { self.deletion_queue_client .push_immediate(remaining_layers) .await .map_err(|_| DeleteTimelineError::Cancelled)?; } fail::fail_point!("timeline-delete-before-index-delete", |_| { Err(DeleteTimelineError::Other(anyhow::anyhow!( "failpoint: timeline-delete-before-index-delete" )))? }); debug!("enqueuing index part deletion"); self.deletion_queue_client .push_immediate([latest_index].to_vec()) .await .map_err(|_| DeleteTimelineError::Cancelled)?; // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. self.flush_deletion_queue() .await .map_err(|_| DeleteTimelineError::Cancelled)?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(DeleteTimelineError::Other(anyhow::anyhow!( "failpoint: timeline-delete-after-index-delete" )))? }); info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json"); Ok(()) } /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { debug!("starting op: {next_op}"); // Prepare upload. match &mut next_op { UploadOp::UploadLayer(layer, meta, mode) => { if upload_queue .recently_deleted .remove(&(layer.layer_desc().layer_name().clone(), meta.generation)) { *mode = Some(OpType::FlushDeletion); } else { *mode = Some(OpType::MayReorder) } } UploadOp::UploadMetadata { .. } => {} UploadOp::Delete(Delete { layers }) => { for (name, meta) in layers { upload_queue .recently_deleted .insert((name.clone(), meta.generation)); } } UploadOp::Barrier(sender) => { sender.send_replace(()); continue; } UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"), }; // Assign unique ID to this task upload_queue.task_counter += 1; let upload_task_id = upload_queue.task_counter; // Add it to the in-progress map let task = Arc::new(UploadTask { task_id: upload_task_id, op: next_op, coalesced_ops, retries: AtomicU32::new(0), }); upload_queue .inprogress_tasks .insert(task.task_id, Arc::clone(&task)); // Spawn task to perform the task let self_rc = Arc::clone(self); let tenant_shard_id = self.tenant_shard_id; let timeline_id = self.timeline_id; task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, self.tenant_shard_id, Some(self.timeline_id), "remote upload", async move { self_rc.perform_upload_task(task).await; Ok(()) } .instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)), ); // Loop back to process next task } } /// /// Perform an upload task. /// /// The task is in the `inprogress_tasks` list. This function will try to /// execute it, retrying forever. On successful completion, the task is /// removed it from the `inprogress_tasks` list, and any next task(s) in the /// queue that were waiting by the completion are launched. /// /// The task can be shut down, however. That leads to stopping the whole /// queue. /// async fn perform_upload_task(self: &Arc, task: Arc) { let cancel = shutdown_token(); // Loop to retry until it completes. loop { // If we're requested to shut down, close up shop and exit. // // Note: We only check for the shutdown requests between retries, so // if a shutdown request arrives while we're busy uploading, in the // upload::upload:*() call below, we will wait not exit until it has // finished. We probably could cancel the upload by simply dropping // the Future, but we're not 100% sure if the remote storage library // is cancellation safe, so we don't dare to do that. Hopefully, the // upload finishes or times out soon enough. if cancel.is_cancelled() { info!("upload task cancelled by shutdown request"); self.stop(); return; } // Assert that we don't modify a layer that's referenced by the current index. if cfg!(debug_assertions) { let modified = match &task.op { UploadOp::UploadLayer(layer, layer_metadata, _) => { vec![(layer.layer_desc().layer_name(), layer_metadata)] } UploadOp::Delete(delete) => { delete.layers.iter().map(|(n, m)| (n.clone(), m)).collect() } // These don't modify layers. UploadOp::UploadMetadata { .. } => Vec::new(), UploadOp::Barrier(_) => Vec::new(), UploadOp::Shutdown => Vec::new(), }; if let Ok(queue) = self.upload_queue.lock().unwrap().initialized_mut() { for (ref name, metadata) in modified { debug_assert!( !queue.clean.0.references(name, metadata), "layer {name} modified while referenced by index", ); } } } let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(layer, layer_metadata, mode) => { // TODO: check if this mechanism can be removed now that can_bypass() performs // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { if self.config.read().unwrap().block_deletions { // Of course, this is not efficient... but usually the queue should be empty. let mut queue_locked = self.upload_queue.lock().unwrap(); let mut detected = false; if let Ok(queue) = queue_locked.initialized_mut() { for list in queue.blocked_deletions.iter_mut() { list.layers.retain(|(name, meta)| { if name == &layer.layer_desc().layer_name() && meta.generation == layer_metadata.generation { detected = true; // remove the layer from deletion queue false } else { // keep the layer true } }); } } if detected { info!( "cancelled blocked deletion of layer {} at gen {:?}", layer.layer_desc().layer_name(), layer_metadata.generation ); } } else { // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted, // which is not possible in the current system. info!( "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}", layer.layer_desc().layer_name(), layer_metadata.generation ); { // We are going to flush, we can clean up the recently deleted list. let mut queue_locked = self.upload_queue.lock().unwrap(); if let Ok(queue) = queue_locked.initialized_mut() { queue.recently_deleted.clear(); } } if let Err(e) = self.deletion_queue_client.flush_execute().await { warn!( "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ", layer.layer_desc().layer_name(), layer_metadata.generation ); } else { info!( "done flushing deletion queue before uploading layer {} at gen {:?}", layer.layer_desc().layer_name(), layer_metadata.generation ); } } } let local_path = layer.local_path(); // We should only be uploading layers created by this `Tenant`'s lifetime, so // the metadata in the upload should always match our current generation. assert_eq!(layer_metadata.generation, self.generation); let remote_path = remote_layer_path( &self.tenant_shard_id.tenant_id, &self.timeline_id, layer_metadata.shard, &layer.layer_desc().layer_name(), layer_metadata.generation, ); upload::upload_timeline_layer( &self.storage_impl, local_path, &remote_path, layer_metadata.file_size, &self.cancel, ) .measure_remote_op( Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Layer, RemoteOpKind::Upload, Arc::clone(&self.metrics), ) .await } UploadOp::UploadMetadata { uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, uploaded, &self.cancel, ) .measure_remote_op( Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Index, RemoteOpKind::Upload, Arc::clone(&self.metrics), ) .await; if res.is_ok() { self.update_remote_physical_size_gauge(Some(uploaded)); let mention_having_future_layers = if cfg!(feature = "testing") { uploaded .layer_metadata .keys() .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn())) } else { false }; if mention_having_future_layers { // find rationale near crate::tenant::timeline::init::cleanup_future_layer tracing::info!( disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(), "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup" ); } } res } // TODO: this should wait for the deletion to be executed by the deletion queue. // Otherwise, the deletion may race with an upload and wrongfully delete a newer // file. Some of the above logic attempts to work around this, it should be replaced // by the upload queue ordering guarantees (see `can_bypass`). See: // . UploadOp::Delete(delete) => { if self.config.read().unwrap().block_deletions { let mut queue_locked = self.upload_queue.lock().unwrap(); if let Ok(queue) = queue_locked.initialized_mut() { queue.blocked_deletions.push(delete.clone()); } Ok(()) } else { pausable_failpoint!("before-delete-layer-pausable"); self.deletion_queue_client .push_layers( self.tenant_shard_id, self.timeline_id, self.generation, delete.layers.clone(), ) .map_err(|e| anyhow::anyhow!(e)) } } unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => { // unreachable. Barrier operations are handled synchronously in // launch_queued_tasks warn!("unexpected {unexpected:?} operation in perform_upload_task"); break; } }; match upload_result { Ok(()) => { break; } Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => { // loop around to do the proper stopping continue; } Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); // Uploads can fail due to rate limits (IAM, S3), spurious network problems, // or other external reasons. Such issues are relatively regular, so log them // at info level at first, and only WARN if the operation fails repeatedly. // // (See similar logic for downloads in `download::download_retry`) if retries < FAILED_UPLOAD_WARN_THRESHOLD { info!( "failed to perform remote task {}, will retry (attempt {}): {:#}", task.op, retries, e ); } else { warn!( "failed to perform remote task {}, will retry (attempt {}): {:?}", task.op, retries, e ); } // sleep until it's time to retry, or we're cancelled exponential_backoff( retries, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, &cancel, ) .await; } } } let retries = task.retries.load(Ordering::SeqCst); if retries > 0 { info!( "remote task {} completed successfully after {} retries", task.op, retries ); } else { debug!("remote task {} completed successfully", task.op); } // The task has completed successfully. Remove it from the in-progress list. let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { UploadQueue::Uninitialized => panic!( "callers are responsible for ensuring this is only called on an initialized queue" ), UploadQueue::Stopped(_stopped) => None, UploadQueue::Initialized(qi) => Some(qi), }; let upload_queue = match upload_queue { Some(upload_queue) => upload_queue, None => { info!("another concurrent task already stopped the queue"); return; } }; upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { UploadOp::UploadLayer(_, _, _) => None, UploadOp::UploadMetadata { ref uploaded } => { // the task id is reused as a monotonicity check for storing the "clean" // IndexPart. let last_updater = upload_queue.clean.1; let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); let monotone = is_later || last_updater.is_none(); assert!( monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id ); // not taking ownership is wasteful upload_queue.clean.0.clone_from(uploaded); upload_queue.clean.1 = Some(task.task_id); let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn(); self.metrics .projected_remote_consistent_lsn_gauge .set(lsn.0); if self.generation.is_none() { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); None } else if self .config .read() .unwrap() .process_remote_consistent_lsn_updates { Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) } else { // Our config disables remote_consistent_lsn updates: drop it. None } } UploadOp::Delete(_) => None, UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(), }; // Launch any queued tasks that were unblocked by this one. self.launch_queued_tasks(upload_queue); lsn_update }; if let Some((lsn, slot)) = lsn_update { // Updates to the remote_consistent_lsn we advertise to pageservers // are all routed through the DeletionQueue, to enforce important // data safety guarantees (see docs/rfcs/025-generation-numbers.md) self.deletion_queue_client .update_remote_consistent_lsn( self.tenant_shard_id, self.timeline_id, self.generation, lsn, slot, ) .await; } self.metric_end(&task.op); for coalesced_op in &task.coalesced_ops { self.metric_end(coalesced_op); } } fn metric_impl( &self, op: &UploadOp, ) -> Option<( RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetricsCallTrackSize, )> { use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { UploadOp::UploadLayer(_, m, _) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), ), UploadOp::UploadMetadata { .. } => ( RemoteOpFileKind::Index, RemoteOpKind::Upload, DontTrackSize { reason: "metadata uploads are tiny", }, ), UploadOp::Delete(_delete) => ( RemoteOpFileKind::Layer, RemoteOpKind::Delete, DontTrackSize { reason: "should we track deletes? positive or negative sign?", }, ), UploadOp::Barrier(..) | UploadOp::Shutdown => { // we do not account these return None; } }; Some(res) } fn metric_begin(&self, op: &UploadOp) { let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); guard.will_decrement_manually(); // in metric_end(), see right below } fn metric_end(&self, op: &UploadOp) { let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; self.metrics.call_end(&file_kind, &op_kind, track_bytes); } /// Close the upload queue for new operations and cancel queued operations. /// /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); self.stop_impl(&mut guard); } fn stop_impl(&self, guard: &mut std::sync::MutexGuard) { match &mut **guard { UploadQueue::Uninitialized => { info!("UploadQueue is in state Uninitialized, nothing to do"); **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized); } UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); } UploadQueue::Initialized(initialized) => { info!("shutting down upload queue"); // Replace the queue with the Stopped state, taking ownership of the old // Initialized queue. We will do some checks on it, and then drop it. let qi = { // Here we preserve working version of the upload queue for possible use during deletions. // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point. // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { inprogress_limit: initialized.inprogress_limit, task_counter: 0, dirty: initialized.dirty.clone(), clean: initialized.clean.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), inprogress_tasks: HashMap::default(), queued_operations: VecDeque::default(), #[cfg(feature = "testing")] dangling_files: HashMap::default(), blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), recently_deleted: HashSet::new(), }; let upload_queue = std::mem::replace( &mut **guard, UploadQueue::Stopped(UploadQueueStopped::Deletable( UploadQueueStoppedDeletable { upload_queue_for_deletion, deleted_at: SetDeletedFlagProgress::NotRunning, }, )), ); if let UploadQueue::Initialized(qi) = upload_queue { qi } else { unreachable!("we checked in the match above that it is Initialized"); } }; // We don't need to do anything here for in-progress tasks. They will finish // on their own, decrement the unfinished-task counter themselves, and observe // that the queue is Stopped. drop(qi.inprogress_tasks); // Tear down queued ops for op in qi.queued_operations.into_iter() { self.metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); } } } } /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue /// externally to RemoteTimelineClient. pub(crate) fn initialized_upload_queue( &self, ) -> Result, NotInitialized> { let mut inner = self.upload_queue.lock().unwrap(); inner.initialized_mut()?; Ok(UploadQueueAccessor { inner }) } pub(crate) fn no_pending_work(&self) -> bool { let inner = self.upload_queue.lock().unwrap(); match &*inner { UploadQueue::Uninitialized | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true, UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => { x.upload_queue_for_deletion.no_pending_work() } UploadQueue::Initialized(x) => x.no_pending_work(), } } /// 'foreign' in the sense that it does not belong to this tenant shard. This method /// is used during GC for other shards to get the index of shard zero. pub(crate) async fn download_foreign_index( &self, shard_number: ShardNumber, cancel: &CancellationToken, ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> { let foreign_shard_id = TenantShardId { shard_number, shard_count: self.tenant_shard_id.shard_count, tenant_id: self.tenant_shard_id.tenant_id, }; download_index_part( &self.storage_impl, &foreign_shard_id, &self.timeline_id, Generation::MAX, cancel, ) .await } } pub(crate) struct UploadQueueAccessor<'a> { inner: std::sync::MutexGuard<'a, UploadQueue>, } impl UploadQueueAccessor<'_> { pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { match &*self.inner { UploadQueue::Initialized(x) => &x.clean.0, UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { unreachable!("checked before constructing") } } } } pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}"); RemotePath::from_string(&path).expect("Failed to construct path") } pub fn remote_tenant_manifest_path( tenant_shard_id: &TenantShardId, generation: Generation, ) -> RemotePath { let path = format!( "tenants/{tenant_shard_id}/tenant-manifest{}.json", generation.get_suffix() ); RemotePath::from_string(&path).expect("Failed to construct path") } /// Prefix to all generations' manifest objects in a tenant shard pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/tenant-manifest",); RemotePath::from_string(&path).expect("Failed to construct path") } pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") } fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath { let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") } pub fn remote_timeline_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> RemotePath { remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } /// Obtains the path of the given Layer in the remote /// /// Note that the shard component of a remote layer path is _not_ always the same /// as in the TenantShardId of the caller: tenants may reference layers from a different /// ShardIndex. Use the ShardIndex from the layer's metadata. pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), layer_file_name, generation.get_suffix() ); RemotePath::from_string(&path).expect("Failed to construct path") } /// Returns true if a and b have the same layer path within a tenant/timeline. This is essentially /// remote_layer_path(a) == remote_layer_path(b) without the string allocations. /// /// TODO: there should be a variant of LayerName for the physical path that contains information /// about the shard and generation, such that this could be replaced by a simple comparison. pub fn is_same_remote_layer_path( aname: &LayerName, ameta: &LayerFileMetadata, bname: &LayerName, bmeta: &LayerFileMetadata, ) -> bool { // NB: don't assert remote_layer_path(a) == remote_layer_path(b); too expensive even for debug. aname == bname && ameta.shard == bmeta.shard && ameta.generation == bmeta.generation } pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}" )) .expect("Failed to construct path") } pub fn remote_initdb_preserved_archive_path( tenant_id: &TenantId, timeline_id: &TimelineId, ) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}" )) .expect("Failed to construct path") } pub fn remote_index_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, ) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", IndexPart::FILE_NAME, generation.get_suffix() )) .expect("Failed to construct path") } pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}" )) .expect("Failed to construct path") } /// Given the key of an index, parse out the generation part of the name pub fn parse_remote_index_path(path: RemotePath) -> Option { let file_name = match path.get_path().file_name() { Some(f) => f, None => { // Unexpected: we should be seeing index_part.json paths only tracing::warn!("Malformed index key {}", path); return None; } }; match file_name.split_once('-') { Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix), None => None, } } /// Given the key of a tenant manifest, parse out the generation number pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option { static RE: OnceLock = OnceLock::new(); let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap()); re.captures(path.get_path().as_str()) .and_then(|c| c.get(1)) .and_then(|m| Generation::parse_suffix(m.as_str())) } #[cfg(test)] mod tests { use std::collections::HashSet; use super::*; use crate::DEFAULT_PG_VERSION; use crate::context::RequestContext; use crate::tenant::config::AttachmentMode; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::{TenantShard, Timeline}; pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() } pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { let metadata = TimelineMetadata::new( disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0), // Any version will do // but it should be consistent with the one in the tests crate::DEFAULT_PG_VERSION, ); // go through serialize + deserialize to fix the header, including checksum TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } fn assert_file_list(a: &HashSet, b: &[&str]) { let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); bvec.sort_unstable(); assert_eq!(avec, bvec); } fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path, generation: Generation) { let mut expected: Vec = expected .iter() .map(|x| format!("{}{}", x, generation.get_suffix())) .collect(); expected.sort(); let mut found: Vec = Vec::new(); for entry in std::fs::read_dir(remote_path).unwrap().flatten() { let entry_name = entry.file_name(); let fname = entry_name.to_str().unwrap(); found.push(String::from(fname)); } found.sort(); assert_eq!(found, expected); } struct TestSetup { harness: TenantHarness, tenant: Arc, timeline: Arc, tenant_ctx: RequestContext, } impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; Ok(Self { harness, tenant, timeline, tenant_ctx: ctx, }) } /// Construct a RemoteTimelineClient in an arbitrary generation fn build_client(&self, generation: Generation) -> Arc { let location_conf = AttachedLocationConfig { generation, attach_mode: AttachmentMode::Single, }; Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), tenant_shard_id: self.harness.tenant_shard_id, timeline_id: TIMELINE_ID, generation, storage_impl: self.harness.remote_storage.clone(), deletion_queue_client: self.harness.deletion_queue.new_client(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &self.harness.tenant_shard_id, &TIMELINE_ID, )), config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)), cancel: CancellationToken::new(), }) } /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id /// and timeline_id are present. fn span(&self) -> tracing::Span { tracing::info_span!( "test", tenant_id = %self.harness.tenant_shard_id.tenant_id, shard_id = %self.harness.tenant_shard_id.shard_slug(), timeline_id = %TIMELINE_ID ) } } // Test scheduling #[tokio::test] async fn upload_scheduling() { // Test outline: // // Schedule upload of a bunch of layers. Check that they are started immediately, not queued // Schedule upload of index. Check that it is queued // let the layer file uploads finish. Check that the index-upload is now started // let the index-upload finish. // // Download back the index.json. Check that the list of files is correct // // Schedule upload. Schedule deletion. Check that the deletion is queued // let upload finish. Check that deletion is now started // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued let test_setup = TestSetup::new("upload_scheduling").await.unwrap(); let span = test_setup.span(); let _guard = span.enter(); let TestSetup { harness, tenant: _tenant, timeline, tenant_ctx: _tenant_ctx, } = test_setup; let client = &timeline.remote_client; // Download back the index.json, and check that the list of files is correct let initial_index_part = match client .download_index_file(&CancellationToken::new()) .await .unwrap() { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), }; let initial_layers = initial_index_part .layer_metadata .keys() .map(|f| f.to_owned()) .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() }; let timeline_path = harness.timeline_path(&TIMELINE_ID); println!("workdir: {}", harness.conf.workdir); let remote_timeline_dir = harness .remote_fs_dir .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap()); println!("remote_timeline_dir: {remote_timeline_dir}"); let generation = harness.generation; let shard = harness.shard; // Create a couple of dummy files, schedule upload for them let layers = [ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() .map(|(name, contents): (LayerName, Vec)| { let local_path = local_layer_path( harness.conf, &timeline.tenant_shard_id, &timeline.timeline_id, &name, &generation, ); std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) }).collect::>(); client .schedule_layer_file_upload(layers[0].clone()) .unwrap(); client .schedule_layer_file_upload(layers[1].clone()) .unwrap(); // Check that they are started immediately, not queued // // this works because we running within block_on, so any futures are now queued up until // our next await point. { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.is_empty()); assert_eq!(upload_queue.inprogress_tasks.len(), 2); assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2); // also check that `latest_file_changes` was updated assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); } // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); client .schedule_index_upload_for_full_metadata_update(&metadata) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.len() == 1); assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } // Wait for the uploads to finish client.wait_completion().await.unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.is_empty()); assert!(upload_queue.inprogress_tasks.is_empty()); } // Download back the index.json, and check that the list of files is correct let index_part = match client .download_index_file(&CancellationToken::new()) .await .unwrap() { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), }; assert_file_list( &index_part .layer_metadata .keys() .map(|f| f.to_owned()) .collect(), &[ &initial_layer.to_string(), &layers[0].layer_desc().layer_name().to_string(), &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); // Schedule upload and then a deletion. Check that the deletion is queued client .schedule_layer_file_upload(layers[2].clone()) .unwrap(); // this is no longer consistent with how deletion works with Layer::drop, but in this test // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); // Deletion schedules upload of the index file, and the file deletion itself assert_eq!(upload_queue.queued_operations.len(), 2); assert_eq!(upload_queue.inprogress_tasks.len(), 1); assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1); assert_eq!(upload_queue.num_inprogress_deletions(), 0); assert_eq!( upload_queue.latest_files_changes_since_metadata_upload_scheduled, 0 ); } assert_remote_files( &[ &initial_layer.to_string(), &layers[0].layer_desc().layer_name().to_string(), &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, generation, ); // Finish them client.wait_completion().await.unwrap(); harness.deletion_queue.pump().await; assert_remote_files( &[ &initial_layer.to_string(), &layers[1].layer_desc().layer_name().to_string(), &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, generation, ); } #[tokio::test] async fn bytes_unfinished_gauge_for_layer_file_uploads() { // Setup let TestSetup { harness, tenant: _tenant, timeline, .. } = TestSetup::new("metrics").await.unwrap(); let client = &timeline.remote_client; let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let local_path = local_layer_path( harness.conf, &timeline.tenant_shard_id, &timeline.timeline_id, &layer_file_name_1, &harness.generation, ); let content_1 = dummy_contents("foo"); std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); #[derive(Debug, PartialEq, Clone, Copy)] struct BytesStartedFinished { started: Option, finished: Option, } impl std::ops::Add for BytesStartedFinished { type Output = Self; fn add(self, rhs: Self) -> Self::Output { Self { started: self.started.map(|v| v + rhs.started.unwrap_or(0)), finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)), } } } let get_bytes_started_stopped = || { let started = client .metrics .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) .map(|v| v.try_into().unwrap()); let stopped = client .metrics .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) .map(|v| v.try_into().unwrap()); BytesStartedFinished { started, finished: stopped, } }; // Test tracing::info!("now doing actual test"); let actual_a = get_bytes_started_stopped(); client .schedule_layer_file_upload(layer_file_1.clone()) .unwrap(); let actual_b = get_bytes_started_stopped(); client.wait_completion().await.unwrap(); let actual_c = get_bytes_started_stopped(); // Validate let expected_b = actual_a + BytesStartedFinished { started: Some(content_1.len()), // assert that the _finished metric is created eagerly so that subtractions work on first sample finished: Some(0), }; assert_eq!(actual_b, expected_b); let expected_c = actual_a + BytesStartedFinished { started: Some(content_1.len()), finished: Some(content_1.len()), }; assert_eq!(actual_c, expected_c); } async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed let example_index_part = IndexPart::example(); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); let index_path = test_state.harness.remote_fs_dir.join( remote_index_path( &test_state.harness.tenant_shard_id, &TIMELINE_ID, generation, ) .get_path(), ); std::fs::create_dir_all(index_path.parent().unwrap()) .expect("creating test dir should work"); eprintln!("Writing {index_path}"); std::fs::write(&index_path, index_part_bytes).unwrap(); example_index_part } /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its /// index, the IndexPart returned is equal to `expected` async fn assert_got_index_part( test_state: &TestSetup, get_generation: Generation, expected: &IndexPart, ) { let client = test_state.build_client(get_generation); let download_r = client .download_index_file(&CancellationToken::new()) .await .expect("download should always succeed"); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); match download_r { MaybeDeletedIndexPart::IndexPart(index_part) => { assert_eq!(&index_part, expected); } MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"), } } #[tokio::test] async fn index_part_download_simple() -> anyhow::Result<()> { let test_state = TestSetup::new("index_part_download_simple").await.unwrap(); let span = test_state.span(); let _guard = span.enter(); // Simple case: we are in generation N, load the index from generation N - 1 let generation_n = 5; let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await; Ok(()) } #[tokio::test] async fn index_part_download_ordering() -> anyhow::Result<()> { let test_state = TestSetup::new("index_part_download_ordering") .await .unwrap(); let span = test_state.span(); let _guard = span.enter(); // A generation-less IndexPart exists in the bucket, we should find it let generation_n = 5; let injected_none = inject_index_part(&test_state, Generation::none()).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await; // If a more recent-than-none generation exists, we should prefer to load that let injected_1 = inject_index_part(&test_state, Generation::new(1)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; // If a more-recent-than-me generation exists, we should ignore it. let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; // If a directly previous generation exists, _and_ an index exists in my own // generation, I should prefer my own generation. let _injected_prev = inject_index_part(&test_state, Generation::new(generation_n - 1)).await; let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await; assert_got_index_part( &test_state, Generation::new(generation_n), &injected_current, ) .await; Ok(()) } } ================================================ FILE: pageserver/src/tenant/secondary/downloader.rs ================================================ use std::collections::{HashMap, HashSet}; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span, instrument, warn}; use utils::completion::Barrier; use utils::crashsafe::path_with_suffix_extension; use utils::id::TimelineId; use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time}; use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline}; use super::scheduler::{ self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter, period_warmup, }; use super::{ CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError, }; use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::disk_usage_eviction_task::{ DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32, }; use crate::metrics::SECONDARY_MODE; use crate::tenant::config::SecondaryLocationConfig; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::ephemeral_file::is_ephemeral_file; use crate::tenant::mgr::TenantManager; use crate::tenant::remote_timeline_client::download::download_layer_file; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::remote_timeline_client::{ FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file, remote_heatmap_path, }; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint}; use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// For each tenant, default period for how long must have passed since the last download_tenant call before /// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first /// download, if the uploader populated it. const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); pub(super) async fn downloader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) .instrument(info_span!("secondary_download_scheduler")) .await } struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, root_ctx: RequestContext, } #[derive(Debug, Clone)] pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, local_path: Utf8PathBuf, } impl OnDiskState { fn new( _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, local_path, } } // This is infallible, because all errors are either acceptable (ENOENT), or totally // unexpected (fatal). pub(super) fn remove_blocking(&self) { // We tolerate ENOENT, because between planning eviction and executing // it, the secondary downloader could have seen an updated heatmap that // resulted in a layer being deleted. // Other local I/O errors are process-fatal: these should never happen. std::fs::remove_file(&self.local_path) .or_else(fs_ext::ignore_not_found) .fatal_err("Deleting secondary layer") } pub(crate) fn file_size(&self) -> u64 { self.metadata.file_size } } pub(super) struct SecondaryDetailTimeline { on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. pub(super) evicted_at: HashMap, ctx: RequestContext, } impl Clone for SecondaryDetailTimeline { fn clone(&self) -> Self { Self { on_disk_layers: self.on_disk_layers.clone(), evicted_at: self.evicted_at.clone(), // This is a bit awkward. The downloader code operates on a snapshot // of the secondary list to avoid locking it for extended periods of time. // No particularly strong reason to chose [`RequestContext::detached_child`], // but makes more sense than [`RequestContext::attached_child`]. ctx: self .ctx .detached_child(self.ctx.task_kind(), self.ctx.download_behavior()), } } } impl std::fmt::Debug for SecondaryDetailTimeline { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("SecondaryDetailTimeline") .field("on_disk_layers", &self.on_disk_layers) .field("evicted_at", &self.evicted_at) .finish() } } impl SecondaryDetailTimeline { pub(super) fn empty(ctx: RequestContext) -> Self { SecondaryDetailTimeline { on_disk_layers: Default::default(), evicted_at: Default::default(), ctx, } } pub(super) fn context(&self) -> &RequestContext { &self.ctx } pub(super) fn remove_layer( &mut self, name: &LayerName, resident_metric: &UIntGauge, ) -> Option { let removed = self.on_disk_layers.remove(name); if let Some(removed) = &removed { resident_metric.sub(removed.file_size()); } removed } /// `local_path` fn touch_layer( &mut self, conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, touched: &HeatMapLayer, resident_metric: &UIntGauge, local_path: F, ) where F: FnOnce() -> Utf8PathBuf, { use std::collections::hash_map::Entry; match self.on_disk_layers.entry(touched.name.clone()) { Entry::Occupied(mut v) => { v.get_mut().access_time = touched.access_time; } Entry::Vacant(e) => { e.insert(OnDiskState::new( conf, tenant_shard_id, timeline_id, touched.name.clone(), touched.metadata.clone(), touched.access_time, local_path(), )); resident_metric.add(touched.metadata.file_size); } } } } // Aspects of a heatmap that we remember after downloading it #[derive(Clone, Debug)] struct DownloadSummary { etag: Etag, #[allow(unused)] mtime: SystemTime, upload_period: Duration, } /// This state is written by the secondary downloader, it is opaque /// to TenantManager #[derive(Debug)] pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, last_download: Option, next_download: Option, timelines: HashMap, } /// Helper for logging SystemTime fn strftime(t: &'_ SystemTime) -> DelayedFormat> { let datetime: chrono::DateTime = (*t).into(); datetime.format("%d/%m/%Y %T") } /// Information returned from download function when it detects the heatmap has changed struct HeatMapModified { etag: Etag, last_modified: SystemTime, bytes: Vec, } enum HeatMapDownload { // The heatmap's etag has changed: return the new etag, mtime and the body bytes Modified(HeatMapModified), // The heatmap's etag is unchanged Unmodified, } impl SecondaryDetail { pub(super) fn new(config: SecondaryLocationConfig) -> Self { Self { config, last_download: None, next_download: None, timelines: HashMap::new(), } } #[cfg(feature = "testing")] pub(crate) fn total_resident_size(&self) -> u64 { self.timelines .values() .map(|tl| { tl.on_disk_layers .values() .map(|v| v.metadata.file_size) .sum::() }) .sum::() } pub(super) fn evict_layer( &mut self, name: LayerName, timeline_id: &TimelineId, now: SystemTime, resident_metric: &UIntGauge, ) -> Option { let timeline = self.timelines.get_mut(timeline_id)?; let removed = timeline.remove_layer(&name, resident_metric); if removed.is_some() { timeline.evicted_at.insert(name, now); } removed } pub(super) fn remove_timeline( &mut self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, resident_metric: &UIntGauge, ) { let removed = self.timelines.remove(timeline_id); if let Some(removed) = removed { Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric); } } pub(super) fn drain_timelines( &mut self, tenant_shard_id: &TenantShardId, resident_metric: &UIntGauge, ) { for (timeline_id, removed) in self.timelines.drain() { Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric); } } fn clear_timeline_metrics( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, detail: SecondaryDetailTimeline, resident_metric: &UIntGauge, ) { resident_metric.sub( detail .on_disk_layers .values() .map(|l| l.metadata.file_size) .sum(), ); let shard_id = format!("{}", tenant_shard_id.shard_slug()); let tenant_id = tenant_shard_id.tenant_id.to_string(); let timeline_id = timeline_id.to_string(); for op in StorageIoSizeOperation::VARIANTS { let _ = STORAGE_IO_SIZE.remove_label_values(&[ op, tenant_id.as_str(), shard_id.as_str(), timeline_id.as_str(), ]); } } /// Additionally returns the total number of layers, used for more stable relative access time /// based eviction. pub(super) fn get_layers_for_eviction( &self, parent: &Arc, ) -> (DiskUsageEvictionInfo, usize) { let mut result = DiskUsageEvictionInfo::default(); let mut total_layers = 0; for (timeline_id, timeline_detail) in &self.timelines { result .resident_layers .extend(timeline_detail.on_disk_layers.iter().map(|(name, ods)| { EvictionCandidate { layer: EvictionLayer::Secondary(EvictionSecondaryLayer { secondary_tenant: parent.clone(), timeline_id: *timeline_id, name: name.clone(), metadata: ods.metadata.clone(), }), last_activity_ts: ods.access_time, relative_last_activity: finite_f32::FiniteF32::ZERO, // Secondary location layers are presumed visible, because Covered layers // are excluded from the heatmap visibility: LayerVisibilityHint::Visible, } })); // total might be missing currently downloading layers, but as a lower than actual // value it is good enough approximation. total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len(); } result.max_layer_size = result .resident_layers .iter() .map(|l| l.layer.get_file_size()) .max(); tracing::debug!( "eviction: secondary tenant {} found {} timelines, {} layers", parent.get_tenant_shard_id(), self.timelines.len(), result.resident_layers.len() ); (result, total_layers) } } struct PendingDownload { secondary_state: Arc, last_download: Option, target_time: Option, } impl scheduler::PendingJob for PendingDownload { fn get_tenant_shard_id(&self) -> &TenantShardId { self.secondary_state.get_tenant_shard_id() } } struct RunningDownload { barrier: Barrier, } impl scheduler::RunningJob for RunningDownload { fn get_barrier(&self) -> Barrier { self.barrier.clone() } } struct CompleteDownload { secondary_state: Arc, completed_at: Instant, result: Result<(), UpdateError>, } impl scheduler::Completion for CompleteDownload { fn get_tenant_shard_id(&self) -> &TenantShardId { self.secondary_state.get_tenant_shard_id() } } type Scheduler = TenantBackgroundJobs< SecondaryDownloader, PendingDownload, RunningDownload, CompleteDownload, DownloadCommand, >; impl JobGenerator for SecondaryDownloader { #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))] fn on_completion(&mut self, completion: CompleteDownload) { let CompleteDownload { secondary_state, completed_at: _completed_at, result, } = completion; tracing::debug!("Secondary tenant download completed"); let mut detail = secondary_state.detail.lock().unwrap(); match result { Err(UpdateError::Restart) => { // Start downloading again as soon as we can. This will involve waiting for the scheduler's // scheduling interval. This slightly reduces the peak download speed of tenants that hit their // deadline and keep restarting, but that also helps give other tenants a chance to execute rather // that letting one big tenant dominate for a long time. detail.next_download = Some(Instant::now()); } _ => { let period = detail .last_download .as_ref() .map(|d| d.upload_period) .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); // We advance next_download irrespective of errors: we don't want error cases to result in // expensive busy-polling. detail.next_download = Some(Instant::now() + period_jitter(period, 5)); } } } async fn schedule(&mut self) -> SchedulingResult { let mut result = SchedulingResult { jobs: Vec::new(), want_interval: None, }; // Step 1: identify some tenants that we may work on let mut tenants: Vec> = Vec::new(); self.tenant_manager .foreach_secondary_tenants(|_id, secondary_state| { tenants.push(secondary_state.clone()); }); // Step 2: filter out tenants which are not yet elegible to run let now = Instant::now(); result.jobs = tenants .into_iter() .filter_map(|secondary_tenant| { let (last_download, next_download) = { let mut detail = secondary_tenant.detail.lock().unwrap(); if !detail.config.warm { // Downloads are disabled for this tenant detail.next_download = None; return None; } if detail.next_download.is_none() { // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent // rounds will use a smaller jitter to avoid accidentally synchronizing later. detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect( "Using our constant, which is known to be small compared with clock range", )); } (detail.last_download.clone(), detail.next_download.unwrap()) }; if now > next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, target_time: Some(next_download), }) } else { None } }) .collect(); // Step 3: sort by target execution time to run most urgent first. result.jobs.sort_by_key(|j| j.target_time); result } fn on_command( &mut self, command: DownloadCommand, ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); let tenant = self .tenant_manager .get_secondary_tenant_shard(*tenant_shard_id) .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?; Ok(PendingDownload { target_time: None, last_download: None, secondary_state: tenant, }) } fn spawn( &mut self, job: PendingDownload, ) -> ( RunningDownload, Pin + Send>>, ) { let PendingDownload { secondary_state, last_download, target_time, } = job; let (completion, barrier) = utils::completion::channel(); let remote_storage = self.remote_storage.clone(); let conf = self.tenant_manager.get_conf(); let tenant_shard_id = *secondary_state.get_tenant_shard_id(); let download_ctx = self .root_ctx .attached_child() .with_scope_secondary_tenant(&tenant_shard_id); (RunningDownload { barrier }, Box::pin(async move { let _completion = completion; let result = TenantDownloader::new(conf, &remote_storage, &secondary_state) .download(&download_ctx) .await; match &result { Err(UpdateError::NoData) => { tracing::info!("No heatmap found for tenant. This is fine if it is new."); }, Err(UpdateError::NoSpace) => { tracing::warn!("Insufficient space while downloading. Will retry later."); } Err(UpdateError::Cancelled) => { tracing::info!("Shut down while downloading"); }, Err(UpdateError::Deserialize(e)) => { tracing::error!("Corrupt content while downloading tenant: {e}"); }, Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => { tracing::error!("Error while downloading tenant: {e}"); }, Err(UpdateError::Restart) => { tracing::info!("Download reached deadline & will restart to update heatmap") } Ok(()) => {} }; // Irrespective of the result, we will reschedule ourselves to run after our usual period. // If the job had a target execution time, we may check our final execution // time against that for observability purposes. if let (Some(target_time), Some(last_download)) = (target_time, last_download) { // Elapsed time includes any scheduling lag as well as the execution of the job let elapsed = Instant::now().duration_since(target_time); warn_when_period_overrun( elapsed, last_download.upload_period, BackgroundLoopKind::SecondaryDownload, ); } CompleteDownload { secondary_state, completed_at: Instant::now(), result } }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) } } enum LayerAction { Download, NoAction, Skip, Touch, } /// This type is a convenience to group together the various functions involved in /// freshening a secondary tenant. struct TenantDownloader<'a> { conf: &'static PageServerConf, remote_storage: &'a GenericRemoteStorage, secondary_state: &'a SecondaryTenant, } /// Errors that may be encountered while updating a tenant #[derive(thiserror::Error, Debug)] enum UpdateError { /// This is not a true failure, but it's how a download indicates that it would like to be restarted by /// the scheduler, to pick up the latest heatmap #[error("Reached deadline, restarting downloads")] Restart, #[error("No remote data found")] NoData, #[error("Insufficient local storage space")] NoSpace, #[error("Failed to download: {0}")] DownloadError(DownloadError), #[error(transparent)] Deserialize(#[from] serde_json::Error), #[error("Cancelled")] Cancelled, #[error(transparent)] Other(#[from] anyhow::Error), } impl From for UpdateError { fn from(value: DownloadError) -> Self { match &value { DownloadError::Cancelled => Self::Cancelled, DownloadError::NotFound => Self::NoData, _ => Self::DownloadError(value), } } } impl From for UpdateError { fn from(value: std::io::Error) -> Self { if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::Errno::from_raw) { UpdateError::NoSpace } else if value .get_ref() .and_then(|x| x.downcast_ref::()) .is_some() { UpdateError::from(DownloadError::from(value)) } else { // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue UpdateError::Other(anyhow::anyhow!(value)) } } } impl<'a> TenantDownloader<'a> { fn new( conf: &'static PageServerConf, remote_storage: &'a GenericRemoteStorage, secondary_state: &'a SecondaryTenant, ) -> Self { Self { conf, remote_storage, secondary_state, } } async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure // cover our access to local storage. let Ok(_guard) = self.secondary_state.gate.enter() else { // Shutting down return Err(UpdateError::Cancelled); }; let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); // We will use the etag from last successful download to make the download conditional on changes let last_download = self .secondary_state .detail .lock() .unwrap() .last_download .clone(); // Download the tenant's heatmap let HeatMapModified { last_modified: heatmap_mtime, etag: heatmap_etag, bytes: heatmap_bytes, } = match tokio::select!( bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) ) { HeatMapDownload::Unmodified => { tracing::info!("Heatmap unchanged since last successful download"); return Ok(()); } HeatMapDownload::Modified(m) => m, }; // Heatmap storage location let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id); let last_heatmap = if last_download.is_none() { match load_heatmap(&heatmap_path, ctx).await { Ok(htm) => htm, Err(e) => { tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}"); None } } } else { None }; let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| { htm.timelines .iter() .map(|tl| (tl.timeline_id, tl)) .collect::>() }); let heatmap = serde_json::from_slice::(&heatmap_bytes)?; let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes) .await .maybe_fatal_err(&context_msg)?; tracing::debug!( "Wrote local heatmap to {}, with {} timelines", heatmap_path, heatmap.timelines.len() ); // Get or initialize the local disk state for the timelines we will update let mut timeline_states = HashMap::new(); for timeline in &heatmap.timelines { let timeline_state = self .secondary_state .detail .lock() .unwrap() .timelines .get(&timeline.timeline_id) .cloned(); let timeline_state = match timeline_state { Some(t) => t, None => { let last_heatmap = last_heatmap_timelines .as_ref() .and_then(|last_heatmap_timelines| { last_heatmap_timelines.get(&timeline.timeline_id).copied() }); // We have no existing state: need to scan local disk for layers first. let timeline_state = init_timeline_state( self.conf, tenant_shard_id, last_heatmap, timeline, &self.secondary_state.resident_size_metric, ctx, ) .await; // Re-acquire detail lock now that we're done with async load from local FS self.secondary_state .detail .lock() .unwrap() .timelines .insert(timeline.timeline_id, timeline_state.clone()); timeline_state } }; timeline_states.insert(timeline.timeline_id, timeline_state); } // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this // phase to initialize our SecondaryProgress. { *self.secondary_state.progress.lock().unwrap() = self.prepare_timelines(&heatmap, heatmap_mtime).await?; } // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again, // so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might // spend 10s of minutes downloading layers we don't need. // (see https://github.com/neondatabase/neon/issues/8182) let deadline = { let period = self .secondary_state .detail .lock() .unwrap() .last_download .as_ref() .map(|d| d.upload_period) .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); // Use double the period: we are not promising to complete within the period, this is just a heuristic // to keep using a "reasonably fresh" heatmap. Instant::now() + period * 2 }; // Download the layers in the heatmap for timeline in heatmap.timelines { let timeline_state = timeline_states .remove(&timeline.timeline_id) .expect("Just populated above"); if self.secondary_state.cancel.is_cancelled() { tracing::debug!( "Cancelled before downloading timeline {}", timeline.timeline_id ); return Ok(()); } let timeline_id = timeline.timeline_id; self.download_timeline(timeline, timeline_state, deadline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id )) .await?; } // Metrics consistency check in testing builds self.secondary_state.validate_metrics(); // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { etag: heatmap_etag, mtime: heatmap_mtime, upload_period: heatmap .upload_period_ms .map(|ms| Duration::from_millis(ms as u64)) .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), }); // Robustness: we should have updated progress properly, but in case we didn't, make sure // we don't leave the tenant in a state where we claim to have successfully downloaded // everything, but our progress is incomplete. The invariant here should be that if // we have set `last_download` to this heatmap's etag, then the next time we see that // etag we can safely do no work (i.e. we must be complete). let mut progress = self.secondary_state.progress.lock().unwrap(); debug_assert!(progress.layers_downloaded == progress.layers_total); debug_assert!(progress.bytes_downloaded == progress.bytes_total); if progress.layers_downloaded != progress.layers_total || progress.bytes_downloaded != progress.bytes_total { tracing::warn!("Correcting drift in progress stats ({progress:?})"); progress.layers_downloaded = progress.layers_total; progress.bytes_downloaded = progress.bytes_total; } Ok(()) } /// Do any fast local cleanup that comes before the much slower process of downloading /// layers from remote storage. In the process, initialize the SecondaryProgress object /// that will later be updated incrementally as we download layers. async fn prepare_timelines( &self, heatmap: &HeatMapTenant, heatmap_mtime: SystemTime, ) -> Result { let heatmap_stats = heatmap.get_stats(); // We will construct a progress object, and then populate its initial "downloaded" numbers // while iterating through local layer state in [`Self::prepare_timelines`] let mut progress = SecondaryProgress { layers_total: heatmap_stats.layers, bytes_total: heatmap_stats.bytes, heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), layers_downloaded: 0, bytes_downloaded: 0, }; // Also expose heatmap bytes_total as a metric self.secondary_state .heatmap_total_size_metric .set(heatmap_stats.bytes); // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock let mut delete_layers = Vec::new(); let mut delete_timelines = Vec::new(); { let mut detail = self.secondary_state.detail.lock().unwrap(); for (timeline_id, timeline_state) in &mut detail.timelines { let Some(heatmap_timeline_index) = heatmap .timelines .iter() .position(|t| t.timeline_id == *timeline_id) else { // This timeline is no longer referenced in the heatmap: delete it locally delete_timelines.push(*timeline_id); continue; }; let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); let layers_in_heatmap = heatmap_timeline .hot_layers() .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state .on_disk_layers .iter() .map(|l| (l.0, l.1.metadata.generation)) .collect::>(); let mut layer_count = layers_on_disk.len(); let mut layer_byte_count: u64 = timeline_state .on_disk_layers .values() .map(|l| l.metadata.file_size) .sum(); // Remove on-disk layers that are no longer present in heatmap for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { layer_count -= 1; layer_byte_count -= timeline_state .on_disk_layers .get(layer_file_name) .unwrap() .metadata .file_size; let local_path = local_layer_path( self.conf, self.secondary_state.get_tenant_shard_id(), timeline_id, layer_file_name, generation, ); delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); } progress.bytes_downloaded += layer_byte_count; progress.layers_downloaded += layer_count; } for delete_timeline in &delete_timelines { // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal // from disk fails that will be a fatal error. detail.remove_timeline( self.secondary_state.get_tenant_shard_id(), delete_timeline, &self.secondary_state.resident_size_metric, ); } } // Execute accumulated deletions for (timeline_id, layer_name, local_path) in delete_layers { tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); tokio::fs::remove_file(&local_path) .await .or_else(fs_ext::ignore_not_found) .maybe_fatal_err("Removing secondary layer")?; // Update in-memory housekeeping to reflect the absence of the deleted layer let mut detail = self.secondary_state.detail.lock().unwrap(); let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { continue; }; timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric); } for timeline_id in delete_timelines { let timeline_path = self .conf .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); tracing::info!(timeline_id=%timeline_id, "Timeline no longer in heatmap, removing from secondary location" ); tokio::fs::remove_dir_all(&timeline_path) .await .or_else(fs_ext::ignore_not_found) .maybe_fatal_err("Removing secondary timeline")?; } Ok(progress) } /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object /// still matches `prev_etag`. async fn download_heatmap( &self, prev_etag: Option<&Etag>, ) -> Result { debug_assert_current_span_has_tenant_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); let cancel = &self.secondary_state.cancel; let opts = DownloadOpts { etag: prev_etag.cloned(), kind: DownloadKind::Small, ..Default::default() }; backoff::retry( || async { let download = match self .remote_storage .download(&heatmap_path, &opts, cancel) .await { Ok(download) => download, Err(DownloadError::Unmodified) => return Ok(HeatMapDownload::Unmodified), Err(err) => return Err(err.into()), }; let mut heatmap_bytes = Vec::new(); let mut body = tokio_util::io::StreamReader::new(download.download_stream); let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; Ok(HeatMapDownload::Modified(HeatMapModified { etag: download.etag, last_modified: download.last_modified, bytes: heatmap_bytes, })) }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "download heatmap", cancel, ) .await .ok_or_else(|| UpdateError::Cancelled) .and_then(|x| x) .inspect(|_| SECONDARY_MODE.download_heatmap.inc()) } /// Download heatmap layers that are not present on local disk, or update their /// access time if they are already present. async fn download_timeline_layers( &self, tenant_shard_id: &TenantShardId, timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, deadline: Instant, ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); let timeline_id = timeline.timeline_id; for layer in timeline.into_hot_layers() { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); return (Err(UpdateError::Cancelled), touched); } if Instant::now() > deadline { // We've been running downloads for a while, restart to download latest heatmap. return (Err(UpdateError::Restart), touched); } match self.layer_action(&timeline_state, &layer).await { LayerAction::Download => (), LayerAction::NoAction => continue, LayerAction::Skip => { self.skip_layer(layer); continue; } LayerAction::Touch => { touched.push(layer); continue; } } match self .download_layer( tenant_shard_id, &timeline_id, layer, timeline_state.context(), ) .await { Ok(Some(layer)) => touched.push(layer), Ok(None) => { // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of // things to consider touched. } Err(e) => { return (Err(e), touched); } } } (Ok(()), touched) } async fn layer_action( &self, timeline_state: &SecondaryDetailTimeline, layer: &HeatMapLayer, ) -> LayerAction { // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. match tokio::fs::metadata(&on_disk.local_path).await { Ok(meta) => { tracing::debug!( "Layer {} present at {}, size {}", layer.name, on_disk.local_path, meta.len(), ); } Err(e) => { tracing::warn!( "Layer {} not found at {} ({})", layer.name, on_disk.local_path, e ); debug_assert!(false); } } } if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() { tracing::info!( "Re-downloading layer {} with changed size or generation: {:?}->{:?}", layer.name, on_disk.metadata.generation_file_size(), layer.metadata.generation_file_size() ); return LayerAction::Download; } if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { // We already have this layer on disk. Update its access time. tracing::debug!( "Access time updated for layer {}: {} -> {}", layer.name, strftime(&on_disk.access_time), strftime(&layer.access_time) ); return LayerAction::Touch; } return LayerAction::NoAction; } else { tracing::debug!("Layer {} not present on disk yet", layer.name); } // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more // recently than it was evicted. if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { if &layer.access_time > evicted_at { tracing::info!( "Re-downloading evicted layer {}, accessed at {}, evicted at {}", layer.name, strftime(&layer.access_time), strftime(evicted_at) ); } else { tracing::trace!( "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", layer.name, strftime(&layer.access_time), strftime(evicted_at) ); return LayerAction::Skip; } } LayerAction::Download } async fn download_timeline( &self, timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, deadline: Instant, ctx: &RequestContext, ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_id = timeline.timeline_id; tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline) .await; // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful { let mut detail = self.secondary_state.detail.lock().unwrap(); let timeline_detail = detail.timelines.entry(timeline_id).or_insert_with(|| { let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline_id); SecondaryDetailTimeline::empty(ctx) }); tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); touched.into_iter().for_each(|t| { timeline_detail.touch_layer( self.conf, tenant_shard_id, &timeline_id, &t, &self.secondary_state.resident_size_metric, || { local_layer_path( self.conf, tenant_shard_id, &timeline_id, &t.name, &t.metadata.generation, ) }, ) }); } result } /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics fn skip_layer(&self, layer: HeatMapLayer) { let mut progress = self.secondary_state.progress.lock().unwrap(); progress.layers_total = progress.layers_total.saturating_sub(1); progress.bytes_total = progress .bytes_total .saturating_sub(layer.metadata.file_size); } async fn download_layer( &self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, layer: HeatMapLayer, ctx: &RequestContext, ) -> Result, UpdateError> { // Failpoints for simulating slow remote storage failpoint_support::sleep_millis_async!( "secondary-layer-download-sleep", &self.secondary_state.cancel ); pausable_failpoint!("secondary-layer-download-pausable"); let local_path = local_layer_path( self.conf, tenant_shard_id, timeline_id, &layer.name, &layer.metadata.generation, ); // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally tracing::info!( "Starting download of layer {}, size {}", layer.name, layer.metadata.file_size ); let downloaded_bytes = download_layer_file( self.conf, self.remote_storage, *tenant_shard_id, *timeline_id, &layer.name, &layer.metadata, &local_path, &self.secondary_state.gate, &self.secondary_state.cancel, ctx, ) .await; let downloaded_bytes = match downloaded_bytes { Ok(bytes) => bytes, Err(DownloadError::NotFound) => { // A heatmap might be out of date and refer to a layer that doesn't exist any more. // This is harmless: continue to download the next layer. It is expected during compaction // GC. tracing::debug!( "Skipped downloading missing layer {}, raced with compaction/gc?", layer.name ); self.skip_layer(layer); return Ok(None); } Err(e) => return Err(e.into()), }; if downloaded_bytes != layer.metadata.file_size { let local_path = local_layer_path( self.conf, tenant_shard_id, timeline_id, &layer.name, &layer.metadata.generation, ); tracing::warn!( "Downloaded layer {} with unexpected size {} != {}. Removing download.", layer.name, downloaded_bytes, layer.metadata.file_size ); tokio::fs::remove_file(&local_path) .await .or_else(fs_ext::ignore_not_found)?; } else { tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); let mut progress = self.secondary_state.progress.lock().unwrap(); progress.bytes_downloaded += downloaded_bytes; progress.layers_downloaded += 1; } SECONDARY_MODE.download_layer.inc(); Ok(Some(layer)) } } /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, last_heatmap: Option<&HeatMapTimeline>, heatmap: &HeatMapTimeline, resident_metric: &UIntGauge, ctx: &RequestContext, ) -> SecondaryDetailTimeline { let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id); let mut detail = SecondaryDetailTimeline::empty(ctx); let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut dir = match tokio::fs::read_dir(&timeline_path).await { Ok(d) => d, Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { let context = format!("Creating timeline directory {timeline_path}"); tracing::info!("{}", context); tokio::fs::create_dir_all(&timeline_path) .await .fatal_err(&context); // No entries to report: drop out. return detail; } else { on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}")); } } }; // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.hot_layers().map(|l| (&l.name, l)).collect(); let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = if let Some(last_heatmap) = last_heatmap { last_heatmap.hot_layers().map(|l| (&l.name, l)).collect() } else { HashMap::new() }; while let Some(dentry) = dir .next_entry() .await .fatal_err(&format!("Listing {timeline_path}")) { let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else { tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy()); continue; }; let local_meta = dentry .metadata() .await .fatal_err(&format!("Read metadata on {file_path}")); let file_name = file_path.file_name().expect("created it from the dentry"); if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) || is_ephemeral_file(file_name) { // Temporary files are frequently left behind from restarting during downloads tracing::info!("Cleaning up temporary file {file_path}"); if let Err(e) = tokio::fs::remove_file(&file_path) .await .or_else(fs_ext::ignore_not_found) { tracing::error!("Failed to remove temporary file {file_path}: {e}"); } continue; } match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); let last_meta = last_heatmap_metadata.get(&name); let mut remove = false; match remote_meta { Some(remote_meta) => { let last_meta_generation_file_size = last_meta .map(|m| m.metadata.generation_file_size()) .unwrap_or(remote_meta.metadata.generation_file_size()); // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784) if remote_meta.metadata.generation_file_size() != last_meta_generation_file_size { tracing::info!( "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}", last_meta_generation_file_size, remote_meta.metadata.generation_file_size() ); remove = true; } else if local_meta.len() != remote_meta.metadata.file_size { // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had // the chance yet to download the new layer to disk, before the process restarted. tracing::info!( "Removing local layer {name} with unexpected local size {} != {}", local_meta.len(), remote_meta.metadata.file_size ); remove = true; } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. detail.touch_layer( conf, tenant_shard_id, &heatmap.timeline_id, remote_meta, resident_metric, || file_path, ); } } None => { // FIXME: consider some optimization when transitioning from attached to secondary: maybe // wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise // we will end up deleting any layers which were created+uploaded more recently than the heatmap. tracing::info!( "Removing secondary local layer {} because it's absent in heatmap", name ); remove = true; } } if remove { tokio::fs::remove_file(&dentry.path()) .await .or_else(fs_ext::ignore_not_found) .fatal_err(&format!( "Removing layer {}", dentry.path().to_string_lossy() )); } } Err(_) => { // Ignore it. tracing::warn!("Unexpected file in timeline directory: {file_name}"); } } } detail } /// Loads a json-encoded heatmap file from the provided on-disk path async fn load_heatmap( path: &Utf8PathBuf, ctx: &RequestContext, ) -> Result, anyhow::Error> { let st = match VirtualFile::read_to_string(path, ctx).await { Ok(st) => st, Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None), Err(e) => Err(e)?, }; let htm = serde_json::from_str(&st)?; Ok(Some(htm)) } ================================================ FILE: pageserver/src/tenant/secondary/heatmap.rs ================================================ use std::collections::HashMap; use std::time::SystemTime; use serde::{Deserialize, Serialize}; use serde_with::{DisplayFromStr, TimestampSeconds, serde_as}; use utils::generation::Generation; use utils::id::TimelineId; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::LayerName; #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTenant { /// Generation of the attached location that uploaded the heatmap: this is not required /// for correctness, but acts as a hint to secondary locations in order to detect thrashing /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. pub(super) generation: Generation, pub(super) timelines: Vec, /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders /// of how frequently it is worthwhile to check for updates. /// /// This is optional for backward compat, and because we sometimes might upload /// a heatmap explicitly via API for a tenant that has no periodic upload configured. #[serde(default)] pub(super) upload_period_ms: Option, } impl HeatMapTenant { pub(crate) fn into_timelines_index(self) -> HashMap { self.timelines .into_iter() .map(|htl| (htl.timeline_id, htl)) .collect() } } #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, layers: Vec, } #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapLayer { pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(crate) access_time: SystemTime, #[serde(default)] pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } impl HeatMapLayer { pub(crate) fn new( name: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, cold: bool, ) -> Self { Self { name, metadata, access_time, cold, } } } impl HeatMapTimeline { pub(crate) fn new(timeline_id: TimelineId, layers: Vec) -> Self { Self { timeline_id, layers, } } pub(crate) fn into_hot_layers(self) -> impl Iterator { self.layers.into_iter().filter(|l| !l.cold) } pub(crate) fn hot_layers(&self) -> impl Iterator { self.layers.iter().filter(|l| !l.cold) } pub(crate) fn all_layers(&self) -> impl Iterator { self.layers.iter() } } pub(crate) struct HeatMapStats { pub(crate) bytes: u64, pub(crate) layers: usize, } impl HeatMapTenant { pub(crate) fn get_stats(&self) -> HeatMapStats { let mut stats = HeatMapStats { bytes: 0, layers: 0, }; for timeline in &self.timelines { for layer in timeline.hot_layers() { stats.layers += 1; stats.bytes += layer.metadata.file_size; } } stats } pub(crate) fn strip_atimes(self) -> Self { Self { timelines: self .timelines .into_iter() .map(|mut tl| { for layer in &mut tl.layers { layer.access_time = SystemTime::UNIX_EPOCH; } tl }) .collect(), generation: self.generation, upload_period_ms: self.upload_period_ms, } } } ================================================ FILE: pageserver/src/tenant/secondary/heatmap_uploader.rs ================================================ use std::collections::HashMap; use std::pin::Pin; use std::sync::{Arc, Weak}; use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span, instrument}; use utils::backoff; use utils::completion::Barrier; use utils::crashsafe::path_with_suffix_extension; use utils::yielding_loop::yielding_loop; use super::heatmap::HeatMapTenant; use super::scheduler::{ self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter, period_warmup, }; use super::{CommandRequest, SecondaryTenantError, UploadCommand}; use crate::TEMP_FILE_SUFFIX; use crate::metrics::SECONDARY_MODE; use crate::tenant::TenantShard; use crate::tenant::config::AttachmentMode; use crate::tenant::mgr::{GetTenantError, TenantManager}; use crate::tenant::remote_timeline_client::remote_heatmap_path; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; use crate::virtual_file::VirtualFile; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, ) { let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency; let generator = HeatmapUploader { tenant_manager, remote_storage, cancel: cancel.clone(), tenants: HashMap::new(), }; let mut scheduler = Scheduler::new(generator, concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) .instrument(info_span!("heatmap_upload_scheduler")) .await } /// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event /// handling loop and mutates it as needed: there are no locks here, because that event loop /// can hold &mut references to this type throughout. struct HeatmapUploader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, cancel: CancellationToken, tenants: HashMap, } struct WriteInProgress { barrier: Barrier, } impl RunningJob for WriteInProgress { fn get_barrier(&self) -> Barrier { self.barrier.clone() } } struct UploadPending { tenant: Arc, last_upload: Option, target_time: Option, period: Option, } impl scheduler::PendingJob for UploadPending { fn get_tenant_shard_id(&self) -> &TenantShardId { self.tenant.get_tenant_shard_id() } } struct WriteComplete { tenant_shard_id: TenantShardId, completed_at: Instant, uploaded: Option, next_upload: Option, } impl scheduler::Completion for WriteComplete { fn get_tenant_shard_id(&self) -> &TenantShardId { &self.tenant_shard_id } } /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember /// when we last did a write. We only populate this after doing at least one /// write for a tenant -- this avoids holding state for tenants that have /// uploads disabled. struct UploaderTenantState { // This Weak only exists to enable culling idle instances of this type // when the Tenant has been deallocated. tenant: Weak, /// Digest of the serialized heatmap that we last successfully uploaded last_upload_state: Option, /// When the last upload attempt completed (may have been successful or failed) last_upload: Option, /// When should we next do an upload? None means never. next_upload: Option, } type Scheduler = TenantBackgroundJobs< HeatmapUploader, UploadPending, WriteInProgress, WriteComplete, UploadCommand, >; impl JobGenerator for HeatmapUploader { async fn schedule(&mut self) -> SchedulingResult { // Cull any entries in self.tenants whose Arc is gone self.tenants .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some()); let now = Instant::now(); let mut result = SchedulingResult { jobs: Vec::new(), want_interval: None, }; let tenants = self.tenant_manager.get_attached_active_tenant_shards(); yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| { let period = match tenant.get_heatmap_period() { None => { // Heatmaps are disabled for this tenant return; } Some(period) => { // If any tenant has asked for uploads more frequent than our scheduling interval, // reduce it to match so that we can keep up. This is mainly useful in testing, where // we may set rather short intervals. result.want_interval = match result.want_interval { None => Some(period), Some(existing) => Some(std::cmp::min(period, existing)), }; period } }; // Stale attachments do not upload anything: if we are in this state, there is probably some // other attachment in mode Single or Multi running on another pageserver, and we don't // want to thrash and overwrite their heatmap uploads. if tenant.get_attach_mode() == AttachmentMode::Stale { return; } // Create an entry in self.tenants if one doesn't already exist: this will later be updated // with the completion time in on_completion. let state = self .tenants .entry(*tenant.get_tenant_shard_id()) .or_insert_with(|| UploaderTenantState { tenant: Arc::downgrade(&tenant), last_upload: None, next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), last_upload_state: None, }); // Decline to do the upload if insufficient time has passed if state.next_upload.map(|nu| nu > now).unwrap_or(false) { return; } let last_upload = state.last_upload_state.clone(); result.jobs.push(UploadPending { tenant, last_upload, target_time: state.next_upload, period: Some(period), }); }) .await .ok(); result } fn spawn( &mut self, job: UploadPending, ) -> ( WriteInProgress, Pin + Send>>, ) { let UploadPending { tenant, last_upload, target_time, period, } = job; let remote_storage = self.remote_storage.clone(); let (completion, barrier) = utils::completion::channel(); let tenant_shard_id = *tenant.get_tenant_shard_id(); (WriteInProgress { barrier }, Box::pin(async move { // Guard for the barrier in [`WriteInProgress`] let _completion = completion; let started_at = Instant::now(); let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await { Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => { let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap.inc(); Some(uploaded) } Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload, Err(UploadHeatmapError::Upload(e)) => { tracing::warn!( "Failed to upload heatmap for tenant {}: {e:#}", tenant.get_tenant_shard_id(), ); let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap_errors.inc(); last_upload } Err(UploadHeatmapError::Cancelled) => { tracing::info!("Cancelled heatmap upload, shutting down"); last_upload } }; let now = Instant::now(); // If the job had a target execution time, we may check our final execution // time against that for observability purposes. if let (Some(target_time), Some(period)) = (target_time, period) { // Elapsed time includes any scheduling lag as well as the execution of the job let elapsed = now.duration_since(target_time); warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload); } let next_upload = tenant .get_heatmap_period() .and_then(|period| now.checked_add(period_jitter(period, 5))); WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, uploaded, next_upload, } }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) } fn on_command( &mut self, command: UploadCommand, ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); tracing::info!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting heatmap write on command"); let tenant = self .tenant_manager .get_attached_tenant_shard(*tenant_shard_id)?; if !tenant.is_active() { return Err(GetTenantError::NotActive(*tenant_shard_id).into()); } Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed last_upload: None, tenant, target_time: None, period: None, }) } #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))] fn on_completion(&mut self, completion: WriteComplete) { tracing::debug!("Heatmap upload completed"); let WriteComplete { tenant_shard_id, completed_at, uploaded, next_upload, } = completion; use std::collections::hash_map::Entry; match self.tenants.entry(tenant_shard_id) { Entry::Vacant(_) => { // Tenant state was dropped, nothing to update. } Entry::Occupied(mut entry) => { entry.get_mut().last_upload = Some(completed_at); entry.get_mut().last_upload_state = uploaded; entry.get_mut().next_upload = next_upload } } } } enum UploadHeatmapOutcome { /// We successfully wrote to remote storage, with this digest. Uploaded(LastUploadState), /// We did not upload because the heatmap digest was unchanged since the last upload NoChange, /// We skipped the upload for some reason, such as tenant/timeline not ready Skipped, } #[derive(thiserror::Error, Debug)] enum UploadHeatmapError { #[error("Cancelled")] Cancelled, #[error(transparent)] Upload(#[from] anyhow::Error), } /// Digests describing the heatmap we most recently uploaded successfully. /// /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, /// which is also an md5sum. #[derive(Clone)] struct LastUploadState { // Digest of json-encoded HeatMapTenant uploaded_digest: md5::Digest, // Digest without atimes set. layers_only_digest: md5::Digest, } /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, last_upload: Option, ) -> Result { debug_assert_current_span_has_tenant_id(); let generation = tenant.get_generation(); debug_assert!(!generation.is_none()); if generation.is_none() { // We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants tracing::warn!("Skipping heatmap upload for tenant with generation==None"); return Ok(UploadHeatmapOutcome::Skipped); } let mut heatmap = HeatMapTenant { timelines: Vec::new(), generation, upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()), }; let timelines = tenant.timelines.lock().unwrap().clone(); // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind // in remote storage. let Ok(_guard) = tenant.gate.enter() else { tracing::info!("Skipping heatmap upload for tenant which is shutting down"); return Err(UploadHeatmapError::Cancelled); }; for (timeline_id, timeline) in timelines { let heatmap_timeline = timeline.generate_heatmap().await; match heatmap_timeline { None => { tracing::debug!( "Skipping heatmap upload because timeline {timeline_id} is not ready" ); return Ok(UploadHeatmapOutcome::Skipped); } Some(heatmap_timeline) => { heatmap.timelines.push(heatmap_timeline); } } } // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; // Drop out early if nothing changed since our last upload let digest = md5::compute(&bytes); if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) { return Ok(UploadHeatmapOutcome::NoChange); } // Calculate a digest that omits atimes, so that we can distinguish actual changes in // layers from changes only in atimes. let heatmap_size_bytes = heatmap.get_stats().bytes; let layers_only_bytes = serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?; let layers_only_digest = md5::compute(&layers_only_bytes); if heatmap_size_bytes < tenant.get_checkpoint_distance() { // For small tenants, skip upload if only atimes changed. This avoids doing frequent // uploads from long-idle tenants whose atimes are just incremented by periodic // size calculations. if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) { return Ok(UploadHeatmapOutcome::NoChange); } } let bytes = bytes::Bytes::from(bytes); let size = bytes.len(); let path = remote_heatmap_path(tenant.get_tenant_shard_id()); let cancel = &tenant.cancel; tracing::debug!("Uploading {size} byte heatmap to {path}"); if let Err(e) = backoff::retry( || async { let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); remote_storage .upload_storage_object(bytes, size, &path, cancel) .await }, TimeoutOrCancel::caused_by_cancel, 3, u32::MAX, "Uploading heatmap", cancel, ) .await .ok_or_else(|| anyhow::anyhow!("Shutting down")) .and_then(|x| x) { if cancel.is_cancelled() { return Err(UploadHeatmapError::Cancelled); } else { return Err(e.into()); } } // After a successful upload persist the fresh heatmap to disk. // When restarting, the tenant will read the heatmap from disk // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]). // If the heatmap is stale, the additive generation can lead to keeping previously // evicted timelines on the secondarie's disk. let tenant_shard_id = tenant.get_tenant_shard_id(); let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id); let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await { tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}"); } tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { uploaded_digest: digest, layers_only_digest, })) } ================================================ FILE: pageserver/src/tenant/secondary/scheduler.rs ================================================ use std::collections::HashMap; use std::marker::PhantomData; use std::pin::Pin; use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use rand::Rng; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::completion::Barrier; use utils::yielding_loop::yielding_loop; use super::{CommandRequest, CommandResponse, SecondaryTenantError}; /// Scheduling interval is the time between calls to JobGenerator::schedule. /// When we schedule jobs, the job generator may provide a hint of its preferred /// interval, which we will respect within these intervals. const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); /// Jitter a Duration by an integer percentage. Returned values are uniform /// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range) pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { if d == Duration::ZERO { d } else { rand::rng().random_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) } } /// When a periodic task first starts, it should wait for some time in the range 0..period, so /// that starting many such tasks at the same time spreads them across the time range. pub(super) fn period_warmup(period: Duration) -> Duration { if period == Duration::ZERO { period } else { rand::rng().random_range(Duration::ZERO..period) } } /// Scheduling helper for background work across many tenants. /// /// Systems that need to run background work across many tenants may use this type /// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`] /// implementation to provide the work to execute. This is a simple scheduler that just /// polls the generator for outstanding work, replacing its queue of pending work with /// what the generator yields on each call: the job generator can change its mind about /// the order of jobs between calls. The job generator is notified when jobs complete, /// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement /// admin APIs). /// /// For an example see [`crate::tenant::secondary::heatmap_uploader`] /// /// G: A JobGenerator that this scheduler will poll to find pending jobs /// PJ: 'Pending Job': type for job descriptors that are ready to run /// RJ: 'Running Job' type' for jobs that have been spawned /// C : 'Completion' type that spawned jobs will send when they finish /// CMD: 'Command' type that the job generator will accept to create jobs on-demand pub(super) struct TenantBackgroundJobs where G: JobGenerator, C: Completion, PJ: PendingJob, RJ: RunningJob, { generator: G, /// Ready to run. Will progress to `running` once concurrent limit is satisfied, or /// be removed on next scheduling pass. pending: std::collections::VecDeque, /// Tasks currently running in Self::tasks for these tenants. Check this map /// before pushing more work into pending for the same tenant. running: HashMap, tasks: JoinSet, concurrency: usize, /// How often we would like schedule_interval to be called. pub(super) scheduling_interval: Duration, _phantom: PhantomData<(PJ, RJ, C, CMD)>, } pub(crate) trait JobGenerator where C: Completion, PJ: PendingJob, RJ: RunningJob, { /// Called at each scheduling interval. Return a list of jobs to run, most urgent first. /// /// This function may be expensive (e.g. walk all tenants), but should not do any I/O. /// Implementations should take care to yield the executor periodically if running /// very long loops. /// /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending /// jobs is not drained by the next scheduling interval, pending jobs will be cleared /// and re-generated. async fn schedule(&mut self) -> SchedulingResult; /// Called when a pending job is ready to be run. /// /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it. fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin + Send>>); /// Called when a job previously spawned with spawn() transmits its completion fn on_completion(&mut self, completion: C); /// Called when a command is received. A job will be spawned immediately if the return /// value is Some, ignoring concurrency limits and the pending queue. fn on_command(&mut self, cmd: CMD) -> Result; } /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling pub(super) struct SchedulingResult { pub(super) jobs: Vec, /// The job generator would like to be called again this soon pub(super) want_interval: Option, } /// See [`TenantBackgroundJobs`]. pub(super) trait PendingJob { fn get_tenant_shard_id(&self) -> &TenantShardId; } /// See [`TenantBackgroundJobs`]. pub(super) trait Completion: Send + 'static { fn get_tenant_shard_id(&self) -> &TenantShardId; } /// See [`TenantBackgroundJobs`]. pub(super) trait RunningJob { fn get_barrier(&self) -> Barrier; } impl TenantBackgroundJobs where C: Completion, PJ: PendingJob, RJ: RunningJob, G: JobGenerator, { pub(super) fn new(generator: G, concurrency: usize) -> Self { Self { generator, pending: std::collections::VecDeque::new(), running: HashMap::new(), tasks: JoinSet::new(), concurrency, scheduling_interval: MAX_SCHEDULING_INTERVAL, _phantom: PhantomData, } } pub(super) async fn run( &mut self, mut command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, ) { tracing::info!("Waiting for background_jobs_can start..."); background_jobs_can_start.wait().await; tracing::info!("background_jobs_can is ready, proceeding."); while !cancel.is_cancelled() { // Look for new work: this is relatively expensive because we have to go acquire the lock on // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones // require an upload. self.schedule_iteration(&cancel).await; if cancel.is_cancelled() { return; } // Schedule some work, if concurrency limit permits it self.spawn_pending(); // This message is printed every scheduling iteration as proof of liveness when looking at logs tracing::info!( "Status: {} tasks running, {} pending", self.running.len(), self.pending.len() ); // Between scheduling iterations, we will: // - Drain any complete tasks and spawn pending tasks // - Handle incoming administrative commands // - Check our cancellation token let next_scheduling_iteration = Instant::now() .checked_add(self.scheduling_interval) .unwrap_or_else(|| { tracing::warn!( "Scheduling interval invalid ({}s)", self.scheduling_interval.as_secs_f64() ); // unwrap(): this constant is small, cannot fail to add to time unless // we are close to the end of the universe. Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap() }); loop { tokio::select! { _ = cancel.cancelled() => { tracing::info!("joining tasks"); // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. // It is the callers responsibility to make sure that the tasks they scheduled // respect an appropriate cancellation token, to shut down promptly. It is only // safe to wait on joining these tasks because we can see the cancellation token // has been set. while let Some(_r) = self.tasks.join_next().await {} tracing::info!("terminating on cancellation token."); break; }, _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { tracing::debug!("woke for scheduling interval"); break;}, cmd = command_queue.recv() => { tracing::debug!("woke for command queue"); let cmd = match cmd { Some(c) =>c, None => { // SecondaryController was destroyed, and this has raced with // our CancellationToken tracing::info!("terminating on command queue destruction"); cancel.cancel(); break; } }; let CommandRequest{ response_tx, payload } = cmd; self.handle_command(payload, response_tx); }, _ = async { let completion = self.process_next_completion().await; match completion { Some(c) => { self.generator.on_completion(c); if !cancel.is_cancelled() { self.spawn_pending(); } }, None => { // Nothing is running, so just wait: expect that this future // will be dropped when something in the outer select! fires. cancel.cancelled().await; } } } => {} } } } } fn do_spawn(&mut self, job: PJ) { let tenant_shard_id = *job.get_tenant_shard_id(); let (in_progress, fut) = self.generator.spawn(job); self.tasks.spawn(fut); let replaced = self.running.insert(tenant_shard_id, in_progress); debug_assert!(replaced.is_none()); if replaced.is_some() { tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running") } } /// For all pending tenants that are elegible for execution, spawn their task. /// /// Caller provides the spawn operation, we track the resulting execution. fn spawn_pending(&mut self) { while !self.pending.is_empty() && self.running.len() < self.concurrency { // unwrap: loop condition includes !is_empty() let pending = self.pending.pop_front().unwrap(); if !self.running.contains_key(pending.get_tenant_shard_id()) { self.do_spawn(pending); } } } /// For administrative commands: skip the pending queue, ignore concurrency limits fn spawn_now(&mut self, job: PJ) -> &RJ { let tenant_shard_id = *job.get_tenant_shard_id(); self.do_spawn(job); self.running .get(&tenant_shard_id) .expect("We just inserted this") } /// Wait until the next task completes, and handle its completion /// /// Cancellation: this method is cancel-safe. async fn process_next_completion(&mut self) -> Option { match self.tasks.join_next().await { Some(r) => { // We use a channel to drive completions, but also // need to drain the JoinSet to avoid completed tasks // accumulating. These calls are 1:1 because every task // we spawn into this joinset submits is result to the channel. let completion = r.expect("Panic in background task"); self.running.remove(completion.get_tenant_shard_id()); Some(completion) } None => { // Nothing is running, so we have nothing to wait for. We may drop out: the // main even loop will call us again after the next time it has run something. None } } } /// Convert the command into a pending job, spawn it, and when the spawned /// job completes, send the result down `response_tx`. fn handle_command( &mut self, cmd: CMD, response_tx: tokio::sync::oneshot::Sender, ) { let job = match self.generator.on_command(cmd) { Ok(j) => j, Err(e) => { response_tx.send(CommandResponse { result: Err(e) }).ok(); return; } }; let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { tracing::info!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Command already running, waiting for it" ); barrier } else { let running = self.spawn_now(job); running.get_barrier().clone() }; // This task does no I/O: it only listens for a barrier's completion and then // sends to the command response channel. It is therefore safe to spawn this without // any gates/task_mgr hooks. tokio::task::spawn(async move { barrier.wait().await; response_tx.send(CommandResponse { result: Ok(()) }).ok(); }); } fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option { self.running.get(tenant_shard_id).map(|r| r.get_barrier()) } /// Periodic execution phase: inspect all attached tenants and schedule any work they require. /// /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`] /// /// This function resets the pending list: it is assumed that the caller may change their mind about /// which tenants need work between calls to schedule_iteration. async fn schedule_iteration(&mut self, cancel: &CancellationToken) { let SchedulingResult { jobs, want_interval, } = self.generator.schedule().await; // Adjust interval based on feedback from the job generator if let Some(want_interval) = want_interval { // Calculation uses second granularity: this scheduler is not intended for high frequency tasks self.scheduling_interval = Duration::from_secs(std::cmp::min( std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()), MAX_SCHEDULING_INTERVAL.as_secs(), )); } // The priority order of previously scheduled work may be invalidated by current state: drop // all pending work (it will be re-scheduled if still needed) self.pending.clear(); // While iterating over the potentially-long list of tenants, we will periodically yield // to avoid blocking executor. yielding_loop(1000, cancel, jobs.into_iter(), |job| { // Skip tenants that already have a write in flight if !self.running.contains_key(job.get_tenant_shard_id()) { self.pending.push_back(job); } }) .await .ok(); } } ================================================ FILE: pageserver/src/tenant/secondary.rs ================================================ mod downloader; pub mod heatmap; mod heatmap_uploader; mod scheduler; use std::sync::Arc; use std::time::SystemTime; use metrics::UIntGauge; use pageserver_api::models; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use remote_storage::GenericRemoteStorage; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::completion::Barrier; use utils::id::TimelineId; use utils::sync::gate::Gate; use self::downloader::{SecondaryDetail, downloader_task}; use self::heatmap_uploader::heatmap_uploader_task; use super::GetTenantError; use super::config::SecondaryLocationConfig; use super::mgr::TenantManager; use super::span::debug_assert_current_span_has_tenant_id; use super::storage_layer::LayerName; use crate::context::RequestContext; use crate::disk_usage_eviction_task::DiskUsageEvictionInfo; use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; enum DownloadCommand { Download(TenantShardId), } enum UploadCommand { Upload(TenantShardId), } impl UploadCommand { fn get_tenant_shard_id(&self) -> &TenantShardId { match self { Self::Upload(id) => id, } } } impl DownloadCommand { fn get_tenant_shard_id(&self) -> &TenantShardId { match self { Self::Download(id) => id, } } } struct CommandRequest { payload: T, response_tx: tokio::sync::oneshot::Sender, } struct CommandResponse { result: Result<(), SecondaryTenantError>, } #[derive(thiserror::Error, Debug)] pub(crate) enum SecondaryTenantError { #[error("{0}")] GetTenant(GetTenantError), #[error("shutting down")] ShuttingDown, } impl From for SecondaryTenantError { fn from(gte: GetTenantError) -> Self { Self::GetTenant(gte) } } // Whereas [`Tenant`] represents an attached tenant, this type represents the work // we do for secondary tenant locations: where we are not serving clients or // ingesting WAL, but we are maintaining a warm cache of layer files. // // This type is all about the _download_ path for secondary mode. The upload path // runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists. // // This structure coordinates TenantManager and SecondaryDownloader, // so that the downloader can indicate which tenants it is currently // operating on, and the manager can indicate when a particular // secondary tenant should cancel any work in flight. #[derive(Debug)] pub(crate) struct SecondaryTenant { /// Carrying a tenant shard ID simplifies callers such as the downloader /// which need to organize many of these objects by ID. tenant_shard_id: TenantShardId, /// Cancellation token indicates to SecondaryDownloader that it should stop doing /// any work for this tenant at the next opportunity. pub(crate) cancel: CancellationToken, pub(crate) gate: Gate, // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However, // storing these enables us to report our full LocationConf, enabling convenient reconciliation // by the control plane (see [`Self::get_location_conf`]) pub(crate) shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, // Internal state used by the Downloader. detail: std::sync::Mutex, // Public state indicating overall progress of downloads relative to the last heatmap seen pub(crate) progress: std::sync::Mutex, // Sum of layer sizes on local disk pub(super) resident_size_metric: UIntGauge, // Sum of layer sizes in the most recently downloaded heatmap pub(super) heatmap_total_size_metric: UIntGauge, } impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, shard_identity: ShardIdentity, tenant_conf: pageserver_api::models::TenantConfig, config: &SecondaryLocationConfig, ) -> Arc { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id]) .unwrap(); let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id]) .unwrap(); Arc::new(Self { tenant_shard_id, // todo: shall we make this a descendent of the // main cancellation token, or is it sufficient that // on shutdown we walk the tenants and fire their // individual cancellations? cancel: CancellationToken::new(), gate: Gate::default(), shard_identity, tenant_conf: std::sync::Mutex::new(tenant_conf), detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), progress: std::sync::Mutex::default(), resident_size_metric, heatmap_total_size_metric, }) } pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } pub(crate) async fn shutdown(&self) { self.cancel.cancel(); // Wait for any secondary downloader work to complete self.gate.close().await; self.validate_metrics(); // Metrics are subtracted from and/or removed eagerly. // Deletions are done in the background via [`BackgroundPurges::spawn`]. let tenant_id = self.tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); self.detail .lock() .unwrap() .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { self.detail.lock().unwrap().config = config.clone(); } pub(crate) fn set_tenant_conf(&self, config: &pageserver_api::models::TenantConfig) { *(self.tenant_conf.lock().unwrap()) = config.clone(); } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { let conf = self.detail.lock().unwrap().config.clone(); let conf = models::LocationConfigSecondary { warm: conf.warm }; let tenant_conf = self.tenant_conf.lock().unwrap().clone(); models::LocationConfig { mode: models::LocationConfigMode::Secondary, generation: None, secondary_conf: Some(conf), shard_number: self.tenant_shard_id.shard_number.0, shard_count: self.tenant_shard_id.shard_count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf, } } pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { &self.tenant_shard_id } pub(crate) fn get_layers_for_eviction(self: &Arc) -> (DiskUsageEvictionInfo, usize) { self.detail.lock().unwrap().get_layers_for_eviction(self) } /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { debug_assert_current_span_has_tenant_id(); let guard = match self.gate.enter() { Ok(g) => g, Err(_) => { tracing::debug!("Dropping layer evictions, secondary tenant shutting down",); return; } }; let now = SystemTime::now(); tracing::info!("Evicting secondary layer"); let this = self.clone(); // spawn it to be cancellation safe tokio::task::spawn_blocking(move || { let _guard = guard; // Update the timeline's state. This does not have to be synchronized with // the download process, because: // - If downloader is racing with us to remove a file (e.g. because it is // removed from heatmap), then our mutual .remove() operations will both // succeed. // - If downloader is racing with us to download the object (this would require // multiple eviction iterations to race with multiple download iterations), then // if we remove it from the state, the worst that happens is the downloader // downloads it again before re-inserting, or we delete the file but it remains // in the state map (in which case it will be downloaded if this secondary // tenant transitions to attached and tries to access it) // // The important assumption here is that the secondary timeline state does not // have to 100% match what is on disk, because it's a best-effort warming // of the cache. let mut detail = this.detail.lock().unwrap(); if let Some(removed) = detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric) { // We might race with removal of the same layer during downloads, so finding the layer we // were trying to remove is optional. Only issue the disk I/O to remove it if we found it. removed.remove_blocking(); } }) .await .expect("secondary eviction should not have panicked"); } /// Exhaustive check that incrementally updated metrics match the actual state. #[cfg(feature = "testing")] fn validate_metrics(&self) { let detail = self.detail.lock().unwrap(); let resident_size = detail.total_resident_size(); assert_eq!(resident_size, self.resident_size_metric.get()); } #[cfg(not(feature = "testing"))] fn validate_metrics(&self) { // No-op in non-testing builds } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, /// and heatmap uploads. This is not a hot data path: it's used for: /// - Live migrations, where we want to ensure a migration destination has the freshest possible /// content before trying to cut over. /// - Tests, where we want to immediately upload/download for a particular tenant. /// /// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, download_req_tx: tokio::sync::mpsc::Sender>, } impl SecondaryController { async fn dispatch( &self, queue: &tokio::sync::mpsc::Sender>, payload: T, ) -> Result<(), SecondaryTenantError> { let (response_tx, response_rx) = tokio::sync::oneshot::channel(); queue .send(CommandRequest { payload, response_tx, }) .await .map_err(|_| SecondaryTenantError::ShuttingDown)?; let response = response_rx .await .map_err(|_| SecondaryTenantError::ShuttingDown)?; response.result } pub(crate) async fn upload_tenant( &self, tenant_shard_id: TenantShardId, ) -> Result<(), SecondaryTenantError> { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } pub(crate) async fn download_tenant( &self, tenant_shard_id: TenantShardId, ) -> Result<(), SecondaryTenantError> { self.dispatch( &self.download_req_tx, DownloadCommand::Download(tenant_shard_id), ) .await } } pub struct GlobalTasks { cancel: CancellationToken, uploader: JoinHandle<()>, downloader: JoinHandle<()>, } impl GlobalTasks { /// Caller is responsible for requesting shutdown via the cancellation token that was /// passed to [`spawn_tasks`]. /// /// # Panics /// /// This method panics if that token is not cancelled. /// This is low-risk because we're calling this during process shutdown, so, a panic /// will be informative but not cause undue downtime. pub async fn wait(self) { let Self { cancel, uploader, downloader, } = self; assert!( cancel.is_cancelled(), "must cancel cancellation token, otherwise the tasks will not shut down" ); let (uploader, downloader) = futures::future::join(uploader, downloader).await; uploader.expect( "unreachable: exit_on_panic_or_error would catch the panic and exit the process", ); downloader.expect( "unreachable: exit_on_panic_or_error would catch the panic and exit the process", ); } } pub fn spawn_tasks( tenant_manager: Arc, remote_storage: GenericRemoteStorage, background_jobs_can_start: Barrier, cancel: CancellationToken, ) -> (SecondaryController, GlobalTasks) { let mgr_clone = tenant_manager.clone(); let storage_clone = remote_storage.clone(); let bg_jobs_clone = background_jobs_can_start.clone(); let (download_req_tx, download_req_rx) = tokio::sync::mpsc::channel::>(16); let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); let cancel_clone = cancel.clone(); let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "secondary tenant downloads", async move { downloader_task( mgr_clone, storage_clone, download_req_rx, bg_jobs_clone, cancel_clone, RequestContext::new( TaskKind::SecondaryDownloads, crate::context::DownloadBehavior::Download, ), ) .await; anyhow::Ok(()) }, )); let cancel_clone = cancel.clone(); let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "heatmap uploads", async move { heatmap_uploader_task( tenant_manager, remote_storage, upload_req_rx, background_jobs_can_start, cancel_clone, ) .await; anyhow::Ok(()) }, )); ( SecondaryController { upload_req_tx, download_req_tx, }, GlobalTasks { cancel, uploader, downloader, }, ) } ================================================ FILE: pageserver/src/tenant/size.rs ================================================ use std::cmp; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tenant_size_model::svg::SvgBranchKind; use tenant_size_model::{Segment, StorageModel}; use tokio::sync::Semaphore; use tokio::sync::oneshot::error::RecvError; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; use super::{GcError, LogicalSizeCalculationCause, TenantShard}; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::{MaybeOffloaded, Timeline}; /// Inputs to the actual tenant sizing model /// /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to /// be a transferrable format between execution environments and developer. /// /// This tracks more information than the actual StorageModel that calculation /// needs. We will convert this into a StorageModel when it's time to perform /// the calculation. /// #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct ModelInputs { pub segments: Vec, pub timeline_inputs: Vec, } /// A [`Segment`], with some extra information for display purposes #[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub struct SegmentMeta { pub segment: Segment, pub timeline_id: TimelineId, pub kind: LsnKind, } #[derive(thiserror::Error, Debug)] pub(crate) enum CalculateSyntheticSizeError { /// Something went wrong internally to the calculation of logical size at a particular branch point #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")] LogicalSize { timeline_id: TimelineId, lsn: Lsn, error: CalculateLogicalSizeError, }, /// Something went wrong internally when calculating GC parameters at start of size calculation #[error(transparent)] GcInfo(GcError), /// Totally unexpected errors, like panics joining a task #[error(transparent)] Fatal(anyhow::Error), /// Tenant shut down while calculating size #[error("Cancelled")] Cancelled, } impl From for CalculateSyntheticSizeError { fn from(value: GcError) -> Self { match value { GcError::TenantCancelled | GcError::TimelineCancelled => { CalculateSyntheticSizeError::Cancelled } other => CalculateSyntheticSizeError::GcInfo(other), } } } impl SegmentMeta { fn size_needed(&self) -> bool { match self.kind { LsnKind::BranchStart => { // If we don't have a later GcCutoff point on this branch, and // no ancestor, calculate size for the branch start point. self.segment.needed && self.segment.parent.is_none() } LsnKind::BranchPoint => true, LsnKind::GcCutOff => true, LsnKind::BranchEnd => false, LsnKind::LeasePoint => true, LsnKind::LeaseStart => false, LsnKind::LeaseEnd => false, } } } #[derive( Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize, )] pub enum LsnKind { /// A timeline starting here BranchStart, /// A child timeline branches off from here BranchPoint, /// GC cutoff point GcCutOff, /// Last record LSN BranchEnd, /// A LSN lease is granted here. LeasePoint, /// A lease starts from here. LeaseStart, /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]). LeaseEnd, } impl From for SvgBranchKind { fn from(kind: LsnKind) -> Self { match kind { LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease, _ => SvgBranchKind::Timeline, } } } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as /// part of [`ModelInputs`] from the HTTP api, explaining the inputs. #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct TimelineInputs { pub timeline_id: TimelineId, pub ancestor_id: Option, ancestor_lsn: Lsn, last_record: Lsn, latest_gc_cutoff: Lsn, /// Cutoff point based on GC settings next_pitr_cutoff: Lsn, /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, /// Lease points on the timeline lease_points: Vec, } /// Gathers the inputs for the tenant sizing model. /// /// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the /// [`TimelineInputs::latest_gc_cutoff`]. /// /// For timelines in general: /// /// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn /// initdb_lsn branchpoints* next_pitr_cutoff latest /// ``` pub(super) async fn gather_inputs( tenant: &TenantShard, limit: &Arc, max_retention_period: Option, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { // refresh is needed to update [`timeline::GcCutoffs`] tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines let mut timelines = tenant.list_timelines(); if timelines.is_empty() { // perhaps the tenant has just been created, and as such doesn't have any data yet return Ok(ModelInputs { segments: vec![], timeline_inputs: Vec::new(), }); } // Filter out timelines that are not active // // There may be a race when a timeline is dropped, // but it is unlikely to cause any issues. In the worst case, // the calculation will error out. timelines.retain(|t| t.is_active()); // Also filter out archived timelines. timelines.retain(|t| t.is_archived() != Some(true)); // Build a map of branch points. let mut branchpoints: HashMap> = HashMap::new(); for timeline in timelines.iter() { if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { branchpoints .entry(ancestor_id) .or_default() .insert(timeline.get_ancestor_lsn()); } } // These become the final result. let mut timeline_inputs = Vec::with_capacity(timelines.len()); let mut segments: Vec = Vec::new(); // // Build Segments representing each timeline. As we do that, also remember // the branchpoints and branch startpoints in 'branchpoint_segments' and // 'branchstart_segments' // // BranchPoint segments of each timeline // (timeline, branchpoint LSN) -> segment_id let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new(); // timeline, Branchpoint seg id, (ancestor, ancestor LSN) type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>); let mut branchstart_segments: Vec = Vec::new(); for timeline in timelines.iter() { let timeline_id = timeline.timeline_id; let last_record_lsn = timeline.get_last_record_lsn(); let ancestor_lsn = timeline.get_ancestor_lsn(); // there's a race between the update (holding tenant.gc_lock) and this read but it // might not be an issue, because it's not for Timeline::gc let gc_info = timeline.gc_info.read().unwrap(); // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a // new gc run, which we have no control over. however differently from `Timeline::gc` // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. // // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // the space cutoff. let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); if next_pitr_cutoff < param_cutoff { next_pitr_cutoff = param_cutoff; } Some(param_cutoff) } else { None }; let branch_is_invisible = timeline.is_invisible() == Some(true); let lease_points = gc_info .leases .keys() .filter(|&&lsn| lsn > ancestor_lsn) .copied() .collect::>(); // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); // Build "interesting LSNs" on this timeline let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() .filter(|(lsn, _child_id, is_offloaded)| { lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No }) .copied() // this assumes there are no other retain_lsns than the branchpoints .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); if !branch_is_invisible { // Do not count lease points for invisible branches. lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); } drop(gc_info); // Add branch points we collected earlier, just in case there were any that were // not present in retain_lsns. We will remove any duplicates below later. if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { lsns.extend( this_branchpoints .iter() .map(|lsn| (*lsn, LsnKind::BranchPoint)), ) } // Add a point for the PITR cutoff let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; if !branch_start_needed && !branch_is_invisible { // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN // range from the last branch point to the latest data. lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } lsns.sort_unstable(); lsns.dedup(); // // Create Segments for the interesting points. // // Timeline start point let ancestor = timeline .get_ancestor_timeline_id() .map(|ancestor_id| (ancestor_id, ancestor_lsn)); branchstart_segments.push((timeline_id, segments.len(), ancestor)); segments.push(SegmentMeta { segment: Segment { parent: None, // filled in later lsn: branch_start_lsn.0, size: None, // filled in later needed: branch_start_needed, }, timeline_id: timeline.timeline_id, kind: LsnKind::BranchStart, }); // GC cutoff point, and any branch points, i.e. points where // other timelines branch off from this timeline. let mut parent = segments.len() - 1; for (lsn, kind) in lsns { if kind == LsnKind::BranchPoint { branchpoint_segments.insert((timeline_id, lsn), segments.len()); } segments.push(SegmentMeta { segment: Segment { parent: Some(parent), lsn: lsn.0, size: None, needed: lsn > next_pitr_cutoff, }, timeline_id: timeline.timeline_id, kind, }); parent = segments.len() - 1; if kind == LsnKind::LeasePoint { // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN // value. Without the other two segments, the calculation code would not count the leased LSN as a point // to be retained. // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug. // // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and // branch points can be given a synthetic id so we can unite them. let mut lease_parent = parent; // Start of a lease. segments.push(SegmentMeta { segment: Segment { parent: Some(lease_parent), lsn: lsn.0, size: None, // Filled in later, if necessary needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention. }, timeline_id: timeline.timeline_id, kind: LsnKind::LeaseStart, }); lease_parent += 1; // End of the lease. segments.push(SegmentMeta { segment: Segment { parent: Some(lease_parent), lsn: lsn.0, size: None, // Filled in later, if necessary needed: true, // everything at the lease LSN must be readable => is needed }, timeline_id: timeline.timeline_id, kind: LsnKind::LeaseEnd, }); } } let branch_end_lsn = if branch_is_invisible { // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point). segments.last().unwrap().segment.lsn } else { // Otherwise, the branch end is the last record LSN. last_record_lsn.0 }; // Current end of the timeline segments.push(SegmentMeta { segment: Segment { parent: Some(parent), lsn: branch_end_lsn, size: None, // Filled in later, if necessary needed: true, }, timeline_id: timeline.timeline_id, kind: LsnKind::BranchEnd, }); timeline_inputs.push(TimelineInputs { timeline_id: timeline.timeline_id, ancestor_id: timeline.get_ancestor_timeline_id(), ancestor_lsn, last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(), next_pitr_cutoff, retention_param_cutoff, lease_points, }); } // We now have all segments from the timelines in 'segments'. The timelines // haven't been linked to each other yet, though. Do that. for (_timeline_id, seg_id, ancestor) in branchstart_segments { // Look up the branch point if let Some(ancestor) = ancestor { let parent_id = *branchpoint_segments.get(&ancestor).unwrap(); segments[seg_id].segment.parent = Some(parent_id); } } // We left the 'size' field empty in all of the Segments so far. // Now find logical sizes for all of the points that might need or benefit from them. fill_logical_sizes( &timelines, &mut segments, limit, logical_size_cache, cause, ctx, ) .await?; if tenant.cancel.is_cancelled() { // If we're shutting down, return an error rather than a sparse result that might include some // timelines from before we started shutting down return Err(CalculateSyntheticSizeError::Cancelled); } Ok(ModelInputs { segments, timeline_inputs, }) } /// Augment 'segments' with logical sizes /// /// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently /// (i.e. we cannot read its logical size at a particular LSN). async fn fill_logical_sizes( timelines: &[Arc], segments: &mut [SegmentMeta], limit: &Arc, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result<(), CalculateSyntheticSizeError> { let timeline_hash: HashMap> = HashMap::from_iter( timelines .iter() .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))), ); // record the used/inserted cache keys here, to remove extras not to start leaking // after initial run the cache should be quite stable, but live timelines will eventually // require new lsns to be inspected. let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option>::new(); // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to // our advantage with `?` error handling. let mut joinset = tokio::task::JoinSet::new(); // For each point that would benefit from having a logical size available, // spawn a Task to fetch it, unless we have it cached already. for seg in segments.iter() { if !seg.size_needed() { continue; } let timeline_id = seg.timeline_id; let lsn = Lsn(seg.segment.lsn); if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) { let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned(); if cached_size.is_none() { let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); let ctx = ctx.attached_child().with_scope_timeline(&timeline); joinset.spawn( calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx) .in_current_span(), ); } e.insert(cached_size); } } // Perform the size lookups let mut have_any_error = None; while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking match res { Err(join_error) if join_error.is_cancelled() => { unreachable!("we are not cancelling any of the futures, nor should be"); } Err(join_error) => { // cannot really do anything, as this panic is likely a bug error!( "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}" ); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(join_error) .context("task that calls spawn_ondemand_logical_size_calculation"), )); } Ok(Err(recv_result_error)) => { // cannot really do anything, as this panic is likely a bug error!("failed to receive logical size query result: {recv_result_error:#}"); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(recv_result_error) .context("Receiving logical size query result"), )); } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => { if matches!(error, CalculateLogicalSizeError::Cancelled) { // Skip this: it's okay if one timeline among many is shutting down while we // calculate inputs for the overall tenant. continue; } else { warn!( timeline_id=%timeline.timeline_id, "failed to calculate logical size at {lsn}: {error:#}" ); have_any_error = Some(CalculateSyntheticSizeError::LogicalSize { timeline_id: timeline.timeline_id, lsn, error, }); } } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => { debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); sizes_needed.insert((timeline.timeline_id, lsn), Some(size)); } } } // prune any keys not needed anymore; we record every used key and added key. logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); if let Some(error) = have_any_error { // we cannot complete this round, because we are missing data. // we have however cached all we were able to request calculation on. return Err(error); } // Insert the looked up sizes to the Segments for seg in segments.iter_mut() { if !seg.size_needed() { continue; } let timeline_id = seg.timeline_id; let lsn = Lsn(seg.segment.lsn); if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { seg.segment.size = Some(*size); } } Ok(()) } impl ModelInputs { pub fn calculate_model(&self) -> tenant_size_model::StorageModel { // Convert SegmentMetas into plain Segments StorageModel { segments: self .segments .iter() .map(|seg| seg.segment.clone()) .collect(), } } // calculate total project size pub fn calculate(&self) -> u64 { let storage = self.calculate_model(); let sizes = storage.calculate(); sizes.total_size } } /// Newtype around the tuple that carries the timeline at lsn logical size calculation. struct TimelineAtLsnSizeResult( Arc, utils::lsn::Lsn, Result, ); #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))] async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, cause: LogicalSizeCalculationCause, ctx: RequestContext, ) -> Result { let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); let size_res = timeline .spawn_ondemand_logical_size_calculation(lsn, cause, ctx) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } #[cfg(test)] #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way // it has the stable lsn's // // The timeline_inputs don't participate in the size calculation, and are here just to explain // the inputs. let doc = r#" { "segments": [ { "segment": { "parent": 9, "lsn": 26033560, "size": null, "needed": false }, "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", "kind": "BranchStart" }, { "segment": { "parent": 0, "lsn": 35720400, "size": 25206784, "needed": false }, "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", "kind": "GcCutOff" }, { "segment": { "parent": 1, "lsn": 35851472, "size": null, "needed": true }, "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", "kind": "BranchEnd" }, { "segment": { "parent": 7, "lsn": 24566168, "size": null, "needed": false }, "timeline_id": "454626700469f0a9914949b9d018e876", "kind": "BranchStart" }, { "segment": { "parent": 3, "lsn": 25261936, "size": 26050560, "needed": false }, "timeline_id": "454626700469f0a9914949b9d018e876", "kind": "GcCutOff" }, { "segment": { "parent": 4, "lsn": 25393008, "size": null, "needed": true }, "timeline_id": "454626700469f0a9914949b9d018e876", "kind": "BranchEnd" }, { "segment": { "parent": null, "lsn": 23694408, "size": null, "needed": false }, "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "kind": "BranchStart" }, { "segment": { "parent": 6, "lsn": 24566168, "size": 25739264, "needed": false }, "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "kind": "BranchPoint" }, { "segment": { "parent": 7, "lsn": 25902488, "size": 26402816, "needed": false }, "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "kind": "GcCutOff" }, { "segment": { "parent": 8, "lsn": 26033560, "size": 26468352, "needed": true }, "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "kind": "BranchPoint" }, { "segment": { "parent": 9, "lsn": 26033560, "size": null, "needed": true }, "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "kind": "BranchEnd" } ], "timeline_inputs": [ { "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", "ancestor_lsn": "0/18D3D98", "last_record": "0/2230CD0", "latest_gc_cutoff": "0/1698C48", "next_pitr_cutoff": "0/2210CD0", "retention_param_cutoff": null, "lease_points": [] }, { "timeline_id": "454626700469f0a9914949b9d018e876", "ancestor_lsn": "0/176D998", "last_record": "0/1837770", "latest_gc_cutoff": "0/1698C48", "next_pitr_cutoff": "0/1817770", "retention_param_cutoff": null, "lease_points": [] }, { "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "ancestor_lsn": "0/0", "last_record": "0/18D3D98", "latest_gc_cutoff": "0/1698C48", "next_pitr_cutoff": "0/18B3D98", "retention_param_cutoff": null, "lease_points": [] } ] } "#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); assert_eq!(inputs.calculate(), 37_851_408); } #[cfg(test)] #[test] fn verify_size_for_one_branch() { let doc = r#" { "segments": [ { "segment": { "parent": null, "lsn": 0, "size": null, "needed": false }, "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", "kind": "BranchStart" }, { "segment": { "parent": 0, "lsn": 305547335776, "size": 220054675456, "needed": false }, "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", "kind": "GcCutOff" }, { "segment": { "parent": 1, "lsn": 305614444640, "size": null, "needed": true }, "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", "kind": "BranchEnd" } ], "timeline_inputs": [ { "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", "ancestor_lsn": "0/0", "last_record": "47/280A5860", "latest_gc_cutoff": "47/240A5860", "next_pitr_cutoff": "47/240A5860", "retention_param_cutoff": "0/0", "lease_points": [] } ] }"#; let model: ModelInputs = serde_json::from_str(doc).unwrap(); let res = model.calculate_model().calculate(); println!("calculated synthetic size: {}", res.total_size); println!("result: {:?}", serde_json::to_string(&res.segments)); use utils::lsn::Lsn; let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap(); let last_lsn: Lsn = "47/280A5860".parse().unwrap(); println!( "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}", u64::from(latest_gc_cutoff_lsn), u64::from(last_lsn) ); assert_eq!(res.total_size, 220121784320); } ================================================ FILE: pageserver/src/tenant/storage_layer/batch_split_writer.rs ================================================ use std::future::Future; use std::ops::Range; use std::sync::Arc; use bytes::Bytes; use pageserver_api::key::{KEY_SIZE, Key}; use tokio_util::sync::CancellationToken; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; use wal_decoder::models::value::Value; use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, }; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::tenant::Timeline; use crate::tenant::storage_layer::Layer; pub(crate) enum BatchWriterResult { Produced(ResidentLayer), Discarded(PersistentLayerKey), } #[cfg(test)] impl BatchWriterResult { fn into_resident_layer(self) -> ResidentLayer { match self { BatchWriterResult::Produced(layer) => layer, BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"), } } fn into_discarded_layer(self) -> PersistentLayerKey { match self { BatchWriterResult::Produced(_) => panic!("unexpected produced layer"), BatchWriterResult::Discarded(layer) => layer, } } } enum LayerWriterWrapper { Image(ImageLayerWriter), Delta(DeltaLayerWriter), } /// An layer writer that takes unfinished layers and finish them atomically. #[must_use] pub struct BatchLayerWriter { generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>, conf: &'static PageServerConf, } impl BatchLayerWriter { pub fn new(conf: &'static PageServerConf) -> Self { Self { generated_layer_writers: Vec::new(), conf, } } pub fn add_unfinished_image_writer( &mut self, writer: ImageLayerWriter, key_range: Range, lsn: Lsn, ) { self.generated_layer_writers.push(( LayerWriterWrapper::Image(writer), PersistentLayerKey { key_range, lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), is_delta: false, }, )); } pub fn add_unfinished_delta_writer( &mut self, writer: DeltaLayerWriter, key_range: Range, lsn_range: Range, ) { self.generated_layer_writers.push(( LayerWriterWrapper::Delta(writer), PersistentLayerKey { key_range, lsn_range, is_delta: true, }, )); } pub(crate) async fn finish( self, tline: &Arc, ctx: &RequestContext, ) -> anyhow::Result> { let res = self .finish_with_discard_fn(tline, ctx, |_| async { false }) .await?; let mut output = Vec::new(); for r in res { if let BatchWriterResult::Produced(layer) = r { output.push(layer); } } Ok(output) } pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, discard_fn: D, ) -> anyhow::Result> where D: Fn(&PersistentLayerKey) -> F, F: Future, { let Self { generated_layer_writers, .. } = self; let clean_up_layers = |generated_layers: Vec| { for produced_layer in generated_layers { if let BatchWriterResult::Produced(resident_layer) = produced_layer { let layer: Layer = resident_layer.into(); layer.delete_on_drop(); } } }; // BEGIN: catch every error and do the recovery in the below section let mut generated_layers: Vec = Vec::new(); for (inner, layer_key) in generated_layer_writers { if discard_fn(&layer_key).await { generated_layers.push(BatchWriterResult::Discarded(layer_key)); } else { let res = match inner { LayerWriterWrapper::Delta(writer) => { writer.finish(layer_key.key_range.end, ctx).await } LayerWriterWrapper::Image(writer) => { writer .finish_with_end_key(layer_key.key_range.end, ctx) .await } }; let layer = match res { Ok((desc, path)) => { match Layer::finish_creating(self.conf, tline, desc, &path) { Ok(layer) => layer, Err(e) => { tokio::fs::remove_file(&path).await.ok(); clean_up_layers(generated_layers); return Err(e); } } } Err(e) => { // Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong, // so we don't need to remove the layer we just failed to create by ourselves. clean_up_layers(generated_layers); return Err(e); } }; generated_layers.push(BatchWriterResult::Produced(layer)); } } // END: catch every error and do the recovery in the above section Ok(generated_layers) } pub fn pending_layer_num(&self) -> usize { self.generated_layer_writers.len() } } /// An image writer that takes images and produces multiple image layers. #[must_use] pub struct SplitImageLayerWriter<'a> { inner: Option, target_layer_size: u64, lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, batches: BatchLayerWriter, start_key: Key, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, } impl<'a> SplitImageLayerWriter<'a> { #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_key: Key, lsn: Lsn, target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, ) -> Self { Self { target_layer_size, inner: None, conf, timeline_id, tenant_shard_id, batches: BatchLayerWriter::new(conf), lsn, start_key, gate, cancel, } } pub async fn put_image( &mut self, key: Key, img: Bytes, ctx: &RequestContext, ) -> Result<(), PutError> { if self.inner.is_none() { self.inner = Some( ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &(self.start_key..Key::MAX), self.lsn, self.gate, self.cancel.clone(), ctx, ) .await .map_err(PutError::Other)?, ); } let inner = self.inner.as_mut().unwrap(); // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; if inner.num_keys() >= 1 && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { let next_image_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &(key..Key::MAX), self.lsn, self.gate, self.cancel.clone(), ctx, ) .await .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, self.start_key..key, self.lsn, ); self.start_key = key; } inner.put_image(key, img, ctx).await } pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, end_key: Key, discard_fn: D, ) -> anyhow::Result> where D: Fn(&PersistentLayerKey) -> F, F: Future, { let Self { mut batches, inner, .. } = self; if let Some(inner) = inner { if inner.num_keys() != 0 { batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); } } batches.finish_with_discard_fn(tline, ctx, discard_fn).await } #[cfg(test)] pub(crate) async fn finish( self, tline: &Arc, ctx: &RequestContext, end_key: Key, ) -> anyhow::Result> { self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) .await } } /// A delta writer that takes key-lsn-values and produces multiple delta layers. /// /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm /// will split them into multiple files based on size. #[must_use] pub struct SplitDeltaLayerWriter<'a> { inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn_range: Range, last_key_written: Key, batches: BatchLayerWriter, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, } impl<'a> SplitDeltaLayerWriter<'a> { pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn_range: Range, target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, ) -> Self { Self { target_layer_size, inner: None, conf, timeline_id, tenant_shard_id, lsn_range, last_key_written: Key::MIN, batches: BatchLayerWriter::new(conf), gate, cancel, } } pub async fn put_value( &mut self, key: Key, lsn: Lsn, val: Value, ctx: &RequestContext, ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction // strategy. https://github.com/neondatabase/neon/issues/8837 if self.inner.is_none() { self.inner = Some(( key, DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, key, self.lsn_range.clone(), self.gate, self.cancel.clone(), ctx, ) .await .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; if inner.num_keys() >= 1 && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { if key != self.last_key_written { let next_delta_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, key, self.lsn_range.clone(), self.gate, self.cancel.clone(), ctx, ) .await .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( prev_delta_writer, start_key..key, self.lsn_range.clone(), ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() ))); } } self.last_key_written = key; let (_, inner) = self.inner.as_mut().unwrap(); inner.put_value(key, lsn, val, ctx).await } pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, discard_fn: D, ) -> anyhow::Result> where D: Fn(&PersistentLayerKey) -> F, F: Future, { let Self { mut batches, inner, .. } = self; if let Some((start_key, writer)) = inner { if writer.num_keys() != 0 { let end_key = self.last_key_written.next(); batches.add_unfinished_delta_writer( writer, start_key..end_key, self.lsn_range.clone(), ); } } batches.finish_with_discard_fn(tline, ctx, discard_fn).await } #[cfg(test)] pub(crate) async fn finish( self, tline: &Arc, ctx: &RequestContext, ) -> anyhow::Result> { self.finish_with_discard_fn(tline, ctx, |_| async { false }) .await } } #[cfg(test)] mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::AsLayerDesc; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } fn get_img(id: u32) -> Bytes { format!("{id:064}").into() } fn get_large_img() -> Bytes { let mut rng = rand::rngs::SmallRng::seed_from_u64(42); let mut data = vec![0; 8192]; rng.fill_bytes(&mut data); data.into() } #[tokio::test] async fn write_one_image() { let harness = TenantHarness::create("split_writer_write_one_image") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, get_key(0), Lsn(0x18), 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), ); image_writer .put_image(get_key(0), get_img(0), &ctx) .await .unwrap(); let layers = image_writer .finish(&tline, &ctx, get_key(10)) .await .unwrap(); assert_eq!(layers.len(), 1); delta_writer .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx) .await .unwrap(); let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 1); assert_eq!( layers .into_iter() .next() .unwrap() .into_resident_layer() .layer_desc() .key(), PersistentLayerKey { key_range: get_key(0)..get_key(1), lsn_range: Lsn(0x18)..Lsn(0x20), is_delta: true } ); } #[tokio::test] async fn write_split() { // Test the split writer with retaining all the layers we have produced (discard=false) write_split_helper("split_writer_write_split", false).await; } #[tokio::test] async fn write_split_discard() { // Test the split writer with discarding all the layers we have produced (discard=true) write_split_helper("split_writer_write_split_discard", true).await; } /// Test the image+delta writer by writing a large number of images and deltas. If discard is /// set to true, all layers will be discarded. async fn write_split_helper(harness_name: &'static str, discard: bool) { let harness = TenantHarness::create(harness_name).await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, get_key(0), Lsn(0x18), 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), ); const N: usize = 2000; for i in 0..N { let i = i as u32; image_writer .put_image(get_key(i), get_large_img(), &ctx) .await .unwrap(); delta_writer .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx) .await .unwrap(); } let image_layers = image_writer .finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard }) .await .unwrap(); let delta_layers = delta_writer .finish_with_discard_fn(&tline, &ctx, |_| async { discard }) .await .unwrap(); let image_layers = image_layers .into_iter() .map(|x| { if discard { x.into_discarded_layer() } else { x.into_resident_layer().layer_desc().key() } }) .collect_vec(); let delta_layers = delta_layers .into_iter() .map(|x| { if discard { x.into_discarded_layer() } else { x.into_resident_layer().layer_desc().key() } }) .collect_vec(); assert_eq!(image_layers.len(), N / 512 + 1); assert_eq!(delta_layers.len(), N / 512 + 1); assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0)); assert_eq!( delta_layers.last().unwrap().key_range.end, get_key(N as u32) ); for idx in 0..image_layers.len() { assert_ne!(image_layers[idx].key_range.start, Key::MIN); assert_ne!(image_layers[idx].key_range.end, Key::MAX); assert_ne!(delta_layers[idx].key_range.start, Key::MIN); assert_ne!(delta_layers[idx].key_range.end, Key::MAX); if idx > 0 { assert_eq!( image_layers[idx - 1].key_range.end, image_layers[idx].key_range.start ); assert_eq!( delta_layers[idx - 1].key_range.end, delta_layers[idx].key_range.start ); } } } #[tokio::test] async fn write_large_img() { let harness = TenantHarness::create("split_writer_write_large_img") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, get_key(0), Lsn(0x18), 4 * 1024, &tline.gate, tline.cancel.clone(), ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024, &tline.gate, tline.cancel.clone(), ); image_writer .put_image(get_key(0), get_img(0), &ctx) .await .unwrap(); image_writer .put_image(get_key(1), get_large_img(), &ctx) .await .unwrap(); let layers = image_writer .finish(&tline, &ctx, get_key(10)) .await .unwrap(); assert_eq!(layers.len(), 2); delta_writer .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx) .await .unwrap(); delta_writer .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx) .await .unwrap(); let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 2); let mut layers_iter = layers.into_iter(); assert_eq!( layers_iter .next() .unwrap() .into_resident_layer() .layer_desc() .key(), PersistentLayerKey { key_range: get_key(0)..get_key(1), lsn_range: Lsn(0x18)..Lsn(0x20), is_delta: true } ); assert_eq!( layers_iter .next() .unwrap() .into_resident_layer() .layer_desc() .key(), PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x18)..Lsn(0x20), is_delta: true } ); } #[tokio::test] async fn write_split_single_key() { let harness = TenantHarness::create("split_writer_write_split_single_key") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); const N: usize = 2000; let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), ); for i in 0..N { let i = i as u32; delta_writer .put_value( get_key(0), Lsn(i as u64 * 16 + 0x10), Value::Image(get_large_img()), &ctx, ) .await .unwrap(); } let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(delta_layers.len(), 1); let delta_layer = delta_layers .into_iter() .next() .unwrap() .into_resident_layer(); assert_eq!( delta_layer.layer_desc().key(), PersistentLayerKey { key_range: get_key(0)..get_key(1), lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), is_delta: true } ); } } ================================================ FILE: pageserver/src/tenant/storage_layer/delta_layer.rs ================================================ //! A DeltaLayer represents a collection of WAL records or page images in a range of //! LSNs, and in a range of Keys. It is stored on a file on disk. //! //! Usually a delta layer only contains differences, in the form of WAL records //! against a base LSN. However, if a relation extended or a whole new relation //! is created, there would be no base for the new pages. The entries for them //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! //! The delta files are stored in `timelines/` directory. Currently, //! there are no subdirectories, and each delta file is named like this: //! //! ```text //! -__- //! ``` //! //! For example: //! //! ```text //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! ``` //! //! Every delta file consists of three parts: "summary", "values", and //! "index". The summary is a fixed size header at the beginning of the file, //! and it contains basic information about the layer, and offsets to the other //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! "values" part. The actual page images and WAL records are stored in the //! "values" part. //! use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::ops::Range; use std::os::unix::fs::FileExt; use std::str::FromStr; use std::sync::Arc; use std::sync::atomic::AtomicU64; use anyhow::{Context, Result, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_epoll_uring::IoBuf; use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, }; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; use crate::virtual_file::TempVirtualFile; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::owned_buffers_io::write::{Buffer, BufferedWriterShutdownMode}; use crate::virtual_file::{self, IoBuffer, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file /// /// After this comes the 'values' part, starting on block 1. After that, /// the 'index' starts at the block indicated by 'index_start_blk' /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Summary { /// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC. pub magic: u16, pub format_version: u16, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, pub lsn_range: Range, /// Block number where the 'index' part of the file begins. pub index_start_blk: u32, /// Block within the 'index', where the B-tree root page is stored pub index_root_blk: u32, } impl From<&DeltaLayer> for Summary { fn from(layer: &DeltaLayer) -> Self { Self::expected( layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.desc.lsn_range.clone(), ) } } impl Summary { /// Serializes the summary header into an aligned buffer of lenth `PAGE_SZ`. pub fn ser_into_page(&self) -> Result { let mut buf = IoBufferMut::with_capacity(PAGE_SZ); Self::ser_into(self, &mut buf)?; // Pad zeroes to the buffer so the length is a multiple of the alignment. buf.extend_with(0, buf.capacity() - buf.len()); Ok(buf.freeze()) } pub(super) fn expected( tenant_id: TenantId, timeline_id: TimelineId, keys: Range, lsns: Range, ) -> Self { Self { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id, timeline_id, key_range: keys, lsn_range: lsns, index_start_blk: 0, index_root_blk: 0, } } } // Flag indicating that this version initialize the page const WILL_INIT: u64 = 1; /// Struct representing reference to BLOB in layers. /// /// Reference contains BLOB offset, and for WAL records it also contains /// `will_init` flag. The flag helps to determine the range of records /// that needs to be applied, without reading/deserializing records themselves. #[derive(Debug, Serialize, Deserialize, Copy, Clone)] pub struct BlobRef(pub u64); impl BlobRef { pub fn will_init(&self) -> bool { (self.0 & WILL_INIT) != 0 } pub fn pos(&self) -> u64 { self.0 >> 1 } pub fn new(pos: u64, will_init: bool) -> BlobRef { let mut blob_ref = pos << 1; if will_init { blob_ref |= WILL_INIT; } BlobRef(blob_ref) } } pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8; struct DeltaKey([u8; DELTA_KEY_SIZE]); /// This is the key of the B-tree index stored in the delta layer. It consists /// of the serialized representation of a Key and LSN. impl DeltaKey { fn from_slice(buf: &[u8]) -> Self { let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; bytes.copy_from_slice(buf); DeltaKey(bytes) } fn from_key_lsn(key: &Key, lsn: Lsn) -> Self { let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; key.write_to_byte_slice(&mut bytes[0..KEY_SIZE]); bytes[KEY_SIZE..].copy_from_slice(&u64::to_be_bytes(lsn.0)); DeltaKey(bytes) } fn key(&self) -> Key { Key::from_slice(&self.0) } fn lsn(&self) -> Lsn { Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap())) } fn extract_lsn_from_buf(buf: &[u8]) -> Lsn { let mut lsn_buf = [0u8; 8]; lsn_buf.copy_from_slice(&buf[KEY_SIZE..]); Lsn(u64::from_be_bytes(lsn_buf)) } } /// This is used only from `pagectl`. Within pageserver, all layers are /// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`]. pub struct DeltaLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, inner: OnceCell>, } impl std::fmt::Debug for DeltaLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use super::RangeDisplayDebug; f.debug_struct("DeltaLayer") .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) .field("lsn_range", &self.desc.lsn_range) .field("file_size", &self.desc.file_size) .field("inner", &self.inner) .finish() } } /// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta /// file. pub struct DeltaLayerInner { // values copied from summary index_start_blk: u32, index_root_blk: u32, file: Arc, file_id: FileId, layer_key_range: Range, layer_lsn_range: Range, max_vectored_read_bytes: Option, } impl DeltaLayerInner { pub(crate) fn layer_dbg_info(&self) -> String { format!( "delta {}..{} {}..{}", self.key_range().start, self.key_range().end, self.lsn_range().start, self.lsn_range().end ) } } impl std::fmt::Debug for DeltaLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DeltaLayerInner") .field("index_start_blk", &self.index_start_blk) .field("index_root_blk", &self.index_root_blk) .finish() } } /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. impl std::fmt::Display for DeltaLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.layer_desc().short_id()) } } impl AsLayerDesc for DeltaLayer { fn layer_desc(&self) -> &PersistentLayerDesc { &self.desc } } impl DeltaLayer { pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { self.desc.dump(); if !verbose { return Ok(()); } let inner = self.load(ctx).await?; inner.dump(ctx).await } fn temp_path_for( conf: &PageServerConf, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, key_start: Key, lsn_range: &Range, ) -> Utf8PathBuf { // TempVirtualFile requires us to never reuse a filename while an old // instance of TempVirtualFile created with that filename is not done dropping yet. // So, we use a monotonic counter to disambiguate the filenames. static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed); conf.timeline_path(tenant_shard_id, timeline_id) .join(format!( "{}-XXX__{:016X}-{:016X}.{:x}.{}", key_start, u64::from(lsn_range.start), u64::from(lsn_range.end), filename_disambiguator, TEMP_FILE_SUFFIX, )) } /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) .await .with_context(|| format!("Failed to load delta layer {}", self.path())) } async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); let expected_layer_name = self.layer_desc().layer_name(); if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); println!("actual: {:?}", actual_layer_name.to_string()); println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) } /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Utf8Path, file: File) -> Result { let mut summary_buf = vec![0; PAGE_SZ]; file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; let metadata = file .metadata() .context("get file metadata to determine size")?; // This function is never used for constructing layers in a running pageserver, // so it does not need an accurate TenantShardId. let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); Ok(DeltaLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_delta( tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn_range, metadata.len(), ), inner: OnceCell::new(), }) } /// Path to the layer file in pageserver workdir. fn path(&self) -> Utf8PathBuf { self.path.clone() } } /// A builder object for constructing a new delta layer. /// /// Usage: /// /// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) /// /// 2. Write the contents by calling `put_value` for every page /// version to store in the layer. /// /// 3. Call `finish`. /// struct DeltaLayerWriterInner { pub path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, tree: DiskBtreeBuilder, blob_writer: BlobWriter, // Number of key-lsns in the layer. num_keys: usize, } impl DeltaLayerWriterInner { /// /// Start building a new delta layer. /// #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will // rename it when we're done. let path = DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); let file = TempVirtualFile::new( VirtualFile::open_with_options_v2( &path, virtual_file::OpenOptions::new() .create_new(true) .write(true), ctx, ) .await?, gate.enter()?, ); // Start at PAGE_SZ, make room for the header block let blob_writer = BlobWriter::new( file, PAGE_SZ as u64, gate, cancel, ctx, info_span!(parent: None, "delta_layer_writer_flush_task", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %path), )?; // Initialize the b-tree index builder let block_buf = BlockBuf::new(); let tree_builder = DiskBtreeBuilder::new(block_buf); Ok(Self { path, timeline_id, tenant_shard_id, key_start, lsn_range, tree: tree_builder, blob_writer, num_keys: 0, }) } /// /// Append a key-value pair to the file. /// /// The values must be appended in key, lsn order. /// async fn put_value( &mut self, key: Key, lsn: Lsn, val: Value, ctx: &RequestContext, ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, Value::ser(&val) .map_err(anyhow::Error::new) .map_err(PutError::Other)? .slice_len(), val.will_init(), ctx, ) .await; res } async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, val: FullSlice, will_init: bool, ctx: &RequestContext, ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { assert!( self.lsn_range.start <= lsn, "lsn_start={}, lsn={}", self.lsn_range.start, lsn ); // We don't want to use compression in delta layer creation let compression = ImageCompressionAlgorithm::Disabled; let (val, res) = self .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); let res = self .tree .append(&delta_key.0, blob_ref.0) .map_err(anyhow::Error::new) .map_err(PutError::Other); self.num_keys += 1; (val, res) } fn size(&self) -> u64 { self.blob_writer.size() + self.tree.borrow_writer().size() } /// /// Finish writing the delta layer. /// async fn finish( self, key_end: Key, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; let file = self .blob_writer .shutdown( BufferedWriterShutdownMode::ZeroPadToNextMultiple(PAGE_SZ), ctx, ) .await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; let mut offset = index_start_blk as u64 * PAGE_SZ as u64; // TODO(yuchen): https://github.com/neondatabase/neon/issues/10092 // Should we just replace BlockBuf::blocks with one big buffer for buf in block_buf.blocks { let (_buf, res) = file.write_all_at(buf.slice_len(), offset, ctx).await; res?; offset += PAGE_SZ as u64; } assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), index_start_blk, index_root_blk, }; // Writes summary at the first block (offset 0). let buf = summary.ser_into_page()?; let (_buf, res) = file.write_all_at(buf.slice_len(), 0, ctx).await; res?; let metadata = file .metadata() .await .context("get file metadata to determine size")?; // 5GB limit for objects without multipart upload (which we don't want to use) // Make it a little bit below to account for differing GB units // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html ensure!( metadata.len() <= S3_UPLOAD_LIMIT, "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!", file.path(), metadata.len() ); // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. let desc = PersistentLayerDesc::new_delta( self.tenant_shard_id, self.timeline_id, self.key_start..key_end, self.lsn_range.clone(), metadata.len(), ); // fsync the file file.sync_all() .await .maybe_fatal_err("delta_layer sync_all")?; trace!("created delta layer {}", self.path); // The gate guard stored in `destination_file` is dropped. Callers (e.g.. flush loop or compaction) // keep the gate open also, so that it's safe for them to rename the file to its final destination. file.disarm_into_inner(); Ok((desc, self.path)) } } /// A builder object for constructing a new delta layer. /// /// Usage: /// /// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) /// /// 2. Write the contents by calling `put_value` for every page /// version to store in the layer. /// /// 3. Call `finish`. /// /// # Note /// /// As described in , it's /// possible for the writer to drop before `finish` is actually called. So this /// could lead to odd temporary files in the directory, exhausting file system. /// This structure wraps `DeltaLayerWriterInner` and also contains `Drop` /// implementation that cleans up the temporary file in failure. It's not /// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves /// out some fields, making it impossible to implement `Drop`. /// #[must_use] pub struct DeltaLayerWriter { inner: Option, } impl DeltaLayerWriter { /// /// Start building a new delta layer. /// #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( DeltaLayerWriterInner::new( conf, timeline_id, tenant_shard_id, key_start, lsn_range, gate, cancel, ctx, ) .await?, ), }) } pub fn is_empty(&self) -> bool { self.inner.as_ref().unwrap().num_keys == 0 } /// /// Append a key-value pair to the file. /// /// The values must be appended in key, lsn order. /// pub async fn put_value( &mut self, key: Key, lsn: Lsn, val: Value, ctx: &RequestContext, ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() .put_value(key, lsn, val, ctx) .await } pub async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, val: FullSlice, will_init: bool, ctx: &RequestContext, ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { self.inner .as_mut() .unwrap() .put_value_bytes(key, lsn, val, will_init, ctx) .await } pub fn size(&self) -> u64 { self.inner.as_ref().unwrap().size() } /// /// Finish writing the delta layer. /// pub(crate) async fn finish( mut self, key_end: Key, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { self.inner.take().unwrap().finish(key_end, ctx).await } pub(crate) fn num_keys(&self) -> usize { self.inner.as_ref().unwrap().num_keys } pub(crate) fn estimated_size(&self) -> u64 { let inner = self.inner.as_ref().unwrap(); inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } } #[derive(thiserror::Error, Debug)] pub enum RewriteSummaryError { #[error("magic mismatch")] MagicMismatch, #[error(transparent)] Other(#[from] anyhow::Error), } impl From for RewriteSummaryError { fn from(e: std::io::Error) -> Self { Self::Other(anyhow::anyhow!(e)) } } impl DeltaLayer { pub async fn rewrite_summary( path: &Utf8Path, rewrite: F, ctx: &RequestContext, ) -> Result<(), RewriteSummaryError> where F: Fn(Summary) -> Summary, { let file = VirtualFile::open_with_options_v2( path, virtual_file::OpenOptions::new().read(true).write(true), ctx, ) .await .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; if actual_summary.magic != DELTA_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); let buf = new_summary.ser_into_page().context("serialize")?; let (_buf, res) = file.write_all_at(buf.slice_len(), 0, ctx).await; res?; Ok(()) } } impl DeltaLayerInner { pub(crate) fn key_range(&self) -> &Range { &self.layer_key_range } pub(crate) fn lsn_range(&self) -> &Range { &self.layer_lsn_range } pub(super) async fn load( path: &Utf8Path, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { let file = Arc::new( VirtualFile::open_v2(path, ctx) .await .context("open layer file")?, ); let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader .read_blk(0, ctx) .await .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?; if let Some(mut expected_summary) = summary { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; // mask out the timeline_id, but still require the layers to be from the same tenant expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary ); } } Ok(DeltaLayerInner { file, file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, max_vectored_read_bytes, layer_key_range: actual_summary.key_range, layer_lsn_range: actual_summary.lsn_range, }) } // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // // Currently, the index is visited for each range, but this // can be further optimised to visit the index only once. pub(super) async fn get_values_reconstruct_data( &self, this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, block_reader, ); let planner = VectoredReadPlanner::new( self.max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(), ); let data_end_offset = self.index_start_offset(); let reads = Self::plan_reads( &keyspace, lsn_range.clone(), data_end_offset, index_reader, planner, ctx, ) .await .map_err(GetVectoredError::Other)?; self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; Ok(()) } async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, data_end_offset: u64, index_reader: DiskBtreeReader, mut planner: VectoredReadPlanner, ctx: &RequestContext, ) -> anyhow::Result> where Reader: BlockReader + Clone, { let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx); let mut index_stream = std::pin::pin!(index_stream); while let Some(index_entry) = index_stream.next().await { let (raw_key, value) = index_entry?; let key = Key::from_slice(&raw_key[..KEY_SIZE]); let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); // Lsns are not monotonically increasing across keys, so we don't assert on them. assert!(key >= range.start); let outside_lsn_range = !lsn_range.contains(&lsn); let flag = { if outside_lsn_range { BlobFlag::Ignore } else if blob_ref.will_init() { BlobFlag::ReplaceAll } else { // Usual path: add blob to the read BlobFlag::None } }; if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { planner.handle_range_end(blob_ref.pos()); range_end_handled = true; break; } else { planner.handle(key, lsn, blob_ref.pos(), flag); } } if !range_end_handled { tracing::debug!("Handling range end fallback at {}", data_end_offset); planner.handle_range_end(data_end_offset); } } Ok(planner.finish()) } fn get_min_read_buffer_size( planned_reads: &[VectoredRead], read_size_soft_max: usize, ) -> usize { let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else { return read_size_soft_max; }; let largest_read_size = largest_read.size(); if largest_read_size > read_size_soft_max { // If the read is oversized, it should only contain one key. let offenders = largest_read .blobs_at .as_slice() .iter() .filter_map(|(_, blob_meta)| { if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY || blob_meta.key.is_aux_file_key() { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None } else { Some(format!("{}@{}", blob_meta.key, blob_meta.lsn)) } }) .join(", "); if !offenders.is_empty() { tracing::warn!( "Oversized vectored read ({} > {}) for keys {}", largest_read_size, read_size_soft_max, offenders ); } } largest_read_size } async fn do_reads_and_update_state( &self, this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) { let max_vectored_read_bytes = self .max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can // track when a key is done. for read in reads.into_iter().rev() { let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() { let io = reconstruct_state.update_key( &blob_meta.key, blob_meta.lsn, blob_meta.will_init, ); ios.insert((blob_meta.key, blob_meta.lsn), io); } let read_extend_residency = this.clone(); let read_from = self.file.clone(); let read_ctx = ctx.attached_child(); reconstruct_state .spawn_io(async move { let vectored_blob_reader = VectoredBlobReader::new(&read_from); let buf = IoBufferMut::with_capacity(buf_size); let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; match res { Ok(blobs_buf) => { let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter().rev() { let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); let blob_read = meta.read(&view).await; let blob_read = match blob_read { Ok(buf) => buf, Err(e) => { io.complete(Err(e)); continue; } }; io.complete(Ok(OnDiskValue::WalRecordOrImage( blob_read.into_bytes(), ))); } assert!(ios.is_empty()); } Err(err) => { for (_, sender) in ios { sender.complete(Err(std::io::Error::new( err.kind(), "vec read failed", ))); } } } // keep layer resident until this IO is done; this spawned IO future generally outlives the // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency drop(read_extend_residency); }) .await; } } pub(crate) async fn index_entries<'a>( &'a self, ctx: &RequestContext, ) -> Result>> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, block_reader, ); let mut all_keys: Vec> = Vec::new(); tree_reader .visit( &[0u8; DELTA_KEY_SIZE], VisitDirection::Forwards, |key, value| { let delta_key = DeltaKey::from_slice(key); let val_ref = ValueRef { blob_ref: BlobRef(value), layer: self, }; let pos = BlobRef(value).pos(); if let Some(last) = all_keys.last_mut() { // subtract offset of the current and last entries to get the size // of the value associated with this (key, lsn) tuple let first_pos = last.size; last.size = pos - first_pos; } let entry = DeltaEntry { key: delta_key.key(), lsn: delta_key.lsn(), size: pos, val: val_ref, }; all_keys.push(entry); true }, &RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) .attached_child(), ) .await?; if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of value storage, // which corresponds to beginning of the index last.size = self.index_start_offset() - last.size; } Ok(all_keys) } /// Using the given writer, write out a version which has the earlier Lsns than `until`. /// /// Return the amount of key value records pushed to the writer. pub(super) async fn copy_prefix( &self, writer: &mut DeltaLayerWriter, until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { use futures::stream::TryStreamExt; use crate::tenant::vectored_blob_io::{ BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; #[derive(Debug)] enum Item { Actual(Key, Lsn, BlobRef), Sentinel, } impl From for Option<(Key, Lsn, BlobRef)> { fn from(value: Item) -> Self { match value { Item::Actual(key, lsn, blob) => Some((key, lsn, blob)), Item::Sentinel => None, } } } impl Item { fn offset(&self) -> Option { match self { Item::Actual(_, _, blob) => Some(*blob), Item::Sentinel => None, } } fn is_last(&self) -> bool { matches!(self, Item::Sentinel) } } let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, block_reader, ); let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); // put in a sentinel value for getting the end offset for last item, and not having to // repeat the whole read part let stream = stream.chain(futures::stream::once(futures::future::ready(Ok( Item::Sentinel, )))); let mut stream = std::pin::pin!(stream); let mut prev: Option<(Key, Lsn, BlobRef)> = None; let mut read_builder: Option = None; let max_read_size = self .max_vectored_read_bytes .map(|x| x.0.get()) .unwrap_or(8192); let mut buffer = Some(IoBufferMut::with_capacity(max_read_size)); // FIXME: buffering of DeltaLayerWriter let mut per_blob_copy = Vec::new(); let mut records = 0; while let Some(item) = stream.try_next().await? { tracing::debug!(?item, "popped"); let offset = item .offset() .unwrap_or(BlobRef::new(self.index_start_offset(), false)); let actionable = if let Some((key, lsn, start_offset)) = prev.take() { let end_offset = offset; Some(( BlobMeta { key, lsn, will_init: false, }, start_offset..end_offset, )) } else { None }; let is_last = item.is_last(); prev = Option::from(item); let actionable = actionable.filter(|x| x.0.lsn < until); let builder = if let Some((meta, offsets)) = actionable { // extend or create a new builder if read_builder .as_mut() .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta)) .unwrap_or(VectoredReadExtended::No) == VectoredReadExtended::Yes { None } else { read_builder.replace(ChunkedVectoredReadBuilder::new( offsets.start.pos(), offsets.end.pos(), meta, max_read_size, )) } } else { // nothing to do, except perhaps flush any existing for the last element None }; // flush the possible older builder and also the new one if the item was the last one let builders = builder.into_iter(); let builders = if is_last { builders.chain(read_builder.take()) } else { builders.chain(None) }; for builder in builders { let read = builder.build(); let reader = VectoredBlobReader::new(&self.file); let mut buf = buffer.take().unwrap(); buf.clear(); buf.reserve(read.size()); let res = reader.read_blobs(&read, buf, ctx).await?; let view = BufView::new_slice(&res.buf); for blob in res.blobs { let key = blob.meta.key; let lsn = blob.meta.lsn; let data = blob.read(&view).await?; #[cfg(debug_assertions)] Value::des(&data) .with_context(|| { format!( "blob failed to deserialize for {}: {:?}", blob, utils::Hex(&data) ) }) .unwrap(); // is it an image or will_init walrecord? // FIXME: this could be handled by threading the BlobRef to the // VectoredReadBuilder let will_init = wal_decoder::models::value::ValueBytes::will_init(&data) .inspect_err(|_e| { #[cfg(feature = "testing")] tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); }) .unwrap_or(false); per_blob_copy.clear(); per_blob_copy.extend_from_slice(&data); let (tmp, res) = writer .put_value_bytes( key, lsn, std::mem::take(&mut per_blob_copy).slice_len(), will_init, ctx, ) .await; per_blob_copy = tmp.into_raw_slice().into_inner(); res?; records += 1; } buffer = Some(res.buf); } } assert!( read_builder.is_none(), "with the sentinel above loop should had handled all" ); Ok(records) } pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { println!( "index_start_blk: {}, root {}", self.index_start_blk, self.index_root_blk ); let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, block_reader, ); tree_reader.dump(ctx).await?; let keys = self.index_entries(ctx).await?; async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { let buf = val.load_raw(ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { format!(" img {} bytes", img.len()) } Value::WalRecord(rec) => { let wal_desc = wal_decoder::models::record::describe_wal_record(&rec)?; format!( " rec {} bytes will_init: {} {}", buf.len(), rec.will_init(), wal_desc ) } }; Ok(desc) } for entry in keys { let DeltaEntry { key, lsn, val, .. } = entry; let desc = match dump_blob(&val, ctx).await { Ok(desc) => desc, Err(err) => { format!("ERROR: {err}") } }; println!(" key {key} at {lsn}: {desc}"); // Print more details about CHECKPOINT records. Would be nice to print details // of many other record types too, but these are particularly interesting, as // have a lot of special processing for them in walingest.rs. use pageserver_api::key::CHECKPOINT_KEY; use postgres_ffi::CheckPoint; if key == CHECKPOINT_KEY { let val = val.load(ctx).await?; match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; println!(" CHECKPOINT: {checkpoint:?}"); } Value::WalRecord(_rec) => { println!(" unexpected walrecord value for checkpoint key"); } } } } Ok(()) } fn stream_index_forwards<'a, R>( &'a self, reader: DiskBtreeReader, start: &'a [u8; DELTA_KEY_SIZE], ctx: &'a RequestContext, ) -> impl futures::stream::Stream< Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, > + 'a where R: BlockReader + 'a, { use futures::stream::TryStreamExt; let stream = reader.into_stream(start, ctx); stream.map_ok(|(key, value)| { let key = DeltaKey::from_slice(&key); let (key, lsn) = (key.key(), key.lsn()); let offset = BlobRef(value); (key, lsn, offset) }) } /// The file offset to the first block of index. /// /// The file structure is summary, values, and index. We often need this for the size of last blob. fn index_start_offset(&self) -> u64 { let offset = self.index_start_blk as u64 * PAGE_SZ as u64; let bref = BlobRef(offset); tracing::debug!( index_start_blk = self.index_start_blk, offset, pos = bref.pos(), "index_start_offset" ); offset } pub fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> DeltaLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); DeltaLayerIterator { delta_layer: self, ctx, index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), key_values_batch: std::collections::VecDeque::new(), is_end: false, planner: StreamingVectoredReadPlanner::new(max_read_size, max_batch_size), } } /// NB: not super efficient, but not terrible either. Should prob be an iterator. // // We're reusing the index traversal logical in plan_reads; would be nice to // factor that out. pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result> { self.index_entries(ctx) .await .map(|entries| entries.into_iter().map(|entry| entry.key).collect()) } } /// A set of data associated with a delta layer key and its value pub struct DeltaEntry<'a> { pub key: Key, pub lsn: Lsn, /// Size of the stored value pub size: u64, /// Reference to the on-disk value pub val: ValueRef<'a>, } /// Reference to an on-disk value pub struct ValueRef<'a> { blob_ref: BlobRef, layer: &'a DeltaLayerInner, } impl ValueRef<'_> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { let buf = self.load_raw(ctx).await?; let val = Value::des(&buf)?; Ok(val) } async fn load_raw(&self, ctx: &RequestContext) -> Result> { let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter( self.layer, ))); let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?; Ok(buf) } } pub(crate) struct Adapter(T); impl> Adapter { pub(crate) async fn read_blk( &self, blknum: u32, ctx: &RequestContext, ) -> Result { let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id); block_reader.read_blk(blknum, ctx).await } } impl AsRef for DeltaLayerInner { fn as_ref(&self) -> &DeltaLayerInner { self } } impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> { fn key(&self) -> Key { self.key } fn lsn(&self) -> Lsn { self.lsn } fn size(&self) -> u64 { self.size } } pub struct DeltaLayerIterator<'a> { delta_layer: &'a DeltaLayerInner, ctx: &'a RequestContext, planner: StreamingVectoredReadPlanner, index_iter: DiskBtreeIterator<'a>, key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } impl DeltaLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.delta_layer.layer_dbg_info() } /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { assert!(self.key_values_batch.is_empty()); assert!(!self.is_end); let plan = loop { if let Some(res) = self.index_iter.next().await { let (raw_key, value) = res?; let key = Key::from_slice(&raw_key[..KEY_SIZE]); let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); let offset = blob_ref.pos(); if let Some(batch_plan) = self.planner.handle(key, lsn, offset, blob_ref.will_init()) { break batch_plan; } } else { self.is_end = true; let data_end_offset = self.delta_layer.index_start_offset(); if let Some(item) = self.planner.handle_range_end(data_end_offset) { break item; } else { return Ok(()); // TODO: test empty iterator } } }; let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let blob_read = meta.read(&view).await?; let value = Value::des(&blob_read)?; next_batch.push_back((meta.meta.key, meta.meta.lsn, value)); } self.key_values_batch = next_batch; Ok(()) } pub async fn next(&mut self) -> anyhow::Result> { if self.key_values_batch.is_empty() { if self.is_end { return Ok(None); } self.next_batch().await?; } Ok(Some( self.key_values_batch .pop_front() .expect("should not be empty"), )) } } #[cfg(test)] pub(crate) mod test { use std::collections::BTreeMap; use super::*; use crate::DEFAULT_PG_VERSION; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{TenantShard, Timeline}; use bytes::Bytes; use itertools::MinMaxResult; use postgres_ffi::PgMajorVersion; use rand::prelude::{SeedableRng, StdRng}; use rand::seq::IndexedRandom; use rand::{Rng, RngCore}; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, /// verify that the traversal fed the right index key and value /// pairs into the planner. #[tokio::test] async fn test_delta_layer_index_traversal() { let base_key = Key { field1: 0, field2: 1663, field3: 12972, field4: 16396, field5: 0, field6: 246080, }; // Populate the index with some entries let entries: BTreeMap> = BTreeMap::from([ (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]), (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]), ]); let mut disk = TestDisk::default(); let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk); let mut disk_offset = 0; for (key, lsns) in &entries { for lsn in lsns { let index_key = DeltaKey::from_key_lsn(key, *lsn); let blob_ref = BlobRef::new(disk_offset, false); writer .append(&index_key.0, blob_ref.0) .expect("In memory disk append should never fail"); disk_offset += 1; } } // Prepare all the arguments for the call into `plan_reads` below let (root_offset, _writer) = writer .finish() .expect("In memory disk finish should never fail"); let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); let planner = VectoredReadPlanner::new(100); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let keyspace = KeySpace { ranges: vec![ base_key..base_key.add(3), base_key.add(3)..base_key.add(100), ], }; let lsn_range = Lsn(2)..Lsn(40); // Plan and validate let vectored_reads = DeltaLayerInner::plan_reads( &keyspace, lsn_range.clone(), disk_offset, reader, planner, &ctx, ) .await .expect("Read planning should not fail"); validate(keyspace, lsn_range, vectored_reads, entries); } fn validate( keyspace: KeySpace, lsn_range: Range, vectored_reads: Vec, index_entries: BTreeMap>, ) { #[derive(Debug, PartialEq, Eq)] struct BlobSpec { key: Key, lsn: Lsn, at: u64, } let mut planned_blobs = Vec::new(); for read in vectored_reads { for (at, meta) in read.blobs_at.as_slice() { planned_blobs.push(BlobSpec { key: meta.key, lsn: meta.lsn, at: *at, }); } } let mut expected_blobs = Vec::new(); let mut disk_offset = 0; for (key, lsns) in index_entries { for lsn in lsns { let key_included = keyspace.ranges.iter().any(|range| range.contains(&key)); let lsn_included = lsn_range.contains(&lsn); if key_included && lsn_included { expected_blobs.push(BlobSpec { key, lsn, at: disk_offset, }); } disk_offset += 1; } } assert_eq!(planned_blobs, expected_blobs); } mod constants { use utils::lsn::Lsn; /// Offset used by all lsns in this test pub(super) const LSN_OFFSET: Lsn = Lsn(0x08); /// Number of unique keys including in the test data pub(super) const KEY_COUNT: u8 = 60; /// Max number of different lsns for each key pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20; /// Possible value sizes for each key along with a probability weight pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)]; /// Probability that there will be a gap between the current key and the next one (33.3%) pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)]; /// The minimum size of a key range in all the generated reads pub(super) const MIN_RANGE_SIZE: i128 = 10; /// The number of ranges included in each vectored read pub(super) const RANGES_COUNT: u8 = 2; /// The number of vectored reads performed pub(super) const READS_COUNT: u8 = 100; /// Soft max size of a vectored read. Will be violated if we have to read keys /// with values larger than the limit pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024; } struct Entry { key: Key, lsn: Lsn, value: Vec, } fn generate_entries(rng: &mut StdRng) -> Vec { let mut current_key = Key::MIN; let mut entries = Vec::new(); for _ in 0..constants::KEY_COUNT { let count = rng.random_range(1..constants::MAX_ENTRIES_PER_KEY); let mut lsns_iter = std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { Some(Lsn(lsn.0 + 0x08)) }); let mut lsns = Vec::new(); while lsns.len() < count as usize { let take = rng.random_bool(0.5); let lsn = lsns_iter.next().unwrap(); if take { lsns.push(lsn); } } for lsn in lsns { let size = constants::VALUE_SIZES .choose_weighted(rng, |item| item.1) .unwrap() .0; let mut buf = vec![0; size]; rng.fill_bytes(&mut buf); entries.push(Entry { key: current_key, lsn, value: buf, }) } let gap = constants::KEY_GAP_CHANGES .choose_weighted(rng, |item| item.1) .unwrap() .0; if gap { current_key = current_key.add(2); } else { current_key = current_key.add(1); } } entries } struct EntriesMeta { key_range: Range, lsn_range: Range, index: BTreeMap<(Key, Lsn), Vec>, } fn get_entries_meta(entries: &[Entry]) -> EntriesMeta { let key_range = match entries.iter().minmax_by_key(|e| e.key) { MinMaxResult::MinMax(min, max) => min.key..max.key.next(), _ => panic!("More than one entry is always expected"), }; let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) { MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1), _ => panic!("More than one entry is always expected"), }; let mut index = BTreeMap::new(); for entry in entries.iter() { index.insert((entry.key, entry.lsn), entry.value.clone()); } EntriesMeta { key_range, lsn_range, index, } } fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range) -> KeySpace { let start = key_range.start.to_i128(); let end = key_range.end.to_i128(); let mut keyspace = KeySpace::default(); for _ in 0..constants::RANGES_COUNT { let mut range: Option> = Option::default(); while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { let range_start = rng.random_range(start..end); let range_end_offset = range_start + constants::MIN_RANGE_SIZE; if range_end_offset >= end { range = Some(Key::from_i128(range_start)..Key::from_i128(end)); } else { let range_end = rng.random_range((range_start + constants::MIN_RANGE_SIZE)..end); range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); } } keyspace.ranges.push(range.unwrap()); } keyspace } #[tokio::test] async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?; let (tenant, ctx) = harness.load().await; let timeline_id = TimelineId::generate(); let timeline = tenant .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx) .await?; tracing::info!("Generating test data ..."); let rng = &mut StdRng::seed_from_u64(0); let entries = generate_entries(rng); let entries_meta = get_entries_meta(&entries); tracing::info!("Done generating {} entries", entries.len()); tracing::info!("Writing test data to delta layer ..."); let mut writer = DeltaLayerWriter::new( harness.conf, timeline_id, harness.tenant_shard_id, entries_meta.key_range.start, entries_meta.lsn_range.clone(), &timeline.gate, timeline.cancel.clone(), &ctx, ) .await?; for entry in entries { let (_, res) = writer .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx) .await; res?; } let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; let inner = resident.get_as_delta(&ctx).await?; let file_size = inner.file.metadata().await?.len(); tracing::info!( "Done writing test data to delta layer. Resulting file size is: {}", file_size ); for i in 0..constants::READS_COUNT { tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT); let block_reader = FileBlockReader::new(&inner.file, inner.file_id); let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( inner.index_start_blk, inner.index_root_blk, block_reader, ); let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; let vectored_reads = DeltaLayerInner::plan_reads( &keyspace, entries_meta.lsn_range.clone(), data_end_offset, index_reader, planner, &ctx, ) .await?; let vectored_blob_reader = VectoredBlobReader::new(&inner.file); let buf_size = DeltaLayerInner::get_min_read_buffer_size( &vectored_reads, constants::MAX_VECTORED_READ_BYTES, ); let mut buf = Some(IoBufferMut::with_capacity(buf_size)); for read in vectored_reads { let blobs_buf = vectored_blob_reader .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) .await?; let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let value = meta.read(&view).await?; assert_eq!( &value[..], &entries_meta.index[&(meta.meta.key, meta.meta.lsn)] ); } buf = Some(blobs_buf.buf); } } Ok(()) } #[tokio::test] async fn copy_delta_prefix_smoke() { use bytes::Bytes; use wal_decoder::models::record::NeonWalRecord; let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") .await .unwrap(); let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx) .await .unwrap(); let ctx = &ctx.with_scope_timeline(&timeline); let initdb_layer = timeline .layers .read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing) .await .likely_resident_layers() .next() .cloned() .unwrap(); { let mut writer = timeline.writer().await; let data = [ (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))), ( 0x30, 12, Value::WalRecord(NeonWalRecord::Postgres { will_init: false, rec: Bytes::from_static(b"1"), }), ), ( 0x40, 12, Value::WalRecord(NeonWalRecord::Postgres { will_init: true, rec: Bytes::from_static(b"2"), }), ), // build an oversized value so we cannot extend and existing read over // this ( 0x50, 12, Value::WalRecord(NeonWalRecord::Postgres { will_init: true, rec: { let mut buf = vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024]; buf.iter_mut() .enumerate() .for_each(|(i, slot)| *slot = (i % 256) as u8); Bytes::from(buf) }, }), ), // because the oversized read cannot be extended further, we are sure to exercise the // builder created on the last round with this: ( 0x60, 12, Value::WalRecord(NeonWalRecord::Postgres { will_init: true, rec: Bytes::from_static(b"3"), }), ), ( 0x60, 9, Value::Image(Bytes::from_static(b"something for a different key")), ), ]; let mut last_lsn = None; for (lsn, key, value) in data { let key = Key::from_i128(key); writer.put(key, Lsn(lsn), &value, ctx).await.unwrap(); last_lsn = Some(lsn); } writer.finish_write(Lsn(last_lsn.unwrap())); } timeline.freeze_and_flush().await.unwrap(); let new_layer = timeline .layers .read(LayerManagerLockHolder::Testing) .await .likely_resident_layers() .find(|&x| x != &initdb_layer) .cloned() .unwrap(); // create a copy for the timeline, so we don't overwrite the file let branch = tenant .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx) .await .unwrap(); assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60)); // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just // a single key for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] { let truncate_at = Lsn(truncate_at); let mut writer = DeltaLayerWriter::new( tenant.conf, branch.timeline_id, tenant.tenant_shard_id, Key::MIN, Lsn(0x11)..truncate_at, &branch.gate, branch.cancel.clone(), ctx, ) .await .unwrap(); let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap(); new_layer .copy_delta_prefix(&mut writer, truncate_at, ctx) .await .unwrap(); let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap(); copied_layer.get_as_delta(ctx).await.unwrap(); assert_keys_and_values_eq( new_layer.get_as_delta(ctx).await.unwrap(), copied_layer.get_as_delta(ctx).await.unwrap(), truncate_at, ctx, ) .await; } } async fn assert_keys_and_values_eq( source: &DeltaLayerInner, truncated: &DeltaLayerInner, truncated_at: Lsn, ctx: &RequestContext, ) { use futures::future::ready; use futures::stream::TryStreamExt; let start_key = [0u8; DELTA_KEY_SIZE]; let source_reader = FileBlockReader::new(&source.file, source.file_id); let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( source.index_start_blk, source.index_root_blk, &source_reader, ); let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx); let source_stream = source_stream.filter(|res| match res { Ok((_, lsn, _)) => ready(lsn < &truncated_at), _ => ready(true), }); let mut source_stream = std::pin::pin!(source_stream); let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id); let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( truncated.index_start_blk, truncated.index_root_blk, &truncated_reader, ); let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx); let mut truncated_stream = std::pin::pin!(truncated_stream); let mut scratch_left = Vec::new(); let mut scratch_right = Vec::new(); loop { let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next()); let (src, truncated) = tokio::try_join!(src, truncated).unwrap(); if src.is_none() { assert!(truncated.is_none()); break; } let (src, truncated) = (src.unwrap(), truncated.unwrap()); // because we've filtered the source with Lsn, we should always have the same keys from both. assert_eq!(src.0, truncated.0); assert_eq!(src.1, truncated.1); // if this is needed for something else, just drop this assert. assert!( src.2.pos() >= truncated.2.pos(), "value position should not go backwards {} vs. {}", src.2.pos(), truncated.2.pos() ); scratch_left.clear(); let src_cursor = source_reader.block_cursor(); let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx); scratch_right.clear(); let trunc_cursor = truncated_reader.block_cursor(); let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx); tokio::try_join!(left, right).unwrap(); assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); } } pub(crate) fn sort_delta( (k1, l1, _): &(Key, Lsn, Value), (k2, l2, _): &(Key, Lsn, Value), ) -> std::cmp::Ordering { (k1, l1).cmp(&(k2, l2)) } #[cfg(feature = "testing")] pub(crate) fn sort_delta_value( (k1, l1, v1): &(Key, Lsn, Value), (k2, l2, v2): &(Key, Lsn, Value), ) -> std::cmp::Ordering { let order_1 = if v1.is_image() { 0 } else { 1 }; let order_2 = if v2.is_image() { 0 } else { 1 }; (k1, l1, order_1).cmp(&(k2, l2, order_2)) } pub(crate) async fn produce_delta_layer( tenant: &TenantShard, tline: &Arc, mut deltas: Vec<(Key, Lsn, Value)>, ctx: &RequestContext, ) -> anyhow::Result { deltas.sort_by(sort_delta); let (key_start, _, _) = deltas.first().unwrap(); let (key_max, _, _) = deltas.last().unwrap(); let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); let lsn_end = Lsn(lsn_max.0 + 1); let mut writer = DeltaLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, *key_start, (*lsn_min)..lsn_end, &tline.gate, tline.cancel.clone(), ctx, ) .await?; let key_end = key_max.next(); for (key, lsn, value) in deltas { writer.put_value(key, lsn, value, ctx).await?; } let (desc, path) = writer.finish(key_end, ctx).await?; let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(delta_layer) } async fn assert_delta_iter_equal( delta_iter: &mut DeltaLayerIterator<'_>, expect: &[(Key, Lsn, Value)], ) { let mut expect_iter = expect.iter(); loop { let o1 = delta_iter.next().await.unwrap(); let o2 = expect_iter.next(); assert_eq!(o1.is_some(), o2.is_some()); if o1.is_none() && o2.is_none() { break; } let (k1, l1, v1) = o1.unwrap(); let (k2, l2, v2) = o2.unwrap(); assert_eq!(&k1, k2); assert_eq!(l1, *l2); assert_eq!(&v1, v2); } } #[tokio::test] async fn delta_layer_iterator() { let harness = TenantHarness::create("delta_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } const N: usize = 1000; let test_deltas = (0..N) .map(|idx| { ( get_key(idx as u32 / 10), Lsn(0x10 * ((idx as u64) % 10 + 1)), Value::Image(Bytes::from(format!("img{idx:05}"))), ) }) .collect_vec(); let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx) .await .unwrap(); let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap(); for max_read_size in [1, 1024] { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); num_items += iter.key_values_batch.len(); if max_read_size == 1 { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; } iter.key_values_batch.clear(); } // Test if the result is correct let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_delta_iter_equal(&mut iter, &test_deltas).await; } } } } ================================================ FILE: pageserver/src/tenant/storage_layer/errors.rs ================================================ use crate::tenant::blob_io::WriteBlobError; #[derive(Debug, thiserror::Error)] pub enum PutError { #[error(transparent)] WriteBlob(WriteBlobError), #[error(transparent)] Other(anyhow::Error), } impl PutError { pub fn is_cancel(&self) -> bool { match self { PutError::WriteBlob(e) => e.is_cancel(), PutError::Other(_) => false, } } pub fn into_anyhow(self) -> anyhow::Error { match self { PutError::WriteBlob(e) => e.into_anyhow(), PutError::Other(e) => e, } } } ================================================ FILE: pageserver/src/tenant/storage_layer/filter_iterator.rs ================================================ use std::ops::Range; use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::PersistentLayerKey; use super::merge_iterator::{MergeIterator, MergeIteratorItem}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// /// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys /// to be retained. pub struct FilterIterator<'a> { inner: MergeIterator<'a>, retain_key_filters: Vec>, current_filter_idx: usize, } impl<'a> FilterIterator<'a> { pub fn create( inner: MergeIterator<'a>, dense_keyspace: KeySpace, sparse_keyspace: SparseKeySpace, ) -> anyhow::Result { let mut retain_key_filters = Vec::new(); retain_key_filters.extend(dense_keyspace.ranges); retain_key_filters.extend(sparse_keyspace.0.ranges); retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start)); // Verify key filters are non-overlapping and sorted for window in retain_key_filters.windows(2) { if window[0].end > window[1].start { bail!( "Key filters are overlapping: {:?} and {:?}", window[0], window[1] ); } } Ok(Self { inner, retain_key_filters, current_filter_idx: 0, }) } async fn next_inner(&mut self) -> anyhow::Result> { while let Some(item) = self.inner.next_inner::().await? { while self.current_filter_idx < self.retain_key_filters.len() && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end { // [filter region] [filter region] [filter region] // ^ item // ^ current filter self.current_filter_idx += 1; // [filter region] [filter region] [filter region] // ^ item // ^ current filter } if self.current_filter_idx >= self.retain_key_filters.len() { // We already exhausted all filters, so we should return now // [filter region] [filter region] [filter region] // ^ item // ^ current filter (nothing) return Ok(None); } if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) { // [filter region] [filter region] [filter region] // ^ item // ^ current filter return Ok(Some(item)); } // If the key is not contained in the key retaining filters, continue to the next item. // [filter region] [filter region] [filter region] // ^ item // ^ current filter } Ok(None) } pub async fn next(&mut self) -> anyhow::Result> { self.next_inner().await } pub async fn next_with_trace( &mut self, ) -> anyhow::Result)>> { self.next_inner().await } } #[cfg(test)] mod tests { use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer; async fn assert_filter_iter_equal( filter_iter: &mut FilterIterator<'_>, expect: &[(Key, Lsn, Value)], ) { let mut expect_iter = expect.iter(); loop { let o1 = filter_iter.next().await.unwrap(); let o2 = expect_iter.next(); assert_eq!(o1.is_some(), o2.is_some()); if o1.is_none() && o2.is_none() { break; } let (k1, l1, v1) = o1.unwrap(); let (k2, l2, v2) = o2.unwrap(); assert_eq!(&k1, k2); assert_eq!(l1, *l2); assert_eq!(&v1, v2); } } #[tokio::test] async fn filter_keyspace_iterator() { use bytes::Bytes; let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } const N: usize = 100; let test_deltas1 = (0..N) .map(|idx| { ( get_key(idx as u32), Lsn(0x20 * ((idx as u64) % 10 + 1)), Value::Image(Bytes::from(format!("img{idx:05}"))), ) }) .collect_vec(); let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) .await .unwrap(); let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, ); let mut filter_iter = FilterIterator::create( merge_iter, KeySpace { ranges: vec![ get_key(5)..get_key(10), get_key(20)..get_key(30), get_key(90)..get_key(110), get_key(1000)..get_key(2000), ], }, SparseKeySpace(KeySpace::default()), ) .unwrap(); let mut result = Vec::new(); result.extend(test_deltas1[5..10].iter().cloned()); result.extend(test_deltas1[20..30].iter().cloned()); result.extend(test_deltas1[90..100].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, ); let mut filter_iter = FilterIterator::create( merge_iter, KeySpace { ranges: vec![ get_key(0)..get_key(10), get_key(20)..get_key(30), get_key(90)..get_key(95), ], }, SparseKeySpace(KeySpace::default()), ) .unwrap(); let mut result = Vec::new(); result.extend(test_deltas1[0..10].iter().cloned()); result.extend(test_deltas1[20..30].iter().cloned()); result.extend(test_deltas1[90..95].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; } } ================================================ FILE: pageserver/src/tenant/storage_layer/image_layer.rs ================================================ //! An ImageLayer represents an image or a snapshot of a key-range at //! one particular LSN. //! //! It contains an image of all key-value pairs in its key-range. Any key //! that falls into the image layer's range but does not exist in the layer, //! does not exist. //! //! An image layer is stored in a file on disk. The file is stored in //! timelines/ directory. Currently, there are no //! subdirectories, and each image layer file is named like this: //! //! ```text //! -__ //! ``` //! //! For example: //! //! ```text //! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 //! ``` //! //! Every image layer file consists of three parts: "summary", //! "index", and "values". The summary is a fixed size header at the //! beginning of the file, and it contains basic information about the //! layer, and offsets to the other parts. The "index" is a B-tree, //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; use std::sync::Arc; use std::sync::atomic::AtomicU64; use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, }; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; use crate::virtual_file::TempVirtualFile; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::owned_buffers_io::write::{Buffer, BufferedWriterShutdownMode}; use crate::virtual_file::{self, IoBuffer, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file /// /// After this comes the 'values' part, starting on block 1. After that, /// the 'index' starts at the block indicated by 'index_start_blk' /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Summary { /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. pub magic: u16, pub format_version: u16, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, pub lsn: Lsn, /// Block number where the 'index' part of the file begins. pub index_start_blk: u32, /// Block within the 'index', where the B-tree root page is stored pub index_root_blk: u32, // the 'values' part starts after the summary header, on block 1. } impl From<&ImageLayer> for Summary { fn from(layer: &ImageLayer) -> Self { Self::expected( layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.lsn, ) } } impl Summary { /// Serializes the summary header into an aligned buffer of lenth `PAGE_SZ`. pub fn ser_into_page(&self) -> Result { let mut buf = IoBufferMut::with_capacity(PAGE_SZ); Self::ser_into(self, &mut buf)?; // Pad zeroes to the buffer so the length is a multiple of the alignment. buf.extend_with(0, buf.capacity() - buf.len()); Ok(buf.freeze()) } pub(super) fn expected( tenant_id: TenantId, timeline_id: TimelineId, key_range: Range, lsn: Lsn, ) -> Self { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id, timeline_id, key_range, lsn, index_start_blk: 0, index_root_blk: 0, } } } /// This is used only from `pagectl`. Within pageserver, all layers are /// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`]. pub struct ImageLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, inner: OnceCell, } impl std::fmt::Debug for ImageLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use super::RangeDisplayDebug; f.debug_struct("ImageLayer") .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) .field("file_size", &self.desc.file_size) .field("lsn", &self.lsn) .field("inner", &self.inner) .finish() } } /// ImageLayer is the in-memory data structure associated with an on-disk image /// file. pub struct ImageLayerInner { // values copied from summary index_start_blk: u32, index_root_blk: u32, key_range: Range, lsn: Lsn, file: Arc, file_id: FileId, max_vectored_read_bytes: Option, } impl ImageLayerInner { pub(crate) fn layer_dbg_info(&self) -> String { format!( "image {}..{} {}", self.key_range().start, self.key_range().end, self.lsn() ) } } impl std::fmt::Debug for ImageLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ImageLayerInner") .field("index_start_blk", &self.index_start_blk) .field("index_root_blk", &self.index_root_blk) .finish() } } impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, block_reader, ); tree_reader.dump(ctx).await?; tree_reader .visit( &[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { println!("key: {} offset {}", hex::encode(key), value); true }, ctx, ) .await?; Ok(()) } } /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. impl std::fmt::Display for ImageLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.layer_desc().short_id()) } } impl AsLayerDesc for ImageLayer { fn layer_desc(&self) -> &PersistentLayerDesc { &self.desc } } impl ImageLayer { pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { self.desc.dump(); if !verbose { return Ok(()); } let inner = self.load(ctx).await?; inner.dump(ctx).await?; Ok(()) } fn temp_path_for( conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, fname: &ImageLayerName, ) -> Utf8PathBuf { // TempVirtualFile requires us to never reuse a filename while an old // instance of TempVirtualFile created with that filename is not done dropping yet. // So, we use a monotonic counter to disambiguate the filenames. static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed); conf.timeline_path(&tenant_shard_id, &timeline_id) .join(format!( "{fname}.{filename_disambiguator:x}.{TEMP_FILE_SUFFIX}" )) } /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { self.inner .get_or_try_init(|| self.load_inner(ctx)) .await .with_context(|| format!("Failed to load image layer {}", self.path())) } async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); let expected_layer_name = self.layer_desc().layer_name(); if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); println!("actual: {:?}", actual_layer_name.to_string()); println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) } /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Utf8Path, file: File) -> Result { let mut summary_buf = vec![0; PAGE_SZ]; file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; let metadata = file .metadata() .context("get file metadata to determine size")?; // This function is never used for constructing layers in a running pageserver, // so it does not need an accurate TenantShardId. let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); Ok(ImageLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_img( tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn, metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, inner: OnceCell::new(), }) } fn path(&self) -> Utf8PathBuf { self.path.clone() } } #[derive(thiserror::Error, Debug)] pub enum RewriteSummaryError { #[error("magic mismatch")] MagicMismatch, #[error(transparent)] Other(#[from] anyhow::Error), } impl From for RewriteSummaryError { fn from(e: std::io::Error) -> Self { Self::Other(anyhow::anyhow!(e)) } } impl ImageLayer { pub async fn rewrite_summary( path: &Utf8Path, rewrite: F, ctx: &RequestContext, ) -> Result<(), RewriteSummaryError> where F: Fn(Summary) -> Summary, { let file = VirtualFile::open_with_options_v2( path, virtual_file::OpenOptions::new().read(true).write(true), ctx, ) .await .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; if actual_summary.magic != IMAGE_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); let buf = new_summary.ser_into_page().context("serialize")?; let (_buf, res) = file.write_all_at(buf.slice_len(), 0, ctx).await; res?; Ok(()) } } impl ImageLayerInner { pub(crate) fn key_range(&self) -> &Range { &self.key_range } pub(crate) fn lsn(&self) -> Lsn { self.lsn } pub(super) async fn load( path: &Utf8Path, lsn: Lsn, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { let file = Arc::new( VirtualFile::open_v2(path, ctx) .await .context("open layer file")?, ); let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader .read_blk(0, ctx) .await .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. // // TODO: confirm and make this into assertion let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?; if let Some(mut expected_summary) = summary { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; // mask out the timeline_id, but still require the layers to be from the same tenant expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary ); } } Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, file, file_id, max_vectored_read_bytes, key_range: actual_summary.key_range, }) } // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. pub(super) async fn get_values_reconstruct_data( &self, this: ResidentLayer, keyspace: KeySpace, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let reads = self .plan_reads(keyspace, None, ctx) .await .map_err(GetVectoredError::Other)?; self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; reconstruct_state.on_image_layer_visited(&self.key_range); Ok(()) } /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// /// If shard_identity is provided, it will be used to filter keys down to those stored on /// this shard. async fn plan_reads( &self, keyspace: KeySpace, shard_identity: Option<&ShardIdentity>, ctx: &RequestContext, ) -> anyhow::Result> { let mut planner = VectoredReadPlanner::new( self.max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(), ); let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::ImageLayerBtreeNode) .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); let index_stream = tree_reader.clone().into_stream(&search_key, &ctx); let mut index_stream = std::pin::pin!(index_stream); while let Some(index_entry) = index_stream.next().await { let (raw_key, offset) = index_entry?; let key = Key::from_slice(&raw_key[..KEY_SIZE]); assert!(key >= range.start); let flag = if let Some(shard_identity) = shard_identity { if shard_identity.is_key_disposable(&key) { BlobFlag::Ignore } else { BlobFlag::None } } else { BlobFlag::None }; if key >= range.end { planner.handle_range_end(offset); range_end_handled = true; break; } else { planner.handle(key, self.lsn, offset, flag); } } if !range_end_handled { let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; planner.handle_range_end(payload_end); } } Ok(planner.finish()) } /// Given a key range, select the parts of that range that should be retained by the ShardIdentity, /// then execute vectored GET operations, passing the results of all read keys into the writer. pub(super) async fn filter( &self, shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, ) -> anyhow::Result { // Fragment the range into the regions owned by this ShardIdentity let plan = self .plan_reads( KeySpace { // If asked for the total key space, plan_reads will give us all the keys in the layer ranges: vec![Key::MIN..Key::MAX], }, Some(shard_identity), ctx, ) .await?; let vectored_blob_reader = VectoredBlobReader::new(&self.file); let mut key_count = 0; for read in plan.into_iter() { let buf_size = read.size(); let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { // Just read the raw header+data and pass it through to the target layer, without // decoding and recompressing it. let raw = meta.raw_with_header(&view); key_count += 1; writer .put_image_raw(meta.meta.key, raw.into_bytes(), ctx) .await .context(format!("Storing key {}", meta.meta.key))?; } } Ok(key_count) } async fn do_reads_and_update_state( &self, this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) { let max_vectored_read_bytes = self .max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(); for read in reads.into_iter() { let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); for (_, blob_meta) in read.blobs_at.as_slice() { let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true); ios.insert((blob_meta.key, blob_meta.lsn), io); } let buf_size = read.size(); if buf_size > max_vectored_read_bytes { // If the read is oversized, it should only contain one key. let offenders = read .blobs_at .as_slice() .iter() .filter_map(|(_, blob_meta)| { if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY || blob_meta.key.is_aux_file_key() { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None } else { Some(format!("{}@{}", blob_meta.key, blob_meta.lsn)) } }) .join(", "); if !offenders.is_empty() { tracing::warn!( "Oversized vectored read ({} > {}) for keys {}", buf_size, max_vectored_read_bytes, offenders ); } } let read_extend_residency = this.clone(); let read_from = self.file.clone(); let read_ctx = ctx.attached_child(); reconstruct_state .spawn_io(async move { let buf = IoBufferMut::with_capacity(buf_size); let vectored_blob_reader = VectoredBlobReader::new(&read_from); let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; match res { Ok(blobs_buf) => { let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let io: OnDiskValueIo = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); let img_buf = meta.read(&view).await; let img_buf = match img_buf { Ok(img_buf) => img_buf, Err(e) => { io.complete(Err(e)); continue; } }; io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes()))); } assert!(ios.is_empty()); } Err(err) => { for (_, io) in ios { io.complete(Err(std::io::Error::new( err.kind(), "vec read failed", ))); } } } // keep layer resident until this IO is done; this spawned IO future generally outlives the // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency drop(read_extend_residency); }) .await; } } pub(crate) fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> ImageLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); ImageLayerIterator { image_layer: self, ctx, index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx), key_values_batch: VecDeque::new(), is_end: false, planner: StreamingVectoredReadPlanner::new(max_read_size, max_batch_size), } } /// NB: not super efficient, but not terrible either. Should prob be an iterator. // // We're reusing the index traversal logical in plan_reads; would be nice to // factor that out. pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result> { let plan = self .plan_reads(KeySpace::single(self.key_range.clone()), None, ctx) .await?; Ok(plan .into_iter() .flat_map(|read| read.blobs_at) .map(|(_, blob_meta)| blob_meta.key) .collect()) } } /// A builder object for constructing a new image layer. /// /// Usage: /// /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) /// /// 2. Write the contents by calling `put_page_image` for every key-value /// pair in the key range. /// /// 3. Call `finish`. /// struct ImageLayerWriterInner { conf: &'static PageServerConf, path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: Range, lsn: Lsn, // Total uncompressed bytes passed into put_image uncompressed_bytes: u64, // Like `uncompressed_bytes`, // but only of images we might consider for compression uncompressed_bytes_eligible: u64, // Like `uncompressed_bytes`, but only of images // where we have chosen their compressed form uncompressed_bytes_chosen: u64, // Number of keys in the layer. num_keys: usize, blob_writer: BlobWriter, tree: DiskBtreeBuilder, #[cfg(feature = "testing")] last_written_key: Key, } impl ImageLayerWriterInner { /// /// Start building a new image layer. /// #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( conf, timeline_id, tenant_shard_id, &ImageLayerName { key_range: key_range.clone(), lsn, }, ); trace!("creating image layer {}", path); let file = TempVirtualFile::new( VirtualFile::open_with_options_v2( &path, virtual_file::OpenOptions::new() .create_new(true) .write(true), ctx, ) .await?, gate.enter()?, ); // Start at `PAGE_SZ` to make room for the header block. let blob_writer = BlobWriter::new( file, PAGE_SZ as u64, gate, cancel, ctx, info_span!(parent: None, "image_layer_writer_flush_task", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %path), )?; // Initialize the b-tree index builder let block_buf = BlockBuf::new(); let tree_builder = DiskBtreeBuilder::new(block_buf); let writer = Self { conf, path, timeline_id, tenant_shard_id, key_range: key_range.clone(), lsn, tree: tree_builder, blob_writer, uncompressed_bytes: 0, uncompressed_bytes_eligible: 0, uncompressed_bytes_chosen: 0, num_keys: 0, #[cfg(feature = "testing")] last_written_key: Key::MIN, }; Ok(writer) } /// /// Write next value to the file. /// /// The page versions must be appended in blknum order. /// async fn put_image( &mut self, key: Key, img: Bytes, ctx: &RequestContext, ) -> Result<(), PutError> { if !self.key_range.contains(&key) { return Err(PutError::Other(anyhow::anyhow!( "key {:?} not in range {:?}", key, self.key_range ))); } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; self.num_keys += 1; let (_img, res) = self .blob_writer .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; } if compression_info.written_compressed { // The image has been compressed self.uncompressed_bytes_chosen += uncompressed_len; } let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); self.tree .append(&keybuf, off) .map_err(anyhow::Error::new) .map_err(PutError::Other)?; #[cfg(feature = "testing")] { self.last_written_key = key; } Ok(()) } /// /// Write the next image to the file, as a raw blob header and data. /// /// The page versions must be appended in blknum order. /// async fn put_image_raw( &mut self, key: Key, raw_with_header: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); // NB: we don't update the (un)compressed metrics, since we can't determine them without // decompressing the image. This seems okay. self.num_keys += 1; let (_, res) = self .blob_writer .write_blob_raw(raw_with_header.slice_len(), ctx) .await; let offset = res?; let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); self.tree.append(&keybuf, offset)?; #[cfg(feature = "testing")] { self.last_written_key = key; } Ok(()) } /// /// Finish writing the image layer. /// async fn finish( self, ctx: &RequestContext, end_key: Option, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; // Calculate compression ratio let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED .inc_by(self.uncompressed_bytes_eligible); crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen); // NB: filter() may pass through raw pages from a different layer, without looking at // whether these are compressed or not. We don't track metrics for these, so avoid // increasing `COMPRESSION_IMAGE_OUTPUT_BYTES` in this case too. if self.uncompressed_bytes > 0 { crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); }; let file = self .blob_writer .shutdown( BufferedWriterShutdownMode::ZeroPadToNextMultiple(PAGE_SZ), ctx, ) .await?; // Write out the index let mut offset = index_start_blk as u64 * PAGE_SZ as u64; let (index_root_blk, block_buf) = self.tree.finish()?; // TODO(yuchen): https://github.com/neondatabase/neon/issues/10092 // Should we just replace BlockBuf::blocks with one big buffer? for buf in block_buf.blocks { let (_buf, res) = file.write_all_at(buf.slice_len(), offset, ctx).await; res?; offset += PAGE_SZ as u64; } let final_key_range = if let Some(end_key) = end_key { self.key_range.start..end_key } else { self.key_range.clone() }; // Fill in the summary on blk 0 let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: final_key_range.clone(), lsn: self.lsn, index_start_blk, index_root_blk, }; // Writes summary at the first block (offset 0). let buf = summary.ser_into_page()?; let (_buf, res) = file.write_all_at(buf.slice_len(), 0, ctx).await; res?; let metadata = file .metadata() .await .context("get metadata to determine file size")?; let desc = PersistentLayerDesc::new_img( self.tenant_shard_id, self.timeline_id, final_key_range, self.lsn, metadata.len(), ); #[cfg(feature = "testing")] if let Some(end_key) = end_key { assert!( self.last_written_key < end_key, "written key violates end_key range" ); } // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. // fsync the file file.sync_all() .await .maybe_fatal_err("image_layer sync_all")?; trace!("created image layer {}", self.path); // The gate guard stored in `destination_file` is dropped. Callers (e.g.. flush loop or compaction) // keep the gate open also, so that it's safe for them to rename the file to its final destination. file.disarm_into_inner(); Ok((desc, self.path)) } } /// A builder object for constructing a new image layer. /// /// Usage: /// /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) /// /// 2. Write the contents by calling `put_page_image` for every key-value /// pair in the key range. /// /// 3. Call `finish`. /// /// # Note /// /// As described in , it's /// possible for the writer to drop before `finish` is actually called. So this /// could lead to odd temporary files in the directory, exhausting file system. /// This structure wraps `ImageLayerWriterInner` and also contains `Drop` /// implementation that cleans up the temporary file in failure. It's not /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves /// out some fields, making it impossible to implement `Drop`. /// #[must_use] pub struct ImageLayerWriter { inner: Option, } impl ImageLayerWriter { /// /// Start building a new image layer. /// #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( ImageLayerWriterInner::new( conf, timeline_id, tenant_shard_id, key_range, lsn, gate, cancel, ctx, ) .await?, ), }) } /// /// Write next value to the file. /// /// The page versions must be appended in blknum order. /// pub async fn put_image( &mut self, key: Key, img: Bytes, ctx: &RequestContext, ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } /// /// Write the next value to the file, as a raw header and data. This allows passing through a /// raw, potentially compressed image from a different layer file without recompressing it. /// /// The page versions must be appended in blknum order. /// pub async fn put_image_raw( &mut self, key: Key, raw_with_header: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { self.inner .as_mut() .unwrap() .put_image_raw(key, raw_with_header, ctx) .await } /// Estimated size of the image layer. pub(crate) fn estimated_size(&self) -> u64 { let inner = self.inner.as_ref().unwrap(); inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } pub(crate) fn num_keys(&self) -> usize { self.inner.as_ref().unwrap().num_keys } /// /// Finish writing the image layer. /// pub(crate) async fn finish( mut self, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { self.inner.take().unwrap().finish(ctx, None).await } /// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. pub(super) async fn finish_with_end_key( mut self, end_key: Key, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { self.inner.take().unwrap().finish(ctx, Some(end_key)).await } } pub struct ImageLayerIterator<'a> { image_layer: &'a ImageLayerInner, ctx: &'a RequestContext, planner: StreamingVectoredReadPlanner, index_iter: DiskBtreeIterator<'a>, key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } impl ImageLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.image_layer.layer_dbg_info() } /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { assert!(self.key_values_batch.is_empty()); assert!(!self.is_end); let plan = loop { if let Some(res) = self.index_iter.next().await { let (raw_key, offset) = res?; if let Some(batch_plan) = self.planner.handle( Key::from_slice(&raw_key[..KEY_SIZE]), self.image_layer.lsn, offset, true, ) { break batch_plan; } } else { self.is_end = true; let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64; if let Some(item) = self.planner.handle_range_end(payload_end) { break item; } else { return Ok(()); // TODO: a test case on empty iterator } } }; let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await?; next_batch.push_back(( meta.meta.key, self.image_layer.lsn, Value::Image(img_buf.into_bytes()), )); } self.key_values_batch = next_batch; Ok(()) } pub async fn next(&mut self) -> anyhow::Result> { if self.key_values_batch.is_empty() { if self.is_end { return Ok(None); } self.next_batch().await?; } Ok(Some( self.key_values_batch .pop_front() .expect("should not be empty"), )) } } #[cfg(test)] mod test { use std::sync::Arc; use std::time::Duration; use bytes::Bytes; use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::{ImageLayerIterator, ImageLayerWriter}; use crate::DEFAULT_PG_VERSION; use crate::context::RequestContext; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::{TenantShard, Timeline}; #[tokio::test] async fn image_layer_rewrite() { let tenant_conf = pageserver_api::models::TenantConfig { gc_period: Some(Duration::ZERO), compaction_period: Some(Duration::ZERO), ..Default::default() }; let tenant_id = TenantId::generate(); let mut gen_ = Generation::new(0xdead0001); let mut get_next_gen = || { let ret = gen_; gen_ = gen_.next(); ret }; // The LSN at which we will create an image layer to filter let lsn = Lsn(0xdeadbeef0000); let timeline_id = TimelineId::generate(); // // Create an unsharded parent with a layer. // let harness = TenantHarness::create_custom( "test_image_layer_rewrite--parent", tenant_conf.clone(), tenant_id, ShardIdentity::unsharded(), get_next_gen(), ) .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) .await .unwrap(); // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap(); let range = input_start..input_end; // Build an image layer to filter let resident = { let mut writer = ImageLayerWriter::new( harness.conf, timeline_id, harness.tenant_shard_id, &range, lsn, &timeline.gate, timeline.cancel.clone(), &ctx, ) .await .unwrap(); let foo_img = Bytes::from_static(&[1, 2, 3, 4]); let mut key = range.start; while key < range.end { writer.put_image(key, foo_img.clone(), &ctx).await.unwrap(); key = key.next(); } let (desc, path) = writer.finish(&ctx).await.unwrap(); Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap() }; let original_size = resident.metadata().file_size; // // Create child shards and do the rewrite, exercising filter(). // TODO: abstraction in TenantHarness for splits. // // Filter for various shards: this exercises cases like values at start of key range, end of key // range, middle of key range. let shard_count = ShardCount::new(4); for shard_number in 0..shard_count.count() { // // mimic the shard split // let shard_identity = ShardIdentity::new( ShardNumber(shard_number), shard_count, ShardStripeSize(0x800), ) .unwrap(); let harness = TenantHarness::create_custom( Box::leak(Box::new(format!( "test_image_layer_rewrite--child{}", shard_identity.shard_slug() ))), tenant_conf.clone(), tenant_id, shard_identity, // NB: in reality, the shards would each fork off their own gen number sequence from the parent. // But here, all we care about is that the gen number is unique. get_next_gen(), ) .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) .await .unwrap(); // // use filter() and make assertions // let mut filtered_writer = ImageLayerWriter::new( harness.conf, timeline_id, harness.tenant_shard_id, &range, lsn, &timeline.gate, timeline.cancel.clone(), &ctx, ) .await .unwrap(); let wrote_keys = resident .filter(&shard_identity, &mut filtered_writer, &ctx) .await .unwrap(); let replacement = if wrote_keys > 0 { let (desc, path) = filtered_writer.finish(&ctx).await.unwrap(); let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap(); Some(resident) } else { None }; // This exact size and those below will need updating as/when the layer encoding changes, but // should be deterministic for a given version of the format, as we used no randomness generating the input. assert_eq!(original_size, 122880); match shard_number { 0 => { // We should have written out just one stripe for our shard identity assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); // We should have dropped some of the data assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); // Assert that we dropped ~3/4 of the data. assert_eq!(replacement.metadata().file_size, 49152); } 1 => { // Shard 1 has no keys in our input range assert_eq!(wrote_keys, 0x0); assert!(replacement.is_none()); } 2 => { // Shard 2 has one stripes in the input range assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); assert_eq!(replacement.metadata().file_size, 49152); } 3 => { // Shard 3 has two stripes in the input range assert_eq!(wrote_keys, 0x1000); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); assert_eq!(replacement.metadata().file_size, 73728); } _ => unreachable!(), } } } async fn produce_image_layer( tenant: &TenantShard, tline: &Arc, mut images: Vec<(Key, Bytes)>, lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result { images.sort(); let (key_start, _) = images.first().unwrap(); let (key_last, _) = images.last().unwrap(); let key_end = key_last.next(); let key_range = *key_start..key_end; let mut writer = ImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, &key_range, lsn, &tline.gate, tline.cancel.clone(), ctx, ) .await?; for (key, img) in images { writer.put_image(key, img, ctx).await?; } let (desc, path) = writer.finish(ctx).await?; let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(img_layer) } async fn assert_img_iter_equal( img_iter: &mut ImageLayerIterator<'_>, expect: &[(Key, Bytes)], expect_lsn: Lsn, ) { let mut expect_iter = expect.iter(); loop { let o1 = img_iter.next().await.unwrap(); let o2 = expect_iter.next(); match (o1, o2) { (None, None) => break, (Some((k1, l1, v1)), Some((k2, i2))) => { let Value::Image(i1) = v1 else { panic!("expect Value::Image") }; assert_eq!(&k1, k2); assert_eq!(l1, expect_lsn); assert_eq!(&i1, i2); } (o1, o2) => panic!("iterators length mismatch: {o1:?}, {o2:?}"), } } } #[tokio::test] async fn image_layer_iterator() { let harness = TenantHarness::create("image_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } const N: usize = 1000; let test_imgs = (0..N) .map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}")))) .collect_vec(); let resident_layer = produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx) .await .unwrap(); let img_layer = resident_layer.get_as_image(&ctx).await.unwrap(); for max_read_size in [1, 1024] { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); num_items += iter.key_values_batch.len(); if max_read_size == 1 { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; } iter.key_values_batch.clear(); } // Test if the result is correct let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; } } } } ================================================ FILE: pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs ================================================ use std::collections::BTreeMap; use std::sync::{Arc, RwLock}; use itertools::Itertools; use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::context::RequestContext; use crate::virtual_file::IoBufferMut; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. pub trait File: Send { /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())` /// and return the number of bytes read (let's call it `nread`). /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes. /// /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`) /// is if the file is shorter than `start+dst.len()`. /// /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. async fn read_exact_at_eof_ok( &self, start: u64, dst: Slice, ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)>; } /// A logical read from [`File`]. See [`Self::new`]. pub struct LogicalRead { pos: u64, state: RwLockRefCell>, } enum LogicalReadState { NotStarted(B), Ongoing(B), Ok(B), Error(Arc), Undefined, } impl LogicalRead { /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`. pub fn new(pos: u64, buf: B) -> Self { Self { pos, state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)), } } pub fn into_result(self) -> Option>> { match self.state.into_inner() { LogicalReadState::Ok(buf) => Some(Ok(buf)), LogicalReadState::Error(e) => Some(Err(e)), LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None, LogicalReadState::Undefined => unreachable!(), } } } /// The buffer into which a [`LogicalRead`] result is placed. pub trait Buffer: std::ops::Deref { /// Immutable. fn cap(&self) -> usize; /// Changes only through [`Self::extend_from_slice`]. fn len(&self) -> usize; /// Panics if the total length would exceed the initialized capacity. fn extend_from_slice(&mut self, src: &[u8]); } /// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO. const DIO_CHUNK_SIZE: usize = crate::virtual_file::get_io_buffer_alignment(); /// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`. /// (The unit is the number of chunks.) const MAX_CHUNK_BATCH_SIZE: usize = { let desired = 128 * 1024; // 128k if desired % DIO_CHUNK_SIZE != 0 { panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE") // compile-time error } desired / DIO_CHUNK_SIZE }; /// Execute the given logical `reads` against `file`. /// The results are placed in the buffers of the [`LogicalRead`]s. /// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`]. /// /// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function. /// Otherwise, this function panics. pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext) where I: IntoIterator>, F: File, B: Buffer + IoBufMut + Send, { // Terminology: // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity // Preserve a copy of the logical reads for debug assertions at the end #[cfg(debug_assertions)] let (reads, assert_logical_reads) = { let (reads, assert) = reads.into_iter().tee(); (reads, Some(Vec::from_iter(assert))) }; #[cfg(not(debug_assertions))] let (reads, assert_logical_reads): (_, Option>>) = (reads, None); // Plan which parts of which chunks need to be appended to which buffer let mut by_chunk: BTreeMap>> = BTreeMap::new(); struct Interest<'a, B: Buffer> { logical_read: &'a LogicalRead, offset_in_chunk: u64, len: u64, } for logical_read in reads { let LogicalRead { pos, state } = logical_read; let mut state = state.borrow_mut(); // transition from NotStarted to Ongoing let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined); let req_len = match cur { LogicalReadState::NotStarted(buf) => { if buf.len() != 0 { panic!( "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`" ); } // buf.cap() == 0 is ok // transition into Ongoing state let req_len = buf.cap(); *state = LogicalReadState::Ongoing(buf); req_len } x => panic!( "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}" ), }; // plan which chunks we need to read from let mut remaining = req_len; let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64()); let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE; while remaining > 0 { let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk); by_chunk.entry(chunk_no).or_default().push(Interest { logical_read, offset_in_chunk: offset_in_chunk.into_u64(), len: remaining_in_chunk.into_u64(), }); offset_in_chunk = 0; chunk_no += 1; remaining -= remaining_in_chunk; } } // At this point, we could iterate over by_chunk, in chunk order, // read each chunk from disk, and fill the buffers. // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE // so we issue fewer IOs = fewer roundtrips = lower overall latency. struct PhysicalRead<'a, B: Buffer> { start_chunk_no: u64, nchunks: usize, dsts: Vec>, } struct PhysicalInterest<'a, B: Buffer> { logical_read: &'a LogicalRead, offset_in_physical_read: u64, len: u64, } let mut physical_reads: Vec> = Vec::new(); let mut by_chunk = by_chunk.into_iter().peekable(); loop { let mut last_chunk_no = None; let to_merge: Vec<(u64, Vec>)> = by_chunk .peeking_take_while(|(chunk_no, _)| { if let Some(last_chunk_no) = last_chunk_no { if *chunk_no != last_chunk_no + 1 { return false; } } last_chunk_no = Some(*chunk_no); true }) .take(MAX_CHUNK_BATCH_SIZE) .collect(); // TODO: avoid this .collect() let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else { break; }; let nchunks = to_merge.len(); let dsts = to_merge .into_iter() .enumerate() .flat_map(|(i, (_, dsts))| { dsts.into_iter().map( move |Interest { logical_read, offset_in_chunk, len, }| { PhysicalInterest { logical_read, offset_in_physical_read: i .checked_mul(DIO_CHUNK_SIZE) .unwrap() .into_u64() + offset_in_chunk, len, } }, ) }) .collect(); physical_reads.push(PhysicalRead { start_chunk_no, nchunks, dsts, }); } drop(by_chunk); // Execute physical reads and fill the logical read buffers // TODO: pipelined reads; prefetch; let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE); for PhysicalRead { start_chunk_no, nchunks, dsts, } in physical_reads { let all_done = dsts .iter() .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal()); if all_done { continue; } let read_offset = start_chunk_no .checked_mul(DIO_CHUNK_SIZE.into_u64()) .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier"); let io_buf = get_io_buffer(nchunks).slice_full(); let req_len = io_buf.len(); let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await { Ok(t) => t, Err(e) => { let e = Arc::new(e); for PhysicalInterest { logical_read, .. } in dsts { *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e)); // this will make later reads for the given LogicalRead short-circuit, see top of loop body } continue; } }; let io_buf = io_buf_slice.into_inner(); assert!( nread <= io_buf.len(), "the last chunk in the file can be a short read, so, no ==" ); let io_buf = &io_buf[..nread]; for PhysicalInterest { logical_read, offset_in_physical_read, len, } in dsts { let mut logical_read_state_borrow = logical_read.state.borrow_mut(); let logical_read_buf = match &mut *logical_read_state_borrow { LogicalReadState::NotStarted(_) => { unreachable!("we transition it into Ongoing at function entry") } LogicalReadState::Ongoing(buf) => buf, LogicalReadState::Ok(_) | LogicalReadState::Error(_) => { continue; } LogicalReadState::Undefined => unreachable!(), }; let range_in_io_buf = std::ops::Range { start: offset_in_physical_read as usize, end: offset_in_physical_read as usize + len as usize, }; assert!(range_in_io_buf.end >= range_in_io_buf.start); if range_in_io_buf.end > nread { let msg = format!( "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}", &*logical_read_state_borrow ); logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, msg, ))); continue; } let data = &io_buf[range_in_io_buf]; // Copy data from io buffer into the logical read buffer. // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.) let pre = if cfg!(debug_assertions) { Some((logical_read_buf.len(), logical_read_buf.cap())) } else { None }; logical_read_buf.extend_from_slice(data); let post = if cfg!(debug_assertions) { Some((logical_read_buf.len(), logical_read_buf.cap())) } else { None }; match (pre, post) { (None, None) => {} (Some(_), None) | (None, Some(_)) => unreachable!(), (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => { assert_eq!(pre_len + len as usize, post_len); assert_eq!(pre_cap, post_cap); } } if logical_read_buf.len() == logical_read_buf.cap() { logical_read_state_borrow.transition_to_terminal(Ok(())); } } } if let Some(assert_logical_reads) = assert_logical_reads { for logical_read in assert_logical_reads { assert!(logical_read.state.borrow().is_terminal()); } } } impl LogicalReadState { fn is_terminal(&self) -> bool { match self { LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false, LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true, LogicalReadState::Undefined => unreachable!(), } } fn transition_to_terminal(&mut self, err: std::io::Result<()>) { let cur = std::mem::replace(self, LogicalReadState::Undefined); let buf = match cur { LogicalReadState::Ongoing(buf) => buf, x => panic!("must only call in state Ongoing, got {x:?}"), }; *self = match err { Ok(()) => LogicalReadState::Ok(buf), Err(e) => LogicalReadState::Error(Arc::new(e)), }; } } impl std::fmt::Debug for LogicalReadState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { #[derive(Debug)] #[allow(unused)] struct BufferDebug { len: usize, cap: usize, } impl<'a> From<&'a dyn Buffer> for BufferDebug { fn from(buf: &'a dyn Buffer) -> Self { Self { len: buf.len(), cap: buf.cap(), } } } match self { LogicalReadState::NotStarted(b) => { write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer)) } LogicalReadState::Ongoing(b) => { write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) } LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), LogicalReadState::Error(e) => write!(f, "Error({e:?})"), LogicalReadState::Undefined => write!(f, "Undefined"), } } } #[derive(Debug)] struct RwLockRefCell(RwLock); impl RwLockRefCell { fn new(value: T) -> Self { Self(RwLock::new(value)) } fn borrow(&self) -> impl std::ops::Deref + '_ { self.0.try_read().unwrap() } fn borrow_mut(&self) -> impl std::ops::DerefMut + '_ { self.0.try_write().unwrap() } fn into_inner(self) -> T { self.0.into_inner().unwrap() } } impl Buffer for Vec { fn cap(&self) -> usize { self.capacity() } fn len(&self) -> usize { self.len() } fn extend_from_slice(&mut self, src: &[u8]) { if self.len() + src.len() > self.cap() { panic!("Buffer capacity exceeded"); } Vec::extend_from_slice(self, src); } } #[cfg(test)] #[allow(clippy::assertions_on_constants)] mod tests { use std::cell::RefCell; use std::collections::VecDeque; use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; struct InMemoryFile { content: Vec, } impl InMemoryFile { fn new_random(len: usize) -> Self { Self { content: rand::rng() .sample_iter(rand::distr::StandardUniform) .take(len) .collect(), } } fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead { let expected_result = if pos as usize + len > self.content.len() { Err("InMemoryFile short read".to_string()) } else { Ok(self.content[pos as usize..pos as usize + len].to_vec()) }; TestLogicalRead::new(pos, len, expected_result) } } #[test] fn test_in_memory_file() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(10); let test_read = |pos, len| { let buf = IoBufferMut::with_capacity_zeroed(len); let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); use futures::FutureExt; let (slice, nread) = fut .now_or_never() .expect("impl never awaits") .expect("impl never errors"); let mut buf = slice.into_inner(); buf.truncate(nread); buf }; assert_eq!(&test_read(0, 1), &file.content[0..1]); assert_eq!(&test_read(1, 2), &file.content[1..3]); assert_eq!(&test_read(9, 2), &file.content[9..]); assert!(test_read(10, 2).is_empty()); assert!(test_read(11, 2).is_empty()); } impl File for InMemoryFile { async fn read_exact_at_eof_ok( &self, start: u64, mut dst: Slice, _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); let nread = { let req_len = dst_slice.len(); let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize)); if start as usize >= self.content.len() { 0 } else { dst_slice[..len] .copy_from_slice(&self.content[start as usize..start as usize + len]); len } }; rand::Rng::fill(&mut rand::rng(), &mut dst_slice[nread..]); // to discover bugs Ok((dst, nread)) } } #[derive(Clone)] struct TestLogicalRead { pos: u64, len: usize, expected_result: Result, String>, } impl TestLogicalRead { fn new(pos: u64, len: usize, expected_result: Result, String>) -> Self { Self { pos, len, expected_result, } } fn make_logical_read(&self) -> LogicalRead> { LogicalRead::new(self.pos, Vec::with_capacity(self.len)) } } async fn execute_and_validate_test_logical_reads( file: &F, test_logical_reads: I, ctx: &RequestContext, ) where I: IntoIterator, F: File, { let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::>(); execute(file, logical_reads.iter(), ctx).await; for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) { let actual = logical_read.into_result().expect("we call execute()"); match (actual, test_logical_read.expected_result) { (Ok(actual), Ok(expected)) if actual == expected => {} (Err(actual), Err(expected)) => { assert_eq!(actual.to_string(), expected); } (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"), } } } #[tokio::test] async fn test_blackbox() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let cs = DIO_CHUNK_SIZE; let cs_u64 = cs.into_u64(); let file = InMemoryFile::new_random(10 * cs); let test_logical_reads = vec![ file.test_logical_read(0, 1), // adjacent to logical_read0 file.test_logical_read(1, 2), // gap // spans adjacent chunks file.test_logical_read(cs_u64 - 1, 2), // gap // tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5 file.test_logical_read(3 * cs_u64 - 1, cs + 2), // gap file.test_logical_read(5 * cs_u64, 1), ]; let num_test_logical_reads = test_logical_reads.len(); let test_logical_reads_perms = test_logical_reads .into_iter() .permutations(num_test_logical_reads); // test all orderings of LogicalReads, the order shouldn't matter for the results for test_logical_reads in test_logical_reads_perms { execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; } } #[tokio::test] #[should_panic] async fn test_reusing_logical_reads_panics() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(DIO_CHUNK_SIZE); let a = file.test_logical_read(23, 10); let logical_reads = vec![a.make_logical_read()]; execute(&file, &logical_reads, &ctx).await; // reuse pancis execute(&file, &logical_reads, &ctx).await; } struct RecorderFile<'a> { recorded: RefCell>, file: &'a InMemoryFile, } struct RecordedRead { pos: u64, req_len: usize, res: Vec, } impl<'a> RecorderFile<'a> { fn new(file: &'a InMemoryFile) -> RecorderFile<'a> { Self { recorded: Default::default(), file, } } } impl File for RecorderFile<'_> { async fn read_exact_at_eof_ok( &self, start: u64, dst: Slice, ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; self.recorded.borrow_mut().push(RecordedRead { pos: start, req_len: dst.bytes_total(), res: Vec::from(&dst[..nread]), }); Ok((dst, nread)) } } #[tokio::test] async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE); let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10); let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20); let recorder = RecorderFile::new(&file); execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; let recorded = recorder.recorded.borrow(); assert_eq!(recorded.len(), 1); let RecordedRead { pos, req_len, .. } = &recorded[0]; assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); assert_eq!(*req_len, DIO_CHUNK_SIZE); } #[tokio::test] async fn test_max_chunk_batch_size_is_respected() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption"); assert!(10 < DIO_CHUNK_SIZE, "test assumption"); let mut test_logical_reads = Vec::new(); for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 { test_logical_reads .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1)); } let recorder = RecorderFile::new(&file); execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await; let recorded = recorder.recorded.borrow(); assert_eq!(recorded.len(), 2); { let RecordedRead { pos, req_len, .. } = &recorded[0]; assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE); assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); } { let RecordedRead { pos, req_len, .. } = &recorded[1]; assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE); assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE); } } #[tokio::test] async fn test_batch_breaks_if_chunk_is_not_interesting() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption"); let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE); let a = file.test_logical_read(0, 1); // chunk 0 let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2 let recorder = RecorderFile::new(&file); execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; let recorded = recorder.recorded.borrow(); assert_eq!(recorded.len(), 2); { let RecordedRead { pos, req_len, .. } = &recorded[0]; assert_eq!(*pos, 0); assert_eq!(*req_len, DIO_CHUNK_SIZE); } { let RecordedRead { pos, req_len, .. } = &recorded[1]; assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64()); assert_eq!(*req_len, DIO_CHUNK_SIZE); } } struct ExpectedRead { expect_pos: u64, expect_len: usize, respond: Result, String>, } struct MockFile { expected: RefCell>, } impl Drop for MockFile { fn drop(&mut self) { assert!( self.expected.borrow().is_empty(), "expected reads not satisfied" ); } } macro_rules! mock_file { ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{ MockFile { expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead { expect_pos: $pos, expect_len: $len, respond: $respond, }),*])), } }}; } impl File for MockFile { async fn read_exact_at_eof_ok( &self, start: u64, mut dst: Slice, _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let ExpectedRead { expect_pos, expect_len, respond, } = self .expected .borrow_mut() .pop_front() .expect("unexpected read"); assert_eq!(start, expect_pos); assert_eq!(dst.bytes_total(), expect_len); match respond { Ok(mocked_bytes) => { let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len()); let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); dst_slice[..len].copy_from_slice(&mocked_bytes[..len]); rand::Rng::fill(&mut rand::rng(), &mut dst_slice[len..]); // to discover bugs Ok((dst, len)) } Err(e) => Err(std::io::Error::other(e)), } } } #[tokio::test] async fn test_mock_file() { // Self-test to ensure the relevant features of mock file work as expected. let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let mock_file = mock_file! { 0 , 512 => Ok(vec![0; 512]), 512 , 512 => Ok(vec![1; 512]), 1024 , 512 => Ok(vec![2; 10]), 2048, 1024 => Err("foo".to_owned()), }; let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) .await .unwrap(); assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[0; 512]); let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) .await .unwrap(); assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[1; 512]); let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) .await .unwrap(); assert_eq!(nread, 10); assert_eq!(&buf.into_inner()[..nread], &[2; 10]); let buf = IoBufferMut::with_capacity(1024); let err = mock_file .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) .await .err() .unwrap(); assert_eq!(err.to_string(), "foo"); } #[tokio::test] async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let test_logical_reads = vec![ // read spanning two batches TestLogicalRead::new( DIO_CHUNK_SIZE.into_u64() / 2, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE, Err("foo".to_owned()), ), // second read in failing chunk TestLogicalRead::new( (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10, 5, Err("foo".to_owned()), ), // read unaffected TestLogicalRead::new( (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + 2 * DIO_CHUNK_SIZE.into_u64() + 10, 5, Ok(vec![1; 5]), ), ]; let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); let test_logical_read_perms = tmp.permutations(test_logical_reads.len()); for test_logical_reads in test_logical_read_perms { let file = mock_file!( 0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]), (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()), (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]), ); execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; } } struct TestShortReadsSetup { ctx: RequestContext, file: InMemoryFile, written: u64, } fn setup_short_chunk_read_tests() -> TestShortReadsSetup { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); assert!(DIO_CHUNK_SIZE > 20, "test assumption"); let written = (2 * DIO_CHUNK_SIZE - 10).into_u64(); let file = InMemoryFile::new_random(written as usize); TestShortReadsSetup { ctx, file, written } } #[tokio::test] async fn test_short_chunk_read_from_written_range() { // Test what happens if there are logical reads // that start within the last chunk, and // the last chunk is not the full chunk length. // // The read should succeed despite the short chunk length. let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); let a = file.test_logical_read(written - 10, 5); let recorder = RecorderFile::new(&file); execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await; let recorded = recorder.recorded.borrow(); assert_eq!(recorded.len(), 1); let RecordedRead { pos, req_len, res } = &recorded[0]; assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); assert_eq!(*req_len, DIO_CHUNK_SIZE); assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); } #[tokio::test] async fn test_short_chunk_read_and_logical_read_from_unwritten_range() { // Test what happens if there are logical reads // that start within the last chunk, and // the last chunk is not the full chunk length, and // the logical reads end in the unwritten range. // // All should fail with UnexpectedEof and have the same IO pattern. async fn the_impl(offset_delta: i64) { let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); let offset = u64::try_from( i64::try_from(written) .unwrap() .checked_add(offset_delta) .unwrap(), ) .unwrap(); let a = file.test_logical_read(offset, 5); let recorder = RecorderFile::new(&file); let a_vr = a.make_logical_read(); execute(&recorder, vec![&a_vr], &ctx).await; // validate the LogicalRead result let a_res = a_vr.into_result().unwrap(); let a_err = a_res.unwrap_err(); assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof); // validate the IO pattern let recorded = recorder.recorded.borrow(); assert_eq!(recorded.len(), 1); let RecordedRead { pos, req_len, res } = &recorded[0]; assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); assert_eq!(*req_len, DIO_CHUNK_SIZE); assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); } the_impl(-1).await; // start == length - 1 the_impl(0).await; // start == length the_impl(1).await; // start == length + 1 } // TODO: mixed: some valid, some UnexpectedEof // TODO: same tests but with merges } ================================================ FILE: pageserver/src/tenant/storage_layer/inmemory_layer.rs ================================================ //! An in-memory layer stores recently received key-value pairs. //! //! The "in-memory" part of the name is a bit misleading: the actual page versions are //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use std::fmt::Write; use std::ops::Range; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}; use std::sync::{Arc, OnceLock}; use std::time::Instant; use anyhow::Result; use camino::Utf8PathBuf; use pageserver_api::key::{CompactKey, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::vec_map::VecMap; use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use crate::metrics::TIMELINE_EPHEMERAL_BYTES; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; pub(crate) mod vectored_dio_read; #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct InMemoryLayerFileId(page_cache::FileId); pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. start_lsn: Lsn, /// Frozen layers have an exclusive end LSN. /// Writes are only allowed when this is `None`. pub(crate) end_lsn: OnceLock, /// Used for traversal path. Cached representation of the in-memory layer after frozen. frozen_local_path_str: OnceLock>, opened_at: Instant, /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The [`IndexEntry`] is an offset into the /// ephemeral file where the page version is stored. /// /// We use a separate lock for the index to reduce the critical section /// during which reads cannot be planned. /// /// Note that the file backing [`InMemoryLayer::file`] is append-only, /// so it is not necessary to hold a lock on the index while reading or writing from the file. /// In particular: /// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`]. /// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`]. index: RwLock>>, /// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes. file: EphemeralFile, estimated_in_mem_size: AtomicU64, } impl std::fmt::Debug for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayer") .field("start_lsn", &self.start_lsn) .field("end_lsn", &self.end_lsn) .finish() } } /// Support the same max blob length as blob_io, because ultimately /// all the InMemoryLayer contents end up being written into a delta layer, /// using the [`crate::tenant::blob_io`]. const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN; const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize; let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize; assert!(trailing_ones + leading_zeroes == std::mem::size_of::() * 8); trailing_ones }; /// See [`InMemoryLayer::index`]. /// /// For memory efficiency, the data is packed into a u64. /// /// Layout: /// - 1 bit: `will_init` /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len` /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos` #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct IndexEntry(u64); impl IndexEntry { /// See [`Self::MAX_SUPPORTED_POS`]. const MAX_SUPPORTED_POS_BITS: usize = { let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS; if remainder < 32 { panic!("pos can be u32 as per type system, support that"); } remainder }; /// The maximum supported blob offset that can be represented by [`Self`]. /// See also [`Self::validate_checkpoint_distance`]. const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1; // Layout const WILL_INIT_RANGE: Range = 0..1; const LEN_RANGE: Range = Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS; const POS_RANGE: Range = Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS; const _ASSERT: () = { if Self::POS_RANGE.end != 64 { panic!("we don't want undefined bits for our own sanity") } }; /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`]. /// /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long. /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`]. /// /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance, /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value. /// /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested) /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer. #[inline(always)] fn new(arg: IndexEntryNewArgs) -> anyhow::Result { let IndexEntryNewArgs { base_offset, batch_offset, len, will_init, } = arg; let pos = base_offset .checked_add(batch_offset) .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?; if pos.into_usize() > Self::MAX_SUPPORTED_POS { anyhow::bail!( "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}", max = Self::MAX_SUPPORTED_POS ); } if len > MAX_SUPPORTED_BLOB_LEN { anyhow::bail!( "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}", ); } let mut data: u64 = 0; use bit_field::BitField; data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 }); data.set_bits(Self::LEN_RANGE, len.into_u64()); data.set_bits(Self::POS_RANGE, pos); Ok(Self(data)) } #[inline(always)] fn unpack(&self) -> IndexEntryUnpacked { use bit_field::BitField; IndexEntryUnpacked { will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0, len: self.0.get_bits(Self::LEN_RANGE), pos: self.0.get_bits(Self::POS_RANGE), } } /// See [`Self::new`]. pub(crate) const fn validate_checkpoint_distance( checkpoint_distance: u64, ) -> Result<(), &'static str> { if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 { return Err("exceeds the maximum supported value"); } let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN); if res.is_none() { return Err( "checkpoint distance + max supported blob len overflows in-memory addition", ); } // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS Ok(()) } const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = { let res = Self::validate_checkpoint_distance( pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE, ); if res.is_err() { panic!("default checkpoint distance is valid") } }; } /// Args to [`IndexEntry::new`]. #[derive(Clone, Copy)] struct IndexEntryNewArgs { base_offset: u64, batch_offset: u64, len: usize, will_init: bool, } /// Unpacked representation of the bitfielded [`IndexEntry`]. #[derive(Clone, Copy, PartialEq, Eq, Debug)] struct IndexEntryUnpacked { will_init: bool, len: u64, pos: u64, } /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, /// to minimize contention. /// /// This global state is used to implement behaviors that require a global view of the system, e.g. /// rolling layers proactively to limit the total amount of dirty data. pub(crate) struct GlobalResources { // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it. // Zero means unlimited. pub(crate) max_dirty_bytes: AtomicU64, // How many bytes are in all EphemeralFile objects dirty_bytes: AtomicU64, // How many layers are contributing to dirty_bytes dirty_layers: AtomicUsize, } // Per-timeline RAII struct for its contribution to [`GlobalResources`] pub(crate) struct GlobalResourceUnits { // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible // for decrementing the global counter by this many bytes when dropped. dirty_bytes: u64, } impl GlobalResourceUnits { // Hint for the layer append path to update us when the layer size differs from the last // call to update_size by this much. If we don't reach this threshold, we'll still get // updated when the Timeline "ticks" in the background. const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; pub(crate) fn new() -> Self { GLOBAL_RESOURCES .dirty_layers .fetch_add(1, AtomicOrdering::Relaxed); Self { dirty_bytes: 0 } } /// Do not call this frequently: all timelines will write to these same global atomics, /// so this is a relatively expensive operation. Wait at least a few seconds between calls. /// /// Returns the effective layer size limit that should be applied, if any, to keep /// the total number of dirty bytes below the configured maximum. pub(crate) fn publish_size(&mut self, size: u64) -> Option { let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), Ordering::Greater => { let delta = size - self.dirty_bytes; let old = GLOBAL_RESOURCES .dirty_bytes .fetch_add(delta, AtomicOrdering::Relaxed); old + delta } Ordering::Less => { let delta = self.dirty_bytes - size; let old = GLOBAL_RESOURCES .dirty_bytes .fetch_sub(delta, AtomicOrdering::Relaxed); old - delta } }; // This is a sloppy update: concurrent updates to the counter will race, and the exact // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes. // That's okay: as long as the metric contains some recent value, it doesn't have to always // be literally the last update. TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes); self.dirty_bytes = size; let max_dirty_bytes = GLOBAL_RESOURCES .max_dirty_bytes .load(AtomicOrdering::Relaxed); if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes { // Set the layer file limit to the average layer size: this implies that all above-average // sized layers will be elegible for freezing. They will be frozen in the order they // next enter publish_size. Some( new_global_dirty_bytes / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64, ) } else { None } } // Call publish_size if the input size differs from last published size by more than // the drift limit pub(crate) fn maybe_publish_size(&mut self, size: u64) { let publish = match size.cmp(&self.dirty_bytes) { Ordering::Equal => false, Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT, }; if publish { self.publish_size(size); } } } impl Drop for GlobalResourceUnits { fn drop(&mut self) { GLOBAL_RESOURCES .dirty_layers .fetch_sub(1, AtomicOrdering::Relaxed); // Subtract our contribution to the global total dirty bytes self.publish_size(0); } } pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { max_dirty_bytes: AtomicU64::new(0), dirty_bytes: AtomicU64::new(0), dirty_layers: AtomicUsize::new(0), }; impl InMemoryLayer { pub(crate) fn file_id(&self) -> InMemoryLayerFileId { self.file_id } pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } pub(crate) fn info(&self) -> InMemoryLayerInfo { let lsn_start = self.start_lsn; if let Some(&lsn_end) = self.end_lsn.get() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } } else { InMemoryLayerInfo::Open { lsn_start } } } pub(crate) fn len(&self) -> u64 { self.file.len() } pub(crate) fn assert_writable(&self) { assert!(self.end_lsn.get().is_none()); } pub(crate) fn end_lsn_or_max(&self) -> Lsn { self.end_lsn.get().copied().unwrap_or(Lsn::MAX) } pub(crate) fn get_lsn_range(&self) -> Range { self.start_lsn..self.end_lsn_or_max() } /// debugging function to print out the contents of the layer /// /// this is likely completly unused pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { let end_str = self.end_lsn_or_max(); println!( "----- in-memory layer for tli {} LSNs {}-{} ----", self.timeline_id, self.start_lsn, end_str, ); Ok(()) } // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. pub async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .attached_child(); let index = self.index.read().await; struct ValueRead { entry_lsn: Lsn, read: vectored_dio_read::LogicalRead>, } let mut reads: HashMap> = HashMap::new(); let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); for range in keyspace.ranges.iter() { for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); let slice = vec_map.slice_range(lsn_range.clone()); for (entry_lsn, index_entry) in slice.iter().rev() { let IndexEntryUnpacked { pos, len, will_init, } = index_entry.unpack(); reads.entry(key).or_default().push(ValueRead { entry_lsn: *entry_lsn, read: vectored_dio_read::LogicalRead::new( pos, Vec::with_capacity(len as usize), ), }); let io = reconstruct_state.update_key(&key, *entry_lsn, will_init); ios.insert((key, *entry_lsn), io); if will_init { break; } } } } drop(index); // release the lock before we spawn the IO let read_from = Arc::clone(self); let read_ctx = ctx.attached_child(); reconstruct_state .spawn_io(async move { let f = vectored_dio_read::execute( &read_from.file, reads .iter() .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), &read_ctx, ); send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 .await; for (key, value_reads) in reads { for ValueRead { entry_lsn, read } in value_reads { let io = ios.remove(&(key, entry_lsn)).expect("sender must exist"); match read.into_result().expect("we run execute() above") { Err(e) => { io.complete(Err(std::io::Error::new( e.kind(), "dio vec read failed", ))); } Ok(value_buf) => { io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into()))); } } } } assert!(ios.is_empty()); // Keep layer existent until this IO is done; // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit // drop for consistency among all three layer types. drop(read_from); }) .await; Ok(()) } } fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) } fn inmem_layer_log_display( mut f: impl Write, timeline: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ) -> std::fmt::Result { write!(f, "timeline {timeline} in-memory ")?; inmem_layer_display(f, start_lsn, end_lsn) } impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let end_lsn = self.end_lsn_or_max(); inmem_layer_display(f, self.start_lsn, end_lsn) } } impl InMemoryLayer { pub fn estimated_in_mem_size(&self) -> u64 { self.estimated_in_mem_size.load(AtomicOrdering::Relaxed) } /// Create a new, empty, in-memory layer pub async fn create( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { trace!( "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}" ); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, cancel, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { file_id: key, frozen_local_path_str: OnceLock::new(), conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), opened_at: Instant::now(), index: RwLock::new(BTreeMap::new()), file, estimated_in_mem_size: AtomicU64::new(0), }) } /// Write path. /// /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. /// /// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`]. /// /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. pub async fn put_batch( &self, serialized_batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { self.assert_writable(); let base_offset = self.file.len(); let SerializedValueBatch { raw, metadata, max_lsn: _, len: _, } = serialized_batch; // Write the batch to the file self.file.write_raw(&raw, ctx).await?; let new_size = self.file.len(); let expected_new_len = base_offset .checked_add(raw.len().into_u64()) // write_raw would error if we were to overflow u64. // also IndexEntry and higher levels in //the code don't allow the file to grow that large .unwrap(); assert_eq!(new_size, expected_new_len); // Update the index with the new entries let mut index = self.index.write().await; for meta in metadata { let SerializedValueMeta { key, lsn, batch_offset, len, will_init, } = match meta { ValueMeta::Serialized(ser) => ser, ValueMeta::Observed(_) => { continue; } }; // Add the base_offset to the batch's index entries which are relative to the batch start. let index_entry = IndexEntry::new(IndexEntryNewArgs { base_offset, batch_offset, len, will_init, })?; let vec_map = index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; if old.is_some() { // This should not break anything, but is unexpected: ingestion code aims to filter out // multiple writes to the same key at the same LSN. This happens in cases where our // ingenstion code generates some write like an empty page, and we see a write from postgres // to the same key in the same wal record. If one such write makes it through, we // index the most recent write, implicitly ignoring the earlier write. We log a warning // because this case is unexpected, and we would like tests to fail if this happens. warn!("Key {} at {} written twice at same LSN", key, lsn); } self.estimated_in_mem_size.fetch_add( (std::mem::size_of::() + std::mem::size_of::() + std::mem::size_of::()) as u64, AtomicOrdering::Relaxed, ); } Ok(()) } pub(crate) fn get_opened_at(&self) -> Instant { self.opened_at } pub(crate) fn tick(&self) -> Option { self.file.tick() } pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys Ok(()) } /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive /// /// A note on locking: /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing /// writes while freezing the layer. This is enforced at a higher level via /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths: /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the /// Timeline::write_lock for its lifetime. The rolling is handled in /// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function /// so can't be called from different threads. /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`]. /// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer), /// hence there can be no concurrent writes pub async fn freeze(&self, end_lsn: Lsn) { assert!( self.start_lsn < end_lsn, "{} >= {}", self.start_lsn, end_lsn ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); self.frozen_local_path_str .set({ let mut buf = String::new(); inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn) .unwrap(); buf.into() }) .expect("frozen_local_path_str set only once"); #[cfg(debug_assertions)] { let index = self.index.read().await; for vec_map in index.values() { for (lsn, _) in vec_map.as_slice() { assert!(*lsn < end_lsn); } } } } /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta /// layer will only contain the key range the user specifies, and may return `None` /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer pub async fn write_to_disk( &self, ctx: &RequestContext, key_range: Option>, l0_flush_global_state: &l0_flush::Inner, gate: &utils::sync::gate::Gate, cancel: CancellationToken, ) -> Result> { let index = self.index.read().await; use l0_flush::Inner; let _concurrency_permit = match l0_flush_global_state { Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), }; let end_lsn = *self.end_lsn.get().unwrap(); let key_count = if let Some(key_range) = key_range { let key_range = key_range.start.to_compact()..key_range.end.to_compact(); index.iter().filter(|(k, _)| key_range.contains(k)).count() } else { index.len() }; if key_count == 0 { return Ok(None); } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, gate, cancel, ctx, ) .await?; match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { let file_contents = self.file.load_to_io_buf(ctx).await?; let file_contents = file_contents.freeze(); for (key, vec_map) in index.iter() { // Write all page versions for (lsn, entry) in vec_map .as_slice() .iter() .map(|(lsn, entry)| (lsn, entry.unpack())) { let IndexEntryUnpacked { pos, len, will_init, } = entry; let buf = file_contents.slice(pos as usize..(pos + len) as usize); let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), *lsn, buf.slice_len(), will_init, ctx, ) .await; res?; } } } } // MAX is used here because we identify L0 layers by full key range let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. // // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of // the `file_contents: Vec` until the IO is done, but not the permit's lifetime. // Thus, we'd have more concurrenct `Vec` in existence than the semaphore allows. // // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages // we dirtied when writing to the filesystem have been flushed and marked !dirty. drop(_concurrency_permit); Ok(Some((desc, path))) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_index_entry() { const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked}; let roundtrip = |args, expect: Unpacked| { let res = IndexEntry::new(args).expect("this tests expects no errors"); let IndexEntryUnpacked { will_init, len, pos, } = res.unpack(); assert_eq!(will_init, expect.will_init); assert_eq!(len, expect.len); assert_eq!(pos, expect.pos); }; // basic roundtrip for pos in [0, MAX_SUPPORTED_POS] { for len in [0, MAX_SUPPORTED_BLOB_LEN] { for will_init in [true, false] { let expect = Unpacked { will_init, len: len.into_u64(), pos: pos.into_u64(), }; roundtrip( Args { will_init, base_offset: pos.into_u64(), batch_offset: 0, len, }, expect, ); roundtrip( Args { will_init, base_offset: 0, batch_offset: pos.into_u64(), len, }, expect, ); } } } // too-large len let too_large = Args { will_init: false, len: MAX_SUPPORTED_BLOB_LEN + 1, base_offset: 0, batch_offset: 0, }; assert!(IndexEntry::new(too_large).is_err()); // too-large pos { let too_large = Args { will_init: false, len: 0, base_offset: MAX_SUPPORTED_POS.into_u64() + 1, batch_offset: 0, }; assert!(IndexEntry::new(too_large).is_err()); let too_large = Args { will_init: false, len: 0, base_offset: 0, batch_offset: MAX_SUPPORTED_POS.into_u64() + 1, }; assert!(IndexEntry::new(too_large).is_err()); } // too large (base_offset + batch_offset) { let too_large = Args { will_init: false, len: 0, base_offset: MAX_SUPPORTED_POS.into_u64(), batch_offset: 1, }; assert!(IndexEntry::new(too_large).is_err()); let too_large = Args { will_init: false, len: 0, base_offset: MAX_SUPPORTED_POS.into_u64() - 1, batch_offset: MAX_SUPPORTED_POS.into_u64() - 1, }; assert!(IndexEntry::new(too_large).is_err()); } // valid special cases // - area past the max supported pos that is accessible by len for len in [1, MAX_SUPPORTED_BLOB_LEN] { roundtrip( Args { will_init: false, len, base_offset: MAX_SUPPORTED_POS.into_u64(), batch_offset: 0, }, Unpacked { will_init: false, len: len as u64, pos: MAX_SUPPORTED_POS.into_u64(), }, ); roundtrip( Args { will_init: false, len, base_offset: 0, batch_offset: MAX_SUPPORTED_POS.into_u64(), }, Unpacked { will_init: false, len: len as u64, pos: MAX_SUPPORTED_POS.into_u64(), }, ); } } } ================================================ FILE: pageserver/src/tenant/storage_layer/layer/failpoints.rs ================================================ //! failpoints for unit tests, implying `#[cfg(test)]`. //! //! These are not accessible over http. use super::*; impl Layer { /// Enable a failpoint from a unit test. pub(super) fn enable_failpoint(&self, failpoint: Failpoint) { self.0.failpoints.lock().unwrap().push(failpoint); } } impl LayerInner { /// Query if this failpoint is enabled, as in, arrive at a failpoint. /// /// Calls to this method need to be `#[cfg(test)]` guarded. pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> { let fut = { let mut fps = self.failpoints.lock().unwrap(); // find the *last* failpoint for cases in which we need to use multiple for the same // thing (two blocked evictions) let fp = fps.iter_mut().rfind(|x| x.kind() == kind); let Some(fp) = fp else { return Ok(()); }; fp.hit() }; fut.await } } #[derive(Debug, PartialEq, Eq)] pub(crate) enum FailpointKind { /// Failpoint acts as an accurate cancelled by drop here; see the only site of use. AfterDeterminingLayerNeedsNoDownload, /// Failpoint for stalling eviction starting WaitBeforeStartingEvicting, /// Failpoint hit in the spawned task WaitBeforeDownloading, } pub(crate) enum Failpoint { AfterDeterminingLayerNeedsNoDownload, WaitBeforeStartingEvicting( Option, utils::completion::Barrier, ), WaitBeforeDownloading( Option, utils::completion::Barrier, ), } impl Failpoint { fn kind(&self) -> FailpointKind { match self { Failpoint::AfterDeterminingLayerNeedsNoDownload => { FailpointKind::AfterDeterminingLayerNeedsNoDownload } Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting, Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading, } } fn hit(&mut self) -> impl std::future::Future> + 'static { use futures::future::FutureExt; // use boxed futures to avoid Either hurdles match self { Failpoint::AfterDeterminingLayerNeedsNoDownload => { let kind = self.kind(); async move { Err(FailpointHit(kind)) }.boxed() } Failpoint::WaitBeforeStartingEvicting(arrival, b) | Failpoint::WaitBeforeDownloading(arrival, b) => { // first one signals arrival drop(arrival.take()); let b = b.clone(); async move { tracing::trace!("waiting on a failpoint barrier"); b.wait().await; tracing::trace!("done waiting on a failpoint barrier"); Ok(()) } .boxed() } } } } impl std::fmt::Display for FailpointKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Debug::fmt(self, f) } } #[derive(Debug)] pub(crate) struct FailpointHit(FailpointKind); impl std::fmt::Display for FailpointHit { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Debug::fmt(self, f) } } impl std::error::Error for FailpointHit {} impl From for DownloadError { fn from(value: FailpointHit) -> Self { DownloadError::Failpoint(value.0) } } ================================================ FILE: pageserver/src/tenant/storage_layer/layer/tests.rs ================================================ use std::time::UNIX_EPOCH; use pageserver_api::key::{CONTROLFILE_KEY, Key}; use postgres_ffi::PgMajorVersion; use tokio::task::JoinSet; use utils::completion::{self, Completion}; use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; use crate::context::DownloadBehavior; use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); /// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE /// timeout uses to advance futures. const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7); /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. #[tokio::test] async fn smoke_test() { let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, ctx) = h.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let image_layers = vec![( Lsn(0x40), vec![( Key::from_hex("620000000033333333444444445500000000").unwrap(), test_img("foo"), )], )]; // Create a test timeline with one real layer, and one synthetic test layer. The synthetic // one is only there so that we can GC the real one without leaving the timeline's metadata // empty, which is an illegal state (see [`IndexPart::validate`]). let timeline = tenant .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, Default::default(), // in-memory layers Default::default(), image_layers, Lsn(0x100), ) .await .unwrap(); let ctx = &ctx.with_scope_timeline(&timeline); // Grab one of the timeline's layers to exercise in the test, and the other layer that is just // there to avoid the timeline being illegally empty let (layer, dummy_layer) = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 2); layers.sort_by_key(|l| l.layer_desc().get_key_range().start); let synthetic_layer = layers.pop().unwrap(); let real_layer = layers.pop().unwrap(); tracing::info!( "real_layer={:?} ({}), synthetic_layer={:?} ({})", real_layer, real_layer.layer_desc().file_size, synthetic_layer, synthetic_layer.layer_desc().file_size ); (real_layer, synthetic_layer) }; // all layers created at pageserver are like `layer`, initialized with strong // Arc. let controlfile_keyspace = KeySpace { ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()], }; let img_before = { let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, ctx, ) .await .unwrap(); data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") .collect_pending_ios() .await .expect("must not error") .img .take() .expect("tenant harness writes the control file") }; // important part is evicting the layer, which can be done when there are no more ResidentLayer // instances -- there currently are none, only two `Layer` values, one in the layermap and on // in scope. layer.evict_and_wait(FOREVER).await.unwrap(); // double-evict returns an error, which is valid if both eviction_task and disk usage based // eviction would both evict the same layer at the same time. let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); assert!(matches!(e, EvictionError::NotFound)); let dl_ctx = RequestContextBuilder::from(ctx) .download_behavior(DownloadBehavior::Download) .attached_child(); // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, &dl_ctx, ) .instrument(download_span.clone()) .await .unwrap(); data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") .collect_pending_ios() .await .expect("must not error") .img .take() .expect("tenant harness writes the control file") }; assert_eq!(img_before, img_after); // evict_and_wait can timeout, but it doesn't cancel the evicting itself // // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to // artificially slow it down. let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await; match layer .evict_and_wait(std::time::Duration::ZERO) .await .unwrap_err() { EvictionError::Timeout => { // expected, but note that the eviction is "still ongoing" helper.release().await; // exhaust spawn_blocking pool to ensure it is now complete SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle) .await; } other => unreachable!("{other:?}"), } // only way to query if a layer is resident is to acquire a ResidentLayer instance. // Layer::keep_resident never downloads, but it might initialize if the layer file is found // downloaded locally. let none = layer.keep_resident().await; assert!( none.is_none(), "Expected none, because eviction removed the local file, found: {none:?}" ); // plain downloading is rarely needed layer .download_and_keep_resident(&dl_ctx) .instrument(download_span) .await .unwrap(); // last important part is deletion on drop: gc and compaction use it for compacted L0 layers // or fully garbage collected layers. deletion means deleting the local file, and scheduling a // deletion of the already unlinked from index_part.json remote file. // // marking a layer to be deleted on drop is irreversible; there is no technical reason against // reversiblity, but currently it is not needed so it is not provided. layer.delete_on_drop(); let path = layer.local_path().to_owned(); // wait_drop produces an unconnected to Layer future which will resolve when the // LayerInner::drop has completed. let mut wait_drop = std::pin::pin!(layer.wait_drop()); // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing // until here tokio::time::pause(); tokio::time::timeout(ADVANCE, &mut wait_drop) .await .expect_err("should had timed out because two strong references exist"); tokio::fs::metadata(&path) .await .expect("the local layer file still exists"); let rtc = &timeline.remote_client; // Simulate GC removing our test layer. { let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await; let layers = &[layer]; g.open_mut().unwrap().finish_gc_timeline(layers); // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } // when strong references are dropped, the file is deleted and remote deletion is scheduled wait_drop.await; let e = tokio::fs::metadata(&path) .await .expect_err("the local file is deleted"); assert_eq!(e.kind(), std::io::ErrorKind::NotFound); rtc.wait_completion().await.unwrap(); assert_eq!( rtc.get_remote_physical_size(), dummy_layer.metadata().file_size ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } /// This test demonstrates a previous hang when a eviction and deletion were requested at the same /// time. Now both of them complete per Arc drop semantics. #[tokio::test(start_paused = true)] async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("evict_and_wait_on_wanted_deleted") .await .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let layer = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); layers.swap_remove(0) }; // setup done let resident = layer.keep_resident().await.unwrap(); { let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // drive the future to await on the status channel tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); layer.delete_on_drop(); drop(resident); // make sure the eviction task gets to run SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; let resident = layer.keep_resident().await; assert!( resident.is_none(), "keep_resident should not have re-initialized: {resident:?}" ); evict_and_wait .await .expect("evict_and_wait should had succeeded"); // works as intended } // assert that once we remove the `layer` from the layer map and drop our reference, // the deletion of the layer in remote_storage happens. { let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await; layers.open_mut().unwrap().finish_gc_timeline(&[layer]); } SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } /// This test ensures we are able to read the layer while the layer eviction has been /// started but not completed. #[test] fn read_wins_pending_eviction() { let rt = tokio::runtime::Builder::new_current_thread() .max_blocking_threads(1) .enable_all() .start_paused(true) .build() .unwrap(); rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("read_wins_pending_eviction") .await .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); layers.swap_remove(0) }; // setup done let resident = layer.keep_resident().await.unwrap(); let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // drive the future to await on the status channel tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); let (completion, barrier) = utils::completion::channel(); let (arrival, arrived_at_barrier) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( Some(arrival), barrier, )); // now the eviction cannot proceed because the threads are consumed while completion exists drop(resident); arrived_at_barrier.wait().await; assert!(!layer.is_likely_resident()); // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); assert!(layer.is_likely_resident()); // reinitialization notifies of new resident status, which should error out all evict_and_wait let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect("no timeout, because get_or_maybe_download re-initialized") .expect_err("eviction should not have succeeded because re-initialized"); // works as intended: evictions lose to "downloads" assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); // this is not wrong: the eviction is technically still "on the way" as it's still queued // because of a failpoint assert_eq!( 0, LAYER_IMPL_METRICS .cancelled_evictions .values() .map(|ctr| ctr.get()) .sum::() ); drop(completion); tokio::time::sleep(ADVANCE).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) .await; assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); // now we finally can observe the original eviction failing // it would had been possible to observe it earlier, but here it is guaranteed to have // happened. assert_eq!( 1, LAYER_IMPL_METRICS .cancelled_evictions .values() .map(|ctr| ctr.get()) .sum::() ); assert_eq!( 1, LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get() ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) }); } /// Use failpoint to delay an eviction starting to get a VersionCheckFailed. #[test] fn multiple_pending_evictions_in_order() { let name = "multiple_pending_evictions_in_order"; let in_order = true; multiple_pending_evictions_scenario(name, in_order); } /// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState. #[test] fn multiple_pending_evictions_out_of_order() { let name = "multiple_pending_evictions_out_of_order"; let in_order = false; multiple_pending_evictions_scenario(name, in_order); } fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let rt = tokio::runtime::Builder::new_current_thread() .max_blocking_threads(1) .enable_all() .start_paused(true) .build() .unwrap(); rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create(name).await.unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); layers.swap_remove(0) }; // setup done let resident = layer.keep_resident().await.unwrap(); let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // drive the future to await on the status channel tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); let (completion1, barrier) = utils::completion::channel(); let mut completion1 = Some(completion1); let (arrival, arrived_at_barrier) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( Some(arrival), barrier, )); // now the eviction cannot proceed because we are simulating arbitrary long delay for the // eviction task start. drop(resident); assert!(!layer.is_likely_resident()); arrived_at_barrier.wait().await; // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); assert!(layer.is_likely_resident()); // reinitialization notifies of new resident status, which should error out all evict_and_wait let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect("no timeout, because get_or_maybe_download re-initialized") .expect_err("eviction should not have succeeded because re-initialized"); // works as intended: evictions lose to "downloads" assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); // this is not wrong: the eviction is technically still "on the way" as it's still queued // because of a failpoint assert_eq!( 0, LAYER_IMPL_METRICS .cancelled_evictions .values() .map(|ctr| ctr.get()) .sum::() ); assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); // configure another failpoint for the second eviction -- evictions are per initialization, // so now that we've reinitialized the inner, we get to run two of them at the same time. let (completion2, barrier) = utils::completion::channel(); let (arrival, arrived_at_barrier) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( Some(arrival), barrier, )); let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); // advance to the wait on the queue tokio::time::timeout(ADVANCE, &mut second_eviction) .await .expect_err("timeout because failpoint is blocking"); arrived_at_barrier.wait().await; assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); let mut release_earlier_eviction = |expected_reason| { assert_eq!( 0, LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), ); drop(completion1.take().unwrap()); let handle = &handle; async move { tokio::time::sleep(ADVANCE).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0( handle, 1, ) .await; assert_eq!( 1, LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), ); } }; if in_order { release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await; } // release the later eviction which is for the current version drop(completion2); tokio::time::sleep(ADVANCE).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) .await; if !in_order { release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await; } tokio::time::timeout(ADVANCE, &mut second_eviction) .await .expect("eviction goes through now that spawn_blocking is unclogged") .expect("eviction should succeed, because version matches"); assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); // ensure the cancelled are unchanged assert_eq!( 1, LAYER_IMPL_METRICS .cancelled_evictions .values() .map(|ctr| ctr.get()) .sum::() ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) }); } /// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently /// a `Layer::keep_resident` call. /// /// This matters because cancelling the eviction would leave us in a state where the file is on /// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to /// have non-repairing `Layer::is_likely_resident`. #[tokio::test(start_paused = true)] async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction") .await .unwrap(); let (tenant, ctx) = h.load().await; let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) .attached_child(); let layer = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); layers.swap_remove(0) }; // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an // Err) at the right time as in "during" the `LayerInner::needs_download`. layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload); let (completion, barrier) = utils::completion::channel(); let (arrival, arrived_at_barrier) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( Some(arrival), barrier, )); tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER)) .await .expect_err("should had advanced to waiting on channel"); arrived_at_barrier.wait().await; // simulate a cancelled read which is cancelled before it gets to re-initialize let e = layer .0 .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!( matches!( e, DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload) ), "{e:?}" ); assert!( layer.0.needs_download().await.unwrap().is_none(), "file is still on disk" ); // release the eviction task drop(completion); tokio::time::sleep(ADVANCE).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; // failpoint is still enabled, but it is not hit let e = layer .0 .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); // failpoint is not counted as cancellation either assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } #[tokio::test(start_paused = true)] async fn evict_and_wait_does_not_wait_for_download() { // let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download") .await .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) .attached_child(); let layer = { let mut layers = { let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); layers.swap_remove(0) }; // kind of forced setup: start an eviction but do not allow it progress until we are // downloading let (eviction_can_continue, barrier) = utils::completion::channel(); let (arrival, eviction_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( Some(arrival), barrier, )); let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // use this once-awaited other_evict to synchronize with the eviction let other_evict = layer.evict_and_wait(FOREVER); tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had advanced"); eviction_arrived.wait().await; drop(eviction_can_continue); other_evict.await.unwrap(); // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver assert!(!layer.is_likely_resident()); // following new evict_and_wait will fail until we've completed the download let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); assert!(matches!(e, EvictionError::NotFound), "{e:?}"); let (download_can_continue, barrier) = utils::completion::channel(); let (arrival, _download_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); let mut download = std::pin::pin!( layer .0 .get_or_maybe_download(true, &ctx) .instrument(download_span) ); assert!( !layer.is_likely_resident(), "during download layer is evicted" ); tokio::time::timeout(ADVANCE, &mut download) .await .expect_err("should had timed out because of failpoint"); // now we finally get to continue, and because the latest state is downloading, we deduce that // original eviction succeeded evict_and_wait.await.unwrap(); // however a new evict_and_wait will fail let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); assert!(matches!(e, EvictionError::NotFound), "{e:?}"); assert!(!layer.is_likely_resident()); drop(download_can_continue); download.await.expect("download should had succeeded"); assert!(layer.is_likely_resident()); // only now can we evict layer.evict_and_wait(FOREVER).await.unwrap(); } /// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, /// which is the last value. /// /// Also checks that the same does not happen on a non-evicted layer (regression test). #[tokio::test(start_paused = true)] async fn eviction_cancellation_on_drop() { use bytes::Bytes; use wal_decoder::models::value::Value; // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("eviction_cancellation_on_drop") .await .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); { // create_test_timeline wrote us one layer, write another let mut writer = timeline.writer().await; writer .put( pageserver_api::key::Key::from_i128(5), Lsn(0x20), &Value::Image(Bytes::from_static(b"this does not matter either")), &ctx, ) .await .unwrap(); writer.finish_write(Lsn(0x20)); } timeline.freeze_and_flush().await.unwrap(); // wait for the upload to complete so our Arc::strong_count assertion holds timeline.remote_client.wait_completion().await.unwrap(); let (evicted_layer, not_evicted) = { let mut layers = { let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await; let layers = guard.likely_resident_layers().cloned().collect::>(); // remove the layers from layermap guard.open_mut().unwrap().finish_gc_timeline(&layers); layers }; assert_eq!(layers.len(), 2); (layers.pop().unwrap(), layers.pop().unwrap()) }; let victims = [(evicted_layer, true), (not_evicted, false)]; for (victim, evict) in victims { let resident = victim.keep_resident().await.unwrap(); drop(victim); assert_eq!(Arc::strong_count(&resident.owner.0), 1); if evict { let evict_and_wait = resident.owner.evict_and_wait(FOREVER); // drive the future to await on the status channel, and then drop it tokio::time::timeout(ADVANCE, evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); } // 1 == we only evict one of the layers assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); drop(resident); // run any spawned tokio::time::sleep(ADVANCE).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; assert_eq!( 1, LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() ); } } /// A test case to remind you the cost of these structures. You can bump the size limit /// below if it is really necessary to add more fields to the structures. #[test] #[cfg(target_arch = "x86_64")] fn layer_size() { assert_eq!(size_of::(), 8); assert_eq!(size_of::(), 104); assert_eq!(size_of::(), 296); // it also has the utf8 path } struct SpawnBlockingPoolHelper { awaited_by_spawn_blocking_tasks: Completion, blocking_tasks: JoinSet<()>, } impl SpawnBlockingPoolHelper { /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until /// release is called. /// /// In the tests this can be used to ensure something cannot be started on the target runtimes /// spawn_blocking pool. /// /// This should be no issue nowdays, because nextest runs each test in it's own process. async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { let default_max_blocking_threads = 512; Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await } async fn consume_all_spawn_blocking_threads0( handle: &tokio::runtime::Handle, threads: usize, ) -> Self { assert_ne!(threads, 0); let (completion, barrier) = completion::channel(); let (started, starts_completed) = completion::channel(); let mut blocking_tasks = JoinSet::new(); for _ in 0..threads { let barrier = barrier.clone(); let started = started.clone(); blocking_tasks.spawn_blocking_on( move || { drop(started); tokio::runtime::Handle::current().block_on(barrier.wait()); }, handle, ); } drop(started); starts_completed.wait().await; drop(barrier); tracing::trace!("consumed all threads"); SpawnBlockingPoolHelper { awaited_by_spawn_blocking_tasks: completion, blocking_tasks, } } /// Release all previously blocked spawn_blocking threads async fn release(self) { let SpawnBlockingPoolHelper { awaited_by_spawn_blocking_tasks, mut blocking_tasks, } = self; drop(awaited_by_spawn_blocking_tasks); while let Some(res) = blocking_tasks.join_next().await { res.expect("none of the tasks should had panicked"); } tracing::trace!("released all threads"); } /// In the tests it is used as an easy way of making sure something scheduled on the target /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed /// before our tasks have a chance to schedule and complete. async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await } async fn consume_and_release_all_of_spawn_blocking_threads0( handle: &tokio::runtime::Handle, threads: usize, ) { Self::consume_all_spawn_blocking_threads0(handle, threads) .await .release() .await } } #[test] fn spawn_blocking_pool_helper_actually_works() { // create a custom runtime for which we know and control how many blocking threads it has // // because the amount is not configurable for our helper, expect the same amount as // BACKGROUND_RUNTIME using the tokio defaults would have. let rt = tokio::runtime::Builder::new_current_thread() .max_blocking_threads(1) .enable_all() .build() .unwrap(); let handle = rt.handle(); rt.block_on(async move { // this will not return until all threads are spun up and actually executing the code // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await; println!("consumed"); let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || { // this will not get to run before we release })); println!("spawned"); tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh) .await .expect_err("the task should not have gotten to run yet"); println!("tried to join"); consumed.release().await; println!("released"); tokio::time::timeout(std::time::Duration::from_secs(1), jh) .await .expect("no timeout") .expect("no join error"); println!("joined"); }); } /// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats fn lowres_time(hires: SystemTime) -> SystemTime { let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs(); UNIX_EPOCH + Duration::from_secs(ts) } #[test] fn access_stats() { let access_stats = LayerAccessStats::default(); // Default is visible assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); access_stats.set_visibility(LayerVisibilityHint::Covered); assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); access_stats.set_visibility(LayerVisibilityHint::Visible); assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); let rtime = UNIX_EPOCH + Duration::from_secs(2000000000); access_stats.record_residence_event_at(rtime); assert_eq!(access_stats.latest_activity(), lowres_time(rtime)); let atime = UNIX_EPOCH + Duration::from_secs(2100000000); access_stats.record_access_at(atime); assert_eq!(access_stats.latest_activity(), lowres_time(atime)); // Setting visibility doesn't clobber access time access_stats.set_visibility(LayerVisibilityHint::Covered); assert_eq!(access_stats.latest_activity(), lowres_time(atime)); access_stats.set_visibility(LayerVisibilityHint::Visible); assert_eq!(access_stats.latest_activity(), lowres_time(atime)); // Recording access implicitly makes layer visible, if it wasn't already let atime = UNIX_EPOCH + Duration::from_secs(2200000000); access_stats.set_visibility(LayerVisibilityHint::Covered); assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); assert!(access_stats.record_access_at(atime)); access_stats.set_visibility(LayerVisibilityHint::Visible); assert!(!access_stats.record_access_at(atime)); access_stats.set_visibility(LayerVisibilityHint::Visible); } #[test] fn access_stats_2038() { // The access stats structure uses a timestamp representation that will run out // of bits in 2038. One year before that, this unit test will start failing. let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap() + Duration::from_secs(3600 * 24 * 365); assert!(one_year_from_now.as_secs() < (2 << 31)); } ================================================ FILE: pageserver/src/tenant/storage_layer/layer.rs ================================================ use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use tracing::{Instrument, info_span}; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; use crate::context::{RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::tenant::timeline::{CompactionError, GetVectoredError}; #[cfg(test)] mod tests; #[cfg(test)] mod failpoints; pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000; /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// /// There are two kinds of layers, in-memory and on-disk layers. In-memory /// layers are used to ingest incoming WAL, and provide fast access to the /// recent page versions. On-disk layers are stored as files on disk, and are /// immutable. This type represents the on-disk kind while in-memory kind are represented by /// [`InMemoryLayer`]. /// /// Furthermore, there are two kinds of on-disk layers: delta and image layers. /// A delta layer contains all modifications within a range of LSNs and keys. /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. /// /// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a /// general goal, read accesses should always win eviction and eviction should not wait for /// download. /// /// ### State transitions /// /// The internal state of `Layer` is composed of most importantly the on-filesystem state and the /// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded, /// right size) or deleted. /// /// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the /// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the /// `heavier_once_cell::InitPermit` has been acquired, any read request /// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus /// cancelling the eviction. /// /// ```text /// +-----------------+ get_or_maybe_download +--------------------------------+ /// | not initialized |--------------------------->| Resident(Arc) | /// | ENOENT | /->| | /// +-----------------+ | +--------------------------------+ /// ^ | | ^ /// | get_or_maybe_download | | | get_or_maybe_download, either: /// evict_blocking | /-------------------------/ | | - upgrade weak to strong /// | | | | - re-initialize without download /// | | evict_and_wait | | /// +-----------------+ v | /// | not initialized | on_downloaded_layer_drop +--------------------------------------+ /// | file is present |<---------------------------| WantedEvicted(Weak) | /// +-----------------+ +--------------------------------------+ /// ``` /// /// ### Unsupported /// /// - Evicting by the operator deleting files from the filesystem /// /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer #[derive(Clone)] pub(crate) struct Layer(Arc); impl std::fmt::Display for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}{}", self.layer_desc().short_id(), self.0.generation.get_suffix() ) } } impl std::fmt::Debug for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self}") } } impl AsLayerDesc for Layer { fn layer_desc(&self) -> &PersistentLayerDesc { self.0.layer_desc() } } impl PartialEq for Layer { fn eq(&self, other: &Self) -> bool { Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) } } pub(crate) fn local_layer_path( conf: &PageServerConf, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, layer_file_name: &LayerName, generation: &Generation, ) -> Utf8PathBuf { let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); if generation.is_none() { // Without a generation, we may only use legacy path style timeline_path.join(layer_file_name.to_string()) } else { timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) } } pub(crate) enum LastEviction { Never, At(std::time::Instant), Evicting, } impl LastEviction { pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool { match self { LastEviction::Never => false, LastEviction::At(evicted_at) => evicted_at > &timepoint, LastEviction::Evicting => true, } } } impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { let local_path = local_layer_path( conf, &timeline.tenant_shard_id, &timeline.timeline_id, &file_name, &metadata.generation, ); let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size, ); let owner = Layer(Arc::new(LayerInner::new( conf, timeline, local_path, desc, None, metadata.generation, metadata.shard, ))); debug_assert!(owner.0.needs_download_blocking().unwrap().is_some()); owner } /// Creates a Layer value for a file we know to be resident in timeline directory. pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size, ); let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { let inner = Arc::new(DownloadedLayer { owner: owner.clone(), kind: tokio::sync::OnceCell::default(), version: 0, }); resident = Some(inner.clone()); LayerInner::new( conf, timeline, local_path, desc, Some(inner), metadata.generation, metadata.shard, ) })); let downloaded = resident.expect("just initialized"); debug_assert!(owner.0.needs_download_blocking().unwrap().is_none()); timeline .metrics .resident_physical_size_add(metadata.file_size); ResidentLayer { downloaded, owner } } /// Creates a Layer value for freshly written out new layer file by renaming it from a /// temporary path. pub(crate) fn finish_creating( conf: &'static PageServerConf, timeline: &Arc, desc: PersistentLayerDesc, temp_path: &Utf8Path, ) -> anyhow::Result { let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { let inner = Arc::new(DownloadedLayer { owner: owner.clone(), kind: tokio::sync::OnceCell::default(), version: 0, }); resident = Some(inner.clone()); let local_path = local_layer_path( conf, &timeline.tenant_shard_id, &timeline.timeline_id, &desc.layer_name(), &timeline.generation, ); LayerInner::new( conf, timeline, local_path, desc, Some(inner), timeline.generation, timeline.get_shard_index(), ) })); let downloaded = resident.expect("just initialized"); // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`. // TODO: this leaves the temp file in place if the rename fails, risking us running // out of space. Should we clean it up here or does the calling context deal with this? utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; Ok(ResidentLayer { downloaded, owner }) } /// Requests the layer to be evicted and waits for this to be done. /// /// If the file is not resident, an [`EvictionError::NotFound`] is returned. /// /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is /// re-downloaded, [`EvictionError::Downloaded`] is returned. /// /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction /// will happen regardless the future returned by this method completing unless there is a /// read access before eviction gets to complete. /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { self.0.evict_and_wait(timeout).await } /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload /// then. /// /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`]. /// This means that the unlinking by [gc] or [compaction] must have happened strictly before /// the value this is called on gets dropped. /// /// This is ensured by both of those methods accepting references to Layer. /// /// [gc]: [`RemoteTimelineClient::schedule_gc_update`] /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`] pub(crate) fn delete_on_drop(&self) { self.0.delete_on_drop(); } pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let downloaded = { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "GET_LAYER", ) }) .attached_child(); self.0 .get_or_maybe_download(true, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone()) .await .map_err(|err| match err { DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { GetVectoredError::Cancelled } other => GetVectoredError::Other(anyhow::anyhow!(other)), })? }; let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), }; self.record_access(ctx); let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "VISIT_LAYER", ) }) .attached_child(); downloaded .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await .map_err(|err| match err { GetVectoredError::Other(err) => GetVectoredError::Other( err.context(format!("get_values_reconstruct_data for layer {self}")), ), err => err, }) } /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> { self.0.get_or_maybe_download(true, ctx).await?; Ok(()) } pub(crate) async fn needs_download(&self) -> Result, std::io::Error> { self.0.needs_download().await } /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// /// Returns None if the layer is currently evicted or becoming evicted. pub(crate) async fn keep_resident(&self) -> Option { let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; Some(ResidentLayer { downloaded, owner: self.clone(), }) } /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal /// with `EvictionError::NotFound`. /// /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or /// will be unless a read happens soon. pub(crate) fn is_likely_resident(&self) -> bool { self.0 .inner .get() .map(|rowe| rowe.is_likely_resident()) .unwrap_or(false) } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. pub(crate) async fn download_and_keep_resident( &self, ctx: &RequestContext, ) -> Result { let downloaded = self.0.get_or_maybe_download(true, ctx).await?; Ok(ResidentLayer { downloaded, owner: self.clone(), }) } pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { self.0.info(reset) } pub(crate) fn latest_activity(&self) -> SystemTime { self.0.access_stats.latest_activity() } pub(crate) fn visibility(&self) -> LayerVisibilityHint { self.0.access_stats.visibility() } pub(crate) fn local_path(&self) -> &Utf8Path { &self.0.path } pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } pub(crate) fn last_evicted_at(&self) -> LastEviction { match self.0.last_evicted_at.try_lock() { Ok(lock) => match *lock { None => LastEviction::Never, Some(at) => LastEviction::At(at), }, Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting, Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"), } } pub(crate) fn get_timeline_id(&self) -> Option { self.0 .timeline .upgrade() .map(|timeline| timeline.timeline_id) } /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { self.0.desc.dump(); if verbose { // for now, unconditionally download everything, even if that might not be wanted. let l = self.0.get_or_maybe_download(true, ctx).await?; l.dump(&self.0, ctx).await? } Ok(()) } /// Waits until this layer has been dropped (and if needed, local file deletion and remote /// deletion scheduling has completed). /// /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { let mut rx = self.0.status.as_ref().unwrap().subscribe(); async move { loop { if rx.changed().await.is_err() { break; } } } } fn record_access(&self, ctx: &RequestContext) { if self.0.access_stats.record_access(ctx) { // Visibility was modified to Visible: maybe log about this match ctx.task_kind() { TaskKind::CalculateSyntheticSize | TaskKind::OndemandLogicalSizeCalculation | TaskKind::GarbageCollector | TaskKind::MgmtRequest => { // This situation is expected in code paths do binary searches of the LSN space to resolve // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size, // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included // because it is run as a sub-task of synthetic size. } _ => { // In all other contexts, it is unusual to do I/O involving layers which are not visible at // some branch tip, so we log the fact that we are accessing something that the visibility // calculation thought should not be visible. // // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object // which was covered by a concurrent compaction. tracing::info!( layer=%self, "became visible as a result of access", ); } } // Update the timeline's visible bytes count if let Some(tl) = self.0.timeline.upgrade() { tl.metrics .visible_physical_size_gauge .add(self.0.desc.file_size) } } } pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); use LayerVisibilityHint::*; match (old_visibility, visibility) { (Visible, Covered) => { // Subtract this layer's contribution to the visible size metric if let Some(tl) = self.0.timeline.upgrade() { debug_assert!( tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size ); tl.metrics .visible_physical_size_gauge .sub(self.0.desc.file_size) } } (Covered, Visible) => { // Add this layer's contribution to the visible size metric if let Some(tl) = self.0.timeline.upgrade() { tl.metrics .visible_physical_size_gauge .add(self.0.desc.file_size) } } (Covered, Covered) | (Visible, Visible) => { // no change } } } } /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted. /// /// However when we want something evicted, we cannot evict it right away as there might be current /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet /// read with [`Layer::get_values_reconstruct_data`]. /// /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search #[derive(Debug)] enum ResidentOrWantedEvicted { Resident(Arc), WantedEvicted(Weak, usize), } impl ResidentOrWantedEvicted { /// Non-mutating access to the a DownloadedLayer, if possible. /// /// This is not used on the read path (anything that calls /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. fn get(&self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(), } } /// Best-effort query for residency right now, not as strong guarantee as receiving a strong /// reference from `ResidentOrWantedEvicted::get`. fn is_likely_resident(&self) -> bool { match self { ResidentOrWantedEvicted::Resident(_) => true, ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0, } } /// Upgrades any weak to strong if possible. /// /// Returns a strong reference if possible, along with a boolean telling if an upgrade /// happened. fn get_and_upgrade(&mut self) -> Option<(Arc, bool)> { match self { ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)), ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() { Some(strong) => { LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses(); *self = ResidentOrWantedEvicted::Resident(strong.clone()); Some((strong, true)) } None => None, }, } } /// When eviction is first requested, drop down to holding a [`Weak`]. /// /// Returns `Some` if this was the first time eviction was requested. Care should be taken to /// drop the possibly last strong reference outside of the mutex of /// [`heavier_once_cell::OnceCell`]. fn downgrade(&mut self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => { let weak = Arc::downgrade(strong); let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version); std::mem::swap(self, &mut temp); match temp { ResidentOrWantedEvicted::Resident(strong) => Some(strong), ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"), } } ResidentOrWantedEvicted::WantedEvicted(..) => None, } } } struct LayerInner { /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of /// [`Self::path`]. conf: &'static PageServerConf, /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. /// /// There should not be an access to timeline for any reason without entering the /// [`Timeline::gate`] at the same time. timeline: Weak, access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. /// /// Filesystem changes (download, evict) are only done while holding a permit which the /// `heavier_once_cell` provides. /// /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but /// possibly read while not holding it. inner: heavier_once_cell::OnceCell, /// Do we want to delete locally and remotely this when `LayerInner` is dropped wanted_deleted: AtomicBool, /// Version is to make sure we will only evict a specific initialization of the downloaded file. /// /// Incremented for each initialization, stored in `DownloadedLayer::version` or /// `ResidentOrWantedEvicted::WantedEvicted`. version: AtomicUsize, /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download /// starts, or completes. /// /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard. /// Holding the InitPermit is the only time we can do state transitions, but we also need to /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to /// [`ResidentOrWantedEvicted::Resident`] on access. /// /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`]. status: Option>, /// Counter for exponential backoff with the download. /// /// This is atomic only for the purposes of having additional data only accessed while holding /// the InitPermit. consecutive_failures: AtomicUsize, /// The generation of this Layer. /// /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`], /// for created layers from [`Timeline::generation`]. generation: Generation, /// The shard of this Layer. /// /// For layers created in this process, this will always be the [`ShardIndex`] of the /// current `ShardIdentity`` (TODO: add link once it's introduced). /// /// For loaded layers, this may be some other value if the tenant has undergone /// a shard split since the layer was originally written. shard: ShardIndex, /// When the Layer was last evicted but has not been downloaded since. /// /// This is used for skipping evicted layers from the previous heatmap (see /// `[Timeline::generate_heatmap]`) and for updating metrics /// (see [`LayerImplMetrics::redownload_after`]). last_evicted_at: std::sync::Mutex>, #[cfg(test)] failpoints: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.layer_desc().short_id()) } } impl AsLayerDesc for LayerInner { fn layer_desc(&self) -> &PersistentLayerDesc { &self.desc } } #[derive(Debug, Clone, Copy)] enum Status { Resident, Evicted, Downloading, } impl Drop for LayerInner { fn drop(&mut self) { // if there was a pending eviction, mark it cancelled here to balance metrics if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() { // eviction has already been started LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); // eviction request is intentionally not honored as no one is present to wait for it // and we could be delaying shutdown for nothing. } let timeline = self.timeline.upgrade(); if let Some(timeline) = timeline.as_ref() { // Only need to decrement metrics if the timeline still exists: otherwise // it will have already de-registered these metrics via TimelineMetrics::shutdown timeline.metrics.dec_layer(&self.desc); if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { debug_assert!( timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size ); timeline .metrics .visible_physical_size_gauge .sub(self.desc.file_size); } } if !*self.wanted_deleted.get_mut() { return; } let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let meta = self.metadata(); let status = self.status.take(); Self::spawn_blocking(move || { let _g = span.entered(); // carry this until we are finished for [`Layer::wait_drop`] support let _status = status; let Some(timeline) = timeline else { // no need to nag that timeline is gone: under normal situation on // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); return; }; let Ok(_guard) = timeline.gate.enter() else { LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); return; }; let removed = match std::fs::remove_file(path) { Ok(()) => true, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { // until we no longer do detaches by removing all local files before removing the // tenant from the global map, we will always get these errors even if we knew what // is the latest state. // // we currently do not track the latest state, so we'll also end up here on evicted // layers. false } Err(e) => { tracing::error!("failed to remove wanted deleted layer: {e}"); LAYER_IMPL_METRICS.inc_delete_removes_failed(); false } }; if removed { timeline.metrics.resident_physical_size_sub(file_size); } let res = timeline .remote_client .schedule_deletion_of_unlinked(vec![(file_name, meta)]); if let Err(e) = res { // test_timeline_deletion_with_files_stuck_in_upload_queue is good at // demonstrating this deadlock (without spawn_blocking): stop will drop // queued items, which will have ResidentLayer's, and those drops would try // to re-entrantly lock the RemoteTimelineClient inner state. if !timeline.is_active() { tracing::info!("scheduling deletion on drop failed: {e:#}"); } else { tracing::warn!("scheduling deletion on drop failed: {e:#}"); } LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); } else { LAYER_IMPL_METRICS.inc_completed_deletes(); } }); } } impl LayerInner { #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); ( heavier_once_cell::OnceCell::new(resident), version, Status::Resident, ) } else { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; // This object acts as a RAII guard on these metrics: increment on construction timeline.metrics.inc_layer(&desc); // New layers are visible by default. This metric is later updated on drop or in set_visibility timeline .metrics .visible_physical_size_gauge .add(desc.file_size); LayerInner { conf, path: local_path, desc, timeline: Arc::downgrade(timeline), access_stats: Default::default(), wanted_deleted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), status: Some(tokio::sync::watch::channel(init_status).0), consecutive_failures: AtomicUsize::new(0), generation, shard, last_evicted_at: std::sync::Mutex::default(), #[cfg(test)] failpoints: Default::default(), } } fn delete_on_drop(&self) { let res = self.wanted_deleted .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed); if res.is_ok() { LAYER_IMPL_METRICS.inc_started_deletes(); } } /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { let mut rx = self.status.as_ref().unwrap().subscribe(); { let current = rx.borrow_and_update(); match &*current { Status::Resident => { // we might get lucky and evict this; continue } Status::Evicted | Status::Downloading => { // it is already evicted return Err(EvictionError::NotFound); } } } let strong = { match self.inner.get() { Some(mut either) => either.downgrade(), None => { // we already have a scheduled eviction, which just has not gotten to run yet. // it might still race with a read access, but that could also get cancelled, // so let's say this is not evictable. return Err(EvictionError::NotFound); } } }; if strong.is_some() { // drop the DownloadedLayer outside of the holding the guard drop(strong); // idea here is that only one evicter should ever get to witness a strong reference, // which means whenever get_or_maybe_download upgrades a weak, it must mark up a // cancelled eviction and signal us, like it currently does. // // a second concurrent evict_and_wait will not see a strong reference. LAYER_IMPL_METRICS.inc_started_evictions(); } let changed = rx.changed(); let changed = tokio::time::timeout(timeout, changed).await; let Ok(changed) = changed else { return Err(EvictionError::Timeout); }; let _: () = changed.expect("cannot be closed, because we are holding a strong reference"); let current = rx.borrow_and_update(); match &*current { // the easiest case Status::Evicted => Ok(()), // it surely was evicted in between, but then there was a new access now; we can't know // if it'll succeed so lets just call it evicted Status::Downloading => Ok(()), // either the download which was started after eviction completed already, or it was // never evicted Status::Resident => Err(EvictionError::Downloaded), } } /// Cancellation safe. async fn get_or_maybe_download( self: &Arc, allow_download: bool, ctx: &RequestContext, ) -> Result, DownloadError> { let mut wait_for_download_recorder = scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| { ctx.ondemand_download_wait_observe(accum.get()); }); let (weak, permit) = { // get_or_init_detached can: // - be fast (mutex lock) OR uncontested semaphore permit acquire // - be slow (wait for semaphore permit or closing) let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let locked = self .inner .get_or_init_detached_measured(Some(&mut wait_for_download_recorder)) .await .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); scopeguard::ScopeGuard::into_inner(init_cancelled); match locked { // this path could had been a RwLock::read Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), Ok(Ok((strong, _))) => { // when upgraded back, the Arc is still available, but // previously a `evict_and_wait` was received. this is the only place when we // send out an update without holding the InitPermit. // // note that we also have dropped the Guard; this is fine, because we just made // a state change and are holding a strong reference to be returned. self.status.as_ref().unwrap().send_replace(Status::Resident); LAYER_IMPL_METRICS .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); } Ok(Err(guard)) => { // path to here: we won the eviction, the file should still be on the disk. let (weak, permit) = guard.take_and_deinit(); (Some(weak), permit) } Err(permit) => (None, permit), } }; let _guard = wait_for_download_recorder.guard(); if let Some(weak) = weak { // only drop the weak after dropping the heavier_once_cell guard assert!( matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)), "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug" ); } let timeline = self .timeline .upgrade() .ok_or(DownloadError::TimelineShutdown)?; // count cancellations, which currently remain largely unexpected let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); // check if we really need to be downloaded: this can happen if a read access won the // semaphore before eviction. // // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a // pending eviction will try to evict even upon finding an uninitialized `self.inner`. let needs_download = self .needs_download() .await .map_err(DownloadError::PreStatFailed); scopeguard::ScopeGuard::into_inner(init_cancelled); let needs_download = needs_download?; let Some(reason) = needs_download else { // the file is present locally because eviction has not had a chance to run yet #[cfg(test)] self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload) .await?; LAYER_IMPL_METRICS.inc_init_needed_no_download(); return Ok(self.initialize_after_layer_is_on_disk(permit)); }; // we must download; getting cancelled before spawning the download is not an issue as // any still running eviction would not find anything to evict. if let NeedsDownload::NotFile(ft) = reason { return Err(DownloadError::NotFile(ft)); } self.check_expected_download(ctx)?; if !allow_download { // this is only used from tests, but it is hard to test without the boolean return Err(DownloadError::DownloadRequired); } let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "DOWNLOAD_LAYER", layer = %self, reason = %reason, ) }) .attached_child(); async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self .download_init_and_wait(timeline, permit, ctx.attached_child()) .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) .await } /// Nag or fail per RequestContext policy fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> { use crate::context::DownloadBehavior::*; let b = ctx.download_behavior(); match b { Download => Ok(()), Warn | Error => { tracing::info!( "unexpectedly on-demand downloading for task kind {:?}", ctx.task_kind() ); crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc(); let really_error = matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn; if really_error { // this check is only probablistic, seems like flakyness footgun Err(DownloadError::ContextAndConfigReallyDeniesDownloads) } else { Ok(()) } } } } /// Actual download, at most one is executed at the time. async fn download_init_and_wait( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: RequestContext, ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); let this: Arc = self.clone(); let guard = timeline .gate .enter() .map_err(|_| DownloadError::DownloadCancelled)?; Self::spawn( async move { let _guard = guard; // now that we have commited to downloading, send out an update to: // - unhang any pending eviction // - break out of evict_and_wait this.status .as_ref() .unwrap() .send_replace(Status::Downloading); #[cfg(test)] this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading) .await .unwrap(); let res = this.download_and_init(timeline, permit, &ctx).await; if let Err(res) = tx.send(res) { match res { Ok(_res) => { tracing::debug!("layer initialized, but caller has been cancelled"); LAYER_IMPL_METRICS.inc_init_completed_without_requester(); } Err(e) => { tracing::info!( "layer file download failed, and caller has been cancelled: {e:?}" ); LAYER_IMPL_METRICS.inc_download_failed_without_requester(); } } } } .in_current_span(), ); match rx.await { Ok(Ok(res)) => Ok(res), Ok(Err(remote_storage::DownloadError::Cancelled)) => { Err(DownloadError::DownloadCancelled) } Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } async fn download_and_init( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: &RequestContext, ) -> Result, remote_storage::DownloadError> { let start = std::time::Instant::now(); let result = timeline .remote_client .download_layer_file( &self.desc.layer_name(), &self.metadata(), &self.path, &timeline.gate, &timeline.cancel, ctx, ) .await; let latency = start.elapsed(); let latency_millis = u64::try_from(latency.as_millis()).unwrap(); match result { Ok(size) => { assert_eq!(size, self.desc.file_size); match self.needs_download().await { Ok(Some(reason)) => { // this is really a bug in needs_download or remote timeline client panic!("post-condition failed: needs_download returned {reason:?}"); } Ok(None) => { // as expected } Err(e) => { panic!("post-condition failed: needs_download errored: {e:?}"); } }; tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful"); timeline .metrics .resident_physical_size_add(self.desc.file_size); self.consecutive_failures.store(0, Ordering::Relaxed); let since_last_eviction = self .last_evicted_at .lock() .unwrap() .take() .map(|ts| ts.elapsed()); if let Some(since_last_eviction) = since_last_eviction { LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } self.access_stats.record_residence_event(); let task_kind: &'static str = ctx.task_kind().into(); ONDEMAND_DOWNLOAD_BYTES .with_label_values(&[task_kind]) .inc_by(self.desc.file_size); ONDEMAND_DOWNLOAD_COUNT .with_label_values(&[task_kind]) .inc(); Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => { let consecutive_failures = 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); if timeline.cancel.is_cancelled() { // If we're shutting down, drop out before logging the error return Err(e); } tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}"); let backoff = utils::backoff::exponential_backoff_duration_seconds( consecutive_failures.min(u32::MAX as usize) as u32, 1.5, 60.0, ); let backoff = std::time::Duration::from_secs_f64(backoff); tokio::select! { _ = tokio::time::sleep(backoff) => {}, _ = timeline.cancel.cancelled() => {}, }; Err(e) } } } /// Initializes the `Self::inner` to a "resident" state. /// /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` /// before calling this method. /// /// If this method is ever made async, it needs to be cancellation safe so that no state /// changes are made before we can write to the OnceCell in non-cancellable fashion. fn initialize_after_layer_is_on_disk( self: &Arc, permit: heavier_once_cell::InitPermit, ) -> Arc { debug_assert_current_span_has_tenant_and_timeline_id(); // disable any scheduled but not yet running eviction deletions for this initialization let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); self.status.as_ref().unwrap().send_replace(Status::Resident); let res = Arc::new(DownloadedLayer { owner: Arc::downgrade(self), kind: tokio::sync::OnceCell::default(), version: next_version, }); let waiters = self.inner.initializer_count(); if waiters > 0 { tracing::info!(waiters, "completing layer init for other tasks"); } let value = ResidentOrWantedEvicted::Resident(res.clone()); self.inner.set(value, permit); res } async fn needs_download(&self) -> Result, std::io::Error> { match tokio::fs::metadata(&self.path).await { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)), Err(e) => Err(e), } } fn needs_download_blocking(&self) -> Result, std::io::Error> { match self.path.metadata() { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)), Err(e) => Err(e), } } fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> { // in future, this should include sha2-256 validation of the file. if !m.is_file() { Err(NeedsDownload::NotFile(m.file_type())) } else if m.len() != self.desc.file_size { Err(NeedsDownload::WrongSize { actual: m.len(), expected: self.desc.file_size, }) } else { Ok(()) } } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_name = self.desc.layer_name().to_string(); let resident = self .inner .get() .map(|rowe| rowe.is_likely_resident()) .unwrap_or(false); let access_stats = self.access_stats.as_api_model(reset); if self.desc.is_delta { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: !resident, access_stats, l0: crate::tenant::layer_map::LayerMap::is_l0( &self.layer_desc().key_range, self.layer_desc().is_delta, ), } } else { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, remote: !resident, access_stats, } } } /// `DownloadedLayer` is being dropped, so it calls this method. fn on_downloaded_layer_drop(self: Arc, only_version: usize) { // we cannot know without inspecting LayerInner::inner if we should evict or not, even // though here it is very likely let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might // drop while the `self.inner` is being locked, leading to a deadlock. let start_evicting = async move { #[cfg(test)] self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting) .await .expect("failpoint should not have errored"); tracing::debug!("eviction started"); let res = self.wait_for_turn_and_evict(only_version).await; // metrics: ignore the Ok branch, it is not done yet if let Err(e) = res { tracing::debug!(res=?Err::<(), _>(&e), "eviction completed"); LAYER_IMPL_METRICS.inc_eviction_cancelled(e); } }; Self::spawn(start_evicting.instrument(span)); } async fn wait_for_turn_and_evict( self: Arc, only_version: usize, ) -> Result<(), EvictionCancelled> { fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> { use Status::*; match status { Resident => Ok(()), Evicted => Err(EvictionCancelled::UnexpectedEvictedState), Downloading => Err(EvictionCancelled::LostToDownload), } } let timeline = self .timeline .upgrade() .ok_or(EvictionCancelled::TimelineGone)?; let mut rx = self .status .as_ref() .expect("LayerInner cannot be dropped, holding strong ref") .subscribe(); is_good_to_continue(&rx.borrow_and_update())?; let Ok(gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; let permit = { // we cannot just `std::fs::remove_file` because there might already be an // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem // operations must be done while holding the heavier_once_cell::InitPermit let mut wait = std::pin::pin!(self.inner.get_or_init_detached()); let waited = loop { // we must race to the Downloading starting, otherwise we would have to wait until the // completion of the download. waiting for download could be long and hinder our // efforts to alert on "hanging" evictions. tokio::select! { res = &mut wait => break res, _ = rx.changed() => { is_good_to_continue(&rx.borrow_and_update())?; // two possibilities for Status::Resident: // - the layer was found locally from disk by a read // - we missed a bunch of updates and now the layer is // again downloaded -- assume we'll fail later on with // version check or AlreadyReinitialized } } }; // re-check now that we have the guard or permit; all updates should have happened // while holding the permit. is_good_to_continue(&rx.borrow_and_update())?; // the term deinitialize is used here, because we clearing out the Weak will eventually // lead to deallocating the reference counted value, and the value we // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned. let (_weak, permit) = match waited { Ok(guard) => { match &*guard { ResidentOrWantedEvicted::WantedEvicted(_weak, version) if *version == only_version => { tracing::debug!(version, "deinitializing matching WantedEvicted"); let (weak, permit) = guard.take_and_deinit(); (Some(weak), permit) } ResidentOrWantedEvicted::WantedEvicted(_, version) => { // if we were not doing the version check, we would need to try to // upgrade the weak here to see if it really is dropped. version check // is done instead assuming that it is cheaper. tracing::debug!( version, only_version, "version mismatch, not deinitializing" ); return Err(EvictionCancelled::VersionCheckFailed); } ResidentOrWantedEvicted::Resident(_) => { return Err(EvictionCancelled::AlreadyReinitialized); } } } Err(permit) => { tracing::debug!("continuing after cancelled get_or_maybe_download or eviction"); (None, permit) } }; permit }; let span = tracing::Span::current(); let spawned_at = std::time::Instant::now(); // this is on purpose a detached spawn; we don't need to wait for it // // eviction completion reporting is the only thing hinging on this, and it can be just as // well from a spawn_blocking thread. // // important to note that now that we've acquired the permit we have made sure the evicted // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case // there are multiple evictions. The rest is not cancellable, and we've now commited to // evicting. // // If spawn_blocking has a queue and maximum number of threads are in use, we could stall // reads. We will need to add cancellation for that if necessary. Self::spawn_blocking(move || { let _span = span.entered(); let res = self.evict_blocking(&timeline, &gate, &permit); let waiters = self.inner.initializer_count(); if waiters > 0 { LAYER_IMPL_METRICS.inc_evicted_with_waiters(); } let completed_in = spawned_at.elapsed(); LAYER_IMPL_METRICS.record_time_to_evict(completed_in); match res { Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e), } tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed"); }); Ok(()) } /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs. fn evict_blocking( &self, timeline: &Timeline, _gate: &gate::GateGuard, _permit: &heavier_once_cell::InitPermit, ) -> Result<(), EvictionCancelled> { // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` match capture_mtime_and_remove(&self.path) { Ok(local_layer_mtime) => { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { let accessed_and_visible = self.access_stats.accessed() && self.access_stats.visibility() == LayerVisibilityHint::Visible; if accessed_and_visible { // Only layers used for reads contribute to our "low residence" metric that is used // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed // to be rapidly evicted without contributing to this metric. timeline .metrics .evictions_with_low_residence_duration .read() .unwrap() .observe(elapsed); } tracing::info!( residence_millis = elapsed.as_millis(), accessed_and_visible, "evicted layer after known residence period" ); } Err(_) => { tracing::info!("evicted layer after unknown residence period"); } } timeline.metrics.evictions.inc(); timeline .metrics .resident_physical_size_sub(self.desc.file_size); } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { tracing::error!( layer_size = %self.desc.file_size, "failed to evict layer from disk, it was already gone" ); return Err(EvictionCancelled::FileNotFound); } Err(e) => { // FIXME: this should probably be an abort tracing::error!("failed to evict file from disk: {e:#}"); return Err(EvictionCancelled::RemoveFailed); } } self.access_stats.record_residence_event(); *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); self.status.as_ref().unwrap().send_replace(Status::Evicted); Ok(()) } fn metadata(&self) -> LayerFileMetadata { LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. /// /// Synchronizing with spawned tasks is very complicated otherwise. fn spawn(fut: F) where F: std::future::Future + Send + 'static, { #[cfg(test)] tokio::task::spawn(fut); #[cfg(not(test))] crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut); } /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. fn spawn_blocking(f: F) where F: FnOnce() + Send + 'static, { #[cfg(test)] tokio::task::spawn_blocking(f); #[cfg(not(test))] crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f); } } fn capture_mtime_and_remove(path: &Utf8Path) -> Result { let m = path.metadata()?; let local_layer_mtime = m.modified()?; std::fs::remove_file(path)?; Ok(local_layer_mtime) } #[derive(Debug, thiserror::Error)] pub(crate) enum EvictionError { #[error("layer was already evicted")] NotFound, /// Evictions must always lose to downloads in races, and this time it happened. #[error("layer was downloaded instead")] Downloaded, #[error("eviction did not happen within timeout")] Timeout, } /// Error internal to the [`LayerInner::get_or_maybe_download`] #[derive(Debug, thiserror::Error)] pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, #[error("context denies downloading")] ContextAndConfigReallyDeniesDownloads, #[error("downloading is really required but not allowed by this method")] DownloadRequired, #[error("layer path exists, but it is not a file: {0:?}")] NotFile(std::fs::FileType), /// Why no error here? Because it will be reported by page_service. We should had also done /// retries already. #[error("downloading evicted layer file failed")] DownloadFailed, #[error("downloading failed, possibly for shutdown")] DownloadCancelled, #[error("pre-condition: stat before download failed")] PreStatFailed(#[source] std::io::Error), #[cfg(test)] #[error("failpoint: {0:?}")] Failpoint(failpoints::FailpointKind), } impl DownloadError { pub(crate) fn is_cancelled(&self) -> bool { matches!(self, DownloadError::DownloadCancelled) } } #[derive(Debug, PartialEq, Copy, Clone)] pub(crate) enum NeedsDownload { NotFound, NotFile(std::fs::FileType), WrongSize { actual: u64, expected: u64 }, } impl std::fmt::Display for NeedsDownload { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { NeedsDownload::NotFound => write!(f, "file was not found"), NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"), NeedsDownload::WrongSize { actual, expected } => { write!(f, "file size mismatch {actual} vs. {expected}") } } } } /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it. pub(crate) struct DownloadedLayer { owner: Weak, // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the // DownloadedLayer kind: tokio::sync::OnceCell>, version: usize, } impl std::fmt::Debug for DownloadedLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DownloadedLayer") // owner omitted because it is always "Weak" .field("kind", &self.kind) .field("version", &self.version) .finish() } } impl Drop for DownloadedLayer { fn drop(&mut self) { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { // Layer::drop will handle cancelling the eviction; because of drop order and // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } impl DownloadedLayer { /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. /// Failure to load the layer is sticky, i.e., future `get()` calls will return /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, /// we will always have the LayerInner on the callstack, so we can just use it. async fn get<'a>( &'a self, owner: &Arc, ctx: &RequestContext, ) -> anyhow::Result<&'a LayerKind> { let init = || async { assert_eq!( Weak::as_ptr(&self.owner), Arc::as_ptr(owner), "these are the same, just avoiding the upgrade" ); let res = if owner.desc.is_delta { let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) .attached_child(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), )); delta_layer::DeltaLayerInner::load( &owner.path, summary, Some(owner.conf.max_vectored_read_bytes), &ctx, ) .await .map(LayerKind::Delta) } else { let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) .attached_child(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), lsn, )); image_layer::ImageLayerInner::load( &owner.path, lsn, summary, Some(owner.conf.max_vectored_read_bytes), &ctx, ) .await .map(LayerKind::Image) }; match res { Ok(layer) => Ok(layer), Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); // We log this message once over the lifetime of `Self` // => Ok and good to log backtrace and path here. tracing::error!( "layer load failed, assuming permanent failure: {}: {err:?}", owner.path ); Err(err) } } }; self.kind .get_or_init(init) .await .as_ref() // We already logged the full backtrace above, once. Don't repeat that here. .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } async fn get_values_reconstruct_data( &self, this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { use LayerKind::*; match self .get(&this.owner.0, ctx) .await .map_err(GetVectoredError::Other)? { Delta(d) => { d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx) .await } } } async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { Delta(d) => d.dump(ctx).await?, Image(i) => i.dump(ctx).await?, } Ok(()) } } /// Wrapper around an actual layer implementation. #[derive(Debug)] enum LayerKind { Delta(delta_layer::DeltaLayerInner), Image(image_layer::ImageLayerInner), } /// Guard for forcing a layer be resident while it exists. #[derive(Clone)] pub struct ResidentLayer { owner: Layer, downloaded: Arc, } impl std::fmt::Display for ResidentLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.owner) } } impl std::fmt::Debug for ResidentLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.owner) } } impl ResidentLayer { /// Release the eviction guard, converting back into a plain [`Layer`]. /// /// You can access the [`Layer`] also by using `as_ref`. pub(crate) fn drop_eviction_guard(self) -> Layer { self.into() } /// Loads all keys stored in the layer. Returns key, lsn and value size. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, ) -> anyhow::Result> { use LayerKind::*; let owner = &self.owner.0; let inner = self.downloaded.get(owner, ctx).await?; // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. self.owner.record_access(ctx); let res = match inner { Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, }; res.with_context(|| format!("Layer index is corrupted for {self}")) } /// Read all they keys in this layer which match the ShardIdentity, and write them all to /// the provided writer. Return the number of keys written. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn filter( &self, shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, ) -> Result { use LayerKind::*; match self .downloaded .get(&self.owner.0, ctx) .await .map_err(CompactionError::Other)? { Delta(_) => { return Err(CompactionError::Other(anyhow::anyhow!(format!( "cannot filter() on a delta layer {self}" )))); } Image(i) => i .filter(shard_identity, writer, ctx) .await .map_err(CompactionError::Other), } } /// Returns the amount of keys and values written to the writer. pub(crate) async fn copy_delta_prefix( &self, writer: &mut super::delta_layer::DeltaLayerWriter, until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { use LayerKind::*; let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { Delta(d) => d .copy_prefix(writer, until, ctx) .await .with_context(|| format!("copy_delta_prefix until {until} of {self}")), Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")), } } pub(crate) fn local_path(&self) -> &Utf8Path { &self.owner.0.path } pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } /// Cast the layer to a delta, return an error if it is an image layer. pub(crate) async fn get_as_delta( &self, ctx: &RequestContext, ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { Delta(d) => Ok(d), Image(_) => Err(anyhow::anyhow!("image layer")), } } /// Cast the layer to an image, return an error if it is a delta layer. pub(crate) async fn get_as_image( &self, ctx: &RequestContext, ) -> anyhow::Result<&image_layer::ImageLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { Image(d) => Ok(d), Delta(_) => Err(anyhow::anyhow!("delta layer")), } } } impl AsLayerDesc for ResidentLayer { fn layer_desc(&self) -> &PersistentLayerDesc { self.owner.layer_desc() } } impl AsRef for ResidentLayer { fn as_ref(&self) -> &Layer { &self.owner } } /// Drop the eviction guard. impl From for Layer { fn from(value: ResidentLayer) -> Self { value.owner } } use metrics::IntCounter; pub(crate) struct LayerImplMetrics { started_evictions: IntCounter, completed_evictions: IntCounter, cancelled_evictions: enum_map::EnumMap, started_deletes: IntCounter, completed_deletes: IntCounter, failed_deletes: enum_map::EnumMap, rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, redownload_after: metrics::Histogram, time_to_evict: metrics::Histogram, } impl Default for LayerImplMetrics { fn default() -> Self { use enum_map::Enum; // reminder: these will be pageserver_layer_* with "_total" suffix let started_evictions = metrics::register_int_counter!( "pageserver_layer_started_evictions", "Evictions started in the Layer implementation" ) .unwrap(); let completed_evictions = metrics::register_int_counter!( "pageserver_layer_completed_evictions", "Evictions completed in the Layer implementation" ) .unwrap(); let cancelled_evictions = metrics::register_int_counter_vec!( "pageserver_layer_cancelled_evictions_count", "Different reasons for evictions to have been cancelled or failed", &["reason"] ) .unwrap(); let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| { let reason = EvictionCancelled::from_usize(i); let s = reason.as_str(); cancelled_evictions.with_label_values(&[s]) })); let started_deletes = metrics::register_int_counter!( "pageserver_layer_started_deletes", "Deletions on drop pending in the Layer implementation" ) .unwrap(); let completed_deletes = metrics::register_int_counter!( "pageserver_layer_completed_deletes", "Deletions on drop completed in the Layer implementation" ) .unwrap(); let failed_deletes = metrics::register_int_counter_vec!( "pageserver_layer_failed_deletes_count", "Different reasons for deletions on drop to have failed", &["reason"] ) .unwrap(); let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| { let reason = DeleteFailed::from_usize(i); let s = reason.as_str(); failed_deletes.with_label_values(&[s]) })); let rare_counters = metrics::register_int_counter_vec!( "pageserver_layer_assumed_rare_count", "Times unexpected or assumed rare event happened", &["event"] ) .unwrap(); let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| { let event = RareEvent::from_usize(i); let s = event.as_str(); rare_counters.with_label_values(&[s]) })); let inits_cancelled = metrics::register_int_counter!( "pageserver_layer_inits_cancelled_count", "Times Layer initialization was cancelled", ) .unwrap(); let redownload_after = { let minute = 60.0; let hour = 60.0 * minute; metrics::register_histogram!( "pageserver_layer_redownloaded_after", "Time between evicting and re-downloading.", vec![ 10.0, 30.0, minute, 5.0 * minute, 15.0 * minute, 30.0 * minute, hour, 12.0 * hour, ] ) .unwrap() }; let time_to_evict = metrics::register_histogram!( "pageserver_layer_eviction_held_permit_seconds", "Time eviction held the permit.", vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000] ) .unwrap(); Self { started_evictions, completed_evictions, cancelled_evictions, started_deletes, completed_deletes, failed_deletes, rare_counters, inits_cancelled, redownload_after, time_to_evict, } } } impl LayerImplMetrics { fn inc_started_evictions(&self) { self.started_evictions.inc(); } fn inc_completed_evictions(&self) { self.completed_evictions.inc(); } fn inc_eviction_cancelled(&self, reason: EvictionCancelled) { self.cancelled_evictions[reason].inc() } fn inc_started_deletes(&self) { self.started_deletes.inc(); } fn inc_completed_deletes(&self) { self.completed_deletes.inc(); } fn inc_deletes_failed(&self, reason: DeleteFailed) { self.failed_deletes[reason].inc(); } /// Counted separatedly from failed layer deletes because we will complete the layer deletion /// attempt regardless of failure to delete local file. fn inc_delete_removes_failed(&self) { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } /// Expected rare just as cancellations are rare, but we could have cancellations separate from /// the single caller which can start the download, so use this counter to separte them. fn inc_init_completed_without_requester(&self) { self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc(); } /// Expected rare because cancellations are unexpected, and failures are unexpected fn inc_download_failed_without_requester(&self) { self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc(); } /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded. /// /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an /// Option. fn inc_raced_wanted_evicted_accesses(&self) { self.rare_counters[RareEvent::UpgradedWantedEvicted].inc(); } /// These are only expected for [`Self::inc_init_cancelled`] amount when /// running with remote storage. fn inc_init_needed_no_download(&self) { self.rare_counters[RareEvent::InitWithoutDownload].inc(); } /// Expected rare because all layer files should be readable and good fn inc_permanent_loading_failures(&self) { self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } fn record_redownloaded_after(&self, duration: std::time::Duration) { self.redownload_after.observe(duration.as_secs_f64()) } /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably /// instead cancel eviction if we would have read waiters. We cannot however separate reads /// from other evictions, so this could have noise as well. fn inc_evicted_with_waiters(&self) { self.rare_counters[RareEvent::EvictedWithWaiters].inc(); } /// Recorded at least initially as the permit is now acquired in async context before /// spawn_blocking action. fn record_time_to_evict(&self, duration: std::time::Duration) { self.time_to_evict.observe(duration.as_secs_f64()) } } #[derive(Debug, Clone, Copy, enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, VersionCheckFailed, FileNotFound, RemoveFailed, AlreadyReinitialized, /// Not evicted because of a pending reinitialization LostToDownload, /// After eviction, there was a new layer access which cancelled the eviction. UpgradedBackOnAccess, UnexpectedEvictedState, } impl EvictionCancelled { fn as_str(&self) -> &'static str { match self { EvictionCancelled::LayerGone => "layer_gone", EvictionCancelled::TimelineGone => "timeline_gone", EvictionCancelled::VersionCheckFailed => "version_check_fail", EvictionCancelled::FileNotFound => "file_not_found", EvictionCancelled::RemoveFailed => "remove_failed", EvictionCancelled::AlreadyReinitialized => "already_reinitialized", EvictionCancelled::LostToDownload => "lost_to_download", EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access", EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state", } } } #[derive(enum_map::Enum)] enum DeleteFailed { TimelineGone, DeleteSchedulingFailed, } impl DeleteFailed { fn as_str(&self) -> &'static str { match self { DeleteFailed::TimelineGone => "timeline_gone", DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed", } } } #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, InitCompletedWithoutRequester, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, PermanentLoadingFailure, EvictedWithWaiters, } impl RareEvent { fn as_str(&self) -> &'static str { use RareEvent::*; match self { RemoveOnDropFailed => "remove_on_drop_failed", InitCompletedWithoutRequester => "init_completed_without", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", PermanentLoadingFailure => "permanent_loading_failure", EvictedWithWaiters => "evicted_with_waiters", } } } pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(LayerImplMetrics::default); ================================================ FILE: pageserver/src/tenant/storage_layer/layer_desc.rs ================================================ use core::fmt::Display; use std::ops::Range; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; #[cfg(test)] use utils::id::TenantId; use utils::id::TimelineId; use utils::lsn::Lsn; use super::{DeltaLayerName, ImageLayerName, LayerName}; /// A unique identifier of a persistent layer. /// /// This is different from `LayerDescriptor`, which is only used in the benchmarks. /// This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// Range of keys that this layer covers pub key_range: Range, /// Inclusive start, exclusive end of the LSN range that this layer holds. /// /// - For an open in-memory layer, the end bound is MAX_LSN /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the /// range start /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 pub lsn_range: Range, /// Whether this is a delta layer, and also, is this incremental. pub is_delta: bool, pub file_size: u64, } /// A unique identifier of a persistent layer within the context of one timeline. #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct PersistentLayerKey { pub key_range: Range, pub lsn_range: Range, pub is_delta: bool, } impl std::fmt::Display for PersistentLayerKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}..{} {}..{} is_delta={}", self.key_range.start, self.key_range.end, self.lsn_range.start, self.lsn_range.end, self.is_delta ) } } impl From for PersistentLayerKey { fn from(image_layer_name: ImageLayerName) -> Self { Self { key_range: image_layer_name.key_range, lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn), is_delta: false, } } } impl From for PersistentLayerKey { fn from(delta_layer_name: DeltaLayerName) -> Self { Self { key_range: delta_layer_name.key_range, lsn_range: delta_layer_name.lsn_range, is_delta: true, } } } impl From for PersistentLayerKey { fn from(layer_name: LayerName) -> Self { match layer_name { LayerName::Image(i) => i.into(), LayerName::Delta(d) => d.into(), } } } impl PersistentLayerDesc { pub fn key(&self) -> PersistentLayerKey { PersistentLayerKey { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), is_delta: self.is_delta, } } pub fn short_id(&self) -> impl Display { self.layer_name() } #[cfg(test)] pub fn new_test(key_range: Range, lsn_range: Range, is_delta: bool) -> Self { Self { tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, lsn_range, is_delta, file_size: 0, } } pub fn new_img( tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn: Lsn, file_size: u64, ) -> Self { Self { tenant_shard_id, timeline_id, key_range, lsn_range: Self::image_layer_lsn_range(lsn), is_delta: false, file_size, } } pub fn new_delta( tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn_range: Range, file_size: u64, ) -> Self { Self { tenant_shard_id, timeline_id, key_range, lsn_range, is_delta: true, file_size, } } pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, filename: LayerName, file_size: u64, ) -> Self { match filename { LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, d.lsn_range, file_size, ), } } /// Get the LSN that the image layer covers. pub fn image_layer_lsn(&self) -> Lsn { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); self.lsn_range.start } /// Get the LSN range corresponding to a single image layer LSN. pub fn image_layer_lsn_range(lsn: Lsn) -> Range { lsn..(lsn + 1) } /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } pub fn layer_name(&self) -> LayerName { if self.is_delta { self.delta_layer_name().into() } else { self.image_layer_name().into() } } // TODO: remove this in the future once we refactor timeline APIs. pub fn get_lsn_range(&self) -> Range { self.lsn_range.clone() } pub fn get_key_range(&self) -> Range { self.key_range.clone() } pub fn get_timeline_id(&self) -> TimelineId { self.timeline_id } /// Does this layer only contain some data for the key-range (incremental), /// or does it contain a version of every page? This is important to know /// for garbage collecting old layers: an incremental layer depends on /// the previous non-incremental layer. pub fn is_incremental(&self) -> bool { self.is_delta } pub fn is_delta(&self) -> bool { self.is_delta } pub fn dump(&self) { if self.is_delta { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----", self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn_range.start, self.lsn_range.end, self.is_incremental(), self.file_size, ); } else { println!( "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----", self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, self.image_layer_lsn(), self.is_incremental(), self.file_size ); } } pub fn file_size(&self) -> u64 { self.file_size } } ================================================ FILE: pageserver/src/tenant/storage_layer/layer_name.rs ================================================ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; use pageserver_api::key::Key; use utils::lsn::Lsn; use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { return cmp; } cmp = self.key_range.end.cmp(&other.key_range.end); if cmp != Ordering::Equal { return cmp; } cmp = self.lsn_range.start.cmp(&other.lsn_range.start); if cmp != Ordering::Equal { return cmp; } cmp = self.lsn_range.end.cmp(&other.lsn_range.end); cmp } } /// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__-- /// ``` impl DeltaLayerName { /// Parse the part of a delta layer's file name that represents the LayerName. Returns None /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let (key_parts, lsn_generation_parts) = fname.split_once("__")?; let (key_start_str, key_end_str) = key_parts.split_once('-')?; let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?; let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) = lsn_end_generation_parts.split_once('-') { if maybe_generation.starts_with("v") { // vY-XXXXXXXX lsn_end_str } else if maybe_generation.len() == 8 { // XXXXXXXX lsn_end_str } else { // no idea what this is return None; } } else { lsn_end_generation_parts }; let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let start_lsn = Lsn::from_hex(lsn_start_str).ok()?; let end_lsn = Lsn::from_hex(lsn_end_str).ok()?; if start_lsn >= end_lsn { return None; // or panic? } if key_start >= key_end { return None; // or panic? } Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}-{}__{:016X}-{:016X}", self.key_range.start, self.key_range.end, u64::from(self.lsn_range.start), u64::from(self.lsn_range.end), ) } } #[derive(PartialEq, Eq, Clone, Hash)] pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { return cmp; } cmp = self.key_range.end.cmp(&other.key_range.end); if cmp != Ordering::Equal { return cmp; } cmp = self.lsn.cmp(&other.lsn); cmp } } impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) } } /// /// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__- /// ``` impl ImageLayerName { /// Parse a string as then LayerName part of an image layer file name. Returns None if the /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let (key_parts, lsn_generation_parts) = fname.split_once("__")?; let (key_start_str, key_end_str) = key_parts.split_once('-')?; let lsn_str = if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') { if maybe_generation.starts_with("v") { // vY-XXXXXXXX lsn_str } else if maybe_generation.len() == 8 { // XXXXXXXX lsn_str } else { // likely a delta layer return None; } } else { lsn_generation_parts }; let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let lsn = Lsn::from_hex(lsn_str).ok()?; Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}-{}__{:016X}", self.key_range.start, self.key_range.end, u64::from(self.lsn), ) } } /// LayerName is the logical identity of a layer within a LayerMap at a moment in time. /// /// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations /// over time (e.g. across shard splits or compression). The physical filenames of layers in local /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) /// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)] pub enum LayerName { Image(ImageLayerName), Delta(DeltaLayerName), } impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, _ => false, } } pub(crate) fn kind(&self) -> &'static str { use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", } } /// Gets the key range encoded in the layer name. pub fn key_range(&self) -> &Range { match &self { LayerName::Image(layer) => &layer.key_range, LayerName::Delta(layer) => &layer.key_range, } } /// Gets the LSN range encoded in the layer name. pub fn lsn_as_range(&self) -> Range { match &self { LayerName::Image(layer) => layer.lsn_as_range(), LayerName::Delta(layer) => layer.lsn_range.clone(), } } pub fn is_delta(&self) -> bool { matches!(self, LayerName::Delta(_)) } } impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), Self::Delta(fname) => write!(f, "{fname}"), } } } impl From for LayerName { fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } impl From for LayerName { fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } impl FromStr for LayerName { type Err = String; /// Conversion from either a physical layer filename, or the string-ization of /// Self. When loading a physical layer filename, we drop any extra information /// not needed to build Self. fn from_str(value: &str) -> Result { let delta = DeltaLayerName::parse_str(value); let image = ImageLayerName::parse_str(value); let ok = match (delta, image) { (None, None) => { return Err(format!( "neither delta nor image layer file name: {value:?}" )); } (Some(delta), None) => Self::Delta(delta), (None, Some(image)) => Self::Image(image), (Some(_), Some(_)) => unreachable!(), }; Ok(ok) } } impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { match self { Self::Image(fname) => serializer.collect_str(fname), Self::Delta(fname) => serializer.collect_str(fname), } } } impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { deserializer.deserialize_string(LayerNameVisitor) } } struct LayerNameVisitor; impl serde::de::Visitor<'_> for LayerNameVisitor { type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( formatter, "a string that is a valid image or delta layer file name" ) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { v.parse().map_err(|e| E::custom(e)) } } #[cfg(test)] mod test { use super::*; #[test] fn image_layer_parse() { let expected = LayerName::Image(ImageLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn: Lsn::from_hex("00000000014FED58").unwrap(), }); let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap(); assert_eq!(parsed, expected); let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap(); assert_eq!(parsed, expected); // Omitting generation suffix is valid let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap(); assert_eq!(parsed, expected); } #[test] fn delta_layer_parse() { let expected = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn_range: Lsn::from_hex("00000000014FED58").unwrap() ..Lsn::from_hex("000000000154C481").unwrap(), }); let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap(); assert_eq!(parsed, expected); let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap(); assert_eq!(parsed, expected); // Omitting generation suffix is valid let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap(); assert_eq!(parsed, expected); } } ================================================ FILE: pageserver/src/tenant/storage_layer/merge_iterator.rs ================================================ use std::cmp::Ordering; use std::collections::{BinaryHeap, binary_heap}; use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; use super::image_layer::{ImageLayerInner, ImageLayerIterator}; use super::{PersistentLayerDesc, PersistentLayerKey}; use crate::context::RequestContext; #[derive(Clone, Copy)] pub(crate) enum LayerRef<'a> { Image(&'a ImageLayerInner), Delta(&'a DeltaLayerInner), } impl<'a> LayerRef<'a> { fn iter_with_options( self, ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> LayerIterRef<'a> { match self { Self::Image(x) => { LayerIterRef::Image(x.iter_with_options(ctx, max_read_size, max_batch_size)) } Self::Delta(x) => { LayerIterRef::Delta(x.iter_with_options(ctx, max_read_size, max_batch_size)) } } } fn layer_dbg_info(&self) -> String { match self { Self::Image(x) => x.layer_dbg_info(), Self::Delta(x) => x.layer_dbg_info(), } } } enum LayerIterRef<'a> { Image(ImageLayerIterator<'a>), Delta(DeltaLayerIterator<'a>), } impl LayerIterRef<'_> { async fn next(&mut self) -> anyhow::Result> { match self { Self::Delta(x) => x.next().await, Self::Image(x) => x.next().await, } } fn layer_dbg_info(&self) -> String { match self { Self::Image(x) => x.layer_dbg_info(), Self::Delta(x) => x.layer_dbg_info(), } } } /// This type plays several roles at once /// 1. Unified iterator for image and delta layers. /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). /// 3. Lazy creation of the real delta/image iterator. #[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum IteratorWrapper<'a> { NotLoaded { ctx: &'a RequestContext, first_key_lower_bound: (Key, Lsn), layer: LayerRef<'a>, source_desc: Arc, max_read_size: u64, max_batch_size: usize, }, Loaded { iter: PeekableLayerIterRef<'a>, source_desc: Arc, }, } pub(crate) struct PeekableLayerIterRef<'a> { iter: LayerIterRef<'a>, peeked: Option<(Key, Lsn, Value)>, // None == end } impl<'a> PeekableLayerIterRef<'a> { async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result { let peeked = iter.next().await?; Ok(Self { iter, peeked }) } fn peek(&self) -> &Option<(Key, Lsn, Value)> { &self.peeked } async fn next(&mut self) -> anyhow::Result> { let result = self.peeked.take(); self.peeked = self.iter.next().await?; if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) { if (k1, l1) < (k2, l2) { bail!("iterator is not ordered: {}", self.iter.layer_dbg_info()); } } Ok(result) } } impl std::cmp::PartialEq for IteratorWrapper<'_> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } impl std::cmp::Eq for IteratorWrapper<'_> {} impl std::cmp::PartialOrd for IteratorWrapper<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl std::cmp::Ord for IteratorWrapper<'_> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { use std::cmp::Ordering; let a = self.peek_next_key_lsn_value(); let b = other.peek_next_key_lsn_value(); match (a, b) { (Some((k1, l1, v1)), Some((k2, l2, v2))) => { fn map_value_to_num(val: &Option<&Value>) -> usize { match val { None => 0, Some(Value::Image(_)) => 1, Some(Value::WalRecord(_)) => 2, } } let order_1 = map_value_to_num(&v1); let order_2 = map_value_to_num(&v2); // When key_lsn are the same, the unloaded iter will always appear before the loaded one. // And note that we do a reverse at the end of the comparison, so it works with the max heap. (k1, l1, order_1).cmp(&(k2, l2, order_2)) } (Some(_), None) => Ordering::Less, (None, Some(_)) => Ordering::Greater, (None, None) => Ordering::Equal, } .reverse() } } impl<'a> IteratorWrapper<'a> { pub fn create_from_image_layer( image_layer: &'a ImageLayerInner, ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> Self { Self::NotLoaded { layer: LayerRef::Image(image_layer), first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()), ctx, source_desc: PersistentLayerKey { key_range: image_layer.key_range().clone(), lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()), is_delta: false, } .into(), max_read_size, max_batch_size, } } pub fn create_from_delta_layer( delta_layer: &'a DeltaLayerInner, ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> Self { Self::NotLoaded { layer: LayerRef::Delta(delta_layer), first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start), ctx, source_desc: PersistentLayerKey { key_range: delta_layer.key_range().clone(), lsn_range: delta_layer.lsn_range().clone(), is_delta: true, } .into(), max_read_size, max_batch_size, } } fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { match self { Self::Loaded { iter, .. } => iter .peek() .as_ref() .map(|(key, lsn, val)| (key, *lsn, Some(val))), Self::NotLoaded { first_key_lower_bound: (key, lsn), .. } => Some((key, *lsn, None)), } } // CORRECTNESS: this function must always take `&mut self`, never `&self`. // // The reason is that `impl Ord for Self` evaluates differently after this function // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut` // and not just `PeekMut::deref` // If we don't take `&mut self` async fn load(&mut self) -> anyhow::Result<()> { assert!(!self.is_loaded()); let Self::NotLoaded { ctx, first_key_lower_bound, layer, source_desc, max_read_size, max_batch_size, } = self else { unreachable!() }; let iter = layer.iter_with_options(ctx, *max_read_size, *max_batch_size); let iter = PeekableLayerIterRef::create(iter).await?; if let Some((k1, l1, _)) = iter.peek() { let (k2, l2) = first_key_lower_bound; if (k1, l1) < (k2, l2) { bail!( "layer key range did not include the first key in the layer: {}", layer.layer_dbg_info() ); } } *self = Self::Loaded { iter, source_desc: source_desc.clone(), }; Ok(()) } fn is_loaded(&self) -> bool { matches!(self, Self::Loaded { .. }) } /// Correctness: must load the iterator before using. /// /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it. /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. async fn next(&mut self) -> anyhow::Result> { let Self::Loaded { iter, .. } = self else { panic!("must load the iterator before using") }; iter.next().await } /// Get the persistent layer key corresponding to this iterator fn trace_source(&self) -> Arc { match self { Self::Loaded { source_desc, .. } => source_desc.clone(), Self::NotLoaded { source_desc, .. } => source_desc.clone(), } } } /// A merge iterator over delta/image layer iterators. /// /// When duplicated records are found, the iterator will not perform any /// deduplication, and the caller should handle these situation. By saying /// duplicated records, there are many possibilities: /// /// * Two same delta at the same LSN. /// * Two same image at the same LSN. /// * Delta/image at the same LSN where the image has already applied the delta. /// /// The iterator will always put the image before the delta. pub struct MergeIterator<'a> { heap: BinaryHeap>, } pub(crate) trait MergeIteratorItem { fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self; fn key_lsn_value(&self) -> &(Key, Lsn, Value); } impl MergeIteratorItem for (Key, Lsn, Value) { fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self { item } fn key_lsn_value(&self) -> &(Key, Lsn, Value) { self } } impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self { (item, iter.trace_source().clone()) } fn key_lsn_value(&self) -> &(Key, Lsn, Value) { &self.0 } } impl<'a> MergeIterator<'a> { #[cfg(test)] pub(crate) fn create_for_testing( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], ctx: &'a RequestContext, ) -> Self { Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) } /// Create a new merge iterator with custom options. /// /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that /// the buffer does not take too much memory. /// /// The default options for L0 compactions are: /// - max_read_size: 1024 * 8192 (8MB) /// - max_batch_size: 1024 /// /// The default options for gc-compaction are: /// - max_read_size: 128 * 8192 (1MB) /// - max_batch_size: 128 pub fn create_with_options( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], ctx: &'a RequestContext, max_read_size: u64, max_batch_size: usize, ) -> Self { let mut heap = Vec::with_capacity(images.len() + deltas.len()); for image in images { heap.push(IteratorWrapper::create_from_image_layer( image, ctx, max_read_size, max_batch_size, )); } for delta in deltas { heap.push(IteratorWrapper::create_from_delta_layer( delta, ctx, max_read_size, max_batch_size, )); } Self { heap: BinaryHeap::from(heap), } } pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { // Once we load the iterator, we can know the real first key-value pair in the iterator. // We put it back into the heap so that a potentially unloaded layer may have a key between // [potential_first_key, loaded_first_key). iter.load().await?; continue; } let Some(item) = iter.next().await? else { // If the iterator returns None, we pop this iterator. Actually, in the current implementation, // we order None > Some, and all the rest of the iterators should return None. binary_heap::PeekMut::pop(iter); continue; }; return Ok(Some(R::new(item, &iter))); } Ok(None) } /// Get the next key-value pair from the iterator. pub async fn next(&mut self) -> anyhow::Result> { self.next_inner().await } /// Get the next key-value pair from the iterator, and trace where the key comes from. pub async fn next_with_trace( &mut self, ) -> anyhow::Result)>> { self.next_inner().await } } #[cfg(test)] mod tests { use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; #[cfg(feature = "testing")] use wal_decoder::models::record::NeonWalRecord; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; #[cfg(feature = "testing")] use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}; async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, expect: &[(Key, Lsn, Value)], ) { let mut expect_iter = expect.iter(); loop { let o1 = merge_iter.next().await.unwrap(); let o2 = expect_iter.next(); assert_eq!(o1.is_some(), o2.is_some()); if o1.is_none() && o2.is_none() { break; } let (k1, l1, v1) = o1.unwrap(); let (k2, l2, v2) = o2.unwrap(); assert_eq!(&k1, k2); assert_eq!(l1, *l2); assert_eq!(&v1, v2); } } #[tokio::test] async fn merge_in_between() { use bytes::Bytes; let harness = TenantHarness::create("merge_iterator_merge_in_between") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } let test_deltas1 = vec![ ( get_key(0), Lsn(0x10), Value::Image(Bytes::copy_from_slice(b"test")), ), ( get_key(5), Lsn(0x10), Value::Image(Bytes::copy_from_slice(b"test")), ), ]; let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) .await .unwrap(); let test_deltas2 = vec![ ( get_key(3), Lsn(0x10), Value::Image(Bytes::copy_from_slice(b"test")), ), ( get_key(4), Lsn(0x10), Value::Image(Bytes::copy_from_slice(b"test")), ), ]; let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), ], &[], &ctx, ); let mut expect = Vec::new(); expect.extend(test_deltas1); expect.extend(test_deltas2); expect.sort_by(sort_delta); assert_merge_iter_equal(&mut merge_iter, &expect).await; } #[tokio::test] async fn delta_merge() { use bytes::Bytes; let harness = TenantHarness::create("merge_iterator_delta_merge") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } const N: usize = 1000; let test_deltas1 = (0..N) .map(|idx| { ( get_key(idx as u32 / 10), Lsn(0x20 * ((idx as u64) % 10 + 1)), Value::Image(Bytes::from(format!("img{idx:05}"))), ) }) .collect_vec(); let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) .await .unwrap(); let test_deltas2 = (0..N) .map(|idx| { ( get_key(idx as u32 / 10), Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10), Value::Image(Bytes::from(format!("img{idx:05}"))), ) }) .collect_vec(); let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); let test_deltas3 = (0..N) .map(|idx| { ( get_key(idx as u32 / 10 + N as u32), Lsn(0x10 * ((idx as u64) % 10 + 1)), Value::Image(Bytes::from(format!("img{idx:05}"))), ) }) .collect_vec(); let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_3.get_as_delta(&ctx).await.unwrap(), ], &[], &ctx, ); let mut expect = Vec::new(); expect.extend(test_deltas1); expect.extend(test_deltas2); expect.extend(test_deltas3); expect.sort_by(sort_delta); assert_merge_iter_equal(&mut merge_iter, &expect).await; // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge } #[cfg(feature = "testing")] #[tokio::test] async fn delta_image_mixed_merge() { use bytes::Bytes; let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } // In this test case, we want to test if the iterator still works correctly with multiple copies // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab. // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix. // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should // correctly process these situations and return everything as-is, and the upper layer of the system // will handle duplicated LSNs. let test_deltas1 = vec![ ( get_key(0), Lsn(0x10), Value::WalRecord(NeonWalRecord::wal_init("")), ), ( get_key(0), Lsn(0x18), Value::WalRecord(NeonWalRecord::wal_append("a")), ), ( get_key(5), Lsn(0x10), Value::WalRecord(NeonWalRecord::wal_init("")), ), ( get_key(5), Lsn(0x18), Value::WalRecord(NeonWalRecord::wal_append("b")), ), ]; let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) .await .unwrap(); let mut test_deltas2 = test_deltas1.clone(); test_deltas2.push(( get_key(10), Lsn(0x20), Value::Image(Bytes::copy_from_slice(b"test")), )); let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); let test_deltas3 = vec![ ( get_key(0), Lsn(0x10), Value::Image(Bytes::copy_from_slice(b"")), ), ( get_key(5), Lsn(0x18), Value::Image(Bytes::copy_from_slice(b"b")), ), ( get_key(15), Lsn(0x20), Value::Image(Bytes::copy_from_slice(b"test")), ), ]; let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); let mut test_deltas4 = test_deltas3.clone(); test_deltas4.push(( get_key(20), Lsn(0x20), Value::Image(Bytes::copy_from_slice(b"test")), )); let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx) .await .unwrap(); let mut expect = Vec::new(); expect.extend(test_deltas1); expect.extend(test_deltas2); expect.extend(test_deltas3); expect.extend(test_deltas4); expect.sort_by(sort_delta_value); // Test with different layer order for MergeIterator::create to ensure the order // is stable. let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_3.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), ], &[], &ctx, ); assert_merge_iter_equal(&mut merge_iter, &expect).await; let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_3.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), ], &[], &ctx, ); assert_merge_iter_equal(&mut merge_iter, &expect).await; is_send(merge_iter); } #[cfg(feature = "testing")] fn is_send(_: impl Send) {} } ================================================ FILE: pageserver/src/tenant/storage_layer.rs ================================================ //! Common traits and structs for layers pub mod batch_split_writer; pub mod delta_layer; pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; pub(crate) mod layer; mod layer_desc; mod layer_name; pub mod merge_iterator; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use crate::PERF_TRACE_TARGET; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; use futures::StreamExt; use futures::stream::FuturesUnordered; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use tracing::{Instrument, info_span, trace}; use utils::lsn::Lsn; use utils::sync::gate::GateGuard; use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::value::Value; use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; use crate::context::{ AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, { if a.start < b.start { a.end > b.start } else { b.end > a.start } } /// Struct used to communicate across calls to 'get_value_reconstruct_data'. /// /// Before first call, you can fill in 'page_img' if you have an older cached /// version of the page available. That can save work in /// 'get_value_reconstruct_data', as it can stop searching for page versions /// when all the WAL records going back to the cached image have been collected. /// /// When get_value_reconstruct_data returns Complete, 'img' is set to an image /// of the page, or the oldest WAL record in 'records' is a will_init-type /// record that initializes the page without requiring a previous image. /// /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have /// been collected, but there are more records outside the current layer. Pass /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// #[derive(Debug, Default, Clone)] pub(crate) struct ValueReconstructState { pub(crate) records: Vec<(Lsn, NeonWalRecord)>, pub(crate) img: Option<(Lsn, Bytes)>, } impl ValueReconstructState { /// Returns the number of page deltas applied to the page image. pub fn num_deltas(&self) -> usize { match self.img { Some(_) => self.records.len(), None => self.records.len() - 1, // omit will_init record } } } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) enum ValueReconstructSituation { Complete, #[default] Continue, } /// On disk representation of a value loaded in a buffer #[derive(Debug)] pub(crate) enum OnDiskValue { /// Unencoded [`Value::Image`] RawImage(Bytes), /// Encoded [`Value`]. Can deserialize into an image or a WAL record WalRecordOrImage(Bytes), } /// Reconstruct data accumulated for a single key during a vectored get #[derive(Debug, Default)] pub struct VectoredValueReconstructState { pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, pub(crate) situation: ValueReconstructSituation, } #[derive(Debug)] pub(crate) struct OnDiskValueIoWaiter { rx: tokio::sync::oneshot::Receiver, } #[derive(Debug)] #[must_use] pub(crate) enum OnDiskValueIo { /// Traversal identified this IO as required to complete the vectored get. Required { num_active_ios: Arc, tx: tokio::sync::oneshot::Sender, }, /// Sparse keyspace reads always read all the values for a given key, /// even though only the first value is needed. /// /// This variant represents the unnecessary IOs for those values at lower LSNs /// that aren't needed, but are currently still being done. /// /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO. /// We added this explicit representation here so that we can drop /// unnecessary IO results immediately, instead of buffering them in /// `oneshot` channels inside [`VectoredValueReconstructState`] until /// [`VectoredValueReconstructState::collect_pending_ios`] gets called. Unnecessary, } type OnDiskValueIoResult = Result; impl OnDiskValueIo { pub(crate) fn complete(self, res: OnDiskValueIoResult) { match self { OnDiskValueIo::Required { num_active_ios, tx } => { num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release); let _ = tx.send(res); } OnDiskValueIo::Unnecessary => { // Nobody cared, see variant doc comment. } } } } #[derive(Debug, thiserror::Error)] pub(crate) enum WaitCompletionError { #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")] IoDropped, } impl OnDiskValueIoWaiter { pub(crate) async fn wait_completion(self) -> Result { // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`. self.rx.await.map_err(|_| WaitCompletionError::IoDropped) } } impl VectoredValueReconstructState { /// # Cancel-Safety /// /// Technically fine to stop polling this future, but, the IOs will still /// be executed to completion by the sidecar task and hold on to / consume resources. /// Better not do it to make reasonsing about the system easier. pub(crate) async fn collect_pending_ios( self, ) -> Result { use utils::bin_ser::BeSer; let mut res = Ok(ValueReconstructState::default()); // We should try hard not to bail early, so that by the time we return from this // function, all IO for this value is done. It's not required -- we could totally // stop polling the IO futures in the sidecar task, they need to support that, // but just stopping to poll doesn't reduce the IO load on the disk. It's easier // to reason about the system if we just wait for all IO to complete, even if // we're no longer interested in the result. // // Revisit this when IO futures are replaced with a more sophisticated IO system // and an IO scheduler, where we know which IOs were submitted and which ones // just queued. Cf the comment on IoConcurrency::spawn_io. for (lsn, waiter) in self.on_disk_values { let value_recv_res = waiter .wait_completion() // we rely on the caller to poll us to completion, so this is not a bail point .await; // Force not bailing early by wrapping the code into a closure. #[allow(clippy::redundant_closure_call)] let _: () = (|| { match (&mut res, value_recv_res) { (Err(_), _) => { // We've already failed, no need to process more. } (Ok(_), Err(wait_err)) => { // This shouldn't happen - likely the sidecar task panicked. res = Err(PageReconstructError::Other(wait_err.into())); } (Ok(_), Ok(Err(err))) => { let err: std::io::Error = err; // TODO: returning IO error here will fail a compute query. // Probably not what we want, we're not doing `maybe_fatal_err` // in the IO futures. // But it's been like that for a long time, not changing it // as part of concurrent IO. // => https://github.com/neondatabase/neon/issues/10454 res = Err(PageReconstructError::Other(err.into())); } (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => { assert!(ok.img.is_none()); ok.img = Some((lsn, img)); } (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => { match Value::des(&buf) { Ok(Value::WalRecord(rec)) => { ok.records.push((lsn, rec)); } Ok(Value::Image(img)) => { assert!(ok.img.is_none()); ok.img = Some((lsn, img)); } Err(err) => { res = Err(PageReconstructError::Other(err.into())); } } } } })(); } res } /// Benchmarking utility to await for the completion of all pending ios /// /// # Cancel-Safety /// /// Technically fine to stop polling this future, but, the IOs will still /// be executed to completion by the sidecar task and hold on to / consume resources. /// Better not do it to make reasonsing about the system easier. #[cfg(feature = "benchmarking")] pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> { let mut res = Ok(()); // We should try hard not to bail early, so that by the time we return from this // function, all IO for this value is done. It's not required -- we could totally // stop polling the IO futures in the sidecar task, they need to support that, // but just stopping to poll doesn't reduce the IO load on the disk. It's easier // to reason about the system if we just wait for all IO to complete, even if // we're no longer interested in the result. // // Revisit this when IO futures are replaced with a more sophisticated IO system // and an IO scheduler, where we know which IOs were submitted and which ones // just queued. Cf the comment on IoConcurrency::spawn_io. for (_lsn, waiter) in self.on_disk_values { let value_recv_res = waiter .wait_completion() // we rely on the caller to poll us to completion, so this is not a bail point .await; match (&mut res, value_recv_res) { (Err(_), _) => { // We've already failed, no need to process more. } (Ok(_), Err(_wait_err)) => { // This shouldn't happen - likely the sidecar task panicked. unreachable!(); } (Ok(_), Ok(Err(err))) => { let err: std::io::Error = err; res = Err(err); } (Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {} (Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {} } } res } } /// Bag of data accumulated during a vectored get.. pub struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. pub keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, /// The keys covered by the image layers keys_with_image_coverage: Option>, // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, pub(crate) enable_debug: bool, pub(crate) debug_state: ValueReconstructState, pub(crate) io_concurrency: IoConcurrency, num_active_ios: Arc, pub(crate) read_path: Option, } /// The level of IO concurrency to be used on the read path /// /// The desired end state is that we always do parallel IO. /// This struct and the dispatching in the impl will be removed once /// we've built enough confidence. pub enum IoConcurrency { Sequential, SidecarTask { task_id: usize, ios_tx: tokio::sync::mpsc::UnboundedSender, }, } type IoFuture = Pin>>; pub(crate) enum SelectedIoConcurrency { Sequential, SidecarTask(GateGuard), } impl std::fmt::Debug for IoConcurrency { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { IoConcurrency::Sequential => write!(f, "Sequential"), IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"), } } } impl std::fmt::Debug for SelectedIoConcurrency { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { SelectedIoConcurrency::Sequential => write!(f, "Sequential"), SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"), } } } impl IoConcurrency { /// Force sequential IO. This is a temporary workaround until we have /// moved plumbing-through-the-call-stack /// of IoConcurrency into `RequestContextq. /// /// DO NOT USE for new code. /// /// Tracking issue: . pub(crate) fn sequential() -> Self { Self::spawn(SelectedIoConcurrency::Sequential) } pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency { let selected = match conf { GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), }; Self::spawn(selected) } pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self { match io_concurrency { SelectedIoConcurrency::Sequential => IoConcurrency::Sequential, SelectedIoConcurrency::SidecarTask(gate_guard) => { let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel(); static TASK_ID: AtomicUsize = AtomicUsize::new(0); let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...) let span = tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id); trace!(task_id, "spawning sidecar task"); tokio::spawn(async move { trace!("start"); scopeguard::defer!{ trace!("end") }; type IosRx = tokio::sync::mpsc::UnboundedReceiver; enum State { Waiting { // invariant: is_empty(), but we recycle the allocation empty_futures: FuturesUnordered, ios_rx: IosRx, }, Executing { futures: FuturesUnordered, ios_rx: IosRx, }, ShuttingDown { futures: FuturesUnordered, }, } let mut state = State::Waiting { empty_futures: FuturesUnordered::new(), ios_rx, }; loop { match state { State::Waiting { empty_futures, mut ios_rx, } => { assert!(empty_futures.is_empty()); tokio::select! { fut = ios_rx.recv() => { if let Some(fut) = fut { trace!("received new io future"); empty_futures.push(fut); state = State::Executing { futures: empty_futures, ios_rx }; } else { state = State::ShuttingDown { futures: empty_futures } } } } } State::Executing { mut futures, mut ios_rx, } => { tokio::select! { res = futures.next() => { trace!("io future completed"); assert!(res.is_some()); if futures.is_empty() { state = State::Waiting { empty_futures: futures, ios_rx}; } else { state = State::Executing { futures, ios_rx }; } } fut = ios_rx.recv() => { if let Some(fut) = fut { trace!("received new io future"); futures.push(fut); state = State::Executing { futures, ios_rx}; } else { state = State::ShuttingDown { futures }; } } } } State::ShuttingDown { mut futures, } => { trace!("shutting down"); while let Some(()) = futures.next().await { trace!("io future completed (shutdown)"); // drain } trace!("shutdown complete"); break; } } } drop(gate_guard); // drop it right before we exit }.instrument(span)); IoConcurrency::SidecarTask { task_id, ios_tx } } } } /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. /// /// The IO is represented as an opaque future. /// IO completion must be handled inside the future, e.g., through a oneshot channel. /// /// The API seems simple but there are multiple **pitfalls** involving /// DEADLOCK RISK. /// /// First, there are no guarantees about the exexecution of the IO. /// It may be `await`ed in-place before this function returns. /// It may be polled partially by this task and handed off to another task to be finished. /// It may be polled and then dropped before returning ready. /// /// This means that submitted IOs must not be interedependent. /// Interdependence may be through shared limited resources, e.g., /// - VirtualFile file descriptor cache slot acquisition /// - tokio-epoll-uring slot /// /// # Why current usage is safe from deadlocks /// /// Textbook condition for a deadlock is that _all_ of the following be given /// - Mutual exclusion /// - Hold and wait /// - No preemption /// - Circular wait /// /// The current usage is safe because: /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting /// for acquisition of other resources: /// - VirtualFile file descriptor cache slot tokio mutex /// - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex) /// - No preemption: there's no taking-away of acquired locks/resources => given /// - Circular wait: this is the part of the condition that isn't met: all IO futures /// first acquire VirtualFile mutex, then tokio-epoll-uring slot. /// There is no IO future that acquires slot before VirtualFile. /// Hence there can be no circular waiting. /// Hence there cannot be a deadlock. /// /// This is a very fragile situation and must be revisited whenver any code called from /// inside the IO futures is changed. /// /// We will move away from opaque IO futures towards well-defined IOs at some point in /// the future when we have shipped this first version of concurrent IO to production /// and are ready to retire the Sequential mode which runs the futures in place. /// Right now, while brittle, the opaque IO approach allows us to ship the feature /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode. /// /// Also read the comment in `collect_pending_ios`. pub(crate) async fn spawn_io(&mut self, fut: F) where F: std::future::Future + Send + 'static, { match self { IoConcurrency::Sequential => fut.await, IoConcurrency::SidecarTask { ios_tx, .. } => { let fut = Box::pin(fut); // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput // while insignificant for latency. // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try // a submission here, but never poll the future. That way, io_uring can make proccess while // the future sits in the ios_tx queue. match ios_tx.send(fut) { Ok(()) => {} Err(_) => { unreachable!("the io task must have exited, likely it panicked") } } } } } #[cfg(test)] pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { use std::ops::{Deref, DerefMut}; use tracing::info; use utils::sync::gate::Gate; // Spawn needs a Gate, give it one. struct Wrapper { inner: IoConcurrency, #[allow(dead_code)] gate: Box, } impl Deref for Wrapper { type Target = IoConcurrency; fn deref(&self) -> &Self::Target { &self.inner } } impl DerefMut for Wrapper { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } let gate = Box::new(Gate::default()); // The default behavior when running Rust unit tests without any further // flags is to use the new behavior. // The CI uses the following environment variable to unit test both old // and new behavior. // NB: the Python regression & perf tests take the `else` branch // below and have their own defaults management. let selected = { // The pageserver_api::config type is unsuitable because it's internally tagged. #[derive(serde::Deserialize)] #[serde(rename_all = "kebab-case")] enum TestOverride { Sequential, SidecarTask, } use once_cell::sync::Lazy; static TEST_OVERRIDE: Lazy = Lazy::new(|| { utils::env::var_serde_json_string( "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO", ) .unwrap_or(TestOverride::SidecarTask) }); match *TEST_OVERRIDE { TestOverride::Sequential => SelectedIoConcurrency::Sequential, TestOverride::SidecarTask => { SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it")) } } }; info!(?selected, "get_vectored_concurrent_io test"); Wrapper { inner: Self::spawn(selected), gate, } } } impl Clone for IoConcurrency { fn clone(&self) -> Self { match self { IoConcurrency::Sequential => IoConcurrency::Sequential, IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { task_id: *task_id, ios_tx: ios_tx.clone(), }, } } } /// Make noise in case the [`ValuesReconstructState`] gets dropped while /// there are still IOs in flight. /// Refer to `collect_pending_ios` for why we prefer not to do that. // /// We log from here instead of from the sidecar task because the [`ValuesReconstructState`] /// gets dropped in a tracing span with more context. /// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with /// the logs / panic handler logs from the sidecar task, which also logs the `task_id`. impl Drop for ValuesReconstructState { fn drop(&mut self) { let num_active_ios = self .num_active_ios .load(std::sync::atomic::Ordering::Acquire); if num_active_ios == 0 { return; } let sidecar_task_id = match &self.io_concurrency { IoConcurrency::Sequential => None, IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id), }; tracing::warn!( num_active_ios, ?sidecar_task_id, backtrace=%std::backtrace::Backtrace::force_capture(), "dropping ValuesReconstructState while some IOs have not been completed", ); } } impl ValuesReconstructState { pub fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, io_concurrency, enable_debug: false, debug_state: ValueReconstructState::default(), num_active_ios: Arc::new(AtomicUsize::new(0)), read_path: None, } } pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, io_concurrency, enable_debug: true, debug_state: ValueReconstructState::default(), num_active_ios: Arc::new(AtomicUsize::new(0)), read_path: None, } } /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls. pub(crate) async fn spawn_io(&mut self, fut: F) where F: std::future::Future + Send + 'static, { self.io_concurrency.spawn_io(fut).await; } pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) { if self.enable_debug { self.debug_state = debug_state.clone(); } } pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { self.layers_visited += 1; if let ReadableLayer::PersistentLayer(layer) = layer { if layer.layer_desc().is_delta() { self.delta_layers_visited += 1; } } } pub(crate) fn get_delta_layers_visited(&self) -> u32 { self.delta_layers_visited } pub(crate) fn get_layers_visited(&self) -> u32 { self.layers_visited } /// On hitting image layer, we can mark all keys in this range as done, because /// if the image layer does not contain a key, it is deleted/never added. pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); assert_eq!( prev_val, None, "should consume the keyspace before the next iteration" ); } /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// /// If the key is done after the update, mark it as such. /// /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in /// `key_done`. // TODO: rename this method & update description. pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo { let state = self.keys.entry(*key).or_default(); let is_sparse_key = key.is_sparse(); let required_io = match state.situation { ValueReconstructSituation::Complete => { if is_sparse_key { // Sparse keyspace might be visited multiple times because // we don't track unmapped keyspaces. return OnDiskValueIo::Unnecessary; } else { unreachable!() } } ValueReconstructSituation::Continue => { self.num_active_ios .fetch_add(1, std::sync::atomic::Ordering::Release); let (tx, rx) = tokio::sync::oneshot::channel(); state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx })); OnDiskValueIo::Required { tx, num_active_ios: Arc::clone(&self.num_active_ios), } } }; if completes && state.situation == ValueReconstructSituation::Continue { state.situation = ValueReconstructSituation::Complete; if !is_sparse_key { self.keys_done.add_key(*key); } } required_io } /// Returns the key space describing the keys that have /// been marked as completed since the last call to this function. /// Returns individual keys done, and the image layer coverage. pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { ( self.keys_done.consume_keyspace(), self.keys_with_image_coverage.take(), ) } } /// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub(crate) enum LayerId { PersitentLayerId(PersistentLayerKey), InMemoryLayerId(InMemoryLayerFileId), } /// Uniquely identify a layer visit by the layer /// and LSN range of the reads. Note that the end of the range is exclusive. /// /// The layer itself is not enough since we may have different LSN lower /// bounds for delta layer reads. Scenarios where this can happen are: /// /// 1. Layer overlaps: imagine an image layer inside and in-memory layer /// and a query that only partially hits the image layer. Part of the query /// needs to read the whole in-memory layer and the other part needs to read /// only up to the image layer. Hence, they'll have different LSN floor values /// for the read. /// /// 2. Scattered reads: the read path supports starting at different LSNs. Imagine /// The start LSN for one range is inside a layer and the start LSN for another range /// Is above the layer (includes all of it). Both ranges need to read the layer all the /// Way to the end but starting at different points. Hence, they'll have different LSN /// Ceil values. /// /// The implication is that we might visit the same layer multiple times /// in order to read different LSN ranges from it. In practice, this isn't very concerning /// because: /// 1. Layer overlaps are rare and generally not intended /// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs /// are grouped tightly enough (likely the case). #[derive(Debug, PartialEq, Eq, Clone, Hash)] struct LayerToVisitId { layer_id: LayerId, lsn_floor: Lsn, lsn_ceil: Lsn, } #[derive(Debug, PartialEq, Eq, Hash)] pub enum ReadableLayerWeak { PersistentLayer(Arc), InMemoryLayer(InMemoryLayerDesc), } /// Layer wrapper for the read path. Note that it is valid /// to use these layers even after external operations have /// been performed on them (compaction, freeze, etc.). #[derive(Debug)] pub(crate) enum ReadableLayer { PersistentLayer(Layer), InMemoryLayer(Arc), } /// A partial description of a read to be done. #[derive(Debug, Clone)] struct LayerVisit { /// An id used to resolve the readable layer within the fringe layer_to_visit_id: LayerToVisitId, /// Lsn range for the read, used for selecting the next read lsn_range: Range, } /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects /// the current keyspace that the search is descending on. /// Each layer tracks the keyspace that intersects it. /// /// The fringe must appear sorted by Lsn. Hence, it uses /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { planned_visits_by_lsn: BinaryHeap, visit_reads: HashMap, } #[derive(Debug)] struct LayerVisitReads { layer: ReadableLayer, target_keyspace: KeySpaceRandomAccum, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { planned_visits_by_lsn: BinaryHeap::new(), visit_reads: HashMap::new(), } } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { let read_desc = self.planned_visits_by_lsn.pop()?; let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id); match removed { Some(( _, LayerVisitReads { layer, mut target_keyspace, }, )) => Some(( layer, target_keyspace.consume_keyspace(), read_desc.lsn_range, )), None => unreachable!("fringe internals are always consistent"), } } pub(crate) fn update( &mut self, layer: ReadableLayer, keyspace: KeySpace, lsn_range: Range, ) { let layer_to_visit_id = LayerToVisitId { layer_id: layer.id(), lsn_floor: lsn_range.start, lsn_ceil: lsn_range.end, }; let entry = self.visit_reads.entry(layer_to_visit_id.clone()); match entry { Entry::Occupied(mut entry) => { entry.get_mut().target_keyspace.add_keyspace(keyspace); } Entry::Vacant(entry) => { self.planned_visits_by_lsn.push(LayerVisit { lsn_range, layer_to_visit_id: layer_to_visit_id.clone(), }); let mut accum = KeySpaceRandomAccum::new(); accum.add_keyspace(keyspace); entry.insert(LayerVisitReads { layer, target_keyspace: accum, }); } } } } impl Default for LayerFringe { fn default() -> Self { Self::new() } } impl Ord for LayerVisit { fn cmp(&self, other: &Self) -> Ordering { let ord = self.lsn_range.end.cmp(&other.lsn_range.end); if ord == std::cmp::Ordering::Equal { self.lsn_range.start.cmp(&other.lsn_range.start).reverse() } else { ord } } } impl PartialOrd for LayerVisit { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl PartialEq for LayerVisit { fn eq(&self, other: &Self) -> bool { self.lsn_range == other.lsn_range } } impl Eq for LayerVisit {} impl ReadableLayer { pub(crate) fn id(&self) -> LayerId { match self { Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), } } pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { ReadableLayer::PersistentLayer(layer) => { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "PLAN_LAYER", layer = %layer ) }) .attached_child(); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } ReadableLayer::InMemoryLayer(layer) => { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "PLAN_LAYER", layer = %layer ) }) .attached_child(); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } } } } /// Layers contain a hint indicating whether they are likely to be used for reads. /// /// This is a hint rather than an authoritative value, so that we do not have to update it synchronously /// when changing the visibility of layers (for example when creating a branch that makes some previously /// covered layers visible). It should be used for cache management but not for correctness-critical checks. #[derive(Debug, Clone, PartialEq, Eq)] pub enum LayerVisibilityHint { /// A Visible layer might be read while serving a read, because there is not an image layer between it /// and a readable LSN (the tip of the branch or a child's branch point) Visible, /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates /// a branch or ephemeral endpoint at an LSN below the layer that covers this. Covered, } pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); #[derive(Clone, Copy, strum_macros::EnumString)] pub(crate) enum LayerAccessStatsReset { NoReset, AllStats, } impl Default for LayerAccessStats { fn default() -> Self { // Default value is to assume resident since creation time, and visible. let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now()); value |= 0x1 << Self::VISIBILITY_SHIFT; Self(std::sync::atomic::AtomicU64::new(value)) } } // Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and // last residence change time. impl LayerAccessStats { // How many high bits to drop from a u32 timestamp? // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use // after that, this software has been very successful!) // - Dropping the top bit is implicitly safe because unix timestamps are meant to be // stored in an i32, so they never used it. // - Dropping the next two bits is safe because this code is only running on systems in // years >= 2024, and these bits have been 1 since 2021 // // Therefore we may store only 28 bits for a timestamp with one second resolution. We do // this truncation to make space for some flags in the high bits of our u64. const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1; const TS_MASK: u32 = 0x1f_ff_ff_ff; const TS_ONES: u32 = 0x60_00_00_00; const ATIME_SHIFT: u32 = 0; const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS; const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS; fn write_bits(&self, mask: u64, value: u64) -> u64 { self.0 .fetch_update( // TODO: decide what orderings are correct std::sync::atomic::Ordering::Relaxed, std::sync::atomic::Ordering::Relaxed, |v| Some((v & !mask) | (value & mask)), ) .expect("Inner function is infallible") } fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) { // Drop the low three bits of the timestamp, for an ~8s accuracy let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64); ((Self::TS_MASK as u64) << shift, timestamp << shift) } fn read_low_res_timestamp(&self, shift: u32) -> Option { let read = self.0.load(std::sync::atomic::Ordering::Relaxed); let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift; if ts_bits == 0 { None } else { Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64))) } } /// Record a change in layer residency. /// /// Recording the event must happen while holding the layer map lock to /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs) /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`. /// /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock, /// the following race could happen: /// /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp. /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. pub(crate) fn record_residence_event_at(&self, now: SystemTime) { let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now); self.write_bits(mask, value); } pub(crate) fn record_residence_event(&self) { self.record_residence_event_at(SystemTime::now()) } fn record_access_at(&self, now: SystemTime) -> bool { let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); // A layer which is accessed must be visible. mask |= 0x1 << Self::VISIBILITY_SHIFT; value |= 0x1 << Self::VISIBILITY_SHIFT; let old_bits = self.write_bits(mask, value); !matches!( self.decode_visibility(old_bits), LayerVisibilityHint::Visible ) } /// Returns true if we modified the layer's visibility to set it to Visible implicitly /// as a result of this access pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { return false; } self.record_access_at(SystemTime::now()) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { let ret = pageserver_api::models::LayerAccessStats { access_time: self .read_low_res_timestamp(Self::ATIME_SHIFT) .unwrap_or(UNIX_EPOCH), residence_time: self .read_low_res_timestamp(Self::RTIME_SHIFT) .unwrap_or(UNIX_EPOCH), visible: matches!(self.visibility(), LayerVisibilityHint::Visible), }; match reset { LayerAccessStatsReset::NoReset => {} LayerAccessStatsReset::AllStats => { self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0); self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0); } } ret } /// Get the latest access timestamp, falling back to latest residence event. The latest residence event /// will be this Layer's construction time, if its residence hasn't changed since then. pub(crate) fn latest_activity(&self) -> SystemTime { if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) { t } else { self.read_low_res_timestamp(Self::RTIME_SHIFT) .expect("Residence time is set on construction") } } /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]). /// /// This indicates whether the layer has been used for some purpose that would motivate /// us to keep it on disk, such as for serving a getpage request. fn accessed(&self) -> bool { // Consider it accessed if the most recent access is more recent than // the most recent change in residence status. match ( self.read_low_res_timestamp(Self::ATIME_SHIFT), self.read_low_res_timestamp(Self::RTIME_SHIFT), ) { (None, _) => false, (Some(_), None) => true, (Some(a), Some(r)) => a >= r, } } /// Helper for extracting the visibility hint from the literal value of our inner u64 fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint { match (bits >> Self::VISIBILITY_SHIFT) & 0x1 { 1 => LayerVisibilityHint::Visible, 0 => LayerVisibilityHint::Covered, _ => unreachable!(), } } /// Returns the old value which has been replaced pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint { let value = match visibility { LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, LayerVisibilityHint::Covered => 0x0, }; let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); self.decode_visibility(old_bits) } pub(crate) fn visibility(&self) -> LayerVisibilityHint { let read = self.0.load(std::sync::atomic::Ordering::Relaxed); self.decode_visibility(read) } } /// Get a layer descriptor from a layer. pub(crate) trait AsLayerDesc { /// Get the layer descriptor. fn layer_desc(&self) -> &PersistentLayerDesc; } pub mod tests { use pageserver_api::shard::TenantShardId; use utils::id::TimelineId; use super::*; impl From for PersistentLayerDesc { fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn_range, 233, ) } } impl From for PersistentLayerDesc { fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn, 233, ) } } impl From for PersistentLayerDesc { fn from(value: LayerName) -> Self { match value { LayerName::Delta(d) => Self::from(d), LayerName::Image(i) => Self::from(i), } } } } /// Range wrapping newtype, which uses display to render Debug. /// /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers. struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range); impl std::fmt::Debug for RangeDisplayDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}..{}", self.0.start, self.0.end) } } #[cfg(test)] mod tests2 { use pageserver_api::key::DBDIR_KEY; use tracing::info; use super::*; use crate::tenant::storage_layer::IoConcurrency; /// TODO: currently this test relies on manual visual inspection of the --no-capture output. /// Should look like so: /// ```text /// RUST_LOG=trace cargo nextest run --features testing --no-capture test_io_concurrency_noise /// running 1 test /// 2025-01-21T17:42:01.335679Z INFO get_vectored_concurrent_io test selected=SidecarTask /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0 /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future /// 2025-01-21T17:42:01.335999Z INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO /// 2025-01-21T17:42:01.336229Z WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace= 0: ::drop /// at ./src/tenant/storage_layer.rs:553:24 /// 1: core::ptr::drop_in_place /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1 /// 2: core::mem::drop /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24 /// 3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}} /// at ./src/tenant/storage_layer.rs:1159:9 /// ... /// 49: /// 2025-01-21T17:42:01.452293Z INFO IoConcurrency_sidecar{task_id=0}: completing IO /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok /// /// ``` #[tokio::test] async fn test_io_concurrency_noise() { crate::tenant::harness::setup_logging(); let io_concurrency = IoConcurrency::spawn_for_test(); match *io_concurrency { IoConcurrency::Sequential => { // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode. return; } IoConcurrency::SidecarTask { .. } => {} } let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel(); let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel(); let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel(); let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true); reconstruct_state .spawn_io(async move { info!("waiting for signal to complete IO"); io_fut_is_waiting_tx.send(()).unwrap(); should_complete_io.await.unwrap(); info!("completing IO"); io.complete(Ok(OnDiskValue::RawImage(Bytes::new()))); io_fut_exiting_tx.send(()).unwrap(); }) .await; io_fut_is_waiting.await.unwrap(); // this is what makes the noise drop(reconstruct_state); do_complete_io.send(()).unwrap(); io_fut_exiting.await.unwrap(); } } ================================================ FILE: pageserver/src/tenant/tasks.rs ================================================ //! This module contains per-tenant background processes, e.g. compaction and GC. use std::cmp::max; use std::future::Future; use std::ops::{ControlFlow, RangeInclusive}; use std::pin::pin; use std::sync::Arc; use std::time::{Duration, Instant}; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use rand::Rng; use scopeguard::defer; use tokio::sync::{Semaphore, SemaphorePermit}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::{TenantShard, TenantState}; /// Semaphore limiting concurrent background tasks (across all tenants). /// /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { let total_threads = TOKIO_WORKER_THREADS.get(); /*BEGIN_HADRON*/ // ideally we should run at least one compaction task per tenant in order to (1) maximize // compaction throughput (2) avoid head-of-line blocking of large compactions. However doing // that may create too many compaction tasks with lots of memory overheads. So we limit the // number of compaction tasks based on the available CPU core count. // Need to revisit. // let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD") // .ok() // .and_then(|s| s.parse().ok()) // .unwrap_or(4); // let permits = usize::max(1, total_threads * tasks_per_thread); // // assert!(permits < total_threads, "need threads for other work"); /*END_HADRON*/ let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); assert_ne!(permits, 0, "we will not be adding in permits later"); assert!(permits < total_threads, "need threads for other work"); Semaphore::new(permits) }); /// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if /// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled. /// /// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive /// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less /// important (e.g. due to page images in delta layers) and can wait for other background tasks. /// /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note /// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same /// thread pool. static CONCURRENT_L0_COMPACTION_TASKS: Lazy = Lazy::new(|| { let total_threads = TOKIO_WORKER_THREADS.get(); let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); assert_ne!(permits, 0, "we will not be adding in permits later"); assert!(permits < total_threads, "need threads for other work"); Semaphore::new(permits) }); /// Background jobs. /// /// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that /// do any significant IO or CPU work. #[derive( Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, strum_macros::Display, enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub(crate) enum BackgroundLoopKind { /// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is /// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics. L0Compaction, Compaction, Gc, Eviction, TenantHouseKeeping, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, HeatmapUpload, SecondaryDownload, } pub struct BackgroundLoopSemaphorePermit<'a> { _permit: SemaphorePermit<'static>, _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>, } /// Acquires a semaphore permit, to limit concurrent background jobs. pub(crate) async fn acquire_concurrency_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, ) -> BackgroundLoopSemaphorePermit<'static> { let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind); if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { pausable_failpoint!("initial-size-calculation-permit-pause"); } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); let semaphore = match loop_kind { BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS, _ => &CONCURRENT_BACKGROUND_TASKS, }; let permit = semaphore.acquire().await.expect("should never close"); recorder.acquired(); BackgroundLoopSemaphorePermit { _permit: permit, _recorder: recorder, } } /// Start per tenant background loops: compaction, GC, and ingest housekeeping. pub fn start_background_loops(tenant: &Arc, can_start: Option<&Barrier>) { let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, tenant_shard_id, None, &format!("compactor for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); let can_start = can_start.cloned(); async move { let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { _ = cancel.cancelled() => return Ok(()), _ = Barrier::maybe_wait(can_start) => {} }; TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); compaction_loop(tenant, cancel) // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } }, ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, tenant_shard_id, None, &format!("garbage collector for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); let can_start = can_start.cloned(); async move { let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { _ = cancel.cancelled() => return Ok(()), _ = Barrier::maybe_wait(can_start) => {} }; TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); gc_loop(tenant, cancel) .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } }, ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::TenantHousekeeping, tenant_shard_id, None, &format!("housekeeping for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); let can_start = can_start.cloned(); async move { let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { _ = cancel.cancelled() => return Ok(()), _ = Barrier::maybe_wait(can_start) => {} }; TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); tenant_housekeeping_loop(tenant, cancel) .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } }, ); } /// Compaction task's main loop. async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { const BASE_BACKOFF_SECS: f64 = 1.0; const MAX_BACKOFF_SECS: f64 = 300.0; const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); let mut period = tenant.get_compaction_period(); let mut error_run = 0; // consecutive errors // Stagger the compaction loop across tenants. if wait_for_active_tenant(&tenant, &cancel).await.is_break() { return; } if sleep_random(period, &cancel).await.is_err() { return; } loop { // Recheck that we're still active. if wait_for_active_tenant(&tenant, &cancel).await.is_break() { return; } // Refresh the period. If compaction is disabled, check again in a bit. period = tenant.get_compaction_period(); if period == Duration::ZERO { #[cfg(not(feature = "testing"))] info!("automatic compaction is disabled"); tokio::select! { _ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {}, _ = cancel.cancelled() => return, } continue; } // Wait for the next compaction run. let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); tokio::select! { _ = tokio::time::sleep(backoff), if error_run > 0 => {}, _ = tokio::time::sleep(period), if error_run == 0 => {}, _ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {}, _ = cancel.cancelled() => return, } // Run compaction. let iteration = Iteration { started_at: Instant::now(), period, kind: BackgroundLoopKind::Compaction, }; let IterationResult { output, elapsed } = iteration .run(tenant.compaction_iteration(&cancel, &ctx)) .await; match output { Ok(outcome) => { error_run = 0; // If there's more compaction work, L0 or not, schedule an immediate run. match outcome { CompactionOutcome::Done => {} CompactionOutcome::Skipped => {} CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(), CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(), } } Err(err) => { error_run += 1; let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); log_compaction_error( &err, Some((error_run, backoff)), cancel.is_cancelled(), false, ); continue; } } // NB: this log entry is recorded by performance tests. debug!( elapsed_ms = elapsed.as_millis(), "compaction iteration complete" ); } } pub(crate) fn log_compaction_error( err: &CompactionError, retry_info: Option<(u32, Duration)>, task_cancelled: bool, degrade_to_warning: bool, ) { let is_cancel = err.is_cancel(); let level = if is_cancel || task_cancelled { Level::INFO } else { Level::ERROR }; if let Some((error_count, sleep_duration)) = retry_info { match level { Level::ERROR => { error!( "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" ) } Level::INFO => { info!( "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" ) } level => unimplemented!("unexpected level {level:?}"), } } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } } } /// GC task's main loop. async fn gc_loop(tenant: Arc, cancel: CancellationToken) { const MAX_BACKOFF_SECS: f64 = 300.0; let mut error_run = 0; // consecutive errors // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; loop { if wait_for_active_tenant(&tenant, &cancel).await.is_break() { return; } let period = tenant.get_gc_period(); if first { first = false; if sleep_random(period, &cancel).await.is_err() { break; } } let gc_horizon = tenant.get_gc_horizon(); let sleep_duration; if period == Duration::ZERO || gc_horizon == 0 { #[cfg(not(feature = "testing"))] info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. sleep_duration = Duration::from_secs(10); } else { let iteration = Iteration { started_at: Instant::now(), period, kind: BackgroundLoopKind::Gc, }; // Run gc let IterationResult { output, elapsed: _ } = iteration .run(tenant.gc_iteration( None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx, )) .await; match output { Ok(_) => { error_run = 0; sleep_duration = period; } Err(crate::tenant::GcError::TenantCancelled) => { return; } Err(e) => { error_run += 1; let wait_duration = exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS); if matches!(e, crate::tenant::GcError::TimelineCancelled) { // Timeline was cancelled during gc. We might either be in an event // that affects the entire tenant (tenant deletion, pageserver shutdown), // or in one that affects the timeline only (timeline deletion). // Therefore, don't exit the loop. info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); } else { error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); } sleep_duration = wait_duration; } } }; if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() { break; } } } /// Tenant housekeeping's main loop. async fn tenant_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { let mut last_throttle_flag_reset_at = Instant::now(); loop { if wait_for_active_tenant(&tenant, &cancel).await.is_break() { return; } // Use the same period as compaction; it's not worth a separate setting. But if it's set to // zero (to disable compaction), then use a reasonable default. Jitter it by 5%. let period = match tenant.get_compaction_period() { Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(), period => period, }; let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else { break; }; // Do tenant housekeeping. let iteration = Iteration { started_at: Instant::now(), period, kind: BackgroundLoopKind::TenantHouseKeeping, }; iteration.run(tenant.housekeeping()).await; // Log any getpage throttling. info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { let now = Instant::now(); let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); if count_throttled == 0 { return; } let allowed_rps = tenant.pagestream_throttle.steady_rps(); let delta = now - prev; info!( n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), count_accounted = count_accounted_finish, // don't break existing log scraping count_throttled, sum_throttled_usecs, count_accounted_start, // log after pre-existing fields to not break existing log scraping allowed_rps=%format_args!("{allowed_rps:.0}"), "shard was throttled in the last n_seconds" ); }); } } /// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down. async fn wait_for_active_tenant( tenant: &Arc, cancel: &CancellationToken, ) -> ControlFlow<()> { if tenant.current_state() == TenantState::Active { return ControlFlow::Continue(()); } let mut update_rx = tenant.subscribe_for_state_updates(); tokio::select! { result = update_rx.wait_for(|s| s == &TenantState::Active) => { if result.is_err() { return ControlFlow::Break(()); } debug!("Tenant state changed to active, continuing the task loop"); ControlFlow::Continue(()) }, _ = cancel.cancelled() => ControlFlow::Break(()), } } #[derive(thiserror::Error, Debug)] #[error("cancelled")] pub(crate) struct Cancelled; /// Sleeps for a random interval up to the given max value. /// /// This delay prevents a thundering herd of background tasks and will likely keep them running on /// different periods for more stable load. pub(crate) async fn sleep_random( max: Duration, cancel: &CancellationToken, ) -> Result { sleep_random_range(Duration::ZERO..=max, cancel).await } /// Sleeps for a random interval in the given range. Returns the duration. pub(crate) async fn sleep_random_range( interval: RangeInclusive, cancel: &CancellationToken, ) -> Result { let delay = rand::rng().random_range(interval); if delay == Duration::ZERO { return Ok(delay); } tokio::select! { _ = cancel.cancelled() => Err(Cancelled), _ = tokio::time::sleep(delay) => Ok(delay), } } /// Sleeps for an interval with a random jitter. pub(crate) async fn sleep_jitter( duration: Duration, jitter: Duration, cancel: &CancellationToken, ) -> Result { let from = duration.saturating_sub(jitter); let to = duration.saturating_add(jitter); sleep_random_range(from..=to, cancel).await } struct Iteration { started_at: Instant, period: Duration, kind: BackgroundLoopKind, } struct IterationResult { output: O, elapsed: Duration, } impl Iteration { #[instrument(skip_all)] pub(crate) async fn run, O>(self, fut: F) -> IterationResult { let mut fut = pin!(fut); // Wrap `fut` into a future that logs a message every `period` so that we get a // very obvious breadcrumb in the logs _while_ a slow iteration is happening. let output = loop { match tokio::time::timeout(self.period, &mut fut).await { Ok(r) => break r, Err(_) => info!("still running"), } }; let elapsed = self.started_at.elapsed(); warn_when_period_overrun(elapsed, self.period, self.kind); IterationResult { output, elapsed } } } // NB: the `task` and `period` are used for metrics labels. pub(crate) fn warn_when_period_overrun( elapsed: Duration, period: Duration, task: BackgroundLoopKind, ) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { // humantime does no significant digits clamping whereas Duration's debug is a bit more // intelligent. however it makes sense to keep the "configuration format" for period, even // though there's no way to output the actual config value. info!( ?elapsed, period = %humantime::format_duration(period), ?task, "task iteration took longer than the configured period" ); metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT .with_label_values(&[task.into(), &format!("{}", period.as_secs())]) .inc(); } } ================================================ FILE: pageserver/src/tenant/throttle.rs ================================================ use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; use arc_swap::ArcSwap; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; /// Throttle for `async` functions. /// /// Runtime reconfigurable. /// /// To share a throttle among multiple entities, wrap it in an [`Arc`]. /// /// The intial use case for this is tenant-wide throttling of getpage@lsn requests. pub struct Throttle { inner: ArcSwap, /// will be turned into [`Stats::count_accounted_start`] count_accounted_start: AtomicU64, /// will be turned into [`Stats::count_accounted_finish`] count_accounted_finish: AtomicU64, /// will be turned into [`Stats::count_throttled`] count_throttled: AtomicU64, /// will be turned into [`Stats::sum_throttled_usecs`] sum_throttled_usecs: AtomicU64, } pub struct Inner { enabled: bool, rate_limiter: Arc, } pub type Config = pageserver_api::models::ThrottleConfig; /// See [`Throttle::reset_stats`]. pub struct Stats { /// Number of requests that started [`Throttle::throttle`] calls. pub count_accounted_start: u64, /// Number of requests that finished [`Throttle::throttle`] calls. pub count_accounted_finish: u64, /// Subset of the `accounted` requests that were actually throttled. /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift. pub count_throttled: u64, /// Sum of microseconds that throttled requests spent waiting for throttling. pub sum_throttled_usecs: u64, } pub enum ThrottleResult { NotThrottled { end: Instant }, Throttled { end: Instant }, } impl Throttle { pub fn new(config: Config) -> Self { Self { inner: ArcSwap::new(Arc::new(Self::new_inner(config))), count_accounted_start: AtomicU64::new(0), count_accounted_finish: AtomicU64::new(0), count_throttled: AtomicU64::new(0), sum_throttled_usecs: AtomicU64::new(0), } } fn new_inner(config: Config) -> Inner { let Config { enabled, initial, refill_interval, refill_amount, max, } = config; // steady rate, we expect `refill_amount` requests per `refill_interval`. // dividing gives us the rps. let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64(); let config = LeakyBucketConfig::new(rps, f64::from(max)); // initial tracks how many tokens are available to put in the bucket // we want how many tokens are currently in the bucket let initial_tokens = max - initial; let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); Inner { enabled: enabled.is_enabled(), rate_limiter: Arc::new(rate_limiter), } } pub fn reconfigure(&self, config: Config) { self.inner.store(Arc::new(Self::new_inner(config))); } /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling. /// This method allows retrieving & resetting that flag. /// Useful for periodic reporting. pub fn reset_stats(&self) -> Stats { let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed); let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed); let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed); let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed); Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs, } } /// See [`Config::steady_rps`]. pub fn steady_rps(&self) -> f64 { self.inner.load().rate_limiter.steady_rps() } /// `start` must be [`Instant::now`] or earlier. pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult { let inner = self.inner.load_full(); // clones the `Inner` Arc if !inner.enabled { return ThrottleResult::NotThrottled { end: start }; } self.count_accounted_start.fetch_add(1, Ordering::Relaxed); let did_throttle = inner.rate_limiter.acquire(key_count).await; self.count_accounted_finish.fetch_add(1, Ordering::Relaxed); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); let end = Instant::now(); let wait_time = end - start; self.sum_throttled_usecs .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); ThrottleResult::Throttled { end } } else { ThrottleResult::NotThrottled { end: start } } } } ================================================ FILE: pageserver/src/tenant/timeline/analysis.rs ================================================ use std::collections::BTreeSet; use std::ops::Range; use utils::lsn::Lsn; use super::Timeline; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; #[derive(serde::Serialize)] pub(crate) struct RangeAnalysis { start: String, end: String, has_image: bool, num_of_deltas_above_image: usize, total_num_of_deltas: usize, num_of_l0: usize, } impl Timeline { pub(crate) async fn perf_info(&self) -> Vec { // First, collect all split points of the layers. let mut split_points = BTreeSet::new(); let mut delta_ranges = Vec::new(); let mut image_ranges = Vec::new(); let num_of_l0; let all_layer_files = { let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; num_of_l0 = guard.layer_map().unwrap().level0_deltas().len(); guard.all_persistent_layers() }; let lsn = self.get_last_record_lsn(); for key in all_layer_files { split_points.insert(key.key_range.start); split_points.insert(key.key_range.end); if key.is_delta { delta_ranges.push((key.key_range.clone(), key.lsn_range.clone())); } else { image_ranges.push((key.key_range.clone(), key.lsn_range.start)); } } // For each split range, compute the estimated read amplification. let split_points = split_points.into_iter().collect::>(); let mut result = Vec::new(); for i in 0..(split_points.len() - 1) { let start = split_points[i]; let end = split_points[i + 1]; // Find the latest image layer that contains the information. let mut maybe_image_layers = image_ranges .iter() // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough. .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn) .cloned() .collect::>(); maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1)); let image_layer = maybe_image_layers.last().cloned(); let lsn_filter_start = image_layer .as_ref() .map(|(_, lsn)| *lsn) .unwrap_or(Lsn::INVALID); fn overlaps_with(lsn_range_a: &Range, lsn_range_b: &Range) -> bool { !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end) } let maybe_delta_layers = delta_ranges .iter() .filter(|(key_range, lsn_range)| { key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range) }) .cloned() .collect::>(); let pitr_delta_layers = delta_ranges .iter() .filter(|(key_range, _)| key_range.contains(&start)) .cloned() .collect::>(); result.push(RangeAnalysis { start: start.to_string(), end: end.to_string(), has_image: image_layer.is_some(), num_of_deltas_above_image: maybe_delta_layers.len(), total_num_of_deltas: pitr_delta_layers.len(), num_of_l0, }); } result } } ================================================ FILE: pageserver/src/tenant/timeline/compaction.rs ================================================ //! New compaction implementation. The algorithm itself is implemented in the //! compaction crate. This file implements the callbacks and structs that allow //! the algorithm to drive the process. //! //! The old legacy algorithm is implemented directly in `timeline.rs`. use std::cmp::min; use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; use std::time::{Duration, Instant}; use super::layer_manager::LayerManagerLockHolder; use super::{ CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::tenant::timeline::{DeltaEntry, RepartitionError}; use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::keyspace::{KeySpace, ShardedRange}; use pageserver_api::models::{CompactInfoResponse, CompactKeyRange}; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use pageserver_compaction::helpers::{fully_contains, overlaps_with}; use pageserver_compaction::interface::*; use serde::Serialize; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, trace, warn}; use utils::critical_timeline; use utils::id::TimelineId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::value::Value; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, }; use crate::tenant::storage_layer::filter_iterator::FilterIterator; use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; use crate::tenant::tasks::log_compaction_error; use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_layer_manager_rlock, }; use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; /// Ratio of shard-local pages below which we trigger shard ancestor layer rewrites. 0.3 means that /// <= 30% of layer pages must belong to the descendant shard to rewrite the layer. /// /// We choose a value < 0.5 to avoid rewriting all visible layers every time we do a power-of-two /// shard split, which gets expensive for large tenants. const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3; #[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize)] pub struct GcCompactionJobId(pub usize); impl std::fmt::Display for GcCompactionJobId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } pub struct GcCompactionCombinedSettings { pub gc_compaction_enabled: bool, pub gc_compaction_verification: bool, pub gc_compaction_initial_threshold_kb: u64, pub gc_compaction_ratio_percent: u64, } #[derive(Debug, Clone)] pub enum GcCompactionQueueItem { MetaJob { /// Compaction options options: CompactOptions, /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) auto: bool, }, SubCompactionJob { i: usize, total: usize, options: CompactOptions, }, Notify(GcCompactionJobId, Option), } /// Statistics for gc-compaction meta jobs, which contains several sub compaction jobs. #[derive(Debug, Clone, Serialize, Default)] pub struct GcCompactionMetaStatistics { /// The total number of sub compaction jobs. pub total_sub_compaction_jobs: usize, /// The total number of sub compaction jobs that failed. pub failed_sub_compaction_jobs: usize, /// The total number of sub compaction jobs that succeeded. pub succeeded_sub_compaction_jobs: usize, /// The layer size before compaction. pub before_compaction_layer_size: u64, /// The layer size after compaction. pub after_compaction_layer_size: u64, /// The start time of the meta job. pub start_time: Option>, /// The end time of the meta job. pub end_time: Option>, /// The duration of the meta job. pub duration_secs: f64, /// The id of the meta job. pub meta_job_id: GcCompactionJobId, /// The LSN below which the layers are compacted, used to compute the statistics. pub below_lsn: Lsn, /// The retention ratio of the meta job (after_compaction_layer_size / before_compaction_layer_size) pub retention_ratio: f64, } impl GcCompactionMetaStatistics { fn finalize(&mut self) { let end_time = chrono::Utc::now(); if let Some(start_time) = self.start_time { if end_time > start_time { let delta = end_time - start_time; if let Ok(std_dur) = delta.to_std() { self.duration_secs = std_dur.as_secs_f64(); } } } self.retention_ratio = self.after_compaction_layer_size as f64 / (self.before_compaction_layer_size as f64 + 1.0); self.end_time = Some(end_time); } } impl GcCompactionQueueItem { pub fn into_compact_info_resp( self, id: GcCompactionJobId, running: bool, ) -> Option { match self { GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, running, job_id: id.0, }), GcCompactionQueueItem::SubCompactionJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, running, job_id: id.0, }), GcCompactionQueueItem::Notify(_, _) => None, } } } #[derive(Default)] struct GcCompactionGuardItems { notify: Option>, permit: Option, } struct GcCompactionQueueInner { running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, guards: HashMap, last_id: GcCompactionJobId, meta_statistics: Option, } impl GcCompactionQueueInner { fn next_id(&mut self) -> GcCompactionJobId { let id = self.last_id; self.last_id = GcCompactionJobId(id.0 + 1); id } } /// A structure to store gc_compaction jobs. pub struct GcCompactionQueue { /// All items in the queue, and the currently-running job. inner: std::sync::Mutex, /// Ensure only one thread is consuming the queue. consumer_lock: tokio::sync::Mutex<()>, } static CONCURRENT_GC_COMPACTION_TASKS: Lazy> = Lazy::new(|| { // Only allow one timeline on one pageserver to run gc compaction at a time. Arc::new(Semaphore::new(1)) }); impl GcCompactionQueue { pub fn new() -> Self { GcCompactionQueue { inner: std::sync::Mutex::new(GcCompactionQueueInner { running: None, queued: VecDeque::new(), guards: HashMap::new(), last_id: GcCompactionJobId(0), meta_statistics: None, }), consumer_lock: tokio::sync::Mutex::new(()), } } pub fn cancel_scheduled(&self) { let mut guard = self.inner.lock().unwrap(); guard.queued.clear(); // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel // API is only used for testing purposes, so we can drop everything here. guard.guards.clear(); } /// Schedule a manual compaction job. pub fn schedule_manual_compaction( &self, options: CompactOptions, notify: Option>, ) -> GcCompactionJobId { let mut guard = self.inner.lock().unwrap(); let id = guard.next_id(); guard.queued.push_back(( id, GcCompactionQueueItem::MetaJob { options, auto: false, }, )); guard.guards.entry(id).or_default().notify = notify; info!("scheduled compaction job id={}", id); id } /// Schedule an auto compaction job. fn schedule_auto_compaction( &self, options: CompactOptions, permit: OwnedSemaphorePermit, ) -> GcCompactionJobId { let mut guard = self.inner.lock().unwrap(); let id = guard.next_id(); guard.queued.push_back(( id, GcCompactionQueueItem::MetaJob { options, auto: true, }, )); guard.guards.entry(id).or_default().permit = Some(permit); id } /// Trigger an auto compaction. pub async fn trigger_auto_compaction( &self, timeline: &Arc, ) -> Result<(), CompactionError> { let GcCompactionCombinedSettings { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, .. } = timeline.get_gc_compaction_settings(); if !gc_compaction_enabled { return Ok(()); } if self.remaining_jobs_num() > 0 { // Only schedule auto compaction when the queue is empty return Ok(()); } if timeline.ancestor_timeline().is_some() { // Do not trigger auto compaction for child timelines. We haven't tested // it enough in staging yet. return Ok(()); } if timeline.get_gc_compaction_watermark() == Lsn::INVALID { // If the gc watermark is not set, we don't need to trigger auto compaction. // This check is the same as in `gc_compaction_split_jobs` but we don't log // here and we can also skip the computation of the trigger condition earlier. return Ok(()); } let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` // to ensure the fairness while avoid starving other tasks. return Ok(()); }; let gc_compaction_state = timeline.get_gc_compaction_state(); let l2_lsn = gc_compaction_state .map(|x| x.last_completed_lsn) .unwrap_or(Lsn::INVALID); let layers = { let guard = timeline .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; let mut l2_size: u64 = 0; let mut l1_size = 0; let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn(); for layer in layers { if layer.lsn_range.start <= l2_lsn { l2_size += layer.file_size(); } else if layer.lsn_range.start <= gc_cutoff { l1_size += layer.file_size(); } } fn trigger_compaction( l1_size: u64, l2_size: u64, gc_compaction_initial_threshold_kb: u64, gc_compaction_ratio_percent: u64, ) -> bool { const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB if l1_size + l2_size >= AUTO_TRIGGER_LIMIT { // Do not auto-trigger when physical size >= 150GB return false; } // initial trigger if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 { info!( "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}", l1_size, gc_compaction_initial_threshold_kb ); return true; } // size ratio trigger if l2_size == 0 { return false; } if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) { info!( "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}", l1_size, l2_size, gc_compaction_ratio_percent ); return true; } false } if trigger_compaction( l1_size, l2_size, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, ) { self.schedule_auto_compaction( CompactOptions { flags: { let mut flags = EnumSet::new(); flags |= CompactFlags::EnhancedGcBottomMostCompaction; if timeline.get_compaction_l0_first() { flags |= CompactFlags::YieldForL0; } flags }, sub_compaction: true, // Only auto-trigger gc-compaction over the data keyspace due to concerns in // https://github.com/neondatabase/neon/issues/11318. compact_key_range: Some(CompactKeyRange { start: Key::MIN, end: Key::metadata_key_range().start, }), compact_lsn_range: None, sub_compaction_max_job_size_mb: None, gc_compaction_do_metadata_compaction: false, }, permit, ); info!( "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", l1_size, l2_size, l2_lsn, gc_cutoff ); } else { debug!( "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", l1_size, l2_size, l2_lsn, gc_cutoff ); } Ok(()) } async fn collect_layer_below_lsn( &self, timeline: &Arc, lsn: Lsn, ) -> Result { let guard = timeline .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let layer_map = guard.layer_map()?; let layers = layer_map.iter_historic_layers().collect_vec(); let mut size = 0; for layer in layers { if layer.lsn_range.start <= lsn { size += layer.file_size(); } } Ok(size) } /// Notify the caller the job has finished and unblock GC. fn notify_and_unblock(&self, id: GcCompactionJobId) { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); if let Some(items) = guard.guards.remove(&id) { if let Some(tx) = items.notify { let _ = tx.send(()); } } if let Some(ref meta_statistics) = guard.meta_statistics { if meta_statistics.meta_job_id == id { if let Ok(stats) = serde_json::to_string(&meta_statistics) { info!( "gc-compaction meta statistics for job id = {}: {}", id, stats ); } } } } fn clear_running_job(&self) { let mut guard = self.inner.lock().unwrap(); guard.running = None; } async fn handle_sub_compaction( &self, id: GcCompactionJobId, options: CompactOptions, timeline: &Arc, auto: bool, ) -> Result<(), CompactionError> { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); let res = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) .await; let jobs = match res { Ok(jobs) => jobs, Err(err) => { warn!("cannot split gc-compaction jobs: {}, unblocked gc", err); self.notify_and_unblock(id); return Err(err); } }; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); } else { let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN. let expected_l2_lsn = jobs .iter() .map(|job| job.compact_lsn_range.end) .max() .unwrap(); for (i, job) in jobs.into_iter().enumerate() { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. let mut flags: EnumSet = EnumSet::default(); flags |= CompactFlags::EnhancedGcBottomMostCompaction; if job.dry_run { flags |= CompactFlags::DryRun; } if options.flags.contains(CompactFlags::YieldForL0) { flags |= CompactFlags::YieldForL0; } let options = CompactOptions { flags, sub_compaction: false, compact_key_range: Some(job.compact_key_range.into()), compact_lsn_range: Some(job.compact_lsn_range.into()), sub_compaction_max_job_size_mb: None, gc_compaction_do_metadata_compaction: false, }; pending_tasks.push(GcCompactionQueueItem::SubCompactionJob { options, i, total: jobs_len, }); } if !auto { pending_tasks.push(GcCompactionQueueItem::Notify(id, None)); } else { pending_tasks.push(GcCompactionQueueItem::Notify(id, Some(expected_l2_lsn))); } let layer_size = self .collect_layer_below_lsn(timeline, expected_l2_lsn) .await?; { let mut guard = self.inner.lock().unwrap(); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); tasks.push((id, task)); } tasks.reverse(); for item in tasks { guard.queued.push_front(item); } guard.meta_statistics = Some(GcCompactionMetaStatistics { meta_job_id: id, start_time: Some(chrono::Utc::now()), before_compaction_layer_size: layer_size, below_lsn: expected_l2_lsn, total_sub_compaction_jobs: jobs_len, ..Default::default() }); } info!( "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len ); } Ok(()) } /// Take a job from the queue and process it. Returns if there are still pending tasks. pub async fn iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, gc_block: &GcBlock, timeline: &Arc, ) -> Result { let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await; if let Err(err) = &res { log_compaction_error(err, None, cancel.is_cancelled(), true); } match res { Ok(res) => Ok(res), Err(e) if e.is_cancel() => Err(e), Err(_) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction // process. For now, let's simply avoid such errors triggering the // circuit breaker. Ok(CompactionOutcome::Skipped) } } } async fn iteration_inner( &self, cancel: &CancellationToken, ctx: &RequestContext, gc_block: &GcBlock, timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { return Err(CompactionError::Other(anyhow::anyhow!( "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue." ))); }; let has_pending_tasks; let mut yield_for_l0 = false; let Some((id, item)) = ({ let mut guard = self.inner.lock().unwrap(); if let Some((id, item)) = guard.queued.pop_front() { guard.running = Some((id, item.clone())); has_pending_tasks = !guard.queued.is_empty(); Some((id, item)) } else { has_pending_tasks = false; None } }) else { self.trigger_auto_compaction(timeline).await?; // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we // have not implemented preemption mechanism yet. We always want to yield it to more important // tasks if there is one. return Ok(CompactionOutcome::Done); }; match item { GcCompactionQueueItem::MetaJob { options, auto } => { if !options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { warn!( "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options ); } else if options.sub_compaction { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); self.handle_sub_compaction(id, options, timeline, auto) .await?; } else { // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn // in this branch. let _gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { self.notify_and_unblock(id); self.clear_running_job(); return Err(CompactionError::Other(anyhow!( "cannot run gc-compaction because gc is blocked: {}", e ))); } }; let res = timeline.compact_with_options(cancel, options, ctx).await; let compaction_result = match res { Ok(res) => res, Err(err) => { warn!(%err, "failed to run gc-compaction"); self.notify_and_unblock(id); self.clear_running_job(); return Err(err); } }; if compaction_result == CompactionOutcome::YieldForL0 { yield_for_l0 = true; } } } GcCompactionQueueItem::SubCompactionJob { options, i, total } => { // TODO: error handling, clear the queue if any task fails? let _gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { self.clear_running_job(); return Err(CompactionError::Other(anyhow!( "cannot run gc-compaction because gc is blocked: {}", e ))); } }; info!("running gc-compaction subcompaction job {}/{}", i, total); let res = timeline.compact_with_options(cancel, options, ctx).await; let compaction_result = match res { Ok(res) => res, Err(err) => { warn!(%err, "failed to run gc-compaction subcompaction job"); self.clear_running_job(); let mut guard = self.inner.lock().unwrap(); if let Some(ref mut meta_statistics) = guard.meta_statistics { meta_statistics.failed_sub_compaction_jobs += 1; } return Err(err); } }; if compaction_result == CompactionOutcome::YieldForL0 { // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because // we need to clean things up before returning from the function. yield_for_l0 = true; } { let mut guard = self.inner.lock().unwrap(); if let Some(ref mut meta_statistics) = guard.meta_statistics { meta_statistics.succeeded_sub_compaction_jobs += 1; } } } GcCompactionQueueItem::Notify(id, l2_lsn) => { let below_lsn = { let mut guard = self.inner.lock().unwrap(); if let Some(ref mut meta_statistics) = guard.meta_statistics { meta_statistics.below_lsn } else { Lsn::INVALID } }; let layer_size = if below_lsn != Lsn::INVALID { self.collect_layer_below_lsn(timeline, below_lsn).await? } else { 0 }; { let mut guard = self.inner.lock().unwrap(); if let Some(ref mut meta_statistics) = guard.meta_statistics { meta_statistics.after_compaction_layer_size = layer_size; meta_statistics.finalize(); } } self.notify_and_unblock(id); if let Some(l2_lsn) = l2_lsn { let current_l2_lsn = timeline .get_gc_compaction_state() .map(|x| x.last_completed_lsn) .unwrap_or(Lsn::INVALID); if l2_lsn >= current_l2_lsn { info!("l2_lsn updated to {}", l2_lsn); timeline .update_gc_compaction_state(GcCompactionState { last_completed_lsn: l2_lsn, }) .map_err(CompactionError::Other)?; } else { warn!( "l2_lsn updated to {} but it is less than the current l2_lsn {}", l2_lsn, current_l2_lsn ); } } } } self.clear_running_job(); Ok(if yield_for_l0 { tracing::info!("give up gc-compaction: yield for L0 compaction"); CompactionOutcome::YieldForL0 } else if has_pending_tasks { CompactionOutcome::Pending } else { CompactionOutcome::Done }) } #[allow(clippy::type_complexity)] pub fn remaining_jobs( &self, ) -> ( Option<(GcCompactionJobId, GcCompactionQueueItem)>, VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, ) { let guard = self.inner.lock().unwrap(); (guard.running.clone(), guard.queued.clone()) } pub fn remaining_jobs_num(&self) -> usize { let guard = self.inner.lock().unwrap(); guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } } } /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will /// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets /// called. #[derive(Debug, Clone)] pub(crate) struct GcCompactJob { pub dry_run: bool, /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary. pub compact_key_range: Range, /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`]. /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here. pub compact_lsn_range: Range, /// See [`CompactOptions::gc_compaction_do_metadata_compaction`]. pub do_metadata_compaction: bool, } impl GcCompactJob { pub fn from_compact_options(options: CompactOptions) -> Self { GcCompactJob { dry_run: options.flags.contains(CompactFlags::DryRun), compact_key_range: options .compact_key_range .map(|x| x.into()) .unwrap_or(Key::MIN..Key::MAX), compact_lsn_range: options .compact_lsn_range .map(|x| x.into()) .unwrap_or(Lsn::INVALID..Lsn::MAX), do_metadata_compaction: options.gc_compaction_do_metadata_compaction, } } } /// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called /// and contains the exact layers we want to compact. pub struct GcCompactionJobDescription { /// All layers to read in the compaction job selected_layers: Vec, /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to /// keep all deltas <= this LSN or generate an image == this LSN. gc_cutoff: Lsn, /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or /// generate an image == this LSN. retain_lsns_below_horizon: Vec, /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data /// \>= this LSN will be kept and will not be rewritten. max_layer_lsn: Lsn, /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive. /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of /// k-merge within gc-compaction. min_layer_lsn: Lsn, /// Only compact layers overlapping with this range. compaction_key_range: Range, /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap. /// This field is here solely for debugging. The field will not be read once the compaction /// description is generated. rewrite_layers: Vec>, } /// The result of bottom-most compaction for a single key at each LSN. #[derive(Debug)] #[cfg_attr(test, derive(PartialEq))] pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>); /// The result of bottom-most compaction. #[derive(Debug)] #[cfg_attr(test, derive(PartialEq))] pub(crate) struct KeyHistoryRetention { /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN. pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>, /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN. pub(crate) above_horizon: KeyLogAtLsn, } impl KeyHistoryRetention { /// Hack: skip delta layer if we need to produce a layer of a same key-lsn. /// /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. /// For example, consider the case where a single delta with range [0x10,0x50) exists. /// And we have branches at LSN 0x10, 0x20, 0x30. /// Then we delete branch @ 0x20. /// Bottom-most compaction may now delete the delta [0x20,0x30). /// And that wouldnt' change the shape of the layer. /// /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. /// /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside. async fn discard_key(key: &PersistentLayerKey, tline: &Arc, dry_run: bool) -> bool { if dry_run { return true; } if LayerMap::is_l0(&key.key_range, key.is_delta) { // gc-compaction should not produce L0 deltas, otherwise it will break the layer order. // We should ignore such layers. return true; } let layer_generation; { let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await; if !guard.contains_key(key) { return false; } layer_generation = guard.get_from_key(key).metadata().generation; } if layer_generation == tline.generation { info!( key=%key, ?layer_generation, "discard layer due to duplicated layer key in the same generation", ); true } else { false } } /// Pipe a history of a single key to the writers. /// /// If `image_writer` is none, the images will be placed into the delta layers. /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images. #[allow(clippy::too_many_arguments)] async fn pipe_to( self, key: Key, delta_writer: &mut SplitDeltaLayerWriter<'_>, mut image_writer: Option<&mut SplitImageLayerWriter<'_>>, stat: &mut CompactionStatistics, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut first_batch = true; for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { if first_batch { if logs.len() == 1 && logs[0].1.is_image() { let Value::Image(img) = &logs[0].1 else { unreachable!() }; stat.produce_image_key(img); if let Some(image_writer) = image_writer.as_mut() { image_writer.put_image(key, img.clone(), ctx).await?; } else { delta_writer .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx) .await?; } } else { for (lsn, val) in logs { stat.produce_key(&val); delta_writer.put_value(key, lsn, val, ctx).await?; } } first_batch = false; } else { for (lsn, val) in logs { stat.produce_key(&val); delta_writer.put_value(key, lsn, val, ctx).await?; } } } let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; for (lsn, val) in above_horizon_logs { stat.produce_key(&val); delta_writer.put_value(key, lsn, val, ctx).await?; } Ok(()) } /// Verify if every key in the retention is readable by replaying the logs. async fn verify( &self, key: Key, base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>, full_history: &[(Key, Lsn, Value)], tline: &Arc, ) -> anyhow::Result<()> { // Usually the min_lsn should be the first record but we do a full iteration to be safe. let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else { // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. return Ok(()); }; let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else { // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. return Ok(()); }; let mut base_img = base_img_from_ancestor .as_ref() .map(|(_, lsn, img)| (*lsn, img)); let mut history = Vec::new(); async fn collect_and_verify( key: Key, lsn: Lsn, base_img: &Option<(Lsn, &Bytes)>, history: &[(Lsn, &NeonWalRecord)], tline: &Arc, skip_empty: bool, ) -> anyhow::Result<()> { if base_img.is_none() && history.is_empty() { if skip_empty { return Ok(()); } anyhow::bail!("verification failed: key {} has no history at {}", key, lsn); }; let mut records = history .iter() .map(|(lsn, val)| (*lsn, (*val).clone())) .collect::>(); // WAL redo requires records in the reverse LSN order records.reverse(); let data = ValueReconstructState { img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())), records, }; tline .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) .await .with_context(|| format!("verification failed for key {key} at lsn {lsn}"))?; Ok(()) } for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon { for (lsn, val) in logs { match val { Value::Image(img) => { base_img = Some((*lsn, img)); history.clear(); } Value::WalRecord(rec) if val.will_init() => { base_img = None; history.clear(); history.push((*lsn, rec)); } Value::WalRecord(rec) => { history.push((*lsn, rec)); } } } if *retain_lsn >= min_lsn { // Only verify after the key appears in the full history for the first time. // We don't modify history: in theory, we could replace the history with a single // image as in `generate_key_retention` to make redos at later LSNs faster. But we // want to verify everything as if they are read from the real layer map. collect_and_verify(key, *retain_lsn, &base_img, &history, tline, false) .await .context("below horizon retain_lsn")?; } } for (lsn, val) in &self.above_horizon.0 { match val { Value::Image(img) => { // Above the GC horizon, we verify every time we see an image. collect_and_verify(key, *lsn, &base_img, &history, tline, true) .await .context("above horizon full image")?; base_img = Some((*lsn, img)); history.clear(); } Value::WalRecord(rec) if val.will_init() => { // Above the GC horizon, we verify every time we see an init record. collect_and_verify(key, *lsn, &base_img, &history, tline, true) .await .context("above horizon init record")?; base_img = None; history.clear(); history.push((*lsn, rec)); } Value::WalRecord(rec) => { history.push((*lsn, rec)); } } } // Ensure the latest record is readable. collect_and_verify(key, max_lsn, &base_img, &history, tline, false) .await .context("latest record")?; Ok(()) } } #[derive(Debug, Serialize, Default)] struct CompactionStatisticsNumSize { num: u64, size: u64, } #[derive(Debug, Serialize, Default)] pub struct CompactionStatistics { /// Delta layer visited (maybe compressed, physical size) delta_layer_visited: CompactionStatisticsNumSize, /// Image layer visited (maybe compressed, physical size) image_layer_visited: CompactionStatisticsNumSize, /// Delta layer produced (maybe compressed, physical size) delta_layer_produced: CompactionStatisticsNumSize, /// Image layer produced (maybe compressed, physical size) image_layer_produced: CompactionStatisticsNumSize, /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) delta_layer_discarded: CompactionStatisticsNumSize, /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) image_layer_discarded: CompactionStatisticsNumSize, num_unique_keys_visited: usize, /// Delta visited (uncompressed, original size) wal_keys_visited: CompactionStatisticsNumSize, /// Image visited (uncompressed, original size) image_keys_visited: CompactionStatisticsNumSize, /// Delta produced (uncompressed, original size) wal_produced: CompactionStatisticsNumSize, /// Image produced (uncompressed, original size) image_produced: CompactionStatisticsNumSize, // Time spent in each phase time_acquire_lock_secs: f64, time_analyze_secs: f64, time_download_layer_secs: f64, time_to_first_kv_pair_secs: f64, time_main_loop_secs: f64, time_final_phase_secs: f64, time_total_secs: f64, // Summary /// Ratio of the key-value size after/before gc-compaction. uncompressed_retention_ratio: f64, /// Ratio of the physical size after/before gc-compaction. compressed_retention_ratio: f64, } impl CompactionStatistics { fn estimated_size_of_value(val: &Value) -> usize { match val { Value::Image(img) => img.len(), Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), _ => std::mem::size_of::(), } } fn estimated_size_of_key() -> usize { KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer) } fn visit_delta_layer(&mut self, size: u64) { self.delta_layer_visited.num += 1; self.delta_layer_visited.size += size; } fn visit_image_layer(&mut self, size: u64) { self.image_layer_visited.num += 1; self.image_layer_visited.size += size; } fn on_unique_key_visited(&mut self) { self.num_unique_keys_visited += 1; } fn visit_wal_key(&mut self, val: &Value) { self.wal_keys_visited.num += 1; self.wal_keys_visited.size += Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; } fn visit_image_key(&mut self, val: &Value) { self.image_keys_visited.num += 1; self.image_keys_visited.size += Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; } fn produce_key(&mut self, val: &Value) { match val { Value::Image(img) => self.produce_image_key(img), Value::WalRecord(_) => self.produce_wal_key(val), } } fn produce_wal_key(&mut self, val: &Value) { self.wal_produced.num += 1; self.wal_produced.size += Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; } fn produce_image_key(&mut self, val: &Bytes) { self.image_produced.num += 1; self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; } fn discard_delta_layer(&mut self, original_size: u64) { self.delta_layer_discarded.num += 1; self.delta_layer_discarded.size += original_size; } fn discard_image_layer(&mut self, original_size: u64) { self.image_layer_discarded.num += 1; self.image_layer_discarded.size += original_size; } fn produce_delta_layer(&mut self, size: u64) { self.delta_layer_produced.num += 1; self.delta_layer_produced.size += size; } fn produce_image_layer(&mut self, size: u64) { self.image_layer_produced.num += 1; self.image_layer_produced.size += size; } fn finalize(&mut self) { let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; let produced_key_value_size = self.image_produced.size + self.wal_produced.size; self.uncompressed_retention_ratio = produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0 let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; let produced_physical_size = self.image_layer_produced.size + self.delta_layer_produced.size + self.image_layer_discarded.size + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate self.compressed_retention_ratio = produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0 } } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] pub enum CompactionOutcome { #[default] /// No layers need to be compacted after this round. Compaction doesn't need /// to be immediately scheduled. Done, /// Still has pending layers to be compacted after this round. Ideally, the scheduler /// should immediately schedule another compaction. Pending, /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only /// guaranteed when `compaction_l0_first` is enabled). YieldForL0, /// Compaction was skipped, because the timeline is ineligible for compaction. Skipped, } impl Timeline { /// TODO: cancellation /// /// Returns whether the compaction has pending tasks. pub(crate) async fn compact_legacy( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, ) -> Result { if options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { self.compact_with_gc(cancel, options, ctx).await?; return Ok(CompactionOutcome::Done); } if options.flags.contains(CompactFlags::DryRun) { return Err(CompactionError::Other(anyhow!( "dry-run mode is not supported for legacy compaction for now" ))); } if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() { // maybe useful in the future? could implement this at some point return Err(CompactionError::Other(anyhow!( "compaction range is not supported for legacy compaction for now" ))); } // High level strategy for compaction / image creation: // // 1. First, do a L0 compaction to ensure we move the L0 // layers into the historic layer map get flat levels of // layers. If we did not compact all L0 layers, we will // prioritize compacting the timeline again and not do // any of the compactions below. // // 2. Then, calculate the desired "partitioning" of the // currently in-use key space. The goal is to partition the // key space into roughly fixed-size chunks, but also take into // account any existing image layers, and try to align the // chunk boundaries with the existing image layers to avoid // too much churn. Also try to align chunk boundaries with // relation boundaries. In principle, we don't know about // relation boundaries here, we just deal with key-value // pairs, and the code in pgdatadir_mapping.rs knows how to // map relations into key-value pairs. But in practice we know // that 'field6' is the block number, and the fields 1-5 // identify a relation. This is just an optimization, // though. // // 3. Once we know the partitioning, for each partition, // decide if it's time to create a new image layer. The // criteria is: there has been too much "churn" since the last // image layer? The "churn" is fuzzy concept, it's a // combination of too many delta files, or too much WAL in // total in the delta file. Or perhaps: if creating an image // file would allow to delete some older files. // // 4. In the end, if the tenant gets auto-sharded, we will run // a shard-ancestor compaction. // Is the timeline being deleted? if self.is_stopping() { trace!("Dropping out of compaction on timeline shutdown"); return Err(CompactionError::new_cancelled()); } let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed // HADRON let force_image_creation_lsn = self.get_force_image_creation_lsn(); // 1. L0 Compact let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); let l0_outcome = self .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), force_image_creation_lsn, ctx, ) .await?; timer.stop_and_record(); l0_outcome }; if options.flags.contains(CompactFlags::OnlyL0Compaction) { return Ok(l0_outcome); } // Yield if we have pending L0 compaction. The scheduler will do another pass. if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) && options.flags.contains(CompactFlags::YieldForL0) { info!("image/ancestor compaction yielding for L0 compaction"); return Ok(CompactionOutcome::YieldForL0); } let gc_cutoff = *self.applied_gc_cutoff_lsn.read(); let l0_l1_boundary_lsn = { // We do the repartition on the L0-L1 boundary. All data below the boundary // are compacted by L0 with low read amplification, thus making the `repartition` // function run fast. let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; guard .all_persistent_layers() .iter() .map(|x| { // Use the end LSN of delta layers OR the start LSN of image layers. if x.is_delta { x.lsn_range.end } else { x.lsn_range.start } }) .max() }; let (partition_mode, partition_lsn) = { let last_repartition_lsn = self.partitioning.read().1; let lsn = match l0_l1_boundary_lsn { Some(boundary) => gc_cutoff .max(boundary) .max(last_repartition_lsn) .max(self.initdb_lsn) .max(self.ancestor_lsn), None => self.get_last_record_lsn(), }; if lsn <= self.initdb_lsn || lsn <= self.ancestor_lsn { // Do not attempt to create image layers below the initdb or ancestor LSN -- no data below it ("l0_l1_boundary", self.get_last_record_lsn()) } else { ("l0_l1_boundary", lsn) } }; // 2. Repartition and create image layers if necessary match self .repartition( partition_lsn, self.get_compaction_target_size(), options.flags, ctx, ) .await { Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) .attached_child(); let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); // 3. Create new image layers for partitions that have been modified "enough". let mode = if options .flags .contains(CompactFlags::ForceImageLayerCreation) { ImageLayerCreationMode::Force } else { ImageLayerCreationMode::Try }; let (image_layers, outcome) = self .create_image_layers( &partitioning, lsn, force_image_creation_lsn, mode, &image_ctx, self.last_image_layer_creation_status .load() .as_ref() .clone(), options.flags.contains(CompactFlags::YieldForL0), ) .instrument(info_span!("create_image_layers", mode = %mode, partition_mode = %partition_mode, lsn = %lsn)) .await .inspect_err(|err| { if let CreateImageLayersError::GetVectoredError( GetVectoredError::MissingKey(_), ) = err { critical_timeline!( self.tenant_shard_id, self.timeline_id, Some(&self.corruption_detected), "missing key during compaction: {err:?}" ); } })?; self.last_image_layer_creation_status .store(Arc::new(outcome.clone())); self.upload_new_image_layers(image_layers)?; if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. info!( "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)." ); return Ok(CompactionOutcome::YieldForL0); } } Ok(_) => { // This happens very frequently so we don't want to log it. debug!("skipping repartitioning due to image compaction LSN being below GC cutoff"); } // Suppress errors when cancelled. // // Log other errors but continue. Failure to repartition is normal, if the timeline was just created // as an empty timeline. Also in unit tests, when we use the timeline as a simple // key-value store, ignoring the datadir layout. Log the error but continue. // // TODO: // 1. shouldn't we return early here if we observe cancellation // 2. Experiment: can we stop checking self.cancel here? Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch Err(err) if err.is_cancel() => {} Err(RepartitionError::CollectKeyspace( e @ CollectKeySpaceError::Decode(_) | e @ CollectKeySpaceError::PageRead( PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), ), )) => { // Alert on critical errors that indicate data corruption. critical_timeline!( self.tenant_shard_id, self.timeline_id, Some(&self.corruption_detected), "could not compact, repartitioning keyspace failed: {e:?}" ); } Err(e) => error!( "could not compact, repartitioning keyspace failed: {:?}", e.into_anyhow() ), }; let partition_count = self.partitioning.read().0.0.parts.len(); // 4. Shard ancestor compaction if self.get_compaction_shard_ancestor() && self.shard_identity.count >= ShardCount::new(2) { // Limit the number of layer rewrites to the number of partitions: this means its // runtime should be comparable to a full round of image layer creations, rather than // being potentially much longer. let rewrite_max = partition_count; let outcome = self .compact_shard_ancestors( rewrite_max, options.flags.contains(CompactFlags::YieldForL0), ctx, ) .await?; match outcome { CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome), CompactionOutcome::Done | CompactionOutcome::Skipped => {} } } Ok(CompactionOutcome::Done) } /* BEGIN_HADRON */ // Get the force image creation LSN based on gc_cutoff_lsn. // Note that this is an estimation and the workload rate may suddenly change. When that happens, // the force image creation may be too early or too late, but eventually it should be able to catch up. pub(crate) fn get_force_image_creation_lsn(self: &Arc) -> Option { let image_creation_period = self.get_image_layer_force_creation_period()?; let current_lsn = self.get_last_record_lsn(); let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?; let pitr_interval = self.get_pitr_interval(); if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() { tracing::warn!( "pitr LSN/interval not found, skipping force image creation LSN calculation" ); return None; } let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0 * image_creation_period.as_secs() / pitr_interval.as_secs(); let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0)); tracing::info!( "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}", self.tenant_shard_id, force_image_creation_lsn, current_lsn, image_creation_period, pitr_lsn, pitr_interval ); Some(force_image_creation_lsn) } /* END_HADRON */ /// Check for layers that are elegible to be rewritten: /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that /// we don't indefinitely retain keys in this shard that aren't needed. /// - For future use: layers beyond pitr_interval that are in formats we would /// rather not maintain compatibility with indefinitely. /// /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound /// how much work it will try to do in each compaction pass. async fn compact_shard_ancestors( self: &Arc, rewrite_max: usize, yield_for_l0: bool, ctx: &RequestContext, ) -> Result { let mut outcome = CompactionOutcome::Done; let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a // layer is behind this Lsn, it indicates that the layer is being retained beyond the // pitr_interval, for example because a branchpoint references it. // // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; let layers = self.layers.read(LayerManagerLockHolder::Compaction).await; let layers_iter = layers.layer_map()?.iter_historic_layers(); let (layers_total, mut layers_checked) = (layers_iter.len(), 0); for layer_desc in layers_iter { layers_checked += 1; let layer = layers.get_from_desc(&layer_desc); if layer.metadata().shard.shard_count == self.shard_identity.count { // This layer does not belong to a historic ancestor, no need to re-image it. continue; } // This layer was created on an ancestor shard: check if it contains any data for this shard. let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity); let layer_local_page_count = sharded_range.page_count(); let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range()); if layer_local_page_count == 0 { // This ancestral layer only covers keys that belong to other shards. // We include the full metadata in the log: if we had some critical bug that caused // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. debug!(%layer, old_metadata=?layer.metadata(), "dropping layer after shard split, contains no keys for this shard", ); if cfg!(debug_assertions) { // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer // should be !is_key_disposable() // TODO: exclude sparse keyspace from this check, otherwise it will infinitely loop. let range = layer_desc.get_key_range(); let mut key = range.start; while key < range.end { debug_assert!(self.shard_identity.is_key_disposable(&key)); key = key.next(); } } drop_layers.push(layer); continue; } else if layer_local_page_count != u32::MAX && layer_local_page_count == layer_raw_page_count { debug!(%layer, "layer is entirely shard local ({} keys), no need to filter it", layer_local_page_count ); continue; } // Only rewrite a layer if we can reclaim significant space. if layer_local_page_count != u32::MAX && layer_local_page_count as f64 / layer_raw_page_count as f64 <= ANCESTOR_COMPACTION_REWRITE_THRESHOLD { debug!(%layer, "layer has a large share of local pages \ ({layer_local_page_count}/{layer_raw_page_count} > \ {ANCESTOR_COMPACTION_REWRITE_THRESHOLD}), not rewriting", ); } // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually // without incurring the I/O cost of a rewrite. if layer_desc.get_lsn_range().end >= *latest_gc_cutoff { debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})", layer_desc.get_lsn_range().end, *latest_gc_cutoff); continue; } // We do not yet implement rewrite of delta layers. if layer_desc.is_delta() { debug!(%layer, "Skipping rewrite of delta layer"); continue; } // We don't bother rewriting layers that aren't visible, since these won't be needed by // reads and will likely be garbage collected soon. if layer.visibility() != LayerVisibilityHint::Visible { debug!(%layer, "Skipping rewrite of invisible layer"); continue; } // Only rewrite layers if their generations differ. This guarantees: // - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one // - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage if layer.metadata().generation == self.generation { debug!(%layer, "Skipping rewrite, is not from old generation"); continue; } if layers_to_rewrite.len() >= rewrite_max { debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", layers_to_rewrite.len() ); outcome = CompactionOutcome::Pending; break; } // Fall through: all our conditions for doing a rewrite passed. layers_to_rewrite.push(layer); } // Drop read lock on layer map before we start doing time-consuming I/O. drop(layers); // Drop out early if there's nothing to do. if layers_to_rewrite.is_empty() && drop_layers.is_empty() { return Ok(CompactionOutcome::Done); } info!( "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \ checked {layers_checked}/{layers_total} layers \ (latest_gc_cutoff={} pitr_cutoff={:?})", layers_to_rewrite.len(), drop_layers.len(), *latest_gc_cutoff, pitr_cutoff, ); let started = Instant::now(); let mut replace_image_layers = Vec::new(); let total = layers_to_rewrite.len(); for (i, layer) in layers_to_rewrite.into_iter().enumerate() { if self.cancel.is_cancelled() { return Err(CompactionError::new_cancelled()); } info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &layer.layer_desc().key_range, layer.layer_desc().image_layer_lsn(), &self.gate, self.cancel.clone(), ctx, ) .await .map_err(CompactionError::Other)?; // Safety of layer rewrites: // - We are writing to a different local file path than we are reading from, so the old Layer // cannot interfere with the new one. // - In the page cache, contents for a particular VirtualFile are stored with a file_id that // is different for two layers with the same name (in `ImageLayerInner::new` we always // acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk // reading the index from one layer file, and then data blocks from the rewritten layer file. // - Any readers that have a reference to the old layer will keep it alive until they are done // with it. If they are trying to promote from remote storage, that will fail, but this is the same // as for compaction generally: compaction is allowed to delete layers that readers might be trying to use. // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. let resident = layer.download_and_keep_resident(ctx).await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) .await?; if keys_written > 0 { let (desc, path) = image_layer_writer .finish(ctx) .await .map_err(CompactionError::Other)?; let new_layer = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; info!(layer=%new_layer, "rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); replace_image_layers.push((layer, new_layer)); } else { // Drop the old layer. Usually for this case we would already have noticed that // the layer has no data for us with the ShardedRange check above, but drop_layers.push(layer); } // Yield for L0 compaction if necessary, but make sure we update the layer map below // with the work we've already done. if yield_for_l0 && self .l0_compaction_trigger .notified() .now_or_never() .is_some() { info!("shard ancestor compaction yielding for L0 compaction"); outcome = CompactionOutcome::YieldForL0; break; } } for layer in &drop_layers { info!(%layer, old_metadata=?layer.metadata(), "dropping layer after shard split (no keys for this shard)", ); } // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch // to remote index) and be removed. This is inefficient but safe. fail::fail_point!("compact-shard-ancestors-localonly"); // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage self.rewrite_layers(replace_image_layers, drop_layers) .await?; fail::fail_point!("compact-shard-ancestors-enqueued"); // We wait for all uploads to complete before finishing this compaction stage. This is not // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. if outcome != CompactionOutcome::YieldForL0 { info!("shard ancestor compaction waiting for uploads"); tokio::select! { result = self.remote_client.wait_completion() => match result { Ok(()) => {}, Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { return Err(CompactionError::new_cancelled()); } }, // Don't wait if there's L0 compaction to do. We don't need to update the outcome // here, because we've already done the actual work. _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {}, } } info!( "shard ancestor compaction done in {:.3}s{}", started.elapsed().as_secs_f64(), match outcome { CompactionOutcome::Pending => format!(", with pending work (rewrite_max={rewrite_max})"), CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"), CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(), } ); fail::fail_point!("compact-shard-ancestors-persistent"); Ok(outcome) } /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is /// an image layer between them and the most recent readable LSN (branch point or tip of timeline). The /// purpose of the visibility hint is to record which layers need to be available to service reads. /// /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// that we know won't be needed for reads. pub(crate) async fn update_layer_visibility( &self, ) -> Result<(), super::layer_manager::Shutdown> { let head_lsn = self.get_last_record_lsn(); // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here. // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that // they will be subject to L0->L1 compaction in the near future. let layer_manager = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let layer_map = layer_manager.layer_map()?; let readable_points = { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); for (child_lsn, _child_timeline_id, is_offloaded) in &children { if *is_offloaded == MaybeOffloaded::Yes { continue; } readable_points.push(*child_lsn); } readable_points.push(head_lsn); readable_points }; let (layer_visibility, covered) = layer_map.get_visibility(readable_points); for (layer_desc, visibility) in layer_visibility { // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one let layer = layer_manager.get_from_desc(&layer_desc); layer.set_visibility(visibility); } // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can // avoid assuming that everything at a branch point is visible. drop(covered); Ok(()) } /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as /// as Level 1 files. Returns whether the L0 layers are fully compacted. async fn compact_level0( self: &Arc, target_file_size: u64, force_compaction_ignore_threshold: bool, force_compaction_lsn: Option, ctx: &RequestContext, ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, outcome, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); let stats = CompactLevel0Phase1StatsBuilder { version: Some(2), tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; self.compact_level0_phase1( stats, target_file_size, force_compaction_ignore_threshold, force_compaction_lsn, &ctx, ) .instrument(phase1_span) .await? }; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do return Ok(CompactionOutcome::Done); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; Ok(outcome) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. async fn compact_level0_phase1( self: &Arc, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, force_compaction_lsn: Option, ctx: &RequestContext, ) -> Result { let begin = tokio::time::Instant::now(); let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); let layers = guard.layer_map()?; let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { if force_compaction_ignore_threshold { if !level0_deltas.is_empty() { info!( level0_deltas = level0_deltas.len(), threshold, "too few deltas to compact, but forcing compaction" ); } else { info!( level0_deltas = level0_deltas.len(), threshold, "too few deltas to compact, cannot force compaction" ); return Ok(CompactLevel0Phase1Result::default()); } } else { // HADRON let min_lsn = level0_deltas .iter() .map(|a| a.get_lsn_range().start) .reduce(min); if force_compaction_lsn.is_some() && min_lsn.is_some() && min_lsn.unwrap() < force_compaction_lsn.unwrap() { info!( "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}", level0_deltas.len(), min_lsn.unwrap(), force_compaction_lsn.unwrap() ); } else { debug!( level0_deltas = level0_deltas.len(), threshold, "too few deltas to compact" ); return Ok(CompactLevel0Phase1Result::default()); } } } let mut level0_deltas = level0_deltas .iter() .map(|x| guard.get_from_desc(x)) .collect::>(); drop_layer_manager_rlock(guard); // The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated // by the time we finish the compaction. So we need to get it here. let l0_last_record_lsn = self.get_last_record_lsn(); // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other // level 0 files that form a contiguous sequence, such that the end // LSN of previous file matches the start LSN of the next file. // // Note that if the files don't form such a sequence, we might // "compact" just a single file. That's a bit pointless, but it allows // us to get rid of the level 0 file, and compact the other files on // the next iteration. This could probably made smarter, but such // "gaps" in the sequence of level 0 files should only happen in case // of a crash, partial download from cloud storage, or something like // that, so it's not a big deal in practice. level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); let mut level0_deltas_iter = level0_deltas.iter(); let first_level0_delta = level0_deltas_iter.next().unwrap(); let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); // Accumulate the size of layers in `deltas_to_compact` let mut deltas_to_compact_bytes = 0; // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size // checkpoint_distance each. To avoid edge cases using extra system resources, bound our // work in this function to only operate on this much delta data at once. // // In general, compaction_threshold should be <= compaction_upper_limit, but in case that // the constraint is not respected, we use the larger of the two. let delta_size_limit = std::cmp::max( self.get_compaction_upper_limit(), self.get_compaction_threshold(), ) as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); let mut fully_compacted = true; deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } deltas_to_compact.push(l.download_and_keep_resident(ctx).await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; if deltas_to_compact_bytes >= delta_size_limit { info!( l0_deltas_selected = deltas_to_compact.len(), l0_deltas_total = level0_deltas.len(), "L0 compaction picker hit max delta layer size limit: {}", delta_size_limit ); fully_compacted = false; // Proceed with compaction, but only a subset of L0s break; } } let lsn_range = Range { start: deltas_to_compact .first() .unwrap() .layer_desc() .lsn_range .start, end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, }; info!( "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", lsn_range.start, lsn_range.end, deltas_to_compact.len(), level0_deltas.len() ); for l in deltas_to_compact.iter() { info!("compact includes {l}"); } // We don't need the original list of layers anymore. Drop it so that // we don't accidentally use it later in the function. drop(level0_deltas); stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now(); // TODO: replace with streaming k-merge let all_keys = { let mut all_keys = Vec::new(); for l in deltas_to_compact.iter() { if self.cancel.is_cancelled() { return Err(CompactionError::new_cancelled()); } let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; let keys = delta .index_entries(ctx) .await .map_err(CompactionError::Other)?; all_keys.extend(keys); } // The current stdlib sorting implementation is designed in a way where it is // particularly fast where the slice is made up of sorted sub-ranges. all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); all_keys }; stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now(); // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. // // A hole is a key range for which this compaction doesn't have any WAL records. // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range, // cover the hole, but actually don't contain any WAL records for that key range. // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`). // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records. // // The algorithm chooses holes as follows. // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys). // - Filter: min threshold on range length // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data) // // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451 #[derive(PartialEq, Eq)] struct Hole { key_range: Range, coverage_size: usize, } let holes: Vec = { use std::cmp::Ordering; impl Ord for Hole { fn cmp(&self, other: &Self) -> Ordering { self.coverage_size.cmp(&other.coverage_size).reverse() } } impl PartialOrd for Hole { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } let max_holes = deltas_to_compact.len(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; for &DeltaEntry { key: next_key, .. } in all_keys.iter() { if let Some(prev_key) = prev { // just first fast filter, do not create hole entries for metadata keys. The last hole in the // compaction is the gap between data key and metadata keys. if next_key.to_i128() - prev_key.to_i128() >= min_hole_range && !Key::is_metadata_key(&prev_key) { let key_range = prev_key..next_key; // Measuring hole by just subtraction of i128 representation of key range boundaries // has not so much sense, because largest holes will corresponds field1/field2 changes. // But we are mostly interested to eliminate holes which cause generation of excessive image layers. // That is why it is better to measure size of hole as number of covering image layers. let coverage_size = { // TODO: optimize this with copy-on-write layer map. let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let layers = guard.layer_map()?; layers.image_coverage(&key_range, l0_last_record_lsn).len() }; if coverage_size >= min_hole_coverage_size { heap.push(Hole { key_range, coverage_size, }); if heap.len() > max_holes { heap.pop(); // remove smallest hole } } } } prev = Some(next_key.next()); } let mut holes = heap.into_vec(); holes.sort_unstable_by_key(|hole| hole.key_range.start); holes }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); if self.cancel.is_cancelled() { return Err(CompactionError::new_cancelled()); } stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), // then the Value::Image is ordered before Value::WalRecord. let mut all_values_iter = { let mut deltas = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact.iter() { let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; deltas.push(l); } MergeIterator::create_with_options( &deltas, &[], ctx, 1024 * 8192, /* 8 MiB buffer per layer iterator */ 1024, ) }; // This iterator walks through all keys and is needed to calculate size used by each key let mut all_keys_iter = all_keys .iter() .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) .coalesce(|mut prev, cur| { // Coalesce keys that belong to the same key pair. // This ensures that compaction doesn't put them // into different layer files. // Still limit this by the target file size, // so that we keep the size of the files in // check. if prev.0 == cur.0 && prev.2 < target_file_size { prev.2 += cur.2; Ok(prev) } else { Err((prev, cur)) } }); // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. // It's possible that there is a single key with so many page versions that storing all of them in a single layer file // would be too large. In that case, we also split on the LSN dimension. // // LSN // ^ // | // | +-----------+ +--+--+--+--+ // | | | | | | | | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ ==> | | | | | // | | | | | | | | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ +--+--+--+--+ // | // +--------------> key // // // If one key (X) has a lot of page versions: // // LSN // ^ // | (X) // | +-----------+ +--+--+--+--+ // | | | | | | | | // | +-----------+ | | +--+ | // | | | | | | | | // | +-----------+ ==> | | | | | // | | | | | +--+ | // | +-----------+ | | | | | // | | | | | | | | // | +-----------+ +--+--+--+--+ // | // +--------------> key // TODO: this actually divides the layers into fixed-size chunks, not // based on the partitioning. // // TODO: we should also opportunistically materialize and // garbage collect what we can. let mut new_layers = Vec::new(); let mut prev_key: Option = None; let mut writer: Option = None; let mut key_values_total_size = 0u64; let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key let mut next_hole = 0; // index of next hole in holes vector let mut keys = 0; while let Some((key, lsn, value)) = all_values_iter .next() .await .map_err(CompactionError::Other)? { keys += 1; if keys % 32_768 == 0 && self.cancel.is_cancelled() { // avoid hitting the cancellation token on every key. in benches, we end up // shuffling an order of million keys per layer, this means we'll check it // around tens of times per layer. return Err(CompactionError::new_cancelled()); } let same_key = prev_key == Some(key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { let mut next_key_size = 0u64; let is_dup_layer = dup_end_lsn.is_valid(); dup_start_lsn = Lsn::INVALID; if !same_key { dup_end_lsn = Lsn::INVALID; } // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { next_key_size = next_size; if key != next_key { if dup_end_lsn.is_valid() { // We are writting segment with duplicates: // place all remaining values of this key in separate segment dup_start_lsn = dup_end_lsn; // new segments starts where old stops dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range } break; } key_values_total_size += next_size; // Check if it is time to split segment: if total keys size is larger than target file size. // We need to avoid generation of empty segments if next_size > target_file_size. if key_values_total_size > target_file_size && lsn != next_lsn { // Split key between multiple layers: such layer can contain only single key dup_start_lsn = if dup_end_lsn.is_valid() { dup_end_lsn // new segment with duplicates starts where old one stops } else { lsn // start with the first LSN for this key }; dup_end_lsn = next_lsn; // upper LSN boundary is exclusive break; } } // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { dup_start_lsn = dup_end_lsn; dup_end_lsn = lsn_range.end; } if writer.is_some() { let written_size = writer.as_mut().unwrap().size(); let contains_hole = next_hole < holes.len() && key >= holes[next_hole].key_range.end; // check if key cause layer overflow or contains hole... if is_dup_layer || dup_end_lsn.is_valid() || written_size + key_values_total_size > target_file_size || contains_hole { // ... if so, flush previous layer and prepare to write new one let (desc, path) = writer .take() .unwrap() .finish(prev_key.unwrap().next(), ctx) .await .map_err(CompactionError::Other)?; let new_delta = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; new_layers.push(new_delta); writer = None; if contains_hole { // skip hole next_hole += 1; } } } // Remember size of key value because at next iteration we will access next item key_values_total_size = next_key_size; } fail_point!("delta-layer-writer-fail-before-finish", |_| { Err(CompactionError::Other(anyhow::anyhow!( "failpoint delta-layer-writer-fail-before-finish" ))) }); if !self.shard_identity.is_key_disposable(&key) { if writer.is_none() { if self.cancel.is_cancelled() { // to be somewhat responsive to cancellation, check for each new layer return Err(CompactionError::new_cancelled()); } // Create writer if not initiaized yet writer = Some( DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, key, if dup_end_lsn.is_valid() { // this is a layer containing slice of values of the same key debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); dup_start_lsn..dup_end_lsn } else { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); lsn_range.clone() }, &self.gate, self.cancel.clone(), ctx, ) .await .map_err(CompactionError::Other)?, ); keys = 0; } writer .as_mut() .unwrap() .put_value(key, lsn, value, ctx) .await?; } else { let owner = self.shard_identity.get_shard_number(&key); // This happens after a shard split, when we're compacting an L0 created by our parent shard debug!("dropping key {key} during compaction (it belongs on shard {owner})"); } if !new_layers.is_empty() { fail_point!("after-timeline-compacted-first-L1"); } prev_key = Some(key); } if let Some(writer) = writer { let (desc, path) = writer .finish(prev_key.unwrap().next(), ctx) .await .map_err(CompactionError::Other)?; let new_delta = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; new_layers.push(new_delta); } // Sync layers if !new_layers.is_empty() { // Print a warning if the created layer is larger than double the target size // Add two pages for potential overhead. This should in theory be already // accounted for in the target calculation, but for very small targets, // we still might easily hit the limit otherwise. let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; for layer in new_layers.iter() { if layer.layer_desc().file_size > warn_limit { warn!( %layer, "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size ); } } // The writer.finish() above already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. // // We use fatal_err() below because the after writer.finish() returns with success, // the in-memory state of the filesystem already has the layer file in its final place, // and subsequent pageserver code could think it's durable while it really isn't. let timeline_dir = VirtualFile::open( &self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id), ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); timeline_dir .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); } stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); stats.new_deltas_count = Some(new_layers.len()); stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); match TryInto::::try_into(stats) .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) { Ok(stats_json) => { info!( stats_json = stats_json.as_str(), "compact_level0_phase1 stats available" ) } Err(e) => { warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); } } // Without this, rustc complains about deltas_to_compact still // being borrowed when we `.into_iter()` below. drop(all_values_iter); Ok(CompactLevel0Phase1Result { new_layers, deltas_to_compact: deltas_to_compact .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), outcome: if fully_compacted { CompactionOutcome::Done } else { CompactionOutcome::Pending }, }) } } #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec, // Whether we have included all L0 layers, or selected only part of them due to the // L0 compaction size limit. outcome: CompactionOutcome, } #[derive(Default)] struct CompactLevel0Phase1StatsBuilder { version: Option, tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, read_lock_held_key_sort_micros: DurationRecorder, compaction_prerequisites_micros: DurationRecorder, read_lock_held_compute_holes_micros: DurationRecorder, read_lock_drop_micros: DurationRecorder, write_layer_files_micros: DurationRecorder, level0_deltas_count: Option, new_deltas_count: Option, new_deltas_size: Option, } #[derive(serde::Serialize)] struct CompactLevel0Phase1Stats { version: u64, tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, read_lock_held_key_sort_micros: RecordedDuration, compaction_prerequisites_micros: RecordedDuration, read_lock_held_compute_holes_micros: RecordedDuration, read_lock_drop_micros: RecordedDuration, write_layer_files_micros: RecordedDuration, level0_deltas_count: usize, new_deltas_count: usize, new_deltas_size: u64, } impl TryFrom for CompactLevel0Phase1Stats { type Error = anyhow::Error; fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { Ok(Self { version: value.version.ok_or_else(|| anyhow!("version not set"))?, tenant_id: value .tenant_id .ok_or_else(|| anyhow!("tenant_id not set"))?, timeline_id: value .timeline_id .ok_or_else(|| anyhow!("timeline_id not set"))?, read_lock_acquisition_micros: value .read_lock_acquisition_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, read_lock_held_key_sort_micros: value .read_lock_held_key_sort_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, compaction_prerequisites_micros: value .compaction_prerequisites_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, read_lock_held_compute_holes_micros: value .read_lock_held_compute_holes_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, read_lock_drop_micros: value .read_lock_drop_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, write_layer_files_micros: value .write_layer_files_micros .into_recorded() .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, level0_deltas_count: value .level0_deltas_count .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, new_deltas_count: value .new_deltas_count .ok_or_else(|| anyhow!("new_deltas_count not set"))?, new_deltas_size: value .new_deltas_size .ok_or_else(|| anyhow!("new_deltas_size not set"))?, }) } } impl Timeline { /// Entry point for new tiered compaction algorithm. /// /// All the real work is in the implementation in the pageserver_compaction /// crate. The code here would apply to any algorithm implemented by the /// same interface, but tiered is the only one at the moment. /// /// TODO: cancellation pub(crate) async fn compact_tiered( self: &Arc, _cancel: &CancellationToken, ctx: &RequestContext, ) -> Result<(), CompactionError> { let fanout = self.get_compaction_threshold() as u64; let target_file_size = self.get_checkpoint_distance(); // Find the top of the historical layers let end_lsn = { let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let layers = guard.layer_map()?; let l0_deltas = layers.level0_deltas(); // As an optimization, if we find that there are too few L0 layers, // bail out early. We know that the compaction algorithm would do // nothing in that case. if l0_deltas.len() < fanout as usize { // doesn't need compacting return Ok(()); } l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap() }; // Is the timeline being deleted? if self.is_stopping() { trace!("Dropping out of compaction on timeline shutdown"); return Err(CompactionError::new_cancelled()); } let (dense_ks, _sparse_ks) = self .collect_keyspace(end_lsn, ctx) .await .map_err(CompactionError::from_collect_keyspace)?; // TODO(chi): ignore sparse_keyspace for now, compact it in the future. let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); pageserver_compaction::compact_tiered::compact_tiered( &mut adaptor, end_lsn, target_file_size, fanout, ctx, ) .await // TODO: compact_tiered needs to return CompactionError .map_err(CompactionError::Other)?; adaptor.flush_updates().await?; Ok(()) } /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns. /// /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon. /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch. /// /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have: /// /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60 /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3 /// /// The function will produce: /// /// ```plain /// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas /// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta /// above_horizon -> deltas=[+F@0x60] full history above the horizon /// ``` /// /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. #[allow(clippy::too_many_arguments)] pub(crate) async fn generate_key_retention( self: &Arc, key: Key, full_history: &[(Key, Lsn, Value)], horizon: Lsn, retain_lsn_below_horizon: &[Lsn], delta_threshold_cnt: usize, base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, verification: bool, ) -> anyhow::Result { // Pre-checks for the invariants let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); if debug_mode { for (log_key, _, _) in full_history { assert_eq!(log_key, &key, "mismatched key"); } for i in 1..full_history.len() { assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN"); if full_history[i - 1].1 == full_history[i].1 { assert!( matches!(full_history[i - 1].2, Value::Image(_)), "unordered delta/image, or duplicated delta" ); } } // There was an assertion for no base image that checks if the first // record in the history is `will_init` before, but it was removed. // This is explained in the test cases for generate_key_retention. // Search "incomplete history" for more information. for lsn in retain_lsn_below_horizon { assert!(lsn < &horizon, "retain lsn must be below horizon") } for i in 1..retain_lsn_below_horizon.len() { assert!( retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i], "unordered LSN" ); } } let has_ancestor = base_img_from_ancestor.is_some(); // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. let (mut split_history, lsn_split_points) = { let mut split_history = Vec::new(); split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new); let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1); for lsn in retain_lsn_below_horizon { lsn_split_points.push(*lsn); } lsn_split_points.push(horizon); let mut current_idx = 0; for item @ (_, lsn, _) in full_history { while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { current_idx += 1; } split_history[current_idx].push(item); } (split_history, lsn_split_points) }; // Step 2: filter out duplicated records due to the k-merge of image/delta layers for split_for_lsn in &mut split_history { let mut prev_lsn = None; let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len()); for record @ (_, lsn, _) in std::mem::take(split_for_lsn) { if let Some(prev_lsn) = &prev_lsn { if *prev_lsn == lsn { // The case that we have an LSN with both data from the delta layer and the image layer. As // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply // drop this delta and keep the image. // // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply // dropped. // // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta // threshold, we could have kept delta instead to save space. This is an optimization for the future. continue; } } prev_lsn = Some(lsn); new_split_for_lsn.push(record); } *split_for_lsn = new_split_for_lsn; } // Step 3: generate images when necessary let mut retention = Vec::with_capacity(split_history.len()); let mut records_since_last_image = 0; let batch_cnt = split_history.len(); assert!( batch_cnt >= 2, "should have at least below + above horizon batches" ); let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); if let Some((key, lsn, ref img)) = base_img_from_ancestor { replay_history.push((key, lsn, Value::Image(img.clone()))); } /// Generate debug information for the replay history fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String { use std::fmt::Write; let mut output = String::new(); if let Some((key, _, _)) = replay_history.first() { write!(output, "key={key} ").unwrap(); let mut cnt = 0; for (_, lsn, val) in replay_history { if val.is_image() { write!(output, "i@{lsn} ").unwrap(); } else if val.will_init() { write!(output, "di@{lsn} ").unwrap(); } else { write!(output, "d@{lsn} ").unwrap(); } cnt += 1; if cnt >= 128 { write!(output, "... and more").unwrap(); break; } } } else { write!(output, "").unwrap(); } output } fn generate_debug_trace( replay_history: Option<&[(Key, Lsn, Value)]>, full_history: &[(Key, Lsn, Value)], lsns: &[Lsn], horizon: Lsn, ) -> String { use std::fmt::Write; let mut output = String::new(); if let Some(replay_history) = replay_history { writeln!( output, "replay_history: {}", generate_history_trace(replay_history) ) .unwrap(); } else { writeln!(output, "replay_history: ",).unwrap(); } writeln!( output, "full_history: {}", generate_history_trace(full_history) ) .unwrap(); writeln!( output, "when processing: [{}] horizon={}", lsns.iter().map(|l| format!("{l}")).join(","), horizon ) .unwrap(); output } let mut key_exists = false; for (i, split_for_lsn) in split_history.into_iter().enumerate() { // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. records_since_last_image += split_for_lsn.len(); // Whether to produce an image into the final layer files let produce_image = if i == 0 && !has_ancestor { // We always generate images for the first batch (below horizon / lowest retain_lsn) true } else if i == batch_cnt - 1 { // Do not generate images for the last batch (above horizon) false } else if records_since_last_image == 0 { false } else if records_since_last_image >= delta_threshold_cnt { // Generate images when there are too many records true } else { false }; replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); // Only retain the items after the last image record for idx in (0..replay_history.len()).rev() { if replay_history[idx].2.will_init() { replay_history = replay_history[idx..].to_vec(); break; } } if replay_history.is_empty() && !key_exists { // The key does not exist at earlier LSN, we can skip this iteration. retention.push(Vec::new()); continue; } else { key_exists = true; } let Some((_, _, val)) = replay_history.first() else { unreachable!("replay history should not be empty once it exists") }; if !val.will_init() { return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| { generate_debug_trace( Some(&replay_history), full_history, retain_lsn_below_horizon, horizon, ) }); } // Whether to reconstruct the image. In debug mode, we will generate an image // at every retain_lsn to ensure data is not corrupted, but we won't put the // image into the final layer. let img_and_lsn = if produce_image { records_since_last_image = 0; let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); let history = std::mem::take(&mut replay_history); let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { img = Some((*lsn, val.clone())); for (_, lsn, val) in history.into_iter().skip(1) { let Value::WalRecord(rec) = val else { return Err(anyhow::anyhow!( "invalid record, first record is image, expect walrecords" )) .with_context(|| { generate_debug_trace( replay_history_for_debug_ref, full_history, retain_lsn_below_horizon, horizon, ) }); }; records.push((lsn, rec)); } } else { for (_, lsn, val) in history.into_iter() { let Value::WalRecord(rec) = val else { return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord")) .with_context(|| generate_debug_trace( replay_history_for_debug_ref, full_history, retain_lsn_below_horizon, horizon, )); }; records.push((lsn, rec)); } } // WAL redo requires records in the reverse LSN order records.reverse(); let state = ValueReconstructState { img, records }; // last batch does not generate image so i is always in range, unless we force generate // an image during testing let request_lsn = if i >= lsn_split_points.len() { Lsn::MAX } else { lsn_split_points[i] }; let img = self .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction) .await?; Some((request_lsn, img)) } else { None }; if produce_image { let (request_lsn, img) = img_and_lsn.unwrap(); replay_history.push((key, request_lsn, Value::Image(img.clone()))); retention.push(vec![(request_lsn, Value::Image(img))]); } else { let deltas = split_for_lsn .iter() .map(|(_, lsn, value)| (*lsn, value.clone())) .collect_vec(); retention.push(deltas); } } let mut result = Vec::with_capacity(retention.len()); assert_eq!(retention.len(), lsn_split_points.len() + 1); for (idx, logs) in retention.into_iter().enumerate() { if idx == lsn_split_points.len() { let retention = KeyHistoryRetention { below_horizon: result, above_horizon: KeyLogAtLsn(logs), }; if verification { retention .verify(key, &base_img_from_ancestor, full_history, self) .await?; } return Ok(retention); } else { result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); } } unreachable!("key retention is empty") } /// Check how much space is left on the disk async fn check_available_space(self: &Arc) -> anyhow::Result { let tenants_dir = self.conf.tenants_path(); let stat = Statvfs::get(&tenants_dir, None) .context("statvfs failed, presumably directory got unlinked")?; let (avail_bytes, _) = stat.get_avail_total_bytes(); Ok(avail_bytes) } /// Check if the compaction can proceed safely without running out of space. We assume the size /// upper bound of the produced files of a compaction job is the same as all layers involved in /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a /// compaction. async fn check_compaction_space( self: &Arc, layer_selection: &[Layer], ) -> Result<(), CompactionError> { let available_space = self .check_available_space() .await .map_err(CompactionError::Other)?; let mut remote_layer_size = 0; let mut all_layer_size = 0; for layer in layer_selection { let needs_download = layer .needs_download() .await .context("failed to check if layer needs download") .map_err(CompactionError::Other)?; if needs_download.is_some() { remote_layer_size += layer.layer_desc().file_size; } all_layer_size += layer.layer_desc().file_size; } let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { return Err(CompactionError::Other(anyhow!( "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size ))); } Ok(()) } /// Check to bail out of gc compaction early if it would use too much memory. async fn check_memory_usage( self: &Arc, layer_selection: &[Layer], ) -> Result<(), CompactionError> { let mut estimated_memory_usage_mb = 0.0; let mut num_image_layers = 0; let mut num_delta_layers = 0; let target_layer_size_bytes = 256 * 1024 * 1024; for layer in layer_selection { let layer_desc = layer.layer_desc(); if layer_desc.is_delta() { // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB). // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt // use 3MB layer size and we need to account for that). estimated_memory_usage_mb += 3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_delta_layers += 1; } else { // Image layers at most have 1MB buffer but it might be compressed; assume 5x compression ratio. estimated_memory_usage_mb += 5.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_image_layers += 1; } } if estimated_memory_usage_mb > 1024.0 { return Err(CompactionError::Other(anyhow!( "estimated memory usage is too high: {}MB, giving up compaction; num_image_layers={}, num_delta_layers={}", estimated_memory_usage_mb, num_image_layers, num_delta_layers ))); } Ok(()) } /// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for /// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon). /// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to account for them /// here. pub(crate) fn get_gc_compaction_watermark(self: &Arc) -> Lsn { let gc_cutoff_lsn = { let gc_info = self.gc_info.read().unwrap(); gc_info.min_cutoff() }; // TODO: standby horizon should use leases so we don't really need to consider it here. // let watermark = watermark.min(self.standby_horizon.load()); // TODO: ensure the child branches will not use anything below the watermark, or consider // them when computing the watermark. gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn()) } /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job. /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts /// like a full compaction of the specified keyspace. pub(crate) async fn gc_compaction_split_jobs( self: &Arc, job: GcCompactJob, sub_compaction_max_job_size_mb: Option, ) -> Result, CompactionError> { let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { job.compact_lsn_range.end } else { self.get_gc_compaction_watermark() }; if compact_below_lsn == Lsn::INVALID { tracing::warn!( "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" ); return Ok(vec![]); } // Split compaction job to about 4GB each const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; let sub_compaction_max_job_size_mb = sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB); let mut compact_jobs = Vec::::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, source_end: &Key, target_start: &Key, target_end: &Key, ) -> Option<(Key, Key)> { let start = source_start.max(target_start); let end = source_end.min(target_end); if start < end { Some((*start, *end)) } else { None } } let mut split_key_ranges = Vec::new(); let ranges = dense_ks .parts .iter() .map(|partition| partition.ranges.iter()) .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter())) .flatten() .cloned() .collect_vec(); for range in ranges.iter() { let Some((start, end)) = truncate_to( &range.start, &range.end, &job.compact_key_range.start, &job.compact_key_range.end, ) else { continue; }; split_key_ranges.push((start, end)); } split_key_ranges.sort(); let all_layers = { let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; let mut current_start = None; let ranges_num = split_key_ranges.len(); for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() { if current_start.is_none() { current_start = Some(start); } let start = current_start.unwrap(); if start >= end { // We have already processed this partition. continue; } let overlapping_layers = { let mut desc = Vec::new(); for layer in all_layers.iter() { if overlaps_with(&layer.get_key_range(), &(start..end)) && layer.get_lsn_range().start <= compact_below_lsn { desc.push(layer.clone()); } } desc }; let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::(); if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 { // Try to extend the compaction range so that we include at least one full layer file. let extended_end = overlapping_layers .iter() .map(|layer| layer.key_range.end) .min(); // It is possible that the search range does not contain any layer files when we reach the end of the loop. // In this case, we simply use the specified key range end. let end = if let Some(extended_end) = extended_end { extended_end.max(end) } else { end }; let end = if ranges_num == idx + 1 { // extend the compaction range to the end of the key range if it's the last partition end.max(job.compact_key_range.end) } else { end }; if total_size == 0 && !compact_jobs.is_empty() { info!( "splitting compaction job: {}..{}, estimated_size={}, extending the previous job", start, end, total_size ); compact_jobs.last_mut().unwrap().compact_key_range.end = end; current_start = Some(end); } else { info!( "splitting compaction job: {}..{}, estimated_size={}", start, end, total_size ); compact_jobs.push(GcCompactJob { dry_run: job.dry_run, compact_key_range: start..end, compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, do_metadata_compaction: false, }); current_start = Some(end); } } } Ok(compact_jobs) } /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, /// and create delta layers with all deltas >= gc horizon. /// /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction. /// Partial compaction will read and process all layers overlapping with the key range, even if it might /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not /// part of the range. /// /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with /// the LSN. Otherwise, it will use the gc cutoff by default. pub(crate) async fn compact_with_gc( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); if sub_compaction { info!( "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); let jobs = self .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb) .await?; let jobs_len = jobs.len(); for (idx, job) in jobs.into_iter().enumerate() { let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len); self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) .instrument(info_span!( "sub_compaction", sub_compaction_progress = sub_compaction_progress )) .await?; } if jobs_len == 0 { info!("no jobs to run, skipping gc bottom-most compaction"); } return Ok(CompactionOutcome::Done); } self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) .await } async fn compact_with_gc_inner( self: &Arc, cancel: &CancellationToken, mut job: GcCompactJob, ctx: &RequestContext, yield_for_l0: bool, ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. // If the job is not configured to compact the metadata key range, shrink the key range // to exclude the metadata key range. The check is done by checking if the end of the key range // is larger than the start of the metadata key range. Note that metadata keys cover the entire // second half of the keyspace, so it's enough to only check the end of the key range. if !job.do_metadata_compaction && job.compact_key_range.end > Key::metadata_key_range().start { tracing::info!( "compaction for metadata key range is not supported yet, overriding compact_key_range from {} to {}", job.compact_key_range.end, Key::metadata_key_range().start ); // Shrink the key range to exclude the metadata key range. job.compact_key_range.end = Key::metadata_key_range().start; // Skip the job if the key range completely lies within the metadata key range. if job.compact_key_range.start >= job.compact_key_range.end { tracing::info!("compact_key_range is empty, skipping compaction"); return Ok(CompactionOutcome::Done); } } let timer = Instant::now(); let begin_timer = timer; let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), _ = cancel.cancelled() => Err(CompactionError::new_cancelled()), } }; let time_acquire_lock = timer.elapsed(); let timer = Instant::now(); let gc_lock = crate::timed( gc_lock, "acquires gc lock", std::time::Duration::from_secs(5), ) .await?; let dry_run = job.dry_run; let compact_key_range = job.compact_key_range; let compact_lsn_range = job.compact_lsn_range; let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); info!( "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end ); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); }; let mut stat = CompactionStatistics::default(); // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. let job_desc = { let guard = self .layers .read(LayerManagerLockHolder::GarbageCollection) .await; let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = { // Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff. // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of // cleaning everything that theoritically it could. In the future, it should use `self.gc_info` // to get the truth data. let real_gc_cutoff = self.get_gc_compaction_watermark(); // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use // the real cutoff. let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { if real_gc_cutoff == Lsn::INVALID { // If the gc_cutoff is not generated yet, we should not compact anything. tracing::warn!( "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" ); return Ok(CompactionOutcome::Skipped); } real_gc_cutoff } else { compact_lsn_range.end }; if gc_cutoff > real_gc_cutoff { warn!( "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff ); gc_cutoff = real_gc_cutoff; } gc_cutoff }; for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } } for lsn in gc_info.leases.keys() { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } } let mut selected_layers: Vec = Vec::new(); drop(gc_info); // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers. let Some(max_layer_lsn) = layers .iter_historic_layers() .filter(|desc| desc.get_lsn_range().start <= gc_cutoff) .map(|desc| desc.get_lsn_range().end) .max() else { info!( "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff ); return Ok(CompactionOutcome::Done); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if // it is a branch. let Some(min_layer_lsn) = layers .iter_historic_layers() .filter(|desc| { if compact_lsn_range.start == Lsn::INVALID { true // select all layers below if start == Lsn(0) } else { desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn } }) .map(|desc| desc.get_lsn_range().start) .min() else { info!( "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end ); return Ok(CompactionOutcome::Done); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key // layers to compact. let mut rewrite_layers = Vec::new(); for desc in layers.iter_historic_layers() { if desc.get_lsn_range().end <= max_layer_lsn && desc.get_lsn_range().start >= min_layer_lsn && overlaps_with(&desc.get_key_range(), &compact_key_range) { // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range, // even if it might contain extra keys selected_layers.push(guard.get_from_desc(&desc)); // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine // to overlap image layers) if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range()) { rewrite_layers.push(desc); } } } if selected_layers.is_empty() { info!( "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end ); return Ok(CompactionOutcome::Done); } retain_lsns_below_horizon.sort(); GcCompactionJobDescription { selected_layers, gc_cutoff, retain_lsns_below_horizon, min_layer_lsn, max_layer_lsn, compaction_key_range: compact_key_range, rewrite_layers, } }; let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID { // If we only compact above some LSN, we should get the history from the current branch below the specified LSN. // We use job_desc.min_layer_lsn as if it's the lowest branch point. (true, job_desc.min_layer_lsn) } else if self.ancestor_timeline.is_some() { // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the // LSN ranges all the way to the ancestor timeline. (true, self.ancestor_lsn) } else { let res = job_desc .retain_lsns_below_horizon .first() .copied() .unwrap_or(job_desc.gc_cutoff); if debug_mode { assert_eq!( res, job_desc .retain_lsns_below_horizon .iter() .min() .copied() .unwrap_or(job_desc.gc_cutoff) ); } (false, res) }; let verification = self.get_gc_compaction_settings().gc_compaction_verification; info!( "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}", job_desc.selected_layers.len(), job_desc.rewrite_layers.len(), job_desc.max_layer_lsn, job_desc.min_layer_lsn, job_desc.gc_cutoff, lowest_retain_lsn, job_desc.compaction_key_range.start, job_desc.compaction_key_range.end, has_data_below, ); let time_analyze = timer.elapsed(); let timer = Instant::now(); for layer in &job_desc.selected_layers { debug!("read layer: {}", layer.layer_desc().key()); } for layer in &job_desc.rewrite_layers { debug!("rewrite layer: {}", layer.key()); } self.check_compaction_space(&job_desc.selected_layers) .await?; self.check_memory_usage(&job_desc.selected_layers).await?; if job_desc.selected_layers.len() > 100 && job_desc.rewrite_layers.len() as f64 >= job_desc.selected_layers.len() as f64 * 0.7 { return Err(CompactionError::Other(anyhow!( "too many layers to rewrite: {} / {}, giving up compaction", job_desc.rewrite_layers.len(), job_desc.selected_layers.len() ))); } // Generate statistics for the compaction for layer in &job_desc.selected_layers { let desc = layer.layer_desc(); if desc.is_delta() { stat.visit_delta_layer(desc.file_size()); } else { stat.visit_image_layer(desc.file_size()); } } // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. let layer_names = job_desc .selected_layers .iter() .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err ))); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc .selected_layers .iter() .map(|l| l.layer_desc().lsn_range.end) .max() .unwrap(); let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); let mut downloaded_layers = Vec::new(); let mut total_downloaded_size = 0; let mut total_layer_size = 0; for layer in &job_desc.selected_layers { if layer .needs_download() .await .context("failed to check if layer needs download") .map_err(CompactionError::Other)? .is_some() { total_downloaded_size += layer.layer_desc().file_size; } total_layer_size += layer.layer_desc().file_size; if cancel.is_cancelled() { return Err(CompactionError::new_cancelled()); } let should_yield = yield_for_l0 && self .l0_compaction_trigger .notified() .now_or_never() .is_some(); if should_yield { tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers"); return Ok(CompactionOutcome::YieldForL0); } let resident_layer = layer .download_and_keep_resident(ctx) .await .context("failed to download and keep resident layer") .map_err(CompactionError::Other)?; downloaded_layers.push(resident_layer); } info!( "finish downloading layers, downloaded={}, total={}, ratio={:.2}", total_downloaded_size, total_layer_size, total_downloaded_size as f64 / total_layer_size as f64 ); for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { let layer = resident_layer .get_as_delta(ctx) .await .context("failed to get delta layer") .map_err(CompactionError::Other)?; delta_layers.push(layer); } else { let layer = resident_layer .get_as_image(ctx) .await .context("failed to get image layer") .map_err(CompactionError::Other)?; image_layers.push(layer); } } let (dense_ks, sparse_ks) = self .collect_gc_compaction_keyspace() .await .context("failed to collect gc compaction keyspace") .map_err(CompactionError::Other)?; let mut merge_iter = FilterIterator::create( MergeIterator::create_with_options( &delta_layers, &image_layers, ctx, 128 * 8192, /* 1MB buffer for each of the inner iterators */ 128, ), dense_ks, sparse_ks, ) .context("failed to create filter iterator") .map_err(CompactionError::Other)?; let time_download_layer = timer.elapsed(); let mut timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); let mut accumulated_values_estimated_size = 0; let mut last_key: Option = None; // Only create image layers when there is no ancestor branches. TODO: create covering image layer // when some condition meet. let mut image_layer_writer = if !has_data_below { Some(SplitImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, job_desc.compaction_key_range.start, lowest_retain_lsn, self.get_compaction_target_size(), &self.gate, self.cancel.clone(), )) } else { None }; let mut delta_layer_writer = SplitDeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), &self.gate, self.cancel.clone(), ); #[derive(Default)] struct RewritingLayers { before: Option, after: Option, } let mut delta_layer_rewriters = HashMap::, RewritingLayers>::new(); /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`). /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set. /// In those cases, we need to pull up data from below the LSN range we're compaction. /// /// This function unifies the cases so that later code doesn't have to think about it. /// /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image /// is needed for reconstruction. This should be fixed in the future. /// /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor /// images. async fn get_ancestor_image( this_tline: &Arc, key: Key, ctx: &RequestContext, has_data_below: bool, history_lsn_point: Lsn, ) -> anyhow::Result> { if !has_data_below { return Ok(None); }; // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing // as much existing code as possible. let img = this_tline.get(key, history_lsn_point, ctx).await?; Ok(Some((key, history_lsn_point, img))) } // Actually, we can decide not to write to the image layer at all at this point because // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. let mut time_to_first_kv_pair = None; while let Some(((key, lsn, val), desc)) = merge_iter .next_with_trace() .await .context("failed to get next key-value pair") .map_err(CompactionError::Other)? { if time_to_first_kv_pair.is_none() { time_to_first_kv_pair = Some(timer.elapsed()); timer = Instant::now(); } if cancel.is_cancelled() { return Err(CompactionError::new_cancelled()); } let should_yield = yield_for_l0 && self .l0_compaction_trigger .notified() .now_or_never() .is_some(); if should_yield { tracing::info!("preempt gc-compaction in the main loop: too many L0 layers"); return Ok(CompactionOutcome::YieldForL0); } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. // // This is not handled in the filter iterator because shard is determined by hash. // Therefore, it does not give us any performance benefit to do things like skip // a whole layer file as handling key spaces (ranges). if cfg!(debug_assertions) { let shard = self.shard_identity.shard_index(); let owner = self.shard_identity.get_shard_number(&key); panic!("key {key} does not belong on shard {shard}, owned by {owner}"); } continue; } if !job_desc.compaction_key_range.contains(&key) { if !desc.is_delta { continue; } let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default(); let rewriter = if key < job_desc.compaction_key_range.start { if rewriter.before.is_none() { rewriter.before = Some( DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, desc.key_range.start, desc.lsn_range.clone(), &self.gate, self.cancel.clone(), ctx, ) .await .context("failed to create delta layer writer") .map_err(CompactionError::Other)?, ); } rewriter.before.as_mut().unwrap() } else if key >= job_desc.compaction_key_range.end { if rewriter.after.is_none() { rewriter.after = Some( DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, job_desc.compaction_key_range.end, desc.lsn_range.clone(), &self.gate, self.cancel.clone(), ctx, ) .await .context("failed to create delta layer writer") .map_err(CompactionError::Other)?, ); } rewriter.after.as_mut().unwrap() } else { unreachable!() }; rewriter .put_value(key, lsn, val, ctx) .await .context("failed to put value") .map_err(CompactionError::Other)?; continue; } match val { Value::Image(_) => stat.visit_image_key(&val), Value::WalRecord(_) => stat.visit_wal_key(&val), } if last_key.is_none() || last_key.as_ref() == Some(&key) { if last_key.is_none() { last_key = Some(key); } accumulated_values_estimated_size += val.estimated_size(); accumulated_values.push((key, lsn, val)); // Accumulated values should never exceed 512MB. if accumulated_values_estimated_size >= 1024 * 1024 * 512 { return Err(CompactionError::Other(anyhow!( "too many values for a single key: {} for key {}, {} items", accumulated_values_estimated_size, key, accumulated_values.len() ))); } } else { let last_key: &mut Key = last_key.as_mut().unwrap(); stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction let retention = self .generate_key_retention( *last_key, &accumulated_values, job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn) .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, verification, ) .await .context("failed to generate key retention") .map_err(CompactionError::Other)?; retention .pipe_to( *last_key, &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, ctx, ) .await .context("failed to pipe to delta layer writer") .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; accumulated_values_estimated_size = val.estimated_size(); accumulated_values.push((key, lsn, val)); } } // TODO: move the below part to the loop body let Some(last_key) = last_key else { return Err(CompactionError::Other(anyhow!( "no keys produced during compaction" ))); }; stat.on_unique_key_visited(); let retention = self .generate_key_retention( last_key, &accumulated_values, job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn) .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, verification, ) .await .context("failed to generate key retention") .map_err(CompactionError::Other)?; retention .pipe_to( last_key, &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, ctx, ) .await .context("failed to pipe to delta layer writer") .map_err(CompactionError::Other)?; // end: move the above part to the loop body let time_main_loop = timer.elapsed(); let timer = Instant::now(); let mut rewrote_delta_layers = Vec::new(); for (key, writers) in delta_layer_rewriters { if let Some(delta_writer_before) = writers.before { let (desc, path) = delta_writer_before .finish(job_desc.compaction_key_range.start, ctx) .await .context("failed to finish delta layer writer") .map_err(CompactionError::Other)?; let layer = Layer::finish_creating(self.conf, self, desc, &path) .context("failed to finish creating delta layer") .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } if let Some(delta_writer_after) = writers.after { let (desc, path) = delta_writer_after .finish(key.key_range.end, ctx) .await .context("failed to finish delta layer writer") .map_err(CompactionError::Other)?; let layer = Layer::finish_creating(self.conf, self, desc, &path) .context("failed to finish creating delta layer") .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } } let discard = |key: &PersistentLayerKey| { let key = key.clone(); async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await } }; let produced_image_layers = if let Some(writer) = image_layer_writer { if !dry_run { let end_key = job_desc.compaction_key_range.end; writer .finish_with_discard_fn(self, ctx, end_key, discard) .await .context("failed to finish image layer writer") .map_err(CompactionError::Other)? } else { drop(writer); Vec::new() } } else { Vec::new() }; let produced_delta_layers = if !dry_run { delta_layer_writer .finish_with_discard_fn(self, ctx, discard) .await .context("failed to finish delta layer writer") .map_err(CompactionError::Other)? } else { drop(delta_layer_writer); Vec::new() }; // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if // compaction is cancelled at this point, we might have some layers that are not cleaned up. let mut compact_to = Vec::new(); let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); let produced_image_layers_len = produced_image_layers.len(); let layer_selection_by_key = job_desc .selected_layers .iter() .map(|l| (l.layer_desc().key(), l.layer_desc().clone())) .collect::>(); for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { if cfg!(debug_assertions) { info!("produced delta layer: {}", layer.layer_desc().key()); } stat.produce_delta_layer(layer.layer_desc().file_size()); compact_to.push(layer); } BatchWriterResult::Discarded(l) => { if cfg!(debug_assertions) { info!("discarded delta layer: {}", l); } if let Some(layer_desc) = layer_selection_by_key.get(&l) { stat.discard_delta_layer(layer_desc.file_size()); } else { tracing::warn!( "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?", l ); stat.discard_delta_layer(0); } keep_layers.insert(l); } } } for layer in &rewrote_delta_layers { debug!( "produced rewritten delta layer: {}", layer.layer_desc().key() ); // For now, we include rewritten delta layer size in the "produce_delta_layer". We could // make it a separate statistics in the future. stat.produce_delta_layer(layer.layer_desc().file_size()); } compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { match action { BatchWriterResult::Produced(layer) => { debug!("produced image layer: {}", layer.layer_desc().key()); stat.produce_image_layer(layer.layer_desc().file_size()); compact_to.push(layer); } BatchWriterResult::Discarded(l) => { debug!("discarded image layer: {}", l); if let Some(layer_desc) = layer_selection_by_key.get(&l) { stat.discard_image_layer(layer_desc.file_size()); } else { tracing::warn!( "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?", l ); stat.discard_image_layer(0); } keep_layers.insert(l); } } } let mut layer_selection = job_desc.selected_layers; // Partial compaction might select more data than it processes, e.g., if // the compaction_key_range only partially overlaps: // // [---compaction_key_range---] // [---A----][----B----][----C----][----D----] // // For delta layers, we will rewrite the layers so that it is cut exactly at // the compaction key range, so we can always discard them. However, for image // layers, as we do not rewrite them for now, we need to handle them differently. // Assume image layers A, B, C, D are all in the `layer_selection`. // // The created image layers contain whatever is needed from B, C, and from // `----]` of A, and from `[---` of D. // // In contrast, `[---A` and `D----]` have not been processed, so, we must // keep that data. // // The solution for now is to keep A and D completely if they are image layers. // (layer_selection is what we'll remove from the layer map, so, retain what // is _not_ fully covered by compaction_key_range). for layer in &layer_selection { if !layer.layer_desc().is_delta() { if !overlaps_with( &layer.layer_desc().key_range, &job_desc.compaction_key_range, ) { return Err(CompactionError::Other(anyhow!( "violated constraint: image layer outside of compaction key range" ))); } if !fully_contains( &job_desc.compaction_key_range, &layer.layer_desc().key_range, ) { keep_layers.insert(layer.layer_desc().key()); } } } layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); let time_final_phase = timer.elapsed(); stat.time_final_phase_secs = time_final_phase.as_secs_f64(); stat.time_to_first_kv_pair_secs = time_to_first_kv_pair .unwrap_or(Duration::ZERO) .as_secs_f64(); stat.time_main_loop_secs = time_main_loop.as_secs_f64(); stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); stat.time_download_layer_secs = time_download_layer.as_secs_f64(); stat.time_analyze_secs = time_analyze.as_secs_f64(); stat.time_total_secs = begin_timer.elapsed().as_secs_f64(); stat.finalize(); info!( "gc-compaction statistics: {}", serde_json::to_string(&stat) .context("failed to serialize gc-compaction statistics") .map_err(CompactionError::Other)? ); if dry_run { return Ok(CompactionOutcome::Done); } info!( "produced {} delta layers and {} image layers, {} layers are kept", produced_delta_layers_len, produced_image_layers_len, keep_layers.len() ); // Step 3: Place back to the layer map. // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. let all_layers = { let guard = self .layers .read(LayerManagerLockHolder::GarbageCollection) .await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; let mut final_layers = all_layers .iter() .map(|layer| layer.layer_name()) .collect::>(); for layer in &layer_selection { final_layers.remove(&layer.layer_desc().layer_name()); } for layer in &compact_to { final_layers.insert(layer.layer_desc().layer_name()); } let final_layers = final_layers.into_iter().collect_vec(); // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err ))); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only // operate on L1 layers. { // Gc-compaction will rewrite the history of a key. This could happen in two ways: // // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer // map, the read path then cannot find any keys below A, reporting a missing key error, while the key // now gets stored in I at the compact LSN. // // --------------- --------------- // delta1@LSN20 image1@LSN20 // --------------- (read path collects delta@LSN20, => --------------- (read path cannot find anything // delta1@LSN10 yields) below LSN 20) // --------------- // // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers, // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4, // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4. // // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta // layer containing 4, yields, and we update the layer map to put the delta layer. // // --------------- --------------- // delta1@LSN4 image1@LSN4 // --------------- (read path collects delta@LSN4, => --------------- (read path collects LSN4 and LSN1, // delta1@LSN1-3 yields) delta1@LSN1 which is an invalid history) // --------------- --------------- // // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads, // and only allow reads to continue after the update is finished. let update_guard = self.gc_compaction_layer_update_lock.write().await; // Acquiring the update guard ensures current read operations end and new read operations are blocked. // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect? let mut guard = self .layers .write(LayerManagerLockHolder::GarbageCollection) .await; guard .open_mut()? .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics); drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map. }; // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json. // Otherwise, after restart, the index_part only contains the old `latest_gc_cutoff` and // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should // be batched into `schedule_compaction_update`. let disk_consistent_lsn = self.disk_consistent_lsn.load(); self.schedule_uploads(disk_consistent_lsn, None) .context("failed to schedule uploads") .map_err(CompactionError::Other)?; // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead // of `compact_from`. let compact_from = { let mut compact_from = Vec::new(); let mut compact_to_set = HashMap::new(); for layer in &compact_to { compact_to_set.insert(layer.layer_desc().key(), layer); } for layer in &layer_selection { if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { tracing::info!( "skipping delete {} because found same layer key at different generation {}", layer, to ); } else { compact_from.push(layer.clone()); } } compact_from }; self.remote_client .schedule_compaction_update(&compact_from, &compact_to)?; drop(gc_lock); Ok(CompactionOutcome::Done) } } struct TimelineAdaptor { timeline: Arc, keyspace: (Lsn, KeySpace), new_deltas: Vec, new_images: Vec, layers_to_delete: Vec>, } impl TimelineAdaptor { pub fn new(timeline: &Arc, keyspace: (Lsn, KeySpace)) -> Self { Self { timeline: timeline.clone(), keyspace, new_images: Vec::new(), new_deltas: Vec::new(), layers_to_delete: Vec::new(), } } pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { let layers_to_delete = { let guard = self .timeline .layers .read(LayerManagerLockHolder::Compaction) .await; self.layers_to_delete .iter() .map(|x| guard.get_from_desc(x)) .collect::>() }; self.timeline .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) .await?; self.timeline .upload_new_image_layers(std::mem::take(&mut self.new_images))?; self.new_deltas.clear(); self.layers_to_delete.clear(); Ok(()) } } #[derive(Clone)] struct ResidentDeltaLayer(ResidentLayer); #[derive(Clone)] struct ResidentImageLayer(ResidentLayer); impl CompactionJobExecutor for TimelineAdaptor { type Key = pageserver_api::key::Key; type Layer = OwnArc; type DeltaLayer = ResidentDeltaLayer; type ImageLayer = ResidentImageLayer; type RequestContext = crate::context::RequestContext; fn get_shard_identity(&self) -> &ShardIdentity { self.timeline.get_shard_identity() } async fn get_layers( &mut self, key_range: &Range, lsn_range: &Range, _ctx: &RequestContext, ) -> anyhow::Result>> { self.flush_updates().await?; let guard = self .timeline .layers .read(LayerManagerLockHolder::Compaction) .await; let layer_map = guard.layer_map()?; let result = layer_map .iter_historic_layers() .filter(|l| { overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range) }) .map(OwnArc) .collect(); Ok(result) } async fn get_keyspace( &mut self, key_range: &Range, lsn: Lsn, _ctx: &RequestContext, ) -> anyhow::Result>> { if lsn == self.keyspace.0 { Ok(pageserver_compaction::helpers::intersect_keyspace( &self.keyspace.1.ranges, key_range, )) } else { // The current compaction implementation only ever requests the key space // at the compaction end LSN. anyhow::bail!("keyspace not available for requested lsn"); } } async fn downcast_delta_layer( &self, layer: &OwnArc, ctx: &RequestContext, ) -> anyhow::Result> { // this is a lot more complex than a simple downcast... if layer.is_delta() { let l = { let guard = self .timeline .layers .read(LayerManagerLockHolder::Compaction) .await; guard.get_from_desc(layer) }; let result = l.download_and_keep_resident(ctx).await?; Ok(Some(ResidentDeltaLayer(result))) } else { Ok(None) } } async fn create_image( &mut self, lsn: Lsn, key_range: &Range, ctx: &RequestContext, ) -> anyhow::Result<()> { Ok(self.create_image_impl(lsn, key_range, ctx).await?) } async fn create_delta( &mut self, lsn_range: &Range, key_range: &Range, input_layers: &[ResidentDeltaLayer], ctx: &RequestContext, ) -> anyhow::Result<()> { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); let mut all_entries = Vec::new(); for dl in input_layers.iter() { all_entries.extend(dl.load_keys(ctx).await?); } // The current stdlib sorting implementation is designed in a way where it is // particularly fast where the slice is made up of sorted sub-ranges. all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); let mut writer = DeltaLayerWriter::new( self.timeline.conf, self.timeline.timeline_id, self.timeline.tenant_shard_id, key_range.start, lsn_range.clone(), &self.timeline.gate, self.timeline.cancel.clone(), ctx, ) .await?; let mut dup_values = 0; // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. let mut prev: Option<(Key, Lsn)> = None; for &DeltaEntry { key, lsn, ref val, .. } in all_entries.iter() { if prev == Some((key, lsn)) { // This is a duplicate. Skip it. // // It can happen if compaction is interrupted after writing some // layers but not all, and we are compacting the range again. // The calculations in the algorithm assume that there are no // duplicates, so the math on targeted file size is likely off, // and we will create smaller files than expected. dup_values += 1; continue; } let value = val.load(ctx).await?; writer.put_value(key, lsn, value, ctx).await?; prev = Some((key, lsn)); } if dup_values > 0 { warn!("delta layer created with {} duplicate values", dup_values); } fail_point!("delta-layer-writer-fail-before-finish", |_| { Err(anyhow::anyhow!( "failpoint delta-layer-writer-fail-before-finish" )) }); let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; let new_delta_layer = Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_deltas.push(new_delta_layer); Ok(()) } async fn delete_layer( &mut self, layer: &OwnArc, _ctx: &RequestContext, ) -> anyhow::Result<()> { self.layers_to_delete.push(layer.clone().0); Ok(()) } } impl TimelineAdaptor { async fn create_image_impl( &mut self, lsn: Lsn, key_range: &Range, ctx: &RequestContext, ) -> Result<(), CreateImageLayersError> { let timer = self.timeline.metrics.create_images_time_histo.start_timer(); let image_layer_writer = ImageLayerWriter::new( self.timeline.conf, self.timeline.timeline_id, self.timeline.tenant_shard_id, key_range, lsn, &self.timeline.gate, self.timeline.cancel.clone(), ctx, ) .await .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( "failpoint image-layer-writer-fail-before-finish" ))) }); let keyspace = KeySpace { ranges: self .get_keyspace(key_range, lsn, ctx) .await .map_err(CreateImageLayersError::Other)?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly let outcome = self .timeline .create_image_layer_for_rel_blocks( &keyspace, image_layer_writer, lsn, ctx, key_range.clone(), IoConcurrency::sequential(), None, ) .await?; if let ImageLayerCreationOutcome::Generated { unfinished_image_layer, } = outcome { let (desc, path) = unfinished_image_layer .finish(ctx) .await .map_err(CreateImageLayersError::Other)?; let image_layer = Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path) .map_err(CreateImageLayersError::Other)?; self.new_images.push(image_layer); } timer.stop_and_record(); Ok(()) } } impl CompactionRequestContext for crate::context::RequestContext {} #[derive(Debug, Clone)] pub struct OwnArc(pub Arc); impl Deref for OwnArc { type Target = as Deref>::Target; fn deref(&self) -> &Self::Target { &self.0 } } impl AsRef for OwnArc { fn as_ref(&self) -> &T { self.0.as_ref() } } impl CompactionLayer for OwnArc { fn key_range(&self) -> &Range { &self.key_range } fn lsn_range(&self) -> &Range { &self.lsn_range } fn file_size(&self) -> u64 { self.file_size } fn short_id(&self) -> std::string::String { self.as_ref().short_id().to_string() } fn is_delta(&self) -> bool { self.as_ref().is_delta() } } impl CompactionLayer for OwnArc { fn key_range(&self) -> &Range { &self.layer_desc().key_range } fn lsn_range(&self) -> &Range { &self.layer_desc().lsn_range } fn file_size(&self) -> u64 { self.layer_desc().file_size } fn short_id(&self) -> std::string::String { self.layer_desc().short_id().to_string() } fn is_delta(&self) -> bool { true } } impl CompactionLayer for ResidentDeltaLayer { fn key_range(&self) -> &Range { &self.0.layer_desc().key_range } fn lsn_range(&self) -> &Range { &self.0.layer_desc().lsn_range } fn file_size(&self) -> u64 { self.0.layer_desc().file_size } fn short_id(&self) -> std::string::String { self.0.layer_desc().short_id().to_string() } fn is_delta(&self) -> bool { true } } impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result>> { self.0.get_as_delta(ctx).await?.index_entries(ctx).await } } impl CompactionLayer for ResidentImageLayer { fn key_range(&self) -> &Range { &self.0.layer_desc().key_range } fn lsn_range(&self) -> &Range { &self.0.layer_desc().lsn_range } fn file_size(&self) -> u64 { self.0.layer_desc().file_size } fn short_id(&self) -> std::string::String { self.0.layer_desc().short_id().to_string() } fn is_delta(&self) -> bool { false } } impl CompactionImageLayer for ResidentImageLayer {} ================================================ FILE: pageserver/src/tenant/timeline/delete.rs ================================================ use std::ops::{Deref, DerefMut}; use std::sync::Arc; use anyhow::Context; use pageserver_api::models::TimelineState; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use tokio::sync::OwnedMutexGuard; use tracing::{Instrument, error, info, info_span, instrument}; use utils::id::TimelineId; use utils::{crashsafe, fs_ext, pausable_failpoint}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::task_mgr::{self, TaskKind}; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::{ PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, }; use crate::tenant::{ CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError, TenantShard, Timeline, TimelineOrOffloaded, }; use crate::virtual_file::MaybeFatalIo; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index( remote_client: &Arc, ) -> Result<(), DeleteTimelineError> { let res = remote_client.persist_index_part_with_deleted_flag().await; match res { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), // Bail out otherwise // // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents // two tasks from performing the deletion at the same time. The first task // that starts deletion should run it to completion. Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); } } Ok(()) } /// Grab the compaction and gc locks, and actually perform the deletion. /// /// The locks prevent GC or compaction from running at the same time. The background tasks do not /// register themselves with the timeline it's operating on, so it might still be running even /// though we called `shutdown_tasks`. /// /// Note that there are still other race conditions between /// GC, compaction and timeline deletion. See /// /// /// No timeout here, GC & Compaction should be responsive to the /// `TimelineState::Stopping` change. // pub(super): documentation link pub(super) async fn delete_local_timeline_directory( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline: &Timeline, ) { // Always ensure the lock order is compaction -> gc. let compaction_lock = timeline.compaction_lock.lock(); let _compaction_lock = crate::timed( compaction_lock, "acquires compaction lock", std::time::Duration::from_secs(5), ) .await; let gc_lock = timeline.gc_lock.lock(); let _gc_lock = crate::timed( gc_lock, "acquires gc lock", std::time::Duration::from_secs(5), ) .await; // NB: storage_sync upload tasks that reference these layers have been cancelled // by the caller. let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id); // NB: This need not be atomic because the deleted flag in the IndexPart // will be observed during tenant/timeline load. The deletion will be resumed there. // // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because, // no locks are shared. tokio::fs::remove_dir_all(local_timeline_directory) .await .or_else(fs_ext::ignore_not_found) .fatal_err("removing timeline directory"); // Make sure previous deletions are ordered before mark removal. // Otherwise there is no guarantee that they reach the disk before mark deletion. // So its possible for mark to reach disk first and for other deletions // to be reordered later and thus missed if a crash occurs. // Note that we dont need to sync after mark file is removed // because we can tolerate the case when mark file reappears on startup. let timeline_path = conf.timelines_path(&tenant_shard_id); crashsafe::fsync_async(timeline_path) .await .fatal_err("fsync after removing timeline directory"); info!("finished deleting layer files, releasing locks"); } /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`make_timeline_delete_guard`] async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &TenantShard, timeline: &TimelineOrOffloaded, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. // This observes the locking order between timelines and timelines_offloaded let mut timelines = tenant.timelines.lock().unwrap(); let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); let mut timelines_importing = tenant.timelines_importing.lock().unwrap(); let offloaded_children_exist = timelines_offloaded .iter() .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id())); let children_exist = timelines .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id())); // XXX this can happen because of race conditions with branch creation. // We already deleted the remote layer files, so it's probably best to panic. if children_exist || offloaded_children_exist { panic!("Timeline grew children while we removed layer files"); } match timeline { TimelineOrOffloaded::Timeline(timeline) => { timelines.remove(&timeline.timeline_id).expect( "timeline that we were deleting was concurrently removed from 'timelines' map", ); tenant .scheduled_compaction_tasks .lock() .unwrap() .remove(&timeline.timeline_id); } TimelineOrOffloaded::Offloaded(timeline) => { let offloaded_timeline = timelines_offloaded .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map"); offloaded_timeline.delete_from_ancestor_with_timelines(&timelines); } TimelineOrOffloaded::Importing(importing) => { timelines_importing.remove(&importing.timeline.timeline_id); } } drop(timelines_importing); drop(timelines_offloaded); drop(timelines); Ok(()) } /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, /// and deletes its data from both disk and s3. /// The sequence of steps: /// 1. Set deleted_at in remote index part. /// 2. Create local mark file. /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata) /// 4. Delete remote layers /// 5. Delete index part /// 6. Delete meta, timeline directory /// 7. Delete mark file /// /// It is resumable from any step in case a crash/restart occurs. /// There are two entrypoints to the process: /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present /// and we possibly neeed to continue deletion of remote files. /// /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. #[derive(Default)] pub enum DeleteTimelineFlow { #[default] NotStarted, InProgress, Finished, } impl DeleteTimelineFlow { // These steps are run in the context of management api request handler. // Long running steps are continued to run in the background. // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! #[instrument(skip_all)] pub async fn run( tenant: &Arc, timeline_id: TimelineId, ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); let (timeline, mut guard) = make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?; guard.mark_in_progress()?; // Now that the Timeline is in Stopping state, request all the related tasks to shut down. // TODO(vlad): shut down imported timeline here match &timeline { TimelineOrOffloaded::Timeline(timeline) => { timeline.shutdown(super::ShutdownMode::Hard).await; } TimelineOrOffloaded::Importing(importing) => { importing.shutdown().await; } TimelineOrOffloaded::Offloaded(_offloaded) => { // Nothing to shut down in this case } } tenant.gc_block.before_delete(&timeline.timeline_id()); fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-index-deleted-at" ))? }); let remote_client = match timeline.maybe_remote_client() { Some(remote_client) => remote_client, None => { let remote_client = tenant .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone()); let result = match remote_client .download_index_file(&tenant.cancel) .instrument(info_span!("download_index_file")) .await { Ok(r) => r, Err(DownloadError::NotFound) => { // Deletion is already complete. // As we came here, we will need to remove the timeline from the tenant though. tracing::info!("Timeline already deleted in remote storage"); if let TimelineOrOffloaded::Offloaded(_) = &timeline { // We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in. tracing::info!( "Timeline with gone index part is offloaded timeline. Removing from tenant." ); remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard) .await?; } return Ok(()); } Err(e) => { return Err(DeleteTimelineError::Other(anyhow::anyhow!( "error: {:?}", e ))); } }; let index_part = match result { MaybeDeletedIndexPart::Deleted(p) => { tracing::info!("Timeline already set as deleted in remote index"); p } MaybeDeletedIndexPart::IndexPart(p) => p, }; let remote_client = Arc::new(remote_client); remote_client .init_upload_queue(&index_part) .map_err(DeleteTimelineError::Other)?; remote_client.shutdown().await; remote_client } }; set_deleted_in_remote_index(&remote_client).await?; fail::fail_point!("timeline-delete-before-schedule", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-schedule" ))? }); Self::schedule_background( guard, tenant.conf, Arc::clone(tenant), timeline, remote_client, ); Ok(()) } fn mark_in_progress(&mut self) -> anyhow::Result<()> { match self { Self::Finished => anyhow::bail!("Bug. Is in finished state"), Self::InProgress { .. } => { /* We're in a retry */ } Self::NotStarted => { /* Fresh start */ } } *self = Self::InProgress; Ok(()) } /// Shortcut to create Timeline in stopping state and spawn deletion task. #[instrument(skip_all, fields(%timeline_id))] pub(crate) async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, remote_client: RemoteTimelineClient, ctx: &RequestContext, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. let (timeline, _timeline_ctx) = tenant .create_timeline_struct( timeline_id, local_metadata, None, // Ancestor is not needed for deletion. None, // Previous heatmap is not needed for deletion tenant.get_timeline_resources_for(remote_client), // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here None, // doesn't matter what we put here None, // doesn't matter what we put here None, // doesn't matter what we put here ctx, ) .context("create_timeline_struct")?; let mut guard = DeletionGuard( Arc::clone(&timeline.delete_progress) .try_lock_owned() .expect("cannot happen because we're the only owner"), ); // We meed to do this because when console retries delete request we shouldnt answer with 404 // because 404 means successful deletion. { let mut locked = tenant.timelines.lock().unwrap(); locked.insert(timeline_id, Arc::clone(&timeline)); } guard.mark_in_progress()?; let remote_client = timeline.remote_client.clone(); let timeline = TimelineOrOffloaded::Timeline(timeline); Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client); Ok(()) } fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, tenant: Arc, timeline: TimelineOrOffloaded, remote_client: Arc, ) { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest. let Ok(tenant_guard) = tenant.gate.enter() else { // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion. info!("Tenant is shutting down, timeline deletion will be resumed when it next starts"); return; }; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, tenant_shard_id, Some(timeline_id), "timeline_delete", async move { let _guard = tenant_guard; if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { // Only log as an error if it's not a cancellation. if matches!(err, DeleteTimelineError::Cancelled) { info!("Shutdown during timeline deletion"); }else { error!("Error: {err:#}"); } if let TimelineOrOffloaded::Timeline(timeline) = timeline { timeline.set_broken(format!("{err:#}")) } }; Ok(()) } .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)), ); } async fn background( mut guard: DeletionGuard, conf: &PageServerConf, tenant: &TenantShard, timeline: &TimelineOrOffloaded, remote_client: Arc, ) -> Result<(), DeleteTimelineError> { fail::fail_point!("timeline-delete-before-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? }); match timeline { TimelineOrOffloaded::Timeline(timeline) => { delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await; } TimelineOrOffloaded::Importing(importing) => { delete_local_timeline_directory(conf, tenant.tenant_shard_id, &importing.timeline) .await; } TimelineOrOffloaded::Offloaded(_offloaded) => { // Offloaded timelines have no local state // TODO: once we persist offloaded information, delete the timeline from there, too } } fail::fail_point!("timeline-delete-after-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? }); remote_client.delete_all().await?; pausable_failpoint!("in_progress_delete"); remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?; // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash // between the deletion of the index-part.json and reaching of this code. // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. // However, we handle this case in tenant loading code so the next time we attach, the issue is // resolved. tenant .maybe_upload_tenant_manifest() .await .map_err(|err| match err { TenantManifestError::Cancelled => DeleteTimelineError::Cancelled, err => DeleteTimelineError::Other(err.into()), })?; *guard = Self::Finished; Ok(()) } pub(crate) fn is_not_started(&self) -> bool { matches!(self, Self::NotStarted) } } #[derive(Copy, Clone, PartialEq, Eq)] pub(super) enum TimelineDeleteGuardKind { Offload, Delete, } pub(super) fn make_timeline_delete_guard( tenant: &TenantShard, timeline_id: TimelineId, guard_kind: TimelineDeleteGuardKind, ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { // Note the interaction between this guard and deletion guard. // Here we attempt to lock deletion guard when we're holding a lock on timelines. // This is important because when you take into account `remove_timeline_from_tenant` // we remove timeline from memory when we still hold the deletion guard. // So here when timeline deletion is finished timeline wont be present in timelines map at all // which makes the following sequence impossible: // T1: get preempted right before the try_lock on `Timeline::delete_progress` // T2: do a full deletion, acquire and drop `Timeline::delete_progress` // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` let timelines = tenant.timelines.lock().unwrap(); let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); let timelines_importing = tenant.timelines_importing.lock().unwrap(); let timeline = match timelines.get(&timeline_id) { Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), None => match timelines_offloaded.get(&timeline_id) { Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), None => match timelines_importing.get(&timeline_id) { Some(t) => TimelineOrOffloaded::Importing(Arc::clone(t)), None => return Err(DeleteTimelineError::NotFound), }, }, }; // Ensure that there are no child timelines, because we are about to remove files, // which will break child branches let mut children = Vec::new(); if guard_kind == TimelineDeleteGuardKind::Delete { children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) })); } children.extend(timelines.iter().filter_map(|(id, entry)| { (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) })); if !children.is_empty() { return Err(DeleteTimelineError::HasChildren(children)); } // Note that using try_lock here is important to avoid a deadlock. // Here we take lock on timelines and then the deletion guard. // At the end of the operation we're holding the guard and need to lock timelines map // to remove the timeline from it. // Always if you have two locks that are taken in different order this can result in a deadlock. let delete_progress = Arc::clone(timeline.delete_progress()); let delete_lock_guard = match delete_progress.try_lock_owned() { Ok(guard) => DeletionGuard(guard), Err(_) => { // Unfortunately if lock fails arc is consumed. return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( timeline.delete_progress(), ))); } }; if guard_kind == TimelineDeleteGuardKind::Delete { if let TimelineOrOffloaded::Timeline(timeline) = &timeline { timeline.set_state(TimelineState::Stopping); } } Ok((timeline, delete_lock_guard)) } pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { type Target = DeleteTimelineFlow; fn deref(&self) -> &Self::Target { &self.0 } } impl DerefMut for DeletionGuard { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } ================================================ FILE: pageserver/src/tenant/timeline/detach_ancestor.rs ================================================ use std::collections::HashSet; use std::sync::Arc; use anyhow::Context; use bytes::Bytes; use http_utils::error::ApiError; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; use pageserver_compaction::helpers::overlaps_with; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::completion; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::gate::GateError; use super::layer_manager::{LayerManager, LayerManagerLockHolder}; use super::{FlushLayerError, Timeline}; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; use crate::tenant::TenantShard; use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::storage_layer::{ AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, ValuesReconstructState, }; use crate::tenant::timeline::VersionedKeySpaceQuery; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { #[error("no ancestors")] NoAncestor, #[error("too many ancestors")] TooManyAncestors, #[error("ancestor is not empty")] AncestorNotEmpty, #[error("shutting down, please retry later")] ShuttingDown, #[error("archived: {}", .0)] Archived(TimelineId), #[error(transparent)] NotFound(crate::tenant::GetTimelineError), #[error("failed to reparent all candidate timelines, please retry")] FailedToReparentAll, #[error("ancestor is already being detached by: {}", .0)] OtherTimelineDetachOngoing(TimelineId), #[error("preparing to timeline ancestor detach failed")] Prepare(#[source] anyhow::Error), #[error("detaching and reparenting failed")] DetachReparent(#[source] anyhow::Error), #[error("completing ancestor detach failed")] Complete(#[source] anyhow::Error), #[error("failpoint: {}", .0)] Failpoint(&'static str), } impl Error { /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given /// variant or fancier `or_else`. fn launder(e: anyhow::Error, or_else: F) -> Error where F: Fn(anyhow::Error) -> Error, { use remote_storage::TimeoutOrCancel; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; if e.is::() || TimeoutOrCancel::caused_by_cancel(&e) || e.downcast_ref::() .is_some_and(|e| e.is_cancelled()) || e.is::() { Error::ShuttingDown } else { or_else(e) } } } impl From for ApiError { fn from(value: Error) -> Self { match value { Error::NoAncestor => ApiError::Conflict(value.to_string()), Error::TooManyAncestors | Error::AncestorNotEmpty => { ApiError::BadRequest(anyhow::anyhow!("{value}")) } Error::ShuttingDown => ApiError::ShuttingDown, Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")), Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { ApiError::ResourceUnavailable(value.to_string().into()) } Error::NotFound(e) => ApiError::from(e), // these variants should have no cancellation errors because of Error::launder Error::Prepare(_) | Error::DetachReparent(_) | Error::Complete(_) | Error::Failpoint(_) => ApiError::InternalServerError(value.into()), } } } impl From for Error { fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self { // treat all as shutting down signals, even though that is not entirely correct // (uninitialized state) Error::ShuttingDown } } impl From for Error { fn from(_: super::layer_manager::Shutdown) -> Self { Error::ShuttingDown } } pub(crate) enum Progress { Prepared(Attempt, PreparedTimelineDetach), Done(AncestorDetached), } pub(crate) struct PreparedTimelineDetach { layers: Vec, } // TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. #[derive(Debug)] pub(crate) struct Options { pub(crate) rewrite_concurrency: std::num::NonZeroUsize, pub(crate) copy_concurrency: std::num::NonZeroUsize, } impl Default for Options { fn default() -> Self { Self { rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(), } } } /// Represents an across tenant reset exclusive single attempt to detach ancestor. #[derive(Debug)] pub(crate) struct Attempt { pub(crate) timeline_id: TimelineId, pub(crate) ancestor_timeline_id: TimelineId, pub(crate) ancestor_lsn: Lsn, _guard: completion::Completion, gate_entered: Option, } impl Attempt { pub(crate) fn before_reset_tenant(&mut self) { let taken = self.gate_entered.take(); assert!(taken.is_some()); } pub(crate) fn new_barrier(&self) -> completion::Barrier { self._guard.barrier() } } pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, historic_layers_to_copy: &Vec, ctx: &RequestContext, ) -> Result, Error> { tracing::info!( "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" ); let io_concurrency = IoConcurrency::spawn_from_conf( detached.conf.get_vectored_concurrent_io, detached.gate.enter().map_err(|_| Error::ShuttingDown)?, ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute. let key_range = Key::sparse_non_inherited_keyspace(); // avoid generating a "future layer" which will then be removed let image_lsn = ancestor_lsn; { for layer in historic_layers_to_copy { let desc = layer.layer_desc(); if !desc.is_delta && desc.lsn_range.start == image_lsn && overlaps_with(&key_range, &desc.key_range) { tracing::info!( layer=%layer, "will copy tombstone from ancestor instead of creating a new one" ); return Ok(None); } } let layers = detached .layers .read(LayerManagerLockHolder::DetachAncestor) .await; for layer in layers.all_persistent_layers() { if !layer.is_delta && layer.lsn_range.start == image_lsn && overlaps_with(&key_range, &layer.key_range) { tracing::warn!( layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files" ); return Ok(None); } } } let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn); let data = ancestor .get_vectored_impl(query, &mut reconstruct_state, ctx) .await .context("failed to retrieve aux keys") .map_err(|e| Error::launder(e, Error::Prepare))?; if !data.is_empty() { // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated // upon compaction but theoretically possible. let mut image_layer_writer = ImageLayerWriter::new( detached.conf, detached.timeline_id, detached.tenant_shard_id, &key_range, image_lsn, &detached.gate, detached.cancel.clone(), ctx, ) .await .context("failed to create image layer writer") .map_err(Error::Prepare)?; for key in data.keys() { image_layer_writer .put_image(*key, Bytes::new(), ctx) .await .context("failed to write key") .map_err(|e| Error::launder(e, Error::Prepare))?; } let (desc, path) = image_layer_writer .finish(ctx) .await .context("failed to finish image layer writer for removing the metadata keys") .map_err(|e| Error::launder(e, Error::Prepare))?; let generated = Layer::finish_creating(detached.conf, detached, desc, &path) .map_err(|e| Error::launder(e, Error::Prepare))?; detached .remote_client .upload_layer_file(&generated, &detached.cancel) .await .map_err(|e| Error::launder(e, Error::Prepare))?; tracing::info!(layer=%generated, "wrote image layer"); Ok(Some(generated)) } else { tracing::info!("no aux keys found in ancestor"); Ok(None) } } /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, tenant: &TenantShard, behavior: DetachBehavior, options: Options, ctx: &RequestContext, ) -> Result { use Error::*; let Some((mut ancestor, mut ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { let ancestor_id; let ancestor_lsn; let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else { return Err(NoAncestor); }; ancestor_id = id; ancestor_lsn = lsn; latest .gc_blocking .as_ref() .is_some_and(|b| b.blocked_by(DetachAncestor)) }; if still_in_progress { // gc is still blocked, we can still reparent and complete. // we are safe to reparent remaining, because they were locked in in the beginning. let attempt = continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?; // because the ancestor of detached is already set to none, we have published all // of the layers, so we are still "prepared." return Ok(Progress::Prepared( attempt, PreparedTimelineDetach { layers: Vec::new() }, )); } let reparented_timelines = reparented_direct_children(detached, tenant)?; return Ok(Progress::Done(AncestorDetached { reparented_timelines, })); }; if detached.is_archived() != Some(false) { return Err(Archived(detached.timeline_id)); } if !ancestor_lsn.is_valid() { // rare case, probably wouldn't even load tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing"); return Err(NoAncestor); } check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?; if let DetachBehavior::MultiLevelAndNoReparent = behavior { // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() { if ancestor_lsn != ancestor.ancestor_lsn { // non-technical requirement; we could flatten still if ancestor LSN does not match but that needs // us to copy and cut more layers. return Err(AncestorNotEmpty); } // Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same) ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable ancestor = ancestor_of_ancestor; // TODO: do we still need to check if we don't want to reparent? check_no_archived_children_of_ancestor( tenant, detached, &ancestor, ancestor_lsn, behavior, )?; } } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose // not to, at least initially return Err(TooManyAncestors); } tracing::info!( "attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}", ancestor.timeline_id, ancestor_lsn, behavior ); let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); fail::fail_point!( "timeline-detach-ancestor::before_starting_after_locking", |_| Err(Error::Failpoint( "timeline-detach-ancestor::before_starting_after_locking" )) ); if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { let span = tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); async { let started_at = std::time::Instant::now(); let freeze_and_flush = ancestor.freeze_and_flush0(); let mut freeze_and_flush = std::pin::pin!(freeze_and_flush); let res = tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush) .await; let res = match res { Ok(res) => res, Err(_elapsed) => { tracing::info!("freezing and flushing ancestor is still ongoing"); freeze_and_flush.await } }; res.map_err(|e| { use FlushLayerError::*; match e { Cancelled | NotRunning(_) => { // FIXME(#6424): technically statically unreachable right now, given how we never // drop the sender Error::ShuttingDown } CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()), } })?; // we do not need to wait for uploads to complete but we do need `struct Layer`, // copying delta prefix is unsupported currently for `InMemoryLayer`. tracing::info!( elapsed_ms = started_at.elapsed().as_millis(), "froze and flushed the ancestor" ); Ok::<_, Error>(()) } .instrument(span) .await?; } let end_lsn = ancestor_lsn + 1; let (filtered_layers, straddling_branchpoint, rest_of_historic) = { // we do not need to start from our layers, because they can only be layers that come // *after* ancestor_lsn let layers = tokio::select! { guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard, _ = detached.cancel.cancelled() => { return Err(ShuttingDown); } _ = ancestor.cancel.cancelled() => { return Err(ShuttingDown); } }; // between retries, these can change if compaction or gc ran in between. this will mean // we have to redo work. partition_work(ancestor_lsn, &layers)? }; // TODO: layers are already sorted by something: use that to determine how much of remote // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, // which is something to keep in mind if copy skipping is implemented. tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after let mut new_layers: Vec = Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1); if let Some(tombstone_layer) = generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, &rest_of_historic, ctx) .await? { new_layers.push(tombstone_layer.into()); } { tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); let mut tasks = tokio::task::JoinSet::new(); let mut wrote_any = false; let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); for layer in straddling_branchpoint { let limiter = limiter.clone(); let timeline = detached.clone(); let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); let span = tracing::info_span!("upload_rewritten_layer", %layer); tasks.spawn( async move { let _permit = limiter.acquire().await; let copied = upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) .await?; if let Some(copied) = copied.as_ref() { tracing::info!(%copied, "rewrote and uploaded"); } Ok(copied) } .instrument(span), ); } while let Some(res) = tasks.join_next().await { match res { Ok(Ok(Some(copied))) => { wrote_any = true; new_layers.push(copied); } Ok(Ok(None)) => {} Ok(Err(e)) => return Err(e), Err(je) => return Err(Error::Prepare(je.into())), } } // FIXME: the fsync should be mandatory, after both rewrites and copies if wrote_any { fsync_timeline_dir(detached, ctx).await; } } let mut tasks = tokio::task::JoinSet::new(); let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); let cancel_eval = CancellationToken::new(); for adopted in rest_of_historic { let limiter = limiter.clone(); let timeline = detached.clone(); let cancel_eval = cancel_eval.clone(); tasks.spawn( async move { let _permit = tokio::select! { permit = limiter.acquire() => { permit } // Wait for the cancellation here instead of letting the entire task be cancelled. // Cancellations are racy in that they might leave layers on disk. _ = cancel_eval.cancelled() => { Err(Error::ShuttingDown)? } }; let (owned, did_hardlink) = remote_copy( &adopted, &timeline, timeline.generation, timeline.shard_identity, &timeline.cancel, ) .await?; tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied"); Ok((owned, did_hardlink)) } .in_current_span(), ); } fn delete_layers(timeline: &Timeline, layers: Vec) -> Result<(), Error> { // We are deleting layers, so we must hold the gate let _gate = timeline.gate.enter().map_err(|e| match e { GateError::GateClosed => Error::ShuttingDown, })?; { layers.into_iter().for_each(|l: Layer| { l.delete_on_drop(); std::mem::drop(l); }); } Ok(()) } let mut should_fsync = false; let mut first_err = None; while let Some(res) = tasks.join_next().await { match res { Ok(Ok((owned, did_hardlink))) => { if did_hardlink { should_fsync = true; } new_layers.push(owned); } // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete. Ok(Err(failed)) => { cancel_eval.cancel(); first_err.get_or_insert(failed); } Err(je) => { cancel_eval.cancel(); first_err.get_or_insert(Error::Prepare(je.into())); } } } if let Some(failed) = first_err { delete_layers(detached, new_layers)?; return Err(failed); } // fsync directory again if we hardlinked something if should_fsync { fsync_timeline_dir(detached, ctx).await; } let prepared = PreparedTimelineDetach { layers: new_layers }; Ok(Progress::Prepared(attempt, prepared)) } async fn start_new_attempt( detached: &Timeline, tenant: &TenantShard, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, ) -> Result { let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?; // insert the block in the index_part.json, if not already there. let _dont_care = tenant .gc_block .insert( detached, crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, ) .await .map_err(|e| Error::launder(e, Error::Prepare))?; Ok(attempt) } async fn continue_with_blocked_gc( detached: &Timeline, tenant: &TenantShard, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, ) -> Result { // FIXME: it would be nice to confirm that there is an in-memory version, since we've just // verified there is a persistent one? obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn) } fn obtain_exclusive_attempt( detached: &Timeline, tenant: &TenantShard, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, ) -> Result { use Error::{OtherTimelineDetachOngoing, ShuttingDown}; // ensure we are the only active attempt for this tenant let (guard, barrier) = completion::channel(); { let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); if let Some((tl, other)) = guard.as_ref() { if !other.is_ready() { return Err(OtherTimelineDetachOngoing(*tl)); } // FIXME: no test enters here } *guard = Some((detached.timeline_id, barrier)); } // ensure the gate is still open let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; Ok(Attempt { timeline_id: detached.timeline_id, ancestor_timeline_id, ancestor_lsn, _guard: guard, gate_entered: Some(_gate_entered), }) } fn reparented_direct_children( detached: &Arc, tenant: &TenantShard, ) -> Result, Error> { let mut all_direct_children = tenant .timelines .lock() .unwrap() .values() .filter_map(|tl| { let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); if is_direct_child { Some(tl.clone()) } else { if let Some(timeline) = tl.ancestor_timeline.as_ref() { assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live"); } None } }) // Collect to avoid lock taking order problem with Tenant::timelines and // Timeline::remote_client .collect::>(); let mut any_shutdown = false; all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { Ok(accessor) => accessor .latest_uploaded_index_part() .lineage .is_reparented(), Err(_shutdownalike) => { // not 100% a shutdown, but let's bail early not to give inconsistent results in // sharded enviroment. any_shutdown = true; true } }); if any_shutdown { // it could be one or many being deleted; have client retry return Err(Error::ShuttingDown); } Ok(all_direct_children .into_iter() .map(|tl| tl.timeline_id) .collect()) } fn partition_work( ancestor_lsn: Lsn, source: &LayerManager, ) -> Result<(usize, Vec, Vec), Error> { let mut straddling_branchpoint = vec![]; let mut rest_of_historic = vec![]; let mut later_by_lsn = 0; for desc in source.layer_map()?.iter_historic_layers() { // off by one chances here: // - start is inclusive // - end is exclusive if desc.lsn_range.start > ancestor_lsn { later_by_lsn += 1; continue; } let target = if desc.lsn_range.start <= ancestor_lsn && desc.lsn_range.end > ancestor_lsn && desc.is_delta { // TODO: image layer at Lsn optimization &mut straddling_branchpoint } else { &mut rest_of_historic }; target.push(source.get_from_desc(&desc)); } Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) } async fn upload_rewritten_layer( end_lsn: Lsn, layer: &Layer, target: &Arc, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result, Error> { let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; let Some(copied) = copied else { return Ok(None); }; target .remote_client .upload_layer_file(&copied, cancel) .await .map_err(|e| Error::launder(e, Error::Prepare))?; Ok(Some(copied.into())) } async fn copy_lsn_prefix( end_lsn: Lsn, layer: &Layer, target_timeline: &Arc, ctx: &RequestContext, ) -> Result, Error> { if target_timeline.cancel.is_cancelled() { return Err(Error::ShuttingDown); } tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); let mut writer = DeltaLayerWriter::new( target_timeline.conf, target_timeline.timeline_id, target_timeline.tenant_shard_id, layer.layer_desc().key_range.start, layer.layer_desc().lsn_range.start..end_lsn, &target_timeline.gate, target_timeline.cancel.clone(), ctx, ) .await .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) .map_err(Error::Prepare)?; let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| { if e.is_cancelled() { Error::ShuttingDown } else { Error::Prepare(e.into()) } })?; let records = resident .copy_delta_prefix(&mut writer, end_lsn, ctx) .await .with_context(|| format!("copy lsn prefix of ancestors {layer}")) .map_err(Error::Prepare)?; drop(resident); tracing::debug!(%layer, records, "copied records"); if records == 0 { drop(writer); // TODO: we might want to store an empty marker in remote storage for this // layer so that we will not needlessly walk `layer` on repeated attempts. Ok(None) } else { // reuse the key instead of adding more holes between layers by using the real // highest key in the layer. let reused_highest_key = layer.layer_desc().key_range.end; let (desc, path) = writer .finish(reused_highest_key, ctx) .await .map_err(Error::Prepare)?; let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) .map_err(Error::Prepare)?; tracing::debug!(%layer, %copied, "new layer produced"); Ok(Some(copied)) } } /// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote /// storage on successful return. without the adopted layer being added to `index_part.json`. /// Returns (Layer, did hardlink) async fn remote_copy( adopted: &Layer, adoptee: &Arc, generation: Generation, shard_identity: ShardIdentity, cancel: &CancellationToken, ) -> Result<(Layer, bool), Error> { let mut metadata = adopted.metadata(); debug_assert!(metadata.generation <= generation); metadata.generation = generation; metadata.shard = shard_identity.shard_index(); let conf = adoptee.conf; let file_name = adopted.layer_desc().layer_name(); // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below let _gate = adoptee.gate.enter().map_err(|e| match e { GateError::GateClosed => Error::ShuttingDown, })?; // depending if Layer::keep_resident, do a hardlink let did_hardlink; let owned = if let Some(adopted_resident) = adopted.keep_resident().await { let adopted_path = adopted_resident.local_path(); let adoptee_path = local_layer_path( conf, &adoptee.tenant_shard_id, &adoptee.timeline_id, &file_name, &metadata.generation, ); match std::fs::hard_link(adopted_path, &adoptee_path) { Ok(()) => {} Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { // In theory we should not get into this situation as we are doing cleanups of the layer file after errors. // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch. // Double check that the file is orphan (probably from an earlier attempt), then delete it let key = file_name.clone().into(); if adoptee .layers .read(LayerManagerLockHolder::DetachAncestor) .await .contains_key(&key) { // We are supposed to filter out such cases before coming to this function return Err(Error::Prepare(anyhow::anyhow!( "layer file {file_name} already present and inside layer map" ))); } tracing::info!("Deleting orphan layer file to make way for hard linking"); // Delete orphan layer file and try again, to ensure this layer has a well understood source std::fs::remove_file(&adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; std::fs::hard_link(adopted_path, &adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; } Err(e) => { return Err(Error::launder(e.into(), Error::Prepare)); } }; did_hardlink = true; Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() } else { did_hardlink = false; Layer::for_evicted(conf, adoptee, file_name, metadata) }; let layer = match adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await { Ok(()) => owned, Err(e) => { { // Clean up the layer so that on a retry we don't get errors that the file already exists owned.delete_on_drop(); std::mem::drop(owned); } return Err(Error::launder(e, Error::Prepare)); } }; Ok((layer, did_hardlink)) } pub(crate) enum DetachingAndReparenting { /// All of the following timeline ids were reparented and the timeline ancestor detach must be /// marked as completed. Reparented(HashSet), /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as /// completed. /// /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. SomeReparentingFailed { must_reset_tenant: bool }, /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach /// must be marked as completed. AlreadyDone(HashSet), } impl DetachingAndReparenting { pub(crate) fn reset_tenant_required(&self) -> bool { use DetachingAndReparenting::*; match self { Reparented(_) => true, SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, AlreadyDone(_) => false, } } pub(crate) fn completed(self) -> Option> { use DetachingAndReparenting::*; match self { Reparented(x) | AlreadyDone(x) => Some(x), SomeReparentingFailed { .. } => None, } } } /// See [`Timeline::detach_from_ancestor_and_reparent`]. pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &TenantShard, prepared: PreparedTimelineDetach, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, behavior: DetachBehavior, _ctx: &RequestContext, ) -> Result { let PreparedTimelineDetach { layers } = prepared; #[derive(Debug)] enum Ancestor { NotDetached(Arc, Lsn), Detached(Arc, Lsn), } let (recorded_branchpoint, still_ongoing) = { let access = detached.remote_client.initialized_upload_queue()?; let latest = access.latest_uploaded_index_part(); ( latest.lineage.detached_previous_ancestor(), latest .gc_blocking .as_ref() .is_some_and(|b| b.blocked_by(DetachAncestor)), ) }; assert!( still_ongoing, "cannot (detach? reparent)? complete if the operation is not still ongoing" ); let ancestor_to_detach = match detached.ancestor_timeline.as_ref() { Some(mut ancestor) => { while ancestor.timeline_id != ancestor_timeline_id { match ancestor.ancestor_timeline.as_ref() { Some(found) => { if ancestor_lsn != ancestor.ancestor_lsn { return Err(Error::DetachReparent(anyhow::anyhow!( "cannot find the ancestor timeline to detach from: wrong ancestor lsn" ))); } ancestor = found; } None => { return Err(Error::DetachReparent(anyhow::anyhow!( "cannot find the ancestor timeline to detach from" ))); } } } Some(ancestor) } None => None, }; let ancestor = match (ancestor_to_detach, recorded_branchpoint) { (Some(ancestor), None) => { assert!( !layers.is_empty(), "there should always be at least one layer to inherit" ); Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) } (Some(_), Some(_)) => { panic!( "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" ); } (None, Some((ancestor_id, ancestor_lsn))) => { // it has been either: // - detached but still exists => we can try reparenting // - detached and deleted // // either way, we must complete assert!( layers.is_empty(), "no layers should had been copied as detach is done" ); let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); if let Some(ancestor) = existing { Ancestor::Detached(ancestor, ancestor_lsn) } else { let direct_children = reparented_direct_children(detached, tenant)?; return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); } } (None, None) => { // TODO: make sure there are no `?` before tenant_reset from after a questionmark from // here. panic!( "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" ); } }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // // if we crash after this operation, a retry will allow reparenting the remaining timelines as // gc is blocked. let (ancestor, ancestor_lsn, was_detached) = match ancestor { Ancestor::NotDetached(ancestor, ancestor_lsn) => { // this has to complete before any reparentings because otherwise they would not have // layers on the new parent. detached .remote_client .schedule_adding_existing_layers_to_index_detach_and_wait( &layers, (ancestor.timeline_id, ancestor_lsn), ) .await .context("publish layers and detach ancestor") .map_err(|e| Error::launder(e, Error::DetachReparent))?; tracing::info!( ancestor=%ancestor.timeline_id, %ancestor_lsn, inherited_layers=%layers.len(), "detached from ancestor" ); (ancestor, ancestor_lsn, true) } Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), }; if let DetachBehavior::MultiLevelAndNoReparent = behavior { // Do not reparent if the user requests to behave so. return Ok(DetachingAndReparenting::Reparented(HashSet::new())); } let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, // others will fail as if those timelines had been stopped for whatever reason. #[cfg(feature = "testing")] let failpoint_sem = || -> Option> { fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( Arc::new(Semaphore::new(1)) )); None }(); // because we are now keeping the slot in progress, it is unlikely that there will be any // timeline deletions during this time. if we raced one, then we'll just ignore it. { let g = tenant.timelines.lock().unwrap(); reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) .cloned() .for_each(|timeline| { // important in this scope: we are holding the Tenant::timelines lock let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); let new_parent = detached.timeline_id; #[cfg(feature = "testing")] let failpoint_sem = failpoint_sem.clone(); tasks.spawn( async move { let res = async { #[cfg(feature = "testing")] if let Some(failpoint_sem) = failpoint_sem { let _permit = failpoint_sem.acquire().await.map_err(|_| { anyhow::anyhow!( "failpoint: timeline-detach-ancestor::allow_one_reparented", ) })?; failpoint_sem.close(); } timeline .remote_client .schedule_reparenting_and_wait(&new_parent) .await } .await; match res { Ok(()) => { tracing::info!("reparented"); Some(timeline) } Err(e) => { // with the use of tenant slot, raced timeline deletion is the most // likely reason. tracing::warn!("reparenting failed: {e:#}"); None } } } .instrument(span), ); }); } let reparenting_candidates = tasks.len(); let mut reparented = HashSet::with_capacity(tasks.len()); while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { assert!( reparented.insert(timeline.timeline_id), "duplicate reparenting? timeline_id={}", timeline.timeline_id ); } Err(je) if je.is_cancelled() => unreachable!("not used"), // just ignore failures now, we can retry Ok(None) => {} Err(je) if je.is_panic() => {} Err(je) => tracing::error!("unexpected join error: {je:?}"), } } let reparented_all = reparenting_candidates == reparented.len(); if reparented_all { Ok(DetachingAndReparenting::Reparented(reparented)) } else { tracing::info!( reparented = reparented.len(), candidates = reparenting_candidates, "failed to reparent all candidates; they can be retried after the tenant_reset", ); let must_reset_tenant = !reparented.is_empty() || was_detached; Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) } } pub(super) async fn complete( detached: &Arc, tenant: &TenantShard, mut attempt: Attempt, _ctx: &RequestContext, ) -> Result<(), Error> { assert_eq!(detached.timeline_id, attempt.timeline_id); if attempt.gate_entered.is_none() { let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; attempt.gate_entered = Some(entered); } else { // Some(gate_entered) means the tenant was not restarted, as is not required } assert!(detached.ancestor_timeline.is_none()); // this should be an 503 at least...? fail::fail_point!( "timeline-detach-ancestor::complete_before_uploading", |_| Err(Error::Failpoint( "timeline-detach-ancestor::complete_before_uploading" )) ); tenant .gc_block .remove( detached, crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, ) .await .map_err(|e| Error::launder(e, Error::Complete))?; Ok(()) } /// Query against a locked `Tenant::timelines`. /// /// A timeline is reparentable if: /// /// - It is not the timeline being detached. /// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor. fn reparentable_timelines<'a, I>( timelines: I, detached: &'a Arc, ancestor: &'a Arc, ancestor_lsn: Lsn, ) -> impl Iterator> + 'a where I: Iterator> + 'a, { timelines.filter_map(move |tl| { if Arc::ptr_eq(tl, detached) { return None; } let tl_ancestor = tl.ancestor_timeline.as_ref()?; let is_same = Arc::ptr_eq(ancestor, tl_ancestor); let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; let is_deleting = tl .delete_progress .try_lock() .map(|flow| !flow.is_not_started()) .unwrap_or(true); if is_same && is_earlier && !is_deleting { Some(tl) } else { None } }) } fn check_no_archived_children_of_ancestor( tenant: &TenantShard, detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, detach_behavior: DetachBehavior, ) -> Result<(), Error> { match detach_behavior { DetachBehavior::NoAncestorAndReparent => { let timelines = tenant.timelines.lock().unwrap(); let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) { if timeline.is_archived() == Some(true) { return Err(Error::Archived(timeline.timeline_id)); } } for timeline_offloaded in timelines_offloaded.values() { if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) { continue; } // This forbids the detach ancestor feature if flattened timelines are present, // even if the ancestor_lsn is from after the branchpoint of the detached timeline. // But as per current design, we don't record the ancestor_lsn of flattened timelines. // This is a bit unfortunate, but as of writing this we don't support flattening // anyway. Maybe we can evolve the data model in the future. if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn { let is_earlier = retain_lsn <= ancestor_lsn; if !is_earlier { continue; } } return Err(Error::Archived(timeline_offloaded.timeline_id)); } } DetachBehavior::MultiLevelAndNoReparent => { // We don't need to check anything if the user requested to not reparent. } } Ok(()) } async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) { let path = &timeline .conf .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id); let timeline_dir = VirtualFile::open(&path, ctx) .await .fatal_err("VirtualFile::open for timeline dir fsync"); timeline_dir .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); } ================================================ FILE: pageserver/src/tenant/timeline/eviction_task.rs ================================================ //! The per-timeline layer eviction task, which evicts data which has not been accessed for more //! than a given threshold. //! //! Data includes all kinds of caches, namely: //! - (in-memory layers) //! - on-demand downloaded layer files on disk //! - (cached layer file pages) //! - derived data from layer file contents, namely: //! - initial logical size //! - partitioning //! - (other currently missing unknowns) //! //! Items with parentheses are not (yet) touched by this task. //! //! See write-up on restart on-demand download spike: use std::collections::HashMap; use std::ops::ControlFlow; use std::sync::Arc; use std::time::{Duration, SystemTime}; use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, info, info_span, instrument, warn}; use utils::completion; use utils::sync::gate::GateGuard; use super::Timeline; use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::size::CalculateSyntheticSizeError; use crate::tenant::storage_layer::LayerVisibilityHint; use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; use crate::tenant::timeline::EvictionError; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{LogicalSizeCalculationCause, TenantShard}; #[derive(Default)] pub struct EvictionTaskTimelineState { last_layer_access_imitation: Option, } #[derive(Default)] pub struct EvictionTaskTenantState { last_layer_access_imitation: Option, } impl Timeline { pub(super) fn launch_eviction_task( self: &Arc, parent: Arc, background_tasks_can_start: Option<&completion::Barrier>, ) { let self_clone = Arc::clone(self); let background_tasks_can_start = background_tasks_can_start.cloned(); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, self.tenant_shard_id, Some(self.timeline_id), &format!( "layer eviction for {}/{}", self.tenant_shard_id, self.timeline_id ), async move { tokio::select! { _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, tenant: Arc) { // acquire the gate guard only once within a useful span let Ok(guard) = self.gate.enter() else { return; }; { let policy = self.get_eviction_policy(); let period = match policy { EvictionPolicy::LayerAccessThreshold(lat) => lat.period, EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; if sleep_random(period, &self.cancel).await.is_err() { return; } } let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn) .with_scope_timeline(&self); loop { let policy = self.get_eviction_policy(); let cf = self .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { break; } } } } } #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] async fn eviction_iteration( self: &Arc, tenant: &TenantShard, policy: &EvictionPolicy, cancel: &CancellationToken, gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); let start = Instant::now(); let (period, threshold) = match policy { EvictionPolicy::NoEviction => { // check again in 10 seconds; XXX config watch mechanism return ControlFlow::Continue(Instant::now() + Duration::from_secs(10)); } EvictionPolicy::LayerAccessThreshold(p) => { match self .eviction_iteration_threshold(tenant, p, cancel, gate, ctx) .await { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } (p.period, p.threshold) } EvictionPolicy::OnlyImitiate(p) => { if self .imitiate_only(tenant, p, cancel, gate, ctx) .await .is_break() { return ControlFlow::Break(()); } (p.period, p.threshold) } }; let elapsed = start.elapsed(); crate::tenant::tasks::warn_when_period_overrun( elapsed, period, BackgroundLoopKind::Eviction, ); // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I // don't think that is a relevant fear however, and regardless the imitation should be the // most costly part. crate::metrics::EVICTION_ITERATION_DURATION .get_metric_with_label_values(&[ &format!("{}", period.as_secs()), &format!("{}", threshold.as_secs()), ]) .unwrap() .observe(elapsed.as_secs_f64()); ControlFlow::Continue(start + period) } async fn eviction_iteration_threshold( self: &Arc, tenant: &TenantShard, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); let permit = self.acquire_imitation_permit(cancel, ctx).await?; self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) .await?; #[derive(Debug, Default)] struct EvictionStats { candidates: usize, evicted: usize, errors: usize, not_evictable: usize, timeouts: usize, #[allow(dead_code)] skipped_for_shutdown: usize, } let mut stats = EvictionStats::default(); // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. // We don't want to hold the layer map lock during eviction. // So, we just need to deal with this. let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read(LayerManagerLockHolder::Eviction).await; guard .likely_resident_layers() .filter(|layer| { let last_activity_ts = layer.latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { // We reach here if `now` < `last_activity_ts`, which can legitimately // happen if there is an access between us getting `now`, and us getting // the access stats from the layer. // // The other reason why it can happen is system clock skew because // SystemTime::now() is not monotonic, so, even if there is no access // to the layer after we get `now` at the beginning of this function, // it could be that `now` < `last_activity_ts`. // // To distinguish the cases, we would need to record `Instant`s in the // access stats (i.e., monotonic timestamps), but then, the timestamps // values in the access stats would need to be `Instant`'s, and hence // they would be meaningless outside of the pageserver process. // At the time of writing, the trade-off is that access stats are more // valuable than detecting clock skew. return false; } }; match layer.visibility() { LayerVisibilityHint::Visible => { // Usual case: a visible layer might be read any time, and we will keep it // resident until it hits our configured TTL threshold. no_activity_for > p.threshold } LayerVisibilityHint::Covered => { // Covered layers: this is probably a layer that was recently covered by // an image layer during compaction. We don't evict it immediately, but // it doesn't stay resident for the full `threshold`: we just keep it // for a shorter time in case // - it is used for Timestamp->LSN lookups // - a new branch is created in recent history which will read this layer no_activity_for > p.period } } }) .cloned() .for_each(|layer| { js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) .await }); stats.candidates += 1; }); }; let join_all = async move { while let Some(next) = js.join_next().await { match next { Ok(Ok(())) => stats.evicted += 1, Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { stats.not_evictable += 1; } Ok(Err(EvictionError::Timeout)) => { stats.timeouts += 1; } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ stats.errors += 1; } Err(je) => tracing::error!("unknown JoinError: {je:?}"), } } stats }; tokio::select! { stats = join_all => { if stats.candidates == stats.not_evictable { debug!(stats=?stats, "eviction iteration complete"); } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 { // reminder: timeouts are not eviction cancellations warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); } } _ = cancel.cancelled() => { // just drop the joinset to "abort" } } ControlFlow::Continue(()) } /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by /// disk usage based eviction task. async fn imitiate_only( self: &Arc, tenant: &TenantShard, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { let permit = self.acquire_imitation_permit(cancel, ctx).await?; self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) .await } async fn acquire_imitation_permit( &self, cancel: &CancellationToken, ctx: &RequestContext, ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> { let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx); tokio::select! { permit = acquire_permit => ControlFlow::Continue(permit), _ = cancel.cancelled() => ControlFlow::Break(()), _ = self.cancel.cancelled() => ControlFlow::Break(()), } } /// If we evict layers but keep cached values derived from those layers, then /// we face a storm of on-demand downloads after pageserver restart. /// The reason is that the restart empties the caches, and so, the values /// need to be re-computed by accessing layers, which we evicted while the /// caches were filled. /// /// Solutions here would be one of the following: /// 1. Have a persistent cache. /// 2. Count every access to a cached value to the access stats of all layers /// that were accessed to compute the value in the first place. /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values /// get re-computed from layers, thereby counting towards layer access stats. /// 4. Make the eviction task imitate the layer accesses that typically hit caches. /// /// We follow approach (4) here because in Neon prod deployment: /// - page cache is quite small => high churn => low hit rate /// => eviction gets correct access stats /// - value-level caches such as logical size & repatition have a high hit rate, /// especially for inactive tenants /// => eviction sees zero accesses for these /// => they cause the on-demand download storm on pageserver restart /// /// We should probably move to persistent caches in the future, or avoid /// having inactive tenants attached to pageserver in the first place. #[instrument(skip_all)] async fn imitate_layer_accesses( &self, tenant: &TenantShard, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, permit: BackgroundLoopSemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { if !self.tenant_shard_id.is_shard_zero() { // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // skip imitating logical size accesses for eviction purposes. return ControlFlow::Continue(()); } let mut state = self.eviction_task_timeline_state.lock().await; // Only do the imitate_layer accesses approximately as often as the threshold. A little // more frequently, to avoid this period racing with the threshold/period-th eviction iteration. let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold); match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { self.imitate_timeline_cached_layer_accesses(gate, ctx).await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()) } } drop(state); if cancel.is_cancelled() { return ControlFlow::Break(()); } // This task is timeline-scoped, but the synthetic size calculation is tenant-scoped. // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. let (mut state, _permit) = { if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() { (locked, permit) } else { // we might need to wait for a long time here in case of pathological synthetic // size calculation performance drop(permit); let locked = tokio::select! { locked = tenant.eviction_task_tenant_state.lock() => locked, _ = self.cancel.cancelled() => { return ControlFlow::Break(()) }, _ = cancel.cancelled() => { return ControlFlow::Break(()) } }; // then reacquire -- this will be bad if there is a lot of traffic, but because we // released the permit, the overall latency will be much better. let permit = self.acquire_imitation_permit(cancel, ctx).await?; (locked, permit) } }; match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx) .await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()); } } drop(state); if cancel.is_cancelled() { return ControlFlow::Break(()); } ControlFlow::Continue(()) } /// Recompute the values which would cause on-demand downloads during restart. #[instrument(skip_all)] async fn imitate_timeline_cached_layer_accesses( &self, guard: &GateGuard, ctx: &RequestContext, ) { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size let size = self .calculate_logical_size( lsn, LogicalSizeCalculationCause::EvictionTaskImitation, guard, ctx, ) .instrument(info_span!("calculate_logical_size")) .await; match &size { Ok(_size) => { // good, don't log it to avoid confusion } Err(_) => { // we have known issues for which we already log this on consumption metrics, // gc, and compaction. leave logging out for now. // // https://github.com/neondatabase/neon/issues/2539 } } // imitiate repartiting on first compactation if let Err(e) = self .collect_keyspace(lsn, ctx) .instrument(info_span!("collect_keyspace")) .await { // if this failed, we probably failed logical size because these use the same keys if size.is_err() { // ignore, see above comment } else { match e { CollectKeySpaceError::Cancelled => { // Shutting down, ignore } err => { warn!( "failed to collect keyspace but succeeded in calculating logical size: {err:#}" ); } } } } } // Imitate the synthetic size calculation done by the consumption_metrics module. #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext, ) { if self.conf.metric_collection_endpoint.is_none() { // We don't start the consumption metrics task if this is not set in the config. // So, no need to imitate the accesses in that case. return; } // The consumption metrics are collected on a per-tenant basis, by a single // global background loop. // It limits the number of synthetic size calculations using the global // `concurrent_tenant_size_logical_size_queries` semaphore to not overload // the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs). // // If we used that same semaphore here, then we'd compete for the // same permits, which may impact timeliness of consumption metrics. // That is a no-go, as consumption metrics are much more important // than what we do here. // // So, we have a separate semaphore, initialized to the same // number of permits as the `concurrent_tenant_size_logical_size_queries`. // In the worst, we would have twice the amount of concurrenct size calculations. // But in practice, the `p.threshold` >> `consumption metric interval`, and // we spread out the eviction task using `random_init_delay`. // So, the chance of the worst case is quite low in practice. // It runs as a per-tenant task, but the eviction_task.rs is per-timeline. // So, we must coordinate with other with other eviction tasks of this tenant. let limit = self .conf .eviction_task_immitated_concurrent_logical_size_queries .inner(); let mut throwaway_cache = HashMap::new(); let gather = crate::tenant::size::gather_inputs( tenant, limit, None, &mut throwaway_cache, LogicalSizeCalculationCause::EvictionTaskImitation, cancel, ctx, ) .instrument(info_span!("gather_inputs")); tokio::select! { _ = cancel.cancelled() => {} gather_result = gather => { match gather_result { Ok(_) => {}, // It can happen sometimes that we hit this instead of the cancellation token firing above Err(CalculateSyntheticSizeError::Cancelled) => {} Err(e) => { // We don't care about the result, but, if it failed, we should log it, // since consumption metric might be hitting the cached value and // thus not encountering this error. warn!("failed to imitate synthetic size calculation accesses: {e:#}") } } } } } } ================================================ FILE: pageserver/src/tenant/timeline/handle.rs ================================================ //! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`. //! //! # Motivation //! //! On a single page service connection, we're typically serving a single TenantTimelineId. //! //! Without sharding, there is a single Timeline object to which we dispatch //! all requests. For example, a getpage request gets dispatched to the //! Timeline::get method of the Timeline object that represents the //! (tenant,timeline) of that connection. //! //! With sharding, for each request that comes in on the connection, //! we first have to perform shard routing based on the requested key (=~ page number). //! The result of shard routing is a Timeline object. //! We then dispatch the request to that Timeline object. //! //! Regardless of whether the tenant is sharded or not, we want to ensure that //! we hold the Timeline gate open while we're invoking the method on the //! Timeline object. //! //! We want to avoid the overhead of doing, for each incoming request, //! - tenant manager lookup (global rwlock + btreemap lookup for shard routing) //! - cloning the `Arc` out of the tenant manager so we can //! release the mgr rwlock before doing any request processing work //! - re-entering the Timeline gate for each Timeline method invocation. //! //! Regardless of how we accomplish the above, it should not //! prevent the Timeline from shutting down promptly. //! //! //! # Design //! //! ## Data Structures //! //! There are two concepts expressed as associated types in the `Types` trait: //! - `TenantManager`: the thing that performs the expensive work. It produces //! a `Timeline` object, which is the other associated type. //! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup. //! //! There are three user-facing data structures exposed by this module: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. //! - `Handle`: a smart pointer that derefs to the Types::Timeline. //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows //! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always //! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`. //! //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. //! //! The `HandleInner` is allocated as a `Arc>` and //! referenced weakly and strongly from various places which we are now illustrating. //! For brevity, we will omit the `Arc>` part in the following and instead //! use `strong ref` and `weak ref` when referring to the `Arc>` //! or `Weak>`, respectively. //! //! - The `Handle` is a strong ref. //! - The `WeakHandle` is a weak ref. //! - The `PerTimelineState` contains a `HashMap`. //! - The `Cache` is a `HashMap`. //! //! Lifetimes: //! - `WeakHandle` and `Handle`: single pagestream request. //! - `Cache`: single page service connection. //! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`). //! //! ## Request Handling Flow (= filling and using the `Cache``) //! //! To dispatch a request, the page service connection calls `Cache::get`. //! //! A cache miss means we call Types::TenantManager::resolve for shard routing, //! cloning the `Arc` out of it, and entering the gate. The result of //! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls. //! //! We wrap the object returned from resolve() in an `Arc` and store that inside the //! `Arc>>`. A weak ref to the HandleInner is stored in the `Cache` //! and a strong ref in the `PerTimelineState`. //! Another strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing //! and find the weak ref in the cache. //! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`. //! //! The pagestream processing is pipelined and involves a batching step. //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` //! and the request handler dispatches the request to the right `>::$request_method`. //! It then drops the `Handle`, and thus the `Arc>` inside it. //! //! # Performance //! //! Remember from the introductory section: //! //! > We want to avoid the overhead of doing, for each incoming request, //! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing) //! > - cloning the `Arc` out of the tenant manager so we can //! > release the mgr rwlock before doing any request processing work //! > - re-entering the Timeline gate for each Timeline method invocation. //! //! All of these boil down to some state that is either globally shared among all shards //! or state shared among all tasks that serve a particular timeline. //! It is either protected by RwLock or manipulated via atomics. //! Even atomics are costly when shared across multiple cores. //! So, we want to avoid any permanent need for coordination between page_service tasks. //! //! The solution is to add indirection: we wrap the Types::Timeline object that is //! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner` //! and hence to the single Cache / page_service connection. //! (Review the "Data Structures" section if that is unclear to you.) //! //! //! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex`), //! lock the mutex, take out a clone of the `Arc`, and drop the Mutex. //! The Mutex is not contended because it is private to the connection. //! And again, the `Arc` clone is cheap because that wrapper //! Arc's refcounts are private to the connection. //! //! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection. //! //! //! # Shutdown //! //! The attentive reader may have noticed the following reference cycle around the `Arc`: //! //! ```text //! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline //! ``` //! //! Further, there is this cycle: //! //! ```text //! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline //! ``` //! //! The former cycle is a memory leak if not broken. //! The latter cycle further prevents the Timeline from shutting down //! because we certainly won't drop the Timeline while the GateGuard is alive. //! Preventing shutdown is the whole point of this handle/cache system, //! but when the Timeline needs to shut down, we need to break the cycle. //! //! The cycle is broken by either //! - Timeline shutdown (=> `PerTimelineState::shutdown`) //! - Connection shutdown (=> dropping the `Cache`). //! //! Both transition the `HandleInner` from [`HandleInner::Open`] to //! [`HandleInner::ShutDown`], which drops the only long-lived //! `Arc`. Once the last short-lived Arc //! is dropped, the `Types::Timeline` gets dropped and thereby //! the `GateGuard` and the `Arc` that it stores, //! thereby breaking both cycles. //! //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, //! thereby breaking the cycle. //! It also initiates draining of already existing `Handle`s by //! poisoning things so that no new `HandleInner`'s can be added //! to the `PerTimelineState`, which will make subsequent `Cache::get` fail. //! //! Concurrently existing / already upgraded `Handle`s will extend the //! lifetime of the `Arc>` and hence cycles. //! However, since `Handle`s are short-lived and new `Handle`s are not //! handed out from `Cache::get` or `WeakHandle::upgrade` after //! `PerTimelineState::shutdown`, that extension of the cycle is bounded. //! //! Concurrently existing `WeakHandle`s will fail to `upgrade()`: //! while they will succeed in upgrading `Weak>`, //! they will find the inner in state `HandleInner::ShutDown` state where the //! `Arc` and Timeline has already been dropped. //! //! Dropping the `Cache` undoes the registration of this `Cache`'s //! `HandleInner`s from all the `PerTimelineState`s, i.e., it //! removes the strong ref to each of its `HandleInner`s //! from all the `PerTimelineState`. //! //! # Locking Rules //! //! To prevent deadlocks we: //! //! 1. Only ever hold one of the locks at a time. //! 2. Don't add more than one Drop impl that locks on the //! cycles above. //! //! As per (2), that impl is in `Drop for Cache`. //! //! # Fast Path for Shard Routing //! //! The `Cache` has a fast path for shard routing to avoid calling into //! the tenant manager for every request. //! //! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s. //! //! The current implementation uses the first entry in the hash map //! to determine the `ShardParameters` and derive the correct //! `ShardIndex` for the requested key. //! //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. //! //! If the lookup is successful and the `WeakHandle` can be upgraded, //! it's a hit. //! //! ## Cache invalidation //! //! The insight is that cache invalidation is sufficient and most efficiently if done lazily. //! The only reasons why an entry in the cache can become stale are: //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is //! being detached, timeline or shard deleted, or pageserver is shutting down. //! 2. We're doing a shard split and new traffic should be routed to the child shards. //! //! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the //! timeline has shut down, and when that happens, we remove the entry from the cache. //! //! Regarding (2), the insight is that it is toally fine to keep dispatching requests //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). use std::collections::HashMap; use std::collections::hash_map; use std::sync::Arc; use std::sync::Mutex; use std::sync::Weak; use std::time::Duration; use pageserver_api::shard::ShardIdentity; use tracing::{instrument, trace}; use utils::id::TimelineId; use utils::shard::{ShardIndex, ShardNumber}; use crate::page_service::GetActiveTimelineError; use crate::tenant::GetTimelineError; use crate::tenant::mgr::{GetActiveTenantError, ShardSelector}; pub(crate) trait Types: Sized { type TenantManager: TenantManager + Sized; type Timeline: Timeline + Sized; } /// Uniquely identifies a [`Cache`] instance over the lifetime of the process. /// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`]. /// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] struct CacheId(u64); impl CacheId { fn next() -> Self { static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { panic!("CacheId::new() returned 0, overflow"); } Self(id) } } /// See module-level comment. pub(crate) struct Cache { id: CacheId, map: Map, } type Map = HashMap>; impl Default for Cache { fn default() -> Self { Self { id: CacheId::next(), map: Default::default(), } } } #[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] pub(crate) struct ShardTimelineId { pub(crate) shard_index: ShardIndex, pub(crate) timeline_id: TimelineId, } /// See module-level comment. pub(crate) struct Handle { inner: Arc>>, open: Arc, } pub(crate) struct WeakHandle { inner: Weak>>, } enum HandleInner { Open(Arc), ShutDown, } /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. /// /// See module-level comment for details. pub struct PerTimelineState { // None = shutting down #[allow(clippy::type_complexity)] handles: Mutex>>>>>, } impl Default for PerTimelineState { fn default() -> Self { Self { handles: Mutex::new(Some(Default::default())), } } } /// Abstract view of [`crate::tenant::mgr`], for testability. pub(crate) trait TenantManager { /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`]. async fn resolve( &self, timeline_id: TimelineId, shard_selector: ShardSelector, ) -> Result; } /// Abstract view of an [`Arc`], for testability. pub(crate) trait Timeline { fn shard_timeline_id(&self) -> ShardTimelineId; fn get_shard_identity(&self) -> &ShardIdentity; fn per_timeline_state(&self) -> &PerTimelineState; } /// Internal type used in [`Cache::get`]. enum RoutingResult { FastPath(Handle), SlowPath(ShardTimelineId), NeedConsultTenantManager, } impl Cache { /* BEGIN_HADRON */ /// A wrapper of do_get to resolve the tenant shard for a get page request. #[instrument(level = "trace", skip_all)] pub(crate) async fn get( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetActiveTimelineError> { const GET_MAX_RETRIES: usize = 10; const RETRY_BACKOFF: Duration = Duration::from_millis(100); let mut attempt = 0; loop { attempt += 1; match self .do_get(timeline_id, shard_selector, tenant_manager) .await { Ok(handle) => return Ok(handle), Err( e @ GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { .. }), ) => { // Retry on tenant manager error to handle tenant split more gracefully if attempt < GET_MAX_RETRIES { tokio::time::sleep(RETRY_BACKOFF).await; continue; } else { tracing::info!( "Failed to resolve tenant shard after {} attempts: {:?}", GET_MAX_RETRIES, e ); return Err(e); } } Err(err) => return Err(err), } } } /* END_HADRON */ /// See module-level comment for details. /// /// Does NOT check for the shutdown state of [`Types::Timeline`]. /// Instead, the methods of [`Types::Timeline`] that are invoked through /// the [`Handle`] are responsible for checking these conditions /// and if so, return an error that causes the page service to /// close the connection. #[instrument(level = "trace", skip_all)] async fn do_get( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetActiveTimelineError> { // terminates because when every iteration we remove an element from the map let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); match routing_state { RoutingResult::FastPath(handle) => return Ok(handle), RoutingResult::SlowPath(key) => match self.map.get(&key) { Some(cached) => match cached.upgrade() { Ok(upgraded) => return Ok(upgraded), Err(HandleUpgradeError::ShutDown) => { // TODO: dedup with shard_routing() trace!("handle cache stale"); self.map.remove(&key).unwrap(); continue; } }, None => break ShardSelector::Known(key.shard_index), }, RoutingResult::NeedConsultTenantManager => break shard_selector, } }; self.get_miss(timeline_id, miss, tenant_manager).await } #[inline(always)] fn shard_routing( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, ) -> RoutingResult { loop { // terminates because when every iteration we remove an element from the map let Some((first_key, first_handle)) = self.map.iter().next() else { return RoutingResult::NeedConsultTenantManager; }; let Ok(first_handle) = first_handle.upgrade() else { // TODO: dedup with get() trace!("handle cache stale"); let first_key_owned = *first_key; self.map.remove(&first_key_owned).unwrap(); continue; }; let first_handle_shard_identity = first_handle.get_shard_identity(); let make_shard_index = |shard_num: ShardNumber| ShardIndex { shard_number: shard_num, shard_count: first_handle_shard_identity.count, }; let need_idx = match shard_selector { ShardSelector::Page(key) => { make_shard_index(first_handle_shard_identity.get_shard_number(&key)) } ShardSelector::Zero => make_shard_index(ShardNumber(0)), ShardSelector::Known(shard_idx) => shard_idx, }; let need_shard_timeline_id = ShardTimelineId { shard_index: need_idx, timeline_id, }; let first_handle_shard_timeline_id = ShardTimelineId { shard_index: first_handle_shard_identity.shard_index(), timeline_id: first_handle.shard_timeline_id().timeline_id, }; if need_shard_timeline_id == first_handle_shard_timeline_id { return RoutingResult::FastPath(first_handle); } else { return RoutingResult::SlowPath(need_shard_timeline_id); } } } #[instrument(level = "trace", skip_all)] #[inline(always)] async fn get_miss( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetActiveTimelineError> { let timeline = tenant_manager.resolve(timeline_id, shard_selector).await?; let key = timeline.shard_timeline_id(); match &shard_selector { ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), ShardSelector::Page(_) => (), // gotta trust tenant_manager ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } trace!("creating new HandleInner"); let timeline = Arc::new(timeline); let handle_inner_arc = Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); let handle_weak = WeakHandle { inner: Arc::downgrade(&handle_inner_arc), }; let handle = handle_weak .upgrade() .ok() .expect("we just created it and it's not linked anywhere yet"); let mut lock_guard = timeline .per_timeline_state() .handles .lock() .expect("mutex poisoned"); let Some(per_timeline_state) = &mut *lock_guard else { return Err(GetActiveTimelineError::Timeline( GetTimelineError::ShuttingDown, )); }; let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); assert!(replaced.is_none(), "some earlier code left a stale handle"); match self.map.entry(key) { hash_map::Entry::Occupied(_o) => { // This cannot not happen because // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle // while we were waiting for the tenant manager. unreachable!() } hash_map::Entry::Vacant(v) => { v.insert(handle_weak); } } Ok(handle) } } pub(crate) enum HandleUpgradeError { ShutDown, } impl WeakHandle { pub(crate) fn upgrade(&self) -> Result, HandleUpgradeError> { let Some(inner) = Weak::upgrade(&self.inner) else { return Err(HandleUpgradeError::ShutDown); }; let lock_guard = inner.lock().expect("poisoned"); match &*lock_guard { HandleInner::Open(open) => { let open = Arc::clone(open); drop(lock_guard); Ok(Handle { open, inner }) } HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), } } pub(crate) fn is_same_handle_as(&self, other: &WeakHandle) -> bool { Weak::ptr_eq(&self.inner, &other.inner) } } impl std::ops::Deref for Handle { type Target = T::Timeline; fn deref(&self) -> &Self::Target { &self.open } } impl Handle { pub(crate) fn downgrade(&self) -> WeakHandle { WeakHandle { inner: Arc::downgrade(&self.inner), } } } impl PerTimelineState { /// After this method returns, [`Cache::get`] will never again return a [`Handle`] /// to the [`Types::Timeline`] that embeds this per-timeline state. /// Even if [`TenantManager::resolve`] would still resolve to it. /// /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive. /// That's ok because they're short-lived. See module-level comment for details. #[instrument(level = "trace", skip_all)] pub(super) fn shutdown(&self) { let handles = self .handles .lock() .expect("mutex poisoned") // NB: this .take() sets locked to None. // That's what makes future `Cache::get` misses fail. // Cache hits are taken care of below. .take(); let Some(handles) = handles else { trace!("already shut down"); return; }; for handle_inner_arc in handles.values() { // Make hits fail. let mut lock_guard = handle_inner_arc.lock().expect("poisoned"); lock_guard.shutdown(); } drop(handles); } } // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. impl Drop for Cache { fn drop(&mut self) { for ( _, WeakHandle { inner: handle_inner_weak, }, ) in self.map.drain() { let Some(handle_inner_arc) = handle_inner_weak.upgrade() else { continue; }; let Some(handle_timeline) = handle_inner_arc // locking rules: drop lock before acquiring other lock below .lock() .expect("poisoned") .shutdown() else { // Concurrent PerTimelineState::shutdown. continue; }; // Clean up per_timeline_state so the HandleInner allocation can be dropped. let per_timeline_state = handle_timeline.per_timeline_state(); let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned"); let Some(handles) = &mut *handles_lock_guard else { continue; }; let Some(removed_handle_inner_arc) = handles.remove(&self.id) else { // Concurrent PerTimelineState::shutdown. continue; }; drop(handles_lock_guard); // locking rules! assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc)); } } } impl HandleInner { fn shutdown(&mut self) -> Option> { match std::mem::replace(self, HandleInner::ShutDown) { HandleInner::Open(timeline) => Some(timeline), HandleInner::ShutDown => { // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown // may do it concurrently, but locking rules disallow holding per-timeline-state lock and // the handle lock at the same time. None } } } } #[cfg(test)] mod tests { use std::sync::Weak; use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; use pageserver_api::models::ShardParameters; use pageserver_api::reltag::RelTag; use pageserver_api::shard::DEFAULT_STRIPE_SIZE; use utils::id::TenantId; use utils::shard::{ShardCount, TenantShardId}; use utils::sync::gate::GateGuard; use super::*; const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); #[derive(Debug)] struct TestTypes; impl Types for TestTypes { type TenantManager = StubManager; type Timeline = Entered; } struct StubManager { shards: Vec>, } struct StubTimeline { gate: utils::sync::gate::Gate, id: TimelineId, shard: ShardIdentity, per_timeline_state: PerTimelineState, myself: Weak, } struct Entered { timeline: Arc, #[allow(dead_code)] // it's stored here to keep the gate open gate_guard: Arc, } impl StubTimeline { fn getpage(&self) { // do nothing } } impl Timeline for Entered { fn shard_timeline_id(&self) -> ShardTimelineId { ShardTimelineId { shard_index: self.shard.shard_index(), timeline_id: self.id, } } fn get_shard_identity(&self) -> &ShardIdentity { &self.shard } fn per_timeline_state(&self) -> &PerTimelineState { &self.per_timeline_state } } impl TenantManager for StubManager { async fn resolve( &self, timeline_id: TimelineId, shard_selector: ShardSelector, ) -> Result { fn enter_gate( timeline: &StubTimeline, ) -> Result, GetActiveTimelineError> { Ok(Arc::new(timeline.gate.enter().map_err(|_| { GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) })?)) } for timeline in &self.shards { if timeline.id == timeline_id { match &shard_selector { ShardSelector::Zero if timeline.shard.is_shard_zero() => { return Ok(Entered { timeline: Arc::clone(timeline), gate_guard: enter_gate(timeline)?, }); } ShardSelector::Zero => continue, ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { return Ok(Entered { timeline: Arc::clone(timeline), gate_guard: enter_gate(timeline)?, }); } ShardSelector::Page(_) => continue, ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { return Ok(Entered { timeline: Arc::clone(timeline), gate_guard: enter_gate(timeline)?, }); } ShardSelector::Known(_) => continue, } } } Err(GetActiveTimelineError::Timeline( GetTimelineError::NotFound { tenant_id: TenantShardId::unsharded(TenantId::from([0; 16])), timeline_id, }, )) } } impl std::ops::Deref for Entered { type Target = StubTimeline; fn deref(&self) -> &Self::Target { &self.timeline } } #[tokio::test(start_paused = true)] async fn test_timeline_shutdown() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let shard0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mgr = StubManager { shards: vec![shard0.clone()], }; let key = DBDIR_KEY; let mut cache = Cache::::default(); // // fill the cache // let handle: Handle<_> = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); assert_eq!(cache.map.len(), 1); drop(handle); // // demonstrate that Handle holds up gate closure // but shutdown prevents new handles from being handed out // tokio::select! { _ = shard0.gate.close() => { panic!("cache and per-timeline handler state keep cache open"); } _ = tokio::time::sleep(FOREVER) => { // NB: first poll of close() makes it enter closing state } } let handle = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); // SHUTDOWN shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown assert_eq!( cache.map.len(), 1, "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" ); // this handle is perfectly usable handle.getpage(); cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .err() .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle"); assert_eq!( cache.map.len(), 0, "first access after shutdown cleans up the Weak's from the cache" ); tokio::select! { _ = shard0.gate.close() => { panic!("handle is keeping gate open"); } _ = tokio::time::sleep(FOREVER) => { } } drop(handle); // closing gate succeeds after dropping handle tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { panic!("handle is dropped, no other gate holders exist") } } // map gets cleaned on next lookup cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .err() .expect("documented behavior: can't get new handle after shutdown"); assert_eq!(cache.map.len(), 0); // ensure all refs to shard0 are gone and we're not leaking anything drop(shard0); drop(mgr); } #[tokio::test] async fn test_multiple_timelines_and_deletion() { crate::tenant::harness::setup_logging(); let timeline_a = TimelineId::generate(); let timeline_b = TimelineId::generate(); assert_ne!(timeline_a, timeline_b); let timeline_a = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_a, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let timeline_b = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_b, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mut mgr = StubManager { shards: vec![timeline_a.clone(), timeline_b.clone()], }; let key = DBDIR_KEY; let mut cache = Cache::::default(); cache .get(timeline_a.id, ShardSelector::Page(key), &mgr) .await .expect("we have it"); cache .get(timeline_b.id, ShardSelector::Page(key), &mgr) .await .expect("we have it"); assert_eq!(cache.map.len(), 2); // delete timeline A timeline_a.per_timeline_state.shutdown(); mgr.shards.retain(|t| t.id != timeline_a.id); assert!( mgr.resolve(timeline_a.id, ShardSelector::Page(key)) .await .is_err(), "broken StubManager implementation" ); assert_eq!( cache.map.len(), 2, "cache still has a Weak handle to Timeline A" ); cache .get(timeline_a.id, ShardSelector::Page(key), &mgr) .await .err() .expect("documented behavior: can't get new handle after shutdown"); assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); cache .get(timeline_b.id, ShardSelector::Page(key), &mgr) .await .expect("we still have it"); } fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key { rel_block_to_key( RelTag { spcnode: 1663, dbnode: 208101, relnode: 2620, forknum: 0, }, shard.0 as u32 * params.stripe_size.0, ) } #[tokio::test(start_paused = true)] async fn test_shard_split() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let parent = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child_params = ShardParameters { count: ShardCount(2), stripe_size: DEFAULT_STRIPE_SIZE, }; let child0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::from_params(ShardNumber(0), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child1 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::from_params(ShardNumber(1), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child_shards_by_shard_number = [child0.clone(), child1.clone()]; let mut cache = Cache::::default(); // fill the cache with the parent for i in 0..2 { let handle = cache .get( timeline_id, ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![parent.clone()], }, ) .await .expect("we have it"); assert!( Weak::ptr_eq(&handle.myself, &parent.myself), "mgr returns parent first" ); drop(handle); } // // SHARD SPLIT: tenant manager changes, but the cache isn't informed // // while we haven't shut down the parent, the cache will return the cached parent, even // if the tenant manager returns the child for i in 0..2 { let handle = cache .get( timeline_id, ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![], // doesn't matter what's in here, the cache is fully loaded }, ) .await .expect("we have it"); assert!( Weak::ptr_eq(&handle.myself, &parent.myself), "mgr returns parent" ); drop(handle); } let parent_handle = cache .get( timeline_id, ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)), &StubManager { shards: vec![parent.clone()], }, ) .await .expect("we have it"); assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself)); // invalidate the cache parent.per_timeline_state.shutdown(); // the cache will now return the child, even though the parent handle still exists for i in 0..2 { let handle = cache .get( timeline_id, ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop }, ) .await .expect("we have it"); assert!( Weak::ptr_eq( &handle.myself, &child_shards_by_shard_number[i as usize].myself ), "mgr returns child" ); drop(handle); } // all the while the parent handle kept the parent gate open tokio::select! { _ = parent_handle.gate.close() => { panic!("parent handle is keeping gate open"); } _ = tokio::time::sleep(FOREVER) => { } } drop(parent_handle); tokio::select! { _ = parent.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { panic!("parent handle is dropped, no other gate holders exist") } } } #[tokio::test(start_paused = true)] async fn test_connection_handler_exit() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let shard0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mgr = StubManager { shards: vec![shard0.clone()], }; let key = DBDIR_KEY; // Simulate 10 connections that's opened, used, and closed for _ in 0..10 { let mut cache = Cache::::default(); let handle = { let handle = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); handle }; handle.getpage(); } // No handles exist, thus gates are closed and don't require shutdown. // Thus the gate should close immediately, even without shutdown. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { panic!("handle is dropped, no other gate holders exist") } } } #[tokio::test(start_paused = true)] async fn test_weak_handles() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let shard0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mgr = StubManager { shards: vec![shard0.clone()], }; let refcount_start = Arc::strong_count(&shard0); let key = DBDIR_KEY; let mut cache = Cache::::default(); let handle = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); let weak_handle = handle.downgrade(); drop(handle); let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it"); // Start shutdown shard0.per_timeline_state.shutdown(); // Upgrades during shutdown don't work, even if upgraded_handle exists. weak_handle .upgrade() .err() .expect("can't upgrade weak handle as soon as shutdown started"); // But upgraded_handle is still alive, so the gate won't close. tokio::select! { _ = shard0.gate.close() => { panic!("handle is keeping gate open"); } _ = tokio::time::sleep(FOREVER) => { } } // Drop the last handle. drop(upgraded_handle); // The gate should close now, despite there still being a weak_handle. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { panic!("only strong handle is dropped and we shut down per-timeline-state") } } // The weak handle still can't be upgraded. weak_handle .upgrade() .err() .expect("still shouldn't be able to upgrade the weak handle"); // There should be no strong references to the timeline object except the one on "stack". assert_eq!(Arc::strong_count(&shard0), refcount_start); } #[tokio::test(start_paused = true)] async fn test_reference_cycle_broken_when_cache_is_dropped() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let shard0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mgr = StubManager { shards: vec![shard0.clone()], }; let key = DBDIR_KEY; let mut cache = Cache::::default(); // helper to check if a handle is referenced by per_timeline_state let per_timeline_state_refs_handle = |handle_weak: &Weak>>| { let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap(); let per_timeline_state = per_timeline_state.as_ref().unwrap(); per_timeline_state .values() .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak)) }; // Fill the cache. let handle = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); let handle_inner_weak = Arc::downgrade(&handle.inner); assert!( per_timeline_state_refs_handle(&handle_inner_weak), "we still hold `handle` _and_ haven't dropped `cache` yet" ); // Drop the cache. drop(cache); assert!( !(per_timeline_state_refs_handle(&handle_inner_weak)), "nothing should reference the handle allocation anymore" ); assert!( Weak::upgrade(&handle_inner_weak).is_some(), "the local `handle` still keeps the allocation alive" ); // but obviously the cache is gone so no new allocations can be handed out. // Drop handle. drop(handle); assert!( Weak::upgrade(&handle_inner_weak).is_none(), "the local `handle` is dropped, so the allocation should be dropped by now" ); } #[tokio::test(start_paused = true)] async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() { crate::tenant::harness::setup_logging(); let timeline_id = TimelineId::generate(); let shard0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, shard: ShardIdentity::unsharded(), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let mgr = StubManager { shards: vec![shard0.clone()], }; let key = DBDIR_KEY; let mut cache = Cache::::default(); let handle = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails let handle_inner_weak = Arc::downgrade(&handle.inner); // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it drop(handle); assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still"); // Shutdown the per_timeline_state. shard0.per_timeline_state.shutdown(); assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer"); // cache only contains Weak's, so, it can outlive the per_timeline_state without // Drop explicitly solely to make this point. drop(cache); } } ================================================ FILE: pageserver/src/tenant/timeline/heatmap_layers_downloader.rs ================================================ //! Timeline utility module to hydrate everything from the current heatmap. //! //! Provides utilities to spawn and abort a background task where the downloads happen. //! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. use std::sync::{Arc, Mutex}; use futures::StreamExt; use http_utils::error::ApiError; use tokio_util::sync::CancellationToken; use utils::sync::gate::Gate; use crate::context::RequestContext; use super::Timeline; // This status is not strictly necessary now, but gives us a nice place // to store progress information if we ever wish to expose it. pub(super) enum HeatmapLayersDownloadStatus { InProgress, Complete, } pub(super) struct HeatmapLayersDownloader { handle: tokio::task::JoinHandle<()>, status: Arc>, cancel: CancellationToken, downloads_guard: Arc, } impl HeatmapLayersDownloader { fn new( timeline: Arc, concurrency: usize, recurse: bool, ctx: RequestContext, ) -> Result { let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; let cancel = timeline.cancel.child_token(); let downloads_guard = Arc::new(Gate::default()); let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress)); let handle = tokio::task::spawn({ let status = status.clone(); let downloads_guard = downloads_guard.clone(); let cancel = cancel.clone(); async move { let _guard = tl_guard; scopeguard::defer! { *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete; } let Some(heatmap) = timeline.generate_heatmap().await else { tracing::info!("Heatmap layers download failed to generate heatmap"); return; }; tracing::info!( resident_size=%timeline.resident_physical_size(), heatmap_layers=%heatmap.all_layers().count(), "Starting heatmap layers download" ); let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map( |layer| { let ctx = ctx.attached_child(); let tl = timeline.clone(); let dl_guard = match downloads_guard.enter() { Ok(g) => g, Err(_) => { // [`Self::shutdown`] was called. Don't spawn any more downloads. return None; } }; Some(async move { let _dl_guard = dl_guard; let res = tl.download_layer(&layer.name, &ctx).await; if let Err(err) = res { if !err.is_cancelled() { tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") } } }) } )).buffered(concurrency); tokio::select! { _ = stream.collect::<()>() => { tracing::info!( resident_size=%timeline.resident_physical_size(), "Heatmap layers download completed" ); }, _ = cancel.cancelled() => { tracing::info!("Heatmap layers download cancelled"); return; } } if recurse { if let Some(ancestor) = timeline.ancestor_timeline() { let ctx = ctx.attached_child(); let res = ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx); if let Err(err) = res { tracing::info!( "Failed to start heatmap layers download for ancestor: {err}" ); } } } } }); Ok(Self { status, handle, cancel, downloads_guard, }) } fn is_complete(&self) -> bool { matches!( *self.status.lock().unwrap(), HeatmapLayersDownloadStatus::Complete ) } /// Drive any in-progress downloads to completion and stop spawning any new ones. /// /// This has two callers and they behave differently /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves /// are sensitive to timeline cancellation. /// /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress /// downloads to complete. async fn stop_and_drain(self) { // Counterintuitive: close the guard before cancelling. // Something needs to poll the already created download futures to completion. // If we cancel first, then the underlying task exits and we lost // the poller. self.downloads_guard.close().await; self.cancel.cancel(); if let Err(err) = self.handle.await { tracing::warn!("Failed to join heatmap layer downloader task: {err}"); } } } impl Timeline { pub(crate) fn start_heatmap_layers_download( self: &Arc, concurrency: usize, recurse: bool, ctx: &RequestContext, ) -> Result<(), ApiError> { let mut locked = self.heatmap_layers_downloader.lock().unwrap(); if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { let dl = HeatmapLayersDownloader::new( self.clone(), concurrency, recurse, ctx.attached_child(), )?; *locked = Some(dl); Ok(()) } else { Err(ApiError::Conflict("Already running".to_string())) } } pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) { // This can race with the start of a new downloader and lead to a situation // where one donloader is shutting down and another one is in-flight. // The only impact is that we'd end up using more remote storage semaphore // units than expected. let downloader = self.heatmap_layers_downloader.lock().unwrap().take(); if let Some(dl) = downloader { dl.stop_and_drain().await; } } } ================================================ FILE: pageserver/src/tenant/timeline/import_pgdata/flow.rs ================================================ //! Import a PGDATA directory into an empty root timeline. //! //! This module is adapted hackathon code by Heikki and Stas. //! Other code in the parent module was written by Christian as part of a customer PoC. //! //! The hackathon code was producing image layer files as a free-standing program. //! //! It has been modified to //! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard) //! - => sharding-awareness: produce image layers with only the data relevant for this shard //! - => S3 as the source for the PGDATA instead of local filesystem //! //! TODOs before productionization: //! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size. //! //! An incomplete set of TODOs from the Hackathon: //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) use std::collections::HashSet; use std::hash::{Hash, Hasher}; use std::num::NonZeroUsize; use std::ops::Range; use std::sync::Arc; use anyhow::ensure; use bytes::Bytes; use futures::stream::FuturesOrdered; use itertools::Itertools; use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, }; use pageserver_api::keyspace::{ShardedRange, singleton_range}; use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::BLCKSZ; use postgres_ffi::relfile_utils::parse_relfilename; use remote_storage::RemotePath; use tokio::sync::Semaphore; use tokio_stream::StreamExt; use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; use utils::pausable_failpoint; use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::pgdatadir_mapping::{ DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; use crate::task_mgr::TaskKind; use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; pub async fn run( timeline: Arc, control_file: ControlFile, storage: RemoteStorageWrapper, import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { // Match how we run the import based on the progress version. // If there's no import progress, it means that this is a new import // and we can use whichever version we want. match import_progress { Some(ShardImportProgress::V1(progress)) => { run_v1(timeline, control_file, storage, Some(progress), ctx).await } None => run_v1(timeline, control_file, storage, None, ctx).await, } } async fn run_v1( timeline: Arc, control_file: ControlFile, storage: RemoteStorageWrapper, import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let planner = Planner { control_file, storage: storage.clone(), shard: timeline.shard_identity, tasks: Vec::default(), }; // Use the job size limit encoded in the progress if we are resuming an import. // This ensures that imports have stable plans even if the pageserver config changes. let import_config = { match &import_progress { Some(progress) => { let base = &timeline.conf.timeline_import_config; TimelineImportConfig { import_job_soft_size_limit: NonZeroUsize::new(progress.job_soft_size_limit) .unwrap(), import_job_concurrency: base.import_job_concurrency, import_job_checkpoint_threshold: base.import_job_checkpoint_threshold, import_job_max_byte_range_size: base.import_job_max_byte_range_size, } } None => timeline.conf.timeline_import_config.clone(), } }; let plan = planner.plan(&import_config).await?; // Hash the plan and compare with the hash of the plan we got back from the storage controller. // If the two match, it means that the planning stage had the same output. // // This is not intended to be a cryptographically secure hash. const SEED: u64 = 42; let mut hasher = twox_hash::XxHash64::with_seed(SEED); plan.hash(&mut hasher); let plan_hash = hasher.finish(); if let Some(progress) = &import_progress { // Handle collisions on jobs of unequal length if progress.jobs != plan.jobs.len() { anyhow::bail!("Import plan job length does not match storcon metadata") } if plan_hash != progress.import_plan_hash { anyhow::bail!("Import plan does not match storcon metadata"); } } pausable_failpoint!("import-timeline-pre-execute-pausable"); let jobs_count = import_progress.as_ref().map(|p| p.jobs); let start_from_job_idx = import_progress.map(|progress| progress.completed); tracing::info!( start_from_job_idx=?start_from_job_idx, jobs=?jobs_count, "Executing import plan" ); plan.execute(timeline, start_from_job_idx, plan_hash, &import_config, ctx) .await } struct Planner { control_file: ControlFile, storage: RemoteStorageWrapper, shard: ShardIdentity, tasks: Vec, } #[derive(Hash)] struct Plan { jobs: Vec, // Included here such that it ends up in the hash for the plan shard: ShardIdentity, } impl Planner { /// Creates an import plan /// /// This function is and must remain pure: given the same input, it will generate the same import plan. async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); anyhow::ensure!(pgdata_lsn.is_valid()); let datadir = PgDataDir::new(&self.storage).await?; // Import dbdir (00:00:00 keyspace) // This is just constructed here, but will be written to the image layer in the first call to import_db() let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory { dbdirs: datadir .dbs .iter() .map(|db| ((db.spcnode, db.dboid), true)) .collect(), })?); self.tasks .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into()); // Import databases (00:spcnode:dbnode keyspace for each db) for db in datadir.dbs { self.import_db(&db).await?; } // Import SLRUs if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; // pg_multixact/members (01:01 keyspace) self.import_slru( SlruKind::MultiXactMembers, &self.storage.pgdata().join("pg_multixact/members"), ) .await?; // pg_multixact/offsets (01:02 keyspace) self.import_slru( SlruKind::MultiXactOffsets, &self.storage.pgdata().join("pg_multixact/offsets"), ) .await?; } // Import pg_twophase. // TODO: as empty let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), })?; self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( TWOPHASEDIR_KEY, Bytes::from(twophasedir_buf), ))); // Controlfile, checkpoint self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( CONTROLFILE_KEY, self.control_file.control_file_buf().clone(), ))); let checkpoint_buf = self .control_file .control_file_data() .checkPointCopy .encode()?; self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( CHECKPOINT_KEY, checkpoint_buf, ))); // Sort the tasks by the key ranges they handle. // The plan being generated here needs to be stable across invocations // of this method. self.tasks.sort_by_key(|task| match task { AnyImportTask::SingleKey(key) => (key.key, key.key.next()), AnyImportTask::RelBlocks(rel_blocks) => { (rel_blocks.key_range.start, rel_blocks.key_range.end) } AnyImportTask::SlruBlocks(slru_blocks) => { (slru_blocks.key_range.start, slru_blocks.key_range.end) } }); // Assigns parts of key space to later parallel jobs // Note: The image layers produced here may have gaps, meaning, // there is not an image for each key in the layer's key range. // The read path stops traversal at the first image layer, regardless // of whether a base image has been found for a key or not. // (Concept of sparse image layers doesn't exist.) // This behavior is exactly right for the base image layers we're producing here. // But, since no other place in the code currently produces image layers with gaps, // it seems noteworthy. let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { let task_size = task.total_size(&self.shard); let projected_chunk_size = current_chunk_size.saturating_add(task_size); if projected_chunk_size > import_config.import_job_soft_size_limit.into() { let key_range = last_end_key..task.key_range().start; jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; } current_chunk_size = current_chunk_size.saturating_add(task_size); current_chunk.push(task); } jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, pgdata_lsn, )); Ok(Plan { jobs, shard: self.shard, }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> { debug!("start"); scopeguard::defer! { debug!("return"); } // Import relmap (00:spcnode:dbnode:00:*:00) let relmap_key = relmap_file_key(db.spcnode, db.dboid); debug!("Constructing relmap entry, key {relmap_key}"); let relmap_path = db.path.join("pg_filenode.map"); let relmap_buf = self.storage.get(&relmap_path).await?; self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( relmap_key, relmap_buf, ))); // Import reldir (00:spcnode:dbnode:00:*:01) let reldir_key = rel_dir_to_key(db.spcnode, db.dboid); debug!("Constructing reldirs entry, key {reldir_key}"); let reldir_buf = RelDirectory::ser(&RelDirectory { rels: db .files .iter() .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)) .collect(), })?; self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( reldir_key, Bytes::from(reldir_buf), ))); // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff) for file in &db.files { debug!(%file.path, %file.filesize, "importing file"); let len = file.filesize; ensure!(len % 8192 == 0); let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192); let start_key = rel_block_to_key(file.rel_tag, start_blk); let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( self.shard, start_key..end_key, &file.path, self.storage.clone(), ))); // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff) if let Some(nblocks) = file.nblocks { let size_key = rel_size_to_key(file.rel_tag); //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}"); let buf = nblocks.to_le_bytes(); self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( size_key, Bytes::from(buf.to_vec()), ))); } } Ok(()) } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments .into_iter() .filter_map(|(path, size)| { let filename = path.object_name()?; let segno = u32::from_str_radix(filename, 16).ok()?; Some((filename.to_string(), segno, size)) }) .collect(); // Write SlruDir let slrudir_key = slru_dir_to_key(kind); let segnos: HashSet = segments .iter() .map(|(_path, segno, _size)| *segno) .collect(); let slrudir = SlruSegmentDirectory { segments: segnos }; let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?; self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( slrudir_key, Bytes::from(slrudir_buf), ))); for (segpath, segno, size) in segments { // SlruSegBlocks for each segment let p = path.join(&segpath); let file_size = size; ensure!(file_size % 8192 == 0); let nblocks = u32::try_from(file_size / 8192)?; let start_key = slru_block_to_key(kind, segno, 0); let end_key = slru_block_to_key(kind, segno, nblocks); debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); self.tasks .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( start_key..end_key, &p, self.storage.clone(), ))); // Followed by SlruSegSize let segsize_key = slru_segment_size_to_key(kind, segno); let segsize_buf = nblocks.to_le_bytes(); self.tasks .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( segsize_key, Bytes::copy_from_slice(&segsize_buf), ))); } Ok(()) } } impl Plan { async fn execute( self, timeline: Arc, start_after_job_idx: Option, import_plan_hash: u64, import_config: &TimelineImportConfig, ctx: &RequestContext, ) -> anyhow::Result<()> { let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel); let mut work = FuturesOrdered::new(); let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); let jobs_in_plan = self.jobs.len(); let mut jobs = self .jobs .into_iter() .enumerate() .map(|(idx, job)| (idx + 1, job)) .filter(|(idx, _job)| { // Filter out any jobs that have been done already if let Some(start_after) = start_after_job_idx { *idx > start_after } else { true } }) .peekable(); let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into(); // Run import jobs concurrently up to the limit specified by the pageserver configuration. // Note that we process completed futures in the oreder of insertion. This will be the // building block for resuming imports across pageserver restarts or tenant migrations. while last_completed_job_idx < jobs_in_plan { tokio::select! { permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { let permit = permit.expect("never closed"); let (job_idx, job) = jobs.next().expect("we peeked"); let job_timeline = timeline.clone(); let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); work.push_back(tokio::task::spawn(async move { let _permit = permit; let res = job.run(job_timeline, max_byte_range_size, &ctx).await; (job_idx, res) })); }, maybe_complete_job_idx = work.next() => { pausable_failpoint!("import-task-complete-pausable"); match maybe_complete_job_idx { Some(Ok((job_idx, res))) => { assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx); res?; last_completed_job_idx = job_idx; if last_completed_job_idx % checkpoint_every == 0 { tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status"); let progress = ShardImportProgressV1 { jobs: jobs_in_plan, completed: last_completed_job_idx, import_plan_hash, job_soft_size_limit: import_config.import_job_soft_size_limit.into(), }; timeline.remote_client.schedule_index_upload_for_file_changes()?; timeline.remote_client.wait_completion().await?; storcon_client.put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress))) ) .await .map_err(|_err| { anyhow::anyhow!("Shut down while putting timeline import status") })?; } }, Some(Err(_)) => { anyhow::bail!( "import job panicked or cancelled" ); } None => {} } } } } Ok(()) } } // // dbdir iteration tools // struct PgDataDir { pub dbs: Vec, // spcnode, dboid, path } struct PgDataDirDb { pub spcnode: u32, pub dboid: u32, pub path: RemotePath, pub files: Vec, } struct PgDataDirDbFile { pub path: RemotePath, pub rel_tag: RelTag, pub segno: u32, pub filesize: usize, // Cummulative size of the given fork, set only for the last segment of that fork pub nblocks: Option, } impl PgDataDir { async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result { let datadir_path = storage.pgdata(); // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first // Traverse database in increasing oid order let basedir = &datadir_path.join("base"); let db_oids: Vec<_> = storage .listdir(basedir) .await? .into_iter() .filter_map(|path| path.object_name().and_then(|name| name.parse::().ok())) .sorted() .collect(); debug!(?db_oids, "found databases"); let mut databases = Vec::new(); for dboid in db_oids { databases.push( PgDataDirDb::new( storage, &basedir.join(dboid.to_string()), postgres_ffi_types::constants::DEFAULTTABLESPACE_OID, dboid, &datadir_path, ) .await?, ); } // special case for global catalogs databases.push( PgDataDirDb::new( storage, &datadir_path.join("global"), postgres_ffi_types::constants::GLOBALTABLESPACE_OID, 0, &datadir_path, ) .await?, ); databases.sort_by_key(|db| (db.spcnode, db.dboid)); Ok(Self { dbs: databases }) } } impl PgDataDirDb { #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))] async fn new( storage: &RemoteStorageWrapper, db_path: &RemotePath, spcnode: u32, dboid: u32, datadir_path: &RemotePath, ) -> anyhow::Result { let mut files: Vec = storage .listfilesindir(db_path) .await? .into_iter() .filter_map(|(path, size)| { debug!(%path, %size, "found file in dbdir"); path.object_name().and_then(|name| { // returns (relnode, forknum, segno) parse_relfilename(name).ok().map(|x| (size, x)) }) }) .sorted_by_key(|(_, relfilename)| *relfilename) .map(|(filesize, (relnode, forknum, segno))| { let rel_tag = RelTag { spcnode, dbnode: dboid, relnode, forknum, }; let path = datadir_path.join(rel_tag.to_segfile_name(segno)); anyhow::ensure!(filesize % BLCKSZ as usize == 0); let nblocks = filesize / BLCKSZ as usize; Ok(PgDataDirDbFile { path, filesize, rel_tag, segno, nblocks: Some(nblocks), // first non-cummulative sizes }) }) .collect::>()?; // Set cummulative sizes. Do all of that math here, so that later we could easier // parallelize over segments and know with which segments we need to write relsize // entry. let mut cumulative_nblocks: usize = 0; let mut prev_rel_tag: Option = None; for i in 0..files.len() { if prev_rel_tag == Some(files[i].rel_tag) { cumulative_nblocks += files[i].nblocks.unwrap(); } else { cumulative_nblocks = files[i].nblocks.unwrap(); } files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag { Some(cumulative_nblocks) } else { None }; prev_rel_tag = Some(files[i].rel_tag); } Ok(PgDataDirDb { files, path: db_path.clone(), spcnode, dboid, }) } } trait ImportTask { fn key_range(&self) -> Range; fn total_size(&self, shard_identity: &ShardIdentity) -> usize { let range = ShardedRange::new(self.key_range(), shard_identity); let page_count = range.page_count(); if page_count == u32::MAX { tracing::warn!( "Import task has non contiguous key range: {}..{}", self.key_range().start, self.key_range().end ); // Tasks should operate on contiguous ranges. It is unexpected for // ranges to violate this assumption. Calling code handles this by mapping // any task on a non contiguous range to its own image layer. usize::MAX } else { page_count as usize * 8192 } } async fn doit( self, layer_writer: &mut ImageLayerWriter, max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result; } struct ImportSingleKeyTask { key: Key, buf: Bytes, } impl Hash for ImportSingleKeyTask { fn hash(&self, state: &mut H) { let ImportSingleKeyTask { key, buf } = self; key.hash(state); // The key value might not have a stable binary representation. // For instance, the db directory uses an unstable hash-map. // To work around this we are a bit lax here and only hash the // size of the buffer which must be consistent. buf.len().hash(state); } } impl ImportSingleKeyTask { fn new(key: Key, buf: Bytes) -> Self { ImportSingleKeyTask { key, buf } } } impl ImportTask for ImportSingleKeyTask { fn key_range(&self) -> Range { singleton_range(self.key) } async fn doit( self, layer_writer: &mut ImageLayerWriter, _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { layer_writer.put_image(self.key, self.buf, ctx).await?; Ok(1) } } struct ImportRelBlocksTask { shard_identity: ShardIdentity, key_range: Range, path: RemotePath, storage: RemoteStorageWrapper, } impl Hash for ImportRelBlocksTask { fn hash(&self, state: &mut H) { let ImportRelBlocksTask { shard_identity: _, key_range, path, storage: _, } = self; key_range.hash(state); path.hash(state); } } impl ImportRelBlocksTask { fn new( shard_identity: ShardIdentity, key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper, ) -> Self { ImportRelBlocksTask { shard_identity, key_range, path: path.clone(), storage, } } } impl ImportTask for ImportRelBlocksTask { fn key_range(&self) -> Range { self.key_range.clone() } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))] async fn doit( self, layer_writer: &mut ImageLayerWriter, max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { debug!("Importing relation file"); let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?; assert_eq!(rel_tag, rel_tag_end); let ranges = (start_blk..end_blk) .enumerate() .filter_map(|(i, blknum)| { let key = rel_block_to_key(rel_tag, blknum); if self.shard_identity.is_key_disposable(&key) { return None; } let file_offset = i.checked_mul(8192).unwrap(); Some(( vec![key], file_offset, file_offset.checked_add(8192).unwrap(), )) }) .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| { assert_eq!(key.len(), 1); assert!(!acc.is_empty()); assert!(acc_end > acc_start); if acc_end == start && end - acc_start <= max_byte_range_size { acc.push(key.pop().unwrap()); Ok((acc, acc_start, end)) } else { Err(((acc, acc_start, acc_end), (key, start, end))) } }); let mut nimages = 0; for (keys, range_start, range_end) in ranges { let range_buf = self .storage .get_range(&self.path, range_start.into_u64(), range_end.into_u64()) .await?; let mut buf = Bytes::from(range_buf); for key in keys { // The writer buffers writes internally let image = buf.split_to(8192); layer_writer.put_image(key, image, ctx).await?; nimages += 1; } } Ok(nimages) } } struct ImportSlruBlocksTask { key_range: Range, path: RemotePath, storage: RemoteStorageWrapper, } impl Hash for ImportSlruBlocksTask { fn hash(&self, state: &mut H) { let ImportSlruBlocksTask { key_range, path, storage: _, } = self; key_range.hash(state); path.hash(state); } } impl ImportSlruBlocksTask { fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { key_range, path: path.clone(), storage, } } } impl ImportTask for ImportSlruBlocksTask { fn key_range(&self) -> Range { self.key_range.clone() } async fn doit( self, layer_writer: &mut ImageLayerWriter, _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { debug!("Importing SLRU segment file {}", self.path); let buf = self.storage.get(&self.path).await?; // TODO(vlad): Does timestamp to LSN work for imported timelines? // Probably not since we don't append the `xact_time` to it as in // [`WalIngest::ingest_xact_record`]. let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?; let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?; let mut blknum = start_blk; let mut nimages = 0; let mut file_offset = 0; while blknum < end_blk { let key = slru_block_to_key(kind, segno, blknum); let buf = &buf[file_offset..(file_offset + 8192)]; file_offset += 8192; layer_writer .put_image(key, Bytes::copy_from_slice(buf), ctx) .await?; nimages += 1; blknum += 1; } Ok(nimages) } } #[derive(Hash)] enum AnyImportTask { SingleKey(ImportSingleKeyTask), RelBlocks(ImportRelBlocksTask), SlruBlocks(ImportSlruBlocksTask), } impl ImportTask for AnyImportTask { fn key_range(&self) -> Range { match self { Self::SingleKey(t) => t.key_range(), Self::RelBlocks(t) => t.key_range(), Self::SlruBlocks(t) => t.key_range(), } } /// returns the number of images put into the `layer_writer` async fn doit( self, layer_writer: &mut ImageLayerWriter, max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { match self { Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, } } } impl From for AnyImportTask { fn from(t: ImportSingleKeyTask) -> Self { Self::SingleKey(t) } } impl From for AnyImportTask { fn from(t: ImportRelBlocksTask) -> Self { Self::RelBlocks(t) } } impl From for AnyImportTask { fn from(t: ImportSlruBlocksTask) -> Self { Self::SlruBlocks(t) } } #[derive(Hash)] struct ChunkProcessingJob { range: Range, tasks: Vec, pgdata_lsn: Lsn, } impl ChunkProcessingJob { fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { assert!(pgdata_lsn.is_valid()); Self { range, tasks, pgdata_lsn, } } async fn run( self, timeline: Arc, max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( timeline.conf, timeline.timeline_id, timeline.tenant_shard_id, &self.range, self.pgdata_lsn, &timeline.gate, timeline.cancel.clone(), ctx, ) .await?; let mut nimages = 0; for task in self.tasks { nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?; } let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; { let guard = timeline .layers .read(LayerManagerLockHolder::ImportPgData) .await; let existing_layer = guard.try_get_from_key(&desc.key()); if let Some(layer) = existing_layer { if layer.metadata().generation == timeline.generation { return Err(anyhow::anyhow!( "Import attempted to rewrite layer file in the same generation: {}", layer.local_path() )); } } } Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; // The same import job might run multiple times since not each job is checkpointed. // Hence, we must support the cases where the layer already exists. We cannot be // certain that the existing layer is identical to the new one, so in that case // we replace the old layer with the one we just generated. let mut guard = timeline .layers .write(LayerManagerLockHolder::ImportPgData) .await; let existing_layer = guard .try_get_from_key(&resident_layer.layer_desc().key()) .cloned(); match existing_layer { Some(existing) => { // Unlink the remote layer from the index without scheduling its deletion. // When `existing_layer` drops [`LayerInner::drop`] will schedule its deletion from // remote storage, but that assumes that the layer was unlinked from the index first. timeline .remote_client .schedule_unlinking_of_layers_from_index_part(std::iter::once( existing.layer_desc().layer_name(), ))?; guard.open_mut()?.rewrite_layers( &[(existing.clone(), resident_layer.clone())], &[], &timeline.metrics, ); } None => { guard .open_mut()? .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); } } crate::tenant::timeline::drop_layer_manager_wlock(guard); timeline .remote_client .schedule_layer_file_upload(resident_layer)?; Ok(()) } } ================================================ FILE: pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs ================================================ use std::ops::Bound; use std::sync::Arc; use anyhow::Context; use bytes::Bytes; use postgres_ffi::{ControlFileData, PgMajorVersion}; use remote_storage::{ Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, RemoteStorageConfig, }; use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use tracing::{debug, info, instrument}; use utils::lsn::Lsn; use super::index_part_format; use crate::assert_u64_eq_usize::U64IsUsize; use crate::config::PageServerConf; pub async fn new( conf: &'static PageServerConf, location: &index_part_format::Location, cancel: CancellationToken, ) -> Result { // Downloads should be reasonably sized. We do ranged reads for relblock raw data // and full reads for SLRU segments which are bounded by Postgres. let timeout = RemoteStorageConfig::DEFAULT_TIMEOUT; let location_storage = match location { #[cfg(feature = "testing")] index_part_format::Location::LocalFs { path } => { GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?) } index_part_format::Location::AwsS3 { region, bucket, key, } => { // TODO: think about security implications of letting the client specify the bucket & prefix. // It's the most flexible right now, but, possibly we want to move bucket name into PS conf // and force the timeline_id into the prefix? GenericRemoteStorage::AwsS3(Arc::new( remote_storage::S3Bucket::new( &remote_storage::S3Config { bucket_name: bucket.clone(), prefix_in_bucket: Some(key.clone()), bucket_region: region.clone(), endpoint: conf .import_pgdata_aws_endpoint_url .clone() .map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env // This matches the default import job concurrency. This is managed // separately from the usual S3 client, but the concern here is bandwidth // usage. concurrency_limit: 128.try_into().unwrap(), max_keys_per_list_response: Some(1000), upload_storage_class: None, // irrelevant }, timeout, ) .await .context("setup s3 bucket")?, )) } }; let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel); Ok(storage_wrapper) } /// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API /// such as [`tokio::fs`], which was used in the original implementation of the import code. #[derive(Clone)] pub struct RemoteStorageWrapper { storage: GenericRemoteStorage, cancel: CancellationToken, } impl RemoteStorageWrapper { pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self { Self { storage, cancel } } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn listfilesindir( &self, path: &RemotePath, ) -> Result, DownloadError> { assert!( path.object_name().is_some(), "must specify dirname, without trailing slash" ); let path = path.add_trailing_slash(); let res = crate::tenant::remote_timeline_client::download::download_retry_forever( || async { let Listing { keys, prefixes: _ } = self .storage .list( Some(&path), remote_storage::ListingMode::WithDelimiter, None, &self.cancel, ) .await?; let res = keys .into_iter() .map(|ListingObject { key, size, .. }| (key, size.into_usize())) .collect(); Ok(res) }, &format!("listfilesindir {path:?}"), &self.cancel, ) .await; debug!(?res, "returning"); res } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn listdir(&self, path: &RemotePath) -> Result, DownloadError> { assert!( path.object_name().is_some(), "must specify dirname, without trailing slash" ); let path = path.add_trailing_slash(); let res = crate::tenant::remote_timeline_client::download::download_retry_forever( || async { let Listing { keys, prefixes } = self .storage .list( Some(&path), remote_storage::ListingMode::WithDelimiter, None, &self.cancel, ) .await?; let res = keys .into_iter() .map(|ListingObject { key, .. }| key) .chain(prefixes.into_iter()) .collect(); Ok(res) }, &format!("listdir {path:?}"), &self.cancel, ) .await; debug!(?res, "returning"); res } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get(&self, path: &RemotePath) -> Result { let res = crate::tenant::remote_timeline_client::download::download_retry_forever( || async { let Download { download_stream, .. } = self .storage .download(path, &DownloadOpts::default(), &self.cancel) .await?; let mut reader = tokio_util::io::StreamReader::new(download_stream); // XXX optimize this, can we get the capacity hint from somewhere? let mut buf = Vec::new(); tokio::io::copy_buf(&mut reader, &mut buf).await?; Ok(Bytes::from(buf)) }, &format!("download {path:?}"), &self.cancel, ) .await; debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); res } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_json( &self, path: &RemotePath, ) -> Result, DownloadError> { let buf = match self.get(path).await { Ok(buf) => buf, Err(DownloadError::NotFound) => return Ok(None), Err(err) => return Err(err), }; let res = serde_json::from_slice(&buf) .context("serialize") // TODO: own error type .map_err(DownloadError::Other)?; Ok(Some(res)) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_range( &self, path: &RemotePath, start_inclusive: u64, end_exclusive: u64, ) -> Result, DownloadError> { let len = end_exclusive .checked_sub(start_inclusive) .unwrap() .into_usize(); let res = crate::tenant::remote_timeline_client::download::download_retry_forever( || async { let Download { download_stream, .. } = self .storage .download( path, &DownloadOpts { kind: DownloadKind::Large, etag: None, byte_start: Bound::Included(start_inclusive), byte_end: Bound::Excluded(end_exclusive), version_id: None, }, &self.cancel) .await?; let mut reader = tokio_util::io::StreamReader::new(download_stream); let mut buf = Vec::with_capacity(len); tokio::io::copy_buf(&mut reader, &mut buf).await?; Ok(buf) }, &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"), &self.cancel, ) .await; debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); res } pub fn pgdata(&self) -> RemotePath { RemotePath::from_string("pgdata").unwrap() } pub async fn get_control_file(&self) -> Result { let control_file_path = self.pgdata().join("global/pg_control"); info!("get control file from {control_file_path}"); let control_file_buf = self.get(&control_file_path).await?; ControlFile::new(control_file_buf) } } pub struct ControlFile { control_file_data: ControlFileData, control_file_buf: Bytes, } impl ControlFile { pub(crate) fn new(control_file_buf: Bytes) -> Result { // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes. let control_file_data = ControlFileData::decode(&control_file_buf)?; let control_file = ControlFile { control_file_data, control_file_buf, }; control_file.try_pg_version()?; // so that we can offer infallible pg_version() Ok(control_file) } pub(crate) fn base_lsn(&self) -> Lsn { Lsn(self.control_file_data.checkPoint).align() } pub(crate) fn pg_version(&self) -> PgMajorVersion { self.try_pg_version() .expect("prepare() checks that try_pg_version doesn't error") } pub(crate) fn control_file_data(&self) -> &ControlFileData { &self.control_file_data } pub(crate) fn control_file_buf(&self) -> &Bytes { &self.control_file_buf } fn try_pg_version(&self) -> anyhow::Result { Ok(match self.control_file_data.catalog_version_no { // thesea are from catversion.h 202107181 => PgMajorVersion::PG14, 202209061 => PgMajorVersion::PG15, 202307071 => PgMajorVersion::PG16, 202406281 => PgMajorVersion::PG17, catversion => { anyhow::bail!("unrecognized catalog version {catversion}") } }) } } ================================================ FILE: pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs ================================================ use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] pub struct PgdataStatus { pub done: bool, // TODO: remaining fields } ================================================ FILE: pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs ================================================ #[cfg(feature = "testing")] use camino::Utf8PathBuf; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Root { V1(V1), } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum V1 { InProgress(InProgress), Done(Done), } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[serde(transparent)] pub struct IdempotencyKey(String); impl IdempotencyKey { pub fn new(s: String) -> Self { Self(s) } } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct InProgress { pub idempotency_key: IdempotencyKey, pub location: Location, pub started_at: chrono::NaiveDateTime, } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct Done { pub idempotency_key: IdempotencyKey, pub started_at: chrono::NaiveDateTime, pub finished_at: chrono::NaiveDateTime, } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Location { #[cfg(feature = "testing")] LocalFs { path: Utf8PathBuf }, AwsS3 { region: String, bucket: String, key: String, }, } impl Root { pub fn is_done(&self) -> bool { match self { Root::V1(v1) => match v1 { V1::Done(_) => true, V1::InProgress(_) => false, }, } } pub fn idempotency_key(&self) -> &IdempotencyKey { match self { Root::V1(v1) => match v1 { V1::InProgress(in_progress) => &in_progress.idempotency_key, V1::Done(done) => &done.idempotency_key, }, } } pub fn started_at(&self) -> &chrono::NaiveDateTime { match self { Root::V1(v1) => match v1 { V1::InProgress(in_progress) => &in_progress.started_at, V1::Done(done) => &done.started_at, }, } } } ================================================ FILE: pageserver/src/tenant/timeline/import_pgdata.rs ================================================ use std::sync::Arc; use anyhow::{Context, bail}; use importbucket_client::{ControlFile, RemoteStorageWrapper}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::info; use utils::lsn::Lsn; use utils::pausable_failpoint; use utils::sync::gate::Gate; use super::{Timeline, TimelineDeleteProgress}; use crate::context::RequestContext; use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; mod flow; mod importbucket_client; mod importbucket_format; pub(crate) mod index_part_format; pub struct ImportingTimeline { pub import_task_handle: JoinHandle<()>, pub import_task_gate: Gate, pub timeline: Arc, pub delete_progress: TimelineDeleteProgress, } impl std::fmt::Debug for ImportingTimeline { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "ImportingTimeline<{}>", self.timeline.timeline_id) } } impl ImportingTimeline { pub async fn shutdown(&self) { self.import_task_handle.abort(); self.import_task_gate.close().await; self.timeline.remote_client.shutdown().await; } } pub async fn doit( timeline: &Arc, index_part: index_part_format::Root, ctx: &RequestContext, cancel: CancellationToken, ) -> anyhow::Result<()> { let index_part_format::Root::V1(v1) = index_part; let index_part_format::InProgress { location, idempotency_key: _, started_at: _, } = match v1 { index_part_format::V1::Done(_) => return Ok(()), index_part_format::V1::InProgress(in_progress) => in_progress, }; let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); let shard_status = storcon_client .get_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, ) .await .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; info!(?shard_status, "peeking shard status"); match shard_status { ShardImportStatus::InProgress(maybe_progress) => { let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; let control_file_res = if maybe_progress.is_none() { // Only prepare the import once when there's no progress. prepare_import(timeline, storage.clone(), &cancel).await } else { storage.get_control_file().await }; let control_file = match control_file_res { Ok(cf) => cf, Err(err) => { return Err( terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, ); } }; let res = flow::run( timeline.clone(), control_file, storage.clone(), maybe_progress, ctx, ) .await; if let Err(err) = res { return Err( terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, ); } tracing::info!("Import plan executed. Flushing remote changes and notifying storcon"); timeline .remote_client .schedule_index_upload_for_file_changes()?; timeline.remote_client.wait_completion().await?; pausable_failpoint!("import-timeline-pre-success-notify-pausable"); // Communicate that shard is done. // Ensure at-least-once delivery of the upcall to storage controller // before we mark the task as done and never come here again. // // Note that we do not mark the import complete in the index part now. // This happens in [`Tenant::finalize_importing_timeline`] in response // to the storage controller calling // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`. storcon_client .put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, ShardImportStatus::Done, ) .await .map_err(|_err| { anyhow::anyhow!("Shut down while putting timeline import status") })?; } ShardImportStatus::Error(err) => { info!( "shard status indicates that the shard is done (error), skipping import {}", err ); } ShardImportStatus::Done => { info!("shard status indicates that the shard is done (success), skipping import"); } } Ok(()) } async fn prepare_import( timeline: &Arc, storage: RemoteStorageWrapper, cancel: &CancellationToken, ) -> anyhow::Result { // Wipe the slate clean before starting the import as a precaution. // This method is only called when there's no recorded checkpoint for the import // in the storage controller. // // Note that this is split-brain safe (two imports for same timeline shards running in // different generations) because we go through the usual deletion path, including deletion queue. info!("wipe the slate clean"); { // TODO: do we need to hold GC lock for this? let mut guard = timeline .layers .write(LayerManagerLockHolder::ImportPgData) .await; assert!( guard.layer_map()?.open_layer.is_none(), "while importing, there should be no in-memory layer" // this just seems like a good place to assert it ); let all_layers_keys = guard.all_persistent_layers(); let all_layers: Vec<_> = all_layers_keys .iter() .map(|key| guard.get_from_key(key)) .collect(); let open = guard.open_mut().context("open_mut")?; timeline.remote_client.schedule_gc_update(&all_layers)?; open.finish_gc_timeline(&all_layers); } // // Wait for pgdata to finish uploading // info!("wait for pgdata to reach status 'done'"); let status_prefix = RemotePath::from_string("status").unwrap(); let pgdata_status_key = status_prefix.join("pgdata"); loop { let res = async { let pgdata_status: Option = storage .get_json(&pgdata_status_key) .await .context("get pgdata status")?; info!(?pgdata_status, "peeking pgdata status"); if pgdata_status.map(|st| st.done).unwrap_or(false) { Ok(()) } else { Err(anyhow::anyhow!("pgdata not done yet")) } } .await; match res { Ok(_) => break, Err(_err) => { info!("indefinitely waiting for pgdata to finish"); if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) .await .is_ok() { bail!("cancelled while waiting for pgdata"); } } } } let control_file = storage.get_control_file().await?; let base_lsn = control_file.base_lsn(); info!("update TimelineMetadata based on LSNs from control file"); { let pg_version = control_file.pg_version(); async move { // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the // checkpoint record, and prev_record_lsn should point to its beginning. // We should read the real end of the record from the WAL, but here we // just fake it. let disk_consistent_lsn = Lsn(base_lsn.0 + 8); let prev_record_lsn = base_lsn; let metadata = TimelineMetadata::new( disk_consistent_lsn, Some(prev_record_lsn), None, // no ancestor Lsn(0), // no ancestor lsn base_lsn, // latest_gc_cutoff_lsn base_lsn, // initdb_lsn pg_version, ); let _start_lsn = disk_consistent_lsn + 1; timeline .remote_client .schedule_index_upload_for_full_metadata_update(&metadata)?; timeline.remote_client.wait_completion().await?; anyhow::Ok(()) } } .await?; Ok(control_file) } async fn terminate_flow_with_error( timeline: &Arc, error: anyhow::Error, storcon_client: &StorageControllerUpcallClient, cancel: &CancellationToken, ) -> anyhow::Error { // The import task is a aborted on tenant shutdown, so in principle, it should // never be cancelled. To be on the safe side, check the cancellation tokens // before marking the import as failed. if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) { let notify_res = storcon_client .put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, ShardImportStatus::Error(format!("{error:#}")), ) .await; if let Err(_notify_error) = notify_res { // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries // forever internally, so errors returned by it can only be due to cancellation. info!("failed to notify storcon about permanent import error"); } // Will be logged by [`Tenant::create_timeline_import_pgdata_task`] error } else { anyhow::anyhow!("Import task cancelled") } } ================================================ FILE: pageserver/src/tenant/timeline/init.rs ================================================ use std::collections::{HashMap, hash_map}; use std::str::FromStr; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use utils::lsn::Lsn; use crate::is_temporary; use crate::tenant::ephemeral_file::is_ephemeral_file; use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; use crate::tenant::remote_timeline_client::{self}; use crate::tenant::storage_layer::LayerName; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about Layer(LayerName, LocalLayerFileMetadata), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed Temporary(String), /// Temporary on-demand download files, should be removed TemporaryDownload(String), /// Backup file from previously future layers IgnoredBackup(Utf8PathBuf), /// Unrecognized, warn about these Unknown(String), } /// Scans the timeline directory for interesting files. pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result> { let mut ret = Vec::new(); for direntry in path.read_dir_utf8()? { let direntry = direntry?; let file_name = direntry.file_name().to_string(); let discovered = match LayerName::from_str(&file_name) { Ok(file_name) => { let file_size = direntry.metadata()?.len(); Discovered::Layer( file_name, LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size), ) } Err(_) => { if file_name.ends_with(".old") { // ignore these Discovered::IgnoredBackup(direntry.path().to_owned()) } else if remote_timeline_client::is_temp_download_file(direntry.path()) { Discovered::TemporaryDownload(file_name) } else if is_ephemeral_file(&file_name) { Discovered::Ephemeral(file_name) } else if is_temporary(direntry.path()) { Discovered::Temporary(file_name) } else { Discovered::Unknown(file_name) } } }; ret.push(discovered); } Ok(ret) } /// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage, /// this structure extends it with metadata describing the layer's presence in local storage. #[derive(Clone, Debug)] pub(super) struct LocalLayerFileMetadata { pub(super) file_size: u64, pub(super) local_path: Utf8PathBuf, } impl LocalLayerFileMetadata { pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self { Self { local_path, file_size, } } } /// For a layer that is present in remote metadata, this type describes how to handle /// it during startup: it is either Resident (and we have some metadata about a local file), /// or it is Evicted (and we only have remote metadata). #[derive(Clone, Debug)] pub(super) enum Decision { /// The layer is not present locally. Evicted(LayerFileMetadata), /// The layer is present locally, and metadata matches: we may hook up this layer to the /// existing file in local storage. Resident { local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, } /// A layer needs to be left out of the layer map. #[derive(Debug)] pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { /// `None` if the layer is only known through [`IndexPart`]. local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. LocalOnly(LocalLayerFileMetadata), /// The layer exists in remote storage but the local layer's metadata (e.g. file size) /// does not match it BadMetadata(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( local_layers: Vec<(LayerName, LocalLayerFileMetadata)>, index_part: &IndexPart, disk_consistent_lsn: Lsn, ) -> Vec<(LayerName, Result)> { let mut result = Vec::new(); let mut remote_layers = HashMap::new(); // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise // construct DismissedLayers to get rid of them. for (layer_name, local_metadata) in local_layers { let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); continue; }; if remote_metadata.file_size != local_metadata.file_size { result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata)))); continue; } remote_layers.insert( layer_name, Decision::Resident { local: local_metadata, remote: remote_metadata.clone(), }, ); } // Construct Decision for layers that were not found locally index_part .layer_metadata .iter() .for_each(|(name, metadata)| { if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) { entry.insert(Decision::Evicted(metadata.clone())); } }); // For layers that were found in authoritative remote metadata, apply a final check that they are within // the disk_consistent_lsn. result.extend(remote_layers.into_iter().map(|(name, decision)| { if name.is_in_future(disk_consistent_lsn) { match decision { Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })), Decision::Resident { local, remote: _remote, } => (name, Err(DismissedLayer::Future { local: Some(local) })), } } else { (name, Ok(decision)) } })); result } pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { let file_name = path.file_name().expect("must be file path"); tracing::debug!(kind, ?file_name, "cleaning up"); std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}")) } pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> { let local_size = local.file_size; let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); tracing::warn!( "removing local file {file_name:?} because it has unexpected length {local_size};" ); std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}")) } pub(super) fn cleanup_future_layer( path: &Utf8Path, name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk // lsns stored in InMemoryLayer. let kind = name.kind(); tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}"); std::fs::remove_file(path)?; Ok(()) } pub(super) fn cleanup_local_only_file( name: &LayerName, local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); tracing::info!( "found local-only {kind} layer {name} size {}", local.file_size ); std::fs::remove_file(&local.local_path)?; Ok(()) } ================================================ FILE: pageserver/src/tenant/timeline/layer_manager.rs ================================================ use std::collections::HashMap; use std::mem::ManuallyDrop; use std::ops::{Deref, DerefMut}; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail, ensure}; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use tracing::trace; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; use super::{LayerFringe, ReadableLayer, TimelineWriterState}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::metrics::TimelineMetrics; use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult}; use crate::tenant::storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey, ReadableLayerWeak, ResidentLayer, }; /// Warn if the lock was held for longer than this threshold. /// It's very generous and we should bring this value down over time. const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5); const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30); /// Describes the operation that is holding the layer manager lock #[derive(Debug, Clone, Copy, strum_macros::Display)] #[strum(serialize_all = "kebab_case")] pub(crate) enum LayerManagerLockHolder { GetLayerMapInfo, GenerateHeatmap, GetPage, Init, LoadLayerMap, GetLayerForWrite, TryFreezeLayer, FlushFrozenLayer, FlushLoop, Compaction, GarbageCollection, Shutdown, ImportPgData, DetachAncestor, Eviction, ComputeImageConsistentLsn, #[cfg(test)] Testing, } /// Wrapper for the layer manager that tracks the amount of time during which /// it was held under read or write lock #[derive(Default)] pub(crate) struct LockedLayerManager { locked: tokio::sync::RwLock, } pub(crate) struct LayerManagerReadGuard<'a> { guard: ManuallyDrop>, acquired_at: std::time::Instant, holder: LayerManagerLockHolder, } pub(crate) struct LayerManagerWriteGuard<'a> { guard: ManuallyDrop>, acquired_at: std::time::Instant, holder: LayerManagerLockHolder, } impl Drop for LayerManagerReadGuard<'_> { fn drop(&mut self) { // Drop the lock first, before potentially warning if it was held for too long. // SAFETY: ManuallyDrop in Drop implementation unsafe { ManuallyDrop::drop(&mut self.guard) }; let held_for = self.acquired_at.elapsed(); if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD { tracing::warn!( holder=%self.holder, "Layer manager read lock held for {}s", held_for.as_secs_f64(), ); } } } impl Drop for LayerManagerWriteGuard<'_> { fn drop(&mut self) { // Drop the lock first, before potentially warning if it was held for too long. // SAFETY: ManuallyDrop in Drop implementation unsafe { ManuallyDrop::drop(&mut self.guard) }; let held_for = self.acquired_at.elapsed(); if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD { tracing::warn!( holder=%self.holder, "Layer manager write lock held for {}s", held_for.as_secs_f64(), ); } } } impl Deref for LayerManagerReadGuard<'_> { type Target = LayerManager; fn deref(&self) -> &Self::Target { self.guard.deref() } } impl Deref for LayerManagerWriteGuard<'_> { type Target = LayerManager; fn deref(&self) -> &Self::Target { self.guard.deref() } } impl DerefMut for LayerManagerWriteGuard<'_> { fn deref_mut(&mut self) -> &mut Self::Target { self.guard.deref_mut() } } impl LockedLayerManager { pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard { let guard = ManuallyDrop::new(self.locked.read().await); LayerManagerReadGuard { guard, acquired_at: std::time::Instant::now(), holder, } } pub(crate) fn try_read( &self, holder: LayerManagerLockHolder, ) -> Result { let guard = ManuallyDrop::new(self.locked.try_read()?); Ok(LayerManagerReadGuard { guard, acquired_at: std::time::Instant::now(), holder, }) } pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard { let guard = ManuallyDrop::new(self.locked.write().await); LayerManagerWriteGuard { guard, acquired_at: std::time::Instant::now(), holder, } } pub(crate) fn try_write( &self, holder: LayerManagerLockHolder, ) -> Result { let guard = ManuallyDrop::new(self.locked.try_write()?); Ok(LayerManagerWriteGuard { guard, acquired_at: std::time::Instant::now(), holder, }) } } /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate /// the layers. Open(OpenLayerManager), /// Shutdown layer manager where there are no more in-memory layers and persistent layers are /// read-only. Closed { layers: HashMap, }, } impl Default for LayerManager { fn default() -> Self { LayerManager::Open(OpenLayerManager::default()) } } impl LayerManager { fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { match weak { ReadableLayerWeak::PersistentLayer(desc) => { ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) } ReadableLayerWeak::InMemoryLayer(desc) => { let inmem = self .layer_map() .expect("no concurrent shutdown") .in_memory_layer(&desc); ReadableLayer::InMemoryLayer(inmem) } } } pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { // The assumption for the `expect()` is that all code maintains the following invariant: // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.try_get_from_key(key) .with_context(|| format!("get layer from key: {key}")) .expect("not found") .clone() } pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> { self.layers().get(key) } pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { self.get_from_key(&desc.key()) } /// Get an immutable reference to the layer map. /// /// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { use LayerManager::*; match self { Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map), Closed { .. } => Err(Shutdown), } } pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> { use LayerManager::*; match self { Open(open) => Ok(open), Closed { .. } => Err(Shutdown), } } /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in /// order to allow shutdown to complete. /// /// If there was a want to flush in-memory layers, it must have happened earlier. pub(crate) fn shutdown(&mut self, writer_state: &mut Option) { use LayerManager::*; match self { Open(OpenLayerManager { layer_map, layer_fmgr: LayerFileManager(hashmap), }) => { // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown. let open = layer_map.open_layer.take(); let frozen = layer_map.frozen_layers.len(); let taken_writer_state = writer_state.take(); tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers"); let layers = std::mem::take(hashmap); *self = Closed { layers }; assert_eq!(open.is_some(), taken_writer_state.is_some()); } Closed { .. } => { tracing::debug!("ignoring multiple shutdowns on layer manager") } } } /// Sum up the historic layer sizes pub(crate) fn layer_size_sum(&self) -> u64 { self.layers() .values() .map(|l| l.layer_desc().file_size) .sum() } pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { self.layers().values().filter(|l| l.is_likely_resident()) } pub(crate) fn visible_layers(&self) -> impl Iterator + '_ { self.layers() .values() .filter(|l| l.visibility() == LayerVisibilityHint::Visible) } pub(crate) fn contains(&self, layer: &Layer) -> bool { self.contains_key(&layer.layer_desc().key()) } pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { self.layers().contains_key(key) } pub(crate) fn all_persistent_layers(&self) -> Vec { self.layers().keys().cloned().collect_vec() } /// Update the [`LayerFringe`] of a read request /// /// Take a key space at a given LSN and query the layer map below each range /// of the key space to find the next layers to visit. pub(crate) fn update_search_fringe( &self, keyspace: &KeySpace, cont_lsn: Lsn, fringe: &mut LayerFringe, ) -> Result<(), Shutdown> { let map = self.layer_map()?; for range in keyspace.ranges.iter() { let results = map.range_search(range.clone(), cont_lsn); results .found .into_iter() .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { ( self.upgrade(layer), keyspace_accum.to_keyspace(), lsn_floor..cont_lsn, ) }) .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range)); } Ok(()) } fn layers(&self) -> &HashMap { use LayerManager::*; match self { Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0, Closed { layers } => layers, } } } #[derive(Default)] pub(crate) struct OpenLayerManager { layer_map: LayerMap, layer_fmgr: LayerFileManager, } impl std::fmt::Debug for OpenLayerManager { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("OpenLayerManager") .field("layer_count", &self.layer_fmgr.0.len()) .finish() } } #[derive(Debug, thiserror::Error)] #[error("layer manager has been shutdown")] pub(crate) struct Shutdown; impl OpenLayerManager { /// Called from `load_layer_map`. Initialize the layer manager with: /// 1. all on-disk layers /// 2. next open layer (with disk disk_consistent_lsn LSN) pub(crate) fn initialize_local_layers(&mut self, layers: Vec, next_open_layer_at: Lsn) { let mut updates = self.layer_map.batch_update(); for layer in layers { Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); } updates.flush(); self.layer_map.next_open_layer_at = Some(next_open_layer_at); } /// Initialize when creating a new timeline, called in `init_empty_layer_map`. pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) { self.layer_map.next_open_layer_at = Some(next_open_layer_at); } /// Open a new writable layer to append data if there is no open layer, otherwise return the /// current open layer, called within `get_layer_for_write`. #[allow(clippy::too_many_arguments)] pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result> { ensure!(lsn.is_aligned()); // Do we have a layer open for writing already? let layer = if let Some(open_layer) = &self.layer_map.open_layer { if open_layer.get_lsn_range().start > lsn { bail!( "unexpected open layer in the future: open layers starts at {}, write lsn {}", open_layer.get_lsn_range().start, lsn ); } Arc::clone(open_layer) } else { // No writeable layer yet. Create one. let start_lsn = self .layer_map .next_open_layer_at .context("No next open layer found")?; trace!( "creating in-memory layer at {}/{} for record at {}", timeline_id, start_lsn, lsn ); let new_layer = InMemoryLayer::create( conf, timeline_id, tenant_shard_id, start_lsn, gate, cancel, ctx, ) .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); self.layer_map.next_open_layer_at = None; layer }; Ok(layer) } /// Tries to freeze an open layer and also manages clearing the TimelineWriterState. /// /// Returns true if anything was frozen. pub(super) async fn try_freeze_in_memory_layer( &mut self, lsn: Lsn, last_freeze_at: &AtomicLsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, metrics: &TimelineMetrics, ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); let froze = if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); open_layer.freeze(end_lsn).await; // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`. // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a // reference to the timeline metrics. Other methods use a metrics borrow as well. metrics.inc_frozen_layer(open_layer); // The layer is no longer open, update the layer map to reflect this. // We will replace it with on-disk historics below. self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); true } else { false }; // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this // accounts for regions in the LSN range where we might have ingested no data due to sharding. last_freeze_at.store(end_lsn); // the writer state must no longer have a reference to the frozen layer let taken = write_lock.take(); assert_eq!( froze, taken.is_some(), "should only had frozen a layer when TimelineWriterState existed" ); froze } /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. pub(crate) fn track_new_image_layers( &mut self, image_layers: &[ResidentLayer], metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); for layer in image_layers { Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); // record these here instead of Layer::finish_creating because otherwise partial // failure with create_image_layers would balloon up the physical size gauge. downside // is that all layers need to be created before metrics are updated. metrics.record_new_file_metrics(layer.layer_desc().file_size); } updates.flush(); } /// Flush a frozen layer and add the written delta layer to the layer map. pub(crate) fn finish_flush_l0_layer( &mut self, delta_layer: Option<&ResidentLayer>, frozen_layer_for_check: &Arc, metrics: &TimelineMetrics, ) { let inmem = self .layer_map .frozen_layers .pop_front() .expect("there must be a inmem layer to flush"); metrics.dec_frozen_layer(&inmem); // Only one task may call this function at a time (for this // timeline). If two tasks tried to flush the same frozen // layer to disk at the same time, that would not work. assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check)); if let Some(l) = delta_layer { let mut updates = self.layer_map.batch_update(); Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr); metrics.record_new_file_metrics(l.layer_desc().file_size); updates.flush(); } } /// Called when compaction is completed. pub(crate) fn finish_compact_l0( &mut self, compact_from: &[Layer], compact_to: &[ResidentLayer], metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); for l in compact_to { Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr); metrics.record_new_file_metrics(l.layer_desc().file_size); } for l in compact_from { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } updates.flush(); } /// Called when a GC-compaction is completed. pub(crate) fn finish_gc_compaction( &mut self, compact_from: &[Layer], compact_to: &[ResidentLayer], metrics: &TimelineMetrics, ) { // gc-compaction could contain layer rewrites. We need to delete the old layers and insert the new ones. // Match the old layers with the new layers let mut add_layers = HashMap::new(); let mut rewrite_layers = HashMap::new(); let mut drop_layers = HashMap::new(); for layer in compact_from { drop_layers.insert(layer.layer_desc().key(), layer.clone()); } for layer in compact_to { if let Some(old_layer) = drop_layers.remove(&layer.layer_desc().key()) { rewrite_layers.insert(layer.layer_desc().key(), (old_layer.clone(), layer.clone())); } else { add_layers.insert(layer.layer_desc().key(), layer.clone()); } } let add_layers = add_layers.values().cloned().collect::>(); let drop_layers = drop_layers.values().cloned().collect::>(); let rewrite_layers = rewrite_layers.values().cloned().collect::>(); self.rewrite_layers_inner(&rewrite_layers, &drop_layers, &add_layers, metrics); } /// Called post-compaction when some previous generation image layers were trimmed. pub fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], metrics: &TimelineMetrics, ) { self.rewrite_layers_inner(rewrite_layers, drop_layers, &[], metrics); } fn rewrite_layers_inner( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], add_layers: &[ResidentLayer], metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); for (old_layer, new_layer) in rewrite_layers { debug_assert_eq!( old_layer.layer_desc().key_range, new_layer.layer_desc().key_range ); debug_assert_eq!( old_layer.layer_desc().lsn_range, new_layer.layer_desc().lsn_range ); // Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents // always marking rewritten layers as visible. new_layer.as_ref().set_visibility(old_layer.visibility()); // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, // such as an increment in the generation number. assert_ne!(old_layer.local_path(), new_layer.local_path()); Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr); Self::insert_historic_layer( new_layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr, ); metrics.record_new_file_metrics(new_layer.layer_desc().file_size); } for l in drop_layers { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } for l in add_layers { Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr); metrics.record_new_file_metrics(l.layer_desc().file_size); } updates.flush(); } /// Called when garbage collect has selected the layers to be removed. pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); for doomed_layer in gc_layers { Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr); } updates.flush() } #[cfg(test)] pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) { let mut updates = self.layer_map.batch_update(); Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); updates.flush() } /// Helper function to insert a layer into the layer map and file manager. fn insert_historic_layer( layer: Layer, updates: &mut BatchedUpdates<'_>, mapping: &mut LayerFileManager, ) { updates.insert_historic(layer.layer_desc().clone()); mapping.insert(layer); } /// Removes the layer from local FS (if present) and from memory. /// Remote storage is not affected by this operation. fn delete_historic_layer( // we cannot remove layers otherwise, since gc and compaction will race layer: &Layer, updates: &mut BatchedUpdates<'_>, mapping: &mut LayerFileManager, ) { let desc = layer.layer_desc(); // TODO Removing from the bottom of the layer map is expensive. // Maybe instead discard all layer map historic versions that // won't be needed for page reconstruction for this timeline, // and mark what we can't delete yet as deleted from the layer // map index without actually rebuilding the index. updates.remove_historic(desc); mapping.remove(layer); layer.delete_on_drop(); } #[cfg(test)] pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc) { use pageserver_api::models::InMemoryLayerInfo; match layer.info() { InMemoryLayerInfo::Open { .. } => { assert!(self.layer_map.open_layer.is_none()); self.layer_map.open_layer = Some(layer); } InMemoryLayerInfo::Frozen { lsn_start, .. } => { if let Some(last) = self.layer_map.frozen_layers.back() { assert!(last.get_lsn_range().end <= lsn_start); } self.layer_map.frozen_layers.push_back(layer); } } } } pub(crate) struct LayerFileManager(HashMap); impl Default for LayerFileManager { fn default() -> Self { Self(HashMap::default()) } } impl LayerFileManager { pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { panic!("overwriting a layer: {:?}", layer.layer_desc()) } } pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { panic!( "removing layer that is not present in layer mapping: {:?}", layer.layer_desc() ) } } } ================================================ FILE: pageserver/src/tenant/timeline/logical_size.rs ================================================ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; use anyhow::Context; use once_cell::sync::OnceCell; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; /// Internal structure to hold all data needed for logical size calculation. /// /// Calculation consists of two stages: /// /// 1. Initial size calculation. That might take a long time, because it requires /// reading all layers containing relation sizes at `initial_part_end`. /// /// 2. Collecting an incremental part and adding that to the initial size. /// Increments are appended on walreceiver writing new timeline data, /// which result in increase or decrease of the logical size. pub(super) struct LogicalSize { /// Size, potentially slow to compute. Calculating this might require reading multiple /// layers, and even ancestor's layers. /// /// NOTE: size at a given LSN is constant, but after a restart we will calculate /// the initial size at a different LSN. pub initial_logical_size: OnceCell<( u64, crate::metrics::initial_logical_size::FinishedCalculationGuard, )>, /// Cancellation for the best-effort logical size calculation. /// /// The token is kept in a once-cell so that we can error out if a higher priority /// request comes in *before* we have started the normal logical size calculation. pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell, /// Once the initial logical size is initialized, this is notified. pub(crate) initialized: tokio::sync::Semaphore, /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. pub initial_part_end: Option, /// All other size changes after startup, combined together. /// /// Size shouldn't ever be negative, but this is signed for two reasons: /// /// 1. If we initialized the "baseline" size lazily, while we already /// process incoming WAL, the incoming WAL records could decrement the /// variable and temporarily make it negative. (This is just future-proofing; /// the initialization is currently not done lazily.) /// /// 2. If there is a bug and we e.g. forget to increment it in some cases /// when size grows, but remember to decrement it when it shrinks again, the /// variable could go negative. In that case, it seems better to at least /// try to keep tracking it, rather than clamp or overflow it. Note that /// get_current_logical_size() will clamp the returned value to zero if it's /// negative, and log an error. Could set it permanently to zero or some /// special value to indicate "broken" instead, but this will do for now. /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` /// to modify this, it will also keep the prometheus metric in sync. pub size_added_after_initial: AtomicI64, /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`]. pub(super) did_return_approximate_to_walreceiver: AtomicBool, } /// Normalized current size, that the data in pageserver occupies. #[derive(Debug, Clone, Copy)] pub(crate) enum CurrentLogicalSize { /// The size is not yet calculated to the end, this is an intermediate result, /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, /// yet total logical size cannot be below 0. Approximate(Approximate), // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are // available for observation without any calculations. Exact(Exact), } #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub(crate) enum Accuracy { Approximate, Exact, } #[derive(Debug, Clone, Copy)] pub(crate) struct Approximate(u64); #[derive(Debug, Clone, Copy)] pub(crate) struct Exact(u64); impl From<&Approximate> for u64 { fn from(value: &Approximate) -> Self { value.0 } } impl From<&Exact> for u64 { fn from(val: &Exact) -> Self { val.0 } } impl Approximate { /// For use in situations where we don't have a sane logical size value but need /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant. pub(crate) fn zero() -> Self { Self(0) } } impl CurrentLogicalSize { pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { match self { Self::Approximate(size) => size.into(), Self::Exact(size) => size.into(), } } pub(crate) fn accuracy(&self) -> Accuracy { match self { Self::Approximate(_) => Accuracy::Approximate, Self::Exact(_) => Accuracy::Exact, } } pub(crate) fn is_exact(&self) -> bool { matches!(self, Self::Exact(_)) } } impl LogicalSize { pub(super) fn empty_initial() -> Self { Self { initial_logical_size: OnceCell::with_value((0, { crate::metrics::initial_logical_size::START_CALCULATION .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial) .calculation_result_saved() })), cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(), initial_part_end: None, size_added_after_initial: AtomicI64::new(0), did_return_approximate_to_walreceiver: AtomicBool::new(false), initialized: tokio::sync::Semaphore::new(0), } } pub(super) fn deferred_initial(compute_to: Lsn) -> Self { Self { initial_logical_size: OnceCell::new(), cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(), initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), did_return_approximate_to_walreceiver: AtomicBool::new(false), initialized: tokio::sync::Semaphore::new(0), } } pub(super) fn current_size(&self) -> CurrentLogicalSize { let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); // ^^^ keep this type explicit so that the casts in this function break if // we change the type. match self.initial_logical_size.get() { Some((initial_size, _)) => { CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment) .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) .unwrap())) } None => { let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment)) } } } pub(super) fn increment_size(&self, delta: i64) { self.size_added_after_initial .fetch_add(delta, AtomicOrdering::SeqCst); } /// Make the value computed by initial logical size computation /// available for re-use. This doesn't contain the incremental part. pub(super) fn initialized_size(&self, lsn: Lsn) -> Option { match self.initial_part_end { Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s), _ => None, } } } ================================================ FILE: pageserver/src/tenant/timeline/offload.rs ================================================ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; use super::Timeline; use super::delete::{DeletionGuard, delete_local_timeline_directory}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard}; use crate::tenant::{ DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded, }; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { #[error("Cancelled")] Cancelled, #[error("Timeline is not archived")] NotArchived, #[error("Offload or deletion already in progress")] AlreadyInProgress, #[error("Unexpected offload error: {0}")] Other(anyhow::Error), } impl From for OffloadError { fn from(e: TenantManifestError) -> Self { match e { TenantManifestError::Cancelled => Self::Cancelled, TenantManifestError::RemoteStorage(e) => Self::Other(e), } } } pub(crate) async fn offload_timeline( tenant: &TenantShard, timeline: &Arc, ) -> Result<(), OffloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); let delete_guard_res = make_timeline_delete_guard( tenant, timeline.timeline_id, TimelineDeleteGuardKind::Offload, ); let (timeline, guard) = match delete_guard_res { Ok(timeline_and_guard) => timeline_and_guard, Err(DeleteTimelineError::HasChildren(children)) => { let is_archived = timeline.is_archived(); if is_archived == Some(true) { tracing::error!("timeline is archived but has non-archived children: {children:?}"); return Err(OffloadError::NotArchived); } tracing::info!( ?is_archived, "timeline is not archived and has unarchived children" ); return Err(OffloadError::NotArchived); } Err(DeleteTimelineError::AlreadyInProgress(_)) => { tracing::info!("timeline offload or deletion already in progress"); return Err(OffloadError::AlreadyInProgress); } Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))), }; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); return Ok(()); }; match timeline.remote_client.shutdown_if_archived().await { Ok(()) => {} Err(ShutdownIfArchivedError::NotInitialized(_)) => { // Either the timeline is being deleted, the operation is being retried, or we are shutting down. // Don't return cancelled here to keep it idempotent. } Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived), } timeline.set_state(TimelineState::Stopping); // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Reload).await; // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress let conf = &tenant.conf; delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await; let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard); { let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); if matches!( tenant.current_state(), TenantState::Stopping { .. } | TenantState::Broken { .. } ) { // Cancel the operation if the tenant is shutting down. Do this while the // timelines_offloaded lock is held to prevent a race with Tenant::shutdown // for defusing the lock return Err(OffloadError::Cancelled); } offloaded_timelines.insert( timeline.timeline_id, Arc::new( OffloadedTimeline::from_timeline(&timeline) .expect("we checked above that timeline was ready"), ), ); } // Last step: mark timeline as offloaded in S3 // TODO: maybe move this step above, right above deletion of the local timeline directory, // then there is no potential race condition where we partially offload a timeline, and // at the next restart attach it again. // For that to happen, we'd need to make the manifest reflect our *intended* state, // not our actual state of offloaded timelines. tenant.maybe_upload_tenant_manifest().await?; tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})"); Ok(()) } /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`make_timeline_delete_guard`] /// /// Returns the strong count of the timeline `Arc` fn remove_timeline_from_tenant( tenant: &TenantShard, timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> usize { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) if children_exist { panic!("Timeline grew children while we removed layer files"); } let timeline = timelines .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); // Clear the compaction queue for this timeline tenant .scheduled_compaction_tasks .lock() .unwrap() .remove(&timeline.timeline_id); Arc::strong_count(&timeline) } ================================================ FILE: pageserver/src/tenant/timeline/span.rs ================================================ ================================================ FILE: pageserver/src/tenant/timeline/uninit.rs ================================================ use std::collections::hash_map::Entry; use std::fs; use std::future::Future; use std::sync::Arc; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; use utils::fs_ext; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::gate::GateGuard; use super::Timeline; use crate::context::RequestContext; use crate::import_datadir; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::{ CreateTimelineError, CreateTimelineIdempotency, TenantShard, TimelineOrOffloaded, }; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or /// its local files are removed. If we crash while this class exists, then the timeline's local /// state is cleaned up during [`TenantShard::clean_up_timelines`], because the timeline's content isn't in remote storage. /// /// The caller is responsible for proper timeline data filling before the final init. #[must_use] pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t TenantShard, timeline_id: TimelineId, raw_timeline: Option<(Arc, TimelineCreateGuard)>, /// Whether we spawned the inner Timeline's tasks such that we must later shut it down /// if aborting the timeline creation needs_shutdown: bool, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t TenantShard, timeline_id: TimelineId, raw_timeline: Option<(Arc, TimelineCreateGuard)>, ) -> Self { Self { owning_tenant, timeline_id, raw_timeline, needs_shutdown: false, } } /// When writing data to this timeline during creation, use this wrapper: it will take care of /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down /// later. pub(crate) async fn write(&mut self, f: F) -> anyhow::Result<()> where F: FnOnce(Arc) -> Fut, Fut: Future>, { debug_assert_current_span_has_tenant_and_timeline_id(); // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop self.needs_shutdown = true; let timeline = self.raw_timeline()?; // Spawn flush loop so that the Timeline is ready to accept writes timeline.maybe_spawn_flush_loop(); // Invoke the provided function, which will write some data into the new timeline if let Err(e) = f(timeline.clone()).await { self.abort().await; return Err(e.into()); } // Flush the underlying timeline's ephemeral layers to disk if let Err(e) = timeline .freeze_and_flush() .await .context("Failed to flush after timeline creation writes") { self.abort().await; return Err(e); } Ok(()) } pub(crate) async fn abort(&self) { if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() { raw_timeline.shutdown(super::ShutdownMode::Hard).await; } } /// Finish timeline creation: insert it into the Tenant's timelines map /// /// This function launches the flush loop if not already done. /// /// The caller is responsible for activating the timeline (function `.activate()`). pub(crate) async fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_shard_id = self.owning_tenant.tenant_shard_id; if self.raw_timeline.is_none() { self.abort().await; return Err(anyhow::anyhow!( "No timeline for initialization found for {tenant_shard_id}/{timeline_id}" )); } // Check that the caller initialized disk_consistent_lsn let new_disk_consistent_lsn = self .raw_timeline .as_ref() .expect("checked above") .0 .get_disk_consistent_lsn(); if !new_disk_consistent_lsn.is_valid() { self.abort().await; return Err(anyhow::anyhow!( "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" )); } let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { Entry::Occupied(_) => { // Unexpected, bug in the caller. Tenant is responsible for preventing concurrent creation of the same timeline. // // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should // skip trying to delete the timeline directory too. anyhow::bail!( "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ) } Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion let (new_timeline, _create_guard) = self.raw_timeline.take().expect("already checked"); v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); Ok(new_timeline) } } } pub(crate) fn finish_creation_myself(&mut self) -> (Arc, TimelineCreateGuard) { self.raw_timeline.take().expect("already checked") } /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( mut self, tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> anyhow::Result> { self.write(|raw_timeline| async move { import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx) .await .context("Failed to import basebackup") .map_err(CreateTimelineError::Other)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { Err(CreateTimelineError::Other(anyhow::anyhow!( "failpoint before-checkpoint-new-timeline" ))) }); Ok(()) }) .await?; // All the data has been imported. Insert the Timeline into the tenant's timelines map let tl = self.finish_creation().await?; tl.activate(tenant, broker_client, None, ctx); Ok(tl) } pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc> { Ok(&self .raw_timeline .as_ref() .with_context(|| { format!( "No raw timeline {}/{} found", self.owning_tenant.tenant_shard_id, self.timeline_id ) })? .0) } } impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { if let Some((timeline, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); if self.needs_shutdown && !timeline.gate.close_complete() { // This should not happen: caller should call [`Self::abort`] on failures tracing::warn!( "Timeline not shut down after initialization failure, cannot clean up files" ); } else { // This is unusual, but can happen harmlessly if the pageserver is stopped while // creating a timeline. info!("Timeline got dropped without initializing, cleaning its files"); cleanup_timeline_directory(create_guard); } } } } pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { let timeline_path = &create_guard.timeline_path; match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { info!("Timeline dir {timeline_path:?} removed successfully") } Err(e) => { error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") } } // Having cleaned up, we can release this TimelineId in `[TenantShard::timelines_creating]` to allow other // timeline creation attempts under this TimelineId to proceed drop(create_guard); } /// A guard for timeline creations in process: as long as this object exists, the timeline ID /// is kept in `[TenantShard::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] pub(crate) struct TimelineCreateGuard { pub(crate) _tenant_gate_guard: GateGuard, pub(crate) owning_tenant: Arc, pub(crate) timeline_id: TimelineId, pub(crate) timeline_path: Utf8PathBuf, pub(crate) idempotency: CreateTimelineIdempotency, } /// Errors when acquiring exclusive access to a timeline ID for creation #[derive(thiserror::Error, Debug)] pub(crate) enum TimelineExclusionError { #[error("Already exists")] AlreadyExists { existing: TimelineOrOffloaded, arg: CreateTimelineIdempotency, }, #[error("Already creating")] AlreadyCreating, #[error("Shutting down")] ShuttingDown, // e.g. I/O errors, or some failure deep in postgres initdb #[error(transparent)] Other(#[from] anyhow::Error), } impl TimelineCreateGuard { pub(crate) fn new( owning_tenant: &Arc, timeline_id: TimelineId, timeline_path: Utf8PathBuf, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, ) -> Result { let _tenant_gate_guard = owning_tenant .gate .enter() .map_err(|_| TimelineExclusionError::ShuttingDown)?; // Lock order: this is the only place we take both locks. During drop() we only // lock creating_timelines let timelines = owning_tenant.timelines.lock().unwrap(); let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap(); let mut creating_timelines: std::sync::MutexGuard< '_, std::collections::HashSet, > = owning_tenant.timelines_creating.lock().unwrap(); if let Some(existing) = timelines.get(&timeline_id) { return Err(TimelineExclusionError::AlreadyExists { existing: TimelineOrOffloaded::Timeline(existing.clone()), arg: idempotency, }); } if !allow_offloaded { if let Some(existing) = timelines_offloaded.get(&timeline_id) { return Err(TimelineExclusionError::AlreadyExists { existing: TimelineOrOffloaded::Offloaded(existing.clone()), arg: idempotency, }); } } if creating_timelines.contains(&timeline_id) { return Err(TimelineExclusionError::AlreadyCreating); } creating_timelines.insert(timeline_id); drop(creating_timelines); drop(timelines_offloaded); drop(timelines); Ok(Self { _tenant_gate_guard, owning_tenant: Arc::clone(owning_tenant), timeline_id, timeline_path, idempotency, }) } } impl Drop for TimelineCreateGuard { fn drop(&mut self) { self.owning_tenant .timelines_creating .lock() .unwrap() .remove(&self.timeline_id); } } ================================================ FILE: pageserver/src/tenant/timeline/walreceiver/connection_manager.rs ================================================ //! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, //! that contains the latest WAL to stream and this connection does not go stale. //! //! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it, //! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. //! Current connection state is tracked too, to ensure it's not getting stale. //! //! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, //! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. use std::collections::HashMap; use std::num::NonZeroU64; use std::ops::ControlFlow; use std::sync::Arc; use std::time::Duration; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; use postgres_connection::PgConnectionConfig; use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, }; use storage_broker::{BrokerClientChannel, Code, Streaming}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::backoff::{ DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config}; use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; pub(crate) struct Cancelled; /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. /// /// # Cancel-Safety /// /// Not cancellation-safe. Use `cancel` token to request cancellation. pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, cancel: &CancellationToken, manager_status: &std::sync::RwLock>, ) -> Result<(), Cancelled> { match tokio::select! { _ = cancel.cancelled() => { return Err(Cancelled); }, st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st } } { Ok(()) => {} Err(new_state) => { debug!( ?new_state, "state changed, stopping wal connection manager loop" ); return Err(Cancelled); } } WALRECEIVER_ACTIVE_MANAGERS.inc(); scopeguard::defer! { WALRECEIVER_ACTIVE_MANAGERS.dec(); } let id = TenantTimelineId { tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, }; let mut timeline_state_updates = connection_manager_state .timeline .subscribe_for_state_updates(); let mut wait_lsn_status = connection_manager_state .timeline .subscribe_for_wait_lsn_updates(); // TODO: create a separate config option for discovery request interval let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout; let mut last_discovery_ts: Option = None; // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30)); debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); let any_activity = connection_manager_state.wal_connection.is_some() || !connection_manager_state.wal_stream_candidates.is_empty(); // These things are happening concurrently: // // - cancellation request // - keep receiving WAL on the current connection // - if the shared state says we need to change connection, disconnect and return // - this runs in a separate task and we receive updates via a watch channel // - change connection if the rules decide so, or if the current connection dies // - receive updates from broker // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently // - if there's no connection and no candidates, try to send a discovery request // NB: make sure each of the select expressions are cancellation-safe // (no need for arms to be cancellation-safe). tokio::select! { _ = cancel.cancelled() => { return Err(Cancelled); } Some(wal_connection_update) = async { match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), None => None, } } => { let wal_connection = connection_manager_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Update(TaskStateUpdate::Started) => {}, TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => { if new_status.has_processed_wal { // We have advanced last_record_lsn by processing the WAL received // from this safekeeper. This is good enough to clean unsuccessful // retries history and allow reconnecting to this safekeeper without // sleeping for a long time. connection_manager_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = new_status; } TaskEvent::End(walreceiver_task_result) => { match walreceiver_task_result { Ok(()) => debug!("WAL receiving task finished"), Err(e) => error!("wal receiver task finished with an error: {e:?}"), } connection_manager_state.drop_old_connection(false).await; }, } }, // Got a new update from the broker broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => { match broker_update { Ok(Some(broker_update)) => { broker_reset_interval.reset(); connection_manager_state.register_timeline_update(broker_update); }, Err(status) => { match status.code() { Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => { // tonic's error handling doesn't provide a clear code for disconnections: we get // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe" // => https://github.com/neondatabase/neon/issues/9562 info!("broker disconnected: {status}"); }, _ => { warn!("broker subscription failed: {status}"); } } return Ok(()); } Ok(None) => { error!("broker subscription stream ended"); // can't happen return Ok(()); } } }, // If we've not received any updates from the broker from a while, are waiting for WAL // and have no safekeeper connection or connection candidates, then it might be that // the broker subscription is wedged. Drop the current subscription and re-subscribe // with the goal of unblocking it. _ = broker_reset_interval.tick() => { let awaiting_lsn = wait_lsn_status.borrow().is_some(); let no_candidates = connection_manager_state.wal_stream_candidates.is_empty(); let no_connection = connection_manager_state.wal_connection.is_none(); if awaiting_lsn && no_candidates && no_connection { tracing::info!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; } }, new_event = async { // Reminder: this match arm needs to be cancellation-safe. loop { if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); } match timeline_state_updates.changed().await { Ok(()) => { let new_state = connection_manager_state.timeline.current_state(); match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, TimelineState::Broken { .. } | TimelineState::Stopping => { debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); return ControlFlow::Break(()); } TimelineState::Loading => { warn!("timeline transitioned back to Loading state, that should not happen"); return ControlFlow::Continue(()); } } } Err(_sender_dropped_error) => return ControlFlow::Break(()), } } } => match new_event { ControlFlow::Continue(()) => { return Ok(()); } ControlFlow::Break(()) => { debug!("Timeline is no longer active, stopping wal connection manager loop"); return Err(Cancelled); } }, Some(()) = async { match time_until_next_retry { Some(sleep_time) => { tokio::time::sleep(sleep_time).await; Some(()) }, None => { debug!("No candidates to retry, waiting indefinitely for the broker events"); None } } } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), Some(()) = async { // Reminder: this match arm needs to be cancellation-safe. // Calculating time needed to wait until sending the next discovery request. // Current implementation is conservative and sends discovery requests only when there are no candidates. if any_activity { // No need to send discovery requests if there is an active connection or candidates. return None; } // Waiting for an active wait_lsn request. while wait_lsn_status.borrow().is_none() { if wait_lsn_status.changed().await.is_err() { // wait_lsn_status channel was closed, exiting warn!("wait_lsn_status channel was closed in connection_manager_loop_step"); return None; } } // All preconditions met, preparing to send a discovery request. let now = std::time::Instant::now(); let next_discovery_ts = last_discovery_ts .map(|ts| ts + discovery_request_interval) .unwrap_or_else(|| now); if next_discovery_ts > now { // Prevent sending discovery requests too frequently. tokio::time::sleep(next_discovery_ts - now).await; } let tenant_timeline_id = Some(ProtoTenantTimelineId { tenant_id: id.tenant_id.as_ref().to_owned(), timeline_id: id.timeline_id.as_ref().to_owned(), }); let request = SafekeeperDiscoveryRequest { tenant_timeline_id }; let msg = TypedMessage { r#type: MessageType::SafekeeperDiscoveryRequest as i32, safekeeper_timeline_info: None, safekeeper_discovery_request: Some(request), safekeeper_discovery_response: None, }; last_discovery_ts = Some(std::time::Instant::now()); info!("No active connection and no candidates, sending discovery request to the broker"); // Cancellation safety: we want to send a message to the broker, but publish_one() // function can get cancelled by the other select! arm. This is absolutely fine, because // we just want to receive broker updates and discovery is not important if we already // receive updates. // // It is possible that `last_discovery_ts` will be updated, but the message will not be sent. // This is totally fine because of the reason above. // This is a fire-and-forget request, we don't care about the response let _ = broker_client.publish_one(msg).await; debug!("Discovery request sent to the broker"); None } => {} } if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); connection_manager_state .change_connection(new_candidate, ctx) .await } *manager_status.write().unwrap() = Some(connection_manager_state.manager_status()); } } /// Endlessly try to subscribe for broker updates for a given timeline. async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, id: TenantTimelineId, cancel: &CancellationToken, ) -> Result, Cancelled> { let mut attempt = 0; loop { exponential_backoff( attempt, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, cancel, ) .await; attempt += 1; // subscribe to the specific timeline let request = SubscribeByFilterRequest { types: vec![ TypeSubscription { r#type: MessageType::SafekeeperTimelineInfo as i32, }, TypeSubscription { r#type: MessageType::SafekeeperDiscoveryResponse as i32, }, ], tenant_timeline_id: Some(FilterTenantTimelineId { enabled: true, tenant_timeline_id: Some(ProtoTenantTimelineId { tenant_id: id.tenant_id.as_ref().to_owned(), timeline_id: id.timeline_id.as_ref().to_owned(), }), }), }; match { tokio::select! { r = broker_client.subscribe_by_filter(request) => { r } _ = cancel.cancelled() => { return Err(Cancelled); } } } { Ok(resp) => { return Ok(resp.into_inner()); } Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. info!( "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}" ); continue; } } } } const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1; const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, /// Info about retries and unsuccessful attempts to connect to safekeepers. wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, } /// An information about connection manager's current connection and connection candidates. #[derive(Debug, Clone)] pub struct ConnectionManagerStatus { existing_connection: Option, wal_stream_candidates: HashMap, } impl ConnectionManagerStatus { /// Generates a string, describing current connection status in a form, suitable for logging. pub fn to_human_readable_string(&self) -> String { let mut resulting_string = String::new(); match &self.existing_connection { Some(connection) => { if connection.has_processed_wal { resulting_string.push_str(&format!( " (update {}): streaming WAL from node {}, ", connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"), connection.node, )); match (connection.streaming_lsn, connection.commit_lsn) { (None, None) => resulting_string.push_str("no streaming data"), (None, Some(commit_lsn)) => { resulting_string.push_str(&format!("commit Lsn: {commit_lsn}")) } (Some(streaming_lsn), None) => { resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}")) } (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str( &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"), ), } } else if connection.is_connected { resulting_string.push_str(&format!( " (update {}): connecting to node {}", connection .latest_connection_update .format("%Y-%m-%d %H:%M:%S"), connection.node, )); } else { resulting_string.push_str(&format!( " (update {}): initializing node {} connection", connection .latest_connection_update .format("%Y-%m-%d %H:%M:%S"), connection.node, )); } } None => resulting_string.push_str(": disconnected"), } resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): ["); let mut candidates = self.wal_stream_candidates.iter().peekable(); while let Some((node_id, candidate_info)) = candidates.next() { resulting_string.push_str(&format!( "({}|{}|{})", node_id, candidate_info.latest_update.format("%H:%M:%S"), Lsn(candidate_info.timeline.commit_lsn) )); if candidates.peek().is_some() { resulting_string.push_str(", "); } } resulting_string.push(']'); resulting_string } } /// Current connection data. #[derive(Debug)] struct WalConnection { /// Time when the connection was initiated. started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, /// Availability zone of the safekeeper. availability_zone: Option, /// Status of the connection. status: WalConnectionStatus, /// WAL streaming task handle. connection_task: TaskHandle, /// Have we discovered that other safekeeper has more recent WAL than we do? discovered_new_wal: Option, } /// Notion of a new committed WAL, which exists on other safekeeper. #[derive(Debug, Clone, Copy)] struct NewCommittedWAL { /// LSN of the new committed WAL. lsn: Lsn, /// When we discovered that the new committed WAL exists on other safekeeper. discovered_at: NaiveDateTime, } #[derive(Debug, Clone, Copy)] struct RetryInfo { next_retry_at: Option, retry_duration_seconds: f64, } /// Data about the timeline to connect to, received from the broker. #[derive(Debug, Clone)] struct BrokerSkTimeline { timeline: SafekeeperDiscoveryResponse, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } impl ConnectionManagerState { pub(super) fn new( timeline: Arc, conf: WalReceiverConf, cancel: CancellationToken, ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, }; Self { id, timeline, cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), } } fn spawn( &self, task: impl FnOnce( tokio::sync::watch::Sender>, CancellationToken, ) -> Fut + Send + 'static, ) -> TaskHandle where Fut: std::future::Future> + Send, { // TODO: get rid of TaskHandle super::TaskHandle::spawn(&self.cancel, task) } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES .with_label_values(&[new_sk.reason.name()]) .inc(); self.drop_old_connection(true).await; let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let ingest_batch_size = self.conf.ingest_batch_size; let protocol = self.conf.protocol; let validate_wal_contiguity = self.conf.validate_wal_contiguity; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, DownloadBehavior::Download, ); let span = info_span!("connection", %node_id); let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); let res = super::walreceiver_connection::handle_walreceiver_connection( timeline, protocol, new_sk.wal_source_connconf, events_sender, cancellation.clone(), connect_timeout, ctx, node_id, ingest_batch_size, validate_wal_contiguity, ) .await; match res { Ok(()) => Ok(()), Err(e) => { match e { WalReceiverError::SuccessfulCompletion(msg) => { info!("walreceiver connection handling ended with success: {msg}"); Ok(()) } WalReceiverError::ExpectedSafekeeperError(e) => { info!("walreceiver connection handling ended: {e}"); Ok(()) } WalReceiverError::ClosedGate => { info!( "walreceiver connection handling ended because of closed gate" ); Ok(()) } WalReceiverError::Cancelled => Ok(()), WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { // Ideally we would learn about this via some path other than Other, but // that requires refactoring all the intermediate layers of ingest code // that only emit anyhow::Error Ok(()) } else { Err(e).context("walreceiver connection handling failure") } } } } } } .instrument(span) }); let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { started_at: now, sk_id: new_sk.safekeeper_id, availability_zone: new_sk.availability_zone, status: WalConnectionStatus { is_connected: false, has_processed_wal: false, latest_connection_update: now, latest_wal_update: now, streaming_lsn: None, commit_lsn: None, node: node_id, }, connection_task: connection_handle, discovered_new_wal: None, }); } /// Drops the current connection (if any) and updates retry timeout for the next /// connection attempt to the same safekeeper. /// /// # Cancel-Safety /// /// Not cancellation-safe. async fn drop_old_connection(&mut self, needs_shutdown: bool) { let wal_connection = match self.wal_connection.take() { Some(wal_connection) => wal_connection, None => return, }; if needs_shutdown { wal_connection .connection_task .shutdown() // This here is why this function isn't cancellation-safe. // If we got cancelled here, then self.wal_connection is already None and we lose track of the task. // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None // and thus be ineffective. .await; } let retry = self .wal_connection_retries .entry(wal_connection.sk_id) .or_insert(RetryInfo { next_retry_at: None, retry_duration_seconds: WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS, }); let now = Utc::now().naive_utc(); // Schedule the next retry attempt. We want to have exponential backoff for connection attempts, // and we add backoff to the time when we started the connection attempt. If the connection // was active for a long time, then next_retry_at will be in the past. retry.next_retry_at = wal_connection .started_at .checked_add_signed(chrono::Duration::milliseconds( (retry.retry_duration_seconds * 1000.0) as i64, )); if let Some(next) = &retry.next_retry_at { if next > &now { info!( "Next connection retry to {:?} is at {}", wal_connection.sk_id, next ); } } let next_retry_duration = retry.retry_duration_seconds * WALCONNECTION_RETRY_BACKOFF_MULTIPLIER; // Clamp the next retry duration to the maximum allowed. let next_retry_duration = next_retry_duration.min(WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS); // Clamp the next retry duration to the minimum allowed. let next_retry_duration = next_retry_duration.max(WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS); retry.retry_duration_seconds = next_retry_duration; } /// Returns time needed to wait to have a new candidate for WAL streaming. fn time_until_next_retry(&self) -> Option { let now = Utc::now().naive_utc(); let next_retry_at = self .wal_connection_retries .values() .filter_map(|retry| retry.next_retry_at) .filter(|next_retry_at| next_retry_at > &now) .min()?; (next_retry_at - now).to_std().ok() } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, typed_msg: TypedMessage) { let mut is_discovery = false; let timeline_update = match typed_msg.r#type() { MessageType::SafekeeperTimelineInfo => { let info = match typed_msg.safekeeper_timeline_info { Some(info) => info, None => { warn!("bad proto message from broker: no safekeeper_timeline_info"); return; } }; SafekeeperDiscoveryResponse { safekeeper_id: info.safekeeper_id, tenant_timeline_id: info.tenant_timeline_id, commit_lsn: info.commit_lsn, safekeeper_connstr: info.safekeeper_connstr, availability_zone: info.availability_zone, standby_horizon: info.standby_horizon, } } MessageType::SafekeeperDiscoveryResponse => { is_discovery = true; match typed_msg.safekeeper_discovery_response { Some(response) => response, None => { warn!("bad proto message from broker: no safekeeper_discovery_response"); return; } } } _ => { // unexpected message return; } }; WALRECEIVER_BROKER_UPDATES.inc(); trace!( "safekeeper info update: standby_horizon(cutoff)={}", timeline_update.standby_horizon ); if timeline_update.standby_horizon != 0 { // ignore reports from safekeepers not connected to replicas self.timeline .standby_horizon .store(Lsn(timeline_update.standby_horizon)); self.timeline .metrics .standby_horizon_gauge .set(timeline_update.standby_horizon as i64); } let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, BrokerSkTimeline { timeline: timeline_update, latest_update: Utc::now().naive_utc(), }, ); if old_entry.is_none() { info!( ?is_discovery, %new_safekeeper_id, "New SK node was added", ); WALRECEIVER_CANDIDATES_ADDED.inc(); } } /// Cleans up stale broker records and checks the rest for the new connection candidate. /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. /// The current rules for approving new candidates: /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps /// * if there's no such entry, no new candidate found, abort /// * otherwise check if the candidate is much better than the current one /// /// To understand exact rules for determining if the candidate is better than the current one, refer to this function's implementation. /// General rules are following: /// * if connected safekeeper is not present, pick the candidate /// * if we haven't received any updates for some time, pick the candidate /// * if the candidate commit_lsn is much higher than the current one, pick the candidate /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. /// Both thresholds are configured per tenant. fn next_connection_candidate(&mut self) -> Option { self.cleanup_old_candidates(); match &self.wal_connection { Some(existing_wal_connection) => { let connected_sk_node = existing_wal_connection.sk_id; let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone(); let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = (now - existing_wal_connection.status.latest_connection_update).to_std() { // Drop connection if we haven't received keepalive message for a while. if latest_interaciton > self.conf.wal_connect_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, availability_zone: new_availability_zone, reason: ReconnectReason::NoKeepAlives { last_keep_alive: Some( existing_wal_connection.status.latest_connection_update, ), check_time: now, threshold: self.conf.wal_connect_timeout, }, }); } } if !existing_wal_connection.status.is_connected { // We haven't connected yet and we shouldn't switch until connection timeout (condition above). return None; } if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn { let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn); // Check if the new candidate has much more WAL than the current one. match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { Some(new_sk_lsn_advantage) => { if new_sk_lsn_advantage >= self.conf.max_lsn_wal_lag.get() { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, availability_zone: new_availability_zone, reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, threshold: self.conf.max_lsn_wal_lag, }, }); } // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, // and the current one is not, switch to the new one. if self.conf.availability_zone.is_some() && existing_wal_connection.availability_zone != self.conf.availability_zone && self.conf.availability_zone == new_availability_zone { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, availability_zone: new_availability_zone, wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::SwitchAvailabilityZone, }); } } None => debug!( "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" ), } } let current_lsn = match existing_wal_connection.status.streaming_lsn { Some(lsn) => lsn, None => self.timeline.get_last_record_lsn(), }; let current_commit_lsn = existing_wal_connection .status .commit_lsn .unwrap_or(current_lsn); let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn); // Keep discovered_new_wal only if connected safekeeper has not caught up yet. let mut discovered_new_wal = existing_wal_connection .discovered_new_wal .filter(|new_wal| new_wal.lsn > current_commit_lsn); if discovered_new_wal.is_none() { // Check if the new candidate has more WAL than the current one. // If the new candidate has more WAL than the current one, we consider switching to the new candidate. discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { trace!( "New candidate has commit_lsn {}, higher than current_commit_lsn {}", candidate_commit_lsn, current_commit_lsn ); Some(NewCommittedWAL { lsn: candidate_commit_lsn, discovered_at: Utc::now().naive_utc(), }) } else { None }; } let waiting_for_new_lsn_since = if current_lsn < current_commit_lsn { // Connected safekeeper has more WAL, but we haven't received updates for some time. trace!( "Connected safekeeper has more WAL, but we haven't received updates for {:?}. current_lsn: {}, current_commit_lsn: {}", (now - existing_wal_connection.status.latest_wal_update).to_std(), current_lsn, current_commit_lsn ); Some(existing_wal_connection.status.latest_wal_update) } else { discovered_new_wal.as_ref().map(|new_wal| { // We know that new WAL is available on other safekeeper, but connected safekeeper don't have it. new_wal .discovered_at .max(existing_wal_connection.status.latest_wal_update) }) }; // If we haven't received any WAL updates for a while and candidate has more WAL, switch to it. if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since { if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() { if candidate_commit_lsn > current_commit_lsn && waiting_for_new_wal > self.conf.lagging_wal_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, availability_zone: new_availability_zone, reason: ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, candidate_commit_lsn, last_wal_interaction: Some( existing_wal_connection.status.latest_wal_update, ), check_time: now, threshold: self.conf.lagging_wal_timeout, }, }); } } } self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, availability_zone: new_safekeeper_broker_data.availability_zone.clone(), wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::NoExistingConnection, }); } } None } /// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers. /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: /// * has no pending retry cooldown /// * has greatest commit_lsn among the ones that are left fn select_connection_candidate( &self, node_to_omit: Option, ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) } /// Returns a list of safekeepers that have valid info and ready for connection. /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates .iter() .filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID) .filter(move |(sk_id, _)| { let next_retry_at = self .wal_connection_retries .get(sk_id) .and_then(|retry_info| { retry_info.next_retry_at }); next_retry_at.is_none() || next_retry_at.unwrap() <= now }).filter_map(|(sk_id, broker_info)| { let info = &broker_info.timeline; if info.safekeeper_connstr.is_empty() { return None; // no connection string, ignore sk } let shard_identity = self.timeline.get_shard_identity(); let (shard_number, shard_count, shard_stripe_size) = ( Some(shard_identity.number.0), Some(shard_identity.count.0), Some(shard_identity.stripe_size.0), ); let connection_conf_args = ConnectionConfigArgs { protocol: self.conf.protocol, ttid: self.id, shard_number, shard_count, shard_stripe_size, listen_pg_addr_str: info.safekeeper_connstr.as_ref(), auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()), availability_zone: self.conf.availability_zone.as_deref() }; match wal_stream_connection_config(connection_conf_args) { Ok(connstr) => Some((*sk_id, info, connstr)), Err(e) => { error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id); None } } }) } /// Remove candidates which haven't sent broker updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); let lagging_wal_timeout = self.conf.lagging_wal_timeout; self.wal_stream_candidates.retain(|node_id, broker_info| { if let Ok(time_since_latest_broker_update) = (Utc::now().naive_utc() - broker_info.latest_update).to_std() { let should_retain = time_since_latest_broker_update < lagging_wal_timeout; if !should_retain { node_ids_to_remove.push(*node_id); } should_retain } else { true } }); if !node_ids_to_remove.is_empty() { for node_id in node_ids_to_remove { info!( "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" ); self.wal_connection_retries.remove(&node_id); WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } /// # Cancel-Safety /// /// Not cancellation-safe. pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; } } fn manager_status(&self) -> ConnectionManagerStatus { ConnectionManagerStatus { existing_connection: self.wal_connection.as_ref().map(|conn| conn.status), wal_stream_candidates: self.wal_stream_candidates.clone(), } } } #[derive(Debug)] struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, availability_zone: Option, reason: ReconnectReason, } /// Stores the reason why WAL connection was switched, for furter debugging purposes. #[derive(Debug, PartialEq, Eq)] enum ReconnectReason { NoExistingConnection, LaggingWal { current_commit_lsn: Lsn, new_commit_lsn: Lsn, threshold: NonZeroU64, }, SwitchAvailabilityZone, NoWalTimeout { current_lsn: Lsn, current_commit_lsn: Lsn, candidate_commit_lsn: Lsn, last_wal_interaction: Option, check_time: NaiveDateTime, threshold: Duration, }, NoKeepAlives { last_keep_alive: Option, check_time: NaiveDateTime, threshold: Duration, }, } impl ReconnectReason { fn name(&self) -> &str { match self { ReconnectReason::NoExistingConnection => "NoExistingConnection", ReconnectReason::LaggingWal { .. } => "LaggingWal", ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone", ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout", ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives", } } } #[cfg(test)] mod tests { use url::Host; use utils::postgres_client::PostgresClientProtocol; use super::*; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; fn dummy_broker_sk_timeline( commit_lsn: u64, safekeeper_connstr: &str, latest_update: NaiveDateTime, ) -> BrokerSkTimeline { BrokerSkTimeline { timeline: SafekeeperDiscoveryResponse { safekeeper_id: 0, tenant_timeline_id: None, commit_lsn, safekeeper_connstr: safekeeper_connstr.to_owned(), availability_zone: None, standby_horizon: 0, }, latest_update, } } #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("no_connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; state.wal_connection = None; state.wal_stream_candidates = HashMap::from([ (NodeId(0), dummy_broker_sk_timeline(1, "", now)), (NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)), (NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)), ( NodeId(3), dummy_broker_sk_timeline( 1 + state.conf.max_lsn_wal_lag.get(), "delay_over_threshold", delay_over_threshold, ), ), ]); let no_candidate = state.next_connection_candidate(); assert!( no_candidate.is_none(), "Expected no candidate selected out of non full data options, but got {no_candidate:?}" ); Ok(()) } #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); let connected_sk_id = NodeId(0); let current_lsn = 100_000; let connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), streaming_lsn: Some(Lsn(current_lsn)), node: NodeId(1), }; state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, availability_zone: None, status: connection_status, connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([ ( connected_sk_id, dummy_broker_sk_timeline( current_lsn + state.conf.max_lsn_wal_lag.get() * 2, DUMMY_SAFEKEEPER_HOST, now, ), ), ( NodeId(1), dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now), ), ( NodeId(2), dummy_broker_sk_timeline( current_lsn + state.conf.max_lsn_wal_lag.get() / 2, "not_enough_advanced_lsn", now, ), ), ]); let no_candidate = state.next_connection_candidate(); assert!( no_candidate.is_none(), "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" ); Ok(()) } #[tokio::test] async fn no_connection_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("no_connection_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); state.wal_connection = None; state.wal_stream_candidates = HashMap::from([( NodeId(0), dummy_broker_sk_timeline( 1 + state.conf.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now, ), )]); let only_candidate = state .next_connection_candidate() .expect("Expected one candidate selected out of the only data option, but got none"); assert_eq!(only_candidate.safekeeper_id, NodeId(0)); assert_eq!( only_candidate.reason, ReconnectReason::NoExistingConnection, "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" ); assert_eq!( only_candidate.wal_source_connconf.host(), &Host::Domain(DUMMY_SAFEKEEPER_HOST.to_owned()) ); let selected_lsn = 100_000; state.wal_stream_candidates = HashMap::from([ ( NodeId(0), dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now), ), ( NodeId(1), dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(2), dummy_broker_sk_timeline(selected_lsn + 100, "", now), ), ]); let biggest_wal_candidate = state.next_connection_candidate().expect( "Expected one candidate selected out of multiple valid data options, but got none", ); assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); assert_eq!( biggest_wal_candidate.reason, ReconnectReason::NoExistingConnection, "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" ); assert_eq!( biggest_wal_candidate.wal_source_connconf.host(), &Host::Domain(DUMMY_SAFEKEEPER_HOST.to_owned()) ); Ok(()) } #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { let harness = TenantHarness::create("candidate_with_many_connection_failures").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); let current_lsn = Lsn(100_000).align(); let bigger_lsn = Lsn(current_lsn.0 + 100).align(); state.wal_connection = None; state.wal_stream_candidates = HashMap::from([ ( NodeId(0), dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(1), dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ]); state.wal_connection_retries = HashMap::from([( NodeId(0), RetryInfo { next_retry_at: now.checked_add_signed(chrono::Duration::hours(1)), retry_duration_seconds: WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS, }, )]); let candidate_with_less_errors = state .next_connection_candidate() .expect("Expected one candidate selected, but got none"); assert_eq!( candidate_with_less_errors.safekeeper_id, NodeId(1), "Should select the node with no pending retry cooldown" ); Ok(()) } #[tokio::test] async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); let connected_sk_id = NodeId(0); let new_lsn = Lsn(current_lsn.0 + state.conf.max_lsn_wal_lag.get() + 1); let connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), node: connected_sk_id, }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, availability_zone: None, status: connection_status, connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([ ( connected_sk_id, dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(1), dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now), ), ]); let over_threshcurrent_candidate = state.next_connection_candidate().expect( "Expected one candidate selected out of multiple valid data options, but got none", ); assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); assert_eq!( over_threshcurrent_candidate.reason, ReconnectReason::LaggingWal { current_commit_lsn: current_lsn, new_commit_lsn: new_lsn, threshold: state.conf.max_lsn_wal_lag }, "Should select bigger WAL safekeeper if it starts to lag enough" ); assert_eq!( over_threshcurrent_candidate.wal_source_connconf.host(), &Host::Domain("advanced_by_lsn_safekeeper".to_owned()) ); Ok(()) } #[tokio::test] async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("timeout_connection_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); let wal_connect_timeout = chrono::Duration::from_std(state.conf.wal_connect_timeout)?; let time_over_threshold = Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout; let connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: true, latest_connection_update: time_over_threshold, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), node: NodeId(1), }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), availability_zone: None, status: connection_status, connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([( NodeId(0), dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), )]); let over_threshcurrent_candidate = state.next_connection_candidate().expect( "Expected one candidate selected out of multiple valid data options, but got none", ); assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); match over_threshcurrent_candidate.reason { ReconnectReason::NoKeepAlives { last_keep_alive, threshold, .. } => { assert_eq!(last_keep_alive, Some(time_over_threshold)); assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } assert_eq!( over_threshcurrent_candidate.wal_source_connconf.host(), &Host::Domain(DUMMY_SAFEKEEPER_HOST.to_owned()) ); Ok(()) } #[tokio::test] async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); let now = Utc::now().naive_utc(); let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let time_over_threshold = Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; let connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: true, latest_connection_update: now, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), node: NodeId(1), }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), availability_zone: None, status: connection_status, connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, }), }); state.wal_stream_candidates = HashMap::from([( NodeId(0), dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now), )]); let over_threshcurrent_candidate = state.next_connection_candidate().expect( "Expected one candidate selected out of multiple valid data options, but got none", ); assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); match over_threshcurrent_candidate.reason { ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, candidate_commit_lsn, last_wal_interaction, threshold, .. } => { assert_eq!(current_lsn, current_lsn); assert_eq!(current_commit_lsn, current_lsn); assert_eq!(candidate_commit_lsn, new_lsn); assert_eq!(last_wal_interaction, Some(time_over_threshold)); assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } assert_eq!( over_threshcurrent_candidate.wal_source_connconf.host(), &Host::Domain(DUMMY_SAFEKEEPER_HOST.to_owned()) ); Ok(()) } const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx) .await .expect("Failed to create an empty timeline for dummy wal connection manager"); let protocol = PostgresClientProtocol::Interpreted { format: utils::postgres_client::InterpretedFormat::Protobuf, compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), }; ConnectionManagerState { id: TenantTimelineId { tenant_id: harness.tenant_shard_id.tenant_id, timeline_id: TIMELINE_ID, }, timeline, cancel: CancellationToken::new(), conf: WalReceiverConf { protocol, wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), auth_token: None, availability_zone: None, ingest_batch_size: 1, validate_wal_contiguity: false, }, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), } } #[tokio::test] async fn switch_to_same_availability_zone() -> anyhow::Result<()> { // Pageserver and one of safekeepers will be in the same availability zone // and pageserver should prefer to connect to it. let test_az = Some("test_az".to_owned()); let harness = TenantHarness::create("switch_to_same_availability_zone").await?; let mut state = dummy_state(&harness).await; state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); let connected_sk_id = NodeId(0); let connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), node: connected_sk_id, }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, availability_zone: None, status: connection_status, connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), discovered_new_wal: None, }); // We have another safekeeper with the same commit_lsn, and it have the same availability zone as // the current pageserver. let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); same_az_sk.timeline.availability_zone.clone_from(&test_az); state.wal_stream_candidates = HashMap::from([ ( connected_sk_id, dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), (NodeId(1), same_az_sk), ]); // We expect that pageserver will switch to the safekeeper in the same availability zone, // even if it has the same commit_lsn. let next_candidate = state.next_connection_candidate().expect( "Expected one candidate selected out of multiple valid data options, but got none", ); assert_eq!(next_candidate.safekeeper_id, NodeId(1)); assert_eq!( next_candidate.reason, ReconnectReason::SwitchAvailabilityZone, "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn" ); assert_eq!( next_candidate.wal_source_connconf.host(), &Host::Domain("same_az".to_owned()) ); Ok(()) } } ================================================ FILE: pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs ================================================ //! Actual Postgres connection handler to stream WAL to the server. use std::error::Error; use std::pin::pin; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, SystemTime}; use anyhow::{Context, anyhow}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::waldecoder::WalDecodeError; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::sync::watch; use tokio::{select, time}; use tokio_postgres::error::SqlState; use tokio_postgres::replication::ReplicationStream; use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, trace, warn}; use utils::critical_timeline; use utils::id::NodeId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; use utils::postgres_client::PostgresClientProtocol; use utils::sync::gate::GateError; use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords}; use wal_decoder::wire_format::FromWireFormat; use super::TaskStateUpdate; use crate::context::RequestContext; use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS}; use crate::pgdatadir_mapping::DatadirModification; use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::{ Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id, }; use crate::walingest::WalIngest; /// Status of the connection. #[derive(Debug, Clone, Copy)] pub(super) struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, /// Defines a healthy connection as one on which pageserver received WAL from safekeeper /// and is able to process it in walingest without errors. pub has_processed_wal: bool, /// Connection establishment time or the timestamp of a latest connection message received. pub latest_connection_update: NaiveDateTime, /// Time of the latest WAL message received. pub latest_wal_update: NaiveDateTime, /// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN. pub streaming_lsn: Option, /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. pub commit_lsn: Option, /// The node it is connected to pub node: NodeId, } pub(super) enum WalReceiverError { /// An error of a type that does not indicate an issue, e.g. a connection closing ExpectedSafekeeperError(tokio_postgres::Error), /// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries /// the message part of the original postgres error SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), ClosedGate, Cancelled, } impl From for WalReceiverError { fn from(err: tokio_postgres::Error) -> Self { if let Some(dberror) = err.as_db_error().filter(|db_error| { db_error.code() == &SqlState::SUCCESSFUL_COMPLETION && db_error.message().contains("ending streaming") }) { // Strip the outer DbError, which carries a misleading "error" severity Self::SuccessfulCompletion(dberror.message().to_string()) } else if err.is_closed() || err .source() .and_then(|source| source.downcast_ref::()) .map(is_expected_io_error) .unwrap_or(false) { Self::ExpectedSafekeeperError(err) } else { Self::Other(anyhow::Error::new(err)) } } } impl From for WalReceiverError { fn from(err: anyhow::Error) -> Self { Self::Other(err) } } impl From for WalReceiverError { fn from(err: WalDecodeError) -> Self { Self::Other(anyhow::Error::new(err)) } } /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. #[allow(clippy::too_many_arguments)] pub(super) async fn handle_walreceiver_connection( timeline: Arc, protocol: PostgresClientProtocol, wal_source_connconf: PgConnectionConfig, events_sender: watch::Sender>, cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, safekeeper_node: NodeId, ingest_batch_size: u64, validate_wal_contiguity: bool, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); // prevent timeline shutdown from finishing until we have exited let _guard = timeline.gate.enter().map_err(|e| match e { GateError::GateClosed => WalReceiverError::ClosedGate, })?; // This function spawns a side-car task (WalReceiverConnectionPoller). // Get its gate guard now as well. let poller_guard = timeline.gate.enter().map_err(|e| match e { GateError::GateClosed => WalReceiverError::ClosedGate, })?; WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); let (replication_client, connection) = { let mut config = wal_source_connconf.to_tokio_postgres_config(); config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); match time::timeout(connect_timeout, config.connect(tokio_postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. info!( "Timed out while waiting {connect_timeout:?} for walreceiver connection to open" ); return Ok(()); } } }; debug!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: false, latest_connection_update: Utc::now().naive_utc(), latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!( "Wal connection event listener dropped right after connection init, aborting the connection: {e}" ); return Ok(()); } // The connection object performs the actual communication with the database, // so spawn it off to run on its own. It shouldn't outlive this function, but, // due to lack of async drop, we can't enforce that. However, we ensure that // 1. it is sensitive to `cancellation` and // 2. holds the Timeline gate open so that after timeline shutdown, // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), Err(connection_error) => { match WalReceiverError::from(connection_error) { WalReceiverError::ExpectedSafekeeperError(_) => { // silence, because most likely we've already exited the outer call // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} WalReceiverError::Cancelled => { debug!("Connection cancelled") } WalReceiverError::ClosedGate => { // doesn't happen at runtime } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } } } }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the // spans won't be properly nested. .instrument(tracing::info_span!("poller")), ); let _guard = LIVE_CONNECTIONS .with_label_values(&["wal_receiver"]) .guard(); let identify = identify_system(&replication_client).await?; info!("{identify:?}"); let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!( "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}" ); return Ok(()); } // // Start streaming the WAL, from where we left off previously. // // If we had previously received WAL up to some point in the middle of a WAL record, we // better start from the end of last full WAL record, not in the middle of one. let mut last_rec_lsn = timeline.get_last_record_lsn(); let mut startpoint = last_rec_lsn; if startpoint == Lsn(0) { return Err(WalReceiverError::Other(anyhow!("No previous WAL position"))); } // There might be some padding after the last full record, skip it. startpoint += startpoint.calc_padding(8u32); // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary, // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node // to the safekeepers. startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); info!( "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..." ); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) .await .map_err(|e| match e.kind { crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, _ => WalReceiverError::Other(e.into()), })?; let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { format, compression, } => (format, compression), PostgresClientProtocol::Vanilla => { return Err(WalReceiverError::Other(anyhow!( "Vanilla WAL receiver protocol is no longer supported for ingest" ))); } }; let mut expected_wal_start = startpoint; while let Some(replication_message) = { select! { biased; _ = cancellation.cancelled() => { debug!("walreceiver interrupted"); None } replication_message = physical_stream.next() => replication_message, } } { let replication_message = replication_message?; let now = Utc::now().naive_utc(); let last_rec_lsn_before_msg = last_rec_lsn; // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. match &replication_message { ReplicationMessage::PrimaryKeepAlive(keepalive) => { connection_status.latest_connection_update = now; connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); } ReplicationMessage::RawInterpretedWalRecords(raw) => { connection_status.latest_connection_update = now; if !raw.data().is_empty() { connection_status.latest_wal_update = now; } connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn())); connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn())); } &_ => {} }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } let status_update = match replication_message { ReplicationMessage::RawInterpretedWalRecords(raw) => { WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64); let mut uncommitted_records = 0; // This is the end LSN of the raw WAL from which the records // were interpreted. let streaming_lsn = Lsn::from(raw.streaming_lsn()); let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression) .await .with_context(|| { anyhow::anyhow!( "Failed to deserialize interpreted records ending at LSN {streaming_lsn}" ) })?; // Guard against WAL gaps. If the start LSN of the PG WAL section // from which the interpreted records were extracted, doesn't match // the end of the previous batch (or the starting point for the first batch), // then kill this WAL receiver connection and start a new one. if validate_wal_contiguity { if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn { match raw_wal_start_lsn.cmp(&expected_wal_start) { std::cmp::Ordering::Greater => { let msg = format!( "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn}" ); critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, Some(&timeline.corruption_detected), "{msg}" ); return Err(WalReceiverError::Other(anyhow!(msg))); } std::cmp::Ordering::Less => { // Other shards are reading WAL behind us. // This is valid, but check that we received records // that we haven't seen before. if let Some(first_rec) = batch.records.first() { if first_rec.next_record_lsn < last_rec_lsn { let msg = format!( "Received record with next_record_lsn multiple times ({} < {})", first_rec.next_record_lsn, expected_wal_start ); critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, Some(&timeline.corruption_detected), "{msg}" ); return Err(WalReceiverError::Other(anyhow!(msg))); } } } std::cmp::Ordering::Equal => {} } } } let InterpretedWalRecords { records, next_record_lsn, raw_wal_start_lsn: _, } = batch; tracing::debug!( "Received WAL up to {} with next_record_lsn={}", streaming_lsn, next_record_lsn ); // We start the modification at 0 because each interpreted record // advances it to its end LSN. 0 is just an initialization placeholder. let mut modification = timeline.begin_modification(Lsn(0)); async fn commit( modification: &mut DatadirModification<'_>, ctx: &RequestContext, uncommitted: &mut u64, ) -> anyhow::Result<()> { let stats = modification.stats(); modification.commit(ctx).await?; WAL_INGEST.records_committed.inc_by(*uncommitted); WAL_INGEST.inc_values_committed(&stats); *uncommitted = 0; Ok(()) } if !records.is_empty() { timeline .metrics .wal_records_received .inc_by(records.len() as u64); } for interpreted in records { if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 { commit(&mut modification, &ctx, &mut uncommitted_records).await?; } let local_next_record_lsn = interpreted.next_record_lsn; if interpreted.is_observed() { WAL_INGEST.records_observed.inc(); } walingest .ingest_record(interpreted, &mut modification, &ctx) .await .with_context(|| { format!("could not ingest record at {local_next_record_lsn}") }) .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. if !cancellation.is_cancelled() && !timeline.is_stopping() { critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, Some(&timeline.corruption_detected), "{err:?}" ); } })?; uncommitted_records += 1; // FIXME: this cannot be made pausable_failpoint without fixing the // failpoint library; in tests, the added amount of debugging will cause us // to timeout the tests. fail_point!("walreceiver-after-ingest"); // Commit every ingest_batch_size records. Even if we filtered out // all records, we still need to call commit to advance the LSN. if uncommitted_records >= ingest_batch_size || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { commit(&mut modification, &ctx, &mut uncommitted_records).await?; } } // Records might have been filtered out on the safekeeper side, but we still // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() { modification.set_lsn(next_record_lsn).unwrap(); true } else { false }; if uncommitted_records > 0 || needs_last_record_lsn_advance { // Commit any uncommitted records commit(&mut modification, &ctx, &mut uncommitted_records).await?; } if !caught_up && streaming_lsn >= end_of_wal { info!("caught up at LSN {streaming_lsn}"); caught_up = true; } tracing::debug!( "Ingested WAL up to {streaming_lsn}. Last record LSN is {}", timeline.get_last_record_lsn() ); last_rec_lsn = next_record_lsn; expected_wal_start = streaming_lsn; Some(streaming_lsn) } ReplicationMessage::PrimaryKeepAlive(keepalive) => { let wal_end = keepalive.wal_end(); let timestamp = keepalive.timestamp(); let reply_requested = keepalive.reply() != 0; trace!( "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})" ); if reply_requested { Some(last_rec_lsn) } else { None } } _ => None, }; if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } } if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline .get_remote_consistent_lsn_visible() .unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. let last_received_lsn = last_lsn; // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let remote_consistent_lsn = timeline_remote_consistent_lsn; let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. let last_received_wal = WalReceiverInfo { wal_source_connconf: wal_source_connconf.clone(), last_received_msg_lsn: last_lsn, last_received_msg_ts: ts .duration_since(SystemTime::UNIX_EPOCH) .expect("Received message time should be before UNIX EPOCH!") .as_micros(), }; *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); // Send the replication feedback message. // Regular standby_status_update fields are put into this message. let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { timeline .get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::User, &ctx, ) // FIXME: https://github.com/neondatabase/neon/issues/5963 .size_dont_care_about_accuracy() } else { // Non-zero shards send zero for logical size. The safekeeper will ignore // this number. This is because in a sharded tenant, only shard zero maintains // accurate logical size. 0 }; let status_update = PageserverFeedback { current_timeline_size, last_received_lsn, disk_consistent_lsn, remote_consistent_lsn, replytime: ts, shard_number: timeline.tenant_shard_id.shard_number.0 as u32, corruption_detected: timeline .corruption_detected .load(std::sync::atomic::Ordering::Relaxed), }; debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); status_update.serialize(&mut data); physical_stream .as_mut() .zenith_status_update(data.len() as u64, &data) .await?; } } Ok(()) } /// Data returned from the postgres `IDENTIFY_SYSTEM` command /// /// See the [postgres docs] for more details. /// /// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html #[derive(Debug)] // As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as // unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 #[allow(dead_code)] struct IdentifySystem { systemid: u64, timeline: u32, xlogpos: PgLsn, dbname: Option, } /// There was a problem parsing the response to /// a postgres IDENTIFY_SYSTEM command. #[derive(Debug, thiserror::Error)] #[error("IDENTIFY_SYSTEM parse error")] struct IdentifyError; /// Run the postgres `IDENTIFY_SYSTEM` command async fn identify_system(client: &Client) -> anyhow::Result { let query_str = "IDENTIFY_SYSTEM"; let response = client.simple_query(query_str).await?; // get(N) from row, then parse it as some destination type. fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result where T: FromStr, { let val = row.get(idx).ok_or(IdentifyError)?; val.parse::().or(Err(IdentifyError)) } // extract the row contents into an IdentifySystem struct. // written as a closure so I can use ? for Option here. if let Some(SimpleQueryMessage::Row(first_row)) = response.first() { Ok(IdentifySystem { systemid: get_parse(first_row, 0)?, timeline: get_parse(first_row, 1)?, xlogpos: get_parse(first_row, 2)?, dbname: get_parse(first_row, 3).ok(), }) } else { Err(IdentifyError.into()) } } ================================================ FILE: pageserver/src/tenant/timeline/walreceiver.rs ================================================ //! WAL receiver manages an open connection to safekeeper, to get the WAL it streams into. //! To do so, a current implementation needs to do the following: //! //! * acknowledge the timelines that it needs to stream WAL into. //! Pageserver is able to dynamically (un)load tenants on attach and detach, //! hence WAL receiver needs to react on such events. //! //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. //! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. //! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. //! Without this data, no WAL streaming is possible currently. //! //! Only one active WAL streaming connection is allowed at a time. //! The connection is supposed to be updated periodically, based on safekeeper timeline data. //! //! * handle the actual connection and WAL streaming //! //! Handling happens dynamically, by portions of WAL being processed and registered in the server. //! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. //! //! The current module contains high-level primitives used in the submodules; general synchronization, timeline acknowledgement and shutdown logic. mod connection_manager; mod walreceiver_connection; use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; use std::time::Duration; use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; use super::Timeline; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ ConnectionManagerState, connection_manager_loop_step, }; #[derive(Clone)] pub struct WalReceiverConf { pub protocol: PostgresClientProtocol, /// The timeout on the connection to safekeeper for WAL streaming. pub wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. pub lagging_wal_timeout: Duration, /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. pub max_lsn_wal_lag: NonZeroU64, pub auth_token: Option>, pub availability_zone: Option, pub ingest_batch_size: u64, pub validate_wal_contiguity: bool, } pub struct WalReceiver { manager_status: Arc>>, /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, } impl WalReceiver { pub fn start( timeline: Arc, conf: WalReceiverConf, mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); let _task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); // acquire timeline gate so we know the task doesn't outlive the Timeline let Ok(_guard) = timeline.gate.enter() else { debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); return; }; debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, cancel.clone(), ); while !cancel.is_cancelled() { let loop_step_result = connection_manager_loop_step( &mut broker_client, &mut connection_manager_state, &walreceiver_ctx, &cancel, &loop_status, ).await; match loop_step_result { Ok(()) => continue, Err(_cancelled) => { trace!("Connection manager loop ended, shutting down"); break; } } } connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; info!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) }); Self { manager_status, cancel, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] pub async fn cancel(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { self.manager_status.read().unwrap().clone() } } /// A handle of an asynchronous task. /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] /// and a cancellation token that it can listen to for earlier interrupts. /// /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] struct TaskHandle { join_handle: Option>>, events_receiver: watch::Receiver>, cancellation: CancellationToken, } enum TaskEvent { Update(TaskStateUpdate), End(anyhow::Result<()>), } #[derive(Debug, Clone)] enum TaskStateUpdate { Started, Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. /// /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); let join_handle = WALRECEIVER_RUNTIME.spawn(async move { events_sender.send(TaskStateUpdate::Started).ok(); task(events_sender, cancellation_clone).await // events_sender is dropped at some point during the .await above. // But the task is still running on WALRECEIVER_RUNTIME. // That is the window when `!jh.is_finished()` // is true inside `fn next_task_event()` below. }); TaskHandle { join_handle: Some(join_handle), events_receiver, cancellation, } } /// # Cancel-Safety /// /// Cancellation-safe. async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), Err(_task_channel_part_dropped) => { TaskEvent::End(match self.join_handle.as_mut() { Some(jh) => { if !jh.is_finished() { // See: https://github.com/neondatabase/neon/issues/2885 trace!("sender is dropped while join handle is still alive"); } let res = match jh.await { Ok(res) => res, Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { // already logged Ok(()) } Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")), }; // For cancellation-safety, drop join_handle only after successful .await. self.join_handle = None; res } None => { // Another option is to have an enum, join handle or result and give away the reference to it Err(anyhow::anyhow!("Task was joined more than once")) } }) } } } /// Aborts current task, waiting for it to finish. async fn shutdown(self) { if let Some(jh) = self.join_handle { self.cancellation.cancel(); match jh.await { Ok(Ok(())) => debug!("Shutdown success"), Ok(Err(e)) => error!("Shutdown task error: {e:?}"), Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { // already logged } Err(je) => { error!("Shutdown task join error: {je}") } } } } } ================================================ FILE: pageserver/src/tenant/timeline.rs ================================================ pub(crate) mod analysis; pub(crate) mod compaction; pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; mod heatmap_layers_downloader; pub(crate) mod import_pgdata; mod init; pub mod layer_manager; pub(crate) mod logical_size; pub mod offload; pub mod span; pub mod uninit; mod walreceiver; use hashlink::LruCache; use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::{ControlFlow, Deref, Range}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; use camino::Utf8Path; use chrono::{DateTime, Utc}; use compaction::{CompactionOutcome, GcCompactionCombinedSettings}; use enumset::EnumSet; use fail::fail_point; use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt}; use handle::ShardTimelineId; use layer_manager::{ LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager, Shutdown, }; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use pageserver_api::key::{ KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, SPARSE_RANGE, }; use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, DetachBehavior, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; use postgres_connection::PgConnectionConfig; use postgres_ffi::v14::xlog_utils; use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp}; use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; use tokio::runtime::Handle; use tokio::sync::mpsc::Sender; use tokio::sync::{Notify, oneshot, watch}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::generation::Generation; use utils::guard_arc_swap::GuardArcSwap; use utils::id::TimelineId; use utils::logging::{MonitorSlowFutureCallback, log_slow, monitor_slow_future}; use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use utils::postgres_client::PostgresClientProtocol; use utils::rate_limit::RateLimit; use utils::seqwait::SeqWait; use utils::simple_rcu::{Rcu, RcuReadGuard}; use utils::sync::gate::{Gate, GateGuard}; use utils::{completion, critical_timeline, fs_ext, pausable_failpoint}; #[cfg(test)] use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::remote_timeline_client::RemoteTimelineClient; use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, debug_assert_current_span_has_tenant_and_timeline_id, }; use crate::PERF_TRACE_TARGET; use crate::aux_file::AuxFileSizeEstimator; use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; use crate::feature_resolver::TenantFeatureResolver; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; use crate::metrics::{ DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; use crate::page_service::TenantManagerTypes; use crate::pgdatadir_mapping::{ CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp, MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::storage_layer::{ AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName, ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation, ValueReconstructState, ValuesReconstructState, }; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::walingest::WalLagCooldown; use crate::walredo::RedoAttemptType; use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(crate) enum FlushLoopState { NotStarted, Running { #[cfg(test)] expect_initdb_optimization: bool, #[cfg(test)] initdb_optimization_count: usize, }, Exited, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum ImageLayerCreationMode { /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path. Try, /// Force creating the image layers if possible. For now, no image layers will be created /// for metadata keys. Used in compaction code path with force flag enabled. Force, /// Initial ingestion of the data, and no data should be dropped in this function. This /// means that no metadata keys should be included in the partitions. Used in flush frozen layer /// code path. Initial, } #[derive(Clone, Debug, Default)] pub enum LastImageLayerCreationStatus { Incomplete { /// The last key of the partition (exclusive) that was processed in the last /// image layer creation attempt. We will continue from this key in the next /// attempt. last_key: Key, }, Complete, #[default] Initial, } impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self:?}") } } /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) { drop(rlock) } /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) { drop(rlock) } /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, pub pagestream_throttle: Arc, pub pagestream_throttle_metrics: Arc, pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, pub basebackup_cache: Arc, pub feature_resolver: Arc, } pub struct Timeline { pub(crate) conf: &'static PageServerConf, tenant_conf: Arc>, myself: Weak, pub(crate) tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects. /// Never changes for the lifetime of this [`Timeline`] object. /// /// This duplicates the generation stored in LocationConf, but that structure is mutable: /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. pub(crate) generation: Generation, /// The detailed sharding information from our parent Tenant. This enables us to map keys /// to shards, and is constant through the lifetime of this Timeline. shard_identity: ShardIdentity, pub pg_version: PgMajorVersion, /// The tuple has two elements. /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote). /// 2. `LayerMap`, the acceleration data structure for `get_reconstruct_data`. /// /// `LayerMap` maps out the `(PAGE,LSN) / (KEY,LSN)` space, which is composed of `(KeyRange, LsnRange)` rectangles. /// We describe these rectangles through the `PersistentLayerDesc` struct. /// /// When we want to reconstruct a page, we first find the `PersistentLayerDesc`'s that we need for page reconstruction, /// using `LayerMap`. Then, we use `LayerFileManager` to get the `PersistentLayer`'s that correspond to the /// `PersistentLayerDesc`'s. /// /// Hence, it's important to keep things coherent. The `LayerFileManager` must always have an entry for all /// `PersistentLayerDesc`'s in the `LayerMap`. If it doesn't, `LayerFileManager::get_from_desc` will panic at /// runtime, e.g., during page reconstruction. /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. pub(crate) layers: LockedLayerManager, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, pub(crate) standby_horizon: AtomicLsn, // WAL redo manager. `None` only for broken tenants. walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. pub(crate) remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all // the WAL up to the request. The SeqWait provides functions for // that. TODO: If we get a request for an old LSN, such that the // versions have already been garbage collected away, we should // throw an error, but we don't track that currently. // // last_record_lsn.load().last points to the end of last processed WAL record. // // We also remember the starting point of the previous record in // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the // first WAL record when the node is started up. But here, we just // keep track of it. last_record_lsn: SeqWait, // All WAL records have been processed and stored durably on files on // local disk, up to this LSN. On crash and restart, we need to re-process // the WAL starting from this point. // // Some later WAL records might have been processed and also flushed to disk // already, so don't be surprised to see some, but there's no guarantee on // them yet. disk_consistent_lsn: AtomicLsn, // Parent timeline that this timeline was branched from, and the LSN // of the branch point. ancestor_timeline: Option>, ancestor_lsn: Lsn, // The LSN of gc-compaction that was last applied to this timeline. gc_compaction_state: ArcSwapOption, pub(crate) metrics: Arc, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM], directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. /// /// The state is cleared upon freezing. write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. /// - The u64 value is a counter, incremented every time a new flush cycle is requested. /// The flush cycle counter is sent back on the layer_flush_done channel when /// the flush finishes. You can use that to wait for the flush to finish. /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn /// read by whoever sends an update layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it. // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than // the planned GC cutoff. pub applied_gc_cutoff_lsn: Rcu, pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. pub(crate) gc_info: std::sync::RwLock, pub(crate) last_image_layer_creation_status: ArcSwap, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations // when they are requested for pre-initdb lsn. // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, /// The repartitioning result. Allows a single writer and multiple readers. pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, last_image_layer_creation_check_at: AtomicLsn, last_image_layer_creation_check_instant: std::sync::Mutex>, /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline /// yet. pub last_received_wal: Mutex>, pub walreceiver: Mutex>, /// Relation size cache pub(crate) rel_size_latest_cache: RwLock>, pub(crate) rel_size_snapshot_cache: Mutex>, download_all_remote_layers_task_info: RwLock>, state: watch::Sender, /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. pub delete_progress: TimelineDeleteProgress, eviction_task_timeline_state: tokio::sync::Mutex, /// Load or creation time information about the disk_consistent_lsn and when the loading /// happened. Used for consumption metrics. pub(crate) loaded_at: (Lsn, SystemTime), /// Gate to prevent shutdown completing while I/O is still happening to this timeline's data pub(crate) gate: Gate, /// Cancellation token scoped to this timeline: anything doing long-running work relating /// to the timeline should drop out when this token fires. pub(crate) cancel: CancellationToken, /// Make sure we only have one running compaction at a time in tests. /// /// Must only be taken in two places: /// - [`Timeline::compact`] (this file) /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, /// If true, the last compaction failed. compaction_failed: AtomicBool, /// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline. /// We need to feed this information back to the Safekeeper and postgres for them to take the /// appropriate action. corruption_detected: AtomicBool, /// Notifies the tenant compaction loop that there is pending L0 compaction work. l0_compaction_trigger: Arc, /// Make sure we only have one running gc at a time. /// /// Must only be taken in two places: /// - [`Timeline::gc`] (this file) /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, /// Cloned from [`super::TenantShard::pagestream_throttle`] on construction. pub(crate) pagestream_throttle: Arc, /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, /// Some test cases directly place keys into the timeline without actually modifying the directory /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and /// in the future, add `extra_test_sparse_keyspace` if necessary. #[cfg(test)] pub(crate) extra_test_dense_keyspace: ArcSwap, pub(crate) l0_flush_global_state: L0FlushGlobalState, pub(crate) handles: handle::PerTimelineState, pub(crate) attach_wal_lag_cooldown: Arc>, /// Cf. [`crate::tenant::CreateTimelineIdempotency`]. pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency, /// If Some, collects GetPage metadata for an ongoing PageTrace. pub(crate) page_trace: ArcSwapOption>, pub(super) previous_heatmap: ArcSwapOption, /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. heatmap_layers_downloader: Mutex>, pub(crate) rel_size_v2_status: ArcSwap<(Option, Option)>, wait_lsn_log_slow: tokio::sync::Semaphore, /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, #[expect(dead_code)] feature_resolver: Arc, /// Basebackup will collect the count and store it here. Used for reldirv2 rollout. pub(crate) db_rel_count: ArcSwapOption<(usize, usize)>, } pub(crate) enum PreviousHeatmap { Active { heatmap: HeatMapTimeline, read_at: std::time::Instant, // End LSN covered by the heatmap if known end_lsn: Option, }, Obsolete, } pub type TimelineDeleteProgress = Arc>; pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, pub last_received_msg_ts: u128, } /// Information about how much history needs to be retained, needed by /// Garbage Collection. #[derive(Default)] pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, /// Leases granted to particular LSNs. pub(crate) leases: BTreeMap, /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) pub(crate) within_ancestor_pitr: bool, } impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } pub(super) fn insert_child( &mut self, child_id: TimelineId, child_lsn: Lsn, is_offloaded: MaybeOffloaded, ) { self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } pub(super) fn remove_child_maybe_offloaded( &mut self, child_id: TimelineId, maybe_offloaded: MaybeOffloaded, ) -> bool { // Remove at most one element. Needed for correctness if there is two live `Timeline` objects referencing // the same timeline. Shouldn't but maybe can occur when Arc's live longer than intended. let mut removed = false; self.retain_lsns.retain(|i| { if removed { return true; } let remove = i.1 == child_id && i.2 == maybe_offloaded; removed |= remove; !remove }); removed } pub(super) fn remove_child_not_offloaded(&mut self, child_id: TimelineId) -> bool { self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::No) } pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool { self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes) } pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool { self.leases.contains_key(&lsn) } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. #[derive(Clone, Debug, Default)] pub(crate) struct GcCutoffs { /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. pub(crate) space: Lsn, /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates /// how much history we must keep to enable reading back at least the PITR interval duration. /// /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield /// Some(last_record_lsn). pub(crate) time: Option, } impl GcCutoffs { fn select_min(&self) -> Lsn { // NB: if we haven't computed the PITR cutoff yet, we can't GC anything. self.space.min(self.time.unwrap_or_default()) } } pub(crate) struct TimelineVisitOutcome { completed_keyspace: KeySpace, image_covered_keyspace: KeySpace, } /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { #[error(transparent)] Other(anyhow::Error), #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(WaitLsnError), #[error("timeline shutting down")] Cancelled, /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), #[error("{0}")] MissingKey(Box), } impl PageReconstructError { pub(crate) fn is_cancel(&self) -> bool { match self { PageReconstructError::Other(_) => false, PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(), PageReconstructError::Cancelled => true, PageReconstructError::WalRedo(_) => false, PageReconstructError::MissingKey(_) => false, } } #[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough pub(crate) fn into_anyhow(self) -> anyhow::Error { match self { PageReconstructError::Other(e) => e, PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(), PageReconstructError::Cancelled => anyhow::Error::new(self), PageReconstructError::WalRedo(e) => e, PageReconstructError::MissingKey(_) => anyhow::Error::new(self), } } } impl From for PageReconstructError { fn from(value: anyhow::Error) -> Self { // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error match value.downcast::() { Ok(pre) => pre, Err(other) => PageReconstructError::Other(other), } } } impl From for PageReconstructError { fn from(value: utils::bin_ser::DeserializeError) -> Self { PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure")) } } impl From for PageReconstructError { fn from(_: layer_manager::Shutdown) -> Self { PageReconstructError::Cancelled } } impl GetVectoredError { #[cfg(test)] pub(crate) fn is_missing_key_error(&self) -> bool { matches!(self, Self::MissingKey(_)) } } impl From for GetVectoredError { fn from(_: layer_manager::Shutdown) -> Self { GetVectoredError::Cancelled } } /// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes /// only and not used by the "real read path". pub enum ReadPathLayerId { PersistentLayer(PersistentLayerKey), InMemoryLayer(Range), } impl std::fmt::Display for ReadPathLayerId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ReadPathLayerId::PersistentLayer(key) => write!(f, "{key}"), ReadPathLayerId::InMemoryLayer(range) => { write!(f, "in-mem {}..{}", range.start, range.end) } } } } pub struct ReadPath { keyspace: KeySpace, lsn: Lsn, path: Vec<(ReadPathLayerId, KeySpace, Range)>, } impl ReadPath { pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self { Self { keyspace, lsn, path: Vec::new(), } } pub fn record_layer_visit( &mut self, layer_to_read: &ReadableLayer, keyspace_to_read: &KeySpace, lsn_range: &Range, ) { let id = match layer_to_read { ReadableLayer::PersistentLayer(layer) => { ReadPathLayerId::PersistentLayer(layer.layer_desc().key()) } ReadableLayer::InMemoryLayer(layer) => { ReadPathLayerId::InMemoryLayer(layer.get_lsn_range()) } }; self.path .push((id, keyspace_to_read.clone(), lsn_range.clone())); } } impl std::fmt::Display for ReadPath { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?; for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() { writeln!( f, "{}: {} {}..{} {}", idx, layer_id, lsn_range.start, lsn_range.end, keyspace )?; } Ok(()) } } #[derive(thiserror::Error)] pub struct MissingKeyError { keyspace: KeySpace, shard: ShardNumber, query: Option, // This is largest request LSN from the get page request batch original_hwm_lsn: Lsn, ancestor_lsn: Option, /// Debug information about the read path if there's an error read_path: Option, backtrace: Option, } impl MissingKeyError { fn enrich(&mut self, query: VersionedKeySpaceQuery) { self.query = Some(query); } } impl std::fmt::Debug for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self}") } } impl std::fmt::Display for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "could not find data for key {} (shard {:?}), original HWM LSN {}", self.keyspace, self.shard, self.original_hwm_lsn )?; if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {ancestor_lsn}")?; } if let Some(ref query) = self.query { write!(f, ", query {query}")?; } if let Some(ref read_path) = self.read_path { write!(f, "\n{read_path}")?; } if let Some(ref backtrace) = self.backtrace { write!(f, "\n{backtrace}")?; } Ok(()) } } #[derive(thiserror::Error, Debug)] pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, #[error("read failed")] GetVectoredError(#[source] GetVectoredError), #[error("reconstruction failed")] PageReconstructError(#[source] PageReconstructError), #[error(transparent)] Other(anyhow::Error), } impl From for CreateImageLayersError { fn from(_: layer_manager::Shutdown) -> Self { CreateImageLayersError::Cancelled } } #[derive(thiserror::Error, Debug, Clone)] pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled #[error("timeline shutting down")] Cancelled, /// We tried to flush a layer while the Timeline is in an unexpected state #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")] NotRunning(FlushLoopState), // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush // loop via a watch channel, where we can only borrow it. #[error("create image layers (shared)")] CreateImageLayersError(Arc), #[error("other (shared)")] Other(#[from] Arc), } impl FlushLayerError { // When crossing from generic anyhow errors to this error type, we explicitly check // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { let cancelled = timeline.cancel.is_cancelled() // The upload queue might have been shut down before the official cancellation of the timeline. || err .downcast_ref::() .map(NotInitialized::is_stopping) .unwrap_or_default(); if cancelled { Self::Cancelled } else { Self::Other(Arc::new(err)) } } } impl From for FlushLayerError { fn from(_: layer_manager::Shutdown) -> Self { FlushLayerError::Cancelled } } #[derive(thiserror::Error, Debug)] pub enum GetVectoredError { #[error("timeline shutting down")] Cancelled, #[error("requested too many keys: {0} > {1}")] Oversized(u64, u64), #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), #[error("requested key not found: {0}")] MissingKey(Box), #[error("ancestry walk")] GetReadyAncestorError(#[source] GetReadyAncestorError), #[error(transparent)] Other(#[from] anyhow::Error), } impl From for GetVectoredError { fn from(value: GetReadyAncestorError) -> Self { use GetReadyAncestorError::*; match value { Cancelled => GetVectoredError::Cancelled, AncestorLsnTimeout(_) | BadState { .. } => { GetVectoredError::GetReadyAncestorError(value) } } } } #[derive(thiserror::Error, Debug)] pub enum GetReadyAncestorError { #[error("ancestor LSN wait error")] AncestorLsnTimeout(#[from] WaitLsnError), #[error("bad state on timeline {timeline_id}: {state:?}")] BadState { timeline_id: TimelineId, state: TimelineState, }, #[error("cancelled")] Cancelled, } #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, ConsumptionMetricsSyntheticSize, EvictionTaskImitation, TenantSizeHandler, } pub enum GetLogicalSizePriority { User, Background, } #[derive(Debug, enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, ForceL0Compaction, OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, /// Makes image compaction yield if there's pending L0 compaction. This should always be used in /// the background compaction task, since we want to aggressively compact down L0 to bound /// read amplification. /// /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0 /// compaction). YieldForL0, } #[serde_with::serde_as] #[derive(Debug, Clone, serde::Deserialize)] pub(crate) struct CompactRequest { pub compact_key_range: Option, pub compact_lsn_range: Option, /// Whether the compaction job should be scheduled. #[serde(default)] pub scheduled: bool, /// Whether the compaction job should be split across key ranges. #[serde(default)] pub sub_compaction: bool, /// Max job size for each subcompaction job. pub sub_compaction_max_job_size_mb: Option, } #[derive(Debug, Clone, serde::Deserialize)] pub(crate) struct MarkInvisibleRequest { #[serde(default)] pub is_visible: Option, } #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, /// If set, the compaction will only compact the key range specified by this option. /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`]. pub compact_key_range: Option, /// If set, the compaction will only compact the LSN within this value. /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`]. pub compact_lsn_range: Option, /// Enable sub-compaction (split compaction job across key ranges). /// This option is only used by GC compaction. pub sub_compaction: bool, /// Set job size for the GC compaction. /// This option is only used by GC compaction. pub sub_compaction_max_job_size_mb: Option, /// Only for GC compaction. /// If set, the compaction will compact the metadata layers. Should be only set to true in unit tests /// because metadata compaction is not fully supported yet. pub gc_compaction_do_metadata_compaction: bool, } impl CompactOptions { #[cfg(test)] pub fn default_for_gc_compaction_unit_tests() -> Self { Self { gc_compaction_do_metadata_compaction: true, ..Default::default() } } } impl std::fmt::Debug for Timeline { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "Timeline<{}>", self.timeline_id) } } #[derive(thiserror::Error, Debug, Clone)] pub enum WaitLsnError { // Called on a timeline which is shutting down #[error("Shutdown")] Shutdown, // Called on an timeline not in active state or shutting down #[error("Bad timeline state: {0:?}")] BadState(TimelineState), // Timeout expired while waiting for LSN to catch up with goal. #[error("{0}")] Timeout(String), } impl WaitLsnError { pub(crate) fn is_cancel(&self) -> bool { match self { WaitLsnError::Shutdown => true, WaitLsnError::BadState(timeline_state) => match timeline_state { TimelineState::Loading => false, TimelineState::Active => false, TimelineState::Stopping => true, TimelineState::Broken { .. } => false, }, WaitLsnError::Timeout(_) => false, } } pub(crate) fn into_anyhow(self) -> anyhow::Error { match self { WaitLsnError::Shutdown => anyhow::Error::new(self), WaitLsnError::BadState(_) => anyhow::Error::new(self), WaitLsnError::Timeout(_) => anyhow::Error::new(self), } } } impl From for tonic::Status { fn from(err: WaitLsnError) -> Self { use tonic::Code; let code = if err.is_cancel() { Code::Unavailable } else { Code::Internal }; tonic::Status::new(code, err.to_string()) } } // The impls below achieve cancellation mapping for errors. // Perhaps there's a way of achieving this with less cruft. impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => CompactionError::new_cancelled(), CreateImageLayersError::Other(e) => { CompactionError::Other(e.context("create image layers")) } _ => CompactionError::Other(e.into()), } } } impl From for FlushLayerError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => FlushLayerError::Cancelled, any => FlushLayerError::CreateImageLayersError(Arc::new(any)), } } } impl From for CreateImageLayersError { fn from(e: PageReconstructError) -> Self { match e { PageReconstructError::Cancelled => CreateImageLayersError::Cancelled, _ => CreateImageLayersError::PageReconstructError(e), } } } impl From for CreateImageLayersError { fn from(e: super::storage_layer::errors::PutError) -> Self { if e.is_cancel() { CreateImageLayersError::Cancelled } else { CreateImageLayersError::Other(e.into_anyhow()) } } } impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { GetVectoredError::Cancelled => CreateImageLayersError::Cancelled, _ => CreateImageLayersError::GetVectoredError(e), } } } impl From for PageReconstructError { fn from(e: GetVectoredError) -> Self { match e { GetVectoredError::Cancelled => PageReconstructError::Cancelled, GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), err @ GetVectoredError::Oversized(_, _) => PageReconstructError::Other(err.into()), GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), GetVectoredError::Other(err) => PageReconstructError::Other(err), } } } impl From for PageReconstructError { fn from(e: GetReadyAncestorError) -> Self { use GetReadyAncestorError::*; match e { AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)), Cancelled => PageReconstructError::Cancelled, } } } pub(crate) enum WaitLsnTimeout { Custom(Duration), // Use the [`PageServerConf::wait_lsn_timeout`] default Default, } pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, PageService, HttpEndpoint, BaseBackupCache, } /// Argument to [`Timeline::shutdown`]. #[derive(Debug, Clone, Copy)] pub(crate) enum ShutdownMode { /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk. This method can /// take multiple seconds for a busy timeline. /// /// While we are flushing, we continue to accept read I/O for LSNs ingested before /// the call to [`Timeline::shutdown`]. FreezeAndFlush, /// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion /// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant /// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown, /// as flushing the deletion queue at that time will cause shutdown-in-progress errors. Reload, /// Shut down immediately, without waiting for any open layers to flush. Hard, } #[allow(clippy::large_enum_variant, reason = "TODO")] enum ImageLayerCreationOutcome { /// We generated an image layer Generated { unfinished_image_layer: ImageLayerWriter, }, /// The key range is empty Empty, /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip /// the image layer creation. Skip, } enum RepartitionError { Other(anyhow::Error), CollectKeyspace(CollectKeySpaceError), } impl RepartitionError { fn is_cancel(&self) -> bool { match self { RepartitionError::Other(_) => false, RepartitionError::CollectKeyspace(e) => e.is_cancel(), } } fn into_anyhow(self) -> anyhow::Error { match self { RepartitionError::Other(e) => e, RepartitionError::CollectKeyspace(e) => e.into_anyhow(), } } } /// Public interface functions impl Timeline { /// Get the LSN where this branch was created pub(crate) fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } /// Get the ancestor's timeline id pub(crate) fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) } /// Get the ancestor timeline pub(crate) fn ancestor_timeline(&self) -> Option<&Arc> { self.ancestor_timeline.as_ref() } /// Get the bytes written since the PITR cutoff on this branch, and /// whether this branch's ancestor_lsn is within its parent's PITR. pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { // TODO: for backwards compatibility, we return the full history back to 0 when the PITR // cutoff has not yet been initialized. This should return None instead, but this is exposed // in external HTTP APIs and callers may not handle a null value. let gc_info = self.gc_info.read().unwrap(); let history = self .get_last_record_lsn() .checked_sub(gc_info.cutoffs.time.unwrap_or_default()) .unwrap_or_default() .0; (history, gc_info.within_ancestor_pitr) } /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard { self.applied_gc_cutoff_lsn.read() } /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed /// to read (based on configured PITR), even if physically we have more history. Returns None /// if the PITR cutoff has not yet been initialized. pub(crate) fn get_gc_cutoff_lsn(&self) -> Option { self.gc_info.read().unwrap().cutoffs.time } /// Look up given page version. /// /// If a remote layer file is needed, it is downloaded as part of this /// call. /// /// This method enforces [`Self::pagestream_throttle`] internally. /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what /// data exists with what keys, in separate metadata entries. If a /// non-existent key is requested, we may incorrectly return a value from /// an ancestor branch, for example, or waste a lot of cycles chasing the /// non-existing key. /// /// # Cancel-Safety /// /// This method is cancellation-safe. #[inline(always)] pub(crate) async fn get( &self, key: Key, lsn: Lsn, ctx: &RequestContext, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } // This check is debug-only because of the cost of hashing, and because it's a double-check: we // already checked the key against the shard_identity when looking up the Timeline from // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let vectored_res = self .get_vectored_impl(query, &mut reconstruct_state, ctx) .await; let key_value = vectored_res?.pop_first(); match key_value { Some((got_key, value)) => { if got_key != key { error!( "Expected {}, but singular vectored get returned {}", key, got_key ); Err(PageReconstructError::Other(anyhow!( "Singular vectored get returned wrong key" ))) } else { value } } None => Err(PageReconstructError::MissingKey(Box::new( MissingKeyError { keyspace: KeySpace::single(key..key.next()), shard: self.shard_identity.get_shard_number(&key), original_hwm_lsn: lsn, ancestor_lsn: None, backtrace: None, read_path: None, query: None, }, ))), } } #[inline(always)] pub(crate) async fn debug_get( &self, key: Key, lsn: Lsn, ctx: &RequestContext, reconstruct_state: &mut ValuesReconstructState, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } // This check is debug-only because of the cost of hashing, and because it's a double-check: we // already checked the key against the shard_identity when looking up the Timeline from // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let vectored_res = self .debug_get_vectored_impl(query, reconstruct_state, ctx) .await; let key_value = vectored_res?.pop_first(); match key_value { Some((got_key, value)) => { if got_key != key { error!( "Expected {}, but singular vectored get returned {}", key, got_key ); Err(PageReconstructError::Other(anyhow!( "Singular vectored get returned wrong key" ))) } else { value } } None => Err(PageReconstructError::MissingKey(Box::new( MissingKeyError { keyspace: KeySpace::single(key..key.next()), shard: self.shard_identity.get_shard_number(&key), original_hwm_lsn: lsn, ancestor_lsn: None, backtrace: None, read_path: None, query: None, }, ))), } } pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN /// /// This naive implementation will be replaced with a more efficient one /// which actually vectorizes the read path. /// /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future /// if the client goes away (e.g. due to timeout or cancellation). pub(crate) async fn get_vectored( &self, query: VersionedKeySpaceQuery, io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { let total_keyspace = query.total_keyspace(); let key_count = total_keyspace.total_raw_size(); if key_count > self.conf.max_get_vectored_keys.get() { return Err(GetVectoredError::Oversized( key_count as u64, self.conf.max_get_vectored_keys.get() as u64, )); } for range in &total_keyspace.ranges { let mut key = range.start; while key != range.end { assert!(!self.shard_identity.is_key_disposable(&key)); key = key.next(); } } trace!( "get vectored query {} from task kind {:?}", query, ctx.task_kind(), ); let start = crate::metrics::GET_VECTORED_LATENCY .for_task_kind(ctx.task_kind()) .map(|metric| (metric, Instant::now())); let res = self .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some((metric, start)) = start { let elapsed = start.elapsed(); metric.observe(elapsed.as_secs_f64()); } res } /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found /// during the search, but for the scan interface, it returns all existing key-value pairs, and does /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that /// the scan operation will not cause OOM in the future. pub(crate) async fn scan( &self, keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, io_concurrency: super::storage_layer::IoConcurrency, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(lsn)); } trace!( "key-value scan request for {:?}@{} from task kind {:?}", keyspace, lsn, ctx.task_kind() ); // We should generalize this into Keyspace::contains in the future. for range in &keyspace.ranges { if range.start.field1 < METADATA_KEY_BEGIN_PREFIX || range.end.field1 > METADATA_KEY_END_PREFIX { return Err(GetVectoredError::Other(anyhow::anyhow!( "only metadata keyspace can be scanned" ))); } } let start = crate::metrics::SCAN_LATENCY .for_task_kind(ctx.task_kind()) .map(ScanLatencyOngoingRecording::start_recording); let query = VersionedKeySpaceQuery::uniform(keyspace, lsn); let vectored_res = self .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some(recording) = start { recording.observe(); } vectored_res } pub(super) async fn get_vectored_impl( &self, query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { if query.is_empty() { return Ok(BTreeMap::default()); } let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { Some(ReadPath::new( query.total_keyspace(), query.high_watermark_lsn()?, )) } else { None }; reconstruct_state.read_path = read_path; let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction { RedoAttemptType::LegacyCompaction } else { RedoAttemptType::ReadPage }; let traversal_res: Result<(), _> = { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "PLAN_IO", ) }) .attached_child(); self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await }; if let Err(err) = traversal_res { // Wait for all the spawned IOs to complete. // See comments on `spawn_io` inside `storage_layer` for more details. let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) .into_values() .map(|state| state.collect_pending_ios()) .collect::>(); while collect_futs.next().await.is_some() {} // Enrich the missing key error with the original query. if let GetVectoredError::MissingKey(mut missing_err) = err { missing_err.enrich(query.clone()); return Err(GetVectoredError::MissingKey(missing_err)); } return Err(err); }; let layers_visited = reconstruct_state.get_layers_visited(); let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "RECONSTRUCT", ) }) .attached_child(); let futs = FuturesUnordered::new(); for (key, state) in std::mem::take(&mut reconstruct_state.keys) { let req_lsn_for_key = query.map_key_to_lsn(&key); futs.push({ let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); let ctx = RequestContextBuilder::from(&ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "RECONSTRUCT_KEY", key = %key, ) }) .attached_child(); async move { assert_eq!(state.situation, ValueReconstructSituation::Complete); let res = state .collect_pending_ios() .maybe_perf_instrument(&ctx, |crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "WAIT_FOR_IO_COMPLETIONS", ) }) .await; let converted = match res { Ok(ok) => ok, Err(err) => { return (key, Err(err)); } }; DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64); // The walredo module expects the records to be descending in terms of Lsn. // And we submit the IOs in that order, so, there shuold be no need to sort here. debug_assert!( converted .records .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), "{converted:?}" ); let walredo_deltas = converted.num_deltas(); let walredo_res = walredo_self .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type) .maybe_perf_instrument(&ctx, |crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "WALREDO", deltas = %walredo_deltas, ) }) .await; (key, walredo_res) } }); } let results = futs .collect::>>() .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await; // For aux file keys (v1 or v2) the vectored read path does not return an error // when they're missing. Instead they are omitted from the resulting btree // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { let total_keyspace = query.total_keyspace(); let max_request_lsn = query.high_watermark_lsn().expect("Validated previously"); static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); LOG_PACER.lock().unwrap().call(|| { let num_keys = total_keyspace.total_raw_size(); let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), lsn = %max_request_lsn, "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", ); }); } // Records the number of layers visited in a few different ways: // // * LAYERS_PER_READ: all layers count towards every read in the batch, because each // layer directly affects its observed latency. // // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch // layer visits and access cost. // // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized // read amplification after batching. let layers_visited = layers_visited as f64; let avg_layers_visited = layers_visited / results.len() as f64; LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited); for _ in &results { self.metrics.layers_per_read.observe(layers_visited); LAYERS_PER_READ_GLOBAL.observe(layers_visited); LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited); } } Ok(results) } // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`. // This is only used in the http getpage call for debugging purpose. pub(super) async fn debug_get_vectored_impl( &self, query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { if query.is_empty() { return Ok(BTreeMap::default()); } let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { Some(ReadPath::new( query.total_keyspace(), query.high_watermark_lsn()?, )) } else { None }; reconstruct_state.read_path = read_path; let traversal_res: Result<(), _> = self .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx) .await; if let Err(err) = traversal_res { // Wait for all the spawned IOs to complete. // See comments on `spawn_io` inside `storage_layer` for more details. let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) .into_values() .map(|state| state.collect_pending_ios()) .collect::>(); while collect_futs.next().await.is_some() {} return Err(err); }; let reconstruct_state = Arc::new(Mutex::new(reconstruct_state)); let futs = FuturesUnordered::new(); for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) { let req_lsn_for_key = query.map_key_to_lsn(&key); futs.push({ let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); let rc_clone = Arc::clone(&reconstruct_state); async move { assert_eq!(state.situation, ValueReconstructSituation::Complete); let converted = match state.collect_pending_ios().await { Ok(ok) => ok, Err(err) => { return (key, Err(err)); } }; DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64); // The walredo module expects the records to be descending in terms of Lsn. // And we submit the IOs in that order, so, there shuold be no need to sort here. debug_assert!( converted .records .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), "{converted:?}" ); { let mut guard = rc_clone.lock().unwrap(); guard.set_debug_state(&converted); } ( key, walredo_self .reconstruct_value( key, req_lsn_for_key, converted, RedoAttemptType::ReadPage, ) .await, ) } }); } let results = futs .collect::>>() .await; Ok(results) } /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } pub(crate) fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } /// Atomically get both last and prev. pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn(). pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver> { self.last_record_lsn.status_receiver() } pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } /// remote_consistent_lsn from the perspective of the tenant's current generation, /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { self.remote_client.remote_consistent_lsn_projected() } /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { self.remote_client.remote_consistent_lsn_visible() } /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; guard.layer_size_sum() } pub(crate) fn resident_physical_size(&self) -> u64 { self.metrics.resident_physical_size_get() } pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] { array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed)) } /// /// Wait until WAL has been received and processed up to this LSN. /// /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// pub(crate) async fn wait_lsn( &self, lsn: Lsn, who_is_waiting: WaitLsnWaiter<'_>, timeout: WaitLsnTimeout, ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { let state = self.current_state(); if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) { return Err(WaitLsnError::Shutdown); } else if !matches!(state, TimelineState::Active) { return Err(WaitLsnError::BadState(state)); } if cfg!(debug_assertions) { match ctx.task_kind() { TaskKind::WalReceiverManager | TaskKind::WalReceiverConnectionHandler | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { WaitLsnWaiter::Timeline(waiter) => { Weak::ptr_eq(&waiter.myself, &self.myself) } WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint | WaitLsnWaiter::BaseBackupCache => unreachable!( "tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind() ), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here panic!( "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock" ); } } else { // if another timeline's is waiting for us, there's no deadlock risk because // our walreceiver task can make progress independent of theirs } } _ => {} } } let timeout = match timeout { WaitLsnTimeout::Custom(t) => t, WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, }; let timer = crate::metrics::WAIT_LSN_TIME.start_timer(); let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard(); let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout); let wait_for_timeout = std::pin::pin!(wait_for_timeout); // Use threshold of 1 because even 1 second of wait for ingest is very much abnormal. let log_slow_threshold = Duration::from_secs(1); // Use period of 10 to avoid flooding logs during an outage that affects all timelines. let log_slow_period = Duration::from_secs(10); let mut logging_permit = None; let wait_for_timeout = monitor_slow_future( log_slow_threshold, log_slow_period, wait_for_timeout, |MonitorSlowFutureCallback { ready, is_slow, elapsed_total, elapsed_since_last_callback, }| { self.metrics .wait_lsn_in_progress_micros .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap()); if !is_slow { return; } // It's slow, see if we should log it. // (We limit the logging to one per invocation per timeline to avoid excessive // logging during an extended broker / networking outage that affects all timelines.) if logging_permit.is_none() { logging_permit = self.wait_lsn_log_slow.try_acquire().ok(); } if logging_permit.is_none() { return; } // We log it. if ready { info!( "slow wait_lsn completed after {:.3}s", elapsed_total.as_secs_f64() ); } else { info!( "slow wait_lsn still running for {:.3}s", elapsed_total.as_secs_f64() ); } }, ); let res = wait_for_timeout.await; // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo drop(logging_permit); drop(start_finish_counterpair_guard); drop(timer); match res { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; match e { Shutdown => Err(WaitLsnError::Shutdown), Timeout => { let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn(), walreceiver_status, ))) } } } } } pub(crate) fn walreceiver_status(&self) -> String { match &*self.walreceiver.lock().unwrap() { None => "stopping or stopped".to_string(), Some(walreceiver) => match walreceiver.status() { Some(status) => status.to_human_readable_string(), None => "Not active".to_string(), }, } } /// Check that it is valid to request operations with that lsn. pub(crate) fn check_lsn_is_in_scope( &self, lsn: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)", lsn, **latest_gc_cutoff_lsn, ); Ok(()) } /// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`. pub(crate) fn init_lsn_lease( &self, lsn: Lsn, length: Duration, ctx: &RequestContext, ) -> anyhow::Result { self.make_lsn_lease(lsn, length, true, ctx) } /// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period. pub(crate) fn renew_lsn_lease( &self, lsn: Lsn, length: Duration, ctx: &RequestContext, ) -> anyhow::Result { self.make_lsn_lease(lsn, length, false, ctx) } /// Obtains a temporary lease blocking garbage collection for the given LSN. /// /// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error /// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present. /// /// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease. /// The returned lease is therefore the maximum between the existing lease and the requesting lease. fn make_lsn_lease( &self, lsn: Lsn, length: Duration, init: bool, _ctx: &RequestContext, ) -> anyhow::Result { let lease = { // Normalize the requested LSN to be aligned, and move to the first record // if it points to the beginning of the page (header). let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE); let mut gc_info = self.gc_info.write().unwrap(); let planned_cutoff = gc_info.min_cutoff(); let valid_until = SystemTime::now() + length; let entry = gc_info.leases.entry(lsn); match entry { Entry::Occupied(mut occupied) => { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; let dt: DateTime = valid_until.into(); info!("lease extended to {}", dt); } else { let dt: DateTime = existing_lease.valid_until.into(); info!("existing lease covers greater length, valid until {}", dt); } existing_lease.clone() } Entry::Vacant(vacant) => { // Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted. let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { bail!( "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn ); } // We allow create lease for those below the planned gc cutoff if we are still within the grace period // of GC blocking. let validate = { let conf = self.tenant_conf.load(); !conf.is_gc_blocked_by_lsn_lease_deadline() }; // Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines // whether it is a initial lease creation or a renewal. if (init || validate) && lsn < planned_cutoff { bail!( "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff ); } let dt: DateTime = valid_until.into(); info!("lease created, valid until {}", dt); vacant.insert(LsnLease { valid_until }).clone() } } }; Ok(lease) } /// Freeze the current open in-memory layer. It will be written to disk on next iteration. /// Returns the flush request ID which can be awaited with wait_flush_completion(). #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze(&self) -> Result { self.freeze0().await } /// Freeze and flush the open in-memory layer, waiting for it to be written to disk. #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> { self.freeze_and_flush0().await } /// Freeze the current open in-memory layer. It will be written to disk on next iteration. /// Returns the flush request ID which can be awaited with wait_flush_completion(). pub(crate) async fn freeze0(&self) -> Result { let mut g = self.write_lock.lock().await; let to_lsn = self.get_last_record_lsn(); self.freeze_inmem_layer_at(to_lsn, &mut g).await } // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { let token = self.freeze0().await?; self.wait_flush_completion(token).await } // Check if an open ephemeral layer should be closed: this provides // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping // an ephemeral layer open forever when idle. It also freezes layers if the global limit on // ephemeral layer bytes has been breached. pub(super) async fn maybe_freeze_ephemeral_layer(&self) { debug_assert_current_span_has_tenant_and_timeline_id(); let Ok(mut write_guard) = self.write_lock.try_lock() else { // If the write lock is held, there is an active wal receiver: rolling open layers // is their responsibility while they hold this lock. return; }; // FIXME: why not early exit? because before #7927 the state would had been cleared every // time, and this was missed. // if write_guard.is_none() { return; } let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else { // Don't block if the layer lock is busy return; }; let Ok(lm) = layers_guard.layer_map() else { return; }; let Some(open_layer) = &lm.open_layer else { // If there is no open layer, we have no layer freezing to do. However, we might need to generate // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions // that didn't result in writes to this shard. // Must not hold the layers lock while waiting for a flush. drop(layers_guard); let last_record_lsn = self.get_last_record_lsn(); let disk_consistent_lsn = self.get_disk_consistent_lsn(); if last_record_lsn > disk_consistent_lsn { // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates // we are a sharded tenant and have skipped some WAL let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard // without any data ingested (yet) doesn't write a remote index as soon as it // sees its LSN advance: we only do this if we've been layer-less // for some time. tracing::debug!( "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", disk_consistent_lsn, last_record_lsn ); // The flush loop will update remote consistent LSN as well as disk consistent LSN. // We know there is no open layer, so we can request freezing without actually // freezing anything. This is true even if we have dropped the layers_guard, we // still hold the write_guard. let _ = async { let token = self .freeze_inmem_layer_at(last_record_lsn, &mut write_guard) .await?; self.wait_flush_completion(token).await } .await; } } return; }; let current_size = open_layer.len(); let current_lsn = self.get_last_record_lsn(); let checkpoint_distance_override = open_layer.tick(); if let Some(size_override) = checkpoint_distance_override { if current_size > size_override { // This is not harmful, but it only happens in relatively rare cases where // time-based checkpoints are not happening fast enough to keep the amount of // ephemeral data within configured limits. It's a sign of stress on the system. tracing::info!( "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure" ); } } let checkpoint_distance = checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance()); if self.should_roll( current_size, current_size, checkpoint_distance, self.get_last_record_lsn(), self.last_freeze_at.load(), open_layer.get_opened_at(), ) { match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); let res = self .freeze_inmem_layer_at(current_lsn, &mut write_guard) .await; if let Err(e) = res { tracing::info!( "failed to flush frozen layer after background freeze: {e:#}" ); } } } } } /// Checks if the internal state of the timeline is consistent with it being able to be offloaded. /// /// This is neccessary but not sufficient for offloading of the timeline as it might have /// child timelines that are not offloaded yet. pub(crate) fn can_offload(&self) -> (bool, &'static str) { if self.remote_client.is_archived() != Some(true) { return (false, "the timeline is not archived"); } if !self.remote_client.no_pending_work() { // if the remote client is still processing some work, we can't offload return (false, "the upload queue is not drained yet"); } (true, "ok") } /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending /// compaction tasks. pub(crate) async fn compact( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, ) -> Result { let res = self .compact_with_options( cancel, CompactOptions { flags, compact_key_range: None, compact_lsn_range: None, sub_compaction: false, sub_compaction_max_job_size_mb: None, gc_compaction_do_metadata_compaction: false, }, ctx, ) .await; if let Err(err) = &res { log_compaction_error(err, None, cancel.is_cancelled(), false); } res } /// Outermost timeline compaction operation; downloads needed layers. /// /// NB: the cancellation token is usually from a background task, but can also come from a /// request task. pub(crate) async fn compact_with_options( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, ) -> Result { // Acquire the compaction lock and task semaphore. // // L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved // out by other background tasks (including image compaction). We request this via // `BackgroundLoopKind::L0Compaction`. // // Yield for pending L0 compaction while waiting for the semaphore. let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { true => BackgroundLoopKind::L0Compaction, false => BackgroundLoopKind::Compaction, }; let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); if yield_for_l0 { // If this is an L0 pass, it doesn't make sense to yield for L0. debug_assert!(!is_l0_only, "YieldForL0 during L0 pass"); // If `compaction_l0_first` is disabled, there's no point yielding. debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass"); } let acquire = async move { let guard = self.compaction_lock.lock().await; let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await; (guard, permit) }; let (_guard, _permit) = tokio::select! { (guard, permit) = acquire => (guard, permit), _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => { return Ok(CompactionOutcome::YieldForL0); } _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped), _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped), }; let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!( "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}" ); return Ok(CompactionOutcome::Skipped); } let result = match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => { self.compact_tiered(cancel, ctx).await?; Ok(CompactionOutcome::Done) } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await, }; // Signal compaction failure to avoid L0 flush stalls when it's broken. match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), Err(e) if e.is_cancel() => {} Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed), }; result } /// Mutate the timeline with a [`TimelineWriter`]. pub(crate) async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, write_guard: self.write_lock.lock().await, } } pub(crate) fn activate( self: &Arc, parent: Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { if self.tenant_shard_id.is_shard_zero() { // Logical size is only maintained accurately on shard zero. self.spawn_initial_logical_size_computation_task(ctx); } self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); self.launch_eviction_task(parent, background_jobs_can_start); } /// After this function returns, there are no timeline-scoped tasks are left running. /// /// The preferred pattern for is: /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, /// go the extra mile and keep track of JoinHandles /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, /// instead of spawning directly on a runtime. It is a more composable / testable pattern. /// /// For legacy reasons, we still have multiple tasks spawned using /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. /// We refer to these as "timeline-scoped task_mgr tasks". /// Some of these tasks are already sensitive to Timeline::cancel while others are /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] /// or [`task_mgr::shutdown_watcher`]. /// We want to gradually convert the code base away from these. /// /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped /// ones that aren't mentioned here): /// - [`TaskKind::TimelineDeletionWorker`] /// - NB: also used for tenant deletion /// - [`TaskKind::RemoteUploadTask`]` /// - [`TaskKind::InitialLogicalSizeCalculation`] /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: /// - [`TaskKind::Eviction`] /// - [`TaskKind::LayerFlushTask`] /// - [`TaskKind::OndemandLogicalSizeCalculation`] /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush // cancel walreceiver to stop ingesting more data asap. // // Note that we're accepting a race condition here where we may // do the final flush below, before walreceiver observes the // cancellation and exits. // This means we may open a new InMemoryLayer after the final flush below. // Flush loop is also still running for a short while, so, in theory, it // could also make its way into the upload queue. // // If we wait for the shutdown of the walreceiver before moving on to the // flush, then that would be avoided. But we don't do it because the // walreceiver entertains reads internally, which means that it possibly // depends on the download of layers. Layer download is only sensitive to // the cancellation of the entire timeline, so cancelling the walreceiver // will have no effect on the individual get requests. // This would cause problems when there is a lot of ongoing downloads or // there is S3 unavailabilities, i.e. detach, deletion, etc would hang, // and we can't deallocate resources of the timeline, etc. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { walreceiver.cancel().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); if let ShutdownMode::FreezeAndFlush = mode { let do_flush = if let Some((open, frozen)) = self .layers .read(LayerManagerLockHolder::Shutdown) .await .layer_map() .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len())) .ok() .filter(|(open, frozen)| *open || *frozen > 0) { if self.remote_client.is_archived() == Some(true) { // No point flushing on shutdown for an archived timeline: it is not important // to have it nice and fresh after our restart, and trying to flush here might // race with trying to offload it (which also stops the flush loop) false } else { tracing::info!(?open, frozen, "flushing and freezing on shutdown"); true } } else { // this is double-shutdown, it'll be a no-op true }; // we shut down walreceiver above, so, we won't add anything more // to the InMemoryLayer; freeze it and wait for all frozen layers // to reach the disk & upload queue, then shut the upload queue and // wait for it to drain. if do_flush { match self.freeze_and_flush().await { Ok(_) => { // drain the upload queue // if we did not wait for completion here, it might be our shutdown process // didn't wait for remote uploads to complete at all, as new tasks can forever // be spawned. // // what is problematic is the shutting down of RemoteTimelineClient, because // obviously it does not make sense to stop while we wait for it, but what // about corner cases like s3 suddenly hanging up? self.remote_client.shutdown().await; } Err(FlushLayerError::Cancelled) => { // this is likely the second shutdown, ignore silently. // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 debug_assert!(self.cancel.is_cancelled()); } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that // we have some extra WAL replay to do next time the timeline starts. warn!("failed to freeze and flush: {e:#}"); } } // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but // we also do a final check here to ensure that the queue is empty. if !self.remote_client.no_pending_work() { warn!( "still have pending work in remote upload queue, but continuing shutting down anyways" ); } } } if let ShutdownMode::Reload = mode { // drain the upload queue self.remote_client.shutdown().await; if !self.remote_client.no_pending_work() { warn!( "still have pending work in remote upload queue, but continuing shutting down anyways" ); } } // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); // If we have a background task downloading heatmap layers stop it. // The background downloads are sensitive to timeline cancellation (done above), // so the drain will be immediate. self.stop_and_drain_heatmap_layers_download().await; // Ensure Prevent new page service requests from starting. self.handles.shutdown(); // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) self.remote_client.stop(); // As documented in remote_client.stop()'s doc comment, it's our responsibility // to shut down the upload queue tasks. // TODO: fix that, task management should be encapsulated inside remote_client. task_mgr::shutdown_tasks( Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; // TODO: work toward making this a no-op. See this function's doc comment for more context. tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; { // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate // open. let mut write_guard = self.write_lock.lock().await; self.layers .write(LayerManagerLockHolder::Shutdown) .await .shutdown(&mut write_guard); } // Finally wait until any gate-holders are complete. // // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; self.metrics.shutdown(); } pub(crate) fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { info!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); } (st, TimelineState::Loading) => { error!("ignoring transition from {st:?} into Loading state"); } (TimelineState::Broken { .. }, new_state) => { error!("Ignoring state update {new_state:?} for broken timeline"); } (TimelineState::Stopping, TimelineState::Active) => { error!("Not activating a Stopping timeline"); } (_, new_state) => { self.state.send_replace(new_state); } } } pub(crate) fn set_broken(&self, reason: String) { let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); let broken_state = TimelineState::Broken { reason, backtrace: backtrace_str, }; self.set_state(broken_state); // Although the Broken state is not equivalent to shutdown() (shutdown will be called // later when this tenant is detach or the process shuts down), firing the cancellation token // here avoids the need for other tasks to watch for the Broken state explicitly. self.cancel.cancel(); } pub(crate) fn current_state(&self) -> TimelineState { self.state.borrow().clone() } pub(crate) fn is_broken(&self) -> bool { matches!(&*self.state.borrow(), TimelineState::Broken { .. }) } pub(crate) fn is_active(&self) -> bool { self.current_state() == TimelineState::Active } pub(crate) fn is_archived(&self) -> Option { self.remote_client.is_archived() } pub(crate) fn is_invisible(&self) -> Option { self.remote_client.is_invisible() } pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } pub(crate) async fn wait_to_become_active( &self, _ctx: &RequestContext, // Prepare for use by cancellation ) -> Result<(), TimelineState> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow().clone(); match current_state { TimelineState::Loading => { receiver .changed() .await .expect("holding a reference to self"); } TimelineState::Active => { return Ok(()); } TimelineState::Broken { .. } | TimelineState::Stopping => { // There's no chance the timeline can transition back into ::Active return Err(current_state); } } } } pub(crate) async fn layer_map_info( &self, reset: LayerAccessStatsReset, ) -> Result { let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); } for frozen_layer in &layer_map.frozen_layers { in_memory_layers.push(frozen_layer.info()); } let historic_layers = layer_map .iter_historic_layers() .map(|desc| guard.get_from_desc(&desc).info(reset)) .collect(); Ok(LayerMapInfo { in_memory_layers, historic_layers, }) } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, ctx: &RequestContext, ) -> Result, super::storage_layer::layer::DownloadError> { let Some(layer) = self .find_layer(layer_file_name) .await .map_err(|e| match e { layer_manager::Shutdown => { super::storage_layer::layer::DownloadError::TimelineShutdown } })? else { return Ok(None); }; layer.download(ctx).await?; Ok(Some(true)) } /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. pub(crate) async fn evict_layer( &self, layer_file_name: &LayerName, ) -> anyhow::Result> { let _gate = self .gate .enter() .map_err(|_| anyhow::anyhow!("Shutting down"))?; let Some(local_layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; // curl has this by default let timeout = std::time::Duration::from_secs(120); match local_layer.evict_and_wait(timeout).await { Ok(()) => Ok(Some(true)), Err(EvictionError::NotFound) => Ok(Some(false)), Err(EvictionError::Downloaded) => Ok(Some(false)), Err(EvictionError::Timeout) => Ok(Some(false)), } } fn should_roll( &self, layer_size: u64, projected_layer_size: u64, checkpoint_distance: u64, projected_lsn: Lsn, last_freeze_at: Lsn, opened_at: Instant, ) -> bool { let distance = projected_lsn.widening_sub(last_freeze_at); // Rolling the open layer can be triggered by: // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that // the safekeepers need to store. For sharded tenants, we multiply by shard count to // account for how writes are distributed across shards: we expect each node to consume // 1/count of the LSN on average. // 2. The size of the currently open layer. // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught // up and suspend activity. if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 { info!( "Will roll layer at {} with layer size {} due to LSN distance ({})", projected_lsn, layer_size, distance ); true } else if projected_layer_size >= checkpoint_distance { // NB: this check is relied upon by: let _ = IndexEntry::validate_checkpoint_distance; info!( "Will roll layer at {} with layer size {} due to layer size ({})", projected_lsn, layer_size, projected_layer_size ); true } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", projected_lsn, layer_size, opened_at.elapsed() ); true } else { false } } pub(crate) fn is_basebackup_cache_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .basebackup_cache_enabled .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled) } /// Try to get a basebackup from the on-disk cache. pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option { self.basebackup_cache .get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn) .await } /// Convenience method to attempt fetching a basebackup for the timeline if enabled and safe for /// the given request parameters. /// /// TODO: consider moving this onto GrpcPageServiceHandler once the libpq handler is gone. pub async fn get_cached_basebackup_if_enabled( &self, lsn: Option, prev_lsn: Option, full: bool, replica: bool, gzip: bool, ) -> Option { if !self.is_basebackup_cache_enabled() || !self.basebackup_cache.is_enabled() { return None; } // We have to know which LSN to fetch the basebackup for. let lsn = lsn?; // We only cache gzipped, non-full basebackups for primary computes with automatic prev_lsn. if prev_lsn.is_some() || full || replica || !gzip { return None; } self.get_cached_basebackup(lsn).await } /// Prepare basebackup for the given LSN and store it in the basebackup cache. /// The method is asynchronous and returns immediately. /// The actual basebackup preparation is performed in the background /// by the basebackup cache on a best-effort basis. pub(crate) fn prepare_basebackup(&self, lsn: Lsn) { if !self.is_basebackup_cache_enabled() { return; } if !self.tenant_shard_id.is_shard_zero() { // In theory we should never get here, but just in case check it. // Preparing basebackup doesn't make sense for shards other than shard zero. return; } if !self.is_active() { // May happen during initial timeline creation. // Such timeline is not in the global timeline map yet, // so basebackup cache will not be able to find it. // TODO(diko): We can prepare such timelines in finish_creation(). return; } self.basebackup_cache .send_prepare(self.tenant_shard_id, self.timeline_id, lsn); } } /// Number of times we will compute partition within a checkpoint distance. const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { pub(crate) fn get_lsn_lease_length(&self) -> Duration { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .lsn_lease_length .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .lsn_lease_length_for_ts .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf.is_gc_blocked_by_lsn_lease_deadline() } pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .lazy_slru_download .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } /// Checks if a get page request should get perf tracing /// /// The configuration priority is: tenant config override, default tenant config, /// pageserver config. pub(crate) fn is_get_page_request_sampled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); let ratio = tenant_conf .tenant_conf .sampling_ratio .flatten() .or(self.conf.default_tenant_conf.sampling_ratio) .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio)); match ratio { Some(r) => { if r.numerator == 0 { false } else { rand::rng().random_range(0..r.denominator) < r.numerator } } None => false, } } fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub(crate) fn get_pitr_interval(&self) -> Duration { let tenant_conf = &self.tenant_conf.load().tenant_conf; tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } fn get_compaction_period(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is /// possible that the index part persists the state while the config doesn't get persisted. pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .rel_size_v2_enabled .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } pub(crate) fn get_rel_size_v2_status(&self) -> (RelSizeMigration, Option) { let (status, migrated_at) = self.rel_size_v2_status.load().as_ref().clone(); (status.unwrap_or(RelSizeMigration::Legacy), migrated_at) } fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_upper_limit .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } pub fn get_compaction_l0_first(&self) -> bool { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_l0_first .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) } pub fn get_compaction_l0_semaphore(&self) -> bool { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_l0_semaphore .unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore) } fn get_l0_flush_delay_threshold(&self) -> Option { // By default, delay L0 flushes at 3x the compaction threshold. The compaction threshold // defaults to 10, and L0 compaction is generally able to keep L0 counts below 30. const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3; // If compaction is disabled, don't delay. if self.get_compaction_period() == Duration::ZERO { return None; } let compaction_threshold = self.get_compaction_threshold(); let tenant_conf = self.tenant_conf.load(); let l0_flush_delay_threshold = tenant_conf .tenant_conf .l0_flush_delay_threshold .or(self.conf.default_tenant_conf.l0_flush_delay_threshold) .unwrap_or(DEFAULT_L0_FLUSH_DELAY_FACTOR * compaction_threshold); // 0 disables backpressure. if l0_flush_delay_threshold == 0 { return None; } // Clamp the flush delay threshold to the compaction threshold; it doesn't make sense to // backpressure flushes below this. // TODO: the tenant config should have validation to prevent this instead. debug_assert!(l0_flush_delay_threshold >= compaction_threshold); Some(max(l0_flush_delay_threshold, compaction_threshold)) } fn get_l0_flush_stall_threshold(&self) -> Option { // Disable L0 stalls by default. Stalling can cause unavailability if L0 compaction isn't // responsive, and it can e.g. block on other compaction via the compaction semaphore or // sibling timelines. We need more confidence before enabling this. const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 // If compaction is disabled, don't stall. if self.get_compaction_period() == Duration::ZERO { return None; } // If compaction is failing, don't stall and try to keep the tenant alive. This may not be a // good idea: read amp can grow unbounded, leading to terrible performance, and we may take // on unbounded compaction debt that can take a long time to fix once compaction comes back // online. At least we'll delay flushes, slowing down the growth and buying some time. if self.compaction_failed.load(AtomicOrdering::Relaxed) { return None; } let compaction_threshold = self.get_compaction_threshold(); let tenant_conf = self.tenant_conf.load(); let l0_flush_stall_threshold = tenant_conf .tenant_conf .l0_flush_stall_threshold .or(self.conf.default_tenant_conf.l0_flush_stall_threshold); // Tests sometimes set compaction_threshold=1 to generate lots of layer files, and don't // handle the 20-second compaction delay. Some (e.g. `test_backward_compatibility`) can't // easily adjust the L0 backpressure settings, so just disable stalls in this case. if cfg!(feature = "testing") && compaction_threshold == 1 && l0_flush_stall_threshold.is_none() { return None; } let l0_flush_stall_threshold = l0_flush_stall_threshold .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold); // 0 disables backpressure. if l0_flush_stall_threshold == 0 { return None; } // Clamp the flush stall threshold to the compaction threshold; it doesn't make sense to // backpressure flushes below this. // TODO: the tenant config should have validation to prevent this instead. debug_assert!(l0_flush_stall_threshold >= compaction_threshold); Some(max(l0_flush_stall_threshold, compaction_threshold)) } fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } // HADRON fn get_image_layer_force_creation_period(&self) -> Option { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .image_layer_force_creation_period .or(self .conf .default_tenant_conf .image_layer_force_creation_period) } fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { let tenant_conf = &self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_algorithm .as_ref() .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm) .clone() } pub fn get_compaction_shard_ancestor(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_shard_ancestor .unwrap_or(self.conf.default_tenant_conf.compaction_shard_ancestor) } fn get_eviction_policy(&self) -> EvictionPolicy { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } fn get_evictions_low_residence_duration_metric_threshold( tenant_conf: &pageserver_api::models::TenantConfig, default_tenant_conf: &pageserver_api::config::TenantConfigToml, ) -> Duration { tenant_conf .evictions_low_residence_duration_metric_threshold .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } fn get_image_layer_creation_check_threshold(&self) -> u8 { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .image_layer_creation_check_threshold .unwrap_or( self.conf .default_tenant_conf .image_layer_creation_check_threshold, ) } fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings { let tenant_conf = &self.tenant_conf.load(); let gc_compaction_enabled = tenant_conf .tenant_conf .gc_compaction_enabled .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); let gc_compaction_verification = tenant_conf .tenant_conf .gc_compaction_verification .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification); let gc_compaction_initial_threshold_kb = tenant_conf .tenant_conf .gc_compaction_initial_threshold_kb .unwrap_or( self.conf .default_tenant_conf .gc_compaction_initial_threshold_kb, ); let gc_compaction_ratio_percent = tenant_conf .tenant_conf .gc_compaction_ratio_percent .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); GcCompactionCombinedSettings { gc_compaction_enabled, gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, } } fn get_image_creation_preempt_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf .image_creation_preempt_threshold .unwrap_or( self.conf .default_tenant_conf .image_creation_preempt_threshold, ) } pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( &new_conf.tenant_conf, &self.conf.default_tenant_conf, ); let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); self.remote_client.update_config(&new_conf.location); let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity { if new_capacity != rel_size_cache.capacity() { rel_size_cache.set_capacity(new_capacity); } } self.metrics .evictions_with_low_residence_duration .write() .unwrap() .change_threshold( &tenant_id_str, &shard_id_str, &timeline_id_str, new_threshold, ); } } /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, tenant_conf: Arc>, metadata: &TimelineMetadata, previous_heatmap: Option, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, walredo_mgr: Option>, resources: TimelineResources, pg_version: PgMajorVersion, state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, gc_compaction_state: Option, rel_size_v2_status: Option, rel_size_migrated_at: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); let evictions_low_residence_duration_metric_threshold = { let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, ) }; if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); // If we construct an explicit timeline object, it's obviously not offloaded let is_offloaded = MaybeOffloaded::No; ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } let relsize_snapshot_cache_capacity = { let loaded_tenant_conf = tenant_conf.load(); loaded_tenant_conf .tenant_conf .relsize_snapshot_cache_capacity .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity) }; Arc::new_cyclic(|myself| { let metrics = Arc::new(TimelineMetrics::new( &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", evictions_low_residence_duration_metric_threshold, ), )); let aux_file_metrics = metrics.aux_file_size_gauge.clone(); let mut result = Timeline { conf, tenant_conf, myself: myself.clone(), timeline_id, tenant_shard_id, generation, shard_identity, pg_version, layers: Default::default(), gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()), walredo_mgr, walreceiver: Mutex::new(None), remote_client: Arc::new(resources.remote_client), // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { last: disk_consistent_lsn, prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), gc_compaction_state: ArcSwapOption::from_pointee(gc_compaction_state), last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), loaded_at: (disk_consistent_lsn, SystemTime::now()), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), metrics, query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, &timeline_id, resources.pagestream_throttle_metrics, ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), layer_flush_start_tx, layer_flush_done_tx, write_lock: tokio::sync::Mutex::new(None), gc_info: std::sync::RwLock::new(GcInfo::default()), last_image_layer_creation_status: ArcSwap::new(Arc::new( LastImageLayerCreationStatus::default(), )), applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), current_logical_size: if disk_consistent_lsn.is_valid() { // we're creating timeline data with some layer files existing locally, // need to recalculate timeline's logical size based on data in the layers. LogicalSize::deferred_initial(disk_consistent_lsn) } else { // we're creating timeline data without any layers existing locally, // initial logical size is 0. LogicalSize::empty_initial() }, partitioning: GuardArcSwap::new(( (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), Lsn(0), )), repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), rel_size_latest_cache: RwLock::new(HashMap::new()), rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)), download_all_remote_layers_task_info: RwLock::new(None), state, eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), delete_progress: TimelineDeleteProgress::default(), cancel, gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), compaction_failed: AtomicBool::default(), corruption_detected: AtomicBool::default(), l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), standby_horizon: AtomicLsn::new(0), pagestream_throttle: resources.pagestream_throttle, aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), l0_flush_global_state: resources.l0_flush_global_state, handles: Default::default(), attach_wal_lag_cooldown, create_idempotency, page_trace: Default::default(), previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), heatmap_layers_downloader: Mutex::new(None), rel_size_v2_status: ArcSwap::from_pointee(( rel_size_v2_status, rel_size_migrated_at, )), wait_lsn_log_slow: tokio::sync::Semaphore::new(1), basebackup_cache: resources.basebackup_cache, feature_resolver: resources.feature_resolver.clone(), db_rel_count: ArcSwapOption::from_pointee(None), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; result .metrics .last_record_lsn_gauge .set(disk_consistent_lsn.0 as i64); result }) } pub(super) fn maybe_spawn_flush_loop(self: &Arc) { let Ok(guard) = self.gate.enter() else { info!("cannot start flush loop when the timeline gate has already been closed"); return; }; let mut flush_loop_state = self.flush_loop_state.lock().unwrap(); match *flush_loop_state { FlushLoopState::NotStarted => (), FlushLoopState::Running { .. } => { info!( "skipping attempt to start flush_loop twice {}/{}", self.tenant_shard_id, self.timeline_id ); return; } FlushLoopState::Exited => { info!( "ignoring attempt to restart exited flush_loop {}/{}", self.tenant_shard_id, self.timeline_id ); return; } } let layer_flush_start_rx = self.layer_flush_start_tx.subscribe(); let self_clone = Arc::clone(self); debug!("spawning flush loop"); *flush_loop_state = FlushLoopState::Running { #[cfg(test)] expect_initdb_optimization: false, #[cfg(test)] initdb_optimization_count: 0, }; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, self.tenant_shard_id, Some(self.timeline_id), "layer flush task", async move { let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); } pub(crate) fn update_gc_compaction_state( &self, gc_compaction_state: GcCompactionState, ) -> anyhow::Result<()> { self.gc_compaction_state .store(Some(Arc::new(gc_compaction_state.clone()))); self.remote_client .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) } pub(crate) fn update_rel_size_v2_status( &self, rel_size_v2_status: RelSizeMigration, rel_size_migrated_at: Option, ) -> anyhow::Result<()> { self.rel_size_v2_status.store(Arc::new(( Some(rel_size_v2_status.clone()), rel_size_migrated_at, ))); self.remote_client .schedule_index_upload_for_rel_size_v2_status_update( rel_size_v2_status, rel_size_migrated_at, ) } pub(crate) fn get_gc_compaction_state(&self) -> Option { self.gc_compaction_state .load() .as_ref() .map(|x| x.as_ref().clone()) } /// Creates and starts the wal receiver. /// /// This function is expected to be called at most once per Timeline's lifecycle /// when the timeline is activated. fn launch_wal_receiver( self: &Arc, ctx: &RequestContext, broker_client: BrokerClientChannel, ) { info!( "launching WAL receiver for timeline {} of tenant {}", self.timeline_id, self.tenant_shard_id ); let tenant_conf = self.tenant_conf.load(); let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); let mut guard = self.walreceiver.lock().unwrap(); assert!( guard.is_none(), "multiple launches / re-launches of WAL receiver are not supported" ); let protocol = PostgresClientProtocol::Interpreted { format: utils::postgres_client::InterpretedFormat::Protobuf, compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), }; *guard = Some(WalReceiver::start( Arc::clone(self), WalReceiverConf { protocol, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), ingest_batch_size: self.conf.ingest_batch_size, validate_wal_contiguity: self.conf.validate_wal_contiguity, }, broker_client, ctx, )); } /// Initialize with an empty layer map. Used when creating a new timeline. pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) { let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect( "in the context where we call this function, no other task has access to the object", ); layers .open_mut() .expect("in this context the LayerManager must still be open") .initialize_empty(Lsn(start_lsn.0)); } /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only /// files. pub(super) async fn load_layer_map( &self, disk_consistent_lsn: Lsn, index_part: IndexPart, ) -> anyhow::Result<()> { use LayerName::*; use init::Decision::*; use init::{Discovered, DismissedLayer}; let mut guard = self .layers .write(LayerManagerLockHolder::LoadLayerMap) .await; let timer = self.metrics.load_layer_map_histo.start_timer(); // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id); let conf = self.conf; let span = tracing::Span::current(); // Copy to move into the task we're about to spawn let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ move || { let _g = span.entered(); let discovered = init::scan_timeline_dir(&timeline_path)?; let mut discovered_layers = Vec::with_capacity(discovered.len()); let mut unrecognized_files = Vec::new(); let mut path = timeline_path; for discovered in discovered { let (name, kind) = match discovered { Discovered::Layer(layer_file_name, local_metadata) => { discovered_layers.push((layer_file_name, local_metadata)); continue; } Discovered::IgnoredBackup(path) => { std::fs::remove_file(path) .or_else(fs_ext::ignore_not_found) .fatal_err("Removing .old file"); continue; } Discovered::Unknown(file_name) => { // we will later error if there are any unrecognized_files.push(file_name); continue; } Discovered::Ephemeral(name) => (name, "old ephemeral file"), Discovered::Temporary(name) => (name, "temporary timeline file"), Discovered::TemporaryDownload(name) => (name, "temporary download"), }; path.push(Utf8Path::new(&name)); init::cleanup(&path, kind)?; path.pop(); } if !unrecognized_files.is_empty() { // assume that if there are any there are many many. let n = unrecognized_files.len(); let first = &unrecognized_files[..n.min(10)]; anyhow::bail!( "unrecognized files in timeline dir (total {n}), first 10: {first:?}" ); } let decided = init::reconcile(discovered_layers, &index_part, disk_consistent_lsn); let mut loaded_layers = Vec::new(); let mut needs_cleanup = Vec::new(); let mut total_physical_size = 0; for (name, decision) in decided { let decision = match decision { Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { if let Some(local) = local { init::cleanup_future_layer( &local.local_path, &name, disk_consistent_lsn, )?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { init::cleanup_local_only_file(&name, &local)?; // this file never existed remotely, we will have to do rework continue; } Err(DismissedLayer::BadMetadata(local)) => { init::cleanup_local_file_for_remote(&local)?; // this file never existed remotely, we will have to do rework continue; } }; match &name { Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1), Image(i) => assert!(i.lsn <= disk_consistent_lsn), } tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { Resident { local, remote } => { total_physical_size += local.file_size; Layer::for_resident(conf, &this, local.local_path, name, remote) .drop_eviction_guard() } Evicted(remote) => Layer::for_evicted(conf, &this, name, remote), }; loaded_layers.push(layer); } Ok((loaded_layers, needs_cleanup, total_physical_size)) } }) .await .map_err(anyhow::Error::new) .and_then(|x| x)?; let num_layers = loaded_layers.len(); guard .open_mut() .expect("layermanager must be open during init") .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); self.remote_client .schedule_layer_file_deletion(&needs_cleanup)?; self.remote_client .schedule_index_upload_for_file_changes()?; // This barrier orders above DELETEs before any later operations. // This is critical because code executing after the barrier might // create again objects with the same key that we just scheduled for deletion. // For example, if we just scheduled deletion of an image layer "from the future", // later compaction might run again and re-create the same image layer. // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. // "same" here means same key range and LSN. // // Without a barrier between above DELETEs and the re-creation's PUTs, // the upload queue may execute the PUT first, then the DELETE. // In our example, we will end up with an IndexPart referencing a non-existent object. // // 1. a future image layer is created and uploaded // 2. ps restart // 3. the future layer from (1) is deleted during load layer map // 4. image layer is re-created and uploaded // 5. deletion queue would like to delete (1) but actually deletes (4) // 6. delete by name works as expected, but it now deletes the wrong (later) version // // See https://github.com/neondatabase/neon/issues/5878 // // NB: generation numbers naturally protect against this because they disambiguate // (1) and (4) // TODO: this is basically a no-op now, should we remove it? self.remote_client.schedule_barrier()?; // TenantShard::create_timeline will wait for these uploads to happen before returning, or // on retry. info!( "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); timer.stop_and_record(); Ok(()) } /// Retrieve current logical size of the timeline. /// /// The size could be lagging behind the actual number, in case /// the initial size calculation has not been run (gets triggered on the first size access). /// /// return size and boolean flag that shows if the size is exact pub(crate) fn get_current_logical_size( self: &Arc, priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { if !self.tenant_shard_id.is_shard_zero() { // Logical size is only accurately maintained on shard zero: when called elsewhere, for example // when HTTP API is serving a GET for timeline zero, return zero return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); } let current_size = self.current_logical_size.current_size(); debug!("Current size: {current_size:?}"); match (current_size.accuracy(), priority) { (logical_size::Accuracy::Exact, _) => (), // nothing to do (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => { // background task will eventually deliver an exact value, we're in no rush } (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => { // background task is not ready, but user is asking for it now; // => make the background task skip the line // (The alternative would be to calculate the size here, but, // it can actually take a long time if the user has a lot of rels. // And we'll inevitable need it again; So, let the background task do the work.) match self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore .get() { Some(cancel) => cancel.cancel(), None => { match self.current_state() { TimelineState::Broken { .. } | TimelineState::Stopping => { // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). // Don't make noise. } TimelineState::Loading => { // Import does not return an activated timeline. info!( "discarding priority boost for logical size calculation because timeline is not yet active" ); } TimelineState::Active => { // activation should be setting the once cell warn!( "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work" ); debug_assert!(false); } } } } } } if let CurrentLogicalSize::Approximate(_) = ¤t_size { if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler { let first = self .current_logical_size .did_return_approximate_to_walreceiver .compare_exchange( false, true, AtomicOrdering::Relaxed, AtomicOrdering::Relaxed, ) .is_ok(); if first { crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc(); } } } current_size } fn spawn_initial_logical_size_computation_task(self: &Arc, ctx: &RequestContext) { let Some(initial_part_end) = self.current_logical_size.initial_part_end else { // nothing to do for freshly created timelines; assert_eq!( self.current_logical_size.current_size().accuracy(), logical_size::Accuracy::Exact, ); self.current_logical_size.initialized.add_permits(1); return; }; let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new(); let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone(); self.current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token) .expect("initial logical size calculation task must be spawned exactly once per Timeline object"); let self_clone = Arc::clone(self); let background_ctx = ctx.detached_child( TaskKind::InitialLogicalSizeCalculation, DownloadBehavior::Download, ); task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, self.tenant_shard_id, Some(self.timeline_id), "initial size calculation", // NB: don't log errors here, task_mgr will do that. async move { self_clone .initial_logical_size_calculation_task( initial_part_end, cancel_wait_for_background_loop_concurrency_limit_semaphore, background_ctx, ) .await; Ok(()) } .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)), ); } /// # Cancellation /// /// This method is sensitive to `Timeline::cancel`. /// /// It is _not_ sensitive to task_mgr::shutdown_token(). /// /// # Cancel-Safety /// /// It does Timeline IO, hence this should be polled to completion because /// we could be leaving in-flight IOs behind, which is safe, but annoying /// to reason about. async fn initial_logical_size_calculation_task( self: Arc, initial_part_end: Lsn, skip_concurrency_limiter: CancellationToken, background_ctx: RequestContext, ) { scopeguard::defer! { // Irrespective of the outcome of this operation, we should unblock anyone waiting for it. self.current_logical_size.initialized.add_permits(1); } let try_once = |attempt: usize| { let background_ctx = &background_ctx; let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { let wait_for_permit = super::tasks::acquire_concurrency_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, ); use crate::metrics::initial_logical_size::StartCircumstances; let (_maybe_permit, circumstances) = tokio::select! { permit = wait_for_permit => { (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) } _ = self_ref.cancel.cancelled() => { return Err(CalculateLogicalSizeError::Cancelled); } () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size // => break out of the rate limit // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime; // but then again what happens if they cancel; also, we should just be using // one runtime across the entire process, so, let's leave this for now. (None, StartCircumstances::SkippedConcurrencyLimiter) } }; let metrics_guard = if attempt == 1 { crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances) } else { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; let io_concurrency = IoConcurrency::spawn_from_conf( self_ref.conf.get_vectored_concurrent_io, self_ref .gate .enter() .map_err(|_| CalculateLogicalSizeError::Cancelled)?, ); let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, LogicalSizeCalculationCause::Initial, background_ctx, ) .await?; self_ref .trigger_aux_file_size_computation( initial_part_end, background_ctx, io_concurrency, ) .await?; // TODO: add aux file size to logical size Ok((calculated_size, metrics_guard)) } }; let retrying = async { let mut attempt = 0; loop { attempt += 1; match try_once(attempt).await { Ok(res) => return ControlFlow::Continue(res), Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()), Err( e @ (CalculateLogicalSizeError::Decode(_) | CalculateLogicalSizeError::PageRead(_)), ) => { warn!(attempt, "initial size calculation failed: {e:?}"); // exponential back-off doesn't make sense at these long intervals; // use fixed retry interval with generous jitter instead let sleep_duration = Duration::from_secs( u64::try_from( // 1hour base (60_i64 * 60_i64) // 10min jitter + rand::rng().random_range(-10 * 60..10 * 60), ) .expect("10min < 1hour"), ); tokio::select! { _ = tokio::time::sleep(sleep_duration) => {} _ = self.cancel.cancelled() => return ControlFlow::Break(()), } } } } }; let (calculated_size, metrics_guard) = match retrying.await { ControlFlow::Continue(calculated_size) => calculated_size, ControlFlow::Break(()) => return, }; // we cannot query current_logical_size.current_size() to know the current // *negative* value, only truncated to u64. let added = self .current_logical_size .size_added_after_initial .load(AtomicOrdering::Relaxed); let sum = calculated_size.saturating_add_signed(added); // set the gauge value before it can be set in `update_current_logical_size`. self.metrics.current_logical_size_gauge.set(sum); self.current_logical_size .initial_logical_size .set((calculated_size, metrics_guard.calculation_result_saved())) .ok() .expect("only this task sets it"); } pub(crate) fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, cause: LogicalSizeCalculationCause, ctx: RequestContext, ) -> oneshot::Receiver> { let (sender, receiver) = oneshot::channel(); let self_clone = Arc::clone(self); // XXX if our caller loses interest, i.e., ctx is cancelled, // we should stop the size calculation work and return an error. // That would require restructuring this function's API to // return the result directly, instead of a Receiver for the result. let ctx = ctx.detached_child( TaskKind::OndemandLogicalSizeCalculation, DownloadBehavior::Download, ); task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, self.tenant_shard_id, Some(self.timeline_id), "ondemand logical size calculation", async move { let res = self_clone .logical_size_calculation_task(lsn, cause, &ctx) .await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors } .in_current_span(), ); receiver } #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, lsn: Lsn, cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result { crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. debug_assert!(self.tenant_shard_id.is_shard_zero()); let guard = self .gate .enter() .map_err(|_| CalculateLogicalSizeError::Cancelled)?; self.calculate_logical_size(lsn, cause, &guard, ctx).await } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. async fn calculate_logical_size( &self, up_to_lsn: Lsn, cause: LogicalSizeCalculationCause, _guard: &GateGuard, ctx: &RequestContext, ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel) { return Err(CalculateLogicalSizeError::Cancelled); } // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { return Ok(size); } let storage_time_metrics = match cause { LogicalSizeCalculationCause::Initial | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo, LogicalSizeCalculationCause::EvictionTaskImitation => { &self.metrics.imitate_logical_size_histo } }; let timer = storage_time_metrics.start_timer(); let logical_size = self .get_current_logical_size_non_incremental(up_to_lsn, ctx) .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); Ok(logical_size) } /// Update current logical size, adding `delta' to the old value. fn update_current_logical_size(&self, delta: i64) { let logical_size = &self.current_logical_size; logical_size.increment_size(delta); // Also set the value in the prometheus gauge. Note that // there is a race condition here: if this is is called by two // threads concurrently, the prometheus gauge might be set to // one value while current_logical_size is set to the // other. match logical_size.current_size() { CurrentLogicalSize::Exact(ref new_current_size) => self .metrics .current_logical_size_gauge .set(new_current_size.into()), CurrentLogicalSize::Approximate(_) => { // don't update the gauge yet, this allows us not to update the gauge back and // forth between the initial size calculation task. } } } pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) { // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system // for each of the database, but we only store one value, and therefore each pgdirmodification // would overwrite the previous value if they modify different databases. match count { MetricsUpdate::Set(count) => { self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed); } MetricsUpdate::Add(count) => { // TODO: these operations are not atomic; but we only have one writer to the metrics, so // it's fine. if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub // the value reliably. self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed); } // Otherwise, ignore this update } MetricsUpdate::Sub(count) => { // TODO: these operations are not atomic; but we only have one writer to the metrics, so // it's fine. if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { // The metrics has been initialized with `MetricsUpdate::Set` before. // The operation could overflow so we need to normalize the value. let prev_val = self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed); let res = prev_val.saturating_sub(count); self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed); } // Otherwise, ignore this update } }; // TODO: remove this, there's no place in the code that updates this aux metrics. let aux_metric = self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); let sum_of_entries = self .directory_metrics .iter() .map(|v| v.load(AtomicOrdering::Relaxed)) .sum(); // Set a high general threshold and a lower threshold for the auxiliary files, // as we can have large numbers of relations in the db directory. const SUM_THRESHOLD: u64 = 5000; const AUX_THRESHOLD: u64 = 1000; if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD { self.metrics .directory_entries_count_gauge .set(sum_of_entries); } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) { metric.set(sum_of_entries); } } async fn find_layer( &self, layer_name: &LayerName, ) -> Result, layer_manager::Shutdown> { let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let layer = guard .layer_map()? .iter_historic_layers() .find(|l| &l.layer_name() == layer_name) .map(|found| guard.get_from_desc(&found)); Ok(layer) } pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool { let crnt = self.previous_heatmap.load(); match crnt.as_deref() { Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn { Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn, None => true, }, Some(PreviousHeatmap::Obsolete) => false, None => false, } } /// The timeline heatmap is a hint to secondary locations from the primary location, /// indicating which layers are currently on-disk on the primary. /// /// None is returned if the Timeline is in a state where uploading a heatmap /// doesn't make sense, such as shutting down or initializing. The caller /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { if !self.is_active() { return None; } let guard = self .layers .read(LayerManagerLockHolder::GenerateHeatmap) .await; // Firstly, if there's any heatmap left over from when this location // was a secondary, take that into account. Keep layers that are: // * present in the layer map // * visible // * non-resident // * not evicted since we read the heatmap // // Without this, a new cold, attached location would clobber the previous // heatamp. let previous_heatmap = self.previous_heatmap.load(); let visible_non_resident = match previous_heatmap.as_deref() { Some(PreviousHeatmap::Active { heatmap, read_at, .. }) => Some(heatmap.all_layers().filter_map(|hl| { let desc: PersistentLayerDesc = hl.name.clone().into(); let layer = guard.try_get_from_key(&desc.key())?; if layer.visibility() == LayerVisibilityHint::Covered { return None; } if layer.is_likely_resident() { return None; } if layer.last_evicted_at().happened_after(*read_at) { return None; } Some((desc, hl.metadata.clone(), hl.access_time, hl.cold)) })), Some(PreviousHeatmap::Obsolete) => None, None => None, }; // Secondly, all currently visible, resident layers are included. let resident = guard.likely_resident_layers().filter_map(|layer| { match layer.visibility() { LayerVisibilityHint::Visible => { // Layer is visible to one or more read LSNs: elegible for inclusion in layer map let last_activity_ts = layer.latest_activity(); Some(( layer.layer_desc().clone(), layer.metadata(), last_activity_ts, false, // these layers are not cold )) } LayerVisibilityHint::Covered => { // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. None } } }); let mut layers = match visible_non_resident { Some(non_resident) => { let mut non_resident = non_resident.peekable(); if non_resident.peek().is_none() { tracing::info!(timeline_id=%self.timeline_id, "Previous heatmap now obsolete"); self.previous_heatmap .store(Some(PreviousHeatmap::Obsolete.into())); } non_resident.chain(resident).collect::>() } None => resident.collect::>(), }; // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes // or hours later: // - Cold layers go last for convenience when a human inspects the heatmap. // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might // only exist for a few minutes before being compacted into L1s. // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner // the layer is likely to be covered by an image layer during compaction. layers.sort_by_key(|(desc, _meta, _atime, cold)| { std::cmp::Reverse(( *cold, !LayerMap::is_l0(&desc.key_range, desc.is_delta), desc.lsn_range.end, )) }); let layers = layers .into_iter() .map(|(desc, meta, atime, cold)| { HeatMapLayer::new(desc.layer_name(), meta, atime, cold) }) .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) } pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap { let guard = self .layers .read(LayerManagerLockHolder::GenerateHeatmap) .await; let now = SystemTime::now(); let mut heatmap_layers = Vec::default(); for vl in guard.visible_layers() { if vl.layer_desc().get_lsn_range().start >= end_lsn { continue; } let hl = HeatMapLayer { name: vl.layer_desc().layer_name(), metadata: vl.metadata(), access_time: now, cold: true, }; heatmap_layers.push(hl); } tracing::info!( "Generating unarchival heatmap with {} layers", heatmap_layers.len() ); let heatmap = HeatMapTimeline::new(self.timeline_id, heatmap_layers); PreviousHeatmap::Active { heatmap, read_at: Instant::now(), end_lsn: Some(end_lsn), } } /// Returns true if the given lsn is or was an ancestor branchpoint. pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original // branchpoint in the value in IndexPart::lineage self.ancestor_lsn == lsn || (self.ancestor_lsn == Lsn::INVALID && self.remote_client.is_previous_ancestor_lsn(lsn)) } } #[derive(Clone)] /// Type representing a query in the ([`Lsn`], [`Key`]) space. /// In other words, a set of segments in a 2D space. /// /// This representation has the advatange of avoiding hash map /// allocations for uniform queries. pub(crate) enum VersionedKeySpaceQuery { /// Variant for queries at a single [`Lsn`] Uniform { keyspace: KeySpace, lsn: Lsn }, /// Variant for queries at multiple [`Lsn`]s Scattered { keyspaces_at_lsn: Vec<(Lsn, KeySpace)>, }, } impl VersionedKeySpaceQuery { pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self { Self::Uniform { keyspace, lsn } } pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self { Self::Scattered { keyspaces_at_lsn } } /// Returns the most recent (largest) LSN included in the query. /// If any of the LSNs included in the query are invalid, returns /// an error instead. fn high_watermark_lsn(&self) -> Result { match self { Self::Uniform { lsn, .. } => { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(*lsn)); } Ok(*lsn) } Self::Scattered { keyspaces_at_lsn } => { let mut max_lsn = None; for (lsn, _keyspace) in keyspaces_at_lsn.iter() { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(*lsn)); } max_lsn = std::cmp::max(max_lsn, Some(lsn)); } if let Some(computed) = max_lsn { Ok(*computed) } else { Err(GetVectoredError::Other(anyhow!("empty input"))) } } } } /// Returns the total keyspace being queried: the result of projecting /// everything in the key dimensions onto the key axis. fn total_keyspace(&self) -> KeySpace { match self { Self::Uniform { keyspace, .. } => keyspace.clone(), Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn .iter() .map(|(_lsn, keyspace)| keyspace) .fold(KeySpace::default(), |mut acc, v| { acc.merge(v); acc }), } } /// Returns LSN for a specific key. /// /// Invariant: requested key must be part of [`Self::total_keyspace`] pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn { match self { Self::Uniform { lsn, .. } => *lsn, Self::Scattered { keyspaces_at_lsn } => { keyspaces_at_lsn .iter() .find(|(_lsn, keyspace)| keyspace.contains(key)) .expect("Returned key was requested") .0 } } } /// Remove any parts of the query (segments) which overlap with the provided /// key space (also segments). fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace { match self { Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove), Self::Scattered { keyspaces_at_lsn } => { let mut removed_accum = KeySpaceRandomAccum::new(); keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| { let removed = keyspace.remove_overlapping_with(to_remove); removed_accum.add_keyspace(removed); }); removed_accum.to_keyspace() } } } fn is_empty(&self) -> bool { match self { Self::Uniform { keyspace, .. } => keyspace.is_empty(), Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn .iter() .all(|(_lsn, keyspace)| keyspace.is_empty()), } } /// "Lower" the query on the LSN dimension fn lower(&mut self, to: Lsn) { match self { Self::Uniform { lsn, .. } => { // If the originally requested LSN is smaller than the starting // LSN of the ancestor we are descending into, we need to respect that. // Hence the min. *lsn = std::cmp::min(*lsn, to); } Self::Scattered { keyspaces_at_lsn } => { keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| { *lsn = std::cmp::min(*lsn, to); }); } } } } impl std::fmt::Display for VersionedKeySpaceQuery { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; match self { VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { write!(f, "{keyspace} @ {lsn}")?; } VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { for (lsn, keyspace) in keyspaces_at_lsn.iter() { write!(f, "{keyspace} @ {lsn},")?; } } } write!(f, "]") } } impl Timeline { #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// /// The algorithm is as follows: /// 1. While some keys are still not done and there's a timeline to visit: /// 2. Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]: /// 2.1: Build the fringe for the current keyspace /// 2.2 Visit the newest layer from the fringe to collect all values for the range it /// intersects /// 2.3. Pop the timeline from the fringe /// 2.4. If the fringe is empty, go back to 1 async fn get_vectored_reconstruct_data( &self, mut query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let original_hwm_lsn = query.high_watermark_lsn().unwrap(); let mut timeline_owned: Arc; let mut timeline = self; let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } let TimelineVisitOutcome { completed_keyspace: completed, image_covered_keyspace, } = { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "PLAN_IO_TIMELINE", timeline = %timeline.timeline_id, high_watermark_lsn = %query.high_watermark_lsn().unwrap(), ) }) .attached_child(); Self::get_vectored_reconstruct_data_timeline( timeline, &query, reconstruct_state, &self.cancel, &ctx, ) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await? }; query.remove_overlapping_with(&completed); // Do not descend into the ancestor timeline for aux files. // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. query.remove_overlapping_with(&KeySpace { ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved if query.is_empty() { break None; } let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { // Not fully retrieved but no ancestor timeline. break Some(query.total_keyspace()); }; // Now we see if there are keys covered by the image layer but does not exist in the // image layer, which means that the key does not exist. // The block below will stop the vectored search if any of the keys encountered an image layer // which did not contain a snapshot for said key. Since we have already removed all completed // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. let mut removed = query.remove_overlapping_with(&image_covered_keyspace); // Do not fire missing key error and end early for sparse keys. Note that we hava already removed // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of // figuring out what is the inherited key range and do a fine-grained pruning. removed.remove_overlapping_with(&KeySpace { ranges: vec![SPARSE_RANGE], }); if !removed.is_empty() { break Some(removed); } // Each key range in the original query is at some point in the LSN space. // When descending into the ancestor, lower all ranges in the LSN space // such that new changes on the parent timeline are not visible. query.lower(timeline.ancestor_lsn); let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "GET_ANCESTOR", timeline = %timeline.timeline_id, ancestor = %ancestor_timeline.timeline_id, ancestor_lsn = %timeline.ancestor_lsn ) }) .attached_child(); timeline_owned = timeline .get_ready_ancestor_timeline(ancestor_timeline, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await?; timeline = &*timeline_owned; }; // Remove sparse keys from the keyspace so that it doesn't fire errors. let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace { let mut missing_keyspace = missing_keyspace; missing_keyspace.remove_overlapping_with(&KeySpace { ranges: vec![SPARSE_RANGE], }); if missing_keyspace.is_empty() { None } else { Some(missing_keyspace) } } else { None }; if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError { keyspace: missing_keyspace, /* better if we can store the full keyspace */ shard: self.shard_identity.number, original_hwm_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, read_path: std::mem::take(&mut reconstruct_state.read_path), query: None, }))); } Ok(()) } async fn get_vectored_init_fringe( &self, query: &VersionedKeySpaceQuery, ) -> Result { let mut fringe = LayerFringe::new(); let guard = self.layers.read(LayerManagerLockHolder::GetPage).await; match query { VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { // LSNs requested by the compute or determined by the pageserver // are inclusive. Queries to the layer map use exclusive LSNs. // Hence, bump the value before the query - same in the other // match arm. let cont_lsn = Lsn(lsn.0 + 1); guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?; } VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { for (lsn, keyspace) in keyspaces_at_lsn.iter() { let cont_lsn_for_keyspace = Lsn(lsn.0 + 1); guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?; } } } Ok(fringe) } /// Collect the reconstruct data for a keyspace from the specified timeline. /// /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect /// the current keyspace. The current keyspace of the search at any given timeline /// is the original keyspace minus all the keys that have been completed minus /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly, /// but if you merge all the keyspaces in the fringe, you get the "current keyspace". /// /// This is basically a depth-first search visitor implementation where a vertex /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack. /// /// At each iteration pop the top of the fringe (the layer with the highest Lsn) /// and get all the required reconstruct data from the layer in one go. /// /// Returns the completed keyspace and the keyspaces with image coverage. The caller /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, query: &VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { // Prevent GC from progressing while visiting the current timeline. // If we are GC-ing because a new image layer was added while traversing // the timeline, then it will remove layers that are required for fulfilling // the current get request (read-path cannot "look back" and notice the new // image layer). let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn(); // See `compaction::compact_with_gc` for why we need this. let _guard = timeline.gc_compaction_layer_update_lock.read().await; // Initialize the fringe let mut fringe = timeline.get_vectored_init_fringe(query).await?; let mut completed_keyspace = KeySpace::default(); let mut image_covered_keyspace = KeySpaceRandomAccum::new(); while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } if let Some(ref mut read_path) = reconstruct_state.read_path { read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); } // Visit the layer and plan IOs for it let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( keyspace_to_read.clone(), lsn_range, reconstruct_state, ctx, ) .await?; let mut unmapped_keyspace = keyspace_to_read; let cont_lsn = next_cont_lsn; reconstruct_state.on_layer_visited(&layer_to_read); let (keys_done_last_step, keys_with_image_coverage) = reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); if let Some(keys_with_image_coverage) = keys_with_image_coverage { unmapped_keyspace .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); image_covered_keyspace.add_range(keys_with_image_coverage); } // Query the layer map for the next layers to read. // // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not // required for correctness, but avoids visiting extra layers // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await; guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?; // It's safe to drop the layer map lock after planning the next round of reads. // The fringe keeps readable handles for the layers which are safe to read even // if layers were compacted or flushed. // // The more interesting consideration is: "Why is the read algorithm still correct // if the layer map changes while it is operating?". Doing a vectored read on a // timeline boils down to pushing an imaginary lsn boundary downwards for each range // covered by the read. The layer map tells us how to move the lsn downwards for a // range at *a particular point in time*. It is fine for the answer to be different // at two different time points. drop(guard); } } Ok(TimelineVisitOutcome { completed_keyspace, image_covered_keyspace: image_covered_keyspace.consume_keyspace(), }) } async fn get_ready_ancestor_timeline( &self, ancestor: &Arc, ctx: &RequestContext, ) -> Result, GetReadyAncestorError> { // It's possible that the ancestor timeline isn't active yet, or // is active but hasn't yet caught up to the branch point. Wait // for it. // // This cannot happen while the pageserver is running normally, // because you cannot create a branch from a point that isn't // present in the pageserver yet. However, we don't wait for the // branch point to be uploaded to cloud storage before creating // a branch. I.e., the branch LSN need not be remote consistent // for the branching operation to succeed. // // Hence, if we try to load a tenant in such a state where // 1. the existence of the branch was persisted (in IndexPart and/or locally) // 2. but the ancestor state is behind branch_lsn because it was not yet persisted // then we will need to wait for the ancestor timeline to // re-stream WAL up to branch_lsn before we access it. // // How can a tenant get in such a state? // - ungraceful pageserver process exit // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 // // NB: this could be avoided by requiring // branch_lsn >= remote_consistent_lsn // during branch creation. match ancestor.wait_to_become_active(ctx).await { Ok(()) => {} Err(TimelineState::Stopping) => { // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping. return Err(GetReadyAncestorError::Cancelled); } Err(state) => { return Err(GetReadyAncestorError::BadState { timeline_id: ancestor.timeline_id, state, }); } } ancestor .wait_lsn( self.ancestor_lsn, WaitLsnWaiter::Timeline(self), WaitLsnTimeout::Default, ctx, ) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, WaitLsnError::BadState(state) => GetReadyAncestorError::BadState { timeline_id: ancestor.timeline_id, state, }, })?; Ok(ancestor.clone()) } pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { &self.shard_identity } #[inline(always)] pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId { ShardTimelineId { shard_index: ShardIndex { shard_number: self.shard_identity.number, shard_count: self.shard_identity.count, }, timeline_id: self.timeline_id, } } /// Returns a non-frozen open in-memory layer for ingestion. /// /// Takes a witness of timeline writer state lock being held, because it makes no sense to call /// this function without holding the mutex. async fn get_layer_for_write( &self, lsn: Lsn, _guard: &tokio::sync::MutexGuard<'_, Option>, ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self .layers .write(LayerManagerLockHolder::GetLayerForWrite) .await; let last_record_lsn = self.get_last_record_lsn(); ensure!( lsn > last_record_lsn, "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", lsn, last_record_lsn, ); let layer = guard .open_mut()? .get_layer_for_write( lsn, self.conf, self.timeline_id, self.tenant_shard_id, &self.gate, &self.cancel, ctx, ) .await?; Ok(layer) } pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64); self.last_record_lsn.advance(new_lsn); } /// Freeze any existing open in-memory layer and unconditionally notify the flush loop. /// /// Unconditional flush loop notification is given because in sharded cases we will want to /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps. async fn freeze_inmem_layer_at( &self, at: Lsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, ) -> Result { let frozen = { let mut guard = self .layers .write(LayerManagerLockHolder::TryFreezeLayer) .await; guard .open_mut()? .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics) .await }; if frozen { let now = Instant::now(); *(self.last_freeze_ts.write().unwrap()) = now; } // Increment the flush cycle counter and wake up the flush task. // Remember the new value, so that when we listen for the flush // to finish, we know when the flush that we initiated has // finished, instead of some other flush that was started earlier. let mut my_flush_request = 0; let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { return Err(FlushLayerError::NotRunning(flush_loop_state)); } self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; *lsn = std::cmp::max(at, *lsn); }); assert_ne!(my_flush_request, 0); Ok(my_flush_request) } /// Layer flusher task's main loop. async fn flush_loop( self: &Arc, mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { // Always notify waiters about the flush loop exiting since the loop might stop // when the timeline hasn't been cancelled. let scopeguard_rx = layer_flush_start_rx.clone(); scopeguard::defer! { let (flush_counter, _) = *scopeguard_rx.borrow(); let _ = self .layer_flush_done_tx .send_replace((flush_counter, Err(FlushLayerError::Cancelled))); } // Subscribe to L0 delta layer updates, for compaction backpressure. let mut watch_l0 = match self .layers .read(LayerManagerLockHolder::FlushLoop) .await .layer_map() { Ok(lm) => lm.watch_level0_deltas(), Err(Shutdown) => return, }; info!("started flush loop"); loop { tokio::select! { _ = self.cancel.cancelled() => { info!("shutting down layer flush task due to Timeline::cancel"); break; }, _ = layer_flush_start_rx.changed() => {} } trace!("waking up"); let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); // The highest LSN to which we flushed in the loop over frozen layers let mut flushed_to_lsn = Lsn(0); let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); return; } // Break to notify potential waiters as soon as we've flushed the requested LSN. If // more requests have arrived in the meanwhile, we'll resume flushing afterwards. if flushed_to_lsn >= frozen_to_lsn { break Ok(()); } // Fetch the next layer to flush, if any. let (layer, l0_count, frozen_count, frozen_size, open_layer_size) = { let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await; let Ok(lm) = layers.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); return; }; let l0_count = lm.level0_deltas().len(); let frozen_count = lm.frozen_layers.len(); let frozen_size: u64 = lm .frozen_layers .iter() .map(|l| l.estimated_in_mem_size()) .sum(); let open_layer_size: u64 = lm .open_layer .as_ref() .map(|l| l.estimated_in_mem_size()) .unwrap_or(0); let layer = lm.frozen_layers.front().cloned(); (layer, l0_count, frozen_count, frozen_size, open_layer_size) // drop 'layers' lock }; let Some(layer) = layer else { break Ok(()); }; // Stall flushes to backpressure if compaction can't keep up. This is propagated up // to WAL ingestion by having ephemeral layer rolls wait for flushes. if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { if l0_count >= stall_threshold { warn!( "stalling layer flushes for compaction backpressure at {l0_count} \ L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)" ); let stall_timer = self .metrics .flush_delay_histo .start_timer() .record_on_drop(); tokio::select! { result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => { if let Ok(l0) = result.as_deref() { let delay = stall_timer.elapsed().as_secs_f64(); info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s"); } }, _ = self.cancel.cancelled() => {}, } continue; // check again } } // Flush the layer. let flush_timer = self.metrics.flush_time_histo.start_timer(); match self.flush_frozen_layer(layer, ctx).await { Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn), Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; } err @ Err( FlushLayerError::NotRunning(_) | FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); break err.map(|_| ()); } } let flush_duration = flush_timer.stop_and_record(); // Notify the tenant compaction loop if L0 compaction is needed. let l0_count = *watch_l0.borrow(); if l0_count >= self.get_compaction_threshold() { self.l0_compaction_trigger.notify_one(); } // Delay the next flush to backpressure if compaction can't keep up. We delay by the // flush duration such that the flush takes 2x as long. This is propagated up to WAL // ingestion by having ephemeral layer rolls wait for flushes. if let Some(delay_threshold) = self.get_l0_flush_delay_threshold() { if l0_count >= delay_threshold { let delay = flush_duration.as_secs_f64(); info!( "delaying layer flush by {delay:.3}s for compaction backpressure at \ {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)" ); let _delay_timer = self .metrics .flush_delay_histo .start_timer() .record_on_drop(); tokio::select! { _ = tokio::time::sleep(flush_duration) => {}, _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {}, _ = self.cancel.cancelled() => {}, } } } }; // Unsharded tenants should never advance their LSN beyond the end of the // highest layer they write: such gaps between layer data and the frozen LSN // are only legal on sharded tenants. debug_assert!( self.shard_identity.count.count() > 1 || flushed_to_lsn >= frozen_to_lsn || !flushed_to_lsn.is_valid() ); if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 && result.is_ok() { // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised // to us via layer_flush_start_rx, then advance it here. // // This path is only taken for tenants with multiple shards: single sharded tenants should // never encounter a gap in the wal. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); tracing::debug!( "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}" ); if self.set_disk_consistent_lsn(frozen_to_lsn) { if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { tracing::warn!( "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}" ); } } } // Notify any listeners that we're done let _ = self .layer_flush_done_tx .send_replace((flush_counter, result)); } } /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete. async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { let mut rx = self.layer_flush_done_tx.subscribe(); loop { { let (last_result_counter, last_result) = &*rx.borrow(); if *last_result_counter >= request { if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller // here, because it might not be Cloneable return Err(err.clone()); } else { return Ok(()); } } } trace!("waiting for flush to complete"); tokio::select! { rx_e = rx.changed() => { rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?; }, // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring // the notification from [`flush_loop`] that it completed. _ = self.cancel.cancelled() => { tracing::info!("Cancelled layer flush due on timeline shutdown"); return Ok(()) } }; trace!("done") } } /// Flush one frozen in-memory layer to disk, as a new delta layer. /// /// Return value is the last lsn (inclusive) of the layer that was frozen. #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); // Whether to directly create image layers for this flush, or flush them as delta layers let create_image_layer = lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1); #[cfg(test)] { match &mut *self.flush_loop_state.lock().unwrap() { FlushLoopState::NotStarted | FlushLoopState::Exited => { panic!("flush loop not running") } FlushLoopState::Running { expect_initdb_optimization, initdb_optimization_count, .. } => { if create_image_layer { *initdb_optimization_count += 1; } else { assert!(!*expect_initdb_optimization, "expected initdb optimization"); } } } } let (layers_to_upload, delta_layer_to_add) = if create_image_layer { // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not // require downloading anything during initial import. let ((rel_partition, metadata_partition), _lsn) = self .repartition( self.initdb_lsn, self.get_compaction_target_size(), EnumSet::empty(), ctx, ) .await .map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); } // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace. // So that the key ranges don't overlap. let mut partitions = KeyPartitioning::default(); partitions.parts.extend(rel_partition.parts); if !metadata_partition.parts.is_empty() { assert_eq!( metadata_partition.parts.len(), 1, "currently sparse keyspace should only contain a single metadata keyspace" ); // Safety: create_image_layers treat sparse keyspaces differently that it does not scan // every single key within the keyspace, and therefore, it's safe to force converting it // into a dense keyspace before calling this function. partitions .parts .extend(metadata_partition.into_dense().parts); } let mut layers_to_upload = Vec::new(); let (generated_image_layers, is_complete) = self .create_image_layers( &partitions, self.initdb_lsn, None, ImageLayerCreationMode::Initial, ctx, LastImageLayerCreationStatus::Initial, false, // don't yield for L0, we're flushing L0 ) .instrument(info_span!("create_image_layers", mode = %ImageLayerCreationMode::Initial, partition_mode = "initial", lsn = %self.initdb_lsn)) .await?; debug_assert!( matches!(is_complete, LastImageLayerCreationStatus::Complete), "init image generation mode must fully cover the keyspace" ); layers_to_upload.extend(generated_image_layers); (layers_to_upload, None) } else { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. // We will remove frozen layer and add delta layer in one atomic operation later. let Some(layer) = self .create_delta_layer(&frozen_layer, None, ctx) .await .map_err(|e| FlushLayerError::from_anyhow(self, e))? else { panic!("delta layer cannot be empty if no filter is applied"); }; ( // FIXME: even though we have a single image and single delta layer assumption // we push them to vec vec![layer.clone()], Some(layer), ) }; pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); } fail_point!("flush-layer-before-update-remote-consistent-lsn", |_| { Err(FlushLayerError::Other(anyhow!("failpoint").into())) }); let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. { let mut guard = self .layers .write(LayerManagerLockHolder::FlushFrozenLayer) .await; guard.open_mut()?.finish_flush_l0_layer( delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics, ); if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn self.schedule_uploads(disk_consistent_lsn, layers_to_upload) .map_err(|e| FlushLayerError::from_anyhow(self, e))?; } // release lock on 'layers' }; // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this // race situation. // See https://github.com/neondatabase/neon/issues/4526 pausable_failpoint!("flush-frozen-pausable"); // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); Ok(Lsn(lsn_range.end.0 - 1)) } /// Return true if the value changed /// /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { let old_value = self.disk_consistent_lsn.fetch_max(new_value); assert!( new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}" ); self.metrics .disk_consistent_lsn_gauge .set(new_value.0 as i64); new_value != old_value } /// Update metadata file fn schedule_uploads( &self, disk_consistent_lsn: Lsn, layers_to_upload: impl IntoIterator, ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we // don't remember what the correct value that corresponds to some old // LSN is. But if we flush everything, then the value corresponding // current 'last_record_lsn' is correct and we can store it on disk. let RecordLsn { last: last_record_lsn, prev: prev_record_lsn, } = self.last_record_lsn.load(); let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { Some(prev_record_lsn) } else { None }; let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, *self.applied_gc_cutoff_lsn.read(), ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( "{}", x.unwrap() )); for layer in layers_to_upload { self.remote_client.schedule_layer_file_upload(layer)?; } self.remote_client .schedule_index_upload_for_metadata_update(&update)?; Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { self.remote_client .preserve_initdb_archive( &self.tenant_shard_id.tenant_id, &self.timeline_id, &self.cancel, ) .await } // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked // in layer map immediately. The caller is responsible to put it into the layer map. async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, key_range: Option>, ctx: &RequestContext, ) -> anyhow::Result> { let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { let Some((desc, path)) = frozen_layer .write_to_disk( &ctx, key_range, self_clone.l0_flush_global_state.inner(), &self_clone.gate, self_clone.cancel.clone(), ) .await? else { return Ok(None); }; let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?; // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. // // We use fatal_err() below because the after write_to_disk returns with success, // the in-memory state of the filesystem already has the layer file in its final place, // and subsequent pageserver code could think it's durable while it really isn't. let timeline_dir = VirtualFile::open( &self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), &ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); timeline_dir .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); anyhow::Ok(Some(new_delta)) }; // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. use crate::virtual_file::io_engine::IoEngine; match crate::virtual_file::io_engine::get() { IoEngine::NotSet => panic!("io engine not set"), IoEngine::StdFs => { let span = tracing::info_span!("blocking"); tokio::task::spawn_blocking({ move || Handle::current().block_on(work.instrument(span)) }) .await .context("spawn_blocking") .and_then(|x| x) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => work.await, } } async fn repartition( &self, lsn: Lsn, partition_size: u64, flags: EnumSet, ctx: &RequestContext, ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> { let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. return Err(RepartitionError::Other(anyhow!( "repartition() called concurrently" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { return Err(RepartitionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" ))); } let distance = lsn.0 - partition_lsn.0; if *partition_lsn != Lsn(0) && distance <= self.repartition_threshold && !flags.contains(CompactFlags::ForceRepartition) { debug!( distance, threshold = self.repartition_threshold, "no repartitioning needed" ); return Ok(( (dense_partition.clone(), sparse_partition.clone()), *partition_lsn, )); } let (dense_ks, sparse_ks) = self .collect_keyspace(lsn, ctx) .await .map_err(RepartitionError::CollectKeyspace)?; let dense_partitioning = dense_ks.partition( &self.shard_identity, partition_size, postgres_ffi::BLCKSZ as u64, ); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], }; // no partitioning for metadata keys for now let result = ((dense_partitioning, sparse_partitioning), lsn); guard.write(result.clone()); Ok(result) } // Is it time to create a new image layer for the given partition? True if we want to generate. async fn time_for_new_image_layer( &self, partition: &KeySpace, lsn: Lsn, force_image_creation_lsn: Option, ) -> bool { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let Ok(layers) = guard.layer_map() else { return false; }; let mut min_image_lsn: Lsn = Lsn::MAX; let mut max_deltas = 0; for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn); for (img_range, last_img) in image_coverage { let img_lsn = if let Some(last_img) = last_img { last_img.get_lsn_range().end } else { Lsn(0) }; // Let's consider an example: // // delta layer with LSN range 71-81 // delta layer with LSN range 81-91 // delta layer with LSN range 91-101 // image layer at LSN 100 // // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, // there's no need to create a new one. We check this case explicitly, to avoid passing // a bogus range to count_deltas below, with start > end. It's even possible that there // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold)); max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { debug!( "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", img_range.start, img_range.end, num_deltas, img_lsn, lsn ); return true; } } min_image_lsn = min(min_image_lsn, img_lsn); } } // HADRON // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn()); if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 { info!( "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}", partition.ranges[0].start, partition.ranges[0].end, min_image_lsn, force_image_creation_lsn.unwrap(), max_deltas ); return true; } debug!( max_deltas, "none of the partitioned ranges had >= {threshold} deltas" ); false } /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, /// so that at most one image layer will be produced from this function. #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_rel_blocks( self: &Arc, partition: &KeySpace, mut image_layer_writer: ImageLayerWriter, lsn: Lsn, ctx: &RequestContext, img_range: Range, io_concurrency: IoConcurrency, progress: Option<(usize, usize)>, ) -> Result { let mut wrote_keys = false; let mut key_request_accum = KeySpaceAccum::new(); for range in &partition.ranges { let mut key = range.start; while key < range.end { // Decide whether to retain this key: usually we do, but sharded tenants may // need to drop keys that don't belong to them. If we retain the key, add it // to `key_request_accum` for later issuing a vectored get if self.shard_identity.is_key_disposable(&key) { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", key, self.shard_identity.get_shard_number(&key) ); } else { key_request_accum.add_key(key); } let last_key_in_range = key.next() == range.end; key = key.next(); // Maybe flush `key_rest_accum` if key_request_accum.raw_size() >= self.conf.max_get_vectored_keys.get() as u64 || (last_key_in_range && key_request_accum.raw_size() > 0) { let query = VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn); let results = self .get_vectored(query, io_concurrency.clone(), ctx) .await?; if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } for (img_key, img) in results { let img = match img { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the // page without losing any actual user data. That seems better // than failing repeatedly and getting stuck. // // We had a bug at one point, where we truncated the FSM and VM // in the pageserver, but the Postgres didn't know about that // and continued to generate incremental WAL records for pages // that didn't exist in the pageserver. Trying to replay those // WAL records failed to find the previous image of the page. // This special case allows us to recover from that situation. // See https://github.com/neondatabase/neon/issues/2601. // // Unfortunately we cannot do this for the main fork, or for // any metadata keys, keys, as that would lead to actual data // loss. if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { warn!( "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}" ); ZERO_PAGE.clone() } else { return Err(CreateImageLayersError::from(err)); } } }; // Write all the keys we just read into our new image layer. image_layer_writer.put_image(img_key, img, ctx).await?; wrote_keys = true; } } } } let progress_report = progress .map(|(idx, total)| format!("({idx}/{total}) ")) .unwrap_or_default(); if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. info!( "{} produced image layer for rel {}", progress_report, ImageLayerName { key_range: img_range.clone(), lsn }, ); Ok(ImageLayerCreationOutcome::Generated { unfinished_image_layer: image_layer_writer, }) } else { tracing::debug!( "{} no data in range {}-{}", progress_report, img_range.start, img_range.end ); Ok(ImageLayerCreationOutcome::Empty) } } /// Create an image layer for metadata keys. This function produces one image layer for all metadata /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it /// would not be too large to fit in a single image layer. /// /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics /// for metadata keys than relational keys, which is the number of delta files visited during the scan. #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_metadata_keys( self: &Arc, partition: &KeySpace, mut image_layer_writer: ImageLayerWriter, lsn: Lsn, ctx: &RequestContext, img_range: Range, mode: ImageLayerCreationMode, io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should // not contain too many keys, otherwise this takes a lot of memory. let data = self .get_vectored_impl( VersionedKeySpaceQuery::uniform(partition.clone(), lsn), &mut reconstruct_state, ctx, ) .await?; let (data, total_kb_retrieved, total_keys_retrieved) = { let mut new_data = BTreeMap::new(); let mut total_kb_retrieved = 0; let mut total_keys_retrieved = 0; for (k, v) in data { let v = v?; total_kb_retrieved += KEY_SIZE + v.len(); total_keys_retrieved += 1; new_data.insert(k, v); } (new_data, total_kb_retrieved / 1024, total_keys_retrieved) }; let delta_files_accessed = reconstruct_state.get_delta_layers_visited(); let elapsed = begin.elapsed(); let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; info!( "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { return Ok(ImageLayerCreationOutcome::Skip); } if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } let mut wrote_any_image = false; for (k, v) in data { if v.is_empty() { // the key has been deleted, it does not need an image // in metadata keyspace, an empty image == tombstone continue; } wrote_any_image = true; // No need to handle sharding b/c metadata keys are always on the 0-th shard. // TODO: split image layers to avoid too large layer files. Too large image files are not handled // on the normal data path either. image_layer_writer.put_image(k, v, ctx).await?; } if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. info!( "created image layer for metadata {}", ImageLayerName { key_range: img_range.clone(), lsn } ); Ok(ImageLayerCreationOutcome::Generated { unfinished_image_layer: image_layer_writer, }) } else { tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); Ok(ImageLayerCreationOutcome::Empty) } } /// Predicate function which indicates whether we should check if new image layers /// are required. Since checking if new image layers are required is expensive in /// terms of CPU, we only do it in the following cases: /// 1. If the timeline has ingested sufficient WAL to justify the cost or ... /// 2. If enough time has passed since the last check: /// 1. For large tenants, we wish to perform the check more often since they /// suffer from the lack of image layers. Note that we assume sharded tenants /// to be large since non-zero shards do not track the logical size. /// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold; let last_checks_at = self.last_image_layer_creation_check_at.load(); let distance = lsn .checked_sub(last_checks_at) .expect("Attempt to compact with LSN going backwards"); let min_distance = self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance(); let distance_based_decision = distance.0 >= min_distance; let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); let check_required_after = (|| { if self.shard_identity.is_unsharded() { if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { if Some(Into::::into(&logical_size)) < large_timeline_threshold { return Duration::from_secs(3600 * 48); } } } self.get_checkpoint_timeout() })(); let time_based_decision = match *last_check_instant { Some(last_check) => { let elapsed = last_check.elapsed(); elapsed >= check_required_after } None => true, }; // Do the expensive delta layer counting only if this timeline has ingested sufficient // WAL since the last check or a checkpoint timeout interval has elapsed since the last // check. let decision = distance_based_decision || time_based_decision; tracing::info!( "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}", decision, distance_based_decision, time_based_decision ); if decision { self.last_image_layer_creation_check_at.store(lsn); *last_check_instant = Some(Instant::now()); } decision } /// Returns the image layers generated and an enum indicating whether the process is fully completed. /// true = we have generate all image layers, false = we preempt the process for L0 compaction. /// /// `partition_mode` is only for logging purpose and is not used anywhere in this function. #[allow(clippy::too_many_arguments)] async fn create_image_layers( self: &Arc, partitioning: &KeyPartitioning, lsn: Lsn, force_image_creation_lsn: Option, mode: ImageLayerCreationMode, ctx: &RequestContext, last_status: LastImageLayerCreationStatus, yield_for_l0: bool, ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); if partitioning.parts.is_empty() { warn!("no partitions to create image layers for"); return Ok((vec![], LastImageLayerCreationStatus::Complete)); } // We need to avoid holes between generated image layers. // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one // image layer with hole between them. In this case such layer can not be utilized by GC. // // How such hole between partitions can appear? // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. // If there is delta layer <100000000..300000000> then it never be garbage collected because // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { info!( "resuming image layer creation: last_status=incomplete, continue from {}", last_key ); true } else { self.should_check_if_image_layers_required(lsn) }; let mut batch_image_writer = BatchLayerWriter::new(self.conf); let mut all_generated = true; let mut partition_processed = 0; let mut total_partitions = partitioning.parts.len(); let mut last_partition_processed = None; let mut partition_parts = partitioning.parts.clone(); if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { // We need to skip the partitions that have already been processed. let mut found = false; for (i, partition) in partition_parts.iter().enumerate() { if last_key <= partition.end().unwrap() { // ```plain // |------|--------|----------|------| // ^last_key // ^start from this partition // ``` // Why `i+1` instead of `i`? // It is possible that the user did some writes after the previous image layer creation attempt so that // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we // still want to skip this partition, so that we can make progress and avoid generating image layers over // the same partition. Doing a mod to ensure we don't end up with an empty vec. if i + 1 >= total_partitions { // In general, this case should not happen -- if last_key is on the last partition, the previous // iteration of image layer creation should return a complete status. break; // with found=false } partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements total_partitions = partition_parts.len(); // Update the start key to the partition start. start = partition_parts[0].start().unwrap(); found = true; break; } } if !found { // Last key is within the last partition, or larger than all partitions. return Ok((vec![], LastImageLayerCreationStatus::Complete)); } } let total = partition_parts.len(); for (idx, partition) in partition_parts.iter().enumerate() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } partition_processed += 1; let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { for range in &partition.ranges { assert!( range.start.field1 >= METADATA_KEY_BEGIN_PREFIX && range.end.field1 <= METADATA_KEY_END_PREFIX, "metadata keys must be partitioned separately" ); } if mode == ImageLayerCreationMode::Try && !check_for_image_layers { // Skip compaction if there are not enough updates. Metadata compaction will do a scan and // might mess up with evictions. start = img_range.end; continue; } // For initial and force modes, we always generate image layers for metadata keys. } else if let ImageLayerCreationMode::Try = mode { // check_for_image_layers = false -> skip // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate if !check_for_image_layers || !self .time_for_new_image_layer(partition, lsn, force_image_creation_lsn) .await { start = img_range.end; continue; } } if let ImageLayerCreationMode::Force = mode { // When forced to create image layers, we might try and create them where they already // exist. This mode is only used in tests/debug. let layers = self.layers.read(LayerManagerLockHolder::Compaction).await; if layers.contains_key(&PersistentLayerKey { key_range: img_range.clone(), lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), is_delta: false, }) { // TODO: this can be processed with the BatchLayerWriter::finish_with_discard // in the future. tracing::info!( "Skipping image layer at {lsn} {}..{}, already exists", img_range.start, img_range.end ); start = img_range.end; continue; } } let image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &img_range, lsn, &self.gate, self.cancel.clone(), ctx, ) .await .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( "failpoint image-layer-writer-fail-before-finish" ))) }); // Begin Hadron // fail_point!("create-image-layer-fail-simulated-corruption", |_| { self.corruption_detected .store(true, std::sync::atomic::Ordering::Relaxed); Err(CreateImageLayersError::Other(anyhow::anyhow!( "failpoint create-image-layer-fail-simulated-corruption" ))) }); // End Hadron let io_concurrency = IoConcurrency::spawn_from_conf( self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| CreateImageLayersError::Cancelled)?, ); let outcome = if !compact_metadata { self.create_image_layer_for_rel_blocks( partition, image_layer_writer, lsn, ctx, img_range.clone(), io_concurrency, Some((idx, total)), ) .await? } else { self.create_image_layer_for_metadata_keys( partition, image_layer_writer, lsn, ctx, img_range.clone(), mode, io_concurrency, ) .await? }; match outcome { ImageLayerCreationOutcome::Empty => { // No data in this partition, so we don't need to create an image layer (for now). // The next image layer should cover this key range, so we don't advance the `start` // key. } ImageLayerCreationOutcome::Generated { unfinished_image_layer, } => { batch_image_writer.add_unfinished_image_writer( unfinished_image_layer, img_range.clone(), lsn, ); // The next image layer should be generated right after this one. start = img_range.end; } ImageLayerCreationOutcome::Skip => { // We don't need to create an image layer for this partition. // The next image layer should NOT cover this range, otherwise // the keyspace becomes empty (reads don't go past image layers). start = img_range.end; } } if let ImageLayerCreationMode::Try = mode { // We have at least made some progress if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 { // The `Try` mode is currently only used on the compaction path. We want to avoid // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation // if there are too many of them. let image_preempt_threshold = self.get_image_creation_preempt_threshold() * self.get_compaction_threshold(); // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield // when there is a single timeline with more than L0 threshold L0 layers. As long as the // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction. if image_preempt_threshold != 0 { let should_yield = self .l0_compaction_trigger .notified() .now_or_never() .is_some(); if should_yield { tracing::info!( "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", partition.start().unwrap(), partition.end().unwrap() ); last_partition_processed = Some(partition.clone()); all_generated = false; break; } } } } } let image_layers = batch_image_writer .finish(self, ctx) .await .map_err(CreateImageLayersError::Other)?; let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; // FIXME: we could add the images to be uploaded *before* returning from here, but right // now they are being scheduled outside of write lock; current way is inconsistent with // compaction lock order. guard .open_mut()? .track_new_image_layers(&image_layers, &self.metrics); drop_layer_manager_wlock(guard); let duration = timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered if !image_layers.is_empty() { self.update_layer_visibility().await?; } let total_layer_size = image_layers .iter() .map(|l| l.metadata().file_size) .sum::(); if !image_layers.is_empty() { info!( "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", image_layers.len(), total_layer_size, duration.as_secs_f64(), partition_processed, total_partitions ); } Ok(( image_layers, if all_generated { LastImageLayerCreationStatus::Complete } else { LastImageLayerCreationStatus::Incomplete { last_key: if let Some(last_partition_processed) = last_partition_processed { last_partition_processed.end().unwrap_or(Key::MIN) } else { // This branch should be unreachable, but in case it happens, we can just return the start key. Key::MIN }, } }, )) } /// Wait until the background initial logical size calculation is complete, or /// this Timeline is shut down. Calling this function will cause the initial /// logical size calculation to skip waiting for the background jobs barrier. pub(crate) async fn await_initial_logical_size(self: Arc) { if !self.shard_identity.is_shard_zero() { // We don't populate logical size on shard >0: skip waiting for it. return; } if self.remote_client.is_deleting() { // The timeline was created in a deletion-resume state, we don't expect logical size to be populated return; } if self.current_logical_size.current_size().is_exact() { // root timelines are initialized with exact count, but never start the background // calculation return; } if self.cancel.is_cancelled() { // We already requested stopping the tenant, so we cannot wait for the logical size // calculation to complete given the task might have been already cancelled. return; } if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore .get() { await_bg_cancel.cancel(); } else { // We should not wait if we were not able to explicitly instruct // the logical size cancellation to skip the concurrency limit semaphore. // TODO: this is an unexpected case. We should restructure so that it // can't happen. tracing::warn!( "await_initial_logical_size: can't get semaphore cancel token, skipping" ); debug_assert!(false); } tokio::select!( _ = self.current_logical_size.initialized.acquire() => {}, _ = self.cancel.cancelled() => {} ) } /// Detach this timeline from its ancestor by copying all of ancestors layers as this /// Timelines layers up to the ancestor_lsn. /// /// Requires a timeline that: /// - has an ancestor to detach from /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not /// a technical requirement /// /// After the operation has been started, it cannot be canceled. Upon restart it needs to be /// polled again until completion. /// /// During the operation all timelines sharing the data with this timeline will be reparented /// from our ancestor to be branches of this timeline. pub(crate) async fn prepare_to_detach_from_ancestor( self: &Arc, tenant: &crate::tenant::TenantShard, options: detach_ancestor::Options, behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { detach_ancestor::prepare(self, tenant, behavior, options, ctx).await } /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and /// reparents any reparentable children of previous ancestor. /// /// This method is to be called while holding the TenantManager's tenant slot, so during this /// method we cannot be deleted nor can any timeline be deleted. After this method returns /// successfully, tenant must be reloaded. /// /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally /// resetting the tenant. pub(crate) async fn detach_from_ancestor_and_reparent( self: &Arc, tenant: &crate::tenant::TenantShard, prepared: detach_ancestor::PreparedTimelineDetach, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { detach_ancestor::detach_and_reparent( self, tenant, prepared, ancestor_timeline_id, ancestor_lsn, behavior, ctx, ) .await } /// Final step which unblocks the GC. /// /// The tenant must've been reset if ancestry was modified previously (in tenant manager). pub(crate) async fn complete_detaching_timeline_ancestor( self: &Arc, tenant: &crate::tenant::TenantShard, attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result<(), detach_ancestor::Error> { detach_ancestor::complete(self, tenant, attempt, ctx).await } } impl Drop for Timeline { fn drop(&mut self) { if let Some(ancestor) = &self.ancestor_timeline { // This lock should never be poisoned, but in case it is we do a .map() instead of // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. if let Ok(mut gc_info) = ancestor.gc_info.write() { if !gc_info.remove_child_not_offloaded(self.timeline_id) { tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, "Couldn't remove retain_lsn entry from timeline's parent on drop: already removed"); } } } info!( "Timeline {} for tenant {} is being dropped", self.timeline_id, self.tenant_shard_id.tenant_id ); } } pub(crate) use compaction_error::CompactionError; /// In a private mod to enforce that [`CompactionError::is_cancel`] is used /// instead of `match`ing on [`CompactionError::ShuttingDown`]. mod compaction_error { use utils::sync::gate::GateError; use crate::{ pgdatadir_mapping::CollectKeySpaceError, tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized}, virtual_file::owned_buffers_io::write::FlushTaskError, }; /// Top-level failure to compact. Use [`Self::is_cancel`]. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { /// Use [`Self::is_cancel`] instead of checking for this variant. #[error("The timeline or pageserver is shutting down")] #[allow(private_interfaces)] ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`]. #[error(transparent)] Other(anyhow::Error), } #[derive(Debug)] struct ForbidMatching; impl CompactionError { pub fn new_cancelled() -> Self { Self::ShuttingDown(ForbidMatching) } /// Errors that can be ignored, i.e., cancel and shutdown. pub fn is_cancel(&self) -> bool { let other = match self { CompactionError::ShuttingDown(_) => return true, CompactionError::Other(other) => other, }; // The write path of compaction in particular often lacks differentiated // handling errors stemming from cancellation from other errors. // So, if requested, we also check the ::Other variant by downcasting. // The list below has been found empirically from flaky tests and production logs. // The process is simple: on ::Other(), compaction will print the enclosed // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the // line where the write path / compaction code does undifferentiated error handling // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts // below, following the same is_cancel() pattern. let root_cause = other.root_cause(); let upload_queue = root_cause .downcast_ref::() .is_some_and(|e| e.is_stopping()); let timeline = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); let buffered_writer_flush_task_canelled = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); let write_blob_cancelled = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); let gate_closed = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); upload_queue || timeline || buffered_writer_flush_task_canelled || write_blob_cancelled || gate_closed } pub fn into_anyhow(self) -> anyhow::Error { match self { CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self), CompactionError::Other(e) => e, } } pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { if err.is_cancel() { Self::new_cancelled() } else { Self::Other(err.into_anyhow()) } } } } impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { super::upload_queue::NotInitialized::Uninitialized => { CompactionError::Other(anyhow::anyhow!(value)) } super::upload_queue::NotInitialized::ShuttingDown | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(), } } } impl From for CompactionError { fn from(e: super::storage_layer::layer::DownloadError) -> Self { match e { super::storage_layer::layer::DownloadError::TimelineShutdown | super::storage_layer::layer::DownloadError::DownloadCancelled => { CompactionError::new_cancelled() } super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | super::storage_layer::layer::DownloadError::DownloadRequired | super::storage_layer::layer::DownloadError::NotFile(_) | super::storage_layer::layer::DownloadError::DownloadFailed | super::storage_layer::layer::DownloadError::PreStatFailed(_) => { CompactionError::Other(anyhow::anyhow!(e)) } #[cfg(test)] super::storage_layer::layer::DownloadError::Failpoint(_) => { CompactionError::Other(anyhow::anyhow!(e)) } } } } impl From for CompactionError { fn from(_: layer_manager::Shutdown) -> Self { CompactionError::new_cancelled() } } impl From for CompactionError { fn from(e: super::storage_layer::errors::PutError) -> Self { if e.is_cancel() { CompactionError::new_cancelled() } else { CompactionError::Other(e.into_anyhow()) } } } #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); #[derive(Default)] enum DurationRecorder { #[default] NotStarted, Recorded(RecordedDuration, tokio::time::Instant), } impl DurationRecorder { fn till_now(&self) -> DurationRecorder { match self { DurationRecorder::NotStarted => { panic!("must only call on recorded measurements") } DurationRecorder::Recorded(_, ended) => { let now = tokio::time::Instant::now(); DurationRecorder::Recorded(RecordedDuration(now - *ended), now) } } } fn into_recorded(self) -> Option { match self { DurationRecorder::NotStarted => None, DurationRecorder::Recorded(recorded, _) => Some(recorded), } } } /// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the /// delta layer might be different from the min/max key/lsn in the delta layer. Therefore, /// the layer descriptor requires the user to provide the ranges, which should cover all /// keys specified in the `data` field. #[cfg(test)] #[derive(Clone)] pub struct DeltaLayerTestDesc { pub lsn_range: Range, pub key_range: Range, pub data: Vec<(Key, Lsn, Value)>, } #[cfg(test)] #[derive(Clone)] pub struct InMemoryLayerTestDesc { pub lsn_range: Range, pub data: Vec<(Key, Lsn, Value)>, pub is_open: bool, } #[cfg(test)] impl DeltaLayerTestDesc { pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { Self { lsn_range, key_range, data, } } pub fn new_with_inferred_key_range( lsn_range: Range, data: Vec<(Key, Lsn, Value)>, ) -> Self { let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); Self { key_range: (*key_min)..(key_max.next()), lsn_range, data, } } pub(crate) fn layer_name(&self) -> LayerName { LayerName::Delta(super::storage_layer::DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), }) } } impl Timeline { async fn finish_compact_batch( self: &Arc, new_deltas: &[ResidentLayer], new_images: &[ResidentLayer], layers_to_remove: &[Layer], ) -> Result<(), CompactionError> { let mut guard = tokio::select! { guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard, _ = self.cancel.cancelled() => { return Err(CompactionError::new_cancelled()); } }; let mut duplicated_layers = HashSet::new(); let mut insert_layers = Vec::with_capacity(new_deltas.len()); for l in new_deltas { if guard.contains(l.as_ref()) { // expected in tests tracing::error!(layer=%l, "duplicated L1 layer"); // good ways to cause a duplicate: we repeatedly error after taking the writelock // `guard` on self.layers. as of writing this, there are no error returns except // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { return Err(CompactionError::Other(anyhow::anyhow!( "compaction generates a L0 layer file as output, which will cause infinite compaction." ))); } else { insert_layers.push(l.clone()); } } // only remove those inputs which were not outputs let remove_layers: Vec = layers_to_remove .iter() .filter(|l| !duplicated_layers.contains(&l.layer_desc().key())) .cloned() .collect(); if !new_images.is_empty() { guard .open_mut()? .track_new_image_layers(new_images, &self.metrics); } guard .open_mut()? .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); self.remote_client .schedule_compaction_update(&remove_layers, new_deltas)?; drop_layer_manager_wlock(guard); Ok(()) } async fn rewrite_layers( self: &Arc, mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, ) -> Result<(), CompactionError> { let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want // to avoid double-removing, and avoid rewriting something that was removed. replace_layers.retain(|(l, _)| guard.contains(l)); drop_layers.retain(|l| guard.contains(l)); guard .open_mut()? .rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); self.remote_client .schedule_compaction_update(&drop_layers, &upload_layers)?; Ok(()) } /// Schedules the uploads of the given image layers fn upload_new_image_layers( self: &Arc, new_images: impl IntoIterator, ) -> Result<(), super::upload_queue::NotInitialized> { for layer in new_images { self.remote_client.schedule_layer_file_upload(layer)?; } // should any new image layer been created, not uploading index_part will // result in a mismatch between remote_physical_size and layermap calculated // size, which will fail some tests, but should not be an issue otherwise. self.remote_client .schedule_index_upload_for_file_changes()?; Ok(()) } async fn find_gc_time_cutoff( &self, now: SystemTime, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result, PageReconstructError> { debug_assert_current_span_has_tenant_and_timeline_id(); if self.shard_identity.is_shard_zero() { // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself let time_range = if pitr == Duration::ZERO { humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") } else { pitr }; // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) let time_cutoff = now.checked_sub(time_range).unwrap_or(now); let timestamp = to_pg_timestamp(time_cutoff); let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { LsnForTimestamp::Present(lsn) => Some(lsn), LsnForTimestamp::Future(lsn) => { // The timestamp is in the future. That sounds impossible, // but what it really means is that there hasn't been // any commits since the cutoff timestamp. // // In this case we should use the LSN of the most recent commit, // which is implicitly the last LSN in the log. debug!("future({})", lsn); Some(self.get_last_record_lsn()) } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); None } LsnForTimestamp::NoData(lsn) => { debug!("nodata({})", lsn); None } }; Ok(time_cutoff) } else { // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff // from shard zero's index. The index doesn't explicitly tell us the time cutoff, but we may assume that // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a // space cutoff that we would also have respected ourselves. match self .remote_client .download_foreign_index(ShardNumber(0), cancel) .await { Ok((index_part, index_generation, _index_mtime)) => { tracing::info!( "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", index_part.metadata.latest_gc_cutoff_lsn() ); Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) } Err(DownloadError::NotFound) => { // This is unexpected, because during timeline creations shard zero persists to remote // storage before other shards are called, and during timeline deletion non-zeroth shards are // deleted before the zeroth one. However, it should be harmless: if we somehow end up in this // state, then shard zero should _eventually_ write an index when it GCs. tracing::warn!("GC couldn't find shard zero's index for timeline"); Ok(None) } Err(e) => { // TODO: this function should return a different error type than page reconstruct error Err(PageReconstructError::Other(anyhow::anyhow!(e))) } } // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage // controller. Otherwise, it is possible that we see the GC cutoff go backwards while shard zero // is going through a migration if we read the old location's index and it has GC'd ahead of the // new location. This is legal in principle, but problematic in practice because it might result // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards // because they have GC'd past the branch point. } } /// Find the Lsns above which layer files need to be retained on /// garbage collection. /// /// We calculate two cutoffs, one based on time and one based on WAL size. `pitr` /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls /// the space-based retention. /// /// This function doesn't simply to calculate time & space based retention: it treats time-based /// retention as authoritative if enabled, and falls back to space-based retention if calculating /// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might /// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs /// in the response as the GC cutoff point for the timeline. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, now: SystemTime, space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { let _timer = self .metrics .find_gc_cutoffs_histo .start_timer() .record_on_drop(); pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); if cfg!(test) && pitr == Duration::ZERO { // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup return Ok(GcCutoffs { time: Some(self.get_last_record_lsn()), space: space_cutoff, }); } // Calculate a time-based limit on how much to retain: // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { // PITR is not set. Retain the size-based limit, or the default time retention, // whichever requires less data. GcCutoffs { time: Some(self.get_last_record_lsn()), space: std::cmp::max(time_cutoff, space_cutoff), } } (Duration::ZERO, None) => { // PITR is not set, and time lookup failed GcCutoffs { time: Some(self.get_last_record_lsn()), space: space_cutoff, } } (_, None) => { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { time: Some(*self.get_applied_gc_cutoff_lsn()), space: space_cutoff, } } (_, Some(time_cutoff)) => { // PITR interval is set and we looked up timestamp successfully. Ignore // size based retention and make time cutoff authoritative GcCutoffs { time: Some(time_cutoff), space: time_cutoff, } } }) } /// Garbage collect layer files on a timeline that are no longer needed. /// /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. pub(super) async fn gc(&self) -> Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc let _g = tokio::select! { guard = self.gc_lock.lock() => guard, _ = self.cancel.cancelled() => return Ok(GcResult::default()), }; let timer = self.metrics.garbage_collect_histo.start_timer(); fail_point!("before-timeline-gc"); // Is the timeline being deleted? if self.is_stopping() { return Err(GcError::TimelineCancelled); } let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); let time_cutoff = gc_info.cutoffs.time; let retain_lsns = gc_info .retain_lsns .iter() .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. // // Caveat: `refresh_gc_info` is in charged of updating the lease map. // Here, we do not check for stale leases again. let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); ( space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease, ) }; let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default()); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. if standby_horizon != Lsn::INVALID { if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) { const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG { new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff); trace!("holding off GC for standby apply LSN {}", standby_horizon); } else { warn!( "standby is lagging for more than {}MB, not holding gc for it", MAX_ALLOWED_STANDBY_LAG / 1024 / 1024 ) } } } // Reset standby horizon to ignore it if it is not updated till next GC. // It is an easy way to unset it when standby disappears without adding // more conf options. self.standby_horizon.store(Lsn::INVALID); self.metrics .standby_horizon_gauge .set(Lsn::INVALID.0 as i64); let res = self .gc_timeline( space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease, new_gc_cutoff, ) .instrument( info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff), ) .await?; // only record successes timer.stop_and_record(); Ok(res) } async fn gc_timeline( &self, space_cutoff: Lsn, time_cutoff: Option, // None if uninitialized retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, ) -> Result { // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); // Nothing to GC. Return early. let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", ); return Ok(result); } let Some(time_cutoff) = time_cutoff else { // The GC cutoff should have been computed by now, but let's be defensive. info!("Nothing to GC: time_cutoff not yet computed"); return Ok(result); }; // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. // // The GC cutoff should only ever move forwards. let waitlist = { let write_guard = self.applied_gc_cutoff_lsn.lock_for_write(); if *write_guard > new_gc_cutoff { return Err(GcError::BadLsn { why: format!( "Cannot move GC cutoff LSN backwards (was {}, new {})", *write_guard, new_gc_cutoff ), }); } write_guard.store_and_unlock(new_gc_cutoff) }; let waitlist_wait_fut = std::pin::pin!(waitlist.wait()); log_slow( "applied_gc_cutoff waitlist wait", Duration::from_secs(30), waitlist_wait_fut, ) .await; info!("GC starting"); debug!("retain_lsns: {:?}", retain_lsns); let max_retain_lsn = retain_lsns.iter().max(); // Scan all layers in the timeline (remote or on-disk). // // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; // 2. it is older than PITR interval; // 3. it doesn't need to be retained for 'retain_lsns'; // 4. it does not need to be kept for LSNs holding valid leases. // 5. newer on-disk image layers cover the layer's whole key range let layers_to_remove = { let mut layers_to_remove = Vec::new(); let guard = self .layers .read(LayerManagerLockHolder::GarbageCollection) .await; let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? if l.get_lsn_range().end > space_cutoff { debug!( "keeping {} because it's newer than space_cutoff {}", l.layer_name(), space_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? if l.get_lsn_range().end > time_cutoff { debug!( "keeping {} because it's newer than time_cutoff {}", l.layer_name(), time_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; } // 3. Is it needed by a child branch? // NOTE With that we would keep data that // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. if let Some(retain_lsn) = max_retain_lsn { // start_lsn is inclusive if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", l.layer_name(), retain_lsn, l.is_incremental(), ); result.layers_needed_by_branches += 1; continue 'outer; } } // 4. Is there a valid lease that requires us to keep this layer? if let Some(lsn) = &max_lsn_with_valid_lease { // keep if layer start <= any of the lease if &l.get_lsn_range().start <= lsn { debug!( "keeping {} because there is a valid lease preventing GC at {}", l.layer_name(), lsn, ); result.layers_needed_by_leases += 1; continue 'outer; } } // 5. Is there a later on-disk layer for this relation? // // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN // is 102, then it might not have been fully flushed to disk // before crash. // // For example, imagine that the following layers exist: // // 1000 - image (A) // 1000-2000 - delta (B) // 2000 - image (C) // 2000-3000 - delta (D) // 3000 - image (E) // // If GC horizon is at 2500, we can remove layers A and B, but // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } // We didn't find any reason to keep this file, so remove it. info!( "garbage collecting {} is_dropped: xx is_incremental: {}", l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); } layers_to_remove }; if !layers_to_remove.is_empty() { // Persist the new GC cutoff value before we actually remove anything. // This unconditionally schedules also an index_part.json update, even though, we will // be doing one a bit later with the unlinked gc'd layers. let disk_consistent_lsn = self.disk_consistent_lsn.load(); self.schedule_uploads(disk_consistent_lsn, None) .map_err(|e| { if self.cancel.is_cancelled() { GcError::TimelineCancelled } else { GcError::Remote(e) } })?; let mut guard = self .layers .write(LayerManagerLockHolder::GarbageCollection) .await; let gc_layers = layers_to_remove .iter() .flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned()) .collect::>(); result.layers_removed = gc_layers.len() as u64; self.remote_client.schedule_gc_update(&gc_layers)?; guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] { result.doomed_layers = gc_layers; } } info!( "GC completed removing {} layers, cutoff {}", result.layers_removed, new_gc_cutoff ); result.elapsed = now.elapsed().unwrap_or(Duration::ZERO); Ok(result) } /// Reconstruct a value, using the given base image and WAL records in 'data'. pub(crate) async fn reconstruct_value( &self, key: Key, request_lsn: Lsn, mut data: ValueReconstructState, redo_attempt_type: RedoAttemptType, ) -> Result { // Perform WAL redo if needed data.records.reverse(); let fire_critical_error = match redo_attempt_type { RedoAttemptType::ReadPage => true, RedoAttemptType::LegacyCompaction => true, RedoAttemptType::GcCompaction => false, }; // If we have a page image, and no WAL, we're all set if data.records.is_empty() { if let Some((img_lsn, img)) = &data.img { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", key, img_lsn, request_lsn, ); Ok(img.clone()) } else { Err(PageReconstructError::from(anyhow!( "base image for {key} at {request_lsn} not found" ))) } } else { // We need to do WAL redo. // // If we don't have a base image, then the oldest WAL record better initialize // the page if data.img.is_none() && !data.records.first().unwrap().1.will_init() { Err(PageReconstructError::from(anyhow!( "Base image for {} at {} not found, but got {} WAL records", key, request_lsn, data.records.len() ))) } else { if data.img.is_some() { trace!( "found {} WAL records and a base image for {} at {}, performing WAL redo", data.records.len(), key, request_lsn ); } else { trace!( "found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn ); }; let res = self .walredo_mgr .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? .request_redo( key, request_lsn, data.img, data.records, self.pg_version, redo_attempt_type, ) .await; let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { if fire_critical_error { critical_timeline!( self.tenant_shard_id, self.timeline_id, Some(&self.corruption_detected), "walredo failure during page reconstruction: {err:?}" ); } return Err(PageReconstructError::WalRedo( err.context("reconstruct a page image"), )); } }; Ok(img) } } } pub(crate) async fn spawn_download_all_remote_layers( self: Arc, request: DownloadRemoteLayersTaskSpawnRequest, ctx: &RequestContext, ) -> Result { use pageserver_api::models::DownloadRemoteLayersTaskState; // this is not really needed anymore; it has tests which really check the return value from // http api. it would be better not to maintain this anymore. let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap(); if let Some(st) = &*status_guard { match &st.state { DownloadRemoteLayersTaskState::Running => { return Err(st.clone()); } DownloadRemoteLayersTaskState::ShutDown | DownloadRemoteLayersTaskState::Completed => { *status_guard = None; } } } let self_clone = Arc::clone(&self); let task_ctx = ctx.detached_child( TaskKind::DownloadAllRemoteLayers, DownloadBehavior::Download, ); let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, self.tenant_shard_id, Some(self.timeline_id), "download all remote layers task", async move { self_clone.download_all_remote_layers(request, &task_ctx).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); match &mut *status_guard { None => { warn!("tasks status is supposed to be Some(), since we are running"); } Some(st) => { let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap()); if st.task_id != exp_task_id { warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id); } else { st.state = DownloadRemoteLayersTaskState::Completed; } } }; Ok(()) } .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); let initial_info = DownloadRemoteLayersTaskInfo { task_id: format!("{task_id}"), state: DownloadRemoteLayersTaskState::Running, total_layer_count: 0, successful_download_count: 0, failed_download_count: 0, }; *status_guard = Some(initial_info.clone()); Ok(initial_info) } async fn download_all_remote_layers( self: &Arc, request: DownloadRemoteLayersTaskSpawnRequest, ctx: &RequestContext, ) { use pageserver_api::models::DownloadRemoteLayersTaskState; let remaining = { let guard = self .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let Ok(lm) = guard.layer_map() else { // technically here we could look into iterating accessible layers, but downloading // all layers of a shutdown timeline makes no sense regardless. tracing::info!("attempted to download all layers of shutdown timeline"); return; }; lm.iter_historic_layers() .map(|desc| guard.get_from_desc(&desc)) .collect::>() }; let total_layer_count = remaining.len(); macro_rules! lock_status { ($st:ident) => { let mut st = self.download_all_remote_layers_task_info.write().unwrap(); let st = st .as_mut() .expect("this function is only called after the task has been spawned"); assert_eq!( st.task_id, format!( "{}", task_mgr::current_task_id().expect("we run inside a task_mgr task") ) ); let $st = st; }; } { lock_status!(st); st.total_layer_count = total_layer_count as u64; } let mut remaining = remaining.into_iter(); let mut have_remaining = true; let mut js = tokio::task::JoinSet::new(); let cancel = task_mgr::shutdown_token(); let limit = request.max_concurrent_downloads; loop { while js.len() < limit.get() && have_remaining && !cancel.is_cancelled() { let Some(next) = remaining.next() else { have_remaining = false; break; }; let span = tracing::info_span!("download", layer = %next); let ctx = ctx.attached_child(); js.spawn( async move { let res = next.download(&ctx).await; (next, res) } .instrument(span), ); } while let Some(res) = js.join_next().await { match res { Ok((_, Ok(_))) => { lock_status!(st); st.successful_download_count += 1; } Ok((layer, Err(e))) => { tracing::error!(%layer, "download failed: {e:#}"); lock_status!(st); st.failed_download_count += 1; } Err(je) if je.is_cancelled() => unreachable!("not used here"), Err(je) if je.is_panic() => { lock_status!(st); st.failed_download_count += 1; } Err(je) => tracing::warn!("unknown joinerror: {je:?}"), } } if js.is_empty() && (!have_remaining || cancel.is_cancelled()) { break; } } { lock_status!(st); st.state = DownloadRemoteLayersTaskState::Completed; } } pub(crate) fn get_download_all_remote_layers_task_info( &self, ) -> Option { self.download_all_remote_layers_task_info .read() .unwrap() .clone() } /* BEGIN_HADRON */ pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result { let guard = self .layers .read(LayerManagerLockHolder::ComputeImageConsistentLsn) .await; let layer_map = guard.layer_map()?; let disk_consistent_lsn = self.get_disk_consistent_lsn(); Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn)) } /* END_HADRON */ } impl Timeline { /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read(LayerManagerLockHolder::Eviction).await; let mut max_layer_size: Option = None; let resident_layers = guard .likely_resident_layers() .map(|layer| { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); let last_activity_ts = layer.latest_activity(); EvictionCandidate { layer: layer.to_owned().into(), last_activity_ts, relative_last_activity: finite_f32::FiniteF32::ZERO, visibility: layer.visibility(), } }) .collect(); DiskUsageEvictionInfo { max_layer_size, resident_layers, } } pub(crate) fn get_shard_index(&self) -> ShardIndex { ShardIndex { shard_number: self.tenant_shard_id.shard_number, shard_count: self.tenant_shard_id.shard_count, } } /// Persistently blocks gc for `Manual` reason. /// /// Returns true if no such block existed before, false otherwise. pub(crate) async fn block_gc(&self, tenant: &super::TenantShard) -> anyhow::Result { use crate::tenant::remote_timeline_client::index::GcBlockingReason; assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); tenant.gc_block.insert(self, GcBlockingReason::Manual).await } /// Persistently unblocks gc for `Manual` reason. pub(crate) async fn unblock_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<()> { use crate::tenant::remote_timeline_client::index::GcBlockingReason; assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); tenant.gc_block.remove(self, GcBlockingReason::Manual).await } #[cfg(test)] pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { self.last_record_lsn.advance(new_lsn); } #[cfg(test)] pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) { self.disk_consistent_lsn.store(new_value); } /// Force create an image layer and place it into the layer map. /// /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`] /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are /// placed into the layer map in one run AND be validated. #[cfg(test)] pub(super) async fn force_create_image_layer( self: &Arc, lsn: Lsn, mut images: Vec<(Key, Bytes)>, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); assert!( lsn <= last_record_lsn, "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}" ); if let Some(check_start_lsn) = check_start_lsn { assert!(lsn >= check_start_lsn); } images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); let min_key = *images.first().map(|(k, _)| k).unwrap(); let end_key = images.last().map(|(k, _)| k).unwrap().next(); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &(min_key..end_key), lsn, &self.gate, self.cancel.clone(), ctx, ) .await?; for (key, img) in images { image_layer_writer.put_image(key, img, ctx).await?; } let (desc, path) = image_layer_writer.finish(ctx).await?; let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!("force created image layer {}", image_layer.local_path()); { let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; guard .open_mut() .unwrap() .force_insert_layer(image_layer.clone()); } // Update remote_timeline_client state to reflect existence of this layer self.remote_client .schedule_layer_file_upload(image_layer) .unwrap(); Ok(()) } /// Force create a delta layer and place it into the layer map. /// /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`] /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are /// placed into the layer map in one run AND be validated. #[cfg(test)] pub(super) async fn force_create_delta_layer( self: &Arc, mut deltas: DeltaLayerTestDesc, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); deltas .data .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start); assert!(deltas.data.last().unwrap().0 < deltas.key_range.end); for (_, lsn, _) in &deltas.data { assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end); } assert!( deltas.lsn_range.end <= last_record_lsn, "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", deltas.lsn_range.end, last_record_lsn ); if let Some(check_start_lsn) = check_start_lsn { assert!(deltas.lsn_range.start >= check_start_lsn); } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, deltas.key_range.start, deltas.lsn_range, &self.gate, self.cancel.clone(), ctx, ) .await?; for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!("force created delta layer {}", delta_layer.local_path()); { let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; guard .open_mut() .unwrap() .force_insert_layer(delta_layer.clone()); } // Update remote_timeline_client state to reflect existence of this layer self.remote_client .schedule_layer_file_upload(delta_layer) .unwrap(); Ok(()) } /// Force create an in-memory layer and place them into the layer map. #[cfg(test)] pub(super) async fn force_create_in_memory_layer( self: &Arc, mut in_memory: InMemoryLayerTestDesc, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { use utils::bin_ser::BeSer; // Validate LSNs if let Some(check_start_lsn) = check_start_lsn { assert!(in_memory.lsn_range.start >= check_start_lsn); } let last_record_lsn = self.get_last_record_lsn(); let layer_end_lsn = if in_memory.is_open { in_memory .data .iter() .map(|(_key, lsn, _value)| lsn) .max() .cloned() } else { Some(in_memory.lsn_range.end) }; if let Some(end) = layer_end_lsn { assert!( end <= last_record_lsn, "advance last record lsn before inserting a layer, end_lsn={end}, last_record_lsn={last_record_lsn}", ); } in_memory.data.iter().for_each(|(_key, lsn, _value)| { assert!(*lsn >= in_memory.lsn_range.start); assert!(*lsn < in_memory.lsn_range.end); }); // Build the batch in_memory .data .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); let data = in_memory .data .into_iter() .map(|(key, lsn, value)| { let value_size = value.serialized_size().unwrap() as usize; (key.to_compact(), lsn, value_size, value) }) .collect::>(); let batch = SerializedValueBatch::from_values(data); // Create the in-memory layer and write the batch into it let layer = InMemoryLayer::create( self.conf, self.timeline_id, self.tenant_shard_id, in_memory.lsn_range.start, &self.gate, // TODO: if we ever use this function in production code, we need to pass the real cancellation token &CancellationToken::new(), ctx, ) .await .unwrap(); layer.put_batch(batch, ctx).await.unwrap(); if !in_memory.is_open { layer.freeze(in_memory.lsn_range.end).await; } info!("force created in-memory layer {:?}", in_memory.lsn_range); // Link the layer to the layer map { let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; let layer_map = guard.open_mut().unwrap(); layer_map.force_insert_in_memory_layer(Arc::new(layer)); } Ok(()) } /// Return all keys at the LSN in the image layers #[cfg(test)] pub(crate) async fn inspect_image_layers( self: &Arc, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read(LayerManagerLockHolder::Testing).await; for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( KeySpace::single(Key::MIN..Key::MAX), lsn..Lsn(lsn.0 + 1), &mut reconstruct_data, ctx, ) .await?; for (k, v) in std::mem::take(&mut reconstruct_data.keys) { let v = v.collect_pending_ios().await?; all_data.push((k, v.img.unwrap().1)); } } } all_data.sort(); Ok(all_data) } /// Get all historic layer descriptors in the layer map #[cfg(test)] pub(crate) async fn inspect_historic_layers( self: &Arc, ) -> anyhow::Result> { let mut layers = Vec::new(); let guard = self.layers.read(LayerManagerLockHolder::Testing).await; for layer in guard.layer_map()?.iter_historic_layers() { layers.push(layer.key()); } Ok(layers) } #[cfg(test)] pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) { let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone(); keyspace.merge(&ks); self.extra_test_dense_keyspace.store(Arc::new(keyspace)); } } /// Tracking writes ingestion does to a particular in-memory layer. /// /// Cleared upon freezing a layer. pub(crate) struct TimelineWriterState { open_layer: Arc, current_size: u64, // Previous Lsn which passed through prev_lsn: Option, // Largest Lsn which passed through the current writer max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, } impl TimelineWriterState { fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, } } } /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. pub(crate) struct TimelineWriter<'a> { tl: &'a Timeline, write_guard: tokio::sync::MutexGuard<'a, Option>, } impl Deref for TimelineWriter<'_> { type Target = Timeline; fn deref(&self) -> &Self::Target { self.tl } } #[derive(PartialEq)] enum OpenLayerAction { Roll, Open, None, } impl TimelineWriter<'_> { async fn handle_open_layer_action( &mut self, at: Lsn, action: OpenLayerAction, ctx: &RequestContext, ) -> anyhow::Result<&Arc> { match action { OpenLayerAction::Roll => { let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); self.roll_layer(freeze_at).await?; self.open_layer(at, ctx).await?; } OpenLayerAction::Open => self.open_layer(at, ctx).await?, OpenLayerAction::None => { assert!(self.write_guard.is_some()); } } Ok(&self.write_guard.as_ref().unwrap().open_layer) } async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { let layer = self .tl .get_layer_for_write(at, &self.write_guard, ctx) .await?; let initial_size = layer.len(); let last_freeze_at = self.last_freeze_at.load(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, )); Ok(()) } async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; // If layer flushes are backpressured due to compaction not keeping up, wait for the flush // to propagate the backpressure up into WAL ingestion. let l0_count = self .tl .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await .layer_map()? .level0_deltas() .len(); let wait_thresholds = [ self.get_l0_flush_delay_threshold(), self.get_l0_flush_stall_threshold(), ]; let wait_threshold = wait_thresholds.into_iter().flatten().min(); // self.write_guard will be taken by the freezing let flush_id = self .tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) .await?; assert!(self.write_guard.is_none()); if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { debug!( "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers" ); self.tl.wait_flush_completion(flush_id).await?; } } if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) } Ok(()) } fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction { let state = &*self.write_guard; let Some(state) = &state else { return OpenLayerAction::Open; }; #[cfg(feature = "testing")] if state.cached_last_freeze_at < self.tl.last_freeze_at.load() { // this check and assertion are not really needed because // LayerManager::try_freeze_in_memory_layer will always clear out the // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there // is no TimelineWriterState. assert!( state.open_layer.end_lsn.get().is_some(), "our open_layer must be outdated" ); // this would be a memory leak waiting to happen because the in-memory layer always has // an index panic!("BUG: TimelineWriterState held on to frozen in-memory layer."); } if state.prev_lsn == Some(lsn) { // Rolling mid LSN is not supported by [downstream code]. // Hence, only roll at LSN boundaries. // // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422 return OpenLayerAction::None; } if state.current_size == 0 { // Don't roll empty layers return OpenLayerAction::None; } if self.tl.should_roll( state.current_size, state.current_size + new_value_size, self.get_checkpoint_distance(), lsn, state.cached_last_freeze_at, state.open_layer.get_opened_at(), ) { OpenLayerAction::Roll } else { OpenLayerAction::None } } /// Put a batch of keys at the specified Lsns. pub(crate) async fn put_batch( &mut self, batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { if !batch.has_data() { return Ok(()); } // In debug builds, assert that we don't write any keys that don't belong to this shard. // We don't assert this in release builds, since key ownership policies may change over // time. Stray keys will be removed during compaction. if cfg!(debug_assertions) { for metadata in &batch.metadata { if let ValueMeta::Serialized(metadata) = metadata { let key = Key::from_compact(metadata.key); assert!( self.shard_identity.is_key_local(&key) || self.shard_identity.is_key_global(&key), "key {key} does not belong on shard {}", self.shard_identity.shard_index() ); } } } let batch_max_lsn = batch.max_lsn; let buf_size: u64 = batch.buffer_size() as u64; let action = self.get_open_layer_action(batch_max_lsn, buf_size); let layer = self .handle_open_layer_action(batch_max_lsn, action, ctx) .await?; let res = layer.put_batch(batch, ctx).await; if res.is_ok() { // Update the current size only when the entire write was ok. // In case of failures, we may have had partial writes which // render the size tracking out of sync. That's ok because // the checkpoint distance should be significantly smaller // than the S3 single shot upload limit of 5GiB. let state = self.write_guard.as_mut().unwrap(); state.current_size += buf_size; state.prev_lsn = Some(batch_max_lsn); state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn)); } res } #[cfg(test)] /// Test helper, for tests that would like to poke individual values without composing a batch pub(crate) async fn put( &mut self, key: Key, lsn: Lsn, value: &Value, ctx: &RequestContext, ) -> anyhow::Result<()> { use utils::bin_ser::BeSer; if !key.is_valid_key_on_write_path() { bail!( "the request contains data not supported by pageserver at TimelineWriter::put: {}", key ); } let val_ser_size = value.serialized_size().unwrap() as usize; let batch = SerializedValueBatch::from_values(vec![( key.to_compact(), lsn, val_ser_size, value.clone(), )]); self.put_batch(batch, ctx).await } pub(crate) async fn delete_batch( &mut self, batch: &[(Range, Lsn)], ctx: &RequestContext, ) -> anyhow::Result<()> { if let Some((_, lsn)) = batch.first() { let action = self.get_open_layer_action(*lsn, 0); let layer = self.handle_open_layer_action(*lsn, action, ctx).await?; layer.put_tombstones(batch).await?; } Ok(()) } /// Track the end of the latest digested WAL record. /// Remember the (end of) last valid WAL record remembered in the timeline. /// /// Call this after you have finished writing all the WAL up to 'lsn'. /// /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. pub(crate) fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } pub(crate) fn update_current_logical_size(&self, delta: i64) { self.tl.update_current_logical_size(delta) } } // We need TimelineWriter to be send in upcoming conversion of // Timeline::layers to tokio::sync::RwLock. #[test] fn is_send() { fn _assert_send() {} _assert_send::>(); } #[cfg(test)] mod tests { use std::sync::Arc; use pageserver_api::key::Key; use postgres_ffi::PgMajorVersion; use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; use wal_decoder::models::value::Value; use super::HeatMapTimeline; use crate::context::RequestContextBuilder; use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::layer_map::LayerMap; use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { assert_eq!(lhs.all_layers().count(), rhs.all_layers().count()); let lhs_rhs = lhs.all_layers().zip(rhs.all_layers()); for (l, r) in lhs_rhs { assert_eq!(l.name, r.name); assert_eq!(l.metadata, r.metadata); } } #[tokio::test] async fn test_heatmap_generation() { let harness = TenantHarness::create("heatmap_generation").await.unwrap(); let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x20), vec![( Key::from_hex("620000000033333333444444445500000000").unwrap(), Lsn(0x11), Value::Image(test_img("foo")), )], ); let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x20), vec![( Key::from_hex("720000000033333333444444445500000000").unwrap(), Lsn(0x11), Value::Image(test_img("foo")), )], ); let l0_delta = DeltaLayerTestDesc::new( Lsn(0x20)..Lsn(0x30), Key::from_hex("000000000000000000000000000000000000").unwrap() ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), vec![( Key::from_hex("720000000033333333444444445500000000").unwrap(), Lsn(0x25), Value::Image(test_img("foo")), )], ); let delta_layers = vec![ covered_delta.clone(), visible_delta.clone(), l0_delta.clone(), ]; let image_layer = ( Lsn(0x40), vec![( Key::from_hex("620000000033333333444444445500000000").unwrap(), test_img("bar"), )], ); let image_layers = vec![image_layer]; let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), ) .await .unwrap(); let ctx = &ctx.with_scope_timeline(&timeline); // Layer visibility is an input to heatmap generation, so refresh it first timeline.update_layer_visibility().await.unwrap(); let heatmap = timeline .generate_heatmap() .await .expect("Infallible while timeline is not shut down"); assert_eq!(heatmap.timeline_id, timeline.timeline_id); // L0 should come last let heatmap_layers = heatmap.all_layers().collect::>(); assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; for layer in heatmap_layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); let layer_lsn = match &layer.name { LayerName::Delta(d) => d.lsn_range.end, LayerName::Image(i) => i.lsn, }; // Apart from L0s, newest Layers should come first if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) { assert!(layer_lsn <= last_lsn); last_lsn = layer_lsn; } } // Evict all the layers and stash the old heatmap in the timeline. // This simulates a migration to a cold secondary location. let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await; let mut all_layers = Vec::new(); let forever = std::time::Duration::from_secs(120); for layer in guard.likely_resident_layers() { all_layers.push(layer.clone()); layer.evict_and_wait(forever).await.unwrap(); } drop(guard); timeline .previous_heatmap .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), end_lsn: None, }))); // Generate a new heatmap and assert that it contains the same layers as the old one. let post_migration_heatmap = timeline.generate_heatmap().await.unwrap(); assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap); // Download each layer one by one. Generate the heatmap at each step and check // that it's stable. for layer in all_layers { if layer.visibility() == LayerVisibilityHint::Covered { continue; } eprintln!("Downloading {layer} and re-generating heatmap"); let ctx = &RequestContextBuilder::from(ctx) .download_behavior(crate::context::DownloadBehavior::Download) .attached_child(); let _resident = layer .download_and_keep_resident(ctx) .instrument(tracing::info_span!( parent: None, "download_layer", tenant_id = %timeline.tenant_shard_id.tenant_id, shard_id = %timeline.tenant_shard_id.shard_slug(), timeline_id = %timeline.timeline_id )) .await .unwrap(); let post_download_heatmap = timeline.generate_heatmap().await.unwrap(); assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap); } // Everything from the post-migration heatmap is now resident. // Check that we drop it from memory. assert!(matches!( timeline.previous_heatmap.load().as_deref(), Some(PreviousHeatmap::Obsolete) )); } #[tokio::test] async fn test_previous_heatmap_obsoletion() { let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion") .await .unwrap(); let l0_delta = DeltaLayerTestDesc::new( Lsn(0x20)..Lsn(0x30), Key::from_hex("000000000000000000000000000000000000").unwrap() ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), vec![( Key::from_hex("720000000033333333444444445500000000").unwrap(), Lsn(0x25), Value::Image(test_img("foo")), )], ); let image_layer = ( Lsn(0x40), vec![( Key::from_hex("620000000033333333444444445500000000").unwrap(), test_img("bar"), )], ); let delta_layers = vec![l0_delta]; let image_layers = vec![image_layer]; let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), ) .await .unwrap(); // Layer visibility is an input to heatmap generation, so refresh it first timeline.update_layer_visibility().await.unwrap(); let heatmap = timeline .generate_heatmap() .await .expect("Infallible while timeline is not shut down"); // Both layers should be in the heatmap assert!(heatmap.all_layers().count() > 0); // Now simulate a migration. timeline .previous_heatmap .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), end_lsn: None, }))); // Evict all the layers in the previous heatmap let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await; let forever = std::time::Duration::from_secs(120); for layer in guard.likely_resident_layers() { layer.evict_and_wait(forever).await.unwrap(); } drop(guard); // Generate a new heatmap and check that the previous heatmap // has been marked obsolete. let post_eviction_heatmap = timeline .generate_heatmap() .await .expect("Infallible while timeline is not shut down"); assert_eq!(post_eviction_heatmap.all_layers().count(), 0); assert!(matches!( timeline.previous_heatmap.load().as_deref(), Some(PreviousHeatmap::Obsolete) )); } #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline( TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, &ctx, ) .await .unwrap(); let layer = find_some_layer(&timeline).await; let layer = layer .keep_resident() .await .expect("no download => no downloading errors") .drop_eviction_guard(); let forever = std::time::Duration::from_secs(120); let first = layer.evict_and_wait(forever); let second = layer.evict_and_wait(forever); let (first, second) = tokio::join!(first, second); let res = layer.keep_resident().await; assert!(res.is_none(), "{res:?}"); match (first, second) { (Ok(()), Ok(())) => { // because there are no more timeline locks being taken on eviction path, we can // witness all three outcomes here. } (Ok(()), Err(EvictionError::NotFound)) | (Err(EvictionError::NotFound), Ok(())) => { // if one completes before the other, this is fine just as well. } other => unreachable!("unexpected {:?}", other), } } async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline .layers .read(LayerManagerLockHolder::GetLayerMapInfo) .await; let desc = layers .layer_map() .unwrap() .iter_historic_layers() .next() .expect("must find one layer to evict"); layers.get_from_desc(&desc) } } ================================================ FILE: pageserver/src/tenant/upload_queue.rs ================================================ use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Debug; use std::sync::Arc; use std::sync::atomic::AtomicU32; use chrono::NaiveDateTime; use once_cell::sync::Lazy; use tracing::info; use utils::generation::Generation; use utils::lsn::{AtomicLsn, Lsn}; use super::remote_timeline_client::is_same_remote_layer_path; use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer}; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; /// Kill switch for upload queue reordering in case it causes problems. /// TODO: remove this once we have confidence in it. static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy = Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true")); /// Kill switch for index upload coalescing in case it causes problems. /// TODO: remove this once we have confidence in it. static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy = Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true")); // clippy warns that Uninitialized is much smaller than Initialized, which wastes // memory for Uninitialized variants. Doesn't matter in practice, there are not // that many upload queues in a running pageserver, and most of them are initialized // anyway. #[allow(clippy::large_enum_variant)] pub enum UploadQueue { Uninitialized, Initialized(UploadQueueInitialized), Stopped(UploadQueueStopped), } impl UploadQueue { pub fn as_str(&self) -> &'static str { match self { UploadQueue::Uninitialized => "Uninitialized", UploadQueue::Initialized(_) => "Initialized", UploadQueue::Stopped(_) => "Stopped", } } } #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] pub enum OpType { MayReorder, FlushDeletion, } /// This keeps track of queued and in-progress tasks. pub struct UploadQueueInitialized { /// Maximum number of inprogress tasks to schedule. 0 is no limit. pub(crate) inprogress_limit: usize, /// Counter to assign task IDs pub(crate) task_counter: u64, /// The next uploaded index_part.json; assumed to be dirty. /// /// Should not be read, directly except for layer file updates. Instead you should add a /// projected field. pub(crate) dirty: IndexPart, /// The latest remote persisted IndexPart. /// /// Each completed metadata upload will update this. The second item is the task_id which last /// updated the value, used to ensure we never store an older value over a newer one. pub(crate) clean: (IndexPart, Option), /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, /// The Lsn is only updated after our generation has been validated with /// the control plane (unlesss a timeline's generation is None, in which case /// we skip validation) pub(crate) visible_remote_consistent_lsn: Arc, /// Tasks that are currently in-progress. In-progress means that a tokio Task /// has been launched for it. An in-progress task can be busy uploading, but it can /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can /// be waiting for retry in `exponential_backoff`. pub inprogress_tasks: HashMap>, /// Queued operations that have not been launched yet. They might depend on previous /// tasks to finish. For example, metadata upload cannot be performed before all /// preceding layer file uploads have completed. pub queued_operations: VecDeque, /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around /// for error logging. /// /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, /// Ensure we order file operations correctly. pub(crate) recently_deleted: HashSet<(LayerName, Generation)>, /// Deletions that are blocked by the tenant configuration pub(crate) blocked_deletions: Vec, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, /// Permitless semaphore on which any number of `RemoteTimelineClient::shutdown` futures can /// wait on until one of them stops the queue. The semaphore is closed when /// `RemoteTimelineClient::launch_queued_tasks` encounters `UploadOp::Shutdown`. pub(crate) shutdown_ready: Arc, } impl UploadQueueInitialized { pub(super) fn no_pending_work(&self) -> bool { self.inprogress_tasks.is_empty() && self.queued_operations.is_empty() } pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn { self.visible_remote_consistent_lsn.load() } pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { let lsn = self.clean.0.metadata.disk_consistent_lsn(); self.clean.1.map(|_| lsn) } /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump /// the queue if it doesn't conflict with operations ahead of it. /// /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads. /// /// None may be returned even if the queue isn't empty, if no operations are ready yet. /// /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit. pub fn next_ready(&mut self) -> Option<(UploadOp, Vec)> { // If inprogress_tasks is already at limit, don't schedule anything more. if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit { return None; } for (i, candidate) in self.queued_operations.iter().enumerate() { // If this candidate is ready, go for it. Otherwise, try the next one. if self.is_ready(i) { // Shutdown operations are left at the head of the queue, to prevent further // operations from starting. Signal that we're ready to shut down. if matches!(candidate, UploadOp::Shutdown) { assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks"); assert_eq!(i, 0, "shutdown not at head of queue"); self.shutdown_ready.close(); return None; } let mut op = self.queued_operations.remove(i).expect("i can't disappear"); // Coalesce any back-to-back index uploads by only uploading the newest one that's // ready. This typically happens with layer/index/layer/index/... sequences, where // the layers bypass the indexes, leaving the indexes queued. // // If other operations are interleaved between index uploads we don't try to // coalesce them, since we may as well update the index concurrently with them. // This keeps the index fresh and avoids starvation. // // NB: we assume that all uploaded indexes have the same remote path. This // is true at the time of writing: the path only depends on the tenant, // timeline and generation, all of which are static for a timeline instance. // Otherwise, we must be careful not to coalesce different paths. let mut coalesced_ops = Vec::new(); if matches!(op, UploadOp::UploadMetadata { .. }) { while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i) { if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING { break; } if !self.is_ready(i) { break; } coalesced_ops.push(op); op = self.queued_operations.remove(i).expect("i can't disappear"); } } return Some((op, coalesced_ops)); } // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up. if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) { return None; } // If upload queue reordering is disabled, bail out after the first operation. if *DISABLE_UPLOAD_QUEUE_REORDERING { return None; } } None } /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are /// allowed to skip the queue when it's safe to do so, to increase parallelism. /// /// The position must be valid for the queue size. fn is_ready(&self, pos: usize) -> bool { let candidate = self.queued_operations.get(pos).expect("invalid position"); self // Look at in-progress operations, in random order. .inprogress_tasks .values() .map(|task| &task.op) // Then queued operations ahead of the candidate, front-to-back. .chain(self.queued_operations.iter().take(pos)) // Keep track of the active index ahead of each operation. This is used to ensure that // an upload doesn't skip the queue too far, such that it modifies a layer that's // referenced by an active index. // // It's okay that in-progress operations are emitted in random order above, since at // most one of them can be an index upload (enforced by can_bypass). .scan(&self.clean.0, |next_active_index, op| { let active_index = *next_active_index; if let UploadOp::UploadMetadata { uploaded } = op { *next_active_index = uploaded; // stash index for next operation after this } Some((op, active_index)) }) // Check if the candidate can bypass all of them. .all(|(op, active_index)| candidate.can_bypass(op, active_index)) } /// Returns the number of in-progress deletion operations. #[cfg(test)] pub(crate) fn num_inprogress_deletions(&self) -> usize { self.inprogress_tasks .iter() .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_))) .count() } /// Returns the number of in-progress layer uploads. #[cfg(test)] pub(crate) fn num_inprogress_layer_uploads(&self) -> usize { self.inprogress_tasks .iter() .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _))) .count() } /// Test helper that schedules all ready operations into inprogress_tasks, and returns /// references to them. /// /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into /// UploadQueue, so we can use the same code path. #[cfg(test)] fn schedule_ready(&mut self) -> Vec> { let mut tasks = Vec::new(); // NB: schedule operations one by one, to handle conflicts with inprogress_tasks. while let Some((op, coalesced_ops)) = self.next_ready() { self.task_counter += 1; let task = Arc::new(UploadTask { task_id: self.task_counter, op, coalesced_ops, retries: 0.into(), }); self.inprogress_tasks.insert(task.task_id, task.clone()); tasks.push(task); } tasks } /// Test helper that marks an operation as completed, removing it from inprogress_tasks. /// /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into /// UploadQueue, so we can use the same code path. #[cfg(test)] fn complete(&mut self, task_id: u64) { let Some(task) = self.inprogress_tasks.remove(&task_id) else { return; }; // Update the clean index on uploads. if let UploadOp::UploadMetadata { ref uploaded } = task.op { if task.task_id > self.clean.1.unwrap_or_default() { self.clean = (*uploaded.clone(), Some(task.task_id)); } } } } #[derive(Clone, Copy)] pub(super) enum SetDeletedFlagProgress { NotRunning, InProgress(NaiveDateTime), Successful(NaiveDateTime), } pub struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } #[allow(clippy::large_enum_variant, reason = "TODO")] pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, } #[derive(thiserror::Error, Debug)] pub enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, #[error("queue is in state Stopped")] Stopped, #[error("queue is shutting down")] ShuttingDown, } impl NotInitialized { pub(crate) fn is_stopping(&self) -> bool { use NotInitialized::*; match self { Uninitialized => false, Stopped => true, ShuttingDown => true, } } } impl UploadQueue { pub fn initialize_empty_remote( &mut self, metadata: &TimelineMetadata, inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { anyhow::bail!("already initialized, state {}", self.as_str()) } } info!("initializing upload queue for empty remote"); let index_part = IndexPart::empty(metadata.clone()); let state = UploadQueueInitialized { inprogress_limit, dirty: index_part.clone(), clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), recently_deleted: HashSet::new(), blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; *self = UploadQueue::Initialized(state); Ok(self.initialized_mut().expect("we just set it")) } pub fn initialize_with_current_remote_index_part( &mut self, index_part: &IndexPart, inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { anyhow::bail!("already initialized, state {}", self.as_str()) } } info!( "initializing upload queue with remote index_part.disk_consistent_lsn: {}", index_part.metadata.disk_consistent_lsn() ); let state = UploadQueueInitialized { inprogress_limit, dirty: index_part.clone(), clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), ), // what follows are boring default initializations task_counter: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), recently_deleted: HashSet::new(), blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; *self = UploadQueue::Initialized(state); Ok(self.initialized_mut().expect("we just set it")) } pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { Uninitialized => Err(NotInitialized::Uninitialized), Initialized(x) => { if x.shutting_down { Err(NotInitialized::ShuttingDown) } else { Ok(x) } } Stopped(_) => Err(NotInitialized::Stopped), } } pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> { match self { UploadQueue::Initialized(_) | UploadQueue::Uninitialized => { anyhow::bail!("queue is in state {}", self.as_str()) } UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => { anyhow::bail!("queue is in state Stopped(Uninitialized)") } UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable), } } } /// An in-progress upload or delete task. #[derive(Debug)] pub struct UploadTask { /// Unique ID of this task. Used as the key in `inprogress_tasks` above. pub task_id: u64, /// Number of task retries. pub retries: AtomicU32, /// The upload operation. pub op: UploadOp, /// Any upload operations that were coalesced into this operation. This typically happens with /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`. pub coalesced_ops: Vec, } /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug, Clone)] pub struct Delete { pub layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Clone, Debug)] pub enum UploadOp { /// Upload a layer file. The last field indicates the last operation for thie file. UploadLayer(ResidentLayer, LayerFileMetadata, Option), /// Upload a index_part.json file UploadMetadata { /// The next [`UploadQueueInitialized::clean`] after this upload succeeds. uploaded: Box, }, /// Delete layer files Delete(Delete), /// Barrier. When the barrier operation is reached, the channel is closed. Barrier(tokio::sync::watch::Sender<()>), /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise /// this is the same as a Barrier. Shutdown, } impl std::fmt::Display for UploadOp { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { UploadOp::UploadLayer(layer, metadata, mode) => { write!( f, "UploadLayer({}, size={:?}, gen={:?}, mode={:?})", layer, metadata.file_size, metadata.generation, mode ) } UploadOp::UploadMetadata { uploaded, .. } => { write!( f, "UploadMetadata(lsn: {})", uploaded.metadata.disk_consistent_lsn() ) } UploadOp::Delete(delete) => { write!(f, "Delete({} layers)", delete.layers.len()) } UploadOp::Barrier(_) => write!(f, "Barrier"), UploadOp::Shutdown => write!(f, "Shutdown"), } } } impl UploadOp { /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the /// active index when other would be uploaded -- if we allow self to bypass other, this would /// be the active index when self is uploaded. pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool { match (self, other) { // Nothing can bypass a barrier or shutdown, and it can't bypass anything. (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false, (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false, // Uploads and deletes can bypass each other unless they're for the same file. (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => { let aname = &a.layer_desc().layer_name(); let bname = &b.layer_desc().layer_name(); !is_same_remote_layer_path(aname, ameta, bname, bmeta) } (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d)) | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => { d.layers.iter().all(|(dname, dmeta)| { !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta) }) } // Deletes are idempotent and can always bypass each other. (UploadOp::Delete(_), UploadOp::Delete(_)) => true, // Uploads and deletes can bypass an index upload as long as neither the uploaded index // nor the active index below it references the file. A layer can't be modified or // deleted while referenced by an index. // // Similarly, index uploads can bypass uploads and deletes as long as neither the // uploaded index nor the active index references the file (the latter would be // incorrect use by the caller). (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i }) | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => { let uname = u.layer_desc().layer_name(); !i.references(&uname, umeta) && !index.references(&uname, umeta) } (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i }) | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => { d.layers.iter().all(|(dname, dmeta)| { !i.references(dname, dmeta) && !index.references(dname, dmeta) }) } // Indexes can never bypass each other. They can coalesce though, and // `UploadQueue::next_ready()` currently does this when possible. (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false, } } } #[cfg(test)] mod tests { use std::str::FromStr as _; use itertools::Itertools as _; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::Timeline; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::layer::local_layer_path; /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. #[track_caller] fn assert_same_op(a: &UploadOp, b: &UploadOp) { use UploadOp::*; match (a, b) { (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => { assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name()); assert_eq!(ameta, bmeta); assert_eq!(atype, btype); } (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers), (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b), (Barrier(_), Barrier(_)) => {} (Shutdown, Shutdown) => {} (a, b) => panic!("{a:?} != {b:?}"), } } /// Test helper which asserts that two sets of operations are the same. #[track_caller] fn assert_same_ops<'a>( a: impl IntoIterator, b: impl IntoIterator, ) { a.into_iter() .zip_eq(b) .for_each(|(a, b)| assert_same_op(a, b)) } /// Test helper to construct a test timeline. /// /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to /// test the upload queue -- decouple ResidentLayer from Timeline. /// /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to /// obtain a TimelineMetadata from a Timeline. fn make_timeline() -> Arc { // Grab the current test name from the current thread name. // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now. let test_name = std::thread::current().name().unwrap().to_string(); let test_name = Box::leak(test_name.into_boxed_str()); let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .expect("failed to create runtime"); runtime .block_on(async { let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await }) .expect("failed to create timeline") } /// Test helper to construct an (empty) resident layer. fn make_layer(timeline: &Arc, name: &str) -> ResidentLayer { make_layer_with_size(timeline, name, 0) } /// Test helper to construct a resident layer with the given size. fn make_layer_with_size(timeline: &Arc, name: &str, size: usize) -> ResidentLayer { let metadata = LayerFileMetadata { generation: timeline.generation, shard: timeline.get_shard_index(), file_size: size as u64, }; make_layer_with_metadata(timeline, name, metadata) } /// Test helper to construct a layer with the given metadata. fn make_layer_with_metadata( timeline: &Arc, name: &str, metadata: LayerFileMetadata, ) -> ResidentLayer { let name = LayerName::from_str(name).expect("invalid name"); let local_path = local_layer_path( timeline.conf, &timeline.tenant_shard_id, &timeline.timeline_id, &name, &metadata.generation, ); std::fs::write(&local_path, vec![0; metadata.file_size as usize]) .expect("failed to write file"); Layer::for_resident(timeline.conf, timeline, local_path, name, metadata) } /// Test helper to add a layer to an index and return a new index. fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box { let mut index = index.clone(); index .layer_metadata .insert(layer.layer_desc().layer_name(), layer.metadata()); Box::new(index) } /// Test helper to remove a layer from an index and return a new index. fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box { let mut index = index.clone(); index .layer_metadata .remove(&layer.layer_desc().layer_name()); Box::new(index) } /// Nothing can bypass a barrier, and it can't bypass inprogress tasks. #[test] fn schedule_barrier() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer3 = make_layer( &tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let (barrier, _) = tokio::sync::watch::channel(()); // Enqueue non-conflicting upload, delete, and index before and after a barrier. let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], }), UploadOp::UploadMetadata { uploaded: index.clone(), }, UploadOp::Barrier(barrier), UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], }), UploadOp::UploadMetadata { uploaded: index.clone(), }, ]; queue.queued_operations.extend(ops.clone()); // Schedule the initial operations ahead of the barrier. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); assert!(matches!( queue.queued_operations.front(), Some(&UploadOp::Barrier(_)) )); // Complete the initial operations. The barrier isn't scheduled while they're pending. for task in tasks { assert!(queue.schedule_ready().is_empty()); queue.complete(task.task_id); } // Schedule the barrier. The later tasks won't schedule until it completes. let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert!(matches!(tasks[0].op, UploadOp::Barrier(_))); assert_eq!(queue.queued_operations.len(), 3); // Complete the barrier. The rest of the tasks schedule immediately. queue.complete(tasks[0].task_id); let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Deletes can be scheduled in parallel, even if they're for the same file. #[test] fn schedule_delete_parallel() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; let tli = make_timeline(); // Enqueue a bunch of deletes, some with conflicting names. let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer3 = make_layer( &tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let ops = [ UploadOp::Delete(Delete { layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())], }), UploadOp::Delete(Delete { layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], }), UploadOp::Delete(Delete { layers: vec![ (layer1.layer_desc().layer_name(), layer1.metadata()), (layer2.layer_desc().layer_name(), layer2.metadata()), ], }), UploadOp::Delete(Delete { layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())], }), UploadOp::Delete(Delete { layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], }), ]; queue.queued_operations.extend(ops.clone()); // Schedule all ready operations. Since deletes don't conflict, they're all scheduled. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Conflicting uploads are serialized. #[test] fn schedule_upload_conflicts() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Enqueue three versions of the same layer, with different file sizes. let layer0a = make_layer_with_size( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1, ); let layer0b = make_layer_with_size( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2, ); let layer0c = make_layer_with_size( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3, ); let ops = [ UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None), UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None), ]; queue.queued_operations.extend(ops.clone()); // Only one version should be scheduled and uploaded at a time. for op in ops { let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert_same_op(&tasks[0].op, &op); queue.complete(tasks[0].task_id); } assert!(queue.schedule_ready().is_empty()); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Conflicting uploads and deletes are serialized. #[test] fn schedule_upload_delete_conflicts() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Enqueue two layer uploads, with a delete of both layers in between them. These should be // scheduled one at a time, since deletes can't bypass uploads and vice versa. let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::Delete(Delete { layers: vec![ (layer0.layer_desc().layer_name(), layer0.metadata()), (layer1.layer_desc().layer_name(), layer1.metadata()), ], }), UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), ]; queue.queued_operations.extend(ops.clone()); // Only one version should be scheduled and uploaded at a time. for op in ops { let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert_same_op(&tasks[0].op, &op); queue.complete(tasks[0].task_id); } assert!(queue.schedule_ready().is_empty()); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting /// delete/upload operations at the head of the queue. #[test] fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Enqueue two layer uploads, with a delete of both layers in between them. These should be // scheduled one at a time, since deletes can't bypass uploads and vice versa. // // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue // and run immediately. let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer3 = make_layer( &tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::Delete(Delete { layers: vec![ (layer0.layer_desc().layer_name(), layer0.metadata()), (layer1.layer_desc().layer_name(), layer1.metadata()), ], }), UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], }), ]; queue.queued_operations.extend(ops.clone()); // Operations 0, 3, and 4 are scheduled immediately. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]); assert_eq!(queue.queued_operations.len(), 2); Ok(()) } /// Non-conflicting uploads are parallelized. #[test] fn schedule_upload_parallel() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Enqueue three different layer uploads. let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), ]; queue.queued_operations.extend(ops.clone()); // All uploads should be scheduled concurrently. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Index uploads are coalesced. #[test] fn schedule_index_coalesce() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; // Enqueue three uploads of the current empty index. let index = Box::new(queue.clean.0.clone()); let ops = [ UploadOp::UploadMetadata { uploaded: index.clone(), }, UploadOp::UploadMetadata { uploaded: index.clone(), }, UploadOp::UploadMetadata { uploaded: index.clone(), }, ]; queue.queued_operations.extend(ops.clone()); // The index uploads are coalesced into a single operation. let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert_same_op(&tasks[0].op, &ops[2]); assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]); assert!(queue.queued_operations.is_empty()); Ok(()) } /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads. /// This is the common case with layer flushes. #[test] fn schedule_index_upload_chain() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Enqueue three uploads of the current empty index. let index = Box::new(queue.clean.0.clone()); let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let index0 = index_with(&index, &layer0); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let index1 = index_with(&index0, &layer1); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let index2 = index_with(&index1, &layer2); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::UploadMetadata { uploaded: index0.clone(), }, UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), UploadOp::UploadMetadata { uploaded: index1.clone(), }, UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), UploadOp::UploadMetadata { uploaded: index2.clone(), }, ]; queue.queued_operations.extend(ops.clone()); // The layer uploads should be scheduled immediately. The indexes must wait. let upload_tasks = queue.schedule_ready(); assert_same_ops( upload_tasks.iter().map(|t| &t.op), [&ops[0], &ops[2], &ops[4]], ); // layer2 completes first. None of the indexes can upload yet. queue.complete(upload_tasks[2].task_id); assert!(queue.schedule_ready().is_empty()); // layer0 completes. index0 can upload. It completes. queue.complete(upload_tasks[0].task_id); let index_tasks = queue.schedule_ready(); assert_eq!(index_tasks.len(), 1); assert_same_op(&index_tasks[0].op, &ops[1]); queue.complete(index_tasks[0].task_id); // layer 1 completes. This unblocks index 1 and 2, which coalesce into // a single upload for index 2. queue.complete(upload_tasks[1].task_id); let index_tasks = queue.schedule_ready(); assert_eq!(index_tasks.len(), 1); assert_same_op(&index_tasks[0].op, &ops[5]); assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]); assert!(queue.queued_operations.is_empty()); Ok(()) } /// A delete can't bypass an index upload if an index ahead of it still references it. #[test] fn schedule_index_delete_dereferenced() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Create a layer to upload. let layer = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let index_upload = index_with(&queue.clean.0, &layer); // Remove the layer reference in a new index, then delete the layer. let index_deref = index_without(&index_upload, &layer); let ops = [ // Initial upload, with a barrier to prevent index coalescing. UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), UploadOp::UploadMetadata { uploaded: index_upload.clone(), }, UploadOp::Barrier(tokio::sync::watch::channel(()).0), // Dereference the layer and delete it. UploadOp::UploadMetadata { uploaded: index_deref.clone(), }, UploadOp::Delete(Delete { layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], }), ]; queue.queued_operations.extend(ops.clone()); // Operations are serialized. for op in ops { let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert_same_op(&tasks[0].op, &op); queue.complete(tasks[0].task_id); } assert!(queue.queued_operations.is_empty()); Ok(()) } /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a /// dereference/upload/reference cycle can't allow the upload to bypass the reference. #[test] fn schedule_index_upload_dereferenced() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; let tli = make_timeline(); // Create a layer to upload. let layer = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); // Upload the layer. Then dereference the layer, and upload/reference it again. let index_upload = index_with(&queue.clean.0, &layer); let index_deref = index_without(&index_upload, &layer); let index_ref = index_with(&index_deref, &layer); let ops = [ // Initial upload, with a barrier to prevent index coalescing. UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), UploadOp::UploadMetadata { uploaded: index_upload.clone(), }, UploadOp::Barrier(tokio::sync::watch::channel(()).0), // Dereference the layer. UploadOp::UploadMetadata { uploaded: index_deref.clone(), }, // Replace and reference the layer. UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), UploadOp::UploadMetadata { uploaded: index_ref.clone(), }, ]; queue.queued_operations.extend(ops.clone()); // Operations are serialized. for op in ops { let tasks = queue.schedule_ready(); assert_eq!(tasks.len(), 1); assert_same_op(&tasks[0].op, &op); queue.complete(tasks[0].task_id); } assert!(queue.queued_operations.is_empty()); Ok(()) } /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from /// next_ready(), but is left at the head of the queue. #[test] fn schedule_shutdown() -> anyhow::Result<()> { let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer3 = make_layer( &tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); // Enqueue non-conflicting upload, delete, and index before and after a shutdown. let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], }), UploadOp::UploadMetadata { uploaded: index.clone(), }, UploadOp::Shutdown, UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], }), UploadOp::UploadMetadata { uploaded: index.clone(), }, ]; queue.queued_operations.extend(ops.clone()); // Schedule the initial operations ahead of the shutdown. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); assert!(matches!( queue.queued_operations.front(), Some(&UploadOp::Shutdown) )); // Complete the initial operations. The shutdown isn't triggered while they're pending. for task in tasks { assert!(queue.schedule_ready().is_empty()); queue.complete(task.task_id); } // The shutdown is triggered the next time we try to pull an operation. It isn't returned, // but is left in the queue. assert!(!queue.shutdown_ready.is_closed()); assert!(queue.next_ready().is_none()); assert!(queue.shutdown_ready.is_closed()); Ok(()) } /// Scheduling respects inprogress_limit. #[test] fn schedule_inprogress_limit() -> anyhow::Result<()> { // Create a queue with inprogress_limit=2. let mut queue = UploadQueue::Uninitialized; let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?; let tli = make_timeline(); // Enqueue a bunch of uploads. let layer0 = make_layer( &tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer1 = make_layer( &tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer2 = make_layer( &tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let layer3 = make_layer( &tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None), ]; queue.queued_operations.extend(ops.clone()); // Schedule all ready operations. Only 2 are scheduled. let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]); assert!(queue.next_ready().is_none()); // When one completes, another is scheduled. queue.complete(tasks[0].task_id); let tasks = queue.schedule_ready(); assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]); Ok(()) } /// Tests that can_bypass takes name, generation and shard index into account for all operations. #[test] fn can_bypass_path() -> anyhow::Result<()> { let tli = make_timeline(); let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; // Asserts that layers a and b either can or can't bypass each other, for all combinations // of operations (except Delete and UploadMetadata which are special-cased). #[track_caller] fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) { let index = IndexPart::empty(TimelineMetadata::example()); for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) { match (&a, &b) { // Deletes can always bypass each other. (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)), // Indexes can never bypass each other. (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => { assert!(!a.can_bypass(&b, &index)) } // For other operations, assert as requested. (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass), } } } fn make_ops(layer: ResidentLayer) -> Vec { let mut index = IndexPart::empty(TimelineMetadata::example()); index .layer_metadata .insert(layer.layer_desc().layer_name(), layer.metadata()); vec![ UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), UploadOp::Delete(Delete { layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], }), UploadOp::UploadMetadata { uploaded: Box::new(index), }, ] } // Makes a ResidentLayer. let layer = |name: &'static str, shard: Option, generation: u32| -> ResidentLayer { let shard = shard .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8))) .unwrap_or(ShardIndex::unsharded()); let metadata = LayerFileMetadata { shard, generation: Generation::Valid(generation), file_size: 0, }; make_layer_with_metadata(&tli, name, metadata) }; // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as // 0 or >0 generation. assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false); assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false); assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false); // Different names can bypass. assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true); // Different shards can bypass. Shard 0 is different from unsharded. assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true); assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true); // Different generations can bypass, both sharded and unsharded. assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true); assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true); Ok(()) } } ================================================ FILE: pageserver/src/tenant/vectored_blob_io.rs ================================================ //! //! Utilities for vectored reading of variable-sized "blobs". //! //! The "blob" api is an abstraction on top of the "block" api, //! with the main difference being that blobs do not have a fixed //! size (each blob is prefixed with 1 or 4 byte length field) //! //! The vectored apis provided in this module allow for planning //! and executing disk IO which covers multiple blobs. //! //! Reads are planned with [`VectoredReadPlanner`] which will coalesce //! adjacent blocks into a single disk IO request and exectuted by //! [`VectoredBlobReader`] which does all the required offset juggling //! and returns a buffer housing all the blobs and a list of offsets. //! //! Note that the vectored blob api does *not* go through the page cache. use std::collections::BTreeMap; use std::ops::Deref; use bytes::Bytes; use pageserver_api::key::Key; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::BoundedBuf; use utils::lsn::Lsn; use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, Header}; use crate::virtual_file::{self, IoBufferMut, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] pub struct BlobMeta { pub key: Key, pub lsn: Lsn, pub will_init: bool, } /// A view into the vectored blobs read buffer. #[derive(Clone, Debug)] pub(crate) enum BufView<'a> { Slice(&'a [u8]), Bytes(bytes::Bytes), } impl<'a> BufView<'a> { /// Creates a new slice-based view on the blob. pub fn new_slice(slice: &'a [u8]) -> Self { Self::Slice(slice) } /// Creates a new [`bytes::Bytes`]-based view on the blob. pub fn new_bytes(bytes: bytes::Bytes) -> Self { Self::Bytes(bytes) } /// Convert the view into `Bytes`. /// /// If using slice as the underlying storage, the copy will be an O(n) operation. pub fn into_bytes(self) -> Bytes { match self { BufView::Slice(slice) => Bytes::copy_from_slice(slice), BufView::Bytes(bytes) => bytes, } } /// Creates a sub-view of the blob based on the range. fn view(&self, range: std::ops::Range) -> Self { match self { BufView::Slice(slice) => BufView::Slice(&slice[range]), BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)), } } } impl Deref for BufView<'_> { type Target = [u8]; fn deref(&self) -> &Self::Target { match self { BufView::Slice(slice) => slice, BufView::Bytes(bytes) => bytes, } } } impl AsRef<[u8]> for BufView<'_> { fn as_ref(&self) -> &[u8] { match self { BufView::Slice(slice) => slice, BufView::Bytes(bytes) => bytes.as_ref(), } } } impl<'a> From<&'a [u8]> for BufView<'a> { fn from(value: &'a [u8]) -> Self { Self::new_slice(value) } } impl From for BufView<'_> { fn from(value: Bytes) -> Self { Self::new_bytes(value) } } /// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed, /// subject to [`VectoredBlob::compression_bits`]. pub struct VectoredBlob { /// Blob metadata. pub meta: BlobMeta, /// Header start offset. header_start: usize, /// Data start offset. data_start: usize, /// End offset. end: usize, /// Compression used on the data, extracted from the header. compression_bits: u8, } impl VectoredBlob { /// Reads a decompressed view of the blob. pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result, std::io::Error> { let view = buf.view(self.data_start..self.end); match self.compression_bits { BYTE_UNCOMPRESSED => Ok(view), BYTE_ZSTD => { let mut decompressed_vec = Vec::new(); let mut decoder = async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); decoder.write_all(&view).await?; decoder.flush().await?; // Zero-copy conversion from `Vec` to `Bytes` Ok(BufView::new_bytes(Bytes::from(decompressed_vec))) } bits => { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, format!( "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.data_start, self.end ), ); Err(error) } } } /// Returns the raw blob including header. pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> { buf.view(self.header_start..self.end) } } impl std::fmt::Display for VectoredBlob { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}@{}, {}..{}", self.meta.key, self.meta.lsn, self.data_start, self.end ) } } /// Return type of [`VectoredBlobReader::read_blobs`] pub struct VectoredBlobsBuf { /// Buffer for all blobs in this read pub buf: IoBufferMut, /// Offsets into the buffer and metadata for all blobs in this read pub blobs: Vec, } /// Description of one disk read for multiple blobs. /// Used as the argument form [`VectoredBlobReader::read_blobs`] #[derive(Debug)] pub struct VectoredRead { pub start: u64, pub end: u64, /// Start offset and metadata for each blob in this read pub blobs_at: VecMap, } impl VectoredRead { pub(crate) fn size(&self) -> usize { (self.end - self.start) as usize } } #[derive(Eq, PartialEq, Debug)] pub(crate) enum VectoredReadExtended { Yes, No, } /// A vectored read builder that tries to coalesce all reads that fits in a chunk. pub(crate) struct ChunkedVectoredReadBuilder { /// Start block number start_blk_no: usize, /// End block number (exclusive). end_blk_no: usize, /// Start offset and metadata for each blob in this read blobs_at: VecMap, max_read_size: Option, } impl ChunkedVectoredReadBuilder { const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment(); /// Start building a new vectored read. /// /// Note that by design, this does not check against reading more than `max_read_size` to /// support reading larger blobs than the configuration value. The builder will be single use /// however after that. fn new_impl( start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: Option, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at .append(start_offset, meta) .expect("First insertion always succeeds"); let start_blk_no = start_offset as usize / Self::CHUNK_SIZE; let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE); Self { start_blk_no, end_blk_no, blobs_at, max_read_size, } } pub(crate) fn new( start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize, ) -> Self { Self::new_impl(start_offset, end_offset, meta, Some(max_read_size)) } pub(crate) fn new_streaming(start_offset: u64, end_offset: u64, meta: BlobMeta) -> Self { Self::new_impl(start_offset, end_offset, meta, None) } /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. /// /// The resulting size also must be below the max read size. pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let start_blk_no = start as usize / Self::CHUNK_SIZE; let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE); let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { let coalesced_size = (end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE; coalesced_size <= max_read_size } else { true } }; // True if the second block starts in the same block or the immediate next block where the first block ended. // // Note: This automatically handles the case where two blocks are adjacent to each other, // whether they starts on chunk size boundary or not. let is_adjacent_chunk_read = { // 1. first.end & second.start are in the same block self.end_blk_no == start_blk_no + 1 || // 2. first.end ends one block before second.start self.end_blk_no == start_blk_no }; if is_adjacent_chunk_read && not_limited_by_max_read_size { self.end_blk_no = end_blk_no; self.blobs_at .append(start, meta) .expect("LSNs are ordered within vectored reads"); return VectoredReadExtended::Yes; } VectoredReadExtended::No } pub(crate) fn size(&self) -> usize { (self.end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE } pub(crate) fn build(self) -> VectoredRead { let start = (self.start_blk_no * Self::CHUNK_SIZE) as u64; let end = (self.end_blk_no * Self::CHUNK_SIZE) as u64; VectoredRead { start, end, blobs_at: self.blobs_at, } } } #[derive(Copy, Clone, Debug)] pub enum BlobFlag { None, Ignore, ReplaceAll, } /// Planner for vectored blob reads. /// /// Blob offsets are received via [`VectoredReadPlanner::handle`] /// and coalesced into disk reads. /// /// The implementation is very simple: /// * Collect all blob offsets in an ordered structure /// * Iterate over the collected blobs and coalesce them into reads at the end pub struct VectoredReadPlanner { // Track all the blob offsets. Start offsets must be ordered. // Values in the value tuples are: // ( // lsn of the blob, // start offset of the blob in the underlying file, // end offset of the blob in the underlying file, // whether the blob initializes the page image or not // see [`pageserver_api::record::NeonWalRecord::will_init`] // ) blobs: BTreeMap>, // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, BlobFlag)>, max_read_size: usize, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { Self { blobs: BTreeMap::new(), prev: None, max_read_size, } } /// Include a new blob in the read plan. /// /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads` /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all /// keys in a given keyspace. This function must be called for each key in the desired /// keyspace (monotonically continuous). [`Self::handle_range_end`] must /// be called after every range in the offset. /// /// In the event that keys are skipped, the behaviour is undefined and can lead to an /// incorrect read plan. We can end up asserting, erroring in wal redo or returning /// incorrect data to the user. /// /// The `flag` argument has two interesting values: /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. /// This is used for WAL records that `will_init`. /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens /// if the blob is cached. pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev { None => { self.prev = Some((key, lsn, offset, flag)); return; } Some(prev) => prev, }; self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); self.prev = Some((key, lsn, offset, flag)); } pub fn handle_range_end(&mut self, offset: u64) { if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev { self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); } self.prev = None; } fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) { match flag { BlobFlag::None => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.push((lsn, start_offset, end_offset, false)); } BlobFlag::ReplaceAll => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.clear(); blobs_for_key.push((lsn, start_offset, end_offset, true)); } BlobFlag::Ignore => {} } } pub fn finish(self) -> Vec { let mut current_read_builder: Option = None; let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { for (lsn, start_offset, end_offset, will_init) in blobs_for_key { let extended = match &mut current_read_builder { Some(read_builder) => read_builder.extend( start_offset, end_offset, BlobMeta { key, lsn, will_init, }, ), None => VectoredReadExtended::No, }; if extended == VectoredReadExtended::No { let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, BlobMeta { key, lsn, will_init, }, self.max_read_size, ); let prev_read_builder = current_read_builder.replace(next_read_builder); // `current_read_builder` is None in the first iteration of the outer loop if let Some(read_builder) = prev_read_builder { reads.push(read_builder.build()); } } } } if let Some(read_builder) = current_read_builder { reads.push(read_builder.build()); } reads } } /// Disk reader for vectored blob spans (does not go through the page cache) pub struct VectoredBlobReader<'a> { file: &'a VirtualFile, } impl<'a> VectoredBlobReader<'a> { pub fn new(file: &'a VirtualFile) -> Self { Self { file } } /// Read the requested blobs into the buffer. /// /// We have to deal with the fact that blobs are not fixed size. /// Each blob is prefixed by a size header. /// /// The success return value is a struct which contains the buffer /// filled from disk and a list of offsets at which each blob lies /// in the buffer. pub async fn read_blobs( &self, read: &VectoredRead, buf: IoBufferMut, ctx: &RequestContext, ) -> Result { assert!(read.size() > 0); assert!( read.size() <= buf.capacity(), "{} > {}", read.size(), buf.capacity() ); if cfg!(debug_assertions) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; debug_assert_eq!( read.start % ALIGN, 0, "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", read.start, ALIGN ); } let buf = self .file .read_exact_at(buf.slice(0..read.size()), read.start, ctx) .await? .into_inner(); let blobs_at = read.blobs_at.as_slice(); let mut blobs = Vec::with_capacity(blobs_at.len()); // Blobs in `read` only provide their starting offset. The end offset // of a blob is implicit: the start of the next blob if one exists // or the end of the read. for (blob_start, meta) in blobs_at.iter().copied() { let header_start = (blob_start - read.start) as usize; let header = Header::decode(&buf[header_start..]).map_err(|anyhow_err| { std::io::Error::new(std::io::ErrorKind::InvalidData, anyhow_err) })?; let data_start = header_start + header.header_len; let end = data_start + header.data_len; let compression_bits = header.compression_bits; blobs.push(VectoredBlob { header_start, data_start, end, meta, compression_bits, }); } Ok(VectoredBlobsBuf { buf, blobs }) } } /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. /// /// It provides a streaming API for getting read blobs. It returns a batch when /// `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. pub struct StreamingVectoredReadPlanner { read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, bool)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, /// we will produce a single batch instead of split them. max_read_size: u64, /// Max item count per batch max_cnt: usize, /// Size of the current batch cnt: usize, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, } } pub fn handle( &mut self, key: Key, lsn: Lsn, offset: u64, will_init: bool, ) -> Option { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev { None => { self.prev = Some((key, lsn, offset, will_init)); return None; } Some(prev) => prev, }; let res = self.add_blob( prev_key, prev_lsn, prev_offset, offset, false, prev_will_init, ); self.prev = Some((key, lsn, offset, will_init)); res } pub fn handle_range_end(&mut self, offset: u64) -> Option { let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev { self.add_blob( prev_key, prev_lsn, prev_offset, offset, true, prev_will_init, ) } else { None }; self.prev = None; res } fn add_blob( &mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, is_last_blob_in_read: bool, will_init: bool, ) -> Option { match &mut self.read_builder { Some(read_builder) => { let extended = read_builder.extend( start_offset, end_offset, BlobMeta { key, lsn, will_init, }, ); assert_eq!(extended, VectoredReadExtended::Yes); } None => { self.read_builder = { Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, BlobMeta { key, lsn, will_init, }, )) }; } } let read_builder = self.read_builder.as_mut().unwrap(); self.cnt += 1; if is_last_blob_in_read || read_builder.size() >= self.max_read_size as usize || self.cnt >= self.max_cnt { let prev_read_builder = self.read_builder.take(); self.cnt = 0; // `current_read_builder` is None in the first iteration if let Some(read_builder) = prev_read_builder { return Some(read_builder.build()); } } None } } #[cfg(test)] mod tests { use super::super::blob_io::tests::{random_array, write_maybe_compressed}; use super::*; use crate::context::DownloadBehavior; use crate::page_cache::PAGE_SZ; use crate::task_mgr::TaskKind; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; assert_eq!(read.start % ALIGN, 0); assert_eq!(read.start / ALIGN, offset_range.first().unwrap().2 / ALIGN); let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); let offsets_in_read: Vec<_> = read .blobs_at .as_slice() .iter() .map(|(offset, _)| *offset) .collect(); assert_eq!(expected_offsets_in_read, offsets_in_read); } #[test] fn planner_chunked_coalesce_all_test() { use crate::virtual_file; const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; let max_read_size = CHUNK_SIZE as usize * 8; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = [ (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap (key, lsn, CHUNK_SIZE / 2, BlobFlag::None), (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap (key, lsn, CHUNK_SIZE, BlobFlag::None), (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None), (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None), (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None), (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None), (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) ]; let ranges = [ &[ blob_descriptions[0], blob_descriptions[2], blob_descriptions[4], blob_descriptions[5], blob_descriptions[7], blob_descriptions[8], blob_descriptions[10], ], &blob_descriptions[11..12], &blob_descriptions[13..], ]; let mut planner = VectoredReadPlanner::new(max_read_size); for (key, lsn, offset, flag) in blob_descriptions { planner.handle(key, lsn, offset, flag); } planner.handle_range_end(652 * 1024); let reads = planner.finish(); assert_eq!(reads.len(), ranges.len()); for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } } #[test] fn planner_max_read_size_test() { let max_read_size = 128 * 1024; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = vec![ (key, lsn, 0, BlobFlag::None), (key, lsn, 32 * 1024, BlobFlag::None), (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1 (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2 (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3 (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4 (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5 (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6 ]; let ranges = [ &blob_descriptions[0..3], &blob_descriptions[3..4], &blob_descriptions[4..5], &blob_descriptions[5..6], &blob_descriptions[6..7], &blob_descriptions[7..], ]; let mut planner = VectoredReadPlanner::new(max_read_size); for (key, lsn, offset, flag) in blob_descriptions.clone() { planner.handle(key, lsn, offset, flag); } planner.handle_range_end(652 * 1024); let reads = planner.finish(); assert_eq!(reads.len(), 6); // TODO: could remove zero reads to produce 5 reads here for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } } #[test] fn planner_replacement_test() { const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; let max_read_size = 128 * CHUNK_SIZE as usize; let first_key = Key::MIN; let second_key = first_key.next(); let lsn = Lsn(0); let blob_descriptions = vec![ (first_key, lsn, 0, BlobFlag::None), // First in read 1 (first_key, lsn, CHUNK_SIZE, BlobFlag::None), // Last in read 1 (second_key, lsn, 2 * CHUNK_SIZE, BlobFlag::ReplaceAll), (second_key, lsn, 3 * CHUNK_SIZE, BlobFlag::None), (second_key, lsn, 4 * CHUNK_SIZE, BlobFlag::ReplaceAll), // First in read 2 (second_key, lsn, 5 * CHUNK_SIZE, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; let mut planner = VectoredReadPlanner::new(max_read_size); for (key, lsn, offset, flag) in blob_descriptions.clone() { planner.handle(key, lsn, offset, flag); } planner.handle_range_end(6 * CHUNK_SIZE); let reads = planner.finish(); assert_eq!(reads.len(), 2); for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } } #[test] fn streaming_planner_max_read_size_test() { let max_read_size = 128 * 1024; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = vec![ (key, lsn, 0, BlobFlag::None), (key, lsn, 32 * 1024, BlobFlag::None), (key, lsn, 96 * 1024, BlobFlag::None), (key, lsn, 128 * 1024, BlobFlag::None), (key, lsn, 198 * 1024, BlobFlag::None), (key, lsn, 268 * 1024, BlobFlag::None), (key, lsn, 396 * 1024, BlobFlag::None), (key, lsn, 652 * 1024, BlobFlag::None), ]; let ranges = [ &blob_descriptions[0..3], &blob_descriptions[3..5], &blob_descriptions[5..6], &blob_descriptions[6..7], &blob_descriptions[7..], ]; let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), ranges.len()); for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } } #[test] fn streaming_planner_max_cnt_test() { let max_read_size = 1024 * 1024; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = vec![ (key, lsn, 0, BlobFlag::None), (key, lsn, 32 * 1024, BlobFlag::None), (key, lsn, 96 * 1024, BlobFlag::None), (key, lsn, 128 * 1024, BlobFlag::None), (key, lsn, 198 * 1024, BlobFlag::None), (key, lsn, 268 * 1024, BlobFlag::None), (key, lsn, 396 * 1024, BlobFlag::None), (key, lsn, 652 * 1024, BlobFlag::None), ]; let ranges = [ &blob_descriptions[0..2], &blob_descriptions[2..4], &blob_descriptions[4..6], &blob_descriptions[6..], ]; let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), ranges.len()); for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } } #[test] fn streaming_planner_edge_test() { let max_read_size = 1024 * 1024; let key = Key::MIN; let lsn = Lsn(0); { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); reads.extend(planner.handle_range_end(652 * 1024)); assert!(reads.is_empty()); } { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); } { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 2); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]); } { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read( &reads[0], &[ (key, lsn, 0, BlobFlag::None), (key, lsn, 128 * 1024, BlobFlag::None), ], ); } } async fn round_trip_test_compressed( blobs: &[Vec], compression: bool, ) -> anyhow::Result<()> { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed(blobs, compression, &ctx).await?; let file = VirtualFile::open_v2(&pathbuf, &ctx).await?; let file_len = std::fs::metadata(&pathbuf)?.len(); // Multiply by two (compressed data might need more space), and add a few bytes for the header let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = IoBufferMut::with_capacity(reserved_bytes); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, lsn: Lsn(0), will_init: false, }; for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let end = offsets.get(idx + 1).unwrap_or(&file_len); if idx + 1 == offsets.len() { continue; } let read_builder = ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); let read_blob = &result.blobs[0]; let view = BufView::new_slice(&result.buf); let read_buf = read_blob.read(&view).await?; assert_eq!( &blob[..], &read_buf[..], "mismatch for idx={idx} at offset={offset}" ); // Check that raw_with_header returns a valid header. let raw = read_blob.raw_with_header(&view); let header = Header::decode(&raw)?; if !compression || header.header_len == 1 { assert_eq!(header.compression_bits, BYTE_UNCOMPRESSED); } assert_eq!(raw.len(), header.total_len()); buf = result.buf; } Ok(()) } #[tokio::test] async fn test_really_big_array() -> anyhow::Result<()> { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), b"hello".to_vec(), random_array(66 * PAGE_SZ), vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test_compressed(blobs, false).await?; round_trip_test_compressed(blobs, true).await?; Ok(()) } #[tokio::test] async fn test_arrays_inc() -> anyhow::Result<()> { let blobs = (0..PAGE_SZ / 8) .map(|v| random_array(v * 16)) .collect::>(); round_trip_test_compressed(&blobs, false).await?; round_trip_test_compressed(&blobs, true).await?; Ok(()) } } ================================================ FILE: pageserver/src/tenant.rs ================================================ //! Timeline repository implementation that keeps old data in layer files, and //! the recent changes in ephemeral files. //! //! See tenant/*_layer.rs files. The functions here are responsible for locating //! the correct layer for the get/put call, walking back the timeline branching //! history as needed. //! //! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its //! parent timeline, and the last LSN that has been written to disk. //! use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Debug, Display}; use std::fs::File; use std::future::Future; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, Weak}; use std::time::{Duration, Instant, SystemTime}; use std::{fmt, fs}; use anyhow::{Context, bail}; use arc_swap::ArcSwap; use camino::{Utf8Path, Utf8PathBuf}; use chrono::NaiveDateTime; use enumset::EnumSet; use futures::StreamExt; use futures::stream::FuturesUnordered; use itertools::Itertools as _; use once_cell::sync::Lazy; pub use pageserver_api::models::TenantState; use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, }; use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; use remote_timeline_client::index::GcCompactionState; use remote_timeline_client::manifest::{ LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest, }; use remote_timeline_client::{ FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, download_tenant_manifest, }; use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; use timeline::import_pgdata::ImportingTimeline; use timeline::layer_manager::LayerManagerLockHolder; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, }; use tokio::io::BufReader; use tokio::sync::{Notify, Semaphore, watch}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use upload_queue::NotInitialized; use utils::circuit_breaker::CircuitBreaker; use utils::crashsafe::path_with_suffix_extension; use utils::sync::gate::{Gate, GateGuard}; use utils::timeout::{TimeoutCancellableError, timeout_cancellable}; use utils::try_rcu::ArcSwapExt; use utils::zstd::{create_zst_tarball, extract_zst_tarball}; use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf}; use self::metadata::TimelineMetadata; use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline}; use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context; use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::feature_resolver::{FeatureResolver, TenantFeatureResolver}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC, remove_tenant_metrics, }; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::{ INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, }; use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walingest::WalLagCooldown; use crate::walredo::{PostgresRedoManager, RedoAttemptType}; use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); use utils::crashsafe; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; pub mod block_io; pub mod vectored_blob_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; pub mod metadata; pub mod remote_timeline_client; pub mod storage_layer; pub mod checks; pub mod config; pub mod mgr; pub mod secondary; pub mod tasks; pub mod upload_queue; pub(crate) mod timeline; pub mod size; mod gc_block; mod gc_result; pub(crate) mod throttle; #[cfg(test)] pub mod debug; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; /// The "tenants" part of `tenants//timelines...` pub const TENANTS_SEGMENT_NAME: &str = "tenants"; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// References to shared objects that are passed into each tenant, such /// as the shared remote storage client and process initialization state. #[derive(Clone)] pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, pub l0_flush_global_state: L0FlushGlobalState, pub basebackup_cache: Arc, pub feature_resolver: FeatureResolver, } /// A [`TenantShard`] is really an _attached_ tenant. The configuration /// for an attached tenant is a subset of the [`LocationConf`], represented /// in this struct. #[derive(Clone)] pub(super) struct AttachedTenantConf { tenant_conf: pageserver_api::models::TenantConfig, location: AttachedLocationConfig, /// The deadline before which we are blocked from GC so that /// leases have a chance to be renewed. lsn_lease_deadline: Option, } impl AttachedTenantConf { fn new( conf: &'static PageServerConf, tenant_conf: pageserver_api::models::TenantConfig, location: AttachedLocationConfig, ) -> Self { // Sets a deadline before which we cannot proceed to GC due to lsn lease. // // We do this as the leases mapping are not persisted to disk. By delaying GC by lease // length, we guarantee that all the leases we granted before will have a chance to renew // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single { Some( tokio::time::Instant::now() + TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf), ) } else { // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale // because we don't do GC in these modes. None }; Self { tenant_conf, location, lsn_lease_deadline, } } fn try_from( conf: &'static PageServerConf, location_conf: LocationConf, ) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => { Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { anyhow::bail!( "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode" ) } } } fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { self.lsn_lease_deadline .map(|d| tokio::time::Instant::now() < d) .unwrap_or(false) } } struct TimelinePreload { timeline_id: TimelineId, client: RemoteTimelineClient, index_part: Result, previous_heatmap: Option, } pub(crate) struct TenantPreload { /// The tenant manifest from remote storage, or None if no manifest was found. tenant_manifest: Option, /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest. timelines: HashMap>, } /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { /// Activate as soon as possible Eager, /// Lazy activation in the background, with the option to skip the queue if the need comes up Lazy, } /// /// Tenant consists of multiple timelines. Keep them in a hash table. /// pub struct TenantShard { // Global pageserver config parameters pub conf: &'static PageServerConf, /// The value creation timestamp, used to measure activation delay, see: /// constructed_at: Instant, state: watch::Sender, // Overridden tenant-specific config parameters. // We keep pageserver_api::models::TenantConfig sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. tenant_conf: Arc>, tenant_shard_id: TenantShardId, // The detailed sharding information, beyond the number/count in tenant_shard_id shard_identity: ShardIdentity, /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`TenantShard`] object. /// /// This duplicates the generation stored in LocationConf, but that structure is mutable: /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. generation: Generation, timelines: Mutex>>, /// During timeline creation, we first insert the TimelineId to the /// creating map, then `timelines`, then remove it from the creating map. /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_creating: std::sync::Mutex>, /// Possibly offloaded and archived timelines /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, /// Tracks the timelines that are currently importing into this tenant shard. /// /// Note that importing timelines are also present in [`Self::timelines_creating`]. /// Keep this in mind when ordering lock acquisition. /// /// Lifetime: /// * An imported timeline is created while scanning the bucket on tenant attach /// if the index part contains an `import_pgdata` entry and said field marks the import /// as in progress. /// * Imported timelines are removed when the storage controller calls the post timeline /// import activation endpoint. timelines_importing: std::sync::Mutex>>, /// The last tenant manifest known to be in remote storage. None if the manifest has not yet /// been either downloaded or uploaded. Always Some after tenant attach. /// /// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`. /// /// Do not modify this directly. It is used to check whether a new manifest needs to be /// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via /// `maybe_upload_tenant_manifest`. remote_tenant_manifest: tokio::sync::Mutex>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, walredo_mgr: Option>, /// Provides access to timeline data sitting in the remote storage. pub(crate) remote_storage: GenericRemoteStorage, /// Access to global deletion queue for when this tenant wants to schedule a deletion. deletion_queue_client: DeletionQueueClient, /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, eviction_task_tenant_state: tokio::sync::Mutex, /// Track repeated failures to compact, so that we can back off. /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, /// Signals the tenant compaction loop that there is L0 compaction work to be done. pub(crate) l0_compaction_trigger: Arc, /// Scheduled gc-compaction tasks. scheduled_compaction_tasks: std::sync::Mutex>>, /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy /// background warmup. pub(crate) activate_now_sem: tokio::sync::Semaphore, /// Time it took for the tenant to activate. Zero if not active yet. attach_wal_lag_cooldown: Arc>, // Cancellation token fires when we have entered shutdown(). This is a parent of // Timelines' cancellation token. pub(crate) cancel: CancellationToken, // Users of the TenantShard such as the page service must take this Gate to avoid // trying to use a TenantShard which is shutting down. pub(crate) gate: Gate, /// Throttle applied at the top of [`Timeline::get`]. /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance. pub(crate) pagestream_throttle: Arc, pub(crate) pagestream_throttle_metrics: Arc, /// An ongoing timeline detach concurrency limiter. /// /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense /// to have two running at the same time. A different one can be started if an earlier one /// has failed for whatever reason. ongoing_timeline_detach: std::sync::Mutex>, /// `index_part.json` based gc blocking reason tracking. /// /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before /// proceeding. pub(crate) gc_block: gc_block::GcBlock, l0_flush_global_state: L0FlushGlobalState, pub(crate) feature_resolver: Arc, } impl std::fmt::Debug for TenantShard { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) } } pub(crate) enum WalRedoManager { Prod(WalredoManagerId, PostgresRedoManager), #[cfg(test)] Test(harness::TestRedoManager), } #[derive(thiserror::Error, Debug)] #[error("pageserver is shutting down")] pub(crate) struct GlobalShutDown; impl WalRedoManager { pub(crate) fn new(mgr: PostgresRedoManager) -> Result, GlobalShutDown> { let id = WalredoManagerId::next(); let arc = Arc::new(Self::Prod(id, mgr)); let mut guard = WALREDO_MANAGERS.lock().unwrap(); match &mut *guard { Some(map) => { map.insert(id, Arc::downgrade(&arc)); Ok(arc) } None => Err(GlobalShutDown), } } } impl Drop for WalRedoManager { fn drop(&mut self) { match self { Self::Prod(id, _) => { let mut guard = WALREDO_MANAGERS.lock().unwrap(); if let Some(map) = &mut *guard { map.remove(id).expect("new() registers, drop() unregisters"); } } #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager } } } } /// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down /// the walredo processes outside of the regular order. /// /// This is necessary to work around a systemd bug where it freezes if there are /// walredo processes left => #[allow(clippy::type_complexity)] pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy< Mutex>>>, > = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new()))); #[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] pub(crate) struct WalredoManagerId(u64); impl WalredoManagerId { pub fn next() -> Self { static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { panic!( "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique" ); } Self(id) } } #[cfg(test)] impl From for WalRedoManager { fn from(mgr: harness::TestRedoManager) -> Self { Self::Test(mgr) } } impl WalRedoManager { pub(crate) async fn shutdown(&self) -> bool { match self { Self::Prod(_, mgr) => mgr.shutdown().await, #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager true } } } pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { match self { Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout), #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager } } } /// # Cancel-Safety /// /// This method is cancellation-safe. pub async fn request_redo( &self, key: pageserver_api::key::Key, lsn: Lsn, base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>, pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { match self { Self::Prod(_, mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } #[cfg(test)] Self::Test(mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } } } pub(crate) fn status(&self) -> Option { match self { WalRedoManager::Prod(_, m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } } } /// A very lightweight memory representation of an offloaded timeline. /// /// We need to store the list of offloaded timelines so that we can perform operations on them, /// like unoffloading them, or (at a later date), decide to perform flattening. /// This type has a much smaller memory impact than [`Timeline`], and thus we can store many /// more offloaded timelines than we can manage ones that aren't. pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, /// Whether to retain the branch lsn at the ancestor or not pub ancestor_retain_lsn: Option, /// When the timeline was archived. /// /// Present for future flattening deliberations. pub archived_at: NaiveDateTime, /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. pub delete_progress: TimelineDeleteProgress, /// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it pub deleted_from_ancestor: AtomicBool, _metrics_guard: OffloadedTimelineMetricsGuard, } /// Increases the offloaded timeline count metric when created, and decreases when dropped. struct OffloadedTimelineMetricsGuard; impl OffloadedTimelineMetricsGuard { fn new() -> Self { TIMELINE_STATE_METRIC .with_label_values(&["offloaded"]) .inc(); Self } } impl Drop for OffloadedTimelineMetricsGuard { fn drop(&mut self) { TIMELINE_STATE_METRIC .with_label_values(&["offloaded"]) .dec(); } } impl OffloadedTimeline { /// Obtains an offloaded timeline from a given timeline object. /// /// Returns `None` if the `archived_at` flag couldn't be obtained, i.e. /// the timeline is not in a stopped state. /// Panics if the timeline is not archived. fn from_timeline(timeline: &Timeline) -> Result { let (ancestor_retain_lsn, ancestor_timeline_id) = if let Some(ancestor_timeline) = timeline.ancestor_timeline() { let ancestor_lsn = timeline.get_ancestor_lsn(); let ancestor_timeline_id = ancestor_timeline.timeline_id; let mut gc_info = ancestor_timeline.gc_info.write().unwrap(); gc_info.insert_child(timeline.timeline_id, ancestor_lsn, MaybeOffloaded::Yes); (Some(ancestor_lsn), Some(ancestor_timeline_id)) } else { (None, None) }; let archived_at = timeline .remote_client .archived_at_stopped_queue()? .expect("must be called on an archived timeline"); Ok(Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_retain_lsn, archived_at, delete_progress: timeline.delete_progress.clone(), deleted_from_ancestor: AtomicBool::new(false), _metrics_guard: OffloadedTimelineMetricsGuard::new(), }) } fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self { // We expect to reach this case in tenant loading, where the `retain_lsn` is populated in the parent's `gc_info` // by the `initialize_gc_info` function. let OffloadedTimelineManifest { timeline_id, ancestor_timeline_id, ancestor_retain_lsn, archived_at, } = *manifest; Self { tenant_shard_id, timeline_id, ancestor_timeline_id, ancestor_retain_lsn, archived_at, delete_progress: TimelineDeleteProgress::default(), deleted_from_ancestor: AtomicBool::new(false), _metrics_guard: OffloadedTimelineMetricsGuard::new(), } } fn manifest(&self) -> OffloadedTimelineManifest { let Self { timeline_id, ancestor_timeline_id, ancestor_retain_lsn, archived_at, .. } = self; OffloadedTimelineManifest { timeline_id: *timeline_id, ancestor_timeline_id: *ancestor_timeline_id, ancestor_retain_lsn: *ancestor_retain_lsn, archived_at: *archived_at, } } /// Delete this timeline's retain_lsn from its ancestor, if present in the given tenant fn delete_from_ancestor_with_timelines( &self, timelines: &std::sync::MutexGuard<'_, HashMap>>, ) { if let (Some(_retain_lsn), Some(ancestor_timeline_id)) = (self.ancestor_retain_lsn, self.ancestor_timeline_id) { if let Some((_, ancestor_timeline)) = timelines .iter() .find(|(tid, _tl)| **tid == ancestor_timeline_id) { let removal_happened = ancestor_timeline .gc_info .write() .unwrap() .remove_child_offloaded(self.timeline_id); if !removal_happened { tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, "Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed"); } } } self.deleted_from_ancestor.store(true, Ordering::Release); } /// Call [`Self::delete_from_ancestor_with_timelines`] instead if possible. /// /// As the entire tenant is being dropped, don't bother deregistering the `retain_lsn` from the ancestor. fn defuse_for_tenant_drop(&self) { self.deleted_from_ancestor.store(true, Ordering::Release); } } impl fmt::Debug for OffloadedTimeline { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "OffloadedTimeline<{}>", self.timeline_id) } } impl Drop for OffloadedTimeline { fn drop(&mut self) { if !self.deleted_from_ancestor.load(Ordering::Acquire) { tracing::warn!( "offloaded timeline {} was dropped without having cleaned it up at the ancestor", self.timeline_id ); } } } #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] pub enum MaybeOffloaded { Yes, No, } #[derive(Clone, Debug)] pub enum TimelineOrOffloaded { Timeline(Arc), Offloaded(Arc), Importing(Arc), } impl TimelineOrOffloaded { pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> { match self { TimelineOrOffloaded::Timeline(timeline) => { TimelineOrOffloadedArcRef::Timeline(timeline) } TimelineOrOffloaded::Offloaded(offloaded) => { TimelineOrOffloadedArcRef::Offloaded(offloaded) } TimelineOrOffloaded::Importing(importing) => { TimelineOrOffloadedArcRef::Importing(importing) } } } pub fn tenant_shard_id(&self) -> TenantShardId { self.arc_ref().tenant_shard_id() } pub fn timeline_id(&self) -> TimelineId { self.arc_ref().timeline_id() } pub fn delete_progress(&self) -> &Arc> { match self { TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress, TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress, TimelineOrOffloaded::Importing(importing) => &importing.delete_progress, } } fn maybe_remote_client(&self) -> Option> { match self { TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()), TimelineOrOffloaded::Offloaded(_offloaded) => None, TimelineOrOffloaded::Importing(importing) => { Some(importing.timeline.remote_client.clone()) } } } } pub enum TimelineOrOffloadedArcRef<'a> { Timeline(&'a Arc), Offloaded(&'a Arc), Importing(&'a Arc), } impl TimelineOrOffloadedArcRef<'_> { pub fn tenant_shard_id(&self) -> TenantShardId { match self { TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id, TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id, TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.tenant_shard_id, } } pub fn timeline_id(&self) -> TimelineId { match self { TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id, TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id, TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.timeline_id, } } } impl<'a> From<&'a Arc> for TimelineOrOffloadedArcRef<'a> { fn from(timeline: &'a Arc) -> Self { Self::Timeline(timeline) } } impl<'a> From<&'a Arc> for TimelineOrOffloadedArcRef<'a> { fn from(timeline: &'a Arc) -> Self { Self::Offloaded(timeline) } } impl<'a> From<&'a Arc> for TimelineOrOffloadedArcRef<'a> { fn from(timeline: &'a Arc) -> Self { Self::Importing(timeline) } } #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { #[error("Timeline is shutting down")] ShuttingDown, #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] NotActive { tenant_id: TenantShardId, timeline_id: TimelineId, state: TimelineState, }, #[error("Timeline {tenant_id}/{timeline_id} was not found")] NotFound { tenant_id: TenantShardId, timeline_id: TimelineId, }, } #[derive(Debug, thiserror::Error)] pub enum LoadLocalTimelineError { #[error("FailedToLoad")] Load(#[source] anyhow::Error), #[error("FailedToResumeDeletion")] ResumeDeletion(#[source] anyhow::Error), } #[derive(thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] NotFound, #[error("HasChildren")] HasChildren(Vec), #[error("Timeline deletion is already in progress")] AlreadyInProgress(Arc>), #[error("Cancelled")] Cancelled, #[error(transparent)] Other(#[from] anyhow::Error), } impl Debug for DeleteTimelineError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::NotFound => write!(f, "NotFound"), Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(), Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(), Self::Cancelled => f.debug_tuple("Cancelled").finish(), Self::Other(e) => f.debug_tuple("Other").field(e).finish(), } } } #[derive(thiserror::Error)] pub enum TimelineArchivalError { #[error("NotFound")] NotFound, #[error("Timeout")] Timeout, #[error("Cancelled")] Cancelled, #[error("ancestor is archived: {}", .0)] HasArchivedParent(TimelineId), #[error("HasUnarchivedChildren")] HasUnarchivedChildren(Vec), #[error("Timeline archival is already in progress")] AlreadyInProgress, #[error(transparent)] Other(anyhow::Error), } #[derive(thiserror::Error, Debug)] pub(crate) enum TenantManifestError { #[error("Remote storage error: {0}")] RemoteStorage(anyhow::Error), #[error("Cancelled")] Cancelled, } impl From for TimelineArchivalError { fn from(e: TenantManifestError) -> Self { match e { TenantManifestError::RemoteStorage(e) => Self::Other(e), TenantManifestError::Cancelled => Self::Cancelled, } } } impl Debug for TimelineArchivalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), Self::Cancelled => write!(f, "Cancelled"), Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() } Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(), Self::Other(e) => f.debug_tuple("Other").field(e).finish(), } } } pub enum SetStoppingError { AlreadyStopping(completion::Barrier), Broken, } impl Debug for SetStoppingError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(), Self::Broken => write!(f, "Broken"), } } } #[derive(thiserror::Error, Debug)] pub(crate) enum FinalizeTimelineImportError { #[error("Import task not done yet")] ImportTaskStillRunning, #[error("Shutting down")] ShuttingDown, } /// Arguments to [`TenantShard::create_timeline`]. /// /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`] /// is `None`, the result of the timeline create call is not deterministic. /// /// See [`CreateTimelineIdempotency`] for an idempotency key. #[derive(Debug)] pub(crate) enum CreateTimelineParams { Bootstrap(CreateTimelineParamsBootstrap), Branch(CreateTimelineParamsBranch), ImportPgdata(CreateTimelineParamsImportPgdata), } #[derive(Debug)] pub(crate) struct CreateTimelineParamsBootstrap { pub(crate) new_timeline_id: TimelineId, pub(crate) existing_initdb_timeline_id: Option, pub(crate) pg_version: PgMajorVersion, } /// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here. #[derive(Debug)] pub(crate) struct CreateTimelineParamsBranch { pub(crate) new_timeline_id: TimelineId, pub(crate) ancestor_timeline_id: TimelineId, pub(crate) ancestor_start_lsn: Option, } #[derive(Debug)] pub(crate) struct CreateTimelineParamsImportPgdata { pub(crate) new_timeline_id: TimelineId, pub(crate) location: import_pgdata::index_part_format::Location, pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey, } /// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in [`TenantShard::start_creating_timeline`] in [`TenantShard::start_creating_timeline`]. /// /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`]. /// /// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request. /// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict. /// /// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict. /// It would be nice to have more advanced derive macros to make that special treatment declarative. /// /// Notes: /// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN. /// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`] /// is not considered for idempotency. We can improve on this over time if we deem it necessary. /// #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) enum CreateTimelineIdempotency { /// NB: special treatment, see comment in [`Self`]. FailWithConflict, Bootstrap { pg_version: PgMajorVersion, }, /// NB: branches always have the same `pg_version` as their ancestor. /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`] /// exists as a field, and is set by cplane, it has always been ignored by pageserver when /// determining the child branch pg_version. Branch { ancestor_timeline_id: TimelineId, ancestor_start_lsn: Lsn, }, ImportPgdata(CreatingTimelineIdempotencyImportPgdata), } #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct CreatingTimelineIdempotencyImportPgdata { idempotency_key: import_pgdata::index_part_format::IdempotencyKey, } /// What is returned by [`TenantShard::start_creating_timeline`]. #[must_use] enum StartCreatingTimelineResult { CreateGuard(TimelineCreateGuard), Idempotent(Arc), } #[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { ReadyToActivate, NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), } #[must_use] struct TimelineInitAndSyncNeedsSpawnImportPgdata { timeline: Arc, import_pgdata: import_pgdata::index_part_format::Root, guard: TimelineCreateGuard, } /// What is returned by [`TenantShard::create_timeline`]. enum CreateTimelineResult { Created(Arc), Idempotent(Arc), /// IMPORTANT: This [`Arc`] object is not in [`TenantShard::timelines`] when /// we return this result, nor will this concrete object ever be added there. /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`]. ImportSpawned(Arc), } impl CreateTimelineResult { fn discriminant(&self) -> &'static str { match self { Self::Created(_) => "Created", Self::Idempotent(_) => "Idempotent", Self::ImportSpawned(_) => "ImportSpawned", } } fn timeline(&self) -> &Arc { match self { Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } /// Unit test timelines aren't activated, test has to do it if it needs to. #[cfg(test)] fn into_timeline_for_test(self) -> Arc { match self { Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } } #[derive(thiserror::Error, Debug)] pub enum CreateTimelineError { #[error("creation of timeline with the given ID is in progress")] AlreadyCreating, #[error("timeline already exists with different parameters")] Conflict, #[error(transparent)] AncestorLsn(anyhow::Error), #[error("ancestor timeline is not active")] AncestorNotActive, #[error("ancestor timeline is archived")] AncestorArchived, #[error("tenant shutting down")] ShuttingDown, #[error(transparent)] Other(#[from] anyhow::Error), } #[derive(thiserror::Error, Debug)] pub enum InitdbError { #[error("Operation was cancelled")] Cancelled, #[error(transparent)] Other(anyhow::Error), #[error(transparent)] Inner(postgres_initdb::Error), } enum CreateTimelineCause { Load, Delete, } #[allow(clippy::large_enum_variant, reason = "TODO")] enum LoadTimelineCause { Attach, Unoffload, } #[derive(thiserror::Error, Debug)] pub(crate) enum GcError { // The tenant is shutting down #[error("tenant shutting down")] TenantCancelled, // The tenant is shutting down #[error("timeline shutting down")] TimelineCancelled, // The tenant is in a state inelegible to run GC #[error("not active")] NotActive, // A requested GC cutoff LSN was invalid, for example it tried to move backwards #[error("not active")] BadLsn { why: String }, // A remote storage error while scheduling updates after compaction #[error(transparent)] Remote(anyhow::Error), // An error reading while calculating GC cutoffs #[error(transparent)] GcCutoffs(PageReconstructError), // If GC was invoked for a particular timeline, this error means it didn't exist #[error("timeline not found")] TimelineNotFound, } impl From for GcError { fn from(value: PageReconstructError) -> Self { match value { PageReconstructError::Cancelled => Self::TimelineCancelled, other => Self::GcCutoffs(other), } } } impl From for GcError { fn from(value: NotInitialized) -> Self { match value { NotInitialized::Uninitialized => GcError::Remote(value.into()), NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled, } } } impl From for GcError { fn from(_: timeline::layer_manager::Shutdown) -> Self { GcError::TimelineCancelled } } #[derive(thiserror::Error, Debug)] pub(crate) enum LoadConfigError { #[error("TOML deserialization error: '{0}'")] DeserializeToml(#[from] toml_edit::de::Error), #[error("Config not found at {0}")] NotFound(Utf8PathBuf), } impl TenantShard { /// Yet another helper for timeline initialization. /// /// - Initializes the Timeline struct and inserts it into the tenant's hash map /// - Scans the local timeline directory for layer files and builds the layer map /// - Downloads remote index file and adds remote files to the layer map /// - Schedules remote upload tasks for any files that are present locally but missing from remote storage. /// /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success, /// it is marked as Active. #[allow(clippy::too_many_arguments)] async fn timeline_init_and_sync( self: &Arc, timeline_id: TimelineId, resources: TimelineResources, index_part: IndexPart, metadata: TimelineMetadata, previous_heatmap: Option, ancestor: Option>, cause: LoadTimelineCause, ctx: &RequestContext, ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; let import_pgdata = index_part.import_pgdata.clone(); let idempotency = match &import_pgdata { Some(import_pgdata) => { CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { idempotency_key: import_pgdata.idempotency_key().clone(), }) } None => { if metadata.ancestor_timeline().is_none() { CreateTimelineIdempotency::Bootstrap { pg_version: metadata.pg_version(), } } else { CreateTimelineIdempotency::Branch { ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), ancestor_start_lsn: metadata.ancestor_lsn(), } } } }; let (timeline, _timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, ancestor.clone(), resources, CreateTimelineCause::Load, idempotency.clone(), index_part.gc_compaction.clone(), index_part.rel_size_migration.clone(), index_part.rel_size_migrated_at, ctx, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); if !disk_consistent_lsn.is_valid() { // As opposed to normal timelines which get initialised with a disk consitent LSN // via initdb, imported timelines start from 0. If the import task stops before // it advances disk consitent LSN, allow it to resume. let in_progress_import = import_pgdata .as_ref() .map(|import| !import.is_done()) .unwrap_or(false); if !in_progress_import { anyhow::bail!("Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"); } } assert_eq!( disk_consistent_lsn, metadata.disk_consistent_lsn(), "these are used interchangeably" ); timeline.remote_client.init_upload_queue(&index_part)?; timeline .load_layer_map(disk_consistent_lsn, index_part) .await .with_context(|| { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; // When unarchiving, we've mostly likely lost the heatmap generated prior // to the archival operation. To allow warming this timeline up, generate // a previous heatmap which contains all visible layers in the layer map. // This previous heatmap will be used whenever a fresh heatmap is generated // for the timeline. if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) { let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); while let Some((tline, end_lsn)) = tline_ending_at { let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; // Another unearchived timeline might have generated a heatmap for this ancestor. // If the current branch point greater than the previous one use the the heatmap // we just generated - it should include more layers. if !tline.should_keep_previous_heatmap(end_lsn) { tline .previous_heatmap .store(Some(Arc::new(unarchival_heatmap))); } else { tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.") } match tline.ancestor_timeline() { Some(ancestor) => { if ancestor.update_layer_visibility().await.is_err() { // Ancestor timeline is shutting down. break; } tline_ending_at = Some((ancestor, tline.get_ancestor_lsn())); } None => { tline_ending_at = None; } } } } match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { let mut guard = self.timelines_creating.lock().unwrap(); if !guard.insert(timeline_id) { // We should never try and load the same timeline twice during startup unreachable!("Timeline {tenant_id}/{timeline_id} is already being created") } let timeline_create_guard = TimelineCreateGuard { _tenant_gate_guard: self.gate.enter()?, owning_tenant: self.clone(), timeline_id, idempotency, // The users of this specific return value don't need the timline_path in there. timeline_path: timeline .conf .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id), }; Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata( TimelineInitAndSyncNeedsSpawnImportPgdata { timeline, import_pgdata, guard: timeline_create_guard, }, )) } Some(_) | None => { { let mut timelines_accessor = self.timelines.lock().unwrap(); match timelines_accessor.entry(timeline_id) { // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { unreachable!( "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" ); } Entry::Vacant(v) => { v.insert(Arc::clone(&timeline)); timeline.maybe_spawn_flush_loop(); } } } if disk_consistent_lsn.is_valid() { // Sanity check: a timeline should have some content. // Exception: importing timelines might not yet have any anyhow::ensure!( ancestor.is_some() || timeline .layers .read(LayerManagerLockHolder::LoadLayerMap) .await .layer_map() .expect( "currently loading, layer manager cannot be shutdown already" ) .iter_historic_layers() .next() .is_some(), "Timeline has no ancestor and no layer files" ); } Ok(TimelineInitAndSyncResult::ReadyToActivate) } } } /// Attach a tenant that's available in cloud storage. /// /// This returns quickly, after just creating the in-memory object /// Tenant struct and launching a background task to download /// the remote index files. On return, the tenant is most likely still in /// Attaching state, and it will become Active once the background task /// finishes. You can use wait_until_active() to wait for the task to /// complete. /// #[allow(clippy::too_many_arguments)] pub(crate) fn spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, resources: TenantSharedResources, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, mode: SpawnMode, ctx: &RequestContext, ) -> Result, GlobalShutDown> { let wal_redo_manager = WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?; let TenantSharedResources { broker_client, remote_storage, deletion_queue_client, l0_flush_global_state, basebackup_cache, feature_resolver, } = resources; let attach_mode = attached_conf.location.attach_mode; let generation = attached_conf.location.generation; let tenant = Arc::new(TenantShard::new( TenantState::Attaching, conf, attached_conf, shard_identity, Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, l0_flush_global_state, basebackup_cache, feature_resolver, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if // we shut down while attaching. let attach_gate_guard = tenant .gate .enter() .expect("We just created the TenantShard: nothing else can have shut it down yet"); // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn); task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, tenant_shard_id, None, "attach tenant", async move { info!( ?attach_mode, "Attaching tenant" ); let _gate_guard = attach_gate_guard; // Is this tenant being spawned as part of process startup? let starting_up = init_order.is_some(); scopeguard::defer! { if starting_up { TENANT.startup_complete.inc(); } } fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) { t.state.send_modify(|state| match state { // TODO: the old code alluded to DeleteTenantFlow sometimes setting // TenantState::Stopping before we get here, but this may be outdated. // Let's find out with a testing assertion. If this doesn't fire, and the // logs don't show this happening in production, remove the Stopping cases. TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => { panic!("unexpected TenantState::Stopping during attach") } // If the tenant is cancelled, assume the error was caused by cancellation. TenantState::Attaching if t.cancel.is_cancelled() => { info!("attach cancelled, setting tenant state to Stopping: {err}"); // NB: progress None tells `set_stopping` that attach has cancelled. *state = TenantState::Stopping { progress: None }; } // According to the old code, DeleteTenantFlow may already have set this to // Stopping. Retain its progress. // TODO: there is no DeleteTenantFlow. Is this still needed? See above. TenantState::Stopping { progress } if t.cancel.is_cancelled() => { assert!(progress.is_some(), "concurrent attach cancellation"); info!("attach cancelled, already Stopping: {err}"); } // Mark the tenant as broken. TenantState::Attaching | TenantState::Stopping { .. } => { error!("attach failed, setting tenant state to Broken (was {state}): {err:?}"); *state = TenantState::broken_from_reason(err.to_string()) } // The attach task owns the tenant state until activated. state => panic!("invalid tenant state {state} during attach: {err:?}"), }); } // TODO: should also be rejecting tenant conf changes that violate this check. if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) { make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); } let mut init_order = init_order; // take the completion because initial tenant loading will complete when all of // these tasks complete. let _completion = init_order .as_mut() .and_then(|x| x.initial_tenant_load.take()); let remote_load_completion = init_order .as_mut() .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { /// We are attaching this tenant lazily in the background. Warmup { _permit: tokio::sync::SemaphorePermit<'a>, during_startup: bool }, /// We are attaching this tenant as soon as we can, because for example an /// endpoint tried to access it. OnDemand, /// During normal operations after startup, we are attaching a tenant, and /// eager attach was requested. Normal, } let attach_type = if matches!(mode, SpawnMode::Lazy) { // Before doing any I/O, wait for at least one of: // - A client attempting to access to this tenant (on-demand loading) // - A permit becoming available in the warmup semaphore (background warmup) tokio::select!( permit = tenant_clone.activate_now_sem.acquire() => { let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, permit = conf.concurrent_tenant_warmup.inner().acquire() => { let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); tracing::info!("Activating tenant (warmup)"); AttachType::Warmup { _permit, during_startup: init_order.is_some() } } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant // stayed in Activating for such a long time that shutdown found it in // that state. tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation"); // Set the tenant to Stopping to signal `set_stopping` that we're done. make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching")); return Ok(()); }, ) } else { // SpawnMode::{Create,Eager} always cause jumping ahead of the // concurrent_tenant_warmup queue AttachType::Normal }; let preload = match &mode { SpawnMode::Eager | SpawnMode::Lazy => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone .preload(&remote_storage, task_mgr::shutdown_token()) .await; match res { Ok(p) => Some(p), Err(e) => { make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); } } } }; // Remote preload is complete. drop(remote_load_completion); // We will time the duration of the attach phase unless this is a creation (attach will do no work) let attach_start = std::time::Instant::now(); let attached = { let _attach_timer = Some(TENANT.attach.start_timer()); tenant_clone.attach(preload, &ctx).await }; let attach_duration = attach_start.elapsed(); _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration)); match attached { Ok(()) => { info!("attach finished, activating"); tenant_clone.activate(broker_client, None, &ctx); } Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)), } // If we are doing an opportunistic warmup attachment at startup, initialize // logical size at the same time. This is better than starting a bunch of idle tenants // with cold caches and then coming back later to initialize their logical sizes. // // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); while futs.next().await.is_some() {} tracing::info!("Warm-up complete"); } Ok(()) } .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)), ); Ok(tenant) } #[instrument(skip_all)] pub(crate) async fn preload( self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { span::debug_assert_current_span_has_tenant_id(); // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); let (mut remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( remote_storage, self.tenant_shard_id, cancel.clone(), ) .await?; let tenant_manifest = match download_tenant_manifest( remote_storage, &self.tenant_shard_id, self.generation, &cancel, ) .await { Ok((tenant_manifest, _, _)) => Some(tenant_manifest), Err(DownloadError::NotFound) => None, Err(err) => return Err(err.into()), }; info!( "found {} timelines ({} offloaded timelines)", remote_timeline_ids.len(), tenant_manifest .as_ref() .map(|m| m.offloaded_timelines.len()) .unwrap_or(0) ); for k in other_keys { warn!("Unexpected non timeline key {k}"); } // Avoid downloading IndexPart of offloaded timelines. let mut offloaded_with_prefix = HashSet::new(); if let Some(tenant_manifest) = &tenant_manifest { for offloaded in tenant_manifest.offloaded_timelines.iter() { if remote_timeline_ids.remove(&offloaded.timeline_id) { offloaded_with_prefix.insert(offloaded.timeline_id); } else { // We'll take care later of timelines in the manifest without a prefix } } } // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even // pulled the first heatmap. Not entirely necessary since the storage controller // will kick the secondary in any case and cause a download. let maybe_heatmap_at = self.read_on_disk_heatmap().await; let timelines = self .load_timelines_metadata( remote_timeline_ids, remote_storage, maybe_heatmap_at, cancel, ) .await?; Ok(TenantPreload { tenant_manifest, timelines: timelines .into_iter() .map(|(id, tl)| (id, Some(tl))) .chain(offloaded_with_prefix.into_iter().map(|id| (id, None))) .collect(), }) } async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { if !self.conf.load_previous_heatmap { return None; } let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); match tokio::fs::read_to_string(on_disk_heatmap_path).await { Ok(heatmap) => match serde_json::from_str::(&heatmap) { Ok(heatmap) => Some((heatmap, std::time::Instant::now())), Err(err) => { error!("Failed to deserialize old heatmap: {err}"); None } }, Err(err) => match err.kind() { std::io::ErrorKind::NotFound => None, _ => { error!("Unexpected IO error reading old heatmap: {err}"); None } }, } } /// /// Background task that downloads all data for a tenant and brings it to Active state. /// /// No background tasks are started as part of this routine. /// async fn attach( self: &Arc, preload: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); let Some(preload) = preload else { anyhow::bail!( "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624" ); }; let mut offloaded_timeline_ids = HashSet::new(); let mut offloaded_timelines_list = Vec::new(); if let Some(tenant_manifest) = &preload.tenant_manifest { for timeline_manifest in tenant_manifest.offloaded_timelines.iter() { let timeline_id = timeline_manifest.timeline_id; let offloaded_timeline = OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest); offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); offloaded_timeline_ids.insert(timeline_id); } } // Complete deletions for offloaded timeline id's from manifest. // The manifest will be uploaded later in this function. offloaded_timelines_list .retain(|(offloaded_id, offloaded)| { // Existence of a timeline is finally determined by the existence of an index-part.json in remote storage. // If there is dangling references in another location, they need to be cleaned up. let delete = !preload.timelines.contains_key(offloaded_id); if delete { tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); offloaded.defuse_for_tenant_drop(); } !delete }); let mut timelines_to_resume_deletions = vec![]; let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { let Some(preload) = preload else { continue }; // This is an invariant of the `preload` function's API assert!(!offloaded_timeline_ids.contains(&timeline_id)); let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); // We found index_part on the remote, this is the standard case. existent_timelines.insert(timeline_id); i } Err(DownloadError::NotFound) => { // There is no index_part on the remote. We only get here // if there is some prefix for the timeline in the remote storage. // This can e.g. be the initdb.tar.zst archive, maybe a // remnant from a prior incomplete creation or deletion attempt. // Delete the local directory as the deciding criterion for a // timeline's existence is presence of index_part. info!(%timeline_id, "index_part not found on remote"); continue; } Err(DownloadError::Fatal(why)) => { // If, while loading one remote timeline, we saw an indication that our generation // number is likely invalid, then we should not load the whole tenant. error!(%timeline_id, "Fatal error loading timeline: {why}"); anyhow::bail!(why.to_string()); } Err(e) => { // Some (possibly ephemeral) error happened during index_part download. // Pretend the timeline exists to not delete the timeline directory, // as it might be a temporary issue and we don't want to re-download // everything after it resolves. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); existent_timelines.insert(timeline_id); continue; } }; match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); remote_index_and_client.insert( timeline_id, (index_part, preload.client, preload.previous_heatmap), ); } MaybeDeletedIndexPart::Deleted(index_part) => { info!( "timeline {} is deleted, picking to resume deletion", timeline_id ); timelines_to_resume_deletions.push((timeline_id, index_part, preload.client)); } } } let mut gc_blocks = HashMap::new(); // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?; for (timeline_id, remote_metadata) in sorted_timelines { let (index_part, remote_client, previous_heatmap) = remote_index_and_client .remove(&timeline_id) .expect("just put it in above"); if let Some(blocking) = index_part.gc_blocking.as_ref() { // could just filter these away, but it helps while testing anyhow::ensure!( !blocking.reasons.is_empty(), "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons" ); let prev = gc_blocks.insert(timeline_id, blocking.reasons); assert!(prev.is_none()); } // TODO again handle early failure let effect = self .load_remote_timeline( timeline_id, index_part, remote_metadata, previous_heatmap, self.get_timeline_resources_for(remote_client), LoadTimelineCause::Attach, ctx, ) .await .with_context(|| { format!( "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) })?; match effect { TimelineInitAndSyncResult::ReadyToActivate => { // activation happens later, on Tenant::activate } TimelineInitAndSyncResult::NeedsSpawnImportPgdata( TimelineInitAndSyncNeedsSpawnImportPgdata { timeline, import_pgdata, guard, }, ) => { let timeline_id = timeline.timeline_id; let import_task_gate = Gate::default(); let import_task_guard = import_task_gate.enter().unwrap(); let import_task_handle = tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), import_pgdata, guard, import_task_guard, ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); let prev = self.timelines_importing.lock().unwrap().insert( timeline_id, Arc::new(ImportingTimeline { timeline: timeline.clone(), import_task_handle, import_task_gate, delete_progress: TimelineDeleteProgress::default(), }), ); assert!(prev.is_none()); } } } // At this point we've initialized all timelines and are tracking them. // Now compute the layer visibility for all (not offloaded) timelines. let compute_visiblity_for = { let timelines_accessor = self.timelines.lock().unwrap(); let mut timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); timelines_offloaded_accessor.extend(offloaded_timelines_list.into_iter()); // Before activation, populate each Timeline's GcInfo with information about its children self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None); timelines_accessor.values().cloned().collect::>() }; for tl in compute_visiblity_for { tl.update_layer_visibility().await.with_context(|| { format!( "failed initial timeline visibility computation {} for tenant {}", tl.timeline_id, self.tenant_shard_id ) })?; } // Walk through deleted timelines, resume deletion for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions { remote_timeline_client .init_upload_queue_stopped_to_continue_deletion(&index_part) .context("init queue stopped") .map_err(LoadLocalTimelineError::ResumeDeletion)?; DeleteTimelineFlow::resume_deletion( Arc::clone(self), timeline_id, &index_part.metadata, remote_timeline_client, ctx, ) .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } // Stash the preloaded tenant manifest, and upload a new manifest if changed. // // NB: this must happen after the tenant is fully populated above. In particular the // offloaded timelines, which are included in the manifest. { let mut guard = self.remote_tenant_manifest.lock().await; assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here *guard = preload.tenant_manifest; } self.maybe_upload_tenant_manifest().await?; // The local filesystem contents are a cache of what's in the remote IndexPart; // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; self.gc_block.set_scanned(gc_blocks); fail::fail_point!("attach-before-activate", |_| { anyhow::bail!("attach-before-activate"); }); failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel); info!("Done"); Ok(()) } /// Check for any local timeline directories that are temporary, or do not correspond to a /// timeline that still exists: this can happen if we crashed during a deletion/creation, or /// if a timeline was deleted while the tenant was attached to a different pageserver. fn clean_up_timelines(&self, existent_timelines: &HashSet) -> anyhow::Result<()> { let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); let entries = match timelines_dir.read_dir_utf8() { Ok(d) => d, Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { return Ok(()); } else { return Err(e).context("list timelines directory for tenant"); } } }; for entry in entries { let entry = entry.context("read timeline dir entry")?; let entry_path = entry.path(); let purge = if crate::is_temporary(entry_path) { true } else { match TimelineId::try_from(entry_path.file_name()) { Ok(i) => { // Purge if the timeline ID does not exist in remote storage: remote storage is the authority. !existent_timelines.contains(&i) } Err(e) => { tracing::warn!( "Unparseable directory in timelines directory: {entry_path}, ignoring ({e})" ); // Do not purge junk: if we don't recognize it, be cautious and leave it for a human. false } } }; if purge { tracing::info!("Purging stale timeline dentry {entry_path}"); if let Err(e) = match entry.file_type() { Ok(t) => if t.is_dir() { std::fs::remove_dir_all(entry_path) } else { std::fs::remove_file(entry_path) } .or_else(fs_ext::ignore_not_found), Err(e) => Err(e), } { tracing::warn!("Failed to purge stale timeline dentry {entry_path}: {e}"); } } } Ok(()) } /// Get sum of all remote timelines sizes /// /// This function relies on the index_part instead of listing the remote storage pub fn remote_size(&self) -> u64 { let mut size = 0; for timeline in self.list_timelines() { size += timeline.remote_client.get_remote_physical_size(); } size } #[instrument(skip_all, fields(timeline_id=%timeline_id))] #[allow(clippy::too_many_arguments)] async fn load_remote_timeline( self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, previous_heatmap: Option, resources: TimelineResources, cause: LoadTimelineCause, ctx: &RequestContext, ) -> anyhow::Result { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_shard_id, &timeline_id)) .await .context("Failed to create new timeline directory")?; let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() { let timelines = self.timelines.lock().unwrap(); Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else( || { anyhow::anyhow!( "cannot find ancestor timeline {ancestor_id} for timeline {timeline_id}" ) }, )?)) } else { None }; self.timeline_init_and_sync( timeline_id, resources, index_part, remote_metadata, previous_heatmap, ancestor, cause, ctx, ) .await } async fn load_timelines_metadata( self: &Arc, timeline_ids: HashSet, remote_storage: &GenericRemoteStorage, heatmap: Option<(HeatMapTenant, std::time::Instant)>, cancel: CancellationToken, ) -> anyhow::Result> { let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1)); let mut part_downloads = JoinSet::new(); for timeline_id in timeline_ids { let cancel_clone = cancel.clone(); let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| { hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { heatmap: h, read_at: hs.1, end_lsn: None, }) }); part_downloads.spawn( self.load_timeline_metadata( timeline_id, remote_storage.clone(), previous_timeline_heatmap, cancel_clone, ) .instrument(info_span!("download_index_part", %timeline_id)), ); } let mut timeline_preloads: HashMap = HashMap::new(); loop { tokio::select!( next = part_downloads.join_next() => { match next { Some(result) => { let preload = result.context("join preload task")?; timeline_preloads.insert(preload.timeline_id, preload); }, None => { break; } } }, _ = cancel.cancelled() => { anyhow::bail!("Cancelled while waiting for remote index download") } ) } Ok(timeline_preloads) } fn build_timeline_client( &self, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, ) -> RemoteTimelineClient { RemoteTimelineClient::new( remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, self.tenant_shard_id, timeline_id, self.generation, &self.tenant_conf.load().location, ) } fn load_timeline_metadata( self: &Arc, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, previous_heatmap: Option, cancel: CancellationToken, ) -> impl Future + use<> { let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("starting index part download"); let index_part = client.download_index_file(&cancel).await; debug!("finished index part download"); TimelinePreload { client, timeline_id, index_part, previous_heatmap, } } } fn check_to_be_archived_has_no_unarchived_children( timeline_id: TimelineId, timelines: &std::sync::MutexGuard<'_, HashMap>>, ) -> Result<(), TimelineArchivalError> { let children: Vec = timelines .iter() .filter_map(|(id, entry)| { if entry.get_ancestor_timeline_id() != Some(timeline_id) { return None; } if entry.is_archived() == Some(true) { return None; } Some(*id) }) .collect(); if !children.is_empty() { return Err(TimelineArchivalError::HasUnarchivedChildren(children)); } Ok(()) } fn check_ancestor_of_to_be_unarchived_is_not_archived( ancestor_timeline_id: TimelineId, timelines: &std::sync::MutexGuard<'_, HashMap>>, offloaded_timelines: &std::sync::MutexGuard< '_, HashMap>, >, ) -> Result<(), TimelineArchivalError> { let has_archived_parent = if let Some(ancestor_timeline) = timelines.get(&ancestor_timeline_id) { ancestor_timeline.is_archived() == Some(true) } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { true } else { error!("ancestor timeline {ancestor_timeline_id} not found"); if cfg!(debug_assertions) { panic!("ancestor timeline {ancestor_timeline_id} not found"); } return Err(TimelineArchivalError::NotFound); }; if has_archived_parent { return Err(TimelineArchivalError::HasArchivedParent( ancestor_timeline_id, )); } Ok(()) } fn check_to_be_unarchived_timeline_has_no_archived_parent( timeline: &Arc, ) -> Result<(), TimelineArchivalError> { if let Some(ancestor_timeline) = timeline.ancestor_timeline() { if ancestor_timeline.is_archived() == Some(true) { return Err(TimelineArchivalError::HasArchivedParent( ancestor_timeline.timeline_id, )); } } Ok(()) } /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline /// /// Counterpart to [`offload_timeline`]. async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); // We activate the timeline below manually, so this must be called on an active tenant. // We expect callers of this function to ensure this. match self.current_state() { TenantState::Activating { .. } | TenantState::Attaching | TenantState::Broken { .. } => { panic!("Timeline expected to be active") } TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled), TenantState::Active => {} } let cancel = self.cancel.clone(); // Protect against concurrent attempts to use this TimelineId // We don't care much about idempotency, as it's ensured a layer above. let allow_offloaded = true; let _create_guard = self .create_timeline_create_guard( timeline_id, CreateTimelineIdempotency::FailWithConflict, allow_offloaded, ) .map_err(|err| match err { TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress, TimelineExclusionError::AlreadyExists { .. } => { TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists")) } TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e), TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled, })?; let timeline_preload = self .load_timeline_metadata( timeline_id, self.remote_storage.clone(), None, cancel.clone(), ) .await; let index_part = match timeline_preload.index_part { Ok(index_part) => { debug!("remote index part exists for timeline {timeline_id}"); index_part } Err(DownloadError::NotFound) => { error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled), Err(e) => { // Some (possibly ephemeral) error happened during index_part download. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); return Err(TimelineArchivalError::Other( anyhow::Error::new(e).context("downloading index_part from remote storage"), )); } }; let index_part = match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::Deleted(_index_part) => { info!("timeline is deleted according to index_part.json"); return Err(TimelineArchivalError::NotFound); } }; let remote_metadata = index_part.metadata.clone(); let timeline_resources = self.build_timeline_resources(timeline_id); self.load_remote_timeline( timeline_id, index_part, remote_metadata, None, timeline_resources, LoadTimelineCause::Unoffload, &ctx, ) .await .with_context(|| { format!( "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) }) .map_err(TimelineArchivalError::Other)?; let timeline = { let timelines = self.timelines.lock().unwrap(); let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); // This is not a panic because no locks are held between `load_remote_timeline` // which puts the timeline into timelines, and our look into the timeline map. return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" ))); }; let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); match offloaded_timelines.remove(&timeline_id) { Some(offloaded) => { offloaded.delete_from_ancestor_with_timelines(&timelines); } None => warn!("timeline already removed from offloaded timelines"), } self.initialize_gc_info(&timelines, &offloaded_timelines, Some(timeline_id)); Arc::clone(timeline) }; // Upload new list of offloaded timelines to S3 self.maybe_upload_tenant_manifest().await?; // Activate the timeline (if it makes sense) if !(timeline.is_broken() || timeline.is_stopping()) { let background_jobs_can_start = None; timeline.activate( self.clone(), broker_client.clone(), background_jobs_can_start, &ctx.with_scope_timeline(&timeline), ); } info!("timeline unoffloading complete"); Ok(timeline) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); // First part: figure out what is needed to do, and do validation let timeline_or_unarchive_offloaded = 'outer: { let timelines = self.timelines.lock().unwrap(); let Some(timeline) = timelines.get(&timeline_id) else { let offloaded_timelines = self.timelines_offloaded.lock().unwrap(); let Some(offloaded) = offloaded_timelines.get(&timeline_id) else { return Err(TimelineArchivalError::NotFound); }; if new_state == TimelineArchivalState::Archived { // It's offloaded already, so nothing to do return Ok(()); } if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id { Self::check_ancestor_of_to_be_unarchived_is_not_archived( ancestor_timeline_id, &timelines, &offloaded_timelines, )?; } break 'outer None; }; // Do some validation. We release the timelines lock below, so there is potential // for race conditions: these checks are more present to prevent misunderstandings of // the API's capabilities, instead of serving as the sole way to defend their invariants. match new_state { TimelineArchivalState::Unarchived => { Self::check_to_be_unarchived_timeline_has_no_archived_parent(timeline)? } TimelineArchivalState::Archived => { Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)? } } Some(Arc::clone(timeline)) }; // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one self.unoffload_timeline(timeline_id, broker_client, ctx) .await? }; // Third part: upload new timeline archival state and block until it is present in S3 let upload_needed = match timeline .remote_client .schedule_index_upload_for_timeline_archival_state(new_state) { Ok(upload_needed) => upload_needed, Err(e) => { if timeline.cancel.is_cancelled() { return Err(TimelineArchivalError::Cancelled); } else { return Err(TimelineArchivalError::Other(e)); } } }; if upload_needed { info!("Uploading new state"); const MAX_WAIT: Duration = Duration::from_secs(10); let Ok(v) = tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await else { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; v.map_err(|e| match e { WaitCompletionError::NotInitialized(e) => { TimelineArchivalError::Other(anyhow::anyhow!(e)) } WaitCompletionError::UploadQueueShutDownOrStopped => { TimelineArchivalError::Cancelled } })?; } Ok(()) } pub fn get_offloaded_timeline( &self, timeline_id: TimelineId, ) -> Result, GetTimelineError> { self.timelines_offloaded .lock() .unwrap() .get(&timeline_id) .map(Arc::clone) .ok_or(GetTimelineError::NotFound { tenant_id: self.tenant_shard_id, timeline_id, }) } pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline( &self, timeline_id: TimelineId, active_only: bool, ) -> Result, GetTimelineError> { let timelines_accessor = self.timelines.lock().unwrap(); let timeline = timelines_accessor .get(&timeline_id) .ok_or(GetTimelineError::NotFound { tenant_id: self.tenant_shard_id, timeline_id, })?; if active_only && !timeline.is_active() { Err(GetTimelineError::NotActive { tenant_id: self.tenant_shard_id, timeline_id, state: timeline.current_state(), }) } else { Ok(Arc::clone(timeline)) } } /// Lists timelines the tenant contains. /// It's up to callers to omit certain timelines that are not considered ready for use. pub fn list_timelines(&self) -> Vec> { self.timelines .lock() .unwrap() .values() .map(Arc::clone) .collect() } /// Lists timelines the tenant contains. /// It's up to callers to omit certain timelines that are not considered ready for use. pub fn list_importing_timelines(&self) -> Vec> { self.timelines_importing .lock() .unwrap() .values() .map(Arc::clone) .collect() } /// Lists timelines the tenant manages, including offloaded ones. /// /// It's up to callers to omit certain timelines that are not considered ready for use. pub fn list_timelines_and_offloaded( &self, ) -> (Vec>, Vec>) { let timelines = self .timelines .lock() .unwrap() .values() .map(Arc::clone) .collect(); let offloaded = self .timelines_offloaded .lock() .unwrap() .values() .map(Arc::clone) .collect(); (timelines, offloaded) } pub fn list_timeline_ids(&self) -> Vec { self.timelines.lock().unwrap().keys().cloned().collect() } /// This is used by tests & import-from-basebackup. /// /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`. /// /// The caller is responsible for getting the timeline into a state that will be accepted /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`]. /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline /// to the [`TenantShard::timelines`]. /// /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys. pub(crate) async fn create_empty_timeline( self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> { anyhow::ensure!( self.is_active(), "Cannot create empty timelines on inactive tenant" ); // Protect against concurrent attempts to use this TimelineId let create_guard = match self .start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict) .await? { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(_) => { unreachable!("FailWithConflict implies we get an error instead") } }; let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version, ); self.prepare_new_timeline( new_timeline_id, &new_metadata, create_guard, initdb_lsn, None, None, None, ctx, ) .await } /// Helper for unit tests to create an empty timeline. /// /// The timeline is has state value `Active` but its background loops are not running. // This makes the various functions which anyhow::ensure! for Active state work in tests. // Our current tests don't need the background loops. #[cfg(test)] pub async fn create_test_timeline( self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result> { let (uninit_tl, ctx) = self .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) .await?; let tline = uninit_tl.raw_timeline().expect("we just created it"); assert_eq!(tline.get_last_record_lsn(), Lsn(0)); // Setup minimum keys required for the timeline to be usable. let mut modification = tline.begin_modification(initdb_lsn); modification .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification .commit(&ctx) .await .context("commit init_empty_test_timeline modification")?; // Flush to disk so that uninit_tl's check for valid disk_consistent_lsn passes. tline.maybe_spawn_flush_loop(); tline.freeze_and_flush().await.context("freeze_and_flush")?; // Make sure the freeze_and_flush reaches remote storage. tline.remote_client.wait_completion().await.unwrap(); let tl = uninit_tl.finish_creation().await?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); Ok(tl) } /// Helper for unit tests to create a timeline with some pre-loaded states. #[cfg(test)] #[allow(clippy::too_many_arguments)] pub async fn create_test_timeline_with_layers( self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: PgMajorVersion, ctx: &RequestContext, in_memory_layer_desc: Vec, delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { use checks::check_valid_layermap; use itertools::Itertools; let tline = self .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) .await?; tline.force_advance_lsn(end_lsn); for deltas in delta_layer_desc { tline .force_create_delta_layer(deltas, Some(initdb_lsn), ctx) .await?; } for (lsn, images) in image_layer_desc { tline .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) .await?; } for in_memory in in_memory_layer_desc { tline .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx) .await?; } let layer_names = tline .layers .read(LayerManagerLockHolder::Testing) .await .layer_map() .unwrap() .iter_historic_layers() .map(|layer| layer.layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { bail!("invalid layermap: {err}"); } Ok(tline) } /// Create a new timeline. /// /// Returns the new timeline ID and reference to its Timeline object. /// /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. #[allow(clippy::too_many_arguments)] pub(crate) async fn create_timeline( self: &Arc, params: CreateTimelineParams, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> Result, CreateTimelineError> { if !self.is_active() { if matches!(self.current_state(), TenantState::Stopping { .. }) { return Err(CreateTimelineError::ShuttingDown); } else { return Err(CreateTimelineError::Other(anyhow::anyhow!( "Cannot create timelines on inactive tenant" ))); } } let _gate = self .gate .enter() .map_err(|_| CreateTimelineError::ShuttingDown)?; let result: CreateTimelineResult = match params { CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap { new_timeline_id, existing_initdb_timeline_id, pg_version, }) => { self.bootstrap_timeline( new_timeline_id, pg_version, existing_initdb_timeline_id, ctx, ) .await? } CreateTimelineParams::Branch(CreateTimelineParamsBranch { new_timeline_id, ancestor_timeline_id, mut ancestor_start_lsn, }) => { let ancestor_timeline = self .get_timeline(ancestor_timeline_id, false) .context("Cannot branch off the timeline that's not present in pageserver")?; // instead of waiting around, just deny the request because ancestor is not yet // ready for other purposes either. if !ancestor_timeline.is_active() { return Err(CreateTimelineError::AncestorNotActive); } if ancestor_timeline.is_archived() == Some(true) { info!("tried to branch archived timeline"); return Err(CreateTimelineError::AncestorArchived); } if let Some(lsn) = ancestor_start_lsn.as_mut() { *lsn = lsn.align(); let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { // can we safely just branch from the ancestor instead? return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", lsn, ancestor_timeline_id, ancestor_ancestor_lsn, ))); } // Wait for the WAL to arrive and be processed on the parent branch up // to the requested branch point. The repository code itself doesn't // require it, but if we start to receive WAL on the new timeline, // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline .wait_lsn( *lsn, timeline::WaitLsnWaiter::Tenant, timeline::WaitLsnTimeout::Default, ctx, ) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) } WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, })?; } self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) .await? } CreateTimelineParams::ImportPgdata(params) => { self.create_timeline_import_pgdata(params, ctx).await? } }; // At this point we have dropped our guard on [`Self::timelines_creating`], and // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // not send a success to the caller until it is. The same applies to idempotent retries. // // TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly // assume that, because they can see the timeline via API, that the creation is done and // that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`]) // until it is durable, e.g., by extending the time we hold the creation guard. This also // interacts with UninitializedTimeline and is generally a bit tricky. // // To re-emphasize: the only correct way to create a timeline is to repeat calling the // creation API until it returns success. Only then is durability guaranteed. info!(creation_result=%result.discriminant(), "waiting for timeline to be durable"); result .timeline() .remote_client .wait_completion() .await .map_err(|e| match e { WaitCompletionError::NotInitialized( e, // If the queue is already stopped, it's a shutdown error. ) if e.is_stopping() => CreateTimelineError::ShuttingDown, WaitCompletionError::NotInitialized(_) => { // This is a bug: we should never try to wait for uploads before initializing the timeline debug_assert!(false); CreateTimelineError::Other(anyhow::anyhow!("timeline not initialized")) } WaitCompletionError::UploadQueueShutDownOrStopped => { CreateTimelineError::ShuttingDown } })?; // The creating task is responsible for activating the timeline. // We do this after `wait_completion()` so that we don't spin up tasks that start // doing stuff before the IndexPart is durable in S3, which is done by the previous section. let activated_timeline = match result { CreateTimelineResult::Created(timeline) => { timeline.activate( self.clone(), broker_client, None, &ctx.with_scope_timeline(&timeline), ); timeline } CreateTimelineResult::Idempotent(timeline) => { info!( "request was deemed idempotent, activation will be done by the creating task" ); timeline } CreateTimelineResult::ImportSpawned(timeline) => { info!( "import task spawned, timeline will become visible and activated once the import is done" ); timeline } }; Ok(activated_timeline) } /// The returned [`Arc`] is NOT in the [`TenantShard::timelines`] map until the import /// completes in the background. A DIFFERENT [`Arc`] will be inserted into the /// [`TenantShard::timelines`] map when the import completes. /// We only return an [`Arc`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`] /// for the response. async fn create_timeline_import_pgdata( self: &Arc, params: CreateTimelineParamsImportPgdata, ctx: &RequestContext, ) -> Result { let CreateTimelineParamsImportPgdata { new_timeline_id, location, idempotency_key, } = params; let started_at = chrono::Utc::now().naive_utc(); // // There's probably a simpler way to upload an index part, but, remote_timeline_client // is the canonical way we do it. // - create an empty timeline in-memory // - use its remote_timeline_client to do the upload // - dispose of the uninit timeline // - keep the creation guard alive let timeline_create_guard = match self .start_creating_timeline( new_timeline_id, CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { idempotency_key: idempotency_key.clone(), }), ) .await? { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { return Ok(CreateTimelineResult::Idempotent(timeline)); } }; let (mut uninit_timeline, timeline_ctx) = { let this = &self; let initdb_lsn = Lsn(0); async move { let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, PgMajorVersion::PG15, ); this.prepare_new_timeline( new_timeline_id, &new_metadata, timeline_create_guard, initdb_lsn, None, None, None, ctx, ) .await } } .await?; let in_progress = import_pgdata::index_part_format::InProgress { idempotency_key, location, started_at, }; let index_part = import_pgdata::index_part_format::Root::V1( import_pgdata::index_part_format::V1::InProgress(in_progress), ); uninit_timeline .raw_timeline() .unwrap() .remote_client .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?; // wait_completion happens in caller let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); let import_task_gate = Gate::default(); let import_task_guard = import_task_gate.enter().unwrap(); let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), index_part, timeline_create_guard, import_task_guard, timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); let prev = self.timelines_importing.lock().unwrap().insert( timeline.timeline_id, Arc::new(ImportingTimeline { timeline: timeline.clone(), import_task_handle, import_task_gate, delete_progress: TimelineDeleteProgress::default(), }), ); // Idempotency is enforced higher up the stack assert!(prev.is_none()); // NB: the timeline doesn't exist in self.timelines at this point Ok(CreateTimelineResult::ImportSpawned(timeline)) } /// Finalize the import of a timeline on this shard by marking it complete in /// the index part. If the import task hasn't finished yet, returns an error. /// /// This method is idempotent. If the import was finalized once, the next call /// will be a no-op. pub(crate) async fn finalize_importing_timeline( &self, timeline_id: TimelineId, ) -> Result<(), FinalizeTimelineImportError> { let timeline = { let locked = self.timelines_importing.lock().unwrap(); match locked.get(&timeline_id) { Some(importing_timeline) => { if !importing_timeline.import_task_handle.is_finished() { return Err(FinalizeTimelineImportError::ImportTaskStillRunning); } importing_timeline.timeline.clone() } None => { return Ok(()); } } }; timeline .remote_client .schedule_index_upload_for_import_pgdata_finalize() .map_err(|_err| FinalizeTimelineImportError::ShuttingDown)?; timeline .remote_client .wait_completion() .await .map_err(|_err| FinalizeTimelineImportError::ShuttingDown)?; self.timelines_importing .lock() .unwrap() .remove(&timeline_id); Ok(()) } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] async fn create_timeline_import_pgdata_task( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, timeline_create_guard: TimelineCreateGuard, _import_task_guard: GateGuard, ctx: RequestContext, ) { debug_assert_current_span_has_tenant_and_timeline_id(); info!("starting"); scopeguard::defer! {info!("exiting")}; let res = self .create_timeline_import_pgdata_task_impl( timeline, index_part, timeline_create_guard, ctx, ) .await; if let Err(err) = &res { error!(?err, "task failed"); // TODO sleep & retry, sensitive to tenant shutdown // TODO: allow timeline deletion requests => should cancel the task } } async fn create_timeline_import_pgdata_task_impl( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, _timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) -> Result<(), anyhow::Error> { info!("importing pgdata"); let ctx = ctx.with_scope_timeline(&timeline); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await .context("import")?; info!("import done - waiting for activation"); anyhow::Ok(()) } pub(crate) async fn delete_timeline( self: Arc, timeline_id: TimelineId, ) -> Result<(), DeleteTimelineError> { DeleteTimelineFlow::run(&self, timeline_id).await?; Ok(()) } /// perform one garbage collection iteration, removing old data files from disk. /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// /// `target_timeline_id` specifies the timeline to GC, or None for all. /// /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained. /// Also known as the retention period, or the GC cutoff point. `horizon` specifies /// the amount of history, as LSN difference from current latest LSN on each timeline. /// `pitr` specifies the same as a time difference from the current time. The effective /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever /// requires more history to be retained. // pub(crate) async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { // Don't start doing work during shutdown if let TenantState::Stopping { .. } = self.current_state() { return Ok(GcResult::default()); } // there is a global allowed_error for this if !self.is_active() { return Err(GcError::NotActive); } { let conf = self.tenant_conf.load(); // If we may not delete layers, then simply skip GC. Even though a tenant // in AttachedMulti state could do GC and just enqueue the blocked deletions, // the only advantage to doing it is to perhaps shrink the LayerMap metadata // a bit sooner than we would achieve by waiting for AttachedSingle status. if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); } if conf.is_gc_blocked_by_lsn_lease_deadline() { info!("Skipping GC because lsn lease deadline is not reached"); return Ok(GcResult::default()); } } let _guard = match self.gc_block.start().await { Ok(guard) => guard, Err(reasons) => { info!("Skipping GC: {reasons}"); return Ok(GcResult::default()); } }; self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } /// Performs one compaction iteration. Called periodically from the compaction loop. Returns /// whether another compaction is needed, if we still have pending work or if we yield for /// immediate L0 compaction. /// /// Compaction can also be explicitly requested for a timeline via the HTTP API. async fn compaction_iteration( self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { // Don't compact inactive tenants. if !self.is_active() { return Ok(CompactionOutcome::Skipped); } // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`, // since we need to compact L0 even in AttachedMulti to bound read amplification. let location = self.tenant_conf.load().location; if !location.may_upload_layers_hint() { info!("skipping compaction in location state {location:?}"); return Ok(CompactionOutcome::Skipped); } // Don't compact if the circuit breaker is tripped. if self.compaction_circuit_breaker.lock().unwrap().is_broken() { info!("skipping compaction due to previous failures"); return Ok(CompactionOutcome::Skipped); } // Collect all timelines to compact, along with offload instructions and L0 counts. let mut compact: Vec> = Vec::new(); let mut offload: HashSet = HashSet::new(); let mut l0_counts: HashMap = HashMap::new(); { let offload_enabled = self.get_timeline_offloading_enabled(); let timelines = self.timelines.lock().unwrap(); for (&timeline_id, timeline) in timelines.iter() { // Skip inactive timelines. if !timeline.is_active() { continue; } // Schedule the timeline for compaction. compact.push(timeline.clone()); // Schedule the timeline for offloading if eligible. let can_offload = offload_enabled && timeline.can_offload().0 && !timelines .iter() .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id)); if can_offload { offload.insert(timeline_id); } } } // release timelines lock for timeline in &compact { // Collect L0 counts. Can't await while holding lock above. if let Ok(lm) = timeline .layers .read(LayerManagerLockHolder::Compaction) .await .layer_map() { l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len()); } } // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to // bound read amplification. // // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC // compaction and offloading. We leave that as a potential problem to solve later. Consider // splitting L0 and image/GC compaction to separate background jobs. if self.get_compaction_l0_first() { let compaction_threshold = self.get_compaction_threshold(); let compact_l0 = compact .iter() .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0))) .filter(|&(_, l0)| l0 >= compaction_threshold) .sorted_by_key(|&(_, l0)| l0) .rev() .map(|(tli, _)| tli.clone()) .collect_vec(); let mut has_pending_l0 = false; for timeline in compact_l0 { let ctx = &ctx.with_scope_timeline(&timeline); // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass. let outcome = timeline .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; match outcome { CompactionOutcome::Done => {} CompactionOutcome::Skipped => {} CompactionOutcome::Pending => has_pending_l0 = true, CompactionOutcome::YieldForL0 => has_pending_l0 = true, } } if has_pending_l0 { return Ok(CompactionOutcome::YieldForL0); // do another pass } } // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more // L0 layers, they may also be compacted here. Image compaction will yield if there is // pending L0 compaction on any tenant timeline. // // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. let mut has_pending = false; for timeline in compact { if !timeline.is_active() { continue; } let ctx = &ctx.with_scope_timeline(&timeline); // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point). let mut flags = EnumSet::default(); if self.get_compaction_l0_first() { flags |= CompactFlags::YieldForL0; } let mut outcome = timeline .compact(cancel, flags, ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; // If we're done compacting, check the scheduled GC compaction queue for more work. if outcome == CompactionOutcome::Done { let queue = { let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); guard .entry(timeline.timeline_id) .or_insert_with(|| Arc::new(GcCompactionQueue::new())) .clone() }; let gc_compaction_strategy = self .feature_resolver .evaluate_multivariate("gc-comapction-strategy") .ok(); let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy { info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy) } else { info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id) }; outcome = queue .iteration(cancel, ctx, &self.gc_block, &timeline) .instrument(span) .await?; } // If we're done compacting, offload the timeline if requested. if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) { pausable_failpoint!("before-timeline-auto-offload"); offload_timeline(self, &timeline) .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id)) .await .or_else(|err| match err { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), OffloadError::AlreadyInProgress => Ok(()), OffloadError::Cancelled => Err(CompactionError::new_cancelled()), // don't break the anyhow chain OffloadError::Other(err) => Err(CompactionError::Other(err)), })?; } match outcome { CompactionOutcome::Done => {} CompactionOutcome::Skipped => {} CompactionOutcome::Pending => has_pending = true, // This mostly makes sense when the L0-only pass above is enabled, since there's // otherwise no guarantee that we'll start with the timeline that has high L0. CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0), } } // Success! Untrip the breaker if necessary. self.compaction_circuit_breaker .lock() .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); match has_pending { true => Ok(CompactionOutcome::Pending), false => Ok(CompactionOutcome::Done), } } /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { if err.is_cancel() { return; } self.compaction_circuit_breaker .lock() .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } /// Cancel scheduled compaction tasks pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) { let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); if let Some(q) = guard.get_mut(&timeline_id) { q.cancel_scheduled(); } } pub(crate) fn get_scheduled_compaction_tasks( &self, timeline_id: TimelineId, ) -> Vec { let res = { let guard = self.scheduled_compaction_tasks.lock().unwrap(); guard.get(&timeline_id).map(|q| q.remaining_jobs()) }; let Some((running, remaining)) = res else { return Vec::new(); }; let mut result = Vec::new(); if let Some((id, running)) = running { result.extend(running.into_compact_info_resp(id, true)); } for (id, job) in remaining { result.extend(job.into_compact_info_resp(id, false)); } result } /// Schedule a compaction task for a timeline. pub(crate) async fn schedule_compaction( &self, timeline_id: TimelineId, options: CompactOptions, ) -> anyhow::Result> { let (tx, rx) = tokio::sync::oneshot::channel(); let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); let q = guard .entry(timeline_id) .or_insert_with(|| Arc::new(GcCompactionQueue::new())); q.schedule_manual_compaction(options, Some(tx)); Ok(rx) } /// Performs periodic housekeeping, via the tenant housekeeping background task. async fn housekeeping(&self) { // Call through to all timelines to freeze ephemeral layers as needed. This usually happens // during ingest, but we don't want idle timelines to hold open layers for too long. // // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode). // We don't run compaction in this case either, and don't want to keep flushing tiny L0 // layers that won't be compacted down. if self.tenant_conf.load().location.may_upload_layers_hint() { let timelines = self .timelines .lock() .unwrap() .values() .filter(|tli| tli.is_active()) .cloned() .collect_vec(); for timeline in timelines { // Include a span with the timeline ID. The parent span already has the tenant ID. let span = info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id); timeline .maybe_freeze_ephemeral_layer() .instrument(span) .await; } } // Shut down walredo if idle. const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180); if let Some(ref walredo_mgr) = self.walredo_mgr { walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); } // Update the feature resolver with the latest tenant-spcific data. self.feature_resolver.refresh_properties_and_flags(self); } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { let timelines = self.timelines.lock().unwrap(); !timelines .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id)) } pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } pub fn is_active(&self) -> bool { self.current_state() == TenantState::Active } pub fn generation(&self) -> Generation { self.generation } pub(crate) fn wal_redo_manager_status(&self) -> Option { self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) } /// Changes tenant status to active, unless shutdown was already requested. /// /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup /// to delay background jobs. Background jobs can be started right away when None is given. fn activate( self: &Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { span::debug_assert_current_span_has_tenant_id(); let mut activating = false; self.state.send_modify(|current_state| { use pageserver_api::models::ActivatingFrom; match &*current_state { TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => { panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {current_state:?}"); } TenantState::Attaching => { *current_state = TenantState::Activating(ActivatingFrom::Attaching); } } debug!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), "Activating tenant"); activating = true; // Continue outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. }); if activating { let timelines_accessor = self.timelines.lock().unwrap(); let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); let mut activated_timelines = 0; for timeline in timelines_to_activate { timeline.activate( self.clone(), broker_client.clone(), background_jobs_can_start, &ctx.with_scope_timeline(timeline), ); activated_timelines += 1; } let tid = self.tenant_shard_id.tenant_id.to_string(); let shard_id = self.tenant_shard_id.shard_slug().to_string(); let offloaded_timeline_count = timelines_offloaded_accessor.len(); TENANT_OFFLOADED_TIMELINES .with_label_values(&[&tid, &shard_id]) .set(offloaded_timeline_count as u64); self.state.send_modify(move |current_state| { assert!( matches!(current_state, TenantState::Activating(_)), "set_stopping and set_broken wait for us to leave Activating state", ); *current_state = TenantState::Active; let elapsed = self.constructed_at.elapsed(); let total_timelines = timelines_accessor.len(); // log a lot of stuff, because some tenants sometimes suffer from user-visible // times to activate. see https://github.com/neondatabase/neon/issues/4025 info!( since_creation_millis = elapsed.as_millis(), tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), activated_timelines, total_timelines, post_state = <&'static str>::from(&*current_state), "activation attempt finished" ); TENANT.activation.observe(elapsed.as_secs_f64()); }); } } /// Shutdown the tenant and join all of the spawned tasks. /// /// The method caters for all use-cases: /// - pageserver shutdown (freeze_and_flush == true) /// - detach + ignore (freeze_and_flush == false) /// /// This will attempt to shutdown even if tenant is broken. /// /// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call. /// If the tenant is already shutting down, we return a clone of the first shutdown call's /// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with /// the ongoing shutdown. async fn shutdown( &self, shutdown_progress: completion::Barrier, shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); // Set tenant (and its timlines) to Stoppping state. // // Since we can only transition into Stopping state after activation is complete, // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed. // // Transitioning tenants to Stopping state has a couple of non-obvious side effects: // 1. Lock out any new requests to the tenants. // 2. Signal cancellation to WAL receivers (we wait on it below). // 3. Signal cancellation for other tenant background loops. // 4. ??? // // The waiting for the cancellation is not done uniformly. // We certainly wait for WAL receivers to shut down. // That is necessary so that no new data comes in before the freeze_and_flush. // But the tenant background loops are joined-on in our caller. // It's mesed up. // we just ignore the failure to stop // If we're still attaching, fire the cancellation token early to drop out: this // will prevent us flushing, but ensures timely shutdown if some I/O during attach // is very slow. let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) { self.cancel.cancel(); // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens // are children of ours, so their flush loops will have shut down already timeline::ShutdownMode::Hard } else { shutdown_mode }; match self.set_stopping(shutdown_progress).await { Ok(()) => {} Err(SetStoppingError::Broken) => { // assume that this is acceptable } Err(SetStoppingError::AlreadyStopping(other)) => { // give caller the option to wait for this this shutdown info!("Tenant::shutdown: AlreadyStopping"); return Err(other); } }; let mut js = tokio::task::JoinSet::new(); { let timelines = self.timelines.lock().unwrap(); timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); let timeline_id = timeline.timeline_id; let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }); } { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); timelines_offloaded.values().for_each(|timeline| { timeline.defuse_for_tenant_drop(); }); } { let mut timelines_importing = self.timelines_importing.lock().unwrap(); timelines_importing .drain() .for_each(|(timeline_id, importing_timeline)| { let span = tracing::info_span!("importing_timeline_shutdown", %timeline_id); js.spawn(async move { importing_timeline.shutdown().instrument(span).await }); }); } // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { match res { Ok(()) => {} Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), Err(je) if je.is_panic() => { /* logged already */ } Err(je) => warn!("unexpected JoinError: {je:?}"), } } if let ShutdownMode::Reload = shutdown_mode { tracing::info!("Flushing deletion queue"); if let Err(e) = self.deletion_queue_client.flush().await { match e { DeletionQueueError::ShuttingDown => { // This is the only error we expect for now. In the future, if more error // variants are added, we should handle them here. } } } } // We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits // them to continue to do work during their shutdown methods, e.g. flushing data. tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); // shutdown all tenant and timeline tasks: gc, compaction, page service // No new tasks will be started for this tenant because it's in `Stopping` state. // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; if let Some(walredo_mgr) = self.walredo_mgr.as_ref() { walredo_mgr.shutdown().await; } // Wait for any in-flight operations to complete self.gate.close().await; remove_tenant_metrics(&self.tenant_shard_id); Ok(()) } /// Change tenant status to Stopping, to mark that it is being shut down. /// /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. /// /// This function is not cancel-safe! async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); // cannot stop before we're done activating, so wait out until we're done activating rx.wait_for(|state| match state { TenantState::Activating(_) | TenantState::Attaching => { info!("waiting for {state} to turn Active|Broken|Stopping"); false } TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true, }) .await .expect("cannot drop self.state while on a &self method"); // we now know we're done activating, let's see whether this task is the winner to transition into Stopping let mut err = None; let stopping = self.state.send_if_modified(|current_state| match current_state { TenantState::Activating(_) | TenantState::Attaching => { unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } TenantState::Active => { // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines // are created after the transition to Stopping. That's harmless, as the Timelines // won't be accessible to anyone afterwards, because the Tenant is in Stopping state. *current_state = TenantState::Stopping { progress: Some(progress) }; // Continue stopping outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. true } TenantState::Stopping { progress: None } => { // An attach was cancelled, and the attach transitioned the tenant from Attaching to // Stopping(None) to let us know it exited. Register our progress and continue. *current_state = TenantState::Stopping { progress: Some(progress) }; true } TenantState::Broken { reason, .. } => { info!( "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}" ); err = Some(SetStoppingError::Broken); false } TenantState::Stopping { progress: Some(progress) } => { info!("Tenant is already in Stopping state"); err = Some(SetStoppingError::AlreadyStopping(progress.clone())); false } }); match (stopping, err) { (true, None) => {} // continue (false, Some(err)) => return Err(err), (true, Some(_)) => unreachable!( "send_if_modified closure must error out if not transitioning to Stopping" ), (false, None) => unreachable!( "send_if_modified closure must return true if transitioning to Stopping" ), } let timelines_accessor = self.timelines.lock().unwrap(); let not_broken_timelines = timelines_accessor .values() .filter(|timeline| !timeline.is_broken()); for timeline in not_broken_timelines { timeline.set_state(TimelineState::Stopping); } Ok(()) } /// Method for tenant::mgr to transition us into Broken state in case of a late failure in /// `remove_tenant_from_memory` /// /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. /// /// In tests, we also use this to set tenants to Broken state on purpose. pub(crate) async fn set_broken(&self, reason: String) { let mut rx = self.state.subscribe(); // The load & attach routines own the tenant state until it has reached `Active`. // So, wait until it's done. rx.wait_for(|state| match state { TenantState::Activating(_) | TenantState::Attaching => { info!( "waiting for {} to turn Active|Broken|Stopping", <&'static str>::from(state) ); false } TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true, }) .await .expect("cannot drop self.state while on a &self method"); // we now know we're done activating, let's see whether this task is the winner to transition into Broken self.set_broken_no_wait(reason) } pub(crate) fn set_broken_no_wait(&self, reason: impl Display) { let reason = reason.to_string(); self.state.send_modify(|current_state| { match *current_state { TenantState::Activating(_) | TenantState::Attaching => { unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } TenantState::Active => { if cfg!(feature = "testing") { warn!("Changing Active tenant to Broken state, reason: {}", reason); *current_state = TenantState::broken_from_reason(reason); } else { unreachable!("not allowed to call set_broken on Active tenants in non-testing builds") } } TenantState::Broken { .. } => { warn!("Tenant is already in Broken state"); } // This is the only "expected" path, any other path is a bug. TenantState::Stopping { .. } => { warn!( "Marking Stopping tenant as Broken state, reason: {}", reason ); *current_state = TenantState::broken_from_reason(reason); } } }); } pub fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } /// The activate_now semaphore is initialized with zero units. As soon as /// we add a unit, waiters will be able to acquire a unit and proceed. pub(crate) fn activate_now(&self) { self.activate_now_sem.add_permits(1); } pub(crate) async fn wait_to_become_active( &self, timeout: Duration, ) -> Result<(), GetActiveTenantError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active self.activate_now(); match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await { Ok(r) => { r.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id)) )? } Err(TimeoutCancellableError::Cancelled) => { return Err(GetActiveTenantError::Cancelled); } Err(TimeoutCancellableError::Timeout) => { return Err(GetActiveTenantError::WaitForActiveTimeout { latest_state: Some(self.current_state()), wait_time: timeout, }); } } } TenantState::Active => { return Ok(()); } TenantState::Broken { reason, .. } => { // This is fatal, and reported distinctly from the general case of "will never be active" because // it's logically a 500 to external API users (broken is always a bug). return Err(GetActiveTenantError::Broken(reason)); } TenantState::Stopping { .. } => { // There's no chance the tenant can transition back into ::Active return Err(GetActiveTenantError::WillNotBecomeActive(current_state)); } } } } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { self.tenant_conf.load().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { let attached_tenant_conf = self.tenant_conf.load(); let location_config_mode = match attached_tenant_conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti, AttachmentMode::Stale => models::LocationConfigMode::AttachedStale, }; models::LocationConfig { mode: location_config_mode, generation: self.generation.into(), secondary_conf: None, shard_number: self.shard_identity.number.0, shard_count: self.shard_identity.count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf: attached_tenant_conf.tenant_conf.clone(), } } pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { &self.tenant_shard_id } pub(crate) fn get_shard_identity(&self) -> ShardIdentity { self.shard_identity } pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { self.shard_identity.stripe_size } pub(crate) fn get_generation(&self) -> Generation { self.generation } /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible, /// and can leave the tenant in a bad state if it fails. The caller is responsible for /// resetting this tenant to a valid state if we fail. pub(crate) async fn split_prepare( &self, child_shards: &Vec, ) -> anyhow::Result<()> { let (timelines, offloaded) = { let timelines = self.timelines.lock().unwrap(); let offloaded = self.timelines_offloaded.lock().unwrap(); (timelines.clone(), offloaded.clone()) }; let timelines_iter = timelines .values() .map(TimelineOrOffloadedArcRef::<'_>::from) .chain( offloaded .values() .map(TimelineOrOffloadedArcRef::<'_>::from), ); for timeline in timelines_iter { // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels // to ensure that they do not start a split if currently in the process of doing these. let timeline_id = timeline.timeline_id(); if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline { // Upload an index from the parent: this is partly to provide freshness for the // child tenants that will copy it, and partly for general ease-of-debugging: there will // always be a parent shard index in the same generation as we wrote the child shard index. tracing::info!(%timeline_id, "Uploading index"); timeline .remote_client .schedule_index_upload_for_file_changes()?; timeline.remote_client.wait_completion().await?; } let remote_client = match timeline { TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(), TimelineOrOffloadedArcRef::Offloaded(offloaded) => { let remote_client = self .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone()); Arc::new(remote_client) } TimelineOrOffloadedArcRef::Importing(_) => { unreachable!("Importing timelines are not included in the iterator") } }; // Shut down the timeline's remote client: this means that the indices we write // for child shards will not be invalidated by the parent shard deleting layers. tracing::info!(%timeline_id, "Shutting down remote storage client"); remote_client.shutdown().await; // Download methods can still be used after shutdown, as they don't flow through the remote client's // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this // operation is rare, so it's simpler to just download it (and robustly guarantees that the index // we use here really is the remotely persistent one). tracing::info!(%timeline_id, "Downloading index_part from parent"); let result = remote_client .download_index_file(&self.cancel) .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id)) .await?; let index_part = match result { MaybeDeletedIndexPart::Deleted(_) => { anyhow::bail!("Timeline deletion happened concurrently with split") } MaybeDeletedIndexPart::IndexPart(p) => p, }; // A shard split may not take place while a timeline import is on-going // for the tenant. Timeline imports run as part of each tenant shard // and rely on the sharding scheme to split the work among pageservers. // If we were to split in the middle of this process, we would have to // either ensure that it's driven to completion on the old shard set // or transfer it to the new shard set. It's technically possible, but complex. match index_part.import_pgdata { Some(ref import) if !import.is_done() => { anyhow::bail!( "Cannot split due to import with idempotency key: {:?}", import.idempotency_key() ); } Some(_) | None => { // fallthrough } } for child_shard in child_shards { tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index()); upload_index_part( &self.remote_storage, child_shard, &timeline_id, self.generation, &index_part, &self.cancel, ) .await?; } } let tenant_manifest = self.build_tenant_manifest(); for child_shard in child_shards { tracing::info!( "Uploading tenant manifest for child {}", child_shard.to_index() ); upload_tenant_manifest( &self.remote_storage, child_shard, self.generation, &tenant_manifest, &self.cancel, ) .await?; } Ok(()) } pub(crate) fn get_sizes(&self) -> TopTenantShardItem { let mut result = TopTenantShardItem { id: self.tenant_shard_id, resident_size: 0, physical_size: 0, max_logical_size: 0, max_logical_size_per_shard: 0, }; for timeline in self.timelines.lock().unwrap().values() { result.resident_size += timeline.metrics.resident_physical_size_gauge.get(); result.physical_size += timeline .remote_client .metrics .remote_physical_size_gauge .get(); result.max_logical_size = std::cmp::max( result.max_logical_size, timeline.metrics.current_logical_size_gauge.get(), ); } result.max_logical_size_per_shard = result .max_logical_size .div_ceil(self.tenant_shard_id.shard_count.count() as u64); result } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), /// perform a topological sort, so that the parent of each timeline comes /// before the children. /// E extracts the ancestor from T /// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc. fn tree_sort_timelines( timelines: HashMap, extractor: E, ) -> anyhow::Result> where E: Fn(&T) -> Option, { let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); // (ancestor, children) let mut later: HashMap> = HashMap::with_capacity(timelines.len()); for (timeline_id, value) in timelines { if let Some(ancestor_id) = extractor(&value) { let children = later.entry(ancestor_id).or_default(); children.push((timeline_id, value)); } else { now.push((timeline_id, value)); } } while let Some((timeline_id, metadata)) = now.pop() { result.push((timeline_id, metadata)); // All children of this can be loaded now if let Some(mut children) = later.remove(&timeline_id) { now.append(&mut children); } } // All timelines should be visited now. Unless there were timelines with missing ancestors. if !later.is_empty() { for (missing_id, orphan_ids) in later { for (orphan_id, _) in orphan_ids { error!( "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded" ); } } bail!("could not load tenant because some timelines are missing ancestors"); } Ok(result) } impl TenantShard { pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> pageserver_api::config::TenantConfigToml { self.tenant_specific_overrides() .merge(self.conf.default_tenant_conf.clone()) } pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .rel_size_v2_enabled .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } pub fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_upper_limit .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } pub fn get_compaction_l0_first(&self) -> bool { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_l0_first .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) } pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } // HADRON pub fn get_image_creation_timeout(&self) -> Option { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf.image_layer_force_creation_period.or(self .conf .default_tenant_conf .image_layer_force_creation_period) } pub fn get_pitr_interval(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_min_resident_size_override(&self) -> Option { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); if heatmap_period.is_zero() { None } else { Some(heatmap_period) } } pub fn get_lsn_lease_length(&self) -> Duration { Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf) } pub fn get_lsn_lease_length_impl( conf: &'static PageServerConf, tenant_conf: &pageserver_api::models::TenantConfig, ) -> Duration { tenant_conf .lsn_lease_length .unwrap_or(conf.default_tenant_conf.lsn_lease_length) } pub fn get_timeline_offloading_enabled(&self) -> bool { if self.conf.timeline_offloading { return true; } let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .timeline_offloading .unwrap_or(self.conf.default_tenant_conf.timeline_offloading) } /// Generate an up-to-date TenantManifest based on the state of this Tenant. fn build_tenant_manifest(&self) -> TenantManifest { // Collect the offloaded timelines, and sort them for deterministic output. let offloaded_timelines = self .timelines_offloaded .lock() .unwrap() .values() .map(|tli| tli.manifest()) .sorted_by_key(|m| m.timeline_id) .collect_vec(); TenantManifest { version: LATEST_TENANT_MANIFEST_VERSION, stripe_size: Some(self.get_shard_stripe_size()), offloaded_timelines, } } pub fn update_tenant_config< F: Fn( pageserver_api::models::TenantConfig, ) -> anyhow::Result, >( &self, update: F, ) -> anyhow::Result { // Use read-copy-update in order to avoid overwriting the location config // state if this races with [`TenantShard::set_new_location_config`]. Note that // this race is not possible if both request types come from the storage // controller (as they should!) because an exclusive op lock is required // on the storage controller side. self.tenant_conf .try_rcu(|attached_conf| -> Result<_, anyhow::Error> { Ok(Arc::new(AttachedTenantConf { tenant_conf: update(attached_conf.tenant_conf.clone())?, location: attached_conf.location, lsn_lease_deadline: attached_conf.lsn_lease_deadline, })) })?; let updated = self.tenant_conf.load(); self.tenant_conf_updated(&updated.tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { timeline.tenant_conf_updated(&updated); } Ok(updated.tenant_conf.clone()) } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { let new_tenant_conf = new_conf.tenant_conf.clone(); self.tenant_conf.store(Arc::new(new_conf.clone())); self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { timeline.tenant_conf_updated(&new_conf); } } fn get_pagestream_throttle_config( psconf: &'static PageServerConf, overrides: &pageserver_api::models::TenantConfig, ) -> throttle::Config { overrides .timeline_get_throttle .clone() .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) } pub(crate) fn tenant_conf_updated(&self, new_conf: &pageserver_api::models::TenantConfig) { let conf = Self::get_pagestream_throttle_config(self.conf, new_conf); self.pagestream_throttle.reconfigure(conf) } /// Helper function to create a new Timeline struct. /// /// The returned Timeline is in Loading state. The caller is responsible for /// initializing any on-disk state, and for inserting the Timeline to the 'timelines' /// map. /// /// `validate_ancestor == false` is used when a timeline is created for deletion /// and we might not have the ancestor present anymore which is fine for to be /// deleted timelines. #[allow(clippy::too_many_arguments)] fn create_timeline_struct( &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, previous_heatmap: Option, ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, gc_compaction_state: Option, rel_size_v2_status: Option, rel_size_migrated_at: Option, ctx: &RequestContext, ) -> anyhow::Result<(Arc, RequestContext)> { let state = match cause { CreateTimelineCause::Load => { let ancestor_id = new_metadata.ancestor_timeline(); anyhow::ensure!( ancestor_id == ancestor.as_ref().map(|t| t.timeline_id), "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found" ); TimelineState::Loading } CreateTimelineCause::Delete => TimelineState::Stopping, }; let pg_version = new_metadata.pg_version(); let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), new_metadata, previous_heatmap, ancestor, new_timeline_id, self.tenant_shard_id, self.generation, self.shard_identity, self.walredo_mgr.clone(), resources, pg_version, state, self.attach_wal_lag_cooldown.clone(), create_idempotency, gc_compaction_state, rel_size_v2_status, rel_size_migrated_at, self.cancel.child_token(), ); let timeline_ctx = RequestContextBuilder::from(ctx) .scope(context::Scope::new_timeline(&timeline)) .detached_child(); Ok((timeline, timeline_ctx)) } /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object /// to ensure proper cleanup of background tasks and metrics. // // Allow too_many_arguments because a constructor's argument list naturally grows with the // number of attributes in the struct: breaking these out into a builder wouldn't be helpful. #[allow(clippy::too_many_arguments)] fn new( state: TenantState, conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, walredo_mgr: Option>, tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, basebackup_cache: Arc, feature_resolver: FeatureResolver, ) -> TenantShard { assert!(!attached_conf.location.generation.is_none()); let (state, mut rx) = watch::channel(state); tokio::spawn(async move { // reflect tenant state in metrics: // - global per tenant state: TENANT_STATE_METRIC // - "set" of broken tenants: BROKEN_TENANTS_SET // // set of broken tenants should not have zero counts so that it remains accessible for // alerting. let tid = tenant_shard_id.to_string(); let shard_id = tenant_shard_id.shard_slug().to_string(); let set_key = &[tid.as_str(), shard_id.as_str()][..]; fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) } let mut tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; let mut counted_broken = if is_broken { // add the id to the set right away, there should not be any updates on the channel // after before tenant is removed, if ever BROKEN_TENANTS_SET.with_label_values(set_key).set(1); true } else { false }; loop { let labels = &tuple.0; let current = TENANT_STATE_METRIC.with_label_values(labels); current.inc(); if rx.changed().await.is_err() { // tenant has been dropped current.dec(); drop(BROKEN_TENANTS_SET.remove_label_values(set_key)); break; } current.dec(); tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; if is_broken && !counted_broken { counted_broken = true; // insert the tenant_id (back) into the set while avoiding needless counter // access BROKEN_TENANTS_SET.with_label_values(set_key).set(1); } } }); TenantShard { tenant_shard_id, shard_identity, generation: attached_conf.location.generation, conf, // using now here is good enough approximation to catch tenants with really long // activation times. constructed_at: Instant::now(), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), timelines_importing: Mutex::new(HashMap::new()), remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, deletion_queue_client, state, cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new( format!("compaction-{tenant_shard_id}"), 5, // Compaction can be a very expensive operation, and might leak disk space. It also ought // to be infallible, as long as remote storage is available. So if it repeatedly fails, // use an extremely long backoff. Some(Duration::from_secs(3600 * 24)), )), l0_compaction_trigger: Arc::new(Notify::new()), scheduled_compaction_tasks: Mutex::new(Default::default()), activate_now_sem: tokio::sync::Semaphore::new(0), attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), cancel: CancellationToken::default(), gate: Gate::default(), pagestream_throttle: Arc::new(throttle::Throttle::new( TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf), )), pagestream_throttle_metrics: Arc::new( crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id), ), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), l0_flush_global_state, basebackup_cache, feature_resolver: Arc::new(TenantFeatureResolver::new( feature_resolver, tenant_shard_id.tenant_id, )), } } /// Locate and load config pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, ) -> Result { let config_path = conf.tenant_location_config_path(tenant_shard_id); info!("loading tenant configuration from {config_path}"); // load and parse file let config = fs::read_to_string(&config_path).map_err(|e| { match e.kind() { std::io::ErrorKind::NotFound => { // The config should almost always exist for a tenant directory: // - When attaching a tenant, the config is the first thing we write // - When detaching a tenant, we atomically move the directory to a tmp location // before deleting contents. // // The very rare edge case that can result in a missing config is if we crash during attach // between creating directory and writing config. Callers should handle that as if the // directory didn't exist. LoadConfigError::NotFound(config_path) } _ => { // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues // that we cannot cleanly recover crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file") } } })?; Ok(toml_edit::de::from_str::(&config)?) } /// Stores a tenant location config to disk. /// /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid /// changes to shard parameters that may result in data corruption. #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, location_conf: &LocationConf, ) -> std::io::Result<()> { let config_path = conf.tenant_location_config_path(tenant_shard_id); Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config_at( tenant_shard_id: &TenantShardId, config_path: &Utf8Path, location_conf: &LocationConf, ) -> std::io::Result<()> { debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. "# .to_string(); fail::fail_point!("tenant-config-before-write", |_| { Err(std::io::Error::other("tenant-config-before-write")) }); // Convert the config to a toml file. conf_content += &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed"); let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); let conf_content = conf_content.into_bytes(); VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await } // // How garbage collection works: // // +--bar-------------> // / // +----+-----foo----------------> // / // ----main--+--------------------------> // \ // +-----baz--------> // // // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's // `gc_infos` are being refreshed // 2. Scan collected timelines, and on each timeline, make note of the // all the points where other timelines have been branched off. // We will refrain from removing page versions at those LSNs. // 3. For each timeline, scan all layer files on the timeline. // Remove all files for which a newer file exists and which // don't cover any branch point LSNs. // // TODO: // - if a relation has a non-incremental persistent layer on a child branch, then we // don't need to keep that in the parent anymore. But currently // we do. async fn gc_iteration_internal( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); let gc_timelines = self .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await?; failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); // If there is nothing to GC, we don't want any messages in the INFO log. if !gc_timelines.is_empty() { info!("{} timelines need GC", gc_timelines.len()); } else { debug!("{} timelines need GC", gc_timelines.len()); } // Perform GC for each timeline. // // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the // branch creation task, which requires the GC lock. A GC iteration can run concurrently // with branch creation. // // See comments in [`TenantShard::branch_timeline`] for more information about why branch // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we // made. break; } let result = match timeline.gc().await { Err(GcError::TimelineCancelled) => { if target_timeline_id.is_some() { // If we were targetting this specific timeline, surface cancellation to caller return Err(GcError::TimelineCancelled); } else { // A timeline may be shutting down independently of the tenant's lifecycle: we should // skip past this and proceed to try GC on other timelines. continue; } } r => r?, }; totals += result; } totals.elapsed = now.elapsed(); Ok(totals) } /// Refreshes the Timeline::gc_info for all timelines, returning the /// vector of timelines which have [`Timeline::get_last_record_lsn`] past /// [`TenantShard::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. pub(crate) async fn refresh_gc_info( &self, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result>, GcError> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. let horizon = self.get_gc_horizon(); let pitr = self.get_pitr_interval(); // refresh all timelines let target_timeline_id = None; self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, timelines_offloaded: &std::sync::MutexGuard>>, restrict_to_timeline: Option, ) { if restrict_to_timeline.is_none() { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. assert!(!self.is_active()); } // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. let mut all_branchpoints: BTreeMap> = BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); ancestor_children.push(( timeline_entry.get_ancestor_lsn(), *timeline_id, MaybeOffloaded::No, )); } }); timelines_offloaded .iter() .for_each(|(timeline_id, timeline_entry)| { let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { return; }; let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { return; }; let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches let timelines_to_write = if let Some(timeline_id) = restrict_to_timeline { itertools::Either::Left(timelines.get(&timeline_id).into_iter()) } else { itertools::Either::Right(timelines.values()) }; for timeline in timelines_to_write { let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); branchpoints.sort_by_key(|b| b.0); let mut target = timeline.gc_info.write().unwrap(); target.retain_lsns = branchpoints; let space_cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) .unwrap_or(Lsn(0)); target.cutoffs = GcCutoffs { space: space_cutoff, time: None, }; } } async fn refresh_gc_info_internal( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result>, GcError> { // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for // currently visible timelines. let timelines = self .timelines .lock() .unwrap() .values() .filter(|tl| match target_timeline_id.as_ref() { Some(target) => &tl.timeline_id == target, None => true, }) .cloned() .collect::>(); if target_timeline_id.is_some() && timelines.is_empty() { // We were to act on a particular timeline and it wasn't found return Err(GcError::TimelineNotFound); } let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); // Ensures all timelines use the same start time when computing the time cutoff. let now_ts_for_pitr_calc = SystemTime::now(); for timeline in timelines.iter() { let ctx = &ctx.with_scope_timeline(timeline); let cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) .unwrap_or(Lsn(0)); let cutoffs = timeline .find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx) .await?; let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); assert!(old.is_none()); } if !self.is_active() || self.cancel.is_cancelled() { return Err(GcError::TenantCancelled); } // grab mutex to prevent new timelines from being created here; avoid doing long operations // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { assert_eq!(target_timeline_id, timeline.timeline_id); } { let mut target = timeline.gc_info.write().unwrap(); // Cull any expired leases let now = SystemTime::now(); target.leases.retain(|_, lease| !lease.is_expired(&now)); timeline .metrics .valid_lsn_lease_count_gauge .set(target.leases.len() as u64); // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { target.within_ancestor_pitr = Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time; } } // Update metrics that depend on GC state timeline .metrics .archival_size .set(if target.within_ancestor_pitr { timeline.metrics.current_logical_size_gauge.get() } else { 0 }); if let Some(time_cutoff) = target.cutoffs.time { timeline.metrics.pitr_history_size.set( timeline .get_last_record_lsn() .checked_sub(time_cutoff) .unwrap_or_default() .0, ); } // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? // - this timeline was created while we were finding cutoffs // - lsn for timestamp search fails for this timeline repeatedly if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { let original_cutoffs = target.cutoffs.clone(); // GC cutoffs should never go back target.cutoffs = GcCutoffs { space: cutoffs.space.max(original_cutoffs.space), time: cutoffs.time.max(original_cutoffs.time), } } } gc_timelines.push(timeline); } drop(gc_cs); Ok(gc_timelines) } /// A substitute for `branch_timeline` for use in unit tests. /// The returned timeline will have state value `Active` to make various `anyhow::ensure!()` /// calls pass, but, we do not actually call `.activate()` under the hood. So, none of the /// timeline background tasks are launched, except the flush loop. #[cfg(test)] async fn branch_timeline_test( self: &Arc, src_timeline: &Arc, dst_id: TimelineId, ancestor_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { let tl = self .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx) .await? .into_timeline_for_test(); tl.set_state(TimelineState::Active); Ok(tl) } /// Helper for unit tests to branch a timeline with some pre-loaded states. #[cfg(test)] #[allow(clippy::too_many_arguments)] pub async fn branch_timeline_test_with_layers( self: &Arc, src_timeline: &Arc, dst_id: TimelineId, ancestor_lsn: Option, ctx: &RequestContext, delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { use checks::check_valid_layermap; use itertools::Itertools; let tline = self .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx) .await?; let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn { ancestor_lsn } else { tline.get_last_record_lsn() }; assert!(end_lsn >= ancestor_lsn); tline.force_advance_lsn(end_lsn); for deltas in delta_layer_desc { tline .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx) .await?; } for (lsn, images) in image_layer_desc { tline .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx) .await?; } let layer_names = tline .layers .read(LayerManagerLockHolder::Testing) .await .layer_map() .unwrap() .iter_historic_layers() .map(|layer| layer.layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { bail!("invalid layermap: {err}"); } Ok(tline) } /// Branch an existing timeline. async fn branch_timeline( self: &Arc, src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, ctx: &RequestContext, ) -> Result { self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) .await } async fn branch_timeline_impl( self: &Arc, src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, ctx: &RequestContext, ) -> Result { let src_id = src_timeline.timeline_id; // We will validate our ancestor LSN in this function. Acquire the GC lock so that // this check cannot race with GC, and the ancestor LSN is guaranteed to remain // valid while we are creating the branch. let _gc_cs = self.gc_cs.lock().await; // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN let start_lsn = start_lsn.unwrap_or_else(|| { let lsn = src_timeline.get_last_record_lsn(); info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}"); lsn }); // we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now let timeline_create_guard = match self .start_creating_timeline( dst_id, CreateTimelineIdempotency::Branch { ancestor_timeline_id: src_timeline.timeline_id, ancestor_start_lsn: start_lsn, }, ) .await? { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { return Ok(CreateTimelineResult::Idempotent(timeline)); } }; // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR // horizon on the source timeline // // We check it against both the planned GC cutoff stored in 'gc_info', // and the 'latest_gc_cutoff' of the last GC that was performed. The // planned GC cutoff in 'gc_info' is normally larger than // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just // changed the GC settings for the tenant to make the PITR window // larger, but some of the data was already removed by an earlier GC // iteration. // check against last actual 'latest_gc_cutoff' first let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn(); { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { tracing::info!( "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn ); } else { src_timeline .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", *applied_gc_cutoff_lsn, )) .map_err(CreateTimelineError::AncestorLsn)?; // and then the planned GC cutoff if start_lsn < planned_cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}" ))); } } } // // The branch point is valid, and we are still holding the 'gc_cs' lock // so that GC cannot advance the GC cutoff until we are finished. // Proceed with the branch creation. // // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. let RecordLsn { last: src_last, prev: src_prev, } = src_timeline.get_last_record_rlsn(); let dst_prev = if src_last == start_lsn { Some(src_prev) } else { None }; // Create the metadata file, noting the ancestor of the new timeline. // There is initially no data in it, but all the read-calls know to look // into the ancestor. let metadata = TimelineMetadata::new( start_lsn, dst_prev, Some(src_id), start_lsn, *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, ); let (rel_size_v2_status, rel_size_migrated_at) = src_timeline.get_rel_size_v2_status(); let (uninitialized_timeline, _timeline_ctx) = self .prepare_new_timeline( dst_id, &metadata, timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), Some(rel_size_v2_status), rel_size_migrated_at, ctx, ) .await?; let new_timeline = uninitialized_timeline.finish_creation().await?; // Root timeline gets its layers during creation and uploads them along with the metadata. // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC // could get incorrect information and remove more layers, than needed. // See also https://github.com/neondatabase/neon/issues/3865 new_timeline .remote_client .schedule_index_upload_for_full_metadata_update(&metadata) .context("branch initial metadata upload")?; // Callers are responsible to wait for uploads to complete and for activating the timeline. Ok(CreateTimelineResult::Created(new_timeline)) } /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( self: &Arc, timeline_id: TimelineId, pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx) .await .map_err(anyhow::Error::new) .map(|r| r.into_timeline_for_test()) } /// Get exclusive access to the timeline ID for creation. /// /// Timeline-creating code paths must use this function before making changes /// to in-memory or persistent state. /// /// The `state` parameter is a description of the timeline creation operation /// we intend to perform. /// If the timeline was already created in the meantime, we check whether this /// request conflicts or is idempotent , based on `state`. async fn start_creating_timeline( self: &Arc, new_timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, ) -> Result { let allow_offloaded = false; match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) { Ok(create_guard) => { pausable_failpoint!("timeline-creation-after-uninit"); Ok(StartCreatingTimelineResult::CreateGuard(create_guard)) } Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown), Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot // check if this request matches the existing one, so caller must try // again later. Err(CreateTimelineError::AlreadyCreating) } Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)), Err(TimelineExclusionError::AlreadyExists { existing: TimelineOrOffloaded::Offloaded(_existing), .. }) => { info!("timeline already exists but is offloaded"); Err(CreateTimelineError::Conflict) } Err(TimelineExclusionError::AlreadyExists { existing: TimelineOrOffloaded::Importing(_existing), .. }) => { // If there's a timeline already importing, then we would hit // the [`TimelineExclusionError::AlreadyCreating`] branch above. unreachable!("Importing timelines hold the creation guard") } Err(TimelineExclusionError::AlreadyExists { existing: TimelineOrOffloaded::Timeline(existing), arg, }) => { { let existing = &existing.create_idempotency; let _span = info_span!("idempotency_check", ?existing, ?arg).entered(); debug!("timeline already exists"); match (existing, &arg) { // FailWithConflict => no idempotency check (CreateTimelineIdempotency::FailWithConflict, _) | (_, CreateTimelineIdempotency::FailWithConflict) => { warn!("timeline already exists, failing request"); return Err(CreateTimelineError::Conflict); } // Idempotent <=> CreateTimelineIdempotency is identical (x, y) if x == y => { info!( "timeline already exists and idempotency matches, succeeding request" ); // fallthrough } (_, _) => { warn!("idempotency conflict, failing request"); return Err(CreateTimelineError::Conflict); } } } Ok(StartCreatingTimelineResult::Idempotent(existing)) } } } async fn upload_initdb( &self, timelines_path: &Utf8PathBuf, pgdata_path: &Utf8PathBuf, timeline_id: &TimelineId, ) -> anyhow::Result<()> { let temp_path = timelines_path.join(format!( "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" )); scopeguard::defer! { if let Err(e) = fs::remove_file(&temp_path) { error!("Failed to remove temporary initdb archive '{temp_path}': {e}"); } } let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?; const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT { warn!( "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}." ); } pausable_failpoint!("before-initdb-upload"); backoff::retry( || async { self::remote_timeline_client::upload_initdb_dir( &self.remote_storage, &self.tenant_shard_id.tenant_id, timeline_id, pgdata_zstd.try_clone().await?, tar_zst_size, &self.cancel, ) .await }, |_| false, 3, u32::MAX, "persist_initdb_tar_zst", &self.cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) } /// - run initdb to init temporary instance and get bootstrap data /// - after initialization completes, tar up the temp dir and upload it to S3. async fn bootstrap_timeline( self: &Arc, timeline_id: TimelineId, pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> Result { let timeline_create_guard = match self .start_creating_timeline( timeline_id, CreateTimelineIdempotency::Bootstrap { pg_version }, ) .await? { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { return Ok(CreateTimelineResult::Idempotent(timeline)); } }; // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let timelines_path = self.conf.timelines_path(&self.tenant_shard_id); let pgdata_path = path_with_suffix_extension( timelines_path.join(format!("basebackup-{timeline_id}")), TEMP_FILE_SUFFIX, ); // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees // we won't race with other creations or existent timelines with the same path. if pgdata_path.exists() { fs::remove_dir_all(&pgdata_path).with_context(|| { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; tracing::info!("removed previous attempt's temporary initdb directory '{pgdata_path}'"); } // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it let pgdata_path_deferred = pgdata_path.clone(); scopeguard::defer! { if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred).or_else(fs_ext::ignore_not_found) { // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}"); } else { tracing::info!("removed temporary initdb directory '{pgdata_path_deferred}'"); } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { if existing_initdb_timeline_id != timeline_id { let source_path = &remote_initdb_archive_path( &self.tenant_shard_id.tenant_id, &existing_initdb_timeline_id, ); let dest_path = &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); // if this fails, it will get retried by retried control plane requests self.remote_storage .copy_object(source_path, dest_path, &self.cancel) .await .context("copy initdb tar")?; } let (initdb_tar_zst_path, initdb_tar_zst) = self::remote_timeline_client::download_initdb_tar_zst( self.conf, &self.remote_storage, &self.tenant_shard_id, &existing_initdb_timeline_id, &self.cancel, ) .await .context("download initdb tar")?; scopeguard::defer! { if let Err(e) = fs::remove_file(&initdb_tar_zst_path) { error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}"); } } let buf_read = BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); extract_zst_tarball(&pgdata_path, buf_read) .await .context("extract initdb tar")?; } else { // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel) .await .context("run initdb")?; // Upload the created data dir to S3 if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } } let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata. let new_metadata = TimelineMetadata::new( Lsn(0), None, None, Lsn(0), pgdata_lsn, pgdata_lsn, pg_version, ); let (mut raw_timeline, timeline_ctx) = self .prepare_new_timeline( timeline_id, &new_metadata, timeline_create_guard, pgdata_lsn, None, None, None, ctx, ) .await?; let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; raw_timeline .write(|unfinished_timeline| async move { import_datadir::import_timeline_from_postgres_datadir( &unfinished_timeline, &pgdata_path, pgdata_lsn, &timeline_ctx, ) .await .with_context(|| { format!( "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}" ) })?; fail::fail_point!("before-checkpoint-new-timeline", |_| { Err(CreateTimelineError::Other(anyhow::anyhow!( "failpoint before-checkpoint-new-timeline" ))) }); Ok(()) }) .await?; // All done! let timeline = raw_timeline.finish_creation().await?; // Callers are responsible to wait for uploads to complete and for activating the timeline. Ok(CreateTimelineResult::Created(timeline)) } fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient { RemoteTimelineClient::new( self.remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, self.tenant_shard_id, timeline_id, self.generation, &self.tenant_conf.load().location, ) } /// Builds required resources for a new timeline. fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { let remote_client = self.build_timeline_remote_client(timeline_id); self.get_timeline_resources_for(remote_client) } /// Builds timeline resources for the given remote client. fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources { TimelineResources { remote_client, pagestream_throttle: self.pagestream_throttle.clone(), pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), basebackup_cache: self.basebackup_cache.clone(), feature_resolver: self.feature_resolver.clone(), } } /// Creates intermediate timeline structure and its files. /// /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map. #[allow(clippy::too_many_arguments)] async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, rel_size_v2_status: Option, rel_size_migrated_at: Option, ctx: &RequestContext, ) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); resources.remote_client.init_upload_queue_for_empty_remote( new_metadata, rel_size_v2_status.clone(), rel_size_migrated_at, )?; let (timeline_struct, timeline_ctx) = self .create_timeline_struct( new_timeline_id, new_metadata, None, ancestor, resources, CreateTimelineCause::Load, create_guard.idempotency.clone(), None, rel_size_v2_status, rel_size_migrated_at, ctx, ) .context("Failed to create timeline data structure")?; timeline_struct.init_empty_layer_map(start_lsn); if let Err(e) = self .create_timeline_files(&create_guard.timeline_path) .await { error!( "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}" ); cleanup_timeline_directory(create_guard); return Err(e); } debug!( "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}" ); Ok(( UninitializedTimeline::new( self, new_timeline_id, Some((timeline_struct, create_guard)), ), timeline_ctx, )) } async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; fail::fail_point!("after-timeline-dir-creation", |_| { anyhow::bail!("failpoint after-timeline-dir-creation"); }); Ok(()) } /// Get a guard that provides exclusive access to the timeline directory, preventing /// concurrent attempts to create the same timeline. /// /// The `allow_offloaded` parameter controls whether to tolerate the existence of /// offloaded timelines or not. fn create_timeline_create_guard( self: &Arc, timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, ) -> Result { let tenant_shard_id = self.tenant_shard_id; let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); let create_guard = TimelineCreateGuard::new( self, timeline_id, timeline_path.clone(), idempotency, allow_offloaded, )?; // At this stage, we have got exclusive access to in-memory state for this timeline ID // for creation. // A timeline directory should never exist on disk already: // - a previous failed creation would have cleaned up after itself // - a pageserver restart would clean up timeline directories that don't have valid remote state // // Therefore it is an unexpected internal error to encounter a timeline directory already existing here, // this error may indicate a bug in cleanup on failed creations. if timeline_path.exists() { return Err(TimelineExclusionError::Other(anyhow::anyhow!( "Timeline directory already exists! This is a bug." ))); } Ok(create_guard) } /// Gathers inputs from all of the timelines to produce a sizing model input. /// /// Future is cancellation safe. Only one calculation can be running at once per tenant. #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn gather_size_inputs( &self, // `max_retention_period` overrides the cutoff that is used to calculate the size // (only if it is shorter than the real cutoff). max_retention_period: Option, cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { let logical_sizes_at_once = self .conf .concurrent_tenant_size_logical_size_queries .inner(); // TODO: Having a single mutex block concurrent reads is not great for performance. // // But the only case where we need to run multiple of these at once is when we // request a size for a tenant manually via API, while another background calculation // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = tokio::select! { locked = self.cached_logical_sizes.lock() => locked, _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), }; size::gather_inputs( self, logical_sizes_at_once, max_retention_period, &mut shared_cache, cause, cancel, ctx, ) .await } /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn calculate_synthetic_size( &self, cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?; let size = inputs.calculate(); self.set_cached_synthetic_size(size); Ok(size) } /// Cache given synthetic size and update the metric value pub fn set_cached_synthetic_size(&self, size: u64) { self.cached_synthetic_tenant_size .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) .unwrap() .set(size); } pub fn cached_synthetic_size(&self) -> u64 { self.cached_synthetic_tenant_size.load(Ordering::Relaxed) } /// Flush any in-progress layers, schedule uploads, and wait for uploads to complete. /// /// This function can take a long time: callers should wrap it in a timeout if calling /// from an external API handler. /// /// Cancel-safety: cancelling this function may leave I/O running, but such I/O is /// still bounded by tenant/timeline shutdown. #[tracing::instrument(skip_all)] pub(crate) async fn flush_remote(&self) -> anyhow::Result<()> { let timelines = self.timelines.lock().unwrap().clone(); async fn flush_timeline(_gate: GateGuard, timeline: Arc) -> anyhow::Result<()> { tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); timeline.freeze_and_flush().await?; tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); timeline.remote_client.wait_completion().await?; Ok(()) } // We do not use a JoinSet for these tasks, because we don't want them to be // aborted when this function's future is cancelled: they should stay alive // holding their GateGuard until they complete, to ensure their I/Os complete // before Timeline shutdown completes. let mut results = FuturesUnordered::new(); for (_timeline_id, timeline) in timelines { // Run each timeline's flush in a task holding the timeline's gate: this // means that if this function's future is cancelled, the Timeline shutdown // will still wait for any I/O in here to complete. let Ok(gate) = timeline.gate.enter() else { continue; }; let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await }); results.push(jh); } while let Some(r) = results.next().await { if let Err(e) = r { if !e.is_cancelled() && !e.is_panic() { tracing::error!("unexpected join error: {e:?}"); } } } // The flushes we did above were just writes, but the TenantShard might have had // pending deletions as well from recent compaction/gc: we want to flush those // as well. This requires flushing the global delete queue. This is cheap // because it's typically a no-op. match self.deletion_queue_client.flush_execute().await { Ok(_) => {} Err(DeletionQueueError::ShuttingDown) => {} } Ok(()) } pub(crate) fn get_tenant_conf(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() } /// How much local storage would this tenant like to have? It can cope with /// less than this (via eviction and on-demand downloads), but this function enables /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O /// by keeping important things on local disk. /// /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less /// than they report here, due to layer eviction. Tenants with many active branches may /// actually use more than they report here. pub(crate) fn local_storage_wanted(&self) -> u64 { let timelines = self.timelines.lock().unwrap(); // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This // reflects the observation that on tenants with multiple large branches, typically only one // of them is used actively enough to occupy space on disk. timelines .values() .map(|t| t.metrics.visible_physical_size_gauge.get()) .max() .unwrap_or(0) } /// HADRON /// Return the visible size of all timelines in this tenant. pub(crate) fn get_visible_size(&self) -> u64 { let timelines = self.timelines.lock().unwrap(); timelines .values() .map(|t| t.metrics.visible_physical_size_gauge.get()) .sum() } /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant /// manifest in `Self::remote_tenant_manifest`. /// /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after /// changing any `TenantShard` state that's included in the manifest, consider making the manifest /// the authoritative source of data with an API that automatically uploads on changes. Revisit /// this when the manifest is more widely used and we have a better idea of the data model. pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> { // Multiple tasks may call this function concurrently after mutating the TenantShard runtime // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but // simple coalescing mechanism. let mut guard = tokio::select! { guard = self.remote_tenant_manifest.lock() => guard, _ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled), }; // Build a new manifest. let manifest = self.build_tenant_manifest(); // Check if the manifest has changed. We ignore the version number here, to avoid // uploading every manifest on version number bumps. if let Some(old) = guard.as_ref() { if manifest.eq_ignoring_version(old) { return Ok(()); } } // Update metrics let tid = self.tenant_shard_id.to_string(); let shard_id = self.tenant_shard_id.shard_slug().to_string(); let set_key = &[tid.as_str(), shard_id.as_str()][..]; TENANT_OFFLOADED_TIMELINES .with_label_values(set_key) .set(manifest.offloaded_timelines.len() as u64); // Upload the manifest. Remote storage does no retries internally, so retry here. match backoff::retry( || async { upload_tenant_manifest( &self.remote_storage, &self.tenant_shard_id, self.generation, &manifest, &self.cancel, ) .await }, |_| self.cancel.is_cancelled(), FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "uploading tenant manifest", &self.cancel, ) .await { None => Err(TenantManifestError::Cancelled), Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled), Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)), Some(Ok(_)) => { // Store the successfully uploaded manifest, so that future callers can avoid // re-uploading the same thing. *guard = Some(manifest); Ok(()) } } } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. async fn run_initdb( conf: &'static PageServerConf, initdb_target_dir: &Utf8Path, pg_version: PgMajorVersion, cancel: &CancellationToken, ) -> Result<(), InitdbError> { let initdb_bin_path = conf .pg_bin_dir(pg_version) .map_err(InitdbError::Other)? .join("initdb"); let initdb_lib_dir = conf.pg_lib_dir(pg_version).map_err(InitdbError::Other)?; info!( "running {} in {}, libdir: {}", initdb_bin_path, initdb_target_dir, initdb_lib_dir, ); let _permit = { let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer(); INIT_DB_SEMAPHORE.acquire().await }; CONCURRENT_INITDBS.inc(); scopeguard::defer! { CONCURRENT_INITDBS.dec(); } let _timer = INITDB_RUN_TIME.start_timer(); let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: &conf.superuser, locale: &conf.locale, initdb_bin: &initdb_bin_path, pg_version, library_search_path: &initdb_lib_dir, pgdata: initdb_target_dir, }) .await .map_err(InitdbError::Inner); // This isn't true cancellation support, see above. Still return an error to // excercise the cancellation code path. if cancel.is_cancelled() { return Err(InitdbError::Cancelled); } res } /// Dump contents of a layer file to stdout. pub async fn dump_layerfile_from_path( path: &Utf8Path, verbose: bool, ctx: &RequestContext, ) -> anyhow::Result<()> { use std::os::unix::fs::FileExt; // All layer files start with a two-byte "magic" value, to identify the kind of // file. let file = File::open(path)?; let mut header_buf = [0u8; 2]; file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { crate::IMAGE_FILE_MAGIC => { ImageLayer::new_for_path(path, file)? .dump(verbose, ctx) .await? } crate::DELTA_FILE_MAGIC => { DeltaLayer::new_for_path(path, file)? .dump(verbose, ctx) .await? } magic => bail!("unrecognized magic identifier: {:?}", magic), } Ok(()) } #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; use hex_literal::hex; use once_cell::sync::OnceCell; use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; use pageserver_api::shard::ShardIndex; use utils::id::TenantId; use utils::logging; use wal_decoder::models::record::NeonWalRecord; use super::*; use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); pub const NEW_TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content pub fn test_img(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); buf.resize(64, 0); buf.freeze() } pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: pageserver_api::models::TenantConfig, pub tenant_shard_id: TenantShardId, pub shard_identity: ShardIdentity, pub generation: Generation, pub shard: ShardIndex, pub remote_storage: GenericRemoteStorage, pub remote_fs_dir: Utf8PathBuf, pub deletion_queue: MockDeletionQueue, } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); pub(crate) fn setup_logging() { LOG_HANDLE.get_or_init(|| { logging::init( logging::LogFormat::Test, // enable it in case the tests exercise code paths that use // debug_assert_current_span_has_tenant_and_timeline_id logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) .expect("Failed to init test logging"); }); } impl TenantHarness { pub async fn create_custom( test_name: &'static str, tenant_conf: pageserver_api::models::TenantConfig, tenant_id: TenantId, shard_identity: ShardIdentity, generation: Generation, ) -> anyhow::Result { setup_logging(); let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); fs::create_dir_all(&repo_dir)?; let conf = PageServerConf::dummy_conf(repo_dir); // Make a static copy of the config. This can never be free'd, but that's // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); let shard = shard_identity.shard_index(); let tenant_shard_id = TenantShardId { tenant_id, shard_number: shard.shard_number, shard_count: shard.shard_count, }; fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; let remote_fs_dir = conf.workdir.join("localfs"); std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { conf, tenant_conf, tenant_shard_id, shard_identity, generation, shard, remote_storage, remote_fs_dir, deletion_queue, }) } pub async fn create(test_name: &'static str) -> anyhow::Result { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. let tenant_conf = pageserver_api::models::TenantConfig { gc_period: Some(Duration::ZERO), compaction_period: Some(Duration::ZERO), ..Default::default() }; let tenant_id = TenantId::generate(); let shard = ShardIdentity::unsharded(); Self::create_custom( test_name, tenant_conf, tenant_id, shard, Generation::new(0xdeadbeef), ) .await } pub fn span(&self) -> tracing::Span { info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) } pub(crate) async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) .with_scope_unit_test(); ( self.do_try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) async fn do_try_load_with_redo( &self, walredo_mgr: Arc, ctx: &RequestContext, ) -> anyhow::Result> { let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( TenantState::Attaching, self.conf, AttachedTenantConf::try_from( self.conf, LocationConf::attached_single( self.tenant_conf.clone(), self.generation, ShardParameters::default(), ), ) .unwrap(), self.shard_identity, Some(walredo_mgr), self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), // TODO: ideally we should run all unit tests with both configs L0FlushGlobalState::new(L0FlushConfig::default()), basebackup_cache, FeatureResolver::new_disabled(), )); let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) .await?; tenant.attach(Some(preload), ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { timeline.set_state(TimelineState::Active); } Ok(tenant) } pub(crate) async fn do_try_load( &self, ctx: &RequestContext, ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); self.do_try_load_with_redo(walredo_mgr, ctx).await } pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } } // Mock WAL redo manager that doesn't do much pub(crate) struct TestRedoManager; impl TestRedoManager { /// # Cancel-Safety /// /// This method is cancellation-safe. pub async fn request_redo( &self, key: Key, lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: PgMajorVersion, _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { // For Neon wal records, we can decode without spawning postgres, so do so. let mut page = match (base_img, records.first()) { (Some((_lsn, img)), _) => { let mut page = BytesMut::new(); page.extend_from_slice(&img); page } (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(), _ => { panic!("Neon WAL redo requires base image or will init record"); } }; for (record_lsn, record) in records { apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?; } Ok(page.freeze()) } else { // We never spawn a postgres walredo process in unit tests: just log what we might have done. let s = format!( "redo for {} to get to {}, with {} and {} records", key, lsn, if base_img.is_some() { "base image" } else { "no base image" }, records.len() ); println!("{s}"); Ok(test_img(&s)) } } } } #[cfg(test)] mod tests { use std::collections::{BTreeMap, BTreeSet}; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; #[cfg(feature = "testing")] use models::CompactLsnRange; use pageserver_api::key::{ AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key, }; use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease}; use pageserver_compaction::helpers::overlaps_with; use rand::Rng; #[cfg(feature = "testing")] use rand::SeedableRng; #[cfg(feature = "testing")] use rand::rngs::StdRng; #[cfg(feature = "testing")] use std::ops::Range; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; #[cfg(feature = "testing")] use timeline::GcInfo; #[cfg(feature = "testing")] use timeline::InMemoryLayerTestDesc; #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery}; use utils::id::TenantId; use utils::shard::{ShardCount, ShardNumber}; #[cfg(feature = "testing")] use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::value::Value; use super::*; use crate::DEFAULT_PG_VERSION; use crate::keyspace::KeySpaceAccum; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); #[cfg(feature = "testing")] struct TestTimelineSpecification { start_lsn: Lsn, last_record_lsn: Lsn, in_memory_layers_shape: Vec<(Range, Range)>, delta_layers_shape: Vec<(Range, Range)>, image_layers_shape: Vec<(Range, Lsn)>, gap_chance: u8, will_init_chance: u8, } #[cfg(feature = "testing")] struct Storage { storage: HashMap<(Key, Lsn), Value>, start_lsn: Lsn, } #[cfg(feature = "testing")] impl Storage { fn get(&self, key: Key, lsn: Lsn) -> Bytes { use bytes::BufMut; let mut crnt_lsn = lsn; let mut got_base = false; let mut acc = Vec::new(); while crnt_lsn >= self.start_lsn { if let Some(value) = self.storage.get(&(key, crnt_lsn)) { acc.push(value.clone()); match value { Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => { if *will_init { got_base = true; break; } } Value::Image(_) => { got_base = true; break; } _ => unreachable!(), } } crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap(); } assert!( got_base, "Input data was incorrect. No base image for {key}@{lsn}" ); tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len()); let mut blob = BytesMut::new(); for value in acc.into_iter().rev() { match value { Value::WalRecord(NeonWalRecord::Test { append, .. }) => { blob.extend_from_slice(append.as_bytes()); } Value::Image(img) => { blob.put(img); } _ => unreachable!(), } } blob.into() } } #[cfg(feature = "testing")] #[allow(clippy::too_many_arguments)] async fn randomize_timeline( tenant: &Arc, new_timeline_id: TimelineId, pg_version: PgMajorVersion, spec: TestTimelineSpecification, random: &mut rand::rngs::StdRng, ctx: &RequestContext, ) -> anyhow::Result<(Arc, Storage, Vec)> { let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default(); let mut interesting_lsns = vec![spec.last_record_lsn]; for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { let mut lsn = lsn_range.start; while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { let gap = random.random_range(1..=100) <= spec.gap_chance; let will_init = random.random_range(1..=100) <= spec.will_init_chance; if gap { continue; } let record = if will_init { Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) } else { Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) }; storage.insert((key, lsn), record); key = key.next(); } lsn = Lsn(lsn.0 + 1); } // Stash some interesting LSN for future use for offset in [0, 5, 100].iter() { if *offset == 0 { interesting_lsns.push(lsn_range.start); } else { let below = lsn_range.start.checked_sub(*offset); match below { Some(v) if v >= spec.start_lsn => { interesting_lsns.push(v); } _ => {} } let above = Lsn(lsn_range.start.0 + offset); interesting_lsns.push(above); } } } for (key_range, lsn_range) in spec.delta_layers_shape.iter() { let mut lsn = lsn_range.start; while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { let gap = random.random_range(1..=100) <= spec.gap_chance; let will_init = random.random_range(1..=100) <= spec.will_init_chance; if gap { continue; } let record = if will_init { Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) } else { Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) }; storage.insert((key, lsn), record); key = key.next(); } lsn = Lsn(lsn.0 + 1); } // Stash some interesting LSN for future use for offset in [0, 5, 100].iter() { if *offset == 0 { interesting_lsns.push(lsn_range.start); } else { let below = lsn_range.start.checked_sub(*offset); match below { Some(v) if v >= spec.start_lsn => { interesting_lsns.push(v); } _ => {} } let above = Lsn(lsn_range.start.0 + offset); interesting_lsns.push(above); } } } for (key_range, lsn) in spec.image_layers_shape.iter() { let mut key = key_range.start; while key < key_range.end { let blob = Bytes::from(format!("[image {key}@{lsn}]")); let record = Value::Image(blob.clone()); storage.insert((key, *lsn), record); key = key.next(); } // Stash some interesting LSN for future use for offset in [0, 5, 100].iter() { if *offset == 0 { interesting_lsns.push(*lsn); } else { let below = lsn.checked_sub(*offset); match below { Some(v) if v >= spec.start_lsn => { interesting_lsns.push(v); } _ => {} } let above = Lsn(lsn.0 + offset); interesting_lsns.push(above); } } } let in_memory_test_layers = { let mut acc = Vec::new(); for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { let mut data = Vec::new(); let mut lsn = lsn_range.start; while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { if let Some(record) = storage.get(&(key, lsn)) { data.push((key, lsn, record.clone())); } key = key.next(); } lsn = Lsn(lsn.0 + 1); } acc.push(InMemoryLayerTestDesc { data, lsn_range: lsn_range.clone(), is_open: false, }) } acc }; let delta_test_layers = { let mut acc = Vec::new(); for (key_range, lsn_range) in spec.delta_layers_shape.iter() { let mut data = Vec::new(); let mut lsn = lsn_range.start; while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { if let Some(record) = storage.get(&(key, lsn)) { data.push((key, lsn, record.clone())); } key = key.next(); } lsn = Lsn(lsn.0 + 1); } acc.push(DeltaLayerTestDesc { data, lsn_range: lsn_range.clone(), key_range: key_range.clone(), }) } acc }; let image_test_layers = { let mut acc = Vec::new(); for (key_range, lsn) in spec.image_layers_shape.iter() { let mut data = Vec::new(); let mut key = key_range.start; while key < key_range.end { if let Some(record) = storage.get(&(key, *lsn)) { let blob = match record { Value::Image(blob) => blob.clone(), _ => unreachable!(), }; data.push((key, blob)); } key = key.next(); } acc.push((*lsn, data)); } acc }; let tline = tenant .create_test_timeline_with_layers( new_timeline_id, spec.start_lsn, pg_version, ctx, in_memory_test_layers, delta_test_layers, image_test_layers, spec.last_record_lsn, ) .await?; Ok(( tline, Storage { storage, start_lsn: spec.start_lsn, }, interesting_lsns, )) } #[tokio::test] async fn test_basic() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; writer.finish_write(Lsn(0x20)); drop(writer); assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, test_img("foo at 0x20") ); Ok(()) } #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines") .await? .load() .await; let _ = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; match tenant .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), "timeline already exists with different parameters".to_string() ), } Ok(()) } /// Convenience function to create a page image with given string as the only content pub fn test_value(s: &str) -> Value { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); Value::Image(buf.freeze()) } /// /// Test branch creation /// #[tokio::test] async fn test_branch() -> anyhow::Result<()> { use std::str::from_utf8; let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let mut writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap(); #[allow(non_snake_case)] let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap(); // Insert a value on the timeline writer .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx) .await?; writer .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx) .await?; writer.finish_write(Lsn(0x20)); writer .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx) .await?; writer.finish_write(Lsn(0x30)); writer .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx) .await?; writer.finish_write(Lsn(0x40)); //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); let mut new_writer = newtline.writer().await; new_writer .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; new_writer.finish_write(Lsn(0x40)); // Check page contents on both branches assert_eq!( from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?, "foo at 0x40" ); assert_eq!( from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?, "bar at 0x40" ); assert_eq!( from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?, "foobar at 0x20" ); //assert_current_logical_size(&tline, Lsn(0x40)); Ok(()) } async fn make_some_layers( tline: &Timeline, start_lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut lsn = start_lsn; { let mut writer = tline.writer().await; // Create a relation on the timeline writer .put( *TEST_KEY, lsn, &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; writer.finish_write(lsn); lsn += 0x10; writer .put( *TEST_KEY, lsn, &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; writer.finish_write(lsn); lsn += 0x10; } tline.freeze_and_flush().await?; { let mut writer = tline.writer().await; writer .put( *TEST_KEY, lsn, &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; writer.finish_write(lsn); lsn += 0x10; writer .put( *TEST_KEY, lsn, &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; writer.finish_write(lsn); } tline.freeze_and_flush().await.map_err(|e| e.into()) } #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the flushing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. tenant .gc_iteration( Some(TIMELINE_ID), 0x10, Duration::ZERO, &CancellationToken::new(), &ctx, ) .await?; // try to branch at lsn 25, should fail because we already garbage collected the data match tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), Err(err) => { let CreateTimelineError::AncestorLsn(err) = err else { panic!("wrong error type") }; assert!(err.to_string().contains("invalid branch start lsn")); assert!( err.source() .unwrap() .to_string() .contains("we might've already garbage collected needed data") ) } } Ok(()) } #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx) .await?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), Err(err) => { let CreateTimelineError::AncestorLsn(err) = err else { panic!("wrong error type"); }; assert!(&err.to_string().contains("invalid branch start lsn")); assert!( &err.source() .unwrap() .to_string() .contains("is earlier than latest GC cutoff") ); } } Ok(()) } /* // FIXME: This currently fails to error out. Calling GC doesn't currently // remove the old value, we'd need to work a little harder #[tokio::test] async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> { let repo = RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn(); assert!(*applied_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { Ok(_) => panic!("request for page should have failed"), Err(err) => assert!(err.to_string().contains("not found at")), } Ok(()) } */ #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; tline.set_broken("test".to_owned()); tenant .gc_iteration( Some(TIMELINE_ID), 0x10, Duration::ZERO, &CancellationToken::new(), &ctx, ) .await?; // The branchpoints should contain all timelines, even ones marked // as Broken. { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); assert_eq!( branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) ); } // You can read the key from the child branch even though the parent is // Broken, as long as you don't need to access data from the parent. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, test_img(&format!("foo at {}", Lsn(0x70))) ); // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); assert!( err.to_string().starts_with(&format!( "bad state on timeline {}: Broken", tline.timeline_id )), "{err}" ); Ok(()) } #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 tenant .gc_iteration( Some(TIMELINE_ID), 0x10, Duration::ZERO, &CancellationToken::new(), &ctx, ) .await?; assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok()); Ok(()) } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; // run gc on parent tenant .gc_iteration( Some(TIMELINE_ID), 0x10, Duration::ZERO, &CancellationToken::new(), &ctx, ) .await?; // Check that the data is still accessible on the branch. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, test_img(&format!("foo at {}", Lsn(0x40))) ); Ok(()) } #[tokio::test] async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; let harness = TenantHarness::create(TEST_NAME).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() .unwrap(); } let (tenant, _ctx) = harness.load().await; tenant .get_timeline(TIMELINE_ID, true) .expect("cannot load timeline"); Ok(()) } #[tokio::test] async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; let harness = TenantHarness::create(TEST_NAME).await?; // create two timelines { let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; let child_tline = tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; child_tline.set_state(TimelineState::Active); let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() .unwrap(); } // check that both of them are initially unloaded let (tenant, _ctx) = harness.load().await; // check that both, child and ancestor are loaded let _child_tline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("cannot get child timeline loaded"); let _ancestor_tline = tenant .get_timeline(TIMELINE_ID, true) .expect("cannot get ancestor timeline loaded"); Ok(()) } #[tokio::test] async fn delta_layer_dumping() -> anyhow::Result<()> { use storage_layer::AsLayerDesc; let (tenant, ctx) = TenantHarness::create("test_layer_dumping") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await; let level0_deltas = layer_map .layer_map()? .level0_deltas() .iter() .map(|desc| layer_map.get_from_desc(desc)) .collect::>(); assert!(!level0_deltas.is_empty()); for delta in level0_deltas { // Ensure we are dumping a delta layer here assert!(delta.layer_desc().is_delta); delta.dump(true, &ctx).await.unwrap(); } Ok(()) } #[tokio::test] async fn test_images() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); tline.freeze_and_flush().await?; tline .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; writer.finish_write(Lsn(0x20)); drop(writer); tline.freeze_and_flush().await?; tline .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x30), &Value::Image(test_img("foo at 0x30")), &ctx, ) .await?; writer.finish_write(Lsn(0x30)); drop(writer); tline.freeze_and_flush().await?; tline .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x40), &Value::Image(test_img("foo at 0x40")), &ctx, ) .await?; writer.finish_write(Lsn(0x40)); drop(writer); tline.freeze_and_flush().await?; tline .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, test_img("foo at 0x20") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?, test_img("foo at 0x30") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?, test_img("foo at 0x40") ); Ok(()) } async fn bulk_insert_compact_gc( tenant: &TenantShard, timeline: &Arc, ctx: &RequestContext, lsn: Lsn, repeat: usize, key_count: usize, ) -> anyhow::Result>> { let compact = true; bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await } async fn bulk_insert_maybe_compact_gc( tenant: &TenantShard, timeline: &Arc, ctx: &RequestContext, mut lsn: Lsn, repeat: usize, key_count: usize, compact: bool, ) -> anyhow::Result>> { let mut inserted: HashMap> = Default::default(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; // Enforce that key range is monotonously increasing let mut keyspace = KeySpaceAccum::new(); let cancel = CancellationToken::new(); for _ in 0..repeat { for _ in 0..key_count { test_key.field6 = blknum; let mut writer = timeline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), ctx, ) .await?; inserted.entry(test_key).or_default().insert(lsn); writer.finish_write(lsn); drop(writer); keyspace.add_key(test_key); lsn = Lsn(lsn.0 + 0x10); blknum += 1; } timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc timeline.compact(&cancel, EnumSet::default(), ctx).await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it // originally was. let res = tenant .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) .await?; assert_eq!(res.layers_removed, 0, "this never removes anything"); } Ok(inserted) } // // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. // Repeat 50 times. // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { let harness = TenantHarness::create("test_bulk_insert").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; Ok(()) } // Test the vectored get real implementation against a simple sequential implementation. // // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting. // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys // grow to the right on the X axis. // [Delta] // [Delta] // [Delta] // [Delta] // ------------ Image --------------- // // After layer generation we pick the ranges to query as follows: // 1. The beginning of each delta layer // 2. At the seam between two adjacent delta layers // // There's one major downside to this test: delta layers only contains images, // so the search can stop at the first delta layer and doesn't traverse any deeper. #[tokio::test] async fn test_get_vectored() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read(LayerManagerLockHolder::Testing).await; let lm = guard.layer_map()?; lm.dump(true, &ctx).await?; let mut reads = Vec::new(); let mut prev = None; lm.iter_historic_layers().for_each(|desc| { if !desc.is_delta() { prev = Some(desc.clone()); return; } let start = desc.key_range.start; let end = desc .key_range .start .add(tenant.conf.max_get_vectored_keys.get() as u32); reads.push(KeySpace { ranges: vec![start..end], }); if let Some(prev) = &prev { if !prev.is_delta() { return; } let first_range = Key { field6: prev.key_range.end.field6 - 4, ..prev.key_range.end }..prev.key_range.end; let second_range = desc.key_range.start..Key { field6: desc.key_range.start.field6 + 4, ..desc.key_range.start }; reads.push(KeySpace { ranges: vec![first_range, second_range], }); }; prev = Some(desc.clone()); }); drop(guard); // Pick a big LSN such that we query over all the changes. let reads_lsn = Lsn(u64::MAX - 1); for read in reads { info!("Doing vectored read on {:?}", read); let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn); let vectored_res = tline .get_vectored_impl( query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; let mut expected_lsns: HashMap = Default::default(); let mut expect_missing = false; let mut key = read.start().unwrap(); while key != read.end().unwrap() { if let Some(lsns) = inserted.get(&key) { let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn); match expected_lsn { Some(lsn) => { expected_lsns.insert(key, *lsn); } None => { expect_missing = true; break; } } } else { expect_missing = true; break; } key = key.next(); } if expect_missing { assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_)))); } else { for (key, image) in vectored_res? { let expected_lsn = expected_lsns.get(&key).expect("determined above"); let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn)); assert_eq!(image?, expected_image); } } } Ok(()) } #[tokio::test] async fn test_get_vectored_aux_files() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); let mut modification = tline.begin_modification(Lsn(0x1000)); modification.put_file("foo/bar1", b"content1", &ctx).await?; modification.set_lsn(Lsn(0x1008))?; modification.put_file("foo/bar2", b"content2", &ctx).await?; modification.commit(&ctx).await?; let child_timeline_id = TimelineId::generate(); tenant .branch_timeline_test( tline, child_timeline_id, Some(tline.get_last_record_lsn()), &ctx, ) .await?; let child_timeline = tenant .get_timeline(child_timeline_id, true) .expect("Should have the branched timeline"); let aux_keyspace = KeySpace { ranges: vec![NON_INHERITED_RANGE], }; let read_lsn = child_timeline.get_last_record_lsn(); let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn); let vectored_res = child_timeline .get_vectored_impl( query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; let images = vectored_res?; assert!(images.is_empty()); Ok(()) } // Test that vectored get handles layer gaps correctly // by advancing into the next ancestor timeline if required. // // The test generates timelines that look like the diagram below. // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram). // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram). // // ``` //-------------------------------+ // ... | // [ L1 ] | // [ / L1 ] | Child Timeline // ... | // ------------------------------+ // [ X L1 ] | Parent Timeline // ------------------------------+ // ``` #[tokio::test] async fn test_get_vectored_key_gap() -> anyhow::Result<()> { let tenant_conf = pageserver_api::models::TenantConfig { // Make compaction deterministic gc_period: Some(Duration::ZERO), compaction_period: Some(Duration::ZERO), // Encourage creation of L1 layers checkpoint_distance: Some(16 * 1024), compaction_target_size: Some(8 * 1024), ..Default::default() }; let harness = TenantHarness::create_custom( "test_get_vectored_key_gap", tenant_conf, TenantId::generate(), ShardIdentity::unsharded(), Generation::new(0xdeadbeef), ) .await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let gap_at_key = current_key.add(100); let mut current_lsn = Lsn(0x10); const KEY_COUNT: usize = 10_000; let timeline_id = TimelineId::generate(); let current_timeline = tenant .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) .await?; current_lsn += 0x100; let mut writer = current_timeline.writer().await; writer .put( gap_at_key, current_lsn, &Value::Image(test_img(&format!("{gap_at_key} at {current_lsn}"))), &ctx, ) .await?; writer.finish_write(current_lsn); drop(writer); let mut latest_lsns = HashMap::new(); latest_lsns.insert(gap_at_key, current_lsn); current_timeline.freeze_and_flush().await?; let child_timeline_id = TimelineId::generate(); tenant .branch_timeline_test( ¤t_timeline, child_timeline_id, Some(current_lsn), &ctx, ) .await?; let child_timeline = tenant .get_timeline(child_timeline_id, true) .expect("Should have the branched timeline"); for i in 0..KEY_COUNT { if current_key == gap_at_key { current_key = current_key.next(); continue; } current_lsn += 0x10; let mut writer = child_timeline.writer().await; writer .put( current_key, current_lsn, &Value::Image(test_img(&format!("{current_key} at {current_lsn}"))), &ctx, ) .await?; writer.finish_write(current_lsn); drop(writer); latest_lsns.insert(current_key, current_lsn); current_key = current_key.next(); // Flush every now and then to encourage layer file creation. if i % 500 == 0 { child_timeline.freeze_and_flush().await?; } } child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; let key_near_end = { let mut tmp = current_key; tmp.field6 -= 10; tmp }; let key_near_gap = { let mut tmp = gap_at_key; tmp.field6 -= 10; tmp }; let read = KeySpace { ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], }; let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn); let results = child_timeline .get_vectored_impl( query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await?; for (key, img_res) in results { let expected = test_img(&format!("{} at {}", key, latest_lsns[&key])); assert_eq!(img_res?, expected); } Ok(()) } // Test that vectored get descends into ancestor timelines correctly and // does not return an image that's newer than requested. // // The diagram below ilustrates an interesting case. We have a parent timeline // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed // from the child timeline, so the parent timeline must be visited. When advacing into // the child timeline, the read path needs to remember what the requested Lsn was in // order to avoid returning an image that's too new. The test below constructs such // a timeline setup and does a few queries around the Lsn of each page image. // ``` // LSN // ^ // | // | // 500 | --------------------------------------> branch point // 400 | X // 300 | X // 200 | --------------------------------------> requested lsn // 100 | X // |---------------------------------------> Key // | // ------> requested key // // Legend: // * X - page images // ``` #[tokio::test] async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_key = start_key.add(1000); let child_gap_at_key = start_key.add(500); let mut parent_gap_lsns: BTreeMap = BTreeMap::new(); let mut current_lsn = Lsn(0x10); let timeline_id = TimelineId::generate(); let parent_timeline = tenant .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) .await?; current_lsn += 0x100; for _ in 0..3 { let mut key = start_key; while key < end_key { current_lsn += 0x10; let image_value = format!("{child_gap_at_key} at {current_lsn}"); let mut writer = parent_timeline.writer().await; writer .put( key, current_lsn, &Value::Image(test_img(&image_value)), &ctx, ) .await?; writer.finish_write(current_lsn); if key == child_gap_at_key { parent_gap_lsns.insert(current_lsn, image_value); } key = key.next(); } parent_timeline.freeze_and_flush().await?; } let child_timeline_id = TimelineId::generate(); let child_timeline = tenant .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx) .await?; let mut key = start_key; while key < end_key { if key == child_gap_at_key { key = key.next(); continue; } current_lsn += 0x10; let mut writer = child_timeline.writer().await; writer .put( key, current_lsn, &Value::Image(test_img(&format!("{key} at {current_lsn}"))), &ctx, ) .await?; writer.finish_write(current_lsn); key = key.next(); } child_timeline.freeze_and_flush().await?; let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10]; let mut query_lsns = Vec::new(); for image_lsn in parent_gap_lsns.keys().rev() { for offset in lsn_offsets { query_lsns.push(Lsn(image_lsn .0 .checked_add_signed(offset) .expect("Shouldn't overflow"))); } } for query_lsn in query_lsns { let query = VersionedKeySpaceQuery::uniform( KeySpace { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, ); let results = child_timeline .get_vectored_impl( query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; let expected_item = parent_gap_lsns .iter() .rev() .find(|(lsn, _)| **lsn <= query_lsn); info!( "Doing vectored read at LSN {}. Expecting image to be: {:?}", query_lsn, expected_item ); match expected_item { Some((_, img_value)) => { let key_results = results.expect("No vectored get error expected"); let key_result = &key_results[&child_gap_at_key]; let returned_img = key_result .as_ref() .expect("No page reconstruct error expected"); info!( "Vectored read at LSN {} returned image {}", query_lsn, std::str::from_utf8(returned_img)? ); assert_eq!(*returned_img, test_img(img_value)); } None => { assert!(matches!(results, Err(GetVectoredError::MissingKey(_)))); } } } Ok(()) } #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { let names_algorithms = [ ("test_random_updates_legacy", CompactionAlgorithm::Legacy), ("test_random_updates_tiered", CompactionAlgorithm::Tiered), ]; for (name, algorithm) in names_algorithms { test_random_updates_algorithm(name, algorithm).await?; } Ok(()) } async fn test_random_updates_algorithm( name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = Some(CompactionAlgorithmSettings { kind: compaction_algorithm, }); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut test_key_end = test_key; test_key_end.field6 = NUM_KEYS as u32; tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end)); let mut keyspace = KeySpaceAccum::new(); // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); updated[blknum] = lsn; drop(writer); keyspace.add_key(test_key); } for _ in 0..50 { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; } // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, test_img(&format!("{blknum} at {last_lsn}")) ); } // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; } Ok(()) } #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_traverse_branches") .await? .load() .await; let mut tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut keyspace = KeySpaceAccum::new(); let cancel = CancellationToken::new(); // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); updated[blknum] = lsn; drop(writer); keyspace.add_key(test_key); } for _ in 0..50 { let new_tline_id = TimelineId::generate(); tenant .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; println!("updating {blknum} at {lsn}"); writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; } // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, test_img(&format!("{blknum} at {last_lsn}")) ); } // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; tline.compact(&cancel, EnumSet::default(), &ctx).await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; } Ok(()) } #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors") .await? .load() .await; let mut tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); // Track page mutation lsns across different timelines. let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES]; let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = TimelineId::generate(); tenant .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{idx} {blknum} at {lsn}"))), &ctx, ) .await?; println!("updating [{idx}][{blknum}] at {lsn}"); writer.finish_write(lsn); drop(writer); updated[idx][blknum] = lsn; } } // Read pages from leaf timeline across all ancestors. for (idx, lsns) in updated.iter().enumerate() { for (blknum, lsn) in lsns.iter().enumerate() { // Skip empty mutations. if lsn.0 == 0 { continue; } println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn, &ctx).await?, test_img(&format!("{idx} {blknum} at {lsn}")) ); } } Ok(()) } #[tokio::test] async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable") .await? .load() .await; let initdb_lsn = Lsn(0x20); let (utline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx) .await?; let tline = utline.raw_timeline().unwrap(); // Spawn flush loop now so that we can set the `expect_initdb_optimization` tline.maybe_spawn_flush_loop(); // Make sure the timeline has the minimum set of required keys for operation. // The only operation you can always do on an empty timeline is to `put` new data. // Except if you `put` at `initdb_lsn`. // In that case, there's an optimization to directly create image layers instead of delta layers. // It uses `repartition()`, which assumes some keys to be present. // Let's make sure the test timeline can handle that case. { let mut state = tline.flush_loop_state.lock().unwrap(); assert_eq!( timeline::FlushLoopState::Running { expect_initdb_optimization: false, initdb_optimization_count: 0, }, *state ); *state = timeline::FlushLoopState::Running { expect_initdb_optimization: true, initdb_optimization_count: 0, }; } // Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization. // As explained above, the optimization requires some keys to be present. // As per `create_empty_timeline` documentation, use init_empty to set them. // This is what `create_test_timeline` does, by the way. let mut modification = tline.begin_modification(initdb_lsn); modification .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification .commit(&ctx) .await .context("commit init_empty_test_timeline modification")?; // Do the flush. The flush code will check the expectations that we set above. tline.freeze_and_flush().await?; // assert freeze_and_flush exercised the initdb optimization { let state = tline.flush_loop_state.lock().unwrap(); let timeline::FlushLoopState::Running { expect_initdb_optimization, initdb_optimization_count, } = *state else { panic!("unexpected state: {:?}", *state); }; assert!(expect_initdb_optimization); assert!(initdb_optimization_count > 0); } Ok(()) } #[tokio::test] async fn test_create_guard_crash() -> anyhow::Result<()> { let name = "test_create_guard_crash"; let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; let (tline, _ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown(super::timeline::ShutdownMode::Hard) .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); } let (tenant, _) = harness.load().await; match tenant.get_timeline(TIMELINE_ID, false) { Ok(_) => panic!("timeline should've been removed during load"), Err(e) => { assert_eq!( e, GetTimelineError::NotFound { tenant_id: tenant.tenant_shard_id, timeline_id: TIMELINE_ID, } ) } } assert!( !harness .conf .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists() ); Ok(()) } #[tokio::test] async fn test_read_at_max_lsn() -> anyhow::Result<()> { let names_algorithms = [ ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy), ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered), ]; for (name, algorithm) in names_algorithms { test_read_at_max_lsn_algorithm(name, algorithm).await?; } Ok(()) } async fn test_read_at_max_lsn_algorithm( name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = Some(CompactionAlgorithmSettings { kind: compaction_algorithm, }); let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); let compact = false; bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let read_lsn = Lsn(u64::MAX - 1); let result = tline.get(test_key, read_lsn, &ctx).await; assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err()); Ok(()) } #[tokio::test] async fn test_metadata_scan() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; const STEP: usize = 10000; // random update + scan base_key + idx * STEP let cancel = CancellationToken::new(); let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); base_key.field1 = AUX_KEY_PREFIX; let mut test_key = base_key; // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); updated[blknum] = lsn; drop(writer); } let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); for iter in 0..=10 { // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = (blknum * STEP) as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, test_img(&format!("{blknum} at {last_lsn}")) ); } let mut cnt = 0; let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); for (key, value) in tline .get_vectored_impl( query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await? { let blknum = key.field6 as usize; let value = value?; assert!(blknum % STEP == 0); let blknum = blknum / STEP; assert_eq!( value, test_img(&format!("{} at {}", blknum, updated[blknum])) ); cnt += 1; } assert_eq!(cnt, NUM_KEYS); for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; } // Perform two cycles of flush, compact, and GC for round in 0..2 { tline.freeze_and_flush().await?; tline .compact( &cancel, if iter % 5 == 0 && round == 0 { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); flags } else { EnumSet::empty() }, &ctx, ) .await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; } } Ok(()) } #[tokio::test] async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_compaction_trigger").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let cancel = CancellationToken::new(); let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); base_key.field1 = AUX_KEY_PREFIX; let test_key = base_key; let mut lsn = Lsn(0x10); for _ in 0..20 { lsn = Lsn(lsn.0 + 0x10); let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{} at {}", 0, lsn))), &ctx, ) .await?; writer.finish_write(lsn); drop(writer); tline.freeze_and_flush().await?; // force create a delta layer } let before_num_l0_delta_files = tline .layers .read(LayerManagerLockHolder::Testing) .await .layer_map()? .level0_deltas() .len(); tline.compact(&cancel, EnumSet::default(), &ctx).await?; let after_num_l0_delta_files = tline .layers .read(LayerManagerLockHolder::Testing) .await .layer_map()? .level0_deltas() .len(); assert!( after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}" ); assert_eq!( tline.get(test_key, lsn, &ctx).await?, test_img(&format!("{} at {}", 0, lsn)) ); Ok(()) } #[tokio::test] async fn test_aux_file_e2e() { let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let mut lsn = Lsn(0x08); let tline: Arc = tenant .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) .await .unwrap(); { lsn += 8; let mut modification = tline.begin_modification(lsn); modification .put_file("pg_logical/mappings/test1", b"first", &ctx) .await .unwrap(); modification.commit(&ctx).await.unwrap(); } // we can read everything from the storage let files = tline .list_aux_files(lsn, &ctx, io_concurrency.clone()) .await .unwrap(); assert_eq!( files.get("pg_logical/mappings/test1"), Some(&bytes::Bytes::from_static(b"first")) ); { lsn += 8; let mut modification = tline.begin_modification(lsn); modification .put_file("pg_logical/mappings/test2", b"second", &ctx) .await .unwrap(); modification.commit(&ctx).await.unwrap(); } let files = tline .list_aux_files(lsn, &ctx, io_concurrency.clone()) .await .unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), Some(&bytes::Bytes::from_static(b"second")) ); let child = tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) .await .unwrap(); let files = child .list_aux_files(lsn, &ctx, io_concurrency.clone()) .await .unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); } #[tokio::test] async fn test_repl_origin_tombstones() { let harness = TenantHarness::create("test_repl_origin_tombstones") .await .unwrap(); let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let mut lsn = Lsn(0x08); let tline: Arc = tenant .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let repl_lsn = Lsn(0x10); { lsn += 8; let mut modification = tline.begin_modification(lsn); modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new())); modification.set_replorigin(1, repl_lsn).await.unwrap(); modification.commit(&ctx).await.unwrap(); } // we can read everything from the storage let repl_origins = tline .get_replorigins(lsn, &ctx, io_concurrency.clone()) .await .unwrap(); assert_eq!(repl_origins.len(), 1); assert_eq!(repl_origins[&1], lsn); { lsn += 8; let mut modification = tline.begin_modification(lsn); modification.put_for_unit_test( repl_origin_key(3), Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")), ); modification.commit(&ctx).await.unwrap(); } let result = tline .get_replorigins(lsn, &ctx, io_concurrency.clone()) .await; assert!(result.is_err()); } #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; const STEP: usize = 10000; // random update + scan base_key + idx * STEP let cancel = CancellationToken::new(); let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... let mut test_key = base_key; let mut lsn = Lsn(0x10); async fn scan_with_statistics( tline: &Timeline, keyspace: &KeySpace, lsn: Lsn, ctx: &RequestContext, io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); let res = tline .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); drop(writer); } let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); for iter in 1..=10 { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer .put( test_key, lsn, &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; writer.finish_write(lsn); drop(writer); } tline.freeze_and_flush().await?; // Force layers to L1 tline .compact( &cancel, { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceL0Compaction); flags }, &ctx, ) .await?; if iter % 5 == 0 { let scan_lsn = Lsn(lsn.0 + 1); info!("scanning at {}", scan_lsn); let (_, before_delta_file_accessed) = scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone()) .await?; tline .compact( &cancel, { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); flags.insert(CompactFlags::ForceL0Compaction); flags }, &ctx, ) .await?; let (_, after_delta_file_accessed) = scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone()) .await?; assert!( after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}" ); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( after_delta_file_accessed <= 2, "after_delta_file_accessed={after_delta_file_accessed}" ); } } Ok(()) } #[tokio::test] async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers Vec::new(), // delta layers vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next()))); let child = tenant .branch_timeline_test_with_layers( &tline, NEW_TIMELINE_ID, Some(Lsn(0x20)), &ctx, Vec::new(), // delta layers vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers Lsn(0x30), ) .await .unwrap(); let lsn = Lsn(0x30); // test vectored get on parent timeline assert_eq!( get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); assert!( get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) .await .unwrap_err() .is_missing_key_error() ); assert!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) .await .unwrap_err() .is_missing_key_error() ); // test vectored get on child timeline assert_eq!( get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); assert_eq!( get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, Some(test_img("data key 2")) ); assert!( get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) .await .unwrap_err() .is_missing_key_error() ); Ok(()) } #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap(); let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap(); let base_inherited_key_child = Key::from_hex("610000000033333333444444445500000001").unwrap(); let base_inherited_key_nonexist = Key::from_hex("610000000033333333444444445500000002").unwrap(); let base_inherited_key_overwrite = Key::from_hex("610000000033333333444444445500000003").unwrap(); assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers Vec::new(), // delta layers vec![( Lsn(0x20), vec![ (base_inherited_key, test_img("metadata inherited key 1")), ( base_inherited_key_overwrite, test_img("metadata key overwrite 1a"), ), (base_key, test_img("metadata key 1")), (base_key_overwrite, test_img("metadata key overwrite 1b")), ], )], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; let child = tenant .branch_timeline_test_with_layers( &tline, NEW_TIMELINE_ID, Some(Lsn(0x20)), &ctx, Vec::new(), // delta layers vec![( Lsn(0x30), vec![ ( base_inherited_key_child, test_img("metadata inherited key 2"), ), ( base_inherited_key_overwrite, test_img("metadata key overwrite 2a"), ), (base_key_child, test_img("metadata key 2")), (base_key_overwrite, test_img("metadata key overwrite 2b")), ], )], // image layers Lsn(0x30), ) .await .unwrap(); let lsn = Lsn(0x30); // test vectored get on parent timeline assert_eq!( get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("metadata key 1")) ); assert_eq!( get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?, Some(test_img("metadata key overwrite 1b")) ); assert_eq!( get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?, Some(test_img("metadata inherited key 1")) ); assert_eq!( get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?, Some(test_img("metadata key overwrite 1a")) ); // test vectored get on child timeline assert_eq!( get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, Some(test_img("metadata key 2")) ); assert_eq!( get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?, Some(test_img("metadata inherited key 1")) ); assert_eq!( get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?, Some(test_img("metadata inherited key 2")) ); assert_eq!( get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?, None ); assert_eq!( get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?, Some(test_img("metadata key overwrite 2b")) ); assert_eq!( get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?, Some(test_img("metadata key overwrite 2a")) ); // test vectored scan on parent timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = tline .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( res.into_iter() .map(|(k, v)| (k, v.unwrap())) .collect::>(), vec![ (base_inherited_key, test_img("metadata inherited key 1")), ( base_inherited_key_overwrite, test_img("metadata key overwrite 1a") ), (base_key, test_img("metadata key 1")), (base_key_overwrite, test_img("metadata key overwrite 1b")), ] ); // test vectored scan on child timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = child .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( res.into_iter() .map(|(k, v)| (k, v.unwrap())) .collect::>(), vec![ (base_inherited_key, test_img("metadata inherited key 1")), ( base_inherited_key_child, test_img("metadata inherited key 2") ), ( base_inherited_key_overwrite, test_img("metadata key overwrite 2a") ), (base_key_child, test_img("metadata key 2")), (base_key_overwrite, test_img("metadata key overwrite 2b")), ] ); Ok(()) } async fn get_vectored_impl_wrapper( tline: &Arc, key: Key, lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { let io_concurrency = IoConcurrency::spawn_from_conf( tline.conf.get_vectored_concurrent_io, tline.gate.enter().unwrap(), ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let mut res = tline .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok(res.pop_last().map(|(k, v)| { assert_eq!(k, key); v.unwrap() })) } #[tokio::test] async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_tombstone_reads").await?; let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones // Lsn 0x30 key0, key3, no key1+key2 // Lsn 0x20 key1+key2 tomestones // Lsn 0x10 key1 in image, key2 in delta let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x20), vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], ), ], // image layers vec![ (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]), ( Lsn(0x30), vec![ (key0, test_img("metadata key 0")), (key3, test_img("metadata key 3")), ], ), ], Lsn(0x30), ) .await?; let lsn = Lsn(0x30); let old_lsn = Lsn(0x20); assert_eq!( get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?, Some(test_img("metadata key 0")) ); assert_eq!( get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?, None, ); assert_eq!( get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?, None, ); assert_eq!( get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?, Some(Bytes::new()), ); assert_eq!( get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?, Some(Bytes::new()), ); assert_eq!( get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?, Some(test_img("metadata key 3")) ); Ok(()) } #[tokio::test] async fn test_metadata_tombstone_image_creation() { let harness = TenantHarness::create("test_metadata_tombstone_image_creation") .await .unwrap(); let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x20), vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x30)..Lsn(0x40), vec![ (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), ], ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], Lsn(0x40), ) .await .unwrap(); let cancel = CancellationToken::new(); // Image layer creation happens on the disk_consistent_lsn so we need to force set it now. tline.force_set_disk_consistent_lsn(Lsn(0x40)); tline .compact( &cancel, { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); flags }, &ctx, ) .await .unwrap(); // Image layers are created at repartition LSN let images = tline .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() .filter(|(k, _)| k.is_metadata_key()) .collect::>(); assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed. } #[tokio::test] async fn test_metadata_tombstone_empty_image_creation() { let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation") .await .unwrap(); let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x20), vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], Lsn(0x30), ) .await .unwrap(); let cancel = CancellationToken::new(); tline .compact( &cancel, { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); flags }, &ctx, ) .await .unwrap(); // Image layers are created at last_record_lsn let images = tline .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() .filter(|(k, _)| k.is_metadata_key()) .collect::>(); assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created } #[tokio::test] async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } // We create // - one bottom-most image layer, // - a delta layer D1 crossing the GC horizon with data below and above the horizon, // - a delta layer D2 crossing the GC horizon with data only below the horizon, // - a delta layer D3 above the horizon. // // | D3 | // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: // | D3 | // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::Image(Bytes::from("value 1@0x20")), ), ( get_key(2), Lsn(0x30), Value::Image(Bytes::from("value 2@0x30")), ), ( get_key(3), Lsn(0x40), Value::Image(Bytes::from("value 3@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), Value::Image(Bytes::from("value 5@0x20")), ), ( get_key(6), Lsn(0x20), Value::Image(Bytes::from("value 6@0x20")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::Image(Bytes::from("value 8@0x48")), ), ( get_key(9), Lsn(0x48), Value::Image(Bytes::from("value 9@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); guard.cutoffs.time = Some(Lsn(0x30)); guard.cutoffs.space = Lsn(0x30); } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x20"), Bytes::from_static(b"value 2@0x30"), Bytes::from_static(b"value 3@0x40"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x20"), Bytes::from_static(b"value 6@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x48"), Bytes::from_static(b"value 9@0x48"), ]; for (idx, expected) in expected_result.iter().enumerate() { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), expected ); } let cancel = CancellationToken::new(); tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); for (idx, expected) in expected_result.iter().enumerate() { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), expected ); } // Check if the image layer at the GC horizon contains exactly what we want let image_at_gc_horizon = tline .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() .filter(|(k, _)| k.is_metadata_key()) .collect::>(); assert_eq!(image_at_gc_horizon.len(), 10); let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x20"), Bytes::from_static(b"value 2@0x30"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x20"), Bytes::from_static(b"value 6@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; for idx in 0..10 { assert_eq!( image_at_gc_horizon[idx], (get_key(idx as u32), expected_result[idx].clone()) ); } // Check if old layers are removed / new layers have the expected LSN let all_layers = inspect_and_sort(&tline, None).await; assert_eq!( all_layers, vec![ // Image layer at GC horizon PersistentLayerKey { key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, // The delta layer below the horizon PersistentLayerKey { key_range: get_key(3)..get_key(4), lsn_range: Lsn(0x30)..Lsn(0x48), is_delta: true }, // The delta3 layer that should not be picked for the compaction PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true } ] ); // increase GC horizon and compact again { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_neon_test_record() -> anyhow::Result<()> { let harness = TenantHarness::create("test_neon_test_record").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(",0x20")), ), ( get_key(1), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(",0x30")), ), (get_key(2), Lsn(0x10), Value::Image("0x10".into())), ( get_key(2), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(",0x20")), ), ( get_key(2), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(",0x30")), ), (get_key(3), Lsn(0x10), Value::Image("0x10".into())), ( get_key(3), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_clear("c")), ), (get_key(4), Lsn(0x10), Value::Image("0x10".into())), ( get_key(4), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_init("i")), ), ( get_key(4), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")), ), ( get_key(5), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_init("1")), ), ( get_key(5), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")), ), ]; let image1 = vec![(get_key(1), "0x10".into())]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x40), delta1, )], // delta layers vec![(Lsn(0x10), image1)], // image layers Lsn(0x50), ) .await?; assert_eq!( tline.get(get_key(1), Lsn(0x50), &ctx).await?, Bytes::from_static(b"0x10,0x20,0x30") ); assert_eq!( tline.get(get_key(2), Lsn(0x50), &ctx).await?, Bytes::from_static(b"0x10,0x20,0x30") ); // Need to remove the limit of "Neon WAL redo requires base image". assert_eq!( tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::from_static(b"c") ); assert_eq!( tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::from_static(b"ij") ); // Manual testing required: currently, read errors will panic the process in debug mode. So we // cannot enable this assertion in the unit test. // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err()); Ok(()) } #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_lsn_lease") .await .unwrap() .load() .await; // set a non-zero lease length to test the feature tenant .update_tenant_config(|mut conf| { conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH); Ok(conf) }) .unwrap(); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); let image_layers = (0x20..=0x90) .step_by(0x10) .map(|n| (Lsn(n), vec![(key, test_img(&format!("data key at {n:x}")))])) .collect(); let timeline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers Vec::new(), image_layers, end_lsn, ) .await?; let leased_lsns = [0x30, 0x50, 0x70]; let mut leases = Vec::new(); leased_lsns.iter().for_each(|n| { leases.push( timeline .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx) .expect("lease request should succeed"), ); }); let updated_lease_0 = timeline .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx) .expect("lease renewal should succeed"); assert_eq!( updated_lease_0.valid_until, leases[0].valid_until, " Renewing with shorter lease should not change the lease." ); let updated_lease_1 = timeline .renew_lsn_lease( Lsn(leased_lsns[1]), timeline.get_lsn_lease_length() * 2, &ctx, ) .expect("lease renewal should succeed"); assert!( updated_lease_1.valid_until > leases[1].valid_until, "Renewing with a long lease should renew lease with later expiration time." ); // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( "applied_gc_cutoff_lsn: {}", *timeline.get_applied_gc_cutoff_lsn() ); timeline.force_set_disk_consistent_lsn(end_lsn); let res = tenant .gc_iteration( Some(TIMELINE_ID), 0, Duration::ZERO, &CancellationToken::new(), &ctx, ) .await .unwrap(); // Keeping everything <= Lsn(0x80) b/c leases: // 0/10: initdb layer // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline. assert_eq!(res.layers_needed_by_leases, 7); // Keeping 0/90 b/c it is the latest layer. assert_eq!(res.layers_not_updated, 1); // Removed 0/80. assert_eq!(res.layers_removed, 1); // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn()); timeline .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) .expect_err("lease request on GC-ed LSN should fail"); // Should still be able to renew a currently valid lease // Assumption: original lease to is still valid for 0/50. // (use `Timeline::init_lsn_lease` for testing so it always does validation) timeline .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx) .expect("lease renewal with validation should succeed"); Ok(()) } #[tokio::test] async fn test_failed_flush_should_not_update_disk_consistent_lsn() -> anyhow::Result<()> { // // Setup // let harness = TenantHarness::create_custom( "test_failed_flush_should_not_upload_disk_consistent_lsn", pageserver_api::models::TenantConfig::default(), TenantId::generate(), ShardIdentity::new(ShardNumber(0), ShardCount(4), ShardStripeSize(128)).unwrap(), Generation::new(1), ) .await?; let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; assert_eq!(timeline.get_shard_identity().count, ShardCount(4)); let mut writer = timeline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; writer.finish_write(Lsn(0x20)); drop(writer); timeline.freeze_and_flush().await.unwrap(); timeline.remote_client.wait_completion().await.unwrap(); let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn_projected(); assert_eq!(Some(disk_consistent_lsn), remote_consistent_lsn); // // Test // let mut writer = timeline.writer().await; writer .put( *TEST_KEY, Lsn(0x30), &Value::Image(test_img("foo at 0x30")), &ctx, ) .await?; writer.finish_write(Lsn(0x30)); drop(writer); fail::cfg( "flush-layer-before-update-remote-consistent-lsn", "return()", ) .unwrap(); let flush_res = timeline.freeze_and_flush().await; // if flush failed, the disk/remote consistent LSN should not be updated assert!(flush_res.is_err()); assert_eq!(disk_consistent_lsn, timeline.get_disk_consistent_lsn()); assert_eq!( remote_consistent_lsn, timeline.get_remote_consistent_lsn_projected() ); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> { test_simple_bottom_most_compaction_deltas_helper( "test_simple_bottom_most_compaction_deltas_1", false, ) .await } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> { test_simple_bottom_most_compaction_deltas_helper( "test_simple_bottom_most_compaction_deltas_2", true, ) .await } #[cfg(feature = "testing")] async fn test_simple_bottom_most_compaction_deltas_helper( test_name: &'static str, use_delta_bottom_layer: bool, ) -> anyhow::Result<()> { let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } // We create // - one bottom-most image layer, // - a delta layer D1 crossing the GC horizon with data below and above the horizon, // - a delta layer D2 crossing the GC horizon with data only below the horizon, // - a delta layer D3 above the horizon. // // | D3 | // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: // | D3 | // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); // or, delta layer at 0x10 if `use_delta_bottom_layer` is true let delta4 = (0..10) .map(|id| { ( get_key(id), Lsn(0x08), Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))), ) }) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(2), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), ), ( get_key(3), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append("@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(6), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let tline = if use_delta_bottom_layer { tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x08)..Lsn(0x10), delta4, ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x48), delta1, ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x48), delta2, ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x48)..Lsn(0x50), delta3, ), ], // delta layers vec![], // image layers Lsn(0x50), ) .await? } else { tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x48), delta1, ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x48), delta2, ), DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x48)..Lsn(0x50), delta3, ), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await? }; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x30), &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); } let cancel = CancellationToken::new(); tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x30), &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); } // increase GC horizon and compact again { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_generate_key_retention() -> anyhow::Result<()> { let harness = TenantHarness::create("test_generate_key_retention").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; tline.force_advance_lsn(Lsn(0x70)); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let history = vec![ ( key, Lsn(0x10), Value::WalRecord(NeonWalRecord::wal_init("0x10")), ), ( key, Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(";0x20")), ), ( key, Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(";0x30")), ), ( key, Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append(";0x40")), ), ( key, Lsn(0x50), Value::WalRecord(NeonWalRecord::wal_append(";0x50")), ), ( key, Lsn(0x60), Value::WalRecord(NeonWalRecord::wal_append(";0x60")), ), ( key, Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ( key, Lsn(0x80), Value::Image(Bytes::copy_from_slice( b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", )), ), ( key, Lsn(0x90), Value::WalRecord(NeonWalRecord::wal_append(";0x90")), ), ]; let res = tline .generate_key_retention( key, &history, Lsn(0x60), &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], 3, None, true, ) .await .unwrap(); let expected_res = KeyHistoryRetention { below_horizon: vec![ ( Lsn(0x20), KeyLogAtLsn(vec![( Lsn(0x20), Value::Image(Bytes::from_static(b"0x10;0x20")), )]), ), ( Lsn(0x40), KeyLogAtLsn(vec![ ( Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(";0x30")), ), ( Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append(";0x40")), ), ]), ), ( Lsn(0x50), KeyLogAtLsn(vec![( Lsn(0x50), Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")), )]), ), ( Lsn(0x60), KeyLogAtLsn(vec![( Lsn(0x60), Value::WalRecord(NeonWalRecord::wal_append(";0x60")), )]), ), ], above_horizon: KeyLogAtLsn(vec![ ( Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ( Lsn(0x80), Value::Image(Bytes::copy_from_slice( b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", )), ), ( Lsn(0x90), Value::WalRecord(NeonWalRecord::wal_append(";0x90")), ), ]), }; assert_eq!(res, expected_res); // We expect GC-compaction to run with the original GC. This would create a situation that // the original GC algorithm removes some delta layers b/c there are full image coverage, // therefore causing some keys to have an incomplete history below the lowest retain LSN. // For example, we have // ```plain // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40. // ``` // Now the GC horizon moves up, and we have // ```plain // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon) // ``` // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20. // We will end up with // ```plain // delta @ 0x30, image @ 0x40 (gc_horizon) // ``` // Now we run the GC-compaction, and this key does not have a full history. // We should be able to handle this partial history and drop everything before the // gc_horizon image. let history = vec![ ( key, Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(";0x20")), ), ( key, Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(";0x30")), ), ( key, Lsn(0x40), Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), ), ( key, Lsn(0x50), Value::WalRecord(NeonWalRecord::wal_append(";0x50")), ), ( key, Lsn(0x60), Value::WalRecord(NeonWalRecord::wal_append(";0x60")), ), ( key, Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ( key, Lsn(0x80), Value::Image(Bytes::copy_from_slice( b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", )), ), ( key, Lsn(0x90), Value::WalRecord(NeonWalRecord::wal_append(";0x90")), ), ]; let res = tline .generate_key_retention( key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None, true, ) .await .unwrap(); let expected_res = KeyHistoryRetention { below_horizon: vec![ ( Lsn(0x40), KeyLogAtLsn(vec![( Lsn(0x40), Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), )]), ), ( Lsn(0x50), KeyLogAtLsn(vec![( Lsn(0x50), Value::WalRecord(NeonWalRecord::wal_append(";0x50")), )]), ), ( Lsn(0x60), KeyLogAtLsn(vec![( Lsn(0x60), Value::WalRecord(NeonWalRecord::wal_append(";0x60")), )]), ), ], above_horizon: KeyLogAtLsn(vec![ ( Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ( Lsn(0x80), Value::Image(Bytes::copy_from_slice( b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", )), ), ( Lsn(0x90), Value::WalRecord(NeonWalRecord::wal_append(";0x90")), ), ]), }; assert_eq!(res, expected_res); // In case of branch compaction, the branch itself does not have the full history, and we need to provide // the ancestor image in the test case. let history = vec![ ( key, Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(";0x20")), ), ( key, Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append(";0x30")), ), ( key, Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append(";0x40")), ), ( key, Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ]; let res = tline .generate_key_retention( key, &history, Lsn(0x60), &[], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), true, ) .await .unwrap(); let expected_res = KeyHistoryRetention { below_horizon: vec![( Lsn(0x60), KeyLogAtLsn(vec![( Lsn(0x60), Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page )]), )], above_horizon: KeyLogAtLsn(vec![( Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), )]), }; assert_eq!(res, expected_res); let history = vec![ ( key, Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(";0x20")), ), ( key, Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append(";0x40")), ), ( key, Lsn(0x60), Value::WalRecord(NeonWalRecord::wal_append(";0x60")), ), ( key, Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), ), ]; let res = tline .generate_key_retention( key, &history, Lsn(0x60), &[Lsn(0x30)], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), true, ) .await .unwrap(); let expected_res = KeyHistoryRetention { below_horizon: vec![ ( Lsn(0x30), KeyLogAtLsn(vec![( Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append(";0x20")), )]), ), ( Lsn(0x60), KeyLogAtLsn(vec![( Lsn(0x60), Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")), )]), ), ], above_horizon: KeyLogAtLsn(vec![( Lsn(0x70), Value::WalRecord(NeonWalRecord::wal_append(";0x70")), )]), }; assert_eq!(res, expected_res); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(2), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), ), ( get_key(3), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append("@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(6), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_20 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_10 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), gc_horizon, &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x20), &ctx) .await .unwrap(), &expected_result_at_lsn_20[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x10), &ctx) .await .unwrap(), &expected_result_at_lsn_10[idx] ); } }; verify_result().await; let cancel = CancellationToken::new(); let mut dryrun_flags = EnumSet::new(); dryrun_flags.insert(CompactFlags::DryRun); tline .compact_with_gc( &cancel, CompactOptions { flags: dryrun_flags, ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. verify_result().await; tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; // compact again tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; // increase GC horizon and compact again { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x38)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); guard.cutoffs.time = Some(Lsn(0x38)); guard.cutoffs.space = Lsn(0x38); } tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result // not increasing the GC horizon and compact again tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key") .await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(1), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), ), ]; let delta2 = vec![ ( get_key(1), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(1), Lsn(0x38), Value::WalRecord(NeonWalRecord::wal_append("@0x38")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, Vec::new(), // in-memory layers vec![ // delta1 and delta 2 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_20 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_10 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), gc_horizon, &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x20), &ctx) .await .unwrap(), &expected_result_at_lsn_20[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x10), &ctx) .await .unwrap(), &expected_result_at_lsn_10[idx] ); } }; verify_result().await; let cancel = CancellationToken::new(); let mut dryrun_flags = EnumSet::new(); dryrun_flags.insert(CompactFlags::DryRun); tline .compact_with_gc( &cancel, CompactOptions { flags: dryrun_flags, ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. verify_result().await; tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; // compact again tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { use models::CompactLsnRange; let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(2), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), ), ( get_key(3), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(3), Lsn(0x40), Value::WalRecord(NeonWalRecord::wal_append("@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(6), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let parent_tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![], // delta layers vec![(Lsn(0x18), img_layer)], // image layers Lsn(0x18), ) .await?; parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); let branch_tline = tenant .branch_timeline_test_with_layers( &parent_tline, NEW_TIMELINE_ID, Some(Lsn(0x18)), &ctx, vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), ], // delta layers vec![], // image layers Lsn(0x50), ) .await?; branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); { parent_tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x10)) .wait() .await; // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Some(Lsn(0x10)), space: Lsn(0x10), }, leases: Default::default(), within_ancestor_pitr: false, }; } { branch_tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x50)) .wait() .await; // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Some(Lsn(0x50)), space: Lsn(0x50), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_lsn_40 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10@0x30"), Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let verify_result = || async { for idx in 0..10 { assert_eq!( branch_tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); assert_eq!( branch_tline .get(get_key(idx as u32), Lsn(0x40), &ctx) .await .unwrap(), &expected_result_at_lsn_40[idx] ); } }; verify_result().await; let cancel = CancellationToken::new(); branch_tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; // Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files. // Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18). branch_tline .compact_with_gc( &cancel, CompactOptions { compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); verify_result().await; Ok(()) } // Regression test for https://github.com/neondatabase/neon/issues/9012 // Create an image arrangement where we have to read at different LSN ranges // from a delta layer. This is achieved by overlapping an image layer on top of // a delta layer. Like so: // // A B // +----------------+ -> delta_layer // | | ^ lsn // | =========|-> nested_image_layer | // | C | | // +----------------+ | // ======== -> baseline_image_layer +-------> key // // // When querying the key range [A, B) we need to read at different LSN ranges // for [A, C) and [C, B). This test checks that the described edge case is handled correctly. #[cfg(feature = "testing")] #[tokio::test] async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?; let (tenant, ctx) = harness.load().await; let will_init_keys = [2, 6]; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); key.field6 = id; key } let mut expected_key_values = HashMap::new(); let baseline_image_layer_lsn = Lsn(0x10); let mut baseline_img_layer = Vec::new(); for i in 0..5 { let key = get_key(i); let value = format!("value {i}@{baseline_image_layer_lsn}"); let removed = expected_key_values.insert(key, value.clone()); assert!(removed.is_none()); baseline_img_layer.push((key, Bytes::from(value))); } let nested_image_layer_lsn = Lsn(0x50); let mut nested_img_layer = Vec::new(); for i in 5..10 { let key = get_key(i); let value = format!("value {i}@{nested_image_layer_lsn}"); let removed = expected_key_values.insert(key, value.clone()); assert!(removed.is_none()); nested_img_layer.push((key, Bytes::from(value))); } let mut delta_layer_spec = Vec::default(); let delta_layer_start_lsn = Lsn(0x20); let mut delta_layer_end_lsn = delta_layer_start_lsn; for i in 0..10 { let key = get_key(i); let key_in_nested = nested_img_layer .iter() .any(|(key_with_img, _)| *key_with_img == key); let lsn = { if key_in_nested { Lsn(nested_image_layer_lsn.0 + 0x10) } else { delta_layer_start_lsn } }; let will_init = will_init_keys.contains(&i); if will_init { delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); expected_key_values.insert(key, "".to_string()); } else { let delta = format!("@{lsn}"); delta_layer_spec.push(( key, lsn, Value::WalRecord(NeonWalRecord::wal_append(&delta)), )); expected_key_values .get_mut(&key) .expect("An image exists for each key") .push_str(delta.as_str()); } delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn); } delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1); assert!( nested_image_layer_lsn > delta_layer_start_lsn && nested_image_layer_lsn < delta_layer_end_lsn ); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, baseline_image_layer_lsn, DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( delta_layer_start_lsn..delta_layer_end_lsn, delta_layer_spec, )], // delta layers vec![ (baseline_image_layer_lsn, baseline_img_layer), (nested_image_layer_lsn, nested_img_layer), ], // image layers delta_layer_end_lsn, ) .await?; let query = VersionedKeySpaceQuery::uniform( KeySpace::single(get_key(0)..get_key(10)), delta_layer_end_lsn, ); let results = tline .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { let value = res.expect("No key errors"); let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); assert_eq!(value, Bytes::from(expected_value)); } Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?; let (tenant, ctx) = harness.load().await; let will_init_keys = [2, 6]; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); key.field6 = id; key } let mut expected_key_values = HashMap::new(); let baseline_image_layer_lsn = Lsn(0x10); let mut baseline_img_layer = Vec::new(); for i in 0..5 { let key = get_key(i); let value = format!("value {i}@{baseline_image_layer_lsn}"); let removed = expected_key_values.insert(key, value.clone()); assert!(removed.is_none()); baseline_img_layer.push((key, Bytes::from(value))); } let nested_image_layer_lsn = Lsn(0x50); let mut nested_img_layer = Vec::new(); for i in 5..10 { let key = get_key(i); let value = format!("value {i}@{nested_image_layer_lsn}"); let removed = expected_key_values.insert(key, value.clone()); assert!(removed.is_none()); nested_img_layer.push((key, Bytes::from(value))); } let frozen_layer = { let lsn_range = Lsn(0x40)..Lsn(0x60); let mut data = Vec::new(); for i in 0..10 { let key = get_key(i); let key_in_nested = nested_img_layer .iter() .any(|(key_with_img, _)| *key_with_img == key); let lsn = { if key_in_nested { Lsn(nested_image_layer_lsn.0 + 5) } else { lsn_range.start } }; let will_init = will_init_keys.contains(&i); if will_init { data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); expected_key_values.insert(key, "".to_string()); } else { let delta = format!("@{lsn}"); data.push(( key, lsn, Value::WalRecord(NeonWalRecord::wal_append(&delta)), )); expected_key_values .get_mut(&key) .expect("An image exists for each key") .push_str(delta.as_str()); } } InMemoryLayerTestDesc { lsn_range, is_open: false, data, } }; let (open_layer, last_record_lsn) = { let start_lsn = Lsn(0x70); let mut data = Vec::new(); let mut end_lsn = Lsn(0); for i in 0..10 { let key = get_key(i); let lsn = Lsn(start_lsn.0 + i as u64); let delta = format!("@{lsn}"); data.push(( key, lsn, Value::WalRecord(NeonWalRecord::wal_append(&delta)), )); expected_key_values .get_mut(&key) .expect("An image exists for each key") .push_str(delta.as_str()); end_lsn = std::cmp::max(end_lsn, lsn); } ( InMemoryLayerTestDesc { lsn_range: start_lsn..Lsn::MAX, is_open: true, data, }, end_lsn, ) }; assert!( nested_image_layer_lsn > frozen_layer.lsn_range.start && nested_image_layer_lsn < frozen_layer.lsn_range.end ); let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, baseline_image_layer_lsn, DEFAULT_PG_VERSION, &ctx, vec![open_layer, frozen_layer], // in-memory layers Vec::new(), // delta layers vec![ (baseline_image_layer_lsn, baseline_img_layer), (nested_image_layer_lsn, nested_img_layer), ], // image layers last_record_lsn, ) .await?; let query = VersionedKeySpaceQuery::uniform( KeySpace::single(get_key(0)..get_key(10)), last_record_lsn, ); let results = tline .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { let value = res.expect("No key errors"); let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); assert_eq!(value, Bytes::from(expected_value.clone())); tracing::info!("key={key} value={expected_value}"); } Ok(()) } // A randomized read path test. Generates a layer map according to a deterministic // specification. Fills the (key, LSN) space in random manner and then performs // random scattered queries validating the results against in-memory storage. // // See this internal Notion page for a diagram of the layer map: // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4 // // A fuzzing mode is also supported. In this mode, the test will use a random // seed instead of a hardcoded one. Use it in conjunction with `cargo stress` // to run multiple instances in parallel: // // $ RUST_BACKTRACE=1 RUST_LOG=INFO \ // cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path #[cfg(feature = "testing")] #[tokio::test] async fn test_read_path() -> anyhow::Result<()> { use rand::seq::IndexedRandom; let seed = if cfg!(feature = "fuzz-read-path") { let seed: u64 = rand::rng().random(); seed } else { // Use a hard-coded seed when not in fuzzing mode. // Note that with the current approach results are not reproducible // accross platforms and Rust releases. const SEED: u64 = 0; SEED }; let mut random = StdRng::seed_from_u64(seed); let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") { const QUERIES: u64 = 5000; let will_init_chance: u8 = random.random_range(0..=10); let gap_chance: u8 = random.random_range(0..=50); (QUERIES, will_init_chance, gap_chance) } else { const QUERIES: u64 = 1000; const WILL_INIT_CHANCE: u8 = 1; const GAP_CHANCE: u8 = 5; (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE) }; let harness = TenantHarness::create("test_read_path").await?; let (tenant, ctx) = harness.load().await; tracing::info!("Using random seed: {seed}"); tracing::info!(%will_init_chance, %gap_chance, "Fill params"); // Define the layer map shape. Note that this part is not randomized. const KEY_DIMENSION_SIZE: u32 = 99; let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap(); let end_key = start_key.add(KEY_DIMENSION_SIZE); let total_key_range = start_key..end_key; let total_key_range_size = end_key.to_i128() - start_key.to_i128(); let total_start_lsn = Lsn(104); let last_record_lsn = Lsn(504); assert!(total_key_range_size % 3 == 0); let in_memory_layers_shape = vec![ (total_key_range.clone(), Lsn(304)..Lsn(400)), (total_key_range.clone(), Lsn(400)..last_record_lsn), ]; let delta_layers_shape = vec![ ( start_key..(start_key.add((total_key_range_size / 3) as u32)), Lsn(200)..Lsn(304), ), ( (start_key.add((total_key_range_size / 3) as u32)) ..(start_key.add((total_key_range_size * 2 / 3) as u32)), Lsn(200)..Lsn(304), ), ( (start_key.add((total_key_range_size * 2 / 3) as u32)) ..(start_key.add(total_key_range_size as u32)), Lsn(200)..Lsn(304), ), ]; let image_layers_shape = vec![ ( start_key.add((total_key_range_size * 2 / 3 - 10) as u32) ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32), Lsn(456), ), ( start_key.add((total_key_range_size / 3 - 10) as u32) ..start_key.add((total_key_range_size / 3 + 10) as u32), Lsn(256), ), (total_key_range.clone(), total_start_lsn), ]; let specification = TestTimelineSpecification { start_lsn: total_start_lsn, last_record_lsn, in_memory_layers_shape, delta_layers_shape, image_layers_shape, gap_chance, will_init_chance, }; // Create and randomly fill in the layers according to the specification let (tline, storage, interesting_lsns) = randomize_timeline( &tenant, TIMELINE_ID, DEFAULT_PG_VERSION, specification, &mut random, &ctx, ) .await?; // Now generate queries based on the interesting lsns that we've collected. // // While there's still room in the query, pick and interesting LSN and a random // key. Then roll the dice to see if the next key should also be included in // the query. When the roll fails, break the "batch" and pick another point in the // (key, LSN) space. const PICK_NEXT_CHANCE: u8 = 50; for _ in 0..queries { let query = { let mut keyspaces_at_lsn: HashMap = HashMap::default(); let mut used_keys: HashSet = HashSet::default(); while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); let mut selected_key = start_key.add(random.random_range(0..KEY_DIMENSION_SIZE)); while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { if used_keys.contains(&selected_key) || selected_key >= start_key.add(KEY_DIMENSION_SIZE) { break; } keyspaces_at_lsn .entry(*selected_lsn) .or_default() .add_key(selected_key); used_keys.insert(selected_key); let pick_next = random.random_range(0..=100) <= PICK_NEXT_CHANCE; if pick_next { selected_key = selected_key.next(); } else { break; } } } VersionedKeySpaceQuery::scattered( keyspaces_at_lsn .into_iter() .map(|(lsn, acc)| (lsn, acc.to_keyspace())) .collect(), ) }; // Run the query and validate the results let results = tline .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx) .await; let blobs = match results { Ok(ok) => ok, Err(err) => { panic!("seed={seed} Error returned for query {query}: {err}"); } }; for (key, key_res) in blobs.into_iter() { match key_res { Ok(blob) => { let requested_at_lsn = query.map_key_to_lsn(&key); let expected = storage.get(key, requested_at_lsn); if blob != expected { tracing::error!( "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}" ); } assert_eq!(blob, expected); } Err(err) => { let requested_at_lsn = query.map_key_to_lsn(&key); panic!( "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}" ); } } } } Ok(()) } fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, k1.key_range.start, k1.key_range.end, k1.lsn_range.start, k1.lsn_range.end, ) .cmp(&( k2.is_delta, k2.key_range.start, k2.key_range.end, k2.lsn_range.start, k2.lsn_range.end, )) } async fn inspect_and_sort( tline: &Arc, filter: Option>, ) -> Vec { let mut all_layers = tline.inspect_historic_layers().await.unwrap(); if let Some(filter) = filter { all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter)); } all_layers.sort_by(sort_layer_key); all_layers } #[cfg(feature = "testing")] fn check_layer_map_key_eq( mut left: Vec, mut right: Vec, ) { left.sort_by(sort_layer_key); right.sort_by(sort_layer_key); if left != right { eprintln!("---LEFT---"); for left in left.iter() { eprintln!("{left}"); } eprintln!("---RIGHT---"); for right in right.iter() { eprintln!("{right}"); } assert_eq!(left, right); } } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } // img layer at 0x10 let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::Image(Bytes::from("value 1@0x20")), ), ( get_key(2), Lsn(0x30), Value::Image(Bytes::from("value 2@0x30")), ), ( get_key(3), Lsn(0x40), Value::Image(Bytes::from("value 3@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), Value::Image(Bytes::from("value 5@0x20")), ), ( get_key(6), Lsn(0x20), Value::Image(Bytes::from("value 6@0x20")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::Image(Bytes::from("value 8@0x48")), ), ( get_key(9), Lsn(0x48), Value::Image(Bytes::from("value 9@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let cancel = CancellationToken::new(); // Do a partial compaction on key range 0..2 tline .compact_with_gc( &cancel, CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(0)..get_key(2)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // newly-generated image layer for the partial compaction range 0-2 PersistentLayerKey { key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // delta1 is split and the second part is rewritten PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(5)..get_key(7), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true, }, ], ); // Do a partial compaction on key range 2..4 tline .compact_with_gc( &cancel, CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(2)..get_key(4)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ PersistentLayerKey { key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // image layer generated for the compaction range 2-4 PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, // we have key2/key3 above the retain_lsn, so we still need this delta layer PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(5)..get_key(7), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true, }, ], ); // Do a partial compaction on key range 4..9 tline .compact_with_gc( &cancel, CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(4)..get_key(9)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ PersistentLayerKey { key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, // image layer generated for this compaction range PersistentLayerKey { key_range: get_key(4)..get_key(9), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true, }, ], ); // Do a partial compaction on key range 9..10 tline .compact_with_gc( &cancel, CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(9)..get_key(10)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ PersistentLayerKey { key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(4)..get_key(9), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, // image layer generated for the compaction range PersistentLayerKey { key_range: get_key(9)..get_key(10), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true, }, ], ); // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones. tline .compact_with_gc( &cancel, CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(0)..get_key(10)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // aha, we removed all unnecessary image/delta layers and got a very clean layer map! PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x20)..Lsn(0x21), is_delta: false, }, PersistentLayerKey { key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true, }, ], ); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_timeline_offload_retain_lsn() -> anyhow::Result<()> { let harness = TenantHarness::create("test_timeline_offload_retain_lsn") .await .unwrap(); let (tenant, ctx) = harness.load().await; let tline_parent = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await .unwrap(); let tline_child = tenant .branch_timeline_test(&tline_parent, NEW_TIMELINE_ID, Some(Lsn(0x20)), &ctx) .await .unwrap(); { let gc_info_parent = tline_parent.gc_info.read().unwrap(); assert_eq!( gc_info_parent.retain_lsns, vec![(Lsn(0x20), tline_child.timeline_id, MaybeOffloaded::No)] ); } // We have to directly call the remote_client instead of using the archive function to avoid constructing broker client... tline_child .remote_client .schedule_index_upload_for_timeline_archival_state(TimelineArchivalState::Archived) .unwrap(); tline_child.remote_client.wait_completion().await.unwrap(); offload_timeline(&tenant, &tline_child) .instrument(tracing::info_span!(parent: None, "offload_test", tenant_id=%"test", shard_id=%"test", timeline_id=%"test")) .await.unwrap(); let child_timeline_id = tline_child.timeline_id; Arc::try_unwrap(tline_child).unwrap(); { let gc_info_parent = tline_parent.gc_info.read().unwrap(); assert_eq!( gc_info_parent.retain_lsns, vec![(Lsn(0x20), child_timeline_id, MaybeOffloaded::Yes)] ); } tenant .get_offloaded_timeline(child_timeline_id) .unwrap() .defuse_for_tenant_drop(); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), )]; let delta4 = vec![( get_key(1), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), )]; let delta2 = vec![ ( get_key(1), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(1), Lsn(0x38), Value::WalRecord(NeonWalRecord::wal_append("@0x38")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_20 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_10 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), gc_horizon, &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x20), &ctx) .await .unwrap(), &expected_result_at_lsn_20[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x10), &ctx) .await .unwrap(), &expected_result_at_lsn_10[idx] ); } }; verify_result().await; let cancel = CancellationToken::new(); tline .compact_with_gc( &cancel, CompactOptions { compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The original image layer, not compacted PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // Delta layer below the specified above_lsn not compacted PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x28), is_delta: true, }, // Delta layer compacted above the LSN PersistentLayerKey { key_range: get_key(1)..get_key(10), lsn_range: Lsn(0x28)..Lsn(0x50), is_delta: true, }, ], ); // compact again tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The compacted image layer (full key range) PersistentLayerKey { key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // All other data in the delta layer PersistentLayerKey { key_range: get_key(1)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x50), is_delta: true, }, ], ); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), )]; let delta4 = vec![( get_key(1), Lsn(0x28), Value::WalRecord(NeonWalRecord::wal_append("@0x28")), )]; let delta2 = vec![ ( get_key(1), Lsn(0x30), Value::WalRecord(NeonWalRecord::wal_append("@0x30")), ), ( get_key(1), Lsn(0x38), Value::WalRecord(NeonWalRecord::wal_append("@0x38")), ), ]; let delta3 = vec![ ( get_key(8), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), Lsn(0x48), Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3), ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let expected_result = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10@0x48"), Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_20 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10@0x20"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let expected_result_at_lsn_10 = [ Bytes::from_static(b"value 0@0x10"), Bytes::from_static(b"value 1@0x10"), Bytes::from_static(b"value 2@0x10"), Bytes::from_static(b"value 3@0x10"), Bytes::from_static(b"value 4@0x10"), Bytes::from_static(b"value 5@0x10"), Bytes::from_static(b"value 6@0x10"), Bytes::from_static(b"value 7@0x10"), Bytes::from_static(b"value 8@0x10"), Bytes::from_static(b"value 9@0x10"), ]; let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( tline .get(get_key(idx as u32), Lsn(0x50), &ctx) .await .unwrap(), &expected_result[idx] ); assert_eq!( tline .get(get_key(idx as u32), gc_horizon, &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x20), &ctx) .await .unwrap(), &expected_result_at_lsn_20[idx] ); assert_eq!( tline .get(get_key(idx as u32), Lsn(0x10), &ctx) .await .unwrap(), &expected_result_at_lsn_10[idx] ); } }; verify_result().await; let cancel = CancellationToken::new(); tline .compact_with_gc( &cancel, CompactOptions { compact_key_range: Some((get_key(0)..get_key(2)).into()), compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The original image layer, not compacted PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and // the layer 0x28-0x30 into one. PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x30), is_delta: true, }, // Above the upper bound and untouched PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x30)..Lsn(0x50), is_delta: true, }, // This layer is untouched PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x30)..Lsn(0x50), is_delta: true, }, ], ); tline .compact_with_gc( &cancel, CompactOptions { compact_key_range: Some((get_key(3)..get_key(8)).into()), compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The original image layer, not compacted PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // Not in the compaction key range, uncompacted PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x30), is_delta: true, }, // Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x30)..Lsn(0x50), is_delta: true, }, // Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer // horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction // becomes 0x50. PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x30)..Lsn(0x50), is_delta: true, }, ], ); // compact again tline .compact_with_gc( &cancel, CompactOptions { compact_key_range: Some((get_key(0)..get_key(5)).into()), compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()), ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The original image layer, not compacted PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // The range gets compacted PersistentLayerKey { key_range: get_key(1)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x50), is_delta: true, }, // Not touched during this iteration of compaction PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x30)..Lsn(0x50), is_delta: true, }, ], ); // final full compaction tline .compact_with_gc( &cancel, CompactOptions::default_for_gc_compaction_unit_tests(), &ctx, ) .await .unwrap(); verify_result().await; let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; check_layer_map_key_eq( all_layers, vec![ // The compacted image layer (full key range) PersistentLayerKey { key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x10)..Lsn(0x11), is_delta: false, }, // All other data in the delta layer PersistentLayerKey { key_range: get_key(1)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x50), is_delta: true, }, ], ); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> { let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); key.field6 = id; key } let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ ( get_key(1), Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_append("@0x20")), ), ( get_key(1), Lsn(0x24), Value::WalRecord(NeonWalRecord::wal_append("@0x24")), ), ( get_key(1), Lsn(0x28), // This record will fail to redo Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")), ), ]; let tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x20)..Lsn(0x30), delta1, )], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) .await?; { tline .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, }; } let cancel = CancellationToken::new(); // Compaction will fail, but should not fire any critical error. // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire // compaction job. Tracked in . let res = tline .compact_with_gc( &cancel, CompactOptions { compact_key_range: None, compact_lsn_range: None, ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) .await; assert!(res.is_err()); Ok(()) } #[cfg(feature = "testing")] #[tokio::test] async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { use pageserver_api::models::TimelineVisibilityState; use crate::tenant::size::gather_inputs; let tenant_conf = pageserver_api::models::TenantConfig { // Ensure that we don't compute gc_cutoffs (which needs reading the layer files) pitr_interval: Some(Duration::ZERO), ..Default::default() }; let harness = TenantHarness::create_custom( "test_synthetic_size_calculation_with_invisible_branches", tenant_conf, TenantId::generate(), ShardIdentity::unsharded(), Generation::new(0xdeadbeef), ) .await?; let (tenant, ctx) = harness.load().await; let main_tline = tenant .create_test_timeline_with_layers( TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx, vec![], vec![], vec![], Lsn(0x100), ) .await?; let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790")); tenant .branch_timeline_test_with_layers( &main_tline, snapshot1, Some(Lsn(0x20)), &ctx, vec![], vec![], Lsn(0x50), ) .await?; let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791")); tenant .branch_timeline_test_with_layers( &main_tline, snapshot2, Some(Lsn(0x30)), &ctx, vec![], vec![], Lsn(0x50), ) .await?; let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792")); tenant .branch_timeline_test_with_layers( &main_tline, snapshot3, Some(Lsn(0x40)), &ctx, vec![], vec![], Lsn(0x50), ) .await?; let limit = Arc::new(Semaphore::new(1)); let max_retention_period = None; let mut logical_size_cache = HashMap::new(); let cause = LogicalSizeCalculationCause::EvictionTaskImitation; let cancel = CancellationToken::new(); let inputs = gather_inputs( &tenant, &limit, max_retention_period, &mut logical_size_cache, cause, &cancel, &ctx, ) .instrument(info_span!( "gather_inputs", tenant_id = "unknown", shard_id = "unknown", )) .await?; use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta}; use LsnKind::*; use tenant_size_model::Segment; let ModelInputs { mut segments, .. } = inputs; segments.retain(|s| s.timeline_id == TIMELINE_ID); for segment in segments.iter_mut() { segment.segment.parent = None; // We don't care about the parent for the test segment.segment.size = None; // We don't care about the size for the test } assert_eq!( segments, [ SegmentMeta { segment: Segment { parent: None, lsn: 0x10, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchStart, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x20, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x30, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x40, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x100, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: GcCutOff, }, // we need to retain everything above the last branch point SegmentMeta { segment: Segment { parent: None, lsn: 0x100, size: None, needed: true, }, timeline_id: TIMELINE_ID, kind: BranchEnd, }, ] ); main_tline .remote_client .schedule_index_upload_for_timeline_invisible_state( TimelineVisibilityState::Invisible, )?; main_tline.remote_client.wait_completion().await?; let inputs = gather_inputs( &tenant, &limit, max_retention_period, &mut logical_size_cache, cause, &cancel, &ctx, ) .instrument(info_span!( "gather_inputs", tenant_id = "unknown", shard_id = "unknown", )) .await?; let ModelInputs { mut segments, .. } = inputs; segments.retain(|s| s.timeline_id == TIMELINE_ID); for segment in segments.iter_mut() { segment.segment.parent = None; // We don't care about the parent for the test segment.segment.size = None; // We don't care about the size for the test } assert_eq!( segments, [ SegmentMeta { segment: Segment { parent: None, lsn: 0x10, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchStart, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x20, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x30, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x40, size: None, needed: false, }, timeline_id: TIMELINE_ID, kind: BranchPoint, }, SegmentMeta { segment: Segment { parent: None, lsn: 0x40, // Branch end LSN == last branch point LSN size: None, needed: true, }, timeline_id: TIMELINE_ID, kind: BranchEnd, }, ] ); Ok(()) } #[tokio::test] async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> { let tenant_conf = pageserver_api::models::TenantConfig { pitr_interval: Some(Duration::from_secs(7 * 3600)), image_layer_force_creation_period: Some(Duration::from_secs(3600)), ..Default::default() }; let tenant_id = TenantId::generate(); let harness = TenantHarness::create_custom( "test_get_force_image_creation_lsn", tenant_conf, tenant_id, ShardIdentity::unsharded(), Generation::new(1), ) .await?; let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100)); { let writer = timeline.writer().await; writer.finish_write(Lsn(5000)); } let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap(); assert_eq!(image_creation_lsn, Lsn(4300)); Ok(()) } } ================================================ FILE: pageserver/src/utilization.rs ================================================ //! An utilization metric which is used to decide on which pageserver to put next tenant. //! //! The metric is exposed via `GET /v1/utilization`. Refer and maintain its openapi spec as the //! truth. use std::path::Path; use anyhow::Context; use pageserver_api::models::PageserverUtilization; use utils::serde_percent::Percent; use crate::config::PageServerConf; use crate::metrics::NODE_UTILIZATION_SCORE; use crate::tenant::mgr::TenantManager; pub(crate) fn regenerate( conf: &PageServerConf, tenants_path: &Path, tenant_manager: &TenantManager, ) -> anyhow::Result { let statvfs = nix::sys::statvfs::statvfs(tenants_path) .map_err(std::io::Error::from) .context("statvfs tenants directory")?; // https://unix.stackexchange.com/a/703650 let blocksz = if statvfs.fragment_size() > 0 { statvfs.fragment_size() } else { statvfs.block_size() }; #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] let free = statvfs.blocks_available() as u64 * blocksz; #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] let used = statvfs .blocks() // use blocks_free instead of available here to match df in case someone compares .saturating_sub(statvfs.blocks_free()) as u64 * blocksz; let captured_at = std::time::SystemTime::now(); // Calculate aggregate utilization from tenants on this pageserver let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?; // Fetch the fraction of disk space which may be used let disk_usable_pct = if conf.disk_usage_based_eviction.enabled { conf.disk_usage_based_eviction.max_usage_pct } else { Percent::new(100).unwrap() }; // Express a static value for how many shards we may schedule on one node const MAX_SHARDS: u32 = 2500; let mut doc = PageserverUtilization { disk_usage_bytes: used, free_space_bytes: free, disk_wanted_bytes, disk_usable_pct, shard_count, max_shard_count: MAX_SHARDS, utilization_score: None, captured_at: utils::serde_system_time::SystemTime(captured_at), }; // Initialize `PageserverUtilization::utilization_score` let score = doc.cached_score(); NODE_UTILIZATION_SCORE.set(score); Ok(doc) } ================================================ FILE: pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs ================================================ //! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific //! handling in case the instance can't launched. //! //! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. //! See for more details. use std::sync::Arc; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use tokio_epoll_uring::{System, SystemHandle}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, error, info, info_span, warn}; use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; use crate::virtual_file::on_fatal_io_error; #[derive(Clone)] struct ThreadLocalState(Arc); struct ThreadLocalStateInner { cell: tokio::sync::OnceCell>, launch_attempts: AtomicU32, /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`] thread_local_state_id: u64, } impl Drop for ThreadLocalStateInner { fn drop(&mut self) { THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id); } } impl ThreadLocalState { pub fn new() -> Self { Self(Arc::new(ThreadLocalStateInner { cell: tokio::sync::OnceCell::default(), launch_attempts: AtomicU32::new(0), thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed), })) } pub fn make_id_string(&self) -> String { format!("{}", self.0.thread_local_state_id) } } static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0); thread_local! { static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new(); } /// Panics if we cannot [`System::launch`]. pub async fn thread_local_system() -> Handle { let fake_cancel = CancellationToken::new(); loop { let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone()); let inner = &thread_local_state.0; let get_or_init_res = inner .cell .get_or_try_init(|| async { let attempt_no = inner .launch_attempts .fetch_add(1, std::sync::atomic::Ordering::Relaxed); let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no); async { // Rate-limit retries per thread-local. // NB: doesn't yield to executor at attempt_no=0. utils::backoff::exponential_backoff( attempt_no, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, &fake_cancel, ) .await; let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id); let res = System::launch_with_metrics(per_system_metrics) // this might move us to another executor thread => loop outside the get_or_try_init, not inside it .await; match res { Ok(system) => { info!("successfully launched system"); metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc(); Ok(system) } Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { warn!("not enough locked memory to tokio-epoll-uring, will retry"); info_span!("stats").in_scope(|| { emit_launch_failure_process_stats(); }); metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id); Err(()) } // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. // This is equivalent to a fatal IO error. Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => { error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process"); info_span!("stats").in_scope(|| { emit_launch_failure_process_stats(); }); on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring"); }, } } .instrument(span) .await }) .await; if get_or_init_res.is_ok() { return Handle(thread_local_state); } } } fn emit_launch_failure_process_stats() { // tokio-epoll-uring stats // vmlck + rlimit // number of threads // rss / system memory usage generally let tokio_epoll_uring::metrics::GlobalMetrics { systems_created, systems_destroyed, } = tokio_epoll_uring::metrics::global(); info!(systems_created, systems_destroyed, "tokio-epoll-uring"); match procfs::process::Process::myself() { Ok(myself) => { match myself.limits() { Ok(limits) => { info!(?limits.max_locked_memory, "/proc/self/limits"); } Err(error) => { info!(%error, "no limit stats due to error"); } } match myself.status() { Ok(status) => { let procfs::process::Status { vmsize, vmlck, vmpin, vmrss, rssanon, rssfile, rssshmem, vmdata, vmstk, vmexe, vmlib, vmpte, threads, .. } = status; info!( vmsize, vmlck, vmpin, vmrss, rssanon, rssfile, rssshmem, vmdata, vmstk, vmexe, vmlib, vmpte, threads, "/proc/self/status" ); } Err(error) => { info!(%error, "no status status due to error"); } } } Err(error) => { info!(%error, "no process stats due to error"); } }; } #[derive(Clone)] pub struct Handle(ThreadLocalState); impl std::ops::Deref for Handle { type Target = SystemHandle; fn deref(&self) -> &Self::Target { self.0 .0 .cell .get() .expect("must be already initialized when using this") } } ================================================ FILE: pageserver/src/virtual_file/io_engine.rs ================================================ //! [`super::VirtualFile`] supports different IO engines. //! //! The [`IoEngineKind`] enum identifies them. //! //! The choice of IO engine is global. //! Initialize using [`init`]. //! //! Then use [`get`] and [`super::OpenOptions`]. //! //! #[cfg(target_os = "linux")] pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::IoBuf; use tracing::Instrument; pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] #[repr(u8)] pub(crate) enum IoEngine { NotSet, StdFs, #[cfg(target_os = "linux")] TokioEpollUring, } impl From for IoEngine { fn from(value: IoEngineKind) -> Self { match value { IoEngineKind::StdFs => IoEngine::StdFs, #[cfg(target_os = "linux")] IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring, } } } impl TryFrom for IoEngine { type Error = u8; fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet, v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs, #[cfg(target_os = "linux")] v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring, x => return Err(x), }) } } static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8); pub(crate) fn set(engine_kind: IoEngineKind) { let engine: IoEngine = engine_kind.into(); IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed); #[cfg(not(test))] { let metric = &crate::metrics::virtual_file_io_engine::KIND; metric.reset(); metric .with_label_values(&[&format!("{engine_kind}")]) .set(1); } } #[cfg(not(test))] pub(super) fn init(engine_kind: IoEngineKind) { set(engine_kind); } /// Longer-term, this API should only be used by [`super::VirtualFile`]. pub(crate) fn get() -> IoEngine { let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap(); if cfg!(test) { let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; match cur { IoEngine::NotSet => { let kind = match std::env::var(env_var_name) { Ok(v) => match v.parse::() { Ok(engine_kind) => engine_kind, Err(e) => { panic!( "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}" ) } }, Err(std::env::VarError::NotPresent) => { #[cfg(target_os = "linux")] { IoEngineKind::TokioEpollUring } #[cfg(not(target_os = "linux"))] { IoEngineKind::StdFs } } Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {env_var_name} is not unicode"); } }; self::set(kind); self::get() } x => x, } } else { cur } } use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; #[cfg(target_os = "linux")] use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] pub(super) fn epoll_uring_error_to_std( e: tokio_epoll_uring::Error, ) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, tokio_epoll_uring::Error::System(system) => std::io::Error::other(system), } } impl IoEngine { pub(super) async fn read_at( &self, file_guard: FileGuard, offset: u64, mut slice: tokio_epoll_uring::Slice, ) -> ( (FileGuard, tokio_epoll_uring::Slice), std::io::Result, ) where Buf: tokio_epoll_uring::IoBufMut + Send, { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let rust_slice = slice.as_mut_rust_slice_full_zeroed(); let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset)); ((file_guard, slice), res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { system.read(file_guard, offset, slice).await }) .await; (resources, res.map_err(epoll_uring_error_to_std)) } } } pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let res = file_guard.with_std_file(|std_file| std_file.sync_all()); (file_guard, res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { system.fsync(file_guard).await }) .await; (resources, res.map_err(epoll_uring_error_to_std)) } } } pub(super) async fn sync_data( &self, file_guard: FileGuard, ) -> (FileGuard, std::io::Result<()>) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let res = file_guard.with_std_file(|std_file| std_file.sync_data()); (file_guard, res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { system.fdatasync(file_guard).await }) .await; (resources, res.map_err(epoll_uring_error_to_std)) } } } pub(super) async fn metadata( &self, file_guard: FileGuard, ) -> (FileGuard, std::io::Result) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let res = file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from)); (file_guard, res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { system.statx(file_guard).await }) .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), ) } } } pub(super) async fn set_len( &self, file_guard: FileGuard, len: u64, ) -> (FileGuard, std::io::Result<()>) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } } } pub(super) async fn write_at( &self, file_guard: FileGuard, offset: u64, buf: FullSlice, ) -> ((FileGuard, FullSlice), std::io::Result) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset)); ((file_guard, buf), result) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; let ((file_guard, slice), res) = retry_ecanceled_once( (file_guard, buf.into_raw_slice()), async |(file_guard, buf)| system.write(file_guard, offset, buf).await, ) .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), ) } } } /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`], /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured /// whereas before the switch to [`super::io_engine`], that wasn't the case. /// This method helps avoid such a regression. /// /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen. pub(crate) async fn spawn_blocking_and_block_on_if_std(&self, work: Fut) -> R where Fut: 'static + Send + std::future::Future, R: 'static + Send, { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { let span = tracing::info_span!("spawn_blocking_block_on_if_std"); tokio::task::spawn_blocking({ move || tokio::runtime::Handle::current().block_on(work.instrument(span)) }) .await .expect("failed to join blocking code most likely it panicked, panicking as well") } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => work.await, } } } /// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, /// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. /// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. /// Investigation ticket: /// /// This function retries the operation once if it fails with ECANCELED. /// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. #[cfg(target_os = "linux")] pub(super) async fn retry_ecanceled_once( resources: T, f: F, ) -> (T, Result>) where F: Fn(T) -> Fut, Fut: std::future::Future>)>, T: Send, V: Send, { let (resources, res) = f(resources).await; let Err(e) = res else { return (resources, res); }; let tokio_epoll_uring::Error::Op(err) = e else { return (resources, Err(e)); }; if err.raw_os_error() != Some(nix::libc::ECANCELED) { return (resources, Err(tokio_epoll_uring::Error::Op(err))); } { static RATE_LIMIT: std::sync::Mutex = std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); let mut guard = RATE_LIMIT.lock().unwrap(); guard.call2(|rate_limit_stats| { info!( %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" ); }); drop(guard); } tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners let (resources, res) = f(resources).await; (resources, res) } pub(super) fn panic_operation_must_be_idempotent() { panic!( "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" ) } pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { engine: IoEngineKind, remark: String, }, } impl FeatureTestResult { #[cfg(target_os = "linux")] const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring; #[cfg(not(target_os = "linux"))] const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs; } impl From for IoEngineKind { fn from(val: FeatureTestResult) -> Self { match val { FeatureTestResult::PlatformPreferred(e) => e, FeatureTestResult::Worse { engine, .. } => engine, } } } /// Somewhat costly under the hood, do only once. /// Panics if we can't set up the feature test. pub fn feature_test() -> anyhow::Result { std::thread::spawn(|| { #[cfg(not(target_os = "linux"))] { Ok(FeatureTestResult::PlatformPreferred( FeatureTestResult::PLATFORM_PREFERRED, )) } #[cfg(target_os = "linux")] { let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); Ok(match rt.block_on(tokio_epoll_uring::System::launch()) { Ok(_) => FeatureTestResult::PlatformPreferred({ assert!(matches!( IoEngineKind::TokioEpollUring, FeatureTestResult::PLATFORM_PREFERRED )); FeatureTestResult::PLATFORM_PREFERRED }), Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => { let remark = match e.raw_os_error() { Some(nix::libc::EPERM) => { // fall back "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled " .to_string() } Some(nix::libc::EFAULT) => { // fail feature test anyhow::bail!( "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory" ); } Some(_) | None => { // fall back format!("creating tokio-epoll-uring fails with error: {e:#}") } }; FeatureTestResult::Worse { engine: IoEngineKind::StdFs, remark, } } }) } }) .join() .unwrap() } /// For use in benchmark binaries only. /// /// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also /// don't want to silently fall back to slower I/O engines in a benchmark: this could waste /// developer time trying to figure out why it's slow. /// /// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic. pub fn io_engine_for_bench() -> IoEngineKind { #[cfg(not(target_os = "linux"))] { panic!("This benchmark does I/O and can only give a representative result on Linux"); } #[cfg(target_os = "linux")] { match feature_test().unwrap() { FeatureTestResult::PlatformPreferred(engine) => engine, FeatureTestResult::Worse { engine: _engine, remark, } => { panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}"); } } } } ================================================ FILE: pageserver/src/virtual_file/metadata.rs ================================================ use std::fs; pub enum Metadata { StdFs(fs::Metadata), #[cfg(target_os = "linux")] TokioEpollUring(Box), } #[cfg(target_os = "linux")] impl From> for Metadata { fn from(value: Box) -> Self { Metadata::TokioEpollUring(value) } } impl From for Metadata { fn from(value: std::fs::Metadata) -> Self { Metadata::StdFs(value) } } impl Metadata { pub fn len(&self) -> u64 { match self { Metadata::StdFs(metadata) => metadata.len(), #[cfg(target_os = "linux")] Metadata::TokioEpollUring(statx) => statx.stx_size, } } } ================================================ FILE: pageserver/src/virtual_file/open_options.rs ================================================ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; use std::os::fd::OwnedFd; use std::os::unix::fs::OpenOptionsExt; use std::path::Path; use super::io_engine::IoEngine; #[derive(Debug, Clone)] pub struct OpenOptions { /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions` /// to support [`Self::is_write`]. write: bool, /// We don't expose + pass through a raw `custom_flags()` style API. /// The only custom flag we support is `O_DIRECT`, which we track here /// and map to `custom_flags()` in the [`Self::open`] method. direct: bool, inner: Inner, } #[derive(Debug, Clone)] enum Inner { StdFs(std::fs::OpenOptions), #[cfg(target_os = "linux")] TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions), } impl Default for OpenOptions { fn default() -> Self { let inner = match super::io_engine::get() { IoEngine::NotSet => panic!("io engine not set"), IoEngine::StdFs => Inner::StdFs(std::fs::OpenOptions::new()), #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { Inner::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new()) } }; Self { write: false, direct: false, inner, } } } impl OpenOptions { pub fn new() -> OpenOptions { Self::default() } pub(super) fn is_write(&self) -> bool { self.write } pub(super) fn is_direct(&self) -> bool { self.direct } pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.read(read); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.read(read); } } self } pub fn write(mut self, write: bool) -> Self { self.write = write; match &mut self.inner { Inner::StdFs(x) => { let _ = x.write(write); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.write(write); } } self } pub fn create(mut self, create: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create(create); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.create(create); } } self } pub fn create_new(mut self, create_new: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create_new(create_new); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.create_new(create_new); } } self } pub fn truncate(mut self, truncate: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.truncate(truncate); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.truncate(truncate); } } self } /// Don't use, `O_APPEND` is not supported. pub fn append(&mut self, _append: bool) { super::io_engine::panic_operation_must_be_idempotent(); } pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] let mut custom_flags = 0; if self.direct { #[cfg(target_os = "linux")] { custom_flags |= nix::libc::O_DIRECT; } #[cfg(not(target_os = "linux"))] { // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!). // Just don't set the flag; to catch alignment bugs typical for O_DIRECT, // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`. static WARNING: std::sync::Once = std::sync::Once::new(); WARNING.call_once(|| { let span = tracing::info_span!(parent: None, "open_options"); let _enter = span.enter(); tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process"); }); } } match self.inner.clone() { Inner::StdFs(mut x) => x .custom_flags(custom_flags) .open(path) .map(|file| file.into()), #[cfg(target_os = "linux")] Inner::TokioEpollUring(mut x) => { x.custom_flags(custom_flags); let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { let res = system.open(path, &x).await; ((), res) }) .await; res.map_err(super::io_engine::epoll_uring_error_to_std) } } } pub fn mode(mut self, mode: u32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.mode(mode); } #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let _ = x.mode(mode); } } self } pub fn direct(mut self, direct: bool) -> Self { self.direct = direct; self } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs ================================================ pub trait Alignment: std::marker::Unpin + 'static { /// Returns the required alignments. fn align(&self) -> usize; } /// Alignment at compile time. #[derive(Debug, Clone, Copy)] pub struct ConstAlign; impl Alignment for ConstAlign { fn align(&self) -> usize { A } } /// Alignment at run time. #[derive(Debug, Clone, Copy)] pub struct RuntimeAlign { align: usize, } impl Alignment for RuntimeAlign { fn align(&self) -> usize { self.align } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs ================================================ use std::ops::{Deref, Range, RangeBounds}; use std::sync::Arc; use super::alignment::Alignment; use super::raw::RawAlignedBuffer; use super::{AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, /// Range that specifies the current slice. range: Range, } impl AlignedBuffer { /// Creates an immutable `IoBuffer` from the raw buffer pub(super) fn from_raw(raw: RawAlignedBuffer, range: Range) -> Self { AlignedBuffer { raw: Arc::new(raw), range, } } /// Returns the number of bytes in the buffer, also referred to as its 'length'. #[inline] pub fn len(&self) -> usize { self.range.len() } /// Returns the alignment of the buffer. #[inline] pub fn align(&self) -> usize { self.raw.align() } #[inline] fn as_ptr(&self) -> *const u8 { // SAFETY: `self.range.start` is guaranteed to be within [0, self.len()). unsafe { self.raw.as_ptr().add(self.range.start) } } /// Extracts a slice containing the entire buffer. /// /// Equivalent to `&s[..]`. #[inline] fn as_slice(&self) -> &[u8] { &self.raw.as_slice()[self.range.start..self.range.end] } /// Returns a slice of self for the index range `[begin..end)`. pub fn slice(&self, range: impl RangeBounds) -> Self { use core::ops::Bound; let len = self.len(); let begin = match range.start_bound() { Bound::Included(&n) => n, Bound::Excluded(&n) => n.checked_add(1).expect("out of range"), Bound::Unbounded => 0, }; let end = match range.end_bound() { Bound::Included(&n) => n.checked_add(1).expect("out of range"), Bound::Excluded(&n) => n, Bound::Unbounded => len, }; assert!( begin <= end, "range start must not be greater than end: {begin:?} <= {end:?}", ); assert!(end <= len, "range end out of bounds: {end:?} <= {len:?}",); let begin = self.range.start + begin; let end = self.range.start + end; AlignedBuffer { raw: Arc::clone(&self.raw), range: begin..end, } } /// Returns the mutable aligned buffer, if the immutable aligned buffer /// has exactly one strong reference. Otherwise returns `None`. pub fn into_mut(self) -> Option> { let raw = Arc::into_inner(self.raw)?; Some(AlignedBufferMut::from_raw(raw)) } } impl Deref for AlignedBuffer { type Target = [u8]; fn deref(&self) -> &Self::Target { self.as_slice() } } impl AsRef<[u8]> for AlignedBuffer { fn as_ref(&self) -> &[u8] { self.as_slice() } } impl PartialEq<[u8]> for AlignedBuffer { fn eq(&self, other: &[u8]) -> bool { self.as_slice().eq(other) } } impl From<&[u8; N]> for AlignedBuffer> { fn from(value: &[u8; N]) -> Self { let mut buf = AlignedBufferMut::with_capacity(N); buf.extend_from_slice(value); buf.freeze() } } /// SAFETY: the underlying buffer references a stable memory region. unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { fn stable_ptr(&self) -> *const u8 { self.as_ptr() } fn bytes_init(&self) -> usize { self.len() } fn bytes_total(&self) -> usize { self.len() } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs ================================================ use std::mem::MaybeUninit; use std::ops::{Deref, DerefMut}; use super::alignment::{Alignment, ConstAlign}; use super::buffer::AlignedBuffer; use super::raw::RawAlignedBuffer; /// A mutable aligned buffer type. #[derive(Debug)] pub struct AlignedBufferMut { raw: RawAlignedBuffer, } impl AlignedBufferMut> { /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. /// /// The buffer will be able to hold at most `capacity` elements and will never resize. /// /// /// # Panics /// /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: /// * `align` must not be zero, /// /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, /// must not overflow isize (i.e., the rounded value must be /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { AlignedBufferMut { raw: RawAlignedBuffer::with_capacity(capacity), } } /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros. pub fn with_capacity_zeroed(capacity: usize) -> Self { use bytes::BufMut; let mut buf = Self::with_capacity(capacity); buf.put_bytes(0, capacity); // SAFETY: `put_bytes` filled the entire buffer. unsafe { buf.set_len(capacity) }; buf } } impl AlignedBufferMut { /// Constructs a mutable aligned buffer from raw. pub(super) fn from_raw(raw: RawAlignedBuffer) -> Self { AlignedBufferMut { raw } } /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { self.raw.capacity() } /// Returns the alignment of the buffer. #[inline] pub fn align(&self) -> usize { self.raw.align() } /// Returns the number of bytes in the buffer, also referred to as its 'length'. #[inline] pub fn len(&self) -> usize { self.raw.len() } /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { // SAFETY: the caller is unsafe unsafe { self.raw.set_len(new_len) } } #[inline] fn as_ptr(&self) -> *const u8 { self.raw.as_ptr() } #[inline] fn as_mut_ptr(&mut self) -> *mut u8 { self.raw.as_mut_ptr() } /// Extracts a slice containing the entire buffer. /// /// Equivalent to `&s[..]`. #[inline] fn as_slice(&self) -> &[u8] { self.raw.as_slice() } /// Extracts a mutable slice of the entire buffer. /// /// Equivalent to `&mut s[..]`. fn as_mut_slice(&mut self) -> &mut [u8] { self.raw.as_mut_slice() } /// Drops the all the contents of the buffer, setting its length to `0`. #[inline] pub fn clear(&mut self) { self.raw.clear() } /// Reserves capacity for at least `additional` more bytes to be inserted /// in the given `IoBufferMut`. The collection may reserve more space to /// speculatively avoid frequent reallocations. After calling `reserve`, /// capacity will be greater than or equal to `self.len() + additional`. /// Does nothing if capacity is already sufficient. /// /// # Panics /// /// Panics if the new capacity exceeds `isize::MAX` _bytes_. pub fn reserve(&mut self, additional: usize) { self.raw.reserve(additional); } /// Shortens the buffer, keeping the first len bytes. pub fn truncate(&mut self, len: usize) { self.raw.truncate(len); } /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. pub fn leak<'a>(self) -> &'a mut [u8] { self.raw.leak() } pub fn freeze(self) -> AlignedBuffer { let len = self.len(); AlignedBuffer::from_raw(self.raw, 0..len) } /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed. #[inline] pub fn extend_from_slice(&mut self, extend: &[u8]) { let cnt = extend.len(); self.reserve(cnt); // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy. unsafe { let dst = self.spare_capacity_mut(); // Reserved above debug_assert!(dst.len() >= cnt); core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt); } // SAFETY: We do have at least `cnt` bytes remaining before advance. unsafe { bytes::BufMut::advance_mut(self, cnt); } } /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit`. #[inline] fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { // SAFETY: we guarantees that the `Self::capacity()` bytes from // `Self::as_mut_ptr()` are allocated. unsafe { let ptr = self.as_mut_ptr().add(self.len()); let len = self.capacity() - self.len(); core::slice::from_raw_parts_mut(ptr.cast(), len) } } } impl Deref for AlignedBufferMut { type Target = [u8]; fn deref(&self) -> &Self::Target { self.as_slice() } } impl DerefMut for AlignedBufferMut { fn deref_mut(&mut self) -> &mut Self::Target { self.as_mut_slice() } } impl AsRef<[u8]> for AlignedBufferMut { fn as_ref(&self) -> &[u8] { self.as_slice() } } impl AsMut<[u8]> for AlignedBufferMut { fn as_mut(&mut self) -> &mut [u8] { self.as_mut_slice() } } impl PartialEq<[u8]> for AlignedBufferMut { fn eq(&self, other: &[u8]) -> bool { self.as_slice().eq(other) } } /// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized. unsafe impl bytes::BufMut for AlignedBufferMut { #[inline] fn remaining_mut(&self) -> usize { // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`. // Thus, it can have at most `self.capacity` bytes. self.capacity() - self.len() } // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized. #[inline] unsafe fn advance_mut(&mut self, cnt: usize) { let len = self.len(); let remaining = self.remaining_mut(); if remaining < cnt { panic_advance(cnt, remaining); } // SAFETY: Addition will not overflow since the sum is at most the capacity. unsafe { self.set_len(len + cnt); } } #[inline] fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice { let cap = self.capacity(); let len = self.len(); // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be // valid for `cap - len` bytes. The subtraction will not underflow since // `len <= cap`. unsafe { bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) } } } /// Panic with a nice error message. #[cold] fn panic_advance(idx: usize, len: usize) -> ! { panic!("advance out of bounds: the len is {len} but advancing by {idx}"); } /// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, /// and the underlying pointer remains stable while io-uring is owning the buffer. /// The tokio-epoll-uring crate itself will not resize the buffer and will respect /// [`tokio_epoll_uring::IoBuf::bytes_total`]. unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { fn stable_ptr(&self) -> *const u8 { self.as_ptr() } fn bytes_init(&self) -> usize { self.len() } fn bytes_total(&self) -> usize { self.capacity() } } // SAFETY: See above. unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { fn stable_mut_ptr(&mut self) -> *mut u8 { self.as_mut_ptr() } unsafe fn set_init(&mut self, init_len: usize) { if self.len() < init_len { // SAFETY: caller function is unsafe unsafe { self.set_len(init_len); } } } } impl std::io::Write for AlignedBufferMut { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.extend_from_slice(buf); Ok(buf.len()) } fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } #[cfg(test)] mod tests { use super::*; const ALIGN: usize = 4 * 1024; type TestIoBufferMut = AlignedBufferMut>; #[test] fn test_with_capacity() { let v = TestIoBufferMut::with_capacity(ALIGN * 4); assert_eq!(v.len(), 0); assert_eq!(v.capacity(), ALIGN * 4); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); let v = TestIoBufferMut::with_capacity(ALIGN / 2); assert_eq!(v.len(), 0); assert_eq!(v.capacity(), ALIGN / 2); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); } #[test] fn test_with_capacity_zeroed() { let v = TestIoBufferMut::with_capacity_zeroed(ALIGN); assert_eq!(v.len(), ALIGN); assert_eq!(v.capacity(), ALIGN); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); assert_eq!(&v[..], &[0; ALIGN]) } #[test] fn test_reserve() { use bytes::BufMut; let mut v = TestIoBufferMut::with_capacity(ALIGN); let capacity = v.capacity(); v.reserve(capacity); assert_eq!(v.capacity(), capacity); let data = [b'a'; ALIGN]; v.put(&data[..]); v.reserve(capacity); assert!(v.capacity() >= capacity * 2); assert_eq!(&v[..], &data[..]); let capacity = v.capacity(); v.clear(); v.reserve(capacity); assert_eq!(capacity, v.capacity()); } #[test] fn test_bytes_put() { use bytes::BufMut; let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); let x = [b'a'; ALIGN]; for _ in 0..2 { for _ in 0..4 { v.put(&x[..]); } assert_eq!(v.len(), ALIGN * 4); assert_eq!(v.capacity(), ALIGN * 4); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); v.clear() } assert_eq!(v.len(), 0); assert_eq!(v.capacity(), ALIGN * 4); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); } #[test] #[should_panic] fn test_bytes_put_panic() { use bytes::BufMut; const ALIGN: usize = 4 * 1024; let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); let x = [b'a'; ALIGN]; for _ in 0..5 { v.put_slice(&x[..]); } } #[test] fn test_io_buf_put_slice() { use tokio_epoll_uring::BoundedBufMut; const ALIGN: usize = 4 * 1024; let mut v = TestIoBufferMut::with_capacity(ALIGN); let x = [b'a'; ALIGN]; for _ in 0..2 { v.put_slice(&x[..]); assert_eq!(v.len(), ALIGN); assert_eq!(v.capacity(), ALIGN); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); v.clear() } assert_eq!(v.len(), 0); assert_eq!(v.capacity(), ALIGN); assert_eq!(v.align(), ALIGN); assert_eq!(v.as_ptr().align_offset(ALIGN), 0); } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs ================================================ use core::slice; use std::alloc::{self, Layout}; use std::cmp; use std::mem::ManuallyDrop; use super::alignment::{Alignment, ConstAlign}; #[derive(Debug)] struct AlignedBufferPtr(*mut u8); // SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. unsafe impl Send for AlignedBufferPtr {} // SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. unsafe impl Sync for AlignedBufferPtr {} /// An aligned buffer type. #[derive(Debug)] pub struct RawAlignedBuffer { ptr: AlignedBufferPtr, capacity: usize, len: usize, align: A, } impl RawAlignedBuffer> { /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. /// /// The buffer will be able to hold at most `capacity` elements and will never resize. /// /// /// # Panics /// /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: /// * `align` must not be zero, /// /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, /// must not overflow isize (i.e., the rounded value must be /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { let align = ConstAlign::; let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout"); // SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout. let ptr = unsafe { let ptr = alloc::alloc(layout); if ptr.is_null() { alloc::handle_alloc_error(layout); } AlignedBufferPtr(ptr) }; RawAlignedBuffer { ptr, capacity, len: 0, align, } } } impl RawAlignedBuffer { /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { self.capacity } /// Returns the alignment of the buffer. #[inline] pub fn align(&self) -> usize { self.align.align() } /// Returns the number of bytes in the buffer, also referred to as its 'length'. #[inline] pub fn len(&self) -> usize { self.len } /// Force the length of the buffer to `new_len`. #[inline] pub unsafe fn set_len(&mut self, new_len: usize) { debug_assert!(new_len <= self.capacity()); self.len = new_len; } #[inline] pub fn as_ptr(&self) -> *const u8 { self.ptr.0 } #[inline] pub fn as_mut_ptr(&mut self) -> *mut u8 { self.ptr.0 } /// Extracts a slice containing the entire buffer. /// /// Equivalent to `&s[..]`. #[inline] pub fn as_slice(&self) -> &[u8] { // SAFETY: The pointer is valid and `len` bytes are initialized. unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } } /// Extracts a mutable slice of the entire buffer. /// /// Equivalent to `&mut s[..]`. pub fn as_mut_slice(&mut self) -> &mut [u8] { // SAFETY: The pointer is valid and `len` bytes are initialized. unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } } /// Drops the all the contents of the buffer, setting its length to `0`. #[inline] pub fn clear(&mut self) { self.len = 0; } /// Reserves capacity for at least `additional` more bytes to be inserted /// in the given `IoBufferMut`. The collection may reserve more space to /// speculatively avoid frequent reallocations. After calling `reserve`, /// capacity will be greater than or equal to `self.len() + additional`. /// Does nothing if capacity is already sufficient. /// /// # Panics /// /// Panics if the new capacity exceeds `isize::MAX` _bytes_. pub fn reserve(&mut self, additional: usize) { if additional > self.capacity() - self.len() { self.reserve_inner(additional); } } fn reserve_inner(&mut self, additional: usize) { let Some(required_cap) = self.len().checked_add(additional) else { capacity_overflow() }; let old_capacity = self.capacity(); let align = self.align(); // This guarantees exponential growth. The doubling cannot overflow // because `cap <= isize::MAX` and the type of `cap` is `usize`. let cap = cmp::max(old_capacity * 2, required_cap); if !is_valid_alloc(cap) { capacity_overflow() } let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout"); let old_ptr = self.as_mut_ptr(); // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout, // and we panics on null pointer. let (ptr, cap) = unsafe { let old_layout = Layout::from_size_align_unchecked(old_capacity, align); let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size()); if ptr.is_null() { alloc::handle_alloc_error(new_layout); } (AlignedBufferPtr(ptr), cap) }; self.ptr = ptr; self.capacity = cap; } /// Shortens the buffer, keeping the first len bytes. pub fn truncate(&mut self, len: usize) { if len > self.len { return; } self.len = len; } /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. pub fn leak<'a>(self) -> &'a mut [u8] { let mut buf = ManuallyDrop::new(self); // SAFETY: leaking the buffer as intended. unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) } } } fn capacity_overflow() -> ! { panic!("capacity overflow") } // We need to guarantee the following: // * We don't ever allocate `> isize::MAX` byte-size objects. // * We don't overflow `usize::MAX` and actually allocate too little. // // On 64-bit we just need to check for overflow since trying to allocate // `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add // an extra guard for this in case we're running on a platform which can use // all 4GB in user-space, e.g., PAE or x32. #[inline] fn is_valid_alloc(alloc_size: usize) -> bool { !(usize::BITS < 64 && alloc_size > isize::MAX as usize) } impl Drop for RawAlignedBuffer { fn drop(&mut self) { // SAFETY: memory was allocated with std::alloc::alloc with the same layout. unsafe { alloc::dealloc( self.as_mut_ptr(), Layout::from_size_align_unchecked(self.capacity, self.align.align()), ) } } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs ================================================ use std::ops::{Deref, DerefMut}; use super::alignment::{Alignment, ConstAlign}; /// Newtype for an aligned slice. pub struct AlignedSlice<'a, const N: usize, A: Alignment> { /// underlying byte slice buf: &'a mut [u8; N], /// alignment marker _align: A, } impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign> { /// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment. pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self { let _align = ConstAlign::; assert_eq!(buf.as_ptr().align_offset(_align.align()), 0); AlignedSlice { buf, _align } } } impl Deref for AlignedSlice<'_, N, A> { type Target = [u8; N]; fn deref(&self) -> &Self::Target { self.buf } } impl DerefMut for AlignedSlice<'_, N, A> { fn deref_mut(&mut self) -> &mut Self::Target { self.buf } } impl AsRef<[u8; N]> for AlignedSlice<'_, N, A> { fn as_ref(&self) -> &[u8; N] { self.buf } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs ================================================ pub mod alignment; pub mod buffer; pub mod buffer_mut; pub mod raw; pub mod slice; pub use alignment::*; pub use buffer_mut::AlignedBufferMut; pub use slice::AlignedSlice; ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs ================================================ use tokio_epoll_uring::{IoBuf, IoBufMut}; use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf}; /// A marker trait for a mutable aligned buffer type. pub trait IoBufAlignedMut: IoBufMut {} /// A marker trait for an aligned buffer type. pub trait IoBufAligned: IoBuf {} impl IoBufAlignedMut for IoBufferMut {} impl IoBufAligned for IoBuffer {} impl IoBufAlignedMut for PageWriteGuardBuf {} ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs ================================================ //! See [`FullSlice`]. use std::ops::{Deref, Range}; use bytes::{Bytes, BytesMut}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use super::write::CheapCloneForRead; use crate::virtual_file::{IoBuffer, IoBufferMut}; /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, /// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that /// [`>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`]. /// pub struct FullSlice { slice: Slice, } impl FullSlice where B: IoBuf, { pub(crate) fn must_new(slice: Slice) -> Self { assert_eq!(slice.bytes_init(), slice.bytes_total()); FullSlice { slice } } pub(crate) fn into_raw_slice(self) -> Slice { let FullSlice { slice: s } = self; s } } impl Deref for FullSlice where B: IoBuf, { type Target = [u8]; fn deref(&self) -> &[u8] { let rust_slice = &self.slice[..]; assert_eq!(rust_slice.len(), self.slice.bytes_init()); assert_eq!(rust_slice.len(), self.slice.bytes_total()); rust_slice } } impl CheapCloneForRead for FullSlice where B: IoBuf + CheapCloneForRead, { fn cheap_clone(&self) -> Self { let bounds = self.slice.bounds(); let clone = self.slice.get_ref().cheap_clone(); let slice = clone.slice(bounds); Self { slice } } } pub(crate) trait IoBufExt { /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. fn slice_len(self) -> FullSlice where Self: Sized; } macro_rules! impl_io_buf_ext { ($T:ty) => { impl IoBufExt for $T { #[inline(always)] fn slice_len(self) -> FullSlice { let len = self.len(); let s = if len == 0 { // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion, // causing a panic if len == 0. // The Slice::from_buf_bounds has the correct assertion (<= instead of <). // => https://github.com/neondatabase/tokio-epoll-uring/issues/46 let slice = self.slice_full(); let mut bounds: Range<_> = slice.bounds(); bounds.end = bounds.start; Slice::from_buf_bounds(slice.into_inner(), bounds) } else { self.slice(0..len) }; FullSlice::must_new(s) } } }; } impl_io_buf_ext!(Bytes); impl_io_buf_ext!(BytesMut); impl_io_buf_ext!(Vec); impl_io_buf_ext!(IoBufferMut); impl_io_buf_ext!(IoBuffer); ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/slice.rs ================================================ use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice}; pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. /// /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; } impl SliceMutExt for Slice where B: IoBufMut, { #[inline(always)] fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] { // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice // // SAFETY: we own `slice`, don't write outside the bounds unsafe { let to_init = self.bytes_total() - self.bytes_init(); self.stable_mut_ptr() .add(self.bytes_init()) .write_bytes(0, to_init); self.set_init(self.bytes_total()); }; let bytes_total = self.bytes_total(); &mut self[0..bytes_total] } } #[cfg(test)] mod tests { use std::io::Read; use bytes::Buf; use tokio_epoll_uring::Slice; use super::*; #[test] fn test_slice_full_zeroed() { let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); // before we start the test, let's make sure we have a shared understanding of what slice_full does { let buf = Vec::with_capacity(3); let slice: Slice<_> = buf.slice_full(); assert_eq!(slice.bytes_init(), 0); assert_eq!(slice.bytes_total(), 3); let rust_slice = &slice[..]; assert_eq!( rust_slice.len(), 0, "Slice only derefs to a &[u8] of the initialized part" ); } // and also let's establish a shared understanding of .slice() { let buf = Vec::with_capacity(3); let slice: Slice<_> = buf.slice(0..2); assert_eq!(slice.bytes_init(), 0); assert_eq!(slice.bytes_total(), 2); let rust_slice = &slice[..]; assert_eq!( rust_slice.len(), 0, "Slice only derefs to a &[u8] of the initialized part" ); } // the above leads to the easy mistake of using slice[..] for borrow-based IO like so: { let buf = Vec::with_capacity(3); let mut slice: Slice<_> = buf.slice_full(); assert_eq!(slice[..].len(), 0); let mut file = make_fake_file(); file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0 assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); } // With owned buffers IO like with VirtualFilem, you could totally // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5. { // TODO: demo } // // Ok, now that we have a shared understanding let's demo how to use the extension trait. // // slice_full() { let buf = Vec::with_capacity(3); let mut slice: Slice<_> = buf.slice_full(); let rust_slice = slice.as_mut_rust_slice_full_zeroed(); assert_eq!(rust_slice.len(), 3); assert_eq!(rust_slice, &[0, 0, 0]); let mut file = make_fake_file(); file.read_exact(rust_slice).unwrap(); assert_eq!(rust_slice, b"123"); assert_eq!(&slice[..], b"123"); } // .slice(..) { let buf = Vec::with_capacity(3); let mut slice: Slice<_> = buf.slice(0..2); let rust_slice = slice.as_mut_rust_slice_full_zeroed(); assert_eq!(rust_slice.len(), 2); assert_eq!(rust_slice, &[0, 0]); let mut file = make_fake_file(); file.read_exact(rust_slice).unwrap(); assert_eq!(rust_slice, b"12"); assert_eq!(&slice[..], b"12"); } } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/write/flush.rs ================================================ use std::ops::ControlFlow; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span, warn}; use utils::sync::duplex; use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; use crate::context::RequestContext; use crate::virtual_file::MaybeFatalIo; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned; use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; /// A handle to the flush task. pub struct FlushHandle { inner: Option>, } pub struct FlushHandleInner { /// A bi-directional channel that sends (buffer, offset) for writes, /// and receives recyled buffer. channel: duplex::mpsc::Duplex, FullSlice>, /// Join handle for the background flush task. join_handle: tokio::task::JoinHandle>, } struct FlushRequest { slice: FullSlice, offset: u64, #[cfg(test)] ready_to_flush_rx: Option>, #[cfg(test)] done_flush_tx: Option>, } pub struct ShutdownRequest { pub set_len: Option, } enum Request { Flush(FlushRequest), Shutdown(ShutdownRequest), } impl Request { fn op_str(&self) -> &'static str { match self { Request::Flush(_) => "flush", Request::Shutdown(_) => "shutdown", } } } /// Constructs a request and a control object for a new flush operation. #[cfg(not(test))] fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { let request = FlushRequest { slice, offset }; let control = FlushControl::untracked(); (request, control) } /// Constructs a request and a control object for a new flush operation. #[cfg(test)] fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel(); let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel(); let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx); let request = FlushRequest { slice, offset, ready_to_flush_rx: Some(ready_to_flush_rx), done_flush_tx: Some(done_flush_tx), }; (request, control) } /// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior. #[cfg(test)] pub(crate) struct FlushControl { not_started: FlushNotStarted, } #[cfg(not(test))] pub(crate) struct FlushControl; impl FlushControl { #[cfg(test)] fn not_started( ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, done_flush_rx: tokio::sync::oneshot::Receiver<()>, ) -> Self { FlushControl { not_started: FlushNotStarted { ready_to_flush_tx, done_flush_rx, }, } } #[cfg(not(test))] fn untracked() -> Self { FlushControl } /// In tests, turn flush control into a not started state. #[cfg(test)] pub(crate) fn into_not_started(self) -> FlushNotStarted { self.not_started } /// Release control to the submitted buffer. /// /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution. pub async fn release(self) { #[cfg(test)] { self.not_started .ready_to_flush() .wait_until_flush_is_done() .await; } } } impl FlushHandle where Buf: IoBufAligned + Send + Sync + CheapCloneForRead, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Spawns a new background flush task and obtains a handle. /// /// Handle and background task are connected through a duplex channel. /// Dirty buffers are sent to the background task for flushing. /// Clean buffers are sent back to the handle for reuse. /// /// The queue depth is 1, and the passed-in `buf` seeds the queue depth. /// I.e., the passed-in buf is immediately available to the handle as a recycled buffer. pub fn spawn_new( file: W, buf: B, gate_guard: utils::sync::gate::GateGuard, cancel: CancellationToken, ctx: RequestContext, span: tracing::Span, ) -> Self where B: Buffer + Send + 'static, { let (front, back) = duplex::mpsc::channel(1); back.try_send(buf.flush()) .expect("we just created it with capacity 1"); let join_handle = tokio::spawn( FlushBackgroundTask::new(back, file, gate_guard, cancel, ctx) .run() .instrument(span), ); FlushHandle { inner: Some(FlushHandleInner { channel: front, join_handle, }), } } /// Submits a buffer to be flushed in the background task. /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise /// clear `maybe_flushed`. pub async fn flush( &mut self, slice: FullSlice, offset: u64, ) -> Result<(FullSlice, FlushControl), FlushTaskError> { let (request, flush_control) = new_flush_op(slice, offset); // Submits the buffer to the background task. self.send(Request::Flush(request)).await?; // Wait for an available buffer from the background flush task. // This is the BACKPRESSURE mechanism: if the flush task can't keep up, // then the write path will eventually wait for it here. let Some(recycled) = self.inner_mut().channel.recv().await else { return self.handle_error().await; }; Ok((recycled, flush_control)) } /// Sends poison pill to flush task and waits for it to exit. pub async fn shutdown(&mut self, req: ShutdownRequest) -> Result { self.send(Request::Shutdown(req)).await?; self.wait().await } async fn send(&mut self, request: Request) -> Result<(), FlushTaskError> { let submit = self.inner_mut().channel.send(request).await; if submit.is_err() { return self.handle_error().await; } Ok(()) } async fn handle_error(&mut self) -> Result { Err(self .wait() .await .expect_err("flush task only disconnects duplex if it exits with an error")) } async fn wait(&mut self) -> Result { let handle = self .inner .take() .expect("must not use after we returned an error"); drop(handle.channel.tx); handle.join_handle.await.unwrap() } /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`. /// This only happens if the handle is used after an error. fn inner_mut(&mut self) -> &mut FlushHandleInner { self.inner .as_mut() .expect("must not use after we returned an error") } } /// A background task for flushing data to disk. pub struct FlushBackgroundTask { /// A bi-directional channel that receives (buffer, offset) for writes, /// and send back recycled buffer. channel: duplex::mpsc::Duplex, Request>, /// A writter for persisting data to disk. writer: W, ctx: RequestContext, cancel: CancellationToken, /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk. _gate_guard: utils::sync::gate::GateGuard, } #[derive(Debug, thiserror::Error)] pub enum FlushTaskError { #[error("flush task cancelled")] Cancelled, } impl FlushTaskError { pub fn is_cancel(&self) -> bool { match self { FlushTaskError::Cancelled => true, } } pub fn into_anyhow(self) -> anyhow::Error { match self { FlushTaskError::Cancelled => anyhow::anyhow!(self), } } } impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, W: OwnedAsyncWriter + Sync + 'static, { /// Creates a new background flush task. fn new( channel: duplex::mpsc::Duplex, Request>, file: W, gate_guard: utils::sync::gate::GateGuard, cancel: CancellationToken, ctx: RequestContext, ) -> Self { FlushBackgroundTask { channel, writer: file, _gate_guard: gate_guard, cancel, ctx, } } /// Runs the background flush task. async fn run(mut self) -> Result { // Exit condition: channel is closed and there is no remaining buffer to be flushed while let Some(request) = self.channel.recv().await { let op_kind = request.op_str(); // Perform the requested operation. // // Error handling happens according to the current policy of crashing // on fatal IO errors and retrying in place otherwise (deeming all other errors retryable). // (The upper layers of the Pageserver write path are not equipped to retry write errors // becasuse they often deallocate the buffers that were already written). // // TODO: use utils::backoff::retry once async closures are actually usable // let mut request_storage = Some(request); for attempt in 1.. { if self.cancel.is_cancelled() { return Err(FlushTaskError::Cancelled); } let result = async { let request: Request = request_storage .take().expect( "likely previous invocation of this future didn't get polled to completion", ); match &request { Request::Shutdown(ShutdownRequest { set_len: None }) => { request_storage = Some(request); return ControlFlow::Break(()); }, Request::Flush(_) | Request::Shutdown(ShutdownRequest { set_len: Some(_) }) => { }, } if attempt > 1 { warn!(op=%request.op_str(), "retrying"); } // borrows so we can async move the requests into async block while not moving these borrows here let writer = &self.writer; let request_storage = &mut request_storage; let ctx = &self.ctx; let io_fut = match request { Request::Flush(FlushRequest { slice, offset, #[cfg(test)] ready_to_flush_rx, #[cfg(test)] done_flush_tx }) => futures::future::Either::Left(async move { #[cfg(test)] if let Some(ready_to_flush_rx) = ready_to_flush_rx { { // In test, wait for control to signal that we are ready to flush. if ready_to_flush_rx.await.is_err() { tracing::debug!("control dropped"); } } } let (slice, res) = writer.write_all_at(slice, offset, ctx).await; *request_storage = Some(Request::Flush(FlushRequest { slice, offset, #[cfg(test)] ready_to_flush_rx: None, // the contract is that we notify before first attempt #[cfg(test)] done_flush_tx })); res }), Request::Shutdown(ShutdownRequest { set_len }) => futures::future::Either::Right(async move { let set_len = set_len.expect("we filter out the None case above"); let res = writer.set_len(set_len, ctx).await; *request_storage = Some(Request::Shutdown(ShutdownRequest { set_len: Some(set_len), })); res }), }; // Don't cancel the io_fut by doing tokio::select with self.cancel.cancelled(). // The underlying tokio-epoll-uring slot / kernel operation is still ongoing and occupies resources. // If we retry indefinitely, we'll deplete those resources. // Future: teach tokio-epoll-uring io_uring operation cancellation, but still, // wait for cancelled ops to complete and discard their error. let res = io_fut.await; let res = res.maybe_fatal_err("owned_buffers_io flush"); let Err(err) = res else { if attempt > 1 { warn!(op=%op_kind, "retry succeeded"); } return ControlFlow::Break(()); }; warn!(%err, "error flushing buffered writer buffer to disk, retrying after backoff"); utils::backoff::exponential_backoff(attempt, 1.0, 10.0, &self.cancel).await; ControlFlow::Continue(()) } .instrument(info_span!("attempt", %attempt, %op_kind)) .await; match result { ControlFlow::Break(()) => break, ControlFlow::Continue(()) => continue, } } let request = request_storage.expect("loop must have run at least once"); let slice = match request { Request::Flush(FlushRequest { slice, #[cfg(test)] mut done_flush_tx, .. }) => { #[cfg(test)] { // In test, tell control we are done flushing buffer. if done_flush_tx.take().expect("always Some").send(()).is_err() { tracing::debug!("control dropped"); } } slice } Request::Shutdown(_) => { // next iteration will observe recv() returning None continue; } }; // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. let send_res = self.channel.send(slice).await; if send_res.is_err() { // Although channel is closed. Still need to finish flushing the remaining buffers. continue; } } Ok(self.writer) } } #[cfg(test)] pub(crate) struct FlushNotStarted { ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, done_flush_rx: tokio::sync::oneshot::Receiver<()>, } #[cfg(test)] pub(crate) struct FlushInProgress { done_flush_rx: tokio::sync::oneshot::Receiver<()>, } #[cfg(test)] pub(crate) struct FlushDone; #[cfg(test)] impl FlushNotStarted { /// Signals the background task the buffer is ready to flush to disk. pub fn ready_to_flush(self) -> FlushInProgress { self.ready_to_flush_tx .send(()) .map(|_| FlushInProgress { done_flush_rx: self.done_flush_rx, }) .unwrap() } } #[cfg(test)] impl FlushInProgress { /// Waits until background flush is done. pub async fn wait_until_flush_is_done(self) -> FlushDone { self.done_flush_rx.await.unwrap(); FlushDone } } ================================================ FILE: pageserver/src/virtual_file/owned_buffers_io/write.rs ================================================ mod flush; use bytes::BufMut; pub(crate) use flush::FlushControl; use flush::FlushHandle; pub(crate) use flush::FlushTaskError; use flush::ShutdownRequest; use tokio_epoll_uring::IoBuf; use tokio_util::sync::CancellationToken; use tracing::trace; use super::io_buf_aligned::IoBufAligned; use super::io_buf_aligned::IoBufAlignedMut; use super::io_buf_ext::{FullSlice, IoBufExt}; use crate::context::RequestContext; use crate::virtual_file::UsizeIsU64; use crate::virtual_file::{IoBuffer, IoBufferMut}; pub(crate) trait CheapCloneForRead { /// Returns a cheap clone of the buffer. fn cheap_clone(&self) -> Self; } impl CheapCloneForRead for IoBuffer { fn cheap_clone(&self) -> Self { // Cheap clone over an `Arc`. self.clone() } } /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. /// The owned buffers need to be aligned due to Direct IO requirements. pub trait OwnedAsyncWriter { fn write_all_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> impl std::future::Future, std::io::Result<()>)> + Send; fn set_len( &self, len: u64, ctx: &RequestContext, ) -> impl Future> + Send; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch /// small writes into larger writes of size [`Buffer::cap`]. /// /// The buffer is flushed if and only if it is full ([`Buffer::pending`] == [`Buffer::cap`]). /// This guarantees that writes to the filesystem happen /// - at offsets that are multiples of [`Buffer::cap`] /// - in lengths that are multiples of [`Buffer::cap`] /// /// Above property is useful for Direct IO, where whatever the /// effectively dominating disk-sector/filesystem-block/memory-page size /// determines the requirements on /// - the alignment of the pointer passed to the read/write operation /// - the value of `count` (i.e., the length of the read/write operation) /// which must be a multiple of the dominating sector/block/page size. /// /// See [`BufferedWriter::shutdown`] / [`BufferedWriterShutdownMode`] for different /// ways of dealing with the special case that the buffer is not full by the time /// we are done writing. /// /// The first flush to the underlying `W` happens at offset `start_offset` (arg of [`BufferedWriter::new`]). /// The next flush is to offset `start_offset + Buffer::cap`. The one after at `start_offset + 2 * Buffer::cap` and so on. /// /// TODO: decouple buffer capacity from alignment requirement. /// Right now we assume [`Buffer::cap`] is the alignment requirement, /// but actually [`Buffer::cap`] should only determine how often we flush /// while writing, while a separate alignment requirement argument should /// be passed to determine alignment requirement. This could be used by /// [`BufferedWriterShutdownMode::PadThenTruncate`] to avoid excessive /// padding of zeroes. For example, today, with a capacity of 64KiB, we /// would pad up to 64KiB-1 bytes of zeroes, then truncate off 64KiB-1. /// This is wasteful, e.g., if the alignment requirement is 4KiB, we only /// need to pad & truncate up to 4KiB-1 bytes of zeroes /// // TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput, // since we would avoid copying majority of the data into the internal buffer. // https://github.com/neondatabase/neon/issues/10101 pub struct BufferedWriter { /// Clone of the buffer that was last submitted to the flush loop. /// `None` if no flush request has been submitted, Some forever after. pub(super) maybe_flushed: Option>, /// New writes are accumulated here. /// `None` only during submission while we wait for flush loop to accept /// the full dirty buffer in exchange for a clean buffer. /// If that exchange fails with an [`FlushTaskError`], the write path /// bails and leaves this as `None`. /// Subsequent writes will panic if attempted. /// The read path continues to work without error because [`Self::maybe_flushed`] /// and [`Self::bytes_submitted`] are advanced before the flush loop exchange starts, /// so, they will never try to read from [`Self::mutable`] anyway, because it's past /// the [`Self::maybe_flushed`] point. mutable: Option, /// A handle to the background flush task for writting data to disk. flush_handle: FlushHandle, /// The number of bytes submitted to the background task. bytes_submitted: u64, } /// How [`BufferedWriter::shutdown`] should deal with pending (=not-yet-flushed) data. /// /// Cf the [`BufferedWriter`] comment's paragraph for context on why we need to think about this. pub enum BufferedWriterShutdownMode { /// Drop pending data, don't write back to file. DropTail, /// Pad the pending data with zeroes (cf [`usize::next_multiple_of`]). ZeroPadToNextMultiple(usize), /// Fill the IO buffer with zeroes, flush to disk, the `ftruncate` the /// file to the exact number of bytes written to [`Self`]. /// /// TODO: see in [`BufferedWriter`] comment about decoupling buffer capacity from alignment requirement. PadThenTruncate, } impl BufferedWriter where B: IoBufAlignedMut + Buffer + Send + 'static, Buf: IoBufAligned + Send + Sync + CheapCloneForRead, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Creates a new buffered writer. /// /// The `buf_new` function provides a way to initialize the owned buffers used by this writer. pub fn new( writer: W, start_offset: u64, buf_new: impl Fn() -> B, gate_guard: utils::sync::gate::GateGuard, cancel: CancellationToken, ctx: &RequestContext, flush_task_span: tracing::Span, ) -> Self { Self { mutable: Some(buf_new()), maybe_flushed: None, flush_handle: FlushHandle::spawn_new( writer, buf_new(), gate_guard, cancel, ctx.attached_child(), flush_task_span, ), bytes_submitted: start_offset, } } /// Returns the number of bytes submitted to the background flush task. pub fn bytes_submitted(&self) -> u64 { self.bytes_submitted } /// Panics if used after any of the write paths returned an error pub fn inspect_mutable(&self) -> Option<&B> { self.mutable.as_ref() } /// Gets a reference to the maybe flushed read-only buffer. /// Returns `None` if the writer has not submitted any flush request. pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice> { self.maybe_flushed.as_ref() } #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn shutdown( mut self, mode: BufferedWriterShutdownMode, ctx: &RequestContext, ) -> Result<(u64, W), FlushTaskError> { let mut mutable = self.mutable.take().expect("must not use after an error"); let unpadded_pending = mutable.pending(); let final_len: u64; let shutdown_req; match mode { BufferedWriterShutdownMode::DropTail => { trace!(pending=%mutable.pending(), "dropping pending data"); drop(mutable); final_len = self.bytes_submitted; shutdown_req = ShutdownRequest { set_len: None }; } BufferedWriterShutdownMode::ZeroPadToNextMultiple(next_multiple) => { let len = mutable.pending(); let cap = mutable.cap(); assert!( len <= cap, "buffer impl ensures this, but let's check because the extend_with below would panic if we go beyond" ); let padded_len = len.next_multiple_of(next_multiple); assert!( padded_len <= cap, "caller specified a multiple that is larger than the buffer capacity" ); let count = padded_len - len; mutable.extend_with(0, count); trace!(count, "padding with zeros"); self.mutable = Some(mutable); final_len = self.bytes_submitted + padded_len.into_u64(); shutdown_req = ShutdownRequest { set_len: None }; } BufferedWriterShutdownMode::PadThenTruncate => { let len = mutable.pending(); let cap = mutable.cap(); // TODO: see struct comment TODO on decoupling buffer capacity from alignment requirement. let alignment_requirement = cap; assert!(len <= cap, "buffer impl should ensure this"); let padding_end_offset = len.next_multiple_of(alignment_requirement); assert!( padding_end_offset <= cap, "{padding_end_offset} <= {cap} ({alignment_requirement})" ); let count = padding_end_offset - len; mutable.extend_with(0, count); trace!(count, "padding with zeros"); self.mutable = Some(mutable); final_len = self.bytes_submitted + len.into_u64(); shutdown_req = ShutdownRequest { // Avoid set_len call if we didn't need to pad anything. set_len: if count > 0 { Some(final_len) } else { None }, }; } }; let padded_pending = self.mutable.as_ref().map(|b| b.pending()); trace!(unpadded_pending, padded_pending, "padding done"); if self.mutable.is_some() { self.flush(ctx).await?; } let Self { mutable: _, maybe_flushed: _, mut flush_handle, bytes_submitted: _, } = self; let writer = flush_handle.shutdown(shutdown_req).await?; Ok((final_len, writer)) } #[cfg(test)] pub(crate) fn mutable(&self) -> &B { self.mutable.as_ref().expect("must not use after an error") } #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered_borrowed( &mut self, chunk: &[u8], ctx: &RequestContext, ) -> Result { let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?; if let Some(control) = control { control.release().await; } Ok(len) } /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior. pub(crate) async fn write_buffered_borrowed_controlled( &mut self, mut chunk: &[u8], ctx: &RequestContext, ) -> Result<(usize, Option), FlushTaskError> { let chunk_len = chunk.len(); let mut control: Option = None; while !chunk.is_empty() { let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); buf.extend_from_slice(&chunk[..n]); chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); if let Some(control) = control.take() { control.release().await; } control = self.flush(ctx).await?; } } Ok((chunk_len, control)) } /// This function can only error if the flush task got cancelled. /// In that case, we leave [`Self::mutable`] intentionally as `None`. /// /// The read path continues to function correctly; it can read up to the /// point where it could read before, i.e., including what was in [`Self::mutable`] /// before the call to this function, because that's now stored in [`Self::maybe_flushed`]. /// /// The write path becomes unavailable and will panic if used. /// The only correct solution to retry writes is to discard the entire [`BufferedWriter`], /// which upper layers of pageserver write path currently do not support. /// It is in fact quite hard to reason about what exactly happens in today's code. /// Best case we accumulate junk in the EphemeralFile, worst case is data corruption. #[must_use = "caller must explcitly check the flush control"] async fn flush( &mut self, _ctx: &RequestContext, ) -> Result, FlushTaskError> { let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.mutable = Some(buf); return Ok(None); } // Prepare the buffer for read while flushing. let slice = buf.flush(); // NB: this assignment also drops thereference to the old buffer, allowing us to re-own & make it mutable below. self.maybe_flushed = Some(slice.cheap_clone()); let offset = self.bytes_submitted; self.bytes_submitted += u64::try_from(buf_len).unwrap(); // If we return/panic here or later, we'll leave mutable = None, breaking further // writers, but the read path should still work. let (recycled, flush_control) = self.flush_handle.flush(slice, offset).await?; // The only other place that could hold a reference to the recycled buffer // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner()); // We got back some recycled buffer, can open up for more writes again. self.mutable = Some(recycled); Ok(Some(flush_control)) } } /// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones. pub trait Buffer { type IoBuf: IoBuf; /// Capacity of the buffer. Must not change over the lifetime `self`.` fn cap(&self) -> usize; /// Add data to the buffer. /// Panics if there is not enough room to accomodate `other`'s content, i.e., /// panics if `other.len() > self.cap() - self.pending()`. fn extend_from_slice(&mut self, other: &[u8]); /// Add `count` bytes `val` into `self`. /// Panics if `count > self.cap() - self.pending()`. fn extend_with(&mut self, val: u8, count: usize); /// Number of bytes in the buffer. fn pending(&self) -> usize; /// Turns `self` into a [`FullSlice`] of the pending data /// so we can use [`tokio_epoll_uring`] to write it to disk. fn flush(self) -> FullSlice; /// After the write to disk is done and we have gotten back the slice, /// [`BufferedWriter`] uses this method to re-use the io buffer. fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } impl Buffer for IoBufferMut { type IoBuf = IoBuffer; fn cap(&self) -> usize { self.capacity() } fn extend_from_slice(&mut self, other: &[u8]) { if self.len() + other.len() > self.cap() { panic!("Buffer capacity exceeded"); } IoBufferMut::extend_from_slice(self, other); } fn extend_with(&mut self, val: u8, count: usize) { if self.len() + count > self.cap() { panic!("Buffer capacity exceeded"); } IoBufferMut::put_bytes(self, val, count); } fn pending(&self) -> usize { self.len() } fn flush(self) -> FullSlice { self.freeze().slice_len() } /// Caller should make sure that `iobuf` only have one strong reference before invoking this method. fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { let mut recycled = iobuf .into_mut() .expect("buffer should only have one strong reference"); recycled.clear(); recycled } } #[cfg(test)] mod tests { use std::sync::Mutex; use rstest::rstest; use super::*; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; #[derive(Debug, PartialEq, Eq)] enum Op { Write { buf: Vec, offset: u64 }, SetLen { len: u64 }, } #[derive(Default, Debug)] struct RecorderWriter { /// record bytes and write offsets. recording: Mutex>, } impl OwnedAsyncWriter for RecorderWriter { async fn write_all_at( &self, buf: FullSlice, offset: u64, _: &RequestContext, ) -> (FullSlice, std::io::Result<()>) { self.recording.lock().unwrap().push(Op::Write { buf: Vec::from(&buf[..]), offset, }); (buf, Ok(())) } async fn set_len(&self, len: u64, _ctx: &RequestContext) -> std::io::Result<()> { self.recording.lock().unwrap().push(Op::SetLen { len }); Ok(()) } } fn test_ctx() -> RequestContext { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } #[rstest] #[tokio::test] async fn test_write_all_borrowed_always_goes_through_buffer( #[values( BufferedWriterShutdownMode::DropTail, BufferedWriterShutdownMode::ZeroPadToNextMultiple(2), BufferedWriterShutdownMode::PadThenTruncate )] mode: BufferedWriterShutdownMode, ) -> anyhow::Result<()> { let ctx = test_ctx(); let ctx = &ctx; let recorder = RecorderWriter::default(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let cap = 4; let mut writer = BufferedWriter::<_, RecorderWriter>::new( recorder, 0, || IoBufferMut::with_capacity(cap), gate.enter()?, cancel, ctx, tracing::Span::none(), ); writer.write_buffered_borrowed(b"abc", ctx).await?; writer.write_buffered_borrowed(b"", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; writer.write_buffered_borrowed(b"efg", ctx).await?; writer.write_buffered_borrowed(b"hijklm", ctx).await?; let mut expect = { [(0, b"abcd"), (4, b"efgh"), (8, b"ijkl")] .into_iter() .map(|(offset, v)| Op::Write { offset, buf: v[..].to_vec(), }) .collect::>() }; let expect_next_offset = 12; match &mode { BufferedWriterShutdownMode::DropTail => (), // We test the case with padding to next multiple of 2 so that it's different // from the alignment requirement of 4 inferred from buffer capacity. // See TODOs in the `BufferedWriter` struct comment on decoupling buffer capacity from alignment requirement. BufferedWriterShutdownMode::ZeroPadToNextMultiple(2) => { expect.push(Op::Write { offset: expect_next_offset, // it's legitimate for pad-to-next multiple 2 to be < alignment requirement 4 inferred from buffer capacity buf: b"m\0".to_vec(), }); } BufferedWriterShutdownMode::ZeroPadToNextMultiple(_) => unimplemented!(), BufferedWriterShutdownMode::PadThenTruncate => { expect.push(Op::Write { offset: expect_next_offset, buf: b"m\0\0\0".to_vec(), }); expect.push(Op::SetLen { len: 13 }); } } let (_, recorder) = writer.shutdown(mode, ctx).await?; assert_eq!(&*recorder.recording.lock().unwrap(), &expect); Ok(()) } #[tokio::test] async fn test_set_len_is_skipped_if_not_needed() -> anyhow::Result<()> { let ctx = test_ctx(); let ctx = &ctx; let recorder = RecorderWriter::default(); let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); let cap = 4; let mut writer = BufferedWriter::<_, RecorderWriter>::new( recorder, 0, || IoBufferMut::with_capacity(cap), gate.enter()?, cancel, ctx, tracing::Span::none(), ); // write a multiple of `cap` writer.write_buffered_borrowed(b"abc", ctx).await?; writer.write_buffered_borrowed(b"defgh", ctx).await?; let (_, recorder) = writer .shutdown(BufferedWriterShutdownMode::PadThenTruncate, ctx) .await?; let expect = { [(0, b"abcd"), (4, b"efgh")] .into_iter() .map(|(offset, v)| Op::Write { offset, buf: v[..].to_vec(), }) .collect::>() }; assert_eq!( &*recorder.recording.lock().unwrap(), &expect, "set_len should not be called if the buffer is already aligned" ); Ok(()) } } ================================================ FILE: pageserver/src/virtual_file/temporary.rs ================================================ use tracing::error; use utils::sync::gate::GateGuard; use crate::context::RequestContext; use super::{ MaybeFatalIo, VirtualFile, owned_buffers_io::{ io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice, write::OwnedAsyncWriter, }, }; /// A wrapper around [`super::VirtualFile`] that deletes the file on drop. /// For use as a [`OwnedAsyncWriter`] in [`super::owned_buffers_io::write::BufferedWriter`]. #[derive(Debug)] pub struct TempVirtualFile { inner: Option, } #[derive(Debug)] struct Inner { file: VirtualFile, /// Gate guard is held on as long as we need to do operations in the path (delete on drop) _gate_guard: GateGuard, } impl OwnedAsyncWriter for TempVirtualFile { fn write_all_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> impl std::future::Future, std::io::Result<()>)> + Send { VirtualFile::write_all_at(self, buf, offset, ctx) } async fn set_len(&self, len: u64, ctx: &RequestContext) -> std::io::Result<()> { VirtualFile::set_len(self, len, ctx).await } } impl Drop for TempVirtualFile { fn drop(&mut self) { let Some(Inner { file, _gate_guard }) = self.inner.take() else { return; }; let path = file.path(); if let Err(e) = std::fs::remove_file(path).maybe_fatal_err("failed to remove the virtual file") { error!(err=%e, path=%path, "failed to remove"); } drop(_gate_guard); } } impl std::ops::Deref for TempVirtualFile { type Target = VirtualFile; fn deref(&self) -> &Self::Target { &self .inner .as_ref() .expect("only None after into_inner or drop") .file } } impl std::ops::DerefMut for TempVirtualFile { fn deref_mut(&mut self) -> &mut Self::Target { &mut self .inner .as_mut() .expect("only None after into_inner or drop") .file } } impl TempVirtualFile { /// The caller is responsible for ensuring that the path of `virtual_file` is not reused /// until after this TempVirtualFile's `Drop` impl has completed. /// Failure to do so will result in unlinking of the reused path by the original instance's Drop impl. /// The best way to do so is by using a monotonic counter as a disambiguator. /// TODO: centralize this disambiguator pattern inside this struct. /// => pub fn new(virtual_file: VirtualFile, gate_guard: GateGuard) -> Self { Self { inner: Some(Inner { file: virtual_file, _gate_guard: gate_guard, }), } } /// Dismantle this wrapper and return the underlying [`VirtualFile`]. /// This disables auto-unlinking functionality that is the essence of this wrapper. /// /// The gate guard is dropped as well; it is the callers responsibility to ensure filesystem /// operations after calls to this functions are still gated by some other gate guard. /// /// TODO: /// - centralize the common usage pattern of callers (sync_all(self), rename(self, dst), sync_all(dst.parent)) /// => pub fn disarm_into_inner(mut self) -> VirtualFile { self.inner .take() .expect("only None after into_inner or drop, and we are into_inner, and we consume") .file } } ================================================ FILE: pageserver/src/virtual_file.rs ================================================ //! VirtualFile is like a normal File, but it's not bound directly to //! a file descriptor. //! //! Instead, the file is opened when it's read from, //! and if too many files are open globally in the system, least-recently //! used ones are closed. //! //! To track which files have been recently used, we use the clock algorithm //! with a 'recently_used' flag on each slot. //! //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! use std::fs::File; use std::io::{Error, ErrorKind}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::LazyLock; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use self::owned_buffers_io::write::OwnedAsyncWriter; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::RequestContext; use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation}; use crate::page_cache::{PAGE_SZ, PageWriteGuard}; pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub use io_engine::{ FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, io_engine_for_bench, }; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; pub use pageserver_api::models::virtual_file as api; pub use temporary::TempVirtualFile; pub(crate) mod io_engine; mod metadata; mod open_options; mod temporary; pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary //! reason we need this abstraction. //! //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`, //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. pub(crate) mod aligned_buffer; pub(crate) mod io_buf_aligned; pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; } #[derive(Debug)] pub struct VirtualFile { inner: VirtualFileInner, _mode: IoMode, } impl VirtualFile { /// Open a file in read-only mode. Like File::open. /// /// Insensitive to `virtual_file_io_mode` setting. pub async fn open>( path: P, ctx: &RequestContext, ) -> Result { let inner = VirtualFileInner::open(path, ctx).await?; Ok(VirtualFile { inner, _mode: IoMode::Buffered, }) } /// Open a file in read-only mode. Like File::open. /// /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_v2>( path: P, ctx: &RequestContext, ) -> Result { Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_with_options_v2>( path: P, mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); let direct = match (mode, open_options.is_write()) { (IoMode::Buffered, _) => false, (IoMode::Direct, false) => true, (IoMode::Direct, true) => false, (IoMode::DirectRw, _) => true, }; open_options = open_options.direct(direct); let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } pub fn path(&self) -> &Utf8Path { self.inner.path.as_path() } pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( final_path: Utf8PathBuf, tmp_path: Utf8PathBuf, content: B, ) -> std::io::Result<()> { VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await } pub async fn sync_all(&self) -> Result<(), Error> { if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 { return Ok(()); } self.inner.sync_all().await } pub async fn sync_data(&self) -> Result<(), Error> { if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 { return Ok(()); } self.inner.sync_data().await } pub async fn set_len(&self, len: u64, ctx: &RequestContext) -> Result<(), Error> { self.inner.set_len(len, ctx).await } pub async fn metadata(&self) -> Result { self.inner.metadata().await } pub async fn read_exact_at( &self, slice: Slice, offset: u64, ctx: &RequestContext, ) -> Result, Error> where Buf: IoBufAlignedMut + Send, { self.inner.read_exact_at(slice, offset, ctx).await } pub async fn read_exact_at_page( &self, page: PageWriteGuard<'static>, offset: u64, ctx: &RequestContext, ) -> Result, Error> { self.inner.read_exact_at_page(page, offset, ctx).await } pub async fn write_all_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result<(), Error>) { self.inner.write_all_at(buf, offset, ctx).await } pub(crate) async fn read_to_string>( path: P, ctx: &RequestContext, ) -> std::io::Result { let file = VirtualFile::open(path, ctx).await?; // TODO: open_v2 let mut buf = Vec::new(); let mut tmp = vec![0; 128]; let mut pos: u64 = 0; loop { let slice = tmp.slice(..128); let (slice, res) = file.inner.read_at(slice, pos, ctx).await; match res { Ok(0) => break, Ok(n) => { pos += n as u64; buf.extend_from_slice(&slice[..n]); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } tmp = slice.into_inner(); } String::from_utf8(buf).map_err(|_| { std::io::Error::new(ErrorKind::InvalidData, "file contents are not valid UTF-8") }) } } /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing /// files. Switching this off is unsafe and only used for testing on machines /// with slow drives. #[repr(u8)] pub enum SyncMode { Sync, UnsafeNoSync, } impl TryFrom for SyncMode { type Error = u8; fn try_from(value: u8) -> Result { Ok(match value { v if v == (SyncMode::Sync as u8) => SyncMode::Sync, v if v == (SyncMode::UnsafeNoSync as u8) => SyncMode::UnsafeNoSync, x => return Err(x), }) } } /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, /// and re-opened when it's accessed again. /// /// Like with std::fs::File, multiple threads can read/write the file concurrently, /// holding just a shared reference the same VirtualFile, using the read_at() / write_at() /// functions from the FileExt trait. But the functions from the Read/Write/Seek traits /// require a mutable reference, because they modify the "current position". /// /// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the /// slot that 'handle points to, if the underlying file is currently open. If it's not /// currently open, the 'handle' can still point to the slot where it was last kept. The /// 'tag' field is used to detect whether the handle still is valid or not. /// #[derive(Debug)] pub struct VirtualFileInner { /// Lazy handle to the global file descriptor cache. The slot that this points to /// might contain our File, or it may be empty, or it may contain a File that /// belongs to a different VirtualFile. handle: RwLock, /// File path and options to use to open it. /// /// Note: this only contains the options needed to re-open it. For example, /// if a new file is created, we only pass the create flag when it's initially /// opened, in the VirtualFile::create() function, and strip the flag before /// storing it here. pub path: Utf8PathBuf, open_options: OpenOptions, } #[derive(Debug, PartialEq, Clone, Copy)] struct SlotHandle { /// Index into OPEN_FILES.slots index: usize, /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has /// been recycled and no longer contains the FD for this virtual file. tag: u64, } /// OPEN_FILES is the global array that holds the physical file descriptors that /// are currently open. Each slot in the array is protected by a separate lock, /// so that different files can be accessed independently. The lock must be held /// in write mode to replace the slot with a different file, but a read mode /// is enough to operate on the file, whether you're reading or writing to it. /// /// OPEN_FILES starts in uninitialized state, and it's initialized by /// the virtual_file::init() function. It must be called exactly once at page /// server startup. static OPEN_FILES: OnceCell = OnceCell::new(); struct OpenFiles { slots: &'static [Slot], /// clock arm for the clock algorithm next: AtomicUsize, } struct Slot { inner: RwLock, /// has this file been used since last clock sweep? recently_used: AtomicBool, } struct SlotInner { /// Counter that's incremented every time a different file is stored here. /// To avoid the ABA problem. tag: u64, /// the underlying file file: Option, } /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. struct PageWriteGuardBuf { page: PageWriteGuard<'static>, } // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. // Page cache pages are zero-initialized, so, wrt uninitialized memory we're good. // (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.) unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { fn stable_ptr(&self) -> *const u8 { self.page.as_ptr() } fn bytes_init(&self) -> usize { self.page.len() } fn bytes_total(&self) -> usize { self.page.len() } } // Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, // hence it's safe to hand out the `stable_mut_ptr()`. unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { fn stable_mut_ptr(&mut self) -> *mut u8 { self.page.as_mut_ptr() } unsafe fn set_init(&mut self, pos: usize) { // There shouldn't really be any reason to call this API since bytes_init() == bytes_total(). assert!(pos <= self.page.len()); } } impl OpenFiles { /// Find a slot to use, evicting an existing file descriptor if needed. /// /// On return, we hold a lock on the slot, and its 'tag' has been updated /// recently_used has been set. It's all ready for reuse. async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard) { // // Run the clock algorithm to find a slot to replace. // let num_slots = self.slots.len(); let mut retries = 0; let mut slot; let mut slot_guard; let index; loop { let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots; slot = &self.slots[next]; // If the recently_used flag on this slot is set, continue the clock // sweep. Otherwise try to use this slot. If we cannot acquire the // lock, also continue the clock sweep. // // We only continue in this manner for a while, though. If we loop // through the array twice without finding a victim, just pick the // next slot and wait until we can reuse it. This way, we avoid // spinning in the extreme case that all the slots are busy with an // I/O operation. if retries < num_slots * 2 { if !slot.recently_used.swap(false, Ordering::Release) { if let Ok(guard) = slot.inner.try_write() { slot_guard = guard; index = next; break; } } retries += 1; } else { slot_guard = slot.inner.write().await; index = next; break; } } // // We now have the victim slot locked. If it was in use previously, close the // old file. // if let Some(old_file) = slot_guard.file.take() { // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to // distinguish the two. STORAGE_IO_TIME_METRIC .get(StorageIoOperation::CloseByReplace) .observe_closure_duration(|| drop(old_file)); } // Prepare the slot for reuse and return it slot_guard.tag += 1; slot.recently_used.store(true, Ordering::Relaxed); ( SlotHandle { index, tag: slot_guard.tag, }, slot_guard, ) } } /// Identify error types that should alwways terminate the process. Other /// error types may be elegible for retry. pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { use nix::errno::Errno::*; match e.raw_os_error().map(nix::errno::Errno::from_raw) { Some(EIO) => { // Terminate on EIO because we no longer trust the device to store // data safely, or to uphold persistence guarantees on fsync. true } Some(EROFS) => { // Terminate on EROFS because a filesystem is usually remounted // readonly when it has experienced some critical issue, so the same // logic as EIO applies. true } Some(EACCES) => { // Terminate on EACCESS because we should always have permissions // for our own data dir: if we don't, then we can't do our job and // need administrative intervention to fix permissions. Terminating // is the best way to make sure we stop cleanly rather than going // into infinite retry loops, and will make it clear to the outside // world that we need help. true } _ => { // Treat all other local file I/O errors are retryable. This includes: // - ENOSPC: we stay up and wait for eviction to free some space // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue // - WriteZero, Interrupted: these are used internally VirtualFile false } } } /// Call this when the local filesystem gives us an error with an external /// cause: this includes EIO, EROFS, and EACCESS: all these indicate either /// bad storage or bad configuration, and we can't fix that from inside /// a running process. pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { let backtrace = std::backtrace::Backtrace::force_capture(); tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}"); std::process::abort(); } pub(crate) trait MaybeFatalIo { fn maybe_fatal_err(self, context: &str) -> std::io::Result; fn fatal_err(self, context: &str) -> T; } impl MaybeFatalIo for std::io::Result { /// Terminate the process if the result is an error of a fatal type, else pass it through /// /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but /// not on ENOSPC. fn maybe_fatal_err(self, context: &str) -> std::io::Result { if let Err(e) = &self { if is_fatal_io_error(e) { on_fatal_io_error(e, context); } } self } /// Terminate the process on any I/O error. /// /// This is appropriate for reads on files that we know exist: they should always work. fn fatal_err(self, context: &str) -> T { match self { Ok(v) => v, Err(e) => { on_fatal_io_error(&e, context); } } } } /// Observe duration for the given storage I/O operation /// /// Unlike `observe_closure_duration`, this supports async, /// where "support" means that we measure wall clock time. macro_rules! observe_duration { ($op:expr, $($body:tt)*) => {{ let instant = Instant::now(); let result = $($body)*; let elapsed = instant.elapsed().as_secs_f64(); STORAGE_IO_TIME_METRIC .get($op) .observe(elapsed); result }} } macro_rules! with_file { ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{ let $ident = $this.lock_file().await?; observe_duration!($op, $($body)*) }}; ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{ let mut $ident = $this.lock_file().await?; observe_duration!($op, $($body)*) }}; } impl VirtualFileInner { /// Open a file in read-only mode. Like File::open. pub async fn open>( path: P, ctx: &RequestContext, ) -> Result { Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Open a file with given options. /// /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt, /// they will be applied also when the file is subsequently re-opened, not only /// on the first time. Make sure that's sane! pub async fn open_with_options>( path: P, open_options: OpenOptions, _ctx: &RequestContext, ) -> Result { let path = path.as_ref(); let (handle, mut slot_guard) = get_open_files().find_victim_slot().await; // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. let file = observe_duration!(StorageIoOperation::Open, { open_options.open(path.as_std_path()).await? }); // Strip all options other than read and write. // // It would perhaps be nicer to check just for the read and write flags // explicitly, but OpenOptions doesn't contain any functions to read flags, // only to set them. let reopen_options = open_options .clone() .create(false) .create_new(false) .truncate(false); let vfile = VirtualFileInner { handle: RwLock::new(handle), path: path.to_owned(), open_options: reopen_options, }; // TODO: Under pressure, it's likely the slot will get re-used and // the underlying file closed before they get around to using it. // => https://github.com/neondatabase/neon/issues/6065 slot_guard.file.replace(file); Ok(vfile) } /// Async version of [`::utils::crashsafe::overwrite`]. /// /// # NB: /// /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but, /// it did at an earlier time. /// And it will use this module's [`io_engine`] in the near future, so, leaving it here. pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( final_path: Utf8PathBuf, tmp_path: Utf8PathBuf, content: B, ) -> std::io::Result<()> { // TODO: use tokio_epoll_uring if configured as `io_engine`. // See https://github.com/neondatabase/neon/issues/6663 tokio::task::spawn_blocking(move || { let slice_storage; let content_len = content.bytes_init(); let content = if content.bytes_init() > 0 { slice_storage = Some(content.slice(0..content_len)); slice_storage.as_deref().expect("just set it to Some()") } else { &[] }; utils::crashsafe::overwrite(&final_path, &tmp_path, content) .maybe_fatal_err("crashsafe_overwrite") }) .await .expect("blocking task is never aborted") } /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; res.maybe_fatal_err("sync_all") }) } /// Call File::sync_data() on the underlying File. pub async fn sync_data(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; res.maybe_fatal_err("sync_data") }) } pub async fn metadata(&self) -> Result { with_file!(self, StorageIoOperation::Metadata, |file_guard| { let (_file_guard, res) = io_engine::get().metadata(file_guard).await; res }) } pub async fn set_len(&self, len: u64, _ctx: &RequestContext) -> Result<(), Error> { with_file!(self, StorageIoOperation::SetLen, |file_guard| { let (_file_guard, res) = io_engine::get().set_len(file_guard, len).await; res.maybe_fatal_err("set_len") }) } /// Helper function internal to `VirtualFile` that looks up the underlying File, /// opens it and evicts some other File if necessary. The passed parameter is /// assumed to be a function available for the physical `File`. /// /// We are doing it via a macro as Rust doesn't support async closures that /// take on parameters with lifetimes. async fn lock_file(&self) -> Result { let open_files = get_open_files(); let mut handle_guard = { // Read the cached slot handle, and see if the slot that it points to still // contains our File. // // We only need to hold the handle lock while we read the current handle. If // another thread closes the file and recycles the slot for a different file, // we will notice that the handle we read is no longer valid and retry. let mut handle = *self.handle.read().await; loop { // Check if the slot contains our File { let slot = &open_files.slots[handle.index]; let slot_guard = slot.inner.read().await; if slot_guard.tag == handle.tag && slot_guard.file.is_some() { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(FileGuard { slot_guard }); } } // The slot didn't contain our File. We will have to open it ourselves, // but before that, grab a write lock on handle in the VirtualFile, so // that no other thread will try to concurrently open the same file. let handle_guard = self.handle.write().await; // If another thread changed the handle while we were not holding the lock, // then the handle might now be valid again. Loop back to retry. if *handle_guard != handle { handle = *handle_guard; continue; } break handle_guard; } }; // We need to open the file ourselves. The handle in the VirtualFile is // now locked in write-mode. Find a free slot to put it in. let (handle, mut slot_guard) = open_files.find_victim_slot().await; // Re-open the physical file. // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this // case from StorageIoOperation::Open. This helps with identifying thrashing // of the virtual file descriptor cache. let file = observe_duration!(StorageIoOperation::OpenAfterReplace, { self.open_options.open(self.path.as_std_path()).await? }); // Store the File in the slot and update the handle in the VirtualFile // to point to it. slot_guard.file.replace(file); *handle_guard = handle; Ok(FileGuard { slot_guard: slot_guard.downgrade(), }) } /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. /// /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. pub async fn read_exact_at( &self, slice: Slice, offset: u64, ctx: &RequestContext, ) -> Result, Error> where Buf: IoBufAlignedMut + Send, { let assert_we_return_original_bounds = if cfg!(debug_assertions) { Some((slice.stable_ptr() as usize, slice.bytes_total())) } else { None }; let original_bounds = slice.bounds(); let (buf, res) = read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await; let res = res.map(|_| buf.slice(original_bounds)); if let Some(original_bounds) = assert_we_return_original_bounds { if let Ok(slice) = &res { let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total()); assert_eq!(original_bounds, returned_bounds); } } res } /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. pub async fn read_exact_at_page( &self, page: PageWriteGuard<'static>, offset: u64, ctx: &RequestContext, ) -> Result, Error> { let buf = PageWriteGuardBuf { page }.slice_full(); debug_assert_eq!(buf.bytes_total(), PAGE_SZ); self.read_exact_at(buf, offset, ctx) .await .map(|slice| slice.into_inner().page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 pub async fn write_all_at( &self, buf: FullSlice, mut offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result<(), Error>) { let buf = buf.into_raw_slice(); let bounds = buf.bounds(); let restore = |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); let mut buf = buf; while !buf.is_empty() { let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await; buf = tmp.into_raw_slice(); match res { Ok(0) => { return ( restore(buf), Err(Error::new( std::io::ErrorKind::WriteZero, "failed to write whole buffer", )), ); } Ok(n) => { buf = buf.slice(n..); offset += n as u64; } Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return (restore(buf), Err(e)), } } (restore(buf), Ok(())) } pub(super) async fn read_at( &self, buf: tokio_epoll_uring::Slice, offset: u64, ctx: &RequestContext, ) -> (tokio_epoll_uring::Slice, Result) where Buf: tokio_epoll_uring::IoBufMut + Send, { self.validate_direct_io( Slice::stable_ptr(&buf).addr(), Slice::bytes_total(&buf), offset, ); let file_guard = match self .lock_file() .await .maybe_fatal_err("lock_file inside VirtualFileInner::read_at") { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), }; observe_duration!(StorageIoOperation::Read, { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { ctx.io_size_metrics().read.add(size.into_u64()); } (buf, res) }) } async fn write_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result) { self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset); let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), }; observe_duration!(StorageIoOperation::Write, { let ((_file_guard, buf), result) = io_engine::get().write_at(file_guard, offset, buf).await; let result = result.maybe_fatal_err("write_at"); if let Ok(size) = result { ctx.io_size_metrics().write.add(size.into_u64()); } (buf, result) }) } /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems. /// /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use. fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) { // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod. if !(cfg!(feature = "testing") || cfg!(test)) { return; } if !self.open_options.is_direct() { return; } // Validate buffer memory alignment. // // What practically matters as of Linux 6.1 is bdev_dma_alignment() // which is practically between 512 and 4096. // On our production systems, the value is 512. // The IoBuffer/IoBufferMut hard-code that value. // // Because the alloctor might return _more_ aligned addresses than requested, // there is a chance that testing would not catch violations of a runtime requirement stricter than 512. { let requirement = get_io_buffer_alignment(); let remainder = addr % requirement; assert!( remainder == 0, "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}" ); } // Validate offset alignment. // // We hard-code 512 throughout the code base. // So enforce just that and not anything more restrictive. // Even the shallowest testing will expose more restrictive requirements if those ever arise. { let requirement = get_io_buffer_alignment() as u64; let remainder = offset % requirement; assert!( remainder == 0, "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}" ); } // Validate buffer size multiple requirement. // // The requirement in Linux 6.1 is bdev_logical_block_size(). // On our production systems, that is 512. { let requirement = get_io_buffer_alignment(); let remainder = size % requirement; assert!( remainder == 0, "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}" ); } } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 pub async fn read_exact_at_impl( mut buf: tokio_epoll_uring::Slice, mut offset: u64, mut read_at: F, ) -> (Buf, std::io::Result<()>) where Buf: IoBufMut + Send, F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, Fut: std::future::Future, std::io::Result)>, { while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; match res { Ok(0) => break, Ok(n) => { buf = buf.slice(n..); offset += n as u64; } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return (buf.into_inner(), Err(e)), } } // NB: don't use `buf.is_empty()` here; it is from the // `impl Deref for Slice { Target = [u8] }`; the &[u8] // returned by it only covers the initialized portion of `buf`. // Whereas we're interested in ensuring that we filled the entire // buffer that the user passed in. if buf.bytes_total() != 0 { ( buf.into_inner(), Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "failed to fill whole buffer", )), ) } else { assert_eq!(buf.len(), buf.bytes_total()); (buf.into_inner(), Ok(())) } } #[cfg(test)] mod test_read_exact_at_impl { use std::collections::VecDeque; use std::sync::Arc; use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; use super::read_exact_at_impl; struct Expectation { offset: u64, bytes_total: usize, result: std::io::Result>, } struct MockReadAt { expectations: VecDeque, } impl MockReadAt { async fn read_at( &mut self, mut buf: tokio_epoll_uring::Slice>, offset: u64, ) -> (tokio_epoll_uring::Slice>, std::io::Result) { let exp = self .expectations .pop_front() .expect("read_at called but we have no expectations left"); assert_eq!(exp.offset, offset); assert_eq!(exp.bytes_total, buf.bytes_total()); match exp.result { Ok(bytes) => { assert!(bytes.len() <= buf.bytes_total()); buf.put_slice(&bytes); (buf, Ok(bytes.len())) } Err(e) => (buf, Err(e)), } } } impl Drop for MockReadAt { fn drop(&mut self) { assert_eq!(self.expectations.len(), 0); } } #[tokio::test] async fn test_basic() { let buf = Vec::with_capacity(5).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![Expectation { offset: 0, bytes_total: 5, result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; assert!(res.is_ok()); assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } #[tokio::test] async fn test_empty_buf_issues_no_syscall() { let buf = Vec::new().slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; assert!(res.is_ok()); } #[tokio::test] async fn test_two_read_at_calls_needed_until_buf_filled() { let buf = Vec::with_capacity(4).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { offset: 0, bytes_total: 4, result: Ok(vec![b'a', b'b']), }, Expectation { offset: 2, bytes_total: 2, result: Ok(vec![b'c', b'd']), }, ]), })); let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; assert!(res.is_ok()); assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); } #[tokio::test] async fn test_eof_before_buffer_full() { let buf = Vec::with_capacity(3).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { offset: 0, bytes_total: 3, result: Ok(vec![b'a']), }, Expectation { offset: 1, bytes_total: 2, result: Ok(vec![b'b']), }, Expectation { offset: 2, bytes_total: 1, result: Ok(vec![]), }, ]), })); let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; let Err(err) = res else { panic!("should return an error"); }; assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); assert_eq!(format!("{err}"), "failed to fill whole buffer"); // buffer contents on error are unspecified } } struct FileGuard { slot_guard: RwLockReadGuard<'static, SlotInner>, } impl AsRef for FileGuard { fn as_ref(&self) -> &OwnedFd { // This unwrap is safe because we only create `FileGuard`s // if we know that the file is Some. self.slot_guard.file.as_ref().unwrap() } } impl FileGuard { /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. fn with_std_file(&self, with: F) -> R where F: FnOnce(&File) -> R, { // SAFETY: // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut` let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; let res = with(&file); let _ = file.into_raw_fd(); res } } impl tokio_epoll_uring::IoFd for FileGuard { unsafe fn as_fd(&self) -> RawFd { let owned_fd: &OwnedFd = self.as_ref(); owned_fd.as_raw_fd() } } #[cfg(test)] impl VirtualFile { pub(crate) async fn read_blk( &self, blknum: u32, ctx: &RequestContext, ) -> Result, std::io::Error> { self.inner.read_blk(blknum, ctx).await } } #[cfg(test)] impl VirtualFileInner { pub(crate) async fn read_blk( &self, blknum: u32, ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full(); assert_eq!(slice.bytes_total(), PAGE_SZ); let slice = self .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; Ok(crate::tenant::block_io::BlockLease::IoBufferMut( slice.into_inner(), )) } } impl Drop for VirtualFileInner { /// If a VirtualFile is dropped, close the underlying file if it was open. fn drop(&mut self) { let handle = self.handle.get_mut(); fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) { if slot_guard.tag == tag { slot.recently_used.store(false, Ordering::Relaxed); // there is also operation "close-by-replace" for closes done on eviction for // comparison. if let Some(fd) = slot_guard.file.take() { STORAGE_IO_TIME_METRIC .get(StorageIoOperation::Close) .observe_closure_duration(|| drop(fd)); } } } // We don't have async drop so we cannot directly await the lock here. // Instead, first do a best-effort attempt at closing the underlying // file descriptor by using `try_write`, and if that fails, spawn // a tokio task to do it asynchronously: we just want it to be // cleaned up eventually. // Most of the time, the `try_lock` should succeed though, // as we have `&mut self` access. In other words, if the slot // is still occupied by our file, there should be no access from // other I/O operations; the only other possible place to lock // the slot is the lock algorithm looking for free slots. let slot = &get_open_files().slots[handle.index]; if let Ok(slot_guard) = slot.inner.try_write() { clean_slot(slot, slot_guard, handle.tag); } else { let tag = handle.tag; tokio::spawn(async move { let slot_guard = slot.inner.write().await; clean_slot(slot, slot_guard, tag); }); }; } } impl OwnedAsyncWriter for VirtualFile { async fn write_all_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> (FullSlice, std::io::Result<()>) { VirtualFile::write_all_at(self, buf, offset, ctx).await } async fn set_len(&self, len: u64, ctx: &RequestContext) -> std::io::Result<()> { VirtualFile::set_len(self, len, ctx).await } } impl OpenFiles { fn new(num_slots: usize) -> OpenFiles { let mut slots = Box::new(Vec::with_capacity(num_slots)); for _ in 0..num_slots { let slot = Slot { recently_used: AtomicBool::new(false), inner: RwLock::new(SlotInner { tag: 0, file: None }), }; slots.push(slot); } OpenFiles { next: AtomicUsize::new(0), slots: Box::leak(slots), } } } /// /// Initialize the virtual file module. This must be called once at page /// server startup. /// #[cfg(not(test))] pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: SyncMode) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } set_io_mode(mode); io_engine::init(engine); SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } const TEST_MAX_FILE_DESCRIPTORS: usize = 10; // Get a handle to the global slots array. fn get_open_files() -> &'static OpenFiles { // // In unit tests, page server startup doesn't happen and no one calls // virtual_file::init(). Initialize it here, with a small array. // // This applies to the virtual file tests below, but all other unit // tests too, so the virtual file facility is always usable in // unit tests. // if cfg!(test) { OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS)) } else { OPEN_FILES.get().expect("virtual_file::init not called yet") } } /// Gets the io buffer alignment. pub(crate) const fn get_io_buffer_alignment() -> usize { DEFAULT_IO_BUFFER_ALIGNMENT } pub(crate) type IoBufferMut = AlignedBufferMut>; pub(crate) type IoBuffer = AlignedBuffer>; pub(crate) type IoPageSlice<'a> = AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>; static IO_MODE: LazyLock = LazyLock::new(|| AtomicU8::new(IoMode::preferred() as u8)); pub fn set_io_mode(mode: IoMode) { IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed); } pub(crate) fn get_io_mode() -> IoMode { IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap() } static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); #[cfg(test)] mod tests { use std::os::unix::fs::FileExt; use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; use rand::Rng; use rand::seq::SliceRandom; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; #[tokio::test] async fn test_virtual_files() -> anyhow::Result<()> { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files"); std::fs::create_dir_all(&testdir)?; let zeropad512 = |content: &[u8]| { let mut buf = IoBufferMut::with_capacity_zeroed(512); buf[..content.len()].copy_from_slice(content); buf.freeze().slice_len() }; let path_a = testdir.join("file_a"); let file_a = VirtualFile::open_with_options_v2( path_a.clone(), OpenOptions::new() .read(true) .write(true) // set create & truncate flags to ensure when we trigger a reopen later in this test, // the reopen_options must have masked out those flags; if they don't, then // the after reopen we will fail to read the `content_a` that we write here. .create(true) .truncate(true), &ctx, ) .await?; let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await; res?; let path_b = testdir.join("file_b"); let file_b = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new() .read(true) .write(true) .create(true) .truncate(true), &ctx, ) .await?; let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await; res?; let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| { let buf = vfile .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx) .await .unwrap(); assert_eq!(&buf[..], &zeropad512(expect)[..]); }; // Open a lot of file descriptors / VirtualFile instances. // Enough to cause some evictions in the fd cache. let mut file_b_dupes = Vec::new(); for _ in 0..100 { let vfile = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new().read(true), &ctx, ) .await?; assert_first_512_eq(&vfile, b"content_b").await; file_b_dupes.push(vfile); } // make sure we opened enough files to definitely cause evictions. assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2); // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. The VirtualFile reopens the file internally. assert_first_512_eq(&file_a, b"content_a").await; // Check that all the other FDs still work too. Use them in random order for // good measure. file_b_dupes.as_mut_slice().shuffle(&mut rand::rng()); for vfile in file_b_dupes.iter_mut() { assert_first_512_eq(vfile, b"content_b").await; } Ok(()) } /// Test using VirtualFiles from many threads concurrently. This tests both using /// a lot of VirtualFiles concurrently, causing evictions, and also using the same /// VirtualFile from multiple threads concurrently. #[tokio::test] async fn test_vfile_concurrency() -> Result<(), Error> { const SIZE: usize = 8 * 1024; const VIRTUAL_FILES: usize = 100; const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; // Create a test file. let test_file_path = testdir.join("concurrency_test_file"); { let file = File::create(&test_file_path)?; file.write_all_at(&SAMPLE, 0)?; } // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { let f = VirtualFile::open_with_options_v2( &test_file_path, OpenOptions::new().read(true), &ctx, ) .await?; files.push(f); } let files = Arc::new(files); // Launch many threads, and use the virtual files concurrently in random order. let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(THREADS) .thread_name("test_vfile_concurrency thread") .build() .unwrap(); let mut hdls = Vec::new(); for _threadno in 0..THREADS { let files = files.clone(); let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { let mut buf = IoBufferMut::with_capacity_zeroed(SIZE); for _ in 1..1000 { let f = &files[rand::rng().random_range(0..files.len())]; buf = f .read_exact_at(buf.slice_full(), 0, &ctx) .await .unwrap() .into_inner(); assert!(buf[..] == SAMPLE); } }); hdls.push(hdl); } for hdl in hdls { hdl.await?; } std::mem::forget(rt); Ok(()) } #[tokio::test] async fn test_atomic_overwrite_basic() { let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); } #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); } } ================================================ FILE: pageserver/src/walingest.rs ================================================ //! //! Parse PostgreSQL WAL records and store them in a neon Timeline. //! //! The pipeline for ingesting WAL looks like this: //! //! WAL receiver -> [`wal_decoder`] -> WalIngest -> Repository //! //! The WAL receiver receives a stream of WAL from the WAL safekeepers. //! Records get decoded and interpreted in the [`wal_decoder`] module //! and then stored to the Repository by WalIngest. //! //! The neon Repository can store page versions in two formats: as //! page images, or a WAL records. [`wal_decoder::models::InterpretedWalRecord::from_bytes_filtered`] //! extracts page images out of some WAL records, but mostly it's WAL //! records. If a WAL record modifies multiple pages, WalIngest //! will call Repository::put_rel_wal_record or put_rel_page_image functions //! separately for each modified page. //! //! To reconstruct a page using a WAL record, the Repository calls the //! code in walredo.rs. walredo.rs passes most WAL records to the WAL //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. use std::backtrace::Backtrace; use std::collections::HashMap; use std::sync::atomic::AtomicBool; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; use bytes::{Buf, Bytes}; use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::*; use postgres_ffi::{ PgMajorVersion, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants, }; use postgres_ffi_types::TimestampTz; use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use tracing::*; use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical_timeline, failpoint_support}; use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::*; use crate::ZERO_PAGE; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::{PageReconstructError, Timeline}; enum_pgversion! {CheckPoint, pgv::CheckPoint} impl CheckPoint { fn encode(&self) -> Result { enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() }) } fn update_next_xid(&mut self, xid: u32) -> bool { enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) }) } pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_multixid(multi_xid, multi_offset) }) } } /// Temporary limitation of WAL lag warnings after attach /// /// After tenant attach, we want to limit WAL lag warnings because /// we don't look at the WAL until the attach is complete, which /// might take a while. pub struct WalLagCooldown { /// Until when should this limitation apply at all active_until: std::time::Instant, /// The maximum lag to suppress. Lags above this limit get reported anyways. max_lag: Duration, } impl WalLagCooldown { pub fn new(attach_start: Instant, attach_duration: Duration) -> Self { Self { active_until: attach_start + attach_duration * 3 + Duration::from_secs(120), max_lag: attach_duration * 2 + Duration::from_secs(60), } } } pub struct WalIngest { attach_wal_lag_cooldown: Arc>, shard: ShardIdentity, checkpoint: CheckPoint, checkpoint_modified: bool, warn_ingest_lag: WarnIngestLag, } struct WarnIngestLag { lag_msg_ratelimit: RateLimit, future_lsn_msg_ratelimit: RateLimit, timestamp_invalid_msg_ratelimit: RateLimit, } pub struct WalIngestError { pub backtrace: std::backtrace::Backtrace, pub kind: WalIngestErrorKind, } #[derive(thiserror::Error, Debug)] pub enum WalIngestErrorKind { #[error(transparent)] #[allow(private_interfaces)] PageReconstructError(#[from] PageReconstructError), #[error(transparent)] DeserializationFailure(#[from] DeserializeError), #[error(transparent)] SerializationFailure(#[from] SerializeError), #[error("the request contains data not supported by pageserver: {0} @ {1}")] InvalidKey(Key, Lsn), #[error("twophase file for xid {0} already exists")] FileAlreadyExists(u64), #[error("slru segment {0:?}/{1} already exists")] SlruAlreadyExists(SlruKind, u32), #[error("relation already exists")] RelationAlreadyExists(RelTag), #[error("invalid reldir key {0}")] InvalidRelDirKey(Key), #[error(transparent)] LogicalError(anyhow::Error), #[error(transparent)] EncodeAuxFileError(anyhow::Error), #[error(transparent)] MaybeRelSizeV2Error(anyhow::Error), #[error("timeline shutting down")] Cancelled, } impl From for WalIngestError where WalIngestErrorKind: From, { fn from(value: T) -> Self { WalIngestError { backtrace: Backtrace::capture(), kind: WalIngestErrorKind::from(value), } } } impl std::error::Error for WalIngestError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { self.kind.source() } } impl core::fmt::Display for WalIngestError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { self.kind.fmt(f) } } impl core::fmt::Debug for WalIngestError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if f.alternate() { f.debug_map() .key(&"backtrace") .value(&self.backtrace) .key(&"kind") .value(&self.kind) .finish() } else { writeln!(f, "Error: {:?}", self.kind)?; if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured { writeln!(f, "Stack backtrace: {:?}", self.backtrace)?; } Ok(()) } } } #[macro_export] macro_rules! ensure_walingest { ($($t:tt)*) => { _ = || -> Result<(), anyhow::Error> { anyhow::ensure!($($t)*); Ok(()) }().map_err(WalIngestErrorKind::LogicalError)?; }; } impl WalIngest { pub async fn new( timeline: &Timeline, startpoint: Lsn, ctx: &RequestContext, ) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; let pgversion = timeline.pg_version; let checkpoint = dispatch_pgversion!(pgversion, { let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); >::into(checkpoint) }); Ok(WalIngest { shard: *timeline.get_shard_identity(), checkpoint, checkpoint_modified: false, attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(), warn_ingest_lag: WarnIngestLag { lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), }, }) } /// Ingest an interpreted PostgreSQL WAL record by doing writes to the underlying key value /// storage of a given timeline. /// /// This function updates `lsn` field of `DatadirModification` /// /// This function returns `true` if the record was ingested, and `false` if it was filtered out pub async fn ingest_record( &mut self, interpreted: InterpretedWalRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result { WAL_INGEST.records_received.inc(); let prev_len = modification.len(); modification.set_lsn(interpreted.next_record_lsn)?; if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) { // Records of this type should always be preceded by a commit(), as they // rely on reading data pages back from the Timeline. assert!(!modification.has_dirty_data()); } assert!(!self.checkpoint_modified); if interpreted.xid != pg_constants::INVALID_TRANSACTION_ID && self.checkpoint.update_next_xid(interpreted.xid) { self.checkpoint_modified = true; } failpoint_support::sleep_millis_async!("wal-ingest-record-sleep"); match interpreted.metadata_record { Some(MetadataRecord::Heapam(rec)) => match rec { HeapamRecord::ClearVmBits(clear_vm_bits) => { self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx) .await?; } }, Some(MetadataRecord::Neonrmgr(rec)) => match rec { NeonrmgrRecord::ClearVmBits(clear_vm_bits) => { self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx) .await?; } }, Some(MetadataRecord::Smgr(rec)) => match rec { SmgrRecord::Create(create) => { self.ingest_xlog_smgr_create(create, modification, ctx) .await?; } SmgrRecord::Truncate(truncate) => { self.ingest_xlog_smgr_truncate(truncate, modification, ctx) .await?; } }, Some(MetadataRecord::Dbase(rec)) => match rec { DbaseRecord::Create(create) => { self.ingest_xlog_dbase_create(create, modification, ctx) .await?; } DbaseRecord::Drop(drop) => { self.ingest_xlog_dbase_drop(drop, modification, ctx).await?; } }, Some(MetadataRecord::Clog(rec)) => match rec { ClogRecord::ZeroPage(zero_page) => { self.ingest_clog_zero_page(zero_page, modification, ctx) .await?; } ClogRecord::Truncate(truncate) => { self.ingest_clog_truncate(truncate, modification, ctx) .await?; } }, Some(MetadataRecord::Xact(rec)) => { self.ingest_xact_record(rec, modification, ctx).await?; } Some(MetadataRecord::MultiXact(rec)) => match rec { MultiXactRecord::ZeroPage(zero_page) => { self.ingest_multixact_zero_page(zero_page, modification, ctx) .await?; } MultiXactRecord::Create(create) => { self.ingest_multixact_create(modification, &create)?; } MultiXactRecord::Truncate(truncate) => { self.ingest_multixact_truncate(modification, &truncate, ctx) .await?; } }, Some(MetadataRecord::Relmap(rec)) => match rec { RelmapRecord::Update(update) => { self.ingest_relmap_update(update, modification, ctx).await?; } }, Some(MetadataRecord::Xlog(rec)) => match rec { XlogRecord::Raw(raw) => { self.ingest_raw_xlog_record(raw, modification, ctx).await?; } }, Some(MetadataRecord::LogicalMessage(rec)) => match rec { LogicalMessageRecord::Put(put) => { self.ingest_logical_message_put(put, modification, ctx) .await?; } #[cfg(feature = "testing")] LogicalMessageRecord::Failpoint => { // This is a convenient way to make the WAL ingestion pause at // particular point in the WAL. For more fine-grained control, // we could peek into the message and only pause if it contains // a particular string, for example, but this is enough for now. failpoint_support::sleep_millis_async!( "pageserver-wal-ingest-logical-message-sleep" ); } }, Some(MetadataRecord::Standby(rec)) => { self.ingest_standby_record(rec).unwrap(); } Some(MetadataRecord::Replorigin(rec)) => { self.ingest_replorigin_record(rec, modification).await?; } None => { // There are two cases through which we end up here: // 1. The resource manager for the original PG WAL record // is [`pg_constants::RM_TBLSPC_ID`]. This is not a supported // record type within Neon. // 2. The resource manager id was unknown to // [`wal_decoder::decoder::MetadataRecord::from_decoded`]. // TODO(vlad): Tighten this up more once we build confidence // that case (2) does not happen in the field. } } modification .ingest_batch(interpreted.batch, &self.shard, ctx) .await?; // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { let new_checkpoint_bytes = self.checkpoint.encode()?; modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Note that at this point this record is only cached in the modification // until commit() is called to flush the data into the repository and update // the latest LSN. Ok(modification.len() > prev_len) } /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { let next_full_xid = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); let next_xid = (next_full_xid) as u32; let mut epoch = (next_full_xid >> 32) as u32; if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" )))?; } epoch -= 1; } Ok(((epoch as u64) << 32) | xid as u64) } async fn ingest_clear_vm_bits( &mut self, clear_vm_bits: ClearVmBits, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let ClearVmBits { new_heap_blkno, old_heap_blkno, flags, vm_rel, } = clear_vm_bits; // Clear the VM bits if required. let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion, // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See: // https://github.com/neondatabase/neon/pull/10634. let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. None::<&AtomicBool>, "clear_vm_bits for unknown VM relation {vm_rel}" ); return Ok(()); }; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. None::<&AtomicBool>, "new_vm_blk {blknum} not in {vm_rel} of size {vm_size}" ); new_vm_blk = None; } } if let Some(blknum) = old_vm_blk { if blknum >= vm_size { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. None::<&AtomicBool>, "old_vm_blk {blknum} not in {vm_rel} of size {vm_size}" ); old_vm_blk = None; } } if new_vm_blk.is_none() && old_vm_blk.is_none() { return Ok(()); } else if new_vm_blk == old_vm_blk { // An UPDATE record that needs to clear the bits for both old and the new page, both of // which reside on the same VM page. self.put_rel_wal_record( modification, vm_rel, new_vm_blk.unwrap(), NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, }, ctx, ) .await?; } else { // Clear VM bits for one heap page, or for two pages that reside on different VM pages. if let Some(new_vm_blk) = new_vm_blk { self.put_rel_wal_record( modification, vm_rel, new_vm_blk, NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno: None, flags, }, ctx, ) .await?; } if let Some(old_vm_blk) = old_vm_blk { self.put_rel_wal_record( modification, vm_rel, old_vm_blk, NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, old_heap_blkno, flags, }, ctx, ) .await?; } } Ok(()) } /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. async fn ingest_xlog_dbase_create( &mut self, create: DbaseCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let DbaseCreate { db_id, tablespace_id, src_db_id, src_tablespace_id, } = create; let rels = modification .tline .list_rels( src_tablespace_id, src_db_id, Version::Modified(modification), ctx, ) .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap let filemap = modification .tline .get_relmap_file( src_tablespace_id, src_db_id, Version::Modified(modification), ctx, ) .await?; modification .put_relmap_file(tablespace_id, db_id, filemap, ctx) .await?; let mut num_rels_copied = 0; let mut num_blocks_copied = 0; for src_rel in rels { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); let nblocks = modification .tline .get_rel_size(src_rel, Version::Modified(modification), ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, relnode: src_rel.relnode, forknum: src_rel.forknum, }; modification.put_rel_creation(dst_rel, nblocks, ctx).await?; // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); for blknum in 0..nblocks { // Sharding: // - src and dst are always on the same shard, because they differ only by dbNode, and // dbNode is not included in the hash inputs for sharding. // - This WAL command is replayed on all shards, but each shard only copies the blocks // that belong to it. let src_key = rel_block_to_key(src_rel, blknum); if !self.shard.is_key_local(&src_key) { debug!( "Skipping non-local key {} during XLOG_DBASE_CREATE", src_key ); continue; } debug!( "copying block {} from {} ({}) to {}", blknum, src_rel, src_key, dst_rel ); let content = modification .tline .get_rel_page_at_lsn( src_rel, blknum, Version::Modified(modification), ctx, crate::tenant::storage_layer::IoConcurrency::sequential(), ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } num_rels_copied += 1; } info!( "Created database {}/{}, copied {} blocks in {} rels", tablespace_id, db_id, num_blocks_copied, num_rels_copied ); Ok(()) } async fn ingest_xlog_dbase_drop( &mut self, dbase_drop: DbaseDrop, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let DbaseDrop { db_id, tablespace_ids, } = dbase_drop; for tablespace_id in tablespace_ids { trace!("Drop db {}, {}", tablespace_id, db_id); modification.drop_dbdir(tablespace_id, db_id, ctx).await?; } Ok(()) } async fn ingest_xlog_smgr_create( &mut self, create: SmgrCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let SmgrCreate { rel } = create; self.put_rel_creation(modification, rel, ctx).await?; Ok(()) } /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. async fn ingest_xlog_smgr_truncate( &mut self, truncate: XlSmgrTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let XlSmgrTruncate { blkno, rnode, flags, } = truncate; let spcnode = rnode.spcnode; let dbnode = rnode.dbnode; let relnode = rnode.relnode; if flags & pg_constants::SMGR_TRUNCATE_HEAP != 0 { let rel = RelTag { spcnode, dbnode, relnode, forknum: MAIN_FORKNUM, }; self.put_rel_truncation(modification, rel, blkno, ctx) .await?; } if flags & pg_constants::SMGR_TRUNCATE_FSM != 0 { let rel = RelTag { spcnode, dbnode, relnode, forknum: FSM_FORKNUM, }; // Zero out the last remaining FSM page, if this shard owns it. We are not precise here, // and instead of digging in the FSM bitmap format we just clear the whole page. let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE; let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no); if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 && self .shard .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no)) { modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?; fsm_physical_page_no += 1; } // Truncate this shard's view of the FSM relation size, if it even has one. let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0); if nblocks > fsm_physical_page_no { self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx) .await?; } } if flags & pg_constants::SMGR_TRUNCATE_VM != 0 { let rel = RelTag { spcnode, dbnode, relnode, forknum: VISIBILITYMAP_FORKNUM, }; // last remaining block, byte, and bit let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE / pg_constants::VM_HEAPBLOCKS_PER_BYTE; let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE * pg_constants::VM_BITS_PER_HEAPBLOCK; // Unless the new size is exactly at a visibility map page boundary, the // tail bits in the last remaining map page, representing truncated heap // blocks, need to be cleared. This is not only tidy, but also necessary // because we don't get a chance to clear the bits if the heap is extended // again. Only do this on the shard that owns the page. if (trunc_byte != 0 || trunc_offs != 0) && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no)) { modification.put_rel_wal_record( rel, vm_page_no, NeonWalRecord::TruncateVisibilityMap { trunc_byte, trunc_offs, }, )?; vm_page_no += 1; } // Truncate this shard's view of the VM relation size, if it even has one. let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0); if nblocks > vm_page_no { self.put_rel_truncation(modification, rel, vm_page_no, ctx) .await?; } } Ok(()) } fn warn_on_ingest_lag( &mut self, conf: &crate::config::PageServerConf, wal_timestamp: TimestampTz, ) { debug_assert_current_span_has_tenant_and_timeline_id(); let now = SystemTime::now(); let rate_limits = &mut self.warn_ingest_lag; let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, { pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp) }); match ts { Ok(ts) => { match now.duration_since(ts) { Ok(lag) => { if lag > conf.wait_lsn_timeout { rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { if let Some(cooldown) = self.attach_wal_lag_cooldown.get() { if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag { return; } } else { // Still loading? We shouldn't be here } let lag = humantime::format_duration(lag); warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); }) } } Err(e) => { let delta_t = e.duration(); // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) // => https://www.robustperception.io/time-metric-from-the-node-exporter/ const IGNORED_DRIFT: Duration = Duration::from_millis(100); if delta_t > IGNORED_DRIFT { let delta_t = humantime::format_duration(delta_t); rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| { warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future"); }) } } }; } Err(error) => { rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries"); }) } } } /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// async fn ingest_xact_record( &mut self, record: XactRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(xl_xid)? } else { xl_xid as u64 }; return modification.put_twophase_file(xid, data, ctx).await; } XactRecord::Commit(common) => (common, true, false), XactRecord::Abort(common) => (common, false, false), XactRecord::CommitPrepared(common) => (common, true, true), XactRecord::AbortPrepared(common) => (common, false, true), }; let XactCommon { parsed, origin_id, xl_xid, lsn, } = xact_common; // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let mut page_xids: Vec = vec![parsed.xid]; self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time); for subxact in &parsed.subxacts { let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE; if subxact_pageno != pageno { // This subxact goes to different page. Write the record // for all the XIDs on the previous page, and continue // accumulating XIDs on this new page. modification.put_slru_wal_record( SlruKind::Clog, segno, rpageno, if is_commit { NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; page_xids = Vec::new(); } pageno = subxact_pageno; segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; page_xids.push(*subxact); } modification.put_slru_wal_record( SlruKind::Clog, segno, rpageno, if is_commit { NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; // Group relations to drop by dbNode. This map will contain all relations that _might_ // exist, we will reduce it to which ones really exist later. This map can be huge if // the transaction touches a huge number of relations (there is no bound on this in // postgres). let mut drop_relations: HashMap<(u32, u32), Vec> = HashMap::new(); for xnode in &parsed.xnodes { for forknum in MAIN_FORKNUM..=INIT_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, dbnode: xnode.dbnode, relnode: xnode.relnode, }; drop_relations .entry((xnode.spcnode, xnode.dbnode)) .or_default() .push(rel); } } // Execute relation drops in a batch: the number may be huge, so deleting individually is prohibitively expensive modification.put_rel_drops(drop_relations, ctx).await?; if origin_id != 0 { modification .set_replorigin(origin_id, parsed.origin_lsn) .await?; } if is_prepared { // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", xl_xid, parsed.xid, lsn, ); let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(parsed.xid)? } else { parsed.xid as u64 }; modification.drop_twophase_file(xid, ctx).await?; } Ok(()) } async fn ingest_clog_truncate( &mut self, truncate: ClogTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let ClogTruncate { pageno, oldest_xid, oldest_xid_db, } = truncate; info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", pageno, oldest_xid, oldest_xid_db ); // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is // truncated, but a checkpoint record with the updated values isn't written until // later. In Neon, a server can start at any LSN, not just on a checkpoint record, // so we keep the oldestXid and oldestXidDB up-to-date. enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestXid = oldest_xid; cp.oldestXidDB = oldest_xid_db; }); self.checkpoint_modified = true; // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it let latest_page_number = enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32 / pg_constants::CLOG_XACTS_PER_PAGE; // Now delete all segments containing pages between xlrec.pageno // and latest_page_number. // First, make an important safety check: // the current endpoint page must not be eligible for removal. // See SimpleLruTruncate() in slru.c if dispatch_pgversion!(modification.tline.pg_version, { pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, pageno) }) { info!("could not truncate directory pg_xact apparent wraparound"); return Ok(()); } // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate // // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. if modification.tline.get_shard_identity().is_shard_zero() { for segno in modification .tline .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; let may_delete = dispatch_pgversion!(modification.tline.pg_version, { pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno) }); if may_delete { modification .drop_slru_segment(SlruKind::Clog, segno, ctx) .await?; trace!("Drop CLOG segment {:>04X}", segno); } } } Ok(()) } async fn ingest_clog_zero_page( &mut self, zero_page: ClogZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let ClogZeroPage { segno, rpageno } = zero_page; self.put_slru_page_image( modification, SlruKind::Clog, segno, rpageno, ZERO_PAGE.clone(), ctx, ) .await } fn ingest_multixact_create( &mut self, modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<(), WalIngestError> { // Create WAL record for updating the multixact-offsets page let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; modification.put_slru_wal_record( SlruKind::MultiXactOffsets, segno, rpageno, NeonWalRecord::MultixactOffsetCreate { mid: xlrec.mid, moff: xlrec.moff, }, )?; // Create WAL records for the update of each affected multixact-members page let mut members = xlrec.members.iter(); let mut offset = xlrec.moff; loop { let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; // How many members fit on this page? let page_remain = pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32 - offset % pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; let mut this_page_members: Vec = Vec::new(); for _ in 0..page_remain { if let Some(m) = members.next() { this_page_members.push(m.clone()); } else { break; } } if this_page_members.is_empty() { // all done break; } let n_this_page = this_page_members.len(); modification.put_slru_wal_record( SlruKind::MultiXactMembers, pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, NeonWalRecord::MultixactMembersCreate { moff: offset, members: this_page_members, }, )?; // Note: The multixact members can wrap around, even within one WAL record. offset = offset.wrapping_add(n_this_page as u32); } let next_offset = offset; assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset); // Update next-multi-xid and next-offset // // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that // read it, like GetNewMultiXactId(). This is different from how nextXid is // incremented! nextXid skips over < FirstNormalTransactionId when the value // is stored, so it's never 0 in a checkpoint. // // I don't know why it's done that way, it seems less error-prone to skip over 0 // when the value is stored rather than when it's read. But let's do it the same // way here. let next_multi_xid = xlrec.mid.wrapping_add(1); if self .checkpoint .update_next_multixid(next_multi_xid, next_offset) { self.checkpoint_modified = true; } // Also update the next-xid with the highest member. According to the comments in // multixact_redo(), this shouldn't be necessary, but let's do the same here. let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { if let Some(max_xid) = acc { if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { Some(mbr.xid) } else { acc } } else { Some(mbr.xid) } }); if let Some(max_xid) = max_mbr_xid { if self.checkpoint.update_next_xid(max_xid) { self.checkpoint_modified = true; } } Ok(()) } async fn ingest_multixact_truncate( &mut self, modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let (maxsegment, startsegment, endsegment) = enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestMulti = xlrec.end_trunc_off; cp.oldestMultiDB = xlrec.oldest_multi_db; let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment( pg_constants::MAX_MULTIXACT_OFFSET, ); let startsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb); let endsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb); (maxsegment, startsegment, endsegment) }); self.checkpoint_modified = true; // PerformMembersTruncation let mut segment: i32 = startsegment; // Delete all the segments except the last one. The last segment can still // contain, possibly partially, valid data. if modification.tline.get_shard_identity().is_shard_zero() { while segment != endsegment { modification .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx) .await?; /* move to next segment, handling wraparound correctly */ if segment == maxsegment { segment = 0; } else { segment += 1; } } } // Truncate offsets // FIXME: this did not handle wraparound correctly Ok(()) } async fn ingest_multixact_zero_page( &mut self, zero_page: MultiXactZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let MultiXactZeroPage { slru_kind, segno, rpageno, } = zero_page; self.put_slru_page_image( modification, slru_kind, segno, rpageno, ZERO_PAGE.clone(), ctx, ) .await } async fn ingest_relmap_update( &mut self, update: RelmapUpdate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let RelmapUpdate { update, buf } = update; modification .put_relmap_file(update.tsid, update.dbid, buf, ctx) .await } async fn ingest_raw_xlog_record( &mut self, raw_record: RawXlogRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let RawXlogRecord { info, lsn, mut buf } = raw_record; let pg_version = modification.tline.pg_version; if info == pg_constants::XLOG_PARAMETER_CHANGE { if let CheckPoint::V17(cp) = &mut self.checkpoint { let rec = v17::XlParameterChange::decode(&mut buf); cp.wal_level = rec.wal_level; self.checkpoint_modified = true; } } else if info == pg_constants::XLOG_END_OF_RECOVERY { if let CheckPoint::V17(cp) = &mut self.checkpoint { let rec = v17::XlEndOfRecovery::decode(&mut buf); cp.wal_level = rec.wal_level; self.checkpoint_modified = true; } } enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); if cp.nextOid != next_oid { cp.nextOid = next_oid; self.checkpoint_modified = true; } } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, cp.oldestXid ); if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { cp.oldestXid = xlog_checkpoint.oldestXid; } trace!( "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid ); // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, // because at shutdown, all in-progress transactions will implicitly // end. Postgres startup code knows that, and allows hot standby to start // immediately from a shutdown checkpoint. // // In Neon, Postgres hot standby startup always behaves as if starting from // an online checkpoint. It needs a valid `oldestActiveXid` value, so // instead of overwriting self.checkpoint.oldestActiveXid with // InvalidTransactionid from the checkpoint WAL record, update it to a // proper value, knowing that there are no in-progress transactions at this // point, except for prepared transactions. // // See also the neon code changes in the InitWalRecovery() function. if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 { let mut oldest_active_full_xid = cp.nextXid.value; for xid in modification.tline.list_twophase_files(lsn, ctx).await? { if xid < oldest_active_full_xid { oldest_active_full_xid = xid; } } oldest_active_full_xid as u32 } else { let mut oldest_active_xid = cp.nextXid.value as u32; for xid in modification.tline.list_twophase_files(lsn, ctx).await? { let narrow_xid = xid as u32; if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { oldest_active_xid = narrow_xid; } } oldest_active_xid }; cp.oldestActiveXid = oldest_active_xid; } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } // NB: We abuse the Checkpoint.redo field: // // - In PostgreSQL, the Checkpoint struct doesn't store the information // of whether this is an online checkpoint or a shutdown checkpoint. It's // stored in the XLOG info field of the WAL record, shutdown checkpoints // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers // in the pageserver, however. // // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the // checkpoint record, if it's a shutdown checkpoint. But when we are // starting from a shutdown checkpoint, the basebackup LSN is the *end* // of the shutdown checkpoint WAL record. That makes it difficult to // correctly detect whether we're starting from a shutdown record or // not. // // To address both of those issues, we store 0 in the redo field if it's // an online checkpoint record, and the record's *end* LSN if it's a // shutdown checkpoint. We don't need the original redo pointer in neon, // because we don't perform WAL replay at startup anyway, so we can get // away with abusing the redo field like this. // // XXX: Ideally, we would persist the extra information in a more // explicit format, rather than repurpose the fields of the Postgres // struct like this. However, we already have persisted data like this, // so we need to maintain backwards compatibility. // // NB: We didn't originally have this convention, so there are still old // persisted records that didn't do this. Before, we didn't update the // persisted redo field at all. That means that old records have a bogus // redo pointer that points to some old value, from the checkpoint record // that was originally imported from the data directory. If it was a // project created in Neon, that means it points to the first checkpoint // after initdb. That's OK for our purposes: all such old checkpoints are // treated as old online checkpoints when the basebackup is created. cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { // Store the *end* LSN of the checkpoint record. Or to be precise, // the start LSN of the *next* record, i.e. if the record ends // exactly at page boundary, the redo LSN points to just after the // page header on the next page. lsn.into() } else { Lsn::INVALID.into() }; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to // have some trace of the checkpoint records in the layer files at the same // LSNs. self.checkpoint_modified = true; } }); if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { modification.tline.prepare_basebackup(lsn); } Ok(()) } async fn ingest_logical_message_put( &mut self, put: PutLogicalMessage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let PutLogicalMessage { path, buf } = put; modification.put_file(path.as_str(), &buf, ctx).await } fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> { match record { StandbyRecord::RunningXacts(running_xacts) => { enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestActiveXid = running_xacts.oldest_running_xid; }); self.checkpoint_modified = true; } } Ok(()) } async fn ingest_replorigin_record( &mut self, record: ReploriginRecord, modification: &mut DatadirModification<'_>, ) -> Result<(), WalIngestError> { match record { ReploriginRecord::Set(set) => { modification .set_replorigin(set.node_id, set.remote_lsn) .await?; } ReploriginRecord::Drop(drop) => { modification.drop_replorigin(drop.node_id).await?; } } Ok(()) } async fn put_rel_creation( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, ) -> Result<(), WalIngestError> { modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } #[cfg(test)] async fn put_rel_page_image( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, img: Bytes, ctx: &RequestContext, ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_page_image(rel, blknum, img)?; Ok(()) } async fn put_rel_wal_record( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, ctx: &RequestContext, ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } async fn put_rel_truncation( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } async fn handle_rel_extend( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. let old_nblocks = modification.create_relation_if_required(rel, ctx).await?; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); modification.put_rel_extend(rel, new_nblocks, ctx).await?; let mut key = rel_block_to_key(rel, blknum); // fill the gap with zeros let mut gap_blocks_filled: u64 = 0; for gap_blknum in old_nblocks..blknum { key.field6 = gap_blknum; if self.shard.get_shard_number(&key) != self.shard.number { continue; } modification.put_rel_page_image_zero(rel, gap_blknum)?; gap_blocks_filled += 1; } WAL_INGEST .gap_blocks_zeroed_on_rel_extend .inc_by(gap_blocks_filled); // Log something when relation extends cause use to fill gaps // with zero pages. Logging is rate limited per pg version to // avoid skewing. if gap_blocks_filled > 0 { use std::sync::Mutex; use once_cell::sync::Lazy; use utils::rate_limit::RateLimit; struct RateLimitPerPgVersion { rate_limiters: [Lazy>; 4], } impl RateLimitPerPgVersion { const fn new() -> Self { Self { rate_limiters: [const { Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(30)))) }; 4], } } const fn rate_limiter( &self, pg_version: PgMajorVersion, ) -> Option<&Lazy>> { const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num(); const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num(); let pg_version = pg_version.major_version_num(); if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION { return None; } Some(&self.rate_limiters[(pg_version - MIN_PG_VERSION) as usize]) } } static LOGGED: RateLimitPerPgVersion = RateLimitPerPgVersion::new(); if let Some(rate_limiter) = LOGGED.rate_limiter(modification.tline.pg_version) { if let Ok(mut locked) = rate_limiter.try_lock() { locked.call(|| { info!( lsn=%modification.get_lsn(), pg_version=%modification.tline.pg_version, rel=%rel, "Filled {} gap blocks on rel extend to {} from {}", gap_blocks_filled, new_nblocks, old_nblocks); }); } } } } Ok(()) } async fn put_slru_page_image( &mut self, modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, img: Bytes, ctx: &RequestContext, ) -> Result<(), WalIngestError> { if !self.shard.is_shard_zero() { return Ok(()); } self.handle_slru_extend(modification, kind, segno, blknum, ctx) .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; Ok(()) } async fn handle_slru_extend( &mut self, modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, ctx: &RequestContext, ) -> Result<(), WalIngestError> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let old_nblocks = if !modification .tline .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it modification .put_slru_segment_creation(kind, segno, 0, ctx) .await?; 0 } else { modification .tline .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx) .await? }; if new_nblocks > old_nblocks { trace!( "extending SLRU {:?} seg {} from {} to {} blocks", kind, segno, old_nblocks, new_nblocks ); modification.put_slru_extend(kind, segno, new_nblocks)?; // fill the gap with zeros for gap_blknum in old_nblocks..blknum { modification.put_slru_page_image_zero(kind, segno, gap_blknum)?; } } Ok(()) } } /// Returns the size of the relation as of this modification, or None if the relation doesn't exist. /// /// This is only accurate on shard 0. On other shards, it will return the size up to the highest /// page number stored in the shard, or None if the shard does not have any pages for it. async fn get_relsize( modification: &DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, ) -> Result, PageReconstructError> { if !modification .tline .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { return Ok(None); } modification .tline .get_rel_size(rel, Version::Modified(modification), ctx) .await .map(Some) } #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use anyhow::Result; use postgres_ffi::PgMajorVersion; use postgres_ffi::RELSEG_SIZE; use super::*; use crate::DEFAULT_PG_VERSION; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path}; use crate::tenant::storage_layer::IoConcurrency; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { spcnode: 0, dbnode: 111, relnode: 1000, forknum: 0, }; fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) { // TODO } #[tokio::test] async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { for i in PgMajorVersion::ALL { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; }); } Ok(()) } async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(dispatch_pgversion!( tline.pg_version, pgv::ZERO_CHECKPOINT.clone() ))?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file m.commit(ctx).await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; Ok(walingest) } #[tokio::test] async fn test_relsize() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?; walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); let test_span = tracing::info_span!(parent: None, "test", tenant_id=%tline.tenant_shard_id.tenant_id, shard_id=%tline.tenant_shard_id.shard_slug(), timeline_id=%tline.timeline_id); // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); // Check page contents at each LSN assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x20)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 2") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x30)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 1, Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 1, Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 2, Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") ); // Truncate last block let mut m = tline.begin_modification(Lsn(0x60)); walingest .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx) .await?; m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 2 ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 1, Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); // should still see the truncated block with older LSN assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 2, Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") ); // Truncate to zero length let mut m = tline.begin_modification(Lsn(0x68)); walingest .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx) .await?, 0 ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx) .await?, 2 ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 0, Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, ZERO_PAGE ); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 1, Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, blk, Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, ZERO_PAGE ); } assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, 1500, Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img("foo blk 1500") ); Ok(()) } // Test what happens if we dropped a relation // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_drop_extend") .await? .load() .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); let mut rel_drops = HashMap::new(); rel_drops.insert((TESTREL_A.spcnode, TESTREL_A.dbnode), vec![TESTREL_A]); m.put_rel_drops(rel_drops, &ctx).await?; m.commit(&ctx).await?; // Check that rel is not visible anymore assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx) .await?, false ); // FIXME: should fail //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, 1 ); Ok(()) } // Test what happens if we truncated a relation // so that one of its segments was dropped // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_truncate_extend") .await? .load() .await; let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; let test_span = tracing::info_span!(parent: None, "test", tenant_id=%tline.tenant_shard_id.tenant_id, shard_id=%tline.tenant_shard_id.shard_slug(), timeline_id=%tline.timeline_id); // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, relsize ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, blkno, Version::at(lsn), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img(&data) ); } // Truncate relation so that second segment was dropped // - only leave one page let mut m = tline.begin_modification(Lsn(0x60)); walingest .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx) .await?; m.commit(&ctx).await?; // Check reported size and contents after truncation assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 1 ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, blkno, Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img(&data) ); } // should still see all blocks with older LSN assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, relsize ); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, blkno, Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img(&data) ); } // Extend relation again. // Add enough blocks to create second segment let lsn = Lsn(0x80); let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {blkno} at {lsn}"); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; assert_eq!( tline .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, relsize ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( TESTREL_A, blkno, Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) .instrument(test_span.clone()) .await?, test_img(&data) ); } Ok(()) } /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut lsn = 0x10; for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; m.commit(&ctx).await?; } assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); assert_current_logical_size(&tline, Lsn(lsn)); // Truncate another block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); assert_current_logical_size(&tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); size -= 1; } assert_current_logical_size(&tline, Lsn(lsn)); Ok(()) } /// Replay a wal segment file taken directly from safekeepers. /// /// This test is useful for benchmarking since it allows us to profile only /// the walingest code in a single-threaded executor, and iterate more quickly /// without waiting for unrelated steps. #[tokio::test] async fn test_ingest_real_wal() { use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::waldecoder::WalStreamDecoder; use crate::tenant::harness::*; // Define test data path and constants. // // Steps to reconstruct the data, if needed: // 1. Run the pgbench python test // 2. Take the first wal segment file from safekeeper // 3. Compress it using `zstd --long input_file` // 4. Copy initdb.tar.zst from local_fs_remote_storage // 5. Grep sk logs for "restart decoder" to get startpoint // 6. Run just the decoder from this test to get the endpoint. // It's the last LSN the decoder will output. let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); let source_initdb_path = format!("{path}/{INITDB_PATH}"); let startpoint = Lsn::from_hex("14AEC08").unwrap(); let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); let span = harness .span() .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID)); let (tenant, ctx) = harness.load().await; let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID); let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path()); std::fs::create_dir_all(initdb_path.parent().unwrap()) .expect("creating test dir should work"); std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works"); // Bootstrap a real timeline. We can't use create_test_timeline because // it doesn't create a real checkpoint, and Walingest::new tries to parse // the garbage data. let tline = tenant .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx) .await .unwrap(); // We fully read and decompress this into memory before decoding // to get a more accurate perf profile of the decoder. let bytes = { use async_compression::tokio::bufread::ZstdDecoder; let file = tokio::fs::File::open(wal_segment_path).await.unwrap(); let reader = tokio::io::BufReader::new(file); let decoder = ZstdDecoder::new(reader); let mut reader = tokio::io::BufReader::new(decoder); let mut buffer = Vec::new(); tokio::io::copy_buf(&mut reader, &mut buffer).await.unwrap(); buffer }; // TODO start a profiler too let started_at = std::time::Instant::now(); // Initialize walingest let xlogoff: usize = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut decoder = WalStreamDecoder::new(startpoint, pg_version); let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx) .await .unwrap(); let mut modification = tline.begin_modification(startpoint); println!("decoding {} bytes", bytes.len() - xlogoff); // Decode and ingest wal. We process the wal in chunks because // that's what happens when we get bytes from safekeepers. for chunk in bytes[xlogoff..].chunks(50) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, &[*modification.tline.get_shard_identity()], lsn, modification.tline.pg_version, ) .unwrap() .remove(modification.tline.get_shard_identity()) .unwrap(); walingest .ingest_record(interpreted, &mut modification, &ctx) .instrument(span.clone()) .await .unwrap(); } modification.commit(&ctx).await.unwrap(); } let duration = started_at.elapsed(); println!("done in {duration:?}"); } } ================================================ FILE: pageserver/src/walredo/apply_neon.rs ================================================ use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::reltag::SlruKind; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; use postgres_ffi::{BLCKSZ, pg_constants}; use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use tracing::*; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; /// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any // Postgres WAL records. But everything else is handled in neon. #[allow(clippy::match_like_matches_macro)] match rec { NeonWalRecord::Postgres { will_init: _, rec: _, } => false, _ => true, } } pub(crate) fn apply_in_neon( record: &NeonWalRecord, lsn: Lsn, key: Key, page: &mut BytesMut, ) -> Result<(), anyhow::Error> { match record { NeonWalRecord::Postgres { will_init: _, rec: _, } => { anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); } // // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c` // NeonWalRecord::TruncateVisibilityMap { trunc_byte, trunc_offs, } => { // sanity check that this is modifying the correct relation let (rel, _) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, "TruncateVisibilityMap record on unexpected rel {rel}" ); let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[*trunc_byte + 1..].fill(0u8); /*---- * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 * ((1 << 1) - 1) = 00000001 * ... * ((1 << 6) - 1) = 00111111 * ((1 << 7) - 1) = 01111111 *---- */ map[*trunc_byte] &= (1 << *trunc_offs) - 1; } NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, } => { // sanity check that this is modifying the correct relation let (rel, blknum) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {rel}" ); if let Some(heap_blkno) = *new_heap_blkno { // Calculate the VM block and offset that corresponds to the heap block. let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); // Check that we're modifying the correct VM block. assert!(map_block == blknum); // equivalent to PageGetContents(page) let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[map_byte as usize] &= !(flags << map_offset); // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it. if !postgres_ffi::page_is_new(page) { postgres_ffi::page_set_lsn(page, lsn); } } // Repeat for 'old_heap_blkno', if any if let Some(heap_blkno) = *old_heap_blkno { let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); assert!(map_block == blknum); let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[map_byte as usize] &= !(flags << map_offset); // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it. if !postgres_ffi::page_is_new(page) { postgres_ffi::page_set_lsn(page, lsn); } } } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. NeonWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, "ClogSetCommitted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); } // Append the timestamp if page.len() == BLCKSZ as usize + 8 { page.truncate(BLCKSZ as usize); } if page.len() == BLCKSZ as usize { page.extend_from_slice(×tamp.to_be_bytes()); } else { warn!( "CLOG blk {} in seg {} has invalid size {}", blknum, segno, page.len() ); } } NeonWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, "ClogSetAborted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, "ClogSetAborted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, "ClogSetAborted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } NeonWalRecord::MultixactOffsetCreate { mid, moff } => { let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactOffsets, "MultixactOffsetCreate record with unexpected key {key}" ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let offset = (entryno * 4) as usize; // Check that we're modifying the correct multixact-offsets block. let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); assert!( blknum == expected_blknum, "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } NeonWalRecord::MultixactMembersCreate { moff, members } => { let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactMembers, "MultixactMembersCreate record with unexpected key {key}" ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; let memberoff = mx_offset_to_member_offset(offset); let flagsoff = mx_offset_to_flags_offset(offset); let bshift = mx_offset_to_flags_bitshift(offset); // Check that we're modifying the correct multixact-members block. let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); assert!( blknum == expected_blknum, "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); flagsval |= member.status << bshift; LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); } } NeonWalRecord::AuxFile { .. } => { // No-op: this record will never be created in aux v2. warn!("AuxFile record should not be created in aux v2"); } #[cfg(feature = "testing")] NeonWalRecord::Test { append, clear, will_init, only_if, } => { use bytes::BufMut; if *will_init { assert!(*clear, "init record must be clear to ensure correctness"); assert!( page.is_empty(), "init record must be the first entry to ensure correctness" ); } if *clear { page.clear(); } if let Some(only_if) = only_if { if page != only_if.as_bytes() { return Err(anyhow::anyhow!( "the current image does not match the expected image, cannot append" )); } } page.put_slice(append.as_bytes()); } } Ok(()) } ================================================ FILE: pageserver/src/walredo/process/no_leak_child.rs ================================================ use std::io; use std::ops::{Deref, DerefMut}; use std::process::{Child, Command}; use pageserver_api::shard::TenantShardId; use tracing::{error, info, instrument}; use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause}; /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. pub(crate) struct NoLeakChild { pub(crate) tenant_id: TenantShardId, pub(crate) child: Option, } impl Deref for NoLeakChild { type Target = Child; fn deref(&self) -> &Self::Target { self.child.as_ref().expect("must not use from drop") } } impl DerefMut for NoLeakChild { fn deref_mut(&mut self) -> &mut Self::Target { self.child.as_mut().expect("must not use from drop") } } impl NoLeakChild { pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { let child = command.spawn()?; Ok(NoLeakChild { tenant_id, child: Some(child), }) } pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) { let child = match self.child.take() { Some(child) => child, None => return, }; Self::kill_and_wait_impl(child, cause); } #[instrument(skip_all, fields(pid=child.id(), ?cause))] pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { scopeguard::defer! { WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); } let res = child.kill(); if let Err(e) = res { // This branch is very unlikely because: // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. // - This is the only place that calls .kill() // - We consume `self`, so, .kill() can't be called twice. // - If the process exited by itself or was killed by someone else, // .kill() will still succeed because we haven't wait()'ed yet. // // So, if we arrive here, we have really no idea what happened, // whether the PID stored in self.child is still valid, etc. // If this function were fallible, we'd return an error, but // since it isn't, all we can do is log an error and proceed // with the wait(). error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); } match child.wait() { Ok(exit_status) => { info!(exit_status = %exit_status, "wait successful"); } Err(e) => { error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); } } } } impl Drop for NoLeakChild { fn drop(&mut self) { let child = match self.child.take() { Some(child) => child, None => return, }; let tenant_shard_id = self.tenant_id; // Offload the kill+wait of the child process into the background. // If someone stops the runtime, we'll leak the child process. // We can ignore that case because we only stop the runtime on pageserver exit. tokio::runtime::Handle::current().spawn(async move { tokio::task::spawn_blocking(move || { // Intentionally don't inherit the tracing context from whoever is dropping us. // This thread here is going to outlive of our dropper. let span = tracing::info_span!( "walredo", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug() ); let _entered = span.enter(); Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); }) .await }); } } pub(crate) trait NoLeakChildCommandExt { fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; } impl NoLeakChildCommandExt for Command { fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { NoLeakChild::spawn(tenant_id, self) } } ================================================ FILE: pageserver/src/walredo/process/protocol.rs ================================================ use bytes::BufMut; use pageserver_api::reltag::RelTag; use serde::Serialize; use utils::bin_ser::BeSer; use utils::lsn::Lsn; /// /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. /// /// In Postgres `BufferTag` structure is used for exactly the same purpose. /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). /// #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] pub(crate) struct BufferTag { pub rel: RelTag, pub blknum: u32, } pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { let len = 4 + 1 + 4 * 4; buf.put_u8(b'B'); buf.put_u32(len as u32); tag.ser_into(buf) .expect("serialize BufferTag should always succeed"); } pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { assert!(base_img.len() == 8192); let len = 4 + 1 + 4 * 4 + base_img.len(); buf.put_u8(b'P'); buf.put_u32(len as u32); tag.ser_into(buf) .expect("serialize BufferTag should always succeed"); buf.put(base_img); } pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { let len = 4 + 8 + rec.len(); buf.put_u8(b'A'); buf.put_u32(len as u32); buf.put_u64(endlsn.0); buf.put(rec); } pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { let len = 4 + 1 + 4 * 4; buf.put_u8(b'G'); buf.put_u32(len as u32); tag.ser_into(buf) .expect("serialize BufferTag should always succeed"); } pub(crate) fn build_ping_msg(buf: &mut Vec) { buf.put_u8(b'H'); buf.put_u32(4); } ================================================ FILE: pageserver/src/walredo/process.rs ================================================ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; use std::collections::VecDeque; use std::process::{Command, Stdio}; #[cfg(feature = "testing")] use std::sync::atomic::AtomicUsize; use std::time::Duration; use anyhow::Context; use bytes::Bytes; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tracing::{Instrument, debug, error, instrument}; use utils::lsn::Lsn; use utils::poison::Poison; use wal_decoder::models::record::NeonWalRecord; use self::no_leak_child::NoLeakChild; use crate::config::PageServerConf; use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause}; use crate::page_cache::PAGE_SZ; use crate::span::debug_assert_current_span_has_tenant_id; pub struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, #[cfg(feature = "testing")] tenant_shard_id: TenantShardId, // Some() on construction, only becomes None on Drop. child: Option, stdout: tokio::sync::Mutex>, stdin: tokio::sync::Mutex>, /// Counter to separate same sized walredo inputs failing at the same millisecond. #[cfg(feature = "testing")] dump_sequence: AtomicUsize, } struct ProcessInput { stdin: tokio::process::ChildStdin, n_requests: usize, } struct ProcessOutput { stdout: tokio::process::ChildStdout, pending_responses: VecDeque>, n_processed_responses: usize, } impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // #[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))] pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: PgMajorVersion, ) -> anyhow::Result { crate::span::debug_assert_current_span_has_tenant_id(); let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; use no_leak_child::NoLeakChildCommandExt; // Start postgres itself let child = Command::new(pg_bin_dir_path.join("postgres")) // the first arg must be --wal-redo so the child process enters into walredo mode .arg("--wal-redo") // the child doesn't process this arg, but, having it in the argv helps indentify the // walredo process for a particular tenant when debugging a pagserver .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) .env( "ASAN_OPTIONS", std::env::var("ASAN_OPTIONS").unwrap_or_default(), ) .env( "UBSAN_OPTIONS", std::env::var("UBSAN_OPTIONS").unwrap_or_default(), ) // NB: The redo process is not trusted after we sent it the first // walredo work. Before that, it is trusted. Specifically, we trust // it to // 1. close all file descriptors except stdin, stdout, stderr because // pageserver might not be 100% diligent in setting FD_CLOEXEC on all // the files it opens, and // 2. to use seccomp to sandbox itself before processing the first // walredo request. .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc(); let mut child = scopeguard::guard(child, |child| { error!("killing wal-redo-postgres process due to a problem during launch"); child.kill_and_wait(WalRedoKillCause::Startup); }); let stdin = child.stdin.take().unwrap(); let stdout = child.stdout.take().unwrap(); let stderr = child.stderr.take().unwrap(); let stderr = tokio::process::ChildStderr::from_std(stderr) .context("convert to tokio::ChildStderr")?; let stdin = tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; let stdout = tokio::process::ChildStdout::from_std(stdout) .context("convert to tokio::ChildStdout")?; // all fallible operations post-spawn are complete, so get rid of the guard let child = scopeguard::ScopeGuard::into_inner(child); tokio::spawn( async move { scopeguard::defer! { debug!("wal-redo-postgres stderr_logger_task finished"); crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); } debug!("wal-redo-postgres stderr_logger_task started"); crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); use tokio::io::AsyncBufReadExt; let mut stderr_lines = tokio::io::BufReader::new(stderr); let mut buf = Vec::new(); let res = loop { buf.clear(); // TODO we don't trust the process to cap its stderr length. // Currently it can do unbounded Vec allocation. match stderr_lines.read_until(b'\n', &mut buf).await { Ok(0) => break Ok(()), // eof Ok(num_bytes) => { let output = String::from_utf8_lossy(&buf[..num_bytes]); if !output.contains("LOG:") { error!(%output, "received output"); } } Err(e) => { break Err(e); } } }; match res { Ok(()) => (), Err(e) => { error!(error=?e, "failed to read from walredo stderr"); } } }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) ); Ok(Self { conf, #[cfg(feature = "testing")] tenant_shard_id, child: Some(child), stdin: tokio::sync::Mutex::new(Poison::new( "stdin", ProcessInput { stdin, n_requests: 0, }, )), stdout: tokio::sync::Mutex::new(Poison::new( "stdout", ProcessOutput { stdout, pending_responses: VecDeque::new(), n_processed_responses: 0, }, )), #[cfg(feature = "testing")] dump_sequence: AtomicUsize::default(), }) } pub(crate) fn id(&self) -> u32 { self.child .as_ref() .expect("must not call this during Drop") .id() } /// Apply given WAL records ('records') over an old page image. Returns /// new page image. /// /// # Cancel-Safety /// /// Cancellation safe. #[instrument(skip_all, fields(pid=%self.id()))] pub(crate) async fn apply_wal_records( &self, rel: RelTag, blknum: u32, base_img: &Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { debug_assert_current_span_has_tenant_id(); let tag = protocol::BufferTag { rel, blknum }; // Serialize all the messages to send the WAL redo process first. // // This could be problematic if there are millions of records to replay, // but in practice the number of records is usually so small that it doesn't // matter, and it's better to keep this code simple. // // Most requests start with a before-image with BLCKSZ bytes, followed by // by some other WAL records. Start with a buffer that can hold that // comfortably. let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); if let Some(img) = base_img { protocol::build_push_page_msg(tag, img, &mut writebuf); } for (lsn, rec) in records.iter() { if let NeonWalRecord::Postgres { will_init: _, rec: postgres_rec, } = rec { protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); } else { anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); } } protocol::build_get_page_msg(tag, &mut writebuf); WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); let Ok(res) = tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await else { anyhow::bail!("WAL redo timed out"); }; if res.is_err() { // not all of these can be caused by this particular input, however these are so rare // in tests so capture all. self.record_and_log(&writebuf); } res } /// Do a ping request-response roundtrip. /// /// Not used in production, but by Rust benchmarks. pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> { let mut writebuf: Vec = Vec::with_capacity(4); protocol::build_ping_msg(&mut writebuf); let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await else { anyhow::bail!("WAL redo ping timed out"); }; let response = res?; if response.len() != PAGE_SZ { anyhow::bail!( "WAL redo ping response should respond with page-sized response: {}", response.len() ); } Ok(()) } /// # Cancel-Safety /// /// When not polled to completion (e.g. because in `tokio::select!` another /// branch becomes ready before this future), concurrent and subsequent /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. /// Dispose of this process instance and create a new one. async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { let request_no = { let mut lock_guard = self.stdin.lock().await; let mut poison_guard = lock_guard.check_and_arm()?; let input = poison_guard.data_mut(); input .stdin .write_all(writebuf) .await .context("write to walredo stdin")?; let request_no = input.n_requests; input.n_requests += 1; poison_guard.disarm(); request_no }; // To improve walredo performance we separate sending requests and receiving // responses. Them are protected by different mutexes (output and input). // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process // then there is not warranty that T1 will first granted output mutex lock. // To address this issue we maintain number of sent requests, number of processed // responses and ring buffer with pending responses. After sending response // (under input mutex), threads remembers request number. Then it releases // input mutex, locks output mutex and fetch in ring buffer all responses until // its stored request number. The it takes correspondent element from // pending responses ring buffer and truncate all empty elements from the front, // advancing processed responses number. let mut lock_guard = self.stdout.lock().await; let mut poison_guard = lock_guard.check_and_arm()?; let output = poison_guard.data_mut(); let n_processed_responses = output.n_processed_responses; while n_processed_responses + output.pending_responses.len() <= request_no { // We expect the WAL redo process to respond with an 8k page image. We read it // into this buffer. let mut resultbuf = vec![0; BLCKSZ.into()]; output .stdout .read_exact(&mut resultbuf) .await .context("read walredo stdout")?; output .pending_responses .push_back(Some(Bytes::from(resultbuf))); } // Replace our request's response with None in `pending_responses`. // Then make space in the ring buffer by clearing out any seqence of contiguous // `None`'s from the front of `pending_responses`. // NB: We can't pop_front() because other requests' responses because another // requester might have grabbed the output mutex before us: // T1: grab input mutex // T1: send request_no 23 // T1: release input mutex // T2: grab input mutex // T2: send request_no 24 // T2: release input mutex // T2: grab output mutex // T2: n_processed_responses + output.pending_responses.len() <= request_no // 23 0 24 // T2: enters poll loop that reads stdout // T2: put response for 23 into pending_responses // T2: put response for 24 into pending_resposnes // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back // T2: takes its response_24 // pending_responses now looks like this: Front Some(response_23) None Back // T2: does the while loop below // pending_responses now looks like this: Front Some(response_23) None Back // T2: releases output mutex // T1: grabs output mutex // T1: n_processed_responses + output.pending_responses.len() > request_no // 23 2 23 // T1: skips poll loop that reads stdout // T1: takes its response_23 // pending_responses now looks like this: Front None None Back // T2: does the while loop below // pending_responses now looks like this: Front Back // n_processed_responses now has value 25 let res = output.pending_responses[request_no - n_processed_responses] .take() .expect("we own this request_no, nobody else is supposed to take it"); while let Some(front) = output.pending_responses.front() { if front.is_none() { output.pending_responses.pop_front(); output.n_processed_responses += 1; } else { break; } } poison_guard.disarm(); Ok(res) } #[cfg(feature = "testing")] fn record_and_log(&self, writebuf: &[u8]) { use std::sync::atomic::Ordering; let millis = std::time::SystemTime::now() .duration_since(std::time::SystemTime::UNIX_EPOCH) .unwrap() .as_millis(); let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); use std::io::Write; let res = std::fs::OpenOptions::new() .write(true) .create_new(true) .read(true) .open(path) .and_then(|mut f| f.write_all(writebuf)); // trip up allowed_errors if let Err(e) = res { tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); } else { tracing::error!(filename, "erroring walredo input saved"); } } #[cfg(not(feature = "testing"))] fn record_and_log(&self, _: &[u8]) {} } impl Drop for WalRedoProcess { fn drop(&mut self) { self.child .take() .expect("we only do this once") .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); // no way to wait for stderr_logger_task from Drop because that is async only } } ================================================ FILE: pageserver/src/walredo.rs ================================================ //! //! WAL redo. This service runs PostgreSQL in a special wal_redo mode //! to apply given WAL records over an old page image and return new //! page image. //! //! We rely on Postgres to perform WAL redo for us. We launch a //! postgres process in special "wal redo" mode that's similar to //! single-user mode. We then pass the previous page image, if any, //! and all the WAL records we want to apply, to the postgres //! process. Then we get the page image back. Communication with the //! postgres process happens via stdin/stdout //! //! See pgxn/neon_walredo/walredoproc.c for the other side of //! this communication. //! //! The Postgres process is assumed to be secure against malicious WAL //! records. It achieves it by dropping privileges before replaying //! any WAL records, so that even if an attacker hijacks the Postgres //! process, he cannot escape out of it. /// Process lifecycle and abstracction for the IPC protocol. mod process; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; use std::future::Future; use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; use postgres_ffi::PgMajorVersion; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; use wal_decoder::models::record::NeonWalRecord; use crate::config::PageServerConf; use crate::metrics::{ WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, }; /// The real implementation that uses a Postgres process to /// perform WAL replay. /// /// Only one thread can use the process at a time, that is controlled by the /// Mutex. In the future, we might want to launch a pool of processes to allow /// concurrent replay of multiple records. pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, /// We use [`heavier_once_cell`] for /// /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`]) /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]). /// /// # Spawning /// /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`]. /// /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. /// /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] /// had that behavior; it's probably unnecessary. /// The only merit of it is that if one walredo process encounters an error, /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. /// and retry redo, thereby starting the new process, while other redo tasks might /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. /// /// # Shutdown /// /// See [`Self::launched_processes`]. redo_process: heavier_once_cell::OnceCell, /// Gate that is entered when launching a walredo process and held open /// until the process has been `kill()`ed and `wait()`ed upon. /// /// Manager shutdown waits for this gate to close after setting the /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`]. /// /// This type of usage is a bit unusual because gates usually keep track of /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight. /// But we use it here to keep track of the _processes_ that we have launched, /// which may outlive any individual redo request because /// - we keep walredo process around until its quiesced to amortize spawn cost and /// - the Arc may be held by multiple concurrent redo requests, so, just because /// you replace the [`Self::redo_process`] cell's content doesn't mean the /// process gets killed immediately. /// /// We could simplify this by getting rid of the [`Arc`]. /// See the comment on [`Self::redo_process`] for more details. launched_processes: utils::sync::gate::Gate, } /// See [`PostgresRedoManager::redo_process`]. enum ProcessOnceCell { Spawned(Arc), ManagerShutDown, } struct Process { process: process::WalRedoProcess, /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). _launched_processes_guard: utils::sync::gate::GateGuard, } impl std::ops::Deref for Process { type Target = process::WalRedoProcess; fn deref(&self) -> &Self::Target { &self.process } } #[derive(Debug, thiserror::Error)] pub enum Error { #[error("cancelled")] Cancelled, #[error(transparent)] Other(#[from] anyhow::Error), } macro_rules! bail { ($($arg:tt)*) => { return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*))); } } #[derive(Debug, Clone, Copy)] pub enum RedoAttemptType { /// Used for the read path. Will fire critical errors and retry twice if failure. ReadPage, // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure. LegacyCompaction, // Used for gc compaction. Will not fire critical errors and not retry. GcCompaction, } impl std::fmt::Display for RedoAttemptType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { RedoAttemptType::ReadPage => write!(f, "read page"), RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"), RedoAttemptType::GcCompaction => write!(f, "gc compaction"), } } } /// /// Public interface of WAL redo manager /// impl PostgresRedoManager { /// /// Request the WAL redo manager to apply some WAL records /// /// The WAL redo is handled by a separate thread, so this just sends a request /// to the thread and waits for response. /// /// # Cancel-Safety /// /// This method is cancellation-safe. pub async fn request_redo( &self, key: Key, lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { bail!("invalid WAL redo request with no records"); } let max_retry_attempts = match redo_attempt_type { RedoAttemptType::ReadPage => 2, RedoAttemptType::LegacyCompaction => 1, RedoAttemptType::GcCompaction => 0, }; let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); let mut batch_start = 0; for (i, record) in records.iter().enumerate().skip(1) { let rec_neon = apply_neon::can_apply_in_neon(&record.1); if rec_neon != batch_neon { let result = if batch_neon { self.apply_batch_neon(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( key, lsn, img, base_img_lsn, &records[batch_start..i], self.conf.wal_redo_timeout, pg_version, max_retry_attempts, redo_attempt_type, ) .await }; img = Some(result?); batch_neon = rec_neon; batch_start = i; } } // last batch if batch_neon { self.apply_batch_neon(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( key, lsn, img, base_img_lsn, &records[batch_start..], self.conf.wal_redo_timeout, pg_version, max_retry_attempts, redo_attempt_type, ) .await } } /// Do a ping request-response roundtrip. /// /// Not used in production, but by Rust benchmarks. /// /// # Cancel-Safety /// /// This method is cancellation-safe. pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> { self.do_with_walredo_process(pg_version, |proc| async move { proc.ping(Duration::from_secs(1)) .await .map_err(Error::Other) }) .await } pub fn status(&self) -> WalRedoManagerStatus { WalRedoManagerStatus { last_redo_at: { let at = *self.last_redo_at.lock().unwrap(); at.and_then(|at| { let age = at.elapsed(); // map any chrono errors silently to None here chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, process: self.redo_process.get().and_then(|p| match &*p { ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }), ProcessOnceCell::ManagerShutDown => None, }), } } } impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// pub fn new( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, ) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), redo_process: heavier_once_cell::OnceCell::default(), launched_processes: utils::sync::gate::Gate::default(), } } /// Shut down the WAL redo manager. /// /// Returns `true` if this call was the one that initiated shutdown. /// `true` may be observed by no caller if the first caller stops polling. /// /// After this future completes /// - no redo process is running /// - no new redo process will be spawned /// - redo requests that need walredo process will fail with [`Error::Cancelled`] /// - [`apply_neon`]-only redo requests may still work, but this may change in the future /// /// # Cancel-Safety /// /// This method is cancellation-safe. pub async fn shutdown(&self) -> bool { // prevent new processes from being spawned let maybe_permit = match self.redo_process.get_or_init_detached().await { Ok(guard) => { if matches!(&*guard, ProcessOnceCell::ManagerShutDown) { None } else { let (proc, permit) = guard.take_and_deinit(); drop(proc); // this just drops the Arc, its refcount may not be zero yet Some(permit) } } Err(permit) => Some(permit), }; let it_was_us = if let Some(permit) = maybe_permit { self.redo_process .set(ProcessOnceCell::ManagerShutDown, permit); true } else { false }; // wait for ongoing requests to drain and the refcounts of all Arc that // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s // for the underlying process. self.launched_processes.close().await; it_was_us } /// This type doesn't have its own background task to check for idleness: we /// rely on our owner calling this function periodically in its own housekeeping /// loops. pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { if let Ok(g) = self.last_redo_at.try_lock() { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } } /// # Cancel-Safety /// /// This method is cancel-safe iff `closure` is cancel-safe. async fn do_with_walredo_process< F: FnOnce(Arc) -> Fut, Fut: Future>, O, >( &self, pg_version: PgMajorVersion, closure: F, ) -> Result { let proc: Arc = match self.redo_process.get_or_init_detached().await { Ok(guard) => match &*guard { ProcessOnceCell::Spawned(proc) => Arc::clone(proc), ProcessOnceCell::ManagerShutDown => { return Err(Error::Cancelled); } }, Err(permit) => { let start = Instant::now(); // acquire guard before spawning process, so that we don't spawn new processes // if the gate is already closed. let _launched_processes_guard = match self.launched_processes.enter() { Ok(guard) => guard, Err(GateError::GateClosed) => unreachable!( "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" ), }; let proc = Arc::new(Process { process: process::WalRedoProcess::launch( self.conf, self.tenant_shard_id, pg_version, ) .context("launch walredo process")?, _launched_processes_guard, }); let duration = start.elapsed(); WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); info!( elapsed_ms = duration.as_millis(), pid = proc.id(), "launched walredo process" ); self.redo_process .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); proc } }; // async closures are unstable, would support &Process let result = closure(proc.clone()).await; if result.is_err() { // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. // Note that there may be other tasks concurrent with us that also hold `proc`. // We have to deal with that here. // Also read the doc comment on field `self.redo_process`. // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. // // NB: the drop impl blocks the dropping thread with a wait() system call for // the child process. In some ways the blocking is actually good: if we // deferred the waiting into the background / to tokio if we used `tokio::process`, // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. match self.redo_process.get() { None => (), Some(guard) => { match &*guard { ProcessOnceCell::ManagerShutDown => {} ProcessOnceCell::Spawned(guard_proc) => { if Arc::ptr_eq(&proc, guard_proc) { // We're the first to observe an error from `proc`, it's our job to take it out of rotation. guard.take_and_deinit(); } else { // Another task already spawned another redo process (further up in this method) // and put it into `redo_process`. Do nothing, our view of the world is behind. } } } } } // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } result } /// /// Process one request for WAL redo using wal-redo postgres /// /// # Cancel-Safety /// /// Cancellation safe. #[allow(clippy::too_many_arguments)] async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, base_img: Option, base_img_lsn: Lsn, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: PgMajorVersion, max_retry_attempts: u32, redo_attempt_type: RedoAttemptType, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; let mut n_attempts = 0u32; loop { let base_img = &base_img; let closure = |proc: Arc| async move { let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres let result = proc .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) .await .context("apply_wal_records"); let duration = started_at.elapsed(); let len = records.len(); let nbytes = records.iter().fold(0, |acumulator, record| { acumulator + match &record.1 { NeonWalRecord::Postgres { rec, .. } => rec.len(), _ => unreachable!("Only PostgreSQL records are accepted in this batch"), } }); WAL_REDO_TIME.observe(duration.as_secs_f64()); WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); debug!( "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", len, nbytes, duration.as_micros(), lsn ); if let Err(e) = result.as_ref() { macro_rules! message { ($level:tt) => { $level!( "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), nbytes, key, redo_attempt_type, base_img_lsn, lsn, n_attempts, e, ) } } match redo_attempt_type { RedoAttemptType::ReadPage => message!(error), RedoAttemptType::LegacyCompaction => message!(error), RedoAttemptType::GcCompaction => message!(warn), } } result.map_err(Error::Other) }; let result = self.do_with_walredo_process(pg_version, closure).await; if result.is_ok() && n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > max_retry_attempts || result.is_ok() { return result; } } } /// /// Process a batch of WAL records using bespoken Neon code. /// fn apply_batch_neon( &self, key: Key, lsn: Lsn, base_img: Option, records: &[(Lsn, NeonWalRecord)], ) -> Result { let start_time = Instant::now(); let mut page = BytesMut::new(); if let Some(fpi) = base_img { // If full-page image is provided, then use it... page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. bail!("invalid neon WAL redo request with no base image"); } // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! let duration = start_time.elapsed(); // FIXME: using the same metric here creates a bimodal distribution by default, and because // there could be multiple batch sizes this would be N+1 modal. WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( "neon applied {} WAL records in {} us to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn ); Ok(page.freeze()) } fn apply_record_neon( &self, key: Key, page: &mut BytesMut, record_lsn: Lsn, record: &NeonWalRecord, ) -> anyhow::Result<()> { apply_neon::apply_in_neon(record, record_lsn, key, page)?; Ok(()) } } #[cfg(test)] pub(crate) mod harness { use super::PostgresRedoManager; use crate::config::PageServerConf; use utils::{id::TenantId, shard::TenantShardId}; pub struct RedoHarness { // underscored because unused, except for removal at drop _repo_dir: camino_tempfile::Utf8TempDir, pub manager: PostgresRedoManager, tenant_shard_id: TenantShardId, } impl RedoHarness { pub fn new() -> anyhow::Result { crate::tenant::harness::setup_logging(); let repo_dir = camino_tempfile::tempdir()?; let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); let manager = PostgresRedoManager::new(conf, tenant_shard_id); Ok(RedoHarness { _repo_dir: repo_dir, manager, tenant_shard_id, }) } pub fn span(&self) -> tracing::Span { tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) } } } #[cfg(test)] mod tests { use std::str::FromStr; use bytes::Bytes; use pageserver_api::key::Key; use postgres_ffi::PgMajorVersion; use tracing::Instrument; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; use crate::walredo::RedoAttemptType; use crate::walredo::harness::RedoHarness; #[tokio::test] async fn test_ping() { let h = RedoHarness::new().unwrap(); h.manager .ping(PgMajorVersion::PG14) .instrument(h.span()) .await .expect("ping should work"); } #[tokio::test] async fn short_v14_redo() { let expected = std::fs::read("test_data/short_v14_redo.page").unwrap(); let h = RedoHarness::new().unwrap(); let page = h .manager .request_redo( Key { field1: 0, field2: 1663, field3: 13010, field4: 1259, field5: 0, field6: 0, }, Lsn::from_str("0/16E2408").unwrap(), None, short_records(), PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) .await .unwrap(); assert_eq!(&expected, &*page); } #[tokio::test] async fn short_v14_fails_for_wrong_key_but_returns_zero_page() { let h = RedoHarness::new().unwrap(); let page = h .manager .request_redo( Key { field1: 0, field2: 1663, // key should be 13010 field3: 13130, field4: 1259, field5: 0, field6: 0, }, Lsn::from_str("0/16E2408").unwrap(), None, short_records(), PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) .await .unwrap(); // TODO: there will be some stderr printout, which is forwarded to tracing that could // perhaps be captured as long as it's in the same thread. assert_eq!(page, crate::ZERO_PAGE); } #[tokio::test] async fn test_stderr() { let h = RedoHarness::new().unwrap(); h .manager .request_redo( Key::from_i128(0), Lsn::INVALID, None, short_records(), PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ RedoAttemptType::ReadPage, ) .instrument(h.span()) .await .unwrap_err(); } #[allow(clippy::octal_escapes)] fn short_records() -> Vec<(Lsn, NeonWalRecord)> { vec![ ( Lsn::from_str("0/16A9388").unwrap(), NeonWalRecord::Postgres { will_init: true, rec: Bytes::from_static(b"j\x03\0\0\0\x04\0\0\xe8\x7fj\x01\0\0\0\0\0\n\0\0\xd0\x16\x13Y\0\x10\0\04\x03\xd4\0\x05\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x03\0\0\0\0\x80\xeca\x01\0\0\x01\0\xd4\0\xa0\x1d\0 \x04 \0\0\0\0/\0\x01\0\xa0\x9dX\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\00\x9f\x9a\x01P\x9e\xb2\x01\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0!\0\x01\x08 \xff\xff\xff?\0\0\0\0\0\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\0\0\0\0\0\0\x80\xbf\0\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\0\0\0\0\x0c\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0/\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0\xdf\x04\0\0pg_type\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0G\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\x0e\0\0\0\0@\x16D\x0e\0\0\0K\x10\0\0\x01\0pr \0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0[\x01\0\0\0\0\0\0\0\t\x04\0\0\x02\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0C\x01\0\0\x15\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0;\n\0\0pg_statistic\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xfd.\0\0\0\0\0\0\n\0\0\0\x02\0\0\0;\n\0\0\0\0\0\0\x13\0\0\0\0\0\xcbC\x13\0\0\0\x18\x0b\0\0\x01\0pr\x1f\0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0C\x01\0\0\0\0\0\0\0\t\x04\0\0\x01\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\x02\0\x01") } ), ( Lsn::from_str("0/16D4080").unwrap(), NeonWalRecord::Postgres { will_init: false, rec: Bytes::from_static(b"\xbc\0\0\0\0\0\0\0h?m\x01\0\0\0\0p\n\0\09\x08\xa3\xea\0 \x8c\0\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x02\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\x05\0\0\0\0@zD\x05\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\x02\0") } ) ] } } ================================================ FILE: pageserver/test_data/indices/mixed_workload/README.md ================================================ # This was captured from one shard of a large tenant in staging. # It has a mixture of deltas and image layers, >1000 layers in total. # This is suitable for general smoke tests that want an index which is not # trivially small, but doesn't contain weird/pathological cases. ================================================ FILE: pageserver/test_data/indices/mixed_workload/index_part.json ================================================ {"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}} ================================================ FILE: pgxn/.dir-locals.el ================================================ ;; see also src/tools/editors/emacs.samples for more complete settings ((c-mode . ((c-basic-offset . 4) (c-file-style . "bsd") (fill-column . 78) (indent-tabs-mode . t) (tab-width . 4))) (nxml-mode . ((fill-column . 78) (indent-tabs-mode . nil))) (perl-mode . ((perl-indent-level . 4) (perl-continued-statement-offset . 2) (perl-continued-brace-offset . -2) (perl-brace-offset . 0) (perl-brace-imaginary-offset . 0) (perl-label-offset . -2) (indent-tabs-mode . t) (tab-width . 4))) (sgml-mode . ((fill-column . 78) (indent-tabs-mode . nil)))) ================================================ FILE: pgxn/.editorconfig ================================================ root = true [*.{c,h,l,y,pl,pm}] indent_style = tab indent_size = tab tab_width = 4 [*.{sgml,xml}] indent_style = space indent_size = 1 [*.xsl] indent_style = space indent_size = 2 ================================================ FILE: pgxn/Makefile ================================================ # This makefile assumes that 'pg_config' is in the path, or is passed in the # PG_CONFIG variable. # # This is used in two different ways: # # 1. The main makefile calls this, when you invoke the `make neon-pg-ext-%` # target. It passes PG_CONFIG pointing to pg_install/%/bin/pg_config. # This is a VPATH build; the current directory is build/pgxn-%, and # the path to the Makefile is passed with the -f argument. # # 2. compute-node.Dockerfile invokes this to build the compute extensions # for the specific Postgres version. It relies on pg_config already # being in $(PATH). srcdir = $(dir $(firstword $(MAKEFILE_LIST))) PG_CONFIG = pg_config subdirs = neon neon_rmgr neon_walredo neon_utils neon_test_utils .PHONY: install install-compute install-storage $(subdirs) install: $(subdirs) install-compute: neon neon_utils neon_test_utils neon_rmgr install-storage: neon_rmgr neon_walredo $(subdirs): %: mkdir -p $* $(MAKE) PG_CONFIG=$(PG_CONFIG) -C $* -f $(abspath $(srcdir)/$@/Makefile) install ================================================ FILE: pgxn/neon/Makefile ================================================ # pgxs/neon/Makefile MODULE_big = neon OBJS = \ $(WIN32RES) \ communicator.o \ communicator_process.o \ extension_server.o \ file_cache.o \ hll.o \ libpagestore.o \ logical_replication_monitor.o \ neon.o \ neon_lwlsncache.o \ neon_pgversioncompat.o \ neon_perf_counters.o \ neon_utils.o \ neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ unstable_extensions.o \ walproposer.o \ walproposer_pg.o \ neon_ddl_handler.o \ walsender_hooks.o \ $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl UNAME_S := $(shell uname -s) ifeq ($(UNAME_S), Darwin) SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration # Link against object files for the current macOS version, to avoid spurious linker warnings. MACOSX_DEPLOYMENT_TARGET := $(shell xcrun --sdk macosx --show-sdk-version) export MACOSX_DEPLOYMENT_TARGET endif EXTENSION = neon DATA = \ neon--1.0.sql \ neon--1.0--1.1.sql \ neon--1.1--1.2.sql \ neon--1.2--1.3.sql \ neon--1.3--1.4.sql \ neon--1.4--1.5.sql \ neon--1.5--1.6.sql \ neon--1.6--1.5.sql \ neon--1.5--1.4.sql \ neon--1.4--1.3.sql \ neon--1.3--1.2.sql \ neon--1.2--1.1.sql \ neon--1.1--1.0.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ libwalproposer.a WALPROP_OBJS = \ $(WIN32RES) \ walproposer.o \ neon_utils.o \ walproposer_compat.o # libcommunicator.a is built by cargo from the Rust sources under communicator/ # subdirectory. `cargo build` also generates communicator_bindings.h. communicator_process.o: communicator/communicator_bindings.h file_cache.o: communicator/communicator_bindings.h $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &: (cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)) # Force `cargo build` every time. Some of the Rust sources might have # changed. .PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h .PHONY: walproposer-lib walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB walproposer-lib: libwalproposer.a; .PHONY: libwalproposer.a libwalproposer.a: $(WALPROP_OBJS) $(RM) $@ $(AR) $(AROPT) $@ $^ # needs vars: # FIND_TYPEDEF pointing to find_typedef # INDENT pointing to pg_bsd_indent # PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name: # pgindent will pick it up as pg_bsd_indent path). .PHONY: pgindent pgindent: +@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir) $(FIND_TYPEDEF) . > neon.typedefs INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) ================================================ FILE: pgxn/neon/README.md ================================================ neon extension consists of several parts: ### shared preload library `neon.so` - implements storage manager API and network communications with remote page server. - walproposer: implements broadcast protocol between postgres and WAL safekeepers. - control plane connector: Captures updates to roles/databases using ProcessUtility_hook and sends them to the control ProcessUtility_hook. - remote extension server: Request compute_ctl to download extension files. - file_cache: Local file cache is used to temporary store relations pages in local file system for better performance. - relsize_cache: Relation size cache for better neon performance. ### SQL functions in `neon--*.sql` Utility functions to expose neon specific information to user and metrics collection. This extension is created in all databases in the cluster by default. ================================================ FILE: pgxn/neon/bitmap.h ================================================ #ifndef NEON_BITMAP_H #define NEON_BITMAP_H /* * Utilities for manipulating bits8* as bitmaps. */ #define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) #define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) #define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) #endif /* NEON_BITMAP_H */ ================================================ FILE: pgxn/neon/communicator/.gitignore ================================================ # generated file (with cbindgen, see build.rs) communicator_bindings.h ================================================ FILE: pgxn/neon/communicator/Cargo.toml ================================================ [package] name = "communicator" version = "0.1.0" license.workspace = true edition.workspace = true [lib] crate-type = ["staticlib"] [features] # 'testing' feature is currently unused in the communicator, but we accept it for convenience of # calling build scripts, so that you can pass the same feature to all packages. testing = [] # 'rest_broker' feature is currently unused in the communicator, but we accept it for convenience of # calling build scripts, so that you can pass the same feature to all packages. rest_broker = [] [dependencies] axum.workspace = true http.workspace = true tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] } tracing.workspace = true tracing-subscriber.workspace = true measured.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../../workspace_hack" } [build-dependencies] cbindgen.workspace = true ================================================ FILE: pgxn/neon/communicator/README.md ================================================ # Communicator This package provides the so-called "compute-pageserver communicator", or just "communicator" in short. The communicator is a separate background worker process that runs in the PostgreSQL server. It's part of the neon extension. Currently, it only provides an HTTP endpoint for metrics, but in the future it will evolve to handle all communications with the pageservers. ## Source code view pgxn/neon/communicator_process.c Contains code needed to start up the communicator process, and the glue that interacts with PostgreSQL code and the Rust code in the communicator process. pgxn/neon/communicator/src/worker_process/ Worker process main loop and glue code At compilation time, pgxn/neon/communicator/ produces a static library, libcommunicator.a. It is linked to the neon.so extension library. ================================================ FILE: pgxn/neon/communicator/build.rs ================================================ use std::env; fn main() -> Result<(), Box> { let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); match cbindgen::generate(crate_dir) { Ok(bindings) => { bindings.write_to_file("communicator_bindings.h"); } Err(cbindgen::Error::ParseSyntaxError { .. }) => { // This means there was a syntax error in the Rust sources. Don't panic, because // we want the build to continue and the Rust compiler to hit the error. The // Rust compiler produces a better error message than cbindgen. eprintln!("Generating C bindings failed because of a Rust syntax error"); } Err(err) => panic!("Unable to generate C bindings: {err:?}"), }; Ok(()) } ================================================ FILE: pgxn/neon/communicator/cbindgen.toml ================================================ language = "C" [enum] prefix_with_name = true ================================================ FILE: pgxn/neon/communicator/src/lib.rs ================================================ mod worker_process; /// Name of the Unix Domain Socket that serves the metrics, and other APIs in the /// future. This is within the Postgres data directory. const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket"; ================================================ FILE: pgxn/neon/communicator/src/worker_process/callbacks.rs ================================================ //! C callbacks to PostgreSQL facilities that the neon extension needs to provide. These //! are implemented in `neon/pgxn/communicator_process.c`. The function signatures better //! match! //! //! These are called from the communicator threads! Careful what you do, most Postgres //! functions are not safe to call in that context. #[cfg(not(test))] unsafe extern "C" { pub fn callback_set_my_latch_unsafe(); pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics; } // Compile unit tests with dummy versions of the functions. Unit tests cannot call back // into the C code. (As of this writing, no unit tests even exists in the communicator // package, but the code coverage build still builds these and tries to link with the // external C code.) #[cfg(test)] unsafe fn callback_set_my_latch_unsafe() { panic!("not usable in unit tests"); } #[cfg(test)] unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics { panic!("not usable in unit tests"); } // safe wrappers pub(super) fn callback_set_my_latch() { unsafe { callback_set_my_latch_unsafe() }; } pub(super) fn callback_get_lfc_metrics() -> LfcMetrics { unsafe { callback_get_lfc_metrics_unsafe() } } /// Return type of the callback_get_lfc_metrics() function. #[repr(C)] pub struct LfcMetrics { pub lfc_cache_size_limit: i64, pub lfc_hits: i64, pub lfc_misses: i64, pub lfc_used: i64, pub lfc_writes: i64, // working set size looking back 1..60 minutes. // // Index 0 is the size of the working set accessed within last 1 minute, // index 59 is the size of the working set accessed within last 60 minutes. pub lfc_approximate_working_set_size_windows: [i64; 60], } ================================================ FILE: pgxn/neon/communicator/src/worker_process/control_socket.rs ================================================ //! Communicator control socket. //! //! Currently, the control socket is used to provide information about the communicator //! process, file cache etc. as prometheus metrics. In the future, it can be used to //! expose more things. //! //! The exporter speaks HTTP, listens on a Unix Domain Socket under the Postgres //! data directory. For debugging, you can access it with curl: //! //! ```sh //! curl --unix-socket neon-communicator.socket http://localhost/metrics //! ``` //! use axum::Router; use axum::body::Body; use axum::extract::State; use axum::response::Response; use http::StatusCode; use http::header::CONTENT_TYPE; use measured::MetricGroup; use measured::text::BufferedTextEncoder; use std::io::ErrorKind; use tokio::net::UnixListener; use crate::NEON_COMMUNICATOR_SOCKET_NAME; use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct; impl CommunicatorWorkerProcessStruct { /// Launch the listener pub(crate) async fn launch_control_socket_listener( &'static self, ) -> Result<(), std::io::Error> { use axum::routing::get; let app = Router::new() .route("/metrics", get(get_metrics)) .route("/autoscaling_metrics", get(get_autoscaling_metrics)) .route("/debug/panic", get(handle_debug_panic)) .with_state(self); // If the server is restarted, there might be an old socket still // lying around. Remove it first. match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) { Ok(()) => { tracing::warn!("removed stale control socket"); } Err(e) if e.kind() == ErrorKind::NotFound => {} Err(e) => { tracing::error!("could not remove stale control socket: {e:#}"); // Try to proceed anyway. It will likely fail below though. } }; // Create the unix domain socket and start listening on it let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?; tokio::spawn(async { tracing::info!("control socket listener spawned"); axum::serve(listener, app) .await .expect("axum::serve never returns") }); Ok(()) } } /// Expose all Prometheus metrics. async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response { tracing::trace!("/metrics requested"); metrics_to_response(&state).await } /// Expose Prometheus metrics, for use by the autoscaling agent. /// /// This is a subset of all the metrics. async fn get_autoscaling_metrics( State(state): State<&CommunicatorWorkerProcessStruct>, ) -> Response { tracing::trace!("/metrics requested"); metrics_to_response(&state.lfc_metrics).await } async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response { panic!("test HTTP handler task panic"); } /// Helper function to convert prometheus metrics to a text response async fn metrics_to_response(metrics: &(dyn MetricGroup + Sync)) -> Response { let mut enc = BufferedTextEncoder::new(); metrics .collect_group_into(&mut enc) .unwrap_or_else(|never| match never {}); Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, "application/text") .body(Body::from(enc.finish())) .unwrap() } ================================================ FILE: pgxn/neon/communicator/src/worker_process/lfc_metrics.rs ================================================ use measured::{ FixedCardinalityLabel, Gauge, GaugeVec, LabelGroup, MetricGroup, label::{LabelName, LabelValue, StaticLabelSet}, metric::{MetricEncoding, gauge::GaugeState, group::Encoding}, }; use super::callbacks::callback_get_lfc_metrics; pub(crate) struct LfcMetricsCollector; #[derive(MetricGroup)] #[metric(new())] struct LfcMetricsGroup { /// LFC cache size limit in bytes lfc_cache_size_limit: Gauge, /// LFC cache hits lfc_hits: Gauge, /// LFC cache misses lfc_misses: Gauge, /// LFC chunks used (chunk = 1MB) lfc_used: Gauge, /// LFC cache writes lfc_writes: Gauge, /// Approximate working set size in pages of 8192 bytes #[metric(init = GaugeVec::dense())] lfc_approximate_working_set_size_windows: GaugeVec>, } impl MetricGroup for LfcMetricsCollector where GaugeState: MetricEncoding, { fn collect_group_into(&self, enc: &mut T) -> Result<(), ::Err> { let g = LfcMetricsGroup::new(); let lfc_metrics = callback_get_lfc_metrics(); g.lfc_cache_size_limit.set(lfc_metrics.lfc_cache_size_limit); g.lfc_hits.set(lfc_metrics.lfc_hits); g.lfc_misses.set(lfc_metrics.lfc_misses); g.lfc_used.set(lfc_metrics.lfc_used); g.lfc_writes.set(lfc_metrics.lfc_writes); for i in 0..60 { let val = lfc_metrics.lfc_approximate_working_set_size_windows[i]; g.lfc_approximate_working_set_size_windows .set(MinuteAsSeconds(i), val); } g.collect_group_into(enc) } } /// This stores the values in range 0..60, /// encodes them as seconds (60, 120, 180, ..., 3600) #[derive(Clone, Copy)] struct MinuteAsSeconds(usize); impl FixedCardinalityLabel for MinuteAsSeconds { fn cardinality() -> usize { 60 } fn encode(&self) -> usize { self.0 } fn decode(value: usize) -> Self { Self(value) } } impl LabelValue for MinuteAsSeconds { fn visit(&self, v: V) -> V::Output { v.write_int((self.0 + 1) as i64 * 60) } } impl LabelGroup for MinuteAsSeconds { fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { v.write_value(LabelName::from_str("duration_seconds"), self); } } ================================================ FILE: pgxn/neon/communicator/src/worker_process/logging.rs ================================================ //! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log //! //! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres //! process latch is raised. That wakes up the loop in the main thread, see //! `communicator_new_bgworker_main()`. It reads the message from the channel and //! ereport()s it. This ensures that only one thread, the main thread, calls the //! PostgreSQL logging routines at any time. use std::ffi::c_char; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::mpsc::sync_channel; use std::sync::mpsc::{Receiver, SyncSender}; use std::sync::mpsc::{TryRecvError, TrySendError}; use tracing::info; use tracing::{Event, Level, Metadata, Subscriber}; use tracing_subscriber::filter::LevelFilter; use tracing_subscriber::fmt::format::Writer; use tracing_subscriber::fmt::{FmtContext, FormatEvent, FormatFields, FormattedFields, MakeWriter}; use tracing_subscriber::registry::LookupSpan; use crate::worker_process::callbacks::callback_set_my_latch; /// This handle is passed to the C code, and used by [`communicator_worker_poll_logging`] pub struct LoggingReceiver { receiver: Receiver, } /// This is passed to `tracing` struct LoggingSender { sender: SyncSender, } static DROPPED_EVENT_COUNT: AtomicU64 = AtomicU64::new(0); /// Called once, at worker process startup. The returned LoggingState is passed back /// in the subsequent calls to `pump_logging`. It is opaque to the C code. #[unsafe(no_mangle)] pub extern "C" fn communicator_worker_configure_logging() -> Box { let (sender, receiver) = sync_channel(1000); let receiver = LoggingReceiver { receiver }; let sender = LoggingSender { sender }; use tracing_subscriber::prelude::*; let r = tracing_subscriber::registry(); let r = r.with( tracing_subscriber::fmt::layer() .with_ansi(false) .event_format(SimpleFormatter) .with_writer(sender) // TODO: derive this from log_min_messages? Currently the code in // communicator_process.c forces log_min_messages='INFO'. .with_filter(LevelFilter::from_level(Level::INFO)), ); r.init(); info!("communicator process logging started"); Box::new(receiver) } /// Read one message from the logging queue. This is essentially a wrapper to Receiver, /// with a C-friendly signature. /// /// The message is copied into *errbuf, which is a caller-supplied buffer of size /// `errbuf_len`. If the message doesn't fit in the buffer, it is truncated. It is always /// NULL-terminated. /// /// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see /// elog.h /// /// If there was a message, *dropped_event_count_p is also updated with a counter of how /// many log messages in total has been dropped. By comparing that with the value from /// previous call, you can tell how many were dropped since last call. /// /// Returns: /// /// 0 if there were no messages /// 1 if there was a message. The message and its level are returned in /// *errbuf and *elevel_p. *dropped_event_count_p is also updated. /// -1 on error, i.e the other end of the queue was disconnected #[unsafe(no_mangle)] pub extern "C" fn communicator_worker_poll_logging( state: &mut LoggingReceiver, errbuf: *mut c_char, errbuf_len: u32, elevel_p: &mut i32, dropped_event_count_p: &mut u64, ) -> i32 { let msg = match state.receiver.try_recv() { Err(TryRecvError::Empty) => return 0, Err(TryRecvError::Disconnected) => return -1, Ok(msg) => msg, }; let src: &[u8] = &msg.message; let dst: *mut u8 = errbuf.cast(); let len = std::cmp::min(src.len(), errbuf_len as usize - 1); unsafe { std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len); *(dst.add(len)) = b'\0'; // NULL terminator } // Map the tracing Level to PostgreSQL elevel. // // XXX: These levels are copied from PostgreSQL's elog.h. Introduce another enum to // hide these? *elevel_p = match msg.level { Level::TRACE => 10, // DEBUG5 Level::DEBUG => 14, // DEBUG1 Level::INFO => 17, // INFO Level::WARN => 19, // WARNING Level::ERROR => 21, // ERROR }; *dropped_event_count_p = DROPPED_EVENT_COUNT.load(Ordering::Relaxed); 1 } //---- The following functions can be called from any thread ---- #[derive(Clone)] struct FormattedEventWithMeta { message: Vec, level: tracing::Level, } impl Default for FormattedEventWithMeta { fn default() -> Self { FormattedEventWithMeta { message: Vec::new(), level: tracing::Level::DEBUG, } } } struct EventBuilder<'a> { event: FormattedEventWithMeta, sender: &'a LoggingSender, } impl std::io::Write for EventBuilder<'_> { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.event.message.write(buf) } fn flush(&mut self) -> std::io::Result<()> { self.sender.send_event(self.event.clone()); Ok(()) } } impl Drop for EventBuilder<'_> { fn drop(&mut self) { let sender = self.sender; let event = std::mem::take(&mut self.event); sender.send_event(event); } } impl<'a> MakeWriter<'a> for LoggingSender { type Writer = EventBuilder<'a>; fn make_writer(&'a self) -> Self::Writer { panic!("not expected to be called when make_writer_for is implemented"); } fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer { EventBuilder { event: FormattedEventWithMeta { message: Vec::new(), level: *meta.level(), }, sender: self, } } } impl LoggingSender { fn send_event(&self, e: FormattedEventWithMeta) { match self.sender.try_send(e) { Ok(()) => { // notify the main thread callback_set_my_latch(); } Err(TrySendError::Disconnected(_)) => {} Err(TrySendError::Full(_)) => { // The queue is full, cannot send any more. To avoid blocking the tokio // thread, simply drop the message. Better to lose some logs than get // stuck if there's a problem with the logging. // // Record the fact that was a message was dropped by incrementing the // counter. DROPPED_EVENT_COUNT.fetch_add(1, Ordering::Relaxed); } } } } /// Simple formatter implementation for tracing_subscriber, which prints the log spans and /// message part like the default formatter, but no timestamp or error level. The error /// level is captured separately by `FormattedEventWithMeta', and when the error is /// printed by the main thread, with PostgreSQL ereport(), it gets a timestamp at that /// point. (The timestamp printed will therefore lag behind the timestamp on the event /// here, if the main thread doesn't process the log message promptly) struct SimpleFormatter; impl FormatEvent for SimpleFormatter where S: Subscriber + for<'a> LookupSpan<'a>, N: for<'a> FormatFields<'a> + 'static, { fn format_event( &self, ctx: &FmtContext<'_, S, N>, mut writer: Writer<'_>, event: &Event<'_>, ) -> std::fmt::Result { // Format all the spans in the event's span context. if let Some(scope) = ctx.event_scope() { for span in scope.from_root() { write!(writer, "{}", span.name())?; // `FormattedFields` is a formatted representation of the span's fields, // which is stored in its extensions by the `fmt` layer's `new_span` // method. The fields will have been formatted by the same field formatter // that's provided to the event formatter in the `FmtContext`. let ext = span.extensions(); let fields = &ext .get::>() .expect("will never be `None`"); // Skip formatting the fields if the span had no fields. if !fields.is_empty() { write!(writer, "{{{fields}}}")?; } write!(writer, ": ")?; } } // Write fields on the event ctx.field_format().format_fields(writer.by_ref(), event)?; Ok(()) } } ================================================ FILE: pgxn/neon/communicator/src/worker_process/main_loop.rs ================================================ use std::str::FromStr as _; use crate::worker_process::lfc_metrics::LfcMetricsCollector; use measured::MetricGroup; use measured::metric::MetricEncoding; use measured::metric::gauge::GaugeState; use measured::metric::group::Encoding; use utils::id::{TenantId, TimelineId}; pub struct CommunicatorWorkerProcessStruct { runtime: tokio::runtime::Runtime, /*** Metrics ***/ pub(crate) lfc_metrics: LfcMetricsCollector, } /// Launch the communicator process's Rust subsystems pub(super) fn init( tenant_id: Option<&str>, timeline_id: Option<&str>, ) -> Result<&'static CommunicatorWorkerProcessStruct, String> { // The caller validated these already let _tenant_id = tenant_id .map(TenantId::from_str) .transpose() .map_err(|e| format!("invalid tenant ID: {e}"))?; let _timeline_id = timeline_id .map(TimelineId::from_str) .transpose() .map_err(|e| format!("invalid timeline ID: {e}"))?; let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .thread_name("communicator thread") .build() .unwrap(); let worker_struct = CommunicatorWorkerProcessStruct { // Note: it's important to not drop the runtime, or all the tasks are dropped // too. Including it in the returned struct is one way to keep it around. runtime, // metrics lfc_metrics: LfcMetricsCollector, }; let worker_struct = Box::leak(Box::new(worker_struct)); // Start the listener on the control socket worker_struct .runtime .block_on(worker_struct.launch_control_socket_listener()) .map_err(|e| e.to_string())?; Ok(worker_struct) } impl MetricGroup for CommunicatorWorkerProcessStruct where T: Encoding, GaugeState: MetricEncoding, { fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { self.lfc_metrics.collect_group_into(enc) } } ================================================ FILE: pgxn/neon/communicator/src/worker_process/mod.rs ================================================ //! This code runs in the communicator worker process. This provides //! the glue code to: //! //! - launch the main loop, //! - receive IO requests from backends and process them, //! - write results back to backends. mod callbacks; mod control_socket; mod lfc_metrics; mod logging; mod main_loop; mod worker_interface; ================================================ FILE: pgxn/neon/communicator/src/worker_process/worker_interface.rs ================================================ //! Functions called from the C code in the worker process use std::ffi::{CStr, CString, c_char}; use crate::worker_process::main_loop; use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct; /// Launch the communicator's tokio tasks, which do most of the work. /// /// The caller has initialized the process as a regular PostgreSQL background worker /// process. /// /// Inputs: /// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode, /// where we use local storage instead of connecting to remote neon storage. That's /// currently only used in some unit tests. /// /// Result: /// Returns pointer to CommunicatorWorkerProcessStruct, which is a handle to running /// Rust tasks. The C code can use it to interact with the Rust parts. On failure, returns /// None/NULL, and an error message is returned in *error_p /// /// This is called only once in the process, so the returned struct, and error message in /// case of failure, are simply leaked. #[unsafe(no_mangle)] pub extern "C" fn communicator_worker_launch( tenant_id: *const c_char, timeline_id: *const c_char, error_p: *mut *const c_char, ) -> Option<&'static CommunicatorWorkerProcessStruct> { // Convert the arguments into more convenient Rust types let tenant_id = if tenant_id.is_null() { None } else { let cstr = unsafe { CStr::from_ptr(tenant_id) }; Some(cstr.to_str().expect("assume UTF-8")) }; let timeline_id = if timeline_id.is_null() { None } else { let cstr = unsafe { CStr::from_ptr(timeline_id) }; Some(cstr.to_str().expect("assume UTF-8")) }; // The `init` function does all the work. let result = main_loop::init(tenant_id, timeline_id); // On failure, return the error message to the C caller in *error_p. match result { Ok(worker_struct) => Some(worker_struct), Err(errmsg) => { let errmsg = CString::new(errmsg).expect("no nuls within error message"); let errmsg = Box::leak(errmsg.into_boxed_c_str()); let p: *const c_char = errmsg.as_ptr(); unsafe { *error_p = p }; None } } } ================================================ FILE: pgxn/neon/communicator.c ================================================ /*------------------------------------------------------------------------- * * communicator.c * Functions for communicating with remote pageservers. * * This is the so-called "legacy" communicator. It consists of functions that * are called from the smgr implementation, in pagestore_smgr.c. There are * plans to replace this with a different implementation, see RFC. * * The communicator is a collection of functions that are called in each * backend, when the backend needs to read a page or other information. It * does not spawn background threads or anything like that. To process * responses to prefetch requests in a timely fashion, however, it registers * a ProcessInterrupts hook that gets called periodically from any * CHECK_FOR_INTERRUPTS() point in the backend. * * By the time the functions in this file are called, the caller has already * established that a request to the pageserver is necessary. The functions * are only called for permanent relations (i.e. not temp or unlogged tables). * Before making a call to the communicator, the caller has already checked * the relation size or local file cache. * * However, when processing responses to getpage requests, the communicator * writes pages directly to the LFC. * * The communicator functions take request LSNs as arguments; the caller is * responsible for determining the correct LSNs to use. There's one exception * to that, in prefetch_do_request(); it sometimes calls back to * neon_get_request_lsns(). That's because sometimes a suitable response is * found in the prefetch buffer and the request LSns are not needed, and the * caller doesn't know whether it's needed or not. * * The main interface consists of the following "synchronous" calls: * * communicator_exists - Returns true if a relation file exists * communicator_nblocks - Returns a relation's size * communicator_dbsize - Returns a databases's total size * communicator_read_at_lsnv - Read contents of one relation block * communicator_read_slru_segment - Read contents of one SLRU segment * * In addition, there functions related to prefetching: * communicator_prefetch_register_bufferv - Start prefetching a page * communicator_prefetch_lookupv - Check if a page is already in prefetch queue * * Misc other functions: * - communicator_init - Initialize the module at startup * - communicator_prefetch_pump_state - Called periodically to advance the state * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "common/hashfn.h" #include "executor/instrument.h" #include "libpq/pqformat.h" #include "miscadmin.h" #include "port/pg_iovec.h" #include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/ipc.h" #include "utils/timeout.h" #include "bitmap.h" #include "communicator.h" #include "file_cache.h" #include "neon.h" #include "neon_perf_counters.h" #include "pagestore_client.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ ##__VA_ARGS__) page_server_api *page_server; /* * Various settings related to prompt (fast) handling of PageStream responses * at any CHECK_FOR_INTERRUPTS point. */ int readahead_getpage_pull_timeout_ms = 50; static int PS_TIMEOUT_ID = 0; static bool timeout_set = false; static bool timeout_signaled = false; /* * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want * that to handle any getpage responses if we're already working on the * backlog of those, as we'd hit issues with determining which prefetch slot * we just got a response for. * * To protect against that, we have this variable that's set whenever we start * receiving data for prefetch slots, so that we don't get confused. * * Note that in certain error cases during readpage we may leak r_r_g=true, * which results in a failure to pick up further responses until we first * actively try to receive new getpage responses. */ static bool readpage_reentrant_guard = false; static void pagestore_timeout_handler(void); #define START_PREFETCH_RECEIVE_WORK() \ do { \ readpage_reentrant_guard = true; \ } while (false) #define END_PREFETCH_RECEIVE_WORK() \ do { \ readpage_reentrant_guard = false; \ if (unlikely(timeout_signaled && !InterruptPending)) \ InterruptPending = true; \ } while (false) /* * Prefetch implementation: * * Prefetch is performed locally by each backend. * * There can be up to readahead_buffer_size active IO requests registered at * any time. Requests using smgr_prefetch are sent to the pageserver, but we * don't wait on the response. Requests using smgr_read are either read from * the buffer, or (if that's not possible) we wait on the response to arrive - * this also will allow us to receive other prefetched pages. * Each request is immediately written to the output buffer of the pageserver * connection, but may not be flushed if smgr_prefetch is used: pageserver * flushes sent requests on manual flush, or every neon.flush_output_after * unflushed requests; which is not necessarily always and all the time. * * Once we have received a response, this value will be stored in the response * buffer, indexed in a hash table. This allows us to retain our buffered * prefetch responses even when we have cache misses. * * Reading of prefetch responses is delayed until them are actually needed * (smgr_read). In case of prefetch miss or any other SMGR request other than * smgr_read, all prefetch responses in the pipeline will need to be read from * the connection; the responses are stored for later use. * * NOTE: The current implementation of the prefetch system implements a ring * buffer of up to readahead_buffer_size requests. If there are more _read and * _prefetch requests between the initial _prefetch and the _read of a buffer, * the prefetch request will have been dropped from this prefetch buffer, and * your prefetch was wasted. */ /* * State machine: * * not in hash : in hash * : * UNUSED ------> REQUESTED --> RECEIVED * ^ : | | * | : v | * | : TAG_REMAINS | * | : | | * +----------------+------------+ * : */ typedef enum PrefetchStatus { PRFS_UNUSED = 0, /* unused slot */ PRFS_REQUESTED, /* request was written to the sendbuffer to * PS, but not necessarily flushed. all fields * except response valid */ PRFS_RECEIVED, /* all fields valid */ PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still * valid */ } PrefetchStatus; /* must fit in uint8; bits 0x1 are used */ typedef enum { PRFSF_NONE = 0x0, PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ } PrefetchRequestFlags; typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ shardno_t shard_no; uint8 status; /* see PrefetchStatus for valid values */ uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonRequestId reqid; NeonResponse *response; /* may be null */ uint64 my_ring_index; } PrefetchRequest; /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry { PrefetchRequest *slot; uint32 status; uint32 hash; } PrfHashEntry; #define SH_PREFIX prfh #define SH_ELEMENT_TYPE PrfHashEntry #define SH_KEY_TYPE PrefetchRequest * #define SH_KEY slot #define SH_STORE_HASH #define SH_GET_HASH(tb, a) ((a)->hash) #define SH_HASH_KEY(tb, key) hash_bytes( \ ((const unsigned char *) &(key)->buftag), \ sizeof(BufferTag) \ ) #define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) #define SH_SCOPE static inline #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. * It maintains a (ring) buffer of in-flight requests and responses. * * We maintain several indexes into the ring buffer: * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 * * ring_unused points to the first unused slot of the buffer * ring_receive is the next request that is to be received * ring_last is the oldest received entry in the buffer * * Apart from being an entry in the ring buffer of prefetch requests, each * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. */ typedef struct PrefetchState { MemoryContext bufctx; /* context for prf_buffer[].response * allocations */ MemoryContext errctx; /* context for prf_buffer[].response * allocations */ MemoryContext hashctx; /* context for prf_buffer */ /* buffer indexes */ uint64 ring_unused; /* first unused slot */ uint64 ring_flush; /* next request to flush */ uint64 ring_receive; /* next slot that is to receive a response */ uint64 ring_last; /* min slot with a response value */ /* metrics / statistics */ int n_responses_buffered; /* count of PS responses not yet in * buffers */ int n_requests_inflight; /* count of PS requests considered in * flight */ int n_unused; /* count of buffers < unused, > last, that are * also unused */ /* the buffers */ prfh_hash *prf_hash; int max_shard_no; /* Mark shards involved in prefetch */ uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; static PrefetchState *MyPState; #define GetPrfSlotNoCheck(ring_index) ( \ &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ ) #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ GetPrfSlotNoCheck(ring_index) \ ) \ ) #define ReceiveBufferNeedsCompaction() (\ (MyPState->n_responses_buffered / 8) < ( \ MyPState->ring_receive - \ MyPState->ring_last - \ MyPState->n_responses_buffered \ ) \ ) static process_interrupts_callback_t prev_interrupt_cb; static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask, bool is_prefetch); static bool prefetch_read(PrefetchRequest *slot); static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool communicator_processinterrupts(void); void pg_init_communicator(void) { prev_interrupt_cb = ProcessInterruptsCallback; ProcessInterruptsCallback = communicator_processinterrupts; } static bool compact_prefetch_buffers(void) { uint64 empty_ring_index = MyPState->ring_last; uint64 search_ring_index = MyPState->ring_receive; int n_moved = 0; if (MyPState->ring_receive == MyPState->ring_last) return false; while (search_ring_index > MyPState->ring_last) { search_ring_index--; if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) { empty_ring_index = search_ring_index; break; } } /* * Here we have established: slots < search_ring_index have an unknown * state (not scanned) slots >= search_ring_index and <= empty_ring_index * are unused slots > empty_ring_index are in use, or outside our buffer's * range. ... unless search_ring_index <= ring_last * * Therefore, there is a gap of at least one unused items between * search_ring_index and empty_ring_index (both inclusive), which grows as * we hit more unused items while moving backwards through the array. */ while (search_ring_index > MyPState->ring_last) { PrefetchRequest *source_slot; PrefetchRequest *target_slot; bool found; /* update search index to an unprocessed entry */ search_ring_index--; source_slot = GetPrfSlot(search_ring_index); if (source_slot->status == PRFS_UNUSED) continue; /* slot is used -- start moving slot */ target_slot = GetPrfSlot(empty_ring_index); Assert(source_slot->status == PRFS_RECEIVED); Assert(target_slot->status == PRFS_UNUSED); target_slot->buftag = source_slot->buftag; target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->flags = source_slot->flags; target_slot->response = source_slot->response; target_slot->reqid = source_slot->reqid; target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); prfh_insert(MyPState->prf_hash, target_slot, &found); Assert(!found); /* Adjust the location of our known-empty slot */ empty_ring_index--; /* empty the moved slot */ source_slot->status = PRFS_UNUSED; source_slot->buftag = (BufferTag) { 0 }; source_slot->response = NULL; source_slot->my_ring_index = 0; source_slot->request_lsns = (neon_request_lsns) { InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr }; /* update bookkeeping */ n_moved++; } /* * Only when we've moved slots we can expect trailing unused slots, so * only then we clean up trailing unused slots. */ if (n_moved > 0) { prefetch_cleanup_trailing_unused(); return true; } return false; } /* * Check that prefetch response matches the slot */ static void check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) { if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse) { neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "", resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused); } if (neon_protocol_version >= 3) { NRelFileInfo rinfo = BufTagGetNRelFileInfo(slot->buftag); if (resp->tag == T_NeonGetPageResponse) { NeonGetPageResponse * getpage_resp = (NeonGetPageResponse *)resp; if (resp->reqid != slot->reqid || resp->lsn != slot->request_lsns.request_lsn || resp->not_modified_since != slot->request_lsns.not_modified_since || !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || getpage_resp->req.forknum != slot->buftag.forkNum || getpage_resp->req.blkno != slot->buftag.blockNum) { NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, "Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum); } } else if (resp->reqid != slot->reqid || resp->lsn != slot->request_lsns.request_lsn || resp->not_modified_since != slot->request_lsns.not_modified_since) { elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); } } } /* * If there might be responses still in the TCP buffer, then we should try to * use those, to reduce any TCP backpressure on the OS/PS side. * * This procedure handles that. * * Note that this works because we don't pipeline non-getPage requests. * * NOTE: This procedure is not allowed to throw errors that should be handled * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS * point inside and outside PostgreSQL. * * This still does throw errors when it receives malformed responses from PS. */ void communicator_prefetch_pump_state(void) { START_PREFETCH_RECEIVE_WORK(); while (MyPState->ring_receive != MyPState->ring_flush) { NeonResponse *response; PrefetchRequest *slot; MemoryContext old; slot = GetPrfSlot(MyPState->ring_receive); old = MemoryContextSwitchTo(MyPState->errctx); response = page_server->try_receive(slot->shard_no); MemoryContextSwitchTo(old); if (response == NULL) break; check_getpage_response(slot, response); /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, slot->my_ring_index, MyPState->ring_receive); } /* update prefetch state */ MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) { /* * Store prefetched result in LFC (please read comments to lfc_prefetch * explaining why it can be done without holding shared buffer lock */ if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) { slot->flags |= PRFSF_LFC; } } } END_PREFETCH_RECEIVE_WORK(); communicator_reconfigure_timeout_if_needed(); } void readahead_buffer_resize(int newsize, void *extra) { uint64 end, nfree = newsize; PrefetchState *newPState; Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (sizeof(PrefetchRequest) * newsize); /* don't try to re-initialize if we haven't initialized yet */ if (MyPState == NULL) return; /* * Make sure that we don't lose track of active prefetch requests by * ensuring we have received all but the last n requests (n = newsize). */ if (MyPState->n_requests_inflight > newsize) { prefetch_wait_for(MyPState->ring_unused - newsize - 1); Assert(MyPState->n_requests_inflight <= newsize); } /* construct the new PrefetchState, and copy over the memory contexts */ newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); newPState->bufctx = MyPState->bufctx; newPState->errctx = MyPState->errctx; newPState->hashctx = MyPState->hashctx; newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); newPState->n_unused = newsize; newPState->n_requests_inflight = 0; newPState->n_responses_buffered = 0; newPState->ring_last = newsize; newPState->ring_unused = newsize; newPState->ring_receive = newsize; newPState->max_shard_no = MyPState->max_shard_no; memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); /* * Copy over the prefetches. * * We populate the prefetch array from the end; to retain the most recent * prefetches, but this has the benefit of only needing to do one * iteration on the dataset, and trivial compaction. */ for (end = MyPState->ring_unused - 1; end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; end -= 1) { PrefetchRequest *slot = GetPrfSlot(end); PrefetchRequest *newslot; bool found; if (slot->status == PRFS_UNUSED) continue; nfree -= 1; newslot = &newPState->prf_buffer[nfree]; *newslot = *slot; newslot->my_ring_index = nfree; prfh_insert(newPState->prf_hash, newslot, &found); Assert(!found); switch (newslot->status) { case PRFS_UNUSED: pg_unreachable(); case PRFS_REQUESTED: newPState->n_requests_inflight += 1; newPState->ring_receive -= 1; newPState->ring_last -= 1; break; case PRFS_RECEIVED: newPState->n_responses_buffered += 1; newPState->ring_last -= 1; break; case PRFS_TAG_REMAINS: newPState->ring_last -= 1; break; } newPState->n_unused -= 1; } newPState->ring_flush = newPState->ring_receive; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; MyNeonCounters->pageserver_open_requests = MyPState->n_requests_inflight; for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { PrefetchRequest *slot = GetPrfSlot(end); Assert(slot->status != PRFS_REQUESTED); if (slot->status == PRFS_RECEIVED) { pfree(slot->response); } } prfh_destroy(MyPState->prf_hash); pfree(MyPState); MyPState = newPState; } /* * Callback to be called on backend exit to ensure correct state of compute-PS communication * in case of backend cancel */ static void prefetch_on_exit(int code, Datum arg) { if (code != 0) /* do disconnect only on abnormal backend termination */ { shardno_t shard_no = DatumGetInt32(arg); prefetch_on_ps_disconnect(); page_server->disconnect(shard_no); } } /* * Make sure that there are no responses still in the buffer. * * This function may indirectly update MyPState->pfs_hash; which invalidates * any active pointers into the hash table. */ static void consume_prefetch_responses(void) { if (MyPState->ring_receive < MyPState->ring_unused) prefetch_wait_for(MyPState->ring_unused - 1); /* * We know for sure we're not working on any prefetch pages after * this. */ END_PREFETCH_RECEIVE_WORK(); } static void prefetch_cleanup_trailing_unused(void) { uint64 ring_index; PrefetchRequest *slot; while (MyPState->ring_last < MyPState->ring_receive) { ring_index = MyPState->ring_last; slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) MyPState->ring_last += 1; else break; } } static bool prefetch_flush_requests(void) { for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) { if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) { if (!page_server->flush(shard_no)) return false; BITMAP_CLR(MyPState->shard_bitmap, shard_no); } } MyPState->max_shard_no = 0; return true; } /* * Wait for slot of ring_index to have received its response. * The caller is responsible for making sure the request buffer is flushed. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. * NOTE: callers should make sure they can handle query cancellations in this * function's call path. */ static bool prefetch_wait_for(uint64 ring_index) { PrefetchRequest *entry; bool result = true; if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) { if (!prefetch_flush_requests()) return false; MyPState->ring_flush = MyPState->ring_unused; } Assert(MyPState->ring_unused > ring_index); START_PREFETCH_RECEIVE_WORK(); while (MyPState->ring_receive <= ring_index) { entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); if (!prefetch_read(entry)) { result = false; break; } CHECK_FOR_INTERRUPTS(); } if (result) { /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ PrefetchRequest *slot = GetPrfSlot(ring_index); result = slot->status == PRFS_RECEIVED; } END_PREFETCH_RECEIVE_WORK(); return result; ; } /* * Read the response of a prefetch request into its slot. * * The caller is responsible for making sure that the request for this buffer * was flushed to the PageServer. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. * * NOTE: this does IO, and can get canceled out-of-line. */ static bool prefetch_read(PrefetchRequest *slot) { NeonResponse *response; MemoryContext old; BufferTag buftag; shardno_t shard_no; uint64 my_ring_index; Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); Assert(readpage_reentrant_guard || AmPrewarmWorker); if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, slot->my_ring_index, MyPState->ring_receive); } /* * Copy the request info so that if an error happens and the prefetch * queue is flushed during the receive call, we can print the original * values in the error message */ buftag = slot->buftag; shard_no = slot->shard_no; my_ring_index = slot->my_ring_index; old = MemoryContextSwitchTo(MyPState->errctx); response = (NeonResponse *) page_server->receive(shard_no); MemoryContextSwitchTo(old); if (response) { check_getpage_response(slot, response); /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, slot->my_ring_index, MyPState->ring_receive); } /* update prefetch state */ MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) { /* * Store prefetched result in LFC (please read comments to lfc_prefetch * explaining why it can be done without holding shared buffer lock */ if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) { slot->flags |= PRFSF_LFC; } } return true; } else { /* * Note: The slot might no longer be valid, if the connection was lost * and the prefetch queue was flushed during the receive call */ neon_shard_log(shard_no, LOG, "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", my_ring_index, RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), buftag.forkNum, buftag.blockNum); return false; } } /* * Wait completion of previosly registered prefetch request. * Prefetch result should be placed in LFC by prefetch_wait_for. */ bool communicator_prefetch_receive(BufferTag tag) { PrfHashEntry *entry; PrefetchRequest hashkey; Assert(readpage_reentrant_guard || AmPrewarmWorker); /* do not pump prefetch state in prewarm worker */ hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) { prefetch_set_unused(entry->slot->my_ring_index); return true; } return false; } /* * Disconnect hook - drop prefetches when the connection drops * * If we don't remove the failed prefetches, we'd be serving incorrect * data to the smgr. */ void prefetch_on_ps_disconnect(void) { MyPState->ring_flush = MyPState->ring_unused; /* Nothing should cancel disconnect: we should not leave connection in opaque state */ HOLD_INTERRUPTS(); while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; uint64 ring_index = MyPState->ring_receive; slot = GetPrfSlot(ring_index); Assert(slot->status == PRFS_REQUESTED); Assert(slot->my_ring_index == ring_index); /* * Drop connection to all shards which have prefetch requests. * It is not a problem to call disconnect multiple times on the same connection * because disconnect implementation in libpagestore.c will check if connection * is alive and do nothing of connection was already dropped. */ page_server->disconnect(slot->shard_no); /* clean up the request */ slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; prefetch_set_unused(ring_index); pgBufferUsage.prefetch.expired += 1; MyNeonCounters->getpage_prefetch_discards_total += 1; } /* * We can have gone into retry due to network error, so update stats with * the latest available */ MyNeonCounters->pageserver_open_requests = MyPState->n_requests_inflight; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; RESUME_INTERRUPTS(); } /* * prefetch_set_unused() - clear a received prefetch slot * * The slot at ring_index must be a current member of the ring buffer, * and may not be in the PRFS_REQUESTED state. * * NOTE: this function will update MyPState->pfs_hash; which invalidates any * active pointers into the hash table. */ static inline void prefetch_set_unused(uint64 ring_index) { PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); if (slot->status == PRFS_RECEIVED) { pfree(slot->response); slot->response = NULL; MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; } else { Assert(slot->response == NULL); } prfh_delete(MyPState->prf_hash, slot); /* clear all fields */ MemSet(slot, 0, sizeof(PrefetchRequest)); slot->status = PRFS_UNUSED; /* run cleanup if we're holding back ring_last */ if (MyPState->ring_last == ring_index) prefetch_cleanup_trailing_unused(); /* * ... and try to store the buffered responses more compactly if > 12.5% * of the buffer is gaps */ else if (ReceiveBufferNeedsCompaction()) compact_prefetch_buffers(); } /* * Send one prefetch request to the pageserver. To wait for the response, call * prefetch_wait_for(). */ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; NeonGetPageRequest request = { .hdr.tag = T_NeonGetPageRequest, /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; Assert(mySlotNo == MyPState->ring_unused); if (force_request_lsns) slot->request_lsns = *force_request_lsns; else neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, &slot->request_lsns, 1); request.hdr.lsn = slot->request_lsns.request_lsn; request.hdr.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) { Assert(mySlotNo == MyPState->ring_unused); /* loop */ } slot->reqid = request.hdr.reqid; /* update prefetch state */ MyPState->n_requests_inflight += 1; MyPState->n_unused -= 1; MyPState->ring_unused += 1; BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); /* update slot state */ slot->status = PRFS_REQUESTED; prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } /* * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. * Present pages are marked in "mask" bitmap and total number of such pages is returned. */ int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, BlockNumber nblocks, void **buffers, bits8 *mask) { int hits = 0; PrefetchRequest hashkey; /* * Use an intermediate PrefetchRequest struct as the hash key to ensure * correct alignment and that the padding bytes are cleared. */ memset(&hashkey.buftag, 0, sizeof(BufferTag)); CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); hashkey.buftag.forkNum = forknum; for (int i = 0; i < nblocks; i++) { PrfHashEntry *entry; hashkey.buftag.blockNum = blocknum + i; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL) { PrefetchRequest *slot = entry->slot; uint64 ring_index = slot->my_ring_index; Assert(slot == GetPrfSlot(ring_index)); Assert(slot->status != PRFS_UNUSED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); if (slot->status != PRFS_RECEIVED) continue; /* * If the caller specified a request LSN to use, only accept * prefetch responses that satisfy that request. */ if (!neon_prefetch_response_usable(&lsns[i], slot)) continue; /* * Ignore errors */ if (slot->response->tag == T_NeonErrorResponse) { continue; } Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */ memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); /* * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here * under buffer lock. */ if (!lfc_store_prefetch_result) lfc_write(rinfo, forknum, blocknum + i, buffers[i]); prefetch_set_unused(ring_index); BITMAP_SET(mask, i); hits += 1; inc_getpage_wait(0); } } pgBufferUsage.prefetch.hits += hits; return hits; } /* * prefetch_register_bufferv() - register and prefetch buffers * * Register that we may want the contents of BufferTag in the near future. * This is used when issuing a speculative prefetch request, but also when * performing a synchronous request and need the buffer right now. * * If force_request_lsns is not NULL, those values are sent to the * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure * to calculate the LSNs to send. * * Bits set in *mask (if present) indicate pages already read; i.e. pages we * can skip in this process. * * When performing a prefetch rather than a synchronous request, * is_prefetch==true. Currently, it only affects how the request is accounted * in the perf counters. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask) { uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true); Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); } /* Internal version. Returns the ring index of the last block (result of this function is used only * when nblocks==1) */ static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask, bool is_prefetch) { uint64 last_ring_index; PrefetchRequest hashkey; #ifdef USE_ASSERT_CHECKING bool any_hits = false; #endif /* We will never read further ahead than our buffer can store. */ nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* * Use an intermediate PrefetchRequest struct as the hash key to ensure * correct alignment and that the padding bytes are cleared. */ memset(&hashkey.buftag, 0, sizeof(BufferTag)); hashkey.buftag = tag; Retry: /* * We can have gone into retry due to network error, so update stats with * the latest available */ MyNeonCounters->pageserver_open_requests = MyPState->ring_unused - MyPState->ring_receive; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; last_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { PrefetchRequest *slot = NULL; PrfHashEntry *entry = NULL; neon_request_lsns *lsns; if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) continue; if (frlsns) lsns = &frlsns[i]; else lsns = NULL; #ifdef USE_ASSERT_CHECKING any_hits = true; #endif slot = NULL; entry = NULL; hashkey.buftag.blockNum = tag.blockNum + i; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL) { slot = entry->slot; last_ring_index = slot->my_ring_index; Assert(slot == GetPrfSlot(last_ring_index)); Assert(slot->status != PRFS_UNUSED); Assert(MyPState->ring_last <= last_ring_index && last_ring_index < MyPState->ring_unused); Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); /* * If the caller specified a request LSN to use, only accept * prefetch responses that satisfy that request. */ if (!is_prefetch) { if (!neon_prefetch_response_usable(lsns, slot)) { /* Wait for the old request to finish and discard it */ if (!prefetch_wait_for(last_ring_index)) goto Retry; prefetch_set_unused(last_ring_index); entry = NULL; slot = NULL; pgBufferUsage.prefetch.expired += 1; MyNeonCounters->getpage_prefetch_discards_total += 1; } } if (entry != NULL) { /* * We received a prefetch for a page that was recently read * and removed from the buffers. Remove that request from the * buffers. */ if (slot->status == PRFS_TAG_REMAINS) { prefetch_set_unused(last_ring_index); entry = NULL; slot = NULL; } else { /* The buffered request is good enough, return that index */ if (is_prefetch) pgBufferUsage.prefetch.duplicates++; continue; } } } else if (!is_prefetch) { pgBufferUsage.prefetch.misses += 1; MyNeonCounters->getpage_prefetch_misses_total++; } /* * We can only leave the block above by finding that there's * no entry that can satisfy this request, either because there * was no entry, or because the entry was invalid or didn't satisfy * the LSNs provided. * * The code should've made sure to clear up the data. */ Assert(entry == NULL); Assert(slot == NULL); /* There should be no buffer overflow */ Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); /* * If the prefetch queue is full, we need to make room by clearing the * oldest slot. If the oldest slot holds a buffer that was already * received, we can just throw it away; we fetched the page * unnecessarily in that case. If the oldest slot holds a request that * we haven't received a response for yet, we have to wait for the * response to that before we can continue. We might not have even * flushed the request to the pageserver yet, it might be just sitting * in the output buffer. In that case, we flush it and wait for the * response. (We could decide not to send it, but it's hard to abort * when the request is already in the output buffer, and 'not sending' * a prefetch request kind of goes against the principles of * prefetching) */ if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) { uint64 cleanup_index = MyPState->ring_last; slot = GetPrfSlot(cleanup_index); Assert(slot->status != PRFS_UNUSED); /* * If there is good reason to run compaction on the prefetch buffers, * try to do that. */ if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) { Assert(slot->status == PRFS_UNUSED); } else { /* * We have the slot for ring_last, so that must still be in * progress */ switch (slot->status) { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); if (!prefetch_wait_for(cleanup_index)) goto Retry; prefetch_set_unused(cleanup_index); pgBufferUsage.prefetch.expired += 1; MyNeonCounters->getpage_prefetch_discards_total += 1; break; case PRFS_RECEIVED: case PRFS_TAG_REMAINS: prefetch_set_unused(cleanup_index); pgBufferUsage.prefetch.expired += 1; MyNeonCounters->getpage_prefetch_discards_total += 1; break; default: pg_unreachable(); } } } /* * The next buffer pointed to by `ring_unused` is now definitely empty, so * we can insert the new request to it. */ last_ring_index = MyPState->ring_unused; Assert(MyPState->ring_last <= last_ring_index && last_ring_index <= MyPState->ring_unused); slot = GetPrfSlotNoCheck(last_ring_index); Assert(slot->status == PRFS_UNUSED); /* * We must update the slot data before insertion, because the hash * function reads the buffer tag from the slot. */ slot->buftag = hashkey.buftag; slot->shard_no = get_shard_number(&tag); slot->my_ring_index = last_ring_index; slot->flags = 0; if (is_prefetch) MyNeonCounters->getpage_prefetch_requests_total++; else MyNeonCounters->getpage_sync_requests_total++; prefetch_do_request(slot, lsns); } MyNeonCounters->pageserver_open_requests = MyPState->ring_unused - MyPState->ring_receive; Assert(any_hits); Assert(last_ring_index != UINT64_MAX); Assert(GetPrfSlot(last_ring_index)->status == PRFS_REQUESTED || GetPrfSlot(last_ring_index)->status == PRFS_RECEIVED); Assert(MyPState->ring_last <= last_ring_index && last_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) { if (!prefetch_flush_requests()) { /* * Prefetch set is reset in case of error, so we should try to * register our request once again */ goto Retry; } MyPState->ring_flush = MyPState->ring_unused; } return last_ring_index; } static bool equal_requests(NeonRequest* a, NeonRequest* b) { return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; } /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. */ static NeonResponse * page_server_request(void const *req) { NeonResponse *resp = NULL; BufferTag tag = {0}; shardno_t shard_no; switch (messageTag(req)) { case T_NeonExistsRequest: CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); break; case T_NeonNblocksRequest: CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); break; case T_NeonDbSizeRequest: NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; break; case T_NeonGetPageRequest: CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: neon_log(PANIC, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); /* * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) { shard_no = 0; } consume_prefetch_responses(); PG_TRY(); { before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); do { while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)) { /* do nothing */ } MyNeonCounters->pageserver_open_requests++; resp = page_server->receive(shard_no); MyNeonCounters->pageserver_open_requests--; } while (resp == NULL); cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); } PG_CATCH(); { cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); /* Nothing should cancel disconnect: we should not leave connection in opaque state */ HOLD_INTERRUPTS(); page_server->disconnect(shard_no); MyNeonCounters->pageserver_open_requests = 0; RESUME_INTERRUPTS(); PG_RE_THROW(); } PG_END_TRY(); return resp; } StringInfoData nm_pack_request(NeonRequest *msg) { StringInfoData s; initStringInfo(&s); pq_sendbyte(&s, msg->tag); if (neon_protocol_version >= 3) { pq_sendint64(&s, msg->reqid); } pq_sendint64(&s, msg->lsn); pq_sendint64(&s, msg->not_modified_since); switch (messageTag(msg)) { /* pagestore_client -> pagestore */ case T_NeonExistsRequest: { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; } case T_NeonNblocksRequest: { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; } case T_NeonDbSizeRequest: { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; pq_sendint32(&s, msg_req->dbNode); break; } case T_NeonGetPageRequest: { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); pq_sendint32(&s, msg_req->blkno); break; } case T_NeonGetSlruSegmentRequest: { NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; pq_sendbyte(&s, msg_req->kind); pq_sendint32(&s, msg_req->segno); break; } /* pagestore -> pagestore_client. We never need to create these. */ case T_NeonExistsResponse: case T_NeonNblocksResponse: case T_NeonGetPageResponse: case T_NeonErrorResponse: case T_NeonDbSizeResponse: case T_NeonGetSlruSegmentResponse: default: neon_log(PANIC, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; } NeonResponse * nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); NeonResponse resp_hdr = {0}; /* make valgrind happy */ NeonResponse *resp = NULL; resp_hdr.tag = tag; if (neon_protocol_version >= 3) { resp_hdr.reqid = pq_getmsgint64(s); resp_hdr.lsn = pq_getmsgint64(s); resp_hdr.not_modified_since = pq_getmsgint64(s); } switch (tag) { /* pagestore -> pagestore_client */ case T_NeonExistsResponse: { NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); if (neon_protocol_version >= 3) { NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); msg_resp->req.forknum = pq_getmsgbyte(s); } msg_resp->req.hdr = resp_hdr; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); resp = (NeonResponse *) msg_resp; break; } case T_NeonNblocksResponse: { NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); if (neon_protocol_version >= 3) { NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); msg_resp->req.forknum = pq_getmsgbyte(s); } msg_resp->req.hdr = resp_hdr; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); resp = (NeonResponse *) msg_resp; break; } case T_NeonGetPageResponse: { NeonGetPageResponse *msg_resp; msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); if (neon_protocol_version >= 3) { NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); msg_resp->req.forknum = pq_getmsgbyte(s); msg_resp->req.blkno = pq_getmsgint(s, 4); } msg_resp->req.hdr = resp_hdr; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); resp = (NeonResponse *) msg_resp; break; } case T_NeonDbSizeResponse: { NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); if (neon_protocol_version >= 3) { msg_resp->req.dbNode = pq_getmsgint(s, 4); } msg_resp->req.hdr = resp_hdr; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); resp = (NeonResponse *) msg_resp; break; } case T_NeonErrorResponse: { NeonErrorResponse *msg_resp; size_t msglen; const char *msgtext; msgtext = pq_getmsgrawstring(s); msglen = strlen(msgtext); msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); msg_resp->req = resp_hdr; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); resp = (NeonResponse *) msg_resp; break; } case T_NeonGetSlruSegmentResponse: { NeonGetSlruSegmentResponse *msg_resp; int n_blocks; msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); if (neon_protocol_version >= 3) { msg_resp->req.kind = pq_getmsgbyte(s); msg_resp->req.segno = pq_getmsgint(s, 4); } msg_resp->req.hdr = resp_hdr; n_blocks = pq_getmsgint(s, 4); msg_resp->n_blocks = n_blocks; memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); pq_getmsgend(s); resp = (NeonResponse *) msg_resp; break; } /* * pagestore_client -> pagestore * * We create these ourselves, and don't need to decode them. */ case T_NeonExistsRequest: case T_NeonNblocksRequest: case T_NeonGetPageRequest: case T_NeonDbSizeRequest: case T_NeonGetSlruSegmentRequest: default: neon_log(PANIC, "unexpected neon message tag 0x%02x", tag); break; } return resp; } /* dump to json for debugging / error reporting purposes */ char * nm_to_string(NeonMessage *msg) { StringInfoData s; initStringInfo(&s); switch (messageTag(msg)) { /* pagestore_client -> pagestore */ case T_NeonExistsRequest: { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } case T_NeonNblocksRequest: { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } case T_NeonGetPageRequest: { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } case T_NeonDbSizeRequest: { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } case T_NeonGetSlruSegmentRequest: { NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } /* pagestore -> pagestore_client */ case T_NeonExistsResponse: { NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); appendStringInfo(&s, ", \"exists\": %d}", msg_resp->exists); appendStringInfoChar(&s, '}'); break; } case T_NeonNblocksResponse: { NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); appendStringInfo(&s, ", \"n_blocks\": %u}", msg_resp->n_blocks); appendStringInfoChar(&s, '}'); break; } case T_NeonGetPageResponse: { NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); appendStringInfo(&s, ", \"rinfo\": %u/%u/%u", RelFileInfoFmt(msg_resp->req.rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_resp->req.forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_resp->req.blkno); appendStringInfoChar(&s, '}'); break; } case T_NeonErrorResponse: { NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; /* FIXME: escape double-quotes in the message */ appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); appendStringInfoChar(&s, '}'); break; } case T_NeonDbSizeResponse: { NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}", msg_resp->db_size); appendStringInfoChar(&s, '}'); break; } case T_NeonGetSlruSegmentResponse: { NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); appendStringInfo(&s, ", \"n_blocks\": %u}", msg_resp->n_blocks); appendStringInfoChar(&s, '}'); break; } default: appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); } return s.data; } /* * communicator_init() -- Initialize per-backend private state */ void communicator_init(void) { Size prfs_size; if (MyPState != NULL) return; /* * Sanity check that theperf counters array is sized correctly. We got * this wrong once, and the formula for max number of backends and aux * processes might well change in the future, so better safe than sorry. * This is a very cheap check so we do it even without assertions. On * v14, this gets called before initializing MyProc, so we cannot perform * the check here. That's OK, we don't expect the logic to change in old * releases. */ #if PG_VERSION_NUM>=150000 if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) elog(ERROR, "MyNeonCounters points past end of array"); #endif prfs_size = offsetof(PrefetchState, prf_buffer) + sizeof(PrefetchRequest) * readahead_buffer_size; MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); MyPState->n_unused = readahead_buffer_size; MyPState->bufctx = SlabContextCreate(TopMemoryContext, "NeonSMGR/prefetch", SLAB_DEFAULT_BLOCK_SIZE * 17, PS_GETPAGERESPONSE_SIZE); MyPState->errctx = AllocSetContextCreate(TopMemoryContext, "NeonSMGR/errors", ALLOCSET_DEFAULT_SIZES); MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, "NeonSMGR/prefetch", ALLOCSET_DEFAULT_SIZES); MyPState->prf_hash = prfh_create(MyPState->hashctx, readahead_buffer_size, NULL); } /* * neon_prefetch_response_usable -- Can a new request be satisfied by old one? * * This is used to check if the response to a prefetch request can be used to * satisfy a page read now. */ static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); Assert(slot->status != PRFS_UNUSED); /* * The new request's LSN should never be older than the old one. This * could be an Assert, except that for testing purposes, we do provide an * interface in neon_test_utils to fetch pages at arbitary LSNs, which * violates this. * * Similarly, the not_modified_since value calculated for a page should * never move backwards. This assumption is a bit fragile; if we updated * the last-written cache when we read in a page, for example, then it * might. But as the code stands, it should not. * * (If two backends issue a request at the same time, they might race and * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; } /*--- * Each request to the pageserver has three LSN values associated with it: * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. * `not_modified_since` and `request_lsn` are sent to the pageserver, but * in the primary node, we always use UINT64_MAX as the `request_lsn`, so * we remember `effective_request_lsn` separately. In a primary, * `effective_request_lsn` is the same as `not_modified_since`. * See comments in neon_get_request_lsns why we can not use last flush WAL position here. * * To determine whether a response to a GetPage request issued earlier is * still valid to satisfy a new page read, we look at the * (not_modified_since, effective_request_lsn] range of the request. It is * effectively a claim that the page has not been modified between those * LSNs. If the range of the old request in the queue overlaps with the * new request, we know that the page hasn't been modified in the union of * the ranges. We can use the response to old request to satisfy the new * request in that case. For example: * * 100 500 * Old request: +--------+ * * 400 800 * New request: +--------+ * * The old request claims that the page was not modified between LSNs 100 * and 500, and the second claims that it was not modified between 400 and * 800. Together they mean that the page was not modified between 100 and * 800. Therefore the response to the old request is also valid for the * new request. * * This logic also holds at the boundary case that the old request's LSN * matches the new request's not_modified_since LSN exactly: * * 100 500 * Old request: +--------+ * * 500 900 * New request: +--------+ * * The response to the old request is the page as it was at LSN 500, and * the page hasn't been changed in the range (500, 900], therefore the * response is valid also for the new request. */ /* this follows from the checks above */ Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* * Does the physical file exist? */ bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns) { bool exists; NeonResponse *resp; { NeonExistsRequest request = { .hdr.tag = T_NeonExistsRequest, .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .rinfo = rinfo, .forknum = forkNum }; resp = page_server_request(&request); switch (resp->tag) { case T_NeonExistsResponse: { NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr) || !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || exists_resp->req.forknum != request.forknum) { NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); } } exists = exists_resp->exists; break; } case T_NeonErrorResponse: if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr)) { elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", resp->reqid, RelFileInfoFmt(rinfo), forkNum, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); } return exists; } /* * Read N pages at a specific LSN. * * *mask is set for pages read at a previous point in time, and which we * should not touch, nor overwrite. * New bits should be set in *mask for the pages we'successfully read. * * The offsets in request_lsns, buffers, and mask are linked. */ void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, void **buffers, BlockNumber nblocks, const bits8 *mask) { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; PrefetchRequest hashkey; Assert(PointerIsValid(request_lsns)); Assert(nblocks >= 1); /* * Use an intermediate PrefetchRequest struct as the hash key to ensure * correct alignment and that the padding bytes are cleared. */ memset(&hashkey.buftag, 0, sizeof(BufferTag)); CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); hashkey.buftag.forkNum = forkNum; hashkey.buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are * not in the shared buffers, so a concurrent process may request the page * after redo has decided it won't redo that page and updated the LwLSN * for that page. If we're in hot standby we need to take care that we * don't return until after REDO has finished replaying up to that LwLSN, * as the page should have been locked up to that point. * * See also the description on neon_redo_read_buffer_filter below. * * NOTE: It is possible that the WAL redo process will still do IO due to * concurrent failed read IOs. Those IOs should never have a request_lsn * that is as large as the WAL record we're currently replaying, if it * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ (void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); for (int i = 0; i < nblocks; i++) { void *buffer = buffers[i]; BlockNumber blockno = base_blockno + i; neon_request_lsns *reqlsns = &request_lsns[i]; TimestampTz start_ts, end_ts; if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) continue; start_ts = GetCurrentTimestamp(); if (RecoveryInProgress() && MyBackendType != B_STARTUP) XLogWaitForReplayOf(reqlsns->request_lsn); /* * Try to find prefetched page in the list of received pages. */ Retry: hashkey.buftag.blockNum = blockno; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL) { slot = entry->slot; if (neon_prefetch_response_usable(reqlsns, slot)) { ring_index = slot->my_ring_index; } else { /* * Cannot use this prefetch, discard it * * We can't drop cache for not-yet-received requested items. It is * unlikely this happens, but it can happen if prefetch distance * is large enough and a backend didn't consume all prefetch * requests. */ if (slot->status == PRFS_REQUESTED) { if (!prefetch_wait_for(slot->my_ring_index)) goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); pgBufferUsage.prefetch.expired += 1; MyNeonCounters->getpage_prefetch_discards_total++; /* make it look like a prefetch cache miss */ entry = NULL; } } do { if (entry == NULL) { ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); Assert(ring_index != UINT64_MAX); slot = GetPrfSlot(ring_index); } else { /* * Empty our reference to the prefetch buffer's hash entry. When * we wait for prefetches, the entry reference is invalidated by * potential updates to the hash, and when we reconnect to the * pageserver the prefetch we're waiting for may be dropped, in * which case we need to retry and take the branch above. */ entry = NULL; } Assert(slot->my_ring_index == ring_index); Assert(MyPState->ring_last <= ring_index && MyPState->ring_unused > ring_index); Assert(slot->status != PRFS_UNUSED); Assert(GetPrfSlot(ring_index) == slot); } while (!prefetch_wait_for(ring_index)); Assert(slot->status == PRFS_RECEIVED); Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); Assert(hashkey.buftag.blockNum == base_blockno + i); /* We already checked that response match request when storing it in slot */ resp = slot->response; switch (resp->tag) { case T_NeonGetPageResponse: { NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; memcpy(buffer, getpage_resp->page, BLCKSZ); /* * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here * under buffer lock. */ if (!lfc_store_prefetch_result) lfc_write(rinfo, forkNum, blockno, buffer); break; } case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } /* buffer was used, clean up for later reuse */ prefetch_set_unused(ring_index); prefetch_cleanup_trailing_unused(); end_ts = GetCurrentTimestamp(); inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); } } /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns) { NeonResponse *resp; BlockNumber n_blocks; { NeonNblocksRequest request = { .hdr.tag = T_NeonNblocksRequest, .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .rinfo = rinfo, .forknum = forknum, }; resp = page_server_request(&request); switch (resp->tag) { case T_NeonNblocksResponse: { NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr) || !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || relsize_resp->req.forknum != forknum) { NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); } } n_blocks = relsize_resp->n_blocks; break; } case T_NeonErrorResponse: if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr)) { elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", resp->reqid, RelFileInfoFmt(rinfo), forknum, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); } return n_blocks; } /* * neon_db_size() -- Get the size of the database in bytes. */ int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) { NeonResponse *resp; int64 db_size; { NeonDbSizeRequest request = { .hdr.tag = T_NeonDbSizeRequest, .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .dbNode = dbNode, }; resp = page_server_request(&request); switch (resp->tag) { case T_NeonDbSizeResponse: { NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr) || dbsize_resp->req.dbNode != dbNode) { NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); } } db_size = dbsize_resp->db_size; break; } case T_NeonErrorResponse: if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr)) { elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X", resp->reqid, dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); } return db_size; } int communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, void *buffer) { int n_blocks; shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ NeonResponse *resp = NULL; NeonGetSlruSegmentRequest request; request = (NeonGetSlruSegmentRequest) { .hdr.tag = T_NeonGetSlruSegmentRequest, .hdr.lsn = request_lsns->request_lsn, .hdr.not_modified_since = request_lsns->not_modified_since, .kind = kind, .segno = segno }; consume_prefetch_responses(); PG_TRY(); { before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); do { while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); resp = page_server->receive(shard_no); } while (resp == NULL); cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); } PG_CATCH(); { cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); /* Nothing should cancel disconnect: we should not leave connection in opaque state */ HOLD_INTERRUPTS(); page_server->disconnect(shard_no); RESUME_INTERRUPTS(); PG_RE_THROW(); } PG_END_TRY(); switch (resp->tag) { case T_NeonGetSlruSegmentResponse: { NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr) || slru_resp->req.kind != kind || slru_resp->req.segno != segno) { NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); } } n_blocks = slru_resp->n_blocks; memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); break; } case T_NeonErrorResponse: if (neon_protocol_version >= 3) { if (!equal_requests(resp, &request.hdr)) { elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X", resp->reqid, kind, (unsigned long long) segno, LSN_FORMAT_ARGS(request_lsns->request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); communicator_reconfigure_timeout_if_needed(); return n_blocks; } void communicator_reconfigure_timeout_if_needed(void) { bool needs_set = MyPState->ring_receive != MyPState->ring_unused && !AmPrewarmWorker && /* do not pump prefetch state in prewarm worker */ readahead_getpage_pull_timeout_ms > 0; if (needs_set != timeout_set) { /* The background writer doens't (shouldn't) read any pages */ Assert(!AmBackgroundWriterProcess()); /* The checkpointer doens't (shouldn't) read any pages */ Assert(!AmCheckpointerProcess()); if (unlikely(PS_TIMEOUT_ID == 0)) { PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); } if (needs_set) { #if PG_MAJORVERSION_NUM <= 14 enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); #else enable_timeout_every( PS_TIMEOUT_ID, TimestampTzPlusMilliseconds(GetCurrentTimestamp(), readahead_getpage_pull_timeout_ms), readahead_getpage_pull_timeout_ms ); #endif timeout_set = true; } else { Assert(timeout_set); disable_timeout(PS_TIMEOUT_ID, false); timeout_set = false; } } } static void pagestore_timeout_handler(void) { #if PG_MAJORVERSION_NUM <= 14 /* * PG14: Setting a repeating timeout is not possible, so we signal here * that the timeout has already been reset, and by telling the system * that system will re-schedule it later if we need to. */ timeout_set = false; #endif timeout_signaled = true; InterruptPending = true; } /* * Process new data received in our active PageStream sockets. * * This relies on the invariant that all pipelined yet-to-be-received requests * are getPage requests managed by MyPState. This is currently true, any * modification will probably require some stuff to make it work again. */ static bool communicator_processinterrupts(void) { if (timeout_signaled) { if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) communicator_prefetch_pump_state(); timeout_signaled = false; communicator_reconfigure_timeout_if_needed(); } if (!prev_interrupt_cb) return false; return prev_interrupt_cb(); } ================================================ FILE: pgxn/neon/communicator.h ================================================ /*------------------------------------------------------------------------- * * communicator.h * internal interface for communicating with remote pageservers * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #ifndef COMMUNICATOR_h #define COMMUNICATOR_h #include "neon_pgversioncompat.h" #include "storage/buf_internals.h" #include "pagestore_client.h" /* initialization at postmaster startup */ extern void pg_init_communicator(void); /* initialization at backend startup */ extern void communicator_init(void); extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns); extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns); extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns); extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, void **buffers, BlockNumber nblocks, const bits8 *mask); extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, BlockNumber nblocks, void **buffers, bits8 *mask); extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask); extern bool communicator_prefetch_receive(BufferTag tag); extern int communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, void *buffer); extern void communicator_reconfigure_timeout_if_needed(void); extern void communicator_prefetch_pump_state(void); #endif ================================================ FILE: pgxn/neon/communicator_process.c ================================================ /*------------------------------------------------------------------------- * * communicator_process.c * Functions for starting up the communicator background worker process. * * Currently, the communicator process only functions as a metrics * exporter. It provides an HTTP endpoint for polling a limited set of * metrics. TODO: In the future, it will do much more, i.e. handle all * the communications with the pageservers. * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "miscadmin.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" #include "replication/walsender.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/pmsignal.h" #include "storage/procsignal.h" #include "tcop/tcopprot.h" #include "utils/timestamp.h" #include "communicator_process.h" #include "file_cache.h" #include "neon.h" #include "neon_perf_counters.h" /* the rust bindings, generated by cbindgen */ #include "communicator/communicator_bindings.h" static void pump_logging(struct LoggingReceiver *logging); PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg); /**** Initialization functions. These run in postmaster ****/ void pg_init_communicator_process(void) { BackgroundWorker bgw; /* Initialize the background worker process */ memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_PostmasterStart; snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main"); snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process"); snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process"); bgw.bgw_restart_time = 5; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; RegisterBackgroundWorker(&bgw); } /**** Worker process functions. These run in the communicator worker process ****/ /* * Entry point for the communicator bgworker process */ void communicator_new_bgworker_main(Datum main_arg) { struct LoggingReceiver *logging; const char *errmsg = NULL; const struct CommunicatorWorkerProcessStruct *proc_handle; /* * Pretend that this process is a WAL sender. That affects the shutdown * sequence: WAL senders are shut down last, after the final checkpoint * has been written. That's what we want for the communicator process too. */ am_walsender = true; MarkPostmasterChildWalSender(); /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); /* * Postmaster sends us SIGUSR2 when all regular backends and bgworkers * have exited, and it's time for us to exit too */ pqsignal(SIGUSR2, die); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); /* * By default, INFO messages are not printed to the log. We want * `tracing::info!` messages emitted from the communicator to be printed, * however, so increase the log level. * * XXX: This overrides any user-set value from the config file. That's not * great, but on the other hand, there should be little reason for user to * control the verbosity of the communicator. It's not too verbose by * default. */ SetConfigOption("log_min_messages", "INFO", PGC_SUSET, PGC_S_OVERRIDE); logging = communicator_worker_configure_logging(); proc_handle = communicator_worker_launch( neon_tenant[0] == '\0' ? NULL : neon_tenant, neon_timeline[0] == '\0' ? NULL : neon_timeline, &errmsg ); if (proc_handle == NULL) { /* * Something went wrong. Before exiting, forward any log messages that * might've been generated during the failed launch. */ pump_logging(logging); elog(PANIC, "%s", errmsg); } /* * The Rust tokio runtime has been launched, and it's running in the * background now. This loop in the main thread handles any interactions * we need with the rest of PostgreSQL. * * NB: This process is now multi-threaded! The Rust threads do not call * into any Postgres functions, but it's not entirely clear which Postgres * functions are safe to call from this main thread either. Be very * careful about adding anything non-trivial here. * * Also note that we try to react quickly to any log messages arriving * from the Rust thread. Be careful to not do anything too expensive here * that might cause delays. */ elog(LOG, "communicator threads started"); for (;;) { TimestampTz before; long duration; ResetLatch(MyLatch); /* * Forward any log messages from the Rust threads into the normal * Postgres logging facility. */ pump_logging(logging); /* * Check interrupts like system shutdown or config reload * * We mustn't block for too long within this loop, or we risk the log * queue to fill up and messages to be lost. Also, even if we can keep * up, if there's a long delay between sending a message and printing * it to the log, the timestamps on the messages get skewed, which is * confusing. * * We expect processing interrupts to happen fast enough that it's OK, * but measure it just in case, and print a warning if it takes longer * than 100 ms. */ #define LOG_SKEW_WARNING_MS 100 before = GetCurrentTimestamp(); CHECK_FOR_INTERRUPTS(); if (ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); } duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp()); if (duration > LOG_SKEW_WARNING_MS) elog(WARNING, "handling interrupts took %ld ms, communicator log timestamps might be skewed", duration); /* * Wait until we are woken up. The rust threads will set the latch * when there's a log message to forward. */ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, PG_WAIT_EXTENSION); } } static void pump_logging(struct LoggingReceiver *logging) { char errbuf[1000]; int elevel; int32 rc; static uint64_t last_dropped_event_count = 0; uint64_t dropped_event_count; uint64_t dropped_now; for (;;) { rc = communicator_worker_poll_logging(logging, errbuf, sizeof(errbuf), &elevel, &dropped_event_count); if (rc == 0) { /* nothing to do */ break; } else if (rc == 1) { /* Because we don't want to exit on error */ if (message_level_is_interesting(elevel)) { /* * Prevent interrupts while cleaning up. * * (Not sure if this is required, but all the error handlers * in Postgres that are installed as sigsetjmp() targets do * this, so let's follow the example) */ HOLD_INTERRUPTS(); errstart(elevel, TEXTDOMAIN); errmsg_internal("[COMMUNICATOR] %s", errbuf); EmitErrorReport(); FlushErrorState(); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); } } else if (rc == -1) { elog(ERROR, "logging channel was closed unexpectedly"); } } /* * If the queue was full at any time since the last time we reported it, * report how many messages were lost. We do this outside the loop, so * that if the logging system is clogged, we don't exacerbate it by * printing lots of warnings about dropped messages. */ dropped_now = dropped_event_count - last_dropped_event_count; if (dropped_now != 0) { elog(WARNING, "%lu communicator log messages were dropped because the log buffer was full", (unsigned long) dropped_now); last_dropped_event_count = dropped_event_count; } } /**** * Callbacks from the rust code, in the communicator process. * * NOTE: These must be thread-safe! It's very limited which PostgreSQL * functions you can use!!! * * The signatures of these need to match those in the Rust code. */ void callback_set_my_latch_unsafe(void) { SetLatch(MyLatch); } ================================================ FILE: pgxn/neon/communicator_process.h ================================================ /*------------------------------------------------------------------------- * * communicator_process.h * Communicator process * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #ifndef COMMUNICATOR_PROCESS_H #define COMMUNICATOR_PROCESS_H extern void pg_init_communicator_process(void); #endif /* COMMUNICATOR_PROCESS_H */ ================================================ FILE: pgxn/neon/extension_server.c ================================================ /*------------------------------------------------------------------------- * * extension_server.c * Request compute_ctl to download extension files. * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "utils/guc.h" #include "extension_server.h" #include "neon_utils.h" int hadron_extension_server_port = 0; static int extension_server_request_timeout = 60; static int extension_server_connect_timeout = 60; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; /* * to download all SQL (and data) files for an extension: * curl -X POST http://localhost:8080/extension_server/postgis * it covers two possible extension files layouts: * 1. extension_name--version--platform.sql * 2. extension_name/extension_name--version.sql * extension_name/extra_files.csv * to download specific library file: * curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */ static bool neon_download_extension_file_http(const char *filename, bool is_library) { CURLcode res; bool ret = false; CURL *handle = NULL; char *compute_ctl_url; handle = alloc_curl_handle(); curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); if (extension_server_request_timeout > 0) curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ ); if (extension_server_connect_timeout > 0) curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ ); compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", hadron_extension_server_port, filename, is_library ? "?is_library=true" : ""); elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url); /* Perform the request, res will get the return code */ res = curl_easy_perform(handle); curl_easy_cleanup(handle); /* Check for errors */ if (res == CURLE_OK) { ret = true; } else { /* * Don't error here because postgres will try to find the file and will * fail with some proper error message if it's not found. */ elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); } return ret; } void pg_init_extension_server() { /* Port to connect to compute_ctl on localhost */ /* to request extension files. */ DefineCustomIntVariable("neon.extension_server_port", "connection string to the compute_ctl", NULL, &hadron_extension_server_port, 0, 0, INT_MAX, PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomIntVariable("neon.extension_server_request_timeout", "timeout for fetching extensions in seconds", NULL, &extension_server_request_timeout, 60, 0, INT_MAX, PGC_SUSET, GUC_UNIT_S, NULL, NULL, NULL); DefineCustomIntVariable("neon.extension_server_connect_timeout", "timeout for connecting to the extension server in seconds", NULL, &extension_server_connect_timeout, 60, 0, INT_MAX, PGC_SUSET, GUC_UNIT_S, NULL, NULL, NULL); /* set download_extension_file_hook */ prev_download_extension_file_hook = download_extension_file_hook; download_extension_file_hook = neon_download_extension_file_http; } ================================================ FILE: pgxn/neon/extension_server.h ================================================ /*------------------------------------------------------------------------- * * extension_server.h * Request compute_ctl to download extension files. * *------------------------------------------------------------------------- */ #ifndef EXTENSION_SERVER_H #define EXTENSION_SERVER_H void pg_init_extension_server(void); #endif /* EXTENSION_SERVER_H */ ================================================ FILE: pgxn/neon/file_cache.c ================================================ /*------------------------------------------------------------------------- * * file_cache.c * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include #include #include "neon_pgversioncompat.h" #include "access/parallel.h" #include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" #include "common/hashfn.h" #include "pgstat.h" #include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/procsignal.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/dynahash.h" #include "utils/guc.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #include "hll.h" #include "bitmap.h" #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" #include "neon_utils.h" #include "pagestore_client.h" #include "communicator.h" #include "communicator/communicator_bindings.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. * Currently LRU eviction policy based on L2 list is used as replacement algorithm. * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash. * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list. * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm. * * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about * its consistency. * * ## Holes * * The LFC can be resized on the fly, up to a maximum size that's determined * at server startup (neon.max_file_cache_size). After server startup, we * expand the underlying file when needed, until it reaches the soft limit * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink * the LFC by punching holes in the underlying file with a * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't * shrink, but the disk space it uses does. * * Each hole is tracked by a dummy FileCacheEntry, which are kept in the * 'holes' linked list. They are entered into the chunk hash table, with a * special key where the blockNumber is used to store the 'offset' of the * hole, and all other fields are zero. Holes are never looked up in the hash * table, we only enter them there to have a FileCacheEntry that we can keep * in the linked list. If the soft limit is raised again, we reuse the holes * before extending the nominal size of the file. */ /* Local file storage allocation chunk. * Should be power of two. Using larger than page chunks can * 1. Reduce hash-map memory footprint: 8TB database contains billion pages * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. * 1Mb chunks can reduce hash map size to 320Mb. * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed */ #define MAX_BLOCKS_PER_CHUNK_LOG 7 /* 1Mb chunk */ #define MAX_BLOCKS_PER_CHUNK (1 << MAX_BLOCKS_PER_CHUNK_LOG) #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log)) #define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1)) /* * Blocks are read or written to LFC file outside LFC critical section. * To synchronize access to such block, writer set state of such block to PENDING. * If some other backend (read or writer) see PENDING status, it change it to REQUESTED and start * waiting until status is changed on conditional variable. * When writer completes is operation, it checks if status is REQUESTED and if so, broadcast conditional variable, * waking up all backend waiting for access to this block. */ typedef enum FileCacheBlockState { UNAVAILABLE, /* block is not present in cache */ AVAILABLE, /* block can be used */ PENDING, /* block is loaded */ REQUESTED /* some other backend is waiting for block to be loaded */ } FileCacheBlockState; typedef struct FileCacheEntry { BufferTag key; uint32 hash; uint32 offset; uint32 access_count; dlist_node list_node; /* LRU/holes list node */ uint32 state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */ } FileCacheEntry; #define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4) #define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3) #define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2)) #define N_COND_VARS 64 #define CV_WAIT_TIMEOUT 10 #define MAX_PREWARM_WORKERS 8 typedef struct PrewarmWorkerState { uint32 prewarmed_pages; uint32 skipped_pages; TimestampTz completed; } PrewarmWorkerState; typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash * reenabling */ uint32 size; /* size of cache file in chunks */ uint32 used; /* number of used chunks */ uint32 used_pages; /* number of used pages */ uint32 pinned; /* number of pinned chunks */ uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; uint64 writes; /* number of writes issued */ uint64 time_read; /* time spent reading (us) */ uint64 time_write; /* time spent writing (us) */ uint64 resizes; /* number of LFC resizes */ uint64 evicted_pages; /* number of evicted pages */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ /* * Estimation of working set size. * * This is not guarded by the lock. No locking is needed because all the * writes to the "registers" are simple 64-bit stores, to update a * timestamp. We assume that: * * - 64-bit stores are atomic. We could enforce that by using * pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but * for now we just rely on it implicitly. * * - Even if they're not, and there is a race between two stores, it * doesn't matter much which one wins because they're both updating the * register with the current timestamp. Or you have a race between * resetting the register and updating it, in which case it also doesn't * matter much which one wins. * * - If they're not atomic, you might get an occasional "torn write" if * you're really unlucky, but we tolerate that too. It just means that * the estimate will be a little off, until the register is updated * again. */ HyperLogLogState wss_estimation; /* Prewarmer state */ PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS]; size_t n_prewarm_workers; size_t n_prewarm_entries; size_t total_prewarm_pages; size_t prewarm_batch; bool prewarm_active; bool prewarm_canceled; dsm_handle prewarm_lfc_state_handle; } FileCacheControl; #define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc #define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks]) #define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8) #define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8) static HTAB *lfc_hash; static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; static int lfc_prewarm_limit; static int lfc_prewarm_batch; static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG; static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK; static char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static bool lfc_do_prewarm; bool lfc_store_prefetch_result; bool lfc_prewarm_update_ws_estimation; bool AmPrewarmWorker; #define LFC_ENABLED() (lfc_ctl->limit != 0) PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); /* * Close LFC file if opened. * All backends should close their LFC files once LFC is disabled. */ static void lfc_close_file(void) { if (lfc_desc >= 0) { close(lfc_desc); lfc_desc = -1; } } /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. * Also we should allow re-enable it if source of failure (lack of disk space, permissions,...) is fixed. * All cache content should be invalidated to avoid reading of stale or corrupted data */ static void lfc_switch_off(void) { int fd; if (LFC_ENABLED()) { HASH_SEQ_STATUS status; FileCacheEntry *entry; /* Invalidate hash */ hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { hash_search_with_hash_value(lfc_hash, &entry->key, entry->hash, HASH_REMOVE, NULL); } lfc_ctl->generation += 1; lfc_ctl->size = 0; lfc_ctl->pinned = 0; lfc_ctl->used = 0; lfc_ctl->used_pages = 0; lfc_ctl->limit = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); /* * We need to use unlink to to avoid races in LFC write, because it is not * protected by lock */ unlink(lfc_path); fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path); else close(fd); /* Wakeup waiting backends */ for (int i = 0; i < N_COND_VARS; i++) ConditionVariableBroadcast(&lfc_ctl->cv[i]); } lfc_close_file(); } static void lfc_disable(char const *op) { elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); lfc_switch_off(); LWLockRelease(lfc_lock); } /* * This check is done without obtaining lfc_lock, so it is unreliable */ static bool lfc_maybe_disabled(void) { return !lfc_ctl || !LFC_ENABLED(); } /* * Open LFC file if not opened yet or generation is changed. * Should be called under LFC lock. */ static bool lfc_ensure_opened(void) { if (lfc_generation != lfc_ctl->generation) { lfc_close_file(); lfc_generation = lfc_ctl->generation; } /* Open cache file if not done yet */ if (lfc_desc < 0) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR); if (lfc_desc < 0) { lfc_disable("open"); return false; } } return true; } void LfcShmemInit(void) { bool found; static HASHCTL info; if (lfc_max_size <= 0) return; lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); if (!found) { int fd; uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size); lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); info.entrysize = FILE_CACHE_ENRTY_SIZE; /* * n_chunks+1 because we add new element to hash table before eviction * of victim */ lfc_hash = ShmemInitHash("lfc_hash", n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); memset(lfc_ctl, 0, sizeof(FileCacheControl)); dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); /* Initialize hyper-log-log structure for estimating working set size */ initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) { elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path); lfc_ctl->limit = 0; } else { close(fd); lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit); } /* Initialize turnstile of condition variables */ for (int i = 0; i < N_COND_VARS; i++) ConditionVariableInit(&lfc_ctl->cv[i]); } } void LfcShmemRequest(void) { if (lfc_max_size > 0) { RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE)); RequestNamedLWLockTranche("lfc_lock", 1); } } static bool is_normal_backend(void) { /* * Stats collector detach shared memory, so we should not try to access * shared memory here. Parallel workers first assign default value (0), so * not perform truncation in parallel workers. The Postmaster can handle * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), * but has no PGPROC. */ return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker(); } static bool lfc_check_chunk_size(int *newval, void **extra, GucSource source) { if (*newval & (*newval - 1)) { elog(ERROR, "LFC chunk size should be power of two"); return false; } return true; } static void lfc_change_chunk_size(int newval, void* extra) { lfc_chunk_size_log = pg_ceil_log2_32(newval); } static bool lfc_check_limit_hook(int *newval, void **extra, GucSource source) { if (*newval > lfc_max_size) { elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); return false; } return true; } static void lfc_change_limit_hook(int newval, void *extra) { uint32 new_size = SIZE_MB_TO_CHUNKS(newval); if (!lfc_ctl || !is_normal_backend()) return; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* Open LFC file only if LFC was enabled or we are going to reenable it */ if (newval == 0 && !LFC_ENABLED()) { LWLockRelease(lfc_lock); /* File should be reopened if LFC is reenabled */ lfc_close_file(); return; } if (!lfc_ensure_opened()) { LWLockRelease(lfc_lock); return; } if (lfc_ctl->limit != new_size) { lfc_ctl->resizes += 1; } while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* * Shrink cache by throwing away least recently accessed chunks and * returning their space to file system */ FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); FileCacheEntry *hole; uint32 offset = victim->offset; uint32 hash; bool found; BufferTag holetag; CriticalAssert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0) neon_log(LOG, "Failed to punch hole in file: %m"); #endif /* We remove the old entry, and re-enter a hole to the hash table */ for (int i = 0; i < lfc_blocks_per_chunk; i++) { bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; lfc_ctl->used_pages -= is_page_cached; lfc_ctl->evicted_pages += is_page_cached; } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); memset(&holetag, 0, sizeof(holetag)); holetag.blockNum = offset; hash = get_hash_value(lfc_hash, &holetag); hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found); hole->hash = hash; hole->offset = offset; hole->access_count = 0; CriticalAssert(!found); dlist_push_tail(&lfc_ctl->holes, &hole->list_node); lfc_ctl->used -= 1; } if (new_size == 0) lfc_switch_off(); else lfc_ctl->limit = new_size; neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); } void lfc_init(void) { /* * In order to create our shared memory area, we have to be loaded via * shared_preload_libraries. */ if (!process_shared_preload_libraries_in_progress) neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries"); DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc", "Immediately store received prefetch result in LFC", NULL, &lfc_store_prefetch_result, false, PGC_SUSET, 0, NULL, NULL, NULL); DefineCustomBoolVariable("neon.prewarm_update_ws_estimation", "Consider prewarmed pages for working set estimation", NULL, &lfc_prewarm_update_ws_estimation, true, PGC_SUSET, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, &lfc_max_size, 0, /* disabled by default */ 0, INT_MAX, PGC_POSTMASTER, GUC_UNIT_MB, NULL, NULL, NULL); DefineCustomIntVariable("neon.file_cache_size_limit", "Current limit for size of Neon local file cache", NULL, &lfc_size_limit, 0, /* disabled by default */ 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MB, lfc_check_limit_hook, lfc_change_limit_hook, NULL); DefineCustomStringVariable("neon.file_cache_path", "Path to local file cache (can be raw device)", NULL, &lfc_path, "file.cache", PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.file_cache_chunk_size", "LFC chunk size in blocks (should be power of two)", NULL, &lfc_blocks_per_chunk, MAX_BLOCKS_PER_CHUNK, 1, MAX_BLOCKS_PER_CHUNK, PGC_POSTMASTER, GUC_UNIT_BLOCKS, lfc_check_chunk_size, lfc_change_chunk_size, NULL); DefineCustomIntVariable("neon.file_cache_prewarm_limit", "Maximal number of prewarmed chunks", NULL, &lfc_prewarm_limit, INT_MAX, /* no limit by default */ 0, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.file_cache_prewarm_batch", "Number of pages retrivied by prewarm from page server", NULL, &lfc_prewarm_batch, 64, 1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); } /* * Dump a list of pages that are currently in the LFC * * This is used to get a snapshot that can be used to prewarm the LFC later. */ FileCacheState* lfc_get_state(size_t max_entries) { FileCacheState* fcs = NULL; if (lfc_maybe_disabled() || max_entries == 0) /* fast exit if file cache is disabled */ return NULL; LWLockAcquire(lfc_lock, LW_SHARED); if (LFC_ENABLED()) { dlist_iter iter; size_t i = 0; uint8* bitmap; size_t n_pages = 0; size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned); size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries); fcs = (FileCacheState*)palloc0(state_size); SET_VARSIZE(fcs, state_size); fcs->magic = FILE_CACHE_STATE_MAGIC; fcs->chunk_size_log = lfc_chunk_size_log; fcs->n_chunks = n_entries; bitmap = FILE_CACHE_STATE_BITMAP(fcs); dlist_reverse_foreach(iter, &lfc_ctl->lru) { FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur); fcs->chunks[i] = entry->key; for (int j = 0; j < lfc_blocks_per_chunk; j++) { if (GET_STATE(entry, j) != UNAVAILABLE) { /* Validate the buffer tag before including it */ BufferTag test_tag = entry->key; test_tag.blockNum += j; if (BufferTagIsValid(&test_tag)) { BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); n_pages += 1; } else { elog(ERROR, "LFC: Skipping invalid buffer tag during cache state capture: blockNum=%u", test_tag.blockNum); } } } if (++i == n_entries) break; } Assert(i == n_entries); fcs->n_pages = n_pages; Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); elog(LOG, "LFC: save state of %d chunks %d pages (validated)", (int)n_entries, (int)n_pages); } LWLockRelease(lfc_lock); return fcs; } /* * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock * and avoid race conditions with other backends. */ void lfc_prewarm(FileCacheState* fcs, uint32 n_workers) { size_t fcs_chunk_size_log; size_t n_entries; size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); size_t fcs_size; uint32_t max_prefetch_pages; dsm_segment *seg; BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; if (!lfc_ensure_opened()) return; if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0) { elog(LOG, "LFC: prewarm is disabled"); return; } if (n_workers > MAX_PREWARM_WORKERS) { elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS); } if (fcs == NULL || fcs->n_chunks == 0) { elog(LOG, "LFC: nothing to prewarm"); return; } if (fcs->magic != FILE_CACHE_STATE_MAGIC) { elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic); } fcs_size = VARSIZE(fcs); if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size) { elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs)); } fcs_chunk_size_log = fcs->chunk_size_log; if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG) { elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log); } n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); Assert(n_entries != 0); max_prefetch_pages = n_entries << fcs_chunk_size_log; if (fcs->n_pages > max_prefetch_pages) { elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages); } LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* Do not prewarm more entries than LFC limit */ if (lfc_ctl->limit <= lfc_ctl->size) { elog(LOG, "LFC: skip prewarm because LFC is already filled"); LWLockRelease(lfc_lock); return; } if (lfc_ctl->prewarm_active) { LWLockRelease(lfc_lock); elog(ERROR, "LFC: skip prewarm because another prewarm is still active"); } lfc_ctl->n_prewarm_entries = n_entries; lfc_ctl->n_prewarm_workers = n_workers; lfc_ctl->prewarm_active = true; lfc_ctl->prewarm_canceled = false; lfc_ctl->prewarm_batch = prewarm_batch; memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState)); LWLockRelease(lfc_lock); /* Calculate total number of pages to be prewarmed */ lfc_ctl->total_prewarm_pages = fcs->n_pages; seg = dsm_create(fcs_size, 0); memcpy(dsm_segment_address(seg), fcs, fcs_size); lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg); /* Spawn background workers */ for (uint32 i = 0; i < n_workers; i++) { BackgroundWorker worker = {0}; worker.bgw_flags = BGWORKER_SHMEM_ACCESS; worker.bgw_start_time = BgWorkerStart_ConsistentState; worker.bgw_restart_time = BGW_NEVER_RESTART; strcpy(worker.bgw_library_name, "neon"); strcpy(worker.bgw_function_name, "lfc_prewarm_main"); snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1); strcpy(worker.bgw_type, "LFC prewarm worker"); worker.bgw_main_arg = Int32GetDatum(i); /* must set notify PID to wait for shutdown */ worker.bgw_notify_pid = MyProcPid; if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i])) { ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("LFC: registering dynamic bgworker prewarm failed"), errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes"))); n_workers = i; lfc_ctl->prewarm_canceled = true; break; } } for (uint32 i = 0; i < n_workers; i++) { bool interrupted; do { interrupted = false; PG_TRY(); { BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]); if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED) { elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status); } } PG_CATCH(); { elog(LOG, "LFC: cancel prewarm"); lfc_ctl->prewarm_canceled = true; interrupted = true; } PG_END_TRY(); } while (interrupted); if (!lfc_ctl->prewarm_workers[i].completed) { /* Background worker doesn't set completion time: it means that it was abnormally terminated */ elog(LOG, "LFC: prewarm worker %d failed", i+1); /* Set completion time to prevent get_prewarm_info from considering this worker as active */ lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp(); } } dsm_detach(seg); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); lfc_ctl->prewarm_active = false; LWLockRelease(lfc_lock); } void lfc_prewarm_main(Datum main_arg) { size_t snd_idx = 0, rcv_idx = 0; size_t n_sent = 0, n_received = 0; size_t fcs_chunk_size_log; size_t max_prefetch_pages; size_t prewarm_batch; size_t n_workers; dsm_segment *seg; FileCacheState* fcs; uint8* bitmap; BufferTag tag; PrewarmWorkerState* ws; uint32 worker_id = DatumGetInt32(main_arg); AmPrewarmWorker = true; pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle); if (seg == NULL) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not map dynamic shared memory segment"))); fcs = (FileCacheState*) dsm_segment_address(seg); prewarm_batch = lfc_ctl->prewarm_batch; fcs_chunk_size_log = fcs->chunk_size_log; n_workers = lfc_ctl->n_prewarm_workers; max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log; ws = &lfc_ctl->prewarm_workers[worker_id]; bitmap = FILE_CACHE_STATE_BITMAP(fcs); /* enable prefetch in LFC */ lfc_store_prefetch_result = true; lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */ elog(LOG, "LFC: worker %d start prewarming", worker_id); while (!lfc_ctl->prewarm_canceled) { if (snd_idx < max_prefetch_pages) { if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id) { /* If there are multiple workers, split chunks between them */ snd_idx += 1 << fcs_chunk_size_log; } else { if (BITMAP_ISSET(bitmap, snd_idx)) { tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); if (!BufferTagIsValid(&tag)) { elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum); } if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) { (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); n_sent += 1; } else { ws->skipped_pages += 1; BITMAP_CLR(bitmap, snd_idx); } } snd_idx += 1; } } if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages) { if (n_received == n_sent && snd_idx == max_prefetch_pages) { break; } if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id) { /* Skip chunks processed by other workers */ rcv_idx += 1 << fcs_chunk_size_log; continue; } /* Locate next block to prefetch */ while (!BITMAP_ISSET(bitmap, rcv_idx)) { rcv_idx += 1; } tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log]; tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1); if (communicator_prefetch_receive(tag)) { ws->prewarmed_pages += 1; } else { ws->skipped_pages += 1; } rcv_idx += 1; n_received += 1; } } /* No need to perform prefetch cleanup here because prewarm worker will be terminated and * connection to PS dropped just after return from this function. */ Assert(n_sent == n_received || lfc_ctl->prewarm_canceled); elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received); lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); } void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; uint32 hash; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (LFC_ENABLED()) { for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk) { tag.blockNum = blkno; hash = get_hash_value(lfc_hash, &tag); entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); if (entry != NULL) { for (int i = 0; i < lfc_blocks_per_chunk; i++) { if (GET_STATE(entry, i) == AVAILABLE) { lfc_ctl->used_pages -= 1; SET_STATE(entry, i, UNAVAILABLE); } } } } } LWLockRelease(lfc_lock); } /* * Check if page is present in the cache. * Returns true if page is found in local cache. */ bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry *entry; int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); bool found = false; uint32 hash; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return false; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno - chunk_offs; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); if (LFC_ENABLED()) { entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE; } LWLockRelease(lfc_lock); return found; } /* * Check if page is present in the cache. * Returns true if page is found in local cache. */ int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int nblocks, bits8 *bitmap) { BufferTag tag; FileCacheEntry *entry; uint32 chunk_offs; int found = 0; uint32 hash; int i = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); if (!LFC_ENABLED()) { LWLockRelease(lfc_lock); return 0; } while (true) { int this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs); entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); if (entry != NULL) { for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++) { if (GET_STATE(entry, chunk_offs) != UNAVAILABLE) { BITMAP_SET(bitmap, i); found++; } } } else { i += this_chunk; } /* * Break out of the iteration before doing expensive stuff for * a next iteration */ if (i >= nblocks) break; /* * Prepare for the next iteration. We don't unlock here, as that'd * probably be more expensive than the gains it'd get us. */ chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i); tag.blockNum = (blkno + i) - chunk_offs; hash = get_hash_value(lfc_hash, &tag); } LWLockRelease(lfc_lock); #ifdef USE_ASSERT_CHECKING { int count = 0; for (int j = 0; j < nblocks; j++) { if (BITMAP_ISSET(bitmap, j)) count++; } Assert(count == found); } #endif return found; } #if PG_MAJORVERSION_NUM >= 16 static PGIOAlignedBlock voidblock = {0}; #else static PGAlignedBlock voidblock = {0}; #endif #define SCRIBBLEPAGE (&voidblock.data) /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in * 'mask' for the pages which were read. This may scribble over buffers not * marked in 'mask', so be careful with operation ordering. * * In case of error local file cache is disabled (lfc->limit is set to zero), * and -1 is returned. * * If the mask argument is supplied, we'll only try to read those pages which * don't have their bits set on entry. At exit, pages which were successfully * read from LFC will have their bits set. */ int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; uint32 hash; uint64 generation; uint32 entry_offset; int blocks_read = 0; int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return -1; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); /* Update working set size estimate for the blocks */ for (int i = 0; i < nblocks; i++) { tag.blockNum = blkno + i; addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); } /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in * 3. Read the blocks we're looking for (in one preadv), assuming they exist * 4. Update the statistics for the read call. * * If there is an error, we do an early return. */ while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; uint8 chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0}; int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs); int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; int n_blocks_to_read = 0; int iov_last_used = 0; int first_block_in_chunk_read = -1; ConditionVariable* cv; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { iov[i].iov_len = BLCKSZ; /* mask not set = we must do work */ if (!BITMAP_ISSET(mask, buf_offset + i)) { iov[i].iov_base = buffers[buf_offset + i]; n_blocks_to_read++; iov_last_used = i + 1; if (first_block_in_chunk_read == -1) { first_block_in_chunk_read = i; } } /* mask set = we must do no work */ else { /* don't scribble on pages we weren't requested to write to */ iov[i].iov_base = SCRIBBLEPAGE; } } /* shortcut IO */ if (n_blocks_to_read == 0) { buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; blkno += blocks_in_chunk; continue; } /* * The effective iov size must be >= the number of blocks we're about * to read. */ Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read); tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* We can return the blocks we've read before LFC got disabled; * assuming we read any. */ if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return blocks_read; } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); if (entry == NULL) { /* Pages are not cached */ lfc_ctl->misses += blocks_in_chunk; pgBufferUsage.file_cache.misses += blocks_in_chunk; LWLockRelease(lfc_lock); buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; blkno += blocks_in_chunk; continue; } /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) { lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); } generation = lfc_ctl->generation; entry_offset = entry->offset; for (int i = first_block_in_chunk_read; i < iov_last_used; i++) { FileCacheBlockState state = UNAVAILABLE; bool sleeping = false; /* no need to work on something we're not interested in */ if (BITMAP_ISSET(mask, buf_offset + i)) continue; while (lfc_ctl->generation == generation) { state = GET_STATE(entry, chunk_offs + i); if (state == PENDING) { SET_STATE(entry, chunk_offs + i, REQUESTED); } else if (state != REQUESTED) { break; } if (!sleeping) { ConditionVariablePrepareToSleep(cv); sleeping = true; } LWLockRelease(lfc_lock); ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); } if (sleeping) { ConditionVariableCancelSleep(); } if (state == AVAILABLE) { BITMAP_SET(chunk_mask, i); iteration_hits++; } else iteration_misses++; } LWLockRelease(lfc_lock); Assert(iteration_hits + iteration_misses > 0); if (iteration_hits != 0) { /* chunk offset (# of pages) into the LFC file */ off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk; int nwrite = iov_last_used - first_block_in_chunk_read; /* offset of first IOV */ first_read_offset += chunk_offs + first_block_in_chunk_read; pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ); /* Read only the blocks we're interested in, limiting */ rc = preadv(lfc_desc, &iov[first_block_in_chunk_read], nwrite, first_read_offset * BLCKSZ); pgstat_report_wait_end(); if (rc != (BLCKSZ * nwrite)) { lfc_disable("read"); return -1; } } /* Place entry to the head of LRU list */ LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (lfc_ctl->generation == generation) { CriticalAssert(LFC_ENABLED()); lfc_ctl->hits += iteration_hits; lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; if (iteration_hits) { lfc_ctl->time_read += io_time_us; inc_page_cache_read_wait(io_time_us); /* * We successfully read the pages we know were valid when we * started reading; now mark those pages as read */ for (int i = first_block_in_chunk_read; i < iov_last_used; i++) { if (BITMAP_ISSET(chunk_mask, i)) BITMAP_SET(mask, buf_offset + i); } } CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) { lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } } else { /* generation mismatch, assume error condition */ lfc_close_file(); LWLockRelease(lfc_lock); return -1; } LWLockRelease(lfc_lock); buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; blkno += blocks_in_chunk; blocks_read += iteration_hits; } return blocks_read; } /* * Initialize new LFC hash entry, perform eviction if needed. * Returns false if there are no unpinned entries and chunk can not be added. */ static bool lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) { /*----------- * If the chunk wasn't already in the LFC then we have these * options, in order of preference: * * Unless there is no space available, we can: * 1. Use an entry from the `holes` list, and * 2. Create a new entry. * We can always, regardless of space in the LFC: * 3. evict an entry from LRU, and * 4. ignore the write operation (the least favorite option) */ if (lfc_ctl->used < lfc_ctl->limit) { if (!dlist_is_empty(&lfc_ctl->holes)) { /* We can reuse a hole that was left behind when the LFC was shrunk previously */ FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); uint32 offset = hole->offset; bool hole_found; hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found); CriticalAssert(hole_found); lfc_ctl->used += 1; entry->offset = offset; /* reuse the hole */ } else { lfc_ctl->used += 1; entry->offset = lfc_ctl->size++;/* allocate new chunk at end * of file */ } } /* * We've already used up all allocated LFC entries. * * If we can clear an entry from the LRU, do that. * If we can't (e.g. because all other slots are being accessed) * then we will remove this entry from the hash and continue * on to the next chunk, as we may not exceed the limit. * * While prewarming LFC we do not want to replace existed entries, * so we just stop prewarm is LFC cache is full. */ else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); for (int i = 0; i < lfc_blocks_per_chunk; i++) { bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; lfc_ctl->used_pages -= is_page_cached; lfc_ctl->evicted_pages += is_page_cached; } CriticalAssert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); neon_log(DEBUG2, "Swap file cache page"); } else { /* Can't add this chunk - we don't have the space for it */ hash_search_with_hash_value(lfc_hash, &entry->key, hash, HASH_REMOVE, NULL); lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */ return false; } entry->access_count = 1; entry->hash = hash; lfc_ctl->pinned += 1; for (int i = 0; i < lfc_blocks_per_chunk; i++) SET_STATE(entry, i, UNAVAILABLE); return true; } /* * Store received prefetch result in LFC cache. * Unlike lfc_read/lfc_write this call is is not protected by shared buffer lock. * So we should be ready that other backends will try to concurrently read or write this block. * We do not store prefetched block if it already exists in LFC or it's not_modified_since LSN is smaller * than current last written LSN (LwLSN). * * We can enforce correctness of storing page in LFC by the following steps: * 1. Check under LFC lock that page in not present in LFC. * 2. Check under LFC lock that LwLSN is not changed since prefetch request time (not_modified_since). * 3. Change page state to "Pending" under LFC lock to prevent all other backends to read or write this * pages until this write is completed. * 4. Assume that some other backend creates new image of the page without reading it * (because reads will be blocked because of 2). This version of the page is stored in shared buffer. * Any attempt to throw away this page from shared buffer will be blocked, because Postgres first * needs to save dirty page and write will be blocked because of 2. * So any backend trying to access this page, will take it from shared buffer without accessing * SMGR and LFC. * 5. After write completion we once again obtain LFC lock and wake-up all waiting backends. * If there is some backend waiting to write new image of the page (4) then now it will be able to * do it,overwriting old (prefetched) page image. As far as this write will be completed before * shared buffer can be reassigned, not other backend can see old page image. */ bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, const void* buffer, XLogRecPtr lsn) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; uint32 hash; uint64 generation; uint32 entry_offset; instr_time io_start, io_end; ConditionVariable* cv; FileCacheBlockState state; XLogRecPtr lwlsn; int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return false; CopyNRelFileInfoToBufTag(tag, rinfo); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); tag.forkNum = forknum; /* Update working set size estimate for the blocks */ if (lfc_prewarm_update_ws_estimation) { tag.blockNum = blkno; addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return false; } lwlsn = neon_get_lwlsn(rinfo, forknum, blkno); if (lwlsn > lsn) { elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn)); LWLockRelease(lfc_lock); return false; } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); if (found) { state = GET_STATE(entry, chunk_offs); if (state != UNAVAILABLE) { /* Do not rewrite existed LFC entry */ LWLockRelease(lfc_lock); return false; } /* * Unlink entry from LRU list to pin it for the duration of IO * operation */ if (entry->access_count++ == 0) { lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); } } else { if (!lfc_init_new_entry(entry, hash)) { /* * We can't process this chunk due to lack of space in LFC, * so skip to the next one */ LWLockRelease(lfc_lock); return false; } } generation = lfc_ctl->generation; entry_offset = entry->offset; SET_STATE(entry, chunk_offs, PENDING); LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); INSTR_TIME_SET_CURRENT(io_start); rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ); INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ) { lfc_disable("write"); } else { LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (lfc_ctl->generation == generation) { uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); lfc_ctl->writes += 1; INSTR_TIME_SUBTRACT(io_start, io_end); time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); lfc_ctl->time_write += time_spent_us; inc_page_cache_write_wait(time_spent_us); if (--entry->access_count == 0) { lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } state = GET_STATE(entry, chunk_offs); if (state == REQUESTED) { ConditionVariableBroadcast(cv); } if (state != AVAILABLE) { lfc_ctl->used_pages += 1; SET_STATE(entry, chunk_offs, AVAILABLE); } } else { lfc_close_file(); } LWLockRelease(lfc_lock); } return true; } /* * Put page in local file cache. * If cache is full then evict some other page. */ void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; uint32 hash; uint64 generation; uint32 entry_offset; int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; CopyNRelFileInfoToBufTag(tag, rinfo); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); tag.forkNum = forkNum; /* Update working set size estimate for the blocks */ for (int i = 0; i < nblocks; i++) { tag.blockNum = blkno + i; addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); } LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return; } generation = lfc_ctl->generation; /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in * 3. Read the blocks we're looking for (in one preadv), assuming they exist * 4. Update the statistics for the read call. * * If there is an error, we do an early return. */ while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno); int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs); instr_time io_start, io_end; ConditionVariable* cv; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); iov[i].iov_len = BLCKSZ; } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); if (found) { /* * Unlink entry from LRU list to pin it for the duration of IO * operation */ if (entry->access_count++ == 0) { lfc_ctl->pinned += 1; dlist_delete(&entry->list_node); } } else { if (!lfc_init_new_entry(entry, hash)) { /* * We can't process this chunk due to lack of space in LFC, * so skip to the next one */ blkno += blocks_in_chunk; buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; continue; } } entry_offset = entry->offset; for (int i = 0; i < blocks_in_chunk; i++) { FileCacheBlockState state = UNAVAILABLE; bool sleeping = false; while (lfc_ctl->generation == generation) { state = GET_STATE(entry, chunk_offs + i); if (state == PENDING) { SET_STATE(entry, chunk_offs + i, REQUESTED); } else if (state == UNAVAILABLE) { SET_STATE(entry, chunk_offs + i, PENDING); break; } else if (state == AVAILABLE) { break; } if (!sleeping) { ConditionVariablePrepareToSleep(cv); sleeping = true; } LWLockRelease(lfc_lock); ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); } if (sleeping) { ConditionVariableCancelSleep(); } } LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ); INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); return; } else { LWLockAcquire(lfc_lock, LW_EXCLUSIVE); if (lfc_ctl->generation == generation) { uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); lfc_ctl->writes += blocks_in_chunk; INSTR_TIME_SUBTRACT(io_start, io_end); time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); lfc_ctl->time_write += time_spent_us; inc_page_cache_write_wait(time_spent_us); if (--entry->access_count == 0) { lfc_ctl->pinned -= 1; dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } for (int i = 0; i < blocks_in_chunk; i++) { FileCacheBlockState state = GET_STATE(entry, chunk_offs + i); if (state == REQUESTED) { ConditionVariableBroadcast(cv); } if (state != AVAILABLE) { lfc_ctl->used_pages += 1; SET_STATE(entry, chunk_offs + i, AVAILABLE); } } } else { /* stop iteration if LFC was disabled */ lfc_close_file(); break; } } blkno += blocks_in_chunk; buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; } LWLockRelease(lfc_lock); } /* * Return metrics about the LFC. * * The return format is a palloc'd array of LfcStatsEntrys. The size * of the returned array is returned in *num_entries. */ LfcStatsEntry * lfc_get_stats(size_t *num_entries) { LfcStatsEntry *entries; size_t n = 0; #define MAX_ENTRIES 10 entries = palloc(sizeof(LfcStatsEntry) * MAX_ENTRIES); entries[n++] = (LfcStatsEntry) {"file_cache_chunk_size_pages", lfc_ctl == NULL, lfc_ctl ? lfc_blocks_per_chunk : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_misses", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->misses : 0}; entries[n++] = (LfcStatsEntry) {"file_cache_hits", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->hits : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_used", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->used : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_writes", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->writes : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_size", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->size : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_used_pages", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->used_pages : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_evicted_pages", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->evicted_pages : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_limit", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->limit : 0 }; entries[n++] = (LfcStatsEntry) {"file_cache_chunks_pinned", lfc_ctl == NULL, lfc_ctl ? lfc_ctl->pinned : 0 }; Assert(n <= MAX_ENTRIES); #undef MAX_ENTRIES *num_entries = n; return entries; } /* * Function returning data from the local file cache * relation node/tablespace/database/blocknum and access_counter */ LocalCachePagesRec * lfc_local_cache_pages(size_t *num_entries) { HASH_SEQ_STATUS status; FileCacheEntry *entry; size_t n_pages; size_t n; LocalCachePagesRec *result; if (!lfc_ctl) { *num_entries = 0; return NULL; } LWLockAcquire(lfc_lock, LW_SHARED); if (!LFC_ENABLED()) { LWLockRelease(lfc_lock); *num_entries = 0; return NULL; } /* Count the pages first */ n_pages = 0; hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { /* Skip hole tags */ if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { for (int i = 0; i < lfc_blocks_per_chunk; i++) n_pages += GET_STATE(entry, i) == AVAILABLE; } } if (n_pages == 0) { LWLockRelease(lfc_lock); *num_entries = 0; return NULL; } result = (LocalCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(LocalCachePagesRec) * n_pages); /* * Scan through all the cache entries, saving the relevant fields * in the result structure. */ n = 0; hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { for (int i = 0; i < lfc_blocks_per_chunk; i++) { if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { if (GET_STATE(entry, i) == AVAILABLE) { result[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i; result[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); result[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); result[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); result[n].forknum = entry->key.forkNum; result[n].blocknum = entry->key.blockNum + i; result[n].accesscount = entry->access_count; n += 1; } } } } Assert(n_pages == n); LWLockRelease(lfc_lock); *num_entries = n_pages; return result; } /* * Internal implementation of the approximate_working_set_size_seconds() * function. */ int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset) { int32 dc; if (lfc_size_limit == 0) return -1; dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); if (reset) memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); return dc; } /* * Get metrics, for the built-in metrics exporter that's part of the communicator * process. * * NB: This is called from a Rust tokio task inside the communicator process. * Acquiring lwlocks, elog(), allocating memory or anything else non-trivial * is strictly prohibited here! */ struct LfcMetrics callback_get_lfc_metrics_unsafe(void) { struct LfcMetrics result = { .lfc_cache_size_limit = (int64) lfc_size_limit * 1024 * 1024, .lfc_hits = lfc_ctl ? lfc_ctl->hits : 0, .lfc_misses = lfc_ctl ? lfc_ctl->misses : 0, .lfc_used = lfc_ctl ? lfc_ctl->used : 0, .lfc_writes = lfc_ctl ? lfc_ctl->writes : 0, }; if (lfc_ctl) { for (int minutes = 1; minutes <= 60; minutes++) { result.lfc_approximate_working_set_size_windows[minutes - 1] = lfc_approximate_working_set_size_seconds(minutes * 60, false); } } return result; } PG_FUNCTION_INFO_V1(get_local_cache_state); Datum get_local_cache_state(PG_FUNCTION_ARGS) { size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0); FileCacheState* fcs = lfc_get_state(max_entries); if (fcs != NULL) PG_RETURN_BYTEA_P((bytea*)fcs); else PG_RETURN_NULL(); } PG_FUNCTION_INFO_V1(prewarm_local_cache); Datum prewarm_local_cache(PG_FUNCTION_ARGS) { bytea* state = PG_GETARG_BYTEA_PP(0); uint32 n_workers = PG_GETARG_INT32(1); FileCacheState* fcs = (FileCacheState*)state; lfc_prewarm(fcs, n_workers); PG_RETURN_NULL(); } PG_FUNCTION_INFO_V1(get_prewarm_info); Datum get_prewarm_info(PG_FUNCTION_ARGS) { Datum values[4]; bool nulls[4]; TupleDesc tupdesc; uint32 prewarmed_pages = 0; uint32 skipped_pages = 0; uint32 active_workers = 0; uint32 total_pages; size_t n_workers; if (lfc_size_limit == 0) PG_RETURN_NULL(); LWLockAcquire(lfc_lock, LW_SHARED); if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0) { LWLockRelease(lfc_lock); PG_RETURN_NULL(); } n_workers = lfc_ctl->n_prewarm_workers; total_pages = lfc_ctl->total_prewarm_pages; for (size_t i = 0; i < n_workers; i++) { PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i]; prewarmed_pages += ws->prewarmed_pages; skipped_pages += ws->skipped_pages; active_workers += ws->completed != 0; } LWLockRelease(lfc_lock); tupdesc = CreateTemplateTupleDesc(4); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); MemSet(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(total_pages); values[1] = Int32GetDatum(prewarmed_pages); values[2] = Int32GetDatum(skipped_pages); values[3] = Int32GetDatum(active_workers); PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } ================================================ FILE: pgxn/neon/file_cache.h ================================================ /*------------------------------------------------------------------------- * * file_cache.h * Local File Cache definitions * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #ifndef FILE_CACHE_h #define FILE_CACHE_h #include "neon_pgversioncompat.h" typedef struct FileCacheState { int32 vl_len_; /* varlena header (do not touch directly!) */ uint32 magic; uint32 n_chunks; uint32 n_pages; uint16 chunk_size_log; BufferTag chunks[FLEXIBLE_ARRAY_MEMBER]; /* followed by bitmap */ } FileCacheState; /* GUCs */ extern bool lfc_store_prefetch_result; /* functions for local file cache */ extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks); /* returns number of blocks read, with one bit set in *read for each */ extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void **buffers, BlockNumber nblocks, bits8 *mask); extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_init(void); extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, const void* buffer, XLogRecPtr lsn); extern FileCacheState* lfc_get_state(size_t max_entries); extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); typedef struct LfcStatsEntry { const char *metric_name; bool isnull; uint64 value; } LfcStatsEntry; extern LfcStatsEntry *lfc_get_stats(size_t *num_entries); typedef struct { uint32 pageoffs; Oid relfilenode; Oid reltablespace; Oid reldatabase; ForkNumber forknum; BlockNumber blocknum; uint16 accesscount; } LocalCachePagesRec; extern LocalCachePagesRec *lfc_local_cache_pages(size_t *num_entries); extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset); static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void *buffer) { bits8 rv = 0; return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; } static inline void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) { return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); } #endif /* FILE_CACHE_H */ ================================================ FILE: pgxn/neon/hll.c ================================================ /*------------------------------------------------------------------------- * * hll.c * Sliding HyperLogLog cardinality estimator * * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group * * Implements https://hal.science/hal-00465313/document * * Based on Hideaki Ohno's C++ implementation. This is probably not ideally * suited to estimating the cardinality of very large sets; in particular, we * have not attempted to further optimize the implementation as described in * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic * Engineering of a State of The Art Cardinality Estimation Algorithm". * * A sparse representation of HyperLogLog state is used, with fixed space * overhead. * * The copyright terms of Ohno's original version (the MIT license) follow. * * IDENTIFICATION * src/backend/lib/hyperloglog.c * *------------------------------------------------------------------------- */ /* * Copyright (c) 2013 Hideaki Ohno * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the 'Software'), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include "postgres.h" #include "funcapi.h" #include "port/pg_bitutils.h" #include "utils/timestamp.h" #include "hll.h" #define POW_2_32 (4294967296.0) #define NEG_POW_2_32 (-4294967296.0) #define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) /* * Worker for addHyperLogLog(). * * Calculates the position of the first set bit in first b bits of x argument * starting from the first, reading from most significant to least significant * bits. * * Example (when considering fist 10 bits of x): * * rho(x = 0b1000000000) returns 1 * rho(x = 0b0010000000) returns 3 * rho(x = 0b0000000000) returns b + 1 * * "The binary address determined by the first b bits of x" * * Return value "j" used to index bit pattern to watch. */ static inline uint8 rho(uint32 x, uint8 b) { uint8 j = 1; if (x == 0) return b + 1; j = 32 - pg_leftmost_one_pos32(x); if (j > b) return b + 1; return j; } /* * Initialize HyperLogLog track state */ void initSHLL(HyperLogLogState *cState) { memset(cState->regs, 0, sizeof(cState->regs)); } /* * Adds element to the estimator, from caller-supplied hash. * * It is critical that the hash value passed be an actual hash value, typically * generated using hash_any(). The algorithm relies on a specific bit-pattern * observable in conjunction with stochastic averaging. There must be a * uniform distribution of bits in hash values for each distinct original value * observed. */ void addSHLL(HyperLogLogState *cState, uint32 hash) { uint8 count; uint32 index; TimestampTz now = GetCurrentTimestamp(); /* Use the first "k" (registerWidth) bits as a zero based index */ index = hash >> HLL_C_BITS; /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; Assert(count <= HLL_C_BITS); cState->regs[index][count] = now; } static uint8 getMaximum(const TimestampTz* reg, TimestampTz since) { uint8 max = 0; for (size_t i = 0; i < HLL_C_BITS + 1; i++) { if (reg[i] >= since) { max = i + 1; } } return max; } /* * Estimates cardinality, based on elements added so far */ double estimateSHLL(HyperLogLogState *cState, time_t duration) { double result; double sum = 0.0; size_t i; uint8 R[HLL_N_REGISTERS]; /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; for (i = 0; i < HLL_N_REGISTERS; i++) { R[i] = getMaximum(cState->regs[i], since); sum += 1.0 / pow(2.0, R[i]); } /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ result = ALPHA_MM / sum; if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) { /* Small range correction */ int zero_count = 0; for (i = 0; i < HLL_N_REGISTERS; i++) { zero_count += R[i] == 0; } if (zero_count != 0) result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / zero_count); } else if (result > (1.0 / 30.0) * POW_2_32) { /* Large range correction */ result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); } return result; } ================================================ FILE: pgxn/neon/hll.h ================================================ /*------------------------------------------------------------------------- * * hll.h * Sliding HyperLogLog cardinality estimator * * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group * * Implements https://hal.science/hal-00465313/document * * Based on Hideaki Ohno's C++ implementation. This is probably not ideally * suited to estimating the cardinality of very large sets; in particular, we * have not attempted to further optimize the implementation as described in * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic * Engineering of a State of The Art Cardinality Estimation Algorithm". * * A sparse representation of HyperLogLog state is used, with fixed space * overhead. * * The copyright terms of Ohno's original version (the MIT license) follow. * * IDENTIFICATION * src/backend/lib/hyperloglog.c * *------------------------------------------------------------------------- */ /* * Copyright (c) 2013 Hideaki Ohno * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the 'Software'), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #ifndef HLL_H #define HLL_H #define HLL_BIT_WIDTH 10 #define HLL_C_BITS (32 - HLL_BIT_WIDTH) #define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) /* * HyperLogLog is an approximate technique for computing the number of distinct * entries in a set. Importantly, it does this by using a fixed amount of * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal * cardinality estimation algorithm" for more. * * Instead of a single counter for every bits register, we have a timestamp * for every valid number of bits we can encounter. Every time we encounter * a certain number of bits, we update the timestamp in those registers to * the current timestamp. * * We can query the sketch's stored cardinality for the range of some timestamp * up to now: For each register, we return the highest bits bucket that has a * modified timestamp >= the query timestamp. This value is the number of bits * for this register in the normal HLL calculation. * * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. * Usage could be halved if we decide to reduce the required time dimension * precision; as 32 bits in second precision should be enough for statistics. * However, that is not yet implemented. */ typedef struct HyperLogLogState { TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; } HyperLogLogState; extern void initSHLL(HyperLogLogState *cState); extern void addSHLL(HyperLogLogState *cState, uint32 hash); extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); #endif ================================================ FILE: pgxn/neon/libpagestore.c ================================================ /*------------------------------------------------------------------------- * * libpagestore.c * Handles network communications with the remote pagestore. * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include #include #include "libpq-int.h" #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" #include "libpq-fe.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "miscadmin.h" #include "pgstat.h" #include "portability/instr_time.h" #include "postmaster/interrupt.h" #include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "utils/guc.h" #include "utils/memutils.h" #include "neon.h" #include "neon_perf_counters.h" #include "neon_utils.h" #include "pagestore_client.h" #include "walproposer.h" #ifdef __linux__ #include #include #endif #define PageStoreTrace DEBUG5 #define MIN_RECONNECT_INTERVAL_USEC 1000 #define MAX_RECONNECT_INTERVAL_USEC 1000000 enum NeonComputeMode { CP_MODE_PRIMARY = 0, CP_MODE_REPLICA, CP_MODE_STATIC }; static const struct config_enum_entry neon_compute_modes[] = { {"primary", CP_MODE_PRIMARY, false}, {"replica", CP_MODE_REPLICA, false}, {"static", CP_MODE_STATIC, false}, {NULL, 0, false} }; /* GUCs */ char *neon_timeline; char *neon_tenant; char *neon_project_id; char *neon_branch_id; char *neon_endpoint_id; int32 max_cluster_size; char *pageserver_connstring; char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; int neon_protocol_version = 3; static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; static int max_sockets; static int pageserver_response_log_timeout = 10000; /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ static int pageserver_response_disconnect_timeout = 150000; static int conf_refresh_reconnect_attempt_threshold = 16; // Hadron: timeout for refresh errors (1 minute) static uint64 kRefreshErrorTimeoutUSec = 1 * USECS_PER_MINUTE; typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; size_t num_shards; size_t stripe_size; } ShardMap; /* * PagestoreShmemState is kept in shared memory. It contains the connection * strings for each shard. * * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option, * allowing it to be changed using pg_reload_conf(). The control plane can * update the connection string if the pageserver crashes, is relocated, or * new shards are added. A parsed copy of the current value of the GUC is kept * in shared memory, updated by the postmaster, because regular backends don't * reload the config during query execution, but we might need to re-establish * the pageserver connection with the new connection string even in the middle * of a query. * * The shared memory copy is protected by a lockless algorithm using two * atomic counters. The counters allow a backend to quickly check if the value * has changed since last access, and to detect and retry copying the value if * the postmaster changes the value concurrently. (Postmaster doesn't have a * PGPROC entry and therefore cannot use LWLocks.) * * stripe_size is now also part of ShardMap, although it is defined by separate GUC. * Postgres doesn't provide any mechanism to enforce dependencies between GUCs, * that it we we have to rely on order of GUC definition in config file. * "neon.stripe_size" should be defined prior to "neon.pageserver_connstring" */ typedef struct { pg_atomic_uint64 begin_update_counter; pg_atomic_uint64 end_update_counter; ShardMap shard_map; } PagestoreShmemState; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; typedef enum PSConnectionState { PS_Disconnected, /* no connection yet */ PS_Connecting_Startup, /* connection starting up */ PS_Connecting_PageStream, /* negotiating pagestream */ PS_Connected, /* connected, pagestream established */ } PSConnectionState; /* This backend's per-shard connections */ typedef struct { TimestampTz last_connect_time; /* read-only debug value */ TimestampTz last_reconnect_time; uint32 delay_us; int n_reconnect_attempts; /*--- * Pageserver connection state, i.e. * disconnected: conn == NULL, wes == NULL; * conn_startup: connection initiated, waiting for connection establishing * conn_ps: PageStream query sent, waiting for confirmation * connected: PageStream established */ PSConnectionState state; PGconn *conn; /* request / response counters for debugging */ uint64 nrequests_sent; uint64 nresponses_received; /* State for the receive timeout mechanism in call_PQgetCopyData() */ instr_time receive_start_time; /* when we started waiting */ instr_time receive_last_log_time; /* when we last printed a log message for the wait */ bool receive_logged; /* has the wait been logged */ /*--- * WaitEventSet containing: * - WL_SOCKET_READABLE on 'conn' * - WL_LATCH_SET on MyLatch, and * - WL_EXIT_ON_PM_DEATH. */ WaitEventSet *wes_read; } PageServer; static uint32 local_request_counter; #define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); static void pageserver_disconnect(shardno_t shard_no); static void pageserver_disconnect_shard(shardno_t shard_no); // HADRON shardno_t get_num_shards(void); static bool PagestoreShmemIsValid(void) { return pagestore_shared && UsedShmemSegAddr; } /* * Parse a comma-separated list of connection strings into a ShardMap. * * If 'result' is NULL, just checks that the input is valid. If the input is * not valid, returns false. The contents of *result are undefined in * that case, and must not be relied on. */ static bool ParseShardMap(const char *connstr, ShardMap *result) { const char *p; int nshards = 0; if (result) memset(result, 0, sizeof(ShardMap)); p = connstr; nshards = 0; for (;;) { const char *sep; size_t connstr_len; sep = strchr(p, ','); connstr_len = sep != NULL ? sep - p : strlen(p); if (connstr_len == 0 && sep == NULL) break; /* ignore trailing comma */ if (nshards >= MAX_SHARDS) { neon_log(LOG, "Too many shards"); return false; } if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE) { neon_log(LOG, "Connection string too long"); return false; } if (result) { memcpy(result->connstring[nshards], p, connstr_len); result->connstring[nshards][connstr_len] = '\0'; } nshards++; if (sep == NULL) break; p = sep + 1; } if (result) { result->num_shards = nshards; result->stripe_size = stripe_size; } return true; } static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source) { char *p = *newval; return ParseShardMap(p, NULL); } static void AssignPageserverConnstring(const char *newval, void *extra) { ShardMap shard_map; /* * Only postmaster updates the copy in shared memory. */ if (!PagestoreShmemIsValid() || IsUnderPostmaster) return; if (!ParseShardMap(newval, &shard_map)) { /* * shouldn't happen, because we already checked the value in * CheckPageserverConnstring */ elog(ERROR, "could not parse shard map"); } if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0) { pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1); pg_write_barrier(); memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)); pg_write_barrier(); pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1); } else { /* no change */ } } /* BEGIN_HADRON */ /** * Return the total number of shards seen in the shard map. */ shardno_t get_num_shards(void) { const ShardMap *shard_map; Assert(pagestore_shared); shard_map = &pagestore_shared->shard_map; Assert(shard_map != NULL); return shard_map->num_shards; } /* END_HADRON */ /* * Get the current number of shards, and/or the connection string for a * particular shard from the shard map in shared memory. * * If num_shards_p is not NULL, it is set to the current number of shards. * * If connstr_p is not NULL, the connection string for 'shard_no' is copied to * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes * long. * * As a side-effect, if the shard map in shared memory had changed since the * last call, terminates all existing connections to all pageservers. */ static void load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p, size_t* stripe_size_p) { uint64 begin_update_counter; uint64 end_update_counter; ShardMap *shard_map = &pagestore_shared->shard_map; shardno_t num_shards; size_t stripe_size; /* * Postmaster can update the shared memory values concurrently, in which * case we would copy a garbled mix of the old and new values. We will * detect it because the counter's won't match, and retry. But it's * important that we don't do anything within the retry-loop that would * depend on the string having valid contents. */ do { begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter); end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter); num_shards = shard_map->num_shards; stripe_size = shard_map->stripe_size; if (connstr_p && shard_no < MAX_SHARDS) strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE); pg_memory_barrier(); } while (begin_update_counter != end_update_counter || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter) || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter)); if (connstr_p && shard_no >= num_shards) neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, num_shards); /* * If any of the connection strings changed, reset all connections. */ if (pagestore_local_counter != end_update_counter) { for (shardno_t i = 0; i < MAX_SHARDS; i++) { if (page_servers[i].conn) pageserver_disconnect(i); } pagestore_local_counter = end_update_counter; /* Reserve file descriptors for sockets */ while (max_sockets < num_shards) { max_sockets += 1; ReserveExternalFD(); } } if (num_shards_p) *num_shards_p = num_shards; if (stripe_size_p) *stripe_size_p = stripe_size; } #define MB (1024*1024) shardno_t get_shard_number(BufferTag *tag) { shardno_t n_shards; size_t stripe_size; uint32 hash; load_shard_map(0, NULL, &n_shards, &stripe_size); #if PG_MAJORVERSION_NUM < 16 hash = murmurhash32(tag->rnode.relNode); hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); #else hash = murmurhash32(tag->relNumber); hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); #endif return hash % n_shards; } static inline void CLEANUP_AND_DISCONNECT(PageServer *shard) { if (shard->wes_read) { FreeWaitEventSet(shard->wes_read); shard->wes_read = NULL; } if (shard->conn) { MyNeonCounters->pageserver_disconnects_total++; PQfinish(shard->conn); shard->conn = NULL; } shard->state = PS_Disconnected; } /* * Connect to a pageserver, or continue to try to connect if we're yet to * complete the connection (e.g. due to receiving an earlier cancellation * during connection start). * Returns true if successfully connected; false if the connection failed. * * Throws errors in unrecoverable situations, or when this backend's query * is canceled. */ static bool pageserver_connect(shardno_t shard_no, int elevel) { PageServer *shard = &page_servers[shard_no]; char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; /* * Get the connection string for this shard. If the shard map has been * updated since we last looked, this will also disconnect any existing * pageserver connections as a side effect. * Note that connstr is used both during connection start, and when we * log the successful connection. */ load_shard_map(shard_no, connstr, NULL, NULL); switch (shard->state) { case PS_Disconnected: { const char *keywords[5]; const char *values[5]; char pid_str[16] = { 0 }; char endpoint_str[36] = { 0 }; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; /* Make sure we start with a clean slate */ CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected"); now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); /* * Make sure we don't do exponential backoff with a constant multiplier * of 0 us, as that doesn't really do much for timeouts... * * cf. https://github.com/neondatabase/neon/issues/7897 */ if (shard->delay_us == 0) shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. * * This is a loop to protect against interrupted sleeps. */ while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); /* At least we should handle cancellations here */ CHECK_FOR_INTERRUPTS(); now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); shard->last_reconnect_time = now; /* * Connect using the connection string we got from the * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment * variable was set, use that as the password. * * The connection options are parsed in the order they're given, so when * we set the password before the connection string, the connection string * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ n_pgsql_params = 0; /* * Pageserver logs include this in the connection's tracing span. * This allows for reasier log correlation between compute and pageserver. */ keywords[n_pgsql_params] = "application_name"; { int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid); if (ret < 0 || ret >= (int)(sizeof(pid_str))) elog(FATAL, "stack-allocated buffer too small to hold pid"); } /* lifetime: PQconnectStartParams strdups internally */ values[n_pgsql_params] = (const char*) pid_str; n_pgsql_params++; keywords[n_pgsql_params] = "dbname"; values[n_pgsql_params] = connstr; n_pgsql_params++; if (neon_auth_token) { keywords[n_pgsql_params] = "password"; values[n_pgsql_params] = neon_auth_token; n_pgsql_params++; } { bool param_set = false; switch (neon_compute_mode) { case CP_MODE_PRIMARY: strncpy(endpoint_str, "-c neon.compute_mode=primary", sizeof(endpoint_str)); param_set = true; break; case CP_MODE_REPLICA: strncpy(endpoint_str, "-c neon.compute_mode=replica", sizeof(endpoint_str)); param_set = true; break; case CP_MODE_STATIC: strncpy(endpoint_str, "-c neon.compute_mode=static", sizeof(endpoint_str)); param_set = true; break; } if (param_set) { keywords[n_pgsql_params] = "options"; values[n_pgsql_params] = endpoint_str; n_pgsql_params++; } } keywords[n_pgsql_params] = NULL; values[n_pgsql_params] = NULL; shard->conn = PQconnectStartParams(keywords, values, 1); if (PQstatus(shard->conn) == CONNECTION_BAD) { char *msg = pchomp(PQerrorMessage(shard->conn)); CLEANUP_AND_DISCONNECT(shard); ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), errdetail_internal("%s", msg))); pfree(msg); return false; } shard->state = PS_Connecting_Startup; } /* FALLTHROUGH */ case PS_Connecting_Startup: { char *pagestream_query; int ps_send_query_ret; bool connected = false; int poll_result = PGRES_POLLING_WRITING; neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup"); do { switch (poll_result) { default: /* unknown/unused states are handled as a failed connection */ case PGRES_POLLING_FAILED: { char *pqerr = PQerrorMessage(shard->conn); char *msg = NULL; neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED"); if (pqerr) msg = pchomp(pqerr); CLEANUP_AND_DISCONNECT(shard); if (msg) { neon_shard_log(shard_no, elevel, "could not connect to pageserver: %s", msg); pfree(msg); } else neon_shard_log(shard_no, elevel, "could not connect to pageserver"); return false; } case PGRES_POLLING_READING: /* Sleep until there's something to do */ while (true) { int rc = WaitLatchOrSocket(MyLatch, WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE, PQsocket(shard->conn), 0, WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); /* query cancellation, backend shutdown */ CHECK_FOR_INTERRUPTS(); } if (rc & WL_SOCKET_READABLE) break; } /* PQconnectPoll() handles the socket polling state updates */ break; case PGRES_POLLING_WRITING: /* Sleep until there's something to do */ while (true) { int rc = WaitLatchOrSocket(MyLatch, WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE, PQsocket(shard->conn), 0, WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); /* query cancellation, backend shutdown */ CHECK_FOR_INTERRUPTS(); } if (rc & WL_SOCKET_WRITEABLE) break; } /* PQconnectPoll() handles the socket polling state updates */ break; case PGRES_POLLING_OK: neon_shard_log(shard_no, DEBUG5, "POLLING_OK"); connected = true; break; } poll_result = PQconnectPoll(shard->conn); elog(DEBUG5, "PQconnectPoll=>%d", poll_result); } while (!connected); /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); #if PG_MAJORVERSION_NUM >= 17 shard->wes_read = CreateWaitEventSet(NULL, 3); #else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); #endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL); switch (neon_protocol_version) { case 3: pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline); break; case 2: pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; default: elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); } if (PQstatus(shard->conn) == CONNECTION_BAD) { char *msg = pchomp(PQerrorMessage(shard->conn)); CLEANUP_AND_DISCONNECT(shard); ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), errdetail_internal("%s", msg))); pfree(msg); return false; } ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query); pfree(pagestream_query); if (ps_send_query_ret != 1) { CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); return false; } shard->state = PS_Connecting_PageStream; } /* FALLTHROUGH */ case PS_Connecting_PageStream: { neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream"); if (PQstatus(shard->conn) == CONNECTION_BAD) { char *msg = pchomp(PQerrorMessage(shard->conn)); CLEANUP_AND_DISCONNECT(shard); ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), errdetail_internal("%s", msg))); pfree(msg); return false; } while (PQisBusy(shard->conn)) { WaitEvent event; /* Sleep until there's something to do */ (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, WAIT_EVENT_NEON_PS_CONFIGURING); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ if (event.events & WL_SOCKET_READABLE) { if (!PQconsumeInput(shard->conn)) { char *msg = pchomp(PQerrorMessage(shard->conn)); CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", msg); pfree(msg); return false; } } } shard->state = PS_Connected; shard->nrequests_sent = 0; shard->nresponses_received = 0; INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; } /* FALLTHROUGH */ case PS_Connected: /* * We successfully connected. Future connections to this PageServer * will do fast retries again, with exponential backoff. */ shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; neon_shard_log(shard_no, DEBUG5, "Connection state: Connected"); neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); return true; default: neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); } pg_unreachable(); } static void get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) { *sndbuf = -1; *recvbuf = -1; #ifdef __linux__ /* * get kernel's send and recv queue size via ioctl * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 */ if (socketfd != -1) { int ioctl_err; ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf); if (ioctl_err!= 0) { *sndbuf = -errno; } ioctl_err = ioctl(socketfd, FIONREAD, recvbuf); if (ioctl_err != 0) { *recvbuf = -errno; } } #endif } /* * Tries to get the local port of a socket. Sets 'port' to -1 on error. */ static void get_local_port(int socketfd, int *port) { struct sockaddr_in addr; socklen_t addr_len = sizeof(addr); memset(&addr, 0, addr_len); if (getsockname(socketfd, (struct sockaddr*) &addr, &addr_len) == 0) { *port = ntohs(addr.sin_port); } else { *port = -1; } } /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ static int call_PQgetCopyData(shardno_t shard_no, char **buffer) { int ret; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; instr_time now, since_start, since_last_log; retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); if (ret == 0) { WaitEvent occurred_event; int noccurred; double log_timeout, disconnect_timeout; long timeout; /* * Calculate time elapsed since the start, and since the last progress * log message. On first call, remember the start time. */ INSTR_TIME_SET_CURRENT(now); if (INSTR_TIME_IS_ZERO(shard->receive_start_time)) { shard->receive_start_time = now; INSTR_TIME_SET_ZERO(since_start); shard->receive_last_log_time = now; INSTR_TIME_SET_ZERO(since_last_log); shard->receive_logged = false; } else { since_start = now; INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); since_last_log = now; INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); } /* Sleep until the log or disconnect timeout is reached. */ log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log)); disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start)); timeout = (long) ceil(Min(log_timeout, disconnect_timeout)); noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0) { if (!PQconsumeInput(pageserver_conn)) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg); pfree(msg); return -1; } goto retry; } /* Timeout was reached, or we were interrupted for some other reason */ INSTR_TIME_SET_CURRENT(now); since_last_log = now; INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); since_start = now; INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); /* * As a debugging aid, if we don't get a response to a pageserver request * for a long time, print a log message. * * The default neon.pageserver_response_log_timeout value, 10 s, is * very generous. Normally we expect a response in a few * milliseconds. We have metrics to track latencies in normal ranges, * but in the cases that take exceptionally long, it's useful to log * the exact timestamps. */ if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) { int port; int sndbuf; int recvbuf; uint64* max_wait; get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket port=%d sndbuf=%d recvbuf=%d) (conn start=%d end=%d)", INSTR_TIME_GET_DOUBLE(since_start), shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged; shard->receive_logged = true; max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms; *max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start)); } /* * If an even longer time has passed without receiving a response from * the pageserver, disconnect. That triggers a reconnection attempt * in the caller. * * If this happens, the pageserver is likely dead and isn't coming * back, or there's some kind of a network glitch and the connection * is permanently gone. Without this, if the pageserver or the network * connection is dead, it could take a very long time (15 minutes or * more) until the TCP keepalive timeout notices that. Even if we * would in fact get a response if we just waited a little longer, * there's a good chance that we'll get the response sooner by * reconnecting. */ if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { int port; get_local_port(PQsocket(pageserver_conn), &port); neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", INSTR_TIME_GET_DOUBLE(since_start), port); MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; pageserver_disconnect(shard_no); return -1; } goto retry; } /* * If we logged earlier that the response is taking a long time, log * another message when the response is finally received. */ if (shard->receive_logged) { INSTR_TIME_SET_CURRENT(now); since_start = now; INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s", INSTR_TIME_GET_DOUBLE(since_start)); } INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; return ret; } /* * Reset prefetch and drop connection to the shard. * It also drops connection to all other shards involved in prefetch, through * prefetch_on_ps_disconnect(). */ static void pageserver_disconnect(shardno_t shard_no) { /* * If the connection to any pageserver is lost, we throw away the * whole prefetch queue, even for other pageservers. It should not * cause big problems, because connection loss is supposed to be a * rare event. */ prefetch_on_ps_disconnect(); pageserver_disconnect_shard(shard_no); } /* * Disconnect from specified shard */ static void pageserver_disconnect_shard(shardno_t shard_no) { PageServer *shard = &page_servers[shard_no]; /* * If anything goes wrong while we were sending a request, it's not clear * what state the connection is in. For example, if we sent the request * but didn't receive a response yet, we might receive the response some * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. * Similarly, even when we're in PS_DISCONNECTED, we may have junk to * clean up: It is possible that we encountered an error allocating any * of the wait event sets or the psql connection, or failed when we tried * to attach wait events to the WaitEventSets. */ CLEANUP_AND_DISCONNECT(shard); shard->state = PS_Disconnected; } // BEGIN HADRON /* * Nudge compute_ctl to refresh our configuration. Called when we suspect we may be * connecting to the wrong pageservers due to a stale configuration. * * This is a best-effort operation. If we couldn't send the local loopback HTTP request * to compute_ctl or if the request fails for any reason, we just log the error and move * on. */ extern int hadron_extension_server_port; // The timestamp (usec) of the first error that occurred while trying to refresh the configuration. // Will be reset to 0 after a successful refresh. static uint64 first_recorded_refresh_error_usec = 0; // Request compute_ctl to refresh the configuration. This operation may fail, e.g., if the compute_ctl // is already in the configuration state. The function returns true if the caller needs to cancel the // current query to avoid dead/live lock. static bool hadron_request_configuration_refresh() { static CURL *handle = NULL; CURLcode res; char *compute_ctl_url; bool cancel_query = false; if (!lakebase_mode) return false; if (handle == NULL) { handle = alloc_curl_handle(); curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); curl_easy_setopt(handle, CURLOPT_POSTFIELDS, ""); } // Set the URL compute_ctl_url = psprintf("http://localhost:%d/refresh_configuration", hadron_extension_server_port); elog(LOG, "Sending refresh configuration request to compute_ctl: %s", compute_ctl_url); curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url); res = curl_easy_perform(handle); if (res != CURLE_OK ) { elog(WARNING, "refresh_configuration request failed: %s\n", curl_easy_strerror(res)); } else { long http_code = 0; curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &http_code); if ( res != CURLE_OK ) { elog(WARNING, "compute_ctl refresh_configuration request getinfo failed: %s\n", curl_easy_strerror(res)); } else { elog(LOG, "compute_ctl refresh_configuration got HTTP response: %ld\n", http_code); if( http_code == 200 ) { first_recorded_refresh_error_usec = 0; } else { if (first_recorded_refresh_error_usec == 0) { first_recorded_refresh_error_usec = GetCurrentTimestamp(); } else if(GetCurrentTimestamp() - first_recorded_refresh_error_usec > kRefreshErrorTimeoutUSec) { { first_recorded_refresh_error_usec = 0; cancel_query = true; } } } } } // In regular Postgres usage, it is not necessary to manually free memory allocated by palloc (psprintf) because // it will be cleaned up after the "memory context" is reset (e.g. after the query or the transaction is finished). // However, the number of times this function gets called during a single query/transaction can be unbounded due to // the various retry loops around calls to pageservers. Therefore, we need to manually free this memory here. if (compute_ctl_url != NULL) { pfree(compute_ctl_url); } return cancel_query; } // END HADRON static bool pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn; MyNeonCounters->pageserver_requests_sent_total++; /* If the connection was lost for some reason, reconnect */ if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD) { neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); pageserver_disconnect(shard_no); pageserver_conn = NULL; } request->reqid = GENERATE_REQUEST_ID(); req_buff = nm_pack_request(request); /* * If pageserver is stopped, the connections from compute node are broken. * The compute node doesn't notice that immediately, but it will cause the * next request to fail, usually on the next query. That causes * user-visible errors if pageserver is restarted, or the tenant is moved * from one pageserver to another. See * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ if (shard->state != PS_Connected) { while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { shard->n_reconnect_attempts += 1; if (shard->n_reconnect_attempts > conf_refresh_reconnect_attempt_threshold && hadron_request_configuration_refresh() ) { neon_shard_log(shard_no, ERROR, "request failed too many times, cancelling query"); } } shard->n_reconnect_attempts = 0; } else { Assert(shard->conn != NULL); } pageserver_conn = shard->conn; /* * Send request. * * In principle, this could block if the output buffer is full, and we * should use async mode and check for interrupts while waiting. In * practice, our requests are small enough to always fit in the output and * TCP buffer. * * Note that this also will fail when the connection is in the * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this * point, but on the grand scheme of things it's only a small issue. */ shard->nrequests_sent++; if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; } pfree(req_buff.data); if (message_level_is_interesting(PageStoreTrace)) { char *msg = nm_to_string((NeonMessage *) request); neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } return true; } static NeonResponse * pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; /* read response */ int rc; if (shard->state != PS_Connected) { neon_shard_log(shard_no, LOG, "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x", shard->state); return NULL; } Assert(pageserver_conn); rc = call_PQgetCopyData(shard_no, &resp_buff.data); if (rc >= 0) { /* call_PQgetCopyData handles rc == 0 */ Assert(rc > 0); PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); } PG_CATCH(); { neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); PG_RE_THROW(); } PG_END_TRY(); if (message_level_is_interesting(PageStoreTrace)) { char *msg = nm_to_string((NeonMessage *) resp); neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); pfree(msg); } } else if (rc == -1 && shard->state == PS_Disconnected) { /* If the state is 'Disconnected', the disconnection message was already logged */ resp = NULL; } else if (rc == -1) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg); pfree(msg); pageserver_disconnect(shard_no); resp = NULL; /* * Always poke compute_ctl to request a configuration refresh if we have issues receiving data from pageservers after * successfully connecting to it. It could be an indication that we are connecting to the wrong pageservers (e.g. PS * is in secondary mode or otherwise refuses to respond our request). */ hadron_request_configuration_refresh(); } else if (rc == -2) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); hadron_request_configuration_refresh(); neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); } else { pageserver_disconnect(shard_no); hadron_request_configuration_refresh(); neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } shard->nresponses_received++; return (NeonResponse *) resp; } static NeonResponse * pageserver_try_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; int rc; if (shard->state != PS_Connected) return NULL; Assert(pageserver_conn); rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); if (rc == 0) { if (!PQconsumeInput(shard->conn)) { return NULL; } rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); } if (rc == 0) return NULL; else if (rc > 0) { PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); } PG_CATCH(); { neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); /* * Malformed responses from PageServer are a reason to raise * errors and cancel transactions. */ PG_RE_THROW(); } PG_END_TRY(); if (message_level_is_interesting(PageStoreTrace)) { char *msg = nm_to_string((NeonMessage *) resp); neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); pfree(msg); } } else if (rc == -1) { neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); pageserver_disconnect(shard_no); resp = NULL; hadron_request_configuration_refresh(); } else if (rc == -2) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); hadron_request_configuration_refresh(); neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: could not read COPY data: %s", msg); resp = NULL; } else { pageserver_disconnect(shard_no); hadron_request_configuration_refresh(); neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } /* * Always poke compute_ctl to request a configuration refresh if we have issues receiving data from pageservers after * successfully connecting to it. It could be an indication that we are connecting to the wrong pageservers (e.g. PS * is in secondary mode or otherwise refuses to respond our request). */ if ( rc < 0 && hadron_request_configuration_refresh() ) { neon_shard_log(shard_no, ERROR, "refresh_configuration request failed, cancelling query"); } shard->nresponses_received++; return (NeonResponse *) resp; } static bool pageserver_flush(shardno_t shard_no) { PGconn *pageserver_conn = page_servers[shard_no].conn; if (page_servers[shard_no].state != PS_Connected) { neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } else { MyNeonCounters->pageserver_send_flushes_total++; if (PQflush(pageserver_conn)) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); pfree(msg); return false; } } return true; } page_server_api api = { .send = pageserver_send, .flush = pageserver_flush, .receive = pageserver_receive, .try_receive = pageserver_try_receive, .disconnect = pageserver_disconnect_shard }; static bool check_neon_id(char **newval, void **extra, GucSource source) { uint8 id[16]; return **newval == '\0' || HexDecodeString(id, *newval, 16); } void PagestoreShmemInit(void) { bool found; pagestore_shared = ShmemInitStruct("libpagestore shared state", sizeof(PagestoreShmemState), &found); if (!found) { pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0); pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0); memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap)); AssignPageserverConnstring(pageserver_connstring, NULL); } } void PagestoreShmemRequest(void) { RequestAddinShmemSpace(sizeof(PagestoreShmemState)); } /* * Module initialization function */ void pg_init_libpagestore(void) { DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, &pageserver_connstring, "", PGC_SIGHUP, 0, /* no flags required */ CheckPageserverConnstring, AssignPageserverConnstring, NULL); DefineCustomStringVariable("neon.timeline_id", "Neon timeline_id the server is running on", NULL, &neon_timeline, "", PGC_POSTMASTER, 0, /* no flags required */ check_neon_id, NULL, NULL); DefineCustomStringVariable("neon.tenant_id", "Neon tenant_id the server is running on", NULL, &neon_tenant, "", PGC_POSTMASTER, 0, /* no flags required */ check_neon_id, NULL, NULL); DefineCustomStringVariable("neon.project_id", "Neon project_id the server is running on", NULL, &neon_project_id, "", PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomStringVariable("neon.branch_id", "Neon branch_id the server is running on", NULL, &neon_branch_id, "", PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomStringVariable("neon.endpoint_id", "Neon endpoint_id the server is running on", NULL, &neon_endpoint_id, "", PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomIntVariable("neon.stripe_size", "sharding stripe size", NULL, &stripe_size, 2048, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_BLOCKS, NULL, NULL, NULL); DefineCustomIntVariable("neon.max_cluster_size", "cluster size limit", NULL, &max_cluster_size, -1, -1, INT_MAX, PGC_SIGHUP, GUC_UNIT_MB, NULL, NULL, NULL); DefineCustomIntVariable("neon.flush_output_after", "Flush the output buffer after every N unflushed requests", NULL, &flush_every_n_requests, 8, -1, INT_MAX, PGC_USERSET, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomIntVariable("neon.max_reconnect_attempts", "Maximal attempts to reconnect to pages server (with 1 second timeout)", NULL, &max_reconnect_attempts, 60, 0, INT_MAX, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.readahead_buffer_size", "number of prefetches to buffer", "This buffer is used to hold and manage prefetched " "data; so it is important that this buffer is at " "least as large as the configured value of all " "tablespaces' effective_io_concurrency and " "maintenance_io_concurrency, and your sessions' " "values for these settings.", &readahead_buffer_size, 128, 16, 1024, PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); DefineCustomIntVariable("neon.readahead_getpage_pull_timeout", "readahead response pull timeout", "Time between active tries to pull data from the " "PageStream connection when we have pages which " "were read ahead but not yet received.", &readahead_getpage_pull_timeout_ms, 50, 0, 5 * 60 * 1000, PGC_USERSET, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("neon.protocol_version", "Version of compute<->page server protocol", NULL, &neon_protocol_version, 3, /* use protocol version 3 */ 2, /* min */ 3, /* max */ PGC_SU_BACKEND, 0, /* no flags required */ NULL, NULL, NULL); DefineCustomIntVariable("hadron.conf_refresh_reconnect_attempt_threshold", "Threshold of the number of consecutive failed pageserver " "connection attempts (per shard) before signaling " "compute_ctl for a configuration refresh.", NULL, &conf_refresh_reconnect_attempt_threshold, 16, 0, INT_MAX, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.pageserver_response_log_timeout", "pageserver response log timeout", "If the pageserver doesn't respond to a request within this timeout, " "a message is printed to the log.", &pageserver_response_log_timeout, 10000, 100, INT_MAX, PGC_SUSET, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", "pageserver response diconnect timeout", "If the pageserver doesn't respond to a request within this timeout, " "disconnect and reconnect.", &pageserver_response_disconnect_timeout, 150000, 100, INT_MAX, PGC_SUSET, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomEnumVariable( "neon.compute_mode", "The compute endpoint node type", NULL, &neon_compute_mode, CP_MODE_PRIMARY, neon_compute_modes, PGC_POSTMASTER, 0, NULL, NULL, NULL); if (page_server != NULL) neon_log(ERROR, "libpagestore already loaded"); neon_log(PageStoreTrace, "libpagestore already loaded"); page_server = &api; /* * Retrieve the auth token to use when connecting to pageserver and * safekeepers */ neon_auth_token = getenv("NEON_AUTH_TOKEN"); if (neon_auth_token) neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable"); if (pageserver_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; } memset(page_servers, 0, sizeof(page_servers)); } ================================================ FILE: pgxn/neon/libpqwalproposer.h ================================================ /* * Interface to set of libpq wrappers walproposer and neon_walreader need. * Similar to libpqwalreceiver, but it has blocking connection establishment and * pqexec which don't fit us. Implementation is at walproposer_pg.c. */ #ifndef ___LIBPQWALPROPOSER_H__ #define ___LIBPQWALPROPOSER_H__ /* Re-exported and modified ExecStatusType */ typedef enum { /* We received a single CopyBoth result */ WP_EXEC_SUCCESS_COPYBOTH, /* * Any success result other than a single CopyBoth was received. The * specifics of the result were already logged, but it may be useful to * provide an error message indicating which safekeeper messed up. * * Do not expect PQerrorMessage to be appropriately set. */ WP_EXEC_UNEXPECTED_SUCCESS, /* * No result available at this time. Wait until read-ready, then call * again. Internally, this is returned when PQisBusy indicates that * PQgetResult would block. */ WP_EXEC_NEEDS_INPUT, /* Catch-all failure. Check PQerrorMessage. */ WP_EXEC_FAILED, } WalProposerExecStatusType; /* Possible return values from walprop_async_read */ typedef enum { /* The full read was successful. buf now points to the data */ PG_ASYNC_READ_SUCCESS, /* * The read is ongoing. Wait until the connection is read-ready, then try * again. */ PG_ASYNC_READ_TRY_AGAIN, /* Reading failed. Check PQerrorMessage(conn) */ PG_ASYNC_READ_FAIL, } PGAsyncReadResult; /* Possible return values from walprop_async_write */ typedef enum { /* The write fully completed */ PG_ASYNC_WRITE_SUCCESS, /* * The write started, but you'll need to call PQflush some more times to * finish it off. We just tried, so it's best to wait until the connection * is read- or write-ready to try again. * * If it becomes read-ready, call PQconsumeInput and flush again. If it * becomes write-ready, just call PQflush. */ PG_ASYNC_WRITE_TRY_FLUSH, /* Writing failed. Check PQerrorMessage(conn) */ PG_ASYNC_WRITE_FAIL, } PGAsyncWriteResult; /* * This header is included by walproposer.h to define walproposer_api; if we're * building walproposer without pg, ignore libpq part, leaving only interface * types. */ #ifndef WALPROPOSER_LIB #include "libpq-fe.h" /* * Sometimes working directly with underlying PGconn is simpler, export the * whole thing for simplicity. */ typedef struct WalProposerConn { PGconn *pg_conn; bool is_nonblocking; /* whether the connection is non-blocking */ char *recvbuf; /* last received CopyData message from * walprop_async_read */ } WalProposerConn; extern WalProposerConn *libpqwp_connect_start(char *conninfo); extern bool libpqwp_send_query(WalProposerConn *conn, char *query); extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn); extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount); extern void libpqwp_disconnect(WalProposerConn *conn); #endif /* WALPROPOSER_LIB */ #endif /* ___LIBPQWALPROPOSER_H__ */ ================================================ FILE: pgxn/neon/logical_replication_monitor.c ================================================ #include "postgres.h" #include #include #include #include #include #include "miscadmin.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include "replication/slot.h" #include "storage/fd.h" #include "storage/procsignal.h" #include "tcop/tcopprot.h" #include "utils/guc.h" #include "utils/wait_event.h" #include "logical_replication_monitor.h" #define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */ static int logical_replication_max_snap_files = 10000; /* * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of * snapshot files. Let's use 8 MB since 8 is a power of 2. */ static int logical_replication_max_logicalsnapdir_size = 8000; /* * A primitive description of a logical snapshot file including the LSN of the * file and its size. */ typedef struct SnapDesc { XLogRecPtr lsn; off_t sz; } SnapDesc; PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); /* * Sorts an array of snapshot descriptors by their LSN. */ static int SnapDescComparator(const void *a, const void *b) { const SnapDesc *desc1 = a; const SnapDesc *desc2 = b; if (desc1->lsn < desc2->lsn) return 1; else if (desc1->lsn == desc2->lsn) return 0; else return -1; } /* * Look at .snap files and calculate minimum allowed restart_lsn of slot so that * next gc would leave not more than logical_replication_max_snap_files; all * slots having lower restart_lsn should be dropped. */ static XLogRecPtr get_snapshots_cutoff_lsn(void) { /* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */ #define SNAPDIR "pg_logical/snapshots" DIR *dirdesc; int dirdesc_fd; struct dirent *de; size_t snapshot_index = 0; SnapDesc *snapshot_descriptors; size_t descriptors_allocated = 1024; XLogRecPtr cutoff = 0; off_t logicalsnapdir_size = 0; const int logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000; if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0) return 0; snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated); dirdesc = AllocateDir(SNAPDIR); dirdesc_fd = dirfd(dirdesc); if (dirdesc_fd == -1) ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m")); /* find all .snap files and get their lsns */ while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL) { uint32 hi; uint32 lo; struct stat st; XLogRecPtr lsn; SnapDesc *desc; if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2) { ereport(LOG, (errmsg("could not parse file name as .snap file \"%s\"", de->d_name))); continue; } lsn = ((uint64) hi) << 32 | lo; elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn)); if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1) ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name)); if (descriptors_allocated == snapshot_index) { descriptors_allocated *= 2; snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated); } desc = &snapshot_descriptors[snapshot_index++]; desc->lsn = lsn; desc->sz = st.st_size; } qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator); /* Are there more snapshot files than specified? */ if (logical_replication_max_snap_files <= snapshot_index) { cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn; elog(LOG, "ls_monitor: number of snapshot files, %zu, is larger than limit of %d", snapshot_index, logical_replication_max_snap_files); } /* Is the size of the logical snapshots directory larger than specified? * * It's possible we could hit both thresholds, so remove any extra files * first, and then truncate based on size of the remaining files. */ if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes) { /* Unfortunately, iterating the directory does not guarantee any order * so we can't cache an index in the preceding loop. */ off_t sz; const XLogRecPtr original = cutoff; sz = snapshot_descriptors[0].sz; for (size_t i = 1; i < logical_replication_max_snap_files; ++i) { if (sz > logical_replication_max_logicalsnapdir_size_bytes) { cutoff = snapshot_descriptors[i - 1].lsn; break; } sz += snapshot_descriptors[i].sz; } if (cutoff != original) elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB", logical_replication_max_logicalsnapdir_size); } pfree(snapshot_descriptors); FreeDir(dirdesc); return cutoff; #undef SNAPDIR } void InitLogicalReplicationMonitor(void) { BackgroundWorker bgw; DefineCustomIntVariable( "neon.logical_replication_max_snap_files", "Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.", NULL, &logical_replication_max_snap_files, 10000, -1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomIntVariable( "neon.logical_replication_max_logicalsnapdir_size", "Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.", NULL, &logical_replication_max_logicalsnapdir_size, 8000, -1, INT_MAX, PGC_SIGHUP, GUC_UNIT_KB, NULL, NULL, NULL); memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor"); snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor"); bgw.bgw_restart_time = 5; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; RegisterBackgroundWorker(&bgw); } /* * Unused logical replication slots pins WAL and prevent deletion of snapshots. * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which * need too many .snap files. These files are stored as AUX files, which are a * pageserver mechanism for storing non-relation data. AUX files are shipped in * in the basebackup which is requested by compute_ctl before Postgres starts. * The larger the time to retrieve the basebackup, the more likely it is the * compute will be killed by the control plane due to a timeout. */ void LogicalSlotsMonitorMain(Datum main_arg) { /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); for (;;) { XLogRecPtr cutoff_lsn; /* In case of a SIGHUP, just reload the configuration. */ if (ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); } /* Get the cutoff LSN */ cutoff_lsn = get_snapshots_cutoff_lsn(); if (cutoff_lsn > 0) { for (int i = 0; i < max_replication_slots; i++) { char slot_name[NAMEDATALEN]; ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); /* Consider only active logical repliction slots */ if (!s->in_use || !SlotIsLogical(s)) { LWLockRelease(ReplicationSlotControlLock); continue; } /* * Retrieve the restart LSN to determine if we need to drop the * slot */ SpinLockAcquire(&s->mutex); restart_lsn = s->data.restart_lsn; SpinLockRelease(&s->mutex); strlcpy(slot_name, s->data.name.data, sizeof(slot_name)); LWLockRelease(ReplicationSlotControlLock); if (restart_lsn >= cutoff_lsn) { elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X", slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); continue; } elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X", slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); /* now try to drop it, killing owner before, if any */ for (;;) { pid_t active_pid; SpinLockAcquire(&s->mutex); active_pid = s->active_pid; SpinLockRelease(&s->mutex); if (active_pid == 0) { /* * Slot is released, try to drop it. Though of course, * it could have been reacquired, so drop can ERROR * out. Similarly, it could have been dropped in the * meanwhile. * * In principle we could remove pg_try/pg_catch, that * would restart the whole bgworker. */ ConditionVariableCancelSleep(); PG_TRY(); { ReplicationSlotDrop(slot_name, true); elog(LOG, "ls_monitor: replication slot %s dropped", slot_name); } PG_CATCH(); { /* log ERROR and reset elog stack */ EmitErrorReport(); FlushErrorState(); elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name); } PG_END_TRY(); break; } else { /* kill the owner and wait for release */ elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid); (void) kill(active_pid, SIGTERM); /* We shouldn't get stuck, but to be safe add timeout. */ ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); } } } } (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, LS_MONITOR_CHECK_INTERVAL, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } } ================================================ FILE: pgxn/neon/logical_replication_monitor.h ================================================ #ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__ #define __NEON_LOGICAL_REPLICATION_MONITOR_H__ void InitLogicalReplicationMonitor(void); #endif ================================================ FILE: pgxn/neon/neon--1.0--1.1.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.1'" to load this file. \quit CREATE FUNCTION neon_get_lfc_stats() RETURNS SETOF RECORD AS 'MODULE_PATHNAME', 'neon_get_lfc_stats' LANGUAGE C PARALLEL SAFE; -- Create a view for convenient access. CREATE VIEW neon_lfc_stats AS SELECT P.* FROM neon_get_lfc_stats() AS P (lfc_key text, lfc_value bigint); ================================================ FILE: pgxn/neon/neon--1.0.sql ================================================ \echo Use "CREATE EXTENSION neon" to load this file. \quit CREATE FUNCTION pg_cluster_size() RETURNS bigint AS 'MODULE_PATHNAME', 'pg_cluster_size' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION backpressure_lsns( OUT received_lsn pg_lsn, OUT disk_consistent_lsn pg_lsn, OUT remote_consistent_lsn pg_lsn ) RETURNS record AS 'MODULE_PATHNAME', 'backpressure_lsns' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION backpressure_throttling_time() RETURNS bigint AS 'MODULE_PATHNAME', 'backpressure_throttling_time' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION local_cache_pages() RETURNS SETOF RECORD AS 'MODULE_PATHNAME', 'local_cache_pages' LANGUAGE C PARALLEL SAFE; -- Create a view for convenient access. CREATE VIEW local_cache AS SELECT P.* FROM local_cache_pages() AS P (pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid, relforknumber int2, relblocknumber int8, accesscount int4); ================================================ FILE: pgxn/neon/neon--1.1--1.0.sql ================================================ -- the order of operations is important here -- because the view depends on the function DROP VIEW IF EXISTS neon_lfc_stats CASCADE; DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE; ================================================ FILE: pgxn/neon/neon--1.1--1.2.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit -- Create a convenient view similar to pg_stat_database -- that exposes all lfc stat values in one row. CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS WITH lfc_stats AS ( SELECT stat_name, count FROM neon_get_lfc_stats() AS t(stat_name text, count bigint) ), lfc_values AS ( SELECT MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses, MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE NULL END) AS file_cache_hits, MAX(CASE WHEN stat_name = 'file_cache_used' THEN count ELSE NULL END) AS file_cache_used, MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes, -- Calculate the file_cache_hit_ratio within the same CTE for simplicity CASE WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2) END AS file_cache_hit_ratio FROM lfc_stats ) SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values; -- externalize the view to all users in role pg_monitor GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR; ================================================ FILE: pgxn/neon/neon--1.2--1.1.sql ================================================ DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE; ================================================ FILE: pgxn/neon/neon--1.2--1.3.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit CREATE FUNCTION approximate_working_set_size(reset bool) RETURNS integer AS 'MODULE_PATHNAME', 'approximate_working_set_size' LANGUAGE C PARALLEL SAFE; GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor; ================================================ FILE: pgxn/neon/neon--1.3--1.2.sql ================================================ DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE; ================================================ FILE: pgxn/neon/neon--1.3--1.4.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) RETURNS integer AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' LANGUAGE C PARALLEL SAFE; GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; ================================================ FILE: pgxn/neon/neon--1.4--1.3.sql ================================================ DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; ================================================ FILE: pgxn/neon/neon--1.4--1.5.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit CREATE FUNCTION get_backend_perf_counters() RETURNS SETOF RECORD AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters' LANGUAGE C PARALLEL SAFE; CREATE FUNCTION get_perf_counters() RETURNS SETOF RECORD AS 'MODULE_PATHNAME', 'neon_get_perf_counters' LANGUAGE C PARALLEL SAFE; -- Show various metrics, for each backend. Note that the values are not reset -- when a backend exits. When a new backend starts with the backend ID, it will -- continue accumulating the values from where the old backend left. If you are -- only interested in the changes from your own session, store the values at the -- beginning of the session somewhere, and subtract them on subsequent calls. -- -- For histograms, 'bucket_le' is the upper bound of the histogram bucket. CREATE VIEW neon_backend_perf_counters AS SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value FROM get_backend_perf_counters() AS P ( procno integer, pid integer, metric text, bucket_le float8, value float8 ); -- Summary across all backends. (This could also be implemented with -- an aggregate query over neon_backend_perf_counters view.) CREATE VIEW neon_perf_counters AS SELECT P.metric, P.bucket_le, P.value FROM get_perf_counters() AS P ( metric text, bucket_le float8, value float8 ); ================================================ FILE: pgxn/neon/neon--1.5--1.4.sql ================================================ DROP VIEW IF EXISTS neon_perf_counters; DROP VIEW IF EXISTS neon_backend_perf_counters; DROP FUNCTION IF EXISTS get_perf_counters(); DROP FUNCTION IF EXISTS get_backend_perf_counters(); ================================================ FILE: pgxn/neon/neon--1.5--1.6.sql ================================================ \echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer) RETURNS record AS 'MODULE_PATHNAME', 'get_prewarm_info' LANGUAGE C STRICT PARALLEL SAFE; CREATE FUNCTION get_local_cache_state(max_chunks integer default null) RETURNS bytea AS 'MODULE_PATHNAME', 'get_local_cache_state' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1) RETURNS void AS 'MODULE_PATHNAME', 'prewarm_local_cache' LANGUAGE C STRICT PARALLEL UNSAFE; ================================================ FILE: pgxn/neon/neon--1.6--1.5.sql ================================================ DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer); DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer); DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer); ================================================ FILE: pgxn/neon/neon.c ================================================ /*------------------------------------------------------------------------- * * neon.c * Main entry point into the neon extension * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "miscadmin.h" #include "pgstat.h" #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" #if PG_MAJORVERSION_NUM >= 15 #include "access/xlogrecovery.h" #endif #include "executor/instrument.h" #include "replication/logical.h" #include "replication/logicallauncher.h" #include "replication/slot.h" #include "replication/walsender.h" #include "storage/proc.h" #include "storage/ipc.h" #include "funcapi.h" #include "access/htup_details.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" #include "utils/guc_tables.h" #include "communicator.h" #include "communicator_process.h" #include "extension_server.h" #include "file_cache.h" #include "neon.h" #include "neon_ddl_handler.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" #include "logical_replication_monitor.h" #include "unstable_extensions.h" #include "walsender_hooks.h" #if PG_MAJORVERSION_NUM >= 16 #include "storage/ipc.h" #endif PG_MODULE_MAGIC; void _PG_init(void); bool lakebase_mode = false; static int running_xacts_overflow_policy; static emit_log_hook_type prev_emit_log_hook; static bool monitor_query_exec_time = false; static ExecutorStart_hook_type prev_ExecutorStart = NULL; static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags); static void neon_ExecutorEnd(QueryDesc *queryDesc); static shmem_startup_hook_type prev_shmem_startup_hook; static void neon_shmem_startup_hook(void); static void neon_shmem_request_hook(void); #if PG_MAJORVERSION_NUM >= 15 static shmem_request_hook_type prev_shmem_request_hook = NULL; #endif #if PG_MAJORVERSION_NUM >= 17 uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; uint32 WAIT_EVENT_NEON_LFC_WRITE; uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; uint32 WAIT_EVENT_NEON_PS_STARTING; uint32 WAIT_EVENT_NEON_PS_CONFIGURING; uint32 WAIT_EVENT_NEON_PS_SEND; uint32 WAIT_EVENT_NEON_PS_READ; uint32 WAIT_EVENT_NEON_WAL_DL; #endif int databricks_test_hook = 0; enum RunningXactsOverflowPolicies { OP_IGNORE, OP_SKIP, OP_WAIT }; static const struct config_enum_entry running_xacts_overflow_policies[] = { {"ignore", OP_IGNORE, false}, {"skip", OP_SKIP, false}, {"wait", OP_WAIT, false}, {NULL, 0, false} }; static const struct config_enum_entry debug_compare_local_modes[] = { {"none", DEBUG_COMPARE_LOCAL_NONE, false}, {"prefetch", DEBUG_COMPARE_LOCAL_PREFETCH, false}, {"lfc", DEBUG_COMPARE_LOCAL_LFC, false}, {"all", DEBUG_COMPARE_LOCAL_ALL, false}, {NULL, 0, false} }; /* * XXX: These private to procarray.c, but we need them here. */ #define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) #define TOTAL_MAX_CACHED_SUBXIDS \ ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) /* * Restore running-xact information by scanning the CLOG at startup. * * In PostgreSQL, a standby always has to wait for a running-xacts WAL record * to arrive before it can start accepting queries. Furthermore, if there are * transactions with too many subxids (> 64) open to fit in the in-memory * subxids cache, the running-xacts record will be marked as "suboverflowed", * and the standby will need to also wait for the currently in-progress * transactions to finish. * * That's not great in PostgreSQL, because a hot standby does not necessary * open up for queries immediately as you might expect. But it's worse in * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint * record; it can start at any LSN. Postgres arranges things so that there is * a running-xacts record soon after every checkpoint record, but when you * start from an arbitrary LSN, that doesn't help. If the primary is idle, or * not running at all, it might never write a new running-xacts record, * leaving the replica in a limbo where it can never start accepting queries. * * To mitigate that, we have an additional mechanism to find the running-xacts * information: we scan the CLOG, making note of any XIDs not marked as * committed or aborted. They are added to the Postgres known-assigned XIDs * array by calling ProcArrayApplyRecoveryInfo() in the caller of this * function. * * There is one big limitation with that mechanism: The size of the * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs, * we have to give up. Furthermore, we don't know how many of the in-progress * XIDs are subtransactions, and if we use up all the space in the * known-assigned XIDs array for subtransactions, we might run out of space in * the array later during WAL replay, causing the replica to shut down with * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to * the known-assigned array without risking that error later is very low, * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up * to half of the known-assigned XIDs array for the subtransactions, even * though that risks getting the error later. * * Note: It's OK if the recovered list of XIDs includes some transactions that * have crashed in the primary, and hence will never commit. They will be seen * as in-progress, until we see a new next running-acts record with an * oldestActiveXid that invalidates them. That's how the known-assigned XIDs * array always works. * * If scraping the CLOG doesn't succeed for some reason, like the subxid * overflow, Postgres will fall back to waiting for a running-xacts record * like usual. * * Returns true if a complete list of in-progress XIDs was scraped. */ static bool RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids) { TransactionId from; TransactionId till; int max_xcnt; TransactionId *prepared_xids = NULL; int n_prepared_xids; TransactionId *restored_xids = NULL; int n_restored_xids; int next_prepared_idx; Assert(*xids == NULL); /* * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We * don't know where to start the scan. * * This shouldn't happen, because the pageserver always maintains a valid * oldestActiveXid nowadays. Except when starting at an old point in time * that was ingested before the pageserver was taught to do that. */ if (!TransactionIdIsValid(checkpoint->oldestActiveXid)) { elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set"); goto fail; } /* * We will scan the CLOG starting from the oldest active XID. * * In some corner cases, the oldestActiveXid from the last checkpoint * might already have been truncated from the CLOG. That is, * oldestActiveXid might be older than oldestXid. That's possible because * oldestActiveXid is only updated at checkpoints. After the last * checkpoint, the oldest transaction might have committed, and the CLOG * might also have been already truncated. So if oldestActiveXid is older * than oldestXid, start at oldestXid instead. (Otherwise we'd try to * access CLOG segments that have already been truncated away.) */ from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid) ? checkpoint->oldestActiveXid : checkpoint->oldestXid; till = XidFromFullTransactionId(checkpoint->nextXid); /* * To avoid "too many KnownAssignedXids" error later during replay, we * limit number of collected transactions. This is a tradeoff: if we are * willing to consume more of the KnownAssignedXids space for the XIDs * now, that allows us to start up, but we might run out of space later. * * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS, * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In * PostgreSQL, that's always enough because the primary will always write * an XLOG_XACT_ASSIGNMENT record if a transaction has more than * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows * the standby to mark the XIDs in pg_subtrans and removing them from the * KnowingAssignedXids array. * * Here, we don't know which XIDs belong to subtransactions that have * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we * wanted to be totally safe and avoid the possibility of getting a "too * many KnownAssignedXids" error later, we would have to limit ourselves * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top * transaction IDs too, because we cannot distinguish between top * transaction IDs and subtransactions here. * * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That * strikes a sensible balance between being useful, and risking a "too * many KnownAssignedXids" error later. */ max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2; /* * Collect XIDs of prepared transactions in an array. This includes only * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions * has already been called, so we can find all the sub-transactions in * pg_subtrans. */ PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids); qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator); /* * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'. */ elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till); restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId)); n_restored_xids = 0; next_prepared_idx = 0; for (TransactionId xid = from; xid != till;) { XLogRecPtr xidlsn; XidStatus xidstatus; xidstatus = TransactionIdGetStatus(xid, &xidlsn); /* * "Merge" the prepared transactions into the restored_xids array as * we go. The prepared transactions array is sorted. This is mostly * a sanity check to ensure that all the prepared transactions are * seen as in-progress. (There is a check after the loop that we didn't * miss any.) */ if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx]) { /* * This is a top-level transaction ID of a prepared transaction. * Include it in the array. */ /* sanity check */ if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS) { elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG", xid, xidstatus); Assert(false); goto fail; } elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids); next_prepared_idx++; } else if (xidstatus == TRANSACTION_STATUS_COMMITTED) { elog(DEBUG1, "XID %u: was committed", xid); goto skip; } else if (xidstatus == TRANSACTION_STATUS_ABORTED) { elog(DEBUG1, "XID %u: was aborted", xid); goto skip; } else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS) { /* * In-progress transactions are included in the array. * * Except subtransactions of the prepared transactions. They are * already set in pg_subtrans, and hence don't need to be tracked * in the known-assigned XIDs array. */ if (n_prepared_xids > 0) { TransactionId parent = SubTransGetParent(xid); if (TransactionIdIsValid(parent)) { /* * This is a subtransaction belonging to a prepared * transaction. * * Sanity check that it is in the prepared XIDs array. It * should be, because StandbyRecoverPreparedTransactions * populated pg_subtrans, and no other XID should be set * in it yet. (This also relies on the fact that * StandbyRecoverPreparedTransactions sets the parent of * each subxid to point directly to the top-level XID, * rather than restoring the original subtransaction * hierarchy.) */ if (bsearch(&parent, prepared_xids, next_prepared_idx, sizeof(TransactionId), xidLogicalComparator) == NULL) { elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG", xid, parent); Assert(false); goto fail; } elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent); goto skip; } } /* include it in the array */ elog(DEBUG1, "XID %u: is in progress", xid); } else { /* * SUB_COMMITTED is a transient state used at commit. We don't * expect to see that here. */ elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG", xid, xidstatus); Assert(false); goto fail; } if (n_restored_xids >= max_xcnt) { /* * Overflowed. We won't be able to install the RunningTransactions * snapshot. */ elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); switch (running_xacts_overflow_policy) { case OP_WAIT: goto fail; case OP_IGNORE: goto success; case OP_SKIP: n_restored_xids = 0; goto success; } } restored_xids[n_restored_xids++] = xid; skip: TransactionIdAdvance(xid); } /* sanity check */ if (next_prepared_idx != n_prepared_xids) { elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG", prepared_xids[next_prepared_idx]); Assert(false); goto fail; } success: elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); *nxids = n_restored_xids; *xids = restored_xids; if (prepared_xids) pfree(prepared_xids); return true; fail: *nxids = 0; *xids = NULL; if (restored_xids) pfree(restored_xids); if (prepared_xids) pfree(prepared_xids); return false; } /* * pgbouncer is able to track GUCs reported by Postgres. * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres: * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be * This code sets GUC_REPORT flag for `search_path`making it possible to include it in * pgbouncer's `track_extra_parameters` list. * * This code is inspired by how the Citus extension does this, see * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694 */ static void ReportSearchPath(void) { #if PG_VERSION_NUM >= 160000 int nGucs = 0; struct config_generic **gucs = get_guc_variables(&nGucs); #else struct config_generic **gucs = get_guc_variables(); int nGucs = GetNumConfigOptions(); #endif for (int i = 0; i < nGucs; i++) { struct config_generic *guc = (struct config_generic *) gucs[i]; if (strcmp(guc->name, "search_path") == 0) { guc->flags |= GUC_REPORT; } } } #if PG_VERSION_NUM < 150000 /* * PG14 uses separate backend for stats collector having no access to shared memory. * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit * variable, so we have to declare it here. */ static int neon_pgstat_file_size_limit; #endif static void DatabricksSqlErrorHookImpl(ErrorData *edata) { if (prev_emit_log_hook != NULL) { prev_emit_log_hook(edata); } if (edata->sqlerrcode == ERRCODE_DATA_CORRUPTED) { pg_atomic_fetch_add_u32(&databricks_metrics_shared->data_corruption_count, 1); } else if (edata->sqlerrcode == ERRCODE_INDEX_CORRUPTED) { pg_atomic_fetch_add_u32(&databricks_metrics_shared->index_corruption_count, 1); } else if (edata->sqlerrcode == ERRCODE_INTERNAL_ERROR) { pg_atomic_fetch_add_u32(&databricks_metrics_shared->internal_error_count, 1); } } void _PG_init(void) { /* * Also load 'neon_rmgr'. This makes it unnecessary to list both 'neon' * and 'neon_rmgr' in shared_preload_libraries. */ #if PG_VERSION_NUM >= 160000 load_file("$libdir/neon_rmgr", false); #endif if (lakebase_mode) { prev_emit_log_hook = emit_log_hook; emit_log_hook = DatabricksSqlErrorHookImpl; } /* * Initializing a pre-loaded Postgres extension happens in three stages: * * 1. _PG_init() is called early at postmaster startup. In this stage, no * shared memory has been allocated yet. Core Postgres GUCs have been * initialized from the config files, but notably, MaxBackends has not * calculated yet. In this stage, we must register any extension GUCs * and can do other early initialization that doesn't depend on shared * memory. In this stage we must also register "shmem request" and * "shmem starutup" hooks, to be called in stages 2 and 3. * * 2. After MaxBackends have been calculated, the "shmem request" hooks * are called. The hooks can reserve shared memory by calling * RequestAddinShmemSpace and RequestNamedLWLockTranche(). The "shmem * request hooks" are a new mechanism in Postgres v15. In v14 and * below, you had to make those Requests in stage 1 already, which * means they could not depend on MaxBackends. (See hack in * NeonPerfCountersShmemRequest()) * * 3. After some more runtime-computed GUCs that affect the amount of * shared memory needed have been calculated, the "shmem startup" hooks * are called. In this stage, we allocate any shared memory, LWLocks * and other shared resources. * * Here, in the 'neon' extension, we register just one shmem request hook * and one startup hook, which call into functions in all the subsystems * that are part of the extension. On v14, the ShmemRequest functions are * called in stage 1, and on v15 onwards they are called in stage 2. */ /* Stage 1: Define GUCs, and other early intialization */ pg_init_libpagestore(); relsize_hash_init(); lfc_init(); pg_init_walproposer(); init_lwlsncache(); pg_init_communicator_process(); pg_init_communicator(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); InitLogicalReplicationMonitor(); InitDDLHandler(); pg_init_extension_server(); restore_running_xacts_callback = RestoreRunningXactsFromClog; DefineCustomBoolVariable( "neon.disable_logical_replication_subscribers", "Disable incoming logical replication", NULL, &disable_logical_replication_subscribers, false, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.disable_wal_prevlink_checks", "Disable validation of prev link in WAL records", NULL, &disable_wal_prev_lsn_checks, false, PGC_SIGHUP, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.monitor_query_exec_time", "Collect infortmation about query execution time", NULL, &monitor_query_exec_time, false, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.allow_replica_misconfig", "Allow replica startup when some critical GUCs have smaller value than on primary node", NULL, &allowReplicaMisconfig, true, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomEnumVariable( "neon.running_xacts_overflow_policy", "Action performed on snapshot overflow when restoring runnings xacts from CLOG", NULL, &running_xacts_overflow_policy, OP_IGNORE, running_xacts_overflow_policies, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomIntVariable("neon.pgstat_file_size_limit", "Maximal size of pgstat.stat file saved in Neon storage", "Zero value disables persisting pgstat.stat file", &neon_pgstat_file_size_limit, 0, 0, 1000000, /* disabled by default */ PGC_SIGHUP, GUC_UNIT_KB, NULL, NULL, NULL); DefineCustomEnumVariable( "neon.debug_compare_local", "Debug mode for comparing content of pages in prefetch ring/LFC/PS and local disk", NULL, &debug_compare_local, DEBUG_COMPARE_LOCAL_NONE, debug_compare_local_modes, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomStringVariable( "neon.privileged_role_name", "Name of the 'weak' superuser role, which we give to the users", NULL, &privileged_role_name, "neon_superuser", PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.lakebase_mode", "Is neon running in Lakebase?", NULL, &lakebase_mode, false, PGC_POSTMASTER, 0, NULL, NULL, NULL); // A test hook used in sql regress to trigger specific behaviors // to test features easily. DefineCustomIntVariable( "databricks.test_hook", "The test hook used in sql regress tests only", NULL, &databricks_test_hook, 0, 0, INT32_MAX, PGC_SUSET, 0, NULL, NULL, NULL); /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the * extension was loaded will be removed. */ EmitWarningsOnPlaceholders("neon"); ReportSearchPath(); /* * Register initialization hooks for stage 2. (On v14, there's no "shmem * request" hooks, so call the ShmemRequest functions immediately.) */ #if PG_VERSION_NUM >= 150000 prev_shmem_request_hook = shmem_request_hook; shmem_request_hook = neon_shmem_request_hook; #else neon_shmem_request_hook(); #endif /* Register hooks for stage 3 */ prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = neon_shmem_startup_hook; /* Other misc initialization */ prev_ExecutorStart = ExecutorStart_hook; ExecutorStart_hook = neon_ExecutorStart; prev_ExecutorEnd = ExecutorEnd_hook; ExecutorEnd_hook = neon_ExecutorEnd; } /* Various functions exposed at SQL level */ PG_FUNCTION_INFO_V1(pg_cluster_size); PG_FUNCTION_INFO_V1(backpressure_lsns); PG_FUNCTION_INFO_V1(backpressure_throttling_time); PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); PG_FUNCTION_INFO_V1(approximate_working_set_size); PG_FUNCTION_INFO_V1(neon_get_lfc_stats); PG_FUNCTION_INFO_V1(local_cache_pages); Datum pg_cluster_size(PG_FUNCTION_ARGS) { int64 size; size = GetNeonCurrentClusterSize(); if (size == 0) PG_RETURN_NULL(); PG_RETURN_INT64(size); } Datum backpressure_lsns(PG_FUNCTION_ARGS) { XLogRecPtr writePtr; XLogRecPtr flushPtr; XLogRecPtr applyPtr; Datum values[3]; bool nulls[3]; TupleDesc tupdesc; replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); tupdesc = CreateTemplateTupleDesc(3); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); MemSet(nulls, 0, sizeof(nulls)); values[0] = LSNGetDatum(writePtr); values[1] = LSNGetDatum(flushPtr); values[2] = LSNGetDatum(applyPtr); PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } Datum backpressure_throttling_time(PG_FUNCTION_ARGS) { PG_RETURN_UINT64(BackpressureThrottlingTime()); } Datum approximate_working_set_size_seconds(PG_FUNCTION_ARGS) { time_t duration; int32 dc; duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0); dc = lfc_approximate_working_set_size_seconds(duration, false); if (dc < 0) PG_RETURN_NULL(); else PG_RETURN_INT32(dc); } Datum approximate_working_set_size(PG_FUNCTION_ARGS) { bool reset = PG_GETARG_BOOL(0); int32 dc; dc = lfc_approximate_working_set_size_seconds(-1, reset); if (dc < 0) PG_RETURN_NULL(); else PG_RETURN_INT32(dc); } Datum neon_get_lfc_stats(PG_FUNCTION_ARGS) { #define NUM_NEON_GET_STATS_COLS 2 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; LfcStatsEntry *entries; size_t num_entries; InitMaterializedSRF(fcinfo, 0); /* lfc_get_stats() does all the heavy lifting */ entries = lfc_get_stats(&num_entries); /* Convert the LfcStatsEntrys to a result set */ for (size_t i = 0; i < num_entries; i++) { LfcStatsEntry *entry = &entries[i]; Datum values[NUM_NEON_GET_STATS_COLS]; bool nulls[NUM_NEON_GET_STATS_COLS]; values[0] = CStringGetTextDatum(entry->metric_name); nulls[0] = false; values[1] = Int64GetDatum(entry->isnull ? 0 : entry->value); nulls[1] = entry->isnull; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } PG_RETURN_VOID(); #undef NUM_NEON_GET_STATS_COLS } Datum local_cache_pages(PG_FUNCTION_ARGS) { #define NUM_LOCALCACHE_PAGES_COLS 7 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; LocalCachePagesRec *entries; size_t num_entries; InitMaterializedSRF(fcinfo, 0); /* lfc_local_cache_pages() does all the heavy lifting */ entries = lfc_local_cache_pages(&num_entries); /* Convert the LocalCachePagesRec structs to a result set */ for (size_t i = 0; i < num_entries; i++) { LocalCachePagesRec *entry = &entries[i]; Datum values[NUM_LOCALCACHE_PAGES_COLS]; bool nulls[NUM_LOCALCACHE_PAGES_COLS] = { false, false, false, false, false, false, false }; values[0] = Int64GetDatum((int64) entry->pageoffs); values[1] = ObjectIdGetDatum(entry->relfilenode); values[2] = ObjectIdGetDatum(entry->reltablespace); values[3] = ObjectIdGetDatum(entry->reldatabase); values[4] = ObjectIdGetDatum(entry->forknum); values[5] = Int64GetDatum((int64) entry->blocknum); values[6] = Int32GetDatum(entry->accesscount); /* Build and return the tuple. */ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } PG_RETURN_VOID(); #undef NUM_LOCALCACHE_PAGES_COLS } /* * Initialization stage 2: make requests for the amount of shared memory we * will need. * * For a high-level explanation of the initialization process, see _PG_init(). */ static void neon_shmem_request_hook(void) { #if PG_VERSION_NUM >= 150000 if (prev_shmem_request_hook) prev_shmem_request_hook(); #endif LfcShmemRequest(); NeonPerfCountersShmemRequest(); PagestoreShmemRequest(); RelsizeCacheShmemRequest(); WalproposerShmemRequest(); LwLsnCacheShmemRequest(); } /* * Initialization stage 3: Initialize shared memory. * * For a high-level explanation of the initialization process, see _PG_init(). */ static void neon_shmem_startup_hook(void) { if (prev_shmem_startup_hook) prev_shmem_startup_hook(); LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); LfcShmemInit(); NeonPerfCountersShmemInit(); if (lakebase_mode) { DatabricksMetricsShmemInit(); } PagestoreShmemInit(); RelsizeCacheShmemInit(); WalproposerShmemInit(); LwLsnCacheShmemInit(); #if PG_MAJORVERSION_NUM >= 17 WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance"); WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); WAIT_EVENT_NEON_LFC_CV_WAIT = WaitEventExtensionNew("Neon/FileCache_CvWait"); WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO"); WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download"); #endif LWLockRelease(AddinShmemInitLock); } /* * ExecutorStart hook: start up tracking if needed */ static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags) { if (prev_ExecutorStart) prev_ExecutorStart(queryDesc, eflags); else standard_ExecutorStart(queryDesc, eflags); if (monitor_query_exec_time) { /* * Set up to track total elapsed time in ExecutorRun. Make sure the * space is allocated in the per-query context so it will go away at * ExecutorEnd. */ if (queryDesc->totaltime == NULL) { MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false); MemoryContextSwitchTo(oldcxt); } } } /* * ExecutorEnd hook: store results if needed */ static void neon_ExecutorEnd(QueryDesc *queryDesc) { if (monitor_query_exec_time && queryDesc->totaltime) { /* * Make sure stats accumulation is done. (Note: it's okay if several * levels of hook all do this.) */ InstrEndLoop(queryDesc->totaltime); inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */ } if (prev_ExecutorEnd) prev_ExecutorEnd(queryDesc); else standard_ExecutorEnd(queryDesc); } ================================================ FILE: pgxn/neon/neon.control ================================================ # neon extension comment = 'cloud storage for PostgreSQL' default_version = '1.6' module_pathname = '$libdir/neon' relocatable = true trusted = true ================================================ FILE: pgxn/neon/neon.h ================================================ /*------------------------------------------------------------------------- * * neon.h * Functions used in the initialization of this extension. * *------------------------------------------------------------------------- */ #ifndef NEON_H #define NEON_H #include "access/xlogdefs.h" #include "utils/wait_event.h" /* GUCs */ extern char *neon_auth_token; extern char *neon_timeline; extern char *neon_tenant; extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; extern int readahead_getpage_pull_timeout_ms; extern bool disable_wal_prev_lsn_checks; extern bool lakebase_mode; extern bool AmPrewarmWorker; #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; extern uint32 WAIT_EVENT_NEON_LFC_WRITE; extern uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; extern uint32 WAIT_EVENT_NEON_PS_STARTING; extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; extern uint32 WAIT_EVENT_NEON_PS_SEND; extern uint32 WAIT_EVENT_NEON_PS_READ; extern uint32 WAIT_EVENT_NEON_WAL_DL; #else #define WAIT_EVENT_NEON_LFC_MAINTENANCE PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE #define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE #define WAIT_EVENT_NEON_LFC_CV_WAIT WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_READ PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ #endif #define NEON_TAG "[NEON_SMGR] " #define neon_log(tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) #define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]); extern PGDLLEXPORT void WalProposerMain(Datum main_arg); extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); extern void LfcShmemRequest(void); extern void PagestoreShmemRequest(void); extern void RelsizeCacheShmemRequest(void); extern void WalproposerShmemRequest(void); extern void LwLsnCacheShmemRequest(void); extern void NeonPerfCountersShmemRequest(void); extern void LfcShmemInit(void); extern void PagestoreShmemInit(void); extern void RelsizeCacheShmemInit(void); extern void WalproposerShmemInit(void); extern void LwLsnCacheShmemInit(void); extern void NeonPerfCountersShmemInit(void); #endif /* NEON_H */ ================================================ FILE: pgxn/neon/neon_ddl_handler.c ================================================ /*------------------------------------------------------------------------- * * neon_ddl_handler.c * Captures updates to roles/databases using ProcessUtility_hook and * sends them to the control ProcessUtility_hook. The changes are sent * via HTTP to the URL specified by the GUC neon.console_url when the * transaction commits. Forwarding may be disabled temporarily by * setting neon.forward_ddl to false. * * Currently, the transaction may abort AFTER * changes have already been forwarded, and that case is not handled. * Subtransactions are handled using a stack of hash tables, which * accumulate changes. On subtransaction commit, the top of the stack * is merged with the table below it. * * Support event triggers for {privileged_role_name} * * IDENTIFICATION * contrib/neon/neon_dll_handler.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include #include "access/xact.h" #include "catalog/pg_authid.h" #include "catalog/pg_proc.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/user.h" #include "fmgr.h" #include "libpq/crypt.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "parser/parse_func.h" #include "tcop/pquery.h" #include "tcop/utility.h" #include "utils/acl.h" #include "utils/guc.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/jsonb.h" #include #include #include "neon_ddl_handler.h" #include "neon_utils.h" #include "neon.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; static fmgr_hook_type next_fmgr_hook = NULL; static needs_fmgr_hook_type next_needs_fmgr_hook = NULL; static bool neon_event_triggers = true; static const char *jwt_token = NULL; /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; static bool RegressTestMode = false; /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup * (which we never do), so we make this a static */ static char CurlErrorBuf[CURL_ERROR_SIZE]; typedef enum { Op_Set, /* An upsert: Either a creation or an alter */ Op_Delete, } OpType; typedef struct { char name[NAMEDATALEN]; Oid owner; char old_name[NAMEDATALEN]; OpType type; } DbEntry; typedef struct { char name[NAMEDATALEN]; char old_name[NAMEDATALEN]; const char *password; OpType type; } RoleEntry; /* * We keep one of these for each subtransaction in a stack. When a subtransaction * commits, we merge the top of the stack into the table below it. It is allocated in the * subtransaction's context. */ typedef struct DdlHashTable { struct DdlHashTable *prev_table; size_t subtrans_level; HTAB *db_table; HTAB *role_table; } DdlHashTable; static DdlHashTable RootTable; static DdlHashTable *CurrentDdlTable = &RootTable; static int SubtransLevel; /* current nesting level of subtransactions */ static void PushKeyValue(JsonbParseState **state, char *key, char *value) { JsonbValue k, v; k.type = jbvString; k.val.string.len = strlen(key); k.val.string.val = key; v.type = jbvString; v.val.string.len = strlen(value); v.val.string.val = value; pushJsonbValue(state, WJB_KEY, &k); pushJsonbValue(state, WJB_VALUE, &v); } static char * ConstructDeltaMessage() { JsonbParseState *state = NULL; pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); if (RootTable.db_table) { JsonbValue dbs; HASH_SEQ_STATUS status; DbEntry *entry; dbs.type = jbvString; dbs.val.string.val = "dbs"; dbs.val.string.len = strlen(dbs.val.string.val); pushJsonbValue(&state, WJB_KEY, &dbs); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); hash_seq_init(&status, RootTable.db_table); while ((entry = hash_seq_search(&status)) != NULL) { pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); PushKeyValue(&state, "name", entry->name); if (entry->owner != InvalidOid) { PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false)); } if (entry->old_name[0] != '\0') { PushKeyValue(&state, "old_name", entry->old_name); } pushJsonbValue(&state, WJB_END_OBJECT, NULL); } pushJsonbValue(&state, WJB_END_ARRAY, NULL); } if (RootTable.role_table) { JsonbValue roles; HASH_SEQ_STATUS status; RoleEntry *entry; roles.type = jbvString; roles.val.string.val = "roles"; roles.val.string.len = strlen(roles.val.string.val); pushJsonbValue(&state, WJB_KEY, &roles); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); hash_seq_init(&status, RootTable.role_table); while ((entry = hash_seq_search(&status)) != NULL) { pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); PushKeyValue(&state, "name", entry->name); if (entry->password) { #if PG_MAJORVERSION_NUM == 14 char *logdetail; #else const char *logdetail; #endif char *encrypted_password; PushKeyValue(&state, "password", (char *) entry->password); encrypted_password = get_role_password(entry->name, &logdetail); if (encrypted_password) { PushKeyValue(&state, "encrypted_password", encrypted_password); } else { elog(ERROR, "Failed to get encrypted password: %s", logdetail); } } if (entry->old_name[0] != '\0') { PushKeyValue(&state, "old_name", entry->old_name); } pushJsonbValue(&state, WJB_END_OBJECT, NULL); } pushJsonbValue(&state, WJB_END_ARRAY, NULL); } { JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL); Jsonb *jsonb = JsonbValueToJsonb(result); return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ ); } } #define ERROR_SIZE 1024 typedef struct { char str[ERROR_SIZE]; size_t size; } ErrorString; static size_t ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) { /* Docs say size is always 1 */ ErrorString *str = userdata; size_t to_write = nmemb; /* +1 for null terminator */ if (str->size + nmemb + 1 >= ERROR_SIZE) to_write = ERROR_SIZE - str->size - 1; /* Ignore everyrthing past the first ERROR_SIZE bytes */ if (to_write == 0) return nmemb; memcpy(str->str + str->size, ptr, to_write); str->size += to_write; str->str[str->size] = '\0'; return nmemb; } static void SendDeltasToControlPlane() { static CURL *handle = NULL; if (!RootTable.db_table && !RootTable.role_table) return; if (!ConsoleURL) { elog(LOG, "ConsoleURL not set, skipping forwarding"); return; } if (!ForwardDDL) return; if (handle == NULL) { struct curl_slist *headers = NULL; headers = curl_slist_append(headers, "Content-Type: application/json"); if (headers == NULL) { elog(ERROR, "Failed to set Content-Type header"); } if (jwt_token) { char auth_header[8192]; snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); headers = curl_slist_append(headers, auth_header); if (headers == NULL) { elog(ERROR, "Failed to set Authorization header"); } } handle = alloc_curl_handle(); curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH"); curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL); curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf); curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); } { char *message = ConstructDeltaMessage(); ErrorString str; const int num_retries = 5; CURLcode curl_status; long response_code; str.size = 0; curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message); curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str); for (int i = 0; i < num_retries; i++) { if ((curl_status = curl_easy_perform(handle)) == 0) break; elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); pg_usleep(1000 * 1000); } if (curl_status != CURLE_OK) elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) { if (response_code != 200) { if (str.size != 0) { elog(ERROR, "Received HTTP code %ld from control plane: %s", response_code, str.str); } else { elog(ERROR, "Received HTTP code %ld from control plane", response_code); } } } } } static void InitCurrentDdlTableIfNeeded() { /* Lazy construction of DllHashTable chain */ if (SubtransLevel > CurrentDdlTable->subtrans_level) { DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable)); new_table->prev_table = CurrentDdlTable; new_table->subtrans_level = SubtransLevel; new_table->role_table = NULL; new_table->db_table = NULL; CurrentDdlTable = new_table; } } static void InitDbTableIfNeeded() { InitCurrentDdlTableIfNeeded(); if (!CurrentDdlTable->db_table) { HASHCTL db_ctl = {}; db_ctl.keysize = NAMEDATALEN; db_ctl.entrysize = sizeof(DbEntry); db_ctl.hcxt = CurTransactionContext; CurrentDdlTable->db_table = hash_create( "Dbs Created", 4, &db_ctl, HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); } } static void InitRoleTableIfNeeded() { InitCurrentDdlTableIfNeeded(); if (!CurrentDdlTable->role_table) { HASHCTL role_ctl = {}; role_ctl.keysize = NAMEDATALEN; role_ctl.entrysize = sizeof(RoleEntry); role_ctl.hcxt = CurTransactionContext; CurrentDdlTable->role_table = hash_create( "Roles Created", 4, &role_ctl, HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); } } static void PushTable() { SubtransLevel += 1; } static void MergeTable() { DdlHashTable *old_table; Assert(SubtransLevel >= CurrentDdlTable->subtrans_level); if (--SubtransLevel >= CurrentDdlTable->subtrans_level) { return; } old_table = CurrentDdlTable; CurrentDdlTable = old_table->prev_table; if (old_table->db_table) { DbEntry *entry; HASH_SEQ_STATUS status; InitDbTableIfNeeded(); hash_seq_init(&status, old_table->db_table); while ((entry = hash_seq_search(&status)) != NULL) { DbEntry *to_write = hash_search( CurrentDdlTable->db_table, entry->name, HASH_ENTER, NULL); to_write->type = entry->type; if (entry->owner != InvalidOid) to_write->owner = entry->owner; strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); if (entry->old_name[0] != '\0') { bool found_old = false; DbEntry *old = hash_search( CurrentDdlTable->db_table, entry->old_name, HASH_FIND, &found_old); if (found_old) { if (old->old_name[0] != '\0') strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); else strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); hash_search( CurrentDdlTable->db_table, entry->old_name, HASH_REMOVE, NULL); } } } hash_destroy(old_table->db_table); } if (old_table->role_table) { RoleEntry *entry; HASH_SEQ_STATUS status; InitRoleTableIfNeeded(); hash_seq_init(&status, old_table->role_table); while ((entry = hash_seq_search(&status)) != NULL) { RoleEntry * old; bool found_old = false; RoleEntry *to_write = hash_search( CurrentDdlTable->role_table, entry->name, HASH_ENTER, NULL); to_write->type = entry->type; to_write->password = entry->password; strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); if (entry->old_name[0] == '\0') continue; old = hash_search( CurrentDdlTable->role_table, entry->old_name, HASH_FIND, &found_old); if (!found_old) continue; strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); hash_search(CurrentDdlTable->role_table, entry->old_name, HASH_REMOVE, NULL); } hash_destroy(old_table->role_table); } } static void PopTable() { Assert(SubtransLevel >= CurrentDdlTable->subtrans_level); if (--SubtransLevel < CurrentDdlTable->subtrans_level) { /* * Current table gets freed because it is allocated in aborted * subtransaction's memory context. */ CurrentDdlTable = CurrentDdlTable->prev_table; } } static void NeonSubXactCallback( SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg) { switch (event) { case SUBXACT_EVENT_START_SUB: return PushTable(); case SUBXACT_EVENT_COMMIT_SUB: return MergeTable(); case SUBXACT_EVENT_ABORT_SUB: return PopTable(); default: return; } } static void NeonXactCallback(XactEvent event, void *arg) { if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT) { SendDeltasToControlPlane(); } RootTable.role_table = NULL; RootTable.db_table = NULL; Assert(CurrentDdlTable == &RootTable); } static bool IsPrivilegedRole(const char *role_name) { Assert(role_name); return strcmp(role_name, privileged_role_name) == 0; } static void HandleCreateDb(CreatedbStmt *stmt) { DefElem *downer = NULL; ListCell *option; bool found = false; DbEntry *entry; InitDbTableIfNeeded(); foreach(option, stmt->options) { DefElem *defel = lfirst(option); if (strcmp(defel->defname, "owner") == 0) downer = defel; } entry = hash_search(CurrentDdlTable->db_table, stmt->dbname, HASH_ENTER, &found); if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); entry->type = Op_Set; if (downer && downer->arg) { const char *owner_name = defGetString(downer); if (IsPrivilegedRole(owner_name)) elog(ERROR, "could not create a database with owner %s", privileged_role_name); entry->owner = get_role_oid(owner_name, false); } else { entry->owner = GetUserId(); } } static void HandleAlterOwner(AlterOwnerStmt *stmt) { const char *name; bool found = false; DbEntry *entry; const char *new_owner; if (stmt->objectType != OBJECT_DATABASE) return; InitDbTableIfNeeded(); name = strVal(stmt->object); entry = hash_search(CurrentDdlTable->db_table, name, HASH_ENTER, &found); if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); new_owner = get_rolespec_name(stmt->newowner); if (IsPrivilegedRole(new_owner)) elog(ERROR, "could not alter owner to %s", privileged_role_name); entry->owner = get_role_oid(new_owner, false); entry->type = Op_Set; } static void HandleDbRename(RenameStmt *stmt) { bool found = false; DbEntry *entry; DbEntry *entry_for_new_name; Assert(stmt->renameType == OBJECT_DATABASE); InitDbTableIfNeeded(); entry = hash_search(CurrentDdlTable->db_table, stmt->subname, HASH_FIND, &found); entry_for_new_name = hash_search(CurrentDdlTable->db_table, stmt->newname, HASH_ENTER, NULL); entry_for_new_name->type = Op_Set; if (found) { if (entry->old_name[0] != '\0') strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); else strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); entry_for_new_name->owner = entry->owner; hash_search(CurrentDdlTable->db_table, stmt->subname, HASH_REMOVE, NULL); } else { strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); entry_for_new_name->owner = InvalidOid; } } static void HandleDropDb(DropdbStmt *stmt) { bool found = false; DbEntry *entry; InitDbTableIfNeeded(); entry = hash_search(CurrentDdlTable->db_table, stmt->dbname, HASH_ENTER, &found); entry->type = Op_Delete; entry->owner = InvalidOid; if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); } static void HandleCreateRole(CreateRoleStmt *stmt) { bool found = false; RoleEntry *entry; DefElem *dpass; ListCell *option; InitRoleTableIfNeeded(); dpass = NULL; foreach(option, stmt->options) { DefElem *defel = lfirst(option); if (strcmp(defel->defname, "password") == 0) dpass = defel; } entry = hash_search(CurrentDdlTable->role_table, stmt->role, HASH_ENTER, &found); if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); if (dpass && dpass->arg) entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); else entry->password = NULL; entry->type = Op_Set; } static void HandleAlterRole(AlterRoleStmt *stmt) { char *role_name; DefElem *dpass; ListCell *option; bool found = false; RoleEntry *entry; InitRoleTableIfNeeded(); role_name = get_rolespec_name(stmt->role); if (IsPrivilegedRole(role_name) && !superuser()) elog(ERROR, "could not ALTER %s", privileged_role_name); dpass = NULL; foreach(option, stmt->options) { DefElem *defel = lfirst(option); if (strcmp(defel->defname, "password") == 0) dpass = defel; } /* We only care about updates to the password */ if (!dpass) { pfree(role_name); return; } entry = hash_search(CurrentDdlTable->role_table, role_name, HASH_ENTER, &found); if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); if (dpass->arg) entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); else entry->password = NULL; entry->type = Op_Set; pfree(role_name); } static void HandleRoleRename(RenameStmt *stmt) { bool found = false; RoleEntry *entry; RoleEntry *entry_for_new_name; Assert(stmt->renameType == OBJECT_ROLE); InitRoleTableIfNeeded(); entry = hash_search(CurrentDdlTable->role_table, stmt->subname, HASH_FIND, &found); entry_for_new_name = hash_search(CurrentDdlTable->role_table, stmt->newname, HASH_ENTER, NULL); entry_for_new_name->type = Op_Set; if (found) { if (entry->old_name[0] != '\0') strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); else strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); entry_for_new_name->password = entry->password; hash_search( CurrentDdlTable->role_table, entry->name, HASH_REMOVE, NULL); } else { strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); entry_for_new_name->password = NULL; } } static void HandleDropRole(DropRoleStmt *stmt) { ListCell *item; InitRoleTableIfNeeded(); foreach(item, stmt->roles) { RoleSpec *spec = lfirst(item); bool found = false; RoleEntry *entry = hash_search( CurrentDdlTable->role_table, spec->rolename, HASH_ENTER, &found); entry->type = Op_Delete; entry->password = NULL; if (!found) memset(entry->old_name, 0, sizeof(entry->old_name)); } } static void HandleRename(RenameStmt *stmt) { if (stmt->renameType == OBJECT_DATABASE) return HandleDbRename(stmt); else if (stmt->renameType == OBJECT_ROLE) return HandleRoleRename(stmt); } /* * Support for Event Triggers. * * In vanilla only superuser can create Event Triggers. * * We allow it for {privileged_role_name} by temporary switching to superuser. But as * far as event trigger can fire in superuser context we should protect * superuser from execution of arbitrary user's code. * * The idea was taken from Supabase PR series starting at * https://github.com/supabase/supautils/pull/98 */ static bool neon_needs_fmgr_hook(Oid functionId) { return (next_needs_fmgr_hook && (*next_needs_fmgr_hook) (functionId)) || get_func_rettype(functionId) == EVENT_TRIGGEROID; } static void LookupFuncOwnerSecDef(Oid functionId, Oid *funcOwner, bool *is_secdef) { Form_pg_proc procForm; HeapTuple proc_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(functionId)); if (!HeapTupleIsValid(proc_tup)) ereport(ERROR, (errmsg("cache lookup failed for function %u", functionId))); procForm = (Form_pg_proc) GETSTRUCT(proc_tup); *funcOwner = procForm->proowner; *is_secdef = procForm->prosecdef; ReleaseSysCache(proc_tup); } PG_FUNCTION_INFO_V1(noop); Datum noop(__attribute__ ((unused)) PG_FUNCTION_ARGS) { PG_RETURN_VOID();} static void force_noop(FmgrInfo *finfo) { finfo->fn_addr = (PGFunction) noop; finfo->fn_oid = InvalidOid; /* not a known function OID anymore */ finfo->fn_nargs = 0; /* no arguments for noop */ finfo->fn_strict = false; finfo->fn_retset = false; finfo->fn_stats = 0; /* no stats collection */ finfo->fn_extra = NULL; /* clear out old context data */ finfo->fn_mcxt = CurrentMemoryContext; finfo->fn_expr = NULL; /* no parse tree */ } /* * Skip executing Event Triggers execution for superusers, because Event * Triggers are SECURITY DEFINER and user provided code could then attempt * privilege escalation. * * Also skip executing Event Triggers when GUC neon.event_triggers has been * set to false. This might be necessary to be able to connect again after a * LOGIN Event Trigger has been installed that would prevent connections as * {privileged_role_name}. */ static void neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) { /* * It can be other needs_fmgr_hook which cause our hook to be invoked for * non-trigger function, so recheck that is is trigger function. */ if (flinfo->fn_oid != InvalidOid && get_func_rettype(flinfo->fn_oid) != EVENT_TRIGGEROID) { if (next_fmgr_hook) (*next_fmgr_hook) (event, flinfo, private); return; } /* * The {privileged_role_name} role can use the GUC neon.event_triggers to disable * firing Event Trigger. * * SET neon.event_triggers TO false; * * This only applies to the {privileged_role_name} role though, and only allows * skipping Event Triggers owned by {privileged_role_name}, which we check by * proxy of the Event Trigger function being owned by {privileged_role_name}. * * A role that is created in role {privileged_role_name} should be allowed to also * benefit from the neon_event_triggers GUC, and will be considered the * same as the {privileged_role_name} role. */ if (event == FHET_START && !neon_event_triggers && is_privileged_role()) { Oid weak_superuser_oid = get_role_oid(privileged_role_name, false); /* Find the Function Attributes (owner Oid, security definer) */ const char *fun_owner_name = NULL; Oid fun_owner = InvalidOid; bool fun_is_secdef = false; LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef); fun_owner_name = GetUserNameFromId(fun_owner, false); if (IsPrivilegedRole(fun_owner_name) || has_privs_of_role(fun_owner, weak_superuser_oid)) { elog(WARNING, "Skipping Event Trigger: neon.event_triggers is false"); /* * we can't skip execution directly inside the fmgr_hook so instead we * change the event trigger function to a noop function. */ force_noop(flinfo); } } /* * Fire Event Trigger if both function owner and current user are * superuser. Allow executing Event Trigger function that belongs to a * superuser when connected as a non-superuser, even when the function is * SECURITY DEFINER. */ else if (event == FHET_START /* still enable it to pass pg_regress tests */ && !RegressTestMode) { /* * Get the current user oid as of before SECURITY DEFINER change of * CurrentUserId, and that would be SessionUserId. */ Oid current_role_oid = GetSessionUserId(); bool role_is_super = superuser_arg(current_role_oid); /* Find the Function Attributes (owner Oid, security definer) */ Oid function_owner = InvalidOid; bool function_is_secdef = false; bool function_is_owned_by_super = false; LookupFuncOwnerSecDef(flinfo->fn_oid, &function_owner, &function_is_secdef); function_is_owned_by_super = superuser_arg(function_owner); /* * Refuse to run functions that belongs to a non-superuser when the * current user is a superuser. * * We could run a SECURITY DEFINER user-function here and be safe with * privilege escalation risks, but superuser roles are only used for * infrastructure maintenance operations, where we prefer to skip * running user-defined code. */ if (role_is_super && !function_is_owned_by_super) { char *func_name = get_func_name(flinfo->fn_oid); ereport(WARNING, (errmsg("Skipping Event Trigger"), errdetail("Event Trigger function \"%s\" " "is owned by non-superuser role \"%s\", " "and current_user \"%s\" is superuser", func_name, GetUserNameFromId(function_owner, false), GetUserNameFromId(current_role_oid, false)))); /* * we can't skip execution directly inside the fmgr_hook so * instead we change the event trigger function to a noop * function. */ force_noop(flinfo); } } if (next_fmgr_hook) (*next_fmgr_hook) (event, flinfo, private); } static Oid prev_role_oid = 0; static int prev_role_sec_context = 0; static bool switched_to_superuser = false; /* * Switch tp superuser if not yet superuser. * Returns false if already switched to superuser. */ static bool switch_to_superuser(void) { Oid superuser_oid; if (switched_to_superuser) return false; switched_to_superuser = true; superuser_oid = get_role_oid("cloud_admin", true /*missing_ok*/); if (superuser_oid == InvalidOid) superuser_oid = BOOTSTRAP_SUPERUSERID; GetUserIdAndSecContext(&prev_role_oid, &prev_role_sec_context); SetUserIdAndSecContext(superuser_oid, prev_role_sec_context | SECURITY_LOCAL_USERID_CHANGE | SECURITY_RESTRICTED_OPERATION); return true; } static void switch_to_original_role(void) { SetUserIdAndSecContext(prev_role_oid, prev_role_sec_context); switched_to_superuser = false; } /* * ALTER ROLE ... SUPERUSER; * * Used internally to give superuser to a non-privileged role to allow * ownership of superuser-only objects such as Event Trigger. * * ALTER ROLE foo SUPERUSER; * ALTER EVENT TRIGGER ... OWNED BY foo; * ALTER ROLE foo NOSUPERUSER; * * Now the EVENT TRIGGER is owned by foo, who can DROP it without having to be * superuser again. */ static void alter_role_super(const char* rolename, bool make_super) { AlterRoleStmt *alter_stmt = makeNode(AlterRoleStmt); DefElem *defel_superuser = #if PG_MAJORVERSION_NUM <= 14 makeDefElem("superuser", (Node *) makeInteger(make_super), -1); #else makeDefElem("superuser", (Node *) makeBoolean(make_super), -1); #endif RoleSpec *rolespec = makeNode(RoleSpec); rolespec->roletype = ROLESPEC_CSTRING; rolespec->rolename = pstrdup(rolename); rolespec->location = -1; alter_stmt->role = rolespec; alter_stmt->options = list_make1(defel_superuser); #if PG_MAJORVERSION_NUM < 15 AlterRole(alter_stmt); #else /* ParseState *pstate, AlterRoleStmt *stmt */ AlterRole(NULL, alter_stmt); #endif CommandCounterIncrement(); } /* * Changes the OWNER of an Event Trigger. * * Event Triggers can only be owned by superusers, so this ALTER ROLE with * SUPERUSER and then removes the property. */ static void alter_event_trigger_owner(const char *obj_name, Oid role_oid) { char* role_name = GetUserNameFromId(role_oid, false); alter_role_super(role_name, true); AlterEventTriggerOwner(obj_name, role_oid); CommandCounterIncrement(); alter_role_super(role_name, false); } /* * Neon processing of the CREATE EVENT TRIGGER requires special attention and * is worth having its own ProcessUtility_hook for that. */ static void ProcessCreateEventTrigger( PlannedStmt *pstmt, const char *queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *qc) { Node *parseTree = pstmt->utilityStmt; bool sudo = false; /* We double-check that after local variable declaration block */ CreateEventTrigStmt *stmt = (CreateEventTrigStmt *) parseTree; /* * We are going to change the current user privileges (sudo) and might * need after execution cleanup. For that we want to capture the UserId * before changing it for our sudo implementation. */ const Oid current_user_id = GetUserId(); bool current_user_is_super = superuser_arg(current_user_id); if (nodeTag(parseTree) != T_CreateEventTrigStmt) { ereport(ERROR, errcode(ERRCODE_INTERNAL_ERROR), errmsg("ProcessCreateEventTrigger called for the wrong command")); } /* * Allow {privileged_role_name} to create Event Trigger, while keeping the * ownership of the object. * * For that we give superuser membership to the role for the execution of * the command. */ if (IsTransactionState() && is_privileged_role()) { /* Find the Event Trigger function Oid */ Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false); /* Find the Function Owner Oid */ Oid func_owner = InvalidOid; bool is_secdef = false; bool function_is_owned_by_super = false; LookupFuncOwnerSecDef(func_oid, &func_owner, &is_secdef); function_is_owned_by_super = superuser_arg(func_owner); if(!current_user_is_super && function_is_owned_by_super) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Permission denied to execute " "a function owned by a superuser role"), errdetail("current user \"%s\" is not a superuser " "and Event Trigger function \"%s\" " "is owned by a superuser", GetUserNameFromId(current_user_id, false), NameListToString(stmt->funcname)))); } if(current_user_is_super && !function_is_owned_by_super) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Permission denied to execute " "a function owned by a non-superuser role"), errdetail("current user \"%s\" is a superuser " "and function \"%s\" is " "owned by a non-superuser", GetUserNameFromId(current_user_id, false), NameListToString(stmt->funcname)))); } sudo = switch_to_superuser(); } PG_TRY(); { if (PreviousProcessUtilityHook) { PreviousProcessUtilityHook( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } else { standard_ProcessUtility( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } /* * Now that the Event Trigger has been installed via our sudo * mechanism, if the original role was not a superuser then change * the event trigger ownership back to the original role. * * That way [ ALTER | DROP ] EVENT TRIGGER commands just work. */ if (IsTransactionState() && is_privileged_role()) { if (!current_user_is_super) { /* * Change event trigger owner to the current role (making * it a privileged role during the ALTER OWNER command). */ alter_event_trigger_owner(stmt->trigname, current_user_id); } } } PG_FINALLY(); { if (sudo) switch_to_original_role(); } PG_END_TRY(); } /* * Neon hooks for DDLs (handling privileges, limiting features, etc). */ static void NeonProcessUtility( PlannedStmt *pstmt, const char *queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *qc) { Node *parseTree = pstmt->utilityStmt; /* * The process utility hook for CREATE EVENT TRIGGER is its own * implementation and warrant being addressed separately from here. */ if (nodeTag(parseTree) == T_CreateEventTrigStmt) { ProcessCreateEventTrigger( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); return; } /* * Other commands that need Neon specific implementations are handled here: */ switch (nodeTag(parseTree)) { case T_CreatedbStmt: HandleCreateDb(castNode(CreatedbStmt, parseTree)); break; case T_AlterOwnerStmt: HandleAlterOwner(castNode(AlterOwnerStmt, parseTree)); break; case T_RenameStmt: HandleRename(castNode(RenameStmt, parseTree)); break; case T_DropdbStmt: HandleDropDb(castNode(DropdbStmt, parseTree)); break; case T_CreateRoleStmt: HandleCreateRole(castNode(CreateRoleStmt, parseTree)); break; case T_AlterRoleStmt: HandleAlterRole(castNode(AlterRoleStmt, parseTree)); break; case T_DropRoleStmt: HandleDropRole(castNode(DropRoleStmt, parseTree)); break; case T_CreateTableSpaceStmt: if (!RegressTestMode) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("CREATE TABLESPACE is not supported on Neon"))); } break; default: break; } if (PreviousProcessUtilityHook) { PreviousProcessUtilityHook( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } else { standard_ProcessUtility( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } } /* * Only {privileged_role_name} is granted privilege to edit neon.event_triggers GUC. */ static void neon_event_triggers_assign_hook(bool newval, void *extra) { if (IsTransactionState() && !is_privileged_role()) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to set neon.event_triggers"), errdetail("Only \"%s\" is allowed to set the GUC", privileged_role_name))); } } void InitDDLHandler() { PreviousProcessUtilityHook = ProcessUtility_hook; ProcessUtility_hook = NeonProcessUtility; next_needs_fmgr_hook = needs_fmgr_hook; needs_fmgr_hook = neon_needs_fmgr_hook; next_fmgr_hook = fmgr_hook; fmgr_hook = neon_fmgr_hook; RegisterXactCallback(NeonXactCallback, NULL); RegisterSubXactCallback(NeonSubXactCallback, NULL); /* * The GUC neon.event_triggers should provide the same effect as the * Postgres GUC event_triggers, but the neon one is PGC_USERSET. * * This allows using the GUC in the connection string and work out of a * LOGIN Event Trigger that would break database access, all without * having to edit and reload the Postgres configuration file. */ DefineCustomBoolVariable( "neon.event_triggers", "Enable firing of event triggers", NULL, &neon_event_triggers, true, PGC_USERSET, 0, NULL, neon_event_triggers_assign_hook, NULL); DefineCustomStringVariable( "neon.console_url", "URL of the Neon Console, which will be forwarded changes to dbs and roles", NULL, &ConsoleURL, NULL, PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.forward_ddl", "Controls whether to forward DDL to the control plane", NULL, &ForwardDDL, true, PGC_SUSET, 0, NULL, NULL, NULL); DefineCustomBoolVariable( "neon.regress_test_mode", "Controls whether we are running in the regression test mode", NULL, &RegressTestMode, false, PGC_SUSET, 0, NULL, NULL, NULL); jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); } } ================================================ FILE: pgxn/neon/neon_ddl_handler.h ================================================ #ifndef CONTROL_DDL_HANDLER_H #define CONTROL_DDL_HANDLER_H void InitDDLHandler(void); #endif ================================================ FILE: pgxn/neon/neon_lwlsncache.c ================================================ #include "postgres.h" #include "neon.h" #include "neon_lwlsncache.h" #include "miscadmin.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "storage/buf_internals.h" #include "utils/guc.h" #include "utils/hsearch.h" typedef struct LastWrittenLsnCacheEntry { BufferTag key; XLogRecPtr lsn; /* double linked list for LRU replacement algorithm */ dlist_node lru_node; } LastWrittenLsnCacheEntry; typedef struct LwLsnCacheCtl { int lastWrittenLsnCacheSize; /* * Maximal last written LSN for pages not present in lastWrittenLsnCache */ XLogRecPtr maxLastWrittenLsn; /* * Double linked list to implement LRU replacement policy for last written LSN cache. * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. */ dlist_head lastWrittenLsnLRU; } LwLsnCacheCtl; /* * Cache of last written LSN for each relation page. * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last * relation metadata update. * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), * pages are replaced using LRU algorithm, based on L2-list. * Access to this cache is protected by 'LastWrittenLsnLock'. */ static HTAB *lastWrittenLsnCache; LwLsnCacheCtl* LwLsnCache; static int lwlsn_cache_size = (128 * 1024); static void lwlc_register_gucs(void) { DefineCustomIntVariable("neon.last_written_lsn_cache_size", "Size of last written LSN cache used by Neon", NULL, &lwlsn_cache_size, (128*1024), 1024, INT_MAX, PGC_POSTMASTER, 0, /* plain units */ NULL, NULL, NULL); } static XLogRecPtr SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); /* All the necessary hooks are defined here */ /* These hold the set_lwlsn_* hooks which were installed before ours, if any */ static set_lwlsn_block_range_hook_type prev_set_lwlsn_block_range_hook = NULL; static set_lwlsn_block_v_hook_type prev_set_lwlsn_block_v_hook = NULL; static set_lwlsn_block_hook_type prev_set_lwlsn_block_hook = NULL; static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL; static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL; static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL; static void neon_set_max_lwlsn(XLogRecPtr lsn); void init_lwlsncache(void) { if (!process_shared_preload_libraries_in_progress) ereport(ERROR, errcode(ERRCODE_INTERNAL_ERROR), errmsg("Loading of shared preload libraries is not in progress. Exiting")); lwlc_register_gucs(); prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook; set_lwlsn_block_range_hook = neon_set_lwlsn_block_range; prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook; set_lwlsn_block_v_hook = neon_set_lwlsn_block_v; prev_set_lwlsn_block_hook = set_lwlsn_block_hook; set_lwlsn_block_hook = neon_set_lwlsn_block; prev_set_max_lwlsn_hook = set_max_lwlsn_hook; set_max_lwlsn_hook = neon_set_max_lwlsn; prev_set_lwlsn_relation_hook = set_lwlsn_relation_hook; set_lwlsn_relation_hook = neon_set_lwlsn_relation; prev_set_lwlsn_db_hook = set_lwlsn_db_hook; set_lwlsn_db_hook = neon_set_lwlsn_db; } void LwLsnCacheShmemRequest(void) { Size requested_size = sizeof(LwLsnCacheCtl); requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry)); RequestAddinShmemSpace(requested_size); } void LwLsnCacheShmemInit(void) { static HASHCTL info; bool found; if (lwlsn_cache_size > 0) { info.keysize = sizeof(BufferTag); info.entrysize = sizeof(LastWrittenLsnCacheEntry); lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", lwlsn_cache_size, lwlsn_cache_size, &info, HASH_ELEM | HASH_BLOBS); LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", sizeof(LwLsnCacheCtl), &found); // Now set the size in the struct LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size; if (found) { return; } } dlist_init(&LwLsnCache->lastWrittenLsnLRU); LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr(); } /* * neon_get_lwlsn -- Returns maximal LSN of written page. * It returns an upper bound for the last written LSN of a given page, * either from a cached last written LSN or a global maximum last written LSN. * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. * If cache is large enough, iterating through all hash items may be rather expensive. * But neon_get_lwlsn(InvalidOid) is used only by neon_dbsize which is not performance critical. */ XLogRecPtr neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; LastWrittenLsnCacheEntry* entry; Assert(LwLsnCache->lastWrittenLsnCacheSize != 0); LWLockAcquire(LastWrittenLsnLock, LW_SHARED); /* Maximal last written LSN among all non-cached pages */ lsn = LwLsnCache->maxLastWrittenLsn; if (NInfoGetRelNumber(rlocator) != InvalidOid) { BufferTag key; Oid spcOid = NInfoGetSpcOid(rlocator); Oid dbOid = NInfoGetDbOid(rlocator); Oid relNumber = NInfoGetRelNumber(rlocator); BufTagInit(key, relNumber, forknum, blkno, spcOid, dbOid); entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); if (entry != NULL) lsn = entry->lsn; else { LWLockRelease(LastWrittenLsnLock); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); /* * In case of statements CREATE TABLE AS SELECT... or INSERT FROM SELECT... we are fetching data from source table * and storing it in destination table. It cause problems with prefetch last-written-lsn is known for the pages of * source table (which for example happens after compute restart). In this case we get get global value of * last-written-lsn which is changed frequently as far as we are writing pages of destination table. * As a result request-lsn for the prefetch and request-let when this page is actually needed are different * and we got exported prefetch request. So it actually disarms prefetch. * To prevent that, we re-insert the page with the latest LSN, so that it's * less likely the LSN for this page will get evicted from the LwLsnCache * before the page is read. */ lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, blkno, 1); } } else { HASH_SEQ_STATUS seq; /* Find maximum of all cached LSNs */ hash_seq_init(&seq, lastWrittenLsnCache); while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) { if (entry->lsn > lsn) lsn = entry->lsn; } } LWLockRelease(LastWrittenLsnLock); return lsn; } static void neon_set_max_lwlsn(XLogRecPtr lsn) { LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); LwLsnCache->maxLastWrittenLsn = lsn; LWLockRelease(LastWrittenLsnLock); } /* * GetLastWrittenLSN -- Returns maximal LSN of written page. * It returns an upper bound for the last written LSN of a given page, * either from a cached last written LSN or a global maximum last written LSN. * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. * If cache is large enough, iterating through all hash items may be rather expensive. * But GetLastWrittenLSN(InvalidOid) is used only by neon_dbsize which is not performance critical. */ void neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blkno, int nblocks, XLogRecPtr *lsns) { LastWrittenLsnCacheEntry* entry; XLogRecPtr lsn; Assert(LwLsnCache->lastWrittenLsnCacheSize != 0); Assert(nblocks > 0); Assert(PointerIsValid(lsns)); LWLockAcquire(LastWrittenLsnLock, LW_SHARED); if (NInfoGetRelNumber(relfilenode) != InvalidOid) { BufferTag key; bool missed_keys = false; Oid spcOid = NInfoGetSpcOid(relfilenode); Oid dbOid = NInfoGetDbOid(relfilenode); Oid relNumber = NInfoGetRelNumber(relfilenode); BufTagInit(key, relNumber, forknum, blkno, spcOid, dbOid); for (int i = 0; i < nblocks; i++) { /* Maximal last written LSN among all non-cached pages */ key.blockNum = blkno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); if (entry != NULL) { lsns[i] = entry->lsn; } else { /* Mark this block's LSN as missing - we'll update the LwLSN for missing blocks in bulk later */ lsns[i] = InvalidXLogRecPtr; missed_keys = true; } } /* * If we had any missing LwLSN entries, we add the missing ones now. * By doing the insertions in one batch, we decrease lock contention. */ if (missed_keys) { LWLockRelease(LastWrittenLsnLock); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = LwLsnCache->maxLastWrittenLsn; for (int i = 0; i < nblocks; i++) { if (lsns[i] == InvalidXLogRecPtr) { lsns[i] = lsn; SetLastWrittenLSNForBlockRangeInternal(lsn, relfilenode, forknum, blkno + i, 1); } } } } else { HASH_SEQ_STATUS seq; lsn = LwLsnCache->maxLastWrittenLsn; /* Find maximum of all cached LSNs */ hash_seq_init(&seq, lastWrittenLsnCache); while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) { if (entry->lsn > lsn) lsn = entry->lsn; } for (int i = 0; i < nblocks; i++) lsns[i] = lsn; } LWLockRelease(LastWrittenLsnLock); } /* * Guts for SetLastWrittenLSNForBlockRange. * Caller must ensure LastWrittenLsnLock is held in exclusive mode. */ static XLogRecPtr SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { if (NInfoGetRelNumber(rlocator) == InvalidOid) { if (lsn > LwLsnCache->maxLastWrittenLsn) LwLsnCache->maxLastWrittenLsn = lsn; else lsn = LwLsnCache->maxLastWrittenLsn; } else { LastWrittenLsnCacheEntry* entry; BufferTag key; bool found; BlockNumber i; Oid spcOid = NInfoGetSpcOid(rlocator); Oid dbOid = NInfoGetDbOid(rlocator); Oid relNumber = NInfoGetRelNumber(rlocator); BufTagInit(key, relNumber, forknum, from, spcOid, dbOid); for (i = 0; i < n_blocks; i++) { key.blockNum = from + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) { if (lsn > entry->lsn) entry->lsn = lsn; else lsn = entry->lsn; /* Unlink from LRU list */ dlist_delete(&entry->lru_node); } else { entry->lsn = lsn; if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) { /* Replace least recently used entry */ LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); /* Adjust max LSN for not cached relations/chunks if needed */ if (victim->lsn > LwLsnCache->maxLastWrittenLsn) LwLsnCache->maxLastWrittenLsn = victim->lsn; hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); } } /* Link to the end of LRU list */ dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, &entry->lru_node); } } return lsn; } /* * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. * We maintain cache of last written LSNs with limited size and LRU replacement * policy. Keeping last written LSN for each page allows to use old LSN when * requesting pages of unchanged or appended relations. Also it is critical for * efficient work of prefetch in case massive update operations (like vacuum or remove). * * rlocator.relNumber can be InvalidOid, in this case maxLastWrittenLsn is updated. * SetLastWrittenLsn with dummy rlocator is used by createdb and dbase_redo functions. */ XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) return lsn; Assert(lsn >= WalSegMinSize); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); LWLockRelease(LastWrittenLsnLock); return lsn; } /* * neon_set_lwlsn_block_v -- Set maximal LSN of pages to their respective * LSNs. * * We maintain cache of last written LSNs with limited size and LRU replacement * policy. Keeping last written LSN for each page allows to use old LSN when * requesting pages of unchanged or appended relations. Also it is critical for * efficient work of prefetch in case massive update operations (like vacuum or remove). * * Note: This is different from SetLastWrittenLSNForBlockRange[Internal], in that this * specifies per-block LSNs, rather than only a single LSN. */ XLogRecPtr neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blockno, int nblocks) { LastWrittenLsnCacheEntry* entry; BufferTag key; bool found; XLogRecPtr max = InvalidXLogRecPtr; Oid spcOid = NInfoGetSpcOid(relfilenode); Oid dbOid = NInfoGetDbOid(relfilenode); Oid relNumber = NInfoGetRelNumber(relfilenode); if (lsns == NULL || nblocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0 || NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); for (int i = 0; i < nblocks; i++) { XLogRecPtr lsn = lsns[i]; if (lsn == InvalidXLogRecPtr) continue; Assert(lsn >= WalSegMinSize); key.blockNum = blockno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) { if (lsn > entry->lsn) entry->lsn = lsn; else lsn = entry->lsn; /* Unlink from LRU list */ dlist_delete(&entry->lru_node); } else { entry->lsn = lsn; if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) { /* Replace least recently used entry */ LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); /* Adjust max LSN for not cached relations/chunks if needed */ if (victim->lsn > LwLsnCache->maxLastWrittenLsn) LwLsnCache->maxLastWrittenLsn = victim->lsn; hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); } } /* Link to the end of LRU list */ dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, &entry->lru_node); max = Max(max, lsn); } LWLockRelease(LastWrittenLsnLock); return max; } /* * SetLastWrittenLSNForBlock -- Set maximal LSN for block */ XLogRecPtr neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) { return neon_set_lwlsn_block_range(lsn, rlocator, forknum, blkno, 1); } /* * neon_set_lwlsn_relation -- Set maximal LSN for relation metadata */ XLogRecPtr neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum) { return neon_set_lwlsn_block(lsn, rlocator, forknum, REL_METADATA_PSEUDO_BLOCKNO); } /* * neon_set_lwlsn_db -- Set maximal LSN for the whole database */ XLogRecPtr neon_set_lwlsn_db(XLogRecPtr lsn) { NRelFileInfo dummyNode = {InvalidOid, InvalidOid, InvalidOid}; return neon_set_lwlsn_block(lsn, dummyNode, MAIN_FORKNUM, 0); } ================================================ FILE: pgxn/neon/neon_lwlsncache.h ================================================ #ifndef NEON_LWLSNCACHE_H #define NEON_LWLSNCACHE_H #include "neon_pgversioncompat.h" void init_lwlsncache(void); /* Hooks */ XLogRecPtr neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno); void neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blkno, int nblocks, XLogRecPtr *lsns); XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); XLogRecPtr neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, ForkNumber forknum, BlockNumber blockno, int nblocks); XLogRecPtr neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno); XLogRecPtr neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum); XLogRecPtr neon_set_lwlsn_db(XLogRecPtr lsn); #endif /* NEON_LWLSNCACHE_H */ ================================================ FILE: pgxn/neon/neon_perf_counters.c ================================================ /*------------------------------------------------------------------------- * * neon_perf_counters.c * Collect statistics about Neon I/O * * Each backend has its own set of counters in shared memory. * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "funcapi.h" #include "miscadmin.h" #include "storage/proc.h" #include "storage/shmem.h" #include "utils/builtins.h" #include "neon.h" #include "neon_perf_counters.h" #include "walproposer.h" /* BEGIN_HADRON */ databricks_metrics *databricks_metrics_shared; Size DatabricksMetricsShmemSize(void) { return sizeof(databricks_metrics); } void DatabricksMetricsShmemInit(void) { bool found; databricks_metrics_shared = ShmemInitStruct("Databricks counters", DatabricksMetricsShmemSize(), &found); Assert(found == IsUnderPostmaster); if (!found) { pg_atomic_init_u32(&databricks_metrics_shared->index_corruption_count, 0); pg_atomic_init_u32(&databricks_metrics_shared->data_corruption_count, 0); pg_atomic_init_u32(&databricks_metrics_shared->internal_error_count, 0); pg_atomic_init_u32(&databricks_metrics_shared->ps_corruption_detected, 0); } } /* END_HADRON */ neon_per_backend_counters *neon_per_backend_counters_shared; void NeonPerfCountersShmemRequest(void) { Size size; #if PG_MAJORVERSION_NUM < 15 /* Hack: in PG14 MaxBackends is not initialized at the time of calling NeonPerfCountersShmemRequest function. * Do it ourselves and then undo to prevent assertion failure */ Assert(MaxBackends == 0); /* not initialized yet */ InitializeMaxBackends(); size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)); MaxBackends = 0; #else size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)); #endif if (lakebase_mode) { size = add_size(size, DatabricksMetricsShmemSize()); } RequestAddinShmemSpace(size); } void NeonPerfCountersShmemInit(void) { bool found; neon_per_backend_counters_shared = ShmemInitStruct("Neon perf counters", mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)), &found); Assert(found == IsUnderPostmaster); if (!found) { /* shared memory is initialized to zeros, so nothing to do here */ } } static inline void inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } hist->wait_us_bucket[lo]++; hist->wait_us_sum += latency_us; hist->wait_us_count++; } static inline void inc_qthist(QTHistogram hist, uint64 elapsed_us) { int lo = 0; int hi = NUM_QT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; if (elapsed_us < qt_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } hist->elapsed_us_bucket[lo]++; hist->elapsed_us_sum += elapsed_us; hist->elapsed_us_count++; } /* * Count a GetPage wait operation. */ void inc_getpage_wait(uint64 latency) { inc_iohist(&MyNeonCounters->getpage_hist, latency); } /* * Count an LFC read wait operation. */ void inc_page_cache_read_wait(uint64 latency) { inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); } /* * Count an LFC write wait operation. */ void inc_page_cache_write_wait(uint64 latency) { inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } void inc_query_time(uint64 elapsed) { inc_qthist(&MyNeonCounters->query_time_hist, elapsed); } /* * Support functions for the views, neon_backend_perf_counters and * neon_perf_counters. */ typedef struct { const char *name; bool is_bucket; double bucket_le; double value; } metric_t; static int io_histogram_to_metrics(IOHistogram histogram, metric_t *metrics, const char *count, const char *sum, const char *bucket) { int i = 0; uint64 bucket_accum = 0; metrics[i].name = count; metrics[i].is_bucket = false; metrics[i].value = (double) histogram->wait_us_count; i++; metrics[i].name = sum; metrics[i].is_bucket = false; metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { uint64 threshold = io_wait_bucket_thresholds[bucketno]; bucket_accum += histogram->wait_us_bucket[bucketno]; metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } return i; } static int qt_histogram_to_metrics(QTHistogram histogram, metric_t *metrics, const char *count, const char *sum, const char *bucket) { int i = 0; uint64 bucket_accum = 0; metrics[i].name = count; metrics[i].is_bucket = false; metrics[i].value = (double) histogram->elapsed_us_count; i++; metrics[i].name = sum; metrics[i].is_bucket = false; metrics[i].value = (double) histogram->elapsed_us_sum / 1000000.0; i++; for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) { uint64 threshold = qt_bucket_thresholds[bucketno]; bucket_accum += histogram->elapsed_us_bucket[bucketno]; metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } return i; } static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { #define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + (2 + NUM_QT_BUCKETS) + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; #define APPEND_METRIC(_name) do { \ metrics[i].name = #_name; \ metrics[i].is_bucket = false; \ metrics[i].value = (double) counters->_name; \ i++; \ } while (false) i += io_histogram_to_metrics(&counters->getpage_hist, &metrics[i], "getpage_wait_seconds_count", "getpage_wait_seconds_sum", "getpage_wait_seconds_bucket"); APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); APPEND_METRIC(compute_getpage_stuck_requests_total); APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms); APPEND_METRIC(getpage_prefetch_misses_total); APPEND_METRIC(getpage_prefetch_discards_total); APPEND_METRIC(pageserver_requests_sent_total); APPEND_METRIC(pageserver_disconnects_total); APPEND_METRIC(pageserver_send_flushes_total); APPEND_METRIC(pageserver_open_requests); APPEND_METRIC(getpage_prefetches_buffered); APPEND_METRIC(file_cache_hits_total); i += io_histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], "file_cache_read_wait_seconds_count", "file_cache_read_wait_seconds_sum", "file_cache_read_wait_seconds_bucket"); i += io_histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], "file_cache_write_wait_seconds_count", "file_cache_write_wait_seconds_sum", "file_cache_write_wait_seconds_bucket"); i += qt_histogram_to_metrics(&counters->query_time_hist, &metrics[i], "query_time_seconds_count", "query_time_seconds_sum", "query_time_seconds_bucket"); Assert(i == NUM_METRICS); #undef APPEND_METRIC #undef NUM_METRICS /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; return metrics; } /* * Write metric to three output Datums */ static void metric_to_datums(metric_t *m, Datum *values, bool *nulls) { values[0] = CStringGetTextDatum(m->name); nulls[0] = false; if (m->is_bucket) { values[1] = Float8GetDatum(m->bucket_le); nulls[1] = false; } else { values[1] = (Datum) 0; nulls[1] = true; } values[2] = Float8GetDatum(m->value); nulls[2] = false; } PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters); Datum neon_get_backend_perf_counters(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; Datum values[5]; bool nulls[5]; /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { PGPROC *proc = GetPGProcByNumber(procno); int pid = proc->pid; neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; metric_t *metrics = neon_perf_counters_to_metrics(counters); values[0] = Int32GetDatum(procno); nulls[0] = false; values[1] = Int32GetDatum(pid); nulls[1] = false; for (int i = 0; metrics[i].name != NULL; i++) { metric_to_datums(&metrics[i], &values[2], &nulls[2]); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } pfree(metrics); } return (Datum) 0; } static inline void io_histogram_merge_into(IOHistogram into, IOHistogram from) { into->wait_us_count += from->wait_us_count; into->wait_us_sum += from->wait_us_sum; for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; } static inline void qt_histogram_merge_into(QTHistogram into, QTHistogram from) { into->elapsed_us_count += from->elapsed_us_count; into->elapsed_us_sum += from->elapsed_us_sum; for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) into->elapsed_us_bucket[bucketno] += from->elapsed_us_bucket[bucketno]; } PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; Datum values[3]; bool nulls[3]; neon_per_backend_counters totals = {0}; metric_t *metrics; /* BEGIN_HADRON */ WalproposerShmemState *wp_shmem; uint32 num_safekeepers; uint32 num_active_safekeepers; /* END_HADRON */ /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); /* Aggregate the counters across all backends */ for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; io_histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total; totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; totals.pageserver_open_requests += counters->pageserver_open_requests; totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; totals.compute_getpage_max_inflight_stuck_time_ms = Max( totals.compute_getpage_max_inflight_stuck_time_ms, counters->compute_getpage_max_inflight_stuck_time_ms); io_histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); io_histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); qt_histogram_merge_into(&totals.query_time_hist, &counters->query_time_hist); } metrics = neon_perf_counters_to_metrics(&totals); for (int i = 0; metrics[i].name != NULL; i++) { metric_to_datums(&metrics[i], &values[0], &nulls[0]); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } if (lakebase_mode) { if (databricks_test_hook == TestHookCorruption) { ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("test corruption"))); } // Not ideal but piggyback our databricks counters into the neon perf counters view // so that we don't need to introduce neon--1.x+1.sql to add a new view. { // Keeping this code in its own block to work around the C90 "don't mix declarations and code" rule when we define // the `databricks_metrics` array in the next block. Yes, we are seriously dealing with C90 rules in 2025. // Read safekeeper status from wal proposer shared memory first. // Note that we are taking a mutex when reading from walproposer shared memory so that the total safekeeper count is // consistent with the active wal acceptors count. Assuming that we don't query this view too often the mutex should // not be a huge deal. wp_shmem = GetWalpropShmemState(); SpinLockAcquire(&wp_shmem->mutex); num_safekeepers = wp_shmem->num_safekeepers; num_active_safekeepers = 0; for (int i = 0; i < num_safekeepers; i++) { if (wp_shmem->safekeeper_status[i] == 1) { num_active_safekeepers++; } } SpinLockRelease(&wp_shmem->mutex); } { metric_t databricks_metrics[] = { {"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)}, {"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)}, {"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)}, {"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)}, {"num_active_safekeepers", false, 0.0, (double) num_active_safekeepers}, {"num_configured_safekeepers", false, 0.0, (double) num_safekeepers}, {NULL, false, 0, 0}, }; for (int i = 0; databricks_metrics[i].name != NULL; i++) { metric_to_datums(&databricks_metrics[i], &values[0], &nulls[0]); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } } /* END_HADRON */ } pfree(metrics); return (Datum) 0; } ================================================ FILE: pgxn/neon/neon_perf_counters.h ================================================ /*------------------------------------------------------------------------- * * neon_perf_counters.h * Performance counters for neon storage requests *------------------------------------------------------------------------- */ #ifndef NEON_PERF_COUNTERS_H #define NEON_PERF_COUNTERS_H #if PG_VERSION_NUM >= 170000 #include "storage/procnumber.h" #else #include "storage/backendid.h" #endif #include "storage/proc.h" static const uint64 io_wait_bucket_thresholds[] = { 2, 3, 6, 10, /* 0 us - 10 us */ 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ UINT64_MAX, }; #define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) typedef struct IOHistogramData { uint64 wait_us_count; uint64 wait_us_sum; uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; } IOHistogramData; typedef IOHistogramData *IOHistogram; static const uint64 qt_bucket_thresholds[] = { 2, 3, 6, 10, /* 0 us - 10 us */ 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; #define NUM_QT_BUCKETS (lengthof(qt_bucket_thresholds)) typedef struct QTHistogramData { uint64 elapsed_us_count; uint64 elapsed_us_sum; uint64 elapsed_us_bucket[NUM_QT_BUCKETS]; } QTHistogramData; typedef QTHistogramData *QTHistogram; typedef struct { /* * Histogram for how long an smgrread() request needs to wait for response * from pageserver. When prefetching is effective, these wait times can be * lower than the network latency to the pageserver, even zero, if the * page is already readily prefetched whenever we need to read a page. * * Note: we accumulate these in microseconds, because that's convenient in * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous * GetPage requests sent. */ uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; /* * Total number of Getpage requests left without an answer for more than * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout */ uint64 compute_getpage_stuck_requests_total; /* * Longest waiting time for active stuck requests. If a stuck request gets a * response or disconnects, this metric is updated */ uint64 compute_getpage_max_inflight_stuck_time_ms; /* * Total number of readahead misses; consisting of either prefetches that * don't satisfy the LSN bounds, or cases where no readahead was issued * for the read. */ uint64 getpage_prefetch_misses_total; /* * Number of prefetched responses that were discarded becuase the * prefetched page was not needed or because it was concurrently fetched / * modified by another backend. */ uint64 getpage_prefetch_discards_total; /* * Total number of requests send to pageserver. (prefetch_requests_total * and sync_request_total count only GetPage requests, this counts all * request types.) */ uint64 pageserver_requests_sent_total; /* * Number of times the connection to the pageserver was lost and the * backend had to reconnect. Note that this doesn't count the first * connection in each backend, only reconnects. */ uint64 pageserver_disconnects_total; /* * Number of network flushes to the pageserver. Synchronous requests are * flushed immediately, but when prefetching requests are sent in batches, * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; /* * Number of open requests to PageServer. */ uint64 pageserver_open_requests; /* * Number of unused prefetches currently cached in this backend. */ uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. * * This is redundant with the server-wide file_cache_hits, but this gives * per-backend granularity, and it's handy to have this in the same place * as counters for requests that went to the pageserver. Maybe move all * the LFC stats to this struct in the future? */ uint64 file_cache_hits_total; /* LFC I/O time buckets */ IOHistogramData file_cache_read_hist; IOHistogramData file_cache_write_hist; /* * Histogram of query execution time. */ QTHistogramData query_time_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ extern neon_per_backend_counters *neon_per_backend_counters_shared; /* * Size of the perf counters array in shared memory. One slot for each backend * and aux process. IOW one for each PGPROC slot, except for slots reserved * for prepared transactions, because they're not real processes and cannot do * I/O. */ #define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS) #define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber]) extern void inc_getpage_wait(uint64 latency); extern void inc_page_cache_read_wait(uint64 latency); extern void inc_page_cache_write_wait(uint64 latency); extern void inc_query_time(uint64 elapsed); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); /* BEGIN_HADRON */ typedef struct { pg_atomic_uint32 index_corruption_count; pg_atomic_uint32 data_corruption_count; pg_atomic_uint32 internal_error_count; pg_atomic_uint32 ps_corruption_detected; } databricks_metrics; extern databricks_metrics *databricks_metrics_shared; extern Size DatabricksMetricsShmemSize(void); extern void DatabricksMetricsShmemInit(void); extern int databricks_test_hook; static const int TestHookCorruption = 1; /* END_HADRON */ #endif /* NEON_PERF_COUNTERS_H */ ================================================ FILE: pgxn/neon/neon_pgversioncompat.c ================================================ /* * Support functions for the compatibility macros in neon_pgversioncompat.h */ #include "postgres.h" #include "funcapi.h" #include "miscadmin.h" #include "access/xlog.h" #include "utils/tuplestore.h" #include "neon_pgversioncompat.h" #if PG_MAJORVERSION_NUM < 15 void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; Tuplestorestate *tupstore; MemoryContext old_context, per_query_ctx; TupleDesc stored_tupdesc; /* check to see if caller supports returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); /* * Store the tuplestore and the tuple descriptor in ReturnSetInfo. This * must be done in the per-query memory context. */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; old_context = MemoryContextSwitchTo(per_query_ctx); if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupstore = tuplestore_begin_heap(false, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = stored_tupdesc; MemoryContextSwitchTo(old_context); } TimeLineID GetWALInsertionTimeLine(void) { return ThisTimeLineID + 1; } #endif ================================================ FILE: pgxn/neon/neon_pgversioncompat.h ================================================ /* * Compatibility macros to cover up differences between supported PostgreSQL versions, * to help with compiling the same sources for all of them. */ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H #include "fmgr.h" #include "storage/buf_internals.h" #if PG_MAJORVERSION_NUM < 16 typedef PGAlignedBlock PGIOAlignedBlock; #endif #if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) #else #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) #endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ NInfoGetDbOid(a) == NInfoGetDbOid(b) && \ NInfoGetRelNumber(a) == NInfoGetRelNumber(b) \ ) /* These macros were turned into static inline functions in v16 */ #if PG_MAJORVERSION_NUM < 16 static inline bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2) { return BUFFERTAGS_EQUAL(*tag1, *tag2); } static inline void InitBufferTag(BufferTag *tag, const RelFileNode *rnode, ForkNumber forkNum, BlockNumber blockNum) { INIT_BUFFERTAG(*tag, *rnode, forkNum, blockNum); } #endif /* RelFileNode -> RelFileLocator rework */ #if PG_MAJORVERSION_NUM < 16 #define USE_RELFILENODE #define RELFILEINFO_HDR "storage/relfilenode.h" #define NRelFileInfo RelFileNode #define NRelFileInfoBackend RelFileNodeBackend #define NRelFileNumber Oid #define InfoFromRelation(rel) (rel)->rd_node #define InfoFromSMgrRel(srel) (srel)->smgr_rnode.node #define InfoBFromSMgrRel(srel) (srel)->smgr_rnode #define InfoFromNInfoB(ninfob) ninfob.node #define RelFileInfoFmt(rinfo) \ (rinfo).spcNode, \ (rinfo).dbNode, \ (rinfo).relNode #define RelFileInfoBackendFmt(ninfob) \ (ninfob).backend, \ (ninfob).node.spcNode, \ (ninfob).node.dbNode, \ (ninfob).node.relNode #define NInfoGetSpcOid(ninfo) (ninfo).spcNode #define NInfoGetDbOid(ninfo) (ninfo).dbNode #define NInfoGetRelNumber(ninfo) (ninfo).relNode #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ } while (false) #define BufTagGetNRelFileInfo(tag) (tag).rnode #define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) #define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \ do { \ RelFileNode rnode = { .spcNode = (spc_oid), .dbNode = (db_oid), .relNode = (rel_number)}; \ (tag).forkNum = (fork_number); \ (tag).blockNum = (block_number); \ (tag).rnode = rnode; \ } while (false) #define InvalidRelFileNumber InvalidOid #define SMgrRelGetRelInfo(reln) ((reln)->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers #else /* major version >= 16 */ #define USE_RELFILELOCATOR #define RELFILEINFO_HDR "storage/relfilelocator.h" #define NRelFileInfo RelFileLocator #define NRelFileInfoBackend RelFileLocatorBackend #define InfoFromRelation(rel) (rel)->rd_locator #define InfoFromSMgrRel(srel) (srel)->smgr_rlocator.locator #define InfoBFromSMgrRel(srel) (srel)->smgr_rlocator #define InfoFromNInfoB(ninfob) (ninfob).locator #define RelFileInfoFmt(rinfo) \ (rinfo).spcOid, \ (rinfo).dbOid, \ (rinfo).relNumber #define RelFileInfoBackendFmt(ninfob) \ (ninfob).backend, \ (ninfob).locator.spcOid, \ (ninfob).locator.dbOid, \ (ninfob).locator.relNumber #define NInfoGetSpcOid(ninfo) (ninfo).spcOid #define NInfoGetDbOid(ninfo) (ninfo).dbOid #define NInfoGetRelNumber(ninfo) (ninfo).relNumber #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ .spcOid = (tag).spcOid, \ .dbOid = (tag).dbOid, \ .relNumber = (tag).relNumber, \ }) #define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \ do { \ (tag).forkNum = (fork_number); \ (tag).blockNum = (block_number); \ (tag).spcOid = (spc_oid); \ (tag).dbOid = (db_oid); \ (tag).relNumber = (rel_number); \ } while (false) #define SMgrRelGetRelInfo(reln) ((reln)->smgr_rlocator) #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif #define NRelFileInfoInvalidate(rinfo) do { \ NInfoGetSpcOid(rinfo) = InvalidOid; \ NInfoGetDbOid(rinfo) = InvalidOid; \ NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \ } while (0) #if PG_MAJORVERSION_NUM < 17 #define ProcNumber BackendId #define INVALID_PROC_NUMBER InvalidBackendId #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) #endif #if PG_MAJORVERSION_NUM < 17 #define MyProcNumber (MyProc - &ProcGlobal->allProcs[0]) #endif #if PG_MAJORVERSION_NUM < 15 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); extern TimeLineID GetWALInsertionTimeLine(void); #endif /* format codes not present in PG17-; but available in PG18+ */ #define INT64_HEX_FORMAT "%" INT64_MODIFIER "x" #define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x" #endif /* NEON_PGVERSIONCOMPAT_H */ ================================================ FILE: pgxn/neon/neon_utils.c ================================================ #include #ifndef WALPROPOSER_LIB #include #endif #include "postgres.h" #include "neon_utils.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" /* * Convert a character which represents a hexadecimal digit to an integer. * * Returns -1 if the character is not a hexadecimal digit. */ static int HexDecodeChar(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; return -1; } /* * Decode a hex string into a byte string, 2 hex chars per byte. * * Returns false if invalid characters are encountered; otherwise true. */ bool HexDecodeString(uint8 *result, char *input, int nbytes) { int i; for (i = 0; i < nbytes; ++i) { int n1 = HexDecodeChar(input[i * 2]); int n2 = HexDecodeChar(input[i * 2 + 1]); if (n1 < 0 || n2 < 0) return false; result[i] = n1 * 16 + n2; } return true; } /* -------------------------------- * pq_getmsgint16 - get a binary 2-byte int from a message buffer * -------------------------------- */ uint16 pq_getmsgint16(StringInfo msg) { return pq_getmsgint(msg, 2); } /* -------------------------------- * pq_getmsgint32 - get a binary 4-byte int from a message buffer * -------------------------------- */ uint32 pq_getmsgint32(StringInfo msg) { return pq_getmsgint(msg, 4); } /* -------------------------------- * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order * -------------------------------- */ uint32 pq_getmsgint32_le(StringInfo msg) { uint32 n32; pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); return n32; } /* -------------------------------- * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order * -------------------------------- */ uint64 pq_getmsgint64_le(StringInfo msg) { uint64 n64; pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); return n64; } /* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ void pq_sendint32_le(StringInfo buf, uint32 i) { enlargeStringInfo(buf, sizeof(uint32)); memcpy(buf->data + buf->len, &i, sizeof(uint32)); buf->len += sizeof(uint32); } /* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ void pq_sendint64_le(StringInfo buf, uint64 i) { enlargeStringInfo(buf, sizeof(uint64)); memcpy(buf->data + buf->len, &i, sizeof(uint64)); buf->len += sizeof(uint64); } /* * Disables core dump for the current process. */ void disable_core_dump() { struct rlimit rlim; #ifdef WALPROPOSER_LIB /* skip in simulation mode */ return; #endif rlim.rlim_cur = 0; rlim.rlim_max = 0; if (setrlimit(RLIMIT_CORE, &rlim)) { int save_errno = errno; fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); } } #ifndef WALPROPOSER_LIB /* * On macOS with a libcurl that has IPv6 support, curl_global_init() calls * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal * place to call curl_global_init() would be _PG_init(), but Neon has to be * added to shared_preload_libraries, which are loaded in the Postmaster * process. The Postmaster is not supposed to become multithreaded at any point * in its lifecycle. Postgres doesn't have any good hook that I know of to * initialize per-backend structures, so we have to check this on any * allocation of a CURL handle. * * Free the allocated CURL handle with curl_easy_cleanup(3). * * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies */ CURL * alloc_curl_handle(void) { static bool curl_initialized = false; CURL *handle; if (unlikely(!curl_initialized)) { /* Protected by mutex internally */ if (curl_global_init(CURL_GLOBAL_DEFAULT)) { elog(ERROR, "Failed to initialize curl"); } curl_initialized = true; } handle = curl_easy_init(); if (handle == NULL) { elog(ERROR, "Failed to initialize curl handle"); } return handle; } #endif /* * Check if a BufferTag is valid by verifying all its fields are not invalid. */ bool BufferTagIsValid(const BufferTag *tag) { #if PG_MAJORVERSION_NUM >= 16 return (tag->spcOid != InvalidOid) && (tag->relNumber != InvalidRelFileNumber) && (tag->forkNum != InvalidForkNumber) && (tag->blockNum != InvalidBlockNumber); #else return (tag->rnode.spcNode != InvalidOid) && (tag->rnode.relNode != InvalidOid) && (tag->forkNum != InvalidForkNumber) && (tag->blockNum != InvalidBlockNumber); #endif } ================================================ FILE: pgxn/neon/neon_utils.h ================================================ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ #include "lib/stringinfo.h" #include "storage/buf_internals.h" #ifndef WALPROPOSER_LIB #include #endif bool HexDecodeString(uint8 *result, char *input, int nbytes); uint16 pq_getmsgint16(StringInfo msg); uint32 pq_getmsgint32(StringInfo msg); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); void disable_core_dump(void); /* Buffer tag validation function */ bool BufferTagIsValid(const BufferTag *tag); #ifndef WALPROPOSER_LIB CURL * alloc_curl_handle(void); #endif #endif /* __NEON_UTILS_H__ */ ================================================ FILE: pgxn/neon/neon_walreader.c ================================================ /* * Like WALRead, but when WAL segment doesn't exist locally instead of throwing * ERROR asynchronously tries to fetch it from the most advanced safekeeper. * * We can't use libpqwalreceiver as it blocks during connection establishment * (and waiting for PQExec result), so use libpqwalproposer instead. * * TODO: keepalives are currently never sent, so the other side can close the * connection prematurely. * * TODO: close conn if reading takes too long to prevent stuck connections. */ #include "postgres.h" #include #include #include "access/xlog_internal.h" #include "access/xlogdefs.h" #include "access/xlogreader.h" #include "libpq/pqformat.h" #include "storage/fd.h" #include "utils/memutils.h" #include "utils/wait_event.h" #include "libpq-fe.h" #include "neon_walreader.h" #include "walproposer.h" #define NEON_WALREADER_ERR_MSG_LEN 512 /* * Can be called where NeonWALReader *state is available in the context, adds log_prefix. */ #define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__) static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli); /* * State of connection to donor safekeeper. */ typedef enum { /* no remote connection */ RS_NONE, /* doing PQconnectPoll, need readable socket */ RS_CONNECTING_READ, /* doing PQconnectPoll, need writable socket */ RS_CONNECTING_WRITE, /* Waiting for START_REPLICATION result */ RS_WAIT_EXEC_RESULT, /* replication stream established */ RS_ESTABLISHED, } NeonWALReaderRemoteState; struct NeonWALReader { /* * LSN before which we assume WAL is not available locally. Exists because * though first segment after startup always exists, part before * basebackup LSN is filled with zeros. */ XLogRecPtr available_lsn; WALSegmentContext segcxt; WALOpenSegment seg; int wre_errno; TimeLineID local_active_tlid; /* Explains failure to read, static for simplicity. */ char err_msg[NEON_WALREADER_ERR_MSG_LEN]; /* * Saved info about request in progress, used to check validity of * arguments after resume and remember how far we accomplished it. req_lsn * is 0 if there is no request in progress. */ XLogRecPtr req_lsn; Size req_len; Size req_progress; char donor_conninfo[MAXCONNINFO]; char donor_name[64]; /* saved donor safekeeper name for logging */ XLogRecPtr donor_lsn; /* state of connection to safekeeper */ NeonWALReaderRemoteState rem_state; WalProposerConn *wp_conn; /* * position in wp_conn recvbuf from which we'll copy WAL next time, or * NULL if there is no unprocessed message */ char *wal_ptr; Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */ /* * LSN of wal_ptr position according to walsender to cross check against * read request */ XLogRecPtr rem_lsn; /* prepended to lines logged by neon_walreader, if provided */ char log_prefix[64]; }; /* palloc and initialize NeonWALReader */ NeonWALReader * NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid) { NeonWALReader *reader; /* * Note: we allocate in TopMemoryContext, reusing the reader for all process * reads. */ reader = (NeonWALReader *) MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; reader->local_active_tlid = tlid; reader->seg.ws_file = -1; reader->seg.ws_segno = 0; reader->seg.ws_tli = 0; reader->segcxt.ws_segsize = wal_segment_size; reader->rem_state = RS_NONE; if (log_prefix) strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix)); return reader; } void NeonWALReaderFree(NeonWALReader *state) { if (state->seg.ws_file != -1) neon_wal_segment_close(state); if (state->wp_conn) libpqwp_disconnect(state->wp_conn); pfree(state); } /* * Like vanilla WALRead, but if requested position is before available_lsn or * WAL segment doesn't exist on disk, it tries to fetch needed segment from the * advanced safekeeper. * * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL * fetched from timeline 'tli'. * * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error * occurs, in which case 'err' has the description. Error always closes remote * connection, if there was any, so socket subscription should be removed. * * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with * NeonWALReaderSocket and call NeonWALRead again with exactly the same * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq * docs during connection establishment (before first successful read) socket * underneath might change. * * Also, eventually walreader should switch from remote to local read; caller * should remove subscription to socket then by checking NeonWALReaderEvents * after successful read (otherwise next read might reopen the connection with * different socket). * * Reading not monotonically is not supported and will result in error. * * Caller should be sure that WAL up to requested LSN exists, otherwise * NEON_WALREAD_WOULDBLOCK might be always returned. */ NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) { /* * If requested data is before known available basebackup lsn or there is * already active remote state, do remote read. */ if (startptr < state->available_lsn || state->rem_state != RS_NONE) { return NeonWALReadRemote(state, buf, startptr, count, tli); } if (NeonWALReadLocal(state, buf, startptr, count, tli)) { return NEON_WALREAD_SUCCESS; } else if (state->wre_errno == ENOENT) { nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote", LSN_FORMAT_ARGS(startptr), count); return NeonWALReadRemote(state, buf, startptr, count, tli); } else { return NEON_WALREAD_ERROR; } } /* Do the read from remote safekeeper. */ static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) { if (state->rem_state == RS_NONE) { if (!NeonWALReaderUpdateDonor(state)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to establish remote connection to fetch WAL: no donor available"); return NEON_WALREAD_ERROR; } /* no connection yet; start one */ nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn)); state->wp_conn = libpqwp_connect_start(state->donor_conninfo); if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to connect to %s to fetch WAL: immediately failed with %s", state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } /* we'll poll immediately */ state->rem_state = RS_CONNECTING_WRITE; return NEON_WALREAD_WOULDBLOCK; } if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) { switch (PQconnectPoll(state->wp_conn->pg_conn)) { case PGRES_POLLING_FAILED: snprintf(state->err_msg, sizeof(state->err_msg), "failed to connect to %s to fetch WAL: poll error: %s", state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; case PGRES_POLLING_READING: state->rem_state = RS_CONNECTING_READ; return NEON_WALREAD_WOULDBLOCK; case PGRES_POLLING_WRITING: state->rem_state = RS_CONNECTING_WRITE; return NEON_WALREAD_WOULDBLOCK; case PGRES_POLLING_OK: { /* connection successfully established */ char start_repl_query[128]; term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm); /* * Set elected walproposer's term to pull only data from * its history. Note: for logical walsender it means we * might stream WAL not yet committed by safekeepers. It * would be cleaner to fix this. * * mineLastElectedTerm shouldn't be 0 at this point * because we checked above that donor exists and it * appears only after successfull election. */ Assert(term > 0); snprintf(start_repl_query, sizeof(start_repl_query), "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", LSN_FORMAT_ARGS(startptr), term); nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", state->donor_name, start_repl_query); if (!libpqwp_send_query(state->wp_conn, start_repl_query)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to send %s query to %s: %s", start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } state->rem_state = RS_WAIT_EXEC_RESULT; break; } default: /* there is unused PGRES_POLLING_ACTIVE */ Assert(false); return NEON_WALREAD_ERROR; /* keep the compiler quiet */ } } if (state->rem_state == RS_WAIT_EXEC_RESULT) { switch (libpqwp_get_query_result(state->wp_conn)) { case WP_EXEC_SUCCESS_COPYBOTH: state->rem_state = RS_ESTABLISHED; break; case WP_EXEC_NEEDS_INPUT: return NEON_WALREAD_WOULDBLOCK; case WP_EXEC_FAILED: snprintf(state->err_msg, sizeof(state->err_msg), "get START_REPLICATION result from %s failed: %s", state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; default: /* can't happen */ snprintf(state->err_msg, sizeof(state->err_msg), "get START_REPLICATION result from %s: unexpected result", state->donor_name); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } } Assert(state->rem_state == RS_ESTABLISHED); /* * If we had the request before, verify args are the same and advance the * result ptr according to the progress; otherwise register the request. */ if (state->req_lsn != InvalidXLogRecPtr) { if (state->req_lsn != startptr || state->req_len != count) { snprintf(state->err_msg, sizeof(state->err_msg), "args changed during request, was %X/%X %zu, now %X/%X %zu", LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu", LSN_FORMAT_ARGS(startptr), count, state->req_progress); buf += state->req_progress; } else { state->req_lsn = startptr; state->req_len = count; state->req_progress = 0; nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu", LSN_FORMAT_ARGS(startptr), count); } while (true) { Size to_copy; /* * If we have no ready data, receive new message. */ if (state->wal_rem_len == 0 && /* * check for the sake of 0 length reads; walproposer does these for * heartbeats, though generally they shouldn't hit remote source. */ state->req_len - state->req_progress > 0) { NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state); if (read_msg_res != NEON_WALREAD_SUCCESS) return read_msg_res; } if (state->req_lsn + state->req_progress != state->rem_lsn) { snprintf(state->err_msg, sizeof(state->err_msg), "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu", LSN_FORMAT_ARGS(state->req_lsn + state->req_progress), LSN_FORMAT_ARGS(state->rem_lsn), LSN_FORMAT_ARGS(state->req_lsn), state->req_len); NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } /* We can copy min of (available, requested) bytes. */ to_copy = Min(state->req_len - state->req_progress, state->wal_rem_len); memcpy(buf, state->wal_ptr, to_copy); state->wal_ptr += to_copy; state->wal_rem_len -= to_copy; state->rem_lsn += to_copy; if (state->wal_rem_len == 0) state->wal_ptr = NULL; /* freed by libpqwalproposer */ buf += to_copy; state->req_progress += to_copy; if (state->req_progress == state->req_len) { XLogSegNo next_segno; XLogSegNo req_segno; XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize); XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize); /* * Request completed. If there is a chance of serving next one * locally, close the connection. */ if (state->req_lsn < state->available_lsn && state->rem_lsn >= state->available_lsn) { nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally", LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn)); NeonWALReaderResetRemote(state); } else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno && is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli)) { nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists", LSN_FORMAT_ARGS(state->rem_lsn)); NeonWALReaderResetRemote(state); } state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; /* Update the current segment info. */ state->seg.ws_tli = tli; return NEON_WALREAD_SUCCESS; } } } /* * Read one WAL message from the stream, sets state->wal_ptr in case of success. * Resets remote state in case of failure. */ static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state) { while (true) /* loop until we get 'w' */ { char *copydata_ptr; int copydata_size; StringInfoData s; char msg_type; int hdrlen; Assert(state->rem_state == RS_ESTABLISHED); Assert(state->wal_ptr == NULL && state->wal_rem_len == 0); switch (libpqwp_async_read(state->wp_conn, ©data_ptr, ©data_size)) { case PG_ASYNC_READ_SUCCESS: break; case PG_ASYNC_READ_TRY_AGAIN: return NEON_WALREAD_WOULDBLOCK; case PG_ASYNC_READ_FAIL: snprintf(state->err_msg, sizeof(state->err_msg), "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s", LSN_FORMAT_ARGS(state->req_lsn), state->req_len, state->req_progress, PQerrorMessage(state->wp_conn->pg_conn)); goto err; } /* put data on StringInfo to parse */ s.data = copydata_ptr; s.len = copydata_size; s.cursor = 0; s.maxlen = -1; if (copydata_size == 0) { snprintf(state->err_msg, sizeof(state->err_msg), "zero length copydata received"); goto err; } msg_type = pq_getmsgbyte(&s); switch (msg_type) { case 'w': { XLogRecPtr start_lsn; hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); if (s.len - s.cursor < hdrlen) { snprintf(state->err_msg, sizeof(state->err_msg), "invalid WAL message received from primary"); goto err; } start_lsn = pq_getmsgint64(&s); pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */ pq_getmsgint64(&s); /* TimestampTz send_time */ state->rem_lsn = start_lsn; state->wal_rem_len = (Size) (s.len - s.cursor); state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor); nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu", LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len); return NEON_WALREAD_SUCCESS; } case 'k': { XLogRecPtr end_lsn; bool reply_requested; hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); if (s.len - s.cursor < hdrlen) { snprintf(state->err_msg, sizeof(state->err_msg), "invalid keepalive message received from primary"); goto err; } end_lsn = pq_getmsgint64(&s); pq_getmsgint64(&s); /* TimestampTz timestamp; */ reply_requested = pq_getmsgbyte(&s); nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d", LSN_FORMAT_ARGS(end_lsn), reply_requested); if (end_lsn < state->req_lsn + state->req_len) { snprintf(state->err_msg, sizeof(state->err_msg), "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X", LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn)); goto err; } continue; } default: nwr_log(WARNING, "invalid replication message type %d", msg_type); continue; } } err: NeonWALReaderResetRemote(state); return NEON_WALREAD_ERROR; } /* reset remote connection and request in progress */ void NeonWALReaderResetRemote(NeonWALReader *state) { state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; state->rem_state = RS_NONE; if (state->wp_conn) { libpqwp_disconnect(state->wp_conn); state->wp_conn = NULL; } state->donor_name[0] = '\0'; state->wal_ptr = NULL; state->wal_rem_len = 0; state->rem_lsn = InvalidXLogRecPtr; } /* * Return socket of connection to remote source. Must be called only when * connection exists (NeonWALReaderEvents returns non zero). */ pgsocket NeonWALReaderSocket(NeonWALReader *state) { if (!state->wp_conn) nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection"); return PQsocket(state->wp_conn->pg_conn); } /* * Whether remote connection is established. Once this is done, until successful * local read or error socket is stable and user can update socket events * instead of readding it each time. */ bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state) { return state->rem_state == RS_ESTABLISHED; } /* * Whether remote connection is established. Once this is done, until successful * local read or error socket is stable and user can update socket events * instead of readding it each time. */ TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state) { return state->local_active_tlid; } /* * Returns events user should wait on connection socket or 0 if remote * connection is not active. */ extern uint32 NeonWALReaderEvents(NeonWALReader *state) { switch (state->rem_state) { case RS_NONE: return 0; case RS_CONNECTING_READ: return WL_SOCKET_READABLE; case RS_CONNECTING_WRITE: return WL_SOCKET_WRITEABLE; case RS_WAIT_EXEC_RESULT: case RS_ESTABLISHED: return WL_SOCKET_READABLE; default: Assert(false); return 0; /* make compiler happy */ } } static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) { char *p; XLogRecPtr recptr; Size nbytes; p = buf; recptr = startptr; nbytes = count; /* Try to read directly from WAL buffers first. */ #if PG_MAJORVERSION_NUM >= 17 { Size rbytes; rbytes = WALReadFromBuffers(p, recptr, nbytes, tli); recptr += rbytes; nbytes -= rbytes; p += rbytes; } #endif while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; XLogSegNo lastRemovedSegNo; startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); /* * If the data we want is not in a segment we have open, close what we * have (if anything) and open the next one, using the caller's * provided openSegment callback. */ if (state->seg.ws_file < 0 || !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || tli != state->seg.ws_tli) { XLogSegNo nextSegNo; neon_wal_segment_close(state); XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); if (!neon_wal_segment_open(state, nextSegNo, &tli)) { char fname[MAXFNAMELEN]; state->wre_errno = errno; XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s", fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno)); return false; } /* This shouldn't happen -- indicates a bug in segment_open */ Assert(state->seg.ws_file >= 0); /* Update the current segment info. */ state->seg.ws_tli = tli; state->seg.ws_segno = nextSegNo; } /* How many bytes are within this segment? */ if (nbytes > (state->segcxt.ws_segsize - startoff)) segbytes = state->segcxt.ws_segsize - startoff; else segbytes = nbytes; #ifndef FRONTEND pgstat_report_wait_start(WAIT_EVENT_WAL_READ); #endif /* Reset errno first; eases reporting non-errno-affecting errors */ errno = 0; readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); #ifndef FRONTEND pgstat_report_wait_end(); #endif if (readbytes <= 0) { char fname[MAXFNAMELEN]; XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize); if (readbytes < 0) { state->wre_errno = errno; snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s", fname, startoff, strerror(state->wre_errno)); } else { snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF", fname, startoff); } return false; } /* * Recheck that the segment hasn't been removed while we were reading * it. */ lastRemovedSegNo = XLogGetLastRemovedSegno(); if (state->seg.ws_segno <= lastRemovedSegNo) { char fname[MAXFNAMELEN]; state->wre_errno = ENOENT; XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize); snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT, fname, lastRemovedSegNo); return false; } /* Update state for read */ recptr += readbytes; nbytes -= readbytes; p += readbytes; } return true; } XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state) { return state->rem_lsn; } const WALOpenSegment * NeonWALReaderGetSegment(NeonWALReader *state) { return &state->seg; } /* * Copy of vanilla wal_segment_open, but returns false in case of error instead * of ERROR, with errno set. * * XLogReaderRoutine->segment_open callback for local pg_wal files */ bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p) { TimeLineID tli = *tli_p; char path[MAXPGPATH]; XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); nwr_log(DEBUG5, "opening %s", path); state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); if (state->seg.ws_file >= 0) return true; return false; } static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) { struct stat stat_buffer; char path[MAXPGPATH]; XLogFilePath(path, tli, segno, segsize); return stat(path, &stat_buffer) == 0; } /* copy of vanilla wal_segment_close with NeonWALReader */ void neon_wal_segment_close(NeonWALReader *state) { if (state->seg.ws_file >= 0) { close(state->seg.ws_file); /* need to check errno? */ state->seg.ws_file = -1; } } char * NeonWALReaderErrMsg(NeonWALReader *state) { return state->err_msg; } /* * Returns true if there is a donor, and false otherwise */ bool NeonWALReaderUpdateDonor(NeonWALReader *state) { WalproposerShmemState *wps = GetWalpropShmemState(); SpinLockAcquire(&wps->mutex); memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name)); memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo)); state->donor_lsn = wps->donor_lsn; SpinLockRelease(&wps->mutex); return state->donor_name[0] != '\0'; } ================================================ FILE: pgxn/neon/neon_walreader.h ================================================ #ifndef __NEON_WALREADER_H__ #define __NEON_WALREADER_H__ #include "access/xlogdefs.h" /* forward declare so we don't have to expose the struct to the public */ struct NeonWALReader; typedef struct NeonWALReader NeonWALReader; /* avoid including walproposer.h as it includes us */ struct WalProposer; typedef struct WalProposer WalProposer; /* NeonWALRead return value */ typedef enum { NEON_WALREAD_SUCCESS, NEON_WALREAD_WOULDBLOCK, NEON_WALREAD_ERROR, } NeonWALReadResult; extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid); extern void NeonWALReaderFree(NeonWALReader *state); extern void NeonWALReaderResetRemote(NeonWALReader *state); extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); extern char *NeonWALReaderErrMsg(NeonWALReader *state); extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state); extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state); extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); extern void neon_wal_segment_close(NeonWALReader *state); extern bool NeonWALReaderUpdateDonor(NeonWALReader *state); #endif /* __NEON_WALREADER_H__ */ ================================================ FILE: pgxn/neon/pagestore_client.h ================================================ /*------------------------------------------------------------------------- * * pagestore_client.h * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #ifndef PAGESTORE_CLIENT_h #define PAGESTORE_CLIENT_h #include "neon_pgversioncompat.h" #include "access/slru.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" #include "storage/block.h" #include "storage/buf_internals.h" #define MAX_SHARDS 128 #define MAX_PAGESERVER_CONNSTRING_SIZE 256 typedef enum { /* pagestore_client -> pagestore */ T_NeonExistsRequest = 0, T_NeonNblocksRequest, T_NeonGetPageRequest, T_NeonDbSizeRequest, T_NeonGetSlruSegmentRequest, /* future tags above this line */ T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */ /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, T_NeonNblocksResponse, T_NeonGetPageResponse, T_NeonErrorResponse, T_NeonDbSizeResponse, T_NeonGetSlruSegmentResponse, /* future tags above this line */ T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */ } NeonMessageTag; typedef uint64 NeonRequestId; /* base struct for c-style inheritance */ typedef struct { NeonMessageTag tag; NeonRequestId reqid; XLogRecPtr lsn; XLogRecPtr not_modified_since; } NeonMessage; #define messageTag(m) (((const NeonMessage *)(m))->tag) /* SLRUs downloadable from page server */ typedef enum { SLRU_CLOG, SLRU_MULTIXACT_MEMBERS, SLRU_MULTIXACT_OFFSETS } SlruKind; /*-- * supertype of all the Neon*Request structs below. * * All requests contain two LSNs: * * lsn: request page (or relation size, etc) at this LSN * not_modified_since: Hint that the page hasn't been modified between * this LSN and the request LSN (`lsn`). * * To request the latest version of a page, you can use MAX_LSN as the request * LSN. * * If you don't know any better, you can always set 'not_modified_since' equal * to 'lsn', but providing a lower value can speed up processing the request * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it * can skip traversing through recent layers which we know to not contain any * versions for the requested page. * * These structs describe the V2 of these requests. (The old now-defunct V1 * protocol contained just one LSN and a boolean 'latest' flag.) * * V3 version of protocol adds request ID to all requests. This request ID is also included in response * as well as other fields from requests, which allows to verify that we receive response for our request. * We copy fields from request to response to make checking more reliable: request ID is formed from process ID * and local counter, so in principle there can be duplicated requests IDs if process PID is reused. */ typedef NeonMessage NeonRequest; typedef struct { NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonExistsRequest; typedef struct { NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonNblocksRequest; typedef struct { NeonRequest hdr; Oid dbNode; } NeonDbSizeRequest; typedef struct { NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; } NeonGetPageRequest; typedef struct { NeonRequest hdr; SlruKind kind; int segno; } NeonGetSlruSegmentRequest; /* supertype of all the Neon*Response structs below */ typedef NeonMessage NeonResponse; typedef struct { NeonExistsRequest req; bool exists; } NeonExistsResponse; typedef struct { NeonNblocksRequest req; uint32 n_blocks; } NeonNblocksResponse; typedef struct { NeonGetPageRequest req; char page[FLEXIBLE_ARRAY_MEMBER]; } NeonGetPageResponse; #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ)) typedef struct { NeonDbSizeRequest req; int64 db_size; } NeonDbSizeResponse; typedef struct { NeonResponse req; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ } NeonErrorResponse; typedef struct { NeonGetSlruSegmentRequest req; int n_blocks; char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; } NeonGetSlruSegmentResponse; extern StringInfoData nm_pack_request(NeonRequest *msg); extern NeonResponse *nm_unpack_response(StringInfo s); extern char *nm_to_string(NeonMessage *msg); /* * If debug_compare_local>DEBUG_COMPARE_LOCAL_NONE, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every * read, compare the versions we read from local disk and Page Server, * and Assert that they are identical. */ typedef enum { DEBUG_COMPARE_LOCAL_NONE, /* normal mode - pages are storted locally only for unlogged relations */ DEBUG_COMPARE_LOCAL_PREFETCH, /* if page is found in prefetch ring, then compare it with local and return */ DEBUG_COMPARE_LOCAL_LFC, /* if page is found in LFC or prefetch ring, then compare it with local and return */ DEBUG_COMPARE_LOCAL_ALL /* always fetch page from PS and compare it with local */ } DebugCompareLocalMode; extern int debug_compare_local; /* * API */ typedef uint16 shardno_t; typedef struct { /* * Send this request to the PageServer associated with this shard. * This function assigns request_id to the request which can be extracted by caller from request struct. */ bool (*send) (shardno_t shard_no, NeonRequest * request); /* * Blocking read for the next response of this shard. * * When a CANCEL signal is handled, the connection state will be * unmodified. */ NeonResponse *(*receive) (shardno_t shard_no); /* * Try get the next response from the TCP buffers, if any. * Returns NULL when the data is not yet available. * * This will raise errors only for malformed responses (we can't put them * back into connection). All other error conditions are soft errors and * return NULL as "no response available". */ NeonResponse *(*try_receive) (shardno_t shard_no); /* * Make sure all requests are sent to PageServer. */ bool (*flush) (shardno_t shard_no); /* * Disconnect from this pageserver shard. */ void (*disconnect) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); extern page_server_api *page_server; extern char *pageserver_connstring; extern int flush_every_n_requests; extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); /* * LSN values associated with each request to the pageserver */ typedef struct { /* * 'request_lsn' is the main value that determines which page version to * fetch. */ XLogRecPtr request_lsn; /* * A hint to the pageserver that the requested page hasn't been modified * between this LSN and 'request_lsn'. That allows the pageserver to * return the page faster, without waiting for 'request_lsn' to arrive in * the pageserver, as long as 'not_modified_since' has arrived. */ XLogRecPtr not_modified_since; /* * 'effective_request_lsn' is not included in the request that's sent to * the pageserver, but is used to keep track of the latest LSN of when the * request was made. In a standby server, this is always the same as the * 'request_lsn', but in the primary we use UINT64_MAX as the * 'request_lsn' to request the latest page version, so we need this * separate field to remember that latest LSN was when the request was * made. It's needed to manage prefetch request, to verify if the response * to a prefetched request is still valid. */ XLogRecPtr effective_request_lsn; } neon_request_lsns; extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); extern int64 neon_dbsize(Oid dbNode); extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, BlockNumber nblocks); /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); #endif /* PAGESTORE_CLIENT_H */ ================================================ FILE: pgxn/neon/pagestore_smgr.c ================================================ /*------------------------------------------------------------------------- * * pagestore_smgr.c * * * * Temporary and unlogged rels * --------------------------- * * Temporary and unlogged tables are stored locally, by md.c. The functions * here just pass the calls through to corresponding md.c functions. * * Index build operations that use the buffer cache are also handled locally, * just like unlogged tables. Such operations must be marked by calling * smgr_start_unlogged_build() and friends. * * In order to know what relations are permanent and which ones are not, we * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set * by smgropen() callers, when they have the relcache entry at hand. However, * sometimes we need to open an SmgrRelation for a relation without the * relcache. That is needed when we evict a buffer; we might not have the * SmgrRelation for that relation open yet. To deal with that, the * 'relpersistence' can be left to zero, meaning we don't know if it's * permanent or not. Most operations are not allowed with relpersistence==0, * but smgrwrite() does work, which is what we need for buffer eviction. and * smgrunlink() so that a backend doesn't need to have the relcache entry at * transaction commit, where relations that were dropped in the transaction * are unlinked. * * If smgrwrite() is called and smgr_relpersistence == 0, we check if the * relation file exists locally or not. If it does exist, we assume it's an * unlogged relation and write the page there. Otherwise it must be a * permanent relation, WAL-logged and stored on the page server, and we ignore * the write like we do for permanent relations. * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xloginsert.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/pg_class.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" #include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" #include "bitmap.h" #include "communicator.h" #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" #include "pagestore_client.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #include "access/nbtree.h" #include "storage/bufpage.h" #include "access/xlog_internal.h" static char *hexdump_page(char *page); #define IS_LOCAL_REL(reln) (\ NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \ NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \ ) const int SmgrTrace = DEBUG5; /* unlogged relation build states */ typedef enum { UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, UNLOGGED_BUILD_PHASE_1, UNLOGGED_BUILD_PHASE_2, UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; int debug_compare_local; static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); /* * Wrapper around log_newpage() that makes a temporary copy of the block and * WAL-logs that. This makes it safe to use while holding only a shared lock * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint * directly because it skips the logging if the LSN is new enough. */ static XLogRecPtr log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { PGIOAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } #if PG_MAJORVERSION_NUM >= 17 /* * Wrapper around log_newpages() that makes a temporary copy of the block and * WAL-logs that. This makes it safe to use while holding only a shared lock * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint * directly because it skips the logging if the LSN is new enough. */ static XLogRecPtr log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, BlockNumber nblocks, Page *pages, bool page_std) { PGIOAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; BlockNumber blknos[XLR_MAX_BLOCK_ID]; Page pageptrs[XLR_MAX_BLOCK_ID]; int nregistered = 0; for (int i = 0; i < nblocks; i++) { Page page = copied_buffer[nregistered].data; memcpy(page, pages[i], BLCKSZ); pageptrs[nregistered] = page; blknos[nregistered] = blkno + i; ++nregistered; if (nregistered >= XLR_MAX_BLOCK_ID) { log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, page_std); nregistered = 0; } } if (nregistered != 0) { log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, page_std); } return ProcLastRecPtr; } #endif /* PG_MAJORVERSION_NUM >= 17 */ /* * Is 'buffer' identical to a freshly initialized empty heap page? */ static bool PageIsEmptyHeapPage(char *buffer) { PGIOAlignedBlock empty_page; PageInit((Page) empty_page.data, BLCKSZ, 0); return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } #if PG_MAJORVERSION_NUM >= 17 static void neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks, const char **buffers, bool force) { #define BLOCK_BATCH_SIZE 16 bool log_pages; BlockNumber batch_blockno = blocknum; XLogRecPtr lsns[BLOCK_BATCH_SIZE]; int batch_size = 0; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM * changes are not WAL-logged when the changes are made, so this is our * last chance to log them, otherwise they're lost. That's OK for * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ log_pages = false; if (force) { Assert(XLogInsertAllowed()); log_pages = true; } else if (XLogInsertAllowed() && (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) { log_pages = true; } if (log_pages) { XLogRecPtr recptr; recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, nblocks, (Page *) buffers, false); for (int i = 0; i < nblocks; i++) PageSetLSN(unconstify(char *, buffers[i]), recptr); ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " "were force logged, lsn=%X/%X", blocknum, blocknum + nblocks, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(recptr)))); } for (int i = 0; i < nblocks; i++) { Page page = (Page) buffers[i]; BlockNumber blkno = blocknum + i; XLogRecPtr lsn = PageGetLSN(page); if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an * all-zeros pages, and we can just ignore that in Neon. We do need to * remember the new size, though, so that smgrnblocks() returns the * right answer after the rel has been extended. We rely on the * relsize cache for that. * * A completely empty heap page doesn't need to be WAL-logged, either. * The heapam can leave such a page behind, if e.g. an insert errors * out after initializing the page, but before it has inserted the * tuple and WAL-logged the change. When we read the page from the * page server, it will come back as all-zeros. That's OK, the heapam * will initialize an all-zeros page on first use. * * In other scenarios, evicting a dirty page with no LSN is a bad * sign: it implies that the page was not WAL-logged, and its contents * will be lost when it's evicted. */ if (PageIsNew(page)) { ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else if (PageIsEmptyHeapPage(page)) { ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { /* * Its a bad sign if there is a page with zero LSN in the buffer * cache in a standby, too. However, PANICing seems like a cure * worse than the disease, as the damage has likely already been * done in the primary. So in a standby, make this an assertion, * and in a release build just LOG the error and soldier on. We * update the last-written LSN of the page with a conservative * value in that case, which is the last replayed LSN. */ ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); Assert(false); lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } /* * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ lsns[batch_size++] = lsn; if (batch_size >= BLOCK_BATCH_SIZE) { neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum, batch_blockno, batch_size); batch_blockno += batch_size; batch_size = 0; } } if (batch_size != 0) { neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum, batch_blockno, batch_size); } } #endif /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ #if PG_MAJORVERSION_NUM < 16 static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM * changes are not WAL-logged when the changes are made, so this is our * last chance to log them, otherwise they're lost. That's OK for * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ log_page = false; if (force) { Assert(XLogInsertAllowed()); log_page = true; } else if (XLogInsertAllowed() && !ShutdownRequestPending && (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) { log_page = true; } if (log_page) { XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, (Page) buffer, false); XLogFlush(recptr); lsn = recptr; ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an * all-zeros pages, and we can just ignore that in Neon. We do need to * remember the new size, though, so that smgrnblocks() returns the * right answer after the rel has been extended. We rely on the * relsize cache for that. * * A completely empty heap page doesn't need to be WAL-logged, either. * The heapam can leave such a page behind, if e.g. an insert errors * out after initializing the page, but before it has inserted the * tuple and WAL-logged the change. When we read the page from the * page server, it will come back as all-zeros. That's OK, the heapam * will initialize an all-zeros page on first use. * * In other scenarios, evicting a dirty page with no LSN is a bad * sign: it implies that the page was not WAL-logged, and its contents * will be lost when it's evicted. */ if (PageIsNew((Page) buffer)) { ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else if (PageIsEmptyHeapPage((Page) buffer)) { ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { /* * Its a bad sign if there is a page with zero LSN in the buffer * cache in a standby, too. However, PANICing seems like a cure * worse than the disease, as the damage has likely already been * done in the primary. So in a standby, make this an assertion, * and in a release build just LOG the error and soldier on. We * update the last-written LSN of the page with a conservative * value in that case, which is the last replayed LSN. */ ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); Assert(false); lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } /* * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forknum, blocknum); } /* * neon_init() -- Initialize private state */ static void neon_init(void) { /* * Sanity check that theperf counters array is sized correctly. We got * this wrong once, and the formula for max number of backends and aux * processes might well change in the future, so better safe than sorry. * This is a very cheap check so we do it even without assertions. On * v14, this gets called before initializing MyProc, so we cannot perform * the check here. That's OK, we don't expect the logic to change in old * releases. */ #if PG_VERSION_NUM>=150000 if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) elog(ERROR, "MyNeonCounters points past end of array"); #endif old_redo_read_buffer_filter = redo_read_buffer_filter; redo_read_buffer_filter = neon_redo_read_buffer_filter; if (debug_compare_local) { mdinit(); } } /* * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position * to physical position in WAL. It always adds SizeOfXLogShortPHD: * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. */ static XLogRecPtr nm_adjust_lsn(XLogRecPtr lsn) { /* * If lsn points to the beging of first record on page or segment, then * "return" it back to the page origin */ if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) { lsn -= SizeOfXLogShortPHD; } else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) { lsn -= SizeOfXLogLongPHD; } return lsn; } /* * Return LSN for requesting pages and number of blocks from page server * * XXX: exposed so that prefetch_do_request() can call back here. */ void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, BlockNumber nblocks) { XLogRecPtr last_written_lsns[PG_IOV_MAX]; Assert(nblocks <= PG_IOV_MAX); neon_get_lwlsn_v(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); for (int i = 0; i < nblocks; i++) { last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); Assert(last_written_lsns[i] != InvalidXLogRecPtr); } if (RecoveryInProgress()) { /*--- * In broad strokes, a replica always requests the page at the current * replay LSN. But looking closer, what exactly is the replay LSN? Is * it the last replayed record, or the record being replayed? And does * the startup process performing the replay need to do something * differently than backends running queries? Let's take a closer look * at the different scenarios: * * 1. Startup process reads a page, last_written_lsn is old. * * Read the old version of the page. We will apply the WAL record on * it to bring it up-to-date. * * We could read the new version, with the changes from this WAL * record already applied, to offload the work of replaying the record * to the pageserver. The pageserver might not have received the WAL * record yet, though, so a read of the old page version and applying * the record ourselves is likely faster. Also, the redo function * might be surprised if the changes have already applied. That's * normal during crash recovery, but not in hot standby. * * 2. Startup process reads a page, last_written_lsn == record we're * replaying. * * Can this happen? There are a few theoretical cases when it might: * * A) The redo function reads the same page twice. We had already read * and applied the changes once, and now we're reading it for the * second time. That would be a rather silly thing for a redo * function to do, and I'm not aware of any that would do it. * * B) The redo function modifies multiple pages, and it already * applied the changes to one of the pages, released the lock on * it, and is now reading a second page. Furthermore, the first * page was already evicted from the buffer cache, and also from * the last-written LSN cache, so that the per-relation or global * last-written LSN was already updated. All the WAL redo functions * hold the locks on pages that they modify, until all the changes * have been modified (?), which would make that impossible. * However, we skip the locking, if the page isn't currently in the * page cache (see neon_redo_read_buffer_filter below). * * Even if the one of the above cases were possible in theory, they * would also require the pages being modified by the redo function to * be immediately evicted from the page cache. * * So this probably does not happen in practice. But if it does, we * request the new version, including the changes from the record * being replayed. That seems like the correct behavior in any case. * * 3. Backend process reads a page with old last-written LSN * * Nothing special here. Read the old version. * * 4. Backend process reads a page with last_written_lsn == record being replayed * * This can happen, if the redo function has started to run, and saw * that the page isn't present in the page cache (see * neon_redo_read_buffer_filter below). Normally, in a normal * Postgres server, the redo function would hold a lock on the page, * so we would get blocked waiting the redo function to release the * lock. To emulate that, wait for the WAL replay of the record to * finish. */ /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); for (int i = 0; i < nblocks; i++) { neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; if (last_written_lsn > replay_lsn) { /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif /* * Cases 2 and 4. If this is a backend (case 4), the * neon_read_at_lsn() call later will wait for the WAL record to be * fully replayed. */ result->request_lsn = last_written_lsn; } else { /* cases 1 and 3 */ result->request_lsn = replay_lsn; } result->not_modified_since = last_written_lsn; result->effective_request_lsn = result->request_lsn; Assert(last_written_lsn <= result->request_lsn); neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); } } else { XLogRecPtr flushlsn; #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif for (int i = 0; i < nblocks; i++) { neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; /* * Use the latest LSN that was evicted from the buffer cache as the * 'not_modified_since' hint. Any pages modified by later WAL records * must still in the buffer cache, so our request cannot concern * those. */ neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", LSN_FORMAT_ARGS(last_written_lsn)); /* * Is it possible that the last-written LSN is ahead of last flush * LSN? Generally not, we shouldn't evict a page from the buffer cache * before all its modifications have been safely flushed. That's the * "WAL before data" rule. However, such case does exist at index * building, _bt_blwritepage logs the full page without flushing WAL * before smgrextend (files are fsynced before build ends). */ if (last_written_lsn > flushlsn) { neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", LSN_FORMAT_ARGS(last_written_lsn), LSN_FORMAT_ARGS(flushlsn)); XLogFlush(last_written_lsn); } /* * Request the very latest version of the page. In principle we * want to read the page at the current insert LSN, and we could * use that value in the request. However, there's a corner case * with pageserver's garbage collection. If the GC horizon is * set to a very small value, it's possible that by the time * that the pageserver processes our request, the GC horizon has * already moved past the LSN we calculate here. Standby servers * always have that problem as the can always lag behind the * primary, but for the primary we can avoid it by always * requesting the latest page, by setting request LSN to * UINT64_MAX. * * effective_request_lsn is used to check that received response is still valid. * In case of primary node it is last written LSN. Originally we used flush_lsn here, * but it is not correct. Consider the following scenario: * 1. Backend A wants to prefetch block X * 2. Backend A checks that block X is not present in the shared buffer cache * 3. Backend A calls prefetch_do_request, which calls neon_get_request_lsns * 4. neon_get_request_lsns obtains LwLSN=11 for the block * 5. Backend B downloads block X, updates and wallogs it with LSN=13 * 6. Block X is once again evicted from shared buffers, its LwLSN is set to LSN=13 * 7. Backend A is still executing in neon_get_request_lsns(). It calls 'flushlsn = GetFlushRecPtr();'. * Let's say that it is LSN=14 * 8. Backend A uses LSN=14 as effective_lsn in the prefetch slot. The request stored in the slot is * [not_modified_since=11, effective_request_lsn=14] * 9. Backend A sends the prefetch request, pageserver processes it, and sends response. * The last LSN that the pageserver had processed was LSN=12, so the page image in the response is valid at LSN=12. * 10. Backend A calls smgrread() for page X with LwLSN=13 * 11. Backend A finds in prefetch ring the response for the prefetch request with [not_modified_since=11, effective_lsn=Lsn14], * so it satisfies neon_prefetch_response_usable condition. * * Things go wrong in step 7-8, when [not_modified_since=11, effective_request_lsn=14] is determined for the request. * That is incorrect, because the page has in fact been modified at LSN=13. The invariant is that for any request, * there should not be any modifications to a page between its not_modified_since and (effective_)request_lsn values. * * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache. * But you can't do that within smgrprefetch(), would need to modify the caller. */ result->request_lsn = UINT64_MAX; result->not_modified_since = last_written_lsn; result->effective_request_lsn = last_written_lsn; } } } /* * neon_exists() -- Does the physical file exist? */ static bool neon_exists(SMgrRelation reln, ForkNumber forkNum) { BlockNumber n_blocks; neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { case 0: /* * We don't know if it's an unlogged rel stored locally, or * permanent rel stored in the page server. First check if it * exists locally. If it does, great. Otherwise check if it exists * in the page server. */ if (mdexists(reln, forkNum)) return true; break; case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: return mdexists(reln, forkNum); default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks)) { return true; } /* * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server * will error out if you check that, because the whole dbdir for * tablespace 0, db 0 doesn't exists. We possibly should change the page * server to accept that and return 'false', to be consistent with * mdexists(). But we probably also should fix pg_table_size() to not call * smgrexists() with bogus relfilenode. * * For now, handle that special case here. */ #if PG_MAJORVERSION_NUM >= 16 if (reln->smgr_rlocator.locator.spcOid == 0 && reln->smgr_rlocator.locator.dbOid == 0 && reln->smgr_rlocator.locator.relNumber == 0) #else if (reln->smgr_rnode.node.spcNode == 0 && reln->smgr_rnode.node.dbNode == 0 && reln->smgr_rnode.node.relNode == 0) #endif { return false; } neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); } /* * neon_create() -- Create a new relation on neond storage * * If isRedo is true, it's okay for the relation to exist already. */ static void neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: if (debug_compare_local) { mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo); if (forkNum == MAIN_FORKNUM) mdcreate(reln, INIT_FORKNUM, true); } else { mdcreate(reln, forkNum, isRedo); } return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_log(SmgrTrace, "Create relation %u/%u/%u.%u", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); /* * Newly created relation is empty, remember that in the relsize cache. * * Note that in REDO, this is called to make sure the relation fork * exists, but it does not truncate the relation. So, we can only update * the relsize if it didn't exist before. * * Also, in redo, we must make sure to update the cached size of the * relation, as that is the primary source of truth for REDO's file length * considerations, and as file extension isn't (perfectly) logged, we need * to take care of that before we hit file size checks. * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created * relation. Currently, we don't call SetLastWrittenLSN() when a new * relation created, so if we didn't remember the size in the relsize * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. */ if (isRedo) { update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &reln->smgr_cached_nblocks[forkNum]); } else set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdcreate(reln, forkNum, isRedo); } } /* * neon_unlink() -- Unlink a relation. * * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber * to delete all forks. * * * If isRedo is true, it's unsurprising for the relation to be already gone. * Also, we should remove the file immediately instead of queuing a request * for later, since during redo there's no possibility of creating a * conflicting relation. * * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */ static void neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged * or permanent relation (or if debug_compare_local is set). Try to * unlink, it won't do any harm if the file doesn't exist. */ mdunlink(rinfo, forkNum, isRedo); if (!NRelFileInfoBackendIsTemp(rinfo)) { forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum); } } /* * neon_extend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of * extending a relation (i.e., blocknum is at or beyond the current * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */ static void #if PG_MAJORVERSION_NUM < 16 neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer, bool skipFsync) #else neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, const void *buffer, bool skipFsync) #endif { XLogRecPtr lsn; BlockNumber n_blocks = 0; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdextend(reln, forkNum, blkno, buffer, skipFsync); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* * Check that the cluster size limit has not been exceeded. * * Temporary and unlogged relations are not included in the cluster size * measured by the page server, so ignore those. Autovacuum processes are * also exempt. */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, (errcode(ERRCODE_DISK_FULL), errmsg("could not extend file because project size limit (%d MB) has been exceeded", max_cluster_size), errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC"))); } /* * Usually Postgres doesn't extend relation on more than one page (leaving * holes). But this rule is violated in PG-15 where * CreateAndCopyRelationData call smgrextend for destination relation n * using size of source relation */ n_blocks = neon_nblocks(reln, forkNum); while (n_blocks < blkno) neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); neon_wallog_page(reln, forkNum, blkno, buffer, false); set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1); lsn = PageGetLSN((Page) buffer); neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); } /* * smgr_extend is often called with an all-zeroes page, so * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer * later, after it has been initialized with the real page contents, and * it is eventually evicted from the buffer cache. But we need a valid LSN * to the relation metadata update now. */ if (lsn == InvalidXLogRecPtr) { lsn = GetXLogInsertRecPtr(); neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno); } neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); } #if PG_MAJORVERSION_NUM >= 16 static void neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { const PGIOAlignedBlock buffer = {0}; int remblocks = nblocks; XLogRecPtr lsn = 0; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, (errcode(ERRCODE_DISK_FULL), errmsg("could not extend file because project size limit (%d MB) has been exceeded", max_cluster_size), errhint("This limit is defined by neon.max_cluster_size GUC"))); } /* * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any * more --- we mustn't create a block whose number actually is * InvalidBlockNumber or larger. */ if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks", relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); } /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) return; /* ensure we have enough xlog buffers to log max-sized records */ XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0); /* * Iterate over all the pages. They are collected into batches of * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each * batch. */ while (remblocks > 0) { int count = Min(remblocks, XLR_MAX_BLOCK_ID); XLogBeginInsert(); for (int i = 0; i < count; i++) XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i, (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI); for (int i = 0; i < count; i++) { lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blocknum + i); } blocknum += count; remblocks -= count; } Assert(lsn != 0); neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); } #endif /* * neon_open() -- Initialize newly-opened relation. */ static void neon_open(SMgrRelation reln) { /* * We don't have anything special to do here. Call mdopen() to let md.c * initialize itself. That's only needed for temporary or unlogged * relations, but it's dirt cheap so do it always to make sure the md * fields are initialized, for debugging purposes if nothing else. */ mdopen(reln); /* no work */ neon_log(SmgrTrace, "open noop"); } /* * neon_close() -- Close the specified relation, if it isn't closed already. */ static void neon_close(SMgrRelation reln, ForkNumber forknum) { /* * Let md.c close it, if it had it open. Doesn't hurt to do this even for * permanent relations that have no local storage. */ mdclose(reln, forknum); } #if PG_MAJORVERSION_NUM >= 17 /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { BufferTag tag; switch (reln->smgr_relpersistence) { case 0: /* probably shouldn't happen, but ignore it */ case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: return mdprefetch(reln, forknum, blocknum, nblocks); default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } tag.spcOid = reln->smgr_rlocator.locator.spcOid; tag.dbOid = reln->smgr_rlocator.locator.dbOid; tag.relNumber = reln->smgr_rlocator.locator.relNumber; tag.forkNum = forknum; while (nblocks > 0) { int iterblocks = Min(nblocks, PG_IOV_MAX); bits8 lfc_present[PG_IOV_MAX / 8] = {0}; if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, iterblocks, lfc_present) == iterblocks) { nblocks -= iterblocks; blocknum += iterblocks; continue; } tag.blockNum = blocknum; communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present); nblocks -= iterblocks; blocknum += iterblocks; } communicator_prefetch_pump_state(); return false; } #else /* PG_MAJORVERSION_NUM >= 17 */ /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { BufferTag tag; switch (reln->smgr_relpersistence) { case 0: /* probably shouldn't happen, but ignore it */ case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: return mdprefetch(reln, forknum, blocknum); default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) return false; tag.forkNum = forknum; tag.blockNum = blocknum; CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); communicator_prefetch_pump_state(); return false; } #endif /* PG_MAJORVERSION_NUM < 17 */ /* * neon_writeback() -- Tell the kernel to write pages back to storage. * * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ static void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { switch (reln->smgr_relpersistence) { case 0: /* mdwriteback() does nothing if the file doesn't exist */ mdwriteback(reln, forknum, blocknum, nblocks); break; case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwriteback(reln, forknum, blocknum, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* * TODO: WAL sync up to lwLsn for the indicated blocks * Without that sync, writeback doesn't actually guarantee the data is * persistently written, which does seem to be one of the assumed * properties of this smgr API call. */ neon_log(SmgrTrace, "writeback noop"); communicator_prefetch_pump_state(); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdwriteback(reln, forknum, blocknum, nblocks); } } /* * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ void neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) { communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } static void compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) { if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; PGIOAlignedBlock mdbuf; PGIOAlignedBlock mdbuf_masked; #if PG_MAJORVERSION_NUM >= 17 { void* mdbuffers[1] = { mdbuf.data }; mdreadv(reln, forkNum, blkno, mdbuffers, 1); } #else mdread(reln, forkNum, blkno, mdbuf.data); #endif memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(buffer)); } } else if (PageIsNew((Page) buffer)) { neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf.data)); } else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } } } } #if PG_MAJORVERSION_NUM < 17 /* * neon_read() -- Read the specified block from a relation. */ #if PG_MAJORVERSION_NUM < 16 static void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else static void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { neon_request_lsns request_lsns; bits8 present; void *bufferp; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdread(reln, forkNum, blkno, buffer); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdread(reln, forkNum, blkno, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* Try to read PS results if they are available */ communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); present = 0; bufferp = buffer; if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) { /* Prefetch hit */ if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH) { compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); } if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH) { return; } } /* Try to read from local file cache */ if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { MyNeonCounters->file_cache_hits_total++; if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC) { compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); } if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC) { return; } } neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ communicator_prefetch_pump_state(); if (debug_compare_local) { compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); } } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 static void compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) { if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { for (BlockNumber i = 0; i < nblocks; i++) { if (BITMAP_ISSET(read_pages, i)) { compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn); } } } } static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { bits8 read_pages[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; int prefetch_result; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdreadv(reln, forknum, blocknum, buffers, nblocks); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdreadv(reln, forknum, blocknum, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (nblocks > PG_IOV_MAX) neon_log(ERROR, "Read request too large: %d is larger than max %d", nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); memset(read_pages, 0, sizeof(read_pages)); prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, read_pages); if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH) { compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); } if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks) { return; } if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH) { memset(read_pages, 0, sizeof(read_pages)); } /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, nblocks, read_pages); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC) { compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); } if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks) { /* Read all blocks from LFC, so we're done */ return; } if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC) { memset(read_pages, 0, sizeof(read_pages)); } communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ communicator_prefetch_pump_state(); if (debug_compare_local) { memset(read_pages, 0xFF, sizeof(read_pages)); compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); } } #endif static char * hexdump_page(char *page) { StringInfoData result; initStringInfo(&result); for (int i = 0; i < BLCKSZ; i++) { if (i % 8 == 0) appendStringInfo(&result, " "); if (i % 40 == 0) appendStringInfo(&result, "\n"); appendStringInfo(&result, "%02x", (unsigned char) (page[i])); } return result.data; } #if PG_MAJORVERSION_NUM < 17 /* * neon_write() -- Write the supplied block at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ static void #if PG_MAJORVERSION_NUM < 16 neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) #else neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) #endif { XLogRecPtr lsn; switch (reln->smgr_relpersistence) { case 0: /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) { /* It exists locally. Guess it's unlogged then. */ #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif /* * We could set relpersistence now that we have determined * that it's local. But we don't dare to do it, because that * would immediately allow reads as well, which shouldn't * happen. We could cache it with a different 'relpersistence' * value, but this isn't performance critical. */ return; } break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_wallog_page(reln, forknum, blocknum, buffer, false); lsn = PageGetLSN((Page) buffer); neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); communicator_prefetch_pump_state(); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) { #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif } } } #endif #if PG_MAJORVERSION_NUM >= 17 static void neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, const void **buffers, BlockNumber nblocks, bool skipFsync) { switch (reln->smgr_relpersistence) { case 0: /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); /* * We could set relpersistence now that we have determined * that it's local. But we don't dare to do it, because that * would immediately allow reads as well, which shouldn't * happen. We could cache it with a different 'relpersistence' * value, but this isn't performance critical. */ return; } break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); communicator_prefetch_pump_state(); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); } } #endif /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum) { BlockNumber n_blocks; neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { return mdnblocks(reln, forknum); } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: return mdnblocks(reln, forknum); default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) { neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, n_blocks); return n_blocks; } neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), n_blocks); return n_blocks; } /* * neon_db_size() -- Get the size of the database in bytes. */ int64 neon_dbsize(Oid dbNode) { int64 db_size; neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); db_size = communicator_dbsize(dbNode, &request_lsns); neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); return db_size; } /* * neon_truncate() -- Truncate relation to specified number of blocks. */ static void neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { XLogRecPtr lsn; switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) { mdtruncate(reln, forknum, old_blocks, nblocks); return; } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdtruncate(reln, forknum, old_blocks, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); /* * Truncating a relation drops all its buffers from the buffer cache * without calling smgrwrite() on them. But we must account for that in * our tracking of last-written-LSN all the same: any future smgrnblocks() * request must return the new size after the truncation. We don't know * what the LSN of the truncation record was, so be conservative and use * the most recently inserted WAL record's LSN. */ lsn = GetXLogInsertRecPtr(); lsn = nm_adjust_lsn(lsn); /* * Flush it, too. We don't actually care about it here, but let's uphold * the invariant that last-written LSN <= flush LSN. */ XLogFlush(lsn); /* * Truncate may affect several chunks of relations. So we should either * update last written LSN for all of them, or update LSN for "dummy" * metadata block. Second approach seems more efficient. If the relation * is extended again later, the extension will update the last-written LSN * for the extended pages, so there's no harm in leaving behind obsolete * entries for the truncated chunks. */ neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdtruncate(reln, forknum, old_blocks, nblocks); } } /* * neon_immedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. We * sync active and inactive segments; smgrDoPendingSyncs() relies on this. * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of * some segment, then mdtruncate() renders that segment inactive. If we * crash before the next checkpoint syncs the newly-inactive segment, that * segment may survive recovery, reintroducing unwanted data into the table. */ static void neon_immedsync(SMgrRelation reln, ForkNumber forknum) { switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdimmedsync(reln, forknum); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); communicator_prefetch_pump_state(); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdimmedsync(reln, forknum); } } #if PG_MAJORVERSION_NUM >= 17 static void neon_registersync(SMgrRelation reln, ForkNumber forknum) { switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdregistersync(reln, forknum); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); if (debug_compare_local) { if (IS_LOCAL_REL(reln)) mdimmedsync(reln, forknum); } } #endif /* * neon_start_unlogged_build() -- Starting build operation on a rel. * * Some indexes are built in two phases, by first populating the table with * regular inserts, using the shared buffer cache but skipping WAL-logging, * and WAL-logging the whole relation after it's done. Neon relies on the * WAL to reconstruct pages, so we cannot use the page server in the * first phase when the changes are not logged. */ static void neon_start_unlogged_build(SMgrRelation reln) { /* * Currently, there can be only one unlogged relation build operation in * progress at a time. That's enough for the current usage. */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) neon_log(ERROR, "unlogged relation build is already in progress"); ereport(SmgrTrace, (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", RelFileInfoFmt(InfoFromSMgrRel(reln))))); switch (reln->smgr_relpersistence) { case 0: neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; if (debug_compare_local) { if (!IsParallelWorker()) mdcreate(reln, INIT_FORKNUM, true); } return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } #if PG_MAJORVERSION_NUM >= 17 /* * We have to disable this check for pg14-16 because sorted build of GIST index requires * to perform unlogged build several times */ if (smgrnblocks(reln, MAIN_FORKNUM) != 0) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); #endif unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; /* * Create the local file. In a parallel build, the leader is expected to * call this first and do it. * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ if (!IsParallelWorker()) { mdcreate(reln, debug_compare_local ? INIT_FORKNUM : MAIN_FORKNUM, false); } } /* * neon_finish_unlogged_build_phase_1() * * Call this after you have finished populating a relation in unlogged mode, * before you start WAL-logging it. */ static void neon_finish_unlogged_build_phase_1(SMgrRelation reln) { Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", RelFileInfoFmt((unlogged_build_rel_info))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); /* * In a parallel build, (only) the leader process performs the 2nd * phase. */ if (IsParallelWorker()) { NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } else unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /* * neon_end_unlogged_build() -- Finish an unlogged rel build. * * Call this after you have finished WAL-logging a relation that was * first populated without WAL-logging. * * This removes the local copy of the rel, since it's now been fully * WAL-logged and is present in the page server. */ static void neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", RelFileInfoFmt(unlogged_build_rel_info)))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { XLogRecPtr recptr; BlockNumber nblocks; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); /* * Update the last-written LSN cache. * * The relation is still on local disk so we can get the size by * calling mdnblocks() directly. For the LSN, GetXLogInsertRecPtr() is * very conservative. If we could assume that this function is called * from the same backend that WAL-logged the contents, we could use * XactLastRecEnd here. But better safe than sorry. */ nblocks = mdnblocks(reln, MAIN_FORKNUM); recptr = GetXLogInsertRecPtr(); neon_set_lwlsn_block_range(recptr, InfoFromNInfoB(rinfob), MAIN_FORKNUM, 0, nblocks); neon_set_lwlsn_relation(recptr, InfoFromNInfoB(rinfob), MAIN_FORKNUM); /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", RelFileInfoFmt(InfoFromNInfoB(rinfob)), forknum); forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); mdclose(reln, forknum); if (!debug_compare_local) { /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); } } if (debug_compare_local) mdunlink(rinfob, INIT_FORKNUM, true); } NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } #define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { XLogRecPtr request_lsn, not_modified_since; SlruKind kind; int n_blocks; neon_request_lsns request_lsns; /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the * logic is a bit simpler. */ if (RecoveryInProgress()) { request_lsn = GetXLogReplayRecPtr(NULL); if (request_lsn == InvalidXLogRecPtr) { /* * This happens in neon startup, we start up without replaying any * records. */ request_lsn = GetRedoStartLsn(); } request_lsn = nm_adjust_lsn(request_lsn); } else request_lsn = UINT64_MAX; /* * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU * segment has not changed since the basebackup, because in order to * modify it, we would have had to download it already. And once * downloaded, we never evict SLRU segments from local disk. */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); if (STRPREFIX(path, "pg_xact")) kind = SLRU_CLOG; else if (STRPREFIX(path, "pg_multixact/members")) kind = SLRU_MULTIXACT_MEMBERS; else if (STRPREFIX(path, "pg_multixact/offsets")) kind = SLRU_MULTIXACT_OFFSETS; else return -1; request_lsns.request_lsn = request_lsn; request_lsns.not_modified_since = not_modified_since; request_lsns.effective_request_lsn = request_lsn; n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); return n_blocks; } static void AtEOXact_neon(XactEvent event, void *arg) { switch (event) { case XACT_EVENT_ABORT: case XACT_EVENT_PARALLEL_ABORT: /* * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; break; case XACT_EVENT_COMMIT: case XACT_EVENT_PARALLEL_COMMIT: case XACT_EVENT_PREPARE: case XACT_EVENT_PRE_COMMIT: case XACT_EVENT_PARALLEL_PRE_COMMIT: case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), (errmsg(NEON_TAG "unlogged index build was not properly finished")))); } break; } communicator_reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = { .smgr_init = neon_init, .smgr_shutdown = NULL, .smgr_open = neon_open, .smgr_close = neon_close, .smgr_create = neon_create, .smgr_exists = neon_exists, .smgr_unlink = neon_unlink, .smgr_extend = neon_extend, #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif #if PG_MAJORVERSION_NUM >= 17 .smgr_prefetch = neon_prefetch, .smgr_readv = neon_readv, .smgr_writev = neon_writev, #else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, #endif .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, #if PG_MAJORVERSION_NUM >= 17 .smgr_registersync = neon_registersync, #endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, .smgr_read_slru_segment = neon_read_slru_segment, }; const f_smgr * smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; } void smgr_init_neon(void) { RegisterXactCallback(AtEOXact_neon, NULL); smgr_init_standard(); neon_init(); communicator_init(); } static void neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr) { BlockNumber relsize; /* This is only used in WAL replay */ Assert(RecoveryInProgress()); /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) { if (relsize < blkno + 1) { update_cached_relsize(rinfo, forknum, blkno + 1); neon_set_lwlsn_relation(end_recptr, rinfo, forknum); } } else { /* * Size was not cached. We populate the cache now, with the size of * the relation measured after this WAL record is applied. * * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ neon_request_lsns request_lsns; neon_get_request_lsns(rinfo, forknum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); relsize = communicator_nblocks(rinfo, forknum, &request_lsns); relsize = Max(relsize, blkno + 1); set_cached_relsize(rinfo, forknum, relsize); neon_set_lwlsn_relation(end_recptr, rinfo, forknum); neon_log(SmgrTrace, "Set length to %d", relsize); } } #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) /* * TODO: May be it is better to make correspondent function from freespace.c public? */ static BlockNumber get_fsm_physical_block(BlockNumber heapblk) { BlockNumber pages; int leafno; int l; /* * Calculate the logical page number of the first leaf page below the * given page. */ leafno = heapblk / SlotsPerFSMPage; /* Count upper level nodes required to address the leaf page */ pages = 0; for (l = 0; l < FSM_TREE_DEPTH; l++) { pages += leafno + 1; leafno /= SlotsPerFSMPage; } /* Turn the page count into 0-based block number */ return pages - 1; } /* * Return whether we can skip the redo for this block. * * The conditions for skipping the IO are: * * - The block is not in the shared buffers, and * - The block is not in the local file cache * * ... because any subsequent read of the page requires us to read * the new version of the page from the PageServer. We do not * check the local file cache; we instead evict the page from LFC: it * is cheaper than going through the FS calls to read the page, and * limits the number of lock operations used in the REDO process. * * We have one exception to the rules for skipping IO: We always apply * changes to shared catalogs' pages. Although this is mostly out of caution, * catalog updates usually result in backends rebuilding their catalog snapshot, * which means it's quite likely the modified page is going to be used soon. * * It is important to note that skipping WAL redo for a page also means * the page isn't locked by the redo process, as there is no Buffer * being returned, nor is there a buffer descriptor to lock. * This means that any IO that wants to read this block needs to wait * for the WAL REDO process to finish processing the WAL record before * it allows the system to start reading the block, as releasing the * block early could lead to phantom reads. * * For example, REDO for a WAL record that modifies 3 blocks could skip * the first block, wait for a lock on the second, and then modify the * third block. Without skipping, all blocks would be locked and phantom * reads would not occur, but with skipping, a concurrent process could * read block 1 with post-REDO contents and read block 3 with pre-REDO * contents, where with REDO locking it would wait on block 1 and see * block 3 with post-REDO contents only. */ static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; BufferTag tag; uint32 hash; LWLock *partitionLock; int buf_id; bool no_redo_needed; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) return true; #if PG_VERSION_NUM < 150000 if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno)) neon_log(PANIC, "failed to locate backup block with ID %d", block_id); #else XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forknum; tag.blockNum = blkno; hash = BufTableHashCode(&tag); partitionLock = BufMappingPartitionLock(hash); /* * Lock the partition of shared_buffers so that it can't be updated * concurrently. */ LWLockAcquire(partitionLock, LW_SHARED); /* * Out of an abundance of caution, we always run redo on shared catalogs, * regardless of whether the block is stored in shared buffers. See also * this function's top comment. */ if (!OidIsValid(NInfoGetDbOid(rinfo))) { no_redo_needed = false; } else { /* Try to find the relevant buffer */ buf_id = BufTableLookup(&tag, hash); no_redo_needed = buf_id < 0; } /* * we don't have the buffer in memory, update lwLsn past this record, also * evict page from file cache */ if (no_redo_needed) { neon_set_lwlsn_block(end_recptr, rinfo, forknum, blkno); /* * Redo changes if page exists in LFC. * We should perform this check after assigning LwLSN to prevent * prefetching of some older version of the page by some other backend. */ no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno); } LWLockRelease(partitionLock); neon_extend_rel_size(rinfo, forknum, blkno, end_recptr); if (forknum == MAIN_FORKNUM) { neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr); } return no_redo_needed; } ================================================ FILE: pgxn/neon/relsize_cache.c ================================================ /*------------------------------------------------------------------------- * * relsize_cache.c * Relation size cache for better zentih performance. * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include "neon.h" #include "neon_pgversioncompat.h" #include "miscadmin.h" #include "pagestore_client.h" #include RELFILEINFO_HDR #include "storage/smgr.h" #include "storage/lwlock.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "catalog/pg_tablespace_d.h" #include "utils/dynahash.h" #include "utils/guc.h" typedef struct { NRelFileInfo rinfo; ForkNumber forknum; } RelTag; typedef struct { RelTag tag; BlockNumber size; dlist_node lru_node; /* LRU list node */ } RelSizeEntry; typedef struct { size_t size; uint64 hits; uint64 misses; uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ } RelSizeHashControl; /* * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB, * which seems reasonable. */ #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) static HTAB *relsize_hash; static LWLockId relsize_lock; static int relsize_hash_size = DEFAULT_RELSIZE_HASH_SIZE; static RelSizeHashControl* relsize_ctl; void RelsizeCacheShmemInit(void) { static HASHCTL info; bool found; relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found); if (!found) { relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); info.keysize = sizeof(RelTag); info.entrysize = sizeof(RelSizeEntry); relsize_hash = ShmemInitHash("neon_relsize", relsize_hash_size, relsize_hash_size, &info, HASH_ELEM | HASH_BLOBS); relsize_ctl->size = 0; relsize_ctl->hits = 0; relsize_ctl->misses = 0; relsize_ctl->writes = 0; dlist_init(&relsize_ctl->lru); } } bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) { bool found = false; if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; tag.rinfo = rinfo; tag.forknum = forknum; /* We need exclusive lock here because of LRU list manipulation */ LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); if (entry != NULL) { *size = entry->size; relsize_ctl->hits += 1; found = true; /* Move entry to the LRU list tail */ dlist_delete(&entry->lru_node); dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); } else { relsize_ctl->misses += 1; } LWLockRelease(relsize_lock); } return found; } void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; bool found = false; tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); /* * This should actually never happen! Below we check if hash is full and delete least recently user item in this case. * But for further safety we also perform check here. */ while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL) { RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); Assert(relsize_ctl->size > 0); relsize_ctl->size -= 1; } entry->size = size; if (!found) { if (++relsize_ctl->size == relsize_hash_size) { /* * Remove least recently used elment from the hash. * Hash size after is becomes `relsize_hash_size-1`. * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. */ RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); relsize_ctl->size -= 1; } } else { dlist_delete(&entry->lru_node); } dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); relsize_ctl->writes += 1; LWLockRelease(relsize_lock); } } void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; bool found; tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); if (!found || entry->size < size) entry->size = size; if (!found) { if (++relsize_ctl->size == relsize_hash_size) { RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); relsize_ctl->size -= 1; } } else { dlist_delete(&entry->lru_node); } relsize_ctl->writes += 1; dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); LWLockRelease(relsize_lock); } } void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum) { if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); if (entry) { dlist_delete(&entry->lru_node); relsize_ctl->size -= 1; } LWLockRelease(relsize_lock); } } void relsize_hash_init(void) { DefineCustomIntVariable("neon.relsize_hash_size", "Sets the maximum number of cached relation sizes for neon", NULL, &relsize_hash_size, DEFAULT_RELSIZE_HASH_SIZE, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); } /* * shmem_request hook: request additional shared resources. We'll allocate or * attach to the shared resources in neon_smgr_shmem_startup(). */ void RelsizeCacheShmemRequest(void) { RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); RequestNamedLWLockTranche("neon_relsize", 1); } ================================================ FILE: pgxn/neon/unstable_extensions.c ================================================ #include #include #include "postgres.h" #include "nodes/plannodes.h" #include "nodes/parsenodes.h" #include "tcop/utility.h" #include "utils/errcodes.h" #include "utils/guc.h" #include "neon_pgversioncompat.h" #include "unstable_extensions.h" static bool allow_unstable_extensions = false; static char *unstable_extensions = NULL; static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; static bool list_contains(char const* comma_separated_list, char const* val) { char const* occ = comma_separated_list; size_t val_len = strlen(val); if (val_len == 0) return false; while ((occ = strstr(occ, val)) != NULL) { if ((occ == comma_separated_list || occ[-1] == ',') && (occ[val_len] == '\0' || occ[val_len] == ',')) { return true; } occ += val_len; } return false; } static void CheckUnstableExtension( PlannedStmt *pstmt, const char *queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *qc) { Node *parseTree = pstmt->utilityStmt; if (allow_unstable_extensions || unstable_extensions == NULL) goto process; switch (nodeTag(parseTree)) { case T_CreateExtensionStmt: { CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree); if (list_contains(unstable_extensions, stmt->extname)) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname), errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'"))); } break; } default: goto process; } process: if (PreviousProcessUtilityHook) { PreviousProcessUtilityHook( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } else { standard_ProcessUtility( pstmt, queryString, readOnlyTree, context, params, queryEnv, dest, qc); } } void InitUnstableExtensionsSupport(void) { DefineCustomBoolVariable( "neon.allow_unstable_extensions", "Allow unstable extensions to be installed and used", NULL, &allow_unstable_extensions, false, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomStringVariable( "neon.unstable_extensions", "List of unstable extensions", NULL, &unstable_extensions, NULL, PGC_SUSET, 0, NULL, NULL, NULL); PreviousProcessUtilityHook = ProcessUtility_hook; ProcessUtility_hook = CheckUnstableExtension; } ================================================ FILE: pgxn/neon/unstable_extensions.h ================================================ #ifndef __NEON_UNSTABLE_EXTENSIONS_H__ #define __NEON_UNSTABLE_EXTENSIONS_H__ void InitUnstableExtensionsSupport(void); #endif ================================================ FILE: pgxn/neon/walproposer.c ================================================ /*------------------------------------------------------------------------- * * walproposer.c * * Proposer/leader part of the total order broadcast protocol between postgres * and WAL safekeepers. * * We have two ways of launching WalProposer: * * 1. As a background worker which will pretend to be physical WalSender. * WalProposer will receive notifications about new available WAL and * will immediately broadcast it to alive safekeepers. * * 2. As a standalone utility by running `postgres --sync-safekeepers`. That * is needed to create LSN from which it is safe to start postgres. More * specifically it addresses following problems: * * a) Chicken-or-the-egg problem: compute postgres needs data directory * with non-rel files that are downloaded from pageserver by calling * basebackup@LSN. This LSN is not arbitrary, it must include all * previously committed transactions and defined through consensus * voting, which happens... in walproposer, a part of compute node. * * b) Just warranting such LSN is not enough, we must also actually commit * it and make sure there is a safekeeper who knows this LSN is * committed so WAL before it can be streamed to pageserver -- otherwise * basebackup will hang waiting for WAL. Advancing commit_lsn without * playing consensus game is impossible, so speculative 'let's just poll * safekeepers, learn start LSN of future epoch and run basebackup' * won't work. * * Both ways are implemented in walproposer_pg.c file. This file contains * generic part of walproposer which can be used in both cases, but can also * be used as an independent library. * *------------------------------------------------------------------------- */ #include #include "postgres.h" #include "libpq/pqformat.h" #include "neon.h" #include "walproposer.h" #include "neon_utils.h" /* Prototypes for private functions */ static void WalProposerLoop(WalProposer *wp); static void ShutdownConnection(Safekeeper *sk); static void ResetConnection(Safekeeper *sk); static long TimeToReconnect(WalProposer *wp, TimestampTz now); static void ReconnectSafekeepers(WalProposer *wp); static void AdvancePollState(Safekeeper *sk, uint32 events); static void HandleConnectionEvent(Safekeeper *sk); static void SendStartWALPush(Safekeeper *sk); static void RecvStartWALPushResult(Safekeeper *sk); static void SendProposerGreeting(Safekeeper *sk); static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); static bool VotesCollected(WalProposer *wp); static void HandleElectedProposer(WalProposer *wp); static term_t GetHighestTerm(TermHistory *th); static term_t GetLastLogTerm(Safekeeper *sk); static void ProcessPropStartPos(WalProposer *wp); static void SendProposerElected(Safekeeper *sk); static void StartStreaming(Safekeeper *sk); static void SendMessageToNode(Safekeeper *sk); static void BroadcastAppendRequest(WalProposer *wp); static void HandleActiveState(Safekeeper *sk, uint32 events); static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version); static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); static int CompareLsn(const void *a, const void *b); static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); static char *MembershipConfigurationToString(MembershipConfiguration *mconf); static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst); static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) { char *host; char *sep; char *port; WalProposer *wp; wp = palloc0(sizeof(WalProposer)); wp->config = config; wp->api = api; wp->localTimeLineID = config->pgTimeline; wp->state = WPS_COLLECTING_TERMS; wp->mconf.generation = INVALID_GENERATION; wp->mconf.members.len = 0; wp->mconf.new_members.len = 0; wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); /* * If safekeepers list starts with g# parse generation number followed by * : */ if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0) { char *endptr; errno = 0; wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10); if (errno != 0) { wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); } if (*endptr != ':') { wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); } /* Skip past : to the first hostname. */ host = endptr + 1; } else { wp->safekeepers_generation = INVALID_GENERATION; host = wp->config->safekeepers_list; } wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); for (; host != NULL && *host != '\0'; host = sep) { port = strchr(host, ':'); if (port == NULL) { wp_log(FATAL, "port is not specified"); } *port++ = '\0'; sep = strchr(port, ','); if (sep != NULL) *sep++ = '\0'; if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS) { wp_log(FATAL, "too many safekeepers"); } wp->safekeeper[wp->n_safekeepers].host = host; wp->safekeeper[wp->n_safekeepers].port = port; wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; /* BEGIN_HADRON */ wp->safekeeper[wp->n_safekeepers].index = wp->n_safekeepers; /* END_HADRON */ { Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers]; int written = 0; written = snprintf((char *) &sk->conninfo, MAXCONNINFO, "%s host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", wp->config->safekeeper_conninfo_options, sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant); if (written > MAXCONNINFO || written < 0) wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr; wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr; wp->n_safekeepers += 1; } if (wp->n_safekeepers < 1) { wp_log(FATAL, "safekeepers addresses are not specified"); } wp->quorum = wp->n_safekeepers / 2 + 1; if (wp->config->proto_version != 2 && wp->config->proto_version != 3) wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); /* BEGIN_HADRON */ wp->api.reset_safekeeper_statuses_for_metrics(wp, wp->n_safekeepers); /* END_HADRON */ /* Fill the greeting package */ wp->greetRequest.pam.tag = 'g'; if (!wp->config->neon_tenant) wp_log(FATAL, "neon.tenant_id is not provided"); wp->greetRequest.tenant_id = wp->config->neon_tenant; if (!wp->config->neon_timeline) wp_log(FATAL, "neon.timeline_id is not provided"); wp->greetRequest.timeline_id = wp->config->neon_timeline; wp->greetRequest.pg_version = PG_VERSION_NUM; wp->greetRequest.system_id = wp->config->systemId; wp->greetRequest.wal_seg_size = wp->config->wal_segment_size; wp->api.init_event_set(wp); return wp; } void WalProposerFree(WalProposer *wp) { MembershipConfigurationFree(&wp->mconf); for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; Assert(sk->outbuf.data != NULL); pfree(sk->outbuf.data); MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; } if (wp->propTermHistory.entries != NULL) pfree(wp->propTermHistory.entries); wp->propTermHistory.entries = NULL; pfree(wp); } static bool WalProposerGenerationsEnabled(WalProposer *wp) { return wp->safekeepers_generation != INVALID_GENERATION; } /* * Create new AppendRequest message and start sending it. This function is * called from walsender every time the new WAL is available. */ void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos) { Assert(startpos == wp->availableLsn && endpos >= wp->availableLsn); wp->availableLsn = endpos; BroadcastAppendRequest(wp); } /* * Advance the WAL proposer state machine, waiting each time for events to occur. * Will exit only when latch is set, i.e. new WAL should be pushed from walsender * to walproposer. */ void WalProposerPoll(WalProposer *wp) { while (true) { Safekeeper *sk = NULL; int rc = 0; uint32 events = 0; TimestampTz now = wp->api.get_current_timestamp(wp); long timeout = TimeToReconnect(wp, now); rc = wp->api.wait_event_set(wp, timeout, &sk, &events); /* Exit loop if latch is set (we got new WAL) */ if (rc == 1 && (events & WL_LATCH_SET)) break; /* * If the event contains something that one of our safekeeper states * was waiting for, we'll advance its state. */ if (rc == 1 && (events & WL_SOCKET_MASK)) { Assert(sk != NULL); AdvancePollState(sk, events); } /* * If the timeout expired, attempt to reconnect to any safekeepers * that we dropped */ ReconnectSafekeepers(wp); if (rc == 0) /* timeout expired */ { /* * Ensure flushrecptr is set to a recent value. This fixes a case * where we've not been notified of new WAL records when we were * planning on consuming them. */ if (!wp->config->syncSafekeepers) { XLogRecPtr flushed = wp->api.get_flush_rec_ptr(wp); if (flushed > wp->availableLsn) break; } } now = wp->api.get_current_timestamp(wp); /* timeout expired: poll state */ if (rc == 0 || TimeToReconnect(wp, now) <= 0) { /* * If no WAL was generated during timeout (and we have already * collected the quorum), then send empty keepalive message */ if (wp->availableLsn != InvalidXLogRecPtr) { BroadcastAppendRequest(wp); } /* * Abandon connection attempts which take too long. */ now = wp->api.get_current_timestamp(wp); for (int i = 0; i < wp->n_safekeepers; i++) { sk = &wp->safekeeper[i]; if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wp->config->safekeeper_connection_timeout)) { wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); ShutdownConnection(sk); } } } } } void WalProposerStart(WalProposer *wp) { /* Initiate connections to all safekeeper nodes */ for (int i = 0; i < wp->n_safekeepers; i++) { ResetConnection(&wp->safekeeper[i]); } WalProposerLoop(wp); } static void WalProposerLoop(WalProposer *wp) { while (true) WalProposerPoll(wp); } /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ static void ShutdownConnection(Safekeeper *sk) { sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; /* BEGIN_HADRON */ sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 0); /* END_HADRON */ MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; sk->wp->api.conn_finish(sk); sk->wp->api.rm_safekeeper_event_set(sk); } /* * This function is called to establish new connection or to reestablish * connection in case of connection failure. * * On success, sets the state to SS_CONNECTING_WRITE. */ static void ResetConnection(Safekeeper *sk) { WalProposer *wp = sk->wp; if (sk->state != SS_OFFLINE) { ShutdownConnection(sk); } /* * Try to establish new connection, it will update sk->conn. */ wp->api.conn_connect_start(sk); /* * PQconnectStart won't actually start connecting until we run * PQconnectPoll. Before we do that though, we need to check that it * didn't immediately fail. */ if (wp->api.conn_status(sk) == WP_CONNECTION_BAD) { /*--- * According to libpq docs: * "If the result is CONNECTION_BAD, the connection attempt has already failed, * typically because of invalid connection parameters." * We should report this failure. Do not print the exact `conninfo` as it may * contain e.g. password. The error message should already provide enough information. * * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */ wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s", sk->host, sk->port, wp->api.conn_error_message(sk)); /* * Even though the connection failed, we still need to clean up the * object */ wp->api.conn_finish(sk); return; } /* * The documentation for PQconnectStart states that we should call * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or * PGRES_POLLING_FAILED. The other two possible returns indicate whether * we should wait for reading or writing on the socket. For the first * iteration of the loop, we're expected to wait until the socket becomes * writable. * * The wording of the documentation is a little ambiguous; thankfully * there's an example in the postgres source itself showing this behavior. * (see libpqrcv_connect, defined in * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) */ wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); wp->api.add_safekeeper_event_set(sk, WL_SOCKET_WRITEABLE); return; } /* * How much milliseconds left till we should attempt reconnection to * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect * (do we actually need this?). */ static long TimeToReconnect(WalProposer *wp, TimestampTz now) { TimestampTz passed; TimestampTz till_reconnect; if (wp->config->safekeeper_reconnect_timeout <= 0) return -1; passed = now - wp->last_reconnect_attempt; till_reconnect = wp->config->safekeeper_reconnect_timeout * 1000 - passed; if (till_reconnect <= 0) return 0; return (long) (till_reconnect / 1000); } /* If the timeout has expired, attempt to reconnect to all offline safekeepers */ static void ReconnectSafekeepers(WalProposer *wp) { TimestampTz now = wp->api.get_current_timestamp(wp); if (TimeToReconnect(wp, now) == 0) { wp->last_reconnect_attempt = now; for (int i = 0; i < wp->n_safekeepers; i++) { if (wp->safekeeper[i].state == SS_OFFLINE) ResetConnection(&wp->safekeeper[i]); } } } /* * Performs the logic for advancing the state machine of the specified safekeeper, * given that a certain set of events has occurred. */ static void AdvancePollState(Safekeeper *sk, uint32 events) { #ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */ WalProposer *wp = sk->wp; #endif /* * Sanity check. We assume further down that the operations don't block * because the socket is ready. */ AssertEventsOkForState(events, sk); /* Execute the code corresponding to the current state */ switch (sk->state) { /* * safekeepers are only taken out of SS_OFFLINE by calls to * ResetConnection */ case SS_OFFLINE: wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline", sk->host, sk->port); break; /* actually unreachable, but prevents * -Wimplicit-fallthrough */ /* * Both connecting states run the same logic. The only difference * is the events they're expecting */ case SS_CONNECTING_READ: case SS_CONNECTING_WRITE: HandleConnectionEvent(sk); break; /* * Waiting for a successful CopyBoth response. */ case SS_WAIT_EXEC_RESULT: RecvStartWALPushResult(sk); break; /* * Finish handshake comms: receive information about the * safekeeper. */ case SS_HANDSHAKE_RECV: RecvAcceptorGreeting(sk); break; /* * Voting is an idle state - we don't expect any events to * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see how * nodes are transferred from SS_VOTING to sending actual vote * requests. */ case SS_WAIT_VOTING: wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; /* Read the safekeeper response for our candidate */ case SS_WAIT_VERDICT: RecvVoteResponse(sk); break; /* Flush proposer announcement message */ case SS_SEND_ELECTED_FLUSH: /* * AsyncFlush ensures we only move on to SS_ACTIVE once the flush * completes. If we still have more to do, we'll wait until the * next poll comes along. */ if (!AsyncFlush(sk)) return; /* flush is done, event set and state will be updated later */ StartStreaming(sk); break; /* * Idle state for waiting votes from quorum. */ case SS_WAIT_ELECTED: wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; /* * Active state is used for streaming WAL and receiving feedback. */ case SS_ACTIVE: HandleActiveState(sk, events); break; } } static void HandleConnectionEvent(Safekeeper *sk) { WalProposer *wp = sk->wp; WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk); /* The new set of events we'll wait on, after updating */ uint32 new_events = WL_NO_EVENTS; switch (result) { case WP_CONN_POLLING_OK: wp_log(LOG, "connected with node %s:%s", sk->host, sk->port); sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* * We have to pick some event to update event set. We'll * eventually need the socket to be readable, so we go with that. */ new_events = WL_SOCKET_READABLE; break; /* * If we need to poll to finish connecting, continue doing that */ case WP_CONN_POLLING_READING: sk->state = SS_CONNECTING_READ; new_events = WL_SOCKET_READABLE; break; case WP_CONN_POLLING_WRITING: sk->state = SS_CONNECTING_WRITE; new_events = WL_SOCKET_WRITEABLE; break; case WP_CONN_POLLING_FAILED: wp_log(WARNING, "failed to connect to node '%s:%s': %s", sk->host, sk->port, wp->api.conn_error_message(sk)); /* * If connecting failed, we don't want to restart the connection * because that might run us into a loop. Instead, shut it down -- * it'll naturally restart at a slower interval on calls to * ReconnectSafekeepers. */ ShutdownConnection(sk); return; } /* * Because PQconnectPoll can change the socket, we have to un-register the * old event and re-register an event on the new socket. */ wp->api.rm_safekeeper_event_set(sk); wp->api.add_safekeeper_event_set(sk, new_events); /* If we successfully connected, send START_WAL_PUSH query */ if (result == WP_CONN_POLLING_OK) SendStartWALPush(sk); } /* * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something * goes wrong, change state to SS_OFFLINE and shutdown the connection. */ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; /* Forbid implicit timeline creation if generations are enabled. */ char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true"; #define CMD_LEN 512 char cmd[CMD_LEN]; snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation); if (!wp->api.conn_send_query(sk, cmd)) { wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", cmd, sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } sk->state = SS_WAIT_EXEC_RESULT; wp->api.update_event_set(sk, WL_SOCKET_READABLE); } static void RecvStartWALPushResult(Safekeeper *sk) { WalProposer *wp = sk->wp; switch (wp->api.conn_get_query_result(sk)) { /* * Successful result, move on to starting the handshake */ case WP_EXEC_SUCCESS_COPYBOTH: SendProposerGreeting(sk); break; /* * Needs repeated calls to finish. Wait until the socket is * readable */ case WP_EXEC_NEEDS_INPUT: /* * SS_WAIT_EXEC_RESULT is always reached through an event, so we * don't need to update the event set */ break; case WP_EXEC_FAILED: wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s", sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; /* * Unexpected result -- funamdentally an error, but we want to * produce a custom message, rather than a generic "something went * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: wp_log(WARNING, "received bad response from safekeeper %s:%s query execution", sk->host, sk->port); ShutdownConnection(sk); return; } } /* * Start handshake: first of all send information about the * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for * a response to finish the handshake. */ static void SendProposerGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf); wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml); pfree(mconf_toml); PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest, &sk->outbuf, wp->config->proto_version); /* * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } /* * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in * members_safekeepers & new_members_safekeepers to sk. */ static void UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { /* members_safekeepers etc are fixed size, sanity check mconf size */ if (wp->mconf.members.len > MAX_SAFEKEEPERS) wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len); if (wp->mconf.new_members.len > MAX_SAFEKEEPERS) wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len); /* node id is not known until greeting is received */ if (sk->state < SS_WAIT_VOTING) return; /* 0 is assumed to be invalid node id, should never happen */ if (sk->greetResponse.nodeId == 0) { wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port); return; } for (uint32 i = 0; i < wp->mconf.members.len; i++) { SafekeeperId *sk_id = &wp->mconf.members.m[i]; if (sk_id->node_id == sk->greetResponse.nodeId) { /* * If mconf or list of safekeepers to connect to changed (the * latter always currently goes through restart though), * ResetMemberSafekeeperPtrs is expected to be called before * UpdateMemberSafekeeperPtr. So, other value suggests that we are * connected to the same sk under different host name, complain * about that. */ if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk) { wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu", sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper); } wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu", sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); wp->members_safekeepers[i] = sk; } } /* repeat for new_members */ for (uint32 i = 0; i < wp->mconf.new_members.len; i++) { SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; if (sk_id->node_id == sk->greetResponse.nodeId) { if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) { wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu", sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper); } wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu", sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); wp->new_members_safekeepers[i] = sk; } } } /* * Reset wp->members_safekeepers & new_members_safekeepers and refill them. * Called after wp changes mconf. */ static void ResetMemberSafekeeperPtrs(WalProposer *wp) { memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); for (int i = 0; i < wp->n_safekeepers; i++) { if (wp->safekeeper[i].state >= SS_WAIT_VOTING) UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]); } } static uint32 MsetQuorum(MemberSet *mset) { Assert(mset->len > 0); return mset->len / 2 + 1; } /* Does n forms quorum in mset? */ static bool MsetHasQuorum(MemberSet *mset, uint32 n) { return n >= MsetQuorum(mset); } /* * TermsCollected helper for a single member set `mset`. * * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers * or new_members_safekeepers. */ static bool TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) { uint32 n_greeted = 0; for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; if (sk != NULL && sk->state == SS_WAIT_VOTING) { if (n_greeted > 0) appendStringInfoString(s, ", "); appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); n_greeted++; } } appendStringInfo(s, ", %u/%u total", n_greeted, mset->len); return MsetHasQuorum(mset, n_greeted); } /* * Have we received greeting from enough (quorum) safekeepers to start voting? */ static bool TermsCollected(WalProposer *wp) { StringInfoData s; /* str for logging */ bool collected = false; /* legacy: generations disabled */ if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { collected = wp->n_connected >= wp->quorum; if (collected) { wp->propTerm++; wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm); } return collected; } /* * With generations enabled, we start campaign only when 1) some mconf is * actually received 2) we have greetings from majority of members as well * as from majority of new_members if it exists. */ if (wp->mconf.generation == INVALID_GENERATION) return false; initStringInfo(&s); appendStringInfoString(&s, "mset greeters: "); if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) goto res; if (wp->mconf.new_members.len > 0) { appendStringInfoString(&s, ", new_mset greeters: "); if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) goto res; } wp->propTerm++; wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm); collected = true; res: pfree(s.data); return collected; } static void RecvAcceptorGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; char *mconf_toml; /* * If our reading doesn't immediately succeed, any necessary error * handling or state setting is taken care of. We can leave any other work * until later. */ sk->greetResponse.apm.tag = 'g'; if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf); wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT, sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); pfree(mconf_toml); /* * Adopt mconf of safekeepers if it is higher. */ if (sk->greetResponse.mconf.generation > wp->mconf.generation) { /* sanity check before adopting, should never happen */ if (sk->greetResponse.mconf.members.len == 0) { wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation); } /* * If we at least started campaign, restart wp to get elected in the * new mconf. Note: in principle once wp is already elected * re-election is not required, but being conservative here is not * bad. * * TODO: put mconf to shmem to immediately pick it up on start, * otherwise if some safekeeper(s) misses latest mconf and gets * connected the first, it may cause redundant restarts here. * * More generally, it would be nice to restart walproposer (wiping * election state) without restarting the process. In particular, that * would allow sync-safekeepers not to die here if it intersected with * sk migration (as well as remove 1s delay). * * Note that assign_neon_safekeepers also currently restarts the * process, so during normal migration walproposer may restart twice. */ if (wp->state >= WPS_CAMPAIGN) { wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation); } MembershipConfigurationFree(&wp->mconf); MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); ResetMemberSafekeeperPtrs(wp); /* full conf was just logged above */ wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); } /* Protocol is all good, move to voting. */ sk->state = SS_WAIT_VOTING; /* In greeting safekeeper sent its id; update mappings accordingly. */ UpdateMemberSafekeeperPtr(wp, sk); /* * Note: it would be better to track the counter on per safekeeper basis, * but at worst walproposer would restart with 'term rejected', so leave * as is for now. */ ++wp->n_connected; if (wp->state == WPS_COLLECTING_TERMS) { /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); /* Quorum is acquired, prepare the vote request. */ if (TermsCollected(wp)) { wp->state = WPS_CAMPAIGN; wp->voteRequest.pam.tag = 'v'; wp->voteRequest.generation = wp->mconf.generation; wp->voteRequest.term = wp->propTerm; } } else if (sk->greetResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", sk->host, sk->port, sk->greetResponse.term, wp->propTerm); } /* * If we have quorum, start (or just send vote request to newly connected * node) election, otherwise wait until we have more greetings. */ if (wp->state == WPS_COLLECTING_TERMS) { /* * SS_VOTING is an idle state; read-ready indicates the connection * closed. */ wp->api.update_event_set(sk, WL_SOCKET_READABLE); } else { /* * Now send voting request to the cohort and wait responses */ for (int j = 0; j < wp->n_safekeepers; j++) { if (wp->safekeeper[j].state == SS_WAIT_VOTING) SendVoteRequest(&wp->safekeeper[j]); } } } static void SendVoteRequest(Safekeeper *sk) { WalProposer *wp = sk->wp; PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest, &sk->outbuf, wp->config->proto_version); /* We have quorum for voting, send our vote request */ wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT, sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term); /* On failure, logging & resetting is handled */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ } static void RecvVoteResponse(Safekeeper *sk) { WalProposer *wp = sk->wp; Assert(wp->state >= WPS_CAMPAIGN); sk->voteResponse.apm.tag = 'v'; if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; wp_log(LOG, "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it * already lives in strictly higher term (concurrent compute spotted) or * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > wp->propTerm || wp->state == WPS_CAMPAIGN)) { wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", sk->host, sk->port, sk->voteResponse.term, wp->propTerm); } Assert(sk->voteResponse.term == wp->propTerm); /* ready for elected message */ sk->state = SS_WAIT_ELECTED; /* Are we already elected? */ if (wp->state == WPS_CAMPAIGN) { /* no; check if this vote makes us elected */ if (VotesCollected(wp)) { wp->state = WPS_ELECTED; HandleElectedProposer(wp); } else { /* can't do much yet, no quorum */ return; } } else { Assert(wp->state == WPS_ELECTED); /* send elected only to this sk */ SendProposerElected(sk); } } /* * VotesCollected helper for a single member set `mset`. * * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers * or new_members_safekeepers. */ static bool VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) { uint32 n_votes = 0; for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; if (sk != NULL && sk->state == SS_WAIT_ELECTED) { Assert(sk->voteResponse.voteGiven); /* * Find the highest vote. NULL check is for the legacy case where * safekeeper might be not initialized with LSN at all and return * 0 LSN in the vote response; we still want to set donor to * something in this case. */ if (GetLastLogTerm(sk) > wp->donorLastLogTerm || (GetLastLogTerm(sk) == wp->donorLastLogTerm && sk->voteResponse.flushLsn > wp->propTermStartLsn) || wp->donor == NULL) { wp->donorLastLogTerm = GetLastLogTerm(sk); wp->propTermStartLsn = sk->voteResponse.flushLsn; wp->donor = sk; } wp->truncateLsn = Max(sk->voteResponse.truncateLsn, wp->truncateLsn); if (n_votes > 0) appendStringInfoString(s, ", "); appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); n_votes++; } } appendStringInfo(s, ", %u/%u total", n_votes, mset->len); return MsetHasQuorum(mset, n_votes); } /* * Checks if enough votes has been collected to get elected and if that's the * case finds the highest vote, setting donor, donorLastLogTerm, * propTermStartLsn fields. Also sets truncateLsn. */ static bool VotesCollected(WalProposer *wp) { StringInfoData s; /* str for logging */ bool collected = false; /* assumed to be called only when not elected yet */ Assert(wp->state == WPS_CAMPAIGN); wp->propTermStartLsn = InvalidXLogRecPtr; wp->donorLastLogTerm = 0; wp->truncateLsn = InvalidXLogRecPtr; /* legacy: generations disabled */ if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { int n_ready = 0; for (int i = 0; i < wp->n_safekeepers; i++) { if (wp->safekeeper[i].state == SS_WAIT_ELECTED) { n_ready++; if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) || wp->donor == NULL) { wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; wp->donor = &wp->safekeeper[i]; } wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } } collected = n_ready >= wp->quorum; if (collected) { wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers); } return collected; } /* * if generations are enabled we're expected to get to voting only when * mconf is established. */ Assert(wp->mconf.generation != INVALID_GENERATION); /* * We must get votes from both msets if both are present. */ initStringInfo(&s); appendStringInfoString(&s, "mset voters: "); if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) goto res; if (wp->mconf.new_members.len > 0) { appendStringInfoString(&s, ", new_mset voters: "); if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) goto res; } wp_log(LOG, "walproposer elected, %s", s.data); collected = true; res: pfree(s.data); return collected; } /* * Called once a majority of acceptors have voted for us and current proposer * has been elected. * * Sends ProposerElected message to all acceptors in SS_WAIT_ELECTED state and starts * replication from walsender. */ static void HandleElectedProposer(WalProposer *wp) { ProcessPropStartPos(wp); Assert(wp->propTermStartLsn != InvalidXLogRecPtr); /* * Synchronously download WAL from the most advanced safekeeper. We do * that only for logical replication (and switching logical walsenders to * neon_walreader is a todo.) */ if (!wp->api.recovery_download(wp, wp->donor)) { wp_log(FATAL, "failed to download WAL for logical replicaiton"); } if (wp->truncateLsn == wp->propTermStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); /* unreachable */ } for (int i = 0; i < wp->n_safekeepers; i++) { if (wp->safekeeper[i].state == SS_WAIT_ELECTED) SendProposerElected(&wp->safekeeper[i]); } /* * The proposer has been elected, and there will be no quorum waiting * after this point. There will be no safekeeper with state * SS_WAIT_ELECTED also, because that state is used only for quorum * waiting. */ if (wp->config->syncSafekeepers) { /* * Send empty message to enforce receiving feedback even from nodes * who are fully recovered; this is required to learn they switched * epoch which finishes sync-safeekepers who doesn't generate any real * new records. Will go away once we switch to async acks. */ BroadcastAppendRequest(wp); /* keep polling until all safekeepers are synced */ return; } wp->api.start_streaming(wp, wp->propTermStartLsn); /* Should not return here */ } /* latest term in TermHistory, or 0 is there is no entries */ static term_t GetHighestTerm(TermHistory *th) { return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; } /* safekeeper's epoch is the term of the highest entry in the log */ static term_t GetLastLogTerm(Safekeeper *sk) { return GetHighestTerm(&sk->voteResponse.termHistory); } /* If LSN points to the page header, skip it */ static XLogRecPtr SkipXLogPageHeader(WalProposer *wp, XLogRecPtr lsn) { if (XLogSegmentOffset(lsn, wp->config->wal_segment_size) == 0) { lsn += SizeOfXLogLongPHD; } else if (lsn % XLOG_BLCKSZ == 0) { lsn += SizeOfXLogShortPHD; } return lsn; } /* * Called after quorum gave votes and proposer starting position (highest vote * term + flush LSN) -- is determined (VotesCollected true), this function * adopts it: pushes LSN to shmem, sets wp term history, verifies that the * basebackup matches. */ static void ProcessPropStartPos(WalProposer *wp) { TermHistory *dth; WalproposerShmemState *walprop_shared; /* must have collected votes */ Assert(wp->state == WPS_ELECTED); /* * If propTermStartLsn is 0, it means flushLsn is 0 everywhere, we are * bootstrapping and nothing was committed yet. Start streaming from the * basebackup LSN then. * * In case of sync-safekeepers just exit: proceeding is not only pointless * but harmful, because we'd give safekeepers term history starting with * 0/0. These hacks will go away once we disable implicit timeline * creation on safekeepers and create it with non zero LSN from the start. */ if (wp->propTermStartLsn == InvalidXLogRecPtr) { if (!wp->config->syncSafekeepers) { wp->propTermStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propTermStartLsn)); } else { wp_log(LOG, "elected with zero propTermStartLsn in sync-safekeepers, exiting"); wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); } } pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propTermStartLsn); Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers); /* * We will be generating WAL since propTermStartLsn, so we should set * availableLsn to mark this LSN as the latest available position. */ wp->availableLsn = wp->propTermStartLsn; /* * Proposer's term history is the donor's + its own entry. */ dth = &wp->donor->voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); if (dth->n_entries > 0) memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->propTerm, LSN_FORMAT_ARGS(wp->propTermStartLsn), wp->donor->host, wp->donor->port, LSN_FORMAT_ARGS(wp->truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ walprop_shared = wp->api.get_shmem_state(wp); if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote) { /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. * Safekeepers don't skip header as they need continious stream of * data, so correct LSN for comparison. */ if (SkipXLogPageHeader(wp, wp->propTermStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* * However, allow to proceed if last_log_term on the node which * gave the highest vote (i.e. point where we are going to start * writing) actually had been won by me; plain restart of * walproposer not intervened by concurrent compute which wrote * WAL is ok. * * This avoids compute crash after manual term_bump. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) { /* * Panic to restart PG as we need to retake basebackup. * However, don't dump core as this is kinda expected * scenario. */ disable_core_dump(); wp_log(PANIC, "collected propTermStartLsn %X/%X, but basebackup LSN %X/%X", LSN_FORMAT_ARGS(wp->propTermStartLsn), LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } } pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm); } /* * Determine for sk the starting streaming point and send it message * 1) Announcing we are elected proposer (which immediately advances epoch if * safekeeper is synced, being important for sync-safekeepers) * 2) Communicating starting streaming point -- safekeeper must truncate its WAL * beyond it -- and history of term switching. * * Sets sk->startStreamingAt. */ static void SendProposerElected(Safekeeper *sk) { WalProposer *wp = sk->wp; ProposerElected msg; TermHistory *th; term_t lastCommonTerm; int idx; /* Now that we are ready to send it's a good moment to create WAL reader */ wp->api.wal_reader_allocate(sk); /* * Determine start LSN by comparing safekeeper's log term switch history * and proposer's, searching for the divergence point. * * Note: there is a vanishingly small chance of no common point even if * there is some WAL on safekeeper, if immediately after bootstrap compute * wrote some WAL on single sk and died; we stream since the beginning * then. */ th = &sk->voteResponse.termHistory; /* We must start somewhere. */ Assert(wp->propTermHistory.n_entries >= 1); for (idx = 0; idx < Min(wp->propTermHistory.n_entries, th->n_entries); idx++) { if (wp->propTermHistory.entries[idx].term != th->entries[idx].term) break; /* term must begin everywhere at the same point */ Assert(wp->propTermHistory.entries[idx].lsn == th->entries[idx].lsn); } idx--; /* step back to the last common term */ if (idx < 0) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u", sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries); } else { /* * End of (common) term is the start of the next except it is the last * one; there it is flush_lsn in case of safekeeper or, in case of * proposer, LSN it is currently writing, but then we just pick * safekeeper pos as it obviously can't be higher. */ if (wp->propTermHistory.entries[idx].term == wp->propTerm) { sk->startStreamingAt = sk->voteResponse.flushLsn; } else { XLogRecPtr propEndLsn = wp->propTermHistory.entries[idx + 1].lsn; XLogRecPtr skEndLsn = (idx + 1 < th->n_entries ? th->entries[idx + 1].lsn : sk->voteResponse.flushLsn); sk->startStreamingAt = Min(propEndLsn, skEndLsn); } } Assert(sk->startStreamingAt <= wp->availableLsn); msg.apm.tag = 'e'; msg.generation = wp->mconf.generation; msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; msg.termHistory = &wp->propTermHistory; lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0; wp_log(LOG, "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s", sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port); PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version); if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) return; StartStreaming(sk); } /* * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets * correct event set. */ static void StartStreaming(Safekeeper *sk) { /* * This is the only entrypoint to state SS_ACTIVE. It's executed exactly * once for a connection. */ sk->state = SS_ACTIVE; sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; /* BEGIN_HADRON */ sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 1); /* END_HADRON */ /* * Donors can only be in SS_ACTIVE state, so we potentially update the * donor when we switch one to SS_ACTIVE. */ UpdateDonorShmem(sk->wp); /* event set will be updated inside SendMessageToNode */ SendMessageToNode(sk); } /* * Try to send message to the particular node. Always updates event set. Will * send at least one message, if socket is ready. * * Can be used only for safekeepers in SS_ACTIVE state. State can be changed * in case of errors. */ static void SendMessageToNode(Safekeeper *sk) { Assert(sk->state == SS_ACTIVE); /* * Note: we always send everything to the safekeeper until WOULDBLOCK or * nothing left to send */ HandleActiveState(sk, WL_SOCKET_WRITEABLE); } /* * Broadcast new message to all caught-up safekeepers */ static void BroadcastAppendRequest(WalProposer *wp) { for (int i = 0; i < wp->n_safekeepers; i++) if (wp->safekeeper[i].state == SS_ACTIVE) SendMessageToNode(&wp->safekeeper[i]); } static void PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); req->apm.tag = 'a'; req->generation = wp->mconf.generation; req->term = wp->propTerm; req->beginLsn = beginLsn; req->endLsn = endLsn; req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; } /* * Process all events happened in SS_ACTIVE state, update event set after that. */ static void HandleActiveState(Safekeeper *sk, uint32 events) { WalProposer *wp = sk->wp; /* * Note: we don't known which socket awoke us (sk or nwr). However, as * SendAppendRequests always tries to send at least one msg in * SS_ACTIVE_SEND be careful not to go there if are only after sk * response, otherwise it'd create busy loop of pings. */ if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL) if (!SendAppendRequests(sk)) return; if (events & WL_SOCKET_READABLE) if (!RecvAppendResponses(sk)) return; #if PG_VERSION_NUM >= 150000 /* expected never to happen, c.f. walprop_pg_active_state_update_event_set */ if (events & WL_SOCKET_CLOSED) { wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", sk->host, sk->port); ShutdownConnection(sk); return; } #endif /* configures event set for yield whatever is the substate */ wp->api.active_state_update_event_set(sk); } /* * Send WAL messages starting from sk->streamingAt until the end or non-writable * socket or neon_walreader blocks, whichever comes first; active_state is * updated accordingly. Caller should take care of updating event set. Even if * no unsent WAL is available, at least one empty message will be sent as a * heartbeat, if socket is ready. * * Resets state and kills the connections if any error on them is encountered. * Returns false in this case, true otherwise. */ static bool SendAppendRequests(Safekeeper *sk) { WalProposer *wp = sk->wp; XLogRecPtr endLsn; PGAsyncWriteResult writeResult; bool sentAnything = false; AppendRequestHeader *req; if (sk->active_state == SS_ACTIVE_FLUSH) { if (!AsyncFlush(sk)) /* * AsyncFlush failed, that could happen if the socket is closed or * we have nothing to write and should wait for writeable socket. */ return sk->state == SS_ACTIVE; /* Event set will be updated in the end of HandleActiveState */ sk->active_state = SS_ACTIVE_SEND; } while (sk->streamingAt != wp->availableLsn || !sentAnything) { if (sk->active_state == SS_ACTIVE_SEND) { sentAnything = true; endLsn = sk->streamingAt; endLsn += MAX_SEND_SIZE; /* if we went beyond available WAL, back off */ if (endLsn > wp->availableLsn) { endLsn = wp->availableLsn; } req = &sk->appendRequest; PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", req->endLsn - req->beginLsn, LSN_FORMAT_ARGS(req->beginLsn), LSN_FORMAT_ARGS(req->endLsn), LSN_FORMAT_ARGS(req->commitLsn), LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); resetStringInfo(&sk->outbuf); /* write AppendRequest header */ PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version); /* prepare for reading WAL into the outbuf */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); sk->active_state = SS_ACTIVE_READ_WAL; } if (sk->active_state == SS_ACTIVE_READ_WAL) { char *errmsg; int req_len; req = &sk->appendRequest; req_len = req->endLsn - req->beginLsn; /* * We send zero sized AppenRequests as heartbeats; don't wal_read * for these. */ if (req_len > 0) { switch (wp->api.wal_read(sk, &sk->outbuf.data[sk->outbuf.len], req->beginLsn, req_len, &errmsg)) { case NEON_WALREAD_SUCCESS: break; case NEON_WALREAD_WOULDBLOCK: return true; case NEON_WALREAD_ERROR: wp_log(WARNING, "WAL reading for node %s:%s failed: %s", sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: Assert(false); } } sk->outbuf.len += req_len; writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); /* Mark current message as sent, whatever the result is */ sk->streamingAt = req->endLsn; switch (writeResult) { case PG_ASYNC_WRITE_SUCCESS: /* Continue writing the next message */ sk->active_state = SS_ACTIVE_SEND; break; case PG_ASYNC_WRITE_TRY_FLUSH: /* * We still need to call PQflush some more to finish the * job. Caller function will handle this by setting right * event set. */ sk->active_state = SS_ACTIVE_FLUSH; return true; case PG_ASYNC_WRITE_FAIL: wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: Assert(false); return false; } } } return true; } /* * Receive and process all available feedback. * * Resets state and kills the connection if any error on it is encountered. * Returns false in this case, true otherwise. * * NB: This function can call SendMessageToNode and produce new messages. */ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; bool readAnything = false; while (true) { /* * If our reading doesn't immediately succeed, any necessary error * handling or state setting is taken care of. We can leave any other * work until later. */ sk->appendResponse.apm.tag = 'a'; if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) break; wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", sk->appendResponse.term, LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port); readAnything = true; /* should never happen: sk is expected to send ERROR instead */ if (sk->appendResponse.generation != wp->mconf.generation) { wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u", sk->greetResponse.nodeId, sk->host, sk->port, sk->appendResponse.generation, wp->mconf.generation); } if (sk->appendResponse.term > wp->propTerm) { /* * * Term has changed to higher one, probably another compute is * running. If this is the case we could PANIC as well because * likely it inserted some data and our basebackup is unsuitable * anymore. However, we also bump term manually (term_bump * endpoint) on safekeepers for migration purposes, in this case * we do want compute to stay alive. So restart walproposer with * FATAL instead of panicking; if basebackup is spoiled next * election will notice this. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); } HandleSafekeeperResponse(wp, sk); } if (!readAnything) return sk->state == SS_ACTIVE; return sk->state == SS_ACTIVE; } #define psfeedback_log(fmt, key, ...) \ wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__) /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ static void ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback) { uint8 nkeys; int i; /* initialize the struct before parsing */ memset(ps_feedback, 0, sizeof(PageserverFeedback)); ps_feedback->present = true; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgrawstring(reply_message); unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) { Assert(value_len == sizeof(int64)); ps_feedback->currentClusterSize = pq_getmsgint64(reply_message); psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { Assert(value_len == sizeof(int64)); ps_feedback->last_received_lsn = pq_getmsgint64(reply_message); psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { Assert(value_len == sizeof(int64)); ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message); psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { Assert(value_len == sizeof(int64)); ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message); psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { Assert(value_len == sizeof(int64)); ps_feedback->replytime = pq_getmsgint64(reply_message); psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime)); } else if (strcmp(key, "shard_number") == 0) { Assert(value_len == sizeof(uint32)); ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); psfeedback_log("%u", key, ps_feedback->shard_number); } else if (strcmp(key, "corruption_detected") == 0) { Assert(value_len == 1); ps_feedback->corruption_detected = pq_getmsgbyte(reply_message) != 0; psfeedback_log("%s", key, ps_feedback->corruption_detected ? "true" : "false"); } else { /* * Skip unknown keys to support backward compatibile protocol * changes */ wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len); pq_getmsgbytes(reply_message, value_len); }; } } /* * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the * last WAL record that can be safely discarded. */ static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp) { XLogRecPtr lsn = wp->n_safekeepers > 0 ? wp->safekeeper[0].appendResponse.flushLsn : InvalidXLogRecPtr; for (int i = 1; i < wp->n_safekeepers; i++) { lsn = Min(lsn, wp->safekeeper[i].appendResponse.flushLsn); } return lsn; } /* * GetAcknowledgedByQuorumWALPosition for a single member set `mset`. * * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers * or new_members_safekeepers. */ static XLogRecPtr GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* * Ascending sort acknowledged LSNs. */ Assert(mset->len <= MAX_SAFEKEEPERS); for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; /* * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to propTermStartLsn. * * Note: we ignore sk state, which is ok: before first ack flushLsn is * 0, and later we just preserve value across reconnections. It would * be ok to check for SS_ACTIVE as well. */ if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn) { responses[i] = sk->appendResponse.flushLsn; } else { responses[i] = 0; } } qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn); /* * And get value committed by the quorum. A way to view this: to get the * highest value committed on the quorum, in the ordered array we skip n - * n_quorum elements to get to the first (lowest) value present on all sks * of the highest quorum. */ return responses[mset->len - MsetQuorum(mset)]; } /* * Calculate WAL position acknowledged by quorum, i.e. which may be regarded * committed. * * Zero may be returned when there is no quorum of nodes recovered to term start * lsn which sent feedback yet. */ static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp) { XLogRecPtr committed; /* legacy: generations disabled */ if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* * Sort acknowledged LSNs */ for (int i = 0; i < wp->n_safekeepers; i++) { /* * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to * propTermStartLsn. * * Note: we ignore sk state, which is ok: before first ack * flushLsn is 0, and later we just preserve value across * reconnections. It would be ok to check for SS_ACTIVE as well. */ responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; } qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); /* * Get the smallest LSN committed by quorum */ return responses[wp->n_safekeepers - wp->quorum]; } committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers); if (wp->mconf.new_members.len > 0) { XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers); committed = Min(committed, new_mset_committed); } return committed; } /* * Return safekeeper with active connection from which WAL can be downloaded, or * none if it doesn't exist. donor_lsn is set to end position of the donor to * the best of our knowledge. */ static void UpdateDonorShmem(WalProposer *wp) { Safekeeper *donor = NULL; int i; XLogRecPtr donor_lsn = InvalidXLogRecPtr; if (wp->state < WPS_ELECTED) { wp_log(WARNING, "UpdateDonorShmem called before elections are won"); return; } /* * First, consider node which had determined our term start LSN as we know * about its position immediately after election before any feedbacks are * sent. */ if (wp->donor->state >= SS_WAIT_ELECTED) { donor = wp->donor; donor_lsn = wp->propTermStartLsn; } /* * But also check feedbacks from all nodes with live connections and take * the highest one. Note: if node sends feedbacks it already processed * elected message so its term is fine. */ for (i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn) { donor = sk; donor_lsn = sk->appendResponse.flushLsn; } } if (donor == NULL) { wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping"); return; } wp->api.update_donor(wp, donor, donor_lsn); } /* * Process AppendResponse message from safekeeper. */ static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) { XLogRecPtr candidateTruncateLsn; XLogRecPtr newCommitLsn; newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); if (newCommitLsn > wp->commitLsn) { wp->commitLsn = newCommitLsn; /* Send new value to all safekeepers. */ BroadcastAppendRequest(wp); } /* * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). * The last one will terminate the process if the shutdown is requested * and WAL is committed by the quorum. BroadcastAppendRequest() should be * called to notify safekeepers about the new commitLsn. */ wp->api.process_safekeeper_feedback(wp, fromsk); /* * Try to advance truncateLsn -- the last record flushed to all * safekeepers. * * Advanced truncateLsn should be not higher than commitLsn. This prevents * surprising violation of truncateLsn <= commitLsn invariant which might * occur because commitLsn generally can't be advanced based on feedback * from safekeeper who is still in the previous epoch (similar to 'leader * can't commit entries from previous term' in Raft); 2) */ candidateTruncateLsn = CalculateMinFlushLsn(wp); candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn); if (candidateTruncateLsn > wp->truncateLsn) { wp->truncateLsn = candidateTruncateLsn; } /* * Generally sync is done when majority reached propTermStartLsn so we * committed it and made the majority aware of it, ensuring they are ready * to give all WAL to pageserver. It would mean whichever majority is * alive, there will be at least one safekeeper who is able to stream WAL * to pageserver to make basebackup possible. However, since at the moment * we don't have any good mechanism of defining the healthy and most * advanced safekeeper who should push the wal into pageserver and * basically the random one gets connected, to prevent hanging basebackup * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. */ if (wp->config->syncSafekeepers) { for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; bool synced = sk->appendResponse.commitLsn >= wp->propTermStartLsn; /* alive safekeeper which is not synced yet; wait for it */ if (sk->state != SS_OFFLINE && !synced) return; } if (newCommitLsn >= wp->propTermStartLsn) { /* A quorum of safekeepers has been synced! */ /* * Send empty message to broadcast latest truncateLsn to all * safekeepers. This helps to finish next sync-safekeepers * eailier, by skipping recovery step. * * We don't need to wait for response because it doesn't affect * correctness, and TCP should be able to deliver the message to * safekeepers in case of network working properly. */ BroadcastAppendRequest(wp); wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); /* unreachable */ } } } /* Serialize MembershipConfiguration into buf. */ static void MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf) { uint32 i; pq_sendint32(buf, mconf->generation); pq_sendint32(buf, mconf->members.len); for (i = 0; i < mconf->members.len; i++) { pq_sendint64(buf, mconf->members.m[i].node_id); pq_send_ascii_string(buf, mconf->members.m[i].host); pq_sendint16(buf, mconf->members.m[i].port); } /* * There is no special mark for absent new_members; zero members in * invalid, so zero len means absent. */ pq_sendint32(buf, mconf->new_members.len); for (i = 0; i < mconf->new_members.len; i++) { pq_sendint64(buf, mconf->new_members.m[i].node_id); pq_send_ascii_string(buf, mconf->new_members.m[i].host); pq_sendint16(buf, mconf->new_members.m[i].port); } } /* Serialize proposer -> acceptor message into buf using specified version */ static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version) { /* both version are supported currently until we fully migrate to 3 */ Assert(proto_version == 3 || proto_version == 2); resetStringInfo(buf); if (proto_version == 3) { /* * v2 sends structs for some messages as is, so commonly send tag only * for v3 */ pq_sendint8(buf, msg->tag); switch (msg->tag) { case 'g': { ProposerGreeting *m = (ProposerGreeting *) msg; pq_send_ascii_string(buf, m->tenant_id); pq_send_ascii_string(buf, m->timeline_id); MembershipConfigurationSerialize(&m->mconf, buf); pq_sendint32(buf, m->pg_version); pq_sendint64(buf, m->system_id); pq_sendint32(buf, m->wal_seg_size); break; } case 'v': { VoteRequest *m = (VoteRequest *) msg; pq_sendint32(buf, m->generation); pq_sendint64(buf, m->term); break; } case 'e': { ProposerElected *m = (ProposerElected *) msg; pq_sendint32(buf, m->generation); pq_sendint64(buf, m->term); pq_sendint64(buf, m->startStreamingAt); pq_sendint32(buf, m->termHistory->n_entries); for (uint32 i = 0; i < m->termHistory->n_entries; i++) { pq_sendint64(buf, m->termHistory->entries[i].term); pq_sendint64(buf, m->termHistory->entries[i].lsn); } break; } case 'a': { /* * Note: this serializes only AppendRequestHeader, caller * is expected to append WAL data later. */ AppendRequestHeader *m = (AppendRequestHeader *) msg; pq_sendint32(buf, m->generation); pq_sendint64(buf, m->term); pq_sendint64(buf, m->beginLsn); pq_sendint64(buf, m->endLsn); pq_sendint64(buf, m->commitLsn); pq_sendint64(buf, m->truncateLsn); break; } default: wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); } return; } if (proto_version == 2) { switch (msg->tag) { case 'g': { /* v2 sent struct as is */ ProposerGreeting *m = (ProposerGreeting *) msg; ProposerGreetingV2 greetRequestV2; /* Fill also v2 struct. */ greetRequestV2.tag = 'g'; greetRequestV2.protocolVersion = proto_version; greetRequestV2.pgVersion = m->pg_version; /* * v3 removed this field because it's easier to pass as * libq or START_WAL_PUSH options */ memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId)); greetRequestV2.systemId = wp->config->systemId; if (*m->timeline_id != '\0' && !HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16)) wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id); if (*m->tenant_id != '\0' && !HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16)) wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id); greetRequestV2.timeline = wp->config->pgTimeline; greetRequestV2.walSegSize = wp->config->wal_segment_size; pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2)); break; } case 'v': { /* v2 sent struct as is */ VoteRequest *m = (VoteRequest *) msg; VoteRequestV2 voteRequestV2; voteRequestV2.tag = m->pam.tag; voteRequestV2.term = m->term; /* removed field */ memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId)); pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2)); break; } case 'e': { ProposerElected *m = (ProposerElected *) msg; pq_sendint64_le(buf, m->apm.tag); pq_sendint64_le(buf, m->term); pq_sendint64_le(buf, m->startStreamingAt); pq_sendint32_le(buf, m->termHistory->n_entries); for (int i = 0; i < m->termHistory->n_entries; i++) { pq_sendint64_le(buf, m->termHistory->entries[i].term); pq_sendint64_le(buf, m->termHistory->entries[i].lsn); } /* * Removed timeline_start_lsn. Still send it as a valid * value until safekeepers taking it from term history are * deployed. */ pq_sendint64_le(buf, m->termHistory->entries[0].lsn); break; } case 'a': /* * Note: this serializes only AppendRequestHeader, caller is * expected to append WAL data later. */ { /* v2 sent struct as is */ AppendRequestHeader *m = (AppendRequestHeader *) msg; AppendRequestHeaderV2 appendRequestHeaderV2; appendRequestHeaderV2.tag = m->apm.tag; appendRequestHeaderV2.term = m->term; appendRequestHeaderV2.epochStartLsn = 0; /* removed field */ appendRequestHeaderV2.beginLsn = m->beginLsn; appendRequestHeaderV2.endLsn = m->endLsn; appendRequestHeaderV2.commitLsn = m->commitLsn; appendRequestHeaderV2.truncateLsn = m->truncateLsn; /* removed field */ memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId)); pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2)); break; } default: wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); } return; } wp_log(FATAL, "unexpected proto_version %d", proto_version); } /* * Try to read CopyData message from i'th safekeeper, resetting connection on * failure. */ static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size) { WalProposer *wp = sk->wp; switch (wp->api.conn_async_read(sk, buf, buf_size)) { case PG_ASYNC_READ_SUCCESS: return true; case PG_ASYNC_READ_TRY_AGAIN: /* WL_SOCKET_READABLE is always set during copyboth */ return false; case PG_ASYNC_READ_FAIL: wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } Assert(false); return false; } /* Deserialize membership configuration from buf to mconf. */ static void MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf) { uint32 i; mconf->generation = pq_getmsgint32(buf); mconf->members.len = pq_getmsgint32(buf); mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len); for (i = 0; i < mconf->members.len; i++) { const char *buf_host; mconf->members.m[i].node_id = pq_getmsgint64(buf); buf_host = pq_getmsgrawstring(buf); strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host)); mconf->members.m[i].port = pq_getmsgint16(buf); } mconf->new_members.len = pq_getmsgint32(buf); mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len); for (i = 0; i < mconf->new_members.len; i++) { const char *buf_host; mconf->new_members.m[i].node_id = pq_getmsgint64(buf); buf_host = pq_getmsgrawstring(buf); strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host)); mconf->new_members.m[i].port = pq_getmsgint16(buf); } } /* * Read next message with known type into provided struct, by reading a CopyData * block from the safekeeper's postgres connection, returning whether the read * was successful. * * If the read needs more polling, we return 'false' and keep the state * unmodified, waiting until it becomes read-ready to try again. If it fully * failed, a warning is emitted and the connection is reset. * * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields. */ static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) { WalProposer *wp = sk->wp; char *buf; int buf_size; uint8 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) return false; sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* parse it */ s.data = buf; s.len = buf_size; s.maxlen = buf_size; s.cursor = 0; if (wp->config->proto_version == 3) { tag = pq_getmsgbyte(&s); if (tag != anymsg->tag) { wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } switch (tag) { case 'g': { AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; msg->nodeId = pq_getmsgint64(&s); MembershipConfigurationDeserialize(&msg->mconf, &s); msg->term = pq_getmsgint64(&s); pq_getmsgend(&s); return true; } case 'v': { VoteResponse *msg = (VoteResponse *) anymsg; msg->generation = pq_getmsgint32(&s); msg->term = pq_getmsgint64(&s); msg->voteGiven = pq_getmsgbyte(&s); msg->flushLsn = pq_getmsgint64(&s); msg->truncateLsn = pq_getmsgint64(&s); msg->termHistory.n_entries = pq_getmsgint32(&s); msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); for (uint32 i = 0; i < msg->termHistory.n_entries; i++) { msg->termHistory.entries[i].term = pq_getmsgint64(&s); msg->termHistory.entries[i].lsn = pq_getmsgint64(&s); } pq_getmsgend(&s); return true; } case 'a': { AppendResponse *msg = (AppendResponse *) anymsg; msg->generation = pq_getmsgint32(&s); msg->term = pq_getmsgint64(&s); msg->flushLsn = pq_getmsgint64(&s); msg->commitLsn = pq_getmsgint64(&s); msg->hs.ts = pq_getmsgint64(&s); msg->hs.xmin.value = pq_getmsgint64(&s); msg->hs.catalog_xmin.value = pq_getmsgint64(&s); if (s.len > s.cursor) ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); else msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } default: { wp_log(FATAL, "unexpected message tag %c to read", (char) tag); return false; } } } else if (wp->config->proto_version == 2) { tag = pq_getmsgint64_le(&s); if (tag != anymsg->tag) { wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } switch (tag) { case 'g': { AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; msg->term = pq_getmsgint64_le(&s); msg->nodeId = pq_getmsgint64_le(&s); pq_getmsgend(&s); return true; } case 'v': { VoteResponse *msg = (VoteResponse *) anymsg; msg->term = pq_getmsgint64_le(&s); msg->voteGiven = pq_getmsgint64_le(&s); msg->flushLsn = pq_getmsgint64_le(&s); msg->truncateLsn = pq_getmsgint64_le(&s); msg->termHistory.n_entries = pq_getmsgint32_le(&s); msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); for (int i = 0; i < msg->termHistory.n_entries; i++) { msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); } pq_getmsgint64_le(&s); /* timelineStartLsn */ pq_getmsgend(&s); return true; } case 'a': { AppendResponse *msg = (AppendResponse *) anymsg; msg->term = pq_getmsgint64_le(&s); msg->flushLsn = pq_getmsgint64_le(&s); msg->commitLsn = pq_getmsgint64_le(&s); msg->hs.ts = pq_getmsgint64_le(&s); msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); if (s.len > s.cursor) ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); else msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } default: { wp_log(FATAL, "unexpected message tag %c to read", (char) tag); return false; } } } wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); return false; /* keep the compiler quiet */ } /* * Blocking equivalent to AsyncWrite. * * We use this everywhere messages are small enough that they should fit in a * single packet. */ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) { WalProposer *wp = sk->wp; uint32 sk_events; uint32 nwr_events; if (!wp->api.conn_blocking_write(sk, msg, msg_size)) { wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } sk->state = success_state; /* * If the new state will be waiting for events to happen, update the event * set to wait for those */ SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); /* * nwr_events is relevant only during SS_ACTIVE which doesn't use * BlockingWrite */ Assert(!nwr_events); if (sk_events) wp->api.update_event_set(sk, sk_events); return true; } /* * Starts a write into the 'i'th safekeeper's postgres connection, moving to * flush_state (adjusting eventset) if write still needs flushing. * * Returns false if sending is unfinished (requires flushing or conn failed). * Upon failure, a warning is emitted and the connection is reset. */ static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) { WalProposer *wp = sk->wp; switch (wp->api.conn_async_write(sk, msg, msg_size)) { case PG_ASYNC_WRITE_SUCCESS: return true; case PG_ASYNC_WRITE_TRY_FLUSH: /* * We still need to call PQflush some more to finish the job; go * to the appropriate state. Update the event set at the bottom of * this function */ sk->state = flush_state; wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); return false; case PG_ASYNC_WRITE_FAIL: wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: Assert(false); return false; } } /* * Flushes a previous call to AsyncWrite. This only needs to be called when the * socket becomes read or write ready *after* calling AsyncWrite. * * If flushing successfully completes returns true, otherwise false. Event set * is updated only if connection fails, otherwise caller should manually unset * WL_SOCKET_WRITEABLE. */ static bool AsyncFlush(Safekeeper *sk) { WalProposer *wp = sk->wp; /*--- * PQflush returns: * 0 if successful [we're good to move on] * 1 if unable to send everything yet [call PQflush again] * -1 if it failed [emit an error] */ switch (wp->api.conn_flush(sk)) { case 0: /* flush is done */ return true; case 1: /* Nothing to do; try again when the socket's ready */ return false; case -1: wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk), wp->api.conn_error_message(sk)); ResetConnection(sk); return false; default: Assert(false); return false; } } static int CompareLsn(const void *a, const void *b) { XLogRecPtr lsn1 = *((const XLogRecPtr *) a); XLogRecPtr lsn2 = *((const XLogRecPtr *) b); if (lsn1 < lsn2) return -1; else if (lsn1 == lsn2) return 0; else return 1; } /* Returns a human-readable string corresonding to the SafekeeperState * * The string should not be freed. * * The strings are intended to be used as a prefix to "state", e.g.: * * wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); * * If this sort of phrasing doesn't fit the message, instead use something like: * * wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); */ static char * FormatSafekeeperState(Safekeeper *sk) { char *return_val = NULL; switch (sk->state) { case SS_OFFLINE: return_val = "offline"; break; case SS_CONNECTING_READ: case SS_CONNECTING_WRITE: return_val = "connecting"; break; case SS_WAIT_EXEC_RESULT: return_val = "receiving query result"; break; case SS_HANDSHAKE_RECV: return_val = "handshake (receiving)"; break; case SS_WAIT_VOTING: return_val = "voting"; break; case SS_WAIT_VERDICT: return_val = "wait-for-verdict"; break; case SS_SEND_ELECTED_FLUSH: return_val = "send-announcement-flush"; break; case SS_WAIT_ELECTED: return_val = "idle"; break; case SS_ACTIVE: switch (sk->active_state) { case SS_ACTIVE_SEND: return_val = "active send"; break; case SS_ACTIVE_READ_WAL: return_val = "active read WAL"; break; case SS_ACTIVE_FLUSH: return_val = "active flush"; break; } break; } Assert(return_val != NULL); return return_val; } /* Asserts that the provided events are expected for given safekeeper's state */ static void AssertEventsOkForState(uint32 events, Safekeeper *sk) { uint32 sk_events; uint32 nwr_events; uint32 expected; bool events_ok_for_state; /* long name so the `Assert` is more * clear later */ WalProposer *wp = sk->wp; SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); /* * Without one more level of notify target indirection we have no way to * distinguish which socket woke up us, so just union expected events. */ expected = sk_events | nwr_events; events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { /* * To give a descriptive message in the case of failure, we use elog * and then an assertion that's guaranteed to fail. */ wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); Assert(events_ok_for_state); } } /* Returns the set of events for both safekeeper (sk_events) and neon_walreader * (nwr_events) sockets a safekeeper in this state should be waiting on. * * This will return WL_NO_EVENTS (= 0) for some events. */ void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events) { WalProposer *wp = sk->wp; *nwr_events = 0; /* nwr_events is empty for most states */ /* If the state doesn't have a modifier, we can check the base state */ switch (sk->state) { /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: *sk_events = WL_SOCKET_READABLE; return; case SS_CONNECTING_WRITE: *sk_events = WL_SOCKET_WRITEABLE; return; /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: *sk_events = WL_SOCKET_READABLE; return; /* * Idle states use read-readiness as a sign that the connection * has been disconnected. */ case SS_WAIT_VOTING: case SS_WAIT_ELECTED: *sk_events = WL_SOCKET_READABLE; return; case SS_SEND_ELECTED_FLUSH: *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; return; case SS_ACTIVE: switch (sk->active_state) { /* * Everything is sent; we just wait for sk responses and * latch. * * Note: this assumes we send all available WAL to * safekeeper in one wakeup (unless it blocks). Otherwise * we would want WL_SOCKET_WRITEABLE here to finish the * work. */ case SS_ACTIVE_SEND: *sk_events = WL_SOCKET_READABLE; /* c.f. walprop_pg_active_state_update_event_set */ #if PG_VERSION_NUM >= 150000 if (wp->api.wal_reader_events(sk)) *nwr_events = WL_SOCKET_CLOSED; #endif /* on PG 14 nwr_events remains 0 */ return; /* * Waiting for neon_walreader socket, but we still read * responses from sk socket. */ case SS_ACTIVE_READ_WAL: *sk_events = WL_SOCKET_READABLE; *nwr_events = wp->api.wal_reader_events(sk); return; /* * Need to flush the sk socket, so ignore neon_walreader * one and set write interest on sk. */ case SS_ACTIVE_FLUSH: *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; #if PG_VERSION_NUM >= 150000 /* c.f. walprop_pg_active_state_update_event_set */ if (wp->api.wal_reader_events(sk)) *nwr_events = WL_SOCKET_CLOSED; #endif /* on PG 14 nwr_events remains 0 */ return; } return; /* The offline state expects no events. */ case SS_OFFLINE: *sk_events = 0; return; default: Assert(false); } } /* Returns a human-readable string corresponding to the event set * * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the * returned string may be meaingless. * * The string should not be freed. It should also not be expected to remain the same between * function calls. */ static char * FormatEvents(WalProposer *wp, uint32 events) { static char return_str[8]; /* Helper variable to check if there's extra bits */ uint32 all_flags = WL_LATCH_SET | WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE | WL_TIMEOUT | WL_POSTMASTER_DEATH | WL_EXIT_ON_PM_DEATH | WL_SOCKET_CONNECTED; /* * The formatting here isn't supposed to be *particularly* useful -- it's * just to give an sense of what events have been triggered without * needing to remember your powers of two. */ return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; if (events & (~all_flags)) { wp_log(WARNING, "event formatting found unexpected component %d", events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } else return_str[6] = '\0'; return (char *) &return_str; } /* Dump mconf as toml for observability / debugging. Result is palloc'ed. */ static char * MembershipConfigurationToString(MembershipConfiguration *mconf) { StringInfoData s; uint32 i; initStringInfo(&s); appendStringInfo(&s, "{gen = %u", mconf->generation); appendStringInfoString(&s, ", members = ["); for (i = 0; i < mconf->members.len; i++) { if (i > 0) appendStringInfoString(&s, ", "); appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id); appendStringInfo(&s, ", host = %s", mconf->members.m[i].host); appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port); } appendStringInfo(&s, "], new_members = ["); for (i = 0; i < mconf->new_members.len; i++) { if (i > 0) appendStringInfoString(&s, ", "); appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id); appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host); appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port); } appendStringInfoString(&s, "]}"); return s.data; } static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst) { dst->generation = src->generation; dst->members.len = src->members.len; dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len); memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len); dst->new_members.len = src->new_members.len; dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len); memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len); } static void MembershipConfigurationFree(MembershipConfiguration *mconf) { if (mconf->members.m) pfree(mconf->members.m); mconf->members.m = NULL; if (mconf->new_members.m) pfree(mconf->new_members.m); mconf->new_members.m = NULL; } ================================================ FILE: pgxn/neon/walproposer.h ================================================ #ifndef __NEON_WALPROPOSER_H__ #define __NEON_WALPROPOSER_H__ #include "access/transam.h" #include "access/xlogdefs.h" #include "access/xlog_internal.h" #include "nodes/replnodes.h" #include "replication/walreceiver.h" #include "utils/uuid.h" #include "libpqwalproposer.h" #include "neon_walreader.h" #include "pagestore_client.h" #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ /* * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred, * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 */ #define WL_NO_EVENTS 0 struct WalProposerConn; /* Defined in libpqwalproposer.h */ typedef struct WalProposerConn WalProposerConn; /* * WAL safekeeper state, which is used to wait for some event. * * States are listed here in the order that they're executed. * * Most states, upon failure, will move back to SS_OFFLINE by calls to * ResetConnection or ShutdownConnection. */ typedef enum { /* * Does not have an active connection and will stay that way until further * notice. * * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. */ SS_OFFLINE, /* * Connecting states. "_READ" waits for the socket to be available for * reading, "_WRITE" waits for writing. There's no difference in the code * they execute when polled, but we have this distinction in order to * recreate the event set in HackyRemoveWalProposerEvent. * * After the connection is made, "START_WAL_PUSH" query is sent. */ SS_CONNECTING_WRITE, SS_CONNECTING_READ, /* * Waiting for the result of the "START_WAL_PUSH" command. * * After we get a successful result, sends handshake to safekeeper. */ SS_WAIT_EXEC_RESULT, /* * Executing the receiving half of the handshake. After receiving, moves * to SS_VOTING. */ SS_HANDSHAKE_RECV, /* * Waiting to participate in voting, but a quorum hasn't yet been reached. * This is an idle state - we do not expect AdvancePollState to be called. * * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a * quorum of handshakes. */ SS_WAIT_VOTING, /* * Already sent voting information, waiting to receive confirmation from * the node. After receiving, moves to SS_WAIT_ELECTED, if the quorum * isn't reached yet. */ SS_WAIT_VERDICT, /* Need to flush ProposerElected message. */ SS_SEND_ELECTED_FLUSH, /* * Waiting for quorum to send WAL. Idle state. If the socket becomes * read-ready, the connection has been closed. * * Moves to SS_ACTIVE only by call to StartStreaming. */ SS_WAIT_ELECTED, /* * Active phase, when we acquired quorum and have WAL to send or feedback * to read. */ SS_ACTIVE, } SafekeeperState; /* * Sending WAL substates of SS_ACTIVE. */ typedef enum { /* * We are ready to send more WAL, waiting for latch set to learn about * more WAL becoming available (or just a timeout to send heartbeat). */ SS_ACTIVE_SEND, /* * Polling neon_walreader to receive chunk of WAL (probably remotely) to * send to this safekeeper. * * Note: socket management is done completely inside walproposer_pg for * simplicity, and thus simulation doesn't test it. Which is fine as * simulation is mainly aimed at consensus checks, not waiteventset * management. * * Also, while in this state we don't touch safekeeper socket, so in * theory it might close connection as inactive. This can be addressed if * needed; however, while fetching WAL we should regularly send it, so the * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle * walreader socket), but similarly shouldn't be a problem. */ SS_ACTIVE_READ_WAL, /* * Waiting for write readiness to flush the socket. */ SS_ACTIVE_FLUSH, } SafekeeperActiveState; /* Consensus logical timestamp. */ typedef uint64 term_t; /* neon storage node id */ typedef uint64 NNodeId; /* * Number uniquely identifying safekeeper membership configuration. * This and following structs pair ones in membership.rs. */ typedef uint32 Generation; #define INVALID_GENERATION 0 typedef struct SafekeeperId { NNodeId node_id; char host[MAXCONNINFO]; uint16 port; } SafekeeperId; /* Set of safekeepers. */ typedef struct MemberSet { uint32 len; /* number of members */ SafekeeperId *m; /* ids themselves */ } MemberSet; /* * Timeline safekeeper membership configuration as sent in the * protocol. */ typedef struct MembershipConfiguration { Generation generation; MemberSet members; /* Has 0 n_members in non joint conf. */ MemberSet new_members; } MembershipConfiguration; /* * Proposer <-> Acceptor messaging. */ typedef struct ProposerAcceptorMessage { uint8 tag; } ProposerAcceptorMessage; /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting { ProposerAcceptorMessage pam; /* message tag */ /* * tenant/timeline ids as C strings with standard hex notation for ease of * printing. In principle they are not strictly needed as ttid is also * passed as libpq options. */ char *tenant_id; char *timeline_id; /* Full conf is carried to allow safekeeper switch */ MembershipConfiguration mconf; /* * pg_version and wal_seg_size are used for timeline creation until we * fully migrate to doing externally. systemId is only used as a sanity * cross check. */ uint32 pg_version; /* in PG_VERSION_NUM format */ uint64 system_id; /* Postgres system identifier. */ uint32 wal_seg_size; } ProposerGreeting; /* protocol v2 variant, kept while wp supports it */ typedef struct ProposerGreetingV2 { uint64 tag; /* message tag */ uint32 protocolVersion; /* proposer-safekeeper protocol version */ uint32 pgVersion; pg_uuid_t proposerId; uint64 systemId; /* Postgres system identifier */ uint8 timeline_id[16]; /* Neon timeline id */ uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; } ProposerGreetingV2; typedef struct AcceptorProposerMessage { uint8 tag; } AcceptorProposerMessage; /* * Acceptor -> Proposer initial response: the highest term acceptor voted for, * its node id and configuration. */ typedef struct AcceptorGreeting { AcceptorProposerMessage apm; NNodeId nodeId; MembershipConfiguration mconf; term_t term; } AcceptorGreeting; /* * Proposer -> Acceptor vote request. */ typedef struct VoteRequest { ProposerAcceptorMessage pam; /* message tag */ Generation generation; /* membership conf generation */ term_t term; } VoteRequest; /* protocol v2 variant, kept while wp supports it */ typedef struct VoteRequestV2 { uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ } VoteRequestV2; /* Element of term switching chain. */ typedef struct TermSwitchEntry { term_t term; XLogRecPtr lsn; } TermSwitchEntry; typedef struct TermHistory { uint32 n_entries; TermSwitchEntry *entries; } TermHistory; /* Vote itself, sent from safekeeper to proposer */ typedef struct VoteResponse { AcceptorProposerMessage apm; /* * Membership conf generation. It's not strictly required because on * mismatch safekeeper is expected to ERROR the connection, but let's * sanity check it. */ Generation generation; term_t term; uint8 voteGiven; /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow * proposer to choose the most advanced one. */ XLogRecPtr flushLsn; XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; } VoteResponse; /* * Proposer -> Acceptor message announcing proposer is elected and communicating * epoch history to it. */ typedef struct ProposerElected { AcceptorProposerMessage apm; Generation generation; /* membership conf generation */ term_t term; /* proposer will send since this point */ XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; } ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader { AcceptorProposerMessage apm; Generation generation; /* membership conf generation */ term_t term; /* term of the proposer */ XLogRecPtr beginLsn; /* start position of message in WAL */ XLogRecPtr endLsn; /* end position of message in WAL */ XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ /* * minimal LSN which may be needed for recovery of some safekeeper (end * lsn + 1 of last chunk streamed to everyone) */ XLogRecPtr truncateLsn; /* in the AppendRequest message, WAL data follows */ } AppendRequestHeader; /* protocol v2 variant, kept while wp supports it */ typedef struct AppendRequestHeaderV2 { uint64 tag; term_t term; /* term of the proposer */ /* * LSN since which current proposer appends WAL (begin_lsn of its first * record); determines epoch switch point. */ XLogRecPtr epochStartLsn; XLogRecPtr beginLsn; /* start position of message in WAL */ XLogRecPtr endLsn; /* end position of message in WAL */ XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ /* * minimal LSN which may be needed for recovery of some safekeeper (end * lsn + 1 of last chunk streamed to everyone) */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ /* in the AppendRequest message, WAL data follows */ } AppendRequestHeaderV2; /* * Hot standby feedback received from replica */ typedef struct HotStandbyFeedback { TimestampTz ts; FullTransactionId xmin; FullTransactionId catalog_xmin; } HotStandbyFeedback; typedef struct PageserverFeedback { /* true if AppendResponse contains this feedback */ bool present; /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ XLogRecPtr last_received_lsn; XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; uint32 shard_number; /* true if the pageserver has detected data corruption in the timeline */ bool corruption_detected; } PageserverFeedback; /* BEGIN_HADRON */ /** * WAL proposer is the only backend that will update `sent_bytes` and `last_recorded_time_us`. * Once the `sent_bytes` reaches the limit, it puts backpressure on PG backends. * * A PG backend checks `should_limit` to see if it should hit backpressure. * - If yes, it also checks the `last_recorded_time_us` to see * if it's time to push more WALs. This is because the WAL proposer * only resets `should_limit` to 0 after it is notified about new WALs * which might take a while. */ typedef struct WalRateLimiter { /* The effective wal write rate. Could be changed dynamically based on whether PG has backpressure or not.*/ pg_atomic_uint32 effective_max_wal_bytes_per_second; /* If the value is 1, PG backends will hit backpressure until the time has past batch_end_time_us. */ pg_atomic_uint32 should_limit; /* The number of bytes sent in the current second. */ uint64 sent_bytes; /* The timestamp when the write starts in the current batch. A batch is a time interval (e.g., )that we track and throttle writes. Most times a batch is 1s, but it could become larger if the PG overwrites the WALs and we will adjust the batch accordingly to compensate (e.g., if PG writes 10MB at once and max WAL write rate is 1MB/s, then the current batch will become 10s). */ pg_atomic_uint64 batch_start_time_us; /* The timestamp (in the future) that the current batch should end and accept more writes (after should_limit is set to 1). */ pg_atomic_uint64 batch_end_time_us; } WalRateLimiter; /* END_HADRON */ typedef struct WalproposerShmemState { pg_atomic_uint64 propEpochStartLsn; char donor_name[64]; char donor_conninfo[MAXCONNINFO]; XLogRecPtr donor_lsn; slock_t mutex; pg_atomic_uint64 mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; pg_atomic_uint64 currentClusterSize; /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; int num_shards; bool replica_promote; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; /* BEGIN_HADRON */ /* The WAL rate limiter */ WalRateLimiter wal_rate_limiter; /* Number of safekeepers in the config */ uint32 num_safekeepers; /* Per-safekeeper status flags: 0=inactive, 1=active */ uint8 safekeeper_status[MAX_SAFEKEEPERS]; /* END_HADRON */ } WalproposerShmemState; /* * Report safekeeper state to proposer */ typedef struct AppendResponse { AcceptorProposerMessage apm; /* * Membership conf generation. It's not strictly required because on * mismatch safekeeper is expected to ERROR the connection, but let's * sanity check it. */ Generation generation; /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. */ term_t term; /* TODO: add comment */ XLogRecPtr flushLsn; /* Safekeeper reports back his awareness about which WAL is committed, as */ /* this is a criterion for walproposer --sync mode exit */ XLogRecPtr commitLsn; HotStandbyFeedback hs; /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ PageserverFeedback ps_feedback; } AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE 56 struct WalProposer; typedef struct WalProposer WalProposer; /* * Descriptor of safekeeper */ typedef struct Safekeeper { WalProposer *wp; char const *host; char const *port; /* BEGIN_HADRON */ /* index of this safekeeper in the WalProposer array */ uint32 index; /* END_HADRON */ /* * connection string for connecting/reconnecting. * * May contain private information like password and should not be logged. */ char conninfo[MAXCONNINFO]; /* * Temporary buffer for the message being sent to the safekeeper. */ StringInfoData outbuf; /* * Streaming will start here; must be record boundary. */ XLogRecPtr startStreamingAt; XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ SafekeeperState state; /* safekeeper state machine state */ SafekeeperActiveState active_state; TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ AppendResponse appendResponse; /* feedback for master */ /* postgres-specific fields */ #ifndef WALPROPOSER_LIB /* * postgres protocol connection to the WAL acceptor * * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we * reach SS_ACTIVE; not before. */ WalProposerConn *conn; /* * WAL reader, allocated for each safekeeper. */ NeonWALReader *xlogreader; /* * Position in wait event set. Equal to -1 if no event */ int eventPos; /* * Neon WAL reader position in wait event set, or -1 if no socket. Note * that event must be removed not only on error/failure, but also on * successful *local* read, as next read might again be remote, but with * different socket. */ int nwrEventPos; /* * Per libpq docs, during connection establishment socket might change, * remember here if it is stable to avoid readding to the event set if * possible. Must be reset whenever nwr event is deleted. */ bool nwrConnEstablished; #endif /* WalProposer library specifics */ #ifdef WALPROPOSER_LIB /* * Buffer for incoming messages. Usually Rust vector is stored here. * Caller is responsible for freeing the buffer. */ StringInfoData inbuf; #endif } Safekeeper; /* Re-exported PostgresPollingStatusType */ typedef enum { WP_CONN_POLLING_FAILED = 0, WP_CONN_POLLING_READING, WP_CONN_POLLING_WRITING, WP_CONN_POLLING_OK, /* * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. * We've removed it here to avoid clutter. */ } WalProposerConnectPollStatusType; /* Re-exported ConnStatusType */ typedef enum { WP_CONNECTION_OK, WP_CONNECTION_BAD, /* * The original ConnStatusType has many more tags, but requests that they * not be relied upon (except for displaying to the user). We don't need * that extra functionality, so we collect them into a single tag here. */ WP_CONNECTION_IN_PROGRESS, } WalProposerConnStatusType; /* * Collection of hooks for walproposer, to call postgres functions, * read WAL and send it over the network. */ typedef struct walproposer_api { /* * Get WalproposerShmemState. This is used to store information about last * elected term. */ WalproposerShmemState *(*get_shmem_state) (WalProposer *wp); /* * Start receiving notifications about new WAL. This is an infinite loop * which calls WalProposerBroadcast() and WalProposerPoll() to send the * WAL. */ void (*start_streaming) (WalProposer *wp, XLogRecPtr startpos); /* Get pointer to the latest available WAL. */ XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp); /* Update current donor info in WalProposer Shmem */ void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn); /* Get current time. */ TimestampTz (*get_current_timestamp) (WalProposer *wp); /* Current error message, aka PQerrorMessage. */ char *(*conn_error_message) (Safekeeper *sk); /* Connection status, aka PQstatus. */ WalProposerConnStatusType (*conn_status) (Safekeeper *sk); /* Start the connection, aka PQconnectStart. */ void (*conn_connect_start) (Safekeeper *sk); /* Poll an asynchronous connection, aka PQconnectPoll. */ WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk); /* Send a blocking SQL query, aka PQsendQuery. */ bool (*conn_send_query) (Safekeeper *sk, char *query); /* Read the query result, aka PQgetResult. */ WalProposerExecStatusType (*conn_get_query_result) (Safekeeper *sk); /* Flush buffer to the network, aka PQflush. */ int (*conn_flush) (Safekeeper *sk); /* Reset sk state: close pq connection, deallocate xlogreader. */ void (*conn_finish) (Safekeeper *sk); /* * Try to read CopyData message from the safekeeper, aka PQgetCopyData. * * On success, the data is placed in *buf. It is valid until the next call * to this function. * * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); /* Try to write CopyData message, aka PQputCopyData. */ PGAsyncWriteResult (*conn_async_write) (Safekeeper *sk, void const *buf, size_t size); /* Blocking CopyData write, aka PQputCopyData + PQflush. */ bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size); /* * Download WAL before basebackup for logical walsenders from sk, if * needed */ bool (*recovery_download) (WalProposer *wp, Safekeeper *sk); /* Allocate WAL reader. */ void (*wal_reader_allocate) (Safekeeper *sk); /* Read WAL from disk to buf. */ NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg); /* Returns events to be awaited on WAL reader, if any. */ uint32 (*wal_reader_events) (Safekeeper *sk); /* Initialize event set. */ void (*init_event_set) (WalProposer *wp); /* Update events for an existing safekeeper connection. */ void (*update_event_set) (Safekeeper *sk, uint32 events); /* Configure wait event set for yield in SS_ACTIVE. */ void (*active_state_update_event_set) (Safekeeper *sk); /* Add a new safekeeper connection to the event set. */ void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events); /* Remove safekeeper connection from event set */ void (*rm_safekeeper_event_set) (Safekeeper *sk); /* * Wait until some event happens: - timeout is reached - socket event for * safekeeper connection - new WAL is available * * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. * * On timeout, events is set to WL_NO_EVENTS. On socket event, events is * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is * closed, events is set to WL_SOCKET_READABLE. * * WL_SOCKET_WRITEABLE is usually set only when we need to flush the * buffer. It can be returned only if caller asked for this event in the * last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); /* Read random bytes. */ bool (*strong_random) (WalProposer *wp, void *buf, size_t len); /* * Get a basebackup LSN. Used to cross-validate with the latest available * LSN on the safekeepers. */ XLogRecPtr (*get_redo_start_lsn) (WalProposer *wp); /* * Finish sync safekeepers with the given LSN. This function should not * return and should exit the program. */ void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn) __attribute__((noreturn)) ; /* * Called after every AppendResponse from the safekeeper. Used to * propagate backpressure feedback and to confirm WAL persistence (has * been commited on the quorum of safekeepers). */ void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); /* * Write a log message to the internal log processor. This is used only * when walproposer is compiled as a library. Otherwise, all logging is * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); /* * BEGIN_HADRON * APIs manipulating shared memory state used for Safekeeper quorum health metrics. */ /* * Reset the safekeeper statuses in shared memory for metric purposes. */ void (*reset_safekeeper_statuses_for_metrics) (WalProposer *wp, uint32 num_safekeepers); /* * Update the safekeeper status in shared memory for metric purposes. */ void (*update_safekeeper_status_for_metrics) (WalProposer *wp, uint32 sk_index, uint8 status); /* END_HADRON */ } walproposer_api; /* * Configuration of the WAL proposer. */ typedef struct WalProposerConfig { /* hex-encoded TenantId cstr */ char *neon_tenant; /* hex-encoded TimelineId cstr */ char *neon_timeline; /* * Comma-separated list of safekeepers, in the following format: * host1:port1,host2:port2,host3:port3 * * This cstr should be editable. */ char *safekeepers_list; /* libpq connection info options. */ char *safekeeper_conninfo_options; /* * WalProposer reconnects to offline safekeepers once in this interval. * Time is in milliseconds. */ int safekeeper_reconnect_timeout; /* * WalProposer terminates the connection if it doesn't receive any message * from the safekeeper in this interval. Time is in milliseconds. */ int safekeeper_connection_timeout; /* * WAL segment size. Will be passed to safekeepers in greet request. Also * used to detect page headers. */ int wal_segment_size; /* * If safekeeper was started in sync mode, walproposer will not subscribe * for new WAL and will exit when quorum of safekeepers will be synced to * the latest available LSN. */ bool syncSafekeepers; /* Will be passed to safekeepers in greet request. */ uint64 systemId; /* Will be passed to safekeepers in greet request. */ TimeLineID pgTimeline; int proto_version; #ifdef WALPROPOSER_LIB void *callback_data; #endif } WalProposerConfig; typedef enum { /* collecting greetings to determine term to campaign for */ WPS_COLLECTING_TERMS, /* campaing started, waiting for votes */ WPS_CAMPAIGN, /* successfully elected */ WPS_ELECTED, } WalProposerState; /* * WAL proposer state. */ typedef struct WalProposer { WalProposerConfig *config; WalProposerState state; /* Current walproposer membership configuration */ MembershipConfiguration mconf; /* * Parallels mconf.members with pointers to the member's slot in * safekeepers array of connections, or NULL if such member is not * connected. Helps to avoid looking slot per id through all * .safekeepers[] when doing quorum checks. */ Safekeeper *members_safekeepers[MAX_SAFEKEEPERS]; /* As above, but for new_members. */ Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS]; /* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */ int quorum; /* * Generation of the membership conf of which safekeepers[] are presumably * members. To make cplane life a bit easier and have more control in * tests with which sks walproposer gets connected neon.safekeepers GUC * doesn't provide full mconf, only the list of endpoints to connect to. * We still would like to know generation associated with it because 1) we * need some handle to enforce using generations in walproposer, and * non-zero value of this serves the purpose; 2) currently we don't do * that, but in theory walproposer can update list of safekeepers to * connect to upon receiving mconf from safekeepers, and generation number * must be checked to see which list is newer. */ Generation safekeepers_generation; /* Number of occupied slots in safekeepers[] */ int n_safekeepers; /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* Current local TimeLineId in use */ TimeLineID localTimeLineID; /* WAL has been generated up to this point */ XLogRecPtr availableLsn; /* cached GetAcknowledgedByQuorumWALPosition result */ XLogRecPtr commitLsn; ProposerGreeting greetRequest; ProposerGreetingV2 greetRequestV2; /* Vote request for safekeeper */ VoteRequest voteRequest; /* * Minimal LSN which may be needed for recovery of some safekeeper, * record-aligned (first record which might not yet received by someone). */ XLogRecPtr truncateLsn; /* * Term of the proposer. We want our term to be highest and unique, so we * collect terms from safekeepers quorum, choose max and +1. After that * our term is fixed and must not change. If we observe that some * safekeeper has higher term, it means that we have another running * compute, so we must stop immediately. */ term_t propTerm; /* term history of the proposer */ TermHistory propTermHistory; /* epoch start lsn of the proposer */ XLogRecPtr propTermStartLsn; /* Most advanced acceptor epoch */ term_t donorLastLogTerm; /* Most advanced acceptor */ Safekeeper *donor; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; /* number of successful connections over the lifetime of walproposer */ int n_connected; /* * Timestamp of the last reconnection attempt. Related to * config->safekeeper_reconnect_timeout */ TimestampTz last_reconnect_attempt; walproposer_api api; } WalProposer; extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api); extern void WalProposerStart(WalProposer *wp); extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); extern WalproposerShmemState *GetWalpropShmemState(void); /* * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to * recreate set from scratch, hence the export. */ extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); extern TimeLineID walprop_pg_get_timeline_id(void); #define WPEVENT 1337 /* special log level for walproposer internal * events */ #define WP_LOG_PREFIX "[WP] " /* * wp_log is used in pure wp code (walproposer.c), allowing API callback to * catch logging. */ #ifdef WALPROPOSER_LIB extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...) pg_attribute_printf(3, 4); #define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__) #else #define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) #endif /* * And wpg_log is used all other (postgres specific) walproposer code, just * adding prefix. */ #define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) #endif /* __NEON_WALPROPOSER_H__ */ ================================================ FILE: pgxn/neon/walproposer_compat.c ================================================ /* * Contains copied/adapted functions from libpq and some internal postgres functions. * This is needed to avoid linking to full postgres server installation. This file * is compiled as a part of libwalproposer static library. */ #include "postgres.h" #include #include "libpq/pqformat.h" #include "miscadmin.h" #include "utils/datetime.h" #include "walproposer.h" void ExceptionalCondition(const char *conditionName, const char *fileName, int lineNumber) { fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n", fileName, lineNumber, conditionName); fprintf(stderr, "aborting...\n"); exit(1); } void pq_copymsgbytes(StringInfo msg, char *buf, int datalen) { if (datalen < 0 || datalen > (msg->len - msg->cursor)) ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__); memcpy(buf, &msg->data[msg->cursor], datalen); msg->cursor += datalen; } /* -------------------------------- * pq_getmsgint - get a binary integer from a message buffer * * Values are treated as unsigned. * -------------------------------- */ unsigned int pq_getmsgint(StringInfo msg, int b) { unsigned int result; unsigned char n8; uint16 n16; uint32 n32; switch (b) { case 1: pq_copymsgbytes(msg, (char *) &n8, 1); result = n8; break; case 2: pq_copymsgbytes(msg, (char *) &n16, 2); result = pg_ntoh16(n16); break; case 4: pq_copymsgbytes(msg, (char *) &n32, 4); result = pg_ntoh32(n32); break; default: fprintf(stderr, "unsupported integer size %d\n", b); ExceptionalCondition("unsupported integer size", __FILE__, __LINE__); result = 0; /* keep compiler quiet */ break; } return result; } /* -------------------------------- * pq_getmsgint64 - get a binary 8-byte int from a message buffer * * It is tempting to merge this with pq_getmsgint, but we'd have to make the * result int64 for all data widths --- that could be a big performance * hit on machines where int64 isn't efficient. * -------------------------------- */ int64 pq_getmsgint64(StringInfo msg) { uint64 n64; pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); return pg_ntoh64(n64); } /* -------------------------------- * pq_getmsgbyte - get a raw byte from a message buffer * -------------------------------- */ int pq_getmsgbyte(StringInfo msg) { if (msg->cursor >= msg->len) ExceptionalCondition("no data left in message", __FILE__, __LINE__); return (unsigned char) msg->data[msg->cursor++]; } /* -------------------------------- * pq_getmsgbytes - get raw data from a message buffer * * Returns a pointer directly into the message buffer; note this * may not have any particular alignment. * -------------------------------- */ const char * pq_getmsgbytes(StringInfo msg, int datalen) { const char *result; if (datalen < 0 || datalen > (msg->len - msg->cursor)) ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__); result = &msg->data[msg->cursor]; msg->cursor += datalen; return result; } /* -------------------------------- * pq_getmsgrawstring - get a null-terminated text string - NO conversion * * Returns a pointer directly into the message buffer. * -------------------------------- */ const char * pq_getmsgrawstring(StringInfo msg) { char *str; int slen; str = &msg->data[msg->cursor]; /* * It's safe to use strlen() here because a StringInfo is guaranteed to * have a trailing null byte. But check we found a null inside the * message. */ slen = strlen(str); if (msg->cursor + slen >= msg->len) ExceptionalCondition("invalid string in message", __FILE__, __LINE__); msg->cursor += slen + 1; return str; } /* -------------------------------- * pq_getmsgend - verify message fully consumed * -------------------------------- */ void pq_getmsgend(StringInfo msg) { if (msg->cursor != msg->len) ExceptionalCondition("invalid msg format", __FILE__, __LINE__); } /* -------------------------------- * pq_sendbytes - append raw data to a StringInfo buffer * -------------------------------- */ void pq_sendbytes(StringInfo buf, const void *data, int datalen) { /* use variant that maintains a trailing null-byte, out of caution */ appendBinaryStringInfo(buf, data, datalen); } /* -------------------------------- * pq_send_ascii_string - append a null-terminated text string (without conversion) * * This function intentionally bypasses encoding conversion, instead just * silently replacing any non-7-bit-ASCII characters with question marks. * It is used only when we are having trouble sending an error message to * the client with normal localization and encoding conversion. The caller * should already have taken measures to ensure the string is just ASCII; * the extra work here is just to make certain we don't send a badly encoded * string to the client (which might or might not be robust about that). * * NB: passed text string must be null-terminated, and so is the data * sent to the frontend. * -------------------------------- */ void pq_send_ascii_string(StringInfo buf, const char *str) { while (*str) { char ch = *str++; if (IS_HIGHBIT_SET(ch)) ch = '?'; appendStringInfoCharMacro(buf, ch); } appendStringInfoChar(buf, '\0'); } /* * Produce a C-string representation of a TimestampTz. * * This is mostly for use in emitting messages. */ const char * timestamptz_to_str(TimestampTz t) { static char buf[MAXDATELEN + 1]; snprintf(buf, sizeof(buf), "TimestampTz(%ld)", t); return buf; } bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec) { TimestampTz diff = stop_time - start_time; return (diff >= msec * INT64CONST(1000)); } void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...) { char buf[1024]; va_list args; fmt = _(fmt); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); wp->api.log_internal(wp, elevel, buf); } ================================================ FILE: pgxn/neon/walproposer_pg.c ================================================ /* * Implementation of postgres based walproposer disk and IO routines, i.e. the * real ones. The reason this is separate from walproposer.c is ability to * replace them with mocks, allowing to do simulation testing. * * Also contains initialization of postgres based walproposer. */ #include "postgres.h" #include #include #include #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "access/xloginsert.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #include "storage/fd.h" #include "storage/latch.h" #include "miscadmin.h" #include "pgstat.h" #include "access/xlog.h" #include "libpq/pqformat.h" #include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender_private.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/timestamp.h" #include "libpq-fe.h" #include "libpqwalproposer.h" #include "neon.h" #include "neon_perf_counters.h" #include "neon_walreader.h" #include "walproposer.h" #define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ #define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ #define MB ((XLogRecPtr)1024 * 1024) #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" /* GUCs */ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; int safekeeper_proto_version = 3; char *safekeeper_conninfo_options = ""; /* BEGIN_HADRON */ int databricks_max_wal_mb_per_second = -1; // during throttling, we will limit the effective WAL write rate to 10KB. // PG can still push some WAL to SK, but at a very low rate. int databricks_throttled_max_wal_bytes_per_second = 10 * 1024; // The max sleep time of a batch. This is to make sure the rate limiter does not // overshoot too much and block PG for a very long time. // This is set as 5 minuetes for now. PG can send as much as 10MB of WALs to SK in one batch, // so this effectively caps the write rate to ~30KB/s in the worst case. static uint64 kRateLimitMaxBatchUSecs = 300 * USECS_PER_SEC; /* END_HADRON */ /* Set to true in the walproposer bgw. */ static bool am_walproposer; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; static const walproposer_api walprop_pg; static volatile sig_atomic_t got_SIGUSR2 = false; static bool reported_sigusr2 = false; static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; static HotStandbyFeedback agg_hs_feedback; static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static uint64 backpressure_lag_impl(void); static uint64 hadron_backpressure_lag_impl(void); static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); static void walprop_pg_init_standalone_sync_safekeepers(void); static void walprop_pg_init_walsender(void); static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static void WalproposerShmemInit_SyncSafekeeper(void); static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); static void WalSndLoop(WalProposer *wp); static void XLogBroadcastWalProposer(WalProposer *wp); static void add_nwr_event_set(Safekeeper *sk, uint32 events); static void update_nwr_event_set(Safekeeper *sk, uint32 events); static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); static void CheckGracefulShutdown(WalProposer *wp); /* BEGIN_HADRON */ shardno_t get_num_shards(void); static int positive_mb_to_bytes(int mb) { if (mb <= 0) { return mb; } else { return mb * 1024 * 1024; } } /* END_HADRON */ static void init_walprop_config(bool syncSafekeepers) { walprop_config.neon_tenant = neon_tenant; walprop_config.neon_timeline = neon_timeline; /* WalProposerCreate scribbles directly on it, so pstrdup */ walprop_config.safekeepers_list = pstrdup(wal_acceptors_list); walprop_config.safekeeper_conninfo_options = pstrdup(safekeeper_conninfo_options); walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout; walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout; walprop_config.wal_segment_size = wal_segment_size; walprop_config.syncSafekeepers = syncSafekeepers; if (!syncSafekeepers) walprop_config.systemId = GetSystemIdentifier(); else walprop_config.systemId = 0; walprop_config.pgTimeline = walprop_pg_get_timeline_id(); walprop_config.proto_version = safekeeper_proto_version; } /* * Entry point for `postgres --sync-safekeepers`. */ PGDLLEXPORT void WalProposerSync(int argc, char *argv[]) { WalProposer *wp; init_walprop_config(true); WalproposerShmemInit_SyncSafekeeper(); walprop_pg_init_standalone_sync_safekeepers(); walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); WalProposerStart(wp); } /* * WAL proposer bgworker entry point. */ PGDLLEXPORT void WalProposerMain(Datum main_arg) { WalProposer *wp; if (*wal_acceptors_list == '\0') { wpg_log(WARNING, "Safekeepers list is empty"); return; } init_walprop_config(false); walprop_pg_init_bgworker(); am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); wp->localTimeLineID = GetWALInsertionTimeLine(); wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp); walprop_pg_init_walsender(); WalProposerStart(wp); } /* * Initialize GUCs, bgworker, shmem and backpressure. */ void pg_init_walproposer(void) { if (!process_shared_preload_libraries_in_progress) return; nwp_register_gucs(); delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; walprop_register_bgworker(); } static void nwp_register_gucs(void) { DefineCustomStringVariable( "neon.safekeepers", "List of Neon WAL acceptors (host:port)", NULL, /* long_desc */ &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ PGC_SIGHUP, GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ NULL, assign_neon_safekeepers, NULL); DefineCustomStringVariable( "neon.safekeeper_conninfo_options", "libpq keyword parameters and values to apply to safekeeper connections", NULL, &safekeeper_conninfo_options, "", PGC_POSTMASTER, 0, NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", "Walproposer reconnects to offline safekeepers once in this interval.", NULL, &wal_acceptor_reconnect_timeout, 1000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", "Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.", NULL, &wal_acceptor_connection_timeout, 10000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_proto_version", "Version of compute <-> safekeeper protocol.", "Used while migrating from 2 to 3.", &safekeeper_proto_version, 3, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); /* BEGIN_HADRON */ DefineCustomIntVariable( "databricks.max_wal_mb_per_second", "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.", NULL, &databricks_max_wal_mb_per_second, -1, -1, INT_MAX, PGC_SUSET, GUC_UNIT_MB, NULL, NULL, NULL); DefineCustomIntVariable( "databricks.throttled_max_wal_bytes_per_second", "The maximum WAL bytes per second when PG is being throttled.", NULL, &databricks_throttled_max_wal_bytes_per_second, 10 * 1024, 0, INT_MAX, PGC_SUSET, GUC_UNIT_BYTE, NULL, NULL, NULL); /* END_HADRON */ } static int split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) { int n_safekeepers = 0; char *curr_sk = safekeepers_list; for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) { if (++n_safekeepers >= MAX_SAFEKEEPERS) { wpg_log(FATAL, "too many safekeepers"); } coma = strchr(coma, ','); safekeepers[n_safekeepers - 1] = curr_sk; if (coma != NULL) { *coma++ = '\0'; } } return n_safekeepers; } static char *split_off_safekeepers_generation(char *safekeepers_list, uint32 *generation) { char *endptr; if (strncmp(safekeepers_list, "g#", 2) != 0) { return safekeepers_list; } else { errno = 0; *generation = strtoul(safekeepers_list + 2, &endptr, 10); if (errno != 0) { wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); } if (*endptr != ':') { wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); } return endptr + 1; } } /* * Accept two coma-separated strings with list of safekeeper host:port addresses. * Split them into arrays and return false if two sets do not match, ignoring the order. */ static bool safekeepers_cmp(char *old, char *new) { char *safekeepers_old[MAX_SAFEKEEPERS]; char *safekeepers_new[MAX_SAFEKEEPERS]; int len_old = 0; int len_new = 0; uint32 gen_old = INVALID_GENERATION; uint32 gen_new = INVALID_GENERATION; old = split_off_safekeepers_generation(old, &gen_old); new = split_off_safekeepers_generation(new, &gen_new); if (gen_old != gen_new) { return false; } len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); if (len_old != len_new) { return false; } qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp); qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp); for (int i = 0; i < len_new; i++) { if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0) { return false; } } return true; } /* * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if * the list changed. */ static void assign_neon_safekeepers(const char *newval, void *extra) { char *newval_copy; char *oldval; if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress()) walprop_shared->replica_promote = true; if (!am_walproposer) return; if (!newval) { /* should never happen */ wpg_log(FATAL, "neon.safekeepers is empty"); } /* Copy values because we will modify them in split_safekeepers_list() */ newval_copy = pstrdup(newval); oldval = pstrdup(wal_acceptors_list); /* * TODO: restarting through FATAL is stupid and introduces 1s delay before * next bgw start. We should refactor walproposer to allow graceful exit * and thus remove this delay. XXX: If you change anything here, sync with * test_safekeepers_reconfigure_reorder. */ if (!safekeepers_cmp(oldval, newval_copy)) { wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", wal_acceptors_list, newval); } pfree(newval_copy); pfree(oldval); } /* BEGIN_HADRON */ static uint64 hadron_backpressure_lag_impl(void) { struct WalproposerShmemState* state = NULL; uint64 lag = 0; if(max_cluster_size < 0){ // if max cluster size is not set, then we don't apply backpressure because we're reconfiguring PG return 0; } lag = backpressure_lag_impl(); state = GetWalpropShmemState(); if ( state != NULL && databricks_max_wal_mb_per_second != -1 ) { int old_limit = pg_atomic_read_u32(&state->wal_rate_limiter.effective_max_wal_bytes_per_second); int new_limit = (lag == 0)? positive_mb_to_bytes(databricks_max_wal_mb_per_second) : databricks_throttled_max_wal_bytes_per_second; if( old_limit != new_limit ) { uint64 batch_start_time = pg_atomic_read_u64(&state->wal_rate_limiter.batch_start_time_us); uint64 batch_end_time = pg_atomic_read_u64(&state->wal_rate_limiter.batch_end_time_us); // the rate limit has changed, we need to reset the rate limiter's batch end time pg_atomic_write_u32(&state->wal_rate_limiter.effective_max_wal_bytes_per_second, new_limit); pg_atomic_write_u64(&state->wal_rate_limiter.batch_end_time_us, Min(batch_start_time + USECS_PER_SEC, batch_end_time)); } if( new_limit == -1 ) { return 0; } if (pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == true) { TimestampTz now = GetCurrentTimestamp(); struct WalRateLimiter *limiter = &state->wal_rate_limiter; uint64 batch_end_time = pg_atomic_read_u64(&limiter->batch_end_time_us); if ( now >= batch_end_time ) { /* * The backend has past the batch end time and it's time to push more WALs. * If the backends are pushing WALs too fast, the wal proposer will rate limit them again. */ uint32 expected = true; pg_atomic_compare_exchange_u32(&state->wal_rate_limiter.should_limit, &expected, false); return 0; } return Max(lag, 1); } // rate limiter decides to not throttle, then return 0. return 0; } return lag; } /* END_HADRON */ /* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { XLogRecPtr writePtr; XLogRecPtr flushPtr; XLogRecPtr applyPtr; #if PG_VERSION_NUM >= 150000 XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); #else XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr)); if (lakebase_mode) { // in case PG does not have shard map initialized, we assume PG always has 1 shard at minimum. shardno_t num_shards = Max(1, get_num_shards()); int tenant_max_replication_apply_lag = num_shards * max_replication_apply_lag; int tenant_max_replication_flush_lag = num_shards * max_replication_flush_lag; int tenant_max_replication_write_lag = num_shards * max_replication_write_lag; if ((writePtr != InvalidXLogRecPtr && tenant_max_replication_write_lag > 0 && myFlushLsn > writePtr + tenant_max_replication_write_lag * MB)) { return (myFlushLsn - writePtr - tenant_max_replication_write_lag * MB); } if ((flushPtr != InvalidXLogRecPtr && tenant_max_replication_flush_lag > 0 && myFlushLsn > flushPtr + tenant_max_replication_flush_lag * MB)) { return (myFlushLsn - flushPtr - tenant_max_replication_flush_lag * MB); } if ((applyPtr != InvalidXLogRecPtr && tenant_max_replication_apply_lag > 0 && myFlushLsn > applyPtr + tenant_max_replication_apply_lag * MB)) { return (myFlushLsn - applyPtr - tenant_max_replication_apply_lag * MB); } } else { if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) { return (myFlushLsn - writePtr - max_replication_write_lag * MB); } if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } } } return 0; } /* * We don't apply backpressure when we're the postmaster, or the startup * process, because in postmaster we can't apply backpressure, and in * the startup process we can't afford to slow down. */ static uint64 startup_backpressure_wrap(void) { if (AmStartupProcess() || !IsUnderPostmaster) return 0; delay_backend_us = &hadron_backpressure_lag_impl; return hadron_backpressure_lag_impl(); } /* * WalproposerShmemSize --- report amount of shared memory space needed */ static Size WalproposerShmemSize(void) { return sizeof(WalproposerShmemState); } void WalproposerShmemInit(void) { bool found; walprop_shared = ShmemInitStruct("Walproposer shared state", sizeof(WalproposerShmemState), &found); if (!found) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); /* BEGIN_HADRON */ pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.effective_max_wal_bytes_per_second, -1); pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.batch_start_time_us, 0); pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.batch_end_time_us, 0); /* END_HADRON */ } } static void WalproposerShmemInit_SyncSafekeeper(void) { walprop_shared = palloc(WalproposerShmemSize()); memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); /* BEGIN_HADRON */ pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.effective_max_wal_bytes_per_second, -1); pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.batch_start_time_us, 0); pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.batch_end_time_us, 0); /* END_HADRON */ } #define BACK_PRESSURE_DELAY 10000L // 0.01 sec static bool backpressure_throttling_impl(void) { uint64 lag; TimestampTz start, stop; bool retry = false; char *new_status = NULL; const char *old_status; int len; if (PointerIsValid(PrevProcessInterruptsCallback)) retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE * INDEX CONCURRENTLY, however. It performs some stages outside a * transaction, even though it writes a lot of WAL. Check PROC_IN_SAFE_IC * flag to cover that case. */ if (am_walsender || (!(MyProc->statusFlags & PROC_IN_SAFE_IC) && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))) return retry; /* Calculate replicas lag */ lag = hadron_backpressure_lag_impl(); if (lag == 0) return retry; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 64 + 1); memcpy(new_status, old_status, len); snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); set_ps_display(new_status); new_status[len] = '\0'; /* truncate off " backpressure ..." to later * reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); pg_usleep(BACK_PRESSURE_DELAY); stop = GetCurrentTimestamp(); pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); /* Reset ps display */ set_ps_display(new_status); pfree(new_status); return true; } uint64 BackpressureThrottlingTime(void) { return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); } /* * Register a background worker proposing WAL to wal acceptors. * We start walproposer bgworker even for replicas in order to support possible replica promotion. * When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished * is automatically launched when promotion is completed. */ static void walprop_register_bgworker(void) { BackgroundWorker bgw; memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); bgw.bgw_restart_time = 1; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; RegisterBackgroundWorker(&bgw); } /* shmem handling */ /* * shmem_request hook: request additional shared resources. We'll allocate or * attach to the shared resources in WalproposerShmemInit(). */ void WalproposerShmemRequest(void) { RequestAddinShmemSpace(WalproposerShmemSize()); } WalproposerShmemState * GetWalpropShmemState(void) { Assert(walprop_shared != NULL); return walprop_shared; } static WalproposerShmemState * walprop_pg_get_shmem_state(WalProposer *wp) { Assert(walprop_shared != NULL); return walprop_shared; } /* * Record new ps_feedback in the array with shards and update min_feedback. */ static PageserverFeedback record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards) { PageserverFeedback min_feedback; Assert(ps_feedback->present); Assert(ps_feedback->shard_number < MAX_SHARDS); Assert(ps_feedback->shard_number < num_shards); // Begin Hadron: Record any corruption signal from the pageserver first. if (ps_feedback->corruption_detected) { pg_atomic_write_u32(&databricks_metrics_shared->ps_corruption_detected, 1); } SpinLockAcquire(&walprop_shared->mutex); // Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive // a new pageserver feedback. walprop_shared->num_shards = Max(walprop_shared->num_shards, num_shards); /* Update the feedback */ memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); /* Calculate min LSNs */ memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback)); for (int i = 0; i < walprop_shared->num_shards; i++) { PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; if (feedback->present) { if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) min_feedback.last_received_lsn = feedback->last_received_lsn; if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; } } /* Copy min_feedback back to shmem */ memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback)); SpinLockRelease(&walprop_shared->mutex); return min_feedback; } void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); *writeLsn = walprop_shared->min_ps_feedback.last_received_lsn; *flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn; *applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* * Start walproposer streaming replication */ static void walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) { StartReplicationCmd cmd; wpg_log(LOG, "WAL proposer starts streaming at %X/%X", LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; cmd.timeline = wp->config->pgTimeline; cmd.startpoint = startpos; StartProposerReplication(wp, &cmd); } static void walprop_pg_init_walsender(void) { am_walsender = true; InitWalSender(); InitProcessPhase2(); /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { #if PG_MAJORVERSION_NUM >= 17 ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false, false, false); #else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); #endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); ReplicationSlotSave(); ReplicationSlotRelease(); } } static void walprop_pg_init_standalone_sync_safekeepers(void) { struct stat stat_buf; #if PG_VERSION_NUM < 150000 ThisTimeLineID = 1; #endif /* * Initialize postmaster_alive_fds as WaitEventSet checks them. * * Copied from InitPostmasterDeathWatchHandle() */ if (pipe(postmaster_alive_fds) < 0) ereport(FATAL, (errcode_for_file_access(), errmsg_internal("could not create pipe to monitor postmaster death: %m"))); if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) ereport(FATAL, (errcode_for_socket_access(), errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); ChangeToDataDir(); /* Create pg_wal directory, if it doesn't exist */ if (stat(XLOGDIR, &stat_buf) != 0) { ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); if (MakePGDirectory(XLOGDIR) < 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", XLOGDIR))); exit(1); } } BackgroundWorkerUnblockSignals(); } /* * We pretend to be a walsender process, and the lifecycle of a walsender is * slightly different than other procesess. At shutdown, walsender processes * stay alive until the very end, after the checkpointer has written the * shutdown checkpoint. When the checkpointer exits, the postmaster sends all * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send * the remaining WAL, and then exit. This ensures that the checkpoint record * reaches durable storage (in safekeepers), before the server shuts down * completely. */ static void walprop_sigusr2(SIGNAL_ARGS) { int save_errno = errno; got_SIGUSR2 = true; SetLatch(MyLatch); errno = save_errno; } static void walprop_pg_init_bgworker(void) { #if PG_VERSION_NUM >= 150000 TimeLineID tli; #endif /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); pqsignal(SIGUSR2, walprop_sigusr2); BackgroundWorkerUnblockSignals(); application_name = (char *) "walproposer"; /* for * synchronous_standby_names */ #if PG_VERSION_NUM >= 150000 /* FIXME pass proper tli to WalProposerInit ? */ GetXLogReplayRecPtr(&tli); #else GetXLogReplayRecPtr(&ThisTimeLineID); #endif } static XLogRecPtr walprop_pg_get_flush_rec_ptr(WalProposer *wp) { #if PG_MAJORVERSION_NUM < 15 return GetFlushRecPtr(); #else return GetFlushRecPtr(NULL); #endif } static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp) { return GetCurrentTimestamp(); } TimeLineID walprop_pg_get_timeline_id(void) { #if PG_VERSION_NUM >= 150000 /* FIXME don't use hardcoded timeline id */ return 1; #else return ThisTimeLineID; #endif } static void walprop_pg_load_libpqwalreceiver(void) { load_file("libpqwalreceiver", false); if (WalReceiverFunctions == NULL) wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } static void walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn) { WalproposerShmemState *wps = wp->api.get_shmem_state(wp); char donor_name[64]; pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port); SpinLockAcquire(&wps->mutex); memcpy(wps->donor_name, donor_name, sizeof(donor_name)); memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo)); wps->donor_lsn = donor_lsn; SpinLockRelease(&wps->mutex); } /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) { /* If we're already correctly blocking or nonblocking, all good */ if (is_nonblocking == conn->is_nonblocking) return true; /* Otherwise, set it appropriately */ if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) return false; conn->is_nonblocking = is_nonblocking; return true; } /* Exported function definitions */ static char * walprop_error_message(Safekeeper *sk) { return PQerrorMessage(sk->conn->pg_conn); } static WalProposerConnStatusType walprop_status(Safekeeper *sk) { switch (PQstatus(sk->conn->pg_conn)) { case CONNECTION_OK: return WP_CONNECTION_OK; case CONNECTION_BAD: return WP_CONNECTION_BAD; default: return WP_CONNECTION_IN_PROGRESS; } } WalProposerConn * libpqwp_connect_start(char *conninfo) { PGconn *pg_conn; WalProposerConn *conn; const char *keywords[3]; const char *values[3]; int n; char *password = neon_auth_token; /* * Connect using the given connection string. If the NEON_AUTH_TOKEN * environment variable was set, use that as the password. * * The connection options are parsed in the order they're given, so when * we set the password before the connection string, the connection string * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ n = 0; if (password) { keywords[n] = "password"; values[n] = password; n++; } keywords[n] = "dbname"; values[n] = conninfo; n++; keywords[n] = NULL; values[n] = NULL; n++; pg_conn = PQconnectStartParams(keywords, values, 1); /* * "If the result is null, then libpq has been unable to allocate a new * PGconn structure" */ if (!pg_conn) wpg_log(FATAL, "failed to allocate new PGconn object"); /* * And in theory this allocation can fail as well, but it's incredibly * unlikely if we just successfully allocated a PGconn. * * palloc will exit on failure though, so there's not much we could do if * it *did* fail. */ conn = (WalProposerConn*)MemoryContextAllocZero(TopMemoryContext, sizeof(WalProposerConn)); conn->pg_conn = pg_conn; conn->is_nonblocking = false; /* connections always start in blocking * mode */ conn->recvbuf = NULL; return conn; } static void walprop_connect_start(Safekeeper *sk) { Assert(sk->conn == NULL); sk->conn = libpqwp_connect_start(sk->conninfo); } static WalProposerConnectPollStatusType walprop_connect_poll(Safekeeper *sk) { WalProposerConnectPollStatusType return_val; switch (PQconnectPoll(sk->conn->pg_conn)) { case PGRES_POLLING_FAILED: return_val = WP_CONN_POLLING_FAILED; break; case PGRES_POLLING_READING: return_val = WP_CONN_POLLING_READING; break; case PGRES_POLLING_WRITING: return_val = WP_CONN_POLLING_WRITING; break; case PGRES_POLLING_OK: return_val = WP_CONN_POLLING_OK; break; /* * There's a comment at its source about this constant being * unused. We'll expect it's never returned. */ case PGRES_POLLING_ACTIVE: wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); /* * This return is never actually reached, but it's here to make * the compiler happy */ return WP_CONN_POLLING_FAILED; default: Assert(false); return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ } return return_val; } extern bool libpqwp_send_query(WalProposerConn *conn, char *query) { /* * We need to be in blocking mode for sending the query to run without * requiring a call to PQflush */ if (!ensure_nonblocking_status(conn, false)) return false; /* PQsendQuery returns 1 on success, 0 on failure */ if (!PQsendQuery(conn->pg_conn, query)) return false; return true; } static bool walprop_send_query(Safekeeper *sk, char *query) { return libpqwp_send_query(sk->conn, query); } WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn) { PGresult *result; WalProposerExecStatusType return_val; /* Marker variable if we need to log an unexpected success result */ char *unexpected_success = NULL; /* Consume any input that we might be missing */ if (!PQconsumeInput(conn->pg_conn)) return WP_EXEC_FAILED; if (PQisBusy(conn->pg_conn)) return WP_EXEC_NEEDS_INPUT; result = PQgetResult(conn->pg_conn); /* * PQgetResult returns NULL only if getting the result was successful & * there's no more of the result to get. */ if (!result) { wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); return WP_EXEC_UNEXPECTED_SUCCESS; } /* Helper macro to reduce boilerplate */ #define UNEXPECTED_SUCCESS(msg) \ return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ unexpected_success = msg; \ break; switch (PQresultStatus(result)) { /* "true" success case */ case PGRES_COPY_BOTH: return_val = WP_EXEC_SUCCESS_COPYBOTH; break; /* Unexpected success case */ case PGRES_EMPTY_QUERY: UNEXPECTED_SUCCESS("empty query return"); case PGRES_COMMAND_OK: UNEXPECTED_SUCCESS("data-less command end"); case PGRES_TUPLES_OK: UNEXPECTED_SUCCESS("tuples return"); case PGRES_COPY_OUT: UNEXPECTED_SUCCESS("'Copy Out' response"); case PGRES_COPY_IN: UNEXPECTED_SUCCESS("'Copy In' response"); case PGRES_SINGLE_TUPLE: UNEXPECTED_SUCCESS("single tuple return"); case PGRES_PIPELINE_SYNC: UNEXPECTED_SUCCESS("pipeline sync point"); /* Failure cases */ case PGRES_BAD_RESPONSE: case PGRES_NONFATAL_ERROR: case PGRES_FATAL_ERROR: case PGRES_PIPELINE_ABORTED: return_val = WP_EXEC_FAILED; break; default: Assert(false); return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ } if (unexpected_success) wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); return return_val; } static WalProposerExecStatusType walprop_get_query_result(Safekeeper *sk) { return libpqwp_get_query_result(sk->conn); } static pgsocket walprop_socket(Safekeeper *sk) { return PQsocket(sk->conn->pg_conn); } static int walprop_flush(Safekeeper *sk) { return (PQflush(sk->conn->pg_conn)); } /* Like libpqrcv_receive. *buf is valid until the next call. */ PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount) { int rawlen; if (conn->recvbuf != NULL) { PQfreemem(conn->recvbuf); conn->recvbuf = NULL; } /* Try to receive a CopyData message */ rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); if (rawlen == 0) { /* Try consuming some data. */ if (!PQconsumeInput(conn->pg_conn)) { *amount = 0; *buf = NULL; return PG_ASYNC_READ_FAIL; } /* Now that we've consumed some input, try again */ rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); } /* * The docs for PQgetCopyData list the return values as: 0 if the copy is * still in progress, but no "complete row" is available -1 if the copy is * done -2 if an error occurred (> 0) if it was successful; that value is * the amount transferred. * * The protocol we use between walproposer and safekeeper means that we * *usually* wouldn't expect to see that the copy is done, but this can * sometimes be triggered by the server returning an ErrorResponse (which * also happens to have the effect that the copy is done). */ switch (rawlen) { case 0: *amount = 0; *buf = NULL; return PG_ASYNC_READ_TRY_AGAIN; case -1: { /* * If we get -1, it's probably because of a server error; the * safekeeper won't normally send a CopyDone message. * * We can check PQgetResult to make sure that the server * failed; it'll always result in PGRES_FATAL_ERROR */ ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); if (status != PGRES_FATAL_ERROR) wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status); /* * If there was actually an error, it'll be properly reported * by calls to PQerrorMessage -- we don't have to do anything * else */ *amount = 0; *buf = NULL; return PG_ASYNC_READ_FAIL; } case -2: *amount = 0; *buf = NULL; return PG_ASYNC_READ_FAIL; default: /* Positive values indicate the size of the returned result */ *amount = rawlen; *buf = conn->recvbuf; return PG_ASYNC_READ_SUCCESS; } } /* * Receive a message from the safekeeper. * * On success, the data is placed in *buf. It is valid until the next call * to this function. */ static PGAsyncReadResult walprop_async_read(Safekeeper *sk, char **buf, int *amount) { return libpqwp_async_read(sk->conn, buf, amount); } static PGAsyncWriteResult walprop_async_write(Safekeeper *sk, void const *buf, size_t size) { int result; /* If we aren't in non-blocking mode, switch to it. */ if (!ensure_nonblocking_status(sk->conn, true)) return PG_ASYNC_WRITE_FAIL; /* * The docs for PQputcopyData list the return values as: 1 if the data was * queued, 0 if it was not queued because of full buffers, or -1 if an * error occurred */ result = PQputCopyData(sk->conn->pg_conn, buf, size); /* * We won't get a result of zero because walproposer always empties the * connection's buffers before sending more */ Assert(result != 0); switch (result) { case 1: /* good -- continue */ break; case -1: return PG_ASYNC_WRITE_FAIL; default: wpg_log(FATAL, "invalid return %d from PQputCopyData", result); } /* * After queueing the data, we still need to flush to get it to send. This * might take multiple tries, but we don't want to wait around until it's * done. * * PQflush has the following returns (directly quoting the docs): 0 if * sucessful, 1 if it was unable to send all the data in the send queue * yet -1 if it failed for some reason */ switch (result = PQflush(sk->conn->pg_conn)) { case 0: return PG_ASYNC_WRITE_SUCCESS; case 1: return PG_ASYNC_WRITE_TRY_FLUSH; case -1: return PG_ASYNC_WRITE_FAIL; default: wpg_log(FATAL, "invalid return %d from PQflush", result); } } /* * This function is very similar to walprop_async_write. For more * information, refer to the comments there. */ static bool walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size) { int result; /* If we are in non-blocking mode, switch out of it. */ if (!ensure_nonblocking_status(sk->conn, false)) return false; if ((result = PQputCopyData(sk->conn->pg_conn, buf, size)) == -1) return false; Assert(result == 1); /* Because the connection is non-blocking, flushing returns 0 or -1 */ if ((result = PQflush(sk->conn->pg_conn)) == -1) return false; Assert(result == 0); return true; } void libpqwp_disconnect(WalProposerConn *conn) { if (conn->recvbuf != NULL) PQfreemem(conn->recvbuf); PQfinish(conn->pg_conn); pfree(conn); } static void walprop_finish(Safekeeper *sk) { if (sk->conn) { libpqwp_disconnect(sk->conn); sk->conn = NULL; } /* free xlogreader */ if (sk->xlogreader) { NeonWALReaderFree(sk->xlogreader); sk->xlogreader = NULL; } rm_safekeeper_event_set(sk, false); } /* * Subscribe for new WAL and stream it in the loop to safekeepers. * * At the moment, this never returns, but an ereport(ERROR) will take us back * to the main loop. */ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ThisTimeLineID = 1; #endif /* * We assume here that we're logging enough information in the WAL for * log-shipping, since this is checked in PostmasterMain(). * * NOTE: wal_level can only change at shutdown, so in most cases it is * difficult for there to be WAL data that we can still see that was * written at wal_level='minimal'. */ if (cmd->slotname) { ReplicationSlotAcquire(cmd->slotname, true); if (SlotIsLogical(MyReplicationSlot)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot use a logical replication slot for physical replication"))); /* * We don't need to verify the slot's restart_lsn here; instead we * rely on the caller requesting the starting point to use. If the * WAL segment doesn't exist, we'll fail later. */ } /* * Select the timeline. If it was given explicitly by the client, use * that. Otherwise use the timeline of the last replayed record, which is * kept in ThisTimeLineID. * * Neon doesn't currently use PG Timelines, but it may in the future, so * we keep this code around to lighten the load for when we need it. */ #if PG_VERSION_NUM >= 150000 FlushPtr = GetFlushRecPtr(&currTLI); #else FlushPtr = GetFlushRecPtr(); currTLI = ThisTimeLineID; #endif /* * XXX: Move straight to STOPPING state, skipping the STREAMING state. * * This is a bit weird. Normal walsenders stay in STREAMING state, until * the checkpointer signals them that it is about to start writing the * shutdown checkpoint. The walsenders acknowledge that they have received * that signal by switching to STOPPING state. That tells the walsenders * that they must not write any new WAL. * * However, we cannot easily intercept that signal from the checkpointer. * It's sent by WalSndInitStopping(), using * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag. * However, that's all private to walsender.c. * * We don't need to do anything special upon receiving the signal, the * walproposer doesn't write any WAL anyway, so we skip the STREAMING * state and go directly to STOPPING mode. That way, the checkpointer * won't wait for us. */ WalSndSetState(WALSNDSTATE_STOPPING); /* * Don't allow a request to stream from a future point in WAL that hasn't * been flushed to disk in this server yet. */ if (FlushPtr < cmd->startpoint) { ereport(ERROR, (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", LSN_FORMAT_ARGS(cmd->startpoint), LSN_FORMAT_ARGS(FlushPtr)))); } /* Start streaming from the requested point */ sentPtr = cmd->startpoint; /* Initialize shared memory status, too */ SpinLockAcquire(&MyWalSnd->mutex); MyWalSnd->sentPtr = sentPtr; SpinLockRelease(&MyWalSnd->mutex); SyncRepInitConfig(); /* Infinite send loop, never returns */ WalSndLoop(wp); WalSndSetState(WALSNDSTATE_STARTUP); if (cmd->slotname) ReplicationSlotRelease(); } /* * Main loop that waits for LSN updates and calls the walproposer. * Synchronous replication sets latch in WalSndWakeup at walsender.c */ static void WalSndLoop(WalProposer *wp) { /* Clear any already-pending wakeups */ ResetLatch(MyLatch); for (;;) { CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(wp); WalProposerPoll(wp); } } /* * Notify walproposer about the new WAL position. */ static void XLogBroadcastWalProposer(WalProposer *wp) { XLogRecPtr startptr; XLogRecPtr endptr; struct WalproposerShmemState *state = NULL; TimestampTz now = 0; int effective_max_wal_bytes_per_second = 0; /* Start from the last sent position */ startptr = sentPtr; /* * Streaming the current timeline on a primary. * * Attempt to send all data that's already been written out and fsync'd to * disk. We cannot go further than what's been written out given the * current implementation of WALRead(). And in any case it's unsafe to * send WAL that is not securely down to disk on the primary: if the * primary subsequently crashes and restarts, standbys must not have * applied any WAL that got lost on the primary. */ #if PG_VERSION_NUM >= 150000 endptr = GetFlushRecPtr(NULL); #else endptr = GetFlushRecPtr(); #endif /* * Record the current system time as an approximation of the time at which * this WAL location was written for the purposes of lag tracking. * * In theory we could make XLogFlush() record a time in shmem whenever WAL * is flushed and we could get that time as well as the LSN when we call * GetFlushRecPtr() above (and likewise for the cascading standby * equivalent), but rather than putting any new code into the hot WAL path * it seems good enough to capture the time here. We should reach this * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that * may take some time, we read the WAL flush pointer and take the time * very close to together here so that we'll get a later position if it is * still moving. * * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, * this gives us a cheap approximation for the WAL flush time for this * LSN. * * Note that the LSN is not necessarily the LSN for the data contained in * the present message; it's the end of the WAL, which might be further * ahead. All the lag tracking machinery cares about is finding out when * that arbitrary LSN is eventually reported as written, flushed and * applied, so that it can measure the elapsed time. */ now = GetCurrentTimestamp(); LagTrackerWrite(endptr, now); /* Do we have any work to do? */ Assert(startptr <= endptr); if (endptr <= startptr) return; /* BEGIN_HADRON */ state = GetWalpropShmemState(); effective_max_wal_bytes_per_second = pg_atomic_read_u32(&state->wal_rate_limiter.effective_max_wal_bytes_per_second); if (effective_max_wal_bytes_per_second != -1 && state != NULL) { struct WalRateLimiter *limiter = &state->wal_rate_limiter; uint64 batch_end_time = pg_atomic_read_u64(&limiter->batch_end_time_us); if ( now >= batch_end_time ) { // Reset the rate limiter to start a new batch limiter->sent_bytes = 0; pg_atomic_write_u32(&limiter->should_limit, false); pg_atomic_write_u64(&limiter->batch_start_time_us, now); /* tentatively assign the batch end time as 1s from now. This could result in one of the following cases: 1. If sent_bytes does not reach effective_max_wal_bytes_per_second in 1s, then we will reset the current batch and clear sent_bytes. No throttling happens. 2. Otherwise, we will recompute the end time (below) based on how many bytes are actually written, and throttle PG until the batch end time. */ pg_atomic_write_u64(&limiter->batch_end_time_us, now + USECS_PER_SEC); } limiter->sent_bytes += (endptr - startptr); if (limiter->sent_bytes > effective_max_wal_bytes_per_second) { uint64_t batch_start_time = pg_atomic_read_u64(&limiter->batch_start_time_us); uint64 throttle_usecs = USECS_PER_SEC * limiter->sent_bytes / Max(effective_max_wal_bytes_per_second, 1); if (throttle_usecs > kRateLimitMaxBatchUSecs){ elog(LOG, "throttle_usecs %lu is too large, limiting to %lu", throttle_usecs, kRateLimitMaxBatchUSecs); throttle_usecs = kRateLimitMaxBatchUSecs; } pg_atomic_write_u32(&limiter->should_limit, true); pg_atomic_write_u64(&limiter->batch_end_time_us, batch_start_time + throttle_usecs); } } /* END_HADRON */ WalProposerBroadcast(wp, startptr, endptr); sentPtr = endptr; /* Update shared memory status */ { WalSnd *walsnd = MyWalSnd; SpinLockAcquire(&walsnd->mutex); walsnd->sentPtr = sentPtr; SpinLockRelease(&walsnd->mutex); } /* Report progress of XLOG streaming in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", LSN_FORMAT_ARGS(sentPtr)); set_ps_display(activitymsg); } } /* Used to download WAL before basebackup for walproposer/logical walsenders. No longer used, replaced by neon_walreader; but callback still exists because simulation tests use it. */ static bool WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { return true; } static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { char log_prefix[64]; snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } static NeonWALReadResult walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg) { NeonWALReadResult res; res = NeonWALRead(sk->xlogreader, buf, startptr, count, sk->wp->localTimeLineID); if (res == NEON_WALREAD_SUCCESS) { /* * If we have the socket subscribed, but walreader doesn't need any * events, it must mean that remote connection just closed hoping to * do next read locally. Remove the socket then. It is important to do * as otherwise next read might open another connection and we won't * be able to distinguish whether we have correct socket added in wait * event set. */ if (NeonWALReaderEvents(sk->xlogreader) == 0) rm_safekeeper_event_set(sk, false); } else if (res == NEON_WALREAD_ERROR) { *errmsg = NeonWALReaderErrMsg(sk->xlogreader); } return res; } static uint32 walprop_pg_wal_reader_events(Safekeeper *sk) { return NeonWALReaderEvents(sk->xlogreader); } static WaitEventSet *waitEvents; static void walprop_pg_free_event_set(WalProposer *wp) { if (waitEvents) { FreeWaitEventSet(waitEvents); waitEvents = NULL; } for (int i = 0; i < wp->n_safekeepers; i++) { wp->safekeeper[i].eventPos = -1; wp->safekeeper[i].nwrEventPos = -1; wp->safekeeper[i].nwrConnEstablished = false; } } static void walprop_pg_init_event_set(WalProposer *wp) { if (waitEvents) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ #if PG_MAJORVERSION_NUM >= 17 waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); #else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); #endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); for (int i = 0; i < wp->n_safekeepers; i++) { wp->safekeeper[i].eventPos = -1; wp->safekeeper[i].nwrEventPos = -1; wp->safekeeper[i].nwrConnEstablished = false; } } /* add safekeeper socket to wait event set */ static void walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) { Assert(sk->eventPos == -1); sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); } /* add neon wal reader socket to wait event set */ static void add_nwr_event_set(Safekeeper *sk, uint32 events) { Assert(sk->nwrEventPos == -1); sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk); sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader); wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); } static void walprop_pg_update_event_set(Safekeeper *sk, uint32 events) { /* eventPos = -1 when we don't have an event */ Assert(sk->eventPos != -1); ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } /* * Update neon_walreader event. * Can be called when nwr socket doesn't exist, does nothing in this case. */ static void update_nwr_event_set(Safekeeper *sk, uint32 events) { /* eventPos = -1 when we don't have an event */ if (sk->nwrEventPos != -1) ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL); } static void walprop_pg_active_state_update_event_set(Safekeeper *sk) { uint32 sk_events; uint32 nwr_events; Assert(sk->state == SS_ACTIVE); SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); /* * If we need to wait for neon_walreader, ensure we have up to date socket * in the wait event set. */ if (sk->active_state == SS_ACTIVE_READ_WAL) { /* * If conn is established and socket is thus stable, update the event * directly; otherwise re-add it. */ if (sk->nwrConnEstablished) { Assert(sk->nwrEventPos != -1); update_nwr_event_set(sk, nwr_events); } else { rm_safekeeper_event_set(sk, false); add_nwr_event_set(sk, nwr_events); } } else { /* * Hack: we should always set 0 here, but for random reasons * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least * some event. Since there is also no way to remove socket except * reconstructing the whole set, SafekeeperStateDesiredEvents instead * gives WL_SOCKET_CLOSED if socket exists. We never expect it to * trigger. * * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event * removal. */ #if PG_VERSION_NUM >= 150000 Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0); update_nwr_event_set(sk, WL_SOCKET_CLOSED); #else /* pg 14 */ rm_safekeeper_event_set(sk, false); #endif } walprop_pg_update_event_set(sk, sk_events); } static void walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove) { rm_safekeeper_event_set(to_remove, true); } /* * A hacky way to remove single event from the event set. Can be called if event * doesn't exist, does nothing in this case. * * Note: Internally, this completely reconstructs the event set. It should be * avoided if possible. * * If is_sk is true, socket of connection to safekeeper is removed; otherwise * socket of neon_walreader. */ static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk) { WalProposer *wp = to_remove->wp; wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d", to_remove->host, to_remove->port, is_sk); /* * Shortpath for exiting if have nothing to do. We never call this * function with safekeeper socket not existing, but do that with neon * walreader socket. */ if ((is_sk && to_remove->eventPos == -1) || (!is_sk && to_remove->nwrEventPos == -1)) { return; } /* Remove the existing event set, assign sk->eventPos = -1 */ walprop_pg_free_event_set(wp); /* Re-initialize it without adding any safekeeper events */ wp->api.init_event_set(wp); /* * loop through the existing safekeepers. If they aren't the one we're * removing, and if they have a socket we can use, re-add the applicable * events. */ for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; /* * If this safekeeper isn't offline, add events for it, except for the * event requested to remove. */ if (sk->state != SS_OFFLINE) { uint32 sk_events; uint32 nwr_events; SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); if (sk != to_remove || !is_sk) { /* will set sk->eventPos */ wp->api.add_safekeeper_event_set(sk, sk_events); } if ((sk != to_remove || is_sk) && nwr_events) { add_nwr_event_set(sk, nwr_events); } } } } static int walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events) { WaitEvent event = {0}; int rc = 0; bool late_cv_trigger = false; *sk = NULL; *events = 0; #if PG_MAJORVERSION_NUM >= 16 if (WalSndCtl != NULL) ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); /* * Now that we prepared the condvar, check flush ptr again -- it might * have changed before we subscribed to cv so we missed the wakeup. * * Do that only when we're interested in new WAL: without sync-safekeepers * and if election already passed. */ if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn) { ConditionVariableCancelSleep(); ResetLatch(MyLatch); CheckGracefulShutdown(wp); *events = WL_LATCH_SET; return 1; } #endif /* * Wait for a wait event to happen, or timeout: - Safekeeper socket can * become available for READ or WRITE - Our latch got set, because * * PG15-: We got woken up by a process triggering the WalSender * PG16+: * WalSndCtl->wal_flush_cv was triggered */ rc = WaitEventSetWait(waitEvents, timeout, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); #if PG_MAJORVERSION_NUM >= 16 if (WalSndCtl != NULL) late_cv_trigger = ConditionVariableCancelSleep(); #endif /* * Process config if requested. This restarts walproposer if safekeepers * list changed. Don't do that for sync-safekeepers because quite probably * it (re-reading config) won't work without some effort, and * sync-safekeepers should be quick to finish anyway. */ if (!wp->config->syncSafekeepers && ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); } /* * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) */ if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger) { /* Reset our latch */ ResetLatch(MyLatch); *events = WL_LATCH_SET; return 1; } /* * If the event contains something about the socket, it means we got an * event from a safekeeper socket. */ if (rc == 1 && (event.events & WL_SOCKET_MASK)) { *sk = (Safekeeper *) event.user_data; *events = event.events; return 1; } /* XXX: Can we have non-timeout event here? */ *events = event.events; return rc; } static void __attribute__((noreturn)) walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) { fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn)); exit(0); } /* * Like vanilla walsender, on sigusr2 send all remaining WAL and exit. * * Note that unlike sync-safekeepers waiting here is not reliable: we * don't check that majority of safekeepers received and persisted * commit_lsn -- only that walproposer reached it (which immediately * broadcasts new value). Doing that without incurring redundant control * file syncing would need wp -> sk protocol change. OTOH unlike * sync-safekeepers which must bump commit_lsn or basebackup will fail, * this catchup is important only for tests where safekeepers/network * don't crash on their own. */ static void CheckGracefulShutdown(WalProposer *wp) { if (got_SIGUSR2) { if (!reported_sigusr2) { XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp); wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X", LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr)); reported_sigusr2 = true; } if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp)) { wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting", LSN_FORMAT_ARGS(wp->commitLsn)); proc_exit(0); } } } /* * Combine hot standby feedbacks from all safekeepers. */ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; hs->xmin = InvalidFullTransactionId; hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } } /* * Based on commitLsn and safekeeper responses including pageserver feedback, * 1) Propagate cluster size received from ps to ensure the limit. * 2) Propagate pageserver LSN positions to ensure backpressure limits. * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters). * 4) Propagate hot standby feedback. * * None of that is functional in sync-safekeepers. */ static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { HotStandbyFeedback hsFeedback; bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; /* handle fresh ps_feedback */ if (sk->appendResponse.ps_feedback.present) { shardno_t num_shards = get_num_shards(); // During shard split, we receive ps_feedback from child shards before // the split commits and our shard map GUC has been updated. We must // filter out such feedback here because record_pageserver_feedback() // doesn't do it. // // NB: what we would actually want to happen is that we only receive // ps_feedback from the parent shards when the split is committed, then // apply the split to our set of tracked feedback and from here on only // receive ps_feedback from child shards. This filter condition doesn't // do that: if we split from N parent to 2N child shards, the first N // child shards' feedback messages will pass this condition, even before // the split is committed. That's a bit sloppy, but OK for now. if (sk->appendResponse.ps_feedback.shard_number < num_shards) { PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback, num_shards); /* Only one main shard sends non-zero currentClusterSize */ if (sk->appendResponse.ps_feedback.currentClusterSize > 0) SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); if (min_feedback.disk_consistent_lsn != standby_apply_lsn) { standby_apply_lsn = min_feedback.disk_consistent_lsn; needToAdvanceSlot = true; } } else { // HADRON elog(DEBUG2, "Ignoring pageserver feedback for unknown shard %d (current shard number %d)", sk->appendResponse.ps_feedback.shard_number, num_shards); } } if (wp->commitLsn > standby_flush_lsn) { standby_flush_lsn = wp->commitLsn; needToAdvanceSlot = true; } if (needToAdvanceSlot) { /* * Advance the replication slot to commitLsn. WAL before it is * hardened and will be fetched from one of safekeepers by * neon_walreader if needed. * * Also wakes up syncrep waiters. */ ProcessStandbyReply( /* write_lsn - This is what durably stored in safekeepers quorum. */ standby_flush_lsn, /* flush_lsn - This is what durably stored in safekeepers quorum. */ standby_flush_lsn, /* * apply_lsn - This is what processed and durably saved at* * pageserver. */ standby_apply_lsn, walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { FullTransactionId xmin = hsFeedback.xmin; FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; FullTransactionId next_xid = ReadNextFullTransactionId(); /* * Page server is updating nextXid in checkpoint each 1024 * transactions, so feedback xmin can be actually larger then nextXid * and function TransactionIdInRecentPast return false in this case, * preventing update of slot's xmin. */ if (FullTransactionIdPrecedes(next_xid, xmin)) xmin = next_xid; if (FullTransactionIdPrecedes(next_xid, catalog_xmin)) catalog_xmin = next_xid; agg_hs_feedback = hsFeedback; elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, XidFromFullTransactionId(xmin), EpochFromFullTransactionId(xmin), XidFromFullTransactionId(catalog_xmin), EpochFromFullTransactionId(catalog_xmin)); } CheckGracefulShutdown(wp); } static XLogRecPtr walprop_pg_get_redo_start_lsn(WalProposer *wp) { return GetRedoStartLsn(); } static bool walprop_pg_strong_random(WalProposer *wp, void *buf, size_t len) { return pg_strong_random(buf, len); } static void walprop_pg_log_internal(WalProposer *wp, int level, const char *line) { elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } void SetNeonCurrentClusterSize(uint64 size) { pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); } uint64 GetNeonCurrentClusterSize(void) { return pg_atomic_read_u64(&walprop_shared->currentClusterSize); } uint64 GetNeonCurrentClusterSize(void); /* BEGIN_HADRON */ static void walprop_pg_reset_safekeeper_statuses_for_metrics(WalProposer *wp, uint32 num_safekeepers) { WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); SpinLockAcquire(&shmem->mutex); shmem->num_safekeepers = num_safekeepers; memset(shmem->safekeeper_status, 0, sizeof(shmem->safekeeper_status)); SpinLockRelease(&shmem->mutex); } static void walprop_pg_update_safekeeper_status_for_metrics(WalProposer *wp, uint32 sk_index, uint8 status) { WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); Assert(sk_index < MAX_SAFEKEEPERS); SpinLockAcquire(&shmem->mutex); shmem->safekeeper_status[sk_index] = status; SpinLockRelease(&shmem->mutex); } /* END_HADRON */ static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, .update_donor = walprop_pg_update_donor, .get_current_timestamp = walprop_pg_get_current_timestamp, .conn_error_message = walprop_error_message, .conn_status = walprop_status, .conn_connect_start = walprop_connect_start, .conn_connect_poll = walprop_connect_poll, .conn_send_query = walprop_send_query, .conn_get_query_result = walprop_get_query_result, .conn_flush = walprop_flush, .conn_finish = walprop_finish, .conn_async_read = walprop_async_read, .conn_async_write = walprop_async_write, .conn_blocking_write = walprop_blocking_write, .recovery_download = WalProposerRecovery, .wal_reader_allocate = walprop_pg_wal_reader_allocate, .wal_read = walprop_pg_wal_read, .wal_reader_events = walprop_pg_wal_reader_events, .init_event_set = walprop_pg_init_event_set, .update_event_set = walprop_pg_update_event_set, .active_state_update_event_set = walprop_pg_active_state_update_event_set, .add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set, .rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set, .wait_event_set = walprop_pg_wait_event_set, .strong_random = walprop_pg_strong_random, .get_redo_start_lsn = walprop_pg_get_redo_start_lsn, .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, .log_internal = walprop_pg_log_internal, .reset_safekeeper_statuses_for_metrics = walprop_pg_reset_safekeeper_statuses_for_metrics, .update_safekeeper_status_for_metrics = walprop_pg_update_safekeeper_status_for_metrics, }; ================================================ FILE: pgxn/neon/walsender_hooks.c ================================================ /*------------------------------------------------------------------------- * * walsender_hooks.c * * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for * fetching WAL from safekeepers, which normal xlogreader can't do. * *------------------------------------------------------------------------- */ #include "walsender_hooks.h" #include "postgres.h" #include "fmgr.h" #include "access/xlogdefs.h" #include "replication/walsender.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "miscadmin.h" #include "utils/wait_event.h" #include "utils/guc.h" #include "postmaster/interrupt.h" #include "neon.h" #include "neon_walreader.h" #include "walproposer.h" static NeonWALReader *wal_reader = NULL; struct WalSnd; extern struct WalSnd *MyWalSnd; extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); extern bool GetDonorShmem(XLogRecPtr *donor_lsn); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); bool disable_wal_prev_lsn_checks = false; static XLogRecPtr NeonWALReadWaitForWAL(XLogRecPtr loc) { while (!NeonWALReaderUpdateDonor(wal_reader)) { pg_usleep(1000); CHECK_FOR_INTERRUPTS(); } // Walsender sends keepalives and stuff, so better use its normal wait if (MyWalSnd != NULL) return WalSndWaitForWal(loc); for (;;) { XLogRecPtr flush_ptr; if (!RecoveryInProgress()) #if PG_VERSION_NUM >= 150000 flush_ptr = GetFlushRecPtr(NULL); #else flush_ptr = GetFlushRecPtr(); #endif else flush_ptr = GetXLogReplayRecPtr(NULL); if (loc <= flush_ptr) return flush_ptr; CHECK_FOR_INTERRUPTS(); pg_usleep(1000); } } static int NeonWALPageRead( XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf) { XLogRecPtr rem_lsn; /* Wait for flush pointer to advance past our request */ XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen); int count; if (flushptr < targetPagePtr + reqLen) return -1; xlogreader->skip_lsn_checks = disable_wal_prev_lsn_checks; /* Read at most XLOG_BLCKSZ bytes */ if (targetPagePtr + XLOG_BLCKSZ <= flushptr) count = XLOG_BLCKSZ; else count = flushptr - targetPagePtr; /* * Sometimes walsender requests non-monotonic sequences of WAL. If that's * the case, we have to reset streaming from remote at the correct * position. For example, walsender may try to verify the segment header * when trying to read in the middle of it. */ rem_lsn = NeonWALReaderGetRemLsn(wal_reader); if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn) { NeonWALReaderResetRemote(wal_reader); } for (;;) { NeonWALReadResult res = NeonWALRead( wal_reader, readBuf, targetPagePtr, count, NeonWALReaderLocalActiveTimeLineID(wal_reader)); if (res == NEON_WALREAD_SUCCESS) { /* * Setting ws_tli is required by the XLogReaderRoutine, it is used * for segment name generation in error reports. * * ReadPageInternal updates ws_segno after calling cb on its own * and XLogReaderRoutine description doesn't require it, but * WALRead sets, let's follow it. */ xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli; xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno; /* * ws_file doesn't exist in case of remote read, and isn't used by * xlogreader except by WALRead on which we don't rely anyway. */ return count; } if (res == NEON_WALREAD_ERROR) { elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s", LSN_FORMAT_ARGS(targetPagePtr), reqLen, NeonWALReaderErrMsg(wal_reader)); return -1; } /* * Res is WOULDBLOCK, so we wait on the socket, recreating event set * if necessary */ { pgsocket sock = NeonWALReaderSocket(wal_reader); uint32_t reader_events = NeonWALReaderEvents(wal_reader); long timeout_ms = 1000; ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); if (ConfigReloadPending) { ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); } WaitLatchOrSocket( MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, sock, timeout_ms, WAIT_EVENT_NEON_WAL_DL); } } } static void NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p) { neon_wal_segment_open(wal_reader, nextSegNo, tli_p); xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; } static void NeonWALReadSegmentClose(XLogReaderState *xlogreader) { neon_wal_segment_close(wal_reader); xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; } void NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { /* * If safekeepers are not configured, assume we don't need neon_walreader, * i.e. running neon fork locally. */ if (wal_acceptors_list[0] == '\0') return; if (!wal_reader) { XLogRecPtr basebackupLsn = GetRedoStartLsn(); /* should never happen */ if (basebackupLsn == 0) { elog(ERROR, "unable to start walsender when basebackupLsn is 0"); } wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1); } xlr->page_read = NeonWALPageRead; xlr->segment_open = NeonWALReadSegmentOpen; xlr->segment_close = NeonWALReadSegmentClose; } ================================================ FILE: pgxn/neon/walsender_hooks.h ================================================ #ifndef __WALSENDER_HOOKS_H__ #define __WALSENDER_HOOKS_H__ struct XLogReaderRoutine; void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr); #endif ================================================ FILE: pgxn/neon_rmgr/Makefile ================================================ # pgxs/neon/Makefile MODULE_big = neon_rmgr OBJS = \ $(WIN32RES) \ neon_rmgr.o \ neon_rmgr_decode.o \ neon_rmgr_desc.o EXTENSION = neon_rmgr DATA = PGFILEDESC = "Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)" PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) ================================================ FILE: pgxn/neon_rmgr/neon_rmgr.c ================================================ #include "postgres.h" #include "fmgr.h" #if PG_MAJORVERSION_NUM >= 16 #include "access/bufmask.h" #include "access/heapam_xlog.h" #include "access/htup_details.h" #include "access/neon_xlog.h" #include "access/rmgr.h" #include "access/visibilitymap.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/buf.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/freespace.h" #include "neon_rmgr.h" PG_MODULE_MAGIC; void _PG_init(void); static void neon_rm_redo(XLogReaderState *record); static void neon_rm_startup(void); static void neon_rm_cleanup(void); static void neon_rm_mask(char *pagedata, BlockNumber blkno); static void redo_neon_heap_insert(XLogReaderState *record); static void redo_neon_heap_delete(XLogReaderState *record); static void redo_neon_heap_update(XLogReaderState *record, bool hot_update); static void redo_neon_heap_lock(XLogReaderState *record); static void redo_neon_heap_multi_insert(XLogReaderState *record); const static RmgrData NeonRmgr = { .rm_name = "neon", .rm_redo = neon_rm_redo, .rm_desc = neon_rm_desc, .rm_identify = neon_rm_identify, .rm_startup = neon_rm_startup, .rm_cleanup = neon_rm_cleanup, .rm_mask = neon_rm_mask, .rm_decode = neon_rm_decode, }; void _PG_init(void) { if (!process_shared_preload_libraries_in_progress) return; RegisterCustomRmgr(RM_NEON_ID, &NeonRmgr); } static void neon_rm_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info & XLOG_NEON_OPMASK) { case XLOG_NEON_HEAP_INSERT: redo_neon_heap_insert(record); break; case XLOG_NEON_HEAP_DELETE: redo_neon_heap_delete(record); break; case XLOG_NEON_HEAP_UPDATE: redo_neon_heap_update(record, false); break; case XLOG_NEON_HEAP_HOT_UPDATE: redo_neon_heap_update(record, true); break; case XLOG_NEON_HEAP_LOCK: redo_neon_heap_lock(record); break; case XLOG_NEON_HEAP_MULTI_INSERT: redo_neon_heap_multi_insert(record); break; default: elog(PANIC, "neon_rm_redo: unknown op code %u", info); } } static void neon_rm_startup(void) { /* nothing to do here */ } static void neon_rm_cleanup(void) { /* nothing to do here */ } static void neon_rm_mask(char *pagedata, BlockNumber blkno) { Page page = (Page) pagedata; OffsetNumber off; mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); mask_unused_space(page); for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) { ItemId iid = PageGetItemId(page, off); char *page_item; page_item = (char *) (page + ItemIdGetOffset(iid)); if (ItemIdIsNormal(iid)) { HeapTupleHeader page_htup = (HeapTupleHeader) page_item; /* * If xmin of a tuple is not yet frozen, we should ignore * differences in hint bits, since they can be set without * emitting WAL. */ if (!HeapTupleHeaderXminFrozen(page_htup)) page_htup->t_infomask &= ~HEAP_XACT_MASK; else { /* Still we need to mask xmax hint bits. */ page_htup->t_infomask &= ~HEAP_XMAX_INVALID; page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; } /* * During replay, we set Command Id to FirstCommandId. Hence, mask * it. See heap_xlog_insert() for details. */ page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; /* * For a speculative tuple, heap_insert() does not set ctid in the * caller-passed heap tuple itself, leaving the ctid field to * contain a speculative token value - a per-backend monotonically * increasing identifier. Besides, it does not WAL-log ctid under * any circumstances. * * During redo, heap_xlog_insert() sets t_ctid to current block * number and self offset number. It doesn't care about any * speculative insertions on the primary. Hence, we set t_ctid to * current block number and self offset number to ignore any * inconsistency. */ if (HeapTupleHeaderIsSpeculative(page_htup)) ItemPointerSet(&page_htup->t_ctid, blkno, off); /* * NB: Not ignoring ctid changes due to the tuple having moved * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's * important information that needs to be in-sync between primary * and standby, and thus is WAL logged. */ } /* * Ignore any padding bytes after the tuple, when the length of the * item is not MAXALIGNed. */ if (ItemIdHasStorage(iid)) { int len = ItemIdGetLength(iid); int padlen = MAXALIGN(len) - len; if (padlen > 0) memset(page_item + len, MASK_MARKER, padlen); } } } /* * COPIED FROM heapam.c * Given an "infobits" field from an XLog record, set the correct bits in the * given infomask and infomask2 for the tuple touched by the record. * * (This is the reverse of compute_infobits). */ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) *infomask |= HEAP_XMAX_IS_MULTI; if (infobits & XLHL_XMAX_LOCK_ONLY) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; if (infobits & XLHL_COMBOCID) *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; if (infobits & XLHL_KEYS_UPDATED) *infomask2 |= HEAP_KEYS_UPDATED; } static void redo_neon_heap_insert(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) XLogRecGetData(record); Buffer buffer; Page page; union { HeapTupleHeaderData hdr; char data[MaxHeapTupleSize]; } tbuf; HeapTupleHeader htup; xl_neon_heap_header xlhdr; uint32 newlen; Size freespace = 0; RelFileLocator target_locator; BlockNumber blkno; ItemPointerData target_tid; XLogRedoAction action; XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(target_locator); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } /* * If we inserted the first and only tuple on the page, re-initialize the * page from scratch. */ if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) { buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { Size datalen; char *data; page = BufferGetPage(buffer); if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) elog(PANIC, "neon_rm_redo: invalid max offset number"); data = XLogRecGetBlockData(record, 0, &datalen); newlen = datalen - SizeOfNeonHeapHeader; Assert(datalen > SizeOfNeonHeapHeader && newlen <= MaxHeapTupleSize); memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader); data += SizeOfNeonHeapHeader; htup = &tbuf.hdr; MemSet((char *) htup, 0, SizeofHeapTupleHeader); /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ memcpy((char *) htup + SizeofHeapTupleHeader, data, newlen); newlen += SizeofHeapTupleHeader; htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, true, true) == InvalidOffsetNumber) elog(PANIC, "neon_rm_redo: failed to add tuple"); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) PageSetAllVisible(page); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* * If the page is running low on free space, update the FSM as well. * Arbitrarily, our definition of "low" is less than 20%. We can't do much * better than that without knowing the fill-factor for the table. * * XXX: Don't do this if the page was restored from full page image. We * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); } static void redo_neon_heap_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) XLogRecGetData(record); Buffer buffer; Page page; ItemId lp = NULL; HeapTupleHeader htup; BlockNumber blkno; RelFileLocator target_locator; ItemPointerData target_tid; XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(target_locator); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) lp = PageGetItemId(page, xlrec->offnum); if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) elog(PANIC, "neon_rm_redo: invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* Make sure t_ctid is set correctly */ if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) HeapTupleHeaderSetMovedPartitions(htup); else htup->t_ctid = target_tid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } static void redo_neon_heap_update(XLogReaderState *record, bool hot_update) { XLogRecPtr lsn = record->EndRecPtr; xl_neon_heap_update *xlrec = (xl_neon_heap_update *) XLogRecGetData(record); RelFileLocator rlocator; BlockNumber oldblk; BlockNumber newblk; ItemPointerData newtid; Buffer obuffer, nbuffer; Page page; OffsetNumber offnum; ItemId lp = NULL; HeapTupleData oldtup; HeapTupleHeader htup; uint16 prefixlen = 0, suffixlen = 0; char *newp; union { HeapTupleHeaderData hdr; char data[MaxHeapTupleSize]; } tbuf; xl_neon_heap_header xlhdr; uint32 newlen; Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) { /* HOT updates are never done across pages */ Assert(!hot_update); } else oldblk = newblk; ItemPointerSet(&newtid, newblk, xlrec->new_offnum); /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, oldblk, &vmbuffer); visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } /* * In normal operation, it is important to lock the two pages in * page-number order, to avoid possible deadlocks against other update * operations going the other way. However, during WAL replay there can * be no other update happening, so we don't need to worry about that. But * we *do* need to worry that we don't expose an inconsistent state to Hot * Standby queries --- so the original page can't be unlocked before we've * added the new tuple to the new page. */ /* Deal with old tuple version */ oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, &obuffer); if (oldaction == BLK_NEEDS_REDO) { page = BufferGetPage(obuffer); offnum = xlrec->old_offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "neon_rm_redo: invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); oldtup.t_data = htup; oldtup.t_len = ItemIdGetLength(lp); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; if (hot_update) HeapTupleHeaderSetHotUpdated(htup); else HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); PageSetLSN(page, lsn); MarkBufferDirty(obuffer); } /* * Read the page the new tuple goes into, if different from old. */ if (oldblk == newblk) { nbuffer = obuffer; newaction = oldaction; } else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) { nbuffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(nbuffer); PageInit(page, BufferGetPageSize(nbuffer), 0); newaction = BLK_NEEDS_REDO; } else newaction = XLogReadBufferForRedo(record, 0, &nbuffer); /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, newblk, &vmbuffer); visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } /* Deal with new tuple */ if (newaction == BLK_NEEDS_REDO) { char *recdata; char *recdata_end; Size datalen; Size tuplen; recdata = XLogRecGetBlockData(record, 0, &datalen); recdata_end = recdata + datalen; page = BufferGetPage(nbuffer); offnum = xlrec->new_offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "neon_rm_redo: invalid max offset number"); if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) { Assert(newblk == oldblk); memcpy(&prefixlen, recdata, sizeof(uint16)); recdata += sizeof(uint16); } if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) { Assert(newblk == oldblk); memcpy(&suffixlen, recdata, sizeof(uint16)); recdata += sizeof(uint16); } memcpy((char *) &xlhdr, recdata, SizeOfNeonHeapHeader); recdata += SizeOfNeonHeapHeader; tuplen = recdata_end - recdata; Assert(tuplen <= MaxHeapTupleSize); htup = &tbuf.hdr; MemSet((char *) htup, 0, SizeofHeapTupleHeader); /* * Reconstruct the new tuple using the prefix and/or suffix from the * old tuple, and the data stored in the WAL record. */ newp = (char *) htup + SizeofHeapTupleHeader; if (prefixlen > 0) { int len; /* copy bitmap [+ padding] [+ oid] from WAL record */ len = xlhdr.t_hoff - SizeofHeapTupleHeader; memcpy(newp, recdata, len); recdata += len; newp += len; /* copy prefix from old tuple */ memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); newp += prefixlen; /* copy new tuple data from WAL record */ len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); memcpy(newp, recdata, len); recdata += len; newp += len; } else { /* * copy bitmap [+ padding] [+ oid] + data from record, all in one * go */ memcpy(newp, recdata, tuplen); recdata += tuplen; newp += tuplen; } Assert(recdata == recdata_end); /* copy suffix from old tuple */ if (suffixlen > 0) memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "neon_rm_redo: failed to add tuple"); if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ PageSetLSN(page, lsn); MarkBufferDirty(nbuffer); } if (BufferIsValid(nbuffer) && nbuffer != obuffer) UnlockReleaseBuffer(nbuffer); if (BufferIsValid(obuffer)) UnlockReleaseBuffer(obuffer); /* * If the new page is running low on free space, update the FSM as well. * Arbitrarily, our definition of "low" is less than 20%. We can't do much * better than that without knowing the fill-factor for the table. * * However, don't update the FSM on HOT updates, because after crash * recovery, either the old or the new tuple will certainly be dead and * prunable. After pruning, the page will have roughly as much free space * as it did before the update, assuming the new tuple is about the same * size as the old one. * * XXX: Don't do this if the page was restored from full page image. We * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); } static void redo_neon_heap_lock(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) XLogRecGetData(record); Buffer buffer; Page page; OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) { RelFileLocator rlocator; Buffer vmbuffer = InvalidBuffer; BlockNumber block; Relation reln; XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); reln = CreateFakeRelcacheEntry(rlocator); visibilitymap_pin(reln, block, &vmbuffer); visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) elog(PANIC, "neon_rm_redo: invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); /* * Clear relevant update flags, but only if the modified infomask says * there's no update. */ if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) { HeapTupleHeaderClearHotUpdated(htup); /* Make sure there is no forward chain link in t_ctid */ ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); } HeapTupleHeaderSetXmax(htup, xlrec->xmax); htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } static void redo_neon_heap_multi_insert(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_neon_heap_multi_insert *xlrec; RelFileLocator rlocator; BlockNumber blkno; Buffer buffer; Page page; union { HeapTupleHeaderData hdr; char data[MaxHeapTupleSize]; } tbuf; HeapTupleHeader htup; uint32 newlen; Size freespace = 0; int i; bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; XLogRedoAction action; /* * Insertion doesn't overwrite MVCC data, so no conflict processing is * required. */ xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(record); XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); /* check that the mutually exclusive flags are not both set */ Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } if (isinit) { buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { char *tupdata; char *endptr; Size len; /* Tuples are stored as block data */ tupdata = XLogRecGetBlockData(record, 0, &len); endptr = tupdata + len; page = (Page) BufferGetPage(buffer); for (i = 0; i < xlrec->ntuples; i++) { OffsetNumber offnum; xl_neon_multi_insert_tuple *xlhdr; /* * If we're reinitializing the page, the tuples are stored in * order from FirstOffsetNumber. Otherwise there's an array of * offsets in the WAL record, and the tuples come after that. */ if (isinit) offnum = FirstOffsetNumber + i; else offnum = xlrec->offsets[i]; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "neon_rm_redo: invalid max offset number"); xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(tupdata); tupdata = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; newlen = xlhdr->datalen; Assert(newlen <= MaxHeapTupleSize); htup = &tbuf.hdr; MemSet((char *) htup, 0, SizeofHeapTupleHeader); /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ memcpy((char *) htup + SizeofHeapTupleHeader, (char *) tupdata, newlen); tupdata += newlen; newlen += SizeofHeapTupleHeader; htup->t_infomask2 = xlhdr->t_infomask2; htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "neon_rm_redo: failed to add tuple"); } if (tupdata != endptr) elog(PANIC, "neon_rm_redo: total tuple length mismatch"); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) PageSetAllVisible(page); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* * If the page is running low on free space, update the FSM as well. * Arbitrarily, our definition of "low" is less than 20%. We can't do much * better than that without knowing the fill-factor for the table. * * XXX: Don't do this if the page was restored from full page image. We * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } #else /* safeguard for older PostgreSQL versions */ PG_MODULE_MAGIC; #endif ================================================ FILE: pgxn/neon_rmgr/neon_rmgr.control ================================================ # neon_rmgr extension comment = 'Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)' default_version = '1.0' module_pathname = '$libdir/neon_rmgr' ================================================ FILE: pgxn/neon_rmgr/neon_rmgr.h ================================================ #ifndef NEON_RMGR_H #define NEON_RMGR_H #if PG_MAJORVERSION_NUM >= 16 #include "access/xlog_internal.h" #include "replication/decode.h" #include "replication/logical.h" extern void neon_rm_desc(StringInfo buf, XLogReaderState *record); extern void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern const char *neon_rm_identify(uint8 info); #endif #endif //NEON_RMGR_H ================================================ FILE: pgxn/neon_rmgr/neon_rmgr_decode.c ================================================ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" #include "replication/logical.h" #include "replication/snapbuild.h" #include "neon_rmgr.h" #endif /* PG >= 16 */ #if PG_MAJORVERSION_NUM == 16 /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); /* common function to decode tuples */ static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple); void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); /* * If we don't have snapshot or we are just fast-forwarding, there is no * point in decoding data changes. */ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || ctx->fast_forward) return; switch (info) { case XLOG_NEON_HEAP_INSERT: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonInsert(ctx, buf); break; case XLOG_NEON_HEAP_DELETE: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonDelete(ctx, buf); break; case XLOG_NEON_HEAP_UPDATE: case XLOG_NEON_HEAP_HOT_UPDATE: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonUpdate(ctx, buf); break; case XLOG_NEON_HEAP_LOCK: break; case XLOG_NEON_HEAP_MULTI_INSERT: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonMultiInsert(ctx, buf); break; default: elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); break; } } static inline bool FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) { if (ctx->callbacks.filter_by_origin_cb == NULL) return false; return filter_by_origin_cb_wrapper(ctx, origin_id); } /* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { Size datalen; char *tupledata; Size tuplelen; XLogReaderState *r = buf->record; xl_neon_heap_insert *xlrec; ReorderBufferChange *change; RelFileLocator target_locator; xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples (this does happen when * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) change->action = REORDER_BUFFER_CHANGE_INSERT; else change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); tupledata = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfNeonHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); } /* * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. * * Deletes can possibly contain the old primary key. */ static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_delete *xlrec; ReorderBufferChange *change; RelFileLocator target_locator; xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (xlrec->flags & XLH_DELETE_IS_SUPER) change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; else change->action = REORDER_BUFFER_CHANGE_DELETE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); /* old primary key stored */ if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) { Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; Size tuplelen = datalen - SizeOfNeonHeapHeader; Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); } /* * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout * in the record, from wal into proper tuplebufs. * * Updates can possibly contain a new tuple and the old primary key. */ static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_update *xlrec; ReorderBufferChange *change; char *data; RelFileLocator target_locator; xlrec = (xl_neon_heap_update *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_UPDATE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) { Size datalen; Size tuplelen; data = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfNeonHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) { Size datalen; Size tuplelen; /* caution, remaining data in record is not aligned */ data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; tuplelen = datalen - SizeOfNeonHeapHeader; change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); } /* * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_multi_insert *xlrec; int i; char *data; char *tupledata; Size tuplelen; RelFileLocator rlocator; xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples. This happens when a * multi_insert is done on a catalog or on a non-persistent relation. */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); if (rlocator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; /* * We know that this multi_insert isn't for a catalog, so the block should * always have data even if a full-page write of it is taken. */ tupledata = XLogRecGetBlockData(r, 0, &tuplelen); Assert(tupledata != NULL); data = tupledata; for (i = 0; i < xlrec->ntuples; i++) { ReorderBufferChange *change; xl_neon_multi_insert_tuple *xlhdr; int datalen; ReorderBufferTupleBuf *tuple; HeapTupleHeader header; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; datalen = xlhdr->datalen; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, datalen); tuple = change->data.tp.newtuple; header = tuple->tuple.t_data; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->tuple.t_self); /* * We can only figure this out after reassembling the transactions. */ tuple->tuple.t_tableOid = InvalidOid; tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; memset(header, 0, SizeofHeapTupleHeader); memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, (char *) data, datalen); header->t_infomask = xlhdr->t_infomask; header->t_infomask2 = xlhdr->t_infomask2; header->t_hoff = xlhdr->t_hoff; /* * Reset toast reassembly state only after the last row in the last * xl_multi_insert_tuple record emitted by one heap_multi_insert() * call. */ if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && (i + 1) == xlrec->ntuples) change->data.tp.clear_toast_afterwards = true; else change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); /* move to the next xl_neon_multi_insert_tuple entry */ data += datalen; } Assert(data == tupledata + tuplelen); } /* * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete * (but not by heap_multi_insert) into a tuplebuf. * * The size 'len' and the pointer 'data' in the record need to be * computed outside as they are record specific. */ static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) { xl_neon_heap_header xlhdr; int datalen = len - SizeOfNeonHeapHeader; HeapTupleHeader header; Assert(datalen >= 0); tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; header = tuple->tuple.t_data; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->tuple.t_self); /* we can only figure this out after reassembling the transactions */ tuple->tuple.t_tableOid = InvalidOid; /* data is not stored aligned, copy to aligned storage */ memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader); memset(header, 0, SizeofHeapTupleHeader); memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, data + SizeOfNeonHeapHeader, datalen); header->t_infomask = xlhdr.t_infomask; header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } #endif #if PG_MAJORVERSION_NUM == 17 /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); /* common function to decode tuples */ static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); /* * If we don't have snapshot or we are just fast-forwarding, there is no * point in decoding data changes. */ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || ctx->fast_forward) return; switch (info) { case XLOG_NEON_HEAP_INSERT: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonInsert(ctx, buf); break; case XLOG_NEON_HEAP_DELETE: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonDelete(ctx, buf); break; case XLOG_NEON_HEAP_UPDATE: case XLOG_NEON_HEAP_HOT_UPDATE: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonUpdate(ctx, buf); break; case XLOG_NEON_HEAP_LOCK: break; case XLOG_NEON_HEAP_MULTI_INSERT: if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeNeonMultiInsert(ctx, buf); break; default: elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); break; } } static inline bool FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) { if (ctx->callbacks.filter_by_origin_cb == NULL) return false; return filter_by_origin_cb_wrapper(ctx, origin_id); } /* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { Size datalen; char *tupledata; Size tuplelen; XLogReaderState *r = buf->record; xl_neon_heap_insert *xlrec; ReorderBufferChange *change; RelFileLocator target_locator; xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples (this does happen when * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) change->action = REORDER_BUFFER_CHANGE_INSERT; else change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); tupledata = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); } /* * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. * * Deletes can possibly contain the old primary key. */ static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_delete *xlrec; ReorderBufferChange *change; RelFileLocator target_locator; xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (xlrec->flags & XLH_DELETE_IS_SUPER) change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; else change->action = REORDER_BUFFER_CHANGE_DELETE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); /* old primary key stored */ if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) { Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; Size tuplelen = datalen - SizeOfNeonHeapHeader; Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); } /* * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout * in the record, from wal into proper tuplebufs. * * Updates can possibly contain a new tuple and the old primary key. */ static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_update *xlrec; ReorderBufferChange *change; char *data; RelFileLocator target_locator; xlrec = (xl_neon_heap_update *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); if (target_locator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_UPDATE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) { Size datalen; Size tuplelen; data = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfNeonHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) { Size datalen; Size tuplelen; /* caution, remaining data in record is not aligned */ data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; tuplelen = datalen - SizeOfNeonHeapHeader; change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); } /* * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_neon_heap_multi_insert *xlrec; int i; char *data; char *tupledata; Size tuplelen; RelFileLocator rlocator; xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples. This happens when a * multi_insert is done on a catalog or on a non-persistent relation. */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); if (rlocator.dbOid != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; /* * We know that this multi_insert isn't for a catalog, so the block should * always have data even if a full-page write of it is taken. */ tupledata = XLogRecGetBlockData(r, 0, &tuplelen); Assert(tupledata != NULL); data = tupledata; for (i = 0; i < xlrec->ntuples; i++) { ReorderBufferChange *change; xl_neon_multi_insert_tuple *xlhdr; int datalen; HeapTuple tuple; HeapTupleHeader header; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; datalen = xlhdr->datalen; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, datalen); tuple = change->data.tp.newtuple; header = tuple->t_data; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->t_self); /* * We can only figure this out after reassembling the transactions. */ tuple->t_tableOid = InvalidOid; tuple->t_len = datalen + SizeofHeapTupleHeader; memset(header, 0, SizeofHeapTupleHeader); memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, (char *) data, datalen); header->t_infomask = xlhdr->t_infomask; header->t_infomask2 = xlhdr->t_infomask2; header->t_hoff = xlhdr->t_hoff; /* * Reset toast reassembly state only after the last row in the last * xl_multi_insert_tuple record emitted by one heap_multi_insert() * call. */ if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && (i + 1) == xlrec->ntuples) change->data.tp.clear_toast_afterwards = true; else change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change, false); /* move to the next xl_neon_multi_insert_tuple entry */ data += datalen; } Assert(data == tupledata + tuplelen); } /* * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete * (but not by heap_multi_insert) into a tuplebuf. * * The size 'len' and the pointer 'data' in the record need to be * computed outside as they are record specific. */ static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple) { xl_neon_heap_header xlhdr; int datalen = len - SizeOfNeonHeapHeader; HeapTupleHeader header; Assert(datalen >= 0); tuple->t_len = datalen + SizeofHeapTupleHeader; header = tuple->t_data; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->t_self); /* we can only figure this out after reassembling the transactions */ tuple->t_tableOid = InvalidOid; /* data is not stored aligned, copy to aligned storage */ memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader); memset(header, 0, SizeofHeapTupleHeader); memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, data + SizeOfNeonHeapHeader, datalen); header->t_infomask = xlhdr.t_infomask; header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } #endif ================================================ FILE: pgxn/neon_rmgr/neon_rmgr_desc.c ================================================ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "access/rmgr.h" #include "access/rmgrdesc_utils.h" #include "access/xlog_internal.h" #include "miscadmin.h" #include "storage/buf.h" #include "storage/bufpage.h" #include "neon_rmgr.h" /* * NOTE: "keyname" argument cannot have trailing spaces or punctuation * characters */ static void infobits_desc(StringInfo buf, uint8 infobits, const char *keyname) { appendStringInfo(buf, "%s: [", keyname); Assert(buf->data[buf->len - 1] != ' '); if (infobits & XLHL_XMAX_IS_MULTI) appendStringInfoString(buf, "IS_MULTI, "); if (infobits & XLHL_XMAX_LOCK_ONLY) appendStringInfoString(buf, "LOCK_ONLY, "); if (infobits & XLHL_XMAX_EXCL_LOCK) appendStringInfoString(buf, "EXCL_LOCK, "); if (infobits & XLHL_XMAX_KEYSHR_LOCK) appendStringInfoString(buf, "KEYSHR_LOCK, "); if (infobits & XLHL_KEYS_UPDATED) appendStringInfoString(buf, "KEYS_UPDATED, "); if (buf->data[buf->len - 1] == ' ') { /* Truncate-away final unneeded ", " */ Assert(buf->data[buf->len - 2] == ','); buf->len -= 2; buf->data[buf->len] = '\0'; } appendStringInfoString(buf, "]"); } void neon_rm_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; info &= XLOG_NEON_OPMASK; if (info == XLOG_NEON_HEAP_INSERT) { xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) rec; appendStringInfo(buf, "off: %u, flags: 0x%02X", xlrec->offnum, xlrec->flags); } else if (info == XLOG_NEON_HEAP_DELETE) { xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) rec; appendStringInfo(buf, "xmax: %u, off: %u, ", xlrec->xmax, xlrec->offnum); infobits_desc(buf, xlrec->infobits_set, "infobits"); appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); } else if (info == XLOG_NEON_HEAP_UPDATE) { xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", xlrec->old_xmax, xlrec->old_offnum); infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); } else if (info == XLOG_NEON_HEAP_HOT_UPDATE) { xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", xlrec->old_xmax, xlrec->old_offnum); infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); } else if (info == XLOG_NEON_HEAP_LOCK) { xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) rec; appendStringInfo(buf, "xmax: %u, off: %u, ", xlrec->xmax, xlrec->offnum); infobits_desc(buf, xlrec->infobits_set, "infobits"); appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); } else if (info == XLOG_NEON_HEAP_MULTI_INSERT) { xl_neon_heap_multi_insert *xlrec = (xl_neon_heap_multi_insert *) rec; bool isinit = (XLogRecGetInfo(record) & XLOG_NEON_INIT_PAGE) != 0; appendStringInfo(buf, "ntuples: %d, flags: 0x%02X", xlrec->ntuples, xlrec->flags); if (XLogRecHasBlockData(record, 0) && !isinit) { appendStringInfoString(buf, ", offsets:"); array_desc(buf, xlrec->offsets, sizeof(OffsetNumber), xlrec->ntuples, &offset_elem_desc, NULL); } } } const char * neon_rm_identify(uint8 info) { const char *id = NULL; switch (info & ~XLR_INFO_MASK) { case XLOG_NEON_HEAP_INSERT: id = "INSERT"; break; case XLOG_NEON_HEAP_INSERT | XLOG_NEON_INIT_PAGE: id = "INSERT+INIT"; break; case XLOG_NEON_HEAP_DELETE: id = "DELETE"; break; case XLOG_NEON_HEAP_UPDATE: id = "UPDATE"; break; case XLOG_NEON_HEAP_UPDATE | XLOG_NEON_INIT_PAGE: id = "UPDATE+INIT"; break; case XLOG_NEON_HEAP_HOT_UPDATE: id = "HOT_UPDATE"; break; case XLOG_NEON_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE: id = "HOT_UPDATE+INIT"; break; case XLOG_NEON_HEAP_LOCK: id = "LOCK"; break; case XLOG_NEON_HEAP_MULTI_INSERT: id = "MULTI_INSERT"; break; case XLOG_NEON_HEAP_MULTI_INSERT | XLOG_NEON_INIT_PAGE: id = "MULTI_INSERT+INIT"; break; } return id; } #endif ================================================ FILE: pgxn/neon_test_utils/Makefile ================================================ # pgxs/neon_test_utils/Makefile MODULE_big = neon_test_utils OBJS = \ $(WIN32RES) \ neontest.o EXTENSION = neon_test_utils DATA = neon_test_utils--1.3.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) ================================================ FILE: pgxn/neon_test_utils/neon_test_utils--1.3.sql ================================================ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit CREATE FUNCTION test_consume_xids(nxids int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION test_consume_oids(oid int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_oids' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION test_consume_cpu(seconds int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_cpu' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION test_consume_memory(megabytes int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_memory' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL) RETURNS VOID AS 'MODULE_PATHNAME', 'test_release_memory' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION clear_buffer_cache() RETURNS VOID AS 'MODULE_PATHNAME', 'clear_buffer_cache' LANGUAGE C STRICT PARALLEL UNSAFE; CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL) RETURNS VOID AS 'MODULE_PATHNAME', 'neon_xlogflush' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION trigger_panic() RETURNS VOID AS 'MODULE_PATHNAME', 'trigger_panic' LANGUAGE C PARALLEL UNSAFE; CREATE FUNCTION trigger_segfault() RETURNS VOID AS 'MODULE_PATHNAME', 'trigger_segfault' LANGUAGE C PARALLEL UNSAFE; -- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun CREATE OR REPLACE FUNCTION 💣() RETURNS void LANGUAGE plpgsql AS $$ BEGIN PERFORM trigger_segfault(); END; $$; ================================================ FILE: pgxn/neon_test_utils/neon_test_utils.control ================================================ # neon_test_utils extension comment = 'helpers for neon testing and debugging' default_version = '1.3' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true ================================================ FILE: pgxn/neon_test_utils/neontest.c ================================================ /*------------------------------------------------------------------------- * * neontest.c * Helpers for neon testing and debugging * * IDENTIFICATION * contrib/neon_test_utils/neontest.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "../neon/neon_pgversioncompat.h" #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/fd.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/rel.h" #include "utils/varlena.h" #include "utils/wait_event.h" #include "../neon/pagestore_client.h" PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); PG_FUNCTION_INFO_V1(test_consume_oids); PG_FUNCTION_INFO_V1(test_consume_cpu); PG_FUNCTION_INFO_V1(test_consume_memory); PG_FUNCTION_INFO_V1(test_release_memory); PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); PG_FUNCTION_INFO_V1(trigger_panic); PG_FUNCTION_INFO_V1(trigger_segfault); /* * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); static neon_read_at_lsn_type neon_read_at_lsn_ptr; /* * Module initialize function: fetch function pointers for cross-module calls. */ void _PG_init(void) { /* Asserts verify that typedefs above match original declarations */ AssertVariableIsOfType(&neon_read_at_lsn, neon_read_at_lsn_type); neon_read_at_lsn_ptr = (neon_read_at_lsn_type) load_external_function("$libdir/neon", "neon_read_at_lsn", true, NULL); } #define neon_read_at_lsn neon_read_at_lsn_ptr /* * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound. * Unlike test_consume_xids which is passed number of xids to be consumed, * this function is given the target Oid. */ Datum test_consume_oids(PG_FUNCTION_ARGS) { int32 oid = PG_GETARG_INT32(0); while (oid != GetNewObjectId()); PG_RETURN_VOID(); } /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. */ Datum test_consume_xids(PG_FUNCTION_ARGS) { int32 nxids = PG_GETARG_INT32(0); TransactionId topxid; FullTransactionId fullxid; TransactionId xid; TransactionId targetxid; /* make sure we have a top-XID first */ topxid = GetTopTransactionId(); xid = ReadNextTransactionId(); targetxid = xid + nxids; while (targetxid < FirstNormalTransactionId) targetxid++; while (TransactionIdPrecedes(xid, targetxid)) { fullxid = GetNewTransactionId(true); xid = XidFromFullTransactionId(fullxid); elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); } PG_RETURN_VOID(); } /* * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds. */ Datum test_consume_cpu(PG_FUNCTION_ARGS) { int32 seconds = PG_GETARG_INT32(0); TimestampTz start; uint64 total_iterations = 0; start = GetCurrentTimestamp(); for (;;) { TimestampTz elapsed; elapsed = GetCurrentTimestamp() - start; if (elapsed > (TimestampTz) seconds * USECS_PER_SEC) break; /* keep spinning */ for (int i = 0; i < 1000000; i++) total_iterations++; elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations); CHECK_FOR_INTERRUPTS(); } PG_RETURN_VOID(); } static MemoryContext consume_cxt = NULL; static slist_head consumed_memory_chunks; static int64 num_memory_chunks; /* * test_consume_memory(megabytes int). * * Consume given amount of memory. The allocation is made in TopMemoryContext, * so it outlives the function, until you call test_release_memory to * explicitly release it, or close the session. */ Datum test_consume_memory(PG_FUNCTION_ARGS) { int32 megabytes = PG_GETARG_INT32(0); /* * Consume the memory in a new memory context, so that it's convenient to * release and to display it separately in a possible memory context dump. */ if (consume_cxt == NULL) consume_cxt = AllocSetContextCreate(TopMemoryContext, "test_consume_memory", ALLOCSET_DEFAULT_SIZES); for (int32 i = 0; i < megabytes; i++) { char *p; p = MemoryContextAllocZero(consume_cxt, 1024 * 1024); /* touch the memory, so that it's really allocated by the kernel */ for (int j = 0; j < 1024 * 1024; j += 1024) p[j] = j % 0xFF; slist_push_head(&consumed_memory_chunks, (slist_node *) p); num_memory_chunks++; } PG_RETURN_VOID(); } /* * test_release_memory(megabytes int). NULL releases all */ Datum test_release_memory(PG_FUNCTION_ARGS) { if (PG_ARGISNULL(0)) { if (consume_cxt) { MemoryContextDelete(consume_cxt); consume_cxt = NULL; num_memory_chunks = 0; } } else { int32 chunks_to_release = PG_GETARG_INT32(0); if (chunks_to_release > num_memory_chunks) { elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks); chunks_to_release = num_memory_chunks; } for (int32 i = 0; i < chunks_to_release; i++) { slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks); pfree(chunk); num_memory_chunks--; } } PG_RETURN_VOID(); } /* * Flush the buffer cache, evicting all pages that are not currently pinned. */ Datum clear_buffer_cache(PG_FUNCTION_ARGS) { bool save_neon_test_evict; /* * Temporarily set the neon_test_evict GUC, so that when we pin and * unpin a buffer, the buffer is evicted. We use that hack to evict all * buffers, as there is no explicit "evict this buffer" function in the * buffer manager. */ save_neon_test_evict = neon_test_evict; neon_test_evict = true; PG_TRY(); { /* Scan through all the buffers */ for (int i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; Buffer bufferid; bool isvalid; NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blocknum; /* Peek into the buffer header to see what page it holds. */ bufHdr = GetBufferDescriptor(i); buf_state = LockBufHdr(bufHdr); if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) isvalid = true; else isvalid = false; bufferid = BufferDescriptorGetBuffer(bufHdr); rinfo = BufTagGetNRelFileInfo(bufHdr->tag); forknum = bufHdr->tag.forkNum; blocknum = bufHdr->tag.blockNum; UnlockBufHdr(bufHdr, buf_state); /* * Pin the buffer, and release it again. Because we have * neon_test_evict==true, this will evict the page from the * buffer cache if no one else is holding a pin on it. */ if (isvalid) { if (ReadRecentBuffer(rinfo, forknum, blocknum, bufferid)) ReleaseBuffer(bufferid); } } } PG_FINALLY(); { /* restore the GUC */ neon_test_evict = save_neon_test_evict; } PG_END_TRY(); PG_RETURN_VOID(); } /* * Reads the page from page server without buffer cache * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN * NULL read lsn will result in reading the latest version. * * Note: reading latest version will result in waiting for latest changes to reach the page server, * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page */ Datum get_raw_page_at_lsn(PG_FUNCTION_ARGS) { bytea *raw_page; ForkNumber forknum; RangeVar *relrv; Relation rel; char *raw_page_data; text *relname; text *forkname; uint32 blkno; neon_request_lsns request_lsns; if (PG_NARGS() != 5) elog(ERROR, "unexpected number of arguments in SQL function signature"); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); relname = PG_GETARG_TEXT_PP(0); forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); /* * For the time being, use the same LSN for request and * effective request LSN. If any test needed to use UINT64_MAX * as the request LSN, we'd need to add effective_request_lsn * as a new argument. */ request_lsns.effective_request_lsn = request_lsns.request_lsn; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use raw page functions"))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); /* Check that this relation has storage */ if (rel->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from view \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from composite type \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from foreign table \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from partitioned table \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from partitioned index \"%s\"", RelationGetRelationName(rel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); forknum = forkname_to_number(text_to_cstring(forkname)); /* Initialize buffer to copy to */ raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns, raw_page_data); relation_close(rel, AccessShareLock); PG_RETURN_BYTEA_P(raw_page); } /* * Another option to read a relation page from page server without cache * this version doesn't validate input and allows reading blocks of dropped relations * * Note: reading latest version will result in waiting for latest changes to reach the page server, * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page */ Datum get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { char *raw_page_data; if (PG_NARGS() != 7) elog(ERROR, "unexpected number of arguments in SQL function signature"); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use raw page functions"))); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3) || PG_ARGISNULL(4)) PG_RETURN_NULL(); { NRelFileInfo rinfo = { #if PG_MAJORVERSION_NUM < 16 .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), .relNode = PG_GETARG_OID(2) #else .spcOid = PG_GETARG_OID(0), .dbOid = PG_GETARG_OID(1), .relNumber = PG_GETARG_OID(2) #endif }; ForkNumber forknum = PG_GETARG_UINT32(3); uint32 blkno = PG_GETARG_UINT32(4); neon_request_lsns request_lsns; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); /* * For the time being, use the same LSN for request * and effective request LSN. If any test needed to * use UINT64_MAX as the request LSN, we'd need to add * effective_request_lsn as a new argument. */ request_lsns.effective_request_lsn = request_lsns.request_lsn; SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } /* * Directly calls XLogFlush(lsn) to flush WAL buffers. * * If 'lsn' is not specified (is NULL), flush all generated WAL. */ Datum neon_xlogflush(PG_FUNCTION_ARGS) { XLogRecPtr lsn; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("cannot flush WAL during recovery."))); if (!PG_ARGISNULL(0)) lsn = PG_GETARG_LSN(0); else { lsn = GetXLogInsertRecPtr(); /*--- * The LSN returned by GetXLogInsertRecPtr() is the position where the * next inserted record would begin. If the last record ended just at * the page boundary, the next record will begin after the page header * on the next page, but the next page's page header has not been * written yet. If we tried to flush it, XLogFlush() would throw an * error: * * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X * * To avoid that, if the insert position points to just after the page * header, back off to page boundary. */ if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD && XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ) lsn -= SizeOfXLogShortPHD; else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD && XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ) lsn -= SizeOfXLogLongPHD; } XLogFlush(lsn); PG_RETURN_VOID(); } /* * Function to trigger panic. */ Datum trigger_panic(PG_FUNCTION_ARGS) { elog(PANIC, "neon_test_utils: panic"); PG_RETURN_VOID(); } /* * Function to trigger a segfault. */ Datum trigger_segfault(PG_FUNCTION_ARGS) { int *ptr = NULL; *ptr = 42; PG_RETURN_VOID(); } ================================================ FILE: pgxn/neon_utils/Makefile ================================================ # pgxs/neon_utils/Makefile MODULE_big = neon_utils OBJS = \ $(WIN32RES) \ neon_utils.o EXTENSION = neon_utils DATA = neon_utils--1.0.sql PGFILEDESC = "neon_utils - small useful functions" PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) ================================================ FILE: pgxn/neon_utils/neon_utils--1.0.sql ================================================ CREATE FUNCTION num_cpus() RETURNS int AS 'MODULE_PATHNAME', 'num_cpus' LANGUAGE C STRICT PARALLEL UNSAFE VOLATILE; ================================================ FILE: pgxn/neon_utils/neon_utils.c ================================================ /*------------------------------------------------------------------------- * * neon_utils.c * neon_utils - small useful functions * * IDENTIFICATION * contrib/neon_utils/neon_utils.c * *------------------------------------------------------------------------- */ #ifdef _WIN32 #include #else #include #endif #include "postgres.h" #include "fmgr.h" PG_MODULE_MAGIC; PG_FUNCTION_INFO_V1(num_cpus); Datum num_cpus(PG_FUNCTION_ARGS) { #ifdef _WIN32 SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors; #else uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN); #endif PG_RETURN_UINT32(num_cpus); } ================================================ FILE: pgxn/neon_utils/neon_utils.control ================================================ # neon_utils extension comment = 'neon_utils - small useful functions' default_version = '1.0' module_pathname = '$libdir/neon_utils' relocatable = true trusted = true ================================================ FILE: pgxn/neon_walredo/Makefile ================================================ # pgxs/neon_walredo/Makefile MODULE_big = neon_walredo OBJS = \ $(WIN32RES) \ inmem_smgr.o \ walredoproc.o \ # This really should be guarded by $(with_libseccomp), but I couldn't # make that work with pgxs. So we always compile it, but its contents # are wrapped in #ifdef HAVE_LIBSECCOMP instead. OBJS += seccomp.o PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver" PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) ifeq ($(with_libseccomp),yes) SHLIB_LINK += -lseccomp endif ================================================ FILE: pgxn/neon_walredo/inmem_smgr.c ================================================ /*------------------------------------------------------------------------- * * inmem_smgr.c * * This is an implementation of the SMGR interface, used in the WAL redo * process. It has no persistent storage, the pages that are written out * are kept in a small number of in-memory buffers. * * Normally, replaying a WAL record only needs to access a handful of * buffers, which fit in the normal buffer cache, so this is just for * "overflow" storage when the buffer cache is not large enough. * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include "../neon/neon_pgversioncompat.h" #include "access/xlog.h" #include "storage/block.h" #include "storage/buf_internals.h" #include RELFILEINFO_HDR #include "storage/smgr.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogutils.h" #endif #include "inmem_smgr.h" /* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */ #define MAX_PAGES 64 /* If more than WARN_PAGES are used, print a warning in the log */ #define WARN_PAGES 32 static BufferTag page_tag[MAX_PAGES]; static char page_body[MAX_PAGES][BLCKSZ]; static int used_pages; static int locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) { NRelFileInfo rinfo = InfoFromSMgrRel(reln); /* We only hold a small number of pages, so linear search */ for (int i = 0; i < used_pages; i++) { if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum && blkno == page_tag[i].blockNum) { return i; } } return -1; } /* neon wal-redo storage manager functionality */ static void inmem_init(void); static void inmem_open(SMgrRelation reln); static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); #if PG_MAJORVERSION_NUM >= 17 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks); #else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); #endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync); static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); #if PG_MAJORVERSION_NUM >= 17 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); #endif /* * inmem_init() -- Initialize private state */ static void inmem_init(void) { used_pages = 0; } /* * inmem_exists() -- Does the physical file exist? */ static bool inmem_exists(SMgrRelation reln, ForkNumber forknum) { NRelFileInfo rinfo = InfoFromSMgrRel(reln); for (int i = 0; i < used_pages; i++) { if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum) { return true; } } return false; } /* * inmem_create() -- Create a new relation on neon storage * * If isRedo is true, it's okay for the relation to exist already. */ static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) { } /* * inmem_unlink() -- Unlink a relation. */ static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo) { } /* * inmem_extend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of * extending a relation (i.e., blocknum is at or beyond the current * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */ static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, #if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) #else const void *buffer, bool skipFsync) #endif { /* same as smgwrite() for us */ inmem_write(reln, forknum, blkno, buffer, skipFsync); } #if PG_MAJORVERSION_NUM >= 16 static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { /* Do nothing: inmem_read will return zero page in any case */ } #endif /* * inmem_open() -- Initialize newly-opened relation. */ static void inmem_open(SMgrRelation reln) { } /* * inmem_close() -- Close the specified relation, if it isn't closed already. */ static void inmem_close(SMgrRelation reln, ForkNumber forknum) { } #if PG_MAJORVERSION_NUM >= 17 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { return true; } #else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } #endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. */ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { } /* * inmem_read() -- Read the specified block from a relation. */ #if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, char *buffer) #else static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { int pg; pg = locate_page(reln, forknum, blkno); if (pg < 0) memset(buffer, 0, BLCKSZ); else memcpy(buffer, page_body[pg], BLCKSZ); } #if PG_MAJORVERSION_NUM >= 17 static void inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void **buffers, BlockNumber nblocks) { for (int i = 0; i < nblocks; i++) { inmem_read(reln, forknum, blkno, buffers[i]); } } #endif /* * inmem_write() -- Write the supplied block at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, #if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) #else const void *buffer, bool skipFsync) #endif { int pg; pg = locate_page(reln, forknum, blocknum); if (pg < 0) { /* * We assume the buffer cache is large enough to hold all the buffers * needed for most operations. Overflowing to this "in-mem smgr" in * rare cases is OK. But if we find that we're using more than * WARN_PAGES, print a warning so that we get alerted and get to * investigate why we're accessing so many buffers. */ if (used_pages >= WARN_PAGES) ereport(WARNING, (errmsg("inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages), errbacktrace())); if (used_pages == MAX_PAGES) elog(ERROR, "Inmem storage overflow"); pg = used_pages; used_pages++; InitBufferTag(&page_tag[pg], &InfoFromSMgrRel(reln), forknum, blocknum); } else { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages); } memcpy(page_body[pg], buffer, BLCKSZ); } #if PG_MAJORVERSION_NUM >= 17 static void inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, const void **buffers, BlockNumber nblocks, bool skipFsync) { for (int i = 0; i < nblocks; i++) { inmem_write(reln, forknum, blkno, buffers[i], skipFsync); } } #endif /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum) { /* * It's not clear why a WAL redo function would call smgrnblocks(). During * recovery, at least before reaching consistency, the size of a relation * could be arbitrarily small, if it was truncated after the record being * replayed, or arbitrarily large if it was extended afterwards. But one * place where it's called is in XLogReadBufferExtended(): it extends the * relation, if it's smaller than the requested page. That's a waste of * time in the WAL redo process. Pretend that all relations are maximally * sized to avoid it. */ return MaxBlockNumber; } /* * inmem_truncate() -- Truncate relation to specified number of blocks. */ static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { } /* * inmem_immedsync() -- Immediately sync a relation to stable storage. */ static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } #if PG_MAJORVERSION_NUM >= 17 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum) { } #endif static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, .smgr_shutdown = NULL, .smgr_open = inmem_open, .smgr_close = inmem_close, .smgr_create = inmem_create, .smgr_exists = inmem_exists, .smgr_unlink = inmem_unlink, .smgr_extend = inmem_extend, #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif #if PG_MAJORVERSION_NUM >= 17 .smgr_prefetch = inmem_prefetch, .smgr_readv = inmem_readv, .smgr_writev = inmem_writev, #else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, #endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, #if PG_MAJORVERSION_NUM >= 17 .smgr_registersync = inmem_registersync, #endif .smgr_start_unlogged_build = NULL, .smgr_finish_unlogged_build_phase_1 = NULL, .smgr_end_unlogged_build = NULL, .smgr_read_slru_segment = NULL, }; const f_smgr * smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); // // What does this code do? // if (backend != INVALID_PROC_NUMBER) // return smgr_standard(backend, rinfo); // else return &inmem_smgr; } void smgr_init_inmem() { inmem_init(); } ================================================ FILE: pgxn/neon_walredo/inmem_smgr.h ================================================ /*------------------------------------------------------------------------- * * inmem_smgr.h * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ ================================================ FILE: pgxn/neon_walredo/neon_seccomp.h ================================================ #ifndef NEON_SECCOMP_H #define NEON_SECCOMP_H #include typedef struct { int psr_syscall; /* syscall number */ uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */ } PgSeccompRule; #define PG_SCMP(syscall, action) \ (PgSeccompRule) { \ .psr_syscall = SCMP_SYS(syscall), \ .psr_action = (action), \ } #define PG_SCMP_ALLOW(syscall) \ PG_SCMP(syscall, SCMP_ACT_ALLOW) extern void seccomp_load_rules(PgSeccompRule *syscalls, int count); #endif /* NEON_SECCOMP_H */ ================================================ FILE: pgxn/neon_walredo/seccomp.c ================================================ /*------------------------------------------------------------------------- * * seccomp.c * Secure Computing BPF API wrapper. * * Pageserver delegates complex WAL decoding duties to postgres, * which means that the latter might fall victim to carefully designed * malicious WAL records and start doing harmful things to the system. * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * * This code is intended to support both x86_64 and aarch64. The latter * doesn't implement some syscalls like open and select. We allow both * select (absent on aarch64) and pselect6 (present on both architectures) * We call select(2) through libc, and the libc wrapper calls select or pselect6 * depending on the architecture. You can check which syscalls are present on * different architectures with the `scmp_sys_resolver` tool from the * seccomp package. * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * * - We have to carefully handpick and maintain the set of syscalls * required for the WAL redo process. Core dumps help with that. * The method of trial and error seems to work reasonably well, * but it would be nice to find a proper way to "prove" that * the set in question is both necessary and sufficient. * * - Once we enter the seccomp bpf mode, it's impossible to lift those * restrictions (otherwise, what kind of "protection" would that be?). * Thus, we have to either enable extra syscalls for the clean shutdown, * or exit the process immediately via _exit() instead of proc_exit(). * * - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom * facility to deal with the forbidden syscalls? If we'd like to embed * a startup security test, we should go with the latter; In that * case, which one of the following options is preferable? * * * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP. * Provide a common signal handler with a static switch to override * its behavior for the test case. This would undermine the whole * purpose of such protection, so we'd have to go further and remap * the memory backing the switch as readonly, then ban mprotect(). * Ugly and fragile, to say the least. * * * Yet again, catch the denied syscalls using SCMP_ACT_TRAP. * Provide 2 different signal handlers: one for a test case, * another for the main processing loop. Install the first one, * enable seccomp, perform the test, switch to the second one, * finally ban sigaction(), presto! * * * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the * test, then ban it altogether with another filter. The downside * of this solution is that we don't actually check that * SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works. * * Either approach seems to require two eBPF filter programs, * which is unfortunate: the man page tells this is uncommon. * Maybe I (@funbringer) am missing something, though; I encourage * any reader to get familiar with it and scrutinize my conclusions. * * TODOs and ideas in no particular order: * * - Do something about mmap() in musl's malloc(). * Definitely not a priority if we don't care about musl. * * - See if we can untangle PG's shutdown sequence (involving unlink()): * * * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode. * * Investigate chroot() or mount namespaces for better FS isolation. * * (Per Heikki) Simply call _exit(), no big deal. * * Come up with a better idea? * * - Make use of seccomp's argument inspection (for what?). * Unfortunately, it views all syscall arguments as scalars, * so it won't work for e.g. string comparison in unlink(). * * - Benchmark with bpf jit on/off, try seccomp_syscall_priority(). * * - Test against various linux distros & glibc versions. * I suspect that certain libc functions might involve slightly * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. * *------------------------------------------------------------------------- */ #include "postgres.h" /* * I couldn't find a good way to do a conditional OBJS += seccomp.o in * the Makefile, so this file is compiled even when seccomp is disabled, * it's just empty in that case. */ #ifdef HAVE_LIBSECCOMP #include #include #include "miscadmin.h" #include "neon_seccomp.h" static void die(int code, const char *str); static bool seccomp_test_sighandler_done = false; static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt); static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt); static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action); void seccomp_load_rules(PgSeccompRule *rules, int count) { struct sigaction action = { .sa_flags = SA_SIGINFO }; PgSeccompRule rule; long fd; /* * Install a test signal handler. * XXX: pqsignal() is too restrictive for our purposes, * since we'd like to examine the contents of siginfo_t. */ action.sa_sigaction = seccomp_test_sighandler; if (sigaction(SIGSYS, &action, NULL) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not install test SIGSYS handler"))); /* * First, check that open of a well-known file works. * XXX: We use raw syscall() to call the very openat() which is * present both on x86_64 and on aarch64. */ fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: signal handler test flag was set unexpectedly"))); if (fd < 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); close((int) fd); /* Set a trap on openat() to test seccomp bpf */ rule = PG_SCMP(openat, SCMP_ACT_TRAP); if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); /* Finally, check that openat() now raises SIGSYS */ (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: SIGSYS handler doesn't seem to work"))); /* Now that everything seems to work, install a proper handler */ action.sa_sigaction = seccomp_deny_sighandler; if (sigaction(SIGSYS, &action, NULL) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not install SIGSYS handler"))); /* If this succeeds, any syscall not in the list will crash the process */ if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not enter seccomp mode"))); } /* * Enter seccomp mode with a BPF filter that will only allow * certain syscalls to proceed. */ static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action) { scmp_filter_ctx ctx; int rc = -1; /* Create a context with a default action for syscalls not in the list */ if ((ctx = seccomp_init(def_action)) == NULL) goto cleanup; for (int i = 0; i < count; i++) { PgSeccompRule *rule = &rules[i]; if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0) goto cleanup; } /* Try building & loading the program into the kernel */ if ((rc = seccomp_load(ctx)) != 0) goto cleanup; cleanup: /* * We don't need the context anymore regardless of the result, * since either we failed or the eBPF program has already been * loaded into the linux kernel. */ seccomp_release(ctx); return rc; } static void die(int code, const char *str) { /* work around gcc ignoring that it shouldn't warn on (void) result being unused */ ssize_t _unused pg_attribute_unused(); /* Best effort write to stderr */ _unused = write(fileno(stderr), str, strlen(str)); /* XXX: we don't want to run any atexit callbacks */ _exit(code); } static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) { #define DIE_PREFIX "seccomp test signal handler: " /* Check that this signal handler is used only for a single test case */ if (seccomp_test_sighandler_done) die(1, DIE_PREFIX "test handler should only be used for 1 test\n"); seccomp_test_sighandler_done = true; if (signum != SIGSYS) die(1, DIE_PREFIX "bad signal number\n"); /* TODO: maybe somehow extract the hardcoded syscall number */ if (info->si_syscall != SCMP_SYS(openat)) die(1, DIE_PREFIX "bad syscall number\n"); #undef DIE_PREFIX } static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) { /* * Unfortunately, we can't use seccomp_syscall_resolve_num_arch() * to resolve the syscall's name, since it calls strdup() * under the hood (wtf!). */ char buffer[128]; (void)snprintf(buffer, lengthof(buffer), "---------------------------------------\n" "seccomp: bad syscall %d\n" "---------------------------------------\n", info->si_syscall); /* * Instead of silently crashing the process with * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS, * we'd like to receive a real SIGSYS to print the * message and *then* immediately exit. */ die(1, buffer); } #endif /* HAVE_LIBSECCOMP */ ================================================ FILE: pgxn/neon_walredo/walredoproc.c ================================================ /*------------------------------------------------------------------------- * * walredoproc.c * Entry point for WAL redo helper * * * This file contains an alternative main() function for the 'postgres' * binary. In the special mode, we go into a special mode that's similar * to the single user mode. We don't launch postmaster or any auxiliary * processes. Instead, we wait for command from 'stdin', and respond to * 'stdout'. * * The protocol through stdin/stdout is loosely based on the libpq protocol. * The process accepts messages through stdin, and each message has the format: * * char msgtype; * int32 length; // length of message including 'length' but excluding * // 'msgtype', in network byte order * * * There are three message types: * * BeginRedoForBlock ('B'): Prepare for WAL replay for given block * PushPage ('P'): Copy a page image (in the payload) to buffer cache * ApplyRecord ('A'): Apply a WAL record (in the payload) * GetPage ('G'): Return a page image from buffer cache. * Ping ('H'): Return the input message. * * Currently, you only get a response to GetPage requests; the response is * simply a 8k page, without any headers. Errors are logged to stderr. * * FIXME: * - this currently requires a valid PGDATA, and creates a lock file there * like a normal postmaster. There's no fundamental reason for that, though. * - should have EndRedoForBlock, and flush page cache, to allow using this * mechanism for more than one block without restarting the process. * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ #include "postgres.h" #include "../neon/neon_pgversioncompat.h" #include #include #include #include #include #ifdef HAVE_SYS_SELECT_H #include #endif #ifdef HAVE_SYS_RESOURCE_H #include #include #endif #if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__) #define MALLOC_NO_MMAP #include #endif #if PG_MAJORVERSION_NUM < 16 #ifndef HAVE_GETRUSAGE #include "rusagestub.h" #endif #endif #include "access/clog.h" #include "access/commit_ts.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" #include "access/xlog.h" #include "access/xlog_internal.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif #include "access/xlogutils.h" #include "catalog/pg_class.h" #include "commands/async.h" #include "libpq/pqformat.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" #if PG_MAJORVERSION_NUM >= 17 #include "storage/dsm_registry.h" #endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/procsignal.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/snapmgr.h" #include "inmem_smgr.h" #ifdef HAVE_LIBSECCOMP #include "neon_seccomp.h" #endif PG_MODULE_MAGIC; static int ReadRedoCommand(StringInfo inBuf); static void BeginRedoForBlock(StringInfo input_message); static void PushPage(StringInfo input_message); static void ApplyRecord(StringInfo input_message); static void apply_error_callback(void *arg); static bool redo_block_filter(XLogReaderState *record, uint8 block_id); static void GetPage(StringInfo input_message); static void Ping(StringInfo input_message); static ssize_t buffered_read(void *buf, size_t count); static void CreateFakeSharedMemoryAndSemaphores(void); static BufferTag target_redo_tag; static XLogReaderState *reader_state; #define TRACE DEBUG1 #ifdef HAVE_LIBSECCOMP /* * https://man7.org/linux/man-pages/man2/close_range.2.html * * The `close_range` syscall is available as of Linux 5.9. * * The `close_range` libc wrapper is only available in glibc >= 2.34. * Debian Bullseye ships a libc package based on glibc 2.31. * => write the wrapper ourselves, using the syscall number from the kernel headers. * * If the Linux uAPI headers don't define the system call number, * fail the build deliberately rather than ifdef'ing it to ENOSYS. * We prefer a compile time over a runtime error for walredo. */ #include #include #include static int close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags) { return syscall(__NR_close_range, start_fd, count, flags); } static PgSeccompRule allowed_syscalls[] = { /* Hard requirements */ PG_SCMP_ALLOW(exit_group), PG_SCMP_ALLOW(pselect6), PG_SCMP_ALLOW(read), PG_SCMP_ALLOW(select), PG_SCMP_ALLOW(write), /* Memory allocation */ PG_SCMP_ALLOW(brk), #ifndef MALLOC_NO_MMAP /* TODO: musl doesn't have mallopt */ PG_SCMP_ALLOW(mmap), PG_SCMP_ALLOW(munmap), #endif /* * getpid() is called on assertion failure, in ExceptionalCondition. * It's not really needed, but seems pointless to hide it either. The * system call unlikely to expose a kernel vulnerability, and the PID * is stored in MyProcPid anyway. */ PG_SCMP_ALLOW(getpid), PG_SCMP_ALLOW(futex), /* needed for errbacktrace */ /* Enable those for a proper shutdown. */ #if 0 PG_SCMP_ALLOW(munmap), PG_SCMP_ALLOW(shmctl), PG_SCMP_ALLOW(shmdt), PG_SCMP_ALLOW(unlink), /* shm_unlink */ #endif }; static void enter_seccomp_mode(void) { /* * The pageserver process relies on us to close all the file descriptors * it potentially leaked to us, _before_ we start processing potentially dangerous * wal records. See the comment in the Rust code that launches this process. */ if (close_range_syscall(3, ~0U, 0) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3"))); #ifdef MALLOC_NO_MMAP /* Ask glibc not to use mmap() */ mallopt(M_MMAP_MAX, 0); #endif seccomp_load_rules(allowed_syscalls, lengthof(allowed_syscalls)); } #endif /* HAVE_LIBSECCOMP */ PGDLLEXPORT void WalRedoMain(int argc, char *argv[]); /* * Entry point for the WAL redo process. * * Performs similar initialization as PostgresMain does for normal * backend processes. Some initialization was done in CallExtMain * already. */ PGDLLEXPORT void WalRedoMain(int argc, char *argv[]) { int firstchar; StringInfoData input_message; #ifdef HAVE_LIBSECCOMP bool enable_seccomp; #endif am_wal_redo_postgres = true; /* * Pageserver treats any output to stderr as an ERROR, so we must * set the log level as early as possible to only log FATAL and * above during WAL redo (note that loglevel ERROR also logs LOG, * which is super strange but that's not something we can solve * for here. ¯\_(-_-)_/¯ */ SetConfigOption("log_min_messages", "WARNING", PGC_SUSET, PGC_S_OVERRIDE); SetConfigOption("client_min_messages", "ERROR", PGC_SUSET, PGC_S_OVERRIDE); /* * WAL redo does not need a large number of buffers. And speed of * DropRelationAllLocalBuffers() is proportional to the number of * buffers. So let's keep it small (default value is 1024) */ num_temp_buffers = 4; NBuffers = 4; /* * install the simple in-memory smgr */ smgr_hook = smgr_inmem; smgr_init_hook = smgr_init_inmem; #if PG_VERSION_NUM >= 160000 /* make rmgr registry believe we can register the resource manager */ process_shared_preload_libraries_in_progress = true; load_file("$libdir/neon_rmgr", false); process_shared_preload_libraries_in_progress = false; #endif /* Initialize MaxBackends (if under postmaster, was done already) */ MaxConnections = 1; max_worker_processes = 0; max_parallel_workers = 0; max_wal_senders = 0; InitializeMaxBackends(); #if PG_VERSION_NUM >= 150000 process_shmem_requests(); InitializeShmemGUCs(); /* * This will try to access data directory which we do not set. * Seems to be pretty safe to disable. */ /* InitializeWalConsistencyChecking(); */ #endif /* * We have our own version of CreateSharedMemoryAndSemaphores() that * sets up local memory instead of shared one. */ CreateFakeSharedMemoryAndSemaphores(); /* * Remember stand-alone backend startup time,roughly at the same point * during startup that postmaster does so. */ PgStartTime = GetCurrentTimestamp(); /* * Create a per-backend PGPROC struct in shared memory. We must do * this before we can use LWLocks. */ InitAuxiliaryProcess(); SetProcessingMode(NormalProcessing); /* Redo routines won't work if we're not "in recovery" */ InRecovery = true; /* * Create the memory context we will use in the main loop. * * MessageContext is reset once per iteration of the main loop, ie, upon * completion of processing of each command message from the client. */ MessageContext = AllocSetContextCreate(TopMemoryContext, "MessageContext", ALLOCSET_DEFAULT_SIZES); /* we need a ResourceOwner to hold buffer pins */ Assert(CurrentResourceOwner == NULL); CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo"); /* Initialize resource managers */ for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) { if (RmgrTable[rmid].rm_startup != NULL) RmgrTable[rmid].rm_startup(); } reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL); #ifdef HAVE_LIBSECCOMP /* We prefer opt-out to opt-in for greater security */ enable_seccomp = true; for (int i = 1; i < argc; i++) if (strcmp(argv[i], "--disable-seccomp") == 0) enable_seccomp = false; /* * We deliberately delay the transition to the seccomp mode * until it's time to enter the main processing loop; * else we'd have to add a lot more syscalls to the allowlist. */ if (enable_seccomp) enter_seccomp_mode(); #endif /* HAVE_LIBSECCOMP */ /* * Main processing loop */ MemoryContextSwitchTo(MessageContext); initStringInfo(&input_message); #if PG_MAJORVERSION_NUM >= 16 MyBackendType = B_BACKEND; #endif for (;;) { /* Release memory left over from prior query cycle. */ resetStringInfo(&input_message); set_ps_display("idle"); /* * (3) read a command (loop blocks here) */ firstchar = ReadRedoCommand(&input_message); switch (firstchar) { case 'B': /* BeginRedoForBlock */ BeginRedoForBlock(&input_message); break; case 'P': /* PushPage */ PushPage(&input_message); break; case 'A': /* ApplyRecord */ ApplyRecord(&input_message); break; case 'G': /* GetPage */ GetPage(&input_message); break; case 'H': /* Ping */ Ping(&input_message); break; /* * EOF means we're done. Perform normal shutdown. */ case EOF: ereport(LOG, (errmsg("received EOF on stdin, shutting down"))); #ifdef HAVE_LIBSECCOMP /* * Skip the shutdown sequence, leaving some garbage behind. * Hopefully, postgres will clean it up in the next run. * This way we don't have to enable extra syscalls, which is nice. * See enter_seccomp_mode() above. */ if (enable_seccomp) _exit(0); #endif /* HAVE_LIBSECCOMP */ /* * NOTE: if you are tempted to add more code here, DON'T! * Whatever you had in mind to do should be set up as an * on_proc_exit or on_shmem_exit callback, instead. Otherwise * it will fail to be called during other backend-shutdown * scenarios. */ proc_exit(0); default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type %d", firstchar))); } } /* end of input-reading loop */ } /* * Initialize dummy shmem. * * This code follows CreateSharedMemoryAndSemaphores() but manually sets up * the shmem header and skips few initialization steps that are not needed for * WAL redo. * * I've also tried removing most of initialization functions that request some * memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had * any sizeable effect on RSS, so probably such clean up not worth the risk of having * half-initialized postgres. */ static void CreateFakeSharedMemoryAndSemaphores(void) { PGShmemHeader *hdr; Size size; int numSemas; char cwd[MAXPGPATH]; #if PG_VERSION_NUM >= 150000 size = CalculateShmemSize(&numSemas); #else /* * Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the * corresponging calculation in CreateSharedMemoryAndSemaphores() */ size = 1409024; numSemas = 10; #endif /* Dummy implementation of PGSharedMemoryCreate() */ { hdr = (PGShmemHeader *) malloc(size); if (!hdr) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory"))); hdr->creatorPID = getpid(); hdr->magic = PGShmemMagic; hdr->dsm_control = 0; hdr->device = 42; /* not relevant for non-shared memory */ hdr->inode = 43; /* not relevant for non-shared memory */ hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); UsedShmemSegAddr = hdr; UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */ } InitShmemAccess(hdr); /* * Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest * of the code does not need DataDir access so nullify DataDir after * PGReserveSemaphores() to error out if something will try to access it. */ if (!getcwd(cwd, MAXPGPATH)) ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("[neon-wal-redo] can not read current directory name"))); DataDir = cwd; PGReserveSemaphores(numSemas); DataDir = NULL; /* * The rest of function follows CreateSharedMemoryAndSemaphores() closely, * skipped parts are marked with comments. */ InitShmemAllocation(); /* * Now initialize LWLocks, which do shared memory allocation and are * needed for InitShmemIndex. */ CreateLWLocks(); /* * Set up shmem.c index hashtable */ InitShmemIndex(); /* * Set up xlog, clog, and buffers */ #if PG_MAJORVERSION_NUM >= 17 DSMRegistryShmemInit(); VarsupShmemInit(); #endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); InitBufferPool(); /* * Set up lock manager */ InitLocks(); /* * Set up predicate lock manager */ InitPredicateLocks(); /* * Set up process table */ if (!IsUnderPostmaster) InitProcGlobal(); CreateSharedProcArray(); CreateSharedBackendStatus(); TwoPhaseShmemInit(); BackgroundWorkerShmemInit(); /* * Set up shared-inval messaging */ CreateSharedInvalidationState(); /* * Set up interprocess signaling mechanisms */ PMSignalShmemInit(); ProcSignalShmemInit(); CheckpointerShmemInit(); AutoVacuumShmemInit(); ReplicationSlotsShmemInit(); ReplicationOriginShmemInit(); WalSndShmemInit(); WalRcvShmemInit(); PgArchShmemInit(); ApplyLauncherShmemInit(); /* * Set up other modules that need some shared memory space */ #if PG_MAJORVERSION_NUM < 17 /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); #endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ /* AsyncShmemInit(); */ #ifdef EXEC_BACKEND /* * Alloc the win32 shared backend array */ if (!IsUnderPostmaster) ShmemBackendArrayAllocation(); #endif /* * Now give loadable modules a chance to set up their shmem allocations */ if (shmem_startup_hook) shmem_startup_hook(); } /* Version compatility wrapper for ReadBufferWithoutRelcache */ static inline Buffer NeonRedoReadBuffer(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) { #if PG_VERSION_NUM >= 150000 return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL, /* no strategy */ true); /* WAL redo is only performed on permanent rels */ #else return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL); /* no strategy */ #endif } /* * Some debug function that may be handy for now. */ pg_attribute_unused() static char * pprint_buffer(char *data, int len) { StringInfoData s; initStringInfo(&s); appendStringInfo(&s, "\n"); for (int i = 0; i < len; i++) { appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) ); if (i % 32 == 31) { appendStringInfo(&s, "\n"); } } appendStringInfo(&s, "\n"); return s.data; } /* ---------------------------------------------------------------- * routines to obtain user input * ---------------------------------------------------------------- */ /* * Read next command from the client. * * the string entered by the user is placed in its parameter inBuf, * and we act like a Q message was received. * * EOF is returned if end-of-file input is seen; time to shut down. * ---------------- */ static int ReadRedoCommand(StringInfo inBuf) { ssize_t ret; char hdr[1 + sizeof(int32)]; int qtype; int32 len; /* Read message type and message length */ ret = buffered_read(hdr, sizeof(hdr)); if (ret != sizeof(hdr)) { if (ret == 0) return EOF; else if (ret < 0) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("could not read message header: %m"))); else ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF"))); } qtype = hdr[0]; memcpy(&len, &hdr[1], sizeof(int32)); len = pg_ntoh32(len); if (len < 4) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid message length"))); len -= 4; /* discount length itself */ /* Read the message payload */ enlargeStringInfo(inBuf, len); ret = buffered_read(inBuf->data, len); if (ret != len) { if (ret < 0) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("could not read message: %m"))); else ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF"))); } inBuf->len = len; inBuf->data[len] = '\0'; return qtype; } /* * Prepare for WAL replay on given block */ static void BeginRedoForBlock(StringInfo input_message) { NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; SMgrRelation reln; /* * message format: * * spcNode * dbNode * relNode * ForkNumber * BlockNumber */ forknum = pq_getmsgbyte(input_message); #if PG_MAJORVERSION_NUM < 16 rinfo.spcNode = pq_getmsgint(input_message, 4); rinfo.dbNode = pq_getmsgint(input_message, 4); rinfo.relNode = pq_getmsgint(input_message, 4); #else rinfo.spcOid = pq_getmsgint(input_message, 4); rinfo.dbOid = pq_getmsgint(input_message, 4); rinfo.relNumber = pq_getmsgint(input_message, 4); #endif blknum = pq_getmsgint(input_message, 4); wal_redo_buffer = InvalidBuffer; InitBufferTag(&target_redo_tag, &rinfo, forknum, blknum); elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", RelFileInfoFmt(rinfo), target_redo_tag.forkNum, target_redo_tag.blockNum); reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { reln->smgr_cached_nblocks[forknum] = blknum + 1; } } /* * Receive a page given by the client, and put it into buffer cache. */ static void PushPage(StringInfo input_message) { NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; const char *content; Buffer buf; Page page; /* * message format: * * spcNode * dbNode * relNode * ForkNumber * BlockNumber * 8k page content */ forknum = pq_getmsgbyte(input_message); #if PG_MAJORVERSION_NUM < 16 rinfo.spcNode = pq_getmsgint(input_message, 4); rinfo.dbNode = pq_getmsgint(input_message, 4); rinfo.relNode = pq_getmsgint(input_message, 4); #else rinfo.spcOid = pq_getmsgint(input_message, 4); rinfo.dbOid = pq_getmsgint(input_message, 4); rinfo.relNumber = pq_getmsgint(input_message, 4); #endif blknum = pq_getmsgint(input_message, 4); content = pq_getmsgbytes(input_message, BLCKSZ); buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_ZERO_AND_LOCK); wal_redo_buffer = buf; page = BufferGetPage(buf); memcpy(page, content, BLCKSZ); MarkBufferDirty(buf); /* pro forma */ UnlockReleaseBuffer(buf); } /* * Receive a WAL record, and apply it. * * All the pages should be loaded into the buffer cache by PushPage calls already. */ static void ApplyRecord(StringInfo input_message) { char *errormsg; XLogRecPtr lsn; XLogRecord *record; int nleft; ErrorContextCallback errcallback; #if PG_VERSION_NUM >= 150000 DecodedXLogRecord *decoded; #define STATIC_DECODEBUF_SIZE (64 * 1024) static char *static_decodebuf = NULL; size_t required_space; #endif /* * message format: * * LSN (the *end* of the record) * record */ lsn = pq_getmsgint64(input_message); smgrinit(); /* reset inmem smgr state */ /* note: the input must be aligned here */ record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord)); nleft = input_message->len - input_message->cursor; if (record->xl_tot_len != sizeof(XLogRecord) + nleft) elog(ERROR, "mismatch between record (%d) and message size (%d)", record->xl_tot_len, (int) sizeof(XLogRecord) + nleft); /* Setup error traceback support for ereport() */ errcallback.callback = apply_error_callback; errcallback.arg = (void *) reader_state; errcallback.previous = error_context_stack; error_context_stack = &errcallback; XLogBeginRead(reader_state, lsn); #if PG_VERSION_NUM >= 150000 /* * For reasonably small records, reuse a fixed size buffer to reduce * palloc overhead. */ required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len); if (required_space <= STATIC_DECODEBUF_SIZE) { if (static_decodebuf == NULL) static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE); decoded = (DecodedXLogRecord *) static_decodebuf; } else decoded = palloc(required_space); if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); else { /* Record the location of the next record. */ decoded->next_lsn = reader_state->NextRecPtr; /* * Update the pointers to the beginning and one-past-the-end of this * record, again for the benefit of historical code that expected the * decoder to track this rather than accessing these fields of the record * itself. */ reader_state->record = decoded; reader_state->ReadRecPtr = decoded->lsn; reader_state->EndRecPtr = decoded->next_lsn; } #else /* * In lieu of calling XLogReadRecord, store the record 'decoded_record' * buffer directly. */ reader_state->ReadRecPtr = lsn; reader_state->decoded_record = record; if (!DecodeXLogRecord(reader_state, record, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); #endif /* Ignore any other blocks than the ones the caller is interested in */ redo_read_buffer_filter = redo_block_filter; RmgrTable[record->xl_rmid].rm_redo(reader_state); /* * If no base image of the page was provided by PushPage, initialize * wal_redo_buffer here. The first WAL record must initialize the page * in that case. */ if (BufferIsInvalid(wal_redo_buffer)) { wal_redo_buffer = NeonRedoReadBuffer(BufTagGetNRelFileInfo(target_redo_tag), target_redo_tag.forkNum, target_redo_tag.blockNum, RBM_NORMAL); Assert(!BufferIsInvalid(wal_redo_buffer)); ReleaseBuffer(wal_redo_buffer); } redo_read_buffer_filter = NULL; /* Pop the error context stack */ error_context_stack = errcallback.previous; elog(TRACE, "applied WAL record with LSN %X/%X", (uint32) (lsn >> 32), (uint32) lsn); #if PG_VERSION_NUM >= 150000 if ((char *) decoded != static_decodebuf) pfree(decoded); #endif } /* * Error context callback for errors occurring during ApplyRecord */ static void apply_error_callback(void *arg) { XLogReaderState *record = (XLogReaderState *) arg; StringInfoData buf; initStringInfo(&buf); #if PG_VERSION_NUM >= 150000 if (record->record) #else if (record->decoded_record) #endif xlog_outdesc(&buf, record); /* translator: %s is a WAL record description */ errcontext("WAL redo at %X/%X for %s", LSN_FORMAT_ARGS(record->ReadRecPtr), buf.data); pfree(buf.data); } static bool redo_block_filter(XLogReaderState *record, uint8 block_id) { BufferTag target_tag; NRelFileInfo rinfo; #if PG_VERSION_NUM >= 150000 XLogRecGetBlockTag(record, block_id, &rinfo, &target_tag.forkNum, &target_tag.blockNum); #else if (!XLogRecGetBlockTag(record, block_id, &rinfo, &target_tag.forkNum, &target_tag.blockNum)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } #endif CopyNRelFileInfoToBufTag(target_tag, rinfo); /* * Can a WAL redo function ever access a relation other than the one that * it modifies? I don't see why it would. * Custom RMGRs may be affected by this. */ if (!RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(target_redo_tag))) elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", RelFileInfoFmt(rinfo), target_tag.forkNum, target_tag.blockNum); /* * If this block isn't one we are currently restoring, then return 'true' * so that this gets ignored */ return !BufferTagsEqual(&target_tag, &target_redo_tag); } /* * Get a page image back from buffer cache. * * After applying some records. */ static void GetPage(StringInfo input_message) { NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; Buffer buf; Page page; int tot_written; /* * message format: * * spcNode * dbNode * relNode * ForkNumber * BlockNumber */ forknum = pq_getmsgbyte(input_message); #if PG_MAJORVERSION_NUM < 16 rinfo.spcNode = pq_getmsgint(input_message, 4); rinfo.dbNode = pq_getmsgint(input_message, 4); rinfo.relNode = pq_getmsgint(input_message, 4); #else rinfo.spcOid = pq_getmsgint(input_message, 4); rinfo.dbOid = pq_getmsgint(input_message, 4); rinfo.relNumber = pq_getmsgint(input_message, 4); #endif blknum = pq_getmsgint(input_message, 4); /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_NORMAL); Assert(buf == wal_redo_buffer); page = BufferGetPage(buf); /* single thread, so don't bother locking the page */ /* Response: Page content */ tot_written = 0; do { ssize_t rc; rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written); if (rc < 0) { /* If interrupted by signal, just retry */ if (errno == EINTR) continue; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to stdout: %m"))); } tot_written += rc; } while (tot_written < BLCKSZ); ReleaseBuffer(buf); DropRelationAllLocalBuffers(rinfo); wal_redo_buffer = InvalidBuffer; elog(TRACE, "Page sent back for block %u", blknum); } static void Ping(StringInfo input_message) { int tot_written; /* Response: the input message */ tot_written = 0; do { ssize_t rc; /* We don't need alignment, but it's bad practice to use char[BLCKSZ] */ #if PG_VERSION_NUM >= 160000 static const PGIOAlignedBlock response; #else static const PGAlignedBlock response; #endif rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written); if (rc < 0) { /* If interrupted by signal, just retry */ if (errno == EINTR) continue; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to stdout: %m"))); } tot_written += rc; } while (tot_written < BLCKSZ); elog(TRACE, "Page sent back for ping"); } /* Buffer used by buffered_read() */ static char stdin_buf[16 * 1024]; static size_t stdin_len = 0; /* # of bytes in buffer */ static size_t stdin_ptr = 0; /* # of bytes already consumed */ /* * Like read() on stdin, but buffered. * * We cannot use libc's buffered fread(), because it uses syscalls that we * have disabled with seccomp(). Depending on the platform, it can call * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat' * seems problematic because it allows interrogating files by path name. * * The return value is the number of bytes read. On error, -1 is returned, and * errno is set appropriately. Unlike read(), this fills the buffer completely * unless an error happens or EOF is reached. */ static ssize_t buffered_read(void *buf, size_t count) { char *dst = buf; while (count > 0) { size_t nthis; if (stdin_ptr == stdin_len) { ssize_t ret; ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf)); if (ret < 0) { /* don't do anything here that could set 'errno' */ return ret; } if (ret == 0) { /* EOF */ break; } stdin_len = (size_t) ret; stdin_ptr = 0; } nthis = Min(stdin_len - stdin_ptr, count); memcpy(dst, &stdin_buf[stdin_ptr], nthis); stdin_ptr += nthis; count -= nthis; dst += nthis; } return (dst - (char *) buf); } ================================================ FILE: postgres.mk ================================================ # Sub-makefile for compiling PostgreSQL as part of Neon. This is # included from the main Makefile, and is not meant to be called # directly. # # CI workflows and Dockerfiles can take advantage of the following # properties for caching: # # - Compiling the targets in this file only builds the PostgreSQL sources # under the vendor/ subdirectory, nothing else from the repository. # - All outputs go to POSTGRES_INSTALL_DIR (by default 'pg_install', # see parent Makefile) # - intermediate build artifacts go to BUILD_DIR # # # Variables passed from the parent Makefile that control what gets # installed and where: # - POSTGRES_VERSIONS # - POSTGRES_INSTALL_DIR # - BUILD_DIR # # Variables passed from the parent Makefile that affect the build # process and the resulting binaries: # - PG_CONFIGURE_OPTS # - PG_CFLAGS # - PG_LDFLAGS # - EXTRA_PATH_OVERRIDES ### ### Main targets ### ### These are called from the main Makefile, and can also be called ### directly from command line # Compile and install a specific PostgreSQL version postgres-install-%: postgres-configure-% \ postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers` # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include # # This is implicitly part of the 'postgres-install-%' target, but this can be handy # if you want to install just the headers without building PostgreSQL, e.g. for building # extensions. postgres-headers-install-%: postgres-configure-% +@echo "Installing PostgreSQL $* headers" $(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install # Run Postgres regression tests postgres-check-%: postgres-install-% $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check ### ### Shorthands for the main targets, for convenience ### # Same as the above main targets, but for all supported PostgreSQL versions # For example, 'make postgres-install' is equivalent to # 'make postgres-install-v14 postgres-install-v15 postgres-install-v16 postgres-install-v17' all_version_targets=postgres-install postgres-headers-install postgres-check .PHONY: $(all_version_targets) $(all_version_targets): postgres-%: $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-%-$(pg_version)) .PHONY: postgres postgres: postgres-install .PHONY: postgres-headers postgres-headers: postgres-headers-install # 'postgres-v17' is an alias for 'postgres-install-v17' etc. $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-$(pg_version)): postgres-%: postgres-install-% ### ### Intermediate targets ### ### These are not intended to be called directly, but are dependencies for the ### main targets. # Run 'configure' $(BUILD_DIR)/%/config.status: mkdir -p $(BUILD_DIR) test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } mkdir -p $(BUILD_DIR)/$* VERSION=$*; \ EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ (cd $(BUILD_DIR)/$$VERSION && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) # nicer alias to run 'configure'. # # This tries to accomplish this rule: # # postgres-configure-%: $(BUILD_DIR)/%/config.status # # XXX: I'm not sure why the above rule doesn't work directly. But this accomplishses # the same thing $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-configure-$(pg_version)): postgres-configure-%: FORCE $(BUILD_DIR)/%/config.status # Compile and install PostgreSQL (and a few contrib modules used in tests) postgres-install-%: postgres-configure-% \ postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers-install` +@echo "Compiling PostgreSQL $*" $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install $(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install $(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install $(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install .PHONY: FORCE FORCE: ================================================ FILE: pre-commit.py ================================================ #!/usr/bin/env python3 from __future__ import annotations import argparse import enum import os import subprocess import sys @enum.unique class Color(enum.Enum): RED = "\033[0;31m" GREEN = "\033[0;33m" CYAN = "\033[0;36m" NC = "\033[0m" # No Color def colorify( s: str, color: Color, no_color: bool = False, ): if no_color: return s return f"{color.value}{s}{NC}" def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str: cmd = "cargo fmt" if not fix_inplace: cmd += " --check" if no_color: cmd += " -- --color=never" return cmd def ruff_check(fix_inplace: bool) -> str: cmd = "poetry run ruff check" if fix_inplace: cmd += " --fix" return cmd def ruff_format(fix_inplace: bool) -> str: cmd = "poetry run ruff format" if not fix_inplace: cmd += " --diff --check" return cmd def mypy() -> str: return "poetry run mypy" def get_commit_files() -> list[str]: files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split()) return files.decode().splitlines() def check( name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False, append_files_to_cmd: bool = True, ): print(f"Checking: {name} ", end="") applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files)) if not applicable_files: print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color)) return if append_files_to_cmd: cmd = f"{cmd} {' '.join(applicable_files)}" res = subprocess.run(cmd.split(), capture_output=True) if res.returncode != 0: print(colorify("[FAILED]", Color.RED, no_color)) if name == "mypy": print("Please inspect the output below and fix type mismatches.") else: print("Please inspect the output below and run make fmt to fix automatically.") if suffix == ".py": print( "If the output is empty, ensure that you've installed Python tooling by\n" "running './scripts/pysync' in the current directory (no root needed)" ) print() print(res.stdout.decode()) sys.exit(1) print(colorify("[OK]", Color.GREEN, no_color)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace") parser.add_argument( "--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb", ) args = parser.parse_args() files = get_commit_files() check( name="cargo fmt", suffix=".rs", cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color), changed_files=files, no_color=args.no_color, append_files_to_cmd=False, ) check( name="ruff check", suffix=".py", cmd=ruff_check(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) check( name="ruff format", suffix=".py", cmd=ruff_format(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) check( name="mypy", suffix=".py", cmd=mypy(), changed_files=files, no_color=args.no_color, ) ================================================ FILE: proxy/Cargo.toml ================================================ [package] name = "proxy" version = "0.1.0" edition = "2024" license.workspace = true [features] default = [] testing = ["dep:tokio-postgres"] rest_broker = ["dep:subzero-core", "dep:ouroboros"] [dependencies] ahash.workspace = true anyhow.workspace = true arc-swap.workspace = true async-compression.workspace = true async-trait.workspace = true atomic-take.workspace = true aws-config.workspace = true aws-credential-types.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } camino.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive", "env"] } clashmap.workspace = true compute_api.workspace = true consumption_metrics.workspace = true env_logger.workspace = true framed-websockets.workspace = true futures.workspace = true hashbrown.workspace = true hex.workspace = true hmac.workspace = true hostname.workspace = true http.workspace = true http-utils.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } gettid = "0.1.3" indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true itoa.workspace = true json = { path = "../libs/proxy/json" } lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true moka.workspace = true once_cell.workspace = true opentelemetry = { workspace = true, features = ["trace"] } papaya = "0.2.0" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true rand.workspace = true rand_core.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } reqwest = { workspace = true, features = ["rustls-tls-native-roots"] } reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true rustc-hash.workspace = true rustls.workspace = true rustls-native-certs.workspace = true rustls-pemfile.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true strum_macros.workspace = true subtle.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tokio-postgres = { workspace = true, optional = true } tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true tracing-log.workspace = true tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true url.workspace = true urlencoding.workspace = true utils.workspace = true uuid.workspace = true x509-cert.workspace = true redis.workspace = true zerocopy.workspace = true zeroize.workspace = true # uncomment this to use the real subzero-core crate # subzero-core = { git = "https://github.com/neondatabase/subzero", rev = "396264617e78e8be428682f87469bb25429af88a", features = ["postgresql"], optional = true } # this is a stub for the subzero-core crate subzero-core = { path = "../libs/proxy/subzero_core", features = ["postgresql"], optional = true} ouroboros = { version = "0.18", optional = true } # jwt stuff jose-jwa = "0.1.2" jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } signature = "2" ecdsa = "0.16" p256 = { version = "0.13", features = ["jwk"] } ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] } rsa = "0.9" workspace_hack.workspace = true [dev-dependencies] assert-json-diff.workspace = true camino-tempfile.workspace = true fallible-iterator.workspace = true flate2.workspace = true tokio-tungstenite.workspace = true pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true walkdir.workspace = true rand_distr = "0.5" tokio-postgres.workspace = true tracing-test = "0.2" ================================================ FILE: proxy/README.md ================================================ # Proxy Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following routing backends are currently implemented: * console new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * web (or link) sends login link for all usernames Also proxy can expose following services to the external world: * postgres protocol over TCP -- usual postgres endpoint compatible with usual postgres drivers * postgres protocol over WebSockets -- same protocol tunneled over websockets for environments where TCP connection is not available. We have our own implementation of a client that uses node-postgres and tunnels traffic through websockets: https://github.com/neondatabase/serverless * SQL over HTTP -- service that accepts POST requests with SQL text over HTTP and responds with JSON-serialised results. ## SQL over HTTP Contrary to the usual postgres proto over TCP and WebSockets using plain one-shot HTTP request achieves smaller amortized latencies in edge setups due to fewer round trips and an enhanced open connection reuse by the v8 engine. Also such endpoint could be used directly without any driver. To play with it locally one may start proxy over a local postgres installation (see end of this page on how to generate certs with openssl): ``` LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 ``` If both postgres and proxy are running you may send a SQL query: ```console curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \ -H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \ -H 'Content-Type: application/json' \ --data '{ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}] }' | jq ``` ```json { "command": "SELECT", "fields": [ { "dataTypeID": 1007, "name": "arr" }, { "dataTypeID": 3802, "name": "obj" }, { "dataTypeID": 23, "name": "num" } ], "rowCount": 1, "rows": [ { "arr": [[1,2],[3,4]], "num": 42, "obj": { "ikey": 4242, "key": "val" } } ] } ``` With the current approach we made the following design decisions: 1. SQL injection protection: We employed the extended query protocol, modifying the rust-postgres driver to send queries in one roundtrip using a text protocol rather than binary, bypassing potential issues like those identified in sfackler/rust-postgres#1030. 2. Postgres type compatibility: As not all postgres types have binary representations (e.g., acl's in pg_class), we adjusted rust-postgres to respond with text protocol, simplifying serialization and fixing queries with text-only types in response. 3. Data type conversion: Considering JSON supports fewer data types than Postgres, we perform conversions where possible, passing all other types as strings. Key conversions include: - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain text) - postgres bool, null, text -> json bool, null, string - postgres array -> json array - postgres json and jsonb -> json object 4. Alignment with node-postgres: To facilitate integration with js libraries, we've matched the response structure of node-postgres, returning command tags and column oids. Command tag capturing was added to the rust-postgres functionality as part of this change. ### Output options User can pass several optional headers that will affect resulting json. 1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres. 2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge cases where it is hard to use rows represented as objects (e.g. when several fields have the same name). ## Test proxy locally Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`. We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows: ```sh docker run \ --detach \ --name proxy-postgres \ --env POSTGRES_PASSWORD=proxy-postgres \ --publish 5432:5432 \ postgres:17-bookworm ``` Next step is setting up auth table and schema as well as creating role (without the JWT table): ```sh docker exec -it proxy-postgres psql -U postgres -c "CREATE SCHEMA IF NOT EXISTS neon_control_plane" docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))" docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';" ``` If you want to test query cancellation, redis is also required: ```sh docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0 ``` Let's create self-signed certificate by running: ```sh openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build" ``` Then we need to build proxy with 'testing' feature and run, e.g.: ```sh RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \ --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \ --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \ -c server.crt -k server.key ``` Now from client you can start a new session: ```sh PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` ## auth broker setup: Create a postgres instance: ```sh docker run \ --detach \ --name proxy-postgres \ --env POSTGRES_HOST_AUTH_METHOD=trust \ --env POSTGRES_USER=authenticated \ --env POSTGRES_DB=database \ --publish 5432:5432 \ postgres:17-bookworm ``` Create a configuration file called `local_proxy.json` in the root of the repo (used also by the auth broker to validate JWTs) ```sh { "jwks": [ { "id": "1", "role_names": ["authenticator", "authenticated", "anon"], "jwks_url": "https://climbing-minnow-11.clerk.accounts.dev/.well-known/jwks.json", "provider_name": "foo", "jwt_audience": null } ] } ``` Start the local proxy: ```sh cargo run --bin local_proxy --features testing -- \ --disable-pg-session-jwt \ --http 0.0.0.0:7432 ``` Start the auth/rest broker: Note: to enable the rest broker you need to replace the stub subzero-core crate with the real one. ```sh cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a ``` ```sh LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing,rest_broker -- \ -c server.crt -k server.key \ --is-auth-broker true \ --is-rest-broker true \ --wss 0.0.0.0:8080 \ --http 0.0.0.0:7002 \ --auth-backend local ``` Create a JWT in your auth provider (e.g. Clerk) and set it in the `NEON_JWT` environment variable. ```sh export NEON_JWT="..." ``` Run a query against the auth broker: ```sh curl -k "https://foo.local.neon.build:8080/sql" \ -H "Authorization: Bearer $NEON_JWT" \ -H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \ -d '{"query":"select 1","params":[]}' ``` Make a rest request against the auth broker (rest broker): ```sh curl -k "https://foo.local.neon.build:8080/database/rest/v1/items?select=id,name&id=eq.1" \ -H "Authorization: Bearer $NEON_JWT" ``` ================================================ FILE: proxy/src/auth/backend/classic.rs ================================================ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use super::{ComputeCredentials, ComputeUserInfo}; use crate::auth::backend::ComputeCredentialKeys; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::stream::{PqStream, Stream}; use crate::{compute, sasl}; pub(super) async fn authenticate( ctx: &RequestContext, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, secret: AuthSecret, ) -> auth::Result { let scram_keys = match secret { AuthSecret::Scram(secret) => { debug!("auth endpoint chooses SCRAM"); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, AuthFlow::new(client, auth::Scram(&secret, ctx)).authenticate(), ) .await .inspect_err(|_| warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs())) .map_err(auth::AuthError::user_timeout)? .inspect_err(|error| warn!(?error, "error processing scram messages"))?; let client_key = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { // TODO: warnings? // TODO: should we get rid of this because double logging? info!("auth backend failed with an error: {reason}"); return Err(auth::AuthError::password_failed(&*creds.user)); } }; compute::ScramKeys { client_key: client_key.as_bytes(), server_key: secret.server_key.as_bytes(), } } }; Ok(ComputeCredentials { info: creds, keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256( scram_keys, )), }) } ================================================ FILE: proxy/src/auth/backend/console_redirect.rs ================================================ use std::fmt; use async_trait::async_trait; use postgres_client::config::SslMode; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; use crate::cache::node_info::CachedNodeInfo; use crate::compute::AuthInfo; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; use crate::proxy::wake_compute::WakeComputeBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; #[derive(Debug, Error)] pub(crate) enum ConsoleRedirectError { #[error(transparent)] WaiterRegister(#[from] waiters::RegisterError), #[error(transparent)] WaiterWait(#[from] waiters::WaitError), #[error(transparent)] Io(#[from] std::io::Error), } #[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient, } impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "NeonControlPlaneClient") } } impl UserFacingError for ConsoleRedirectError { fn to_string_client(&self) -> String { "Internal error".to_string() } } impl ReportableError for ConsoleRedirectError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { Self::WaiterRegister(_) => crate::error::ErrorKind::Service, Self::WaiterWait(_) => crate::error::ErrorKind::Service, Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } fn hello_message( redirect_uri: &reqwest::Url, session_id: &str, duration: std::time::Duration, ) -> String { let formatted_duration = humantime::format_duration(duration).to_string(); format!( concat![ "Welcome to Neon!\n", "Authenticate by visiting (will expire in {duration}):\n", " {redirect_uri}{session_id}\n\n", ], duration = formatted_duration, redirect_uri = redirect_uri, session_id = session_id, ) } pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } impl ConsoleRedirectBackend { pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self { Self { console_uri, api } } pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient { &self.api } pub(crate) async fn authenticate( &self, ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, ) -> auth::Result<(ConsoleRedirectNodeInfo, AuthInfo, ComputeUserInfo)> { authenticate(ctx, auth_config, &self.console_uri, client) .await .map(|(node_info, auth_info, user_info)| { (ConsoleRedirectNodeInfo(node_info), auth_info, user_info) }) } } pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); #[async_trait] impl WakeComputeBackend for ConsoleRedirectNodeInfo { async fn wake_compute( &self, _ctx: &RequestContext, ) -> Result { Ok(Cached::new_uncached(self.0.clone())) } } async fn authenticate( ctx: &RequestContext, auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result<(NodeInfo, AuthInfo, ComputeUserInfo)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. // just try again. let (psql_session_id, waiter) = loop { let psql_session_id = new_psql_session_id(); if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) { break (psql_session_id, waiter); } }; let span = info_span!("console_redirect", psql_session_id = &psql_session_id); let greeting = hello_message( link_uri, &psql_session_id, auth_config.console_redirect_confirmation_timeout, ); // Give user a URL to spawn a new database. info!(parent: &span, "sending the auth URL to the user"); client.write_message(BeMessage::AuthenticationOk); client.write_message(BeMessage::ParameterStatus { name: b"client_encoding", value: b"UTF8", }); client.write_message(BeMessage::NoticeResponse(&greeting)); client.flush().await?; // Wait for console response via control plane (see `mgmt`). info!(parent: &span, "waiting for console's reply..."); let db_info = tokio::time::timeout(auth_config.console_redirect_confirmation_timeout, waiter) .await .map_err(|_elapsed| { auth::AuthError::confirmation_timeout( auth_config.console_redirect_confirmation_timeout.into(), ) })? .map_err(ConsoleRedirectError::from)?; if auth_config.ip_allowlist_check_enabled && let Some(allowed_ips) = &db_info.allowed_ips && !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } // Check if the access over the public internet is allowed, otherwise block. Note that // the console redirect is not behind the VPC service endpoint, so we don't need to check // the VPC endpoint ID. if let Some(public_access_allowed) = db_info.public_access_allowed && !public_access_allowed { return Err(auth::AuthError::NetworkNotAllowed); } // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy // everywhere, we can remove this. let ssl_mode = if db_info.host.contains("--") { // we need TLS connection with SNI info to properly route it SslMode::Require } else { SslMode::Disable }; let conn_info = compute::ConnectInfo { host: db_info.host.into(), port: db_info.port, ssl_mode, host_addr: None, }; let auth_info = AuthInfo::for_console_redirect(&db_info.dbname, &db_info.user, db_info.password.as_deref()); let user: RoleName = db_info.user.into(); let user_info = ComputeUserInfo { endpoint: db_info.aux.endpoint_id.as_str().into(), user: user.clone(), options: NeonOptions::default(), }; ctx.set_dbname(db_info.dbname.into()); ctx.set_user(user); ctx.set_project(db_info.aux.clone()); info!("woken up a compute node"); Ok(( NodeInfo { conn_info, aux: db_info.aux, }, auth_info, user_info, )) } ================================================ FILE: proxy/src/auth/backend/hacks.rs ================================================ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info}; use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::sasl; use crate::stream::{self, Stream}; /// Compared to [SCRAM](crate::scram), cleartext password auth saves /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub(crate) async fn authenticate_cleartext( ctx: &RequestContext, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, config: &'static AuthenticationConfig, ) -> auth::Result { debug!("cleartext auth flow override is enabled, proceeding"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); let ep = EndpointIdInt::from(&info.endpoint); let role = RoleNameInt::from(&info.user); let auth_flow = AuthFlow::new( client, auth::CleartextPassword { secret, endpoint: ep, role, pool: config.scram_thread_pool.clone(), }, ); let auth_outcome = { // pause the timer while we communicate with the client let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // cleartext auth is only allowed to the ws/http protocol. // If we're here, we already received the password in the first message. // Scram protocol will be executed on the proxy side. auth_flow.authenticate().await? }; let keys = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); return Err(auth::AuthError::password_failed(&*info.user)); } }; Ok(ComputeCredentials { info, keys }) } /// Workaround for clients which don't provide an endpoint (project) name. /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub(crate) async fn password_hack_no_authentication( ctx: &RequestContext, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result<(ComputeUserInfo, Vec)> { debug!("project not specified, resorting to the password hack auth flow"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client, auth::PasswordHack) .get_password() .await?; debug!(project = &*payload.endpoint, "received missing parameter"); // Report tentative success; compute node will check the password anyway. Ok(( ComputeUserInfo { user: info.user, options: info.options, endpoint: payload.endpoint, }, payload.password, )) } ================================================ FILE: proxy/src/auth/backend/jwt.rs ================================================ use std::borrow::Cow; use std::future::Future; use std::sync::Arc; use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use base64::Engine as _; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; use reqwest::{Client, redirect}; use reqwest_retry::RetryTransientMiddleware; use reqwest_retry::policies::ExponentialBackoff; use serde::de::Visitor; use serde::{Deserialize, Deserializer}; use serde_json::value::RawValue; use signature::Verifier; use thiserror::Error; use tokio::time::Instant; use crate::auth::backend::ComputeCredentialKeys; use crate::context::RequestContext; use crate::control_plane::errors::GetEndpointJwksError; use crate::http::read_body_with_limit; use crate::intern::RoleNameInt; use crate::types::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); const MIN_RENEW: Duration = Duration::from_secs(30); const AUTO_RENEW: Duration = Duration::from_secs(300); const MAX_RENEW: Duration = Duration::from_secs(3600); const MAX_JWK_BODY_SIZE: usize = 64 * 1024; const JWKS_USER_AGENT: &str = "neon-proxy"; const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2); const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5); const JWKS_FETCH_RETRIES: u32 = 3; /// How to get the JWT auth rules pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { fn fetch_auth_rules( &self, ctx: &RequestContext, endpoint: EndpointId, ) -> impl Future, FetchAuthRulesError>> + Send; } #[derive(Error, Debug)] pub(crate) enum FetchAuthRulesError { #[error(transparent)] GetEndpointJwks(#[from] GetEndpointJwksError), #[error("JWKs settings for this role were not configured")] RoleJwksNotConfigured, } #[derive(Clone)] pub(crate) struct AuthRule { pub(crate) id: String, pub(crate) jwks_url: url::Url, pub(crate) audience: Option, pub(crate) role_names: Vec, } pub struct JwkCache { client: reqwest_middleware::ClientWithMiddleware, map: ClashMap<(EndpointId, RoleName), Arc>, } pub(crate) struct JwkCacheEntry { /// Should refetch at least every hour to verify when old keys have been removed. /// Should refetch when new key IDs are seen only every 5 minutes or so last_retrieved: Instant, /// cplane will return multiple JWKs urls that we need to scrape. key_sets: ahash::HashMap, } impl JwkCacheEntry { fn find_jwk_and_audience( &self, key_id: &str, role_name: &RoleName, ) -> Option<(&jose_jwk::Jwk, Option<&str>)> { self.key_sets .values() // make sure our requested role has access to the key set .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name)) // try and find the requested key-id in the key set .find_map(|key_set| { key_set .find_key(key_id) .map(|jwk| (jwk, key_set.audience.as_deref())) }) } } struct KeySet { jwks: jose_jwk::JwkSet, audience: Option, role_names: Vec, } impl KeySet { fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> { self.jwks .keys .iter() .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id)) } } pub(crate) struct JwkCacheEntryLock { cached: ArcSwapOption, lookup: tokio::sync::Semaphore, } impl Default for JwkCacheEntryLock { fn default() -> Self { JwkCacheEntryLock { cached: ArcSwapOption::empty(), lookup: tokio::sync::Semaphore::new(1), } } } #[derive(Deserialize)] struct JwkSet<'a> { /// we parse into raw-value because not all keys in a JWKS are ones /// we can parse directly, so we parse them lazily. #[serde(borrow)] keys: Vec<&'a RawValue>, } /// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs. /// Returns `None` and log a warning if there are any errors. async fn fetch_jwks( client: &reqwest_middleware::ClientWithMiddleware, jwks_url: url::Url, ) -> Option { let req = client.get(jwks_url.clone()); // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. let resp = req.send().await.and_then(|r| { r.error_for_status() .map_err(reqwest_middleware::Error::Reqwest) }); let resp = match resp { Ok(r) => r, // TODO: should we re-insert JWKs if we want to keep this JWKs URL? // I expect these failures would be quite sparse. Err(e) => { tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs"); return None; } }; let resp: http::Response = resp.into(); let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await { Ok(bytes) => bytes, Err(e) => { tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); return None; } }; let jwks = match serde_json::from_slice::(&bytes) { Ok(jwks) => jwks, Err(e) => { tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); return None; } }; // `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need. // // Even though we limit our responses to 64KiB, we could still receive a payload like // `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB. // Pre-allocating the corresponding `Vec::::with_capacity(30000)` uses 8.2MiB. let mut keys = vec![]; let mut failed = 0; for key in jwks.keys { let key = match serde_json::from_str::(key.get()) { Ok(key) => key, Err(e) => { tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK"); failed += 1; continue; } }; // if `use` (called `cls` in rust) is specified to be something other than signing, // we can skip storing it. if key .prm .cls .as_ref() .is_some_and(|c| *c != jose_jwk::Class::Signing) { continue; } keys.push(key); } keys.shrink_to_fit(); if failed > 0 { tracing::warn!(url=?jwks_url, failed, "could not decode JWKs"); } if keys.is_empty() { tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body"); return None; } Some(jose_jwk::JwkSet { keys }) } impl JwkCacheEntryLock { async fn acquire_permit(self: &Arc) -> JwkRenewalPermit<'_> { JwkRenewalPermit::acquire_permit(self).await } fn try_acquire_permit(self: &Arc) -> Option> { JwkRenewalPermit::try_acquire_permit(self) } async fn renew_jwks( &self, _permit: JwkRenewalPermit<'_>, ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, auth_rules: &F, ) -> Result, JwtError> { // double check that no one beat us to updating the cache. let now = Instant::now(); let guard = self.cached.load_full(); if let Some(cached) = guard { let last_update = now.duration_since(cached.last_retrieved); if last_update < Duration::from_secs(300) { return Ok(cached); } } let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?; let mut key_sets = ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await { key_sets.insert( rule.id, KeySet { jwks, audience: rule.audience, role_names: rule.role_names, }, ); } } let entry = Arc::new(JwkCacheEntry { last_retrieved: now, key_sets, }); self.cached.swap(Some(Arc::clone(&entry))); Ok(entry) } async fn get_or_update_jwk_cache( self: &Arc, ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, fetch: &F, ) -> Result, JwtError> { let now = Instant::now(); let guard = self.cached.load_full(); // if we have no cached JWKs, try and get some let Some(cached) = guard else { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; }; let last_update = now.duration_since(cached.last_retrieved); // check if the cached JWKs need updating. if last_update > MAX_RENEW { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; // it's been too long since we checked the keys. wait for them to update. return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; } // every 5 minutes we should spawn a job to eagerly update the token. if last_update > AUTO_RENEW { if let Some(permit) = self.try_acquire_permit() { tracing::debug!("JWKs should be renewed. Renewal permit acquired"); let permit = permit.into_owned(); let entry = self.clone(); let client = client.clone(); let fetch = fetch.clone(); let ctx = ctx.clone(); tokio::spawn(async move { if let Err(e) = entry .renew_jwks(permit, &ctx, &client, endpoint, &fetch) .await { tracing::warn!(error=?e, "could not fetch JWKs in background job"); } }); } else { tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping"); } } Ok(cached) } async fn check_jwt( self: &Arc, ctx: &RequestContext, jwt: &str, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, role_name: &RoleName, fetch: &F, ) -> Result { // JWT compact form is defined to be // || . || || . || // where Signature = alg( || . || ); let (header_payload, signature) = jwt .rsplit_once('.') .ok_or(JwtEncodingError::InvalidCompactForm)?; let (header, payload) = header_payload .split_once('.') .ok_or(JwtEncodingError::InvalidCompactForm)?; let header = BASE64_URL_SAFE_NO_PAD.decode(header)?; let header = serde_json::from_slice::>(&header)?; let payloadb = BASE64_URL_SAFE_NO_PAD.decode(payload)?; let payload = serde_json::from_slice::>(&payloadb)?; if let Some(iss) = &payload.issuer { ctx.set_jwt_issuer(iss.as_ref().to_owned()); } let sig = BASE64_URL_SAFE_NO_PAD.decode(signature)?; let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; let mut guard = self .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) .await?; // get the key from the JWKs if possible. If not, wait for the keys to update. let (jwk, expected_audience) = loop { match guard.find_jwk_and_audience(&kid, role_name) { Some(jwk) => break jwk, None if guard.last_retrieved.elapsed() > MIN_RENEW => { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; guard = self .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } _ => return Err(JwtError::JwkNotFound), } }; if !jwk.is_supported(&header.algorithm) { return Err(JwtError::SignatureAlgorithmNotSupported); } match &jwk.key { jose_jwk::Key::Ec(key) => { verify_ec_signature(header_payload.as_bytes(), &sig, key)?; } jose_jwk::Key::Rsa(key) => { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => return Err(JwtError::UnsupportedKeyType(key.into())), } tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience && payload.audience.0.iter().all(|s| s != aud) { return Err(JwtError::InvalidClaims( JwtClaimsError::InvalidJwtTokenAudience, )); } let now = SystemTime::now(); if let Some(exp) = payload.expiration && now >= exp + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( exp.duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default() .as_secs(), ))); } if let Some(nbf) = payload.not_before && nbf >= now + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims( JwtClaimsError::JwtTokenNotYetReadyToUse( nbf.duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default() .as_secs(), ), )); } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) } } impl JwkCache { pub(crate) async fn check_jwt( &self, ctx: &RequestContext, endpoint: EndpointId, role_name: &RoleName, fetch: &F, jwt: &str, ) -> Result { // try with just a read lock first let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); let entry = entry.unwrap_or_else(|| { // acquire a write lock after to insert. let entry = self.map.entry(key).or_default(); Arc::clone(&*entry) }); entry .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch) .await } } impl Default for JwkCache { fn default() -> Self { let client = Client::builder() .user_agent(JWKS_USER_AGENT) .redirect(redirect::Policy::none()) .tls_built_in_native_certs(true) .connect_timeout(JWKS_CONNECT_TIMEOUT) .timeout(JWKS_FETCH_TIMEOUT) .build() .expect("client config should be valid"); // Retry up to 3 times with increasing intervals between attempts. let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES); let client = reqwest_middleware::ClientBuilder::new(client) .with(RetryTransientMiddleware::new_with_policy(retry_policy)) .build(); JwkCache { client, map: ClashMap::default(), } } } fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { use ecdsa::Signature; use signature::Verifier; match key.crv { jose_jwk::EcCurves::P256 => { let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?; let key = p256::ecdsa::VerifyingKey::from(&pk); let sig = Signature::from_slice(sig)?; key.verify(data, &sig)?; } key => return Err(JwtError::UnsupportedEcKeyType(key)), } Ok(()) } fn verify_rsa_signature( data: &[u8], sig: &[u8], key: &jose_jwk::Rsa, alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; use rsa::RsaPublicKey; use rsa::pkcs1v15::{Signature, VerifyingKey}; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; match alg { Algorithm::Signing(Signing::Rs256) => { let key = VerifyingKey::::new(key); let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; } _ => return Err(JwtError::InvalidRsaSigningAlgorithm), } Ok(()) } /// #[derive(serde::Deserialize, serde::Serialize)] struct JwtHeader<'a> { /// must be a supported alg #[serde(rename = "alg")] algorithm: jose_jwa::Algorithm, /// key id, must be provided for our usecase #[serde(rename = "kid", borrow)] key_id: Option>, } /// #[derive(serde::Deserialize, Debug)] #[allow(dead_code)] struct JwtPayload<'a> { /// Audience - Recipient for which the JWT is intended #[serde(rename = "aud", default)] audience: OneOrMany, /// Expiration - Time after which the JWT expires #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)] expiration: Option, /// Not before - Time before which the JWT is not valid #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)] not_before: Option, // the following entries are only extracted for the sake of debug logging. /// Issuer of the JWT #[serde(rename = "iss", borrow)] issuer: Option>, /// Subject of the JWT (the user) #[serde(rename = "sub", borrow)] subject: Option>, /// Unique token identifier #[serde(rename = "jti", borrow)] jwt_id: Option>, /// Unique session identifier #[serde(rename = "sid", borrow)] session_id: Option>, } /// `OneOrMany` supports parsing either a single item or an array of items. /// /// Needed for /// /// > The "aud" (audience) claim identifies the recipients that the JWT is /// > intended for. Each principal intended to process the JWT MUST /// > identify itself with a value in the audience claim. If the principal /// > processing the claim does not identify itself with a value in the /// > "aud" claim when this claim is present, then the JWT MUST be /// > rejected. In the general case, the "aud" value is **an array of case- /// > sensitive strings**, each containing a StringOrURI value. In the /// > special case when the JWT has one audience, the "aud" value MAY be a /// > **single case-sensitive string** containing a StringOrURI value. The /// > interpretation of audience values is generally application specific. /// > Use of this claim is OPTIONAL. #[derive(Default, Debug)] struct OneOrMany(Vec); impl<'de> Deserialize<'de> for OneOrMany { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { struct OneOrManyVisitor; impl<'de> Visitor<'de> for OneOrManyVisitor { type Value = OneOrMany; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { formatter.write_str("a single string or an array of strings") } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Ok(OneOrMany(vec![v.to_owned()])) } fn visit_seq(self, mut seq: A) -> Result where A: serde::de::SeqAccess<'de>, { let mut v = vec![]; while let Some(s) = seq.next_element()? { v.push(s); } Ok(OneOrMany(v)) } } deserializer.deserialize_any(OneOrManyVisitor) } } fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { >::deserialize(d)? .map(|t| { SystemTime::UNIX_EPOCH .checked_add(Duration::from_secs(t)) .ok_or_else(|| { serde::de::Error::custom(format_args!("timestamp out of bounds: {t}")) }) }) .transpose() } struct JwkRenewalPermit<'a> { inner: Option>, } enum JwkRenewalPermitInner<'a> { Owned(Arc), Borrowed(&'a Arc), } impl JwkRenewalPermit<'_> { fn into_owned(mut self) -> JwkRenewalPermit<'static> { JwkRenewalPermit { inner: self.inner.take().map(JwkRenewalPermitInner::into_owned), } } async fn acquire_permit(from: &Arc) -> JwkRenewalPermit<'_> { match from.lookup.acquire().await { Ok(permit) => { permit.forget(); JwkRenewalPermit { inner: Some(JwkRenewalPermitInner::Borrowed(from)), } } Err(_) => panic!("semaphore should not be closed"), } } fn try_acquire_permit(from: &Arc) -> Option> { match from.lookup.try_acquire() { Ok(permit) => { permit.forget(); Some(JwkRenewalPermit { inner: Some(JwkRenewalPermitInner::Borrowed(from)), }) } Err(tokio::sync::TryAcquireError::NoPermits) => None, Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"), } } } impl JwkRenewalPermitInner<'_> { fn into_owned(self) -> JwkRenewalPermitInner<'static> { match self { JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p), JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)), } } } impl Drop for JwkRenewalPermit<'_> { fn drop(&mut self) { let entry = match &self.inner { None => return, Some(JwkRenewalPermitInner::Owned(p)) => p, Some(JwkRenewalPermitInner::Borrowed(p)) => *p, }; entry.lookup.add_permits(1); } } #[derive(Error, Debug)] #[non_exhaustive] pub(crate) enum JwtError { #[error("jwk not found")] JwkNotFound, #[error("missing key id")] MissingKeyId, #[error("Provided authentication token is not a valid JWT encoding")] JwtEncoding(#[from] JwtEncodingError), #[error(transparent)] InvalidClaims(#[from] JwtClaimsError), #[error("invalid P256 key")] InvalidP256Key(jose_jwk::crypto::Error), #[error("invalid RSA key")] InvalidRsaKey(jose_jwk::crypto::Error), #[error("invalid RSA signing algorithm")] InvalidRsaSigningAlgorithm, #[error("unsupported EC key type {0:?}")] UnsupportedEcKeyType(jose_jwk::EcCurves), #[error("unsupported key type {0:?}")] UnsupportedKeyType(KeyType), #[error("signature algorithm not supported")] SignatureAlgorithmNotSupported, #[error("signature error: {0}")] Signature(#[from] signature::Error), #[error("failed to fetch auth rules: {0}")] FetchAuthRules(#[from] FetchAuthRulesError), } impl From for JwtError { fn from(err: base64::DecodeError) -> Self { JwtEncodingError::Base64Decode(err).into() } } impl From for JwtError { fn from(err: serde_json::Error) -> Self { JwtEncodingError::SerdeJson(err).into() } } #[derive(Error, Debug)] #[non_exhaustive] pub enum JwtEncodingError { #[error(transparent)] Base64Decode(#[from] base64::DecodeError), #[error(transparent)] SerdeJson(#[from] serde_json::Error), #[error("invalid compact form")] InvalidCompactForm, } #[derive(Error, Debug, PartialEq)] #[non_exhaustive] pub enum JwtClaimsError { #[error("invalid JWT token audience")] InvalidJwtTokenAudience, #[error("JWT token has expired (exp={0})")] JwtTokenHasExpired(u64), #[error("JWT token is not yet ready to use (nbf={0})")] JwtTokenNotYetReadyToUse(u64), } #[allow(dead_code, reason = "Debug use only")] #[derive(Debug)] pub(crate) enum KeyType { Ec(jose_jwk::EcCurves), Rsa, Oct, Okp(jose_jwk::OkpCurves), Unknown, } impl From<&jose_jwk::Key> for KeyType { fn from(key: &jose_jwk::Key) -> Self { match key { jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv), jose_jwk::Key::Rsa(_rsa) => Self::Rsa, jose_jwk::Key::Oct(_oct) => Self::Oct, jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv), _ => Self::Unknown, } } } #[cfg(test)] mod tests { use std::future::IntoFuture; use std::net::SocketAddr; use std::time::SystemTime; use bytes::Bytes; use http::Response; use http_body_util::Full; use hyper::service::service_fn; use hyper_util::rt::TokioIo; use rand_core::OsRng; use rsa::pkcs8::DecodePrivateKey; use serde::Serialize; use serde_json::json; use signature::Signer; use tokio::net::TcpListener; use super::*; use crate::types::RoleName; fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); let pk = sk.public_key().into(); let jwk = jose_jwk::Jwk { key: jose_jwk::Key::Ec(pk), prm: jose_jwk::Parameters { kid: Some(kid), alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), ..Default::default() }, }; (sk, jwk) } fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap(); let pk = sk.to_public_key().into(); let jwk = jose_jwk::Jwk { key: jose_jwk::Key::Rsa(pk), prm: jose_jwk::Parameters { kid: Some(kid), alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), ..Default::default() }, }; (sk, jwk) } fn now() -> u64 { SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs() } fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { let now = now(); let body = typed_json::json! {{ "exp": now + 3600, "nbf": now, "aud": ["audience1", "neon", "audience2"], "sub": "user1", "sid": "session1", "jti": "token1", "iss": "neon-testing", }}; build_custom_jwt_payload(kid, body, sig) } fn build_custom_jwt_payload( kid: String, body: impl Serialize, sig: jose_jwa::Signing, ) -> String { let header = JwtHeader { algorithm: jose_jwa::Algorithm::Signing(sig), key_id: Some(Cow::Owned(kid)), }; let header = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&header).unwrap()); let body = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&body).unwrap()); format!("{header}.{body}") } fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String { use p256::ecdsa::{Signature, SigningKey}; let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String { use p256::ecdsa::{Signature, SigningKey}; let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256); let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String { use rsa::pkcs1v15::SigningKey; use rsa::signature::SignatureEncoding; let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256); let sig = SigningKey::::new(key).sign(payload.as_bytes()); let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } // RSA key gen is slow.... const RS1: &str = "-----BEGIN PRIVATE KEY----- MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9 ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi 5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+ KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz 1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+ HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24 DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY 9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF /4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL 3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9 8LMFlz1QPqSZYN/A/kOcLBfa3A== -----END PRIVATE KEY----- "; const RS2: &str = "-----BEGIN PRIVATE KEY----- MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX 0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR +Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1 P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/ FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M 4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL 5JiconnI5aLek0QVPoFaVXFa -----END PRIVATE KEY----- "; #[derive(Clone)] struct Fetch(Vec); impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { Ok(self.0.clone()) } } async fn jwks_server( router: impl for<'a> Fn(&'a str) -> Option> + Send + Sync + 'static, ) -> SocketAddr { let router = Arc::new(router); let service = service_fn(move |req| { let router = Arc::clone(&router); async move { match router(req.uri().path()) { Some(body) => Response::builder() .status(200) .body(Full::new(Bytes::from(body))), None => Response::builder() .status(404) .body(Full::new(Bytes::new())), } } }); let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); let server = hyper::server::conn::http1::Builder::new(); let addr = listener.local_addr().unwrap(); tokio::spawn(async move { loop { let (s, _) = listener.accept().await.unwrap(); let serve = server.serve_connection(TokioIo::new(s), service.clone()); tokio::spawn(serve.into_future()); } }); addr } #[tokio::test] async fn check_jwt_happy_path() { let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into()); let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into()); let (ec1, jwk3) = new_ec_jwk("ec1".into()); let (ec2, jwk4) = new_ec_jwk("ec2".into()); let foo_jwks = jose_jwk::JwkSet { keys: vec![jwk1, jwk3], }; let bar_jwks = jose_jwk::JwkSet { keys: vec![jwk2, jwk4], }; let jwks_addr = jwks_server(move |path| match path { "/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()), "/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()), _ => None, }) .await; let role_name1 = RoleName::from("anonymous"); let role_name2 = RoleName::from("authenticated"); let roles = vec![ RoleNameInt::from(&role_name1), RoleNameInt::from(&role_name2), ]; let rules = vec![ AuthRule { id: "foo".to_owned(), jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(), audience: None, role_names: roles.clone(), }, AuthRule { id: "bar".to_owned(), jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(), audience: None, role_names: roles.clone(), }, ]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let endpoint = EndpointId::from("ep"); let jwt1 = new_rsa_jwt("rs1".into(), rs1); let jwt2 = new_rsa_jwt("rs2".into(), rs2); let jwt3 = new_ec_jwt("ec1".into(), &ec1); let jwt4 = new_ec_jwt("ec2".into(), &ec2); let tokens = [jwt1, jwt2, jwt3, jwt4]; let role_names = [role_name1, role_name2]; for role in &role_names { for token in &tokens { jwk_cache .check_jwt( &RequestContext::test(), endpoint.clone(), role, &fetch, token, ) .await .unwrap(); } } } /// AWS Cognito escapes the `/` in the URL. #[tokio::test] async fn check_jwt_regression_cognito_issuer() { let (key, jwk) = new_ec_jwk("key".into()); let now = now(); let token = new_custom_ec_jwt( "key".into(), &key, typed_json::json! {{ "sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d", // cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse // to write that escape character. instead I will make a bogus URL using `\` instead. "iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh", "client_id": "abcdefghijklmnopqrstuvwxyz", "origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893", "event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065", "token_use": "access", "scope": "aws.cognito.signin.user.admin", "auth_time":now, "exp":now + 60, "iat":now, "jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0", "username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d", }}, ); let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await; let role_name = RoleName::from("anonymous"); let rules = vec![AuthRule { id: "aws-cognito".to_owned(), jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), audience: None, role_names: vec![RoleNameInt::from(&role_name)], }]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let endpoint = EndpointId::from("ep"); jwk_cache .check_jwt( &RequestContext::test(), endpoint.clone(), &role_name, &fetch, &token, ) .await .unwrap(); } #[tokio::test] async fn check_jwt_invalid_signature() { let (_, jwk) = new_ec_jwk("1".into()); let (key, _) = new_ec_jwk("1".into()); // has a matching kid, but signed by the wrong key let bad_jwt = new_ec_jwt("1".into(), &key); let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; let jwks_addr = jwks_server(move |path| match path { "/" => Some(serde_json::to_vec(&jwks).unwrap()), _ => None, }) .await; let role = RoleName::from("authenticated"); let rules = vec![AuthRule { id: String::new(), jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), audience: None, role_names: vec![RoleNameInt::from(&role)], }]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let ep = EndpointId::from("ep"); let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt) .await .unwrap_err(); assert!( matches!(err, JwtError::Signature(_)), "expected \"signature error\", got {err:?}" ); } #[tokio::test] async fn check_jwt_unknown_role() { let (key, jwk) = new_rsa_jwk(RS1, "1".into()); let jwt = new_rsa_jwt("1".into(), key); let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; let jwks_addr = jwks_server(move |path| match path { "/" => Some(serde_json::to_vec(&jwks).unwrap()), _ => None, }) .await; let role = RoleName::from("authenticated"); let rules = vec![AuthRule { id: String::new(), jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), audience: None, role_names: vec![RoleNameInt::from(&role)], }]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let ep = EndpointId::from("ep"); // this role_name is not accepted let bad_role_name = RoleName::from("cloud_admin"); let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt) .await .unwrap_err(); assert!( matches!(err, JwtError::JwkNotFound), "expected \"jwk not found\", got {err:?}" ); } #[tokio::test] async fn check_jwt_invalid_claims() { let (key, jwk) = new_ec_jwk("1".into()); let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; let jwks_addr = jwks_server(move |path| match path { "/" => Some(serde_json::to_vec(&jwks).unwrap()), _ => None, }) .await; let now = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs(); struct Test { body: serde_json::Value, error: JwtClaimsError, } let table = vec![ Test { body: json! {{ "nbf": now + 60, "aud": "neon", }}, error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60), }, Test { body: json! {{ "exp": now - 60, "aud": ["neon"], }}, error: JwtClaimsError::JwtTokenHasExpired(now - 60), }, Test { body: json! {{ }}, error: JwtClaimsError::InvalidJwtTokenAudience, }, Test { body: json! {{ "aud": [], }}, error: JwtClaimsError::InvalidJwtTokenAudience, }, Test { body: json! {{ "aud": "foo", }}, error: JwtClaimsError::InvalidJwtTokenAudience, }, Test { body: json! {{ "aud": ["foo"], }}, error: JwtClaimsError::InvalidJwtTokenAudience, }, Test { body: json! {{ "aud": ["foo", "bar"], }}, error: JwtClaimsError::InvalidJwtTokenAudience, }, ]; let role = RoleName::from("authenticated"); let rules = vec![AuthRule { id: String::new(), jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), audience: Some("neon".to_string()), role_names: vec![RoleNameInt::from(&role)], }]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let ep = EndpointId::from("ep"); let ctx = RequestContext::test(); for test in table { let jwt = new_custom_ec_jwt("1".into(), &key, test.body); match jwk_cache .check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt) .await { Err(JwtError::InvalidClaims(error)) if error == test.error => {} Err(err) => { panic!("expected {:?}, got {err:?}", test.error) } Ok(_payload) => { panic!("expected {:?}, got ok", test.error) } } } } #[tokio::test] async fn check_jwk_keycloak_regression() { let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into()); let valid_jwk = serde_json::to_value(valid_jwk).unwrap(); // This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones. // This is taken directly from keycloak. let invalid_jwk = serde_json::json! { { "kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ", "kty": "RSA", "alg": "RSA-OAEP", "use": "enc", "n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw", "e": "AQAB", "x5c": [ "MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI=" ], "x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs", "x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU" } }; let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }}; let jwks_addr = jwks_server(move |path| match path { "/" => Some(serde_json::to_vec(&jwks).unwrap()), _ => None, }) .await; let role_name = RoleName::from("anonymous"); let role = RoleNameInt::from(&role_name); let rules = vec![AuthRule { id: "foo".to_owned(), jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), audience: None, role_names: vec![role], }]; let fetch = Fetch(rules); let jwk_cache = JwkCache::default(); let endpoint = EndpointId::from("ep"); let token = new_rsa_jwt("rs1".into(), rs); jwk_cache .check_jwt( &RequestContext::test(), endpoint.clone(), &role_name, &fetch, &token, ) .await .unwrap(); } } ================================================ FILE: proxy/src/auth/backend/local.rs ================================================ use std::net::SocketAddr; use arc_swap::ArcSwapOption; use postgres_client::config::SslMode; use tokio::sync::Semaphore; use super::jwt::{AuthRule, FetchAuthRules}; use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnectInfo; use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestContext; use crate::control_plane::NodeInfo; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::http; use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; use crate::types::EndpointId; use crate::url::ApiUrl; pub struct LocalBackend { pub(crate) initialize: Semaphore, pub(crate) compute_ctl: ComputeCtlApi, pub(crate) node_info: NodeInfo, } impl LocalBackend { pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self { LocalBackend { initialize: Semaphore::new(1), compute_ctl: ComputeCtlApi { api: http::Endpoint::new(compute_ctl, http::new_client()), }, node_info: NodeInfo { conn_info: ConnectInfo { host_addr: Some(postgres_addr.ip()), host: postgres_addr.ip().to_string().into(), port: postgres_addr.port(), ssl_mode: SslMode::Disable, }, // TODO(conrad): make this better reflect compute info rather than endpoint info. aux: MetricsAuxInfo { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), project_id: ProjectIdTag::get_interner().get_or_intern("local"), branch_id: BranchIdTag::get_interner().get_or_intern("local"), compute_id: "local".into(), cold_start_info: ColdStartInfo::WarmCached, }, }, } } } #[derive(Clone, Copy)] pub(crate) struct StaticAuthRules; pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); impl FetchAuthRules for StaticAuthRules { async fn fetch_auth_rules( &self, _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?; let mut rules = vec![]; for setting in &role_mappings.jwks { rules.push(AuthRule { id: setting.id.clone(), jwks_url: setting.jwks_url.clone(), audience: setting.jwt_audience.clone(), role_names: setting.role_names.clone(), }); } Ok(rules) } } ================================================ FILE: proxy/src/auth/backend/mod.rs ================================================ mod classic; mod console_redirect; mod hacks; pub mod jwt; pub mod local; use std::sync::Arc; pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; use local::LocalBackend; use postgres_client::config::AuthKeys; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info}; use crate::auth::{self, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange}; use crate::cache::Cached; use crate::cache::node_info::CachedNodeInfo; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::messages::EndpointRateLimitConfig; use crate::control_plane::{ self, AccessBlockerFlags, AuthSecret, ControlPlaneApi, EndpointAccessControl, RoleAccessControl, }; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; use crate::proxy::wake_compute::WakeComputeBackend; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{scram, stream}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { Owned(T), Borrowed(&'a T), } impl std::ops::Deref for MaybeOwned<'_, T> { type Target = T; fn deref(&self) -> &Self::Target { match self { MaybeOwned::Owned(t) => t, MaybeOwned::Borrowed(t) => t, } } } /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector /// which we use in [`crate::config::ProxyConfig`]. /// /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. pub enum Backend<'a, T> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneClient>, T), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { ControlPlaneClient::ProxyV1(endpoint) => fmt .debug_tuple("ControlPlane::ProxyV1") .field(&endpoint.url()) .finish(), #[cfg(any(test, feature = "testing"))] ControlPlaneClient::PostgresMock(endpoint) => { let url = endpoint.url(); match url::Url::parse(url) { Ok(mut url) => { let _ = url.set_password(Some("_redacted_")); let url = url.as_str(); fmt.debug_tuple("ControlPlane::PostgresMock") .field(&url) .finish() } Err(_) => fmt .debug_tuple("ControlPlane::PostgresMock") .field(&url) .finish(), } } #[cfg(test)] ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } impl Backend<'_, T> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. pub(crate) fn as_ref(&self) -> Backend<'_, &T> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x), Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } pub(crate) fn get_api(&self) -> &ControlPlaneClient { match self { Self::ControlPlane(api, _) => api, Self::Local(_) => panic!("Local backend has no API"), } } pub(crate) fn is_local_proxy(&self) -> bool { matches!(self, Self::Local(_)) } } impl<'a, T> Backend<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)), Self::Local(l) => Backend::Local(l), } } } impl<'a, T, E> Backend<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. pub(crate) fn transpose(self) -> Result, E> { match self { Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)), Self::Local(l) => Ok(Backend::Local(l)), } } } pub(crate) struct ComputeCredentials { pub(crate) info: ComputeUserInfo, pub(crate) keys: ComputeCredentialKeys, } #[derive(Debug, Clone)] pub(crate) struct ComputeUserInfoNoEndpoint { pub(crate) user: RoleName, pub(crate) options: NeonOptions, } #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub(crate) struct ComputeUserInfo { pub(crate) endpoint: EndpointId, pub(crate) user: RoleName, pub(crate) options: NeonOptions, } impl ComputeUserInfo { pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey { self.options.get_cache_key(&self.endpoint) } } #[cfg_attr(test, derive(Debug))] pub(crate) enum ComputeCredentialKeys { AuthKeys(AuthKeys), JwtPayload(Vec), } impl TryFrom for ComputeUserInfo { // user name type Error = ComputeUserInfoNoEndpoint; fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result { match user_info.endpoint_id { None => Err(ComputeUserInfoNoEndpoint { user: user_info.user, options: user_info.options, }), Some(endpoint) => Ok(ComputeUserInfo { endpoint, user: user_info.user, options: user_info.options, }), } } } /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( ctx: &RequestContext, api: &impl control_plane::ControlPlaneApi, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, ) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { let (info, password) = hacks::password_hack_no_authentication(ctx, info, client).await?; ctx.set_endpoint_id(info.endpoint.clone()); (info, Some(password)) } Ok(info) => (info, None), }; debug!("fetching authentication info and allowlists"); let access_controls = api .get_endpoint_access_control(ctx, &info.endpoint, &info.user) .await?; access_controls.check( ctx, config.ip_allowlist_check_enabled, config.is_vpc_acccess_proxy, )?; access_controls.connection_attempt_rate_limit(ctx, &info.endpoint, &endpoint_rate_limiter)?; let role_access = api .get_role_access_control(ctx, &info.endpoint, &info.user) .await?; let secret = if let Some(secret) = role_access.secret { secret } else { // If we don't have an authentication secret, we mock one to // prevent malicious probing (possible due to missing protocol steps). // This mocked secret will never lead to successful authentication. info!("authentication info not found, mocking it"); AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) }; match authenticate_with_secret( ctx, secret, info, client, unauthenticated_password, allow_cleartext, config, ) .await { Ok(keys) => Ok(keys), Err(e) => Err(e), } } async fn authenticate_with_secret( ctx: &RequestContext, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, unauthenticated_password: Option>, allow_cleartext: bool, config: &'static AuthenticationConfig, ) -> auth::Result { if let Some(password) = unauthenticated_password { let ep = EndpointIdInt::from(&info.endpoint); let role = RoleNameInt::from(&info.user); let auth_outcome = validate_password_and_exchange(&config.scram_thread_pool, ep, role, &password, secret) .await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); return Err(auth::AuthError::password_failed(&*info.user)); } }; // we have authenticated the password client.write_message(BeMessage::AuthenticationOk); return Ok(ComputeCredentials { info, keys }); } // -- the remaining flows are self-authenticating -- // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { ctx.set_auth_method(crate::context::AuthMethod::Cleartext); return hacks::authenticate_cleartext(ctx, info, client, secret, config).await; } // Finally, proceed with the main auth flow (SCRAM-based). classic::authenticate(ctx, info, client, config, secret).await } impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { /// Get username from the credentials. pub(crate) fn get_user(&self) -> &str { match self { Self::ControlPlane(_, user_info) => &user_info.user, Self::Local(_) => "local", } } /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub(crate) async fn authenticate( self, ctx: &RequestContext, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { debug!( user = &*user_info.user, project = user_info.endpoint(), "performing authentication using the console" ); let auth_res = auth_quirks( ctx, &*api, user_info.clone(), client, allow_cleartext, config, endpoint_rate_limiter, ) .await; match auth_res { Ok(credentials) => Ok(Backend::ControlPlane(api, credentials)), Err(e) => { // The password could have been changed, so we invalidate the cache. // We should only invalidate the cache if the TTL might have expired. if e.is_password_failed() && let ControlPlaneClient::ProxyV1(api) = &*api && let Some(ep) = &user_info.endpoint_id { api.caches .project_info .maybe_invalidate_role_secret(ep, &user_info.user); } Err(e) } } } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")); } }; // TODO: replace with some metric info!("user successfully authenticated"); res } } impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, user_info) => { api.get_role_access_control(ctx, &user_info.endpoint, &user_info.user) .await } Self::Local(_) => Ok(RoleAccessControl { secret: None }), } } pub(crate) async fn get_endpoint_access_control( &self, ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, user_info) => { api.get_endpoint_access_control(ctx, &user_info.endpoint, &user_info.user) .await } Self::Local(_) => Ok(EndpointAccessControl { allowed_ips: Arc::new(vec![]), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }), } } } #[async_trait::async_trait] impl WakeComputeBackend for Backend<'_, ComputeUserInfo> { async fn wake_compute( &self, ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, info) => api.wake_compute(ctx, info).await, Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } } #[cfg(test)] mod tests { #![allow(clippy::unimplemented, clippy::unwrap_used)] use std::sync::Arc; use bytes::BytesMut; use control_plane::AuthSecret; use fallible_iterator::FallibleIterator; use once_cell::sync::Lazy; use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; use postgres_protocol::message::backend::Message as PgMessage; use postgres_protocol::message::frontend; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use super::auth_quirks; use super::jwt::JwkCache; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::cache::node_info::CachedNodeInfo; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::messages::EndpointRateLimitConfig; use crate::control_plane::{ self, AccessBlockerFlags, EndpointAccessControl, RoleAccessControl, }; use crate::proxy::NeonOptions; use crate::rate_limiter::EndpointRateLimiter; use crate::scram::ServerSecret; use crate::scram::threadpool::ThreadPool; use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, vpc_endpoint_ids: Vec, access_blocker_flags: AccessBlockerFlags, secret: AuthSecret, } impl control_plane::ControlPlaneApi for Auth { async fn get_role_access_control( &self, _ctx: &RequestContext, _endpoint: &crate::types::EndpointId, _role: &crate::types::RoleName, ) -> Result { Ok(RoleAccessControl { secret: Some(self.secret.clone()), }) } async fn get_endpoint_access_control( &self, _ctx: &RequestContext, _endpoint: &crate::types::EndpointId, _role: &crate::types::RoleName, ) -> Result { Ok(EndpointAccessControl { allowed_ips: Arc::new(self.ips.clone()), allowed_vpce: Arc::new(self.vpc_endpoint_ids.clone()), flags: self.access_blocker_flags, rate_limits: EndpointRateLimitConfig::default(), }) } async fn get_endpoint_jwks( &self, _ctx: &RequestContext, _endpoint: &crate::types::EndpointId, ) -> Result, control_plane::errors::GetEndpointJwksError> { unimplemented!() } async fn wake_compute( &self, _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() } } static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { jwks_cache: JwkCache::default(), scram_thread_pool: ThreadPool::new(1), scram_protocol_timeout: std::time::Duration::from_secs(5), ip_allowlist_check_enabled: true, is_vpc_acccess_proxy: false, is_auth_broker: false, accept_jwts: false, console_redirect_confirmation_timeout: std::time::Duration::from_secs(5), }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { loop { r.read_buf(&mut *b).await.unwrap(); if let Some(m) = PgMessage::parse(&mut *b).unwrap() { break m; } } } #[tokio::test] async fn auth_quirks_scram() { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server)); let ctx = RequestContext::test(); let api = Auth { ips: vec![], vpc_endpoint_ids: vec![], access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; let user_info = ComputeUserInfoMaybeEndpoint { user: "conrad".into(), endpoint_id: Some("endpoint".into()), options: NeonOptions::default(), }; let handle = tokio::spawn(async move { let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported()); let mut read = BytesMut::new(); // server should offer scram match read_message(&mut client, &mut read).await { PgMessage::AuthenticationSasl(a) => { let options: Vec<&str> = a.mechanisms().collect().unwrap(); assert_eq!(options, ["SCRAM-SHA-256"]); } _ => panic!("wrong message"), } // client sends client-first-message let mut write = BytesMut::new(); frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap(); client.write_all(&write).await.unwrap(); // server response with server-first-message match read_message(&mut client, &mut read).await { PgMessage::AuthenticationSaslContinue(a) => { scram.update(a.data()).await.unwrap(); } _ => panic!("wrong message"), } // client response with client-final-message write.clear(); frontend::sasl_response(scram.message(), &mut write).unwrap(); client.write_all(&write).await.unwrap(); // server response with server-final-message match read_message(&mut client, &mut read).await { PgMessage::AuthenticationSaslFinal(a) => { scram.finish(a.data()).unwrap(); } _ => panic!("wrong message"), } }); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( EndpointRateLimiter::DEFAULT, 64, )); let _creds = auth_quirks( &ctx, &api, user_info, &mut stream, false, &CONFIG, endpoint_rate_limiter, ) .await .unwrap(); // flush the final server message stream.flush().await.unwrap(); handle.await.unwrap(); } #[tokio::test] async fn auth_quirks_cleartext() { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server)); let ctx = RequestContext::test(); let api = Auth { ips: vec![], vpc_endpoint_ids: vec![], access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; let user_info = ComputeUserInfoMaybeEndpoint { user: "conrad".into(), endpoint_id: Some("endpoint".into()), options: NeonOptions::default(), }; let handle = tokio::spawn(async move { let mut read = BytesMut::new(); let mut write = BytesMut::new(); // server should offer cleartext match read_message(&mut client, &mut read).await { PgMessage::AuthenticationCleartextPassword => {} _ => panic!("wrong message"), } // client responds with password write.clear(); frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( EndpointRateLimiter::DEFAULT, 64, )); let _creds = auth_quirks( &ctx, &api, user_info, &mut stream, true, &CONFIG, endpoint_rate_limiter, ) .await .unwrap(); handle.await.unwrap(); } #[tokio::test] async fn auth_quirks_password_hack() { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new_skip_handshake(Stream::from_raw(server)); let ctx = RequestContext::test(); let api = Auth { ips: vec![], vpc_endpoint_ids: vec![], access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; let user_info = ComputeUserInfoMaybeEndpoint { user: "conrad".into(), endpoint_id: None, options: NeonOptions::default(), }; let handle = tokio::spawn(async move { let mut read = BytesMut::new(); // server should offer cleartext match read_message(&mut client, &mut read).await { PgMessage::AuthenticationCleartextPassword => {} _ => panic!("wrong message"), } // client responds with password let mut write = BytesMut::new(); frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write) .unwrap(); client.write_all(&write).await.unwrap(); }); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( EndpointRateLimiter::DEFAULT, 64, )); let creds = auth_quirks( &ctx, &api, user_info, &mut stream, true, &CONFIG, endpoint_rate_limiter, ) .await .unwrap(); assert_eq!(creds.info.endpoint, "my-endpoint"); handle.await.unwrap(); } } ================================================ FILE: proxy/src/auth/credentials.rs ================================================ //! User credentials used in authentication. use std::collections::HashSet; use std::net::IpAddr; use std::str::FromStr; use itertools::Itertools; use thiserror::Error; use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::pqproto::StartupMessageParams; use crate::proxy::NeonOptions; use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), #[error( "Inconsistent project name inferred from \ SNI ('{}') and project option ('{}').", .domain, .option, )] InconsistentProjectNames { domain: EndpointId, option: EndpointId, }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(EndpointId), } impl UserFacingError for ComputeUserInfoParseError {} impl ReportableError for ComputeUserInfoParseError { fn get_error_kind(&self) -> crate::error::ErrorKind { crate::error::ErrorKind::User } } /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct ComputeUserInfoMaybeEndpoint { pub(crate) user: RoleName, pub(crate) endpoint_id: Option, pub(crate) options: NeonOptions, } impl ComputeUserInfoMaybeEndpoint { #[inline] pub(crate) fn endpoint(&self) -> Option<&str> { self.endpoint_id.as_deref() } } pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option { let (subdomain, common_name) = sni.split_once('.')?; if !common_names.contains(common_name) { return None; } if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { return None; } Some(EndpointId::from(subdomain)) } impl ComputeUserInfoMaybeEndpoint { pub(crate) fn parse( ctx: &RequestContext, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, ) -> Result { // Some parameters are stored in the startup message. let get_param = |key| { params .get(key) .ok_or(ComputeUserInfoParseError::MissingKey(key)) }; let user: RoleName = get_param("user")?.into(); // Project name might be passed via PG's command-line options. let endpoint_option = params .options_raw() .and_then(|options| { // We support both `project` (deprecated) and `endpoint` options for backward compatibility. // However, if both are present, we don't exactly know which one to use. // Therefore we require that only one of them is present. options .filter_map(parse_endpoint_param) .at_most_one() .ok()? }) .map(|name| name.into()); let endpoint_from_domain = sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn))); let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { Some(Err(ComputeUserInfoParseError::InconsistentProjectNames { domain, option, })) } // Invariant: project name may not contain certain characters. (a, b) => a.or(b).map(|name| { if project_name_valid(name.as_ref()) { Ok(name) } else { Err(ComputeUserInfoParseError::MalformedProjectName(name)) } }), } .transpose()?; if let Some(ep) = &endpoint { ctx.set_endpoint_id(ep.clone()); } let metrics = Metrics::get(); debug!(%user, "credentials"); let protocol = ctx.protocol(); let kind = if sni.is_some() { debug!("Connection with sni"); SniKind::Sni } else if endpoint.is_some() { debug!("Connection without sni"); SniKind::NoSni } else { debug!("Connection with password hack"); SniKind::PasswordHack }; metrics .proxy .accepted_connections_by_sni .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); Ok(Self { user, endpoint_id: endpoint, options, }) } } pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern)) } #[derive(Debug, Clone, Eq, PartialEq)] pub(crate) enum IpPattern { Subnet(ipnet::IpNet), Range(IpAddr, IpAddr), Single(IpAddr), None, } impl<'de> serde::de::Deserialize<'de> for IpPattern { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct StrVisitor; impl serde::de::Visitor<'_> for StrVisitor { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( formatter, "comma separated list with ip address, ip address range, or ip address subnet mask" ) } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Ok(parse_ip_pattern(v).unwrap_or_else(|e| { warn!("Cannot parse ip pattern {v}: {e}"); IpPattern::None })) } } deserializer.deserialize_str(StrVisitor) } } impl FromStr for IpPattern { type Err = anyhow::Error; fn from_str(s: &str) -> Result { parse_ip_pattern(s) } } fn parse_ip_pattern(pattern: &str) -> anyhow::Result { if pattern.contains('/') { let subnet: ipnet::IpNet = pattern.parse()?; return Ok(IpPattern::Subnet(subnet)); } if let Some((start, end)) = pattern.split_once('-') { let start: IpAddr = start.parse()?; let end: IpAddr = end.parse()?; return Ok(IpPattern::Range(start, end)); } let addr: IpAddr = pattern.parse()?; Ok(IpPattern::Single(addr)) } fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool { match pattern { IpPattern::Subnet(subnet) => subnet.contains(ip), IpPattern::Range(start, end) => start <= ip && ip <= end, IpPattern::Single(addr) => addr == ip, IpPattern::None => false, } } fn project_name_valid(name: &str) -> bool { name.chars().all(|c| c.is_alphanumeric() || c == '-') } #[cfg(test)] mod tests { use ComputeUserInfoParseError::*; use serde_json::json; use super::*; #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); Ok(()) } #[test] fn parse_excessive() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); Ok(()) } #[test] fn parse_project_from_sni() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); Ok(()) } #[test] fn parse_project_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ("options", "-ckey=1 project=bar -c geqo=off"), ]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } #[test] fn parse_endpoint_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } #[test] fn parse_three_endpoints_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ( "options", "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off", ), ]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); Ok(()) } #[test] fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); Ok(()) } #[test] fn parse_projects_identical() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); Ok(()) } #[test] fn parse_multi_common_names() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe")]); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) } #[test] fn parse_projects_different() { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]); let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); let ctx = RequestContext::test(); let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); assert_eq!(domain, "second"); } _ => panic!("bad error: {err:?}"), } } #[test] fn parse_unknown_sni() { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .unwrap(); assert!(info.endpoint_id.is_none()); } #[test] fn parse_unknown_sni_with_options() { let options = StartupMessageParams::new([ ("user", "john_doe"), ("options", "endpoint=foo-bar-baz-1234"), ]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .unwrap(); assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234")); } #[test] fn parse_neon_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"), ]); let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), "project endpoint_type:read_write lsn:0/2" ); Ok(()) } #[test] fn test_check_peer_addr_is_in_list() { fn check(v: serde_json::Value) -> bool { let peer_addr = IpAddr::from([127, 0, 0, 1]); let ip_list: Vec = serde_json::from_value(v).unwrap(); check_peer_addr_is_in_list(&peer_addr, &ip_list) } assert!(check(json!([]))); assert!(check(json!(["127.0.0.1"]))); assert!(!check(json!(["8.8.8.8"]))); // If there is an incorrect address, it will be skipped. assert!(check(json!(["88.8.8", "127.0.0.1"]))); } #[test] fn test_parse_ip_v4() -> anyhow::Result<()> { let peer_addr = IpAddr::from([127, 0, 0, 1]); // Ok assert_eq!(parse_ip_pattern("127.0.0.1")?, IpPattern::Single(peer_addr)); assert_eq!( parse_ip_pattern("127.0.0.1/31")?, IpPattern::Subnet(ipnet::IpNet::new(peer_addr, 31)?) ); assert_eq!( parse_ip_pattern("0.0.0.0-200.0.1.2")?, IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2])) ); // Error assert!(parse_ip_pattern("300.0.1.2").is_err()); assert!(parse_ip_pattern("30.1.2").is_err()); assert!(parse_ip_pattern("127.0.0.1/33").is_err()); assert!(parse_ip_pattern("127.0.0.1-127.0.3").is_err()); assert!(parse_ip_pattern("1234.0.0.1-127.0.3.0").is_err()); Ok(()) } #[test] fn test_check_ipv4() -> anyhow::Result<()> { let peer_addr = IpAddr::from([127, 0, 0, 1]); let peer_addr_next = IpAddr::from([127, 0, 0, 2]); let peer_addr_prev = IpAddr::from([127, 0, 0, 0]); // Success assert!(check_ip(&peer_addr, &IpPattern::Single(peer_addr))); assert!(check_ip( &peer_addr, &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_prev, 31)?) )); assert!(check_ip( &peer_addr, &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 30)?) )); assert!(check_ip( &peer_addr, &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2])) )); assert!(check_ip( &peer_addr, &IpPattern::Range(peer_addr, peer_addr) )); // Not success assert!(!check_ip(&peer_addr, &IpPattern::Single(peer_addr_prev))); assert!(!check_ip( &peer_addr, &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 31)?) )); assert!(!check_ip( &peer_addr, &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), peer_addr_prev) )); assert!(!check_ip( &peer_addr, &IpPattern::Range(peer_addr_next, IpAddr::from([128, 0, 0, 0])) )); // There is no check that for range start <= end. But it's fine as long as for all this cases the result is false. assert!(!check_ip( &peer_addr, &IpPattern::Range(peer_addr, peer_addr_prev) )); Ok(()) } #[test] fn test_connection_blocker() { fn check(v: serde_json::Value) -> bool { let peer_addr = IpAddr::from([127, 0, 0, 1]); let ip_list: Vec = serde_json::from_value(v).unwrap(); check_peer_addr_is_in_list(&peer_addr, &ip_list) } assert!(check(json!([]))); assert!(check(json!(["127.0.0.1"]))); assert!(!check(json!(["255.255.255.255"]))); } } ================================================ FILE: proxy/src/auth/flow.rs ================================================ //! Main authentication flow. use std::sync::Arc; use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use super::backend::ComputeCredentialKeys; use super::{AuthError, PasswordHackPayload}; use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::pqproto::{BeAuthenticationSaslMessage, BeMessage}; use crate::sasl; use crate::scram::threadpool::ThreadPool; use crate::scram::{self}; use crate::stream::{PqStream, Stream}; use crate::tls::TlsServerEndPoint; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. pub(crate) struct Scram<'a>( pub(crate) &'a scram::ServerSecret, pub(crate) &'a RequestContext, ); impl Scram<'_> { #[inline(always)] fn first_message(&self, channel_binding: bool) -> BeMessage<'_> { if channel_binding { BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS)) } else { BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods( scram::METHODS_WITHOUT_PLUS, )) } } } /// Use an ad hoc auth flow (for clients which don't support SNI) proposed in /// . pub(crate) struct PasswordHack; /// Use clear-text password auth called `password` in docs /// pub(crate) struct CleartextPassword { pub(crate) pool: Arc, pub(crate) endpoint: EndpointIdInt, pub(crate) role: RoleNameInt, pub(crate) secret: AuthSecret, } /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub(crate) struct AuthFlow<'a, S, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream>, /// State might contain ancillary data. state: State, tls_server_end_point: TlsServerEndPoint, } /// Initial state of the stream wrapper. impl<'a, S: AsyncRead + AsyncWrite + Unpin, M> AuthFlow<'a, S, M> { /// Create a new wrapper for client authentication. pub(crate) fn new(stream: &'a mut PqStream>, method: M) -> Self { let tls_server_end_point = stream.get_ref().tls_server_end_point(); Self { stream, state: method, tls_server_end_point, } } } impl AuthFlow<'_, S, PasswordHack> { /// Perform user authentication. Raise an error in case authentication failed. pub(crate) async fn get_password(self) -> super::Result { self.stream .write_message(BeMessage::AuthenticationCleartextPassword); self.stream.flush().await?; let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) .ok_or(AuthError::MalformedPassword("missing terminator"))?; let payload = PasswordHackPayload::parse(password) // If we ended up here and the payload is malformed, it means that // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. .ok_or(AuthError::MissingEndpointName)?; Ok(payload) } } impl AuthFlow<'_, S, CleartextPassword> { /// Perform user authentication. Raise an error in case authentication failed. pub(crate) async fn authenticate(self) -> super::Result> { self.stream .write_message(BeMessage::AuthenticationCleartextPassword); self.stream.flush().await?; let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) .ok_or(AuthError::MalformedPassword("missing terminator"))?; let outcome = validate_password_and_exchange( &self.state.pool, self.state.endpoint, self.state.role, password, self.state.secret, ) .await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message(BeMessage::AuthenticationOk); } Ok(outcome) } } /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub(crate) async fn authenticate(self) -> super::Result> { let Scram(secret, ctx) = self.state; let channel_binding = self.tls_server_end_point; // send sasl message. { // pause the timer while we communicate with the client let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let sasl = self.state.first_message(channel_binding.supported()); self.stream.write_message(sasl); self.stream.flush().await?; } // complete sasl handshake. sasl::authenticate(ctx, self.stream, |method| { // Currently, the only supported SASL method is SCRAM. match method { SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), SCRAM_SHA_256_PLUS => { ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus); } method => return Err(sasl::Error::BadAuthMethod(method.into())), } // TODO: make this a metric instead info!("client chooses {}", method); Ok(scram::Exchange::new(secret, rand::random, channel_binding)) }) .await .map_err(AuthError::Sasl) } } pub(crate) async fn validate_password_and_exchange( pool: &ThreadPool, endpoint: EndpointIdInt, role: RoleNameInt, password: &[u8], secret: AuthSecret, ) -> super::Result> { match secret { // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { let outcome = crate::scram::exchange(pool, endpoint, role, &scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)), }; let keys = crate::compute::ScramKeys { client_key: client_key.as_bytes(), server_key: scram_secret.server_key.as_bytes(), }; Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys( postgres_client::config::AuthKeys::ScramSha256(keys), ))) } } } ================================================ FILE: proxy/src/auth/mod.rs ================================================ //! Client authentication mechanisms. pub mod backend; pub use backend::Backend; mod credentials; pub(crate) use credentials::{ ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, check_peer_addr_is_in_list, endpoint_sni, }; mod password_hack; use password_hack::PasswordHackPayload; pub(crate) use password_hack::parse_endpoint_param; mod flow; use std::io; use std::net::IpAddr; pub(crate) use flow::*; use thiserror::Error; use tokio::time::error::Elapsed; use crate::auth::backend::jwt::JwtError; use crate::control_plane; use crate::error::{ReportableError, UserFacingError}; /// Convenience wrapper for the authentication error. pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub(crate) enum AuthError { #[error(transparent)] ConsoleRedirect(#[from] backend::ConsoleRedirectError), #[error(transparent)] GetAuthInfo(#[from] control_plane::errors::GetAuthInfoError), /// SASL protocol errors (includes [SCRAM](crate::scram)). #[error(transparent)] Sasl(#[from] crate::sasl::Error), #[error("Unsupported authentication method: {0}")] BadAuthMethod(Box), #[error("Malformed password message: {0}")] MalformedPassword(&'static str), #[error( "Endpoint ID is not specified. \ Either please upgrade the postgres client library (libpq) for SNI support \ or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D'. \ See more at https://neon.tech/sni" )] MissingEndpointName, #[error( "VPC endpoint ID is not specified. \ This endpoint requires a VPC endpoint ID to connect." )] MissingVPCEndpointId, #[error("password authentication failed for user '{0}'")] PasswordFailed(Box), /// Errors produced by e.g. [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), #[error( "This IP address {0} is not allowed to connect to this endpoint. \ Please add it to the allowed list in the Neon console. \ Make sure to check for IPv4 or IPv6 addresses." )] IpAddressNotAllowed(IpAddr), #[error("This connection is trying to access this endpoint from a blocked network.")] NetworkNotAllowed, #[error( "This VPC endpoint id {0} is not allowed to connect to this endpoint. \ Please add it to the allowed list in the Neon console." )] VpcEndpointIdNotAllowed(String), #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, #[error("Authentication timed out")] UserTimeout(Elapsed), #[error("Disconnected due to inactivity after {0}.")] ConfirmationTimeout(humantime::Duration), #[error(transparent)] Jwt(#[from] JwtError), } impl AuthError { pub(crate) fn bad_auth_method(name: impl Into>) -> Self { AuthError::BadAuthMethod(name.into()) } pub(crate) fn password_failed(user: impl Into>) -> Self { AuthError::PasswordFailed(user.into()) } pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { AuthError::IpAddressNotAllowed(ip) } pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self { AuthError::VpcEndpointIdNotAllowed(id) } pub(crate) fn too_many_connections() -> Self { AuthError::TooManyConnections } pub(crate) fn is_password_failed(&self) -> bool { matches!(self, AuthError::PasswordFailed(_)) } pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { AuthError::UserTimeout(elapsed) } pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { AuthError::ConfirmationTimeout(timeout) } } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { match self { Self::ConsoleRedirect(e) => e.to_string_client(), Self::GetAuthInfo(e) => e.to_string_client(), Self::Sasl(e) => e.to_string_client(), Self::PasswordFailed(_) => self.to_string(), Self::BadAuthMethod(_) => self.to_string(), Self::MalformedPassword(_) => self.to_string(), Self::MissingEndpointName => self.to_string(), Self::MissingVPCEndpointId => self.to_string(), Self::Io(_) => "Internal error".to_string(), Self::IpAddressNotAllowed(_) => self.to_string(), Self::NetworkNotAllowed => self.to_string(), Self::VpcEndpointIdNotAllowed(_) => self.to_string(), Self::TooManyConnections => self.to_string(), Self::UserTimeout(_) => self.to_string(), Self::ConfirmationTimeout(_) => self.to_string(), Self::Jwt(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { Self::ConsoleRedirect(e) => e.get_error_kind(), Self::GetAuthInfo(e) => e.get_error_kind(), Self::Sasl(e) => e.get_error_kind(), Self::PasswordFailed(_) => crate::error::ErrorKind::User, Self::BadAuthMethod(_) => crate::error::ErrorKind::User, Self::MalformedPassword(_) => crate::error::ErrorKind::User, Self::MissingEndpointName => crate::error::ErrorKind::User, Self::MissingVPCEndpointId => crate::error::ErrorKind::User, Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, Self::NetworkNotAllowed => crate::error::ErrorKind::User, Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User, Self::TooManyConnections => crate::error::ErrorKind::RateLimit, Self::UserTimeout(_) => crate::error::ErrorKind::User, Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, Self::Jwt(_) => crate::error::ErrorKind::User, } } } ================================================ FILE: proxy/src/auth/password_hack.rs ================================================ //! Payload for ad hoc authentication method for clients that don't support SNI. //! See the `impl` for [`super::backend::Backend`]. //! Read more: . //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. use bstr::ByteSlice; use crate::types::EndpointId; pub(crate) struct PasswordHackPayload { pub(crate) endpoint: EndpointId, pub(crate) password: Vec, } impl PasswordHackPayload { pub(crate) fn parse(bytes: &[u8]) -> Option { // The format is `project=;` or `project=$`. let separators = [";", "$"]; for sep in separators { if let Some((endpoint, password)) = bytes.split_once_str(sep) { let endpoint = endpoint.to_str().ok()?; return Some(Self { endpoint: parse_endpoint_param(endpoint)?.into(), password: password.to_owned(), }); } } None } } pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> { bytes .strip_prefix("project=") .or_else(|| bytes.strip_prefix("endpoint=")) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_endpoint_param_fn() { let input = ""; assert!(parse_endpoint_param(input).is_none()); let input = "project="; assert_eq!(parse_endpoint_param(input), Some("")); let input = "project=foobar"; assert_eq!(parse_endpoint_param(input), Some("foobar")); let input = "endpoint="; assert_eq!(parse_endpoint_param(input), Some("")); let input = "endpoint=foobar"; assert_eq!(parse_endpoint_param(input), Some("foobar")); let input = "other_option=foobar"; assert!(parse_endpoint_param(input).is_none()); } #[test] fn parse_password_hack_payload_project() { let bytes = b""; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"project="; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"project=;"; let payload: PasswordHackPayload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, ""); assert_eq!(payload.password, b""); let bytes = b"project=foobar;pass;word"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, "foobar"); assert_eq!(payload.password, b"pass;word"); } #[test] fn parse_password_hack_payload_endpoint() { let bytes = b""; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"endpoint="; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"endpoint=;"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, ""); assert_eq!(payload.password, b""); let bytes = b"endpoint=foobar;pass;word"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, "foobar"); assert_eq!(payload.password, b"pass;word"); } #[test] fn parse_password_hack_payload_dollar() { let bytes = b""; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"endpoint="; assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"endpoint=$"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, ""); assert_eq!(payload.password, b""); let bytes = b"endpoint=foobar$pass$word"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); assert_eq!(payload.endpoint, "foobar"); assert_eq!(payload.password, b"pass$word"); } } ================================================ FILE: proxy/src/batch.rs ================================================ //! Batch processing system based on intrusive linked lists. //! //! Enqueuing a batch job requires no allocations, with //! direct support for cancelling jobs early. use std::collections::BTreeMap; use std::pin::pin; use std::sync::Mutex; use scopeguard::ScopeGuard; use tokio::sync::oneshot; use tokio::sync::oneshot::error::TryRecvError; use crate::ext::LockExt; type ProcResult

= Result<

::Res,

::Err>; pub trait QueueProcessing: Send + 'static { type Req: Send + 'static; type Res: Send; type Err: Send + Clone; /// Get the desired batch size. fn batch_size(&self, queue_size: usize) -> usize; /// This applies a full batch of events. /// Must respond with a full batch of replies. /// /// If this apply can error, it's expected that errors be forwarded to each Self::Res. /// /// Batching does not need to happen atomically. fn apply( &mut self, req: Vec, ) -> impl Future, Self::Err>> + Send; } #[derive(thiserror::Error)] pub enum BatchQueueError { #[error(transparent)] Result(E), #[error(transparent)] Cancelled(C), } pub struct BatchQueue { processor: tokio::sync::Mutex

, inner: Mutex>, } struct BatchJob { req: P::Req, res: tokio::sync::oneshot::Sender>, } impl BatchQueue

{ pub fn new(p: P) -> Self { Self { processor: tokio::sync::Mutex::new(p), inner: Mutex::new(BatchQueueInner { version: 0, queue: BTreeMap::new(), }), } } /// Perform a single request-response process, this may be batched internally. /// /// This function is not cancel safe. pub async fn call( &self, req: P::Req, cancelled: impl Future, ) -> Result> { let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); let mut cancelled = pin!(cancelled); let resp: Option> = loop { // try become the leader, or try wait for success. let mut processor = tokio::select! { // try become leader. p = self.processor.lock() => p, // wait for success. resp = &mut rx => break resp.ok(), // wait for cancellation. cancel = cancelled.as_mut() => { let mut inner = self.inner.lock_propagate_poison(); if inner.queue.remove(&id).is_some() { tracing::warn!("batched task cancelled before completion"); } return Err(BatchQueueError::Cancelled(cancel)); }, }; tracing::debug!(id, "batch: became leader"); let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor); // snitch incase the task gets cancelled. let cancel_safety = scopeguard::guard((), |()| { if !std::thread::panicking() { tracing::error!( id, "batch: leader cancelled, despite not being cancellation safe" ); } }); // apply a batch. // if this is cancelled, jobs will not be completed and will panic. let values = processor.apply(reqs).await; // good: we didn't get cancelled. ScopeGuard::into_inner(cancel_safety); match values { Ok(values) => { if values.len() != resps.len() { tracing::error!( "batch: invalid response size, expected={}, got={}", resps.len(), values.len() ); } // send response values. for (tx, value) in std::iter::zip(resps, values) { if tx.send(Ok(value)).is_err() { // receiver hung up but that's fine. } } } Err(err) => { for tx in resps { if tx.send(Err(err.clone())).is_err() { // receiver hung up but that's fine. } } } } match rx.try_recv() { Ok(resp) => break Some(resp), Err(TryRecvError::Closed) => break None, // edge case - there was a race condition where // we became the leader but were not in the batch. // // Example: // thread 1: register job id=1 // thread 2: register job id=2 // thread 2: processor.lock().await // thread 1: processor.lock().await // thread 2: becomes leader, batch_size=1, jobs=[1]. Err(TryRecvError::Empty) => {} } }; tracing::debug!(id, "batch: job completed"); resp.expect("no response found. batch processer should not panic") .map_err(BatchQueueError::Result) } } struct BatchQueueInner { version: u64, queue: BTreeMap>, } impl BatchQueueInner

{ fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver>) { let (tx, rx) = oneshot::channel(); let id = self.version; // Overflow concern: // This is a u64, and we might enqueue 2^16 tasks per second. // This gives us 2^48 seconds (9 million years). // Even if this does overflow, it will not break, but some // jobs with the higher version might never get prioritised. self.version += 1; self.queue.insert(id, BatchJob { req, res: tx }); tracing::debug!(id, "batch: registered job in the queue"); (id, rx) } fn get_batch(&mut self, p: &P) -> (Vec, Vec>>) { let batch_size = p.batch_size(self.queue.len()); let mut reqs = Vec::with_capacity(batch_size); let mut resps = Vec::with_capacity(batch_size); let mut ids = Vec::with_capacity(batch_size); while reqs.len() < batch_size { let Some((id, job)) = self.queue.pop_first() else { break; }; reqs.push(job.req); resps.push(job.res); ids.push(id); } tracing::debug!(ids=?ids, "batch: acquired jobs"); (reqs, resps) } } ================================================ FILE: proxy/src/bin/local_proxy.rs ================================================ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::local_proxy::run().await } ================================================ FILE: proxy/src/bin/pg_sni_router.rs ================================================ //! A stand-alone program that routes connections, e.g. from //! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. //! //! This allows connecting to pods/services running in the same Kubernetes cluster from //! the outside. Similar to an ingress controller for HTTPS. #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::pg_sni_router::run().await } ================================================ FILE: proxy/src/bin/proxy.rs ================================================ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::proxy::run().await } ================================================ FILE: proxy/src/binary/local_proxy.rs ================================================ use std::env; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; use std::time::Duration; use anyhow::bail; use arc_swap::ArcSwapOption; use camino::Utf8PathBuf; use clap::Parser; use futures::future::Either; use tokio::net::TcpListener; use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::local::LocalBackend; use crate::auth::{self}; use crate::cancellation::CancellationHandler; #[cfg(feature = "rest_broker")] use crate::config::RestConfig; use crate::config::{ self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, refresh_config_loop, }; use crate::control_plane::locks::ApiLocks; use crate::http::health_server::AppMetrics; use crate::metrics::{Metrics, ServiceInfo}; use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}; use crate::scram::threadpool::ThreadPool; use crate::serverless::cancel_set::CancelSet; use crate::serverless::{self, GlobalConnPoolOptions}; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::url::ApiUrl; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); /// Neon proxy/router #[derive(Parser)] #[command(version = GIT_VERSION, about)] struct LocalProxyCliArgs { /// listen for incoming metrics connections on ip:port #[clap(long, default_value = "127.0.0.1:7001")] metrics: String, /// listen for incoming http connections on ip:port #[clap(long)] http: String, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] connect_compute_lock: String, #[clap(flatten)] sql_over_http: SqlOverHttpArgs, /// User rate limiter max number of requests per second. /// /// Provided in the form `@`. /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] user_rps_limit: Vec, /// Whether to retry the connection to the compute node #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] connect_to_compute_retry: String, /// Address of the postgres server #[clap(long, default_value = "127.0.0.1:5432")] postgres: SocketAddr, /// Address of the internal compute-ctl api service #[clap(long, default_value = "http://127.0.0.1:3081/")] compute_ctl: ApiUrl, /// Path of the local proxy config file #[clap(long, default_value = "./local_proxy.json")] config_path: Utf8PathBuf, /// Path of the local proxy PID file #[clap(long, default_value = "./local_proxy.pid")] pid_path: Utf8PathBuf, /// Disable pg_session_jwt extension installation /// This is useful for testing the local proxy with vanilla postgres. #[clap(long, default_value = "false")] #[cfg(feature = "testing")] disable_pg_session_jwt: bool, } #[derive(clap::Args, Clone, Copy, Debug)] struct SqlOverHttpArgs { /// How many connections to pool for each endpoint. Excess connections are discarded #[clap(long, default_value_t = 200)] sql_over_http_pool_max_total_conns: usize, /// How long pooled connections should remain idle for before closing #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] sql_over_http_idle_timeout: tokio::time::Duration, #[clap(long, default_value_t = 100)] sql_over_http_client_conn_threshold: u64, #[clap(long, default_value_t = 16)] sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, } pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init_local_proxy()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); // TODO: refactor these to use labels debug!("Version: {GIT_VERSION}"); debug!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { revision: GIT_VERSION, build_tag: BUILD_TAG, }); let jemalloc = match crate::jemalloc::MetricRecorder::new() { Ok(t) => Some(t), Err(e) => { tracing::error!(error = ?e, "could not start jemalloc metrics loop"); None } }; let args = LocalProxyCliArgs::parse(); let config = build_config(&args)?; let auth_backend = build_auth_backend(&args); // before we bind to any ports, write the process ID to a file // so that compute-ctl can find our process later // in order to trigger the appropriate SIGHUP on config change. // // This also claims a "lock" that makes sure only one instance // of local_proxy runs at a time. let _process_guard = loop { match pid_file::claim_for_current_process(&args.pid_path) { Ok(guard) => break guard, Err(e) => { // compute-ctl might have tried to read the pid-file to let us // know about some config change. We should try again. error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); tokio::time::sleep(Duration::from_secs(1)).await; } } }; let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; let http_listener = TcpListener::bind(args.http).await?; let shutdown = CancellationToken::new(); // todo: should scale with CU let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( LeakyBucketConfig { rps: 10.0, max: 100.0, }, 16, )); let mut maintenance_tasks = JoinSet::new(); let refresh_config_notify = Arc::new(Notify::new()); maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), { let refresh_config_notify = Arc::clone(&refresh_config_notify); move || { refresh_config_notify.notify_one(); } })); // trigger the first config load **after** setting up the signal hook // to avoid the race condition where: // 1. No config file registered when local_proxy starts up // 2. The config file is written but the signal hook is not yet received // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. refresh_config_notify.notify_one(); tokio::spawn(refresh_config_loop( config, args.config_path, refresh_config_notify, )); maintenance_tasks.spawn(crate::http::health_server::task_main( metrics_listener, AppMetrics { jemalloc, neon_metrics, proxy: crate::metrics::Metrics::get(), }, )); let task = serverless::task_main( config, auth_backend, http_listener, shutdown.clone(), Arc::new(CancellationHandler::new(&config.connect_to_compute)), endpoint_rate_limiter, ); Metrics::get() .service .info .set_label(ServiceInfo::running()); match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { // exit immediately on maintenance task completion Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {}, // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), // exit immediately on client task error Either::Right((res, _)) => res?, } Ok(()) } /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let config::ConcurrencyLockOptions { shards, limiter, epoch, timeout, } = args.connect_compute_lock.parse()?; info!( ?limiter, shards, ?epoch, "Using NodeLocks (connect_compute)" ); let connect_compute_locks = ApiLocks::new( "connect_compute_lock", limiter, shards, timeout, epoch, &Metrics::get().proxy.connect_compute_lock, ); let http_config = HttpConfig { accept_websockets: false, pool_options: GlobalConnPoolOptions { gc_epoch: Duration::from_secs(60), pool_shards: 2, idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, opt_in: false, max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let compute_config = ComputeConfig { retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, tls: Arc::new(compute_client_config_with_root_certs()?), timeout: Duration::from_secs(2), }; let greetings = env::var_os("NEON_MOTD").map_or(String::new(), |s| match s.into_string() { Ok(s) => s, Err(_) => { debug!("NEON_MOTD environment variable is not valid UTF-8"); String::new() } }); Ok(Box::leak(Box::new(ProxyConfig { tls_config: ArcSwapOption::from(None), metric_collection: None, http_config, authentication_config: AuthenticationConfig { jwks_cache: JwkCache::default(), scram_thread_pool: ThreadPool::new(0), scram_protocol_timeout: Duration::from_secs(10), ip_allowlist_check_enabled: true, is_vpc_acccess_proxy: false, is_auth_broker: false, accept_jwts: true, console_redirect_confirmation_timeout: Duration::ZERO, }, #[cfg(feature = "rest_broker")] rest_config: RestConfig { is_rest_broker: false, db_schema_cache: None, max_schema_size: 0, hostname_prefix: String::new(), }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, greetings, #[cfg(feature = "testing")] disable_pg_session_jwt: args.disable_pg_session_jwt, }))) } /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> { let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned( LocalBackend::new(args.postgres, args.compute_ctl.clone()), )); Box::leak(Box::new(auth_backend)) } ================================================ FILE: proxy/src/binary/mod.rs ================================================ //! All binaries have the body of their main() defined here, so that the code //! is also covered by code style configs in lib.rs and the unused-code check is //! more effective when practically all modules are private to the lib. pub mod local_proxy; pub mod pg_sni_router; pub mod proxy; ================================================ FILE: proxy/src/binary/pg_sni_router.rs ================================================ //! A stand-alone program that routes connections, e.g. from //! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. //! //! This allows connecting to pods/services running in the same Kubernetes cluster from //! the outside. Similar to an ingress controller for HTTPS. use std::io; use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; use anyhow::{Context, anyhow, bail, ensure}; use clap::Arg; use futures::future::Either; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use rustls::crypto::ring; use rustls::pki_types::{DnsName, PrivateKeyDer}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::net::TcpListener; use tokio_rustls::TlsConnector; use tokio_rustls::server::TlsStream; use tokio_util::sync::CancellationToken; use tracing::{Instrument, error, info}; use utils::project_git_version; use utils::sentry_init::init_sentry; use crate::context::RequestContext; use crate::metrics::{Metrics, ServiceInfo}; use crate::pglb::TlsRequired; use crate::pqproto::FeStartupPacket; use crate::protocol2::ConnectionInfo; use crate::proxy::{ErrorSource, copy_bidirectional_client_compute}; use crate::stream::{PqStream, Stream}; use crate::util::run_until_cancelled; project_git_version!(GIT_VERSION); fn cli() -> clap::Command { clap::Command::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("listen") .short('l') .long("listen") .help("listen for incoming client connections on ip:port") .default_value("127.0.0.1:4432"), ) .arg( Arg::new("listen-tls") .long("listen-tls") .help("listen for incoming client connections on ip:port, requiring TLS to compute") .default_value("127.0.0.1:4433"), ) .arg( Arg::new("tls-key") .short('k') .long("tls-key") .help("path to TLS key for client postgres connections") .required(true), ) .arg( Arg::new("tls-cert") .short('c') .long("tls-cert") .help("path to TLS cert for client postgres connections") .required(true), ) .arg( Arg::new("dest") .short('d') .long("destination") .help("append this domain zone to the SNI hostname to get the destination address") .required(true), ) } pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); let args = cli().get_matches(); let destination: String = args .get_one::("dest") .expect("string argument defined") .parse()?; // Configure TLS let tls_config = match ( args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?, _ => bail!("tls-key and tls-cert must be specified"), }; let compute_tls_config = Arc::new(crate::tls::client_config::compute_client_config_with_root_certs()?); // Start listening for incoming client connections let proxy_address: SocketAddr = args .get_one::("listen") .expect("listen argument defined") .parse()?; let proxy_address_compute_tls: SocketAddr = args .get_one::("listen-tls") .expect("listen-tls argument defined") .parse()?; info!("Starting sni router on {proxy_address}"); info!("Starting sni router on {proxy_address_compute_tls}"); let proxy_listener = TcpListener::bind(proxy_address).await?; let proxy_listener_compute_tls = TcpListener::bind(proxy_address_compute_tls).await?; let cancellation_token = CancellationToken::new(); let dest = Arc::new(destination); let main = tokio::spawn(task_main( dest.clone(), tls_config.clone(), None, proxy_listener, cancellation_token.clone(), )) .map(crate::error::flatten_err); let main_tls = tokio::spawn(task_main( dest, tls_config, Some(compute_tls_config), proxy_listener_compute_tls, cancellation_token.clone(), )) .map(crate::error::flatten_err); Metrics::get() .service .info .set_label(ServiceInfo::running()); let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {})); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. // we want to immediately exit on either of these cases let main = futures::future::try_join(main, main_tls); let signal = match futures::future::select(signals_task, main).await { Either::Left((res, _)) => crate::error::flatten_err(res)?, Either::Right((res, _)) => { res?; return Ok(()); } }; // maintenance tasks return `Infallible` success values, this is an impossible value // so this match statically ensures that there are no possibilities for that value match signal {} } pub(super) fn parse_tls( key_path: &Path, cert_path: &Path, ) -> anyhow::Result> { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); PrivateKeyDer::Pkcs8( keys.pop() .expect("keys should not be empty") .context(format!( "Failed to read TLS keys at '{}'", key_path.display() ))?, ) }; let cert_chain_bytes = std::fs::read(cert_path).context(format!( "Failed to read TLS cert file at '{}.'", cert_path.display() ))?; let cert_chain: Vec<_> = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) .try_collect() .with_context(|| { format!( "Failed to read TLS certificate chain from bytes from file at '{}'.", cert_path.display() ) })? }; let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) .context("ring should support TLS1.2 and TLS1.3")? .with_no_client_auth() .with_single_cert(cert_chain, key)? .into(); Ok(tls_config) } pub(super) async fn task_main( dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, ) -> anyhow::Result<()> { // When set for the server socket, the keepalive setting // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; let session_id = uuid::Uuid::new_v4(); let tls_config = Arc::clone(&tls_config); let dest_suffix = Arc::clone(&dest_suffix); let compute_tls_config = compute_tls_config.clone(); connections.spawn( async move { socket .set_nodelay(true) .context("failed to set socket option")?; let ctx = RequestContext::new( session_id, ConnectionInfo { addr: peer_addr, extra: None, }, crate::metrics::Protocol::SniRouter, ); handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await } .unwrap_or_else(|e| { if let Some(FirstMessage(io_error)) = e.downcast_ref() { // this is noisy. if we get EOF on the very first message that's likely // just NLB doing a healthcheck. if io_error.kind() == io::ErrorKind::UnexpectedEof { return; } } // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); }) .instrument(tracing::info_span!("handle_client", ?session_id)), ); } connections.close(); drop(listener); connections.wait().await; info!("all client connections have finished"); Ok(()) } #[derive(Debug, thiserror::Error)] #[error(transparent)] struct FirstMessage(io::Error); async fn ssl_handshake( ctx: &RequestContext, raw_stream: S, tls_config: Arc, ) -> anyhow::Result> { let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)) .await .map_err(FirstMessage)?; match msg { FeStartupPacket::SslRequest { direct: None } => { let raw = stream.accept_tls().await?; Ok(raw .upgrade(tls_config, !ctx.has_private_peer_addr()) .await?) } unexpected => { info!( ?unexpected, "unexpected startup packet, rejecting connection" ); Err(stream.throw_error(TlsRequired, None).await)? } } } async fn handle_client( ctx: RequestContext, dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { let mut tls_stream = ssl_handshake(&ctx, stream, tls_config).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` let sni = tls_stream .get_ref() .1 .server_name() .ok_or(anyhow!("SNI missing"))?; let dest: Vec<&str> = sni .split_once('.') .context("invalid SNI")? .0 .splitn(3, "--") .collect(); let port = dest[2].parse::().context("invalid port")?; let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); info!("destination: {}", destination); let mut client = tokio::net::TcpStream::connect(&destination).await?; let client = if let Some(compute_tls_config) = compute_tls_config { info!("upgrading TLS"); // send SslRequest client .write_all(b"\x00\x00\x00\x08\x04\xd2\x16\x2f") .await?; // wait for S/N respons let mut resp = b'N'; client.read_exact(std::slice::from_mut(&mut resp)).await?; // error if not S ensure!(resp == b'S', "compute refused TLS"); // upgrade to TLS. let domain = DnsName::try_from(destination)?; let domain = rustls::pki_types::ServerName::DnsName(domain); let client = TlsConnector::from(compute_tls_config) .connect(domain, client) .await?; Connection::Tls(client) } else { Connection::Raw(client) }; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); ctx.log_connect(); // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); let res = match client { Connection::Raw(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await, Connection::Tls(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await, }; match res { Ok(_) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), } } #[allow(clippy::large_enum_variant)] enum Connection { Raw(tokio::net::TcpStream), Tls(tokio_rustls::client::TlsStream), } ================================================ FILE: proxy/src/binary/proxy.rs ================================================ use std::env; use std::net::SocketAddr; use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; #[cfg(any(test, feature = "testing"))] use anyhow::Context; use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; #[cfg(any(test, feature = "testing"))] use camino::Utf8PathBuf; use futures::future::Either; use itertools::{Itertools, Position}; use rand::Rng; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; #[cfg(any(test, feature = "testing"))] use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; #[cfg(any(test, feature = "testing"))] use crate::auth::backend::local::LocalBackend; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; use crate::batch::BatchQueue; use crate::cancellation::{CancellationHandler, CancellationProcessor}; #[cfg(feature = "rest_broker")] use crate::config::RestConfig; #[cfg(any(test, feature = "testing"))] use crate::config::refresh_config_loop; use crate::config::{ self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, }; use crate::context::parquet::ParquetUploadArgs; use crate::http::health_server::AppMetrics; use crate::metrics::{Metrics, ServiceInfo}; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::redis::kv_ops::RedisKVClient; use crate::redis::{elasticache, notifications}; use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; #[cfg(feature = "rest_broker")] use crate::serverless::rest::DbSchemaCache; use crate::tls::client_config::compute_client_config_with_root_certs; #[cfg(any(test, feature = "testing"))] use crate::url::ApiUrl; use crate::{auth, control_plane, http, serverless, usage_metrics}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] #[clap(rename_all = "kebab-case")] enum AuthBackendType { #[clap(alias("cplane-v1"))] ControlPlane, #[clap(alias("link"))] ConsoleRedirect, #[cfg(any(test, feature = "testing"))] Postgres, #[cfg(any(test, feature = "testing"))] Local, } /// Neon proxy/router #[derive(Parser)] #[command(version = GIT_VERSION, about)] struct ProxyCliArgs { /// Name of the region this proxy is deployed in #[clap(long, default_value_t = String::new())] region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, /// Path of the local proxy config file (used for local-file auth backend) #[clap(long, default_value = "./local_proxy.json")] #[cfg(any(test, feature = "testing"))] config_path: Utf8PathBuf, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] mgmt: SocketAddr, /// listen for incoming http connections (metrics, etc) on ip:port #[clap(long, default_value = "127.0.0.1:7001")] http: SocketAddr, /// listen for incoming wss connections on ip:port #[clap(long)] wss: Option, /// redirect unauthenticated users to the given uri in case of console redirect auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, /// cloud API endpoint for authenticating users #[clap( short, long, default_value = "http://localhost:3000/authenticate_proxy_request/" )] auth_endpoint: String, /// JWT used to connect to control plane. #[clap( long, value_name = "JWT", default_value = "", env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" )] control_plane_token: Arc, /// if this is not local proxy, this toggles whether we accept jwt or passwords for http #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] is_auth_broker: bool, /// path to TLS key for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'k', long, alias = "ssl-key")] tls_key: Option, /// path to TLS cert for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] tls_cert: Option, /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. #[clap(long, alias = "allow-ssl-keylogfile")] allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, /// cache for `wake_compute` api method (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] wake_compute_lock: String, /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] connect_compute_lock: String, #[clap(flatten)] sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, /// size of the threadpool for password hashing #[clap(long, default_value_t = 4)] scram_thread_pool_size: u8, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form `@`. /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, /// Wake compute rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] wake_compute_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, /// Cancellation ops batch size for redis #[clap(long, default_value_t = 8)] cancellation_batch_size: usize, /// redis url for plain authentication #[clap(long, alias("redis-notifications"))] redis_plain: Option, /// what from the available authentications type to use for redis. Supported are "irsa" and "plain". #[clap(long, default_value = "irsa")] redis_auth_type: String, /// redis host for irsa authentication #[clap(long)] redis_host: Option, /// redis port for irsa authentication #[clap(long)] redis_port: Option, /// redis cluster name for irsa authentication #[clap(long)] redis_cluster_name: Option, /// redis user_id for irsa authentication #[clap(long)] redis_user_id: Option, /// aws region for irsa authentication #[clap(long, default_value_t = String::new())] aws_region: String, /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, /// cache for all valid endpoints // TODO: remove after a couple of releases. #[clap(long, default_value_t = String::new())] #[deprecated] endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, /// http endpoint to receive periodic metric updates #[clap(long)] metric_collection_endpoint: Option, /// how often metrics should be sent to a collection endpoint #[clap(long)] metric_collection_interval: Option, /// interval for backup metric collection #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] metric_backup_collection_interval: std::time::Duration, /// remote storage configuration for backup metric collection /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` #[clap(long, value_parser = remote_storage_from_toml)] metric_backup_collection_remote_storage: Option, /// chunk size for backup metric collection /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] metric_backup_collection_chunk_size: usize, /// Whether to retry the connection to the compute node #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] connect_to_compute_retry: String, /// Whether to retry the wake_compute request #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] wake_compute_retry: String, /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] is_private_access_proxy: bool, /// Configure whether all incoming requests have a Proxy Protocol V2 packet. #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)] proxy_protocol_v2: ProxyProtocolV2, /// Time the proxy waits for the webauth session to be confirmed by the control plane. // TODO: rename to `console_redirect_confirmation_timeout`. #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] webauth_confirmation_timeout: std::time::Duration, #[clap(flatten)] pg_sni_router: PgSniRouterArgs, /// if this is not local proxy, this toggles whether we accept Postgres REST requests #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] #[cfg(feature = "rest_broker")] is_rest_broker: bool, /// cache for `db_schema_cache` introspection (use `size=0` to disable) #[clap(long, default_value = "size=1000,ttl=1h")] #[cfg(feature = "rest_broker")] db_schema_cache: String, /// Maximum size allowed for schema in bytes #[clap(long, default_value_t = 5 * 1024 * 1024)] // 5MB #[cfg(feature = "rest_broker")] max_schema_size: usize, /// Hostname prefix to strip from request hostname to get database hostname #[clap(long, default_value = "apirest.")] #[cfg(feature = "rest_broker")] hostname_prefix: String, } #[derive(clap::Args, Clone, Copy, Debug)] struct SqlOverHttpArgs { /// timeout for http connection requests #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] sql_over_http_timeout: tokio::time::Duration, /// Whether the SQL over http pool is opt-in #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] sql_over_http_pool_opt_in: bool, /// How many connections to pool for each endpoint. Excess connections are discarded #[clap(long, default_value_t = 20)] sql_over_http_pool_max_conns_per_endpoint: usize, /// How many connections to pool for each endpoint. Excess connections are discarded #[clap(long, default_value_t = 20000)] sql_over_http_pool_max_total_conns: usize, /// How long pooled connections should remain idle for before closing #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] sql_over_http_idle_timeout: tokio::time::Duration, /// Duration each shard will wait on average before a GC sweep. /// A longer time will causes sweeps to take longer but will interfere less frequently. #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] sql_over_http_pool_gc_epoch: tokio::time::Duration, /// How many shards should the global pool have. Must be a power of two. /// More shards will introduce less contention for pool operations, but can /// increase memory used by the pool #[clap(long, default_value_t = 128)] sql_over_http_pool_shards: usize, #[clap(long, default_value_t = 10000)] sql_over_http_client_conn_threshold: u64, #[clap(long, default_value_t = 64)] sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, } #[derive(clap::Args, Clone, Debug)] struct PgSniRouterArgs { /// listen for incoming client connections on ip:port #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")] listen: SocketAddr, /// listen for incoming client connections on ip:port, requiring TLS to compute #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")] listen_tls: SocketAddr, /// path to TLS key for client postgres connections #[clap(id = "sni-router-tls-key", long)] tls_key: Option, /// path to TLS cert for client postgres connections #[clap(id = "sni-router-tls-cert", long)] tls_cert: Option, /// append this domain zone to the SNI hostname to get the destination address #[clap(id = "sni-router-destination", long)] dest: Option, } pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); // TODO: refactor these to use labels info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { revision: GIT_VERSION, build_tag: BUILD_TAG, }); let jemalloc = match crate::jemalloc::MetricRecorder::new() { Ok(t) => Some(t), Err(e) => { error!(error = ?e, "could not start jemalloc metrics loop"); None } }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; let auth_backend = build_auth_backend(&args)?; match auth_backend { Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); let redis_client = configure_redis(&args).await?; // Check that we can bind to address before further initialization info!("Starting http on {}", args.http); let http_listener = TcpListener::bind(args.http).await?.into_std()?; info!("Starting mgmt on {}", args.mgmt); let mgmt_listener = TcpListener::bind(args.mgmt).await?; let proxy_listener = if args.is_auth_broker { None } else { info!("Starting proxy on {}", args.proxy); Some(TcpListener::bind(args.proxy).await?) }; let sni_router_listeners = { let args = &args.pg_sni_router; if args.dest.is_some() { ensure!( args.tls_key.is_some(), "sni-router-tls-key must be provided" ); ensure!( args.tls_cert.is_some(), "sni-router-tls-cert must be provided" ); info!( "Starting pg-sni-router on {} and {}", args.listen, args.listen_tls ); Some(( TcpListener::bind(args.listen).await?, TcpListener::bind(args.listen_tls).await?, )) } else { None } }; // TODO: rename the argument to something like serverless. // It now covers more than just websockets, it also covers SQL over HTTP. let serverless_listener = if let Some(serverless_address) = args.wss { info!("Starting wss on {serverless_address}"); Some(TcpListener::bind(serverless_address).await?) } else if args.is_auth_broker { bail!("wss arg must be present for auth-broker") } else { None }; let cancellation_token = CancellationToken::new(); let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute)); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit) .unwrap_or(EndpointRateLimiter::DEFAULT), 64, )); #[cfg(any(test, feature = "testing"))] let refresh_config_notify = Arc::new(Notify::new()); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); match auth_backend { Either::Left(auth_backend) => { if let Some(proxy_listener) = proxy_listener { client_tasks.spawn(crate::pglb::task_main( config, auth_backend, proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); } if let Some(serverless_listener) = serverless_listener { client_tasks.spawn(serverless::task_main( config, auth_backend, serverless_listener, cancellation_token.clone(), cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); } // if auth backend is local, we need to load the config file #[cfg(any(test, feature = "testing"))] if let auth::Backend::Local(_) = &auth_backend { refresh_config_notify.notify_one(); tokio::spawn(refresh_config_loop( config, args.config_path, refresh_config_notify.clone(), )); } } Either::Right(auth_backend) => { if let Some(proxy_listener) = proxy_listener { client_tasks.spawn(crate::console_redirect_proxy::task_main( config, auth_backend, proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), )); } } } // spawn pg-sni-router mode. if let Some((listen, listen_tls)) = sni_router_listeners { let args = args.pg_sni_router; let dest = args.dest.expect("already asserted it is set"); let key_path = args.tls_key.expect("already asserted it is set"); let cert_path = args.tls_cert.expect("already asserted it is set"); let tls_config = super::pg_sni_router::parse_tls(&key_path, &cert_path)?; let dest = Arc::new(dest); client_tasks.spawn(super::pg_sni_router::task_main( dest.clone(), tls_config.clone(), None, listen, cancellation_token.clone(), )); client_tasks.spawn(super::pg_sni_router::task_main( dest, tls_config, Some(config.connect_to_compute.tls.clone()), listen_tls, cancellation_token.clone(), )); } client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, args.region, )); // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), { move || { #[cfg(any(test, feature = "testing"))] refresh_config_notify.notify_one(); } })); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { jemalloc, neon_metrics, proxy: crate::metrics::Metrics::get(), }, )); maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); // add a task to flush the db_schema cache every 10 minutes #[cfg(feature = "rest_broker")] if let Some(db_schema_cache) = &config.rest_config.db_schema_cache { maintenance_tasks.spawn(db_schema_cache.maintain()); } if let Some(metrics_config) = &config.metric_collection { // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); } if let Some(client) = redis_client { // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. // This prevents immediate exit and pod restart, // which can cause hammering of the redis in case of connection issues. // cancellation key management let mut redis_kv_client = RedisKVClient::new(client.clone()); for attempt in (0..3).with_position() { match redis_kv_client.try_connect().await { Ok(()) => { info!("Connected to Redis KV client"); cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor { client: redis_kv_client, batch_size: args.cancellation_batch_size, })); break; } Err(e) => { error!("Failed to connect to Redis KV client: {e}"); if matches!(attempt, Position::Last(_)) { bail!( "Failed to connect to Redis KV client after {} attempts", attempt.into_inner() ); } let jitter = rand::rng().random_range(0..100); tokio::time::sleep(Duration::from_millis(1000 + jitter)).await; } } } #[allow(irrefutable_let_patterns)] if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { // project info cache and invalidation of that cache. let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main(client, cache.clone())); maintenance_tasks.spawn(async move { cache.gc_worker().await }); } } Metrics::get() .service .info .set_label(ServiceInfo::running()); let maintenance = loop { // get one complete task match futures::future::select( pin!(maintenance_tasks.join_next()), pin!(client_tasks.join_next()), ) .await { // exit immediately on maintenance task completion Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?, // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), // exit immediately on client task error Either::Right((Some(res), _)) => crate::error::flatten_err(res)?, // exit if all our client tasks have shutdown gracefully Either::Right((None, _)) => return Ok(()), } }; // maintenance tasks return Infallible success values, this is an impossible value // so this match statically ensures that there are no possibilities for that value match maintenance {} } /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let thread_pool = ThreadPool::new(args.scram_thread_pool_size); Metrics::get() .proxy .scram_pool .0 .set(thread_pool.metrics.clone()) .ok(); let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, args.certs_dir.as_deref(), args.allow_tls_keylogfile, )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; let tls_config = ArcSwapOption::from(tls_config.map(Arc::new)); let backup_metric_collection_config = config::MetricBackupCollectionConfig { remote_storage_config: args.metric_backup_collection_remote_storage.clone(), chunk_size: args.metric_backup_collection_chunk_size, }; let metric_collection = match ( &args.metric_collection_endpoint, &args.metric_collection_interval, ) { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, backup_metric_collection_config, }), (None, None) => None, _ => bail!( "either both or neither metric-collection-endpoint \ and metric-collection-interval must be specified" ), }; let config::ConcurrencyLockOptions { shards, limiter, epoch, timeout, } = args.connect_compute_lock.parse()?; info!( ?limiter, shards, ?epoch, "Using NodeLocks (connect_compute)" ); let connect_compute_locks = control_plane::locks::ApiLocks::new( "connect_compute_lock", limiter, shards, timeout, epoch, &Metrics::get().proxy.connect_compute_lock, ); let http_config = HttpConfig { accept_websockets: !args.is_auth_broker, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, pool_shards: args.sql_over_http.sql_over_http_pool_shards, idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, opt_in: args.sql_over_http.sql_over_http_pool_opt_in, max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { jwks_cache: JwkCache::default(), scram_thread_pool: thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, ip_allowlist_check_enabled: !args.is_private_access_proxy, is_vpc_acccess_proxy: args.is_private_access_proxy, is_auth_broker: args.is_auth_broker, #[cfg(not(feature = "rest_broker"))] accept_jwts: args.is_auth_broker, #[cfg(feature = "rest_broker")] accept_jwts: args.is_auth_broker || args.is_rest_broker, console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, }; let compute_config = ComputeConfig { retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, tls: Arc::new(compute_client_config_with_root_certs()?), timeout: Duration::from_secs(2), }; #[cfg(feature = "rest_broker")] let rest_config = { let db_schema_cache_config: CacheOptions = args.db_schema_cache.parse()?; info!("Using DbSchemaCache with options={db_schema_cache_config:?}"); let db_schema_cache = if args.is_rest_broker { Some(DbSchemaCache::new(db_schema_cache_config)) } else { None }; RestConfig { is_rest_broker: args.is_rest_broker, db_schema_cache, max_schema_size: args.max_schema_size, hostname_prefix: args.hostname_prefix.clone(), } }; let mut greetings = env::var_os("NEON_MOTD").map_or(String::new(), |s| match s.into_string() { Ok(s) => s, Err(_) => { debug!("NEON_MOTD environment variable is not valid UTF-8"); String::new() } }); match &args.auth_backend { AuthBackendType::ControlPlane => {} #[cfg(any(test, feature = "testing"))] AuthBackendType::Postgres => {} #[cfg(any(test, feature = "testing"))] AuthBackendType::Local => {} AuthBackendType::ConsoleRedirect => { greetings = "Connected to database".to_string(); } } let config = ProxyConfig { tls_config, metric_collection, http_config, authentication_config, proxy_protocol_v2: args.proxy_protocol_v2, handshake_timeout: args.handshake_timeout, wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, greetings, #[cfg(feature = "testing")] disable_pg_session_jwt: false, #[cfg(feature = "rest_broker")] rest_config, }; let config = Box::leak(Box::new(config)); tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); Ok(config) } /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, ))); let config::ConcurrencyLockOptions { shards, limiter, epoch, timeout, } = args.wake_compute_lock.parse()?; info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( "wake_compute_lock", limiter, shards, timeout, epoch, &Metrics::get().wake_compute_lock, ))); tokio::spawn(locks.garbage_collect_worker()); let url: crate::url::ApiUrl = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( endpoint, args.control_plane_token.clone(), caches, locks, wake_compute_endpoint_rate_limiter, ); let api = control_plane::client::ControlPlaneClient::ProxyV1(api); let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); let config = Box::leak(Box::new(auth_backend)); Ok(Either::Left(config)) } #[cfg(any(test, feature = "testing"))] AuthBackendType::Postgres => { let mut url: ApiUrl = args.auth_endpoint.parse()?; if url.password().is_none() { let password = env::var("PGPASSWORD") .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?; url.set_password(Some(&password)) .expect("Failed to set password"); } let api = control_plane::client::mock::MockControlPlane::new( url, !args.is_private_access_proxy, ); let api = control_plane::client::ControlPlaneClient::PostgresMock(api); let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); let config = Box::leak(Box::new(auth_backend)); Ok(Either::Left(config)) } AuthBackendType::ConsoleRedirect => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, ))); let config::ConcurrencyLockOptions { shards, limiter, epoch, timeout, } = args.wake_compute_lock.parse()?; info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( "wake_compute_lock", limiter, shards, timeout, epoch, &Metrics::get().wake_compute_lock, ))); let url = args.uri.clone().parse()?; let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(ep_url, http::new_client()); let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter // and locks are not used in ConsoleRedirectBackend, // but they are required by the NeonControlPlaneClient let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( endpoint, args.control_plane_token.clone(), caches, locks, wake_compute_endpoint_rate_limiter, ); let backend = ConsoleRedirectBackend::new(url, api); let config = Box::leak(Box::new(backend)); Ok(Either::Right(config)) } #[cfg(any(test, feature = "testing"))] AuthBackendType::Local => { let postgres: SocketAddr = "127.0.0.1:7432".parse()?; let compute_ctl: ApiUrl = "http://127.0.0.1:3081/".parse()?; let auth_backend = crate::auth::Backend::Local( crate::auth::backend::MaybeOwned::Owned(LocalBackend::new(postgres, compute_ctl)), ); let config = Box::leak(Box::new(auth_backend)); Ok(Either::Left(config)) } } } async fn configure_redis( args: &ProxyCliArgs, ) -> anyhow::Result> { // TODO: untangle the config args let redis_client = match &*args.redis_auth_type { "plain" => match &args.redis_plain { None => { bail!("plain auth requires redis_plain to be set"); } Some(url) => { Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) } }, "irsa" => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( ConnectionWithCredentialsProvider::new_with_credentials_provider( host.clone(), port, elasticache::CredentialsProvider::new( args.aws_region.clone(), args.redis_cluster_name.clone(), args.redis_user_id.clone(), ) .await, ), ), (None, None) => { // todo: upgrade to error? warn!( "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" ); None } _ => { bail!("redis-host and redis-port must be specified together"); } }, auth_type => { bail!("unknown auth type {auth_type:?} given") } }; Ok(redis_client) } #[cfg(test)] mod tests { use std::time::Duration; use clap::Parser; use crate::rate_limiter::RateBucketInfo; #[test] fn parse_endpoint_rps_limit() { let config = super::ProxyCliArgs::parse_from([ "proxy", "--endpoint-rps-limit", "100@1s", "--endpoint-rps-limit", "20@30s", ]); assert_eq!( config.endpoint_rps_limit, vec![ RateBucketInfo::new(100, Duration::from_secs(1)), RateBucketInfo::new(20, Duration::from_secs(30)), ] ); } } ================================================ FILE: proxy/src/cache/common.rs ================================================ use std::ops::{Deref, DerefMut}; use std::time::{Duration, Instant}; use moka::Expiry; use moka::notification::RemovalCause; use crate::control_plane::messages::ControlPlaneErrorMessage; use crate::metrics::{ CacheEviction, CacheKind, CacheOutcome, CacheOutcomeGroup, CacheRemovalCause, Metrics, }; /// Default TTL used when caching errors from control plane. pub const DEFAULT_ERROR_TTL: Duration = Duration::from_secs(30); /// A generic trait which exposes types of cache's key and value, /// as well as the notion of cache entry invalidation. /// This is useful for [`Cached`]. pub(crate) trait Cache { /// Entry's key. type Key; /// Entry's value. type Value; /// Invalidate an entry using a lookup info. /// We don't have an empty default impl because it's error-prone. fn invalidate(&self, _: &Self::Key); } impl Cache for &C { type Key = C::Key; type Value = C::Value; fn invalidate(&self, info: &Self::Key) { C::invalidate(self, info); } } /// Wrapper for convenient entry invalidation. pub(crate) struct Cached::Value> { /// Cache + lookup info. pub(crate) token: Option<(C, C::Key)>, /// The value itself. pub(crate) value: V, } impl Cached { /// Place any entry into this wrapper; invalidation will be a no-op. pub(crate) fn new_uncached(value: V) -> Self { Self { token: None, value } } /// Drop this entry from a cache if it's still there. pub(crate) fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { cache.invalidate(info); } self.value } /// Tell if this entry is actually cached. pub(crate) fn cached(&self) -> bool { self.token.is_some() } } impl Deref for Cached { type Target = V; fn deref(&self) -> &Self::Target { &self.value } } impl DerefMut for Cached { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.value } } pub type ControlPlaneResult = Result>; #[derive(Clone, Copy)] pub struct CplaneExpiry { pub error: Duration, } impl Default for CplaneExpiry { fn default() -> Self { Self { error: DEFAULT_ERROR_TTL, } } } impl CplaneExpiry { pub fn expire_early( &self, value: &ControlPlaneResult, updated: Instant, ) -> Option { match value { Ok(_) => None, Err(err) => Some(self.expire_err_early(err, updated)), } } pub fn expire_err_early(&self, err: &ControlPlaneErrorMessage, updated: Instant) -> Duration { err.status .as_ref() .and_then(|s| s.details.retry_info.as_ref()) .map_or(self.error, |r| r.retry_at.into_std() - updated) } } impl Expiry> for CplaneExpiry { fn expire_after_create( &self, _key: &K, value: &ControlPlaneResult, created_at: Instant, ) -> Option { self.expire_early(value, created_at) } fn expire_after_update( &self, _key: &K, value: &ControlPlaneResult, updated_at: Instant, _duration_until_expiry: Option, ) -> Option { self.expire_early(value, updated_at) } } pub fn eviction_listener(kind: CacheKind, cause: RemovalCause) { let cause = match cause { RemovalCause::Expired => CacheRemovalCause::Expired, RemovalCause::Explicit => CacheRemovalCause::Explicit, RemovalCause::Replaced => CacheRemovalCause::Replaced, RemovalCause::Size => CacheRemovalCause::Size, }; Metrics::get() .cache .evicted_total .inc(CacheEviction { cache: kind, cause }); } #[inline] pub fn count_cache_outcome(kind: CacheKind, cache_result: Option) -> Option { let outcome = if cache_result.is_some() { CacheOutcome::Hit } else { CacheOutcome::Miss }; Metrics::get().cache.request_total.inc(CacheOutcomeGroup { cache: kind, outcome, }); cache_result } #[inline] pub fn count_cache_insert(kind: CacheKind) { Metrics::get().cache.inserted_total.inc(kind); } ================================================ FILE: proxy/src/cache/mod.rs ================================================ pub(crate) mod common; pub(crate) mod node_info; pub(crate) mod project_info; pub(crate) use common::{Cached, ControlPlaneResult, CplaneExpiry}; ================================================ FILE: proxy/src/cache/node_info.rs ================================================ use crate::cache::common::{Cache, count_cache_insert, count_cache_outcome, eviction_listener}; use crate::cache::{Cached, ControlPlaneResult, CplaneExpiry}; use crate::config::CacheOptions; use crate::control_plane::NodeInfo; use crate::metrics::{CacheKind, Metrics}; use crate::types::EndpointCacheKey; pub(crate) struct NodeInfoCache(moka::sync::Cache>); pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; impl Cache for NodeInfoCache { type Key = EndpointCacheKey; type Value = ControlPlaneResult; fn invalidate(&self, info: &EndpointCacheKey) { self.0.invalidate(info); } } impl NodeInfoCache { pub fn new(config: CacheOptions) -> Self { let builder = moka::sync::Cache::builder() .name("node_info") .expire_after(CplaneExpiry::default()); let builder = config.moka(builder); if let Some(size) = config.size { Metrics::get() .cache .capacity .set(CacheKind::NodeInfo, size as i64); } let builder = builder .eviction_listener(|_k, _v, cause| eviction_listener(CacheKind::NodeInfo, cause)); Self(builder.build()) } pub fn insert(&self, key: EndpointCacheKey, value: ControlPlaneResult) { count_cache_insert(CacheKind::NodeInfo); self.0.insert(key, value); } pub fn get(&self, key: &EndpointCacheKey) -> Option> { count_cache_outcome(CacheKind::NodeInfo, self.0.get(key)) } pub fn get_entry( &'static self, key: &EndpointCacheKey, ) -> Option> { self.get(key).map(|res| { res.map(|value| Cached { token: Some((self, key.clone())), value, }) }) } } ================================================ FILE: proxy/src/cache/project_info.rs ================================================ use std::collections::HashSet; use std::convert::Infallible; use clashmap::ClashMap; use moka::sync::Cache; use tracing::{debug, info}; use crate::cache::common::{ ControlPlaneResult, CplaneExpiry, count_cache_insert, count_cache_outcome, eviction_listener, }; use crate::config::ProjectInfoCacheOptions; use crate::control_plane::messages::{ControlPlaneErrorMessage, Reason}; use crate::control_plane::{EndpointAccessControl, RoleAccessControl}; use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{CacheKind, Metrics}; use crate::types::{EndpointId, RoleName}; /// Cache for project info. /// This is used to cache auth data for endpoints. /// Invalidation is done by console notifications or by TTL (if console notifications are disabled). /// /// We also store endpoint-to-project mapping in the cache, to be able to access per-endpoint data. /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCache { role_controls: Cache<(EndpointIdInt, RoleNameInt), ControlPlaneResult>, ep_controls: Cache>, project2ep: ClashMap>, // FIXME(stefan): we need a way to GC the account2ep map. account2ep: ClashMap>, config: ProjectInfoCacheOptions, } impl ProjectInfoCache { pub fn invalidate_endpoint_access(&self, endpoint_id: EndpointIdInt) { info!("invalidating endpoint access for `{endpoint_id}`"); self.ep_controls.invalidate(&endpoint_id); } pub fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt) { info!("invalidating endpoint access for project `{project_id}`"); let endpoints = self .project2ep .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { self.ep_controls.invalidate(&endpoint_id); } } pub fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt) { info!("invalidating endpoint access for org `{account_id}`"); let endpoints = self .account2ep .get(&account_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { self.ep_controls.invalidate(&endpoint_id); } } pub fn invalidate_role_secret_for_project( &self, project_id: ProjectIdInt, role_name: RoleNameInt, ) { info!( "invalidating role secret for project_id `{}` and role_name `{}`", project_id, role_name, ); let endpoints = self .project2ep .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { self.role_controls.invalidate(&(endpoint_id, role_name)); } } } impl ProjectInfoCache { pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self { Metrics::get().cache.capacity.set( CacheKind::ProjectInfoRoles, (config.size * config.max_roles) as i64, ); Metrics::get() .cache .capacity .set(CacheKind::ProjectInfoEndpoints, config.size as i64); // we cache errors for 30 seconds, unless retry_at is set. let expiry = CplaneExpiry::default(); Self { role_controls: Cache::builder() .name("project_info_roles") .eviction_listener(|_k, _v, cause| { eviction_listener(CacheKind::ProjectInfoRoles, cause); }) .max_capacity(config.size * config.max_roles) .time_to_live(config.ttl) .expire_after(expiry) .build(), ep_controls: Cache::builder() .name("project_info_endpoints") .eviction_listener(|_k, _v, cause| { eviction_listener(CacheKind::ProjectInfoEndpoints, cause); }) .max_capacity(config.size) .time_to_live(config.ttl) .expire_after(expiry) .build(), project2ep: ClashMap::new(), account2ep: ClashMap::new(), config, } } pub(crate) fn get_role_secret( &self, endpoint_id: &EndpointId, role_name: &RoleName, ) -> Option> { let endpoint_id = EndpointIdInt::get(endpoint_id)?; let role_name = RoleNameInt::get(role_name)?; count_cache_outcome( CacheKind::ProjectInfoRoles, self.role_controls.get(&(endpoint_id, role_name)), ) } pub(crate) fn get_endpoint_access( &self, endpoint_id: &EndpointId, ) -> Option> { let endpoint_id = EndpointIdInt::get(endpoint_id)?; count_cache_outcome( CacheKind::ProjectInfoEndpoints, self.ep_controls.get(&endpoint_id), ) } pub(crate) fn insert_endpoint_access( &self, account_id: Option, project_id: Option, endpoint_id: EndpointIdInt, role_name: RoleNameInt, controls: EndpointAccessControl, role_controls: RoleAccessControl, ) { if let Some(account_id) = account_id { self.insert_account2endpoint(account_id, endpoint_id); } if let Some(project_id) = project_id { self.insert_project2endpoint(project_id, endpoint_id); } debug!( key = &*endpoint_id, "created a cache entry for endpoint access" ); count_cache_insert(CacheKind::ProjectInfoEndpoints); count_cache_insert(CacheKind::ProjectInfoRoles); self.ep_controls.insert(endpoint_id, Ok(controls)); self.role_controls .insert((endpoint_id, role_name), Ok(role_controls)); } pub(crate) fn insert_endpoint_access_err( &self, endpoint_id: EndpointIdInt, role_name: RoleNameInt, msg: Box, ) { debug!( key = &*endpoint_id, "created a cache entry for an endpoint access error" ); // RoleProtected is the only role-specific error that control plane can give us. // If a given role name does not exist, it still returns a successful response, // just with an empty secret. if msg.get_reason() != Reason::RoleProtected { // We can cache all the other errors in ep_controls because they don't // depend on what role name we pass to control plane. self.ep_controls .entry(endpoint_id) .and_compute_with(|entry| match entry { // leave the entry alone if it's already Ok Some(entry) if entry.value().is_ok() => moka::ops::compute::Op::Nop, // replace the entry _ => { count_cache_insert(CacheKind::ProjectInfoEndpoints); moka::ops::compute::Op::Put(Err(msg.clone())) } }); } count_cache_insert(CacheKind::ProjectInfoRoles); self.role_controls .insert((endpoint_id, role_name), Err(msg)); } fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { endpoints.insert(endpoint_id); } else { self.project2ep .insert(project_id, HashSet::from([endpoint_id])); } } fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) { if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) { endpoints.insert(endpoint_id); } else { self.account2ep .insert(account_id, HashSet::from([endpoint_id])); } } pub fn maybe_invalidate_role_secret(&self, _endpoint_id: &EndpointId, _role_name: &RoleName) { // TODO: Expire the value early if the key is idle. // Currently not an issue as we would just use the TTL to decide, which is what already happens. } pub async fn gc_worker(&self) -> anyhow::Result { let mut interval = tokio::time::interval(self.config.gc_interval); loop { interval.tick().await; self.ep_controls.run_pending_tasks(); self.role_controls.run_pending_tasks(); } } } #[cfg(test)] mod tests { use std::sync::Arc; use std::time::Duration; use super::*; use crate::control_plane::messages::{Details, EndpointRateLimitConfig, ErrorInfo, Status}; use crate::control_plane::{AccessBlockerFlags, AuthSecret}; use crate::scram::ServerSecret; #[tokio::test] async fn test_project_info_cache_settings() { let cache = ProjectInfoCache::new(ProjectInfoCacheOptions { size: 1, max_roles: 2, ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); let project_id: Option = Some(ProjectIdInt::from(&"project".into())); let endpoint_id: EndpointId = "endpoint".into(); let account_id = None; let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret2 = None; let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); cache.insert_endpoint_access( account_id, project_id, (&endpoint_id).into(), (&user1).into(), EndpointAccessControl { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret1.clone(), }, ); cache.insert_endpoint_access( account_id, project_id, (&endpoint_id).into(), (&user2).into(), EndpointAccessControl { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret2.clone(), }, ); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); assert_eq!(cached.unwrap().secret, secret1); let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); assert_eq!(cached.unwrap().secret, secret2); // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); cache.role_controls.run_pending_tasks(); cache.insert_endpoint_access( account_id, project_id, (&endpoint_id).into(), (&user3).into(), EndpointAccessControl { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret3.clone(), }, ); cache.role_controls.run_pending_tasks(); assert_eq!(cache.role_controls.entry_count(), 2); tokio::time::sleep(Duration::from_secs(2)).await; cache.role_controls.run_pending_tasks(); assert_eq!(cache.role_controls.entry_count(), 0); } #[tokio::test] async fn test_caching_project_info_errors() { let cache = ProjectInfoCache::new(ProjectInfoCacheOptions { size: 10, max_roles: 10, ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); let project_id = Some(ProjectIdInt::from(&"project".into())); let endpoint_id: EndpointId = "endpoint".into(); let account_id = None; let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let role_msg = Box::new(ControlPlaneErrorMessage { error: "role is protected and cannot be used for password-based authentication" .to_owned() .into_boxed_str(), http_status_code: http::StatusCode::NOT_FOUND, status: Some(Status { code: "PERMISSION_DENIED".to_owned().into_boxed_str(), message: "role is protected and cannot be used for password-based authentication" .to_owned() .into_boxed_str(), details: Details { error_info: Some(ErrorInfo { reason: Reason::RoleProtected, }), retry_info: None, user_facing_message: None, }, }), }); let generic_msg = Box::new(ControlPlaneErrorMessage { error: "oh noes".to_owned().into_boxed_str(), http_status_code: http::StatusCode::NOT_FOUND, status: None, }); let get_role_secret = |endpoint_id, role_name| cache.get_role_secret(endpoint_id, role_name).unwrap(); let get_endpoint_access = |endpoint_id| cache.get_endpoint_access(endpoint_id).unwrap(); // stores role-specific errors only for get_role_secret cache.insert_endpoint_access_err((&endpoint_id).into(), (&user1).into(), role_msg.clone()); assert_eq!( get_role_secret(&endpoint_id, &user1).unwrap_err().error, role_msg.error ); assert!(cache.get_endpoint_access(&endpoint_id).is_none()); // stores non-role specific errors for both get_role_secret and get_endpoint_access cache.insert_endpoint_access_err( (&endpoint_id).into(), (&user1).into(), generic_msg.clone(), ); assert_eq!( get_role_secret(&endpoint_id, &user1).unwrap_err().error, generic_msg.error ); assert_eq!( get_endpoint_access(&endpoint_id).unwrap_err().error, generic_msg.error ); // error isn't returned for other roles in the same endpoint assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); // success for a role does not overwrite errors for other roles cache.insert_endpoint_access( account_id, project_id, (&endpoint_id).into(), (&user2).into(), EndpointAccessControl { allowed_ips: Arc::new(vec![]), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret.clone(), }, ); assert!(get_role_secret(&endpoint_id, &user1).is_err()); assert!(get_role_secret(&endpoint_id, &user2).is_ok()); // ...but does clear the access control error assert!(get_endpoint_access(&endpoint_id).is_ok()); // storing an error does not overwrite successful access control response cache.insert_endpoint_access_err( (&endpoint_id).into(), (&user2).into(), generic_msg.clone(), ); assert!(get_role_secret(&endpoint_id, &user2).is_err()); assert!(get_endpoint_access(&endpoint_id).is_ok()); } } ================================================ FILE: proxy/src/cancellation.rs ================================================ use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; use std::pin::pin; use std::sync::{Arc, OnceLock}; use std::time::Duration; use futures::FutureExt; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::RawCancelToken; use postgres_client::tls::MakeTlsConnect; use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::time::timeout; use tracing::{debug, error, info}; use crate::auth::AuthError; use crate::auth::backend::ComputeUserInfo; use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; use crate::error::ReportableError; use crate::ext::LockExt; use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind}; use crate::pqproto::CancelKeyData; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::keys::KeyPrefix; use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError}; use crate::util::run_until; type IpSubnetKey = IpNet; /// Initial period and TTL is shorter to clear keys of short-lived connections faster. const CANCEL_KEY_INITIAL_PERIOD: Duration = Duration::from_secs(60); const CANCEL_KEY_REFRESH_PERIOD: Duration = Duration::from_secs(10 * 60); /// `CANCEL_KEY_TTL_SLACK` is added to the periods to determine the actual TTL. const CANCEL_KEY_TTL_SLACK: Duration = Duration::from_secs(30); // Message types for sending through mpsc channel pub enum CancelKeyOp { Store { key: CancelKeyData, value: Box, expire: Duration, }, Refresh { key: CancelKeyData, expire: Duration, }, Get { key: CancelKeyData, }, GetOld { key: CancelKeyData, }, } impl CancelKeyOp { const fn redis_msg_kind(&self) -> RedisMsgKind { match self { CancelKeyOp::Store { .. } => RedisMsgKind::Set, CancelKeyOp::Refresh { .. } => RedisMsgKind::Expire, CancelKeyOp::Get { .. } => RedisMsgKind::Get, CancelKeyOp::GetOld { .. } => RedisMsgKind::HGet, } } fn cancel_channel_metric_guard(&self) -> CancelChannelSizeGuard<'static> { Metrics::get() .proxy .cancel_channel_size .guard(self.redis_msg_kind()) } } #[derive(thiserror::Error, Debug, Clone)] pub enum PipelineError { #[error("could not send cmd to redis: {0}")] RedisKVClient(Arc), #[error("incorrect number of responses from redis")] IncorrectNumberOfResponses, } pub struct Pipeline { inner: redis::Pipeline, replies: usize, } impl Pipeline { fn with_capacity(n: usize) -> Self { Self { inner: redis::Pipeline::with_capacity(n), replies: 0, } } async fn execute(self, client: &mut RedisKVClient) -> Result, PipelineError> { let responses = self.replies; let batch_size = self.inner.len(); if !client.credentials_refreshed() { tracing::debug!( "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." ); tokio::time::sleep(Duration::from_secs(5)).await; } match client.query(&self.inner).await { // for each reply, we expect that many values. Ok(Value::Array(values)) if values.len() == responses => { debug!( batch_size, responses, "successfully completed cancellation jobs", ); Ok(values.into_iter().collect()) } Ok(value) => { error!(batch_size, ?value, "unexpected redis return value"); Err(PipelineError::IncorrectNumberOfResponses) } Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))), } } fn add_command(&mut self, cmd: Cmd) { self.inner.add_command(cmd); self.replies += 1; } } impl CancelKeyOp { fn register(&self, pipe: &mut Pipeline) { match self { CancelKeyOp::Store { key, value, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); pipe.add_command(Cmd::set_options( &key, &**value, SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())), )); } CancelKeyOp::Refresh { key, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64)); } CancelKeyOp::GetOld { key } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); pipe.add_command(Cmd::hget(key, "data")); } CancelKeyOp::Get { key } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); pipe.add_command(Cmd::get(key)); } } } } pub struct CancellationProcessor { pub client: RedisKVClient, pub batch_size: usize, } impl QueueProcessing for CancellationProcessor { type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp); type Res = redis::Value; type Err = PipelineError; fn batch_size(&self, _queue_size: usize) -> usize { self.batch_size } async fn apply(&mut self, batch: Vec) -> Result, Self::Err> { if !self.client.credentials_refreshed() { // this will cause a timeout for cancellation operations tracing::debug!( "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." ); tokio::time::sleep(Duration::from_secs(5)).await; } let mut pipeline = Pipeline::with_capacity(batch.len()); let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); for (_, op) in &batch { op.register(&mut pipeline); } pipeline.execute(&mut self.client).await } } /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. pub struct CancellationHandler { compute_config: &'static ComputeConfig, // rate limiter of cancellation requests limiter: Arc>>, tx: OnceLock>, // send messages to the redis KV client task } #[derive(Debug, Error)] pub(crate) enum CancelError { #[error("{0}")] IO(#[from] std::io::Error), #[error("{0}")] Postgres(#[from] postgres_client::Error), #[error("rate limit exceeded")] RateLimit, #[error("Authentication error")] AuthError(#[from] AuthError), #[error("key not found")] NotFound, #[error("proxy service error")] InternalError, } impl ReportableError for CancelError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { CancelError::IO(_) => crate::error::ErrorKind::Compute, CancelError::Postgres(e) if e.as_db_error().is_some() => { crate::error::ErrorKind::Postgres } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, CancelError::RateLimit => crate::error::ErrorKind::RateLimit, CancelError::NotFound | CancelError::AuthError(_) => crate::error::ErrorKind::User, CancelError::InternalError => crate::error::ErrorKind::Service, } } } impl CancellationHandler { pub fn new(compute_config: &'static ComputeConfig) -> Self { Self { compute_config, tx: OnceLock::new(), limiter: Arc::new(std::sync::Mutex::new( LeakyBucketRateLimiter::::new_with_shards( LeakyBucketRateLimiter::::DEFAULT, 64, ), )), } } pub fn init_tx(&self, queue: BatchQueue) { self.tx .set(queue) .map_err(|_| {}) .expect("cancellation queue should be registered once"); } pub(crate) fn get_key(self: Arc) -> Session { // we intentionally generate a random "backend pid" and "secret key" here. // we use the corresponding u64 as an identifier for the // actual endpoint+pid+secret for postgres/pgbouncer. // // if we forwarded the backend_pid from postgres to the client, there would be a lot // of overlap between our computes as most pids are small (~100). let key: CancelKeyData = rand::random(); debug!("registered new query cancellation key {key}"); Session { key, cancellation_handler: self, } } /// This is not cancel safe async fn get_cancel_key( &self, key: CancelKeyData, ) -> Result, CancelError> { const TIMEOUT: Duration = Duration::from_secs(5); let Some(tx) = self.tx.get() else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); }; let guard = Metrics::get() .proxy .cancel_channel_size .guard(RedisMsgKind::Get); let op = CancelKeyOp::Get { key }; let result = timeout( TIMEOUT, tx.call((guard, op), std::future::pending::()), ) .await .map_err(|_| { tracing::warn!("timed out waiting to receive GetCancelData response"); CancelError::RateLimit })?; // We may still have cancel keys set with HSET "data". // Check error type and retry with HGET. // TODO: remove code after HSET is not used anymore. let result = if let Err(err) = result.as_ref() && let BatchQueueError::Result(err) = err && let PipelineError::RedisKVClient(err) = err && let RedisKVClientError::Redis(err) = &**err && let Some(errcode) = err.code() && errcode == "WRONGTYPE" { let guard = Metrics::get() .proxy .cancel_channel_size .guard(RedisMsgKind::HGet); let op = CancelKeyOp::GetOld { key }; timeout( TIMEOUT, tx.call((guard, op), std::future::pending::()), ) .await .map_err(|_| { tracing::warn!("timed out waiting to receive GetCancelData response"); CancelError::RateLimit })? } else { result }; let result = result.map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); CancelError::InternalError })?; let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); CancelError::InternalError })?; let cancel_closure: CancelClosure = serde_json::from_str(&cancel_state_str).map_err(|e| { tracing::warn!("failed to deserialize cancel state: {e}"); CancelError::InternalError })?; Ok(Some(cancel_closure)) } /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query. /// Will fetch IP allowlist internally. /// /// return Result primarily for tests /// /// This is not cancel safe pub(crate) async fn cancel_session( &self, key: CancelKeyData, ctx: RequestContext, check_ip_allowed: bool, check_vpc_allowed: bool, auth_backend: &T, ) -> Result<(), CancelError> { let subnet_key = match ctx.peer_addr() { IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), }; let allowed = { let rate_limit_config = None; let limiter = self.limiter.lock_propagate_poison(); limiter.check(subnet_key, rate_limit_config, 1) }; if !allowed { // log only the subnet part of the IP address to know which subnet is rate limited tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { kind: crate::metrics::CancellationOutcome::RateLimitExceeded, }); return Err(CancelError::RateLimit); } let cancel_state = self.get_cancel_key(key).await.map_err(|e| { tracing::warn!("failed to receive RedisOp response: {e}"); CancelError::InternalError })?; let Some(cancel_closure) = cancel_state else { tracing::warn!("query cancellation key not found: {key}"); Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { kind: crate::metrics::CancellationOutcome::NotFound, }); return Err(CancelError::NotFound); }; let info = &cancel_closure.user_info; let access_controls = auth_backend .get_endpoint_access_control(&ctx, &info.endpoint, &info.user) .await .map_err(|e| CancelError::AuthError(e.into()))?; access_controls.check(&ctx, check_ip_allowed, check_vpc_allowed)?; Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { kind: crate::metrics::CancellationOutcome::Found, }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query(self.compute_config).await } } /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CancelClosure { pub socket_addr: SocketAddr, pub cancel_token: RawCancelToken, pub hostname: String, // for pg_sni router pub user_info: ComputeUserInfo, } impl CancelClosure { /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query( &self, compute_config: &ComputeConfig, ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; let tls = <_ as MakeTlsConnect>::make_tls_connect( compute_config, &self.hostname, ) .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?; self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); Ok(()) } } /// Helper for registering query cancellation tokens. pub(crate) struct Session { /// The user-facing key identifying this session. key: CancelKeyData, cancellation_handler: Arc, } impl Session { pub(crate) fn key(&self) -> &CancelKeyData { &self.key } /// Ensure the cancel key is continously refreshed, /// but stop when the channel is dropped. /// /// This is not cancel safe pub(crate) async fn maintain_cancel_key( &self, session_id: uuid::Uuid, cancel: tokio::sync::oneshot::Receiver, cancel_closure: &CancelClosure, compute_config: &ComputeConfig, ) { let Some(tx) = self.cancellation_handler.tx.get() else { tracing::warn!("cancellation handler is not available"); // don't exit, as we only want to exit if cancelled externally. std::future::pending().await }; let closure_json = serde_json::to_string(&cancel_closure) .expect("serialising to json string should not fail") .into_boxed_str(); let mut cancel = pin!(cancel); enum State { Init, Refresh, } let mut state = State::Init; loop { let (op, mut wait_interval) = match state { State::Init => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "registering cancellation key" ); ( CancelKeyOp::Store { key: self.key, value: closure_json.clone(), expire: CANCEL_KEY_INITIAL_PERIOD + CANCEL_KEY_TTL_SLACK, }, CANCEL_KEY_INITIAL_PERIOD, ) } State::Refresh => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "refreshing cancellation key" ); ( CancelKeyOp::Refresh { key: self.key, expire: CANCEL_KEY_REFRESH_PERIOD + CANCEL_KEY_TTL_SLACK, }, CANCEL_KEY_REFRESH_PERIOD, ) } }; match tx .call((op.cancel_channel_metric_guard(), op), cancel.as_mut()) .await { // SET returns OK Ok(Value::Okay) => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "registered cancellation key" ); state = State::Refresh; } // EXPIRE returns 1 Ok(Value::Int(1)) => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "refreshed cancellation key" ); } Ok(_) => { // Any other response likely means the key expired. tracing::warn!(src=%self.key, "refreshing cancellation key failed"); // Re-enter the SET loop quickly to repush full data. state = State::Init; wait_interval = Duration::ZERO; } // retry immediately. Err(BatchQueueError::Result(error)) => { tracing::warn!(?error, "error refreshing cancellation key"); // Small delay to prevent busy loop with high cpu and logging. wait_interval = Duration::from_millis(10); } Err(BatchQueueError::Cancelled(Err(_cancelled))) => break, } // wait before continuing. break immediately if cancelled. if run_until(tokio::time::sleep(wait_interval), cancel.as_mut()) .await .is_err() { break; } } if let Err(err) = cancel_closure .try_cancel_query(compute_config) .boxed() .await { tracing::warn!( ?session_id, ?err, "could not cancel the query in the database" ); } } } ================================================ FILE: proxy/src/compute/mod.rs ================================================ mod tls; use std::fmt::Debug; use std::io; use std::net::{IpAddr, SocketAddr}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use postgres_client::config::{AuthKeys, ChannelBinding, SslMode}; use postgres_client::connect_raw::StartupStream; use postgres_client::error::SqlState; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; use thiserror::Error; use tokio::net::{TcpStream, lookup_host}; use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeCredentialKeys; use crate::auth::parse_endpoint_param; use crate::compute::tls::TlsError; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; use crate::pqproto::StartupMessageParams; use crate::proxy::connect_compute::TlsNegotiation; use crate::proxy::neon_option; use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub(crate) enum PostgresError { /// This error doesn't seem to reveal any secrets; for instance, /// `postgres_client::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] postgres_client::Error), } impl UserFacingError for PostgresError { fn to_string_client(&self) -> String { match self { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. PostgresError::Postgres(err) => match err.as_db_error() { Some(err) => { let msg = err.message(); if msg.starts_with("unsupported startup parameter: ") || msg.starts_with("unsupported startup parameter in options: ") { format!( "{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter" ) } else { msg.to_owned() } } None => err.to_string(), }, } } } impl ReportableError for PostgresError { fn get_error_kind(&self) -> ErrorKind { match self { PostgresError::Postgres(err) => match err.as_db_error() { Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User, Some(_) => ErrorKind::Postgres, None => ErrorKind::Compute, }, } } } #[derive(Debug, Error)] pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] TlsError), #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), #[error("error acquiring resource permit: {0}")] TooManyConnectionAttempts(#[from] ApiLockError), #[cfg(test)] #[error("retryable: {retryable}, wakeable: {wakeable}, kind: {kind:?}")] TestError { retryable: bool, wakeable: bool, kind: crate::error::ErrorKind, }, } impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { match self { ConnectionError::WakeComputeError(err) => err.to_string_client(), ConnectionError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } ConnectionError::TlsError(_) => COULD_NOT_CONNECT.to_owned(), #[cfg(test)] ConnectionError::TestError { .. } => self.to_string(), } } } impl ReportableError for ConnectionError { fn get_error_kind(&self) -> ErrorKind { match self { ConnectionError::TlsError(_) => ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), #[cfg(test)] ConnectionError::TestError { kind, .. } => *kind, } } } /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>; #[derive(Clone)] pub enum Auth { /// Only used during console-redirect. Password(Vec), /// Used by sql-over-http, ws, tcp. Scram(Box), } /// A config for authenticating to the compute node. pub(crate) struct AuthInfo { /// None for local-proxy, as we use trust-based localhost auth. /// Some for sql-over-http, ws, tcp, and in most cases for console-redirect. /// Might be None for console-redirect, but that's only a consequence of testing environments ATM. auth: Option, server_params: StartupMessageParams, channel_binding: ChannelBinding, /// Console redirect sets user and database, we shouldn't re-use those from the params. skip_db_user: bool, } /// Contains only the data needed to establish a secure connection to compute. #[derive(Clone)] pub struct ConnectInfo { pub host_addr: Option, pub host: Host, pub port: u16, pub ssl_mode: SslMode, } /// Creation and initialization routines. impl AuthInfo { pub(crate) fn for_console_redirect(db: &str, user: &str, pw: Option<&str>) -> Self { let mut server_params = StartupMessageParams::default(); server_params.insert("database", db); server_params.insert("user", user); Self { auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())), server_params, skip_db_user: true, // pg-sni-router is a mitm so this would fail. channel_binding: ChannelBinding::Disable, } } pub(crate) fn with_auth_keys(keys: ComputeCredentialKeys) -> Self { Self { auth: match keys { ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => { Some(Auth::Scram(Box::new(auth_keys))) } ComputeCredentialKeys::JwtPayload(_) => None, }, server_params: StartupMessageParams::default(), skip_db_user: false, channel_binding: ChannelBinding::Prefer, } } } impl ConnectInfo { pub fn to_postgres_client_config(&self) -> postgres_client::Config { let mut config = postgres_client::Config::new(self.host.to_string(), self.port); config.ssl_mode(self.ssl_mode); if let Some(host_addr) = self.host_addr { config.set_host_addr(host_addr); } config } } impl AuthInfo { fn enrich(&self, mut config: postgres_client::Config) -> postgres_client::Config { match &self.auth { Some(Auth::Scram(keys)) => config.auth_keys(AuthKeys::ScramSha256(**keys)), Some(Auth::Password(pw)) => config.password(pw), None => &mut config, }; config.channel_binding(self.channel_binding); for (k, v) in self.server_params.iter() { config.set_param(k, v); } config } /// Apply startup message params to the connection config. pub(crate) fn set_startup_params( &mut self, params: &StartupMessageParams, arbitrary_params: bool, ) { if !arbitrary_params { self.server_params.insert("client_encoding", "UTF8"); } for (k, v) in params.iter() { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. "user" | "database" if self.skip_db_user => {} "options" => { if let Some(options) = filtered_options(v) { self.server_params.insert(k, &options); } } "user" | "database" | "application_name" | "replication" => { self.server_params.insert(k, v); } // if we allow arbitrary params, then we forward them through. // this is a flag for a period of backwards compatibility k if arbitrary_params => { self.server_params.insert(k, v); } _ => {} } } } pub async fn authenticate( &self, ctx: &RequestContext, compute: &mut ComputeConnection, ) -> Result<(), PostgresError> { // client config with stubbed connect info. // TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely, // utilising pqproto.rs. let mut tmp_config = postgres_client::Config::new(String::new(), 0); // We have already established SSL if necessary. tmp_config.ssl_mode(SslMode::Disable); let tmp_config = self.enrich(tmp_config); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); tmp_config.authenticate(&mut compute.stream).await?; drop(pause); Ok(()) } } impl ConnectInfo { /// Establish a raw TCP+TLS connection to the compute node. async fn connect_raw( &self, config: &ComputeConfig, tls: TlsNegotiation, ) -> Result<(SocketAddr, MaybeTlsStream), TlsError> { let timeout = config.timeout; // wrap TcpStream::connect with timeout let connect_with_timeout = |addrs| { tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res { Ok(tcpstream_connect_res) => tcpstream_connect_res, Err(_) => Err(io::Error::new( io::ErrorKind::TimedOut, format!("exceeded connection timeout {timeout:?}"), )), }) }; let connect_once = |addrs| { debug!("trying to connect to compute node at {addrs:?}"); connect_with_timeout(addrs).and_then(|stream| async { let socket_addr = stream.peer_addr()?; let socket = socket2::SockRef::from(&stream); // Disable Nagle's algorithm to not introduce latency between // client and compute. socket.set_nodelay(true)?; // This prevents load balancer from severing the connection. socket.set_keepalive(true)?; Ok((socket_addr, stream)) }) }; // We can't reuse connection establishing logic from `postgres_client` here, // because it has no means for extracting the underlying socket which we // require for our business. let port = self.port; let host = &*self.host; let addrs = match self.host_addr { Some(addr) => vec![SocketAddr::new(addr, port)], None => lookup_host((host, port)).await?.collect(), }; match connect_once(&*addrs).await { Ok((sockaddr, stream)) => Ok(( sockaddr, tls::connect_tls(stream, self.ssl_mode, config, host, tls).await?, )), Err(err) => { warn!("couldn't connect to compute node at {host}:{port}: {err}"); Err(TlsError::Connection(err)) } } } } pub type RustlsStream = >::Stream; pub type MaybeRustlsStream = MaybeTlsStream; pub struct ComputeConnection { /// Socket connected to a compute node. pub stream: StartupStream, /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, pub hostname: Host, pub ssl_mode: SslMode, pub socket_addr: SocketAddr, pub guage: NumDbConnectionsGuard<'static>, } impl ConnectInfo { /// Connect to a corresponding compute node. pub async fn connect( &self, ctx: &RequestContext, aux: &MetricsAuxInfo, config: &ComputeConfig, tls: TlsNegotiation, ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream) = self.connect_raw(config, tls).await?; drop(pause); tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {} ({socket_addr}) sslmode={:?}, latency={}, query_id={}", self.host, self.ssl_mode, ctx.get_proxy_latency(), ctx.get_testodrome_id().unwrap_or_default(), ); let stream = StartupStream::new(stream); let connection = ComputeConnection { stream, socket_addr, hostname: self.host.clone(), ssl_mode: self.ssl_mode, aux: aux.clone(), guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) } } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. fn filtered_options(options: &str) -> Option { #[allow(unstable_name_collisions)] let options: String = StartupMessageParams::parse_options_raw(options) .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); // Don't even bother with empty options. if options.is_empty() { return None; } Some(options) } #[cfg(test)] mod tests { use super::*; #[test] fn test_filtered_options() { // Empty options is unlikely to be useful anyway. let params = ""; assert_eq!(filtered_options(params), None); // It's likely that clients will only use options to specify endpoint/project. let params = "project=foo"; assert_eq!(filtered_options(params), None); // Same, because unescaped whitespaces are no-op. let params = " project=foo "; assert_eq!(filtered_options(params).as_deref(), None); let params = r"\ project=foo \ "; assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ ")); let params = "project = foo"; assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2 neon_proxy_params_compat:true"; assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); } } ================================================ FILE: proxy/src/compute/tls.rs ================================================ use futures::FutureExt; use postgres_client::config::SslMode; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::{MakeTlsConnect, TlsConnect}; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use crate::pqproto::request_tls; use crate::proxy::connect_compute::TlsNegotiation; use crate::proxy::retry::CouldRetry; #[derive(Debug, Error)] pub enum TlsError { #[error(transparent)] Dns(#[from] InvalidDnsNameError), #[error(transparent)] Connection(#[from] std::io::Error), #[error("TLS required but not provided")] Required, } impl CouldRetry for TlsError { fn could_retry(&self) -> bool { match self { TlsError::Dns(_) => false, TlsError::Connection(err) => err.could_retry(), // perhaps compute didn't realise it supports TLS? TlsError::Required => true, } } } pub async fn connect_tls( mut stream: S, mode: SslMode, tls: &T, host: &str, negotiation: TlsNegotiation, ) -> Result, TlsError> where S: AsyncRead + AsyncWrite + Unpin + Send, T: MakeTlsConnect< S, Error = InvalidDnsNameError, TlsConnect: TlsConnect, >, { match mode { SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), SslMode::Prefer | SslMode::Require => {} } match negotiation { // No TLS request needed TlsNegotiation::Direct => {} // TLS request successful TlsNegotiation::Postgres if request_tls(&mut stream).await? => {} // TLS request failed but is required TlsNegotiation::Postgres if SslMode::Require == mode => return Err(TlsError::Required), // TLS request failed but is not required TlsNegotiation::Postgres => return Ok(MaybeTlsStream::Raw(stream)), } Ok(MaybeTlsStream::Tls( tls.make_tls_connect(host)?.connect(stream).boxed().await?, )) } ================================================ FILE: proxy/src/compute_ctl/mod.rs ================================================ use compute_api::responses::GenericAPIError; use hyper::{Method, StatusCode}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use thiserror::Error; use crate::http; use crate::types::{DbName, RoleName}; use crate::url::ApiUrl; pub struct ComputeCtlApi { pub(crate) api: http::Endpoint, } #[derive(Serialize, Debug)] pub struct ExtensionInstallRequest { pub extension: &'static str, pub database: DbName, pub version: &'static str, } #[derive(Serialize, Debug)] pub struct SetRoleGrantsRequest { pub database: DbName, pub schema: &'static str, pub privileges: Vec, pub role: RoleName, } #[derive(Clone, Debug, Deserialize)] pub struct ExtensionInstallResponse {} #[derive(Clone, Debug, Deserialize)] pub struct SetRoleGrantsResponse {} #[derive(Debug, Serialize, Deserialize, Clone, Copy)] #[serde(rename_all = "UPPERCASE")] pub enum Privilege { Usage, } #[derive(Error, Debug)] pub enum ComputeCtlError { #[error("connection error: {0}")] Connection(#[source] reqwest_middleware::Error), #[error("request error [{status}]: {body:?}")] Request { status: StatusCode, body: Option, }, #[error("response parsing error: {0}")] Response(#[source] reqwest::Error), } impl ComputeCtlApi { pub async fn install_extension( &self, req: &ExtensionInstallRequest, ) -> Result { self.generic_request(req, Method::POST, |url| { url.path_segments_mut().push("extensions"); }) .await } pub async fn grant_role( &self, req: &SetRoleGrantsRequest, ) -> Result { self.generic_request(req, Method::POST, |url| { url.path_segments_mut().push("grants"); }) .await } async fn generic_request( &self, req: &Req, method: Method, url: impl for<'a> FnOnce(&'a mut ApiUrl), ) -> Result where Req: Serialize, Resp: DeserializeOwned, { let resp = self .api .request_with_url(method, url) .json(req) .send() .await .map_err(ComputeCtlError::Connection)?; let status = resp.status(); if status.is_client_error() || status.is_server_error() { let body = resp.json().await.ok(); return Err(ComputeCtlError::Request { status, body }); } resp.json().await.map_err(ComputeCtlError::Response) } } ================================================ FILE: proxy/src/config.rs ================================================ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Ok, bail, ensure}; use arc_swap::ArcSwapOption; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use compute_api::spec::LocalProxySpec; use remote_storage::RemoteStorageConfig; use thiserror::Error; use tokio::sync::Notify; use tracing::{debug, error, info, warn}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::local::JWKS_ROLE_MAP; use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; use crate::ext::TaskExt; use crate::intern::RoleNameInt; use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig}; use crate::scram; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; #[cfg(feature = "rest_broker")] use crate::serverless::rest::DbSchemaCache; pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::{Host, RoleName}; pub struct ProxyConfig { pub tls_config: ArcSwapOption, pub metric_collection: Option, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, #[cfg(feature = "rest_broker")] pub rest_config: RestConfig, pub proxy_protocol_v2: ProxyProtocolV2, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute: ComputeConfig, pub greetings: String, // Greeting message sent to the client after connection establishment and contains session_id. #[cfg(feature = "testing")] pub disable_pg_session_jwt: bool, } pub struct ComputeConfig { pub retry: RetryConfig, pub tls: Arc, pub timeout: Duration, } #[derive(Copy, Clone, Debug, ValueEnum, PartialEq)] pub enum ProxyProtocolV2 { /// Connection will error if PROXY protocol v2 header is missing Required, /// Connection will error if PROXY protocol v2 header is provided Rejected, } #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct HttpConfig { pub accept_websockets: bool, pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, pub max_request_size_bytes: usize, pub max_response_size_bytes: usize, } pub struct AuthenticationConfig { pub scram_thread_pool: Arc, pub scram_protocol_timeout: tokio::time::Duration, pub ip_allowlist_check_enabled: bool, pub is_vpc_acccess_proxy: bool, pub jwks_cache: JwkCache, pub is_auth_broker: bool, pub accept_jwts: bool, pub console_redirect_confirmation_timeout: tokio::time::Duration, } #[cfg(feature = "rest_broker")] pub struct RestConfig { pub is_rest_broker: bool, pub db_schema_cache: Option, pub max_schema_size: usize, pub hostname_prefix: String, } #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub remote_storage_config: Option, pub chunk_size: usize, } pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&s.parse()?) } /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { /// Max number of entries. pub size: Option, /// Entry's time-to-live. pub absolute_ttl: Option, /// Entry's time-to-idle. pub idle_ttl: Option, } impl CacheOptions { /// Default options for [`crate::cache::node_info::NodeInfoCache`]. pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,idle_ttl=4m"; /// Parse cache options passed via cmdline. /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. fn parse(options: &str) -> anyhow::Result { let mut size = None; let mut absolute_ttl = None; let mut idle_ttl = None; for option in options.split(',') { let (key, value) = option .split_once('=') .with_context(|| format!("bad key-value pair: {option}"))?; match key { "size" => size = Some(value.parse()?), "absolute_ttl" | "ttl" => absolute_ttl = Some(humantime::parse_duration(value)?), "idle_ttl" | "tti" => idle_ttl = Some(humantime::parse_duration(value)?), unknown => bail!("unknown key: {unknown}"), } } Ok(Self { size, absolute_ttl, idle_ttl, }) } pub fn moka( &self, mut builder: moka::sync::CacheBuilder, ) -> moka::sync::CacheBuilder { if let Some(size) = self.size { builder = builder.max_capacity(size); } if let Some(ttl) = self.absolute_ttl { builder = builder.time_to_live(ttl); } if let Some(tti) = self.idle_ttl { builder = builder.time_to_idle(tti); } builder } } impl FromStr for CacheOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { let error = || format!("failed to parse cache options '{options}'"); Self::parse(options).with_context(error) } } /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct ProjectInfoCacheOptions { /// Max number of entries. pub size: u64, /// Entry's time-to-live. pub ttl: Duration, /// Max number of roles per endpoint. pub max_roles: u64, /// Gc interval. pub gc_interval: Duration, } impl ProjectInfoCacheOptions { /// Default options for [`crate::cache::project_info::ProjectInfoCache`]. pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=10000,ttl=4m,max_roles=10,gc_interval=60m"; /// Parse cache options passed via cmdline. /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. fn parse(options: &str) -> anyhow::Result { let mut size = None; let mut ttl = None; let mut max_roles = None; let mut gc_interval = None; for option in options.split(',') { let (key, value) = option .split_once('=') .with_context(|| format!("bad key-value pair: {option}"))?; match key { "size" => size = Some(value.parse()?), "ttl" => ttl = Some(humantime::parse_duration(value)?), "max_roles" => max_roles = Some(value.parse()?), "gc_interval" => gc_interval = Some(humantime::parse_duration(value)?), unknown => bail!("unknown key: {unknown}"), } } // TTL doesn't matter if cache is always empty. if let Some(0) = size { ttl.get_or_insert(Duration::default()); } Ok(Self { size: size.context("missing `size`")?, ttl: ttl.context("missing `ttl`")?, max_roles: max_roles.context("missing `max_roles`")?, gc_interval: gc_interval.context("missing `gc_interval`")?, }) } } impl FromStr for ProjectInfoCacheOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { let error = || format!("failed to parse cache options '{options}'"); Self::parse(options).with_context(error) } } /// This is a config for connect to compute and wake compute. #[derive(Clone, Copy, Debug)] pub struct RetryConfig { /// Number of times we should retry. pub max_retries: u32, /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0 pub base_delay: tokio::time::Duration, /// Exponential base for retry wait duration pub backoff_factor: f64, } impl RetryConfig { // Default options for RetryConfig. /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2"; /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. /// Cplane has timeout of 60s on each request. 8m7s in total. pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; /// Parse retry options passed via cmdline. /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`]. pub fn parse(options: &str) -> anyhow::Result { let mut num_retries = None; let mut base_retry_wait_duration = None; let mut retry_wait_exponent_base = None; for option in options.split(',') { let (key, value) = option .split_once('=') .with_context(|| format!("bad key-value pair: {option}"))?; match key { "num_retries" => num_retries = Some(value.parse()?), "base_retry_wait_duration" => { base_retry_wait_duration = Some(humantime::parse_duration(value)?); } "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), unknown => bail!("unknown key: {unknown}"), } } Ok(Self { max_retries: num_retries.context("missing `num_retries`")?, base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?, backoff_factor: retry_wait_exponent_base .context("missing `retry_wait_exponent_base`")?, }) } } /// Helper for cmdline cache options parsing. #[derive(serde::Deserialize)] pub struct ConcurrencyLockOptions { /// The number of shards the lock map should have pub shards: usize, /// The number of allowed concurrent requests for each endpoitn #[serde(flatten)] pub limiter: RateLimiterConfig, /// Garbage collection epoch #[serde(deserialize_with = "humantime_serde::deserialize")] pub epoch: Duration, /// Lock timeout #[serde(deserialize_with = "humantime_serde::deserialize")] pub timeout: Duration, } impl ConcurrencyLockOptions { /// Default options for [`crate::control_plane::client::ApiLocks`]. pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; /// Default options for [`crate::control_plane::client::ApiLocks`]. pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = "shards=64,permits=100,epoch=10m,timeout=10ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; /// Parse lock options passed via cmdline. /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`]. fn parse(options: &str) -> anyhow::Result { let options = options.trim(); if options.starts_with('{') && options.ends_with('}') { return Ok(serde_json::from_str(options)?); } let mut shards = None; let mut permits = None; let mut epoch = None; let mut timeout = None; for option in options.split(',') { let (key, value) = option .split_once('=') .with_context(|| format!("bad key-value pair: {option}"))?; match key { "shards" => shards = Some(value.parse()?), "permits" => permits = Some(value.parse()?), "epoch" => epoch = Some(humantime::parse_duration(value)?), "timeout" => timeout = Some(humantime::parse_duration(value)?), unknown => bail!("unknown key: {unknown}"), } } // these dont matter if lock is disabled if let Some(0) = permits { timeout = Some(Duration::default()); epoch = Some(Duration::default()); shards = Some(2); } let permits = permits.context("missing `permits`")?; let out = Self { shards: shards.context("missing `shards`")?, limiter: RateLimiterConfig { algorithm: RateLimitAlgorithm::Fixed, initial_limit: permits, }, epoch: epoch.context("missing `epoch`")?, timeout: timeout.context("missing `timeout`")?, }; ensure!(out.shards > 1, "shard count must be > 1"); ensure!( out.shards.is_power_of_two(), "shard count must be a power of two" ); Ok(out) } } impl FromStr for ConcurrencyLockOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { let error = || format!("failed to parse cache lock options '{options}'"); Self::parse(options).with_context(error) } } #[derive(Error, Debug)] pub(crate) enum RefreshConfigError { #[error(transparent)] Read(#[from] std::io::Error), #[error(transparent)] Parse(#[from] serde_json::Error), #[error(transparent)] Validate(anyhow::Error), #[error(transparent)] Tls(anyhow::Error), } pub(crate) async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { let mut init = true; loop { rx.notified().await; match refresh_config_inner(config, &path).await { std::result::Result::Ok(()) => {} // don't log for file not found errors if this is the first time we are checking // for computes that don't use local_proxy, this is not an error. Err(RefreshConfigError::Read(e)) if init && e.kind() == std::io::ErrorKind::NotFound => { debug!(error=?e, ?path, "could not read config file"); } Err(RefreshConfigError::Tls(e)) => { error!(error=?e, ?path, "could not read TLS certificates"); } Err(e) => { error!(error=?e, ?path, "could not read config file"); } } init = false; } } pub(crate) async fn refresh_config_inner( config: &ProxyConfig, path: &Utf8Path, ) -> Result<(), RefreshConfigError> { let bytes = tokio::fs::read(&path).await?; let data: LocalProxySpec = serde_json::from_slice(&bytes)?; let mut jwks_set = vec![]; fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; ensure!( jwks_url.has_authority() && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), "Invalid JWKS url. Must be HTTP", ); ensure!( jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), "Invalid JWKS url. No domain listed", ); // clear username, password and ports jwks_url .set_username("") .expect("url can be a base and has a valid host and is not a file. should not error"); jwks_url .set_password(None) .expect("url can be a base and has a valid host and is not a file. should not error"); // local testing is hard if we need to have a specific restricted port if cfg!(not(feature = "testing")) { jwks_url.set_port(None).expect( "url can be a base and has a valid host and is not a file. should not error", ); } // clear query params jwks_url.set_fragment(None); jwks_url.query_pairs_mut().clear().finish(); if jwks_url.scheme() != "https" { // local testing is hard if we need to set up https support. if cfg!(not(feature = "testing")) { jwks_url .set_scheme("https") .expect("should not error to set the scheme to https if it was http"); } else { warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); } } Ok(JwksSettings { id: jwks.id, jwks_url, _provider_name: jwks.provider_name, jwt_audience: jwks.jwt_audience, role_names: jwks .role_names .into_iter() .map(RoleName::from) .map(|s| RoleNameInt::from(&s)) .collect(), }) } for jwks in data.jwks.into_iter().flatten() { jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); } info!("successfully loaded new config"); JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); if let Some(tls_config) = data.tls { let tls_config = tokio::task::spawn_blocking(move || { crate::tls::server_config::configure_tls( tls_config.key_path.as_ref(), tls_config.cert_path.as_ref(), None, false, ) }) .await .propagate_task_panic() .map_err(RefreshConfigError::Tls)?; config.tls_config.store(Some(Arc::new(tls_config))); } std::result::Result::Ok(()) } #[cfg(test)] mod tests { use super::*; use crate::rate_limiter::Aimd; #[test] fn test_parse_cache_options() -> anyhow::Result<()> { let CacheOptions { size, absolute_ttl, idle_ttl: _, } = "size=4096,ttl=5min".parse()?; assert_eq!(size, Some(4096)); assert_eq!(absolute_ttl, Some(Duration::from_secs(5 * 60))); let CacheOptions { size, absolute_ttl, idle_ttl: _, } = "ttl=4m,size=2".parse()?; assert_eq!(size, Some(2)); assert_eq!(absolute_ttl, Some(Duration::from_secs(4 * 60))); let CacheOptions { size, absolute_ttl, idle_ttl: _, } = "size=0,ttl=1s".parse()?; assert_eq!(size, Some(0)); assert_eq!(absolute_ttl, Some(Duration::from_secs(1))); let CacheOptions { size, absolute_ttl, idle_ttl: _, } = "size=0".parse()?; assert_eq!(size, Some(0)); assert_eq!(absolute_ttl, None); Ok(()) } #[test] fn test_parse_lock_options() -> anyhow::Result<()> { let ConcurrencyLockOptions { epoch, limiter, shards, timeout, } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?; assert_eq!(epoch, Duration::from_secs(10 * 60)); assert_eq!(timeout, Duration::from_secs(1)); assert_eq!(shards, 32); assert_eq!(limiter.initial_limit, 4); assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); let ConcurrencyLockOptions { epoch, limiter, shards, timeout, } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?; assert_eq!(epoch, Duration::from_secs(60)); assert_eq!(timeout, Duration::from_millis(100)); assert_eq!(shards, 16); assert_eq!(limiter.initial_limit, 8); assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); let ConcurrencyLockOptions { epoch, limiter, shards, timeout, } = "permits=0".parse()?; assert_eq!(epoch, Duration::ZERO); assert_eq!(timeout, Duration::ZERO); assert_eq!(shards, 2); assert_eq!(limiter.initial_limit, 0); assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); Ok(()) } #[test] fn test_parse_json_lock_options() -> anyhow::Result<()> { let ConcurrencyLockOptions { epoch, limiter, shards, timeout, } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"# .parse()?; assert_eq!(epoch, Duration::from_secs(10 * 60)); assert_eq!(timeout, Duration::from_secs(1)); assert_eq!(shards, 32); assert_eq!(limiter.initial_limit, 44); assert_eq!( limiter.algorithm, RateLimitAlgorithm::Aimd { conf: Aimd { min: 5, max: 500, dec: 0.9, inc: 10, utilisation: 0.8 } }, ); Ok(()) } } ================================================ FILE: proxy/src/console_redirect_proxy.rs ================================================ use std::sync::Arc; use futures::{FutureExt, TryFutureExt}; use postgres_client::RawCancelToken; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info}; use crate::auth::backend::ConsoleRedirectBackend; use crate::cancellation::{CancelClosure, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::pglb::ClientRequestError; use crate::pglb::handshake::{HandshakeData, handshake}; use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::{ ErrorSource, connect_compute, forward_compute_params_to_client, send_client_greeting, }; use crate::util::run_until_cancelled; pub async fn task_main( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); } // When set for the server socket, the keepalive setting // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; let conn_gauge = Metrics::get() .proxy .client_connections .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); connections.spawn(async move { let (socket, conn_info) = match config.proxy_protocol_v2 { ProxyProtocolV2::Required => { match read_proxy_protocol(socket).await { Err(e) => { error!("per-client task finished with an error: {e:#}"); return; } // our load balancers will not send any more data. let's just exit immediately Ok((_socket, ConnectHeader::Local)) => { debug!("healthcheck received"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), } } // ignore the header - it cannot be confused for a postgres or http connection so will // error later. ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, extra: None, }, ), }; match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( "per-client task finished with an error: failed to set socket option: {e:#}" ); return; } } let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_client( config, backend, &ctx, cancellation_handler, socket, conn_gauge, cancellations, ) .instrument(ctx.span()) .boxed() .await; match res { Err(e) => { ctx.set_error_kind(e.get_error_kind()); error!(parent: &ctx.span(), "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); } Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { error!( ?session_id, "per-client task finished with an IO error from the client: {e:#}" ); } Err(ErrorSource::Compute(e)) => { error!( ?session_id, "per-client task finished with an IO error from the compute: {e:#}" ); } } } } }); } connections.close(); cancellations.close(); drop(listener); // Drain connections connections.wait().await; cancellations.wait().await; Ok(()) } #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, ctx: &RequestContext, cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); let metrics = &Metrics::get().proxy; let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.load(); let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake) .await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ let cancellation_handler_clone = Arc::clone(&cancellation_handler); let ctx = ctx.clone(); let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { cancellation_handler_clone .cancel_session( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, config.authentication_config.is_vpc_acccess_proxy, backend.get_api(), ) .await .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); }.instrument(cancel_span) }); return Ok(None); } }; drop(pause); ctx.set_db_options(params.clone()); let (node_info, mut auth_info, user_info) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { Ok(auth_result) => auth_result, Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; auth_info.set_startup_params(¶ms, true); let mut node = connect_compute::connect_to_compute( ctx, config, &node_info, connect_compute::TlsNegotiation::Postgres, ) .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; auth_info .authenticate(ctx, &mut node) .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; send_client_greeting(ctx, &config.greetings, &mut stream); let session = cancellation_handler.get_key(); let (process_id, secret_key) = forward_compute_params_to_client(ctx, *session.key(), &mut stream, &mut node.stream) .await?; let stream = stream.flush_and_into_inner().await?; let hostname = node.hostname.to_string(); let session_id = ctx.session_id(); let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); tokio::spawn(async move { session .maintain_cancel_key( session_id, cancel, &CancelClosure { socket_addr: node.socket_addr, cancel_token: RawCancelToken { ssl_mode: node.ssl_mode, process_id, secret_key, }, hostname, user_info, }, &config.connect_to_compute, ) .await; }); Ok(Some(ProxyPassthrough { client: stream, compute: node.stream.into_framed().into_inner(), aux: node.aux, private_link_id: None, _cancel_on_shutdown: cancel_on_shutdown, _req: request_gauge, _conn: conn_gauge, _db_conn: node.guage, })) } ================================================ FILE: proxy/src/context/mod.rs ================================================ //! Connection request monitoring contexts use std::net::IpAddr; use chrono::Utc; use once_cell::sync::OnceCell; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; use tracing::{Span, error, info_span}; use try_lock::TryLock; use uuid::Uuid; use self::parquet::RequestData; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::error::ErrorKind; use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{LatencyAccumulated, LatencyTimer, Metrics, Protocol, Waiting}; use crate::pqproto::StartupMessageParams; use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; pub mod parquet; pub(crate) static LOG_CHAN: OnceCell> = OnceCell::new(); pub(crate) static LOG_CHAN_DISCONNECT: OnceCell> = OnceCell::new(); /// Context data for a single request to connect to a database. /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. /// All connection logic should instead use strongly typed state machines, not a bunch of Options. pub struct RequestContext( /// To allow easier use of the ctx object, we have interior mutability. /// I would typically use a RefCell but that would break the `Send` requirements /// so we need something with thread-safety. `TryLock` is a cheap alternative /// that offers similar semantics to a `RefCell` but with synchronisation. TryLock, ); struct RequestContextInner { pub(crate) conn_info: ConnectionInfo, pub(crate) session_id: Uuid, pub(crate) protocol: Protocol, first_packet: chrono::DateTime, pub(crate) span: Span, // filled in as they are discovered project: Option, branch: Option, endpoint_id: Option, dbname: Option, user: Option, application: Option, user_agent: Option, error_kind: Option, pub(crate) auth_method: Option, jwt_issuer: Option, success: bool, pub(crate) cold_start_info: ColdStartInfo, pg_options: Option, testodrome_query_id: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, // This sender is only used to log the length of session in case of success. disconnect_sender: Option>, pub(crate) latency_timer: LatencyTimer, disconnect_timestamp: Option>, } #[derive(Clone, Debug)] pub(crate) enum AuthMethod { // aka link ConsoleRedirect, ScramSha256, ScramSha256Plus, Cleartext, Jwt, } impl Clone for RequestContext { fn clone(&self) -> Self { let inner = self.0.try_lock().expect("should not deadlock"); let new = RequestContextInner { conn_info: inner.conn_info.clone(), session_id: inner.session_id, protocol: inner.protocol, first_packet: inner.first_packet, span: info_span!("background_task"), project: inner.project, branch: inner.branch, endpoint_id: inner.endpoint_id.clone(), dbname: inner.dbname.clone(), user: inner.user.clone(), application: inner.application.clone(), user_agent: inner.user_agent.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), success: inner.success, cold_start_info: inner.cold_start_info, pg_options: inner.pg_options.clone(), testodrome_query_id: inner.testodrome_query_id.clone(), sender: None, disconnect_sender: None, latency_timer: LatencyTimer::noop(inner.protocol), disconnect_timestamp: inner.disconnect_timestamp, }; Self(TryLock::new(new)) } } impl RequestContext { pub fn new(session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol) -> Self { // TODO: be careful with long lived spans let span = info_span!( "connect_request", %protocol, ?session_id, %conn_info, ep = tracing::field::Empty, role = tracing::field::Empty, ); let inner = RequestContextInner { conn_info, session_id, protocol, first_packet: Utc::now(), span, project: None, branch: None, endpoint_id: None, dbname: None, user: None, application: None, user_agent: None, error_kind: None, auth_method: None, jwt_issuer: None, success: false, cold_start_info: ColdStartInfo::Unknown, pg_options: None, testodrome_query_id: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), disconnect_timestamp: None, }; Self(TryLock::new(inner)) } #[cfg(test)] pub(crate) fn test() -> Self { use std::net::SocketAddr; let ip = IpAddr::from([127, 0, 0, 1]); let addr = SocketAddr::new(ip, 5432); let conn_info = ConnectionInfo { addr, extra: None }; RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp) } pub(crate) fn console_application_name(&self) -> String { let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", this.application.as_deref().unwrap_or_default(), this.protocol ) } pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) { self.0 .try_lock() .expect("should not deadlock") .set_cold_start_info(info); } pub(crate) fn set_db_options(&self, options: StartupMessageParams) { let mut this = self.0.try_lock().expect("should not deadlock"); this.set_application(options.get("application_name").map(SmolStr::from)); if let Some(user) = options.get("user") { this.set_user(user.into()); } if let Some(dbname) = options.get("database") { this.set_dbname(dbname.into()); } // Try to get testodrome_query_id directly from parameters if let Some(options_str) = options.get("options") { // If not found directly, try to extract it from the options string for option in options_str.split_whitespace() { if let Some(value) = option.strip_prefix("neon_query_id:") { this.set_testodrome_id(value.into()); break; } } } this.pg_options = Some(options); } pub(crate) fn set_project(&self, x: MetricsAuxInfo) { let mut this = self.0.try_lock().expect("should not deadlock"); if this.endpoint_id.is_none() { this.set_endpoint_id(x.endpoint_id.as_str().into()); } this.branch = Some(x.branch_id); this.project = Some(x.project_id); this.set_cold_start_info(x.cold_start_info); } pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) { let mut this = self.0.try_lock().expect("should not deadlock"); this.project = Some(project_id); } pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) { self.0 .try_lock() .expect("should not deadlock") .set_endpoint_id(endpoint_id); } pub(crate) fn set_dbname(&self, dbname: DbName) { self.0 .try_lock() .expect("should not deadlock") .set_dbname(dbname); } pub(crate) fn set_user(&self, user: RoleName) { self.0 .try_lock() .expect("should not deadlock") .set_user(user); } pub(crate) fn set_user_agent(&self, user_agent: Option) { self.0 .try_lock() .expect("should not deadlock") .set_user_agent(user_agent); } pub(crate) fn set_testodrome_id(&self, query_id: SmolStr) { self.0 .try_lock() .expect("should not deadlock") .set_testodrome_id(query_id); } pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); } pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) { let mut this = self.0.try_lock().expect("should not deadlock"); this.jwt_issuer = Some(jwt_issuer); } pub fn has_private_peer_addr(&self) -> bool { self.0 .try_lock() .expect("should not deadlock") .has_private_peer_addr() } pub(crate) fn set_error_kind(&self, kind: ErrorKind) { let mut this = self.0.try_lock().expect("should not deadlock"); // Do not record errors from the private address to metrics. if !this.has_private_peer_addr() { Metrics::get().proxy.errors_total.inc(kind); } if let Some(ep) = &this.endpoint_id { let metric = &Metrics::get().proxy.endpoints_affected_by_errors; let label = metric.with_labels(kind); metric.get_metric(label).measure(ep); } this.error_kind = Some(kind); } pub fn set_success(&self) { let mut this = self.0.try_lock().expect("should not deadlock"); this.success = true; } pub fn log_connect(self) -> DisconnectLogger { let mut this = self.0.into_inner(); this.log_connect(); // close current span. this.span = Span::none(); DisconnectLogger(this) } pub(crate) fn protocol(&self) -> Protocol { self.0.try_lock().expect("should not deadlock").protocol } pub(crate) fn span(&self) -> Span { self.0.try_lock().expect("should not deadlock").span.clone() } pub(crate) fn session_id(&self) -> Uuid { self.0.try_lock().expect("should not deadlock").session_id } pub(crate) fn peer_addr(&self) -> IpAddr { self.0 .try_lock() .expect("should not deadlock") .conn_info .addr .ip() } pub(crate) fn extra(&self) -> Option { self.0 .try_lock() .expect("should not deadlock") .conn_info .extra .clone() } pub(crate) fn cold_start_info(&self) -> ColdStartInfo { self.0 .try_lock() .expect("should not deadlock") .cold_start_info } pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> { LatencyTimerPause { ctx: self, start: tokio::time::Instant::now(), waiting_for, } } pub(crate) fn latency_timer_pause_at( &self, at: tokio::time::Instant, waiting_for: Waiting, ) -> LatencyTimerPause<'_> { LatencyTimerPause { ctx: self, start: at, waiting_for, } } pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated { self.0 .try_lock() .expect("should not deadlock") .latency_timer .accumulated() } pub(crate) fn get_testodrome_id(&self) -> Option { self.0 .try_lock() .expect("should not deadlock") .testodrome_query_id .clone() } pub(crate) fn success(&self) { self.0 .try_lock() .expect("should not deadlock") .latency_timer .success(); } } pub(crate) struct LatencyTimerPause<'a> { ctx: &'a RequestContext, start: tokio::time::Instant, waiting_for: Waiting, } impl Drop for LatencyTimerPause<'_> { fn drop(&mut self) { self.ctx .0 .try_lock() .expect("should not deadlock") .latency_timer .unpause(self.start, self.waiting_for); } } impl RequestContextInner { fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); } fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); let metric = &Metrics::get().proxy.connecting_endpoints; let label = metric.with_labels(self.protocol); metric.get_metric(label).measure(&endpoint_id); self.endpoint_id = Some(endpoint_id); } } fn set_application(&mut self, app: Option) { if let Some(app) = app { self.application = Some(app); } } fn set_user_agent(&mut self, user_agent: Option) { self.user_agent = user_agent; } fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } fn set_user(&mut self, user: RoleName) { self.span.record("role", display(&user)); self.user = Some(user); } fn set_testodrome_id(&mut self, query_id: SmolStr) { self.testodrome_query_id = Some(query_id); } fn has_private_peer_addr(&self) -> bool { match self.conn_info.addr.ip() { IpAddr::V4(ip) => ip.is_private(), IpAddr::V6(_) => false, } } fn log_connect(&mut self) { if let Some(tx) = self.sender.take() { // If type changes, this error handling needs to be updated. let tx: mpsc::UnboundedSender = tx; if let Err(e) = tx.send(RequestData::from(&*self)) { error!("log_connect channel send failed: {e}"); } } } fn log_disconnect(&mut self) { // If we are here, it's guaranteed that the user successfully connected to the endpoint. // Here we log the length of the session. self.disconnect_timestamp = Some(Utc::now()); if let Some(tx) = self.disconnect_sender.take() { // If type changes, this error handling needs to be updated. let tx: mpsc::UnboundedSender = tx; if let Err(e) = tx.send(RequestData::from(&*self)) { error!("log_disconnect channel send failed: {e}"); } } } } impl Drop for RequestContextInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); } } } pub struct DisconnectLogger(RequestContextInner); impl Drop for DisconnectLogger { fn drop(&mut self) { self.0.log_disconnect(); } } ================================================ FILE: proxy/src/context/parquet.rs ================================================ use std::sync::Arc; use std::time::SystemTime; use anyhow::Context; use bytes::buf::Writer; use bytes::{BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::basic::Compression; use parquet::file::metadata::RowGroupMetaDataPtr; use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr}; use parquet::file::writer::SerializedFileWriter; use parquet::record::RecordWriter; use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; use tokio::sync::mpsc; use tokio::time; use tokio_util::sync::CancellationToken; use tracing::{Span, debug, info}; use utils::backoff; use super::{LOG_CHAN, RequestContextInner}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; use crate::ext::TaskExt; use crate::pqproto::StartupMessageParams; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` #[clap(long, value_parser = remote_storage_from_toml)] parquet_upload_remote_storage: Option, #[clap(long, value_parser = remote_storage_from_toml)] parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] parquet_upload_row_group_size: usize, /// How large each column page should be in bytes #[clap(long, default_value_t = DEFAULT_PAGE_SIZE)] parquet_upload_page_size: usize, /// How large the total parquet file should be in bytes #[clap(long, default_value_t = 100_000_000)] parquet_upload_size: i64, /// How long to wait before forcing a file upload #[clap(long, default_value = "20m", value_parser = humantime::parse_duration)] parquet_upload_maximum_duration: tokio::time::Duration, /// What level of compression to use #[clap(long, default_value_t = Compression::UNCOMPRESSED)] parquet_upload_compression: Compression, } // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. // complication: parquet is a columnar format, while we want to write in as rows. // design: // * we batch up to 1024 rows, then flush them into a 'row group' // * after each rowgroup write, we check the length of the file and upload to s3 if large enough #[derive(parquet_derive::ParquetRecordWriter)] pub(crate) struct RequestData { region: String, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones timestamp: chrono::NaiveDateTime, session_id: uuid::Uuid, peer_addr: String, username: Option, application_name: Option, user_agent: Option, endpoint_id: Option, database: Option, project: Option, branch: Option, pg_options: Option, auth_method: Option<&'static str>, jwt_issuer: Option, error: Option<&'static str>, /// Success is counted if we form a HTTP response with sql rows inside /// Or if we make it to proxy_pass success: bool, /// Indicates if the cplane started the new compute node for this request. cold_start_info: &'static str, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`. disconnect_timestamp: Option, } struct Options<'a> { options: &'a StartupMessageParams, } impl serde::Serialize for Options<'_> { fn serialize(&self, s: S) -> Result where S: serde::Serializer, { let mut state = s.serialize_map(None)?; for (k, v) in self.options.iter() { state.serialize_entry(k, v)?; } state.end() } } impl From<&RequestContextInner> for RequestData { fn from(value: &RequestContextInner) -> Self { Self { session_id: value.session_id, peer_addr: value.conn_info.addr.ip().to_string(), timestamp: value.first_packet.naive_utc(), username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), user_agent: value.user_agent.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), branch: value.branch.as_deref().map(String::from), pg_options: value .pg_options .as_ref() .and_then(|options| serde_json::to_string(&Options { options }).ok()), auth_method: value.auth_method.as_ref().map(|x| match x { super::AuthMethod::ConsoleRedirect => "console_redirect", super::AuthMethod::ScramSha256 => "scram_sha_256", super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", super::AuthMethod::Jwt => "jwt", }), jwt_issuer: value.jwt_issuer.clone(), protocol: value.protocol.as_str(), region: String::new(), error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, cold_start_info: value.cold_start_info.as_str(), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() .as_micros() as u64, // 584 millenia... good enough disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()), } } } /// Parquet request context worker /// /// It listened on a channel for all completed requests, extracts the data and writes it into a parquet file, /// then uploads a completed batch to S3 pub async fn worker( cancellation_token: CancellationToken, config: ParquetUploadArgs, region: String, ) -> anyhow::Result<()> { let Some(remote_storage_config) = config.parquet_upload_remote_storage else { tracing::warn!("parquet request upload: no s3 bucket configured"); return Ok(()); }; let (tx, mut rx) = mpsc::unbounded_channel(); LOG_CHAN .set(tx.downgrade()) .expect("only one worker should set the channel"); // setup row stream that will close on cancellation let cancellation_token2 = cancellation_token.clone(); tokio::spawn(async move { cancellation_token2.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx); }); let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await .context("remote storage init")?; let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) .set_compression(config.parquet_upload_compression); let parquet_config = ParquetConfig { propeties: Arc::new(properties.build()), rows_per_group: config.parquet_upload_row_group_size, file_size: config.parquet_upload_size, max_duration: config.parquet_upload_maximum_duration, #[cfg(any(test, feature = "testing"))] test_remote_failures: 0, }; // TODO(anna): consider moving this to a separate function. if let Some(disconnect_events_storage_config) = config.parquet_upload_disconnect_events_remote_storage { let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); LOG_CHAN_DISCONNECT .set(tx_disconnect.downgrade()) .expect("only one worker should set the channel"); // setup row stream that will close on cancellation tokio::spawn(async move { cancellation_token.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx_disconnect); }); let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); let rx_disconnect = rx_disconnect.map(RequestData::from); let storage_disconnect = GenericRemoteStorage::from_config(&disconnect_events_storage_config) .await .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( worker_inner(storage, rx, parquet_config, ®ion), worker_inner( storage_disconnect, rx_disconnect, parquet_config_disconnect, ®ion ) ) .map(|_| ()) } else { worker_inner(storage, rx, parquet_config, ®ion).await } } #[derive(Clone, Debug)] struct ParquetConfig { propeties: WriterPropertiesPtr, rows_per_group: usize, file_size: i64, max_duration: tokio::time::Duration, #[cfg(any(test, feature = "testing"))] test_remote_failures: u64, } async fn worker_inner( storage: GenericRemoteStorage, rx: impl Stream, config: ParquetConfig, region: &str, ) -> anyhow::Result<()> { #[cfg(any(test, feature = "testing"))] let storage = if config.test_remote_failures > 0 { GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100) } else { storage }; let mut rx = std::pin::pin!(rx); let mut rows = Vec::with_capacity(config.rows_per_group); let schema = rows.as_slice().schema()?; let buffer = BytesMut::new(); let w = buffer.writer(); let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?; let mut last_upload = time::Instant::now(); let mut len = 0; while let Some(mut row) = rx.next().await { region.clone_into(&mut row.region); rows.push(row); let force = last_upload.elapsed() > config.max_duration; if rows.len() == config.rows_per_group || force { let rg_meta; (rows, w, rg_meta) = flush_rows(rows, w).await?; len += rg_meta.compressed_size(); } if len > config.file_size || force { last_upload = time::Instant::now(); let file = upload_parquet(w, len, &storage).await?; w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; len = 0; } } if !rows.is_empty() { let rg_meta; (_, w, rg_meta) = flush_rows(rows, w).await?; len += rg_meta.compressed_size(); } if !w.flushed_row_groups().is_empty() { let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) } async fn flush_rows( rows: Vec, mut w: SerializedFileWriter, ) -> anyhow::Result<( Vec, SerializedFileWriter, RowGroupMetaDataPtr, )> where W: std::io::Write + Send + 'static, { let span = Span::current(); let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || { let _enter = span.enter(); let mut rg = w.next_row_group()?; rows.as_slice().write_to_row_group(&mut rg)?; let rg_meta = rg.close()?; let size = rg_meta.compressed_size(); let compression = rg_meta.compressed_size() as f64 / rg_meta.total_byte_size() as f64; debug!(size, compression, "flushed row group to parquet file"); Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta)) }) .await .propagate_task_panic()?; rows.clear(); Ok((rows, w, rg_meta)) } async fn upload_parquet( mut w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, ) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() .iter() .map(|rg| rg.total_byte_size()) .sum::(); // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 let (mut buffer, metadata) = tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> { let metadata = w.finish()?; let buffer = std::mem::take(w.inner_mut().get_mut()); Ok((buffer, metadata)) }) .await .propagate_task_panic()?; let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; let size = data.len(); let now = chrono::Utc::now(); let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix( uuid::NoContext, // we won't be running this in 1970. this cast is ok now.timestamp() as u64, now.timestamp_subsec_nanos(), )); info!( %id, rows = metadata.num_rows, size, compression, "uploading request parquet file" ); let year = now.year(); let month = now.month(); let day = now.day(); let hour = now.hour(); // segment files by time for S3 performance let path = RemotePath::from_string(&format!( "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" ))?; let cancel = CancellationToken::new(); let maybe_err = backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); storage .upload(stream, data.len(), &path, None, &cancel) .await }, TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, "request_data_upload", // we don't want cancellation to interrupt here, so we make a dummy cancel token &cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) .with_context(|| format!("request_data_upload: path={path}")) .err(); if let Some(err) = maybe_err { tracing::error!(%id, %path, error = ?err, "failed to upload request data"); } Ok(buffer.writer()) } #[cfg(test)] mod tests { use std::net::Ipv4Addr; use std::num::NonZeroUsize; use std::sync::Arc; use camino::Utf8Path; use clap::Parser; use futures::{Stream, StreamExt}; use itertools::Itertools; use parquet::basic::{Compression, ZstdLevel}; use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties}; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use tokio::sync::mpsc; use tokio::time; use walkdir::WalkDir; use super::{ParquetConfig, ParquetUploadArgs, RequestData, worker_inner}; #[derive(Parser)] struct ProxyCliArgs { #[clap(flatten)] parquet_upload: ParquetUploadArgs, } #[test] fn default_parser() { let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from(["proxy"]); assert_eq!(parquet_upload.parquet_upload_remote_storage, None); assert_eq!(parquet_upload.parquet_upload_row_group_size, 8192); assert_eq!(parquet_upload.parquet_upload_page_size, DEFAULT_PAGE_SIZE); assert_eq!(parquet_upload.parquet_upload_size, 100_000_000); assert_eq!( parquet_upload.parquet_upload_maximum_duration, time::Duration::from_secs(20 * 60) ); assert_eq!( parquet_upload.parquet_upload_compression, Compression::UNCOMPRESSED ); } #[test] fn full_parser() { let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from([ "proxy", "--parquet-upload-remote-storage", "{bucket_name='default',prefix_in_bucket='proxy/',bucket_region='us-east-1',endpoint='http://minio:9000'}", "--parquet-upload-row-group-size", "100", "--parquet-upload-page-size", "10000", "--parquet-upload-size", "10000000", "--parquet-upload-maximum-duration", "10m", "--parquet-upload-compression", "zstd(5)", ]); assert_eq!( parquet_upload.parquet_upload_remote_storage, Some(RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: "default".into(), bucket_region: "us-east-1".into(), prefix_in_bucket: Some("proxy/".into()), endpoint: Some("http://minio:9000".into()), concurrency_limit: NonZeroUsize::new( DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); assert_eq!(parquet_upload.parquet_upload_page_size, 10000); assert_eq!(parquet_upload.parquet_upload_size, 10_000_000); assert_eq!( parquet_upload.parquet_upload_maximum_duration, time::Duration::from_secs(10 * 60) ); assert_eq!( parquet_upload.parquet_upload_compression, Compression::ZSTD(ZstdLevel::try_new(5).unwrap()) ); } fn generate_request_data(rng: &mut impl Rng) -> RequestData { RequestData { session_id: uuid::Builder::from_random_bytes(rng.random()).into_uuid(), peer_addr: Ipv4Addr::from(rng.random::<[u8; 4]>()).to_string(), timestamp: chrono::DateTime::from_timestamp_millis( rng.random_range(1703862754..1803862754), ) .unwrap() .naive_utc(), application_name: Some("test".to_owned()), user_agent: Some("test-user-agent".to_owned()), username: Some(hex::encode(rng.random::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.random::<[u8; 16]>())), database: Some(hex::encode(rng.random::<[u8; 16]>())), project: Some(hex::encode(rng.random::<[u8; 16]>())), branch: Some(hex::encode(rng.random::<[u8; 16]>())), pg_options: None, auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.random_range(0..3)], region: String::new(), error: None, success: rng.random(), cold_start_info: "no", duration_us: rng.random_range(0..30_000_000), disconnect_timestamp: None, } } fn random_stream(len: usize) -> impl Stream + Unpin { let mut rng = StdRng::from_seed([0x39; 32]); futures::stream::iter( std::iter::repeat_with(move || generate_request_data(&mut rng)).take(len), ) } async fn run_test( tmpdir: &Utf8Path, config: ParquetConfig, rx: impl Stream, ) -> Vec<(u64, usize, i64)> { let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs { local_path: tmpdir.to_path_buf(), }, timeout: std::time::Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(30), }; let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await .unwrap(); worker_inner(storage, rx, config, "us-east-1") .await .unwrap(); let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() .filter_map(|entry| entry.ok()) .filter(|entry| entry.file_type().is_file()) .map(|entry| entry.path().to_path_buf()) .collect_vec(); files.sort(); files .into_iter() .map(|path| std::fs::File::open(tmpdir.as_std_path().join(path)).unwrap()) .map(|file| { ( file.metadata().unwrap(), SerializedFileReader::new(file).unwrap().metadata().clone(), ) }) .map(|(file_meta, parquet_meta)| { ( file_meta.len(), parquet_meta.num_row_groups(), parquet_meta.file_metadata().num_rows(), ) }) .collect() } #[tokio::test] async fn verify_parquet_no_compression() { let tmpdir = camino_tempfile::tempdir().unwrap(); let config = ParquetConfig { propeties: Arc::new(WriterProperties::new()), rows_per_group: 2_000, file_size: 1_000_000, max_duration: time::Duration::from_secs(20 * 60), test_remote_failures: 0, }; let rx = random_stream(50_000); let file_stats = run_test(tmpdir.path(), config, rx).await; assert_eq!( file_stats, [ (1313878, 3, 6000), (1313891, 3, 6000), (1314058, 3, 6000), (1313914, 3, 6000), (1313760, 3, 6000), (1314084, 3, 6000), (1313965, 3, 6000), (1313911, 3, 6000), (438290, 1, 2000) ] ); tmpdir.close().unwrap(); } #[tokio::test] async fn verify_parquet_strong_compression() { let tmpdir = camino_tempfile::tempdir().unwrap(); let config = ParquetConfig { propeties: Arc::new( WriterProperties::builder() .set_compression(parquet::basic::Compression::ZSTD( ZstdLevel::try_new(10).unwrap(), )) .build(), ), rows_per_group: 2_000, file_size: 1_000_000, max_duration: time::Duration::from_secs(20 * 60), test_remote_failures: 0, }; let rx = random_stream(50_000); let file_stats = run_test(tmpdir.path(), config, rx).await; // with strong compression, the files are smaller assert_eq!( file_stats, [ (1206039, 5, 10000), (1205798, 5, 10000), (1205776, 5, 10000), (1206051, 5, 10000), (1205746, 5, 10000) ] ); tmpdir.close().unwrap(); } #[tokio::test] async fn verify_parquet_unreliable_upload() { let tmpdir = camino_tempfile::tempdir().unwrap(); let config = ParquetConfig { propeties: Arc::new(WriterProperties::new()), rows_per_group: 2_000, file_size: 1_000_000, max_duration: time::Duration::from_secs(20 * 60), test_remote_failures: 2, }; let rx = random_stream(50_000); let file_stats = run_test(tmpdir.path(), config, rx).await; assert_eq!( file_stats, [ (1313878, 3, 6000), (1313891, 3, 6000), (1314058, 3, 6000), (1313914, 3, 6000), (1313760, 3, 6000), (1314084, 3, 6000), (1313965, 3, 6000), (1313911, 3, 6000), (438290, 1, 2000) ] ); tmpdir.close().unwrap(); } #[tokio::test(start_paused = true)] async fn verify_parquet_regular_upload() { let tmpdir = camino_tempfile::tempdir().unwrap(); let config = ParquetConfig { propeties: Arc::new(WriterProperties::new()), rows_per_group: 2_000, file_size: 1_000_000, max_duration: time::Duration::from_secs(60), test_remote_failures: 2, }; let (tx, mut rx) = mpsc::unbounded_channel(); tokio::spawn(async move { for _ in 0..3 { let mut s = random_stream(3000); while let Some(r) = s.next().await { tx.send(r).unwrap(); } time::sleep(time::Duration::from_secs(70)).await; } }); let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let file_stats = run_test(tmpdir.path(), config, rx).await; // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, [(658552, 2, 3001), (658265, 2, 3000), (658061, 2, 2999)] ); tmpdir.close().unwrap(); } } ================================================ FILE: proxy/src/control_plane/client/cplane_proxy_v1.rs ================================================ //! Production console backend. use std::net::IpAddr; use std::str::FromStr; use std::sync::Arc; use ::http::HeaderName; use ::http::header::AUTHORIZATION; use bytes::Bytes; use futures::TryFutureExt; use hyper::StatusCode; use postgres_client::config::SslMode; use tokio::time::Instant; use tracing::{Instrument, debug, info, info_span, warn}; use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::cache::node_info::CachedNodeInfo; use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse}; use crate::control_plane::{ AccessBlockerFlags, AuthInfo, AuthSecret, EndpointAccessControl, NodeInfo, RoleAccessControl, }; use crate::metrics::Metrics; use crate::proxy::retry::CouldRetry; use crate::rate_limiter::WakeComputeRateLimiter; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{compute, http, scram}; pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] pub struct NeonControlPlaneClient { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub(crate) locks: &'static ApiLocks, pub(crate) wake_compute_endpoint_rate_limiter: Arc, // put in a shared ref so we don't copy secrets all over in memory jwt: Arc, } impl NeonControlPlaneClient { /// Construct an API object containing the auth parameters. pub fn new( endpoint: http::Endpoint, jwt: Arc, caches: &'static ApiCaches, locks: &'static ApiLocks, wake_compute_endpoint_rate_limiter: Arc, ) -> Self { Self { endpoint, caches, locks, wake_compute_endpoint_rate_limiter, jwt, } } pub(crate) fn url(&self) -> &str { self.endpoint.url().as_str() } async fn get_and_cache_auth_info( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, cache_key: &EndpointId, extract: impl FnOnce(&EndpointAccessControl, &RoleAccessControl) -> T, ) -> Result { match self.do_get_auth_req(ctx, endpoint, role).await { Ok(auth_info) => { let control = EndpointAccessControl { allowed_ips: Arc::new(auth_info.allowed_ips), allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), flags: auth_info.access_blocker_flags, rate_limits: auth_info.rate_limits, }; let role_control = RoleAccessControl { secret: auth_info.secret, }; let res = extract(&control, &role_control); self.caches.project_info.insert_endpoint_access( auth_info.account_id, auth_info.project_id, cache_key.into(), role.into(), control, role_control, ); if let Some(project_id) = auth_info.project_id { ctx.set_project_id(project_id); } Ok(res) } Err(err) => match err { GetAuthInfoError::ApiError(ControlPlaneError::Message(ref msg)) => { let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info); // If we can retry this error, do not cache it, // unless we were given a retry delay. if msg.could_retry() && retry_info.is_none() { return Err(err); } self.caches.project_info.insert_endpoint_access_err( cache_key.into(), role.into(), msg.clone(), ); Err(err) } err => Err(err), }, } } async fn do_get_auth_req( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result { async { let response = { let request = self .endpoint .get_path("get_endpoint_access_control") .header(X_REQUEST_ID, ctx.session_id().to_string()) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", ctx.console_application_name().as_str()), ("endpointish", endpoint.as_str()), ("role", role.as_str()), ]) .build()?; debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let _pause = ctx.latency_timer_pause_at(start, crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; info!(duration = ?start.elapsed(), "received http response"); response }; let body = match parse_body::( response.status(), response.bytes().await?, ) { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. // TODO(anna): retry Err(e) => { return if e.get_reason().is_not_found() { // TODO: refactor this because it's weird // this is a failure to authenticate but we return Ok. Ok(AuthInfo::default()) } else { Err(e.into()) }; } }; let secret = if body.role_secret.is_empty() { None } else { let secret = scram::ServerSecret::parse(&body.role_secret) .map(AuthSecret::Scram) .ok_or(GetAuthInfoError::BadSecret)?; Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); Metrics::get() .proxy .allowed_ips_number .observe(allowed_ips.len() as f64); let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); Metrics::get() .proxy .allowed_vpc_endpoint_ids .observe(allowed_vpc_endpoint_ids.len() as f64); let block_public_connections = body.block_public_connections.unwrap_or_default(); let block_vpc_connections = body.block_vpc_connections.unwrap_or_default(); Ok(AuthInfo { secret, allowed_ips, allowed_vpc_endpoint_ids, project_id: body.project_id, account_id: body.account_id, access_blocker_flags: AccessBlockerFlags { public_access_blocked: block_public_connections, vpc_access_blocked: block_vpc_connections, }, rate_limits: body.rate_limits, }) } .inspect_err(|e| tracing::debug!(error = ?e)) .instrument(info_span!("do_get_auth_info")) .await } async fn do_get_endpoint_jwks( &self, ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, GetEndpointJwksError> { let request_id = ctx.session_id().to_string(); async { let request = self .endpoint .get_with_url(|url| { url.path_segments_mut() .push("endpoints") .push(endpoint.as_str()) .push("jwks"); }) .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) .build() .map_err(GetEndpointJwksError::RequestBuild)?; debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self .endpoint .execute(request) .await .map_err(GetEndpointJwksError::RequestExecute)?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::( response.status(), response.bytes().await.map_err(ControlPlaneError::from)?, )?; let rules = body .jwks .into_iter() .map(|jwks| AuthRule { id: jwks.id, jwks_url: jwks.jwks_url, audience: jwks.jwt_audience, role_names: jwks.role_names, }) .collect(); Ok(rules) } .inspect_err(|e| tracing::debug!(error = ?e)) .instrument(info_span!("do_get_endpoint_jwks")) .await } async fn do_wake_compute( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self .endpoint .get_path("wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("endpointish", user_info.endpoint.as_str()), ]); let options = user_info.options.to_deep_object(); if !options.is_empty() { request_builder = request_builder.query(&options); } let request = request_builder.build()?; debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response.status(), response.bytes().await?)?; let Some((host, port)) = parse_host_port(&body.address) else { return Err(WakeComputeError::BadComputeAddress(body.address)); }; let host_addr = IpAddr::from_str(host).ok(); let ssl_mode = match &body.server_name { Some(_) => SslMode::Require, None => SslMode::Disable, }; let host = match body.server_name { Some(host) => host.into(), None => host.into(), }; let node = NodeInfo { conn_info: compute::ConnectInfo { host_addr, host, port, ssl_mode, }, aux: body.aux, }; Ok(node) } .inspect_err(|e| tracing::debug!(error = ?e)) .instrument(info_span!("do_wake_compute")) .await } } impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn get_role_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result { let key = endpoint.normalize(); if let Some(role_control) = self.caches.project_info.get_role_secret(&key, role) { return match role_control { Err(msg) => { info!(key = &*key, "found cached get_role_access_control error"); Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg))) } Ok(role_control) => { debug!(key = &*key, "found cached role access control"); Ok(role_control) } }; } self.get_and_cache_auth_info(ctx, endpoint, role, &key, |_, role_control| { role_control.clone() }) .await } #[tracing::instrument(skip_all)] async fn get_endpoint_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result { let key = endpoint.normalize(); if let Some(control) = self.caches.project_info.get_endpoint_access(&key) { return match control { Err(msg) => { info!( key = &*key, "found cached get_endpoint_access_control error" ); Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg))) } Ok(control) => { debug!(key = &*key, "found cached endpoint access control"); Ok(control) } }; } self.get_and_cache_auth_info(ctx, endpoint, role, &key, |control, _| control.clone()) .await } #[tracing::instrument(skip_all)] async fn get_endpoint_jwks( &self, ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await } #[tracing::instrument(skip_all)] async fn wake_compute( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); macro_rules! check_cache { () => { if let Some(info) = self.caches.node_info.get_entry(&key) { return match info { Err(msg) => { info!(key = &*key, "found cached wake_compute error"); Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( msg, ))) } Ok(info) => { debug!(key = &*key, "found cached compute node info"); ctx.set_project(info.aux.clone()); Ok(info) } }; } }; } // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. check_cache!(); let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { // TODO: if there is something in the cache, mark the permit as success. check_cache!(); } // check rate limit if !self .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize_intern(), 1) { return Err(WakeComputeError::TooManyConnections); } let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); match node { Ok(node) => { ctx.set_project(node.aux.clone()); debug!(key = &*key, "created a cache entry for woken compute node"); let mut stored_node = node.clone(); // store the cached node as 'warm_cached' stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; self.caches.node_info.insert(key.clone(), Ok(stored_node)); Ok(Cached { token: Some((&self.caches.node_info, key)), value: node, }) } Err(err) => match err { WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => { let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info); // If we can retry this error, do not cache it, // unless we were given a retry delay. if msg.could_retry() && retry_info.is_none() { return Err(err); } debug!( key = &*key, "created a cache entry for the wake compute error" ); self.caches.node_info.insert(key, Err(msg.clone())); Err(err) } err => Err(err), }, } } } /// Parse http response body, taking status code into account. fn parse_body serde::Deserialize<'a>>( status: StatusCode, body: Bytes, ) -> Result { if status.is_success() { // We shouldn't log raw body because it may contain secrets. info!("request succeeded, processing the body"); return Ok(serde_json::from_slice(&body).map_err(std::io::Error::other)?); } // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. info!("response_error plaintext: {:?}", body); // Don't throw an error here because it's not as important // as the fact that the request itself has failed. let mut body = serde_json::from_slice(&body).unwrap_or_else(|e| { warn!("failed to parse error body: {e}"); Box::new(ControlPlaneErrorMessage { error: "reason unclear (malformed error message)".into(), http_status_code: status, status: None, }) }); body.http_status_code = status; warn!("console responded with an error ({status}): {body:?}"); Err(ControlPlaneError::Message(body)) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.rsplit_once(':')?; let ipv6_brackets: &[_] = &['[', ']']; Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_host_port_v4() { let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); assert_eq!(host, "127.0.0.1"); assert_eq!(port, 5432); } #[test] fn test_parse_host_port_v6() { let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); assert_eq!(host, "2001:db8::1"); assert_eq!(port, 5432); } #[test] fn test_parse_host_port_url() { let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") .expect("failed to parse"); assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); assert_eq!(port, 5432); } } ================================================ FILE: proxy/src/control_plane/client/mock.rs ================================================ //! Mock console backend which relies on a user-provided postgres instance. use std::io; use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; use futures::TryFutureExt; use postgres_client::config::SslMode; use thiserror::Error; use tokio_postgres::Client; use tracing::{Instrument, error, info, info_span, warn}; use crate::auth::IpPattern; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::cache::node_info::CachedNodeInfo; use crate::compute::ConnectInfo; use crate::context::RequestContext; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; use crate::control_plane::messages::{EndpointRateLimitConfig, MetricsAuxInfo}; use crate::control_plane::{ AccessBlockerFlags, AuthInfo, AuthSecret, EndpointAccessControl, NodeInfo, RoleAccessControl, }; use crate::intern::RoleNameInt; use crate::scram; use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; use crate::url::ApiUrl; #[derive(Debug, Error)] enum MockApiError { #[error("Failed to read password: {0}")] PasswordNotSet(tokio_postgres::Error), } impl From for ControlPlaneError { fn from(e: MockApiError) -> Self { io::Error::other(e).into() } } impl From for ControlPlaneError { fn from(e: tokio_postgres::Error) -> Self { io::Error::other(e).into() } } #[derive(Clone)] pub struct MockControlPlane { endpoint: ApiUrl, ip_allowlist_check_enabled: bool, } impl MockControlPlane { pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self { Self { endpoint, ip_allowlist_check_enabled, } } pub(crate) fn url(&self) -> &str { self.endpoint.as_str() } async fn do_get_auth_info( &self, endpoint: &EndpointId, role: &RoleName, ) -> Result { let (secret, allowed_ips) = async { // Perhaps we could persist this connection, but then we'd have to // write more code for reopening it if it got closed, which doesn't // seem worth it. let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); let secret = if let Some(entry) = get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", &[&role.as_str()], "rolpassword", ) .await? { info!("got a secret: {entry}"); // safe since it's not a prod scenario scram::ServerSecret::parse(&entry).map(AuthSecret::Scram) } else { warn!("user '{role}' does not exist"); None }; let allowed_ips = if self.ip_allowlist_check_enabled { match get_execute_postgres_query( &client, "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", &[&endpoint.as_str()], "allowed_ips", ) .await? { Some(s) => { info!("got allowed_ips: {s}"); s.split(',') .map(|s| { IpPattern::from_str(s).expect("mocked ip pattern should be correct") }) .collect() } None => vec![], } } else { vec![] }; Ok((secret, allowed_ips)) } .inspect_err(|e: &GetAuthInfoError| tracing::error!("{e}")) .instrument(info_span!("postgres", url = self.endpoint.as_str())) .await?; Ok(AuthInfo { secret, allowed_ips, allowed_vpc_endpoint_ids: vec![], project_id: None, account_id: None, access_blocker_flags: AccessBlockerFlags::default(), rate_limits: EndpointRateLimitConfig::default(), }) } async fn do_get_endpoint_jwks( &self, endpoint: &EndpointId, ) -> Result, GetEndpointJwksError> { let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; let connection = tokio::spawn(connection); let res = client.query( "select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1", &[&endpoint.as_str()], ) .await?; let mut rows = vec![]; for row in res { rows.push(AuthRule { id: row.get("id"), jwks_url: url::Url::parse(row.get("jwks_url"))?, audience: row.get("audience"), role_names: row .get::<_, Vec>("role_names") .into_iter() .map(RoleName::from) .map(|s| RoleNameInt::from(&s)) .collect(), }); } drop(client); connection.await??; Ok(rows) } async fn do_wake_compute(&self) -> Result { let port = self.endpoint.port().unwrap_or(5432); let conn_info = match self.endpoint.host_str() { None => ConnectInfo { host_addr: Some(IpAddr::V4(Ipv4Addr::LOCALHOST)), host: "localhost".into(), port, ssl_mode: SslMode::Disable, }, Some(host) => ConnectInfo { host_addr: IpAddr::from_str(host).ok(), host: host.into(), port, ssl_mode: SslMode::Disable, }, }; let node = NodeInfo { conn_info, aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; Ok(node) } } async fn get_execute_postgres_query( client: &Client, query: &str, params: &[&(dyn tokio_postgres::types::ToSql + Sync)], idx: &str, ) -> Result, GetAuthInfoError> { let rows = client.query(query, params).await?; // We can get at most one row, because `rolname` is unique. let Some(row) = rows.first() else { // This means that the user doesn't exist, so there can be no secret. // However, this is still a *valid* outcome which is very similar // to getting `404 Not found` from the Neon console. return Ok(None); }; let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?; Ok(Some(entry)) } impl super::ControlPlaneApi for MockControlPlane { async fn get_endpoint_access_control( &self, _ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result { let info = self.do_get_auth_info(endpoint, role).await?; Ok(EndpointAccessControl { allowed_ips: Arc::new(info.allowed_ips), allowed_vpce: Arc::new(info.allowed_vpc_endpoint_ids), flags: info.access_blocker_flags, rate_limits: info.rate_limits, }) } async fn get_role_access_control( &self, _ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result { let info = self.do_get_auth_info(endpoint, role).await?; Ok(RoleAccessControl { secret: info.secret, }) } async fn get_endpoint_jwks( &self, _ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await } #[tracing::instrument(skip_all)] async fn wake_compute( &self, _ctx: &RequestContext, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await } } ================================================ FILE: proxy/src/control_plane/client/mod.rs ================================================ pub mod cplane_proxy_v1; #[cfg(any(test, feature = "testing"))] pub mod mock; use std::hash::Hash; use std::sync::Arc; use std::time::Duration; use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; use super::{EndpointAccessControl, RoleAccessControl}; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::cache::node_info::{CachedNodeInfo, NodeInfoCache}; use crate::cache::project_info::ProjectInfoCache; use crate::config::{CacheOptions, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{ControlPlaneApi, errors}; use crate::error::ReportableError; use crate::metrics::ApiLockMetrics; use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; use crate::types::EndpointId; #[non_exhaustive] #[derive(Clone)] pub enum ControlPlaneClient { /// Proxy V1 control plane API ProxyV1(cplane_proxy_v1::NeonControlPlaneClient), /// Local mock control plane. #[cfg(any(test, feature = "testing"))] PostgresMock(mock::MockControlPlane), /// Internal testing #[cfg(test)] #[allow(private_interfaces)] Test(Box), } impl ControlPlaneApi for ControlPlaneClient { async fn get_role_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &crate::types::RoleName, ) -> Result { match self { Self::ProxyV1(api) => api.get_role_access_control(ctx, endpoint, role).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_role_access_control(ctx, endpoint, role).await, #[cfg(test)] Self::Test(_api) => { unreachable!("this function should never be called in the test backend") } } } async fn get_endpoint_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &crate::types::RoleName, ) -> Result { match self { Self::ProxyV1(api) => api.get_endpoint_access_control(ctx, endpoint, role).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_endpoint_access_control(ctx, endpoint, role).await, #[cfg(test)] Self::Test(api) => api.get_access_control(), } } async fn get_endpoint_jwks( &self, ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, errors::GetEndpointJwksError> { match self { Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(test)] Self::Test(_api) => Ok(vec![]), } } async fn wake_compute( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { match self { Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await, #[cfg(test)] Self::Test(api) => api.wake_compute(), } } } #[cfg(test)] pub(crate) trait TestControlPlaneClient: Send + Sync + 'static { fn wake_compute(&self) -> Result; fn get_access_control(&self) -> Result; fn dyn_clone(&self) -> Box; } #[cfg(test)] impl Clone for Box { fn clone(&self) -> Self { TestControlPlaneClient::dyn_clone(&**self) } } /// Various caches for [`control_plane`](super). pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub(crate) node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, ) -> Self { Self { node_info: NodeInfoCache::new(wake_compute_cache_config), project_info: Arc::new(ProjectInfoCache::new(project_info_cache_config)), } } } /// Various caches for [`control_plane`](super). pub struct ApiLocks { name: &'static str, node_locks: ClashMap>, config: RateLimiterConfig, timeout: Duration, epoch: std::time::Duration, metrics: &'static ApiLockMetrics, } #[derive(Debug, thiserror::Error)] pub(crate) enum ApiLockError { #[error("timeout acquiring resource permit")] TimeoutError(#[from] tokio::time::error::Elapsed), } impl ReportableError for ApiLockError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, } } } impl ApiLocks { pub fn new( name: &'static str, config: RateLimiterConfig, shards: usize, timeout: Duration, epoch: std::time::Duration, metrics: &'static ApiLockMetrics, ) -> Self { Self { name, node_locks: ClashMap::with_shard_amount(shards), config, timeout, epoch, metrics, } } pub(crate) async fn get_permit(&self, key: &K) -> Result { if self.config.initial_limit == 0 { return Ok(WakeComputePermit { permit: Token::disabled(), }); } let now = Instant::now(); let semaphore = { // get fast path if let Some(semaphore) = self.node_locks.get(key) { semaphore.clone() } else { self.node_locks .entry(key.clone()) .or_insert_with(|| { self.metrics.semaphores_registered.inc(); DynamicLimiter::new(self.config) }) .clone() } }; let permit = semaphore.acquire_timeout(self.timeout).await; self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); if permit.is_ok() { debug!(elapsed = ?now.elapsed(), "acquired permit"); } else { debug!(elapsed = ?now.elapsed(), "timed out acquiring permit"); } Ok(WakeComputePermit { permit: permit? }) } pub async fn garbage_collect_worker(&self) { if self.config.initial_limit == 0 { return; } let mut interval = tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; // temporary lock a single shard and then clear any semaphores that aren't currently checked out // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked // therefore releasing it is safe from race conditions info!( name = self.name, shard = i, "performing epoch reclamation on api lock" ); let mut lock = shard.write(); let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1) .count(); drop(lock); self.metrics.semaphores_unregistered.inc_by(count as u64); timer.observe(); } } } } pub(crate) struct WakeComputePermit { permit: Token, } impl WakeComputePermit { pub(crate) fn should_check_cache(&self) -> bool { !self.permit.is_disabled() } pub(crate) fn release(self, outcome: Outcome) { self.permit.release(outcome); } pub(crate) fn release_result(self, res: Result) -> Result { match res { Ok(_) => self.release(Outcome::Success), Err(_) => self.release(Outcome::Overload), } res } } impl FetchAuthRules for ControlPlaneClient { async fn fetch_auth_rules( &self, ctx: &RequestContext, endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { self.get_endpoint_jwks(ctx, &endpoint) .await .map_err(FetchAuthRulesError::GetEndpointJwks) } } ================================================ FILE: proxy/src/control_plane/errors.rs ================================================ use std::io; use thiserror::Error; use crate::control_plane::client::ApiLockError; use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason}; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. pub(crate) const REQUEST_FAILED: &str = "Control plane request failed"; /// Common console API error. #[derive(Debug, Error)] pub(crate) enum ControlPlaneError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] Message(Box), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] Transport(#[from] std::io::Error), } impl ControlPlaneError { /// Returns HTTP status code if it's the reason for failure. pub(crate) fn get_reason(&self) -> messages::Reason { match self { ControlPlaneError::Message(e) => e.get_reason(), ControlPlaneError::Transport(_) => messages::Reason::Unknown, } } } impl UserFacingError for ControlPlaneError { fn to_string_client(&self) -> String { match self { // To minimize risks, only select errors are forwarded to users. ControlPlaneError::Message(c) => c.get_user_facing_message(), ControlPlaneError::Transport(_) => REQUEST_FAILED.to_owned(), } } } impl ReportableError for ControlPlaneError { fn get_error_kind(&self) -> ErrorKind { match self { ControlPlaneError::Message(e) => match e.get_reason() { Reason::RoleProtected | Reason::ResourceNotFound | Reason::ProjectNotFound | Reason::EndpointNotFound | Reason::EndpointDisabled | Reason::BranchNotFound | Reason::WrongLsnOrTimestamp => ErrorKind::User, Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, Reason::NonDefaultBranchComputeTimeExceeded | Reason::ActiveTimeQuotaExceeded | Reason::ComputeTimeQuotaExceeded | Reason::WrittenDataQuotaExceeded | Reason::DataTransferQuotaExceeded | Reason::LogicalSizeQuotaExceeded | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota, Reason::ConcurrencyLimitReached | Reason::LockAlreadyTaken | Reason::RunningOperations | Reason::EndpointIdle | Reason::ProjectUnderMaintenance | Reason::Unknown => ErrorKind::ControlPlane, }, ControlPlaneError::Transport(_) => ErrorKind::ControlPlane, } } } impl CouldRetry for ControlPlaneError { fn could_retry(&self) -> bool { match self { // retry some transport errors Self::Transport(io) => io.could_retry(), Self::Message(e) => e.could_retry(), } } } impl From for ControlPlaneError { fn from(e: reqwest::Error) -> Self { io::Error::other(e).into() } } impl From for ControlPlaneError { fn from(e: reqwest_middleware::Error) -> Self { io::Error::other(e).into() } } #[derive(Debug, Error)] pub(crate) enum GetAuthInfoError { // We shouldn't include the actual secret here. #[error("Console responded with a malformed auth secret")] BadSecret, #[error(transparent)] ApiError(ControlPlaneError), } // This allows more useful interactions than `#[from]`. impl> From for GetAuthInfoError { fn from(e: E) -> Self { Self::ApiError(e.into()) } } impl UserFacingError for GetAuthInfoError { fn to_string_client(&self) -> String { match self { // We absolutely should not leak any secrets! Self::BadSecret => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. Self::ApiError(e) => e.to_string_client(), } } } impl ReportableError for GetAuthInfoError { fn get_error_kind(&self) -> ErrorKind { match self { Self::BadSecret => ErrorKind::ControlPlane, Self::ApiError(_) => ErrorKind::ControlPlane, } } } #[derive(Debug, Error)] pub(crate) enum WakeComputeError { #[error("Console responded with a malformed compute address: {0}")] BadComputeAddress(Box), #[error(transparent)] ControlPlane(ControlPlaneError), #[error("Too many connections attempts")] TooManyConnections, #[error("error acquiring resource permit: {0}")] TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. impl> From for WakeComputeError { fn from(e: E) -> Self { Self::ControlPlane(e.into()) } } impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { match self { // We shouldn't show user the address even if it's broken. // Besides, user is unlikely to care about this detail. Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(), // However, control plane might return a meaningful error. Self::ControlPlane(e) => e.to_string_client(), Self::TooManyConnections => self.to_string(), Self::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } } } } impl ReportableError for WakeComputeError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, Self::ControlPlane(e) => e.get_error_kind(), Self::TooManyConnections => crate::error::ErrorKind::RateLimit, Self::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } impl CouldRetry for WakeComputeError { fn could_retry(&self) -> bool { match self { Self::BadComputeAddress(_) => false, Self::ControlPlane(e) => e.could_retry(), Self::TooManyConnections => false, Self::TooManyConnectionAttempts(_) => false, } } } #[derive(Debug, Error)] pub enum GetEndpointJwksError { #[error("failed to build control plane request: {0}")] RequestBuild(#[source] reqwest::Error), #[error("failed to send control plane request: {0}")] RequestExecute(#[source] reqwest_middleware::Error), #[error(transparent)] ControlPlane(#[from] ControlPlaneError), #[cfg(any(test, feature = "testing"))] #[error(transparent)] TokioPostgres(#[from] tokio_postgres::Error), #[cfg(any(test, feature = "testing"))] #[error(transparent)] ParseUrl(#[from] url::ParseError), #[cfg(any(test, feature = "testing"))] #[error(transparent)] TaskJoin(#[from] tokio::task::JoinError), } ================================================ FILE: proxy/src/control_plane/messages.rs ================================================ use std::fmt::{self, Display}; use std::time::Duration; use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use smol_str::SmolStr; use tokio::time::Instant; use crate::auth::IpPattern; use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. #[derive(Debug, Deserialize, Clone)] pub(crate) struct ControlPlaneErrorMessage { pub(crate) error: Box, #[serde(skip)] pub(crate) http_status_code: http::StatusCode, pub(crate) status: Option, } impl ControlPlaneErrorMessage { pub(crate) fn get_reason(&self) -> Reason { self.status .as_ref() .and_then(|s| s.details.error_info.as_ref()) .map_or(Reason::Unknown, |e| e.reason) } pub(crate) fn get_user_facing_message(&self) -> String { use super::errors::REQUEST_FAILED; self.status .as_ref() .and_then(|s| s.details.user_facing_message.as_ref()) .map_or_else(|| { // Ask @neondatabase/control-plane for review before adding more. match self.http_status_code { http::StatusCode::NOT_FOUND => { // Status 404: failed to get a project-related resource. format!("{REQUEST_FAILED}: endpoint cannot be found") } http::StatusCode::NOT_ACCEPTABLE => { // Status 406: endpoint is disabled (we don't allow connections). format!("{REQUEST_FAILED}: endpoint is disabled") } http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") } _ => REQUEST_FAILED.to_owned(), } }, |m| m.message.clone().into()) } } impl Display for ControlPlaneErrorMessage { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let msg: &str = self .status .as_ref() .and_then(|s| s.details.user_facing_message.as_ref()) .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref()); write!(f, "{msg}") } } impl CouldRetry for ControlPlaneErrorMessage { fn could_retry(&self) -> bool { // If the error message does not have a status, // the error is unknown and probably should not retry automatically let Some(status) = &self.status else { return false; }; // retry if the retry info is set. if status.details.retry_info.is_some() { return true; } // if no retry info set, attempt to use the error code to guess the retry state. let reason = status .details .error_info .map_or(Reason::Unknown, |e| e.reason); reason.can_retry() } } #[derive(Debug, Deserialize, Clone)] #[allow(dead_code)] pub(crate) struct Status { pub(crate) code: Box, pub(crate) message: Box, pub(crate) details: Details, } #[derive(Debug, Deserialize, Clone)] pub(crate) struct Details { pub(crate) error_info: Option, pub(crate) retry_info: Option, pub(crate) user_facing_message: Option, } #[derive(Copy, Clone, Debug, Deserialize)] pub(crate) struct ErrorInfo { pub(crate) reason: Reason, // Schema could also have `metadata` field, but it's not structured. Skip it for now. } #[derive(Clone, Copy, Debug, Deserialize, Default, PartialEq, Eq)] pub(crate) enum Reason { /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles. #[serde(rename = "ROLE_PROTECTED")] RoleProtected, /// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found, /// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to /// access the requested resource. /// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc. #[serde(rename = "RESOURCE_NOT_FOUND")] ResourceNotFound, /// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct, /// or that the subject doesn't have enough permissions to access the requested project. #[serde(rename = "PROJECT_NOT_FOUND")] ProjectNotFound, /// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct, /// or that the subject doesn't have enough permissions to access the requested endpoint. #[serde(rename = "ENDPOINT_NOT_FOUND")] EndpointNotFound, /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections. #[serde(rename = "ENDPOINT_DISABLED")] EndpointDisabled, /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct, /// or that the subject doesn't have enough permissions to access the requested branch. #[serde(rename = "BRANCH_NOT_FOUND")] BranchNotFound, /// WrongLsnOrTimestamp indicates that the specified LSN or timestamp are wrong. #[serde(rename = "WRONG_LSN_OR_TIMESTAMP")] WrongLsnOrTimestamp, /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded. #[serde(rename = "RATE_LIMIT_EXCEEDED")] RateLimitExceeded, /// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been /// exceeded. #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")] NonDefaultBranchComputeTimeExceeded, /// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded. #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")] ActiveTimeQuotaExceeded, /// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded. #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")] ComputeTimeQuotaExceeded, /// WrittenDataQuotaExceeded indicates that the written data quota was exceeded. #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")] WrittenDataQuotaExceeded, /// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded. #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")] DataTransferQuotaExceeded, /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded. #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] LogicalSizeQuotaExceeded, /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded. #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")] ActiveEndpointsLimitExceeded, /// RunningOperations indicates that the project already has some running operations /// and scheduling of new ones is prohibited. #[serde(rename = "RUNNING_OPERATIONS")] RunningOperations, /// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached. #[serde(rename = "CONCURRENCY_LIMIT_REACHED")] ConcurrencyLimitReached, /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken. #[serde(rename = "LOCK_ALREADY_TAKEN")] LockAlreadyTaken, /// EndpointIdle indicates that the endpoint cannot become active, because it's idle. #[serde(rename = "ENDPOINT_IDLE")] EndpointIdle, /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance, /// and thus cannot accept connections. #[serde(rename = "PROJECT_UNDER_MAINTENANCE")] ProjectUnderMaintenance, #[default] #[serde(other)] Unknown, } impl Reason { pub(crate) fn is_not_found(self) -> bool { matches!( self, Reason::ResourceNotFound | Reason::ProjectNotFound | Reason::EndpointNotFound | Reason::BranchNotFound ) } pub(crate) fn can_retry(self) -> bool { match self { // do not retry role protected errors // not a transient error Reason::RoleProtected => false, // on retry, it will still not be found or valid Reason::ResourceNotFound | Reason::ProjectNotFound | Reason::EndpointNotFound | Reason::EndpointDisabled | Reason::BranchNotFound | Reason::WrongLsnOrTimestamp => false, // we were asked to go away Reason::RateLimitExceeded | Reason::NonDefaultBranchComputeTimeExceeded | Reason::ActiveTimeQuotaExceeded | Reason::ComputeTimeQuotaExceeded | Reason::WrittenDataQuotaExceeded | Reason::DataTransferQuotaExceeded | Reason::LogicalSizeQuotaExceeded | Reason::ActiveEndpointsLimitExceeded => false, // transient error. control plane is currently busy // but might be ready soon Reason::RunningOperations | Reason::ConcurrencyLimitReached | Reason::LockAlreadyTaken | Reason::EndpointIdle | Reason::ProjectUnderMaintenance => true, // unknown error. better not retry it. Reason::Unknown => false, } } } #[derive(Copy, Clone, Debug, Deserialize)] #[allow(dead_code)] pub(crate) struct RetryInfo { #[serde(rename = "retry_delay_ms", deserialize_with = "milliseconds_from_now")] pub(crate) retry_at: Instant, } fn milliseconds_from_now<'de, D: serde::Deserializer<'de>>(d: D) -> Result { let millis = u64::deserialize(d)?; Ok(Instant::now() + Duration::from_millis(millis)) } #[derive(Debug, Deserialize, Clone)] pub(crate) struct UserFacingMessage { pub(crate) message: Box, } /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. /// Returned by the `/get_endpoint_access_control` API method. #[derive(Deserialize)] pub(crate) struct GetEndpointAccessControl { pub(crate) role_secret: Box, pub(crate) project_id: Option, pub(crate) account_id: Option, pub(crate) allowed_ips: Option>, pub(crate) allowed_vpc_endpoint_ids: Option>, pub(crate) block_public_connections: Option, pub(crate) block_vpc_connections: Option, #[serde(default)] pub(crate) rate_limits: EndpointRateLimitConfig, } #[derive(Copy, Clone, Deserialize, Default, Debug)] pub struct EndpointRateLimitConfig { pub connection_attempts: ConnectionAttemptsLimit, } #[derive(Copy, Clone, Deserialize, Default, Debug)] pub struct ConnectionAttemptsLimit { pub tcp: Option, pub ws: Option, pub http: Option, } #[derive(Copy, Clone, Deserialize, Debug)] pub struct LeakyBucketSetting { pub rps: f64, pub burst: f64, } /// Response which holds compute node's `host:port` pair. /// Returned by the `/proxy_wake_compute` API method. #[derive(Debug, Deserialize)] pub(crate) struct WakeCompute { pub(crate) address: Box, pub(crate) server_name: Option, pub(crate) aux: MetricsAuxInfo, } /// Async response which concludes the console redirect auth flow. /// Also known as `kickResponse` in the console. #[derive(Debug, Deserialize)] pub(crate) struct KickSession<'a> { /// Session ID is assigned by the proxy. pub(crate) session_id: &'a str, /// Compute node connection params. #[serde(deserialize_with = "KickSession::parse_db_info")] pub(crate) result: DatabaseInfo, } impl KickSession<'_> { fn parse_db_info<'de, D>(des: D) -> Result where D: serde::Deserializer<'de>, { #[derive(Deserialize)] enum Wrapper { // Currently, console only reports `Success`. // `Failure(String)` used to be here... RIP. Success(DatabaseInfo), } Wrapper::deserialize(des).map(|x| match x { Wrapper::Success(info) => info, }) } } /// Compute node connection params. #[derive(Deserialize)] pub(crate) struct DatabaseInfo { pub(crate) host: Box, pub(crate) port: u16, pub(crate) dbname: Box, pub(crate) user: Box, /// Console always provides a password, but it might /// be inconvenient for debug with local PG instance. pub(crate) password: Option>, pub(crate) aux: MetricsAuxInfo, #[serde(default)] pub(crate) allowed_ips: Option>, #[serde(default)] pub(crate) allowed_vpc_endpoint_ids: Option>, #[serde(default)] pub(crate) public_access_allowed: Option, } // Manually implement debug to omit sensitive info. impl fmt::Debug for DatabaseInfo { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) .field("dbname", &self.dbname) .field("user", &self.user) .field("allowed_ips", &self.allowed_ips) .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids) .finish_non_exhaustive() } } /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. #[derive(Debug, Deserialize, Clone)] pub(crate) struct MetricsAuxInfo { pub(crate) endpoint_id: EndpointIdInt, pub(crate) project_id: ProjectIdInt, pub(crate) branch_id: BranchIdInt, // note: we don't use interned strings for compute IDs. // they churn too quickly and we have no way to clean up interned strings. pub(crate) compute_id: SmolStr, #[serde(default)] pub(crate) cold_start_info: ColdStartInfo, } #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] Unknown, /// Compute was already running Warm, #[serde(rename = "pool_hit")] #[label(rename = "pool_hit")] /// Compute was not running but there was an available VM VmPoolHit, #[serde(rename = "pool_miss")] #[label(rename = "pool_miss")] /// Compute was not running and there were no VMs available VmPoolMiss, // not provided by control plane /// Connection available from HTTP pool HttpPoolHit, /// Cached connection info WarmCached, } impl ColdStartInfo { pub(crate) fn as_str(self) -> &'static str { match self { ColdStartInfo::Unknown => "unknown", ColdStartInfo::Warm => "warm", ColdStartInfo::VmPoolHit => "pool_hit", ColdStartInfo::VmPoolMiss => "pool_miss", ColdStartInfo::HttpPoolHit => "http_pool_hit", ColdStartInfo::WarmCached => "warm_cached", } } } #[derive(Debug, Deserialize, Clone)] pub struct EndpointJwksResponse { pub jwks: Vec, } #[derive(Debug, Deserialize, Clone)] pub struct JwksSettings { pub id: String, pub jwks_url: url::Url, #[serde(rename = "provider_name")] pub _provider_name: String, pub jwt_audience: Option, pub role_names: Vec, } #[cfg(test)] mod tests { use serde_json::json; use super::*; fn dummy_aux() -> serde_json::Value { json!({ "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", "compute_id": "compute", "cold_start_info": "unknown", }) } #[test] fn parse_kick_session() -> anyhow::Result<()> { // This is what the console's kickResponse looks like. let json = json!({ "session_id": "deadbeef", "result": { "Success": { "host": "localhost", "port": 5432, "dbname": "postgres", "user": "john_doe", "password": "password", "aux": dummy_aux(), } } }); serde_json::from_str::>(&json.to_string())?; Ok(()) } #[test] fn parse_db_info() -> anyhow::Result<()> { // with password serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", "user": "john_doe", "password": "password", "aux": dummy_aux(), }))?; // without password serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", "user": "john_doe", "aux": dummy_aux(), }))?; // new field (forward compatibility) serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", "user": "john_doe", "project": "hello_world", "N.E.W": "forward compatibility check", "aux": dummy_aux(), }))?; // with allowed_ips let dbinfo = serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", "user": "john_doe", "password": "password", "aux": dummy_aux(), "allowed_ips": ["127.0.0.1"], }))?; assert_eq!( dbinfo.allowed_ips, Some(vec![IpPattern::Single("127.0.0.1".parse()?)]) ); Ok(()) } #[test] fn parse_wake_compute() -> anyhow::Result<()> { let json = json!({ "address": "0.0.0.0", "aux": dummy_aux(), }); serde_json::from_str::(&json.to_string())?; Ok(()) } #[test] fn parse_get_role_secret() -> anyhow::Result<()> { // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field. let json = json!({ "role_secret": "secret", }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], "project_id": "project", }); serde_json::from_str::(&json.to_string())?; Ok(()) } } ================================================ FILE: proxy/src/control_plane/mgmt.rs ================================================ use std::convert::Infallible; use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, error, info, info_span}; use crate::control_plane::messages::{DatabaseInfo, KickSession}; use crate::waiters::{self, Waiter, Waiters}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. pub(crate) fn get_waiter( psql_session_id: impl Into, ) -> Result, waiters::RegisterError> { CPLANE_WAITERS.register(psql_session_id.into()) } pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Management API listener task. /// It spawns management response handlers needed for the console redirect auth flow. pub async fn task_main(listener: TcpListener) -> anyhow::Result { scopeguard::defer! { info!("mgmt has shut down"); } loop { let (socket, peer_addr) = listener.accept().await?; info!("accepted connection from {peer_addr}"); socket .set_nodelay(true) .context("failed to set client socket option")?; let span = info_span!("mgmt", peer = %peer_addr); tokio::task::spawn( async move { info!("serving a new management API connection"); // these might be long running connections, have a separate logging for cancelling // on shutdown and other ways of stopping. let cancelled = scopeguard::guard(tracing::Span::current(), |span| { let _e = span.entered(); info!("management API task cancelled"); }); if let Err(e) = handle_connection(socket).await { error!("serving failed with an error: {e}"); } else { info!("serving completed"); } // we can no longer get dropped scopeguard::ScopeGuard::into_inner(cancelled); } .instrument(span), ); } } async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; pgbackend .run(&mut MgmtHandler, &CancellationToken::new()) .await } /// A message received by `mgmt` when a compute node is ready. pub(crate) type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; impl postgres_backend::Handler for MgmtHandler { async fn process_query( &mut self, pgb: &mut PostgresBackendTCP, query: &str, ) -> Result<(), QueryError> { try_process_query(pgb, query).map_err(|e| { error!("failed to process response: {e:?}"); e }) } } fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { let resp: KickSession<'_> = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); info!("got response: {:?}", resp.result); match notify(resp.session_id, resp.result) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } Err(e) => { error!("failed to deliver response to per-client task"); pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string(), None))?; } } Ok(()) } ================================================ FILE: proxy/src/control_plane/mod.rs ================================================ //! Various stuff for dealing with the Neon Console. //! Later we might move some API wrappers here. /// Payloads used in the console's APIs. pub mod messages; /// Wrappers for console APIs and their mocks. pub mod client; pub(crate) mod errors; use std::sync::Arc; use messages::EndpointRateLimitConfig; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; use crate::auth::{AuthError, IpPattern, check_peer_addr_is_in_list}; use crate::cache::node_info::CachedNodeInfo; use crate::context::RequestContext; use crate::control_plane::messages::MetricsAuxInfo; use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt}; use crate::protocol2::ConnectionInfoExtra; use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig}; use crate::types::{EndpointId, RoleName}; use crate::{compute, scram}; /// Various cache-related types. pub mod caches { pub use super::client::ApiCaches; } /// Various cache-related types. pub mod locks { pub use super::client::ApiLocks; } /// Console's management API. pub mod mgmt; /// Auth secret which is managed by the cloud. #[derive(Clone, Eq, PartialEq, Debug)] pub(crate) enum AuthSecret { /// [SCRAM](crate::scram) authentication info. Scram(scram::ServerSecret), } #[derive(Default)] pub(crate) struct AuthInfo { pub(crate) secret: Option, /// List of IP addresses allowed for the autorization. pub(crate) allowed_ips: Vec, /// List of VPC endpoints allowed for the autorization. pub(crate) allowed_vpc_endpoint_ids: Vec, /// Project ID. This is used for cache invalidation. pub(crate) project_id: Option, /// Account ID. This is used for cache invalidation. pub(crate) account_id: Option, /// Are public connections or VPC connections blocked? pub(crate) access_blocker_flags: AccessBlockerFlags, /// The rate limits for this endpoint. pub(crate) rate_limits: EndpointRateLimitConfig, } /// Info for establishing a connection to a compute node. #[derive(Clone)] pub(crate) struct NodeInfo { pub(crate) conn_info: compute::ConnectInfo, /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, } #[derive(Copy, Clone, Default, Debug)] pub(crate) struct AccessBlockerFlags { pub public_access_blocked: bool, pub vpc_access_blocked: bool, } #[derive(Clone, Debug)] pub struct RoleAccessControl { pub secret: Option, } #[derive(Clone, Debug)] pub struct EndpointAccessControl { pub allowed_ips: Arc>, pub allowed_vpce: Arc>, pub flags: AccessBlockerFlags, pub rate_limits: EndpointRateLimitConfig, } impl EndpointAccessControl { pub fn check( &self, ctx: &RequestContext, check_ip_allowed: bool, check_vpc_allowed: bool, ) -> Result<(), AuthError> { if check_ip_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &self.allowed_ips) { return Err(AuthError::IpAddressNotAllowed(ctx.peer_addr())); } // check if a VPC endpoint ID is coming in and if yes, if it's allowed if check_vpc_allowed { if self.flags.vpc_access_blocked { return Err(AuthError::NetworkNotAllowed); } let incoming_vpc_endpoint_id = match ctx.extra() { None => return Err(AuthError::MissingVPCEndpointId), Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; let vpce = &self.allowed_vpce; // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. if !vpce.is_empty() && !vpce.contains(&incoming_vpc_endpoint_id) { return Err(AuthError::vpc_endpoint_id_not_allowed( incoming_vpc_endpoint_id, )); } } else if self.flags.public_access_blocked { return Err(AuthError::NetworkNotAllowed); } Ok(()) } pub fn connection_attempt_rate_limit( &self, ctx: &RequestContext, endpoint: &EndpointId, rate_limiter: &EndpointRateLimiter, ) -> Result<(), AuthError> { let endpoint = EndpointIdInt::from(endpoint); let limits = &self.rate_limits.connection_attempts; let config = match ctx.protocol() { crate::metrics::Protocol::Http => limits.http, crate::metrics::Protocol::Ws => limits.ws, crate::metrics::Protocol::Tcp => limits.tcp, crate::metrics::Protocol::SniRouter => return Ok(()), }; let config = config.and_then(|config| { if config.rps <= 0.0 || config.burst <= 0.0 { return None; } Some(LeakyBucketConfig::new(config.rps, config.burst)) }); if !rate_limiter.check(endpoint, config, 1) { return Err(AuthError::too_many_connections()); } Ok(()) } } /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. pub(crate) trait ControlPlaneApi { async fn get_role_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result; async fn get_endpoint_access_control( &self, ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, ) -> Result; async fn get_endpoint_jwks( &self, ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result; } ================================================ FILE: proxy/src/error.rs ================================================ use std::fmt; use anyhow::Context; use measured::FixedCardinalityLabel; use tokio::task::JoinError; /// Marks errors that may be safely shown to a client. /// This trait can be seen as a specialized version of [`ToString`]. /// /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. pub(crate) trait UserFacingError: ReportableError { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly /// recommended to override the default impl in case error type /// contains anything sensitive: various IDs, IP addresses etc. #[inline(always)] fn to_string_client(&self) -> String { self.to_string() } } #[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] #[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits #[label(rename = "serviceratelimit")] ServiceRateLimit, /// Proxy quota limit violation #[label(rename = "quota")] Quota, /// internal errors Service, /// Error communicating with control plane #[label(rename = "controlplane")] ControlPlane, /// Postgres error Postgres, /// Error communicating with compute Compute, } impl ErrorKind { pub(crate) fn to_metric_label(self) -> &'static str { match self { ErrorKind::User => "user", ErrorKind::ClientDisconnect => "clientdisconnect", ErrorKind::RateLimit => "ratelimit", ErrorKind::ServiceRateLimit => "serviceratelimit", ErrorKind::Quota => "quota", ErrorKind::Service => "service", ErrorKind::ControlPlane => "controlplane", ErrorKind::Postgres => "postgres", ErrorKind::Compute => "compute", } } } pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } /// Flattens `Result>` into `Result`. pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) } ================================================ FILE: proxy/src/ext.rs ================================================ use std::panic::resume_unwind; use std::sync::{Mutex, MutexGuard}; use tokio::task::JoinError; pub(crate) trait LockExt { fn lock_propagate_poison(&self) -> MutexGuard<'_, T>; } impl LockExt for Mutex { /// Lock the mutex and panic if the mutex was poisoned. #[track_caller] fn lock_propagate_poison(&self) -> MutexGuard<'_, T> { match self.lock() { Ok(guard) => guard, // poison occurs when another thread panicked while holding the lock guard. // since panicking is often unrecoverable, propagating the poison panic is reasonable. Err(poison) => panic!("{poison}"), } } } pub(crate) trait TaskExt { fn propagate_task_panic(self) -> T; } impl TaskExt for Result { /// Unwrap the result and panic if the inner task panicked. /// Also panics if the task was cancelled #[track_caller] fn propagate_task_panic(self) -> T { match self { Ok(t) => t, // Using resume_unwind prevents the panic hook being called twice. // Since we use this for structured concurrency, there is only // 1 logical panic, so this is more correct. Err(e) if e.is_panic() => resume_unwind(e.into_panic()), Err(e) => panic!("unexpected task error: {e}"), } } } ================================================ FILE: proxy/src/http/health_server.rs ================================================ use std::convert::Infallible; use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span}; use http_utils::error::ApiError; use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; use measured::MetricGroup; use measured::text::BufferedTextEncoder; use metrics::NeonMetrics; use tracing::{info, info_span}; use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } fn make_router(metrics: AppMetrics) -> RouterBuilder { let state = Arc::new(Mutex::new(PrometheusHandler { encoder: BufferedTextEncoder::new(), metrics, })); endpoint::make_router() .get("/metrics", move |r| { let state = state.clone(); request_span(r, move |b| prometheus_metrics_handler(b, state)) }) .get("/v1/status", status_handler) .get("/profile/cpu", move |r| { request_span(r, profile_cpu_handler) }) .get("/profile/heap", move |r| { request_span(r, profile_heap_handler) }) } pub async fn task_main( http_listener: TcpListener, metrics: AppMetrics, ) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } let service = || RouterService::new(make_router(metrics).build()?); hyper0::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) .await?; bail!("hyper server without shutdown handling cannot shutdown successfully"); } struct PrometheusHandler { encoder: BufferedTextEncoder, metrics: AppMetrics, } #[derive(MetricGroup)] pub struct AppMetrics { #[metric(namespace = "jemalloc")] pub jemalloc: Option, #[metric(flatten)] pub neon_metrics: NeonMetrics, #[metric(flatten)] pub proxy: &'static crate::metrics::Metrics, } async fn prometheus_metrics_handler( _req: Request, state: Arc>, ) -> Result, ApiError> { let started_at = std::time::Instant::now(); let span = info_span!("blocking"); let body = tokio::task::spawn_blocking(move || { let _span = span.entered(); let mut state = state.lock_propagate_poison(); let PrometheusHandler { encoder, metrics } = &mut *state; metrics .collect_group_into(&mut *encoder) .unwrap_or_else(|infallible| match infallible {}); let body = encoder.finish(); tracing::info!( bytes = body.len(), elapsed_ms = started_at.elapsed().as_millis(), "responded /metrics" ); body }) .await .propagate_task_panic(); let response = Response::builder() .status(200) .header(CONTENT_TYPE, "text/plain; version=0.0.4") .body(Body::from(body)) .expect("response headers should be valid"); Ok(response) } ================================================ FILE: proxy/src/http/mod.rs ================================================ //! HTTP client and server impls. //! Other modules should use stuff from this module instead of //! directly relying on deps like `reqwest` (think loose coupling). pub mod health_server; use std::time::{Duration, Instant}; use bytes::Bytes; use futures::FutureExt; use http::Method; use http_body_util::BodyExt; use hyper::body::Body; pub(crate) use reqwest::{Request, Response}; use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; pub(crate) use reqwest_retry::RetryTransientMiddleware; pub(crate) use reqwest_retry::policies::ExponentialBackoff; use thiserror::Error; use crate::metrics::{ConsoleRequest, Metrics}; use crate::url::ApiUrl; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() .build() .expect("Failed to create http client"); reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) .build() } pub(crate) fn new_client_with_timeout( request_timeout: Duration, total_retry_duration: Duration, ) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() .timeout(request_timeout) .build() .expect("Failed to create http client with timeout"); let retry_policy = ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration); reqwest_middleware::ClientBuilder::new(timeout_client) .with(reqwest_tracing::TracingMiddleware::default()) // As per docs, "This middleware always errors when given requests with streaming bodies". // That's all right because we only use this client to send `serde_json::RawValue`, which // is not a stream. // // ex-maintainer note: // this limitation can be fixed if streaming is necessary. // retries will still not be performed, but it wont error immediately .with(RetryTransientMiddleware::new_with_policy(retry_policy)) .build() } /// Thin convenience wrapper for an API provided by an http endpoint. #[derive(Debug, Clone)] pub struct Endpoint { /// API's base URL. endpoint: ApiUrl, /// Connection manager with built-in pooling. client: ClientWithMiddleware, } impl Endpoint { /// Construct a new HTTP endpoint wrapper. /// Http client is not constructed under the hood so that it can be shared. pub fn new(endpoint: ApiUrl, client: impl Into) -> Self { Self { endpoint, client: client.into(), } } #[inline(always)] pub(crate) fn url(&self) -> &ApiUrl { &self.endpoint } /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. pub(crate) fn get_path(&self, path: &str) -> RequestBuilder { self.get_with_url(|u| { u.path_segments_mut().push(path); }) } /// Return a [builder](RequestBuilder) for a `GET` request, /// accepting a closure to modify the url path segments for more complex paths queries. pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder { self.request_with_url(Method::GET, f) } /// Return a [builder](RequestBuilder) for a request, /// accepting a closure to modify the url path segments for more complex paths queries. pub(crate) fn request_with_url( &self, method: Method, f: impl for<'a> FnOnce(&'a mut ApiUrl), ) -> RequestBuilder { let mut url = self.endpoint.clone(); f(&mut url); self.client.request(method, url.into_inner()) } /// Execute a [request](reqwest::Request). pub(crate) fn execute( &self, request: Request, ) -> impl Future> { let metric = Metrics::get() .proxy .console_request_latency .with_labels(ConsoleRequest { request: request.url().path(), }); let req = self.client.execute(request).boxed(); async move { let start = Instant::now(); scopeguard::defer!({ Metrics::get() .proxy .console_request_latency .get_metric(metric) .observe_duration_since(start); }); req.await } } } #[derive(Error, Debug)] pub(crate) enum ReadBodyError { #[error("Content length exceeds limit of {limit} bytes")] BodyTooLarge { limit: usize }, #[error(transparent)] Read(#[from] E), } pub(crate) async fn read_body_with_limit( mut b: impl Body + Unpin, limit: usize, ) -> Result, ReadBodyError> { // We could use `b.limited().collect().await.to_bytes()` here // but this ends up being slightly more efficient as far as I can tell. // check the lower bound of the size hint. // in reqwest, this value is influenced by the Content-Length header. let lower_bound = match usize::try_from(b.size_hint().lower()) { Ok(bound) if bound <= limit => bound, _ => return Err(ReadBodyError::BodyTooLarge { limit }), }; let mut bytes = Vec::with_capacity(lower_bound); while let Some(frame) = b.frame().await.transpose()? { if let Ok(data) = frame.into_data() { if bytes.len() + data.len() > limit { return Err(ReadBodyError::BodyTooLarge { limit }); } bytes.extend_from_slice(&data); } } Ok(bytes) } #[cfg(test)] mod tests { use reqwest::Client; use super::*; #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; let endpoint = Endpoint::new(url, Client::new()); // Validate that this pattern makes sense. let req = endpoint .get_path("frobnicate") .query(&[ ("foo", Some("10")), // should be just `foo=10` ("bar", None), // shouldn't be passed at all ]) .build()?; assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); Ok(()) } #[test] fn uuid_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; let endpoint = Endpoint::new(url, Client::new()); let req = endpoint .get_path("frobnicate") .query(&[("session_id", uuid::Uuid::nil())]) .build()?; assert_eq!( req.url().as_str(), "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" ); Ok(()) } } ================================================ FILE: proxy/src/intern.rs ================================================ use std::hash::BuildHasherDefault; use std::marker::PhantomData; use std::num::NonZeroUsize; use std::ops::Index; use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName}; pub trait InternId: Sized + 'static { fn get_interner() -> &'static StringInterner; } pub struct StringInterner { inner: ThreadedRodeo>, _id: PhantomData, } #[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)] pub struct InternedString { inner: Spur, _id: PhantomData, } impl std::fmt::Display for InternedString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.as_str().fmt(f) } } impl InternedString { pub(crate) fn as_str(&self) -> &'static str { Id::get_interner().inner.resolve(&self.inner) } pub(crate) fn get(s: &str) -> Option { Id::get_interner().get(s) } } impl AsRef for InternedString { fn as_ref(&self) -> &str { self.as_str() } } impl std::ops::Deref for InternedString { type Target = str; fn deref(&self) -> &str { self.as_str() } } impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { fn deserialize>(d: D) -> Result { struct Visitor(PhantomData); impl serde::de::Visitor<'_> for Visitor { type Value = InternedString; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { formatter.write_str("a string") } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Ok(Id::get_interner().get_or_intern(v)) } } d.deserialize_str(Visitor::(PhantomData)) } } impl serde::Serialize for InternedString { fn serialize(&self, s: S) -> Result { self.as_str().serialize(s) } } impl StringInterner { pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")), // unbounded MemoryLimits::for_memory_usage(usize::MAX), BuildHasherDefault::::default(), ), _id: PhantomData, } } #[cfg(test)] fn len(&self) -> usize { self.inner.len() } #[cfg(test)] fn current_memory_usage(&self) -> usize { self.inner.current_memory_usage() } pub(crate) fn get_or_intern(&self, s: &str) -> InternedString { InternedString { inner: self.inner.get_or_intern(s), _id: PhantomData, } } pub(crate) fn get(&self, s: &str) -> Option> { Some(InternedString { inner: self.inner.get(s)?, _id: PhantomData, }) } } impl Index> for StringInterner { type Output = str; fn index(&self, index: InternedString) -> &Self::Output { self.inner.resolve(&index.inner) } } impl Default for StringInterner { fn default() -> Self { Self::new() } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct RoleNameTag; impl InternId for RoleNameTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type RoleNameInt = InternedString; impl From<&RoleName> for RoleNameInt { fn from(value: &RoleName) -> Self { RoleNameTag::get_interner().get_or_intern(value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct EndpointIdTag; impl InternId for EndpointIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type EndpointIdInt = InternedString; impl From<&EndpointId> for EndpointIdInt { fn from(value: &EndpointId) -> Self { EndpointIdTag::get_interner().get_or_intern(value) } } impl From for EndpointIdInt { fn from(value: EndpointId) -> Self { EndpointIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; impl InternId for BranchIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type BranchIdInt = InternedString; impl From<&BranchId> for BranchIdInt { fn from(value: &BranchId) -> Self { BranchIdTag::get_interner().get_or_intern(value) } } impl From for BranchIdInt { fn from(value: BranchId) -> Self { BranchIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; impl InternId for ProjectIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type ProjectIdInt = InternedString; impl From<&ProjectId> for ProjectIdInt { fn from(value: &ProjectId) -> Self { ProjectIdTag::get_interner().get_or_intern(value) } } impl From for ProjectIdInt { fn from(value: ProjectId) -> Self { ProjectIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct AccountIdTag; impl InternId for AccountIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type AccountIdInt = InternedString; impl From<&AccountId> for AccountIdInt { fn from(value: &AccountId) -> Self { AccountIdTag::get_interner().get_or_intern(value) } } impl From for AccountIdInt { fn from(value: AccountId) -> Self { AccountIdTag::get_interner().get_or_intern(&value) } } #[cfg(test)] mod tests { use std::sync::OnceLock; use super::InternId; use crate::intern::StringInterner; struct MyId; impl InternId for MyId { fn get_interner() -> &'static StringInterner { pub(crate) static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } #[test] fn push_many_strings() { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000.0, 0.8).unwrap(); let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); let interner = MyId::get_interner(); const N: usize = 100_000; let mut verify = Vec::with_capacity(N); for endpoint in endpoints.take(N) { let endpoint = format!("ep-string-interning-{endpoint}"); let key = interner.get_or_intern(&endpoint); verify.push((endpoint, key)); } for (s, key) in verify { assert_eq!(interner[key], s); } // 2031616/59861 = 34 bytes per string assert_eq!(interner.len(), 59_861); // will have other overhead for the internal hashmaps that are not accounted for. assert_eq!(interner.current_memory_usage(), 2_031_616); } } ================================================ FILE: proxy/src/jemalloc.rs ================================================ use std::marker::PhantomData; use measured::label::NoLabels; use measured::metric::gauge::GaugeState; use measured::metric::group::Encoding; use measured::metric::name::MetricNameEncoder; use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; use measured::text::TextEncoder; use measured::{LabelGroup, MetricGroup}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, inner: Metrics, } #[derive(MetricGroup)] struct Metrics { active_bytes: JemallocGaugeFamily, allocated_bytes: JemallocGaugeFamily, mapped_bytes: JemallocGaugeFamily, metadata_bytes: JemallocGaugeFamily, resident_bytes: JemallocGaugeFamily, retained_bytes: JemallocGaugeFamily, } impl MetricGroup for MetricRecorder where Metrics: MetricGroup, { fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { if self.epoch.advance().is_ok() { self.inner.collect_group_into(enc)?; } Ok(()) } } impl MetricRecorder { pub fn new() -> Result { tracing::debug!( config = config::malloc_conf::read()?, version = version::read()?, "starting jemalloc recorder" ); Ok(Self { epoch: epoch::mib()?, inner: Metrics { active_bytes: JemallocGaugeFamily(stats::active::mib()?), allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), }, }) } } struct JemallocGauge(PhantomData); impl Default for JemallocGauge { fn default() -> Self { JemallocGauge(PhantomData) } } impl MetricType for JemallocGauge { type Metadata = T; } struct JemallocGaugeFamily(T); impl MetricFamilyEncoding for JemallocGaugeFamily where JemallocGauge: MetricEncoding, { fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { JemallocGauge::write_type(&name, enc)?; JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) } } macro_rules! jemalloc_gauge { ($stat:ident, $mib:ident) => { impl MetricEncoding> for JemallocGauge { fn write_type( name: impl MetricNameEncoder, enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { GaugeState::write_type(name, enc) } fn collect_into( &self, mib: &stats::$mib, labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { if let Ok(v) = mib.read() { GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?; } Ok(()) } } }; } jemalloc_gauge!(active, active_mib); jemalloc_gauge!(allocated, allocated_mib); jemalloc_gauge!(mapped, mapped_mib); jemalloc_gauge!(metadata, metadata_mib); jemalloc_gauge!(resident, resident_mib); jemalloc_gauge!(retained, retained_mib); ================================================ FILE: proxy/src/lib.rs ================================================ // rustc lints/lint groups // https://doc.rust-lang.org/rustc/lints/groups.html #![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)] #![warn(clippy::all, clippy::pedantic, clippy::cargo)] // List of denied lints from the clippy::restriction group. // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction #![warn( clippy::undocumented_unsafe_blocks, // TODO: Enable once all individual checks are enabled. //clippy::as_conversions, clippy::dbg_macro, clippy::empty_enum_variants_with_brackets, clippy::exit, clippy::float_cmp_const, clippy::lossy_float_literal, clippy::macro_use_imports, clippy::manual_ok_or, // TODO: consider clippy::map_err_ignore // TODO: consider clippy::mem_forget clippy::rc_mutex, clippy::rest_pat_in_fully_bound_structs, clippy::string_add, clippy::string_to_string, clippy::todo, clippy::unimplemented, clippy::unwrap_used, )] // List of permanently allowed lints. #![allow( // It's ok to cast bool to u8, etc. clippy::cast_lossless, // Seems unavoidable. clippy::multiple_crate_versions, // While #[must_use] is a great feature this check is too noisy. clippy::must_use_candidate, // Inline consts, structs, fns, imports, etc. are ok if they're used by // the following statement(s). clippy::items_after_statements, )] // List of temporarily allowed lints. // TODO: fix code and reduce list or move to permanent list above. #![expect( clippy::cargo_common_metadata, clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::doc_markdown, clippy::inline_always, clippy::match_same_arms, clippy::match_wild_err_arm, clippy::missing_errors_doc, clippy::missing_panics_doc, clippy::module_name_repetitions, clippy::needless_pass_by_value, clippy::redundant_closure_for_method_calls, clippy::similar_names, clippy::single_match_else, clippy::struct_excessive_bools, clippy::struct_field_names, clippy::too_many_lines, clippy::unused_self )] #![allow( clippy::unsafe_derive_deserialize, reason = "false positive: https://github.com/rust-lang/rust-clippy/issues/15120" )] #![cfg_attr( any(test, feature = "testing"), allow( clippy::needless_raw_string_hashes, clippy::unreadable_literal, clippy::unused_async, ) )] // List of temporarily allowed lints to unblock beta/nightly. #![allow(unknown_lints)] pub mod binary; mod auth; mod batch; mod cache; mod cancellation; mod compute; mod compute_ctl; mod config; mod console_redirect_proxy; mod context; mod control_plane; mod error; mod ext; mod http; mod intern; mod jemalloc; mod logging; mod metrics; mod parse; mod pglb; mod pqproto; mod protocol2; mod proxy; mod rate_limiter; mod redis; mod sasl; mod scram; mod serverless; mod signals; mod stream; mod tls; mod types; mod url; mod usage_metrics; mod util; mod waiters; ================================================ FILE: proxy/src/logging.rs ================================================ use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; use std::{env, io}; use chrono::{DateTime, Utc}; use opentelemetry::trace::TraceContextExt; use tracing::subscriber::Interest; use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; use tracing_subscriber::fmt::time::SystemTime; use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::LookupSpan; use crate::metrics::Metrics; /// Initialize logging and OpenTelemetry tracing and exporter. /// /// Logging can be configured using `RUST_LOG` environment variable. /// /// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up /// configuration from environment variables. For example, to change the /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. /// See pub fn init() -> anyhow::Result { let logfmt = LogFormat::from_env()?; let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() .add_directive( "aws_config=info" .parse() .expect("this should be a valid filter directive"), ) .add_directive( "azure_core::policies::transport=off" .parse() .expect("this should be a valid filter directive"), ); let provider = tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()); let otlp_layer = provider.as_ref().map(tracing_utils::layer); let json_log_layer = if logfmt == LogFormat::Json { Some(JsonLoggingLayer::new( RealClock, StderrWriter { stderr: std::io::stderr(), }, &["conn_id", "ep", "query_id", "request_id", "session_id"], )) } else { None }; let text_log_layer = if logfmt == LogFormat::Text { Some( tracing_subscriber::fmt::layer() .with_ansi(false) .with_writer(std::io::stderr) .with_target(false), ) } else { None }; tracing_subscriber::registry() .with(env_filter) .with(otlp_layer) .with(json_log_layer) .with(text_log_layer) .try_init()?; Ok(LoggingGuard(provider)) } /// Initialize logging for local_proxy with log prefix and no opentelemetry. /// /// Logging can be configured using `RUST_LOG` environment variable. pub fn init_local_proxy() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy(); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) .with_writer(std::io::stderr) .event_format(LocalProxyFormatter(Format::default().with_target(false))); tracing_subscriber::registry() .with(env_filter) .with(fmt_layer) .try_init()?; Ok(LoggingGuard(None)) } pub struct LocalProxyFormatter(Format); impl FormatEvent for LocalProxyFormatter where S: Subscriber + for<'a> LookupSpan<'a>, N: for<'a> FormatFields<'a> + 'static, { fn format_event( &self, ctx: &tracing_subscriber::fmt::FmtContext<'_, S, N>, mut writer: tracing_subscriber::fmt::format::Writer<'_>, event: &tracing::Event<'_>, ) -> std::fmt::Result { writer.write_str("[local_proxy] ")?; self.0.format_event(ctx, writer, event) } } pub struct LoggingGuard(Option); impl Drop for LoggingGuard { fn drop(&mut self) { if let Some(p) = &self.0 { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. tracing::info!("shutting down the tracing machinery"); drop(p.shutdown()); } } } #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] enum LogFormat { Text, #[default] Json, } impl LogFormat { fn from_env() -> anyhow::Result { let logfmt = env::var("LOGFMT"); Ok(match logfmt.as_deref() { Err(_) => LogFormat::default(), Ok("text") => LogFormat::Text, Ok("json") => LogFormat::Json, Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"), }) } } trait MakeWriter { fn make_writer(&self) -> impl io::Write; } struct StderrWriter { stderr: io::Stderr, } impl MakeWriter for StderrWriter { #[inline] fn make_writer(&self) -> impl io::Write { self.stderr.lock() } } // TODO: move into separate module or even separate crate. trait Clock { fn now(&self) -> DateTime; } struct RealClock; impl Clock for RealClock { #[inline] fn now(&self) -> DateTime { Utc::now() } } /// Name of the field used by tracing crate to store the event message. const MESSAGE_FIELD: &str = "message"; /// Tracing used to enforce that spans/events have no more than 32 fields. /// It seems this is no longer the case, but it's still documented in some places. /// Generally, we shouldn't expect more than 32 fields anyway, so we can try and /// rely on it for some (minor) performance gains. const MAX_TRACING_FIELDS: usize = 32; thread_local! { /// Thread-local instance with per-thread buffer for log writing. static EVENT_FORMATTER: RefCell = const { RefCell::new(EventFormatter::new()) }; /// Cached OS thread ID. static THREAD_ID: u64 = gettid::gettid(); } /// Map for values fixed at callsite registration. // We use papaya here because registration rarely happens post-startup. // papaya is good for read-heavy workloads. // // We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy, // since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy. type CallsiteMap = papaya::HashMap>; /// Implements tracing layer to handle events specific to logging. struct JsonLoggingLayer { clock: C, writer: W, /// tracks which fields of each **event** are duplicates skipped_field_indices: CallsiteMap, /// tracks callsite names to an ID. callsite_name_ids: papaya::HashMap<&'static str, u32, ahash::RandomState>, span_info: CallsiteMap, /// Fields we want to keep track of in a separate json object. extract_fields: &'static [&'static str], } impl JsonLoggingLayer { fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self { JsonLoggingLayer { clock, skipped_field_indices: CallsiteMap::default(), span_info: CallsiteMap::default(), callsite_name_ids: papaya::HashMap::default(), writer, extract_fields, } } #[inline] fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo { self.span_info .pin() .get_or_insert_with(metadata.callsite(), || { CallsiteSpanInfo::new(&self.callsite_name_ids, metadata, self.extract_fields) }) .clone() } } impl Layer for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { use std::io::Write; // TODO: consider special tracing subscriber to grab timestamp very // early, before OTel machinery, and add as event extension. let now = self.clock.now(); EVENT_FORMATTER.with(|f| { let mut borrow = f.try_borrow_mut(); let formatter = match borrow.as_deref_mut() { Ok(formatter) => formatter, // If the thread local formatter is borrowed, // then we likely hit an edge case were we panicked during formatting. // We allow the logging to proceed with an uncached formatter. Err(_) => &mut EventFormatter::new(), }; formatter.format( now, event, &ctx, &self.skipped_field_indices, self.extract_fields, ); let mut writer = self.writer.make_writer(); if writer.write_all(formatter.buffer()).is_err() { Metrics::get().proxy.logging_errors_count.inc(); } }); } /// Registers a SpanFields instance as span extension. fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); let mut fields = SpanFields::new(self.span_info(span.metadata())); attrs.record(&mut fields); // This is a new span: the extensions should not be locked // unless some layer spawned a thread to process this span. // I don't think any layers do that. span.extensions_mut().insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); // assumption: `on_record` is rarely called. // assumption: a span being updated by one thread, // and formatted by another thread is even rarer. let mut ext = span.extensions_mut(); if let Some(fields) = ext.get_mut::() { values.record(fields); } } /// Called (lazily) roughly once per event/span instance. We quickly check /// for duplicate field names and record duplicates as skippable. Last field wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { debug_assert!( metadata.fields().len() <= MAX_TRACING_FIELDS, "callsite {metadata:?} has too many fields." ); if !metadata.is_event() { // register the span info. self.span_info(metadata); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } let mut field_indices = SkippedFieldIndices::default(); let mut seen_fields = HashMap::new(); for field in metadata.fields() { if let Some(old_index) = seen_fields.insert(field.name(), field.index()) { field_indices.set(old_index); } } if !field_indices.is_empty() { self.skipped_field_indices .pin() .insert(metadata.callsite(), field_indices); } Interest::always() } } /// Any span info that is fixed to a particular callsite. Not variable between span instances. #[derive(Clone)] struct CallsiteSpanInfo { /// index of each field to extract. usize::MAX if not found. extract: Arc<[usize]>, /// tracks the fixed "callsite ID" for each span. /// note: this is not stable between runs. normalized_name: Arc, } impl CallsiteSpanInfo { fn new( callsite_name_ids: &papaya::HashMap<&'static str, u32, ahash::RandomState>, metadata: &'static Metadata<'static>, extract_fields: &[&'static str], ) -> Self { let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect(); // get all the indices of span fields we want to focus let extract = extract_fields .iter() // use rposition, since we want last match wins. .map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX)) .collect(); // normalized_name is unique for each callsite, but it is not // unified across separate proxy instances. // todo: can we do better here? let cid = *callsite_name_ids .pin() .update_or_insert(metadata.name(), |&cid| cid + 1, 0); // we hope that most span names are unique, in which case this will always be 0 let normalized_name = if cid == 0 { metadata.name().into() } else { // if the span name is not unique, add the numeric ID to span name to distinguish it. // sadly this is non-determinstic, across restarts but we should fix it by disambiguating re-used span names instead. format!("{}#{cid}", metadata.name()).into() }; Self { extract, normalized_name, } } } #[derive(Clone)] struct RawValue(Box<[u8]>); impl RawValue { fn new(v: impl json::ValueEncoder) -> Self { Self(json::value_to_vec!(|val| v.encode(val)).into_boxed_slice()) } } impl json::ValueEncoder for &RawValue { fn encode(self, v: json::ValueSer<'_>) { v.write_raw_json(&self.0); } } /// Stores span field values recorded during the spans lifetime. struct SpanFields { values: [Option; MAX_TRACING_FIELDS], /// cached span info so we can avoid extra hashmap lookups in the hot path. span_info: CallsiteSpanInfo, } impl SpanFields { fn new(span_info: CallsiteSpanInfo) -> Self { Self { span_info, values: [const { None }; MAX_TRACING_FIELDS], } } } impl tracing::field::Visit for SpanFields { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { if let Ok(value) = i64::try_from(value) { self.values[field.index()] = Some(RawValue::new(value)); } else { self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { if let Ok(value) = u64::try_from(value) { self.values[field.index()] = Some(RawValue::new(value)); } else { self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { self.values[field.index()] = Some(RawValue::new(format_args!("{value:?}"))); } #[inline] fn record_error( &mut self, field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } /// List of field indices skipped during logging. Can list duplicate fields or /// metafields not meant to be logged. #[derive(Copy, Clone, Default)] struct SkippedFieldIndices { // 32-bits is large enough for `MAX_TRACING_FIELDS` bits: u32, } impl SkippedFieldIndices { #[inline] fn is_empty(self) -> bool { self.bits == 0 } #[inline] fn set(&mut self, index: usize) { debug_assert!(index <= 32, "index out of bounds of 32-bit set"); self.bits |= 1 << index; } #[inline] fn contains(self, index: usize) -> bool { self.bits & (1 << index) != 0 } } /// Formats a tracing event and writes JSON to its internal buffer including a newline. // TODO: buffer capacity management, truncate if too large struct EventFormatter { logline_buffer: Vec, } impl EventFormatter { #[inline] const fn new() -> Self { EventFormatter { logline_buffer: Vec::new(), } } #[inline] fn buffer(&self) -> &[u8] { &self.logline_buffer } fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, skipped_field_indices: &CallsiteMap, extract_fields: &'static [&'static str], ) where S: Subscriber + for<'a> LookupSpan<'a>, { let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true); use tracing_log::NormalizeEvent; let normalized_meta = event.normalized_metadata(); let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata()); let skipped_field_indices = skipped_field_indices .pin() .get(&meta.callsite()) .copied() .unwrap_or_default(); self.logline_buffer.clear(); let serializer = json::ValueSer::new(&mut self.logline_buffer); json::value_as_object!(|serializer| { // Timestamp comes first, so raw lines can be sorted by timestamp. serializer.entry("timestamp", &*timestamp); // Level next. serializer.entry("level", meta.level().as_str()); // Message next. let mut message_extractor = MessageFieldExtractor::new(serializer.key("message"), skipped_field_indices); event.record(&mut message_extractor); message_extractor.finish(); // Direct message fields. { let mut message_skipper = MessageFieldSkipper::new( serializer.key("fields").object(), skipped_field_indices, ); event.record(&mut message_skipper); // rollback if no fields are present. if message_skipper.present { message_skipper.serializer.finish(); } } let mut extracted = ExtractedSpanFields::new(extract_fields); let spans = serializer.key("spans"); json::value_as_object!(|spans| { let parent_spans = ctx .event_span(event) .map_or(vec![], |parent| parent.scope().collect()); for span in parent_spans.iter().rev() { let ext = span.extensions(); // all spans should have this extension. let Some(fields) = ext.get() else { continue }; extracted.layer_span(fields); let SpanFields { values, span_info } = fields; let span_fields = spans.key(&*span_info.normalized_name); json::value_as_object!(|span_fields| { for (field, value) in std::iter::zip(span.metadata().fields(), values) { if let Some(value) = value { span_fields.entry(field.name(), value); } } }); } }); // TODO: thread-local cache? let pid = std::process::id(); // Skip adding pid 1 to reduce noise for services running in containers. if pid != 1 { serializer.entry("process_id", pid); } THREAD_ID.with(|tid| serializer.entry("thread_id", tid)); // TODO: tls cache? name could change if let Some(thread_name) = std::thread::current().name() && !thread_name.is_empty() && thread_name != "tokio-runtime-worker" { serializer.entry("thread_name", thread_name); } if let Some(task_id) = tokio::task::try_id() { serializer.entry("task_id", format_args!("{task_id}")); } serializer.entry("target", meta.target()); // Skip adding module if it's the same as target. if let Some(module) = meta.module_path() && module != meta.target() { serializer.entry("module", module); } if let Some(file) = meta.file() { if let Some(line) = meta.line() { serializer.entry("src", format_args!("{file}:{line}")); } else { serializer.entry("src", file); } } { let otel_context = Span::current().context(); let otel_spanref = otel_context.span(); let span_context = otel_spanref.span_context(); if span_context.is_valid() { serializer.entry("trace_id", format_args!("{}", span_context.trace_id())); } } if extracted.has_values() { // TODO: add fields from event, too? let extract = serializer.key("extract"); json::value_as_object!(|extract| { for (key, value) in std::iter::zip(extracted.names, extracted.values) { if let Some(value) = value { extract.entry(*key, &value); } } }); } }); self.logline_buffer.push(b'\n'); } } /// Extracts the message field that's mixed will other fields. struct MessageFieldExtractor<'buf> { serializer: Option>, skipped_field_indices: SkippedFieldIndices, } impl<'buf> MessageFieldExtractor<'buf> { #[inline] fn new(serializer: json::ValueSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer: Some(serializer), skipped_field_indices, } } #[inline] fn finish(self) { if let Some(ser) = self.serializer { ser.value(""); } } #[inline] fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) { if field.name() == MESSAGE_FIELD && !self.skipped_field_indices.contains(field.index()) && let Some(ser) = self.serializer.take() { ser.value(v); } } } impl tracing::field::Visit for MessageFieldExtractor<'_> { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { self.record_field(field, value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { self.record_field(field, value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { self.record_field(field, value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { self.record_field(field, value); } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { self.record_field(field, value); } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { self.record_field(field, value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { self.record_field(field, format_args!("{value:x?}")); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { self.record_field(field, value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { self.record_field(field, format_args!("{value:?}")); } #[inline] fn record_error( &mut self, field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { self.record_field(field, format_args!("{value}")); } } /// A tracing field visitor that skips the message field. struct MessageFieldSkipper<'buf> { serializer: json::ObjectSer<'buf>, skipped_field_indices: SkippedFieldIndices, present: bool, } impl<'buf> MessageFieldSkipper<'buf> { #[inline] fn new(serializer: json::ObjectSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, present: false, } } #[inline] fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) { if field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") && !self.skipped_field_indices.contains(field.index()) { self.serializer.entry(field.name(), v); self.present |= true; } } } impl tracing::field::Visit for MessageFieldSkipper<'_> { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { self.record_field(field, value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { self.record_field(field, value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { self.record_field(field, value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { self.record_field(field, value); } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { self.record_field(field, value); } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { self.record_field(field, value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { self.record_field(field, format_args!("{value:x?}")); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { self.record_field(field, value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { self.record_field(field, format_args!("{value:?}")); } #[inline] fn record_error( &mut self, field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { self.record_field(field, format_args!("{value}")); } } struct ExtractedSpanFields { names: &'static [&'static str], values: Vec>, } impl ExtractedSpanFields { fn new(names: &'static [&'static str]) -> Self { ExtractedSpanFields { names, values: vec![None; names.len()], } } fn layer_span(&mut self, fields: &SpanFields) { let SpanFields { values, span_info } = fields; // extract the fields for (i, &j) in span_info.extract.iter().enumerate() { let Some(Some(value)) = values.get(j) else { continue; }; // TODO: replace clone with reference, if possible. self.values[i] = Some(value.clone()); } } #[inline] fn has_values(&self) -> bool { self.values.iter().any(|v| v.is_some()) } } #[cfg(test)] mod tests { use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; use tracing::info_span; use super::*; struct TestClock { current_time: Mutex>, } impl Clock for Arc { fn now(&self) -> DateTime { *self.current_time.lock().expect("poisoned") } } struct VecWriter<'a> { buffer: MutexGuard<'a, Vec>, } impl MakeWriter for Arc>> { fn make_writer(&self) -> impl io::Write { VecWriter { buffer: self.lock().expect("poisoned"), } } } impl io::Write for VecWriter<'_> { fn write(&mut self, buf: &[u8]) -> io::Result { self.buffer.write(buf) } fn flush(&mut self) -> io::Result<()> { Ok(()) } } #[test] fn test_field_collection() { let clock = Arc::new(TestClock { current_time: Mutex::new(Utc::now()), }); let buffer = Arc::new(Mutex::new(Vec::new())); let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), span_info: papaya::HashMap::default(), callsite_name_ids: papaya::HashMap::default(), writer: buffer.clone(), extract_fields: &["x"], }; let registry = tracing_subscriber::Registry::default().with(log_layer); tracing::subscriber::with_default(registry, || { info_span!("some_span", x = 24).in_scope(|| { info_span!("some_other_span", y = 30).in_scope(|| { info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { tracing::error!( a = 1, a = 2, a = 3, message = "explicit message field", "implicit message field" ); }); }); }); }); let buffer = Arc::try_unwrap(buffer) .expect("no other reference") .into_inner() .expect("poisoned"); let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON"); let expected: serde_json::Value = serde_json::json!( { "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), "level": "ERROR", "message": "explicit message field", "fields": { "a": 3, }, "spans": { "some_span":{ "x": 24, }, "some_other_span": { "y": 30, }, "some_span#1": { "x": 42, }, }, "extract": { "x": 42, }, "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), "target": "proxy::logging::tests", "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(), "thread_name": "logging::tests::test_field_collection", } ); assert_json_eq!(actual, expected); } } ================================================ FILE: proxy/src/metrics.rs ================================================ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; use measured::label::{ FixedCardinalitySet, LabelGroupSet, LabelGroupVisitor, LabelName, LabelSet, LabelValue, StaticLabelSet, }; use measured::metric::group::Encoding; use measured::metric::histogram::Thresholds; use measured::metric::name::MetricName; use measured::{ Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec, InfoMetric}; use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; use crate::error::ErrorKind; #[derive(MetricGroup)] #[metric(new())] pub struct Metrics { #[metric(namespace = "proxy")] #[metric(init = ProxyMetrics::new())] pub proxy: ProxyMetrics, #[metric(namespace = "wake_compute_lock")] pub wake_compute_lock: ApiLockMetrics, #[metric(namespace = "service")] pub service: ServiceMetrics, #[metric(namespace = "cache")] pub cache: CacheMetrics, } impl Metrics { #[track_caller] pub fn get() -> &'static Self { static SELF: OnceLock = OnceLock::new(); SELF.get_or_init(|| { let mut metrics = Metrics::new(); metrics.proxy.errors_total.init_all_dense(); metrics.proxy.redis_errors_total.init_all_dense(); metrics.proxy.redis_events_count.init_all_dense(); metrics.proxy.retries_metric.init_all_dense(); metrics.proxy.connection_failures_total.init_all_dense(); metrics }) } } #[derive(MetricGroup)] #[metric(new())] pub struct ProxyMetrics { #[metric(flatten)] pub db_connections: CounterPairVec, #[metric(flatten)] pub client_connections: CounterPairVec, #[metric(flatten)] pub connection_requests: CounterPairVec, #[metric(flatten)] pub http_endpoint_pools: HttpEndpointPools, #[metric(flatten)] pub cancel_channel_size: CounterPairVec, /// Time it took for proxy to establish a connection to the compute endpoint. // largest bucket = 2^16 * 0.5ms = 32s #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] pub compute_connection_latency_seconds: HistogramVec, /// Time it took for proxy to receive a response from control plane. #[metric( // largest bucket = 2^16 * 0.2ms = 13s metadata = Thresholds::exponential_buckets(0.0002, 2.0), )] pub console_request_latency: HistogramVec, /// Size of the HTTP request body lengths. // smallest bucket = 16 bytes // largest bucket = 4^12 * 16 bytes = 256MB #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] pub http_conn_content_length_bytes: HistogramVec, 12>, /// Time it takes to reclaim unused connection pools. #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] pub http_pool_reclaimation_lag_seconds: Histogram<16>, /// Number of opened connections to a database. pub http_pool_opened_connections: Gauge, /// Number of allowed ips #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, /// Number of allowed VPC endpoints IDs #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, /// Number of connections, by the method we used to determine the endpoint. pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, /// Number of wake-up failures (per kind). pub connection_failures_breakdown: CounterVec, /// Number of bytes sent/received between all clients and backends. pub io_bytes: CounterVec>, /// Number of IO errors while logging. pub logging_errors_count: Counter, /// Number of errors by a given classification. pub errors_total: CounterVec>, /// Number of cancellation requests (per found/not_found). pub cancellation_requests_total: CounterVec, /// Number of errors by a given classification pub redis_errors_total: CounterVec, /// Number of TLS handshake failures pub tls_handshake_failures: Counter, /// Number of SHA 256 rounds executed. pub sha_rounds: Counter, /// HLL approximate cardinality of endpoints that are connecting pub connecting_endpoints: HyperLogLogVec, 32>, /// Number of endpoints affected by errors of a given classification pub endpoints_affected_by_errors: HyperLogLogVec, 32>, /// Number of retries (per outcome, per retry_type). #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] pub retries_metric: HistogramVec, /// Number of events consumed from redis (per event type). pub redis_events_count: CounterVec>, #[metric(namespace = "connect_compute_lock")] pub connect_compute_lock: ApiLockMetrics, #[metric(namespace = "scram_pool")] pub scram_pool: OnceLockWrapper>, } /// A Wrapper over [`OnceLock`] to implement [`MetricGroup`]. pub struct OnceLockWrapper(pub OnceLock); impl Default for OnceLockWrapper { fn default() -> Self { Self(OnceLock::new()) } } impl> MetricGroup for OnceLockWrapper { fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { if let Some(inner) = self.0.get() { inner.collect_group_into(enc)?; } Ok(()) } } #[derive(MetricGroup)] #[metric(new())] pub struct ApiLockMetrics { /// Number of semaphores registered in this api lock pub semaphores_registered: Counter, /// Number of semaphores unregistered in this api lock pub semaphores_unregistered: Counter, /// Time it takes to reclaim unused semaphores in the api lock #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] pub reclamation_lag_seconds: Histogram<16>, /// Time it takes to acquire a semaphore lock #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] pub semaphore_acquire_seconds: Histogram<16>, } impl Default for ApiLockMetrics { fn default() -> Self { Self::new() } } #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "direction")] pub enum HttpDirection { Request, Response, } #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "direction")] pub enum Direction { Tx, Rx, } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] #[label(singleton = "protocol")] pub enum Protocol { Http, Ws, Tcp, SniRouter, } impl Protocol { pub fn as_str(self) -> &'static str { match self { Protocol::Http => "http", Protocol::Ws => "ws", Protocol::Tcp => "tcp", Protocol::SniRouter => "sni_router", } } } impl std::fmt::Display for Protocol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.as_str()) } } #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum Bool { True, False, } #[derive(LabelGroup)] #[label(set = ConsoleRequestSet)] pub struct ConsoleRequest<'a> { #[label(dynamic_with = ThreadedRodeo, default)] pub request: &'a str, } #[derive(MetricGroup, Default)] pub struct HttpEndpointPools { /// Number of endpoints we have registered pools for pub http_pool_endpoints_registered_total: Counter, /// Number of endpoints we have unregistered pools for pub http_pool_endpoints_unregistered_total: Counter, } pub struct HttpEndpointPoolsGuard<'a> { dec: &'a Counter, } impl Drop for HttpEndpointPoolsGuard<'_> { fn drop(&mut self) { self.dec.inc(); } } impl HttpEndpointPools { pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> { self.http_pool_endpoints_registered_total.inc(); HttpEndpointPoolsGuard { dec: &self.http_pool_endpoints_unregistered_total, } } } pub struct NumDbConnectionsGauge; impl CounterPairAssoc for NumDbConnectionsGauge { const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); const INC_HELP: &'static str = "Number of opened connections to a database."; const DEC_HELP: &'static str = "Number of closed connections to a database."; type LabelGroupSet = StaticLabelSet; } pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; pub struct NumClientConnectionsGauge; impl CounterPairAssoc for NumClientConnectionsGauge { const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); const INC_HELP: &'static str = "Number of opened connections from a client."; const DEC_HELP: &'static str = "Number of closed connections from a client."; type LabelGroupSet = StaticLabelSet; } pub type NumClientConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; pub struct NumConnectionRequestsGauge; impl CounterPairAssoc for NumConnectionRequestsGauge { const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); const INC_HELP: &'static str = "Number of client connections accepted."; const DEC_HELP: &'static str = "Number of client connections closed."; type LabelGroupSet = StaticLabelSet; } pub type NumConnectionRequestsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; pub struct CancelChannelSizeGauge; impl CounterPairAssoc for CancelChannelSizeGauge { const INC_NAME: &'static MetricName = MetricName::from_str("opened_msgs_cancel_channel_total"); const DEC_NAME: &'static MetricName = MetricName::from_str("closed_msgs_cancel_channel_total"); const INC_HELP: &'static str = "Number of processing messages in the cancellation channel."; const DEC_HELP: &'static str = "Number of closed messages in the cancellation channel."; type LabelGroupSet = StaticLabelSet; } pub type CancelChannelSizeGuard<'a> = metrics::MeasuredCounterPairGuard<'a, CancelChannelSizeGauge>; #[derive(LabelGroup)] #[label(set = ComputeConnectionLatencySet)] pub struct ComputeConnectionLatencyGroup { protocol: Protocol, cold_start_info: ColdStartInfo, outcome: ConnectOutcome, excluded: LatencyExclusions, } #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum LatencyExclusions { Client, ClientAndCplane, ClientCplaneCompute, ClientCplaneComputeRetry, } #[derive(LabelGroup)] #[label(set = SniSet)] pub struct SniGroup { pub protocol: Protocol, pub kind: SniKind, } #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum SniKind { /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, /// Metadata based routing, using the password field. PasswordHack, } #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "kind")] pub enum ConnectionFailureKind { ComputeCached, ComputeUncached, } #[derive(LabelGroup)] #[label(set = ConnectionFailuresBreakdownSet)] pub struct ConnectionFailuresBreakdownGroup { pub kind: ErrorKind, pub retry: Bool, } #[derive(LabelGroup, Copy, Clone)] #[label(set = RedisErrorsSet)] pub struct RedisErrors<'a> { #[label(dynamic_with = ThreadedRodeo, default)] pub channel: &'a str, } #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum CancellationOutcome { NotFound, Found, RateLimitExceeded, } #[derive(LabelGroup)] #[label(set = CancellationRequestSet)] pub struct CancellationRequest { pub kind: CancellationOutcome, } #[derive(Clone, Copy)] pub enum Waiting { Cplane, Client, Compute, RetryTimeout, } #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "kind")] #[allow(clippy::enum_variant_names)] pub enum RedisMsgKind { Set, Get, Expire, HGet, } #[derive(Default, Clone)] pub struct LatencyAccumulated { pub cplane: time::Duration, pub client: time::Duration, pub compute: time::Duration, pub retry: time::Duration, } impl std::fmt::Display for LatencyAccumulated { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "client: {}, cplane: {}, compute: {}, retry: {}", self.client.as_micros(), self.cplane.as_micros(), self.compute.as_micros(), self.retry.as_micros() ) } } pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, // time since the stopwatch was stopped stop: Option, // accumulated time on the stopwatch accumulated: LatencyAccumulated, // label data protocol: Protocol, cold_start_info: ColdStartInfo, outcome: ConnectOutcome, skip_reporting: bool, } impl LatencyTimer { pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified outcome: ConnectOutcome::Failed, skip_reporting: false, } } pub(crate) fn noop(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified outcome: ConnectOutcome::Failed, skip_reporting: true, } } pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) { let dur = start.elapsed(); match waiting_for { Waiting::Cplane => self.accumulated.cplane += dur, Waiting::Client => self.accumulated.client += dur, Waiting::Compute => self.accumulated.compute += dur, Waiting::RetryTimeout => self.accumulated.retry += dur, } } pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { self.cold_start_info = cold_start_info; } pub fn success(&mut self) { // stop the stopwatch and record the time that we have accumulated self.stop = Some(time::Instant::now()); // success self.outcome = ConnectOutcome::Success; } pub fn accumulated(&self) -> LatencyAccumulated { self.accumulated.clone() } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum ConnectOutcome { Success, Failed, } impl Drop for LatencyTimer { fn drop(&mut self) { if self.skip_reporting { return; } let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); let metric = &Metrics::get().proxy.compute_connection_latency_seconds; // Excluding client communication from the accumulated time. metric.observe( ComputeConnectionLatencyGroup { protocol: self.protocol, cold_start_info: self.cold_start_info, outcome: self.outcome, excluded: LatencyExclusions::Client, }, duration .saturating_sub(self.accumulated.client) .as_secs_f64(), ); // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; metric.observe( ComputeConnectionLatencyGroup { protocol: self.protocol, cold_start_info: self.cold_start_info, outcome: self.outcome, excluded: LatencyExclusions::ClientAndCplane, }, duration.saturating_sub(accumulated_total).as_secs_f64(), ); // Exclude client, cplane, compute communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; metric.observe( ComputeConnectionLatencyGroup { protocol: self.protocol, cold_start_info: self.cold_start_info, outcome: self.outcome, excluded: LatencyExclusions::ClientCplaneCompute, }, duration.saturating_sub(accumulated_total).as_secs_f64(), ); // Exclude client, cplane, compute, retry communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute + self.accumulated.retry; metric.observe( ComputeConnectionLatencyGroup { protocol: self.protocol, cold_start_info: self.cold_start_info, outcome: self.outcome, excluded: LatencyExclusions::ClientCplaneComputeRetry, }, duration.saturating_sub(accumulated_total).as_secs_f64(), ); } } impl From for Bool { fn from(value: bool) -> Self { if value { Bool::True } else { Bool::False } } } #[derive(LabelGroup)] #[label(set = RetriesMetricSet)] pub struct RetriesMetricGroup { pub outcome: ConnectOutcome, pub retry_type: RetryType, } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum RetryType { WakeCompute, ConnectToCompute, } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] #[label(singleton = "event")] pub enum RedisEventsCount { EndpointCreated, BranchCreated, ProjectCreated, CancelSession, InvalidateRole, InvalidateEndpoint, InvalidateProject, InvalidateProjects, InvalidateOrg, } pub struct ThreadPoolWorkers(usize); #[derive(Copy, Clone)] pub struct ThreadPoolWorkerId(pub usize); impl LabelValue for ThreadPoolWorkerId { fn visit(&self, v: V) -> V::Output { v.write_int(self.0 as i64) } } impl LabelGroup for ThreadPoolWorkerId { fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { v.write_value(LabelName::from_str("worker"), self); } } impl LabelGroupSet for ThreadPoolWorkers { type Group<'a> = ThreadPoolWorkerId; fn cardinality(&self) -> Option { Some(self.0) } fn encode_dense(&self, value: Self::Unique) -> Option { Some(value) } fn decode_dense(&self, value: usize) -> Self::Group<'_> { ThreadPoolWorkerId(value) } type Unique = usize; fn encode(&self, value: Self::Group<'_>) -> Option { Some(value.0) } fn decode(&self, value: &Self::Unique) -> Self::Group<'_> { ThreadPoolWorkerId(*value) } } impl LabelSet for ThreadPoolWorkers { type Value<'a> = ThreadPoolWorkerId; fn dynamic_cardinality(&self) -> Option { Some(self.0) } fn encode(&self, value: Self::Value<'_>) -> Option { (value.0 < self.0).then_some(value.0) } fn decode(&self, value: usize) -> Self::Value<'_> { ThreadPoolWorkerId(value) } } impl FixedCardinalitySet for ThreadPoolWorkers { fn cardinality(&self) -> usize { self.0 } } #[derive(MetricGroup)] #[metric(new(workers: usize))] pub struct ThreadPoolMetrics { #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] pub worker_task_turns_total: CounterVec, #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] pub worker_task_skips_total: CounterVec, } #[derive(MetricGroup, Default)] pub struct ServiceMetrics { pub info: InfoMetric, } #[derive(Default)] pub struct ServiceInfo { pub state: ServiceState, } impl ServiceInfo { pub const fn running() -> Self { ServiceInfo { state: ServiceState::Running, } } pub const fn terminating() -> Self { ServiceInfo { state: ServiceState::Terminating, } } } impl LabelGroup for ServiceInfo { fn visit_values(&self, v: &mut impl LabelGroupVisitor) { const STATE: &LabelName = LabelName::from_str("state"); v.write_value(STATE, &self.state); } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug, Default)] #[label(singleton = "state")] pub enum ServiceState { #[default] Init, Running, Terminating, } #[derive(MetricGroup)] #[metric(new())] pub struct CacheMetrics { /// The capacity of the cache pub capacity: GaugeVec>, /// The total number of entries inserted into the cache pub inserted_total: CounterVec>, /// The total number of entries removed from the cache pub evicted_total: CounterVec, /// The total number of cache requests pub request_total: CounterVec, } impl Default for CacheMetrics { fn default() -> Self { Self::new() } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] #[label(singleton = "cache")] pub enum CacheKind { NodeInfo, ProjectInfoEndpoints, ProjectInfoRoles, Schema, Pbkdf2, } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum CacheRemovalCause { Expired, Explicit, Replaced, Size, } #[derive(LabelGroup)] #[label(set = CacheEvictionSet)] pub struct CacheEviction { pub cache: CacheKind, pub cause: CacheRemovalCause, } #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum CacheOutcome { Hit, Miss, } #[derive(LabelGroup)] #[label(set = CacheOutcomeSet)] pub struct CacheOutcomeGroup { pub cache: CacheKind, pub outcome: CacheOutcome, } ================================================ FILE: proxy/src/parse.rs ================================================ //! Small parsing helpers. use std::ffi::CStr; pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { let cstr = CStr::from_bytes_until_nul(bytes).ok()?; let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len()); Some((cstr, other)) } #[cfg(test)] mod tests { use super::*; #[test] fn test_split_cstr() { assert!(split_cstr(b"").is_none()); assert!(split_cstr(b"foo").is_none()); let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); assert_eq!(cstr.to_bytes(), b""); assert_eq!(rest, b""); let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); assert_eq!(cstr.to_bytes(), b"foo"); assert_eq!(rest, b"bar"); } } ================================================ FILE: proxy/src/pglb/copy_bidirectional.rs ================================================ use std::future::poll_fn; use std::io; use std::pin::Pin; use std::task::{Context, Poll, ready}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tracing::info; #[derive(Debug)] enum TransferState { Running(CopyBuffer), ShuttingDown(u64), Done(u64), } #[derive(Debug)] pub(crate) enum ErrorDirection { Read(io::Error), Write(io::Error), } impl ErrorSource { fn from_client(err: ErrorDirection) -> ErrorSource { match err { ErrorDirection::Read(client) => Self::Client(client), ErrorDirection::Write(compute) => Self::Compute(compute), } } fn from_compute(err: ErrorDirection) -> ErrorSource { match err { ErrorDirection::Write(client) => Self::Client(client), ErrorDirection::Read(compute) => Self::Compute(compute), } } } #[derive(Debug)] pub enum ErrorSource { Client(io::Error), Compute(io::Error), } fn transfer_one_direction( cx: &mut Context<'_>, state: &mut TransferState, r: &mut A, w: &mut B, ) -> Poll> where A: AsyncRead + AsyncWrite + Unpin + ?Sized, B: AsyncRead + AsyncWrite + Unpin + ?Sized, { let mut r = Pin::new(r); let mut w = Pin::new(w); loop { match state { TransferState::Running(buf) => { let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?; *state = TransferState::ShuttingDown(count); } TransferState::ShuttingDown(count) => { ready!(w.as_mut().poll_shutdown(cx)).map_err(ErrorDirection::Write)?; *state = TransferState::Done(*count); } TransferState::Done(count) => return Poll::Ready(Ok(*count)), } } } #[tracing::instrument(skip_all)] pub async fn copy_bidirectional_client_compute( client: &mut Client, compute: &mut Compute, ) -> Result<(u64, u64), ErrorSource> where Client: AsyncRead + AsyncWrite + Unpin + ?Sized, Compute: AsyncRead + AsyncWrite + Unpin + ?Sized, { let mut client_to_compute = TransferState::Running(CopyBuffer::new()); let mut compute_to_client = TransferState::Running(CopyBuffer::new()); poll_fn(|cx| { let mut client_to_compute_result = transfer_one_direction(cx, &mut client_to_compute, client, compute) .map_err(ErrorSource::from_client)?; let mut compute_to_client_result = transfer_one_direction(cx, &mut compute_to_client, compute, client) .map_err(ErrorSource::from_compute)?; // TODO: 1 info log, with a enum label for close direction. // Early termination checks from compute to client. if let TransferState::Done(_) = compute_to_client && let TransferState::Running(buf) = &client_to_compute { info!("Compute is done, terminate client"); // Initiate shutdown client_to_compute = TransferState::ShuttingDown(buf.amt); client_to_compute_result = transfer_one_direction(cx, &mut client_to_compute, client, compute) .map_err(ErrorSource::from_client)?; } // Early termination checks from client to compute. if let TransferState::Done(_) = client_to_compute && let TransferState::Running(buf) = &compute_to_client { info!("Client is done, terminate compute"); // Initiate shutdown compute_to_client = TransferState::ShuttingDown(buf.amt); compute_to_client_result = transfer_one_direction(cx, &mut compute_to_client, compute, client) .map_err(ErrorSource::from_compute)?; } // It is not a problem if ready! returns early ... (comment remains the same) let client_to_compute = ready!(client_to_compute_result); let compute_to_client = ready!(compute_to_client_result); Poll::Ready(Ok((client_to_compute, compute_to_client))) }) .await } #[derive(Debug)] pub(super) struct CopyBuffer { read_done: bool, need_flush: bool, pos: usize, cap: usize, amt: u64, buf: Box<[u8]>, } const DEFAULT_BUF_SIZE: usize = 1024; impl CopyBuffer { pub(super) fn new() -> Self { Self { read_done: false, need_flush: false, pos: 0, cap: 0, amt: 0, buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(), } } fn poll_fill_buf( &mut self, cx: &mut Context<'_>, reader: Pin<&mut R>, ) -> Poll> where R: AsyncRead + ?Sized, { let me = &mut *self; let mut buf = ReadBuf::new(&mut me.buf); buf.set_filled(me.cap); let res = reader.poll_read(cx, &mut buf); if let Poll::Ready(Ok(())) = res { let filled_len = buf.filled().len(); me.read_done = me.cap == filled_len; me.cap = filled_len; } res } fn poll_write_buf( &mut self, cx: &mut Context<'_>, mut reader: Pin<&mut R>, mut writer: Pin<&mut W>, ) -> Poll> where R: AsyncRead + ?Sized, W: AsyncWrite + ?Sized, { let me = &mut *self; match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) { Poll::Pending => { // Top up the buffer towards full if we can read a bit more // data - this should improve the chances of a large write if !me.read_done && me.cap < me.buf.len() { ready!(me.poll_fill_buf(cx, reader.as_mut())).map_err(ErrorDirection::Read)?; } Poll::Pending } res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write), } } pub(super) fn poll_copy( &mut self, cx: &mut Context<'_>, mut reader: Pin<&mut R>, mut writer: Pin<&mut W>, ) -> Poll> where R: AsyncRead + ?Sized, W: AsyncWrite + ?Sized, { loop { // If there is some space left in our buffer, then we try to read some // data to continue, thus maximizing the chances of a large write. if self.cap < self.buf.len() && !self.read_done { match self.poll_fill_buf(cx, reader.as_mut()) { Poll::Ready(Ok(())) => (), Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))), Poll::Pending => { // Ignore pending reads when our buffer is not empty, because // we can try to write data immediately. if self.pos == self.cap { // Try flushing when the reader has no progress to avoid deadlock // when the reader depends on buffered writer. if self.need_flush { ready!(writer.as_mut().poll_flush(cx)) .map_err(ErrorDirection::Write)?; self.need_flush = false; } return Poll::Pending; } } } } // If our buffer has some data, let's write it out! while self.pos < self.cap { let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?; if i == 0 { return Poll::Ready(Err(ErrorDirection::Write(io::Error::new( io::ErrorKind::WriteZero, "write zero byte into writer", )))); } self.pos += i; self.amt += i as u64; self.need_flush = true; } // If pos larger than cap, this loop will never stop. // In particular, user's wrong poll_write implementation returning // incorrect written length may lead to thread blocking. debug_assert!( self.pos <= self.cap, "writer returned length larger than input slice" ); // All data has been written, the buffer can be considered empty again self.pos = 0; self.cap = 0; // If we've written all the data and we've seen EOF, flush out the // data and finish the transfer. if self.read_done { ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?; return Poll::Ready(Ok(self.amt)); } } } } #[cfg(test)] mod tests { use tokio::io::AsyncWriteExt; use super::*; #[tokio::test] async fn test_client_to_compute() { let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream // Simulate 'a' finishing while there's still data for 'b' client_client.write_all(b"hello").await.unwrap(); client_client.shutdown().await.unwrap(); compute_client.write_all(b"Neon").await.unwrap(); compute_client.shutdown().await.unwrap(); let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) .await .unwrap(); // Assert correct transferred amounts let (client_to_compute_count, compute_to_client_count) = result; assert_eq!(client_to_compute_count, 5); // 'hello' was transferred assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all } #[tokio::test] async fn test_compute_to_client() { let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream // Simulate 'a' finishing while there's still data for 'b' compute_client.write_all(b"hello").await.unwrap(); compute_client.shutdown().await.unwrap(); client_client .write_all(b"Neon Serverless Postgres") .await .unwrap(); let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) .await .unwrap(); // Assert correct transferred amounts let (client_to_compute_count, compute_to_client_count) = result; assert_eq!(compute_to_client_count, 5); // 'hello' was transferred assert!(client_to_compute_count <= 8); // response only partially transferred or not at all } } ================================================ FILE: proxy/src/pglb/handshake.rs ================================================ use futures::{FutureExt, TryFutureExt}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use crate::auth::endpoint_sni; use crate::config::TlsConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::pglb::TlsRequired; use crate::pqproto::{ BeMessage, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; use crate::stream::{PqStream, Stream, StreamUpgradeError}; use crate::tls::PG_ALPN_PROTOCOL; #[derive(Error, Debug)] pub(crate) enum HandshakeError { #[error("data is sent before server replied with EncryptionResponse")] EarlyData, #[error("protocol violation")] ProtocolViolation, #[error("{0}")] StreamUpgradeError(#[from] StreamUpgradeError), #[error("{0}")] Io(#[from] std::io::Error), #[error("{0}")] ReportedError(#[from] crate::stream::ReportedError), } impl ReportableError for HandshakeError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { HandshakeError::EarlyData => crate::error::ErrorKind::User, HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, HandshakeError::StreamUpgradeError(upgrade) => match upgrade { StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, }, HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, HandshakeError::ReportedError(e) => e.get_error_kind(), } } } pub(crate) enum HandshakeData { Startup(PqStream>, StartupMessageParams), Cancel(CancelKeyData), } /// Establish a (most probably, secure) connection with the client. /// For better testing experience, `stream` can be any object satisfying the traits. /// It's easier to work with owned `stream` here as we need to upgrade it to TLS; /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub(crate) async fn handshake( ctx: &RequestContext, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, ) -> Result, HandshakeError> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0); const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0); let (mut stream, mut msg) = PqStream::parse_startup(Stream::from_raw(stream)).await?; loop { match msg { FeStartupPacket::SslRequest { direct } => match stream.get_ref() { Stream::Raw { .. } if !tried_ssl => { tried_ssl = true; if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. let mut read_buf; let raw = if let Some(direct) = &direct { read_buf = &direct[..]; stream.accept_direct_tls() } else { read_buf = &[]; stream.accept_tls().await? }; let Stream::Raw { raw } = raw else { return Err(HandshakeError::StreamUpgradeError( StreamUpgradeError::AlreadyTls, )); }; let mut res = Ok(()); let accept = tokio_rustls::TlsAcceptor::from(tls.pg_config.clone()) .accept_with(raw, |session| { // push the early data to the tls session while !read_buf.is_empty() { match session.read_tls(&mut read_buf) { Ok(_) => {} Err(e) => { res = Err(e); break; } } } }) .map_ok(Box::new) .boxed(); res?; if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } let tls_stream = accept.await.inspect_err(|_| { if record_handshake_error { Metrics::get().proxy.tls_handshake_failures.inc(); } })?; let conn_info = tls_stream.get_ref().1; // try parse endpoint let ep = conn_info .server_name() .and_then(|sni| endpoint_sni(sni, &tls.common_names)); if let Some(ep) = ep { ctx.set_endpoint_id(ep); } // check the ALPN, if exists, as required. match conn_info.alpn_protocol() { None | Some(PG_ALPN_PROTOCOL) => {} Some(other) => { let alpn = String::from_utf8_lossy(other); warn!(%alpn, "unexpected ALPN"); return Err(HandshakeError::ProtocolViolation); } } let (_, tls_server_end_point) = tls.cert_resolver.resolve(conn_info.server_name()); let tls = Stream::Tls { tls: tls_stream, tls_server_end_point, }; (stream, msg) = PqStream::parse_startup(tls).await?; } else { if direct.is_some() { // client sent us a ClientHello already, we can't do anything with it. return Err(HandshakeError::ProtocolViolation); } msg = stream.reject_encryption().await?; } } _ => return Err(HandshakeError::ProtocolViolation), }, FeStartupPacket::GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { tried_gss = true; // Currently, we don't support GSSAPI msg = stream.reject_encryption().await?; } _ => return Err(HandshakeError::ProtocolViolation), }, FeStartupPacket::StartupMessage { params, version } if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { Err(stream.throw_error(TlsRequired, None).await)?; } // This log highlights the start of the connection. // This contains useful information for debugging, not logged elsewhere, like role name and endpoint id. info!( ?version, ?params, session_type = "normal", "successful handshake" ); break Ok(HandshakeData::Startup(stream, params)); } // downgrade protocol version FeStartupPacket::StartupMessage { params, version } if version.major() == 3 && version > PG_PROTOCOL_LATEST => { debug!(?version, "unsupported minor version"); // no protocol extensions are supported. // let mut unsupported = vec![]; let mut supported = StartupMessageParams::default(); for (k, v) in params.iter() { if k.starts_with("_pq_.") { unsupported.push(k); } else { supported.insert(k, v); } } stream.write_message(BeMessage::NegotiateProtocolVersion { version: PG_PROTOCOL_LATEST, options: &unsupported, }); stream.flush().await?; info!( ?version, ?params, session_type = "normal", "successful handshake; unsupported minor version requested" ); break Ok(HandshakeData::Startup(stream, supported)); } FeStartupPacket::StartupMessage { version, params } => { warn!( ?version, ?params, session_type = "normal", "unsuccessful handshake; unsupported version" ); return Err(HandshakeError::ProtocolViolation); } FeStartupPacket::CancelRequest(cancel_key_data) => { info!(session_type = "cancellation", "successful handshake"); break Ok(HandshakeData::Cancel(cancel_key_data)); } } } } ================================================ FILE: proxy/src/pglb/inprocess.rs ================================================ #![allow(dead_code, reason = "TODO: work in progress")] use std::pin::{Pin, pin}; use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use std::task::{Context, Poll}; use std::{fmt, io}; use tokio::io::{AsyncRead, AsyncWrite, DuplexStream, ReadBuf}; use tokio::sync::mpsc; const STREAM_CHANNEL_SIZE: usize = 16; const MAX_STREAM_BUFFER_SIZE: usize = 4096; #[derive(Debug)] pub struct Connection { stream_sender: mpsc::Sender, stream_receiver: mpsc::Receiver, stream_id_counter: Arc, } impl Connection { pub fn new() -> (Connection, Connection) { let (sender_a, receiver_a) = mpsc::channel(STREAM_CHANNEL_SIZE); let (sender_b, receiver_b) = mpsc::channel(STREAM_CHANNEL_SIZE); let stream_id_counter = Arc::new(AtomicUsize::new(1)); let conn_a = Connection { stream_sender: sender_a, stream_receiver: receiver_b, stream_id_counter: Arc::clone(&stream_id_counter), }; let conn_b = Connection { stream_sender: sender_b, stream_receiver: receiver_a, stream_id_counter, }; (conn_a, conn_b) } #[inline] fn next_stream_id(&self) -> StreamId { StreamId(self.stream_id_counter.fetch_add(1, Ordering::Relaxed)) } #[tracing::instrument(skip_all, fields(stream_id = tracing::field::Empty, err))] pub async fn open_stream(&self) -> io::Result { let (local, remote) = tokio::io::duplex(MAX_STREAM_BUFFER_SIZE); let stream_id = self.next_stream_id(); tracing::Span::current().record("stream_id", stream_id.0); let local = Stream { inner: local, id: stream_id, }; let remote = Stream { inner: remote, id: stream_id, }; self.stream_sender .send(remote) .await .map_err(io::Error::other)?; Ok(local) } #[tracing::instrument(skip_all, fields(stream_id = tracing::field::Empty, err))] pub async fn accept_stream(&mut self) -> io::Result> { Ok(self.stream_receiver.recv().await.inspect(|stream| { tracing::Span::current().record("stream_id", stream.id.0); })) } } #[derive(Copy, Clone, Debug)] pub struct StreamId(usize); impl fmt::Display for StreamId { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } // TODO: Proper closing. Currently Streams can outlive their Connections. // Carry WeakSender and check strong_count? #[derive(Debug)] pub struct Stream { inner: DuplexStream, id: StreamId, } impl Stream { #[inline] pub fn id(&self) -> StreamId { self.id } } impl AsyncRead for Stream { #[tracing::instrument(level = "debug", skip_all, fields(stream_id = %self.id))] #[inline] fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { pin!(&mut self.inner).poll_read(cx, buf) } } impl AsyncWrite for Stream { #[tracing::instrument(level = "debug", skip_all, fields(stream_id = %self.id))] #[inline] fn poll_write( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { pin!(&mut self.inner).poll_write(cx, buf) } #[tracing::instrument(level = "debug", skip_all, fields(stream_id = %self.id))] #[inline] fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { pin!(&mut self.inner).poll_flush(cx) } #[tracing::instrument(level = "debug", skip_all, fields(stream_id = %self.id))] #[inline] fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { pin!(&mut self.inner).poll_shutdown(cx) } #[tracing::instrument(level = "debug", skip_all, fields(stream_id = %self.id))] #[inline] fn poll_write_vectored( mut self: Pin<&mut Self>, cx: &mut Context<'_>, bufs: &[io::IoSlice<'_>], ) -> Poll> { pin!(&mut self.inner).poll_write_vectored(cx, bufs) } #[inline] fn is_write_vectored(&self) -> bool { self.inner.is_write_vectored() } } #[cfg(test)] mod tests { use tokio::io::{AsyncReadExt, AsyncWriteExt}; use super::*; #[tokio::test] async fn test_simple_roundtrip() { let (client, mut server) = Connection::new(); let server_task = tokio::spawn(async move { while let Some(mut stream) = server.accept_stream().await.unwrap() { tokio::spawn(async move { let mut buf = [0; 64]; loop { match stream.read(&mut buf).await.unwrap() { 0 => break, n => stream.write(&buf[..n]).await.unwrap(), }; } }); } }); let mut stream = client.open_stream().await.unwrap(); stream.write_all(b"hello!").await.unwrap(); let mut buf = [0; 64]; let n = stream.read(&mut buf).await.unwrap(); assert_eq!(n, 6); assert_eq!(&buf[..n], b"hello!"); drop(stream); drop(client); server_task.await.unwrap(); } } ================================================ FILE: proxy/src/pglb/mod.rs ================================================ pub mod copy_bidirectional; pub mod handshake; pub mod inprocess; pub mod passthrough; use std::sync::Arc; use futures::FutureExt; use smol_str::ToSmolStr; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, warn}; use crate::auth; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumClientConnectionsGuard}; pub use crate::pglb::copy_bidirectional::ErrorSource; use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; use crate::proxy::handle_client; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::Stream; use crate::util::run_until_cancelled; pub const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; #[derive(Error, Debug)] #[error("{ERR_INSECURE_CONNECTION}")] pub struct TlsRequired; impl ReportableError for TlsRequired { fn get_error_kind(&self) -> crate::error::ErrorKind { crate::error::ErrorKind::User } } impl UserFacingError for TlsRequired {} pub async fn task_main( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); } // When set for the server socket, the keepalive setting // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; let conn_gauge = Metrics::get() .proxy .client_connections .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { let (socket, conn_info) = match config.proxy_protocol_v2 { ProxyProtocolV2::Required => { match read_proxy_protocol(socket).await { Err(e) => { warn!("per-client task finished with an error: {e:#}"); return; } // our load balancers will not send any more data. let's just exit immediately Ok((_socket, ConnectHeader::Local)) => { debug!("healthcheck received"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), } } // ignore the header - it cannot be confused for a postgres or http connection so will // error later. ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, extra: None, }, ), }; match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( "per-client task finished with an error: failed to set socket option: {e:#}" ); return; } } let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_connection( config, auth_backend, &ctx, cancellation_handler, socket, ClientMode::Tcp, endpoint_rate_limiter2, conn_gauge, cancellations, ) .instrument(ctx.span()) .boxed() .await; match res { Err(e) => { ctx.set_error_kind(e.get_error_kind()); warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); } Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { warn!( ?session_id, "per-client task finished with an IO error from the client: {e:#}" ); } Err(ErrorSource::Compute(e)) => { error!( ?session_id, "per-client task finished with an IO error from the compute: {e:#}" ); } } } } }); } connections.close(); cancellations.close(); drop(listener); // Drain connections connections.wait().await; cancellations.wait().await; Ok(()) } pub(crate) enum ClientMode { Tcp, Websockets { hostname: Option }, } /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { pub fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, ClientMode::Websockets { .. } => true, } } pub fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { match self { ClientMode::Tcp => s.sni_hostname(), ClientMode::Websockets { hostname } => hostname.as_deref(), } } pub fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> { match self { ClientMode::Tcp => tls, // TLS is None here if using websockets, because the connection is already encrypted. ClientMode::Websockets { .. } => None, } } } #[derive(Debug, Error)] // almost all errors should be reported to the user, but there's a few cases where we cannot // 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons // 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, // we cannot be sure the client even understands our error message // 3. PrepareClient: The client disconnected, so we can't tell them anyway... pub(crate) enum ClientRequestError { #[error("{0}")] Cancellation(#[from] cancellation::CancelError), #[error("{0}")] Handshake(#[from] HandshakeError), #[error("{0}")] HandshakeTimeout(#[from] tokio::time::error::Elapsed), #[error("{0}")] PrepareClient(#[from] std::io::Error), #[error("{0}")] ReportedError(#[from] crate::stream::ReportedError), } impl ReportableError for ClientRequestError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { ClientRequestError::Cancellation(e) => e.get_error_kind(), ClientRequestError::Handshake(e) => e.get_error_kind(), ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, ClientRequestError::ReportedError(e) => e.get_error_kind(), ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, } } } #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_connection( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestContext, cancellation_handler: Arc, client: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); let metrics = &Metrics::get().proxy; let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.load(); let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, client, mode.handshake_tls(tls), record_handshake_error); let (mut client, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake) .await?? { HandshakeData::Startup(client, params) => (client, params), HandshakeData::Cancel(cancel_key_data) => { // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ let cancellation_handler_clone = Arc::clone(&cancellation_handler); let ctx = ctx.clone(); let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { cancellation_handler_clone .cancel_session( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, config.authentication_config.is_vpc_acccess_proxy, auth_backend.get_api(), ) .await .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); }.instrument(cancel_span) }); return Ok(None); } }; drop(pause); ctx.set_db_options(params.clone()); let common_names = tls.map(|tls| &tls.common_names); let (node, cancel_on_shutdown) = handle_client( config, auth_backend, ctx, cancellation_handler, &mut client, &mode, endpoint_rate_limiter, common_names, ¶ms, ) .await?; let client = client.flush_and_into_inner().await?; let private_link_id = match ctx.extra() { Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), None => None, }; Ok(Some(ProxyPassthrough { client, compute: node.stream.into_framed().into_inner(), aux: node.aux, private_link_id, _cancel_on_shutdown: cancel_on_shutdown, _req: request_gauge, _conn: conn_gauge, _db_conn: node.guage, })) } ================================================ FILE: proxy/src/pglb/passthrough.rs ================================================ use std::convert::Infallible; use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; use crate::compute::MaybeRustlsStream; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{ Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard, NumDbConnectionsGuard, }; use crate::stream::Stream; use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] pub(crate) async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, private_link_id: Option, ) -> Result<(), ErrorSource> { // we will report ingress at a later date let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, private_link_id, }); let metrics = &Metrics::get().proxy.io_bytes; let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). metrics.get_metric(m_sent).inc_by(cnt as u64); usage_tx.record_egress(cnt as u64); }, ); let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). metrics.get_metric(m_recv).inc_by(cnt as u64); usage_tx.record_ingress(cnt as u64); }, ); // Starting from here we only proxy the client's traffic. debug!("performing the proxy pass..."); let _ = crate::pglb::copy_bidirectional::copy_bidirectional_client_compute( &mut client, &mut compute, ) .await?; Ok(()) } pub(crate) struct ProxyPassthrough { pub(crate) client: Stream, pub(crate) compute: MaybeRustlsStream, pub(crate) aux: MetricsAuxInfo, pub(crate) private_link_id: Option, pub(crate) _cancel_on_shutdown: tokio::sync::oneshot::Sender, pub(crate) _req: NumConnectionRequestsGuard<'static>, pub(crate) _conn: NumClientConnectionsGuard<'static>, pub(crate) _db_conn: NumDbConnectionsGuard<'static>, } impl ProxyPassthrough { pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { proxy_pass(self.client, self.compute, self.aux, self.private_link_id).await } } ================================================ FILE: proxy/src/pqproto.rs ================================================ //! Postgres protocol codec //! //! use std::fmt; use std::io::{self, Cursor}; use bytes::{Buf, BufMut}; use itertools::Itertools; use rand::distr::{Distribution, StandardUniform}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use zerocopy::{FromBytes, Immutable, IntoBytes, big_endian}; pub type ErrorCode = [u8; 5]; pub const FE_PASSWORD_MESSAGE: u8 = b'p'; pub const SQLSTATE_INTERNAL_ERROR: [u8; 5] = *b"XX000"; /// The protocol version number. /// /// The most significant 16 bits are the major version number (3 for the protocol described here). /// The least significant 16 bits are the minor version number (0 for the protocol described here). /// #[derive(Clone, Copy, PartialEq, PartialOrd, FromBytes, IntoBytes, Immutable)] #[repr(C)] pub struct ProtocolVersion { major: big_endian::U16, minor: big_endian::U16, } impl ProtocolVersion { pub const fn new(major: u16, minor: u16) -> Self { Self { major: big_endian::U16::new(major), minor: big_endian::U16::new(minor), } } pub const fn minor(self) -> u16 { self.minor.get() } pub const fn major(self) -> u16 { self.major.get() } } impl fmt::Debug for ProtocolVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_list() .entry(&self.major()) .entry(&self.minor()) .finish() } } /// const MAX_STARTUP_PACKET_LENGTH: usize = 10000; const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; /// const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); /// const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); /// const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); /// This first reads the startup message header, is 8 bytes. /// The first 4 bytes is a big-endian message length, and the next 4 bytes is a version number. /// /// The length value is inclusive of the header. For example, /// an empty message will always have length 8. #[derive(Clone, Copy, FromBytes, IntoBytes, Immutable)] #[repr(C)] struct StartupHeader { len: big_endian::U32, version: ProtocolVersion, } /// read the type from the stream using zerocopy. /// /// not cancel safe. macro_rules! read { ($s:expr => $t:ty) => {{ // cannot be implemented as a function due to lack of const-generic-expr let mut buf = [0; size_of::<$t>()]; $s.read_exact(&mut buf).await?; let res: $t = zerocopy::transmute!(buf); res }}; } /// Returns true if TLS is supported. /// /// This is not cancel safe. pub async fn request_tls(stream: &mut S) -> io::Result where S: AsyncRead + AsyncWrite + Unpin, { let payload = StartupHeader { len: 8.into(), version: NEGOTIATE_SSL_CODE, }; stream.write_all(payload.as_bytes()).await?; stream.flush().await?; // we expect back either `S` or `N` as a single byte. let mut res = *b"0"; stream.read_exact(&mut res).await?; debug_assert!( res == *b"S" || res == *b"N", "unexpected SSL negotiation response: {}", char::from(res[0]), ); // S for SSL. Ok(res == *b"S") } pub async fn read_startup(stream: &mut S) -> io::Result where S: AsyncRead + Unpin, { let header = read!(stream => StartupHeader); // // First byte indicates standard SSL handshake message // (It can't be a Postgres startup length because in network byte order // that would be a startup packet hundreds of megabytes long) if header.as_bytes()[0] == 0x16 { return Ok(FeStartupPacket::SslRequest { // The bytes we read for the header are actually part of a TLS ClientHello. // In theory, if the ClientHello was < 8 bytes we would fail with EOF before we get here. // In practice though, I see no world where a ClientHello is less than 8 bytes // since it includes ephemeral keys etc. direct: Some(zerocopy::transmute!(header)), }); } let Some(len) = (header.len.get() as usize).checked_sub(8) else { return Err(io::Error::other(format!( "invalid startup message length {}, must be at least 8.", header.len, ))); }; // TODO: add a histogram for startup packet lengths if len > MAX_STARTUP_PACKET_LENGTH { tracing::warn!("large startup message detected: {len} bytes"); return Err(io::Error::other(format!( "invalid startup message length {len}" ))); } match header.version { // CANCEL_REQUEST_CODE => { if len != 8 { return Err(io::Error::other( "CancelRequest message is malformed, backend PID / secret key missing", )); } Ok(FeStartupPacket::CancelRequest( read!(stream => CancelKeyData), )) } // NEGOTIATE_SSL_CODE => { // Requested upgrade to SSL (aka TLS) Ok(FeStartupPacket::SslRequest { direct: None }) } NEGOTIATE_GSS_CODE => { // Requested upgrade to GSSAPI Ok(FeStartupPacket::GssEncRequest) } version if version.major() == RESERVED_INVALID_MAJOR_VERSION => Err(io::Error::other( format!("Unrecognized request code {version:?}"), )), // StartupMessage version => { // The protocol version number is followed by one or more pairs of parameter name and value strings. // A zero byte is required as a terminator after the last name/value pair. // Parameters can appear in any order. user is required, others are optional. let mut buf = vec![0; len]; stream.read_exact(&mut buf).await?; if buf.pop() != Some(b'\0') { return Err(io::Error::other( "StartupMessage params: missing null terminator", )); } // TODO: Don't do this. // There's no guarantee that these messages are utf8, // but they usually happen to be simple ascii. let params = String::from_utf8(buf) .map_err(|_| io::Error::other("StartupMessage params: invalid utf-8"))?; Ok(FeStartupPacket::StartupMessage { version, params: StartupMessageParams { params }, }) } } } /// Read a raw postgres packet, which will respect the max length requested. /// /// This returns the message tag, as well as the message body. The message /// body is written into `buf`, and it is otherwise completely overwritten. /// /// This is not cancel safe. pub async fn read_message<'a, S>( stream: &mut S, buf: &'a mut Vec, max: u32, ) -> io::Result<(u8, &'a mut [u8])> where S: AsyncRead + Unpin, { /// This first reads the header, which for regular messages in the 3.0 protocol is 5 bytes. /// The first byte is a message tag, and the next 4 bytes is a big-endian length. /// /// Awkwardly, the length value is inclusive of itself, but not of the tag. For example, /// an empty message will always have length 4. #[derive(Clone, Copy, FromBytes)] #[repr(C)] struct Header { tag: u8, len: big_endian::U32, } let header = read!(stream => Header); // as described above, the length must be at least 4. let Some(len) = header.len.get().checked_sub(4) else { return Err(io::Error::other(format!( "invalid startup message length {}, must be at least 4.", header.len, ))); }; // TODO: add a histogram for message lengths // check if the message exceeds our desired max. if len > max { tracing::warn!("large postgres message detected: {len} bytes"); return Err(io::Error::other(format!("invalid message length {len}"))); } // read in our entire message. buf.resize(len as usize, 0); stream.read_exact(buf).await?; Ok((header.tag, buf)) } pub struct WriteBuf(Cursor>); impl Buf for WriteBuf { #[inline] fn remaining(&self) -> usize { self.0.remaining() } #[inline] fn chunk(&self) -> &[u8] { self.0.chunk() } #[inline] fn advance(&mut self, cnt: usize) { self.0.advance(cnt); } } impl WriteBuf { pub const fn new() -> Self { Self(Cursor::new(Vec::new())) } /// Use a heuristic to determine if we should shrink the write buffer. #[inline] fn should_shrink(&self) -> bool { let n = self.0.position() as usize; let len = self.0.get_ref().len(); // the unused space at the front of our buffer is 2x the size of our filled portion. n + n > len } /// Shrink the write buffer so that subsequent writes have more spare capacity. #[cold] fn shrink(&mut self) { let n = self.0.position() as usize; let buf = self.0.get_mut(); // buf repr: // [----unused------|-----filled-----|-----uninit-----] // ^ n ^ buf.len() ^ buf.capacity() let filled = n..buf.len(); let filled_len = filled.len(); buf.copy_within(filled, 0); buf.truncate(filled_len); self.0.set_position(0); } /// clear the write buffer. pub fn reset(&mut self) { let buf = self.0.get_mut(); buf.clear(); self.0.set_position(0); } /// Shrinks the buffer if efficient to do so, and returns the remaining size. pub fn occupied_len(&mut self) -> usize { if self.should_shrink() { self.shrink(); } self.0.get_mut().len() } /// Write a raw message to the internal buffer. /// /// The size_hint value is only a hint for reserving space. It's ok if it's incorrect, since /// we calculate the length after the fact. pub fn write_raw(&mut self, size_hint: usize, tag: u8, f: impl FnOnce(&mut Vec)) { if self.should_shrink() { self.shrink(); } let buf = self.0.get_mut(); buf.reserve(5 + size_hint); buf.push(tag); let start = buf.len(); buf.extend_from_slice(&[0, 0, 0, 0]); f(buf); let end = buf.len(); let len = (end - start) as u32; buf[start..start + 4].copy_from_slice(&len.to_be_bytes()); } /// Write an encryption response message. pub fn encryption(&mut self, m: u8) { self.0.get_mut().push(m); } pub fn write_error(&mut self, msg: &str, error_code: ErrorCode) { self.shrink(); // // // "SERROR\0CXXXXX\0M\0\0".len() == 17 self.write_raw(17 + msg.len(), b'E', |buf| { // Severity: ERROR buf.put_slice(b"SERROR\0"); // Code: error_code buf.put_u8(b'C'); buf.put_slice(&error_code); buf.put_u8(0); // Message: msg buf.put_u8(b'M'); buf.put_slice(msg.as_bytes()); buf.put_u8(0); // End. buf.put_u8(0); }); } } #[derive(Debug)] pub enum FeStartupPacket { CancelRequest(CancelKeyData), SslRequest { direct: Option<[u8; 8]>, }, GssEncRequest, StartupMessage { version: ProtocolVersion, params: StartupMessageParams, }, } #[derive(Debug, Clone, Default)] pub struct StartupMessageParams { pub params: String, } impl StartupMessageParams { /// Get parameter's value by its name. pub fn get(&self, name: &str) -> Option<&str> { self.iter().find_map(|(k, v)| (k == name).then_some(v)) } /// Split command-line options according to PostgreSQL's logic, /// taking into account all escape sequences but leaving them as-is. /// [`None`] means that there's no `options` in [`Self`]. pub fn options_raw(&self) -> Option> { self.get("options").map(Self::parse_options_raw) } /// Split command-line options according to PostgreSQL's logic, /// taking into account all escape sequences but leaving them as-is. pub fn parse_options_raw(input: &str) -> impl Iterator { // See `postgres: pg_split_opts`. let mut last_was_escape = false; input .split(move |c: char| { // We split by non-escaped whitespace symbols. let should_split = c.is_ascii_whitespace() && !last_was_escape; last_was_escape = c == '\\' && !last_was_escape; should_split }) .filter(|s| !s.is_empty()) } /// Iterate through key-value pairs in an arbitrary order. pub fn iter(&self) -> impl Iterator { self.params.split_terminator('\0').tuples() } // This function is mostly useful in tests. #[cfg(test)] pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { let mut b = Self { params: String::new(), }; for (k, v) in pairs { b.insert(k, v); } b } /// Set parameter's value by its name. /// name and value must not contain a \0 byte pub fn insert(&mut self, name: &str, value: &str) { self.params.reserve(name.len() + value.len() + 2); self.params.push_str(name); self.params.push('\0'); self.params.push_str(value); self.params.push('\0'); } } /// Cancel keys usually are represented as PID+SecretKey, but to proxy they're just /// opaque bytes. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, FromBytes, IntoBytes, Immutable)] pub struct CancelKeyData(pub big_endian::U64); pub fn id_to_cancel_key(id: u64) -> CancelKeyData { CancelKeyData(big_endian::U64::new(id)) } impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let id = self.0; f.debug_tuple("CancelKeyData") .field(&format_args!("{id:x}")) .finish() } } impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> CancelKeyData { id_to_cancel_key(rng.random()) } } pub enum BeMessage<'a> { AuthenticationOk, AuthenticationSasl(BeAuthenticationSaslMessage<'a>), AuthenticationCleartextPassword, BackendKeyData(CancelKeyData), ParameterStatus { name: &'a [u8], value: &'a [u8], }, ReadyForQuery, NoticeResponse(&'a str), NegotiateProtocolVersion { version: ProtocolVersion, options: &'a [&'a str], }, } #[derive(Debug)] pub enum BeAuthenticationSaslMessage<'a> { Methods(&'a [&'a str]), Continue(&'a [u8]), Final(&'a [u8]), } impl BeMessage<'_> { /// Write the message into an internal buffer pub fn write_message(self, buf: &mut WriteBuf) { match self { // BeMessage::AuthenticationOk => { buf.write_raw(1, b'R', |buf| buf.put_i32(0)); } // BeMessage::AuthenticationCleartextPassword => { buf.write_raw(1, b'R', |buf| buf.put_i32(3)); } // BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(methods)) => { let len: usize = methods.iter().map(|m| m.len() + 1).sum(); buf.write_raw(len + 2, b'R', |buf| { buf.put_i32(10); // Specifies that SASL auth method is used. for method in methods { buf.put_slice(method.as_bytes()); buf.put_u8(0); } buf.put_u8(0); // zero terminator for the list }); } // BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Continue(extra)) => { buf.write_raw(extra.len() + 1, b'R', |buf| { buf.put_i32(11); // Continue SASL auth. buf.put_slice(extra); }); } // BeMessage::AuthenticationSasl(BeAuthenticationSaslMessage::Final(extra)) => { buf.write_raw(extra.len() + 1, b'R', |buf| { buf.put_i32(12); // Send final SASL message. buf.put_slice(extra); }); } // BeMessage::BackendKeyData(key_data) => { buf.write_raw(8, b'K', |buf| buf.put_slice(key_data.as_bytes())); } // // BeMessage::NoticeResponse(msg) => { // 'N' signalizes NoticeResponse messages buf.write_raw(18 + msg.len(), b'N', |buf| { // Severity: NOTICE buf.put_slice(b"SNOTICE\0"); // Code: XX000 (ignored for notice, but still required) buf.put_slice(b"CXX000\0"); // Message: msg buf.put_u8(b'M'); buf.put_slice(msg.as_bytes()); buf.put_u8(0); // End notice. buf.put_u8(0); }); } // BeMessage::ParameterStatus { name, value } => { buf.write_raw(name.len() + value.len() + 2, b'S', |buf| { buf.put_slice(name.as_bytes()); buf.put_u8(0); buf.put_slice(value.as_bytes()); buf.put_u8(0); }); } // BeMessage::ReadyForQuery => { buf.write_raw(1, b'Z', |buf| buf.put_u8(b'I')); } // BeMessage::NegotiateProtocolVersion { version, options } => { let len: usize = options.iter().map(|o| o.len() + 1).sum(); buf.write_raw(8 + len, b'v', |buf| { buf.put_slice(version.as_bytes()); buf.put_u32(options.len() as u32); for option in options { buf.put_slice(option.as_bytes()); buf.put_u8(0); } }); } } } } #[cfg(test)] mod tests { use std::io::Cursor; use tokio::io::{AsyncWriteExt, duplex}; use zerocopy::IntoBytes; use super::ProtocolVersion; use crate::pqproto::{FeStartupPacket, read_message, read_startup}; #[tokio::test] async fn reject_large_startup() { // we're going to define a v3.0 startup message with far too many parameters. let mut payload = vec![]; // 10001 + 8 bytes. payload.extend_from_slice(&10009_u32.to_be_bytes()); payload.extend_from_slice(ProtocolVersion::new(3, 0).as_bytes()); payload.resize(10009, b'a'); let (mut server, mut client) = duplex(128); #[rustfmt::skip] let (server, client) = tokio::join!( async move { read_startup(&mut server).await.unwrap_err() }, async move { client.write_all(&payload).await.unwrap_err() }, ); assert_eq!(server.to_string(), "invalid startup message length 10001"); assert_eq!(client.to_string(), "broken pipe"); } #[tokio::test] async fn reject_large_password() { // we're going to define a password message that is far too long. let mut payload = vec![]; payload.push(b'p'); payload.extend_from_slice(&517_u32.to_be_bytes()); payload.resize(518, b'a'); let (mut server, mut client) = duplex(128); #[rustfmt::skip] let (server, client) = tokio::join!( async move { read_message(&mut server, &mut vec![], 512).await.unwrap_err() }, async move { client.write_all(&payload).await.unwrap_err() }, ); assert_eq!(server.to_string(), "invalid message length 513"); assert_eq!(client.to_string(), "broken pipe"); } #[tokio::test] async fn read_startup_message() { let mut payload = vec![]; payload.extend_from_slice(&17_u32.to_be_bytes()); payload.extend_from_slice(ProtocolVersion::new(3, 0).as_bytes()); payload.extend_from_slice(b"abc\0def\0\0"); let startup = read_startup(&mut Cursor::new(&payload)).await.unwrap(); let FeStartupPacket::StartupMessage { version, params } = startup else { panic!("unexpected startup message: {startup:?}"); }; assert_eq!(version.major(), 3); assert_eq!(version.minor(), 0); assert_eq!(params.params, "abc\0def\0"); } #[tokio::test] async fn read_ssl_message() { let mut payload = vec![]; payload.extend_from_slice(&8_u32.to_be_bytes()); payload.extend_from_slice(ProtocolVersion::new(1234, 5679).as_bytes()); let startup = read_startup(&mut Cursor::new(&payload)).await.unwrap(); let FeStartupPacket::SslRequest { direct: None } = startup else { panic!("unexpected startup message: {startup:?}"); }; } #[tokio::test] async fn read_tls_message() { // sample client hello taken from let client_hello = [ 0x16, 0x03, 0x01, 0x00, 0xf8, 0x01, 0x00, 0x00, 0xf4, 0x03, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x08, 0x13, 0x02, 0x13, 0x03, 0x13, 0x01, 0x00, 0xff, 0x01, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x18, 0x00, 0x16, 0x00, 0x00, 0x13, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x75, 0x6c, 0x66, 0x68, 0x65, 0x69, 0x6d, 0x2e, 0x6e, 0x65, 0x74, 0x00, 0x0b, 0x00, 0x04, 0x03, 0x00, 0x01, 0x02, 0x00, 0x0a, 0x00, 0x16, 0x00, 0x14, 0x00, 0x1d, 0x00, 0x17, 0x00, 0x1e, 0x00, 0x19, 0x00, 0x18, 0x01, 0x00, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03, 0x01, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x1e, 0x00, 0x1c, 0x04, 0x03, 0x05, 0x03, 0x06, 0x03, 0x08, 0x07, 0x08, 0x08, 0x08, 0x09, 0x08, 0x0a, 0x08, 0x0b, 0x08, 0x04, 0x08, 0x05, 0x08, 0x06, 0x04, 0x01, 0x05, 0x01, 0x06, 0x01, 0x00, 0x2b, 0x00, 0x03, 0x02, 0x03, 0x04, 0x00, 0x2d, 0x00, 0x02, 0x01, 0x01, 0x00, 0x33, 0x00, 0x26, 0x00, 0x24, 0x00, 0x1d, 0x00, 0x20, 0x35, 0x80, 0x72, 0xd6, 0x36, 0x58, 0x80, 0xd1, 0xae, 0xea, 0x32, 0x9a, 0xdf, 0x91, 0x21, 0x38, 0x38, 0x51, 0xed, 0x21, 0xa2, 0x8e, 0x3b, 0x75, 0xe9, 0x65, 0xd0, 0xd2, 0xcd, 0x16, 0x62, 0x54, ]; let mut cursor = Cursor::new(&client_hello); let startup = read_startup(&mut cursor).await.unwrap(); let FeStartupPacket::SslRequest { direct: Some(prefix), } = startup else { panic!("unexpected startup message: {startup:?}"); }; // check that no data is lost. assert_eq!(prefix, [0x16, 0x03, 0x01, 0x00, 0xf8, 0x01, 0x00, 0x00]); assert_eq!(cursor.position(), 8); } #[tokio::test] async fn read_message_success() { let query = b"Q\0\0\0\x0cSELECT 1Q\0\0\0\x0cSELECT 2"; let mut cursor = Cursor::new(&query); let mut buf = vec![]; let (tag, message) = read_message(&mut cursor, &mut buf, 100).await.unwrap(); assert_eq!(tag, b'Q'); assert_eq!(message, b"SELECT 1"); let (tag, message) = read_message(&mut cursor, &mut buf, 100).await.unwrap(); assert_eq!(tag, b'Q'); assert_eq!(message, b"SELECT 2"); } } ================================================ FILE: proxy/src/protocol2.rs ================================================ //! Proxy Protocol V2 implementation //! Compatible with use core::fmt; use std::io; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr}; use bytes::Buf; use smol_str::SmolStr; use strum_macros::FromRepr; use tokio::io::{AsyncRead, AsyncReadExt}; use zerocopy::{FromBytes, Immutable, KnownLayout, Unaligned, network_endian}; /// Proxy Protocol Version 2 Header const SIGNATURE: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; const LOCAL_V2: u8 = 0x20; const PROXY_V2: u8 = 0x21; const TCP_OVER_IPV4: u8 = 0x11; const UDP_OVER_IPV4: u8 = 0x12; const TCP_OVER_IPV6: u8 = 0x21; const UDP_OVER_IPV6: u8 = 0x22; #[derive(PartialEq, Eq, Clone, Debug)] pub struct ConnectionInfo { pub addr: SocketAddr, pub extra: Option, } #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectHeader { Local, Proxy(ConnectionInfo), } impl fmt::Display for ConnectionInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self.extra { None => self.addr.ip().fmt(f), Some(ConnectionInfoExtra::Aws { vpce_id }) => { write!(f, "vpce_id[{vpce_id:?}]:addr[{}]", self.addr.ip()) } Some(ConnectionInfoExtra::Azure { link_id }) => { write!(f, "link_id[{link_id}]:addr[{}]", self.addr.ip()) } } } } #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectionInfoExtra { Aws { vpce_id: SmolStr }, Azure { link_id: u32 }, } pub(crate) async fn read_proxy_protocol( mut read: T, ) -> std::io::Result<(T, ConnectHeader)> { let mut header = [0; size_of::()]; read.read_exact(&mut header).await?; let header: ProxyProtocolV2Header = zerocopy::transmute!(header); if header.signature != SIGNATURE { return Err(std::io::Error::other("invalid proxy protocol header")); } let mut payload = vec![0; usize::from(header.len.get())]; read.read_exact(&mut payload).await?; let res = process_proxy_payload(header, &payload)?; Ok((read, res)) } fn process_proxy_payload( header: ProxyProtocolV2Header, mut payload: &[u8], ) -> std::io::Result { match header.version_and_command { // the connection was established on purpose by the proxy // without being relayed. The connection endpoints are the sender and the // receiver. Such connections exist when the proxy sends health-checks to the // server. The receiver must accept this connection as valid and must use the // real connection endpoints and discard the protocol block including the // family which is ignored. LOCAL_V2 => return Ok(ConnectHeader::Local), // the connection was established on behalf of another node, // and reflects the original connection endpoints. The receiver must then use // the information provided in the protocol block to get original the address. PROXY_V2 => {} // other values are unassigned and must not be emitted by senders. Receivers // must drop connections presenting unexpected values here. _ => { return Err(io::Error::other(format!( "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)", header.version_and_command ))); } } let size_err = "invalid proxy protocol length. payload not large enough to fit requested IP addresses"; let addr = match header.protocol_and_family { TCP_OVER_IPV4 | UDP_OVER_IPV4 => { let addr = payload .try_get::() .ok_or_else(|| io::Error::other(size_err))?; SocketAddr::from((addr.src_addr.get(), addr.src_port.get())) } TCP_OVER_IPV6 | UDP_OVER_IPV6 => { let addr = payload .try_get::() .ok_or_else(|| io::Error::other(size_err))?; SocketAddr::from((addr.src_addr.get(), addr.src_port.get())) } // unspecified or unix stream. ignore the addresses _ => { return Err(io::Error::other( "invalid proxy protocol address family/transport protocol.", )); } }; let mut extra = None; while let Some(mut tlv) = read_tlv(&mut payload) { match Pp2Kind::from_repr(tlv.kind) { Some(Pp2Kind::Aws) => { if tlv.value.is_empty() { tracing::warn!("invalid aws tlv: no subtype"); } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { Some(Pp2AwsType::VpceId) => match std::str::from_utf8(tlv.value) { Ok(s) => { extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); } Err(e) => { tracing::warn!("invalid aws vpce id: {e}"); } }, None => { tracing::warn!("unknown aws tlv: subtype={subtype}"); } } } Some(Pp2Kind::Azure) => { if tlv.value.is_empty() { tracing::warn!("invalid azure tlv: no subtype"); } let subtype = tlv.value.get_u8(); match Pp2AzureType::from_repr(subtype) { Some(Pp2AzureType::PrivateEndpointLinkId) => { if tlv.value.len() != 4 { tracing::warn!("invalid azure link_id: {:?}", tlv.value); } extra = Some(ConnectionInfoExtra::Azure { link_id: tlv.value.get_u32_le(), }); } None => { tracing::warn!("unknown azure tlv: subtype={subtype}"); } } } Some(kind) => { tracing::debug!("unused tlv[{kind:?}]: {:?}", tlv.value); } None => { tracing::debug!("unknown tlv: {tlv:?}"); } } } Ok(ConnectHeader::Proxy(ConnectionInfo { addr, extra })) } #[derive(FromRepr, Debug, Copy, Clone)] #[repr(u8)] enum Pp2Kind { // The following are defined by https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt // we don't use these but it would be interesting to know what's available Alpn = 0x01, Authority = 0x02, Crc32C = 0x03, Noop = 0x04, UniqueId = 0x05, Ssl = 0x20, NetNs = 0x30, /// Aws = 0xEA, /// Azure = 0xEE, } #[derive(FromRepr, Debug, Copy, Clone)] #[repr(u8)] enum Pp2AwsType { VpceId = 0x01, } #[derive(FromRepr, Debug, Copy, Clone)] #[repr(u8)] enum Pp2AzureType { PrivateEndpointLinkId = 0x01, } #[derive(Debug)] struct Tlv<'a> { kind: u8, value: &'a [u8], } fn read_tlv<'a>(b: &mut &'a [u8]) -> Option> { let tlv_header = b.try_get::()?; let len = usize::from(tlv_header.len.get()); Some(Tlv { kind: tlv_header.kind, value: b.split_off(..len)?, }) } trait BufExt: Sized { fn try_get(&mut self) -> Option; } impl BufExt for &[u8] { fn try_get(&mut self) -> Option { let (res, rest) = T::read_from_prefix(self).ok()?; *self = rest; Some(res) } } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(C, packed)] struct ProxyProtocolV2Header { signature: [u8; 12], version_and_command: u8, protocol_and_family: u8, len: network_endian::U16, } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(C, packed)] struct ProxyProtocolV2HeaderV4 { src_addr: NetworkEndianIpv4, dst_addr: NetworkEndianIpv4, src_port: network_endian::U16, dst_port: network_endian::U16, } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(C, packed)] struct ProxyProtocolV2HeaderV6 { src_addr: NetworkEndianIpv6, dst_addr: NetworkEndianIpv6, src_port: network_endian::U16, dst_port: network_endian::U16, } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(C, packed)] struct TlvHeader { kind: u8, len: network_endian::U16, } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(transparent)] struct NetworkEndianIpv4(network_endian::U32); impl NetworkEndianIpv4 { #[inline] fn get(self) -> Ipv4Addr { Ipv4Addr::from_bits(self.0.get()) } } #[derive(FromBytes, KnownLayout, Immutable, Unaligned, Copy, Clone)] #[repr(transparent)] struct NetworkEndianIpv6(network_endian::U128); impl NetworkEndianIpv6 { #[inline] fn get(self) -> Ipv6Addr { Ipv6Addr::from_bits(self.0.get()) } } #[cfg(test)] mod tests { use tokio::io::AsyncReadExt; use crate::protocol2::{ ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, read_proxy_protocol, }; #[tokio::test] async fn test_ipv4() { let header = super::SIGNATURE // Proxy command, IPV4 | TCP .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice()) // 12 + 3 bytes .chain([0, 15].as_slice()) // src ip .chain([127, 0, 0, 1].as_slice()) // dst ip .chain([192, 168, 0, 1].as_slice()) // src port .chain([255, 255].as_slice()) // dst port .chain([1, 1].as_slice()) // TLV .chain([1, 2, 3].as_slice()); let extra_data = [0x55; 256]; let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice())) .await .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); let ConnectHeader::Proxy(info) = info else { panic!() }; assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into()); } #[tokio::test] async fn test_ipv6() { let header = super::SIGNATURE // Proxy command, IPV6 | UDP .chain([PROXY_V2, UDP_OVER_IPV6].as_slice()) // 36 + 3 bytes .chain([0, 39].as_slice()) // src ip .chain([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0].as_slice()) // dst ip .chain([0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8].as_slice()) // src port .chain([1, 1].as_slice()) // dst port .chain([255, 255].as_slice()) // TLV .chain([1, 2, 3].as_slice()); let extra_data = [0x55; 256]; let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice())) .await .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); let ConnectHeader::Proxy(info) = info else { panic!() }; assert_eq!( info.addr, ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() ); } #[tokio::test] #[should_panic = "invalid proxy protocol header"] async fn test_invalid() { let data = [0x55; 256]; read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] #[should_panic = "early eof"] async fn test_short() { let data = [0x55; 10]; read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] async fn test_large_tlv() { let tlv = vec![0x55; 32768]; let tlv_len = (tlv.len() as u16).to_be_bytes(); let len = (12 + 3 + tlv.len() as u16).to_be_bytes(); let header = super::SIGNATURE // Proxy command, Inet << 4 | Stream .chain([PROXY_V2, TCP_OVER_IPV4].as_slice()) // 12 + 3 bytes .chain(len.as_slice()) // src ip .chain([55, 56, 57, 58].as_slice()) // dst ip .chain([192, 168, 0, 1].as_slice()) // src port .chain([255, 255].as_slice()) // dst port .chain([1, 1].as_slice()) // TLV .chain([255].as_slice()) .chain(tlv_len.as_slice()) .chain(tlv.as_slice()); let extra_data = [0xaa; 256]; let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice())) .await .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); let ConnectHeader::Proxy(info) = info else { panic!() }; assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into()); } #[tokio::test] async fn test_local() { let len = 0u16.to_be_bytes(); let header = super::SIGNATURE .chain([LOCAL_V2, 0x00].as_slice()) .chain(len.as_slice()); let extra_data = [0xaa; 256]; let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice())) .await .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); let ConnectHeader::Local = info else { panic!() }; } } ================================================ FILE: proxy/src/proxy/connect_auth.rs ================================================ use thiserror::Error; use crate::auth::Backend; use crate::auth::backend::ComputeUserInfo; use crate::cache::common::Cache; use crate::compute::{AuthInfo, ComputeConnection, ConnectionError, PostgresError}; use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::error::{ReportableError, UserFacingError}; use crate::proxy::connect_compute::{TlsNegotiation, connect_to_compute}; use crate::proxy::retry::ShouldRetryWakeCompute; #[derive(Debug, Error)] pub enum AuthError { #[error(transparent)] Auth(#[from] PostgresError), #[error(transparent)] Connect(#[from] ConnectionError), } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { match self { AuthError::Auth(postgres_error) => postgres_error.to_string_client(), AuthError::Connect(connection_error) => connection_error.to_string_client(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { AuthError::Auth(postgres_error) => postgres_error.get_error_kind(), AuthError::Connect(connection_error) => connection_error.get_error_kind(), } } } /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub(crate) async fn connect_to_compute_and_auth( ctx: &RequestContext, config: &ProxyConfig, user_info: &Backend<'_, ComputeUserInfo>, auth_info: AuthInfo, tls: TlsNegotiation, ) -> Result { let mut attempt = 0; // NOTE: This is messy, but should hopefully be detangled with PGLB. // We wanted to separate the concerns of **connect** to compute (a PGLB operation), // from **authenticate** to compute (a NeonKeeper operation). // // This unfortunately removed retry handling for one error case where // the compute was cached, and we connected, but the compute cache was actually stale // and is associated with the wrong endpoint. We detect this when the **authentication** fails. // As such, we retry once here if the `authenticate` function fails and the error is valid to retry. loop { attempt += 1; let mut node = connect_to_compute(ctx, config, user_info, tls).await?; let res = auth_info.authenticate(ctx, &mut node).await; match res { Ok(()) => return Ok(node), Err(e) => { if attempt < 2 && let Backend::ControlPlane(cplane, user_info) = user_info && let ControlPlaneClient::ProxyV1(cplane_proxy_v1) = &**cplane && e.should_retry_wake_compute() { tracing::warn!(error = ?e, "retrying wake compute"); let key = user_info.endpoint_cache_key(); cplane_proxy_v1.caches.node_info.invalidate(&key); continue; } return Err(e)?; } } } } ================================================ FILE: proxy/src/proxy/connect_compute.rs ================================================ use tokio::time; use tracing::{debug, info, warn}; use crate::cache::node_info::CachedNodeInfo; use crate::compute::{self, COULD_NOT_CONNECT, ComputeConnection}; use crate::config::{ComputeConfig, ProxyConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::NodeInfo; use crate::control_plane::locks::ApiLocks; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; use crate::proxy::retry::{ShouldRetryWakeCompute, retry_after, should_retry}; use crate::proxy::wake_compute::{WakeComputeBackend, wake_compute}; use crate::types::Host; /// If we couldn't connect, a cached connection info might be to blame /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(skip_all)] pub(crate) fn invalidate_cache(node_info: CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); } let label = if is_cached { ConnectionFailureKind::ComputeCached } else { ConnectionFailureKind::ComputeUncached }; Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } pub(crate) trait ConnectMechanism { type Connection; async fn connect_once( &self, ctx: &RequestContext, node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result; } struct TcpMechanism<'a> { /// connect_to_compute concurrency lock locks: &'a ApiLocks, tls: TlsNegotiation, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum TlsNegotiation { /// TLS is assumed Direct, /// We must ask for TLS using the postgres SSLRequest message Postgres, } impl ConnectMechanism for TcpMechanism<'_> { type Connection = ComputeConnection; #[tracing::instrument(skip_all, fields( pid = tracing::field::Empty, compute_id = tracing::field::Empty ))] async fn connect_once( &self, ctx: &RequestContext, node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result { let permit = self.locks.get_permit(&node_info.conn_info.host).await?; permit.release_result( node_info .conn_info .connect(ctx, &node_info.aux, config, self.tls) .await, ) } } /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub(crate) async fn connect_to_compute( ctx: &RequestContext, config: &ProxyConfig, user_info: &B, tls: TlsNegotiation, ) -> Result { connect_to_compute_inner( ctx, &TcpMechanism { locks: &config.connect_compute_locks, tls, }, user_info, config.wake_compute_retry_config, &config.connect_to_compute, ) .await } /// Try to connect to the compute node, retrying if necessary. pub(crate) async fn connect_to_compute_inner( ctx: &RequestContext, mechanism: &M, user_info: &B, wake_compute_retry_config: RetryConfig, compute: &ComputeConfig, ) -> Result { let mut num_retries = 0; let node_info = wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; // try once let err = match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); return Ok(res); } Err(e) => e, }; debug!(error = ?err, COULD_NOT_CONNECT); let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { // If we just received this from cplane and not from the cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. if !should_retry(&err, num_retries, compute.retry) { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); return Err(err); } node_info } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node debug!("compute node's state has likely changed; requesting a wake-up"); invalidate_cache(node_info); // TODO: increment num_retries? wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await? }; // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet debug!("wake_compute success. attempting to connect"); num_retries = 1; loop { match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); // TODO: is this necessary? We have a metric. info!(?num_retries, "connected to compute node after"); return Ok(res); } Err(e) => { if !should_retry(&e, num_retries, compute.retry) { // Don't log an error here, caller will print the error Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); return Err(e); } warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } } let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; drop(pause); } } ================================================ FILE: proxy/src/proxy/mod.rs ================================================ #[cfg(test)] mod tests; pub(crate) mod connect_auth; pub(crate) mod connect_compute; pub(crate) mod retry; pub(crate) mod wake_compute; use std::collections::HashSet; use std::convert::Infallible; use std::sync::Arc; use futures::TryStreamExt; use itertools::Itertools; use once_cell::sync::OnceCell; use postgres_client::RawCancelToken; use postgres_client::connect_raw::StartupStream; use postgres_protocol::message::backend::Message; use regex::Regex; use serde::{Deserialize, Serialize}; use smol_str::{SmolStr, format_smolstr}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; use tokio::sync::oneshot; use tracing::Instrument; use crate::cancellation::{CancelClosure, CancellationHandler}; use crate::compute::{ComputeConnection, PostgresError, RustlsStream}; use crate::config::ProxyConfig; use crate::context::RequestContext; pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use crate::pglb::{ClientMode, ClientRequestError}; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; use crate::{auth, compute}; #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestContext, cancellation_handler: Arc, client: &mut PqStream>, mode: &ClientMode, endpoint_rate_limiter: Arc, common_names: Option<&HashSet>, params: &StartupMessageParams, ) -> Result<(ComputeConnection, oneshot::Sender), ClientRequestError> { let hostname = mode.hostname(client.get_ref()); // Extract credentials which we're going to use for auth. let result = auth_backend .as_ref() .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, params, hostname, common_names)) .transpose(); let user_info = match result { Ok(user_info) => user_info, Err(e) => Err(client.throw_error(e, Some(ctx)).await)?, }; let user = user_info.get_user().to_owned(); let user_info = match user_info .authenticate( ctx, client, mode.allow_cleartext(), &config.authentication_config, endpoint_rate_limiter, ) .await { Ok(auth_result) => auth_result, Err(e) => { let db = params.get("database"); let app = params.get("application_name"); let params_span = tracing::info_span!("", ?user, ?db, ?app); return Err(client .throw_error(e, Some(ctx)) .instrument(params_span) .await)?; } }; let (cplane, creds) = match user_info { auth::Backend::ControlPlane(cplane, creds) => (cplane, creds), auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some(); let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys); auth_info.set_startup_params(params, params_compat); let backend = auth::Backend::ControlPlane(cplane, creds.info); // TODO: callback to pglb let res = connect_auth::connect_to_compute_and_auth( ctx, config, &backend, auth_info, connect_compute::TlsNegotiation::Postgres, ) .await; let mut node = match res { Ok(node) => node, Err(e) => Err(client.throw_error(e, Some(ctx)).await)?, }; send_client_greeting(ctx, &config.greetings, client); let auth::Backend::ControlPlane(_, user_info) = backend else { unreachable!("ensured above"); }; let session = cancellation_handler.get_key(); let (process_id, secret_key) = forward_compute_params_to_client(ctx, *session.key(), client, &mut node.stream).await?; let hostname = node.hostname.to_string(); let session_id = ctx.session_id(); let (cancel_on_shutdown, cancel) = oneshot::channel(); tokio::spawn(async move { session .maintain_cancel_key( session_id, cancel, &CancelClosure { socket_addr: node.socket_addr, cancel_token: RawCancelToken { ssl_mode: node.ssl_mode, process_id, secret_key, }, hostname, user_info, }, &config.connect_to_compute, ) .await; }); Ok((node, cancel_on_shutdown)) } /// Greet the client with any useful information. pub(crate) fn send_client_greeting( ctx: &RequestContext, greetings: &String, client: &mut PqStream, ) { // Expose session_id to clients if we have a greeting message. if !greetings.is_empty() { let session_msg = format!("{}, session_id: {}", greetings, ctx.session_id()); client.write_message(BeMessage::NoticeResponse(session_msg.as_str())); } // Forward recorded latencies for probing requests if let Some(testodrome_id) = ctx.get_testodrome_id() { client.write_message(BeMessage::ParameterStatus { name: "neon.testodrome_id".as_bytes(), value: testodrome_id.as_bytes(), }); let latency_measured = ctx.get_proxy_latency(); client.write_message(BeMessage::ParameterStatus { name: "neon.cplane_latency".as_bytes(), value: latency_measured.cplane.as_micros().to_string().as_bytes(), }); client.write_message(BeMessage::ParameterStatus { name: "neon.client_latency".as_bytes(), value: latency_measured.client.as_micros().to_string().as_bytes(), }); client.write_message(BeMessage::ParameterStatus { name: "neon.compute_latency".as_bytes(), value: latency_measured.compute.as_micros().to_string().as_bytes(), }); client.write_message(BeMessage::ParameterStatus { name: "neon.retry_latency".as_bytes(), value: latency_measured.retry.as_micros().to_string().as_bytes(), }); } } pub(crate) async fn forward_compute_params_to_client( ctx: &RequestContext, cancel_key_data: CancelKeyData, client: &mut PqStream, compute: &mut StartupStream, ) -> Result<(i32, i32), ClientRequestError> { let mut process_id = 0; let mut secret_key = 0; let err = loop { // if the client buffer is too large, let's write out some bytes now to save some space client.write_if_full().await?; let msg = match compute.try_next().await { Ok(msg) => msg, Err(e) => break postgres_client::Error::io(e), }; match msg { // Send our cancellation key data instead. Some(Message::BackendKeyData(body)) => { client.write_message(BeMessage::BackendKeyData(cancel_key_data)); process_id = body.process_id(); secret_key = body.secret_key(); } // Forward all postgres connection params to the client. Some(Message::ParameterStatus(body)) => { if let Ok(name) = body.name() && let Ok(value) = body.value() { client.write_message(BeMessage::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), }); } } // Forward all notices to the client. Some(Message::NoticeResponse(notice)) => { client.write_raw(notice.as_bytes().len(), b'N', |buf| { buf.extend_from_slice(notice.as_bytes()); }); } Some(Message::ReadyForQuery(_)) => { client.write_message(BeMessage::ReadyForQuery); return Ok((process_id, secret_key)); } Some(Message::ErrorResponse(body)) => break postgres_client::Error::db(body), Some(_) => break postgres_client::Error::unexpected_message(), None => break postgres_client::Error::closed(), } }; Err(client .throw_error(PostgresError::Postgres(err), Some(ctx)) .await)? } #[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { // proxy options: /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute. pub const PARAMS_COMPAT: &'static str = "proxy_params_compat"; // cplane options: /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN. const LSN: &'static str = "lsn"; /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp. const TIMESTAMP: &'static str = "timestamp"; /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write. const ENDPOINT_TYPE: &'static str = "endpoint_type"; pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params .options_raw() .map(Self::parse_from_iter) .unwrap_or_default() } pub(crate) fn parse_options_raw(options: &str) -> Self { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } pub(crate) fn get(&self, key: &str) -> Option { self.0 .iter() .find_map(|(k, v)| (k == key).then_some(v)) .cloned() } pub(crate) fn is_ephemeral(&self) -> bool { self.0.iter().any(|(k, _)| match &**k { // This is not a cplane option, we know it does not create ephemeral computes. Self::PARAMS_COMPAT => false, Self::LSN => true, Self::TIMESTAMP => true, Self::ENDPOINT_TYPE => true, // err on the side of caution. any cplane options we don't know about // might lead to ephemeral computes. _ => true, }) } fn parse_from_iter<'a>(options: impl Iterator) -> Self { let mut options = options .filter_map(neon_option) .map(|(k, v)| (k.into(), v.into())) .collect_vec(); options.sort(); Self(options) } pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { // prefix + format!(" {k}:{v}") // kinda jank because SmolStr is immutable std::iter::once(prefix) .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v])) .collect::() .into() } /// DeepObject format /// `paramName[prop1]=value1¶mName[prop2]=value2&...` pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { self.0 .iter() .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone())) .collect() } } pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").expect("regex should be correct")); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); Some((k, v)) } ================================================ FILE: proxy/src/proxy/retry.rs ================================================ use std::error::Error; use std::io; use tokio::time; use crate::compute::{self, PostgresError}; use crate::config::RetryConfig; pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; } pub(crate) trait ShouldRetryWakeCompute { /// Returns true if we need to invalidate the cache for this node. /// If false, we can continue retrying with the current node cache. fn should_retry_wake_compute(&self) -> bool; } pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool { num_retries < config.max_retries && err.could_retry() } impl CouldRetry for io::Error { fn could_retry(&self) -> bool { use std::io::ErrorKind; matches!( self.kind(), ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut ) } } impl ShouldRetryWakeCompute for postgres_client::error::DbError { fn should_retry_wake_compute(&self) -> bool { use postgres_client::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. let non_retriable_pg_errors = matches!( self.code(), &SqlState::TOO_MANY_CONNECTIONS | &SqlState::OUT_OF_MEMORY | &SqlState::SYNTAX_ERROR | &SqlState::T_R_SERIALIZATION_FAILURE | &SqlState::INVALID_CATALOG_NAME | &SqlState::INVALID_SCHEMA_NAME | &SqlState::INVALID_PARAMETER_VALUE, ); if non_retriable_pg_errors { return false; } // PGBouncer errors that should not trigger a wake_compute retry. if self.code() == &SqlState::PROTOCOL_VIOLATION { // Source for the error message: // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070 return !self .message() .contains("no more connections allowed (max_client_conn)"); } true } } impl ShouldRetryWakeCompute for postgres_client::Error { fn should_retry_wake_compute(&self) -> bool { if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { postgres_client::error::DbError::should_retry_wake_compute(db_err) } else { // likely an IO error. Possible the compute has shutdown and the // cache is stale. true } } } impl CouldRetry for compute::ConnectionError { fn could_retry(&self) -> bool { match self { compute::ConnectionError::TlsError(err) => err.could_retry(), compute::ConnectionError::WakeComputeError(err) => err.could_retry(), compute::ConnectionError::TooManyConnectionAttempts(_) => false, #[cfg(test)] compute::ConnectionError::TestError { retryable, .. } => *retryable, } } } impl ShouldRetryWakeCompute for compute::ConnectionError { fn should_retry_wake_compute(&self) -> bool { match self { // the cache entry was not checked for validity compute::ConnectionError::TooManyConnectionAttempts(_) => false, #[cfg(test)] compute::ConnectionError::TestError { wakeable, .. } => *wakeable, _ => true, } } } impl ShouldRetryWakeCompute for PostgresError { fn should_retry_wake_compute(&self) -> bool { match self { PostgresError::Postgres(error) => error.should_retry_wake_compute(), } } } pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { config .base_delay .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } #[cfg(test)] mod tests { use postgres_client::error::{DbError, SqlState}; use super::ShouldRetryWakeCompute; #[test] fn should_retry_wake_compute_for_db_error() { // These SQLStates should NOT trigger a wake_compute retry. let non_retry_states = [ SqlState::TOO_MANY_CONNECTIONS, SqlState::OUT_OF_MEMORY, SqlState::SYNTAX_ERROR, SqlState::T_R_SERIALIZATION_FAILURE, SqlState::INVALID_CATALOG_NAME, SqlState::INVALID_SCHEMA_NAME, SqlState::INVALID_PARAMETER_VALUE, ]; for state in non_retry_states { let err = DbError::new_test_error(state.clone(), "oops".to_string()); assert!( !err.should_retry_wake_compute(), "State {state:?} unexpectedly retried" ); } // Errors coming from pgbouncer should not trigger a wake_compute retry let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"]; for error in non_retry_pgbouncer_errors { let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string()); assert!( !err.should_retry_wake_compute(), "PGBouncer error {error:?} unexpectedly retried" ); } // These SQLStates should trigger a wake_compute retry. let retry_states = [ SqlState::CONNECTION_FAILURE, SqlState::CONNECTION_EXCEPTION, SqlState::CONNECTION_DOES_NOT_EXIST, SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, ]; for state in retry_states { let err = DbError::new_test_error(state.clone(), "oops".to_string()); assert!( err.should_retry_wake_compute(), "State {state:?} unexpectedly skipped retry" ); } } } ================================================ FILE: proxy/src/proxy/tests/mitm.rs ================================================ //! Man-in-the-middle tests //! //! Channel binding should prevent a proxy server //! *that has access to create valid certificates* //! from controlling the TLS connection. use std::fmt::Debug; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_client::tls::TlsConnect; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream}; use tokio_util::codec::{Decoder, Encoder}; use super::*; use crate::config::TlsConfig; use crate::context::RequestContext; use crate::pglb::handshake::{HandshakeData, handshake}; enum Intercept { None, Methods, SASLResponse, } async fn proxy_mitm( intercept: Intercept, ) -> (DuplexStream, DuplexStream, ClientConfig<'static>, TlsConfig) { let (end_server1, client1) = tokio::io::duplex(1024); let (server2, end_client2) = tokio::io::duplex(1024); let (client_config1, server_config1) = generate_tls_config("generic-project-name.localhost", "localhost").unwrap(); let (client_config2, server_config2) = generate_tls_config("generic-project-name.localhost", "localhost").unwrap(); tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; let (end_client, startup) = match handshake( &RequestContext::test(), client1, Some(&server_config1), false, ) .await .unwrap() { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), }; let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame); let end_client = end_client.flush_and_into_inner().await.unwrap(); let mut end_client = tokio_util::codec::Framed::new(end_client, PgFrame); // give the end_server the startup parameters let mut buf = BytesMut::new(); frontend::startup_message( &postgres_protocol::message::frontend::StartupMessageParams { params: startup.params.as_bytes().into(), }, &mut buf, ) .unwrap(); end_server.send(buf.freeze()).await.unwrap(); // proxy messages between end_client and end_server loop { tokio::select! { message = end_server.next() => { match message { Some(Ok(message)) => { // intercept SASL and return only SCRAM-SHA-256 ;) if matches!(intercept, Intercept::Methods) && message.starts_with(b"R") && message[5..].starts_with(&[0,0,0,10]) { end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap(); continue; } end_client.send(message).await.unwrap(); } _ => break, } } message = end_client.next() => { match message { Some(Ok(message)) => { // intercept SASL response and return SCRAM-SHA-256 with no channel binding ;) if matches!(intercept, Intercept::SASLResponse) && message.starts_with(b"p") && message[5..].starts_with(b"SCRAM-SHA-256-PLUS\0") { let sasl_message = &message[1+4+19+4..]; let mut new_message = b"n,,".to_vec(); new_message.extend_from_slice(sasl_message.strip_prefix(b"p=tls-server-end-point,,").unwrap()); let mut buf = BytesMut::new(); frontend::sasl_initial_response("SCRAM-SHA-256", &new_message, &mut buf).unwrap(); end_server.send(buf.freeze()).await.unwrap(); continue; } end_server.send(message).await.unwrap(); } _ => break, } } else => { break } } } }); (end_server1, end_client2, client_config1, server_config2) } /// taken from tokio-postgres pub(crate) async fn connect_tls(mut stream: S, tls: T) -> T::Stream where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, T::Error: Debug, { let mut buf = BytesMut::new(); frontend::ssl_request(&mut buf); stream.write_all(&buf).await.unwrap(); let mut buf = [0]; stream.read_exact(&mut buf).await.unwrap(); assert!(buf[0] == b'S', "ssl not supported by server"); tls.connect(stream).await.unwrap() } struct PgFrame; impl Decoder for PgFrame { type Item = Bytes; type Error = std::io::Error; fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { if src.len() < 5 { src.reserve(5 - src.len()); return Ok(None); } let len = u32::from_be_bytes(src[1..5].try_into().unwrap()) as usize + 1; if src.len() < len { src.reserve(len - src.len()); return Ok(None); } Ok(Some(src.split_to(len).freeze())) } } impl Encoder for PgFrame { type Error = std::io::Error; fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> { dst.extend_from_slice(&item); Ok(()) } } /// If the client doesn't support channel bindings, it can be exploited. #[tokio::test] async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let (server, client, client_config, server_config) = proxy_mitm(Intercept::None).await; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), Scram::new("password").await?, )); let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? } /// If the client chooses SCRAM-PLUS, it will fail #[tokio::test] async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, postgres_client::config::ChannelBinding::Prefer, ) .await } /// If the MITM pretends like SCRAM-PLUS isn't available, but the client supports it, it will fail #[tokio::test] async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, postgres_client::config::ChannelBinding::Prefer, ) .await } /// If the MITM pretends like the client doesn't support channel bindings, it will fail #[tokio::test] async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, postgres_client::config::ChannelBinding::Prefer, ) .await } /// If the client chooses SCRAM-PLUS, it will fail #[tokio::test] async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, postgres_client::config::ChannelBinding::Require, ) .await } /// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail #[tokio::test] async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, postgres_client::config::ChannelBinding::Require, ) .await } /// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail #[tokio::test] async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, postgres_client::config::ChannelBinding::Require, ) .await } async fn connect_failure( intercept: Intercept, channel_binding: postgres_client::config::ChannelBinding, ) -> anyhow::Result<()> { let (server, client, client_config, server_config) = proxy_mitm(intercept).await; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), Scram::new("password").await?, )); let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(channel_binding) .user("user") .dbname("db") .password("password") .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() .context("client shouldn't be able to connect")?; let _server_err = proxy .await? .err() .context("server shouldn't accept client")?; Ok(()) } ================================================ FILE: proxy/src/proxy/tests/mod.rs ================================================ //! A group of high-level tests for connection establishing logic and auth. #![allow(clippy::unimplemented)] mod mitm; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; use tokio::io::{AsyncRead, AsyncWrite, DuplexStream}; use tokio::time::Instant; use tracing_test::traced_test; use super::retry::CouldRetry; use crate::auth::backend::{ComputeUserInfo, MaybeOwned}; use crate::cache::node_info::{CachedNodeInfo, NodeInfoCache}; use crate::config::{CacheOptions, ComputeConfig, RetryConfig, TlsConfig}; use crate::context::RequestContext; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{self, NodeInfo}; use crate::error::ErrorKind; use crate::pglb::ERR_INSECURE_CONNECTION; use crate::pglb::handshake::{HandshakeData, handshake}; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; use crate::proxy::connect_compute::{ConnectMechanism, connect_to_compute_inner}; use crate::proxy::retry::retry_after; use crate::stream::{PqStream, Stream}; use crate::tls::client_config::compute_client_config_with_certs; use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{auth, compute, sasl, scram}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( hostname: &str, common_name: &str, ) -> anyhow::Result<( pki_types::CertificateDer<'static>, pki_types::CertificateDer<'static>, pki_types::PrivateKeyDer<'static>, )> { let ca_key = rcgen::KeyPair::generate()?; let ca = { let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); params.self_signed(&ca_key)? }; let cert_key = rcgen::KeyPair::generate()?; let cert = { let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?; params.distinguished_name = rcgen::DistinguishedName::new(); params .distinguished_name .push(rcgen::DnType::CommonName, common_name); params.signed_by(&cert_key, &ca, &ca_key)? }; Ok(( ca.der().clone(), cert.der().clone(), pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()), )) } struct ClientConfig<'a> { config: Arc, hostname: &'a str, } type TlsConnect = >::TlsConnect; impl ClientConfig<'_> { fn make_tls_connect(self) -> anyhow::Result> { Ok(crate::tls::postgres_rustls::make_tls_connect( &self.config, self.hostname, )?) } } /// Generate TLS certificates and build rustls configs for client and server. fn generate_tls_config<'a>( hostname: &'a str, common_name: &'a str, ) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> { let (ca, cert, key) = generate_certs(hostname, common_name)?; let tls_config = { let config = rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .context("ring should support the default protocol versions")? .with_no_client_auth() .with_single_cert(vec![cert.clone()], key.clone_key())?; let cert_resolver = CertResolver::new(key, vec![cert])?; let common_names = cert_resolver.get_common_names(); let config = Arc::new(config); TlsConfig { http_config: config.clone(), pg_config: config, common_names, cert_resolver: Arc::new(cert_resolver), } }; let client_config = { let config = Arc::new(compute_client_config_with_certs([ca])); ClientConfig { config, hostname } }; Ok((client_config, tls_config)) } #[async_trait] trait TestAuth: Sized { async fn authenticate( self, stream: &mut PqStream>, ) -> anyhow::Result<()> { stream.write_message(BeMessage::AuthenticationOk); Ok(()) } } struct NoAuth; impl TestAuth for NoAuth {} struct Scram(scram::ServerSecret); impl Scram { async fn new(password: &str) -> anyhow::Result { let secret = scram::ServerSecret::build(password) .await .context("failed to generate scram secret")?; Ok(Scram(secret)) } fn mock() -> Self { Scram(scram::ServerSecret::mock(rand::random())) } } #[async_trait] impl TestAuth for Scram { async fn authenticate( self, stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream, auth::Scram(&self.0, &RequestContext::test())) .authenticate() .await?; use sasl::Outcome::*; match outcome { Success(_) => Ok(()), Failure(reason) => bail!("autentication failed with an error: {reason}"), } } } /// A dummy proxy impl which performs a handshake and reports auth success. async fn dummy_proxy( client: impl AsyncRead + AsyncWrite + Unpin + Send, tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), }; auth.authenticate(&mut stream).await?; stream.write_message(BeMessage::ParameterStatus { name: b"client_encoding", value: b"UTF8", }); stream.write_message(BeMessage::ReadyForQuery); stream.flush().await?; Ok(()) } #[tokio::test] async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) .tls_and_authenticate(server, NoTls) .await .err() // -> Option .context("client shouldn't be able to connect")?; assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; assert!(client_err.to_string().contains(&server_err.to_string())); Ok(()) } #[tokio::test] async fn handshake_tls() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? } #[tokio::test] async fn handshake_raw() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .set_param("options", "project=generic-project-name") .ssl_mode(SslMode::Prefer) .tls_and_authenticate(server, NoTls) .await?; proxy.await? } #[tokio::test] async fn keepalive_is_inherited() -> anyhow::Result<()> { use tokio::net::{TcpListener, TcpStream}; let listener = TcpListener::bind("127.0.0.1:0").await?; let port = listener.local_addr()?.port(); socket2::SockRef::from(&listener).set_keepalive(true)?; let t = tokio::spawn(async move { let (client, _) = listener.accept().await?; let keepalive = socket2::SockRef::from(&client).keepalive()?; anyhow::Ok(keepalive) }); TcpStream::connect(("127.0.0.1", port)).await?; assert!(t.await??, "keepalive should be inherited"); Ok(()) } #[rstest] #[case("password_foo")] #[case("pwd-bar")] #[case("")] #[tokio::test] async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), Scram::new(password).await?, )); let _conn = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Require) .user("user") .dbname("db") .password(password) .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? } #[tokio::test] async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), Scram::new("password").await?, )); let _conn = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? } #[tokio::test] async fn scram_auth_mock() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); use rand::Rng; use rand::distr::Alphanumeric; let password: String = rand::rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) .map(char::from) .collect(); let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("user") .dbname("db") .password(&password) // no password will match the mocked secret .ssl_mode(SslMode::Require) .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() // -> Option .context("client shouldn't be able to connect")?; let _server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; Ok(()) } #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; let config = RetryConfig { base_delay: Duration::from_secs(1), max_retries: 5, backoff_factor: 2.0, }; for num_retries in 1..config.max_retries { total_wait += retry_after(num_retries, config); } assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1); } #[derive(Clone, Copy, Debug)] enum ConnectAction { Wake, WakeCold, WakeFail, WakeRetry, Connect, // connect_once -> Err, could_retry = true, should_retry_wake_compute = true Retry, // connect_once -> Err, could_retry = true, should_retry_wake_compute = false RetryNoWake, // connect_once -> Err, could_retry = false, should_retry_wake_compute = true Fail, // connect_once -> Err, could_retry = false, should_retry_wake_compute = false FailNoWake, } #[derive(Clone)] struct TestConnectMechanism { counter: Arc>, sequence: Vec, cache: &'static NodeInfoCache, } impl TestConnectMechanism { fn verify(&self) { let counter = self.counter.lock().unwrap(); assert_eq!( *counter, self.sequence.len(), "sequence does not proceed to the end" ); } } impl TestConnectMechanism { fn new(sequence: Vec) -> Self { Self { counter: Arc::new(std::sync::Mutex::new(0)), sequence, cache: Box::leak(Box::new(NodeInfoCache::new(CacheOptions { size: Some(1), absolute_ttl: Some(Duration::from_secs(100)), idle_ttl: None, }))), } } } #[derive(Debug)] struct TestConnection; impl ConnectMechanism for TestConnectMechanism { type Connection = TestConnection; async fn connect_once( &self, _ctx: &RequestContext, _node_info: &CachedNodeInfo, _config: &ComputeConfig, ) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; *counter += 1; match action { ConnectAction::Connect => Ok(TestConnection), ConnectAction::Retry => Err(compute::ConnectionError::TestError { retryable: true, wakeable: true, kind: ErrorKind::Compute, }), ConnectAction::RetryNoWake => Err(compute::ConnectionError::TestError { retryable: true, wakeable: false, kind: ErrorKind::Compute, }), ConnectAction::Fail => Err(compute::ConnectionError::TestError { retryable: false, wakeable: true, kind: ErrorKind::Compute, }), ConnectAction::FailNoWake => Err(compute::ConnectionError::TestError { retryable: false, wakeable: false, kind: ErrorKind::Compute, }), x => panic!("expecting action {x:?}, connect is called instead"), } } } impl TestControlPlaneClient for TestConnectMechanism { fn wake_compute(&self) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; *counter += 1; match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeCold => Ok(CachedNodeInfo::new_uncached( helper_create_uncached_node_info(), )), ConnectAction::WakeFail => { let err = control_plane::errors::ControlPlaneError::Message(Box::new( ControlPlaneErrorMessage { http_status_code: StatusCode::BAD_REQUEST, error: "TEST".into(), status: None, }, )); assert!(!err.could_retry()); Err(control_plane::errors::WakeComputeError::ControlPlane(err)) } ConnectAction::WakeRetry => { let err = control_plane::errors::ControlPlaneError::Message(Box::new( ControlPlaneErrorMessage { http_status_code: StatusCode::BAD_REQUEST, error: "TEST".into(), status: Some(Status { code: "error".into(), message: "error".into(), details: Details { error_info: None, retry_info: Some(control_plane::messages::RetryInfo { retry_at: Instant::now() + Duration::from_millis(1), }), user_facing_message: None, }, }), }, )); assert!(err.could_retry()); Err(control_plane::errors::WakeComputeError::ControlPlane(err)) } x => panic!("expecting action {x:?}, wake_compute is called instead"), } } fn get_access_control( &self, ) -> Result { unimplemented!("not used in tests") } fn dyn_clone(&self) -> Box { Box::new(self.clone()) } } fn helper_create_uncached_node_info() -> NodeInfo { NodeInfo { conn_info: compute::ConnectInfo { host: "test".into(), port: 5432, ssl_mode: SslMode::Disable, host_addr: None, }, aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, } } fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = helper_create_uncached_node_info(); cache.insert("key".into(), Ok(node.clone())); CachedNodeInfo { token: Some((cache, "key".into())), value: node, } } fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> auth::Backend<'static, ComputeUserInfo> { auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))), ComputeUserInfo { endpoint: "endpoint".into(), user: "user".into(), options: NeonOptions::parse_options_raw(""), }, ) } fn config() -> ComputeConfig { let retry = RetryConfig { base_delay: Duration::from_secs(1), max_retries: 5, backoff_factor: 2.0, }; ComputeConfig { retry, tls: Arc::new(compute_client_config_with_certs(std::iter::empty())), timeout: Duration::from_secs(2), } } #[tokio::test] async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); } #[tokio::test] async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); } /// Test that we don't retry if the error is not retryable. #[tokio::test] async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); } /// Even for non-retryable errors, we should retry at least once. #[tokio::test] async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); } /// Retry for at most `NUM_RETRIES_CONNECT` times. #[tokio::test] async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); let wake_compute_retry_config = RetryConfig { base_delay: Duration::from_secs(1), max_retries: 1, backoff_factor: 2.0, }; let config = config(); connect_to_compute_inner( &ctx, &mechanism, &user_info, wake_compute_retry_config, &config, ) .await .unwrap_err(); mechanism.verify(); } /// Should retry wake compute. #[tokio::test] async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); } /// Wake failed with a non-retryable error. #[tokio::test] async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); } #[tokio::test] #[traced_test] async fn fail_but_wake_invalidates_cache() { let ctx = RequestContext::test(); let mech = TestConnectMechanism::new(vec![ ConnectAction::Wake, ConnectAction::Fail, ConnectAction::Wake, ConnectAction::Connect, ]); let user = helper_create_connect_info(&mech); let cfg = config(); connect_to_compute_inner(&ctx, &mech, &user, cfg.retry, &cfg) .await .unwrap(); assert!(logs_contain( "invalidating stalled compute node info cache entry" )); } #[tokio::test] #[traced_test] async fn fail_no_wake_skips_cache_invalidation() { let ctx = RequestContext::test(); let mech = TestConnectMechanism::new(vec![ ConnectAction::Wake, ConnectAction::RetryNoWake, ConnectAction::Connect, ]); let user = helper_create_connect_info(&mech); let cfg = config(); connect_to_compute_inner(&ctx, &mech, &user, cfg.retry, &cfg) .await .unwrap(); assert!(!logs_contain( "invalidating stalled compute node info cache entry" )); } #[tokio::test] #[traced_test] async fn retry_but_wake_invalidates_cache() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); // Wake → Retry (retryable + wakeable) → Wake → Connect let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let cfg = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, cfg.retry, &cfg) .await .unwrap(); mechanism.verify(); // Because Retry has wakeable=true, we should see invalidate_cache assert!(logs_contain( "invalidating stalled compute node info cache entry" )); } #[tokio::test] #[traced_test] async fn retry_no_wake_skips_invalidation() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); // Wake → RetryNoWake (retryable + NOT wakeable) let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake, Fail]); let user_info = helper_create_connect_info(&mechanism); let cfg = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, cfg.retry, &cfg) .await .unwrap_err(); mechanism.verify(); // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache assert!(!logs_contain( "invalidating stalled compute node info cache entry" )); } #[tokio::test] #[traced_test] async fn retry_no_wake_error_fast() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); // Wake → FailNoWake (not retryable + NOT wakeable) let mechanism = TestConnectMechanism::new(vec![Wake, FailNoWake]); let user_info = helper_create_connect_info(&mechanism); let cfg = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, cfg.retry, &cfg) .await .unwrap_err(); mechanism.verify(); // Because FailNoWake has wakeable=false, we must NOT see invalidate_cache assert!(!logs_contain( "invalidating stalled compute node info cache entry" )); } #[tokio::test] #[traced_test] async fn retry_cold_wake_skips_invalidation() { let _ = env_logger::try_init(); use ConnectAction::*; let ctx = RequestContext::test(); // WakeCold → FailNoWake (not retryable + NOT wakeable) let mechanism = TestConnectMechanism::new(vec![WakeCold, Retry, Connect]); let user_info = helper_create_connect_info(&mechanism); let cfg = config(); connect_to_compute_inner(&ctx, &mechanism, &user_info, cfg.retry, &cfg) .await .unwrap(); mechanism.verify(); } ================================================ FILE: proxy/src/proxy/wake_compute.rs ================================================ use async_trait::async_trait; use tracing::{error, info}; use crate::cache::node_info::CachedNodeInfo; use crate::config::RetryConfig; use crate::context::RequestContext; use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, }; use crate::proxy::retry::{retry_after, should_retry}; // Use macro to retain original callsite. macro_rules! log_wake_compute_error { (error = ?$error:expr, $num_retries:expr, retriable = $retriable:literal) => { match $error { WakeComputeError::ControlPlane(ControlPlaneError::Message(_)) => { info!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node") } _ => error!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node"), } }; } #[async_trait] pub(crate) trait WakeComputeBackend { async fn wake_compute(&self, ctx: &RequestContext) -> Result; } pub(crate) async fn wake_compute( num_retries: &mut u32, ctx: &RequestContext, api: &B, config: RetryConfig, ) -> Result { loop { match api.wake_compute(ctx).await { Err(e) if !should_retry(&e, *num_retries, config) => { log_wake_compute_error!(error = ?e, num_retries, retriable = false); report_error(&e, false); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); return Err(e); } Err(e) => { log_wake_compute_error!(error = ?e, num_retries, retriable = true); report_error(&e, true); } Ok(n) => { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); // TODO: is this necessary? We have a metric. // TODO: this log line is misleading as "wake_compute" might return cached (and stale) info. info!(?num_retries, "compute node woken up after"); return Ok(n); } } let wait_duration = retry_after(*num_retries, config); *num_retries += 1; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; drop(pause); } } fn report_error(e: &WakeComputeError, retry: bool) { let kind = e.get_error_kind(); Metrics::get() .proxy .connection_failures_breakdown .inc(ConnectionFailuresBreakdownGroup { kind, retry: retry.into(), }); } ================================================ FILE: proxy/src/rate_limiter/leaky_bucket.rs ================================================ use std::hash::Hash; use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use clashmap::ClashMap; use rand::Rng; use tokio::time::Instant; use tracing::info; use utils::leaky_bucket::LeakyBucketState; use crate::intern::EndpointIdInt; // Simple per-endpoint rate limiter. pub type EndpointRateLimiter = LeakyBucketRateLimiter; pub struct LeakyBucketRateLimiter { map: ClashMap, default_config: utils::leaky_bucket::LeakyBucketConfig, access_count: AtomicUsize, } impl LeakyBucketRateLimiter { pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig { rps: 600.0, max: 1500.0, }; pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { Self { map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards), default_config: config.into(), access_count: AtomicUsize::new(0), } } /// Check that number of connections to the endpoint is below `max_rps` rps. pub(crate) fn check(&self, key: K, config: Option, n: u32) -> bool { let now = Instant::now(); let config = config.map_or(self.default_config, Into::into); if self .access_count .fetch_add(1, Ordering::AcqRel) .is_multiple_of(2048) { self.do_gc(now); } let mut entry = self .map .entry(key) .or_insert_with(|| LeakyBucketState { empty_at: now }); entry.add_tokens(&config, now, n as f64).is_ok() } fn do_gc(&self, now: Instant) { info!( "cleaning up bucket rate limiter, current size = {}", self.map.len() ); let n = self.map.shards().len(); let shard = rand::rng().random_range(0..n); self.map.shards()[shard] .write() .retain(|(_, value)| !value.bucket_is_empty(now)); } } pub struct LeakyBucketConfig { pub rps: f64, pub max: f64, } impl LeakyBucketConfig { pub fn new(rps: f64, max: f64) -> Self { assert!(rps > 0.0, "rps must be positive"); assert!(max > 0.0, "max must be positive"); Self { rps, max } } } impl From for utils::leaky_bucket::LeakyBucketConfig { fn from(config: LeakyBucketConfig) -> Self { utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max) } } #[cfg(test)] #[allow(clippy::float_cmp)] mod tests { use std::time::Duration; use tokio::time::Instant; use utils::leaky_bucket::LeakyBucketState; use super::LeakyBucketConfig; #[tokio::test(start_paused = true)] async fn check() { let config: utils::leaky_bucket::LeakyBucketConfig = LeakyBucketConfig::new(500.0, 2000.0).into(); assert_eq!(config.cost, Duration::from_millis(2)); assert_eq!(config.bucket_width, Duration::from_secs(4)); let mut bucket = LeakyBucketState { empty_at: Instant::now(), }; // should work for 2000 requests this second for _ in 0..2000 { bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width); // in 1ms we should drain 0.5 tokens. // make sure we don't lose any tokens tokio::time::advance(Duration::from_millis(1)).await; bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); tokio::time::advance(Duration::from_millis(1)).await; bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); // in 10ms we should drain 5 tokens tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // in 10s we should drain 5000 tokens // but cap is only 2000 tokio::time::advance(Duration::from_secs(10)).await; for _ in 0..2000 { bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // should sustain 500rps for _ in 0..2000 { tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } } } } ================================================ FILE: proxy/src/rate_limiter/limit_algorithm/aimd.rs ================================================ use super::{LimitAlgorithm, Outcome, Sample}; /// Loss-based congestion avoidance. /// /// Additive-increase, multiplicative decrease. /// /// Adds available currency when: /// 1. no load-based errors are observed, and /// 2. the utilisation of the current limit is high. /// /// Reduces available concurrency by a factor when load-based errors are detected. #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] pub(crate) struct Aimd { /// Minimum limit for AIMD algorithm. pub(crate) min: usize, /// Maximum limit for AIMD algorithm. pub(crate) max: usize, /// Decrease AIMD decrease by value in case of error. pub(crate) dec: f32, /// Increase AIMD increase by value in case of success. pub(crate) inc: usize, /// A threshold below which the limit won't be increased. pub(crate) utilisation: f32, } impl LimitAlgorithm for Aimd { fn update(&self, old_limit: usize, sample: Sample) -> usize { match sample.outcome { Outcome::Success => { let utilisation = sample.in_flight as f32 / old_limit as f32; if utilisation > self.utilisation { let limit = old_limit + self.inc; let new_limit = limit.clamp(self.min, self.max); if new_limit > old_limit { tracing::info!(old_limit, new_limit, "limit increased"); } else { tracing::debug!(old_limit, new_limit, "limit clamped at max"); } new_limit } else { old_limit } } Outcome::Overload => { let new_limit = old_limit as f32 * self.dec; // Floor instead of round, so the limit reduces even with small numbers. // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 let new_limit = new_limit.floor() as usize; let new_limit = new_limit.clamp(self.min, self.max); if new_limit < old_limit { tracing::info!(old_limit, new_limit, "limit decreased"); } else { tracing::debug!(old_limit, new_limit, "limit clamped at min"); } new_limit } } } } #[cfg(test)] mod tests { use std::time::Duration; use super::*; use crate::rate_limiter::limit_algorithm::{ DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, }; #[tokio::test(start_paused = true)] async fn increase_decrease() { let config = RateLimiterConfig { initial_limit: 1, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 2, inc: 10, dec: 0.5, utilisation: 0.8, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Success); assert_eq!(limiter.state().limit(), 2); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Success); assert_eq!(limiter.state().limit(), 2); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Overload); assert_eq!(limiter.state().limit(), 1); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Overload); assert_eq!(limiter.state().limit(), 1); } #[tokio::test(start_paused = true)] async fn should_decrease_limit_on_overload() { let config = RateLimiterConfig { initial_limit: 10, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 1500, inc: 10, dec: 0.5, utilisation: 0.8, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(100)) .await .unwrap(); token.release(Outcome::Overload); assert_eq!(limiter.state().limit(), 5, "overload: decrease"); } #[tokio::test(start_paused = true)] async fn acquire_timeout_times_out() { let config = RateLimiterConfig { initial_limit: 1, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 2, inc: 10, dec: 0.5, utilisation: 0.8, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); let now = tokio::time::Instant::now(); limiter .acquire_timeout(Duration::from_secs(1)) .await .err() .unwrap(); assert!(now.elapsed() >= Duration::from_secs(1)); token.release(Outcome::Success); assert_eq!(limiter.state().limit(), 2); } #[tokio::test(start_paused = true)] async fn should_increase_limit_on_success_when_using_gt_util_threshold() { let config = RateLimiterConfig { initial_limit: 4, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 1500, inc: 1, dec: 0.5, utilisation: 0.5, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); let _token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); let _token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Success); assert_eq!(limiter.state().limit(), 5, "success: increase"); } #[tokio::test(start_paused = true)] async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { let config = RateLimiterConfig { initial_limit: 4, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 1500, inc: 10, dec: 0.5, utilisation: 0.5, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); token.release(Outcome::Success); assert_eq!( limiter.state().limit(), 4, "success: ignore when < half limit" ); } #[tokio::test(start_paused = true)] async fn should_not_change_limit_when_no_outcome() { let config = RateLimiterConfig { initial_limit: 10, algorithm: RateLimitAlgorithm::Aimd { conf: Aimd { min: 1, max: 1500, inc: 10, dec: 0.5, utilisation: 0.5, }, }, }; let limiter = DynamicLimiter::new(config); let token = limiter .acquire_timeout(Duration::from_millis(1)) .await .unwrap(); drop(token); assert_eq!(limiter.state().limit(), 10, "ignore"); } } ================================================ FILE: proxy/src/rate_limiter/limit_algorithm.rs ================================================ //! Algorithms for controlling concurrency limits. use std::pin::pin; use std::sync::Arc; use std::time::Duration; use parking_lot::Mutex; use tokio::sync::Notify; use tokio::time::Instant; use tokio::time::error::Elapsed; use self::aimd::Aimd; pub(crate) mod aimd; /// Whether a job succeeded or failed as a result of congestion/overload. /// /// Errors not considered to be caused by overload should be ignored. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum Outcome { /// The job succeeded, or failed in a way unrelated to overload. Success, /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal /// was observed. Overload, } /// An algorithm for controlling a concurrency limit. pub(crate) trait LimitAlgorithm: Send + Sync + 'static { /// Update the concurrency limit in response to a new job completion. fn update(&self, old_limit: usize, sample: Sample) -> usize; } /// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay). #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub(crate) struct Sample { pub(crate) latency: Duration, /// Jobs in flight when the sample was taken. pub(crate) in_flight: usize, pub(crate) outcome: Outcome, } #[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)] #[serde(rename_all = "snake_case")] pub(crate) enum RateLimitAlgorithm { #[default] Fixed, Aimd { #[serde(flatten)] conf: Aimd, }, } pub(crate) struct Fixed; impl LimitAlgorithm for Fixed { fn update(&self, old_limit: usize, _sample: Sample) -> usize { old_limit } } #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] pub struct RateLimiterConfig { #[serde(flatten)] pub(crate) algorithm: RateLimitAlgorithm, pub(crate) initial_limit: usize, } impl RateLimiterConfig { pub(crate) fn create_rate_limit_algorithm(self) -> Box { match self.algorithm { RateLimitAlgorithm::Fixed => Box::new(Fixed), RateLimitAlgorithm::Aimd { conf } => Box::new(conf), } } } pub(crate) struct LimiterInner { alg: Box, available: usize, limit: usize, in_flight: usize, } impl LimiterInner { fn update_limit(&mut self, latency: Duration, outcome: Option) { if let Some(outcome) = outcome { let sample = Sample { latency, in_flight: self.in_flight, outcome, }; self.limit = self.alg.update(self.limit, sample); } } fn take(&mut self, ready: &Notify) -> Option<()> { if self.available >= 1 { self.available -= 1; self.in_flight += 1; // tell the next in the queue that there is a permit ready if self.available >= 1 { ready.notify_one(); } Some(()) } else { None } } } /// Limits the number of concurrent jobs. /// /// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the /// token once the job is finished. /// /// The limit will be automatically adjusted based on observed latency (delay) and/or failures /// caused by overload (loss). pub(crate) struct DynamicLimiter { config: RateLimiterConfig, inner: Mutex, // to notify when a token is available ready: Notify, } /// A concurrency token, required to run a job. /// /// Release the token back to the [`DynamicLimiter`] after the job is complete. pub(crate) struct Token { start: Instant, limiter: Option>, } /// A snapshot of the state of the [`DynamicLimiter`]. /// /// Not guaranteed to be consistent under high concurrency. #[derive(Debug, Clone, Copy)] #[cfg(test)] struct LimiterState { limit: usize, } impl DynamicLimiter { /// Create a limiter with a given limit control algorithm. pub(crate) fn new(config: RateLimiterConfig) -> Arc { let ready = Notify::new(); ready.notify_one(); Arc::new(Self { inner: Mutex::new(LimiterInner { alg: config.create_rate_limit_algorithm(), available: config.initial_limit, limit: config.initial_limit, in_flight: 0, }), ready, config, }) } /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. pub(crate) async fn acquire_timeout( self: &Arc, duration: Duration, ) -> Result { tokio::time::timeout(duration, self.acquire()).await? } /// Try to acquire a concurrency [Token]. async fn acquire(self: &Arc) -> Result { if self.config.initial_limit == 0 { // If the rate limiter is disabled, we can always acquire a token. Ok(Token::disabled()) } else { let mut notified = pin!(self.ready.notified()); let mut ready = notified.as_mut().enable(); loop { if ready { let mut inner = self.inner.lock(); if inner.take(&self.ready).is_some() { break Ok(Token::new(self.clone())); } notified.set(self.ready.notified()); } notified.as_mut().await; ready = true; } } } /// Return the concurrency [Token], along with the outcome of the job. /// /// The [Outcome] of the job, and the time taken to perform it, may be used /// to update the concurrency limit. /// /// Set the outcome to `None` to ignore the job. fn release_inner(&self, start: Instant, outcome: Option) { if outcome.is_none() { tracing::warn!("outcome is {:?}", outcome); } else { tracing::debug!("outcome is {:?}", outcome); } if self.config.initial_limit == 0 { return; } let mut inner = self.inner.lock(); inner.update_limit(start.elapsed(), outcome); inner.in_flight -= 1; if inner.in_flight < inner.limit { inner.available = inner.limit - inner.in_flight; // At least 1 permit is now available self.ready.notify_one(); } } /// The current state of the limiter. #[cfg(test)] fn state(&self) -> LimiterState { let inner = self.inner.lock(); LimiterState { limit: inner.limit } } } impl Token { fn new(limiter: Arc) -> Self { Self { start: Instant::now(), limiter: Some(limiter), } } pub(crate) fn disabled() -> Self { Self { start: Instant::now(), limiter: None, } } pub(crate) fn is_disabled(&self) -> bool { self.limiter.is_none() } pub(crate) fn release(mut self, outcome: Outcome) { self.release_mut(Some(outcome)); } pub(crate) fn release_mut(&mut self, outcome: Option) { if let Some(limiter) = self.limiter.take() { limiter.release_inner(self.start, outcome); } } } impl Drop for Token { fn drop(&mut self) { self.release_mut(None); } } #[cfg(test)] impl LimiterState { /// The current concurrency limit. fn limit(self) -> usize { self.limit } } ================================================ FILE: proxy/src/rate_limiter/limiter.rs ================================================ use std::borrow::Cow; use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; use std::sync::Mutex; use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::bail; use clashmap::ClashMap; use itertools::Itertools; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; use super::LeakyBucketConfig; use crate::ext::LockExt; use crate::intern::EndpointIdInt; // Simple per-endpoint rate limiter. // // Check that number of connections to the endpoint is below `max_rps` rps. // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { map: ClashMap, Hasher>, info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, } #[derive(Clone, Copy)] struct RateBucket { start: Instant, count: u32, } impl RateBucket { fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool { if now - self.start < info.interval { self.count + n <= info.max_rpi } else { // bucket expired, reset self.count = 0; self.start = now; true } } fn inc(&mut self, n: u32) { self.count += n; } } #[derive(Clone, Copy, PartialEq)] pub struct RateBucketInfo { pub(crate) interval: Duration, // requests per interval pub(crate) max_rpi: u32, } impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let rps = self.rps().floor() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } impl std::fmt::Debug for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self}") } } impl std::str::FromStr for RateBucketInfo { type Err = anyhow::Error; fn from_str(s: &str) -> Result { let Some((max_rps, interval)) = s.split_once('@') else { bail!("invalid rate info") }; let max_rps = max_rps.parse()?; let interval = humantime::parse_duration(interval)?; Ok(Self::new(max_rps, interval)) } } impl RateBucketInfo { pub const DEFAULT_SET: [Self; 3] = [ Self::new(300, Duration::from_secs(1)), Self::new(200, Duration::from_secs(60)), Self::new(100, Duration::from_secs(600)), ]; pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ Self::new(500, Duration::from_secs(1)), Self::new(300, Duration::from_secs(60)), Self::new(200, Duration::from_secs(600)), ]; pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info .iter() .tuple_windows() .find(|(a, b)| a.max_rpi > b.max_rpi); if let Some((a, b)) = invalid { bail!( "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", b.max_rpi, a.max_rpi, ); } Ok(()) } pub const fn new(max_rps: u32, interval: Duration) -> Self { Self { interval, max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32, } } pub fn to_leaky_bucket(this: &[Self]) -> Option { // bit of a hack - find the min rps and max rps supported and turn it into // leaky bucket config instead let mut iter = this.iter().map(|info| info.rps()); let first = iter.next()?; let (min, max) = (first, first); let (min, max) = iter.fold((min, max), |(min, max), rps| { (f64::min(min, rps), f64::max(max, rps)) }); Some(LeakyBucketConfig { rps: min, max }) } } impl BucketRateLimiter { pub fn new(info: impl Into>) -> Self { Self::new_with_rand_and_hasher(info, StdRng::from_os_rng(), RandomState::new()) } } impl BucketRateLimiter { fn new_with_rand_and_hasher( info: impl Into>, rand: R, hasher: S, ) -> Self { let info = info.into(); info!(buckets = ?info, "endpoint rate limiter"); Self { info, map: ClashMap::with_hasher_and_shard_amount(hasher, 64), access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request rand: Mutex::new(rand), } } /// Check that number of connections to the endpoint is below `max_rps` rps. pub(crate) fn check(&self, key: K, n: u32) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) // = 30MB if self .access_count .fetch_add(1, Ordering::AcqRel) .is_multiple_of(2048) { self.do_gc(); } let now = Instant::now(); let mut entry = self.map.entry(key).or_insert_with(|| { vec![ RateBucket { start: now, count: 0, }; self.info.len() ] }); let should_allow_request = entry .iter_mut() .zip(&*self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, n)); if should_allow_request { // only increment the bucket counts if the request will actually be accepted entry.iter_mut().for_each(|b| b.inc(n)); } should_allow_request } /// Clean the map. Simple strategy: remove all entries in a random shard. /// At worst, we'll double the effective max_rps during the cleanup. /// But that way deletion does not aquire mutex on each entry access. pub(crate) fn do_gc(&self) { info!( "cleaning up bucket rate limiter, current size = {}", self.map.len() ); let n = self.map.shards().len(); // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide // (impossible, infact, unless we have 2048 threads) let shard = self.rand.lock_propagate_poison().random_range(0..n); self.map.shards()[shard].write().clear(); } } #[cfg(test)] mod tests { use std::hash::BuildHasherDefault; use std::time::Duration; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; use super::{BucketRateLimiter, WakeComputeRateLimiter}; use crate::intern::EndpointIdInt; use crate::rate_limiter::RateBucketInfo; use crate::types::EndpointId; #[test] fn rate_bucket_rpi() { let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5)); assert_eq!(rate_bucket.max_rpi, 50 * 5); let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500)); assert_eq!(rate_bucket.max_rpi, 50 / 2); } #[test] fn rate_bucket_parse() { let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap(); assert_eq!(rate_bucket.interval, Duration::from_secs(10)); assert_eq!(rate_bucket.max_rpi, 100 * 10); assert_eq!(rate_bucket.to_string(), "100@10s"); let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap(); assert_eq!(rate_bucket.interval, Duration::from_secs(60)); assert_eq!(rate_bucket.max_rpi, 100 * 60); assert_eq!(rate_bucket.to_string(), "100@1m"); } #[test] fn default_rate_buckets() { let mut defaults = RateBucketInfo::DEFAULT_SET; RateBucketInfo::validate(&mut defaults[..]).unwrap(); } #[test] #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] fn rate_buckets_validate() { let mut rates: Vec = ["300@1s", "10@10s"] .into_iter() .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); } #[tokio::test] async fn test_rate_limits() { let mut rates: Vec = ["100@1s", "20@30s"] .into_iter() .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); let limiter = WakeComputeRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); let endpoint = EndpointIdInt::from(endpoint); time::pause(); for _ in 0..100 { assert!(limiter.check(endpoint, 1)); } // more connections fail assert!(!limiter.check(endpoint, 1)); // fail even after 500ms as it's in the same bucket time::advance(time::Duration::from_millis(500)).await; assert!(!limiter.check(endpoint, 1)); // after a full 1s, 100 requests are allowed again time::advance(time::Duration::from_millis(500)).await; for _ in 1..6 { for _ in 0..50 { assert!(limiter.check(endpoint, 2)); } time::advance(time::Duration::from_millis(1000)).await; } // more connections after 600 will exceed the 20rps@30s limit assert!(!limiter.check(endpoint, 1)); // will still fail before the 30 second limit time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; assert!(!limiter.check(endpoint, 1)); // after the full 30 seconds, 100 requests are allowed again time::advance(time::Duration::from_millis(1)).await; for _ in 0..100 { assert!(limiter.check(endpoint, 1)); } } #[tokio::test] async fn test_rate_limits_gc() { // fixed seeded random/hasher to ensure that the test is not flaky let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); let limiter = BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { limiter.check(i, 1); } assert!(limiter.map.len() < 150_000); } } ================================================ FILE: proxy/src/rate_limiter/mod.rs ================================================ mod leaky_bucket; mod limit_algorithm; mod limiter; pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; #[cfg(test)] pub(crate) use limit_algorithm::aimd::Aimd; pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; pub use limiter::{RateBucketInfo, WakeComputeRateLimiter}; ================================================ FILE: proxy/src/redis/connection_with_credentials_provider.rs ================================================ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use futures::FutureExt; use redis::aio::{ConnectionLike, MultiplexedConnection}; use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult}; use tokio::task::AbortHandle; use tracing::{error, info, warn}; use super::elasticache::CredentialsProvider; use crate::redis::elasticache::CredentialsProviderError; enum Credentials { Static(ConnectionInfo), Dynamic(Arc, redis::ConnectionAddr), } impl Clone for Credentials { fn clone(&self) -> Self { match self { Credentials::Static(info) => Credentials::Static(info.clone()), Credentials::Dynamic(provider, addr) => { Credentials::Dynamic(Arc::clone(provider), addr.clone()) } } } } #[derive(thiserror::Error, Debug)] pub enum ConnectionProviderError { #[error(transparent)] Redis(#[from] RedisError), #[error(transparent)] CredentialsProvider(#[from] CredentialsProviderError), } /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. /// Provides PubSub connection without credentials refresh. pub struct ConnectionWithCredentialsProvider { credentials: Credentials, // TODO: with more load on the connection, we should consider using a connection pool con: Option, refresh_token_task: Option, mutex: tokio::sync::Mutex<()>, credentials_refreshed: Arc, } impl Clone for ConnectionWithCredentialsProvider { fn clone(&self) -> Self { Self { credentials: self.credentials.clone(), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), credentials_refreshed: Arc::new(AtomicBool::new(false)), } } } impl ConnectionWithCredentialsProvider { pub fn new_with_credentials_provider( host: String, port: u16, credentials_provider: Arc, ) -> Self { Self { credentials: Credentials::Dynamic( credentials_provider, redis::ConnectionAddr::TcpTls { host, port, insecure: false, tls_params: None, }, ), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), credentials_refreshed: Arc::new(AtomicBool::new(false)), } } pub fn new_with_static_credentials(params: T) -> Self { Self { credentials: Credentials::Static( params .into_connection_info() .expect("static configured redis credentials should be a valid format"), ), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), credentials_refreshed: Arc::new(AtomicBool::new(true)), } } async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> { redis::cmd("PING") .query_async(con) .await .map_err(Into::into) } pub(crate) fn credentials_refreshed(&self) -> bool { self.credentials_refreshed.load(Ordering::Relaxed) } pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { match Self::ping(con).await { Ok(()) => { return Ok(()); } Err(e) => { warn!("Error during PING: {e:?}"); } } } else { info!("Connection is not established"); } info!("Establishing a new connection..."); self.con = None; if let Some(f) = self.refresh_token_task.take() { f.abort(); } let mut con = self .get_client() .await? .get_multiplexed_tokio_connection() .await?; if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); let credentials_refreshed = self.credentials_refreshed.clone(); let f = tokio::spawn(Self::keep_connection( con2, credentials_provider, credentials_refreshed, )); self.refresh_token_task = Some(f.abort_handle()); } match Self::ping(&mut con).await { Ok(()) => { info!("Connection succesfully established"); } Err(e) => { warn!("Connection is broken. Error during PING: {e:?}"); } } self.con = Some(con); Ok(()) } async fn get_connection_info(&self) -> Result { match &self.credentials { Credentials::Static(info) => Ok(info.clone()), Credentials::Dynamic(provider, addr) => { let (username, password) = provider.provide_credentials().await?; Ok(ConnectionInfo { addr: addr.clone(), redis: RedisConnectionInfo { db: 0, username: Some(username), password: Some(password.clone()), // TODO: switch to RESP3 after testing new client version. protocol: redis::ProtocolVersion::RESP2, }, }) } } } async fn get_client(&self) -> Result { let client = redis::Client::open(self.get_connection_info().await?)?; self.credentials_refreshed.store(true, Ordering::Relaxed); Ok(client) } // PubSub does not support credentials refresh. // Requires manual reconnection every 12h. pub(crate) async fn get_async_pubsub(&self) -> anyhow::Result { Ok(self.get_client().await?.get_async_pubsub().await?) } // The connection lives for 12h. // It can be prolonged with sending `AUTH` commands with the refreshed token. // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits async fn keep_connection( mut con: MultiplexedConnection, credentials_provider: Arc, credentials_refreshed: Arc, ) -> ! { loop { // The connection lives for 12h, for the sanity check we refresh it every hour. tokio::time::sleep(Duration::from_secs(60 * 60)).await; match Self::refresh_token(&mut con, credentials_provider.clone()).await { Ok(()) => { info!("Token refreshed"); credentials_refreshed.store(true, Ordering::Relaxed); } Err(e) => { error!("Error during token refresh: {e:?}"); credentials_refreshed.store(false, Ordering::Relaxed); } } } } async fn refresh_token( con: &mut MultiplexedConnection, credentials_provider: Arc, ) -> anyhow::Result<()> { let (user, password) = credentials_provider.provide_credentials().await?; let _: () = redis::cmd("AUTH") .arg(user) .arg(password) .query_async(con) .await?; Ok(()) } /// Sends an already encoded (packed) command into the TCP socket and /// reads the single response from it. pub(crate) async fn send_packed_command( &mut self, cmd: &redis::Cmd, ) -> RedisResult { // Clone connection to avoid having to lock the ArcSwap in write mode let con = self.con.as_mut().ok_or(redis::RedisError::from(( redis::ErrorKind::IoError, "Connection not established", )))?; con.send_packed_command(cmd).await } /// Sends multiple already encoded (packed) command into the TCP socket /// and reads `count` responses from it. This is used to implement /// pipelining. pub(crate) async fn send_packed_commands( &mut self, cmd: &redis::Pipeline, offset: usize, count: usize, ) -> RedisResult> { // Clone shared connection future to avoid having to lock the ArcSwap in write mode let con = self.con.as_mut().ok_or(redis::RedisError::from(( redis::ErrorKind::IoError, "Connection not established", )))?; con.send_packed_commands(cmd, offset, count).await } } impl ConnectionLike for ConnectionWithCredentialsProvider { fn req_packed_command<'a>( &'a mut self, cmd: &'a redis::Cmd, ) -> redis::RedisFuture<'a, redis::Value> { self.send_packed_command(cmd).boxed() } fn req_packed_commands<'a>( &'a mut self, cmd: &'a redis::Pipeline, offset: usize, count: usize, ) -> redis::RedisFuture<'a, Vec> { self.send_packed_commands(cmd, offset, count).boxed() } fn get_db(&self) -> i64 { self.con.as_ref().map_or(0, |c| c.get_db()) } } ================================================ FILE: proxy/src/redis/elasticache.rs ================================================ use std::sync::Arc; use std::time::{Duration, SystemTime}; use aws_config::Region; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_credential_types::provider::error::CredentialsError; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings, }; use aws_sigv4::sign::v4::signing_params::BuildError; use tracing::info; #[derive(Debug)] pub struct AWSIRSAConfig { region: String, service_name: String, cluster_name: String, user_id: String, token_ttl: Duration, action: String, } impl AWSIRSAConfig { pub fn new(region: String, cluster_name: Option, user_id: Option) -> Self { AWSIRSAConfig { region, service_name: "elasticache".to_string(), cluster_name: cluster_name.unwrap_or_default(), user_id: user_id.unwrap_or_default(), // "The IAM authentication token is valid for 15 minutes" // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits token_ttl: Duration::from_secs(15 * 60), action: "connect".to_string(), } } } #[derive(thiserror::Error, Debug)] pub enum CredentialsProviderError { #[error(transparent)] AwsCredentials(#[from] CredentialsError), #[error(transparent)] AwsSigv4Build(#[from] BuildError), #[error(transparent)] AwsSigv4Singing(#[from] SigningError), #[error(transparent)] Http(#[from] http::Error), } /// Credentials provider for AWS elasticache authentication. /// /// Official documentation: /// /// /// Useful resources: /// pub struct CredentialsProvider { config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain, } impl CredentialsProvider { pub async fn new( aws_region: String, redis_cluster_name: Option, redis_user_id: Option, ) -> Arc { let region_provider = RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region_provider.region().await); let aws_credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), ) // uses "AWS_PROFILE" / `aws sso login --profile ` .or_else( "profile-sso", ProfileFileCredentialsProvider::builder() .configure(&provider_conf) .build(), ) // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" // needed to access remote extensions bucket .or_else( "token", WebIdentityTokenCredentialsProvider::builder() .configure(&provider_conf) .build(), ) // uses imds v2 .or_else("imds", ImdsCredentialsProvider::builder().build()) }; Arc::new(CredentialsProvider { config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id), credentials_provider: aws_credentials_provider, }) } pub(crate) async fn provide_credentials( &self, ) -> Result<(String, String), CredentialsProviderError> { let aws_credentials = self .credentials_provider .provide_credentials() .await? .into(); info!("AWS credentials successfully obtained"); info!("Connecting to Redis with configuration: {:?}", self.config); let mut settings = SigningSettings::default(); settings.signature_location = SignatureLocation::QueryParams; settings.expires_in = Some(self.config.token_ttl); let signing_params = aws_sigv4::sign::v4::SigningParams::builder() .identity(&aws_credentials) .region(&self.config.region) .name(&self.config.service_name) .time(SystemTime::now()) .settings(settings) .build()? .into(); let auth_params = [ ("Action", &self.config.action), ("User", &self.config.user_id), ]; let auth_params = url::form_urlencoded::Serializer::new(String::new()) .extend_pairs(auth_params) .finish(); let auth_uri = http::Uri::builder() .scheme("http") .authority(self.config.cluster_name.as_bytes()) .path_and_query(format!("/?{auth_params}")) .build()?; info!("{}", auth_uri); // Convert the HTTP request into a signable request let signable_request = SignableRequest::new( "GET", auth_uri.to_string(), std::iter::empty(), SignableBody::Bytes(&[]), )?; // Sign and then apply the signature to the request let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts(); let mut signable_request = http::Request::builder() .method("GET") .uri(auth_uri) .body(())?; si.apply_to_request_http1x(&mut signable_request); Ok(( self.config.user_id.clone(), signable_request .uri() .to_string() .replacen("http://", "", 1), )) } } ================================================ FILE: proxy/src/redis/keys.rs ================================================ use crate::pqproto::CancelKeyData; pub mod keyspace { pub const CANCEL_PREFIX: &str = "cancel"; } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) enum KeyPrefix { Cancel(CancelKeyData), } impl KeyPrefix { pub(crate) fn build_redis_key(&self) -> String { match self { KeyPrefix::Cancel(key) => { let id = key.0.get(); let keyspace = keyspace::CANCEL_PREFIX; format!("{keyspace}:{id:x}") } } } } #[cfg(test)] mod tests { use super::*; use crate::pqproto::id_to_cancel_key; #[test] fn test_build_redis_key() { let cancel_key: KeyPrefix = KeyPrefix::Cancel(id_to_cancel_key(12345 << 32 | 54321)); let redis_key = cancel_key.build_redis_key(); assert_eq!(redis_key, "cancel:30390000d431"); } } ================================================ FILE: proxy/src/redis/kv_ops.rs ================================================ use std::time::Duration; use futures::FutureExt; use redis::aio::ConnectionLike; use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::redis::connection_with_credentials_provider::ConnectionProviderError; #[derive(thiserror::Error, Debug)] pub enum RedisKVClientError { #[error(transparent)] Redis(#[from] RedisError), #[error(transparent)] ConnectionProvider(#[from] ConnectionProviderError), } pub struct RedisKVClient { client: ConnectionWithCredentialsProvider, } #[allow(async_fn_in_trait)] pub trait Queryable { async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult; } impl Queryable for Pipeline { async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { self.query_async(conn).await } } impl Queryable for Cmd { async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { self.query_async(conn).await } } impl RedisKVClient { pub fn new(client: ConnectionWithCredentialsProvider) -> Self { Self { client } } pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> { self.client .connect() .boxed() .await .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) .map_err(Into::into) } pub(crate) fn credentials_refreshed(&self) -> bool { self.client.credentials_refreshed() } pub(crate) async fn query( &mut self, q: &impl Queryable, ) -> Result { let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), Err(e) => e, }; tracing::debug!("failed to run query: {e}"); match e.retry_method() { redis::RetryMethod::Reconnect => { tracing::info!("Redis client is disconnected. Reconnecting..."); self.try_connect().await?; } redis::RetryMethod::RetryImmediately => {} redis::RetryMethod::WaitAndRetry => { // somewhat arbitrary. tokio::time::sleep(Duration::from_millis(100)).await; } _ => Err(e)?, } Ok(q.query(&mut self.client).await?) } } ================================================ FILE: proxy/src/redis/mod.rs ================================================ pub mod connection_with_credentials_provider; pub mod elasticache; pub mod keys; pub mod kv_ops; pub mod notifications; ================================================ FILE: proxy/src/redis/notifications.rs ================================================ use std::convert::Infallible; use std::sync::Arc; use futures::StreamExt; use redis::aio::PubSub; use serde::Deserialize; use tokio_util::sync::CancellationToken; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::util::deserialize_json_string; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result { let mut conn = client.get_async_pubsub().await?; tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); conn.subscribe(CPLANE_CHANNEL_NAME).await?; Ok(conn) } #[derive(Debug, Deserialize)] struct NotificationHeader<'a> { topic: &'a str, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] enum Notification { #[serde( rename = "/account_settings_update", alias = "/allowed_vpc_endpoints_updated_for_org", deserialize_with = "deserialize_json_string" )] AccountSettingsUpdate(InvalidateAccount), #[serde( rename = "/endpoint_settings_update", deserialize_with = "deserialize_json_string" )] EndpointSettingsUpdate(InvalidateEndpoint), #[serde( rename = "/project_settings_update", alias = "/allowed_ips_updated", alias = "/block_public_or_vpc_access_updated", alias = "/allowed_vpc_endpoints_updated_for_projects", deserialize_with = "deserialize_json_string" )] ProjectSettingsUpdate(InvalidateProject), #[serde( rename = "/role_setting_update", alias = "/password_updated", deserialize_with = "deserialize_json_string" )] RoleSettingUpdate(InvalidateRole), #[serde( other, deserialize_with = "deserialize_unknown_topic", skip_serializing )] UnknownTopic, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(rename_all = "snake_case")] enum InvalidateEndpoint { EndpointId(EndpointIdInt), EndpointIds(Vec), } impl std::ops::Deref for InvalidateEndpoint { type Target = [EndpointIdInt]; fn deref(&self) -> &Self::Target { match self { Self::EndpointId(id) => std::slice::from_ref(id), Self::EndpointIds(ids) => ids, } } } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(rename_all = "snake_case")] enum InvalidateProject { ProjectId(ProjectIdInt), ProjectIds(Vec), } impl std::ops::Deref for InvalidateProject { type Target = [ProjectIdInt]; fn deref(&self) -> &Self::Target { match self { Self::ProjectId(id) => std::slice::from_ref(id), Self::ProjectIds(ids) => ids, } } } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(rename_all = "snake_case")] enum InvalidateAccount { AccountId(AccountIdInt), AccountIds(Vec), } impl std::ops::Deref for InvalidateAccount { type Target = [AccountIdInt]; fn deref(&self) -> &Self::Target { match self { Self::AccountId(id) => std::slice::from_ref(id), Self::AccountIds(ids) => ids, } } } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] struct InvalidateRole { project_id: ProjectIdInt, role_name: RoleNameInt, } // https://github.com/serde-rs/serde/issues/1714 fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error> where D: serde::Deserializer<'de>, { deserializer.deserialize_any(serde::de::IgnoredAny)?; Ok(()) } struct MessageHandler { cache: Arc, } impl Clone for MessageHandler { fn clone(&self) -> Self { Self { cache: self.cache.clone(), } } } impl MessageHandler { pub(crate) fn new(cache: Arc) -> Self { Self { cache } } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { let payload: String = msg.get_payload()?; tracing::debug!(?payload, "received a message payload"); let msg: Notification = match serde_json::from_str(&payload) { Ok(Notification::UnknownTopic) => { match serde_json::from_str::(&payload) { // don't update the metric for redis errors if it's just a topic we don't know about. Ok(header) => tracing::warn!(topic = header.topic, "unknown topic"), Err(e) => { Metrics::get().proxy.redis_errors_total.inc(RedisErrors { channel: msg.get_channel_name(), }); tracing::error!("broken message: {e}"); } } return Ok(()); } Ok(msg) => msg, Err(e) => { Metrics::get().proxy.redis_errors_total.inc(RedisErrors { channel: msg.get_channel_name(), }); match serde_json::from_str::(&payload) { Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), Err(_) => tracing::error!("broken message: {e}"), } return Ok(()); } }; tracing::debug!(?msg, "received a message"); match msg { Notification::RoleSettingUpdate { .. } | Notification::EndpointSettingsUpdate { .. } | Notification::ProjectSettingsUpdate { .. } | Notification::AccountSettingsUpdate { .. } => { invalidate_cache(self.cache.clone(), msg.clone()); let m = &Metrics::get().proxy.redis_events_count; match msg { Notification::RoleSettingUpdate { .. } => { m.inc(RedisEventsCount::InvalidateRole); } Notification::EndpointSettingsUpdate { .. } => { m.inc(RedisEventsCount::InvalidateEndpoint); } Notification::ProjectSettingsUpdate { .. } => { m.inc(RedisEventsCount::InvalidateProject); } Notification::AccountSettingsUpdate { .. } => { m.inc(RedisEventsCount::InvalidateOrg); } Notification::UnknownTopic => {} } // TODO: add additional metrics for the other event types. // It might happen that the invalid entry is on the way to be cached. // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. let cache = self.cache.clone(); tokio::spawn(async move { tokio::time::sleep(INVALIDATION_LAG).await; invalidate_cache(cache, msg); }); } Notification::UnknownTopic => unreachable!(), } Ok(()) } } fn invalidate_cache(cache: Arc, msg: Notification) { match msg { Notification::EndpointSettingsUpdate(ids) => ids .iter() .for_each(|&id| cache.invalidate_endpoint_access(id)), Notification::AccountSettingsUpdate(ids) => ids .iter() .for_each(|&id| cache.invalidate_endpoint_access_for_org(id)), Notification::ProjectSettingsUpdate(ids) => ids .iter() .for_each(|&id| cache.invalidate_endpoint_access_for_project(id)), Notification::RoleSettingUpdate(InvalidateRole { project_id, role_name, }) => cache.invalidate_role_secret_for_project(project_id, role_name), Notification::UnknownTopic => unreachable!(), } } async fn handle_messages( handler: MessageHandler, redis: ConnectionWithCredentialsProvider, cancellation_token: CancellationToken, ) -> anyhow::Result<()> { loop { if cancellation_token.is_cancelled() { return Ok(()); } let mut conn = match try_connect(&redis).await { Ok(conn) => conn, Err(e) => { tracing::error!( "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } }; let mut stream = conn.on_message(); while let Some(msg) = stream.next().await { match handler.handle_message(msg).await { Ok(()) => {} Err(e) => { tracing::error!("failed to handle message: {e}, will try to reconnect"); break; } } if cancellation_token.is_cancelled() { return Ok(()); } } } } /// Handle console's invalidation messages. #[tracing::instrument(name = "redis_notifications", skip_all)] pub async fn task_main( redis: ConnectionWithCredentialsProvider, cache: Arc, ) -> anyhow::Result { let handler = MessageHandler::new(cache); // 6h - 1m. // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); loop { let cancellation_token = CancellationToken::new(); interval.tick().await; tokio::spawn(handle_messages( handler.clone(), redis.clone(), cancellation_token.clone(), )); tokio::spawn(async move { tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h. cancellation_token.cancel(); }); } } #[cfg(test)] mod tests { use serde_json::json; use super::*; use crate::types::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { let project_id: ProjectId = "new_project".into(); let data = format!("{{\"project_id\": \"{project_id}\"}}"); let text = json!({ "type": "message", "topic": "/allowed_ips_updated", "data": data, "extre_fields": "something" }) .to_string(); let result: Notification = serde_json::from_str(&text)?; assert_eq!( result, Notification::ProjectSettingsUpdate(InvalidateProject::ProjectId((&project_id).into())) ); Ok(()) } #[test] fn parse_multiple_projects() -> anyhow::Result<()> { let project_id1: ProjectId = "new_project1".into(); let project_id2: ProjectId = "new_project2".into(); let data = format!("{{\"project_ids\": [\"{project_id1}\",\"{project_id2}\"]}}"); let text = json!({ "type": "message", "topic": "/allowed_vpc_endpoints_updated_for_projects", "data": data, "extre_fields": "something" }) .to_string(); let result: Notification = serde_json::from_str(&text)?; assert_eq!( result, Notification::ProjectSettingsUpdate(InvalidateProject::ProjectIds(vec![ (&project_id1).into(), (&project_id2).into() ])) ); Ok(()) } #[test] fn parse_password_updated() -> anyhow::Result<()> { let project_id: ProjectId = "new_project".into(); let role_name: RoleName = "new_role".into(); let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}"); let text = json!({ "type": "message", "topic": "/password_updated", "data": data, "extre_fields": "something" }) .to_string(); let result: Notification = serde_json::from_str(&text)?; assert_eq!( result, Notification::RoleSettingUpdate(InvalidateRole { project_id: (&project_id).into(), role_name: (&role_name).into(), }) ); Ok(()) } #[test] fn parse_unknown_topic() -> anyhow::Result<()> { let with_data = json!({ "type": "message", "topic": "/doesnotexist", "data": { "payload": "ignored" }, "extra_fields": "something" }) .to_string(); let result: Notification = serde_json::from_str(&with_data)?; assert_eq!(result, Notification::UnknownTopic); let without_data = json!({ "type": "message", "topic": "/doesnotexist", "extra_fields": "something" }) .to_string(); let result: Notification = serde_json::from_str(&without_data)?; assert_eq!(result, Notification::UnknownTopic); Ok(()) } } ================================================ FILE: proxy/src/sasl/channel_binding.rs ================================================ //! Definition and parser for channel binding flag (a part of the `GS2` header). use base64::Engine as _; use base64::prelude::BASE64_STANDARD; /// Channel binding flag (possibly with params). #[derive(Debug, PartialEq, Eq)] pub(crate) enum ChannelBinding { /// Client doesn't support channel binding. NotSupportedClient, /// Client thinks server doesn't support channel binding. NotSupportedServer, /// Client wants to use this type of channel binding. Required(T), } impl ChannelBinding { pub(crate) fn and_then( self, f: impl FnOnce(T) -> Result, ) -> Result, E> { Ok(match self { Self::NotSupportedClient => ChannelBinding::NotSupportedClient, Self::NotSupportedServer => ChannelBinding::NotSupportedServer, Self::Required(x) => ChannelBinding::Required(f(x)?), }) } } impl<'a> ChannelBinding<&'a str> { // NB: FromStr doesn't work with lifetimes pub(crate) fn parse(input: &'a str) -> Option { Some(match input { "n" => Self::NotSupportedClient, "y" => Self::NotSupportedServer, other => Self::Required(other.strip_prefix("p=")?), }) } } impl ChannelBinding { /// Encode channel binding data as base64 for subsequent checks. pub(crate) fn encode<'a, E>( &self, get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>, ) -> Result, E> { Ok(match self { Self::NotSupportedClient => { // base64::encode("n,,") "biws".into() } Self::NotSupportedServer => { // base64::encode("y,,") "eSws".into() } Self::Required(mode) => { let mut cbind_input = format!("p={mode},,",).into_bytes(); cbind_input.extend_from_slice(get_cbind_data(mode)?); BASE64_STANDARD.encode(&cbind_input).into() } }) } } #[cfg(test)] mod tests { use super::*; #[test] fn channel_binding_encode() -> anyhow::Result<()> { use ChannelBinding::*; let cases = [ (NotSupportedClient, BASE64_STANDARD.encode("n,,")), (NotSupportedServer, BASE64_STANDARD.encode("y,,")), (Required("foo"), BASE64_STANDARD.encode("p=foo,,bar")), ]; for (cb, input) in cases { assert_eq!(cb.encode(|_| anyhow::Ok(b"bar"))?, input); } Ok(()) } } ================================================ FILE: proxy/src/sasl/messages.rs ================================================ //! Definitions for SASL messages. use crate::parse::split_cstr; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub(crate) struct FirstMessage<'a> { /// Authentication method, e.g. `"SCRAM-SHA-256"`. pub(crate) method: &'a str, /// Initial client message. pub(crate) message: &'a str, } impl<'a> FirstMessage<'a> { // NB: FromStr doesn't work with lifetimes pub(crate) fn parse(bytes: &'a [u8]) -> Option { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; let (len_bytes, bytes) = tail.split_first_chunk()?; let len = u32::from_be_bytes(*len_bytes) as usize; if len != bytes.len() { return None; } let message = std::str::from_utf8(bytes).ok()?; Some(Self { method, message }) } } #[cfg(test)] mod tests { use super::*; #[test] fn parse_sasl_first_message() { let proto = "SCRAM-SHA-256"; let sasl = "n,,n=,r=KHQ2Gjc7NptyB8aov5/TnUy4"; let sasl_len = (sasl.len() as u32).to_be_bytes(); let bytes = [proto.as_bytes(), &[0], sasl_len.as_ref(), sasl.as_bytes()].concat(); let password = FirstMessage::parse(&bytes).unwrap(); assert_eq!(password.method, proto); assert_eq!(password.message, sasl); } } ================================================ FILE: proxy/src/sasl/mod.rs ================================================ //! Simple Authentication and Security Layer. //! //! RFC: . //! //! Reference implementation: //! * //! * mod channel_binding; mod messages; mod stream; use std::io; pub(crate) use channel_binding::ChannelBinding; pub(crate) use messages::FirstMessage; pub(crate) use stream::{Outcome, authenticate}; use thiserror::Error; use crate::error::{ReportableError, UserFacingError}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] pub(crate) enum Error { #[error("Unsupported authentication method: {0}")] BadAuthMethod(Box), #[error("Channel binding failed: {0}")] ChannelBindingFailed(&'static str), #[error("Unsupported channel binding method: {0}")] ChannelBindingBadMethod(Box), #[error("Bad client message: {0}")] BadClientMessage(&'static str), #[error("Internal error: missing digest")] MissingBinding, #[error("could not decode salt: {0}")] Base64(#[from] base64::DecodeError), #[error(transparent)] Io(#[from] io::Error), } impl UserFacingError for Error { fn to_string_client(&self) -> String { match self { Self::ChannelBindingFailed(m) => (*m).to_string(), Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), _ => "authentication protocol violation".to_string(), } } } impl ReportableError for Error { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { Error::BadAuthMethod(_) => crate::error::ErrorKind::User, Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User, Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, Error::BadClientMessage(_) => crate::error::ErrorKind::User, Error::MissingBinding => crate::error::ErrorKind::Service, Error::Base64(_) => crate::error::ErrorKind::ControlPlane, Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } /// A convenient result type for SASL exchange. pub(crate) type Result = std::result::Result; /// A result of one SASL exchange. #[must_use] pub(crate) enum Step { /// We should continue exchanging messages. Continue(T, String), /// The client has been authenticated successfully. Success(R, String), /// Authentication failed (reason attached). Failure(&'static str), } /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. pub(crate) trait Mechanism: Sized { /// What's produced as a result of successful authentication. type Output; /// Produce a server challenge to be sent to the client. /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). fn exchange(self, input: &str) -> Result>; } ================================================ FILE: proxy/src/sasl/stream.rs ================================================ //! Abstraction for the string-oriented SASL protocols. use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use super::{Mechanism, Step}; use crate::context::RequestContext; use crate::pqproto::{BeAuthenticationSaslMessage, BeMessage}; use crate::stream::PqStream; /// SASL authentication outcome. /// It's much easier to match on those two variants /// than to peek into a noisy protocol error type. #[must_use = "caller must explicitly check for success"] pub(crate) enum Outcome { /// Authentication succeeded and produced some value. Success(R), /// Authentication failed (reason attached). Failure(&'static str), } pub async fn authenticate( ctx: &RequestContext, stream: &mut PqStream, mechanism: F, ) -> super::Result> where S: AsyncRead + AsyncWrite + Unpin, F: FnOnce(&str) -> super::Result, M: Mechanism, { let (mut mechanism, mut input) = { // pause the timer while we communicate with the client let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // Initial client message contains the chosen auth method's name. let msg = stream.read_password_message().await?; let sasl = super::FirstMessage::parse(msg) .ok_or(super::Error::BadClientMessage("bad sasl message"))?; (mechanism(sasl.method)?, sasl.message) }; loop { match mechanism.exchange(input) { Ok(Step::Continue(moved_mechanism, reply)) => { mechanism = moved_mechanism; // write reply let sasl_msg = BeAuthenticationSaslMessage::Continue(reply.as_bytes()); stream.write_message(BeMessage::AuthenticationSasl(sasl_msg)); drop(reply); } Ok(Step::Success(result, reply)) => { // write reply let sasl_msg = BeAuthenticationSaslMessage::Final(reply.as_bytes()); stream.write_message(BeMessage::AuthenticationSasl(sasl_msg)); stream.write_message(BeMessage::AuthenticationOk); // exit with success break Ok(Outcome::Success(result)); } // exit with failure Ok(Step::Failure(reason)) => break Ok(Outcome::Failure(reason)), Err(error) => { tracing::info!(?error, "error during SASL exchange"); return Err(error); } } // pause the timer while we communicate with the client let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // get next input stream.flush().await?; let msg = stream.read_password_message().await?; input = std::str::from_utf8(msg) .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "bad encoding"))?; } } ================================================ FILE: proxy/src/scram/cache.rs ================================================ use tokio::time::Instant; use zeroize::Zeroize as _; use super::pbkdf2; use crate::cache::Cached; use crate::cache::common::{Cache, count_cache_insert, count_cache_outcome, eviction_listener}; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::metrics::{CacheKind, Metrics}; pub(crate) struct Pbkdf2Cache(moka::sync::Cache<(EndpointIdInt, RoleNameInt), Pbkdf2CacheEntry>); pub(crate) type CachedPbkdf2<'a> = Cached<&'a Pbkdf2Cache>; impl Cache for Pbkdf2Cache { type Key = (EndpointIdInt, RoleNameInt); type Value = Pbkdf2CacheEntry; fn invalidate(&self, info: &(EndpointIdInt, RoleNameInt)) { self.0.invalidate(info); } } /// To speed up password hashing for more active customers, we store the tail results of the /// PBKDF2 algorithm. If the output of PBKDF2 is U1 ^ U2 ^ ⋯ ^ Uc, then we store /// suffix = U17 ^ U18 ^ ⋯ ^ Uc. We only need to calculate U1 ^ U2 ^ ⋯ ^ U15 ^ U16 /// to determine the final result. /// /// The suffix alone isn't enough to crack the password. The stored_key is still required. /// While both are cached in memory, given they're in different locations is makes it much /// harder to exploit, even if any such memory exploit exists in proxy. #[derive(Clone)] pub struct Pbkdf2CacheEntry { /// corresponds to [`super::ServerSecret::cached_at`] pub(super) cached_from: Instant, pub(super) suffix: pbkdf2::Block, } impl Drop for Pbkdf2CacheEntry { fn drop(&mut self) { self.suffix.zeroize(); } } impl Pbkdf2Cache { pub fn new() -> Self { const SIZE: u64 = 100; const TTL: std::time::Duration = std::time::Duration::from_secs(60); let builder = moka::sync::Cache::builder() .name("pbkdf2") .max_capacity(SIZE) // We use time_to_live so we don't refresh the lifetime for an invalid password attempt. .time_to_live(TTL); Metrics::get() .cache .capacity .set(CacheKind::Pbkdf2, SIZE as i64); let builder = builder.eviction_listener(|_k, _v, cause| eviction_listener(CacheKind::Pbkdf2, cause)); Self(builder.build()) } pub fn insert(&self, endpoint: EndpointIdInt, role: RoleNameInt, value: Pbkdf2CacheEntry) { count_cache_insert(CacheKind::Pbkdf2); self.0.insert((endpoint, role), value); } fn get(&self, endpoint: EndpointIdInt, role: RoleNameInt) -> Option { count_cache_outcome(CacheKind::Pbkdf2, self.0.get(&(endpoint, role))) } pub fn get_entry( &self, endpoint: EndpointIdInt, role: RoleNameInt, ) -> Option> { self.get(endpoint, role).map(|value| Cached { token: Some((self, (endpoint, role))), value, }) } } ================================================ FILE: proxy/src/scram/countmin.rs ================================================ use std::hash::Hash; /// estimator of hash jobs per second. /// pub(crate) struct CountMinSketch { // one for each depth hashers: Vec, width: usize, depth: usize, // buckets, width*depth buckets: Vec, } impl CountMinSketch { /// Given parameters (ε, δ), /// set width = ceil(e/ε) /// set depth = ceil(ln(1/δ)) /// /// guarantees: /// actual <= estimate /// estimate <= actual + ε * N with probability 1 - δ /// where N is the cardinality of the stream pub(crate) fn with_params(epsilon: f64, delta: f64) -> Self { CountMinSketch::new( (std::f64::consts::E / epsilon).ceil() as usize, (1.0_f64 / delta).ln().ceil() as usize, ) } fn new(width: usize, depth: usize) -> Self { Self { #[cfg(test)] hashers: (0..depth) .map(|i| { // digits of pi for good randomness ahash::RandomState::with_seeds( 314159265358979323, 84626433832795028, 84197169399375105, 82097494459230781 + i as u64, ) }) .collect(), #[cfg(not(test))] hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(), width, depth, buckets: vec![0; width * depth], } } pub(crate) fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { let mut min = u32::MAX; for row in 0..self.depth { let col = (self.hashers[row].hash_one(t) as usize) % self.width; let row = &mut self.buckets[row * self.width..][..self.width]; row[col] = row[col].saturating_add(x); min = std::cmp::min(min, row[col]); } min } pub(crate) fn reset(&mut self) { self.buckets.clear(); self.buckets.resize(self.width * self.depth, 0); } } #[cfg(test)] mod tests { use rand::rngs::StdRng; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use super::CountMinSketch; fn eval_precision(n: usize, p: f64, q: f64) -> usize { // fixed value of phi for consistent test let mut rng = StdRng::seed_from_u64(16180339887498948482); #[allow(non_snake_case)] let mut N = 0; let mut ids = vec![]; for _ in 0..n { // number to insert at once let n = rng.random_range(1..4096); // number of insert operations let m = rng.random_range(1..100); let id = uuid::Builder::from_random_bytes(rng.random()).into_uuid(); ids.push((id, n, m)); // N = sum(actual) N += n * m; } // q% of counts will be within p of the actual value let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); // insert a bunch of entries in a random order let mut ids2 = ids.clone(); while !ids2.is_empty() { ids2.shuffle(&mut rng); ids2.retain_mut(|id| { sketch.inc_and_return(&id.0, id.1); id.2 -= 1; id.2 > 0 }); } let mut within_p = 0; for (id, n, m) in ids { let actual = n * m; let estimate = sketch.inc_and_return(&id, 0); // This estimate has the guarantee that actual <= estimate assert!(actual <= estimate); // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ. // ε = p / N, δ = 1 - q; // therefore, estimate <= actual + p with probability q. if estimate as f64 <= actual as f64 + p { within_p += 1; } } within_p } #[test] fn precision() { assert_eq!(eval_precision(100, 100.0, 0.99), 100); assert_eq!(eval_precision(1000, 100.0, 0.99), 1000); assert_eq!(eval_precision(100, 4096.0, 0.99), 100); assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000); // seems to be more precise than the literature indicates? // probably numbers are too small to truly represent the probabilities. assert_eq!(eval_precision(100, 4096.0, 0.90), 100); assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); assert_eq!(eval_precision(100, 4096.0, 0.1), 100); assert_eq!(eval_precision(1000, 4096.0, 0.1), 978); } // returns memory usage in bytes, and the time complexity per insert. fn eval_cost(p: f64, q: f64) -> (usize, usize) { #[allow(non_snake_case)] // N = sum(actual) // Let's assume 1021 samples, all of 4096 let N = 1021 * 4096; let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); let memory = size_of::() * sketch.buckets.len(); let time = sketch.depth; (memory, time) } #[test] fn memory_usage() { assert_eq!(eval_cost(100.0, 0.99), (2273580, 5)); assert_eq!(eval_cost(4096.0, 0.99), (55520, 5)); assert_eq!(eval_cost(4096.0, 0.90), (33312, 3)); assert_eq!(eval_cost(4096.0, 0.1), (11104, 1)); } } ================================================ FILE: proxy/src/scram/exchange.rs ================================================ //! Implementation of the SCRAM authentication algorithm. use std::convert::Infallible; use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use tracing::{debug, trace}; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; use super::threadpool::ThreadPool; use super::{ScramKey, pbkdf2}; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::sasl::{self, ChannelBinding, Error as SaslError}; use crate::scram::cache::Pbkdf2CacheEntry; /// The only channel binding mode we currently support. #[derive(Debug)] struct TlsServerEndPoint; impl std::fmt::Display for TlsServerEndPoint { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "tls-server-end-point") } } impl std::str::FromStr for TlsServerEndPoint { type Err = sasl::Error; fn from_str(s: &str) -> Result { match s { "tls-server-end-point" => Ok(TlsServerEndPoint), _ => Err(sasl::Error::ChannelBindingBadMethod(s.into())), } } } struct SaslSentInner { cbind_flag: ChannelBinding, client_first_message_bare: String, server_first_message: OwnedServerFirstMessage, } struct SaslInitial { nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], } enum ExchangeState { /// Waiting for [`ClientFirstMessage`]. Initial(SaslInitial), /// Waiting for [`ClientFinalMessage`]. SaltSent(SaslSentInner), } /// Server's side of SCRAM auth algorithm. pub(crate) struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, tls_server_end_point: crate::tls::TlsServerEndPoint, } impl<'a> Exchange<'a> { pub(crate) fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], tls_server_end_point: crate::tls::TlsServerEndPoint, ) -> Self { Self { state: ExchangeState::Initial(SaslInitial { nonce }), secret, tls_server_end_point, } } } async fn derive_client_key( pool: &ThreadPool, endpoint: EndpointIdInt, password: &[u8], salt: &[u8], iterations: u32, ) -> pbkdf2::Block { pool.spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) .await } /// For cleartext flow, we need to derive the client key to /// 1. authenticate the client. /// 2. authenticate with compute. pub(crate) async fn exchange( pool: &ThreadPool, endpoint: EndpointIdInt, role: RoleNameInt, secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { if secret.iterations > CACHED_ROUNDS { exchange_with_cache(pool, endpoint, role, secret, password).await } else { let salt = BASE64_STANDARD.decode(&*secret.salt_base64)?; let hash = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; Ok(validate_pbkdf2(secret, &hash)) } } /// Compute the client key using a cache. We cache the suffix of the pbkdf2 result only, /// which is not enough by itself to perform an offline brute force. async fn exchange_with_cache( pool: &ThreadPool, endpoint: EndpointIdInt, role: RoleNameInt, secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { let salt = BASE64_STANDARD.decode(&*secret.salt_base64)?; debug_assert!( secret.iterations > CACHED_ROUNDS, "we should not cache password data if there isn't enough rounds needed" ); // compute the prefix of the pbkdf2 output. let prefix = derive_client_key(pool, endpoint, password, &salt, CACHED_ROUNDS).await; if let Some(entry) = pool.cache.get_entry(endpoint, role) { // hot path: let's check the threadpool cache if secret.cached_at == entry.cached_from { // cache is valid. compute the full hash by adding the prefix to the suffix. let mut hash = prefix; pbkdf2::xor_assign(&mut hash, &entry.suffix); let outcome = validate_pbkdf2(secret, &hash); if matches!(outcome, sasl::Outcome::Success(_)) { trace!("password validated from cache"); } return Ok(outcome); } // cached key is no longer valid. debug!("invalidating cached password"); entry.invalidate(); } // slow path: full password hash. let hash = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; let outcome = validate_pbkdf2(secret, &hash); let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, sasl::Outcome::Failure(_) => return Ok(outcome), }; trace!("storing cached password"); // time to cache, compute the suffix by subtracting the prefix from the hash. let mut suffix = hash; pbkdf2::xor_assign(&mut suffix, &prefix); pool.cache.insert( endpoint, role, Pbkdf2CacheEntry { cached_from: secret.cached_at, suffix, }, ); Ok(sasl::Outcome::Success(client_key)) } fn validate_pbkdf2(secret: &ServerSecret, hash: &pbkdf2::Block) -> sasl::Outcome { let client_key = super::ScramKey::client_key(&(*hash).into()); if secret.is_password_invalid(&client_key).into() { sasl::Outcome::Failure("password doesn't match") } else { sasl::Outcome::Success(client_key) } } const CACHED_ROUNDS: u32 = 16; impl SaslInitial { fn transition( &self, secret: &ServerSecret, tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let client_first_message = ClientFirstMessage::parse(input) .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?; // If the flag is set to "y" and the server supports channel // binding, the server MUST fail authentication if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer && tls_server_end_point.supported() { return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used")); } let server_first_message = client_first_message.build_server_first_message( &(self.nonce)(), &secret.salt_base64, secret.iterations, ); let msg = server_first_message.as_str().to_owned(); let next = SaslSentInner { cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?, client_first_message_bare: client_first_message.bare.to_owned(), server_first_message, }; Ok(sasl::Step::Continue(next, msg)) } } impl SaslSentInner { fn transition( &self, secret: &ServerSecret, tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let Self { cbind_flag, client_first_message_bare, server_first_message, } = self; let client_final_message = ClientFinalMessage::parse(input) .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; let channel_binding = cbind_flag.encode(|_| match tls_server_end_point { crate::tls::TlsServerEndPoint::Sha256(x) => Ok(x), crate::tls::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), })?; // This might've been caused by a MITM attack if client_final_message.channel_binding != channel_binding { return Err(SaslError::ChannelBindingFailed( "insecure connection: secure channel data mismatch", )); } if client_final_message.nonce != server_first_message.nonce() { return Err(SaslError::BadClientMessage("combined nonce doesn't match")); } let signature_builder = SignatureBuilder { client_first_message_bare, server_first_message: server_first_message.as_str(), client_final_message_without_proof: client_final_message.without_proof, }; let client_key = signature_builder .build(&secret.stored_key) .derive_client_key(&client_final_message.proof); // Auth fails either if keys don't match or it's pre-determined to fail. if secret.is_password_invalid(&client_key).into() { return Ok(sasl::Step::Failure("password doesn't match")); } let msg = client_final_message.build_server_final_message(signature_builder, &secret.server_key); Ok(sasl::Step::Success(client_key, msg)) } } impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { use ExchangeState; use sasl::Step; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { Step::Continue(sent, msg) => { self.state = ExchangeState::SaltSent(sent); Ok(Step::Continue(self, msg)) } Step::Failure(msg) => Ok(Step::Failure(msg)), } } ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), Step::Failure(msg) => Ok(Step::Failure(msg)), } } } } } ================================================ FILE: proxy/src/scram/key.rs ================================================ //! Tools for client/server/stored key management. use hmac::Mac as _; use sha2::Digest as _; use subtle::ConstantTimeEq; use zeroize::Zeroize as _; use crate::metrics::Metrics; use crate::scram::pbkdf2::Prf; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. #[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] pub(crate) struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } impl Drop for ScramKey { fn drop(&mut self) { self.bytes.zeroize(); } } impl PartialEq for ScramKey { fn eq(&self, other: &Self) -> bool { self.ct_eq(other).into() } } impl ConstantTimeEq for ScramKey { fn ct_eq(&self, other: &Self) -> subtle::Choice { self.bytes.ct_eq(&other.bytes) } } impl ScramKey { pub(crate) fn sha256(&self) -> Self { Metrics::get().proxy.sha_rounds.inc_by(1); Self { bytes: sha2::Sha256::digest(self.as_bytes()).into(), } } pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { self.bytes } pub(crate) fn client_key(b: &[u8; 32]) -> Self { // Prf::new_from_slice will run 2 sha256 rounds. // Update + Finalize run 2 sha256 rounds. Metrics::get().proxy.sha_rounds.inc_by(4); let mut prf = Prf::new_from_slice(b).expect("HMAC is able to accept all key sizes"); prf.update(b"Client Key"); let client_key: [u8; 32] = prf.finalize().into_bytes().into(); client_key.into() } } impl From<[u8; SCRAM_KEY_LEN]> for ScramKey { #[inline(always)] fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { Self { bytes } } } impl AsRef<[u8]> for ScramKey { #[inline(always)] fn as_ref(&self) -> &[u8] { &self.bytes } } ================================================ FILE: proxy/src/scram/messages.rs ================================================ //! Definitions for SCRAM messages. use std::fmt; use std::ops::Range; use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use super::base64_decode_array; use super::key::{SCRAM_KEY_LEN, ScramKey}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; /// Although we ignore all extensions, we still have to validate the message. fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; if !attr.is_ascii_alphabetic() { return None; } let eq = chars.next()?; if eq != '=' { return None; } } Some(()) } #[derive(Debug)] pub(crate) struct ClientFirstMessage<'a> { /// `client-first-message-bare`. pub(crate) bare: &'a str, /// Channel binding mode. pub(crate) cbind_flag: ChannelBinding<&'a str>, /// Client nonce. pub(crate) nonce: &'a str, } impl<'a> ClientFirstMessage<'a> { // NB: FromStr doesn't work with lifetimes pub(crate) fn parse(input: &'a str) -> Option { let mut parts = input.split(','); let cbind_flag = ChannelBinding::parse(parts.next()?)?; // PG doesn't support authorization identity, // so we don't bother defining GS2 header type let authzid = parts.next()?; if !authzid.is_empty() { return None; } // Unfortunately, `parts.as_str()` is unstable let pos = authzid.as_ptr() as usize - input.as_ptr() as usize + 1; let (_, bare) = input.split_at(pos); // In theory, these might be preceded by "reserved-mext" (i.e. "m=") let username = parts.next()?.strip_prefix("n=")?; // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14 if !username.is_empty() { tracing::warn!(username, "scram username provided, but is not expected"); // TODO(conrad): // return None; } let nonce = parts.next()?.strip_prefix("r=")?; // Validate but ignore auth extensions validate_sasl_extensions(parts)?; Some(Self { bare, cbind_flag, nonce, }) } /// Build a response to [`ClientFirstMessage`]. pub(crate) fn build_server_first_message( &self, nonce: &[u8; SCRAM_RAW_NONCE_LEN], salt_base64: &str, iterations: u32, ) -> OwnedServerFirstMessage { let mut message = String::with_capacity(128); message.push_str("r="); // write combined nonce let combined_nonce_start = message.len(); message.push_str(self.nonce); BASE64_STANDARD.encode_string(nonce, &mut message); let combined_nonce = combined_nonce_start..message.len(); // write salt and iterations message.push_str(",s="); message.push_str(salt_base64); message.push_str(",i="); message.push_str(itoa::Buffer::new().format(iterations)); // This design guarantees that it's impossible to create a // server-first-message without receiving a client-first-message OwnedServerFirstMessage { message, nonce: combined_nonce, } } } #[derive(Debug)] pub(crate) struct ClientFinalMessage<'a> { /// `client-final-message-without-proof`. pub(crate) without_proof: &'a str, /// Channel binding data (base64). pub(crate) channel_binding: &'a str, /// Combined client & server nonce. pub(crate) nonce: &'a str, /// Client auth proof. pub(crate) proof: [u8; SCRAM_KEY_LEN], } impl<'a> ClientFinalMessage<'a> { // NB: FromStr doesn't work with lifetimes pub(crate) fn parse(input: &'a str) -> Option { let (without_proof, proof) = input.rsplit_once(',')?; let mut parts = without_proof.split(','); let channel_binding = parts.next()?.strip_prefix("c=")?; let nonce = parts.next()?.strip_prefix("r=")?; // Validate but ignore auth extensions validate_sasl_extensions(parts)?; let proof = base64_decode_array(proof.strip_prefix("p=")?)?; Some(Self { without_proof, channel_binding, nonce, proof, }) } /// Build a response to [`ClientFinalMessage`]. pub(crate) fn build_server_final_message( &self, signature_builder: SignatureBuilder<'_>, server_key: &ScramKey, ) -> String { let mut buf = String::from("v="); BASE64_STANDARD.encode_string(signature_builder.build(server_key), &mut buf); buf } } /// We need to keep a convenient representation of this /// message for the next authentication step. pub(crate) struct OwnedServerFirstMessage { /// Owned `server-first-message`. message: String, /// Slice into `message`. nonce: Range, } impl OwnedServerFirstMessage { /// Extract combined nonce from the message. #[inline(always)] pub(crate) fn nonce(&self) -> &str { &self.message[self.nonce.clone()] } /// Get reference to a text representation of the message. #[inline(always)] pub(crate) fn as_str(&self) -> &str { &self.message } } impl fmt::Debug for OwnedServerFirstMessage { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ServerFirstMessage") .field("message", &self.as_str()) .field("nonce", &self.nonce()) .finish() } } #[cfg(test)] mod tests { use super::*; #[test] fn parse_client_first_message() { use ChannelBinding::*; // (Almost) real strings captured during debug sessions let cases = [ (NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"), (NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"), ( Required("tls-server-end-point"), "p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju", ), ]; for (cb, input) in cases { let msg = ClientFirstMessage::parse(input).unwrap(); assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.cbind_flag, cb); } } #[test] fn parse_client_first_message_with_invalid_gs2_authz() { assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none()); } #[test] fn parse_client_first_message_with_extra_params() { let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap(); assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz"); assert_eq!(msg.nonce, "nonce"); assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient); } #[test] fn parse_client_first_message_with_extra_params_invalid() { // must be of the form `=<...>` assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none()); assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none()); assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none()); } #[test] fn parse_client_final_message() { let input = [ "c=eSws", "r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU", "p=SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=", ] .join(","); let msg = ClientFinalMessage::parse(&input).unwrap(); assert_eq!( msg.without_proof, "c=eSws,r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" ); assert_eq!( msg.nonce, "iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" ); assert_eq!( BASE64_STANDARD.encode(msg.proof), "SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=" ); } } ================================================ FILE: proxy/src/scram/mod.rs ================================================ //! Salted Challenge Response Authentication Mechanism. //! //! RFC: . //! //! Reference implementation: //! * //! * mod cache; mod countmin; mod exchange; mod key; mod messages; mod pbkdf2; mod secret; mod signature; pub mod threadpool; use base64::Engine as _; use base64::prelude::BASE64_STANDARD; pub(crate) use exchange::{Exchange, exchange}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; /// A list of supported SCRAM methods. pub(crate) const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; /// Decode base64 into array without any heap allocations fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { let mut bytes = [0u8; N]; let size = BASE64_STANDARD.decode_slice(input, &mut bytes).ok()?; if size != N { return None; } Some(bytes) } #[cfg(test)] mod tests { use super::threadpool::ThreadPool; use super::{Exchange, ServerSecret}; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::sasl::{Mechanism, Step}; use crate::types::{EndpointId, RoleName}; #[test] fn snapshot() { let iterations = 4096; let salt = "QSXCR+Q6sek8bf92"; let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8="; let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo="; let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",); let secret = ServerSecret::parse(&secret).unwrap(); const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; let mut exchange = Exchange::new(&secret, || NONCE, crate::tls::TlsServerEndPoint::Undefined); let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO"; let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0="; let server_first = "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096"; let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0="; exchange = match exchange.exchange(client_first).unwrap() { Step::Continue(exchange, message) => { assert_eq!(message, server_first); exchange } Step::Success(_, _) => panic!("expected continue, got success"), Step::Failure(f) => panic!("{f}"), }; let key = match exchange.exchange(client_final).unwrap() { Step::Success(key, message) => { assert_eq!(message, server_final); key } Step::Continue(_, _) => panic!("expected success, got continue"), Step::Failure(f) => panic!("{f}"), }; assert_eq!( key.as_bytes(), [ 74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134, 27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112, ] ); } async fn check( pool: &ThreadPool, scram_secret: &ServerSecret, password: &[u8], ) -> Result<(), &'static str> { let ep = EndpointId::from("foo"); let ep = EndpointIdInt::from(ep); let role = RoleName::from("user"); let role = RoleNameInt::from(&role); let outcome = super::exchange(pool, ep, role, scram_secret, password) .await .unwrap(); match outcome { crate::sasl::Outcome::Success(_) => Ok(()), crate::sasl::Outcome::Failure(r) => Err(r), } } async fn run_round_trip_test(server_password: &str, client_password: &str) { let pool = ThreadPool::new(1); let scram_secret = ServerSecret::build(server_password).await.unwrap(); check(&pool, &scram_secret, client_password.as_bytes()) .await .unwrap(); } #[tokio::test] async fn round_trip() { run_round_trip_test("pencil", "pencil").await; } #[tokio::test] #[should_panic(expected = "password doesn't match")] async fn failure() { run_round_trip_test("pencil", "eraser").await; } #[tokio::test] #[tracing_test::traced_test] async fn password_cache() { let pool = ThreadPool::new(1); let scram_secret = ServerSecret::build("password").await.unwrap(); // wrong passwords are not added to cache check(&pool, &scram_secret, b"wrong").await.unwrap_err(); assert!(!logs_contain("storing cached password")); // correct passwords get cached check(&pool, &scram_secret, b"password").await.unwrap(); assert!(logs_contain("storing cached password")); // wrong passwords do not match the cache check(&pool, &scram_secret, b"wrong").await.unwrap_err(); assert!(!logs_contain("password validated from cache")); // correct passwords match the cache check(&pool, &scram_secret, b"password").await.unwrap(); assert!(logs_contain("password validated from cache")); } } ================================================ FILE: proxy/src/scram/pbkdf2.rs ================================================ //! For postgres password authentication, we need to perform a PBKDF2 using //! PRF=HMAC-SHA2-256, producing only 1 block (32 bytes) of output key. use hmac::Mac as _; use hmac::digest::consts::U32; use hmac::digest::generic_array::GenericArray; use zeroize::Zeroize as _; use crate::metrics::Metrics; /// The Psuedo-random function used during PBKDF2 and the SCRAM-SHA-256 handshake. pub type Prf = hmac::Hmac; pub(crate) type Block = GenericArray; pub(crate) struct Pbkdf2 { hmac: Prf, /// U{r-1} for whatever iteration r we are currently on. prev: Block, /// the output of `fold(xor, U{1}..U{r})` for whatever iteration r we are currently on. hi: Block, /// number of iterations left iterations: u32, } impl Drop for Pbkdf2 { fn drop(&mut self) { self.prev.zeroize(); self.hi.zeroize(); } } // inspired from impl Pbkdf2 { pub(crate) fn start(pw: &[u8], salt: &[u8], iterations: u32) -> Self { // key the HMAC and derive the first block in-place let mut hmac = Prf::new_from_slice(pw).expect("HMAC is able to accept all key sizes"); // U1 = PRF(Password, Salt + INT_32_BE(i)) // i = 1 since we only need 1 block of output. hmac.update(salt); hmac.update(&1u32.to_be_bytes()); let init_block = hmac.finalize_reset().into_bytes(); // Prf::new_from_slice will run 2 sha256 rounds. // Our update + finalize run 2 sha256 rounds for each pbkdf2 round. Metrics::get().proxy.sha_rounds.inc_by(4); Self { hmac, // one iteration spent above iterations: iterations - 1, hi: init_block, prev: init_block, } } pub(crate) fn cost(&self) -> u32 { (self.iterations).clamp(0, 4096) } /// For "fairness", we implement PBKDF2 with cooperative yielding, which is why we use this `turn` /// function that only executes a fixed number of iterations before continuing. /// /// Task must be rescheuled if this returns [`std::task::Poll::Pending`]. pub(crate) fn turn(&mut self) -> std::task::Poll { let Self { hmac, prev, hi, iterations, } = self; // only do up to 4096 iterations per turn for fairness let n = (*iterations).clamp(0, 4096); for _ in 0..n { let next = single_round(hmac, prev); xor_assign(hi, &next); *prev = next; } // Our update + finalize run 2 sha256 rounds for each pbkdf2 round. Metrics::get().proxy.sha_rounds.inc_by(2 * n as u64); *iterations -= n; if *iterations == 0 { std::task::Poll::Ready(*hi) } else { std::task::Poll::Pending } } } #[inline(always)] pub fn xor_assign(x: &mut Block, y: &Block) { for (x, &y) in std::iter::zip(x, y) { *x ^= y; } } #[inline(always)] fn single_round(prf: &mut Prf, ui: &Block) -> Block { // Ui = PRF(Password, Ui-1) prf.update(ui); prf.finalize_reset().into_bytes() } #[cfg(test)] mod tests { use pbkdf2::pbkdf2_hmac_array; use sha2::Sha256; use super::Pbkdf2; #[test] fn works() { let salt = b"sodium chloride"; let pass = b"Ne0n_!5_50_C007"; let mut job = Pbkdf2::start(pass, salt, 60000); let hash: [u8; 32] = loop { let std::task::Poll::Ready(hash) = job.turn() else { continue; }; break hash.into(); }; let expected = pbkdf2_hmac_array::(pass, salt, 60000); assert_eq!(hash, expected); } } ================================================ FILE: proxy/src/scram/secret.rs ================================================ //! Tools for SCRAM server secret management. use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use subtle::{Choice, ConstantTimeEq}; use tokio::time::Instant; use super::base64_decode_array; use super::key::ScramKey; /// Server secret is produced from user's password, /// and is used throughout the authentication process. #[derive(Clone, Eq, PartialEq, Debug)] pub(crate) struct ServerSecret { /// When this secret was cached. pub(crate) cached_at: Instant, /// Number of iterations for `PBKDF2` function. pub(crate) iterations: u32, /// Salt used to hash user's password. pub(crate) salt_base64: Box, /// Hashed `ClientKey`. pub(crate) stored_key: ScramKey, /// Used by client to verify server's signature. pub(crate) server_key: ScramKey, /// Should auth fail no matter what? /// This is exactly the case for mocked secrets. pub(crate) doomed: bool, } impl ServerSecret { pub(crate) fn parse(input: &str) -> Option { // SCRAM-SHA-256$:$: let s = input.strip_prefix("SCRAM-SHA-256$")?; let (params, keys) = s.split_once('$')?; let ((iterations, salt), (stored_key, server_key)) = params.split_once(':').zip(keys.split_once(':'))?; let secret = ServerSecret { cached_at: Instant::now(), iterations: iterations.parse().ok()?, salt_base64: salt.into(), stored_key: base64_decode_array(stored_key)?.into(), server_key: base64_decode_array(server_key)?.into(), doomed: false, }; Some(secret) } pub(crate) fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { // constant time to not leak partial key match client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) } /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. pub(crate) fn mock(nonce: [u8; 32]) -> Self { Self { cached_at: Instant::now(), // this doesn't reveal much information as we're going to use // iteration count 1 for our generated passwords going forward. // PG16 users can set iteration count=1 already today. iterations: 1, salt_base64: BASE64_STANDARD.encode(nonce).into_boxed_str(), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, } } /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] pub(crate) async fn build(password: &str) -> Option { Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } #[cfg(test)] mod tests { use super::*; #[test] fn parse_scram_secret() { let iterations = 4096; let salt = "+/tQQax7twvwTj64mjBsxQ=="; let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns="; let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI="; let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}"); let parsed = ServerSecret::parse(&secret).unwrap(); assert_eq!(parsed.iterations, iterations); assert_eq!(&*parsed.salt_base64, salt); assert_eq!(BASE64_STANDARD.encode(parsed.stored_key), stored_key); assert_eq!(BASE64_STANDARD.encode(parsed.server_key), server_key); } } ================================================ FILE: proxy/src/scram/signature.rs ================================================ //! Tools for client/server signature management. use hmac::Mac as _; use super::key::{SCRAM_KEY_LEN, ScramKey}; use crate::metrics::Metrics; use crate::scram::pbkdf2::Prf; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] pub(crate) struct SignatureBuilder<'a> { pub(crate) client_first_message_bare: &'a str, pub(crate) server_first_message: &'a str, pub(crate) client_final_message_without_proof: &'a str, } impl SignatureBuilder<'_> { pub(crate) fn build(&self, key: &ScramKey) -> Signature { // don't know exactly. this is a rough approx Metrics::get().proxy.sha_rounds.inc_by(8); let mut mac = Prf::new_from_slice(key.as_ref()).expect("HMAC accepts all key sizes"); mac.update(self.client_first_message_bare.as_bytes()); mac.update(b","); mac.update(self.server_first_message.as_bytes()); mac.update(b","); mac.update(self.client_final_message_without_proof.as_bytes()); Signature { bytes: mac.finalize().into_bytes().into(), } } } /// A computed value which, when xored with `ClientProof`, /// produces `ClientKey` that we need for authentication. #[derive(Debug)] #[repr(transparent)] pub(crate) struct Signature { bytes: [u8; SCRAM_KEY_LEN], } impl Signature { /// Derive `ClientKey` from client's signature and proof. pub(crate) fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { // This is how the proof is calculated: // // 1. sha256(ClientKey) -> StoredKey // 2. hmac_sha256(StoredKey, [messages...]) -> ClientSignature // 3. ClientKey ^ ClientSignature -> ClientProof // // Step 3 implies that we can restore ClientKey from the proof // by xoring the latter with the ClientSignature. Afterwards we // can check that the presumed ClientKey meets our expectations. let mut signature = self.bytes; for (i, x) in proof.iter().enumerate() { signature[i] ^= x; } signature.into() } } impl From<[u8; SCRAM_KEY_LEN]> for Signature { fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { Self { bytes } } } impl AsRef<[u8]> for Signature { fn as_ref(&self) -> &[u8] { &self.bytes } } ================================================ FILE: proxy/src/scram/threadpool.rs ================================================ //! Custom threadpool implementation for password hashing. //! //! Requirements: //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. use std::cell::RefCell; use std::future::Future; use std::pin::Pin; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::task::{Context, Poll}; use futures::FutureExt; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; use super::cache::Pbkdf2Cache; use super::pbkdf2; use super::pbkdf2::Pbkdf2; use crate::intern::EndpointIdInt; use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}; use crate::scram::countmin::CountMinSketch; pub struct ThreadPool { runtime: Option, pub metrics: Arc, // we hash a lot of passwords. // we keep a cache of partial hashes for faster validation. pub(super) cache: Pbkdf2Cache, } /// How often to reset the sketch values const SKETCH_RESET_INTERVAL: u64 = 1021; thread_local! { static STATE: RefCell> = const { RefCell::new(None) }; } impl ThreadPool { pub fn new(mut n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. if n_workers == 0 { n_workers = 1; } Arc::new_cyclic(|pool| { let pool = pool.clone(); let worker_id = AtomicUsize::new(0); let runtime = tokio::runtime::Builder::new_multi_thread() .worker_threads(n_workers as usize) .on_thread_start(move || { STATE.with_borrow_mut(|state| { *state = Some(ThreadRt { pool: pool.clone(), id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)), rng: SmallRng::from_os_rng(), // used to determine whether we should temporarily skip tasks for fairness. // 99% of estimates will overcount by no more than 4096 samples countmin: CountMinSketch::with_params( 1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01, ), tick: 0, }); }); }) .build() .expect("password threadpool runtime should be configured correctly"); Self { runtime: Some(runtime), metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), cache: Pbkdf2Cache::new(), } }) } pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle { JobHandle( self.runtime .as_ref() .expect("runtime is always set") .spawn(JobSpec { pbkdf2, endpoint }), ) } } impl Drop for ThreadPool { fn drop(&mut self) { self.runtime .take() .expect("runtime is always set") .shutdown_background(); } } struct ThreadRt { pool: Weak, id: ThreadPoolWorkerId, rng: SmallRng, countmin: CountMinSketch, tick: u64, } impl ThreadRt { fn should_run(&mut self, job: &JobSpec) -> bool { let rate = self .countmin .inc_and_return(&job.endpoint, job.pbkdf2.cost()); const P: f64 = 2000.0; // probability decreases as rate increases. // lower probability, higher chance of being skipped // // estimates (rate in terms of 4096 rounds): // rate = 0 => probability = 100% // rate = 10 => probability = 71.3% // rate = 50 => probability = 62.1% // rate = 500 => probability = 52.3% // rate = 1021 => probability = 49.8% // // My expectation is that the pool queue will only begin backing up at ~1000rps // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above // are in requests per second. let probability = P.ln() / (P + rate as f64).ln(); self.rng.random_bool(probability) } } struct JobSpec { pbkdf2: Pbkdf2, endpoint: EndpointIdInt, } impl Future for JobSpec { type Output = pbkdf2::Block; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { STATE.with_borrow_mut(|state| { let state = state.as_mut().expect("should be set on thread startup"); state.tick = state.tick.wrapping_add(1); if state.tick.is_multiple_of(SKETCH_RESET_INTERVAL) { state.countmin.reset(); } if state.should_run(&self) { if let Some(pool) = state.pool.upgrade() { pool.metrics.worker_task_turns_total.inc(state.id); } match self.pbkdf2.turn() { Poll::Ready(result) => Poll::Ready(result), // more to do, we shall requeue Poll::Pending => { cx.waker().wake_by_ref(); Poll::Pending } } } else { if let Some(pool) = state.pool.upgrade() { pool.metrics.worker_task_skips_total.inc(state.id); } cx.waker().wake_by_ref(); Poll::Pending } }) } } pub(crate) struct JobHandle(tokio::task::JoinHandle); impl Future for JobHandle { type Output = pbkdf2::Block; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { match self.0.poll_unpin(cx) { Poll::Ready(Ok(ok)) => Poll::Ready(ok), Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()), Poll::Pending => Poll::Pending, } } } impl Drop for JobHandle { fn drop(&mut self) { self.0.abort(); } } #[cfg(test)] mod tests { use super::*; use crate::types::EndpointId; #[tokio::test] async fn hash_is_correct() { let pool = ThreadPool::new(1); let ep = EndpointId::from("foo"); let ep = EndpointIdInt::from(ep); let salt = [0x55; 32]; let actual = pool .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) .await; let expected = &[ 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, ]; assert_eq!(actual.as_slice(), expected); } } ================================================ FILE: proxy/src/serverless/backend.rs ================================================ use std::sync::Arc; use std::time::Duration; use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; use postgres_client::error::SqlState; use postgres_client::maybe_tls_stream::MaybeTlsStream; use rand_core::OsRng; use tracing::field::display; use tracing::{debug, info}; use super::AsyncRW; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; use super::http_conn_pool::{self, HttpConnPool, LocalProxyClient, poll_http2_client}; use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; use crate::auth::{self, AuthError}; use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::{EndpointIdInt, RoleNameInt}; use crate::pqproto::StartupMessageParams; use crate::proxy::{connect_auth, connect_compute}; use crate::rate_limiter::EndpointRateLimiter; use crate::types::{EndpointId, LOCAL_PROXY_SUFFIX}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc>>, pub(crate) local_pool: Arc>, pub(crate) pool: Arc>>, pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, } impl PoolingBackend { pub(crate) async fn authenticate_with_password( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, password: &[u8], ) -> Result { ctx.set_auth_method(crate::context::AuthMethod::Cleartext); let user_info = user_info.clone(); let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); let access_control = backend.get_endpoint_access_control(ctx).await?; access_control.check( ctx, self.config.authentication_config.ip_allowlist_check_enabled, self.config.authentication_config.is_vpc_acccess_proxy, )?; access_control.connection_attempt_rate_limit( ctx, &user_info.endpoint, &self.endpoint_rate_limiter, )?; let role_access = backend.get_role_secret(ctx).await?; let Some(secret) = role_access.secret else { // If we don't have an authentication secret, for the http flow we can just return an error. info!("authentication info not found"); return Err(AuthError::password_failed(&*user_info.user)); }; let ep = EndpointIdInt::from(&user_info.endpoint); let role = RoleNameInt::from(&user_info.user); let auth_outcome = crate::auth::validate_password_and_exchange( &self.config.authentication_config.scram_thread_pool, ep, role, password, secret, ) .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); Ok(key) } crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); Err(AuthError::password_failed(&*user_info.user)) } }; res.map(|key| ComputeCredentials { info: user_info, keys: key, }) } pub(crate) async fn authenticate_with_jwt( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, jwt: String, ) -> Result { ctx.set_auth_method(crate::context::AuthMethod::Jwt); match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { let keys = self .config .authentication_config .jwks_cache .check_jwt( ctx, user_info.endpoint.clone(), &user_info.user, &**console, &jwt, ) .await?; Ok(ComputeCredentials { info: user_info.clone(), keys, }) } crate::auth::Backend::Local(_) => { let keys = self .config .authentication_config .jwks_cache .check_jwt( ctx, user_info.endpoint.clone(), &user_info.user, &StaticAuthRules, &jwt, ) .await?; Ok(ComputeCredentials { info: user_info.clone(), keys, }) } } } // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. #[tracing::instrument(skip_all, fields( pid = tracing::field::Empty, compute_id = tracing::field::Empty, conn_id = tracing::field::Empty, ))] pub(crate) async fn connect_to_compute( &self, ctx: &RequestContext, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, ) -> Result, HttpConnError> { let maybe_client = if force_new { debug!("pool: pool is disabled"); None } else { debug!("pool: looking for an existing connection"); self.pool.get(ctx, &conn_info)? }; if let Some(client) = maybe_client { return Ok(client); } let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| keys.info); let mut params = StartupMessageParams::default(); params.insert("database", &conn_info.dbname); params.insert("user", &conn_info.user_info.user); let mut auth_info = compute::AuthInfo::with_auth_keys(keys.keys); auth_info.set_startup_params(¶ms, true); let node = connect_auth::connect_to_compute_and_auth( ctx, self.config, &backend, auth_info, connect_compute::TlsNegotiation::Postgres, ) .await?; let (client, connection) = postgres_client::connect::managed( node.stream, Some(node.socket_addr.ip()), postgres_client::config::Host::Tcp(node.hostname.to_string()), node.socket_addr.port(), node.ssl_mode, Some(self.config.connect_to_compute.timeout), ) .await?; Ok(poll_client( self.pool.clone(), ctx, conn_info, client, connection, conn_id, node.aux, )) } // Wake up the destination if needed #[tracing::instrument(skip_all, fields( compute_id = tracing::field::Empty, conn_id = tracing::field::Empty, ))] pub(crate) async fn connect_to_local_proxy( &self, ctx: &RequestContext, conn_info: ConnInfo, ) -> Result, HttpConnError> { debug!("pool: looking for an existing connection"); if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); } let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); debug!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| ComputeUserInfo { user: conn_info.user_info.user.clone(), endpoint: EndpointId::from(format!( "{}{LOCAL_PROXY_SUFFIX}", conn_info.user_info.endpoint.normalize() )), options: conn_info.user_info.options.clone(), }); let node = connect_compute::connect_to_compute( ctx, self.config, &backend, connect_compute::TlsNegotiation::Direct, ) .await?; let stream = match node.stream.into_framed().into_inner() { MaybeTlsStream::Raw(s) => Box::pin(s) as AsyncRW, MaybeTlsStream::Tls(s) => Box::pin(s) as AsyncRW, }; let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) .timer(TokioTimer::new()) .keep_alive_interval(Duration::from_secs(20)) .keep_alive_while_idle(true) .keep_alive_timeout(Duration::from_secs(5)) .handshake(TokioIo::new(stream)) .await .map_err(LocalProxyConnError::H2)?; Ok(poll_http2_client( self.http_conn_pool.clone(), ctx, &conn_info, client, connection, conn_id, node.aux.clone(), )) } /// Connect to postgres over localhost. /// /// We expect postgres to be started here, so we won't do any retries. /// /// # Panics /// /// Panics if called with a non-local_proxy backend. #[tracing::instrument(skip_all, fields( pid = tracing::field::Empty, conn_id = tracing::field::Empty, ))] pub(crate) async fn connect_to_local_postgres( &self, ctx: &RequestContext, conn_info: ConnInfo, disable_pg_session_jwt: bool, ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { return Ok(client); } let local_backend = match &self.auth_backend { auth::Backend::ControlPlane(_, ()) => { unreachable!("only local_proxy can connect to local postgres") } auth::Backend::Local(local) => local, }; if !self.local_pool.initialized(&conn_info) { // only install and grant usage one at a time. let _permit = local_backend .initialize .acquire() .await .expect("semaphore should never be closed"); // check again for race if !self.local_pool.initialized(&conn_info) && !disable_pg_session_jwt { local_backend .compute_ctl .install_extension(&ExtensionInstallRequest { extension: EXT_NAME, database: conn_info.dbname.clone(), version: EXT_VERSION, }) .await?; local_backend .compute_ctl .grant_role(&SetRoleGrantsRequest { schema: EXT_SCHEMA, privileges: vec![Privilege::Usage], database: conn_info.dbname.clone(), role: conn_info.user_info.user.clone(), }) .await?; self.local_pool.set_initialized(&conn_info); } } let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); let (key, jwk) = create_random_jwk(); let mut config = local_backend .node_info .conn_info .to_postgres_client_config(); config .user(&conn_info.user_info.user) .dbname(&conn_info.dbname); if !disable_pg_session_jwt { config.set_param( "options", &format!( "-c pg_session_jwt.jwk={}", serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") ), ); } let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(&postgres_client::NoTls).await?; drop(pause); let pid = client.get_process_id(); tracing::Span::current().record("pid", pid); let mut handle = local_conn_pool::poll_client( self.local_pool.clone(), ctx, conn_info, client, connection, key, conn_id, local_backend.node_info.aux.clone(), ); { let (client, mut discard) = handle.inner(); debug!("setting up backend session state"); // initiates the auth session if !disable_pg_session_jwt && let Err(e) = client.batch_execute("select auth.init();").await { discard.discard(); return Err(e.into()); } info!("backend session state initialized"); } Ok(handle) } } fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { let key = SigningKey::generate(&mut OsRng); let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { crv: jose_jwk::OkpCurves::Ed25519, x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), d: None, }); (key, jwk) } #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), #[error("could not connect to compute")] ConnectError(#[from] compute::ConnectionError), #[error("could not connect to postgres in compute")] PostgresConnectionError(#[from] postgres_client::Error), #[error("could not connect to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), #[error("could not install extension: {0}")] ComputeCtl(#[from] ComputeCtlError), #[error("could not get auth info")] GetAuthInfo(#[from] GetAuthInfoError), #[error("user not authenticated")] AuthError(#[from] AuthError), #[error("wake_compute returned error")] WakeCompute(#[from] WakeComputeError), #[error("error acquiring resource permit: {0}")] TooManyConnectionAttempts(#[from] ApiLockError), } impl From for HttpConnError { fn from(value: connect_auth::AuthError) -> Self { match value { connect_auth::AuthError::Auth(compute::PostgresError::Postgres(error)) => { Self::PostgresConnectionError(error) } connect_auth::AuthError::Connect(error) => Self::ConnectError(error), } } } #[derive(Debug, thiserror::Error)] pub(crate) enum LocalProxyConnError { #[error("could not establish h2 connection")] H2(#[from] hyper::Error), } impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectError(e) => e.get_error_kind(), HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { // user provided a wrong database name Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User, // postgres rejected the connection Some(_) => ErrorKind::Postgres, // couldn't even reach postgres None => ErrorKind::Compute, }, HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), } } } impl UserFacingError for HttpConnError { fn to_string_client(&self) -> String { match self { HttpConnError::ConnectError(p) => p.to_string_client(), HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), HttpConnError::PostgresConnectionError(p) => p.to_string(), HttpConnError::LocalProxyConnectionError(p) => p.to_string(), HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(), HttpConnError::JwtPayloadError(p) => p.to_string(), HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), HttpConnError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } } } } impl ReportableError for LocalProxyConnError { fn get_error_kind(&self) -> ErrorKind { match self { LocalProxyConnError::H2(_) => ErrorKind::Compute, } } } impl UserFacingError for LocalProxyConnError { fn to_string_client(&self) -> String { "Could not establish HTTP connection to the database".to_string() } } ================================================ FILE: proxy/src/serverless/cancel_set.rs ================================================ //! A set for cancelling random http connections use std::hash::{BuildHasher, BuildHasherDefault}; use std::num::NonZeroUsize; use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; use rand::distr::uniform::{UniformSampler, UniformUsize}; use rustc_hash::FxHasher; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use uuid::Uuid; type Hasher = BuildHasherDefault; pub struct CancelSet { shards: Box<[Mutex]>, // keyed by random uuid, fxhasher is fine hasher: Hasher, } pub(crate) struct CancelShard { tokens: IndexMap, } impl CancelSet { pub fn new(shards: usize) -> Self { CancelSet { shards: (0..shards) .map(|_| { Mutex::new(CancelShard { tokens: IndexMap::with_hasher(Hasher::default()), }) }) .collect(), hasher: Hasher::default(), } } pub(crate) fn take(&self) -> Option { let dist = UniformUsize::new_inclusive(0, usize::MAX).expect("valid bounds"); for _ in 0..4 { if let Some(token) = self.take_raw(dist.sample(&mut rand::rng())) { return Some(token); } tracing::trace!("failed to get cancel token"); } None } fn take_raw(&self, rng: usize) -> Option { NonZeroUsize::new(self.shards.len()) .and_then(|len| self.shards[rng % len].lock().take(rng / len)) } pub(crate) fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { let shard = NonZeroUsize::new(self.shards.len()).map(|len| { let hash = self.hasher.hash_one(id) as usize; let shard = &self.shards[hash % len]; shard.lock().insert(id, token); shard }); CancelGuard { shard, id } } } impl CancelShard { fn take(&mut self, rng: usize) -> Option { NonZeroUsize::new(self.tokens.len()).and_then(|len| { // 10 second grace period so we don't cancel new connections if self.tokens.get_index(rng % len)?.1.0.elapsed() < Duration::from_secs(10) { return None; } let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?; Some(token) }) } fn remove(&mut self, id: uuid::Uuid) { self.tokens.swap_remove(&id); } fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) { self.tokens.insert(id, (Instant::now(), token)); } } pub(crate) struct CancelGuard<'a> { shard: Option<&'a Mutex>, id: Uuid, } impl Drop for CancelGuard<'_> { fn drop(&mut self) { if let Some(shard) = self.shard { shard.lock().remove(self.id); } } } ================================================ FILE: proxy/src/serverless/conn_pool.rs ================================================ use std::fmt; use std::pin::pin; use std::sync::{Arc, Weak}; use std::task::{Poll, ready}; use futures::future::poll_fn; use futures::{Future, FutureExt}; use postgres_client::tls::MakeTlsConnect; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span}; #[cfg(test)] use { super::conn_pool_lib::GlobalConnPoolOptions, crate::auth::backend::ComputeUserInfo, std::{sync::atomic, time::Duration}, }; use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool, GlobalConnPool, }; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::Metrics; type TlsStream = >::Stream; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { pub(crate) conn_info: ConnInfo, pub(crate) auth: AuthData, } #[derive(Debug, Clone)] pub(crate) enum AuthData { Password(SmallVec<[u8; 16]>), Jwt(String), } impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}@{}/{}?{}", self.user_info.user, self.user_info.endpoint, self.dbname, self.user_info.options.get_cache_key("") ) } } pub(crate) fn poll_client( global_pool: Arc>>, ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: postgres_client::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let mut session_id = ctx.session_id(); let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), None => Weak::new(), }; let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); let cancel = CancellationToken::new(); let cancelled = cancel.clone().cancelled_owned(); tokio::spawn(async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); let mut cancelled = pin!(cancelled); poll_fn(move |cx| { let _instrument = span.enter(); if cancelled.as_mut().poll(cx).is_ready() { info!("connection dropped"); return Poll::Ready(()); } match rx.has_changed() { Ok(true) => { session_id = *rx.borrow_and_update(); info!(%session_id, "changed session"); idle_timeout.as_mut().reset(Instant::now() + idle); } Err(_) => { info!("connection dropped"); return Poll::Ready(()); } _ => {} } // 5 minute idle connection timeout if idle_timeout.as_mut().poll(cx).is_ready() { idle_timeout.as_mut().reset(Instant::now() + idle); info!("connection idle"); if let Some(pool) = pool.clone().upgrade() { // remove client from pool - should close the connection if it's idle. // does nothing if the client is currently checked-out and in-use if pool.write().remove_client(db_user.clone(), conn_id) { info!("idle connection removed"); } } } match ready!(connection.poll_unpin(cx)) { Err(e) => error!(%session_id, "connection error: {}", e), Ok(()) => info!("connection closed"), } // remove from connection pool if let Some(pool) = pool.clone().upgrade() && pool.write().remove_client(db_user.clone(), conn_id) { info!("closed connection removed"); } Poll::Ready(()) }) .await; }); let inner = ClientInnerCommon { inner: client, aux, conn_id, data: ClientDataEnum::Remote(ClientDataRemote { session: tx, cancel, }), }; Client::new(inner, conn_info, pool_clone) } #[derive(Clone)] pub(crate) struct ClientDataRemote { session: tokio::sync::watch::Sender, cancel: CancellationToken, } impl ClientDataRemote { pub fn session(&mut self) -> &mut tokio::sync::watch::Sender { &mut self.session } pub fn cancel(&mut self) { self.cancel.cancel(); } } #[cfg(test)] mod tests { use std::sync::atomic::AtomicBool; use super::*; use crate::proxy::NeonOptions; use crate::serverless::cancel_set::CancelSet; use crate::types::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { fn new(is_closed: bool) -> Self { MockClient(Arc::new(is_closed.into())) } } impl ClientInnerExt for MockClient { fn is_closed(&self) -> bool { self.0.load(atomic::Ordering::Relaxed) } fn get_process_id(&self) -> i32 { 0 } fn reset(&mut self) -> Result<(), postgres_client::Error> { Ok(()) } } fn create_inner() -> ClientInnerCommon { create_inner_with(MockClient::new(false)) } fn create_inner_with(client: MockClient) -> ClientInnerCommon { ClientInnerCommon { inner: client, aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, conn_id: uuid::Uuid::new_v4(), data: ClientDataEnum::Remote(ClientDataRemote { session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), cancel: CancellationToken::new(), }), } } #[tokio::test] async fn test_pool() { let _ = env_logger::try_init(); let config = Box::leak(Box::new(crate::config::HttpConfig { accept_websockets: false, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: 2, gc_epoch: Duration::from_secs(1), pool_shards: 2, idle_timeout: Duration::from_secs(1), opt_in: false, max_total_conns: 3, }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, max_request_size_bytes: usize::MAX, max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { user_info: ComputeUserInfo { user: "user".into(), endpoint: "endpoint".into(), options: NeonOptions::default(), }, dbname: "dbname".into(), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), ); { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); assert_eq!(0, pool.get_global_connections_count()); client.inner().1.discard(); // Discard should not add the connection from the pool. assert_eq!(0, pool.get_global_connections_count()); } { let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); drop(client); assert_eq!(1, pool.get_global_connections_count()); } { let closed_client = Client::new( create_inner_with(MockClient::new(true)), conn_info.clone(), ep_pool.clone(), ); drop(closed_client); assert_eq!(1, pool.get_global_connections_count()); } let is_closed: Arc = Arc::new(false.into()); { let client = Client::new( create_inner_with(MockClient(is_closed.clone())), conn_info.clone(), ep_pool.clone(), ); drop(client); // The client should be added to the pool. assert_eq!(2, pool.get_global_connections_count()); } { let client = Client::new(create_inner(), conn_info, ep_pool); drop(client); // The client shouldn't be added to the pool. Because the ep-pool is full. assert_eq!(2, pool.get_global_connections_count()); } let conn_info = ConnInfo { user_info: ComputeUserInfo { user: "user".into(), endpoint: "endpoint-2".into(), options: NeonOptions::default(), }, dbname: "dbname".into(), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), ); { let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); drop(client); assert_eq!(3, pool.get_global_connections_count()); } { let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); drop(client); // The client shouldn't be added to the pool. Because the global pool is full. assert_eq!(3, pool.get_global_connections_count()); } is_closed.store(true, atomic::Ordering::Relaxed); // Do gc for all shards. pool.gc(0); pool.gc(1); // Closed client should be removed from the pool. assert_eq!(2, pool.get_global_connections_count()); } } ================================================ FILE: proxy/src/serverless/conn_pool_lib.rs ================================================ use std::collections::HashMap; use std::marker::PhantomData; use std::ops::Deref; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use std::time::Duration; use clashmap::ClashMap; use parking_lot::RwLock; use rand::Rng; use smol_str::ToSmolStr; use tracing::{Span, debug, info, warn}; use super::backend::HttpConnError; use super::conn_pool::ClientDataRemote; use super::http_conn_pool::ClientDataHttp; use super::local_conn_pool::ClientDataLocal; use crate::auth::backend::ComputeUserInfo; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { pub(crate) user_info: ComputeUserInfo, pub(crate) dbname: DbName, } impl ConnInfo { // hm, change to hasher to avoid cloning? pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { (self.dbname.clone(), self.user_info.user.clone()) } pub(crate) fn endpoint_cache_key(&self) -> Option { // We don't want to cache http connections for ephemeral endpoints. if self.user_info.options.is_ephemeral() { None } else { Some(self.user_info.endpoint_cache_key()) } } } #[derive(Clone)] #[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum ClientDataEnum { Remote(ClientDataRemote), Local(ClientDataLocal), Http(ClientDataHttp), } #[derive(Clone)] pub(crate) struct ClientInnerCommon { pub(crate) inner: C, pub(crate) aux: MetricsAuxInfo, pub(crate) conn_id: uuid::Uuid, pub(crate) data: ClientDataEnum, // custom client data like session, key, jti } impl Drop for ClientInnerCommon { fn drop(&mut self) { match &mut self.data { ClientDataEnum::Remote(remote_data) => { remote_data.cancel(); } ClientDataEnum::Local(local_data) => { local_data.cancel(); } ClientDataEnum::Http(_http_data) => (), } } } impl ClientInnerCommon { pub(crate) fn get_conn_id(&self) -> uuid::Uuid { self.conn_id } pub(crate) fn get_data(&mut self) -> &mut ClientDataEnum { &mut self.data } } pub(crate) struct ConnPoolEntry { pub(crate) conn: ClientInnerCommon, pub(crate) _last_access: std::time::Instant, } // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub(crate) struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, /// max # connections per endpoint max_conns: usize, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, pool_name: String, } impl EndpointConnPool { pub(crate) fn new( hmap: HashMap<(DbName, RoleName), DbUserConnPool>, tconns: usize, max_conns_per_endpoint: usize, global_connections_count: Arc, max_total_conns: usize, pname: String, ) -> Self { Self { pools: hmap, total_conns: tconns, max_conns: max_conns_per_endpoint, _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count, global_pool_size_max_conns: max_total_conns, pool_name: pname, } } pub(crate) fn get_conn_entry( &mut self, db_user: (DbName, RoleName), ) -> Option> { let Self { pools, total_conns, global_connections_count, .. } = self; pools.get_mut(&db_user).and_then(|pool_entries| { let (entry, removed) = pool_entries.get_conn_entry(total_conns); global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); entry }) } pub(crate) fn remove_client( &mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid, ) -> bool { let Self { pools, total_conns, global_connections_count, .. } = self; if let Some(pool) = pools.get_mut(&db_user) { let old_len = pool.get_conns().len(); pool.get_conns() .retain(|conn| conn.conn.get_conn_id() != conn_id); let new_len = pool.get_conns().len(); let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(removed as i64); } *total_conns -= removed; removed > 0 } else { false } } pub(crate) fn get_name(&self) -> &str { &self.pool_name } pub(crate) fn get_pool(&self, db_user: (DbName, RoleName)) -> Option<&DbUserConnPool> { self.pools.get(&db_user) } pub(crate) fn get_pool_mut( &mut self, db_user: (DbName, RoleName), ) -> Option<&mut DbUserConnPool> { self.pools.get_mut(&db_user) } pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, mut client: ClientInnerCommon) { let conn_id = client.get_conn_id(); let (max_conn, conn_count, pool_name) = { let pool = pool.read(); ( pool.global_pool_size_max_conns, pool.global_connections_count .load(atomic::Ordering::Relaxed), pool.get_name().to_string(), ) }; if client.inner.is_closed() { info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because connection is closed"); return; } if let Err(error) = client.inner.reset() { warn!(?error, %conn_id, "{pool_name}: throwing away connection '{conn_info}' because connection could not be reset"); return; } if conn_count >= max_conn { info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full"); return; } // return connection to the pool let mut returned = false; let mut per_db_size = 0; let total_conns = { let mut pool = pool.write(); if pool.total_conns < pool.max_conns { let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); pool_entries.get_conns().push(ConnPoolEntry { conn: client, _last_access: std::time::Instant::now(), }); returned = true; per_db_size = pool_entries.get_conns().len(); pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .inc(); } pool.total_conns }; // do logging outside of the mutex if returned { debug!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); } else { info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } } } impl Drop for EndpointConnPool { fn drop(&mut self) { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(self.total_conns as i64); } } } pub(crate) struct DbUserConnPool { pub(crate) conns: Vec>, pub(crate) initialized: Option, // a bit ugly, exists only for local pools } impl Default for DbUserConnPool { fn default() -> Self { Self { conns: Vec::new(), initialized: None, } } } pub(crate) trait DbUserConn: Default { fn set_initialized(&mut self); fn is_initialized(&self) -> bool; fn clear_closed_clients(&mut self, conns: &mut usize) -> usize; fn get_conn_entry(&mut self, conns: &mut usize) -> (Option>, usize); fn get_conns(&mut self) -> &mut Vec>; } impl DbUserConn for DbUserConnPool { fn set_initialized(&mut self) { self.initialized = Some(true); } fn is_initialized(&self) -> bool { self.initialized.unwrap_or(false) } fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { let old_len = self.conns.len(); self.conns.retain(|conn| !conn.conn.inner.is_closed()); let new_len = self.conns.len(); let removed = old_len - new_len; *conns -= removed; removed } fn get_conn_entry(&mut self, conns: &mut usize) -> (Option>, usize) { let mut removed = self.clear_closed_clients(conns); let conn = self.conns.pop(); if conn.is_some() { *conns -= 1; removed += 1; } Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(removed as i64); (conn, removed) } fn get_conns(&mut self) -> &mut Vec> { &mut self.conns } } pub(crate) trait EndpointConnPoolExt { fn clear_closed(&mut self) -> usize; fn total_conns(&self) -> usize; } impl EndpointConnPoolExt for EndpointConnPool { fn clear_closed(&mut self) -> usize { let mut clients_removed: usize = 0; for db_pool in self.pools.values_mut() { clients_removed += db_pool.clear_closed_clients(&mut self.total_conns); } clients_removed } fn total_conns(&self) -> usize { self.total_conns } } pub(crate) struct GlobalConnPool where C: ClientInnerExt, P: EndpointConnPoolExt, { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. pub(crate) global_pool: ClashMap>>, /// Number of endpoint-connection pools /// /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. pub(crate) global_pool_size: AtomicUsize, /// Total number of connections in the pool pub(crate) global_connections_count: Arc, pub(crate) config: &'static crate::config::HttpConfig, _marker: PhantomData, } #[derive(Debug, Clone, Copy)] pub struct GlobalConnPoolOptions { // Maximum number of connections per one endpoint. // Can mix different (dbname, username) connections. // When running out of free slots for a particular endpoint, // falls back to opening a new connection for each request. pub max_conns_per_endpoint: usize, pub gc_epoch: Duration, pub pool_shards: usize, pub idle_timeout: Duration, pub opt_in: bool, // Total number of connections in the pool. pub max_total_conns: usize, } impl GlobalConnPool where C: ClientInnerExt, P: EndpointConnPoolExt, { pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { global_pool: ClashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), config, global_connections_count: Arc::new(AtomicUsize::new(0)), _marker: PhantomData, }) } #[cfg(test)] pub(crate) fn get_global_connections_count(&self) -> usize { self.global_connections_count .load(atomic::Ordering::Relaxed) } pub(crate) fn get_idle_timeout(&self) -> Duration { self.config.pool_options.idle_timeout } pub(crate) fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); } pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { let epoch = self.config.pool_options.gc_epoch; let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); loop { interval.tick().await; let shard = rng.random_range(0..self.global_pool.shards().len()); self.gc(shard); } } pub(crate) fn gc(&self, shard: usize) { debug!(shard, "pool: performing epoch reclamation"); // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); let timer = Metrics::get() .proxy .http_pool_reclaimation_lag_seconds .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|(endpoint, x)| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. if let Some(pool) = Arc::get_mut(x) { let endpoints = pool.get_mut(); clients_removed = endpoints.clear_closed(); if endpoints.total_conns() == 0 { info!("pool: discarding pool for endpoint {endpoint}"); return false; } } true }); let new_len = shard.len(); drop(shard); timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { let size = self .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(clients_removed as i64); info!( "pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}" ); } let removed = current_len - new_len; if removed > 0 { let global_pool_size = self .global_pool_size .fetch_sub(removed, atomic::Ordering::Relaxed) - removed; info!("pool: performed global pool gc. size now {global_pool_size}"); } } } impl GlobalConnPool> { pub(crate) fn get( self: &Arc, ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let mut client: Option> = None; let Some(endpoint) = conn_info.endpoint_cache_key() else { return Ok(None); }; let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); if let Some(entry) = endpoint_pool .write() .get_conn_entry(conn_info.db_and_user()) { client = Some(entry.conn); } let endpoint_pool = Arc::downgrade(&endpoint_pool); // ok return cached connection if found and establish a new one otherwise if let Some(mut client) = client { if client.inner.is_closed() { info!("pool: cached connection '{conn_info}' is closed, opening a new one"); return Ok(None); } tracing::Span::current() .record("conn_id", tracing::field::display(client.get_conn_id())); tracing::Span::current().record( "pid", tracing::field::display(client.inner.get_process_id()), ); debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); match client.get_data() { ClientDataEnum::Local(data) => { data.session().send(ctx.session_id())?; } ClientDataEnum::Remote(data) => { data.session().send(ctx.session_id())?; } ClientDataEnum::Http(_) => (), } ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } Ok(None) } pub(crate) fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); } // slow path let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, pool_name: String::from("remote"), })); // find or create a pool for this endpoint let mut created = false; let pool = self .global_pool .entry(endpoint.clone()) .or_insert_with(|| { created = true; new_pool }) .clone(); // log new global pool size if created { let global_pool_size = self .global_pool_size .fetch_add(1, atomic::Ordering::Relaxed) + 1; info!( "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" ); } pool } } pub(crate) struct Client { span: Span, inner: Option>, conn_info: ConnInfo, pool: Weak>>, } pub(crate) struct Discard<'a, C: ClientInnerExt> { conn_info: &'a ConnInfo, pool: &'a mut Weak>>, } impl Client { pub(crate) fn new( inner: ClientInnerCommon, conn_info: ConnInfo, pool: Weak>>, ) -> Self { Self { inner: Some(inner), span: Span::current(), conn_info, pool, } } pub(crate) fn client_inner(&mut self) -> (&mut ClientInnerCommon, Discard<'_, C>) { let Self { inner, pool, conn_info, span: _, } = self; let inner_m = inner.as_mut().expect("client inner should not be removed"); (inner_m, Discard { conn_info, pool }) } pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, pool, conn_info, span: _, } = self; let inner = inner.as_mut().expect("client inner should not be removed"); (&mut inner.inner, Discard { conn_info, pool }) } pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self .inner .as_ref() .expect("client inner should not be removed") .aux; let private_link_id = match ctx.extra() { None => None, Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), }; USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, private_link_id, }) } } impl Drop for Client { fn drop(&mut self) { let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let _current_span = self.span.enter(); // return connection to the pool EndpointConnPool::put(&conn_pool, &conn_info, client); } } } impl Deref for Client { type Target = C; fn deref(&self) -> &Self::Target { &self .inner .as_ref() .expect("client inner should not be removed") .inner } } pub(crate) trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; fn reset(&mut self) -> Result<(), postgres_client::Error>; } impl ClientInnerExt for postgres_client::Client { fn is_closed(&self) -> bool { self.is_closed() } fn get_process_id(&self) -> i32 { self.get_process_id() } fn reset(&mut self) -> Result<(), postgres_client::Error> { self.reset_session_background() } } impl Discard<'_, C> { pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { info!( "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state" ); } } } ================================================ FILE: proxy/src/serverless/error.rs ================================================ use http::StatusCode; use http::header::HeaderName; use crate::auth::ComputeUserInfoParseError; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::ReadBodyError; pub trait HttpCodeError { fn get_http_status_code(&self) -> StatusCode; } #[derive(Debug, thiserror::Error)] pub(crate) enum ConnInfoError { #[error("invalid header: {0}")] InvalidHeader(&'static HeaderName), #[error("invalid connection string: {0}")] UrlParseError(#[from] url::ParseError), #[error("incorrect scheme")] IncorrectScheme, #[error("missing database name")] MissingDbName, #[error("invalid database name")] InvalidDbName, #[error("missing username")] MissingUsername, #[error("invalid username: {0}")] InvalidUsername(#[from] std::string::FromUtf8Error), #[error("missing authentication credentials: {0}")] MissingCredentials(Credentials), #[error("missing hostname")] MissingHostname, #[error("invalid hostname: {0}")] InvalidEndpoint(#[from] ComputeUserInfoParseError), } #[derive(Debug, thiserror::Error)] pub(crate) enum Credentials { #[error("required password")] Password, #[error("required authorization bearer token in JWT format")] BearerJwt, } impl ReportableError for ConnInfoError { fn get_error_kind(&self) -> ErrorKind { ErrorKind::User } } impl UserFacingError for ConnInfoError { fn to_string_client(&self) -> String { self.to_string() } } #[derive(Debug, thiserror::Error)] pub(crate) enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] Read(#[from] hyper::Error), #[error("request is too large (max is {limit} bytes)")] BodyTooLarge { limit: usize }, #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } impl From> for ReadPayloadError { fn from(value: ReadBodyError) -> Self { match value { ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, ReadBodyError::Read(e) => Self::Read(e), } } } impl ReportableError for ReadPayloadError { fn get_error_kind(&self) -> ErrorKind { match self { ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, ReadPayloadError::Parse(_) => ErrorKind::User, } } } impl HttpCodeError for ReadPayloadError { fn get_http_status_code(&self) -> StatusCode { match self { ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, } } } ================================================ FILE: proxy/src/serverless/http_conn_pool.rs ================================================ use std::collections::VecDeque; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use bytes::Bytes; use http_body_util::combinators::BoxBody; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use smol_str::ToSmolStr; use tracing::{Instrument, debug, error, info, info_span}; use super::AsyncRW; use super::backend::HttpConnError; use super::conn_pool_lib::{ ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, EndpointConnPoolExt, GlobalConnPool, }; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; pub(crate) type LocalProxyClient = http2::SendRequest>; pub(crate) type LocalProxyConnection = http2::Connection, BoxBody, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. pub(crate) struct HttpConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) // or we can change this to an Option rather than a VecDeque. // // Opening more connections to the same db because we run out of streams // seems somewhat redundant though. // // Probably we should run a semaphore and just the single conn. TBD. conns: VecDeque>, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, } impl HttpConnPool { fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { let conn = conns.pop_front()?; if !conn.conn.inner.is_closed() { let new_conn = ConnPoolEntry { conn: conn.conn.clone(), _last_access: std::time::Instant::now(), }; conns.push_back(new_conn); return Some(conn); } } } fn remove_conn(&mut self, conn_id: uuid::Uuid) -> bool { let Self { conns, global_connections_count, .. } = self; let old_len = conns.len(); conns.retain(|entry| entry.conn.conn_id != conn_id); let new_len = conns.len(); let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(removed as i64); } removed > 0 } } impl EndpointConnPoolExt for HttpConnPool { fn clear_closed(&mut self) -> usize { let Self { conns, .. } = self; let old_len = conns.len(); conns.retain(|entry| !entry.conn.inner.is_closed()); let new_len = conns.len(); old_len - new_len } fn total_conns(&self) -> usize { self.conns.len() } } impl Drop for HttpConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count .fetch_sub(self.conns.len(), atomic::Ordering::Relaxed); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .dec_by(self.conns.len() as i64); } } } impl GlobalConnPool> { #[expect(unused_results)] pub(crate) fn get( self: &Arc, ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let result: Result>, HttpConnError>; let Some(endpoint) = conn_info.endpoint_cache_key() else { result = Ok(None); return result; }; let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); let Some(client) = endpoint_pool.write().get_conn_entry() else { result = Ok(None); return result; }; tracing::Span::current().record("conn_id", tracing::field::display(client.conn.conn_id)); debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.success(); Ok(Some(Client::new(client.conn.clone()))) } fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); } // slow path let new_pool = Arc::new(RwLock::new(HttpConnPool { conns: VecDeque::new(), _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), })); // find or create a pool for this endpoint let mut created = false; let pool = self .global_pool .entry(endpoint.clone()) .or_insert_with(|| { created = true; new_pool }) .clone(); // log new global pool size if created { let global_pool_size = self .global_pool_size .fetch_add(1, atomic::Ordering::Relaxed) + 1; info!( "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" ); } pool } } pub(crate) fn poll_http2_client( global_pool: Arc>>, ctx: &RequestContext, conn_info: &ConnInfo, client: LocalProxyClient, connection: LocalProxyConnection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let session_id = ctx.session_id(); let span = info_span!(parent: None, "connection", %conn_id); let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => { let pool = global_pool.get_or_create_endpoint_pool(&endpoint); let client = ClientInnerCommon { inner: client.clone(), aux: aux.clone(), conn_id, data: ClientDataEnum::Http(ClientDataHttp()), }; pool.write().conns.push_back(ConnPoolEntry { conn: client, _last_access: std::time::Instant::now(), }); Metrics::get() .proxy .http_pool_opened_connections .get_metric() .inc(); Arc::downgrade(&pool) } None => Weak::new(), }; tokio::spawn( async move { let _conn_gauge = conn_gauge; let res = connection.await; match res { Ok(()) => info!("connection closed"), Err(e) => error!(%session_id, "connection error: {e:?}"), } // remove from connection pool if let Some(pool) = pool.clone().upgrade() && pool.write().remove_conn(conn_id) { info!("closed connection removed"); } } .instrument(span), ); let client = ClientInnerCommon { inner: client, aux, conn_id, data: ClientDataEnum::Http(ClientDataHttp()), }; Client::new(client) } pub(crate) struct Client { pub(crate) inner: ClientInnerCommon, } impl Client { pub(self) fn new(inner: ClientInnerCommon) -> Self { Self { inner } } pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self.inner.aux; let private_link_id = match ctx.extra() { None => None, Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), }; USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, private_link_id, }) } } impl ClientInnerExt for LocalProxyClient { fn is_closed(&self) -> bool { self.is_closed() } fn get_process_id(&self) -> i32 { // ideally throw something meaningful -1 } fn reset(&mut self) -> Result<(), postgres_client::Error> { // We use HTTP/2.0 to talk to local proxy. HTTP is stateless, // so there's nothing to reset. Ok(()) } } ================================================ FILE: proxy/src/serverless/http_util.rs ================================================ //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility //! Will merge back in at some point in the future. use anyhow::Context; use bytes::Bytes; use http::header::AUTHORIZATION; use http::{HeaderMap, HeaderName, HeaderValue, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use serde::Serialize; use url::Url; use uuid::Uuid; use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool_lib::ConnInfo; use super::error::{ConnInfoError, Credentials}; use crate::auth::backend::ComputeUserInfo; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::pqproto::StartupMessageParams; use crate::proxy::NeonOptions; use crate::types::{DbName, EndpointId, RoleName}; // Common header names used across serverless modules pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id"); pub(super) static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); pub(super) static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); pub(super) static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); pub(super) static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); pub(super) static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level"); pub(super) static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); pub(super) static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable"); pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH]; HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..])) .expect("uuid hyphenated format should be all valid header characters") } /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { match this { ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) } ApiError::Unauthorized(_) => { HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) } ApiError::NotFound(_) => { HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) } ApiError::Conflict(_) => { HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) } ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( this.to_string(), StatusCode::PRECONDITION_FAILED, ), ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( "Shutting down".to_string(), StatusCode::SERVICE_UNAVAILABLE, ), ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::TOO_MANY_REQUESTS, ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, ), ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( this.to_string(), StatusCode::INTERNAL_SERVER_ERROR, ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, ), } } /// Same as [`http_utils::error::HttpErrorBody`] #[derive(Serialize)] struct HttpErrorBody { pub(crate) msg: String, } impl HttpErrorBody { /// Same as [`http_utils::error::HttpErrorBody::response_from_msg_and_status`] fn response_from_msg_and_status( msg: String, status: StatusCode, ) -> Response> { HttpErrorBody { msg }.to_response(status) } /// Same as [`http_utils::error::HttpErrorBody::to_response`] fn to_response(&self, status: StatusCode) -> Response> { Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail .body( Full::new(Bytes::from( serde_json::to_string(self) .expect("serialising HttpErrorBody should never fail"), )) .map_err(|x| match x {}) .boxed(), ) .expect("content-type header should be valid") } } /// Same as [`http_utils::json::json_response`] pub(crate) fn json_response( status: StatusCode, data: T, ) -> Result>, ApiError> { let json = serde_json::to_string(&data) .context("Failed to serialize JSON response") .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") .body(Full::new(Bytes::from(json)).map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } pub(crate) fn get_conn_info( config: &'static AuthenticationConfig, ctx: &RequestContext, connection_string: Option<&str>, headers: &HeaderMap, ) -> Result { let connection_url = match connection_string { Some(connection_string) => Url::parse(connection_string)?, None => { let connection_string = headers .get(&CONN_STRING) .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? .to_str() .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; Url::parse(connection_string)? } }; let protocol = connection_url.scheme(); if protocol != "postgres" && protocol != "postgresql" { return Err(ConnInfoError::IncorrectScheme); } let mut url_path = connection_url .path_segments() .ok_or(ConnInfoError::MissingDbName)?; let dbname: DbName = urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); ctx.set_dbname(dbname.clone()); let username = RoleName::from(urlencoding::decode(connection_url.username())?); if username.is_empty() { return Err(ConnInfoError::MissingUsername); } ctx.set_user(username.clone()); // TODO: make sure this is right in the context of rest broker let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { if !config.accept_jwts { return Err(ConnInfoError::MissingCredentials(Credentials::Password)); } let auth = auth .to_str() .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; AuthData::Jwt( auth.strip_prefix("Bearer ") .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? .into(), ) } else if let Some(pass) = connection_url.password() { // wrong credentials provided if config.accept_jwts { return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); } AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { std::borrow::Cow::Borrowed(b) => b.into(), std::borrow::Cow::Owned(b) => b.into(), }) } else if config.accept_jwts { return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); } else { return Err(ConnInfoError::MissingCredentials(Credentials::Password)); }; let endpoint: EndpointId = match connection_url.host() { Some(url::Host::Domain(hostname)) => hostname .split_once('.') .map_or(hostname, |(prefix, _)| prefix) .into(), Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { return Err(ConnInfoError::MissingHostname); } }; ctx.set_endpoint_id(endpoint.clone()); let pairs = connection_url.query_pairs(); let mut options = Option::None; let mut params = StartupMessageParams::default(); params.insert("user", &username); params.insert("database", &dbname); for (key, value) in pairs { params.insert(&key, &value); if key == "options" { options = Some(NeonOptions::parse_options_raw(&value)); } } // check the URL that was used, for metrics { let host_endpoint = headers // get the host header .get("host") // extract the domain .and_then(|h| { let (host, _port) = h.to_str().ok()?.split_once(':')?; Some(host) }) // get the endpoint prefix .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); let kind = if host_endpoint == Some(&*endpoint) { SniKind::Sni } else { SniKind::NoSni }; let protocol = ctx.protocol(); Metrics::get() .proxy .accepted_connections_by_sni .inc(SniGroup { protocol, kind }); } ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) .and_then(|h| h.to_str().ok()) .map(Into::into), ); let user_info = ComputeUserInfo { endpoint, user: username, options: options.unwrap_or_default(), }; let conn_info = ConnInfo { user_info, dbname }; Ok(ConnInfoWithAuth { conn_info, auth }) } ================================================ FILE: proxy/src/serverless/json.rs ================================================ use json::{ListSer, ObjectSer, ValueSer}; use postgres_client::Row; use postgres_client::types::{Kind, Type}; use serde_json::Value; // // Convert json non-string types to strings, so that they can be passed to Postgres // as parameters. // pub(crate) fn json_to_pg_text(json: Vec) -> Vec> { json.iter().map(json_value_to_pg_text).collect() } fn json_value_to_pg_text(value: &Value) -> Option { match value { // special care for nulls Value::Null => None, // convert to text with escaping v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), // avoid escaping here, as we pass this as a parameter Value::String(s) => Some(s.clone()), // special care for arrays Value::Array(_) => json_array_to_pg_array(value), } } // // Serialize a JSON array to a Postgres array. Contrary to the strings in the params // in the array we need to escape the strings. Postgres is okay with arrays of form // '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving // it for Postgres to check. // // Example of the same escaping in node-postgres: packages/pg/lib/utils.js // fn json_array_to_pg_array(value: &Value) -> Option { match value { // special care for nulls Value::Null => None, // convert to text with escaping // here string needs to be escaped, as it is part of the array v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), // recurse into array Value::Array(arr) => { let vals = arr .iter() .map(json_array_to_pg_array) .map(|v| v.unwrap_or_else(|| "NULL".to_string())) .collect::>() .join(","); Some(format!("{{{vals}}}")) } } } #[derive(Debug, thiserror::Error)] pub(crate) enum JsonConversionError { #[error("internal error compute returned invalid data: {0}")] AsTextError(postgres_client::Error), #[error("parse int error: {0}")] ParseIntError(#[from] std::num::ParseIntError), #[error("parse float error: {0}")] ParseFloatError(#[from] std::num::ParseFloatError), #[error("parse json error: {0}")] ParseJsonError(#[from] serde_json::Error), #[error("unbalanced array")] UnbalancedArray, #[error("unbalanced quoted string")] UnbalancedString, } enum OutputMode<'a> { Array(ListSer<'a>), Object(ObjectSer<'a>), } impl OutputMode<'_> { fn key(&mut self, key: &str) -> ValueSer<'_> { match self { OutputMode::Array(values) => values.entry(), OutputMode::Object(map) => map.key(key), } } fn finish(self) { match self { OutputMode::Array(values) => values.finish(), OutputMode::Object(map) => map.finish(), } } } // // Convert postgres row with text-encoded values to JSON object // pub(crate) fn pg_text_row_to_json( output: ValueSer, row: &Row, raw_output: bool, array_mode: bool, ) -> Result<(), JsonConversionError> { let mut entries = if array_mode { OutputMode::Array(output.list()) } else { OutputMode::Object(output.object()) }; for (i, column) in row.columns().iter().enumerate() { let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; let value = entries.key(column.name()); match pg_value { Some(v) if raw_output => value.value(v), Some(v) => pg_text_to_json(value, v, column.type_())?, None => value.value(json::Null), } } entries.finish(); Ok(()) } // // Convert postgres text-encoded value to JSON value // fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> { if let Kind::Array(elem_type) = pg_type.kind() { // todo: we should fetch this from postgres. let delimiter = ','; json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?); return Ok(()); } match *pg_type { Type::BOOL => output.value(val == "t"), Type::INT2 | Type::INT4 => { let val = val.parse::()?; output.value(val); } Type::FLOAT4 | Type::FLOAT8 => { let fval = val.parse::()?; if fval.is_finite() { output.value(fval); } else { // Pass Nan, Inf, -Inf as strings // JS JSON.stringify() does converts them to null, but we // want to preserve them, so we pass them as strings output.value(val); } } // we assume that the string value is valid json. Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()), _ => output.value(val), } Ok(()) } /// Parse postgres array into JSON array. /// /// This is a bit involved because we need to handle nested arrays and quoted /// values. Unlike postgres we don't check that all nested arrays have the same /// dimensions, we just return them as is. /// /// /// /// The external text representation of an array value consists of items that are interpreted /// according to the I/O conversion rules for the array's element type, plus decoration that /// indicates the array structure. The decoration consists of curly braces (`{` and `}`) around /// the array value plus delimiter characters between adjacent items. The delimiter character /// is usually a comma (,) but can be something else: it is determined by the typdelim setting /// for the array's element type. Among the standard data types provided in the PostgreSQL /// distribution, all use a comma, except for type box, which uses a semicolon (;). /// /// In a multidimensional array, each dimension (row, plane, cube, etc.) /// gets its own level of curly braces, and delimiters must be written between adjacent /// curly-braced entities of the same level. fn pg_array_parse( elements: &mut ListSer, mut pg_array: &str, elem: &Type, delim: char, ) -> Result<(), JsonConversionError> { // skip bounds decoration, eg: // `[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}` // technically these are significant, but we have no way to represent them in json. if let Some('[') = pg_array.chars().next() { let Some((_bounds, array)) = pg_array.split_once('=') else { return Err(JsonConversionError::UnbalancedArray); }; pg_array = array; } // whitespace might preceed a `{`. let pg_array = pg_array.trim_start(); let rest = pg_array_parse_inner(elements, pg_array, elem, delim)?; if !rest.is_empty() { return Err(JsonConversionError::UnbalancedArray); } Ok(()) } /// reads a single array from the `pg_array` string and pushes each values to `elements`. /// returns the rest of the `pg_array` string that was not read. fn pg_array_parse_inner<'a>( elements: &mut ListSer, mut pg_array: &'a str, elem: &Type, delim: char, ) -> Result<&'a str, JsonConversionError> { // array should have a `{` prefix. pg_array = pg_array .strip_prefix('{') .ok_or(JsonConversionError::UnbalancedArray)?; let mut q = String::new(); loop { let value = elements.entry(); pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?; // check for separator. if let Some(next) = pg_array.strip_prefix(delim) { // next item. pg_array = next; } else { break; } } let Some(next) = pg_array.strip_prefix('}') else { // missing `}` terminator. return Err(JsonConversionError::UnbalancedArray); }; // whitespace might follow a `}`. Ok(next.trim_start()) } /// reads a single item from the `pg_array` string. /// returns the rest of the `pg_array` string that was not read. /// /// `quoted` is a scratch allocation that has no defined output. fn pg_array_parse_item<'a>( output: ValueSer, quoted: &mut String, mut pg_array: &'a str, elem: &Type, delim: char, ) -> Result<&'a str, JsonConversionError> { // We are trying to parse an array item. // This could be a new array, if this is a multi-dimentional array. // This could be a quoted string representing `elem`. // This could be an unquoted string representing `elem`. // whitespace might preceed an item. pg_array = pg_array.trim_start(); if pg_array.starts_with('{') { // nested array. pg_array = json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?; return Ok(pg_array); } if let Some(mut pg_array) = pg_array.strip_prefix('"') { // the parsed string is un-escaped and written into quoted. pg_array = pg_array_parse_quoted(quoted, pg_array)?; // we have un-escaped the string, parse it as pgtext. pg_text_to_json(output, quoted, elem)?; return Ok(pg_array); } // we need to parse an item. read until we find a delimiter or `}`. let index = pg_array .find([delim, '}']) .ok_or(JsonConversionError::UnbalancedArray)?; let item; (item, pg_array) = pg_array.split_at(index); // item might have trailing whitespace that we need to ignore. let item = item.trim_end(); // we might have an item string: // check for null if item == "NULL" { output.value(json::Null); } else { pg_text_to_json(output, item, elem)?; } Ok(pg_array) } /// reads a single quoted item from the `pg_array` string. /// /// Returns the rest of the `pg_array` string that was not read. /// The output is written into `quoted`. /// /// The pg_array string must have a `"` terminator, but the `"` initial value /// must have already been removed from the input. The terminator is removed. fn pg_array_parse_quoted<'a>( quoted: &mut String, mut pg_array: &'a str, ) -> Result<&'a str, JsonConversionError> { // The array output routine will put double quotes around element values if they are empty strings, // contain curly braces, delimiter characters, double quotes, backslashes, or white space, // or match the word `NULL`. Double quotes and backslashes embedded in element values will be backslash-escaped. // For numeric data types it is safe to assume that double quotes will never appear, // but for textual data types one should be prepared to cope with either the presence or absence of quotes. quoted.clear(); // We write to quoted in chunks terminated by an escape character. // Eg if we have the input `foo\"bar"`, then we write `foo`, then `"`, then finally `bar`. loop { // we need to parse an chunk. read until we find a '\\' or `"`. let i = pg_array .find(['\\', '"']) .ok_or(JsonConversionError::UnbalancedString)?; let chunk: &str; (chunk, pg_array) = pg_array .split_at_checked(i) .expect("i is guaranteed to be in-bounds of pg_array"); // push the chunk. quoted.push_str(chunk); // consume the chunk_end character. let chunk_end: char; (chunk_end, pg_array) = split_first_char(pg_array).expect("pg_array should start with either '\\\\' or '\"'"); // finished. if chunk_end == '"' { // whitespace might follow the '"'. pg_array = pg_array.trim_start(); break Ok(pg_array); } // consume the escaped character. let escaped: char; (escaped, pg_array) = split_first_char(pg_array).ok_or(JsonConversionError::UnbalancedString)?; quoted.push(escaped); } } fn split_first_char(s: &str) -> Option<(char, &str)> { let mut chars = s.chars(); let c = chars.next()?; Some((c, chars.as_str())) } #[cfg(test)] mod tests { use serde_json::json; use super::*; #[test] fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; let pg_params = json_to_pg_text(json); assert_eq!( pg_params, vec![Some("true".to_owned()), Some("false".to_owned())] ); let json = vec![Value::Number(serde_json::Number::from(42))]; let pg_params = json_to_pg_text(json); assert_eq!(pg_params, vec![Some("42".to_owned())]); let json = vec![Value::String("foo\"".to_string())]; let pg_params = json_to_pg_text(json); assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); let json = vec![Value::Null]; let pg_params = json_to_pg_text(json); assert_eq!(pg_params, vec![None]); } #[test] fn test_json_array_to_pg_array() { // atoms and escaping let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; let json: Value = serde_json::from_str(json).unwrap(); let pg_params = json_to_pg_text(vec![json]); assert_eq!( pg_params, vec![Some( "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() )] ); // nested arrays let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; let json: Value = serde_json::from_str(json).unwrap(); let pg_params = json_to_pg_text(vec![json]); assert_eq!( pg_params, vec![Some( "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() )] ); // array of objects let json = r#"[{"foo": 1},{"bar": 2}]"#; let json: Value = serde_json::from_str(json).unwrap(); let pg_params = json_to_pg_text(vec![json]); assert_eq!( pg_params, vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] ); } fn pg_text_to_json(val: &str, pg_type: &Type) -> Value { let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap()); serde_json::from_str(&output).unwrap() } fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value { let output = json::value_to_string!(|v| json::value_as_list!(|v| { super::pg_array_parse(v, pg_array, pg_type, ',').unwrap(); })); serde_json::from_str(&output).unwrap() } #[test] fn test_atomic_types_parse() { assert_eq!(pg_text_to_json("foo", &Type::TEXT), json!("foo")); assert_eq!(pg_text_to_json("42", &Type::INT4), json!(42)); assert_eq!(pg_text_to_json("42", &Type::INT2), json!(42)); assert_eq!(pg_text_to_json("42", &Type::INT8), json!("42")); assert_eq!(pg_text_to_json("42.42", &Type::FLOAT8), json!(42.42)); assert_eq!(pg_text_to_json("42.42", &Type::FLOAT4), json!(42.42)); assert_eq!(pg_text_to_json("NaN", &Type::FLOAT4), json!("NaN")); assert_eq!( pg_text_to_json("Infinity", &Type::FLOAT4), json!("Infinity") ); assert_eq!( pg_text_to_json("-Infinity", &Type::FLOAT4), json!("-Infinity") ); let json: Value = serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") .unwrap(); assert_eq!( pg_text_to_json( r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#, &Type::JSONB ), json ); } #[test] fn test_pg_array_parse_text() { fn pt(pg_arr: &str) -> Value { pg_array_parse(pg_arr, &Type::TEXT) } assert_eq!( pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), json!(["aa\"\\,a", "cha", "bbbb"]) ); assert_eq!( pt(r#"{{"foo","bar"},{"bee","bop"}}"#), json!([["foo", "bar"], ["bee", "bop"]]) ); assert_eq!( pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), json!([[[["foo", null, "bop", "bup"]]]]) ); assert_eq!( pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) ); } #[test] fn test_pg_array_parse_bool() { fn pb(pg_arr: &str) -> Value { pg_array_parse(pg_arr, &Type::BOOL) } assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); assert_eq!( pb(r#"{{t,f},{f,t}}"#), json!([[true, false], [false, true]]) ); assert_eq!( pb(r#"{{t,NULL},{NULL,f}}"#), json!([[true, null], [null, false]]) ); } #[test] fn test_pg_array_parse_numbers() { fn pn(pg_arr: &str, ty: &Type) -> Value { pg_array_parse(pg_arr, ty) } assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); assert_eq!( pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), json!([1.1, 2.2, 3.3]) ); assert_eq!( pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), json!([1.1, 2.2, 3.3]) ); assert_eq!( pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), json!(["NaN", "Infinity", "-Infinity"]) ); assert_eq!( pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), json!(["NaN", "Infinity", "-Infinity"]) ); } #[test] fn test_pg_array_with_decoration() { fn p(pg_arr: &str) -> Value { pg_array_parse(pg_arr, &Type::INT2) } assert_eq!( p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), json!([[[1, 2, 3], [4, 5, 6]]]) ); } #[test] fn test_pg_array_parse_json() { fn pt(pg_arr: &str) -> Value { pg_array_parse(pg_arr, &Type::JSONB) } assert_eq!(pt(r#"{"{}"}"#), json!([{}])); assert_eq!( pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), json!([{"foo": 1, "bar": 2}]) ); assert_eq!( pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), json!([{"foo": 1}, {"bar": 2}]) ); assert_eq!( pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), json!([[{"foo": 1}, {"bar": 2}]]) ); } } ================================================ FILE: proxy/src/serverless/local_conn_pool.rs ================================================ //! Manages the pool of connections between local_proxy and postgres. //! //! The pool is keyed by database and role_name, and can contain multiple connections //! shared between users. //! //! The pool manages the pg_session_jwt extension used for authorizing //! requests in the db. //! //! The first time a db/role pair is seen, local_proxy attempts to install the extension //! and grant usage to the role on the given schema. use std::collections::HashMap; use std::pin::pin; use std::sync::Arc; use std::sync::atomic::AtomicUsize; use std::task::{Poll, ready}; use std::time::Duration; use base64::Engine as _; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use ed25519_dalek::{Signature, Signer, SigningKey}; use futures::future::poll_fn; use futures::{Future, FutureExt}; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; use serde_json::value::RawValue; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, info_span}; use super::backend::HttpConnError; use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn, EndpointConnPool, }; use super::sql_over_http::SqlOverHttpError; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; pub(crate) const EXT_VERSION: &str = "0.3.1"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] pub(crate) struct ClientDataLocal { session: tokio::sync::watch::Sender, cancel: CancellationToken, key: SigningKey, jti: u64, } impl ClientDataLocal { pub fn session(&mut self) -> &mut tokio::sync::watch::Sender { &mut self.session } pub fn cancel(&mut self) { self.cancel.cancel(); } } pub(crate) struct LocalConnPool { global_pool: Arc>>, config: &'static crate::config::HttpConfig, } impl LocalConnPool { pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { Arc::new(Self { global_pool: Arc::new(RwLock::new(EndpointConnPool::new( HashMap::new(), 0, config.pool_options.max_conns_per_endpoint, Arc::new(AtomicUsize::new(0)), config.pool_options.max_total_conns, String::from("local_pool"), ))), config, }) } pub(crate) fn get_idle_timeout(&self) -> Duration { self.config.pool_options.idle_timeout } pub(crate) fn get( self: &Arc, ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let client = self .global_pool .write() .get_conn_entry(conn_info.db_and_user()) .map(|entry| entry.conn); // ok return cached connection if found and establish a new one otherwise if let Some(mut client) = client { if client.inner.is_closed() { info!("local_pool: cached connection '{conn_info}' is closed, opening a new one"); return Ok(None); } tracing::Span::current() .record("conn_id", tracing::field::display(client.get_conn_id())); tracing::Span::current().record( "pid", tracing::field::display(client.inner.get_process_id()), ); debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "local_pool: reusing connection '{conn_info}'" ); match client.get_data() { ClientDataEnum::Local(data) => { data.session().send(ctx.session_id())?; } ClientDataEnum::Remote(data) => { data.session().send(ctx.session_id())?; } ClientDataEnum::Http(_) => (), } ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.success(); return Ok(Some(Client::new( client, conn_info.clone(), Arc::downgrade(&self.global_pool), ))); } Ok(None) } pub(crate) fn initialized(self: &Arc, conn_info: &ConnInfo) -> bool { if let Some(pool) = self.global_pool.read().get_pool(conn_info.db_and_user()) { return pool.is_initialized(); } false } pub(crate) fn set_initialized(self: &Arc, conn_info: &ConnInfo) { if let Some(pool) = self .global_pool .write() .get_pool_mut(conn_info.db_and_user()) { pool.set_initialized(); } } } #[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: postgres_client::Connection, key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let mut session_id = ctx.session_id(); let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = Arc::downgrade(&global_pool); let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); let cancel = CancellationToken::new(); let cancelled = cancel.clone().cancelled_owned(); tokio::spawn(async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); let mut cancelled = pin!(cancelled); poll_fn(move |cx| { let _instrument = span.enter(); if cancelled.as_mut().poll(cx).is_ready() { info!("connection dropped"); return Poll::Ready(()); } match rx.has_changed() { Ok(true) => { session_id = *rx.borrow_and_update(); info!(%session_id, "changed session"); idle_timeout.as_mut().reset(Instant::now() + idle); } Err(_) => { info!("connection dropped"); return Poll::Ready(()); } _ => {} } // 5 minute idle connection timeout if idle_timeout.as_mut().poll(cx).is_ready() { idle_timeout.as_mut().reset(Instant::now() + idle); info!("connection idle"); if let Some(pool) = pool.clone().upgrade() { // remove client from pool - should close the connection if it's idle. // does nothing if the client is currently checked-out and in-use if pool .global_pool .write() .remove_client(db_user.clone(), conn_id) { info!("idle connection removed"); } } } match ready!(connection.poll_unpin(cx)) { Err(e) => error!(%session_id, "connection error: {}", e), Ok(()) => info!("connection closed"), } // remove from connection pool if let Some(pool) = pool.clone().upgrade() && pool .global_pool .write() .remove_client(db_user.clone(), conn_id) { info!("closed connection removed"); } Poll::Ready(()) }) .await; }); let inner = ClientInnerCommon { inner: client, aux, conn_id, data: ClientDataEnum::Local(ClientDataLocal { session: tx, cancel, key, jti: 0, }), }; Client::new(inner, conn_info, Arc::downgrade(&global_pool.global_pool)) } impl ClientInnerCommon { pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), SqlOverHttpError> { if let ClientDataEnum::Local(local_data) = &mut self.data { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; // initiates the auth session // this is safe from query injections as the jwt format free of any escape characters. let query = format!("select auth.jwt_session_init('{token}')"); self.inner .batch_execute(&query) .await .map_err(SqlOverHttpError::InternalPostgres)?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); Ok(()) } else { panic!("unexpected client data type"); } } } /// implements relatively efficient in-place json object key upserting /// /// only supports top-level keys fn upsert_json_object( payload: &[u8], key: &str, value: &RawValue, ) -> Result { let mut payload = serde_json::from_slice::>(payload)?; payload.insert(key, value); serde_json::to_string(&payload) } fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result { let mut buffer = itoa::Buffer::new(); // encode the jti integer to a json rawvalue let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)) .expect("itoa formatted integer should be guaranteed valid json"); // update the jti in-place let payload = upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?; // sign the jwt let token = sign_jwt(sk, payload.as_bytes()); Ok(token) } fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { let header_len = 20; let payload_len = Base64UrlUnpadded::encoded_len(payload); let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]); let total_len = header_len + payload_len + signature_len + 2; let mut jwt = String::with_capacity(total_len); let cap = jwt.capacity(); // we only need an empty header with the alg specified. // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9" jwt.push_str("eyJhbGciOiJFZERTQSJ9."); // encode the jwt payload in-place BASE64_URL_SAFE_NO_PAD.encode_string(payload, &mut jwt); // create the signature from the encoded header || payload let sig: Signature = sk.sign(jwt.as_bytes()); jwt.push('.'); // encode the jwt signature in-place BASE64_URL_SAFE_NO_PAD.encode_string(sig.to_bytes(), &mut jwt); debug_assert_eq!( jwt.len(), total_len, "the jwt len should match our expected len" ); debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change"); jwt } #[cfg(test)] mod tests { use ed25519_dalek::SigningKey; use typed_json::json; use super::resign_jwt; #[test] fn jwt_token_snapshot() { let key = SigningKey::from_bytes(&[1; 32]); let data = json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap(); // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. // In the public-key box, paste the following jwk public key // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}` // Note - jwt.io doesn't support EdDSA :( // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509 // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { // crv: jose_jwk::OkpCurves::Ed25519, // x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), // d: None, // }); // println!("{}", serde_json::to_string(&jwk).unwrap()); assert_eq!( jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg" ); } } ================================================ FILE: proxy/src/serverless/mod.rs ================================================ //! Routers for our serverless APIs //! //! Handles both SQL over HTTP and SQL over Websockets. mod backend; pub mod cancel_set; mod conn_pool; mod conn_pool_lib; mod error; mod http_conn_pool; mod http_util; mod json; mod local_conn_pool; #[cfg(feature = "rest_broker")] pub mod rest; mod sql_over_http; mod websocket; use std::net::{IpAddr, SocketAddr}; use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; use arc_swap::ArcSwapOption; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool_lib::GlobalConnPoolOptions; use futures::TryFutureExt; use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; use http_util::{NEON_REQUEST_ID, uuid_to_header_value}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; use rand::SeedableRng; use rand::rngs::StdRng; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; use tracing::{Instrument, info, warn}; use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; use crate::util::run_until_cancelled; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); } let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config); let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let conn_pool = Arc::clone(&conn_pool); tokio::spawn(async move { conn_pool.gc_worker(StdRng::from_os_rng()).await; }); } // shutdown the connection pool tokio::spawn({ let cancellation_token = cancellation_token.clone(); let conn_pool = conn_pool.clone(); async move { cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || conn_pool.shutdown()) .await .propagate_task_panic(); } }); let http_conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let http_conn_pool = Arc::clone(&http_conn_pool); tokio::spawn(async move { http_conn_pool.gc_worker(StdRng::from_os_rng()).await; }); } // shutdown the connection pool tokio::spawn({ let cancellation_token = cancellation_token.clone(); let http_conn_pool = http_conn_pool.clone(); async move { cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) .await .propagate_task_panic(); } }); let backend = Arc::new(PoolingBackend { http_conn_pool: Arc::clone(&http_conn_pool), local_pool, pool: Arc::clone(&conn_pool), config, auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_acceptor: Arc = Arc::new(&config.tls_config); let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { let (conn, peer_addr) = res.context("could not accept TCP stream")?; if let Err(e) = conn.set_nodelay(true) { tracing::error!("could not set nodelay: {e}"); continue; } let conn_id = uuid::Uuid::new_v4(); let http_conn_span = tracing::info_span!("http_conn", ?conn_id); let n_connections = Metrics::get() .proxy .client_connections .sample(crate::metrics::Protocol::Http); tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check"); if n_connections > config.http_config.client_conn_threshold { tracing::trace!("attempting to cancel a random connection"); if let Some(token) = config.http_config.cancel_set.take() { tracing::debug!("cancelling a random connection"); token.cancel(); } } let conn_token = cancellation_token.child_token(); let tls_acceptor = tls_acceptor.clone(); let backend = backend.clone(); let connections2 = connections.clone(); let cancellation_handler = cancellation_handler.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); let cancellations = cancellations.clone(); connections.spawn( async move { let conn_token2 = conn_token.clone(); let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2); let session_id = uuid::Uuid::new_v4(); let _gauge = Metrics::get() .proxy .client_connections .guard(crate::metrics::Protocol::Http); let startup_result = Box::pin(connection_startup( config, tls_acceptor, session_id, conn, peer_addr, )) .await; let Some((conn, conn_info)) = startup_result else { return; }; Box::pin(connection_handler( config, backend, connections2, cancellations, cancellation_handler, endpoint_rate_limiter, conn_token, conn, conn_info, session_id, )) .await; } .instrument(http_conn_span), ); } connections.wait().await; Ok(()) } pub(crate) trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {} impl AsyncReadWrite for T {} pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { async fn accept(&self, conn: TcpStream) -> std::io::Result; } #[async_trait] impl MaybeTlsAcceptor for &'static ArcSwapOption { async fn accept(&self, conn: TcpStream) -> std::io::Result { match &*self.load() { Some(config) => Ok(Box::pin( TlsAcceptor::from(config.http_config.clone()) .accept(conn) .await?, )), None => Ok(Box::pin(conn)), } } } /// Handles the TCP startup lifecycle. /// 1. Parses PROXY protocol V2 /// 2. Handles TLS handshake async fn connection_startup( config: &ProxyConfig, tls_acceptor: Arc, session_id: uuid::Uuid, conn: TcpStream, peer_addr: SocketAddr, ) -> Option<(AsyncRW, ConnectionInfo)> { // handle PROXY protocol let (conn, conn_info) = match config.proxy_protocol_v2 { ProxyProtocolV2::Required => { match read_proxy_protocol(conn).await { Err(e) => { warn!("per-client task finished with an error: {e:#}"); return None; } // our load balancers will not send any more data. let's just exit immediately Ok((_conn, ConnectHeader::Local)) => { tracing::debug!("healthcheck received"); return None; } Ok((conn, ConnectHeader::Proxy(info))) => (conn, info), } } // ignore the header - it cannot be confused for a postgres or http connection so will // error later. ProxyProtocolV2::Rejected => ( conn, ConnectionInfo { addr: peer_addr, extra: None, }, ), }; let has_private_peer_addr = match conn_info.addr.ip() { IpAddr::V4(ip) => ip.is_private(), IpAddr::V6(_) => false, }; info!(?session_id, %conn_info, "accepted new TCP connection"); // try upgrade to TLS, but with a timeout. let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { Ok(Ok(conn)) => { info!(?session_id, %conn_info, "accepted new TLS connection"); conn } // The handshake failed Ok(Err(e)) => { if !has_private_peer_addr { Metrics::get().proxy.tls_handshake_failures.inc(); } warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}"); return None; } // The handshake timed out Err(e) => { if !has_private_peer_addr { Metrics::get().proxy.tls_handshake_failures.inc(); } warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}"); return None; } }; Some((conn, conn_info)) } /// Handles HTTP connection /// 1. With graceful shutdowns /// 2. With graceful request cancellation with connection failure /// 3. With websocket upgrade support. #[allow(clippy::too_many_arguments)] async fn connection_handler( config: &'static ProxyConfig, backend: Arc, connections: TaskTracker, cancellations: TaskTracker, cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, conn: AsyncRW, conn_info: ConnectionInfo, session_id: uuid::Uuid, ) { let session_id = AtomicTake::new(session_id); // Cancel all current inflight HTTP requests if the HTTP connection is closed. let http_cancellation_token = CancellationToken::new(); let _cancel_connection = http_cancellation_token.clone().drop_guard(); let conn_info2 = conn_info.clone(); let server = Builder::new(TokioExecutor::new()); let conn = server.serve_connection_with_upgrades( hyper_util::rt::TokioIo::new(conn), hyper::service::service_fn(move |req: hyper::Request| { // First HTTP request shares the same session ID let mut session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); if matches!(backend.auth_backend, crate::auth::Backend::Local(_)) { // take session_id from request, if given. if let Some(id) = req .headers() .get(&NEON_REQUEST_ID) .and_then(|id| uuid::Uuid::try_parse_ascii(id.as_bytes()).ok()) { session_id = id; } } // Cancel the current inflight HTTP request if the requets stream is closed. // This is slightly different to `_cancel_connection` in that // h2 can cancel individual requests with a `RST_STREAM`. let http_request_token = http_cancellation_token.child_token(); let cancel_request = http_request_token.clone().drop_guard(); // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. // By spawning the future, we ensure it never gets cancelled until it decides to. let cancellations = cancellations.clone(); let handler = connections.spawn( request_handler( req, config, backend.clone(), connections.clone(), cancellation_handler.clone(), session_id, conn_info2.clone(), http_request_token, endpoint_rate_limiter.clone(), cancellations, ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), ); async move { let mut res = handler.await; cancel_request.disarm(); // add the session ID to the response if let Ok(resp) = &mut res { resp.headers_mut() .append(&NEON_REQUEST_ID, uuid_to_header_value(session_id)); } res } }), ); // On cancellation, trigger the HTTP connection handler to shut down. let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { Either::Left((_cancelled, mut conn)) => { tracing::debug!(%conn_info, "cancelling connection"); conn.as_mut().graceful_shutdown(); conn.await } Either::Right((res, _)) => res, }; match res { Ok(()) => tracing::info!(%conn_info, "HTTP connection closed"), Err(e) => tracing::warn!(%conn_info, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( mut request: hyper::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, cancellation_handler: Arc, session_id: uuid::Uuid, conn_info: ConnectionInfo, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, cancellations: TaskTracker, ) -> Result>, ApiError> { let host = request .headers() .get("host") .and_then(|h| h.to_str().ok()) .and_then(|h| h.split(':').next()) .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. if config.http_config.accept_websockets && framed_websockets::upgrade::is_upgrade_request(&request) { let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Ws); ctx.set_user_agent( request .headers() .get(hyper::header::USER_AGENT) .and_then(|h| h.to_str().ok()) .map(Into::into), ); let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; let cancellations = cancellations.clone(); ws_connections.spawn( async move { if let Err(e) = websocket::serve_websocket( config, backend.auth_backend, ctx, websocket, cancellation_handler, endpoint_rate_limiter, host, cancellations, ) .await { warn!("error in websocket connection: {e:#}"); } } .instrument(span), ); // Return the response so the spawned future can continue. Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http); let span = ctx.span(); let testodrome_id = request .headers() .get("X-Neon-Query-ID") .and_then(|value| value.to_str().ok()) .map(|s| s.to_string()); if let Some(query_id) = testodrome_id { info!(parent: &ctx.span(), "testodrome query ID: {query_id}"); ctx.set_testodrome_id(query_id.into()); } sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") .header( "Access-Control-Allow-Headers", "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code .body(Empty::new().map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into())) } else { #[cfg(feature = "rest_broker")] { if config.rest_config.is_rest_broker // we are testing for the path to be /database_name/rest/... && request .uri() .path() .split('/') .nth(2) .is_some_and(|part| part.starts_with("rest")) { let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http); let span = ctx.span(); let testodrome_id = request .headers() .get("X-Neon-Query-ID") .and_then(|value| value.to_str().ok()) .map(|s| s.to_string()); if let Some(query_id) = testodrome_id { info!(parent: &span, "testodrome query ID: {query_id}"); ctx.set_testodrome_id(query_id.into()); } rest::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") } } #[cfg(not(feature = "rest_broker"))] { json_response(StatusCode::BAD_REQUEST, "query is not supported") } } } ================================================ FILE: proxy/src/serverless/rest.rs ================================================ use std::borrow::Cow; use std::collections::HashMap; use std::convert::Infallible; use std::sync::Arc; use bytes::Bytes; use http::Method; use http::header::{ ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW, AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN, }; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::response::Builder; use hyper::http::{HeaderMap, HeaderName, HeaderValue}; use hyper::{Request, Response, StatusCode}; use indexmap::IndexMap; use moka::sync::Cache; use ouroboros::self_referencing; use serde::de::DeserializeOwned; use serde::{Deserialize, Deserializer}; use serde_json::Value as JsonValue; use serde_json::value::RawValue; use subzero_core::api::ContentType::{ApplicationJSON, Other, SingularJSON, TextCSV}; use subzero_core::api::QueryNode::{Delete, FunctionCall, Insert, Update}; use subzero_core::api::Resolution::{IgnoreDuplicates, MergeDuplicates}; use subzero_core::api::{ApiResponse, ListVal, Payload, Preferences, Representation, SingleVal}; use subzero_core::config::{db_allowed_select_functions, db_schemas, role_claim_key}; use subzero_core::dynamic_statement::{JoinIterator, param, sql}; use subzero_core::error::Error::{ self as SubzeroCoreError, ContentTypeError, GucHeadersError, GucStatusError, InternalError, JsonDeserialize, JwtTokenInvalid, NotFound, }; use subzero_core::error::pg_error_to_status_code; use subzero_core::formatter::Param::{LV, PL, SV, Str, StrOwned}; use subzero_core::formatter::postgresql::{fmt_main_query, generate}; use subzero_core::formatter::{Param, Snippet, SqlParam}; use subzero_core::parser::postgrest::parse; use subzero_core::permissions::{check_safe_functions, replace_select_star}; use subzero_core::schema::{ DbSchema, POSTGRESQL_INTROSPECTION_SQL, get_postgresql_configuration_query, }; use subzero_core::{content_range_header, content_range_status}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; use typed_json::json; use url::form_urlencoded; use super::backend::{HttpConnError, LocalProxyConnError, PoolingBackend}; use super::conn_pool::AuthData; use super::conn_pool_lib::ConnInfo; use super::error::{ConnInfoError, Credentials, HttpCodeError, ReadPayloadError}; use super::http_conn_pool::{self, LocalProxyClient}; use super::http_util::{ ALLOW_POOL, CONN_STRING, NEON_REQUEST_ID, RAW_TEXT_OUTPUT, TXN_ISOLATION_LEVEL, TXN_READ_ONLY, get_conn_info, json_response, uuid_to_header_value, }; use super::json::JsonConversionError; use crate::auth::backend::ComputeCredentialKeys; use crate::cache::common::{count_cache_insert, count_cache_outcome, eviction_listener}; use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::read_body_with_limit; use crate::metrics::{CacheKind, Metrics}; use crate::serverless::sql_over_http::HEADER_VALUE_TRUE; use crate::types::EndpointCacheKey; use crate::util::deserialize_json_string; static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#; const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL; const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*"); // CORS headers values const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue = HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS"); const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400"); const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static( "Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit", ); const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization"); // A wrapper around the DbSchema that allows for self-referencing #[self_referencing] pub struct DbSchemaOwned { schema_string: String, #[covariant] #[borrows(schema_string)] schema: DbSchema<'this>, } impl<'de> Deserialize<'de> for DbSchemaOwned { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { let s = String::deserialize(deserializer)?; DbSchemaOwned::try_new(s, |s| serde_json::from_str(s)) .map_err(::custom) } } fn split_comma_separated(s: &str) -> Vec { s.split(',').map(|s| s.trim().to_string()).collect() } fn deserialize_comma_separated<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { let s = String::deserialize(deserializer)?; Ok(split_comma_separated(&s)) } fn deserialize_comma_separated_option<'de, D>( deserializer: D, ) -> Result>, D::Error> where D: Deserializer<'de>, { let opt = Option::::deserialize(deserializer)?; if let Some(s) = &opt { let trimmed = s.trim(); if trimmed.is_empty() { return Ok(None); } return Ok(Some(split_comma_separated(trimmed))); } Ok(None) } // The ApiConfig is the configuration for the API per endpoint // The configuration is read from the database and cached in the DbSchemaCache #[derive(Deserialize, Debug)] pub struct ApiConfig { #[serde( default = "db_schemas", deserialize_with = "deserialize_comma_separated" )] pub db_schemas: Vec, pub db_anon_role: Option, pub db_max_rows: Option, #[serde(default = "db_allowed_select_functions")] pub db_allowed_select_functions: Vec, // #[serde(deserialize_with = "to_tuple", default)] // pub db_pre_request: Option<(String, String)>, #[allow(dead_code)] #[serde(default = "role_claim_key")] pub role_claim_key: String, #[serde(default, deserialize_with = "deserialize_comma_separated_option")] pub db_extra_search_path: Option>, #[serde(default, deserialize_with = "deserialize_comma_separated_option")] pub server_cors_allowed_origins: Option>, } // The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint pub(crate) struct DbSchemaCache(Cache>); impl DbSchemaCache { pub fn new(config: crate::config::CacheOptions) -> Self { let builder = Cache::builder().name("schema"); let builder = config.moka(builder); let metrics = &Metrics::get().cache; if let Some(size) = config.size { metrics.capacity.set(CacheKind::Schema, size as i64); } let builder = builder.eviction_listener(|_k, _v, cause| eviction_listener(CacheKind::Schema, cause)); Self(builder.build()) } pub async fn maintain(&self) -> Result { let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60)); loop { ticker.tick().await; self.0.run_pending_tasks(); } } pub fn get_cached( &self, endpoint_id: &EndpointCacheKey, ) -> Option> { count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id)) } pub async fn get_remote( &self, endpoint_id: &EndpointCacheKey, auth_header: &HeaderValue, connection_string: &str, client: &mut http_conn_pool::Client, ctx: &RequestContext, config: &'static ProxyConfig, ) -> Result, RestError> { info!("db_schema cache miss for endpoint: {:?}", endpoint_id); let remote_value = self .internal_get_remote(auth_header, connection_string, client, ctx, config) .await; let (api_config, schema_owned) = match remote_value { Ok((api_config, schema_owned)) => (api_config, schema_owned), Err(e @ RestError::SchemaTooLarge) => { // for the case where the schema is too large, we cache an empty dummy value // all the other requests will fail without triggering the introspection query let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) .map_err(|e| JsonDeserialize { source: e })?; let api_config = ApiConfig { db_schemas: vec![], db_anon_role: None, db_max_rows: None, db_allowed_select_functions: vec![], role_claim_key: String::new(), db_extra_search_path: None, server_cors_allowed_origins: None, }; let value = Arc::new((api_config, schema_owned)); count_cache_insert(CacheKind::Schema); self.0.insert(endpoint_id.clone(), value); return Err(e); } Err(e) => { return Err(e); } }; let value = Arc::new((api_config, schema_owned)); count_cache_insert(CacheKind::Schema); self.0.insert(endpoint_id.clone(), value.clone()); Ok(value) } async fn internal_get_remote( &self, auth_header: &HeaderValue, connection_string: &str, client: &mut http_conn_pool::Client, ctx: &RequestContext, config: &'static ProxyConfig, ) -> Result<(ApiConfig, DbSchemaOwned), RestError> { #[derive(Deserialize)] struct SingleRow { rows: [Row; 1], } #[derive(Deserialize)] struct ConfigRow { #[serde(deserialize_with = "deserialize_json_string")] config: ApiConfig, } #[derive(Deserialize)] struct SchemaRow { json_schema: DbSchemaOwned, } let headers = vec![ (&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())), ( &CONN_STRING, HeaderValue::from_str(connection_string).expect( "connection string came from a header, so it must be a valid headervalue", ), ), (&AUTHORIZATION, auth_header.clone()), (&RAW_TEXT_OUTPUT, HEADER_VALUE_TRUE), ]; let query = get_postgresql_configuration_query(Some("pgrst.pre_config")); let SingleRow { rows: [ConfigRow { config: api_config }], } = make_local_proxy_request( client, headers.iter().cloned(), QueryData { query: Cow::Owned(query), params: vec![], }, config.rest_config.max_schema_size, ) .await .map_err(|e| match e { RestError::ReadPayload(ReadPayloadError::BodyTooLarge { .. }) => { RestError::SchemaTooLarge } e => e, })?; // now that we have the api_config let's run the second INTROSPECTION_SQL query let SingleRow { rows: [SchemaRow { json_schema }], } = make_local_proxy_request( client, headers, QueryData { query: INTROSPECTION_SQL.into(), params: vec![ serde_json::to_value(&api_config.db_schemas) .expect("Vec is always valid to encode as JSON"), JsonValue::Bool(false), // include_roles_with_login JsonValue::Bool(false), // use_internal_permissions ], }, config.rest_config.max_schema_size, ) .await .map_err(|e| match e { RestError::ReadPayload(ReadPayloadError::BodyTooLarge { .. }) => { RestError::SchemaTooLarge } e => e, })?; Ok((api_config, json_schema)) } } // A type to represent a postgresql errors // we use our own type (instead of postgres_client::Error) because we get the error from the json response #[derive(Debug, thiserror::Error, Deserialize)] pub(crate) struct PostgresError { pub code: String, pub message: String, pub detail: Option, pub hint: Option, } impl HttpCodeError for PostgresError { fn get_http_status_code(&self) -> StatusCode { let status = pg_error_to_status_code(&self.code, true); StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) } } impl ReportableError for PostgresError { fn get_error_kind(&self) -> ErrorKind { ErrorKind::User } } impl UserFacingError for PostgresError { fn to_string_client(&self) -> String { if self.code.starts_with("PT") { "Postgres error".to_string() } else { self.message.clone() } } } impl std::fmt::Display for PostgresError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.message) } } // A type to represent errors that can occur in the rest broker #[derive(Debug, thiserror::Error)] pub(crate) enum RestError { #[error(transparent)] ReadPayload(#[from] ReadPayloadError), #[error(transparent)] ConnectCompute(#[from] HttpConnError), #[error(transparent)] ConnInfo(#[from] ConnInfoError), #[error(transparent)] Postgres(#[from] PostgresError), #[error(transparent)] JsonConversion(#[from] JsonConversionError), #[error(transparent)] SubzeroCore(#[from] SubzeroCoreError), #[error("schema is too large")] SchemaTooLarge, } impl ReportableError for RestError { fn get_error_kind(&self) -> ErrorKind { match self { RestError::ReadPayload(e) => e.get_error_kind(), RestError::ConnectCompute(e) => e.get_error_kind(), RestError::ConnInfo(e) => e.get_error_kind(), RestError::Postgres(_) => ErrorKind::Postgres, RestError::JsonConversion(_) => ErrorKind::Postgres, RestError::SubzeroCore(_) => ErrorKind::User, RestError::SchemaTooLarge => ErrorKind::User, } } } impl UserFacingError for RestError { fn to_string_client(&self) -> String { match self { RestError::ReadPayload(p) => p.to_string(), RestError::ConnectCompute(c) => c.to_string_client(), RestError::ConnInfo(c) => c.to_string_client(), RestError::SchemaTooLarge => self.to_string(), RestError::Postgres(p) => p.to_string_client(), RestError::JsonConversion(_) => "could not parse postgres response".to_string(), RestError::SubzeroCore(s) => { // TODO: this is a hack to get the message from the json body let json = s.json_body(); let default_message = "Unknown error".to_string(); json.get("message") .map_or(default_message.clone(), |m| match m { JsonValue::String(s) => s.clone(), _ => default_message, }) } } } } impl HttpCodeError for RestError { fn get_http_status_code(&self) -> StatusCode { match self { RestError::ReadPayload(e) => e.get_http_status_code(), RestError::ConnectCompute(h) => match h.get_error_kind() { ErrorKind::User => StatusCode::BAD_REQUEST, _ => StatusCode::INTERNAL_SERVER_ERROR, }, RestError::ConnInfo(_) => StatusCode::BAD_REQUEST, RestError::Postgres(e) => e.get_http_status_code(), RestError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR, RestError::SchemaTooLarge => StatusCode::INTERNAL_SERVER_ERROR, RestError::SubzeroCore(e) => { let status = e.status_code(); StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) } } } } // Helper functions for the rest broker fn fmt_env_query<'a>(env: &'a HashMap<&'a str, &'a str>) -> Snippet<'a> { "select " + if env.is_empty() { sql("null") } else { env.iter() .map(|(k, v)| { "set_config(" + param(k as &SqlParam) + ", " + param(v as &SqlParam) + ", true)" }) .join(",") } } // TODO: see about removing the need for cloning the values (inner things are &Cow already) fn to_sql_param(p: &Param) -> JsonValue { match p { SV(SingleVal(v, ..)) => JsonValue::String(v.to_string()), Str(v) => JsonValue::String((*v).to_string()), StrOwned(v) => JsonValue::String((*v).clone()), PL(Payload(v, ..)) => JsonValue::String(v.clone().into_owned()), LV(ListVal(v, ..)) => { if v.is_empty() { JsonValue::String(r"{}".to_string()) } else { JsonValue::String(format!( "{{\"{}\"}}", v.iter() .map(|e| e.replace('\\', "\\\\").replace('\"', "\\\"")) .collect::>() .join("\",\"") )) } } } } #[derive(serde::Serialize)] struct QueryData<'a> { query: Cow<'a, str>, params: Vec, } #[derive(serde::Serialize)] struct BatchQueryData<'a> { queries: Vec>, } async fn make_local_proxy_request( client: &mut http_conn_pool::Client, headers: impl IntoIterator, body: QueryData<'_>, max_len: usize, ) -> Result { let body_string = serde_json::to_string(&body) .map_err(|e| RestError::JsonConversion(JsonConversionError::ParseJsonError(e)))?; let response = make_raw_local_proxy_request(client, headers, body_string).await?; let response_status = response.status(); if response_status != StatusCode::OK { return Err(RestError::SubzeroCore(InternalError { message: "Failed to get endpoint schema".to_string(), })); } // Capture the response body let response_body = crate::http::read_body_with_limit(response.into_body(), max_len) .await .map_err(ReadPayloadError::from)?; // Parse the JSON response let response_json: S = serde_json::from_slice(&response_body) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; Ok(response_json) } async fn make_raw_local_proxy_request( client: &mut http_conn_pool::Client, headers: impl IntoIterator, body: String, ) -> Result, RestError> { let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri); let req_headers = req.headers_mut().expect("failed to get headers"); // Add all provided headers to the request for (header_name, header_value) in headers { req_headers.insert(header_name, header_value.clone()); } let body_boxed = Full::new(Bytes::from(body)) .map_err(|never| match never {}) // Convert Infallible to hyper::Error .boxed(); let req = req.body(body_boxed).map_err(|_| { RestError::SubzeroCore(InternalError { message: "Failed to build request".to_string(), }) })?; // Send the request to the local proxy client .inner .inner .send_request(req) .await .map_err(LocalProxyConnError::from) .map_err(HttpConnError::from) .map_err(RestError::from) } pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestContext, request: Request, backend: Arc, cancel: CancellationToken, ) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; let response = match result { Ok(r) => { ctx.set_success(); // Handling the error response from local proxy here if r.status().is_server_error() { let status = r.status(); let body_bytes = r .collect() .await .map_err(|e| { ApiError::InternalServerError(anyhow::Error::msg(format!( "could not collect http body: {e}" ))) })? .to_bytes(); if let Ok(mut json_map) = serde_json::from_slice::>(&body_bytes) { let message = json_map.get("message"); if let Some(message) = message { let msg: String = match serde_json::from_str(message.get()) { Ok(msg) => msg, Err(_) => { "Unable to parse the response message from server".to_string() } }; error!("Error response from local_proxy: {status} {msg}"); json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys let resp_json = serde_json::to_string(&json_map) .unwrap_or("failed to serialize the response message".to_string()); return json_response(status, resp_json); } } error!("Unable to parse the response message from local_proxy"); return json_response( status, json!({ "message": "Unable to parse the response message from server".to_string() }), ); } r } Err(e @ RestError::SubzeroCore(_)) => { let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); tracing::info!( kind=error_kind.to_metric_label(), error=%e, msg="subzero core error", "forwarding error to user" ); let RestError::SubzeroCore(subzero_err) = e else { panic!("expected subzero core error") }; let json_body = subzero_err.json_body(); let status_code = StatusCode::from_u16(subzero_err.status_code()) .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR); json_response(status_code, json_body)? } Err(e) => { let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); let message = e.to_string_client(); let status_code = e.get_http_status_code(); tracing::info!( kind=error_kind.to_metric_label(), error=%e, msg=message, "forwarding error to user" ); let (code, detail, hint) = match e { RestError::Postgres(e) => ( if e.code.starts_with("PT") { None } else { Some(e.code) }, e.detail, e.hint, ), _ => (None, None, None), }; json_response( status_code, json!({ "message": message, "code": code, "detail": detail, "hint": hint, }), )? } }; Ok(response) } async fn handle_inner( _cancel: CancellationToken, config: &'static ProxyConfig, ctx: &RequestContext, request: Request, backend: Arc, ) -> Result>, RestError> { let _requeset_gauge = Metrics::get() .proxy .connection_requests .guard(ctx.protocol()); info!( protocol = %ctx.protocol(), "handling interactive connection from client" ); // Read host from Host, then URI host as fallback // TODO: will this be a problem if behind a load balancer? // TODO: can we use the x-forwarded-host header? let host = request .headers() .get(HOST) .and_then(|v| v.to_str().ok()) .unwrap_or_else(|| request.uri().host().unwrap_or("")); // a valid path is /database/rest/v1/... so splitting should be ["", "database", "rest", "v1", ...] let database_name = request .uri() .path() .split('/') .nth(1) .ok_or(RestError::SubzeroCore(NotFound { target: request.uri().path().to_string(), }))?; // we always use the authenticator role to connect to the database let authenticator_role = "authenticator"; // Strip the hostname prefix from the host to get the database hostname let database_host = host.replace(&config.rest_config.hostname_prefix, ""); let connection_string = format!("postgresql://{authenticator_role}@{database_host}/{database_name}"); let conn_info = get_conn_info( &config.authentication_config, ctx, Some(&connection_string), request.headers(), )?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" ); match conn_info.auth { AuthData::Jwt(jwt) => { let api_prefix = format!("/{database_name}/rest/v1/"); handle_rest_inner( config, ctx, &api_prefix, request, &connection_string, conn_info.conn_info, jwt, backend, ) .await } AuthData::Password(_) => Err(RestError::ConnInfo(ConnInfoError::MissingCredentials( Credentials::BearerJwt, ))), } } fn apply_common_cors_headers( response: &mut Builder, request_headers: &HeaderMap, allowed_origins: Option<&Vec>, ) { let request_origin = request_headers .get(ORIGIN) .map(|v| v.to_str().unwrap_or("")); let response_allow_origin = match (request_origin, allowed_origins) { (Some(or), Some(allowed_origins)) => { if allowed_origins.iter().any(|o| o == or) { Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS)) } else { None } } (Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS), _ => None, }; if let Some(h) = response.headers_mut() { h.insert( ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_EXPOSE_HEADERS_VALUE, ); if let Some(origin) = response_allow_origin { h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin); } } } #[allow(clippy::too_many_arguments)] async fn handle_rest_inner( config: &'static ProxyConfig, ctx: &RequestContext, api_prefix: &str, request: Request, connection_string: &str, conn_info: ConnInfo, jwt: String, backend: Arc, ) -> Result>, RestError> { let db_schema_cache = config .rest_config .db_schema_cache .as_ref() .ok_or(RestError::SubzeroCore(InternalError { message: "DB schema cache is not configured".to_string(), }))?; let endpoint_cache_key = conn_info .endpoint_cache_key() .ok_or(RestError::SubzeroCore(InternalError { message: "Failed to get endpoint cache key".to_string(), }))?; let (parts, originial_body) = request.into_parts(); // try and get the cached entry for this endpoint // it contains the api config and the introspected db schema let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key); let allowed_origins = cached_entry .as_ref() .and_then(|arc| arc.0.server_cors_allowed_origins.as_ref()); let mut response = Response::builder(); apply_common_cors_headers(&mut response, &parts.headers, allowed_origins); // handle the OPTIONS request if parts.method == Method::OPTIONS { let allowed_headers = parts .headers .get(ACCESS_CONTROL_REQUEST_HEADERS) .and_then(|a| a.to_str().ok()) .filter(|v| !v.is_empty()) .map_or_else( || "Authorization".to_string(), |v| format!("{v}, Authorization"), ); return response .status(StatusCode::OK) .header( ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_METHODS_VALUE, ) .header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE) .header( ACCESS_CONTROL_ALLOW_HEADERS, HeaderValue::from_str(&allowed_headers) .unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE), ) .header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE) .body(Empty::new().map_err(|x| match x {}).boxed()) .map_err(|e| { RestError::SubzeroCore(InternalError { message: e.to_string(), }) }); } // validate the jwt token let jwt_parsed = backend .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await .map_err(HttpConnError::from)?; let auth_header = parts .headers .get(AUTHORIZATION) .ok_or(RestError::SubzeroCore(InternalError { message: "Authorization header is required".to_string(), }))?; let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; let entry = match cached_entry { Some(e) => e, None => { // if not cached, get the remote entry (will run the introspection query) db_schema_cache .get_remote( &endpoint_cache_key, auth_header, connection_string, &mut client, ctx, config, ) .await? } }; let (api_config, db_schema_owned) = entry.as_ref(); let db_schema = db_schema_owned.borrow_schema(); let db_schemas = &api_config.db_schemas; // list of schemas available for the api let db_extra_search_path = &api_config.db_extra_search_path; // TODO: use this when we get a replacement for jsonpath_lib // let role_claim_key = &api_config.role_claim_key; // let role_claim_path = format!("${role_claim_key}"); let db_anon_role = &api_config.db_anon_role; let max_rows = api_config.db_max_rows.as_deref(); let db_allowed_select_functions = api_config .db_allowed_select_functions .iter() .map(|s| s.as_str()) .collect::>(); // extract the jwt claims (we'll need them later to set the role and env) let jwt_claims = match jwt_parsed.keys { ComputeCredentialKeys::JwtPayload(payload_bytes) => { // `payload_bytes` contains the raw JWT payload as Vec // You can deserialize it back to JSON or parse specific claims let payload: serde_json::Value = serde_json::from_slice(&payload_bytes) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; Some(payload) } ComputeCredentialKeys::AuthKeys(_) => None, }; // read the role from the jwt claims (and set it to the "anon" role if not present) let (role, authenticated) = match &jwt_claims { Some(claims) => match claims.get("role") { Some(JsonValue::String(r)) => (Some(r), true), _ => (db_anon_role.as_ref(), true), }, None => (db_anon_role.as_ref(), false), }; // do not allow unauthenticated requests when there is no anonymous role setup if let (None, false) = (role, authenticated) { return Err(RestError::SubzeroCore(JwtTokenInvalid { message: "unauthenticated requests not allowed".to_string(), })); } // start deconstructing the request because subzero core mostly works with &str let method = parts.method; let method_str = method.as_str(); let path = parts.uri.path_and_query().map_or("/", |pq| pq.as_str()); // this is actually the table name (or rpc/function_name) // TODO: rename this to something more descriptive let root = match parts.uri.path().strip_prefix(api_prefix) { Some(p) => Ok(p), None => Err(RestError::SubzeroCore(NotFound { target: parts.uri.path().to_string(), })), }?; // pick the current schema from the headers (or the first one from config) let schema_name = &DbSchema::pick_current_schema(db_schemas, method_str, &parts.headers)?; // add the content-profile header to the response let mut response_headers = vec![]; if db_schemas.len() > 1 { response_headers.push(("Content-Profile".to_string(), schema_name.clone())); } // parse the query string into a Vec<(&str, &str)> let query = match parts.uri.query() { Some(q) => form_urlencoded::parse(q.as_bytes()).collect(), None => vec![], }; let get: Vec<(&str, &str)> = query.iter().map(|(k, v)| (&**k, &**v)).collect(); // convert the headers map to a HashMap<&str, &str> let headers: HashMap<&str, &str> = parts .headers .iter() .map(|(k, v)| (k.as_str(), v.to_str().unwrap_or("__BAD_HEADER__"))) .collect(); let cookies = HashMap::new(); // TODO: add cookies // Read the request body (skip for GET requests) let body_as_string: Option = if method == Method::GET { None } else { let body_bytes = read_body_with_limit(originial_body, config.http_config.max_request_size_bytes) .await .map_err(ReadPayloadError::from)?; if body_bytes.is_empty() { None } else { Some(String::from_utf8_lossy(&body_bytes).into_owned()) } }; // parse the request into an ApiRequest struct let mut api_request = parse( schema_name, root, db_schema, method_str, path, get, body_as_string.as_deref(), headers, cookies, max_rows, ) .map_err(RestError::SubzeroCore)?; let role_str = match role { Some(r) => r, None => "", }; replace_select_star(db_schema, schema_name, role_str, &mut api_request.query)?; // TODO: this is not relevant when acting as PostgREST but will be useful // in the context of DBX where they need internal permissions // if !disable_internal_permissions { // check_privileges(db_schema, schema_name, role_str, &api_request)?; // } check_safe_functions(&api_request, &db_allowed_select_functions)?; // TODO: this is not relevant when acting as PostgREST but will be useful // in the context of DBX where they need internal permissions // if !disable_internal_permissions { // insert_policy_conditions(db_schema, schema_name, role_str, &mut api_request.query)?; // } let env_role = Some(role_str); // construct the env (passed in to the sql context as GUCs) let empty_json = "{}".to_string(); let headers_env = serde_json::to_string(&api_request.headers).unwrap_or(empty_json.clone()); let cookies_env = serde_json::to_string(&api_request.cookies).unwrap_or(empty_json.clone()); let get_env = serde_json::to_string(&api_request.get).unwrap_or(empty_json.clone()); let jwt_claims_env = jwt_claims .as_ref() .map(|v| serde_json::to_string(v).unwrap_or(empty_json.clone())) .unwrap_or(if let Some(r) = env_role { let claims: HashMap<&str, &str> = HashMap::from([("role", r)]); serde_json::to_string(&claims).unwrap_or(empty_json.clone()) } else { empty_json.clone() }); let mut search_path = vec![api_request.schema_name]; if let Some(extra) = &db_extra_search_path { search_path.extend(extra.iter().map(|s| s.as_str())); } let search_path_str = search_path .into_iter() .filter(|s| !s.is_empty()) .collect::>() .join(","); let mut env: HashMap<&str, &str> = HashMap::from([ ("request.method", api_request.method), ("request.path", api_request.path), ("request.headers", &headers_env), ("request.cookies", &cookies_env), ("request.get", &get_env), ("request.jwt.claims", &jwt_claims_env), ("search_path", &search_path_str), ]); if let Some(r) = env_role { env.insert("role", r); } // generate the sql statements let (env_statement, env_parameters, _) = generate(fmt_env_query(&env)); let (main_statement, main_parameters, _) = generate(fmt_main_query( db_schema, api_request.schema_name, &api_request, &env, )?); let mut headers = vec![ (&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())), ( &CONN_STRING, HeaderValue::from_str(connection_string).expect("invalid connection string"), ), (&AUTHORIZATION, auth_header.clone()), ( &TXN_ISOLATION_LEVEL, HeaderValue::from_static("ReadCommitted"), ), (&ALLOW_POOL, HEADER_VALUE_TRUE), ]; if api_request.read_only { headers.push((&TXN_READ_ONLY, HEADER_VALUE_TRUE)); } // convert the parameters from subzero core representation to the local proxy repr. let req_body = serde_json::to_string(&BatchQueryData { queries: vec![ QueryData { query: env_statement.into(), params: env_parameters .iter() .map(|p| to_sql_param(&p.to_param())) .collect(), }, QueryData { query: main_statement.into(), params: main_parameters .iter() .map(|p| to_sql_param(&p.to_param())) .collect(), }, ], }) .map_err(|e| RestError::JsonConversion(JsonConversionError::ParseJsonError(e)))?; // todo: map body to count egress let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly? // send the request to the local proxy let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; let (response_parts, body) = proxy_response.into_parts(); let max_response = config.http_config.max_response_size_bytes; let bytes = read_body_with_limit(body, max_response) .await .map_err(ReadPayloadError::from)?; // if the response status is greater than 399, then it is an error // FIXME: check if there are other error codes or shapes of the response if response_parts.status.as_u16() > 399 { // turn this postgres error from the json into PostgresError let postgres_error = serde_json::from_slice(&bytes) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; return Err(RestError::Postgres(postgres_error)); } #[derive(Deserialize)] struct QueryResults { /// we run two queries, so we want only two results. results: (EnvRows, MainRows), } /// `env_statement` returns nothing of interest to us #[derive(Deserialize)] struct EnvRows {} #[derive(Deserialize)] struct MainRows { /// `main_statement` only returns a single row. rows: [MainRow; 1], } #[derive(Deserialize)] struct MainRow { body: String, page_total: Option, total_result_set: Option, response_headers: Option, response_status: Option, } let results: QueryResults = serde_json::from_slice(&bytes) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; let QueryResults { results: (_, MainRows { rows: [row] }), } = results; // build the intermediate response object let api_response = ApiResponse { page_total: row.page_total.map_or(0, |v| v.parse::().unwrap_or(0)), total_result_set: row.total_result_set.map(|v| v.parse::().unwrap_or(0)), top_level_offset: 0, // FIXME: check why this is 0 response_headers: row.response_headers, response_status: row.response_status, body: row.body, }; // TODO: rollback the transaction if the page_total is not 1 and the accept_content_type is SingularJSON // we can not do this in the context of proxy for now // if api_request.accept_content_type == SingularJSON && api_response.page_total != 1 { // // rollback the transaction here // return Err(RestError::SubzeroCore(SingularityError { // count: api_response.page_total, // content_type: "application/vnd.pgrst.object+json".to_string(), // })); // } // TODO: rollback the transaction if the page_total is not 1 and the method is PUT // we can not do this in the context of proxy for now // if api_request.method == Method::PUT && api_response.page_total != 1 { // // Makes sure the querystring pk matches the payload pk // // e.g. PUT /items?id=eq.1 { "id" : 1, .. } is accepted, // // PUT /items?id=eq.14 { "id" : 2, .. } is rejected. // // If this condition is not satisfied then nothing is inserted, // // rollback the transaction here // return Err(RestError::SubzeroCore(PutMatchingPkError)); // } // create and return the response to the client // this section mostly deals with setting the right headers according to PostgREST specs let page_total = api_response.page_total; let total_result_set = api_response.total_result_set; let top_level_offset = api_response.top_level_offset; let response_content_type = match (&api_request.accept_content_type, &api_request.query.node) { (SingularJSON, _) | ( _, FunctionCall { returns_single: true, is_scalar: false, .. }, ) => SingularJSON, (TextCSV, _) => TextCSV, _ => ApplicationJSON, }; // check if the SQL env set some response headers (happens when we called a rpc function) if let Some(response_headers_str) = api_response.response_headers { let Ok(headers_json) = serde_json::from_str::>>(response_headers_str.as_str()) else { return Err(RestError::SubzeroCore(GucHeadersError)); }; response_headers.extend(headers_json.into_iter().flatten()); } // calculate and set the content range header let lower = top_level_offset as i64; let upper = top_level_offset as i64 + page_total as i64 - 1; let total = total_result_set.map(|t| t as i64); let content_range = match (&method, &api_request.query.node) { (&Method::POST, Insert { .. }) => content_range_header(1, 0, total), (&Method::DELETE, Delete { .. }) => content_range_header(1, upper, total), _ => content_range_header(lower, upper, total), }; response_headers.push(("Content-Range".to_string(), content_range)); // calculate the status code #[rustfmt::skip] let mut status = match (&method, &api_request.query.node, page_total, &api_request.preferences) { (&Method::POST, Insert { .. }, ..) => 201, (&Method::DELETE, Delete { .. }, _, Some(Preferences {representation: Some(Representation::Full),..}),) => 200, (&Method::DELETE, Delete { .. }, ..) => 204, (&Method::PATCH, Update { columns, .. }, 0, _) if !columns.is_empty() => 404, (&Method::PATCH, Update { .. }, _,Some(Preferences {representation: Some(Representation::Full),..}),) => 200, (&Method::PATCH, Update { .. }, ..) => 204, (&Method::PUT, Insert { .. },_,Some(Preferences {representation: Some(Representation::Full),..}),) => 200, (&Method::PUT, Insert { .. }, ..) => 204, _ => content_range_status(lower, upper, total), }; // add the preference-applied header if let Some(Preferences { resolution: Some(r), .. }) = api_request.preferences { response_headers.push(( "Preference-Applied".to_string(), match r { MergeDuplicates => "resolution=merge-duplicates".to_string(), IgnoreDuplicates => "resolution=ignore-duplicates".to_string(), }, )); } // check if the SQL env set some response status (happens when we called a rpc function) if let Some(response_status_str) = api_response.response_status { status = response_status_str .parse::() .map_err(|_| RestError::SubzeroCore(GucStatusError))?; } // set the content type header // TODO: move this to a subzero function // as_header_value(&self) -> Option<&str> let http_content_type = match response_content_type { SingularJSON => Ok("application/vnd.pgrst.object+json"), TextCSV => Ok("text/csv"), ApplicationJSON => Ok("application/json"), Other(t) => Err(RestError::SubzeroCore(ContentTypeError { message: format!("None of these Content-Types are available: {t}"), })), }?; // build the response body let response_body = Full::new(Bytes::from(api_response.body)) .map_err(|never| match never {}) .boxed(); // build the response response = response .status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)) .header(CONTENT_TYPE, http_content_type); // Add all headers from response_headers vector for (header_name, header_value) in response_headers { response = response.header(header_name, header_value); } // add the body and return the response response.body(response_body).map_err(|_| { RestError::SubzeroCore(InternalError { message: "Failed to build response".to_string(), }) }) } ================================================ FILE: proxy/src/serverless/sql_over_http.rs ================================================ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; use futures::future::{Either, select, try_join}; use futures::{StreamExt, TryFutureExt}; use http::Method; use http::header::AUTHORIZATION; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use serde_json::Value; use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{Level, debug, error, info}; use typed_json::json; use super::backend::{LocalProxyConnError, PoolingBackend}; use super::conn_pool::AuthData; use super::conn_pool_lib::{self, ConnInfo}; use super::error::{ConnInfoError, HttpCodeError, ReadPayloadError}; use super::http_util::{ ALLOW_POOL, ARRAY_MODE, CONN_STRING, NEON_REQUEST_ID, RAW_TEXT_OUTPUT, TXN_DEFERRABLE, TXN_ISOLATION_LEVEL, TXN_READ_ONLY, get_conn_info, json_response, uuid_to_header_value, }; use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; use crate::auth::backend::ComputeCredentialKeys; use crate::config::{HttpConfig, ProxyConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::read_body_with_limit; use crate::metrics::{HttpDirection, Metrics}; use crate::serverless::backend::HttpConnError; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; use crate::util::run_until_cancelled; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] struct QueryData { query: String, #[serde(deserialize_with = "bytes_to_pg_text")] #[serde(default)] params: Vec>, #[serde(default)] array_mode: Option, } #[derive(serde::Deserialize)] struct BatchQueryData { queries: Vec, } #[derive(serde::Deserialize)] #[serde(untagged)] enum Payload { Single(QueryData), Batch(BatchQueryData), } pub(super) const HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> where D: serde::de::Deserializer<'de>, { // TODO: consider avoiding the allocation here. let json: Vec = serde::de::Deserialize::deserialize(deserializer)?; Ok(json_to_pg_text(json)) } pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestContext, request: Request, backend: Arc, cancel: CancellationToken, ) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { Ok(r) => { ctx.set_success(); // Handling the error response from local proxy here if config.authentication_config.is_auth_broker && r.status().is_server_error() { let status = r.status(); let body_bytes = r .collect() .await .map_err(|e| { ApiError::InternalServerError(anyhow::Error::msg(format!( "could not collect http body: {e}" ))) })? .to_bytes(); if let Ok(mut json_map) = serde_json::from_slice::>(&body_bytes) { let message = json_map.get("message"); if let Some(message) = message { let msg: String = match serde_json::from_str(message.get()) { Ok(msg) => msg, Err(_) => { "Unable to parse the response message from server".to_string() } }; error!("Error response from local_proxy: {status} {msg}"); json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys let resp_json = serde_json::to_string(&json_map) .unwrap_or("failed to serialize the response message".to_string()); return json_response(status, resp_json); } } error!("Unable to parse the response message from local_proxy"); return json_response( status, json!({ "message": "Unable to parse the response message from server".to_string() }), ); } r } Err(e @ SqlOverHttpError::Cancelled(_)) => { let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); let message = "Query cancelled, connection was terminated"; tracing::info!( kind=error_kind.to_metric_label(), error=%e, msg=message, "forwarding error to user" ); json_response( StatusCode::BAD_REQUEST, json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }), )? } Err(e) => { let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); let mut message = e.to_string_client(); let db_error = match &e { SqlOverHttpError::ConnectCompute(HttpConnError::PostgresConnectionError(e)) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T { db.map(x).unwrap_or_default() } if let Some(db_error) = db_error { db_error.message().clone_into(&mut message); } let position = db_error.and_then(|db| db.position()); let (position, internal_position, internal_query) = match position { Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None), Some(ErrorPosition::Internal { position, query }) => { (None, Some(position.to_string()), Some(query.clone())) } None => (None, None, None), }; let code = get(db_error, |db| db.code().code()); let severity = get(db_error, |db| db.severity()); let detail = get(db_error, |db| db.detail()); let hint = get(db_error, |db| db.hint()); let where_ = get(db_error, |db| db.where_()); let table = get(db_error, |db| db.table()); let column = get(db_error, |db| db.column()); let schema = get(db_error, |db| db.schema()); let datatype = get(db_error, |db| db.datatype()); let constraint = get(db_error, |db| db.constraint()); let file = get(db_error, |db| db.file()); let line = get(db_error, |db| db.line().map(|l| l.to_string())); let routine = get(db_error, |db| db.routine()); if db_error.is_some() && error_kind == ErrorKind::User { // this error contains too much info, and it's not an error we care about. if tracing::enabled!(Level::DEBUG) { debug!( kind=error_kind.to_metric_label(), error=%e, msg=message, "forwarding error to user" ); } else { info!( kind = error_kind.to_metric_label(), error = "bad query", "forwarding error to user" ); } } else { info!( kind=error_kind.to_metric_label(), error=%e, msg=message, "forwarding error to user" ); } json_response( e.get_http_status_code(), json!({ "message": message, "code": code, "detail": detail, "hint": hint, "position": position, "internalPosition": internal_position, "internalQuery": internal_query, "severity": severity, "where": where_, "table": table, "column": column, "schema": schema, "dataType": datatype, "constraint": constraint, "file": file, "line": line, "routine": routine, }), )? } }; response .headers_mut() .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpError { #[error("{0}")] ReadPayload(#[from] ReadPayloadError), #[error("{0}")] ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), #[error("response is too large (max is {0} bytes)")] ResponseTooLarge(usize), #[error("invalid isolation level")] InvalidIsolationLevel, /// for queries our customers choose to run #[error("{0}")] Postgres(#[source] postgres_client::Error), /// for queries we choose to run #[error("{0}")] InternalPostgres(#[source] postgres_client::Error), #[error("{0}")] JsonConversion(#[from] JsonConversionError), #[error("{0}")] Cancelled(SqlOverHttpCancel), } impl ReportableError for SqlOverHttpError { fn get_error_kind(&self) -> ErrorKind { match self { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, // customer initiated SQL errors. SqlOverHttpError::Postgres(p) => { if p.as_db_error().is_some() { ErrorKind::User } else { ErrorKind::Compute } } // proxy initiated SQL errors. SqlOverHttpError::InternalPostgres(p) => { if p.as_db_error().is_some() { ErrorKind::Service } else { ErrorKind::Compute } } // postgres returned a bad row format that we couldn't parse. SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, SqlOverHttpError::Cancelled(c) => c.get_error_kind(), } } } impl UserFacingError for SqlOverHttpError { fn to_string_client(&self) -> String { match self { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), SqlOverHttpError::InternalPostgres(p) => p.to_string(), SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), SqlOverHttpError::Cancelled(_) => self.to_string(), } } } impl HttpCodeError for SqlOverHttpError { fn get_http_status_code(&self) -> StatusCode { match self { SqlOverHttpError::ReadPayload(e) => e.get_http_status_code(), SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() { ErrorKind::User => StatusCode::BAD_REQUEST, _ => StatusCode::INTERNAL_SERVER_ERROR, }, SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST, SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE, SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST, SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST, SqlOverHttpError::InternalPostgres(_) => StatusCode::INTERNAL_SERVER_ERROR, SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR, SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR, } } } #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] Postgres, #[error("query was cancelled while stuck trying to connect to the database")] Connect, } impl ReportableError for SqlOverHttpCancel { fn get_error_kind(&self) -> ErrorKind { match self { SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect, SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect, } } } #[derive(Clone, Copy, Debug)] struct HttpHeaders { raw_output: bool, default_array_mode: bool, txn_isolation_level: Option, txn_read_only: bool, txn_deferrable: bool, } impl HttpHeaders { fn try_parse(headers: &hyper::http::HeaderMap) -> Result { // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { Some(x) => Some( map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, ), None => None, }; let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); Ok(Self { raw_output, default_array_mode, txn_isolation_level, txn_read_only, txn_deferrable, }) } } fn map_header_to_isolation_level(level: &HeaderValue) -> Option { match level.as_bytes() { b"Serializable" => Some(IsolationLevel::Serializable), b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), _ => None, } } fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { match level { IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), _ => None, } } async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &RequestContext, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { let _requeset_gauge = Metrics::get() .proxy .connection_requests .guard(ctx.protocol()); info!( protocol = %ctx.protocol(), "handling interactive connection from client" ); let conn_info = get_conn_info(&config.authentication_config, ctx, None, request.headers())?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" ); match conn_info.auth { AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await } auth => { handle_db_inner( cancel, config, ctx, request, conn_info.conn_info, auth, backend, ) .await } } } async fn handle_db_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &RequestContext, request: Request, conn_info: ConnInfo, auth: AuthData, backend: Arc, ) -> Result>, SqlOverHttpError> { // // Determine the destination and connection params // let headers = request.headers(); // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); let parsed_headers = HttpHeaders::try_parse(headers)?; let mut request_len = 0; let fetch_and_process_request = Box::pin( async { let body = read_body_with_limit( request.into_body(), config.http_config.max_request_size_bytes, ) .await?; request_len = body.len(); Metrics::get() .proxy .http_conn_content_length_bytes .observe(HttpDirection::Request, body.len() as f64); debug!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly } .map_err(SqlOverHttpError::from), ); let authenticate_and_connect = Box::pin( async { let keys = match auth { AuthData::Password(pw) => backend .authenticate_with_password(ctx, &conn_info.user_info, &pw) .await .map_err(HttpConnError::AuthError)?, AuthData::Jwt(jwt) => backend .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await .map_err(HttpConnError::AuthError)?, }; let client = match keys.keys { ComputeCredentialKeys::JwtPayload(payload) if backend.auth_backend.is_local_proxy() => { #[cfg(feature = "testing")] let disable_pg_session_jwt = config.disable_pg_session_jwt; #[cfg(not(feature = "testing"))] let disable_pg_session_jwt = false; let mut client = backend .connect_to_local_postgres(ctx, conn_info, disable_pg_session_jwt) .await?; if !disable_pg_session_jwt { let (cli_inner, _dsc) = client.client_inner(); cli_inner.set_jwt_session(&payload).await?; } Client::Local(client) } _ => { let client = backend .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; Client::Remote(client) } }; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else ctx.success(); Ok::<_, SqlOverHttpError>(client) } .map_err(SqlOverHttpError::from), ); let (payload, mut client) = match run_until_cancelled( // Run both operations in parallel try_join( pin!(fetch_and_process_request), pin!(authenticate_and_connect), ), &cancel, ) .await { Some(result) => result?, None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), }; let mut response = Response::builder() .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/json"); // Now execute the query and return the result. let json_output = match payload { Payload::Single(stmt) => { stmt.process(&config.http_config, cancel, &mut client, parsed_headers) .await? } Payload::Batch(statements) => { if parsed_headers.txn_read_only { response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } if parsed_headers.txn_deferrable { response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); } if let Some(txn_isolation_level) = parsed_headers .txn_isolation_level .and_then(map_isolation_level_to_headers) { response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); } statements .process(&config.http_config, cancel, &mut client, parsed_headers) .await? } }; let metrics = client.metrics(ctx); let len = json_output.len(); let response = response .body( Full::new(Bytes::from(json_output)) .map_err(|x| match x {}) .boxed(), ) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); metrics.record_ingress(request_len as u64); Metrics::get() .proxy .http_conn_content_length_bytes .observe(HttpDirection::Response, len as f64); Ok(response) } static HEADERS_TO_FORWARD: &[&HeaderName] = &[ &AUTHORIZATION, &CONN_STRING, &RAW_TEXT_OUTPUT, &ARRAY_MODE, &TXN_ISOLATION_LEVEL, &TXN_READ_ONLY, &TXN_DEFERRABLE, ]; async fn handle_auth_broker_inner( ctx: &RequestContext, request: Request, conn_info: ConnInfo, jwt: String, backend: Arc, ) -> Result>, SqlOverHttpError> { backend .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await .map_err(HttpConnError::from)?; let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); let (mut parts, body) = request.into_parts(); let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri); // todo(conradludgate): maybe auth-broker should parse these and re-serialize // these instead just to ensure they remain normalised. for &h in HEADERS_TO_FORWARD { if let Some(hv) = parts.headers.remove(h) { req = req.header(h, hv); } } req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())); let req = req .body(body.map_err(|e| e).boxed()) //TODO: is there a potential for a regression here? .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress let _metrics = client.metrics(ctx); Ok(client .inner .inner .send_request(req) .await .map_err(LocalProxyConnError::from) .map_err(HttpConnError::from)? .map(|b| b.boxed())) } impl QueryData { async fn process( self, config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, ) -> Result { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); let mut json_buf = vec![]; let batch_result = match select( pin!(query_to_json( config, &mut *inner, self, json::ValueSer::new(&mut json_buf), parsed_headers )), pin!(cancel.cancelled()), ) .await { Either::Left((res, __not_yet_cancelled)) => res, Either::Right((_cancelled, query)) => { tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { tracing::warn!(?err, "could not cancel query"); } // wait for the query cancellation match time::timeout(time::Duration::from_millis(100), query).await { // query successed before it was cancelled. Ok(Ok(status)) => Ok(status), // query failed or was cancelled. Ok(Err(error)) => { let db_error = match &error { SqlOverHttpError::ConnectCompute( HttpConnError::PostgresConnectionError(e), ) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; // if errored for some other reason, it might not be safe to return if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { discard.discard(); } return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } Err(_timeout) => { discard.discard(); return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } } } }; match batch_result { // The query successfully completed. Ok(_) => { let json_output = String::from_utf8(json_buf).expect("json should be valid utf8"); Ok(json_output) } // The query failed with an error Err(e) => { discard.discard(); Err(e) } } } } impl BatchQueryData { async fn process( self, config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, ) -> Result { info!("starting transaction"); let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); let mut builder = inner.build_transaction(); if let Some(isolation_level) = parsed_headers.txn_isolation_level { builder = builder.isolation_level(isolation_level); } if parsed_headers.txn_read_only { builder = builder.read_only(true); } if parsed_headers.txn_deferrable { builder = builder.deferrable(true); } let mut transaction = builder .start() .await .inspect_err(|_| { // if we cannot start a transaction, we should return immediately // and not return to the pool. connection is clearly broken discard.discard(); }) .map_err(SqlOverHttpError::Postgres)?; let json_output = match query_batch_to_json( config, cancel.child_token(), &mut transaction, self, parsed_headers, ) .await { Ok(json_output) => { info!("commit"); transaction .commit() .await .inspect_err(|_| { // if we cannot commit - for now don't return connection to pool // TODO: get a query status from the error discard.discard(); }) .map_err(SqlOverHttpError::Postgres)?; json_output } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { tracing::warn!(?err, "could not cancel query"); } // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. discard.discard(); return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } Err(err) => { return Err(err); } }; Ok(json_output) } } async fn query_batch( config: &'static HttpConfig, cancel: CancellationToken, transaction: &mut Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, results: &mut json::ListSer<'_>, ) -> Result<(), SqlOverHttpError> { for stmt in queries.queries { let query = pin!(query_to_json( config, transaction, stmt, results.entry(), parsed_headers, )); let cancelled = pin!(cancel.cancelled()); let res = select(query, cancelled).await; match res { // TODO: maybe we should check that the transaction bit is set here Either::Left((Ok(_), _cancelled)) => {} Either::Left((Err(e), _cancelled)) => { return Err(e); } Either::Right((_cancelled, _)) => { return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } } } Ok(()) } async fn query_batch_to_json( config: &'static HttpConfig, cancel: CancellationToken, tx: &mut Transaction<'_>, queries: BatchQueryData, headers: HttpHeaders, ) -> Result { let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| { let results = obj.key("results"); json::value_as_list!(|results| { query_batch(config, cancel, tx, queries, headers, results).await?; }); })); Ok(json_output) } async fn query_to_json( config: &'static HttpConfig, client: &mut T, data: QueryData, output: json::ValueSer<'_>, parsed_headers: HttpHeaders, ) -> Result { let query_start = Instant::now(); let mut output = json::ObjectSer::new(output); let mut row_stream = client .query_raw_txt(&data.query, data.params) .await .map_err(SqlOverHttpError::Postgres)?; let query_acknowledged = Instant::now(); let mut json_fields = output.key("fields").list(); for c in row_stream.statement.columns() { let json_field = json_fields.entry(); json::value_as_object!(|json_field| { json_field.entry("name", c.name()); json_field.entry("dataTypeID", c.type_().oid()); json_field.entry("tableID", c.table_oid()); json_field.entry("columnID", c.column_id()); json_field.entry("dataTypeSize", c.type_size()); json_field.entry("dataTypeModifier", c.type_modifier()); json_field.entry("format", "text"); }); } json_fields.finish(); let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); let raw_output = parsed_headers.raw_output; // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. let mut rows = 0; let mut json_rows = output.key("rows").list(); while let Some(row) = row_stream.next().await { let row = row.map_err(SqlOverHttpError::Postgres)?; // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if json_rows.as_buffer().len() > config.max_response_size_bytes { return Err(SqlOverHttpError::ResponseTooLarge( config.max_response_size_bytes, )); } pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?; rows += 1; // assumption: parsing pg text and converting to json takes CPU time. // let's assume it is slightly expensive, so we should consume some cooperative budget. // Especially considering that `RowStream::next` might be pulling from a batch // of rows and never hit the tokio mpsc for a long time (although unlikely). tokio::task::consume_budget().await; } json_rows.finish(); let query_resp_end = Instant::now(); let ready = row_stream.status; // grab the command tag and number of rows affected let command_tag = row_stream.command_tag.unwrap_or_default(); let mut command_tag_split = command_tag.split(' '); let command_tag_name = command_tag_split.next().unwrap_or_default(); let command_tag_count = if command_tag_name == "INSERT" { // INSERT returns OID first and then number of rows command_tag_split.nth(1) } else { // other commands return number of rows (if any) command_tag_split.next() } .and_then(|s| s.parse::().ok()); info!( rows, ?ready, command_tag, acknowledgement = ?(query_acknowledged - query_start), response = ?(query_resp_end - query_start), "finished executing query" ); output.entry("command", command_tag_name); output.entry("rowCount", command_tag_count); output.entry("rowAsArray", array_mode); output.finish(); Ok(ready) } enum Client { Remote(conn_pool_lib::Client), Local(conn_pool_lib::Client), } enum Discard<'a> { Remote(conn_pool_lib::Discard<'a, postgres_client::Client>), Local(conn_pool_lib::Discard<'a, postgres_client::Client>), } impl Client { fn metrics(&self, ctx: &RequestContext) -> Arc { match self { Client::Remote(client) => client.metrics(ctx), Client::Local(local_client) => local_client.metrics(ctx), } } fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) { match self { Client::Remote(client) => { let (c, d) = client.inner(); (c, Discard::Remote(d)) } Client::Local(local_client) => { let (c, d) = local_client.inner(); (c, Discard::Local(d)) } } } } impl Discard<'_> { fn discard(&mut self) { match self { Discard::Remote(discard) => discard.discard(), Discard::Local(discard) => discard.discard(), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_payload() { let payload = "{\"query\":\"SELECT * FROM users WHERE name = ?\",\"params\":[\"test\"],\"arrayMode\":true}"; let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); match deserialized_payload { Payload::Single(QueryData { query, params, array_mode, }) => { assert_eq!(query, "SELECT * FROM users WHERE name = ?"); assert_eq!(params, vec![Some(String::from("test"))]); assert!(array_mode.unwrap()); } Payload::Batch(_) => { panic!("deserialization failed: case with single query, one param, and array mode") } } let payload = "{\"queries\":[{\"query\":\"SELECT * FROM users0 WHERE name = ?\",\"params\":[\"test0\"], \"arrayMode\":false},{\"query\":\"SELECT * FROM users1 WHERE name = ?\",\"params\":[\"test1\"],\"arrayMode\":true}]}"; let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); match deserialized_payload { Payload::Batch(BatchQueryData { queries }) => { assert_eq!(queries.len(), 2); for (i, query) in queries.into_iter().enumerate() { assert_eq!( query.query, format!("SELECT * FROM users{i} WHERE name = ?") ); assert_eq!(query.params, vec![Some(format!("test{i}"))]); assert_eq!(query.array_mode.unwrap(), i > 0); } } Payload::Single(_) => panic!("deserialization failed: case with multiple queries"), } let payload = "{\"query\":\"SELECT 1\"}"; let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); match deserialized_payload { Payload::Single(QueryData { query, params, array_mode, }) => { assert_eq!(query, "SELECT 1"); assert_eq!(params, vec![]); assert!(array_mode.is_none()); } Payload::Batch(_) => panic!("deserialization failed: case with only one query"), } } } ================================================ FILE: proxy/src/serverless/websocket.rs ================================================ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; use hyper::upgrade::OnUpgrade; use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::pglb::{ClientMode, handle_connection}; use crate::proxy::ErrorSource; use crate::rate_limiter::EndpointRateLimiter; pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. pub(crate) struct WebSocketRw { #[pin] stream: WebSocketServer, recv: Bytes, send: BytesMut, } } impl WebSocketRw { pub(crate) fn new(stream: WebSocketServer) -> Self { Self { stream, recv: Bytes::new(), send: BytesMut::new(), } } } impl AsyncWrite for WebSocketRw { fn poll_write( self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { let this = self.project(); let mut stream = this.stream; ready!(stream.as_mut().poll_ready(cx).map_err(io::Error::other))?; this.send.put(buf); match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io::Error::other(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let stream = self.project().stream; stream.poll_flush(cx).map_err(io::Error::other) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let stream = self.project().stream; stream.poll_close(cx).map_err(io::Error::other) } } impl AsyncRead for WebSocketRw { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; let len = std::cmp::min(bytes.len(), buf.remaining()); buf.put_slice(&bytes[..len]); self.consume(len); Poll::Ready(Ok(())) } } impl AsyncBufRead for WebSocketRw { fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { // Please refer to poll_fill_buf's documentation. const EOF: Poll> = Poll::Ready(Ok(&[])); let mut this = self.project(); loop { if !this.recv.chunk().is_empty() { let chunk = (*this.recv).chunk(); return Poll::Ready(Ok(chunk)); } let res = ready!(this.stream.as_mut().poll_next(cx)); match res.transpose().map_err(io::Error::other)? { Some(message) => match message.opcode { OpCode::Ping => {} OpCode::Pong => {} OpCode::Text => { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; warn!(length = message.payload.len(), error); return Poll::Ready(Err(io::Error::other(error))); } OpCode::Binary | OpCode::Continuation => { debug_assert!(this.recv.is_empty()); *this.recv = message.payload.freeze(); } OpCode::Close => return EOF, }, None => return EOF, } } } fn consume(self: Pin<&mut Self>, amount: usize) { self.project().recv.advance(amount); } } #[allow(clippy::too_many_arguments)] pub(crate) async fn serve_websocket( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestContext, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> anyhow::Result<()> { let websocket = websocket.await?; let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); let conn_gauge = Metrics::get() .proxy .client_connections .guard(crate::metrics::Protocol::Ws); let res = Box::pin(handle_connection( config, auth_backend, &ctx, cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, endpoint_rate_limiter, conn_gauge, cancellations, )) .await; match res { Err(e) => { ctx.set_error_kind(e.get_error_kind()); Err(e.into()) } Ok(None) => { ctx.set_success(); Ok(()) } Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); match p.proxy_pass().await { Ok(()) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), } } } } #[cfg(test)] mod tests { use std::pin::pin; use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex}; use tokio::task::JoinSet; use tokio_tungstenite::WebSocketStream; use tokio_tungstenite::tungstenite::Message; use tokio_tungstenite::tungstenite::protocol::Role; use super::WebSocketRw; #[tokio::test] async fn websocket_stream_wrapper_happy_path() { let (stream1, stream2) = duplex(1024); let mut js = JoinSet::new(); js.spawn(async move { let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await; client .send(Message::Binary(b"hello world".to_vec())) .await .unwrap(); let message = client.next().await.unwrap().unwrap(); assert_eq!(message, Message::Binary(b"websockets are cool".to_vec())); client.close(None).await.unwrap(); }); js.spawn(async move { let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2))); let mut buf = vec![0; 1024]; let n = rw.read(&mut buf).await.unwrap(); assert_eq!(&buf[..n], b"hello world"); rw.write_all(b"websockets are cool").await.unwrap(); rw.flush().await.unwrap(); let n = rw.read_to_end(&mut buf).await.unwrap(); assert_eq!(n, 0); }); js.join_next().await.unwrap().unwrap(); js.join_next().await.unwrap().unwrap(); } } ================================================ FILE: proxy/src/signals.rs ================================================ use std::convert::Infallible; use anyhow::bail; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use crate::metrics::{Metrics, ServiceInfo}; /// Handle unix signals appropriately. pub async fn handle( token: CancellationToken, mut refresh_config: F, ) -> anyhow::Result where F: FnMut(), { use tokio::signal::unix::{SignalKind, signal}; let mut hangup = signal(SignalKind::hangup())?; let mut interrupt = signal(SignalKind::interrupt())?; let mut terminate = signal(SignalKind::terminate())?; loop { tokio::select! { // Hangup is commonly used for config reload. _ = hangup.recv() => { info!("received SIGHUP"); refresh_config(); } // Shut down the whole application. _ = interrupt.recv() => { warn!("received SIGINT, exiting immediately"); Metrics::get().service.info.set_label(ServiceInfo::terminating()); bail!("interrupted"); } _ = terminate.recv() => { warn!("received SIGTERM, shutting down once all existing connections have closed"); Metrics::get().service.info.set_label(ServiceInfo::terminating()); token.cancel(); } } } } ================================================ FILE: proxy/src/stream.rs ================================================ use std::pin::Pin; use std::sync::Arc; use std::{io, task}; use rustls::ServerConfig; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::Metrics; use crate::pqproto::{ BeMessage, FE_PASSWORD_MESSAGE, FeStartupPacket, SQLSTATE_INTERNAL_ERROR, WriteBuf, read_message, read_startup, }; use crate::tls::TlsServerEndPoint; /// Stream wrapper which implements libpq's protocol. /// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying /// to pass random malformed bytes through the connection). pub struct PqStream { stream: S, read: Vec, write: WriteBuf, } impl PqStream { pub fn get_ref(&self) -> &S { &self.stream } /// Construct a new libpq protocol wrapper over a stream without the first startup message. #[cfg(test)] pub fn new_skip_handshake(stream: S) -> Self { Self { stream, read: Vec::new(), write: WriteBuf::new(), } } } impl PqStream { /// Construct a new libpq protocol wrapper and read the first startup message. /// /// This is not cancel safe. pub async fn parse_startup(mut stream: S) -> io::Result<(Self, FeStartupPacket)> { let startup = read_startup(&mut stream).await?; Ok(( Self { stream, read: Vec::new(), write: WriteBuf::new(), }, startup, )) } /// Tell the client that encryption is not supported. /// /// This is not cancel safe pub async fn reject_encryption(&mut self) -> io::Result { // N for No. self.write.encryption(b'N'); self.flush().await?; read_startup(&mut self.stream).await } } impl PqStream { /// Read a raw postgres packet, which will respect the max length requested. /// This is not cancel safe. async fn read_raw_expect(&mut self, tag: u8, max: u32) -> io::Result<&mut [u8]> { let (actual_tag, msg) = read_message(&mut self.stream, &mut self.read, max).await?; if actual_tag != tag { return Err(io::Error::other(format!( "incorrect message tag, expected {:?}, got {:?}", tag as char, actual_tag as char, ))); } Ok(msg) } /// Read a postgres password message, which will respect the max length requested. /// This is not cancel safe. pub async fn read_password_message(&mut self) -> io::Result<&mut [u8]> { // passwords are usually pretty short // and SASL SCRAM messages are no longer than 256 bytes in my testing // (a few hashes and random bytes, encoded into base64). const MAX_PASSWORD_LENGTH: u32 = 512; self.read_raw_expect(FE_PASSWORD_MESSAGE, MAX_PASSWORD_LENGTH) .await } } #[derive(Debug)] pub struct ReportedError { source: anyhow::Error, error_kind: ErrorKind, } impl ReportedError { pub fn new(e: impl UserFacingError + Into) -> Self { let error_kind = e.get_error_kind(); Self { source: e.into(), error_kind, } } } impl std::fmt::Display for ReportedError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.source.fmt(f) } } impl std::error::Error for ReportedError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { self.source.source() } } impl ReportableError for ReportedError { fn get_error_kind(&self) -> ErrorKind { self.error_kind } } impl PqStream { /// Tell the client that we are willing to accept SSL. /// This is not cancel safe pub async fn accept_tls(mut self) -> io::Result { // S for SSL. self.write.encryption(b'S'); self.flush().await?; Ok(self.stream) } /// Assert that we are using direct TLS. pub fn accept_direct_tls(self) -> S { self.stream } /// Write a raw message to the internal buffer. pub fn write_raw(&mut self, size_hint: usize, tag: u8, f: impl FnOnce(&mut Vec)) { self.write.write_raw(size_hint, tag, f); } /// Write the message into an internal buffer pub fn write_message(&mut self, message: BeMessage<'_>) { message.write_message(&mut self.write); } /// Write the buffer to the socket until we have some more space again. pub async fn write_if_full(&mut self) -> io::Result<()> { while self.write.occupied_len() > 2048 { self.stream.write_buf(&mut self.write).await?; } Ok(()) } /// Flush the output buffer into the underlying stream. /// /// This is cancel safe. pub async fn flush(&mut self) -> io::Result<()> { self.stream.write_all_buf(&mut self.write).await?; self.write.reset(); self.stream.flush().await?; Ok(()) } /// Flush the output buffer into the underlying stream. /// /// This is cancel safe. pub async fn flush_and_into_inner(mut self) -> io::Result { self.flush().await?; Ok(self.stream) } /// Write the error message to the client, then re-throw it. /// /// Trait [`UserFacingError`] acts as an allowlist for error types. /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind. pub(crate) async fn throw_error( &mut self, error: E, ctx: Option<&crate::context::RequestContext>, ) -> ReportedError where E: UserFacingError + Into, { let error_kind = error.get_error_kind(); let msg = error.to_string_client(); if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User { tracing::info!( kind = error_kind.to_metric_label(), msg, "forwarding error to user" ); } let probe_msg; let mut msg = &*msg; if let Some(ctx) = ctx && ctx.get_testodrome_id().is_some() { let tag = match error_kind { ErrorKind::User => "client", ErrorKind::ClientDisconnect => "client", ErrorKind::RateLimit => "proxy", ErrorKind::ServiceRateLimit => "proxy", ErrorKind::Quota => "proxy", ErrorKind::Service => "proxy", ErrorKind::ControlPlane => "controlplane", ErrorKind::Postgres => "other", ErrorKind::Compute => "compute", }; probe_msg = typed_json::json!({ "tag": tag, "msg": msg, "cold_start_info": ctx.cold_start_info(), }) .to_string(); msg = &probe_msg; } // TODO: either preserve the error code from postgres, or assign error codes to proxy errors. self.write.write_error(msg, SQLSTATE_INTERNAL_ERROR); self.flush() .await .unwrap_or_else(|e| tracing::debug!("write_message failed: {e}")); ReportedError::new(error) } } /// Wrapper for upgrading raw streams into secure streams. pub enum Stream { /// We always begin with a raw stream, /// which may then be upgraded into a secure stream. Raw { raw: S }, Tls { /// We box [`TlsStream`] since it can be quite large. tls: Box>, /// Channel binding parameter tls_server_end_point: TlsServerEndPoint, }, } impl Unpin for Stream {} impl Stream { /// Construct a new instance from a raw stream. pub fn from_raw(raw: S) -> Self { Self::Raw { raw } } /// Return SNI hostname when it's available. pub fn sni_hostname(&self) -> Option<&str> { match self { Stream::Raw { .. } => None, Stream::Tls { tls, .. } => tls.get_ref().1.server_name(), } } pub(crate) fn tls_server_end_point(&self) -> TlsServerEndPoint { match self { Stream::Raw { .. } => TlsServerEndPoint::Undefined, Stream::Tls { tls_server_end_point, .. } => *tls_server_end_point, } } } #[derive(Debug, Error)] #[error("Can't upgrade TLS stream")] pub enum StreamUpgradeError { #[error("Bad state reached: can't upgrade TLS stream")] AlreadyTls, #[error("Can't upgrade stream: IO error: {0}")] Io(#[from] io::Error), } impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. pub async fn upgrade( self, cfg: Arc, record_handshake_error: bool, ) -> Result, StreamUpgradeError> { match self { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await .inspect_err(|_| { if record_handshake_error { Metrics::get().proxy.tls_handshake_failures.inc(); } })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } impl AsyncRead for Stream { fn poll_read( mut self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &mut ReadBuf<'_>, ) -> task::Poll> { match &mut *self { Self::Raw { raw } => Pin::new(raw).poll_read(context, buf), Self::Tls { tls, .. } => Pin::new(tls).poll_read(context, buf), } } } impl AsyncWrite for Stream { fn poll_write( mut self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &[u8], ) -> task::Poll> { match &mut *self { Self::Raw { raw } => Pin::new(raw).poll_write(context, buf), Self::Tls { tls, .. } => Pin::new(tls).poll_write(context, buf), } } fn poll_flush( mut self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { match &mut *self { Self::Raw { raw } => Pin::new(raw).poll_flush(context), Self::Tls { tls, .. } => Pin::new(tls).poll_flush(context), } } fn poll_shutdown( mut self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { match &mut *self { Self::Raw { raw } => Pin::new(raw).poll_shutdown(context), Self::Tls { tls, .. } => Pin::new(tls).poll_shutdown(context), } } } ================================================ FILE: proxy/src/tls/client_config.rs ================================================ use std::env; use std::io::Cursor; use std::path::PathBuf; use std::sync::Arc; use anyhow::{Context, bail}; use rustls::crypto::ring; /// We use an internal certificate authority when establishing a TLS connection with compute. fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else { return Ok(()); }; let ca_file = PathBuf::from(ca_file); let ca = std::fs::read(&ca_file) .with_context(|| format!("could not read CA from {}", ca_file.display()))?; for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) { store .add(cert.context("could not parse internal CA certificate")?) .context("could not parse internal CA certificate")?; } Ok(()) } /// For console redirect proxy, we need to establish a connection to compute via pg-sni-router. /// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we /// load certificates from our native store. fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { let der_certs = rustls_native_certs::load_native_certs(); if !der_certs.errors.is_empty() { bail!("could not parse certificates: {:?}", der_certs.errors); } store.add_parsable_certificates(der_certs.certs); Ok(()) } fn load_compute_certs() -> anyhow::Result> { let mut store = rustls::RootCertStore::empty(); load_native_certs(&mut store)?; load_internal_certs(&mut store)?; Ok(Arc::new(store)) } /// Loads the root certificates and constructs a client config suitable for connecting to the neon compute. /// This function is blocking. pub fn compute_client_config_with_root_certs() -> anyhow::Result { Ok( rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("ring should support the default protocol versions") .with_root_certificates(load_compute_certs()?) .with_no_client_auth(), ) } #[cfg(test)] pub fn compute_client_config_with_certs( certs: impl IntoIterator>, ) -> rustls::ClientConfig { let mut store = rustls::RootCertStore::empty(); store.add_parsable_certificates(certs); rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("ring should support the default protocol versions") .with_root_certificates(store) .with_no_client_auth() } ================================================ FILE: proxy/src/tls/mod.rs ================================================ pub mod client_config; pub mod postgres_rustls; pub mod server_config; use anyhow::Context; use base64::Engine as _; use base64::prelude::BASE64_STANDARD; use rustls::pki_types::CertificateDer; use sha2::{Digest, Sha256}; use tracing::{error, info}; use x509_cert::der::{Reader, SliceReader, oid}; /// pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; /// Channel binding parameter /// /// /// Description: The hash of the TLS server's certificate as it /// appears, octet for octet, in the server's Certificate message. Note /// that the Certificate message contains a certificate_list, in which /// the first element is the server's certificate. /// /// The hash function is to be selected as follows: /// /// * if the certificate's signatureAlgorithm uses a single hash /// function, and that hash function is either MD5 or SHA-1, then use SHA-256; /// /// * if the certificate's signatureAlgorithm uses a single hash /// function and that hash function neither MD5 nor SHA-1, then use /// the hash function associated with the certificate's /// signatureAlgorithm; /// /// * if the certificate's signatureAlgorithm uses no hash functions or /// uses multiple hash functions, then this channel binding type's /// channel bindings are undefined at this time (updates to is channel /// binding type may occur to address this issue if it ever arises). #[derive(Debug, Clone, Copy)] pub enum TlsServerEndPoint { Sha256([u8; 32]), Undefined, } impl TlsServerEndPoint { pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { const SHA256_OIDS: &[oid::ObjectIdentifier] = &[ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid::db::rfc5912::ECDSA_WITH_SHA_256, oid::db::rfc5912::SHA_256_WITH_RSA_ENCRYPTION, ]; let certificate = SliceReader::new(cert) .context("Failed to parse cerficiate")? .decode::() .context("Failed to parse cerficiate")?; let subject = certificate.tbs_certificate.subject; info!(%subject, "parsing TLS certificate"); let oid = certificate.signature_algorithm.oid; if SHA256_OIDS.contains(&oid) { let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); info!(%subject, tls_server_end_point = %BASE64_STANDARD.encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { error!(%subject, "unknown channel binding"); Ok(Self::Undefined) } } pub fn supported(&self) -> bool { !matches!(self, TlsServerEndPoint::Undefined) } } ================================================ FILE: proxy/src/tls/postgres_rustls.rs ================================================ use std::convert::TryFrom; use std::sync::Arc; use postgres_client::tls::MakeTlsConnect; use rustls::pki_types::{InvalidDnsNameError, ServerName}; use tokio::io::{AsyncRead, AsyncWrite}; use crate::config::ComputeConfig; mod private { use std::future::Future; use std::io; use std::pin::Pin; use std::task::{Context, Poll}; use postgres_client::tls::{ChannelBinding, TlsConnect}; use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::TlsConnector; use tokio_rustls::client::TlsStream; use crate::tls::TlsServerEndPoint; pub struct TlsConnectFuture { inner: tokio_rustls::Connect, } impl Future for TlsConnectFuture where S: AsyncRead + AsyncWrite + Unpin, { type Output = io::Result>; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { Pin::new(&mut self.inner) .poll(cx) .map_ok(|s| RustlsStream(Box::new(s))) } } pub struct RustlsConnect(pub RustlsConnectData); pub struct RustlsConnectData { pub hostname: ServerName<'static>, pub connector: TlsConnector, } impl TlsConnect for RustlsConnect where S: AsyncRead + AsyncWrite + Unpin + Send + 'static, { type Stream = RustlsStream; type Error = io::Error; type Future = TlsConnectFuture; fn connect(self, stream: S) -> Self::Future { TlsConnectFuture { inner: self.0.connector.connect(self.0.hostname, stream), } } } pub struct RustlsStream(Box>); impl postgres_client::tls::TlsStream for RustlsStream where S: AsyncRead + AsyncWrite + Unpin, { fn channel_binding(&self) -> ChannelBinding { let (_, session) = self.0.get_ref(); match session.peer_certificates() { Some([cert, ..]) => TlsServerEndPoint::new(cert) .ok() .and_then(|cb| match cb { TlsServerEndPoint::Sha256(hash) => Some(hash), TlsServerEndPoint::Undefined => None, }) .map_or_else(ChannelBinding::none, |hash| { ChannelBinding::tls_server_end_point(hash.to_vec()) }), _ => ChannelBinding::none(), } } } impl AsyncRead for RustlsStream where S: AsyncRead + AsyncWrite + Unpin, { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { Pin::new(&mut self.0).poll_read(cx, buf) } } impl AsyncWrite for RustlsStream where S: AsyncRead + AsyncWrite + Unpin, { fn poll_write( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { Pin::new(&mut self.0).poll_write(cx, buf) } fn poll_flush( mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { Pin::new(&mut self.0).poll_flush(cx) } fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { Pin::new(&mut self.0).poll_shutdown(cx) } } } impl MakeTlsConnect for ComputeConfig where S: AsyncRead + AsyncWrite + Unpin + Send + 'static, { type Stream = private::RustlsStream; type TlsConnect = private::RustlsConnect; type Error = InvalidDnsNameError; fn make_tls_connect(&self, hostname: &str) -> Result { make_tls_connect(&self.tls, hostname) } } pub fn make_tls_connect( tls: &Arc, hostname: &str, ) -> Result { ServerName::try_from(hostname).map(|dns_name| { private::RustlsConnect(private::RustlsConnectData { hostname: dns_name.to_owned(), connector: tls.clone().into(), }) }) } ================================================ FILE: proxy/src/tls/server_config.rs ================================================ use std::collections::{HashMap, HashSet}; use std::path::Path; use std::sync::Arc; use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use rustls::sign::CertifiedKey; use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; pub struct TlsConfig { // unfortunate split since we cannot change the ALPN on demand. // pub http_config: Arc, pub pg_config: Arc, pub common_names: HashSet, pub cert_resolver: Arc, } /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &Path, cert_path: &Path, certs_dir: Option<&Path>, allow_tls_keylogfile: bool, ) -> anyhow::Result { // add default certificate let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?; // add extra certificates if let Some(certs_dir) = certs_dir { for entry in std::fs::read_dir(certs_dir)? { let entry = entry?; let path = entry.path(); if path.is_dir() { // file names aligned with default cert-manager names let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { cert_resolver.add_cert_path(&key_path, &cert_path)?; } } } } let common_names = cert_resolver.get_common_names(); let cert_resolver = Arc::new(cert_resolver); // allow TLS 1.2 to be compatible with older client libraries let mut config = rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) .context("ring should support TLS1.2 and TLS1.3")? .with_no_client_auth() .with_cert_resolver(cert_resolver.clone()); config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; if allow_tls_keylogfile { // KeyLogFile will check for the SSLKEYLOGFILE environment variable. config.key_log = Arc::new(rustls::KeyLogFile::new()); } let mut http_config = config.clone(); let mut pg_config = config; http_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; pg_config.alpn_protocols = vec![b"postgresql".to_vec()]; Ok(TlsConfig { http_config: Arc::new(http_config), pg_config: Arc::new(pg_config), common_names, cert_resolver, }) } #[derive(Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, default: (Arc, TlsServerEndPoint), } impl CertResolver { fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; Self::new(priv_key, cert_chain) } pub fn new( priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, ) -> anyhow::Result { let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; let mut certs = HashMap::new(); let default = (cert.clone(), tls_server_end_point); certs.insert(common_name, (cert, tls_server_end_point)); Ok(Self { certs, default }) } fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; self.add_cert(priv_key, cert_chain) } fn add_cert( &mut self, priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, ) -> anyhow::Result<()> { let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; self.certs.insert(common_name, (cert, tls_server_end_point)); Ok(()) } pub fn get_common_names(&self) -> HashSet { self.certs.keys().cloned().collect() } } fn parse_key_cert( key_path: &Path, cert_path: &Path, ) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { let priv_key = { let key_bytes = std::fs::read(key_path) .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?; rustls_pemfile::private_key(&mut &key_bytes[..]) .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? }; let cert_chain_bytes = std::fs::read(cert_path).context(format!( "Failed to read TLS cert file at '{}.'", cert_path.display() ))?; let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) .try_collect() .with_context(|| { format!( "Failed to read TLS certificate chain from bytes from file at '{}'.", cert_path.display() ) })? }; Ok((priv_key, cert_chain)) } fn process_key_cert( priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, ) -> anyhow::Result<(String, Arc, TlsServerEndPoint)> { let key = sign::any_supported_type(&priv_key).context("invalid private key")?; let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; let certificate = SliceReader::new(first_cert) .context("Failed to parse cerficiate")? .decode::() .context("Failed to parse cerficiate")?; let common_name = certificate.tbs_certificate.subject.to_string(); // We need to get the canonical name for this certificate so we can match them against any domain names // seen within the proxy codebase. // // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. // We need to remove the wildcard prefix for the purposes of certificate selection. // // auth-broker does not use SNI and instead uses the Neon-Connection-String header. // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. // // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string // validation, so let's we can continue with any common-name let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { s.to_string() } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { s.to_string() } else if let Some(s) = common_name.strip_prefix("CN=") { s.to_string() } else { bail!("Failed to parse common name from certificate") }; let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); Ok((common_name, cert, tls_server_end_point)) } impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, client_hello: rustls::server::ClientHello<'_>, ) -> Option> { Some(self.resolve(client_hello.server_name()).0) } } impl CertResolver { pub fn resolve( &self, server_name: Option<&str>, ) -> (Arc, TlsServerEndPoint) { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. // // With the current coding foo.com will match *.foo.com and that // repeats behavior of the old code. if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { return cert.clone(); } if let Some((_, rest)) = sni_name.split_once('.') { sni_name = rest; } else { // The customer has some custom DNS mapping - just return // a default certificate. // // This will error if the customer uses anything stronger // than sslmode=require. That's a choice they can make. return self.default.clone(); } } } else { // No SNI, use the default certificate, otherwise we can't get to // options parameter which can be used to set endpoint name too. // That means that non-SNI flow will not work for CNAME domains in // verify-full mode. // // If that will be a problem we can: // // a) Instead of multi-cert approach use single cert with extra // domains listed in Subject Alternative Name (SAN). // b) Deploy separate proxy instances for extra domains. self.default.clone() } } } ================================================ FILE: proxy/src/types.rs ================================================ use crate::intern::{EndpointIdInt, EndpointIdTag, InternId}; macro_rules! smol_str_wrapper { ($name:ident) => { #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] pub struct $name(smol_str::SmolStr); impl $name { #[allow(unused)] pub(crate) fn as_str(&self) -> &str { self.0.as_str() } } impl std::fmt::Display for $name { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl std::cmp::PartialEq for $name where smol_str::SmolStr: std::cmp::PartialEq, { fn eq(&self, other: &T) -> bool { self.0.eq(other) } } impl From for $name where smol_str::SmolStr: From, { fn from(x: T) -> Self { Self(x.into()) } } impl AsRef for $name { fn as_ref(&self) -> &str { self.0.as_ref() } } impl std::ops::Deref for $name { type Target = str; fn deref(&self) -> &str { &*self.0 } } impl<'de> serde::de::Deserialize<'de> for $name { fn deserialize>(d: D) -> Result { >::deserialize(d).map(Self) } } impl serde::Serialize for $name { fn serialize(&self, s: S) -> Result { self.0.serialize(s) } } }; } const POOLER_SUFFIX: &str = "-pooler"; pub(crate) const LOCAL_PROXY_SUFFIX: &str = "-local-proxy"; impl EndpointId { #[must_use] fn normalize_str(&self) -> &str { if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { stripped } else if let Some(stripped) = self.as_ref().strip_suffix(LOCAL_PROXY_SUFFIX) { stripped } else { self } } #[must_use] pub fn normalize(&self) -> Self { self.normalize_str().into() } #[must_use] pub fn normalize_intern(&self) -> EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(self.normalize_str()) } } // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. smol_str_wrapper!(EndpointId); // 50% of branch strings are 23 characters or less. smol_str_wrapper!(BranchId); // 90% of project strings are 23 characters or less. smol_str_wrapper!(ProjectId); // 90% of account strings are 23 characters or less. smol_str_wrapper!(AccountId); // will usually equal endpoint ID smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); // postgres hostname, will likely be a port:ip addr smol_str_wrapper!(Host); ================================================ FILE: proxy/src/url.rs ================================================ use anyhow::bail; /// A [url](url::Url) type with additional guarantees. #[repr(transparent)] #[derive(Debug, Clone, PartialEq, Eq)] pub struct ApiUrl(url::Url); impl ApiUrl { /// Consume the wrapper and return inner [url](url::Url). pub(crate) fn into_inner(self) -> url::Url { self.0 } /// See [`url::Url::path_segments_mut`]. pub(crate) fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> { // We've already verified that it works during construction. self.0.path_segments_mut().expect("bad API url") } } /// This instance imposes additional requirements on the url. impl std::str::FromStr for ApiUrl { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let mut url: url::Url = s.parse()?; // Make sure that we can build upon this URL. if url.path_segments_mut().is_err() { bail!("bad API url provided"); } Ok(Self(url)) } } /// This instance is safe because it doesn't allow us to modify the object. impl std::ops::Deref for ApiUrl { type Target = url::Url; fn deref(&self) -> &Self::Target { &self.0 } } impl std::ops::DerefMut for ApiUrl { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } impl std::fmt::Display for ApiUrl { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } #[cfg(test)] mod tests { use super::*; #[test] fn bad_url() { let url = "test:foobar"; url.parse::().expect("unexpected parsing failure"); url.parse::().expect_err("should not parse"); } #[test] fn good_url() { let url = "test://foobar"; let mut a = url.parse::().expect("unexpected parsing failure"); let mut b = url.parse::().expect("unexpected parsing failure"); a.path_segments_mut().unwrap().push("method"); b.path_segments_mut().push("method"); assert_eq!(a, b.into_inner()); } } ================================================ FILE: proxy/src/usage_metrics.rs ================================================ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. use std::borrow::Cow; use std::convert::Infallible; use std::sync::Arc; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::time::Duration; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use clashmap::ClashMap; use clashmap::mapref::entry::Entry; use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, EventType, idempotency_key}; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; use smol_str::SmolStr; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; use crate::config::MetricCollectionConfig; use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; use crate::http; use crate::intern::{BranchIdInt, EndpointIdInt}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, /// so keep it in a named struct. /// /// Both the proxy and the ingestion endpoint will live in the same region (or cell) /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, #[serde(with = "none_as_empty_string")] pub(crate) private_link_id: Option, } #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] struct Extra { #[serde(flatten)] ids: Ids, direction: TrafficDirection, } mod none_as_empty_string { use serde::Deserialize; use smol_str::SmolStr; #[allow(clippy::ref_option)] pub fn serialize(t: &Option, s: S) -> Result { s.serialize_str(t.as_deref().unwrap_or("")) } pub fn deserialize<'de, D: serde::Deserializer<'de>>( d: D, ) -> Result, D::Error> { let s = SmolStr::deserialize(d)?; if s.is_empty() { Ok(None) } else { Ok(Some(s)) } } } #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "lowercase")] pub(crate) enum TrafficDirection { Ingress, Egress, } pub(crate) trait MetricCounterRecorder { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64); /// Record that some bytes were sent from the client to the proxy fn record_ingress(&self, bytes: u64); /// Record that some connections were opened fn record_connection(&self, count: usize); } trait MetricCounterReporter { fn get_metrics(&mut self) -> MetricsData; fn move_metrics(&self) -> MetricsData; } #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, received: AtomicU64, opened_connections: AtomicUsize, } impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::Relaxed); } /// Record that some bytes were sent from the proxy to the client fn record_ingress(&self, bytes: u64) { self.received.fetch_add(bytes, Ordering::Relaxed); } /// Record that some connections were opened fn record_connection(&self, count: usize) { self.opened_connections.fetch_add(count, Ordering::Relaxed); } } impl MetricCounterReporter for MetricCounter { fn get_metrics(&mut self) -> MetricsData { MetricsData { received: *self.received.get_mut(), transmitted: *self.transmitted.get_mut(), connections: *self.opened_connections.get_mut(), } } fn move_metrics(&self) -> MetricsData { MetricsData { received: self.received.swap(0, Ordering::Relaxed), transmitted: self.transmitted.swap(0, Ordering::Relaxed), connections: self.opened_connections.swap(0, Ordering::Relaxed), } } } struct MetricsData { transmitted: u64, received: u64, connections: usize, } struct BytesSent { transmitted: u64, received: u64, } trait Clearable { /// extract the value that should be reported fn should_report(self: &Arc) -> Option; /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool; } impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. // // Worst case is that we won't report an event for this endpoint. // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 let MetricsData { transmitted, received, connections, } = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report if transmitted == 0 && received == 0 && !is_open && connections == 0 { None } else { Some(BytesSent { transmitted, received, }) } } fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; let MetricsData { transmitted, received, connections, } = counter.get_metrics(); // clear if there's no data to report transmitted == 0 && received == 0 && connections == 0 } } // endpoint and branch IDs are not user generated so we don't run the risk of hash-dos type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub(crate) struct Metrics { endpoints: ClashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub(crate) fn register(&self, ids: Ids) -> Arc { let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { self.endpoints .entry(ids) .or_insert_with(|| { Arc::new(MetricCounter { received: AtomicU64::new(0), transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), }) }) .clone() }; entry.record_connection(1); entry } } pub(crate) static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result { info!("metrics collector config: {config:?}"); scopeguard::defer! { info!("metrics collector has shut down"); } let http_client = http::new_client_with_timeout( HTTP_REPORTING_REQUEST_TIMEOUT, HTTP_REPORTING_RETRY_DURATION, ); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); // Even if the remote storage is not configured, we still want to clear the metrics. let storage = if let Some(config) = config .backup_metric_collection_config .remote_storage_config .as_ref() { Some( GenericRemoteStorage::from_config(config) .await .context("remote storage init")?, ) } else { None }; let mut prev = Utc::now(); let mut ticker = tokio::time::interval(config.interval); loop { ticker.tick().await; let now = Utc::now(); collect_metrics_iteration( &USAGE_METRICS.endpoints, &http_client, &config.endpoint, storage.as_ref(), config.backup_metric_collection_config.chunk_size, &hostname, prev, now, ) .await; prev = now; } } fn collect_and_clear_metrics( endpoints: &ClashMap, FastHasher>, ) -> Vec<(Ids, BytesSent)> { let mut metrics_to_clear = Vec::new(); let metrics_to_send: Vec<(Ids, BytesSent)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); let Some(value) = counter.should_report() else { metrics_to_clear.push(key); return None; }; Some((key, value)) }) .collect(); for metric in metrics_to_clear { match endpoints.entry(metric) { Entry::Occupied(mut counter) => { if counter.get_mut().should_clear() { counter.remove_entry(); } } Entry::Vacant(_) => {} } } metrics_to_send } fn create_event_chunks<'a>( metrics_to_send: &'a [(Ids, BytesSent)], hostname: &'a str, prev: DateTime, now: DateTime, chunk_size: usize, ) -> impl Iterator>> + 'a { metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { events: chunk .iter() .flat_map(|(ids, bytes)| { [ Event { kind: EventType::Incremental { start_time: prev, stop_time: now, }, metric: PROXY_IO_BYTES_PER_CLIENT, idempotency_key: idempotency_key(hostname), value: bytes.transmitted, extra: Extra { ids: ids.clone(), direction: TrafficDirection::Egress, }, }, Event { kind: EventType::Incremental { start_time: prev, stop_time: now, }, metric: PROXY_IO_BYTES_PER_CLIENT, idempotency_key: idempotency_key(hostname), value: bytes.received, extra: Extra { ids: ids.clone(), direction: TrafficDirection::Ingress, }, }, ] }) .collect(), }) } #[expect(clippy::too_many_arguments)] #[instrument(skip_all)] async fn collect_metrics_iteration( endpoints: &ClashMap, FastHasher>, client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, storage: Option<&GenericRemoteStorage>, outer_chunk_size: usize, hostname: &str, prev: DateTime, now: DateTime, ) { info!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", metric_collection_endpoint ); let metrics_to_send = collect_and_clear_metrics(endpoints); if metrics_to_send.is_empty() { trace!("no new metrics to send"); } let cancel = CancellationToken::new(); let path_prefix = create_remote_path_prefix(now); // Send metrics. for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, outer_chunk_size) { tokio::join!( upload_main_events_chunked(client, metric_collection_endpoint, &chunk, CHUNK_SIZE), async { if let Err(e) = upload_backup_events(storage, &chunk, &path_prefix, &cancel).await { error!("failed to upload consumption events to remote storage: {e:?}"); } } ); } } fn create_remote_path_prefix(now: DateTime) -> String { format!( "year={year:04}/month={month:02}/day={day:02}/hour={hour:02}/{hour:02}:{minute:02}:{second:02}Z", year = now.year(), month = now.month(), day = now.day(), hour = now.hour(), minute = now.minute(), second = now.second(), ) } async fn upload_main_events_chunked( client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, chunk: &EventChunk<'_, Event>, subchunk_size: usize, ) { // Split into smaller chunks to avoid exceeding the max request size for subchunk in chunk.events.chunks(subchunk_size).map(|c| EventChunk { events: Cow::Borrowed(c), }) { let res = client .post(metric_collection_endpoint.clone()) .json(&subchunk) .send() .await; let res = match res { Ok(x) => x, Err(err) => { // TODO: retry? error!("failed to send metrics: {:?}", err); continue; } }; if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); for metric in subchunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large warn!("potentially abnormal metric value: {:?}", metric); } } } } async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, chunk: &EventChunk<'_, Event>, path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { let Some(storage) = storage else { warn!("no remote storage configured"); return Ok(()); }; let real_now = Utc::now(); let id = uuid::Uuid::new_v7(Timestamp::from_unix( NoContext, real_now.second().into(), real_now.nanosecond(), )); let path = format!("{path_prefix}_{id}.ndjson.gz"); let remote_path = match RemotePath::from_string(&path) { Ok(remote_path) => remote_path, Err(e) => { bail!("failed to create remote path from str {path}: {:?}", e); } }; // TODO: This is async compression from Vec to Vec. Rewrite as byte stream. // Use sync compression in blocking threadpool. let mut encoder = GzipEncoder::new(Vec::new()); for event in chunk.events.iter() { let data = serde_json::to_vec(event).context("serialize metrics")?; encoder.write_all(&data).await.context("compress metrics")?; encoder.write_all(b"\n").await.context("compress metrics")?; } encoder.shutdown().await.context("compress metrics")?; let compressed_data: Bytes = encoder.get_ref().clone().into(); backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); storage .upload(stream, compressed_data.len(), &remote_path, None, cancel) .await }, TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, "usage_metrics_upload", cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) .with_context(|| format!("usage_metrics_upload: path={remote_path}"))?; Ok(()) } #[cfg(test)] mod tests { use std::fs; use std::io::{BufRead, BufReader}; use std::sync::{Arc, Mutex}; use anyhow::Error; use camino_tempfile::tempdir; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; use hyper::body::Incoming; use hyper::server::conn::http1; use hyper::service::service_fn; use hyper::{Request, Response}; use hyper_util::rt::TokioIo; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::net::TcpListener; use url::Url; use super::*; use crate::http; use crate::types::{BranchId, EndpointId}; #[tokio::test] async fn metrics() { type Report = EventChunk<'static, Event>; let reports: Arc>> = Arc::default(); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); tokio::spawn({ let reports = reports.clone(); async move { loop { if let Ok((stream, _addr)) = listener.accept().await { let reports = reports.clone(); http1::Builder::new() .serve_connection( TokioIo::new(stream), service_fn(move |req: Request| { let reports = reports.clone(); async move { let bytes = req.into_body().collect().await?.to_bytes(); let events = serde_json::from_slice(&bytes)?; reports.lock().unwrap().push(events); Ok::<_, Error>(Response::new(String::new())) } }), ) .await .unwrap(); } } } }); let metrics = Metrics::default(); let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); let storage_test_dir = tempdir().unwrap(); let local_fs_path = storage_test_dir.path().join("usage_metrics"); fs::create_dir_all(&local_fs_path).unwrap(); let storage = GenericRemoteStorage::from_config(&RemoteStorageConfig { storage: RemoteStorageKind::LocalFs { local_path: local_fs_path.clone(), }, timeout: Duration::from_secs(10), small_timeout: Duration::from_secs(1), }) .await .unwrap(); let mut pushed_chunks: Vec = Vec::new(); let mut stored_chunks: Vec = Vec::new(); // no counters have been registered collect_metrics_iteration( &metrics.endpoints, &client, &endpoint, Some(&storage), 1000, "foo", now, now, ) .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); // register a new counter let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), private_link_id: None, }); // the counter should be observed despite 0 egress collect_metrics_iteration( &metrics.endpoints, &client, &endpoint, Some(&storage), 1000, "foo", now, now, ) .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 0); assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); assert_eq!(r[0].events[1].value, 0); assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // record egress counter.record_egress(1); // record ingress counter.record_ingress(2); // egress should be observered collect_metrics_iteration( &metrics.endpoints, &client, &endpoint, Some(&storage), 1000, "foo", now, now, ) .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 1); assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); assert_eq!(r[0].events[1].value, 2); assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // release counter drop(counter); // we do not observe the counter collect_metrics_iteration( &metrics.endpoints, &client, &endpoint, Some(&storage), 1000, "foo", now, now, ) .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); let path_prefix = create_remote_path_prefix(now); for entry in walkdir::WalkDir::new(&local_fs_path) .into_iter() .filter_map(|e| e.ok()) { let path = local_fs_path.join(&path_prefix).to_string(); if entry.path().to_str().unwrap().starts_with(&path) { let file = fs::File::open(entry.into_path()).unwrap(); let decoder = flate2::bufread::GzDecoder::new(BufReader::new(file)); let reader = BufReader::new(decoder); let mut events: Vec> = Vec::new(); for line in reader.lines() { let line = line.unwrap(); let event: Event = serde_json::from_str(&line).unwrap(); events.push(event); } let report = Report { events: Cow::Owned(events), }; stored_chunks.push(report); } } storage_test_dir.close().ok(); // sort by first event's idempotency key because the order of files is nondeterministic pushed_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); stored_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); assert_eq!(pushed_chunks, stored_chunks); } } ================================================ FILE: proxy/src/util.rs ================================================ use std::pin::pin; use futures::future::{Either, select}; use tokio_util::sync::CancellationToken; pub async fn run_until_cancelled( f: F, cancellation_token: &CancellationToken, ) -> Option { run_until(f, cancellation_token.cancelled()).await.ok() } /// Runs the future `f` unless interrupted by future `condition`. pub async fn run_until( f: F1, condition: F2, ) -> Result { match select(pin!(f), pin!(condition)).await { Either::Left((f1, _)) => Ok(f1), Either::Right((f2, _)) => Err(f2), } } pub fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result where T: for<'de2> serde::Deserialize<'de2>, D: serde::Deserializer<'de>, { use serde::Deserialize; let s = String::deserialize(deserializer)?; serde_json::from_str(&s).map_err(::custom) } ================================================ FILE: proxy/src/waiters.rs ================================================ use std::pin::Pin; use std::task; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use thiserror::Error; use tokio::sync::oneshot; #[derive(Debug, Error)] pub(crate) enum RegisterError { #[error("Waiter `{0}` already registered")] Occupied(String), } #[derive(Debug, Error)] pub(crate) enum NotifyError { #[error("Notify failed: waiter `{0}` not registered")] NotFound(String), #[error("Notify failed: channel hangup")] Hangup, } #[derive(Debug, Error)] pub(crate) enum WaitError { #[error("Wait failed: channel hangup")] Hangup, } pub(crate) struct Waiters(pub(self) Mutex>>); impl Default for Waiters { fn default() -> Self { Waiters(Mutex::default()) } } impl Waiters { pub(crate) fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, guard: DropKey { registry: self, key, }, }) } pub(crate) fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { let tx = self .0 .lock() .remove(key) .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; tx.send(value).map_err(|_| NotifyError::Hangup) } } struct DropKey<'a, T> { key: String, registry: &'a Waiters, } impl Drop for DropKey<'_, T> { fn drop(&mut self) { self.registry.0.lock().remove(&self.key); } } pin_project! { pub(crate) struct Waiter<'a, T> { #[pin] receiver: oneshot::Receiver, guard: DropKey<'a, T>, } } impl std::future::Future for Waiter<'_, T> { type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) .map_err(|_| WaitError::Hangup) } } #[cfg(test)] mod tests { use std::sync::Arc; use super::*; #[tokio::test] async fn test_waiter() -> anyhow::Result<()> { let waiters = Arc::new(Waiters::default()); let key = "Key"; let waiter = waiters.register(key.to_owned())?; let waiters = Arc::clone(&waiters); let notifier = tokio::spawn(async move { waiters.notify(key, ())?; Ok(()) }); waiter.await?; notifier.await? } } ================================================ FILE: pyproject.toml ================================================ [tool.poetry] description = "" authors = [] package-mode = false [tool.poetry.dependencies] python = "^3.11" pytest = "^7.4.4" psycopg2-binary = "^2.9.10" typing-extensions = "^4.12.2" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.32.4" pytest-xdist = "^3.3.1" asyncpg = "^0.30.0" aiopg = "^1.4.0" Jinja2 = "^3.1.6" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"} moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" allure-pytest = "^2.13.5" pytest-asyncio = "^0.21.0" toml = "^0.10.2" psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" aiohttp = "3.12.14" pytest-rerunfailures = "^15.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.23.0" httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" jwcrypto = "^1.5.6" h2 = "^4.2.0" types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" testcontainers = "^4.9.0" # Install a release candidate of `jsonnet`, as it supports Python 3.13 jsonnet = "^0.21.0-rc2" requests-unixsocket = "^0.4.1" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" ruff = "^0.11.2" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.mypy] exclude = [ "^vendor/", "^target/", "test_runner/performance/pgvector/loaddata.py", ] check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner:$MYPY_CONFIG_FILE_DIR/test_runner/stubs" disallow_incomplete_defs = false disallow_untyped_calls = false disallow_untyped_decorators = false disallow_untyped_defs = false strict = true [[tool.mypy.overrides]] module = [ "_jsonnet.*", "asyncpg.*", "pg8000.*", "allure.*", "allure_commons.*", "allure_pytest.*", "kafka.*", "testcontainers.*", ] ignore_missing_imports = true [tool.ruff] target-version = "py311" extend-exclude = [ "vendor/", "target/", "test_runner/stubs/", # Autogenerated by mypy's stubgen ] line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter [tool.ruff.lint] ignore = [ "E501", # Line too long, we don't want to be too strict about it ] select = [ "E", # pycodestyle "F", # Pyflakes "I", # isort "W", # pycodestyle "B", # bugbear "UP", # pyupgrade "TC", # flake8-type-checking ] ================================================ FILE: pytest.ini ================================================ [pytest] filterwarnings = error::pytest.PytestUnhandledThreadExceptionWarning error::UserWarning ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' --ignore=test_runner/performance markers = remote_cluster testpaths = test_runner minversion = 6.0 log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true timeout = 300 ================================================ FILE: run_clippy.sh ================================================ #!/usr/bin/env bash set -euo pipefail # If you save this in your path under the name "cargo-zclippy" (or whatever # name you like), then you can run it as "cargo zclippy" from the shell prompt. # # If your text editor has rust-analyzer integration, you can also use this new # command as a replacement for "cargo check" or "cargo clippy" and see clippy # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command # NB: the CI runs the full feature powerset, so, it catches slightly more errors # at the expense of longer runtime. This script is used by developers, so, don't # do that here. thisscript="${BASH_SOURCE[0]}" thisscript_dir="$(dirname "$thisscript")" CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" exec cargo clippy --all-features $CLIPPY_COMMON_ARGS ================================================ FILE: rust-toolchain.toml ================================================ [toolchain] channel = "1.88.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html # but we also need `llvm-tools` for coverage data merges on CI components = ["llvm-tools", "rustfmt", "clippy"] ================================================ FILE: safekeeper/Cargo.toml ================================================ [package] name = "safekeeper" version = "0.1.0" edition = "2024" license.workspace = true [features] default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] benchmarking = [] [dependencies] async-stream.workspace = true anyhow.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive"] } crc32c.workspace = true fail.workspace = true hex.workspace = true humantime.workspace = true http.workspace = true hyper0.workspace = true itertools.workspace = true jsonwebtoken.workspace = true futures.workspace = true once_cell.workspace = true parking_lot.workspace = true pageserver_api.workspace = true postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true smallvec.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-tar.workspace = true tokio-util = { workspace = true } tracing.workspace = true url.workspace = true metrics.workspace = true pem.workspace = true postgres_backend.workspace = true postgres_ffi.workspace = true postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true safekeeper_client.workspace = true sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true http-utils.workspace = true utils.workspace = true wal_decoder.workspace = true env_logger.workspace = true nix.workspace = true workspace_hack.workspace = true [dev-dependencies] criterion.workspace = true itertools.workspace = true walproposer.workspace = true rand.workspace = true desim.workspace = true tracing.workspace = true tracing-subscriber = { workspace = true, features = ["json"] } [[bench]] name = "receive_wal" harness = false required-features = ["benchmarking"] ================================================ FILE: safekeeper/benches/README.md ================================================ ## Safekeeper Benchmarks To run benchmarks: ```sh # All benchmarks. cargo bench --package safekeeper # Specific file. cargo bench --package safekeeper --bench receive_wal # Specific benchmark. cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false # List available benchmarks. cargo bench --package safekeeper --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. Benchmarks are automatically compared against the previous run. To compare against other runs, see `--baseline` and `--save-baseline`. ================================================ FILE: safekeeper/benches/receive_wal.rs ================================================ //! WAL ingestion benchmarks. use std::io::Write as _; use bytes::BytesMut; use camino_tempfile::tempfile; use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use pprof::criterion::{Output, PProfProfiler}; use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; use safekeeper::test_utils::Env; use safekeeper_api::membership::SafekeeperGeneration as Generation; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; /// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, bench_file_write, bench_bytes_reserve, ); criterion_main!(benches); /// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an /// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When /// measuring throughput, only the logical message payload is considered, excluding /// segment/page/record headers. fn bench_process_msg(c: &mut Criterion) { let mut g = c.benchmark_group("process_msg"); for fsync in [false, true] { for commit in [false, true] { for size in [8, KB, 8 * KB, 128 * KB, MB] { // Kind of weird to change the group throughput per benchmark, but it's the only way // to vary it per benchmark. It works. g.throughput(criterion::Throughput::Bytes(size as u64)); g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| { run_bench(b, size, fsync, commit).unwrap() }); } } } // The actual benchmark. If commit is true, advance the commit LSN on every message. fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> { let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only .enable_all() .build()?; // Construct the payload. The prefix counts towards the payload (including NUL terminator). let prefix = c"p"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Set up the Safekeeper. let env = Env::new(fsync)?; let mut safekeeper = runtime.block_on(env.make_safekeeper( NodeId(1), TenantTimelineId::generate(), Lsn(0), ))?; b.iter_batched_ref( // Pre-construct WAL records and requests. Criterion will batch them. || { let (lsn, record) = walgen.next().expect("endless WAL"); ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { generation: Generation::new(0), term: 1, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), }, wal_data: record, }) }, // Benchmark message processing (time per message). |msg| { runtime .block_on(safekeeper.process_msg(msg)) .expect("message failed") }, BatchSize::SmallInput, // automatically determine a batch size ); Ok(()) } } /// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting /// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages /// instead of measuring each individual message to amortize costs (e.g. fsync), which is more /// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including /// headers). Records are pre-constructed to avoid skewing the benchmark. /// /// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`: fn bench_wal_acceptor(c: &mut Criterion) { let mut g = c.benchmark_group("wal_acceptor"); for fsync in [false, true] { for n in [1, 100, 10000] { g.bench_function(format!("fsync={fsync}/n={n}"), |b| { run_bench(b, n, fsync).unwrap() }); } } /// The actual benchmark. n is the number of WAL records to send in a pipelined batch. fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded let env = Env::new(fsync)?; let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"), Lsn(0)); // Create buffered channels that can fit all requests, to avoid blocking on channels. let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n); let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n); // Spawn the WalAcceptor task. runtime.block_on(async { // TODO: WalAcceptor doesn't actually need a full timeline, only // Safekeeper::process_msg(). Consider decoupling them to simplify the setup. let tli = env .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0)); anyhow::Ok(()) })?; b.iter_batched( // Pre-construct a batch of WAL records and requests. || { walgen .take(n) .map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { generation: Generation::new(0), term: 1, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: Lsn(0), truncate_lsn: Lsn(0), }, wal_data: record, }) .collect_vec() }, // Benchmark batch ingestion (time per batch). |reqs| { runtime.block_on(async { let final_lsn = reqs.last().unwrap().h.end_lsn; // Stuff all the messages into the buffered channel to pipeline them. for req in reqs { let msg = ProposerAcceptorMessage::AppendRequest(req); msg_tx.send(msg).await.expect("send failed"); } // Wait for the last message to get flushed. while let Some(reply) = reply_rx.recv().await { if let AcceptorProposerMessage::AppendResponse(resp) = reply { if resp.flush_lsn >= final_lsn { return; } } } panic!("disconnected") }) }, BatchSize::PerIteration, // only run one request batch at a time ); Ok(()) } } /// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting /// for the last LSN to be flushed to storage. Only the actual message payload counts towards /// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage. /// /// To avoid running out of memory, messages are constructed during the benchmark. fn bench_wal_acceptor_throughput(c: &mut Criterion) { const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding let mut g = c.benchmark_group("wal_acceptor_throughput"); g.sample_size(10); g.throughput(criterion::Throughput::Bytes(VOLUME as u64)); for fsync in [false, true] { for commit in [false, true] { for size in [KB, 8 * KB, 128 * KB, MB] { assert_eq!(VOLUME % size, 0, "volume must be divisible by size"); let count = VOLUME / size; g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| { run_bench(b, count, size, fsync, commit).unwrap() }); } } } /// The actual benchmark. size is the payload size per message, count is the number of messages. /// If commit is true, advance the commit LSN on each message. fn run_bench( b: &mut Bencher, count: usize, size: usize, fsync: bool, commit: bool, ) -> anyhow::Result<()> { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded // Construct the payload. The prefix counts towards the payload (including NUL terminator). let prefix = c"p"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Construct and spawn the WalAcceptor task. let env = Env::new(fsync)?; let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE); runtime.block_on(async { let tli = env .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0)); anyhow::Ok(()) })?; // Ingest the WAL. b.iter(|| { runtime.block_on(async { let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { generation: Generation::new(0), term: 1, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), }, wal_data: record, }); // Send requests. for req in reqgen { _ = reply_rx.try_recv(); // discard any replies, to avoid blocking let msg = ProposerAcceptorMessage::AppendRequest(req); msg_tx.send(msg).await.expect("send failed"); } // Wait for last message to get flushed. while let Some(reply) = reply_rx.recv().await { if let AcceptorProposerMessage::AppendResponse(resp) = reply { if resp.flush_lsn >= walgen.lsn { return; } } } panic!("disconnected") }) }); Ok(()) } } /// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended /// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput. fn bench_file_write(c: &mut Criterion) { let mut g = c.benchmark_group("file_write"); for kind in ["stdlib", "tokio"] { for fsync in [false, true] { for size in [8, KB, 8 * KB, 128 * KB, MB] { // Kind of weird to change the group throughput per benchmark, but it's the only way to // vary it per benchmark. It works. g.throughput(criterion::Throughput::Bytes(size as u64)); g.bench_function( format!("{kind}/fsync={fsync}/size={size}"), |b| match kind { "stdlib" => run_bench_stdlib(b, size, fsync).unwrap(), "tokio" => run_bench_tokio(b, size, fsync).unwrap(), name => panic!("unknown kind {name}"), }, ); } } } fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> { let mut file = tempfile()?; let buf = vec![0u8; size]; b.iter(|| { file.write_all(&buf).unwrap(); file.flush().unwrap(); if fsync { file.sync_data().unwrap(); } }); Ok(()) } fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded let mut file = tokio::fs::File::from_std(tempfile()?); let buf = vec![0u8; size]; b.iter(|| { runtime.block_on(async { file.write_all(&buf).await.unwrap(); file.flush().await.unwrap(); if fsync { file.sync_data().await.unwrap(); } }) }); Ok(()) } } /// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic /// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc. fn bench_bytes_reserve(c: &mut Criterion) { let mut g = c.benchmark_group("bytes_reserve"); for size in [1, 64, KB, 8 * KB, 128 * KB] { g.throughput(criterion::Throughput::Bytes(size as u64)); g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); } fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { let mut bytes = BytesMut::new(); let data = vec![0; size]; b.iter(|| { bytes.reserve(size); bytes.extend_from_slice(&data); bytes.split_to(size).freeze(); }); Ok(()) } } ================================================ FILE: safekeeper/client/Cargo.toml ================================================ [package] name = "safekeeper_client" version = "0.1.0" edition.workspace = true license.workspace = true [dependencies] http-utils.workspace = true safekeeper_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } serde.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } ================================================ FILE: safekeeper/client/src/lib.rs ================================================ pub mod mgmt_api; ================================================ FILE: safekeeper/client/src/mgmt_api.rs ================================================ //! Safekeeper http client. //! //! Partially copied from pageserver client; some parts might be better to be //! united. use std::error::Error as _; use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, Response, StatusCode}; use safekeeper_api::models::{ self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization, TimelineCreateRequest, }; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; #[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, client: reqwest::Client, } #[derive(thiserror::Error, Debug)] pub enum Error { /// Failed to receive body (reqwest error). #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), /// Status is not ok, but failed to parse body as `HttpErrorBody`. #[error("receive error body: {0}")] ReceiveErrorBody(String), /// Status is not ok; parsed error in body as `HttpErrorBody`. #[error("safekeeper API: {1}")] ApiError(StatusCode, String), #[error("Cancelled")] Cancelled, #[error("request timed out: {0}")] Timeout(String), } pub type Result = std::result::Result; pub trait ResponseErrorMessageExt: Sized { fn error_from_body(self) -> impl std::future::Future> + Send; } /// If status is not ok, try to extract error message from the body. impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); if status.is_success() { return Ok(self); } let url = self.url().to_owned(); Err(match self.json::().await { Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("http error ({}) at {}.", status.as_u16(), url)) } }) } } impl Client { pub fn new( client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option, ) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt .map(|jwt| SecretString::from(format!("Bearer {}", jwt.get_contents()))), client, } } pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { let uri = format!("{}/v1/tenant/timeline", self.mgmt_api_endpoint); let resp = self.post(&uri, req).await?; Ok(resp) } pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint); let resp = self.post(&uri, req).await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn exclude_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, req: &models::TimelineMembershipSwitchRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/exclude", self.mgmt_api_endpoint, tenant_id, timeline_id ); let resp = self.put(&uri, req).await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id ); let resp = self .request_maybe_body(Method::DELETE, &uri, None::<()>) .await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn switch_timeline_membership( &self, tenant_id: TenantId, timeline_id: TimelineId, req: &models::TimelineMembershipSwitchRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/membership", self.mgmt_api_endpoint, tenant_id, timeline_id ); let resp = self.put(&uri, req).await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self .request_maybe_body(Method::DELETE, &uri, None::<()>) .await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn bump_timeline_term( &self, tenant_id: TenantId, timeline_id: TimelineId, req: &models::TimelineTermBumpRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/term_bump", self.mgmt_api_endpoint, tenant_id, timeline_id ); let resp = self.post(&uri, req).await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn timeline_status( &self, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id ); self.get(&uri).await } pub async fn snapshot( &self, tenant_id: TenantId, timeline_id: TimelineId, stream_to: NodeId, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}/snapshot/{}", self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0 ); self.get(&uri).await } pub async fn status(&self) -> Result { let uri = format!("{}/v1/status", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } pub async fn utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } async fn post( &self, uri: U, body: B, ) -> Result { self.request(Method::POST, uri, body).await } async fn put( &self, uri: U, body: B, ) -> Result { self.request(Method::PUT, uri, body).await } async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } /// Send the request and check that the status code is good. async fn request( &self, method: Method, uri: U, body: B, ) -> Result { self.request_maybe_body(method, uri, Some(body)).await } /// Send the request and check that the status code is good, with an optional body. async fn request_maybe_body( &self, method: Method, uri: U, body: Option, ) -> Result { let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; Ok(response) } /// Just send the request. async fn request_noerror( &self, method: Method, uri: U, body: Option, ) -> Result { let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value.get_contents()) } if let Some(body) = body { req = req.json(&body); } req.send().await.map_err(Error::ReceiveBody) } } ================================================ FILE: safekeeper/spec/.gitignore ================================================ *TTrace* *.toolbox/ states/ ================================================ FILE: safekeeper/spec/MCProposerAcceptorReconfig.tla ================================================ ---- MODULE MCProposerAcceptorReconfig ---- EXTENDS TLC, ProposerAcceptorReconfig \* Augments the spec with model checking constraints. \* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it \* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big \* anyway. \* For model checking. CONSTANTS max_entries, \* model constraint: max log entries acceptor/proposer can hold max_term, \* model constraint: max allowed term max_generation \* mode constraint: max config generation ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat \* Model space constraint. StateConstraint == /\ \A p \in proposers: /\ prop_state[p].term <= max_term /\ Len(prop_state[p].wal) <= max_entries /\ conf_store.generation <= max_generation \* Sets of proposers and acceptors and symmetric because we don't take any \* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN \* ...) ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) \* enforce order of the vars in the error trace with ALIAS \* Note that ALIAS is supported only since version 1.8.0 which is pre-release \* as of writing this. Alias == [ prop_state |-> prop_state, prop_conf |-> prop_conf, acc_state |-> acc_state, acc_conf |-> acc_conf, committed |-> committed, conf_store |-> conf_store ] ==== ================================================ FILE: safekeeper/spec/MCProposerAcceptorStatic.tla ================================================ ---- MODULE MCProposerAcceptorStatic ---- EXTENDS TLC, ProposerAcceptorStatic \* Augments the spec with model checking constraints. \* Note that MCProposerAcceptorReconfig duplicates it and might need to \* be updated as well. \* For model checking. CONSTANTS max_entries, \* model constraint: max log entries acceptor/proposer can hold max_term \* model constraint: max allowed term ASSUME max_entries \in Nat /\ max_term \in Nat \* Model space constraint. StateConstraint == \A p \in proposers: /\ prop_state[p].term <= max_term /\ Len(prop_state[p].wal) <= max_entries \* Sets of proposers and acceptors are symmetric because we don't take any \* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN \* ...) ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) \* enforce order of the vars in the error trace with ALIAS \* Note that ALIAS is supported only since version 1.8.0 which is pre-release \* as of writing this. Alias == [ prop_state |-> prop_state, acc_state |-> acc_state, committed |-> committed ] ==== ================================================ FILE: safekeeper/spec/ProposerAcceptorReconfig.tla ================================================ ---- MODULE ProposerAcceptorReconfig ---- (* Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md Simplifications: - The ones inherited from ProposerAcceptorStatic. - We don't model transient state of the configuration change driver process (storage controller in the implementation). Its actions StartChange and FinishChange are taken based on the persistent state of safekeepers and conf store. The justification for that is the following: once new configuration n is created (e.g with StartChange or FinishChange), any old configuration change driver working on older conf < n will never be able to commit it to the conf store because it is protected by CAS. The propagation of these older confs is still possible though, and spec allows to do it through acceptors. Plus the model is already pretty huge. - Previous point also means that the FinishChange action is based only on the current state of safekeepers, not from the past. That's ok because while individual acceptor may go down, quorum one never does. So the FinishChange condition which collects max of the quorum may get only more strict over time. The invariants expectedly break if any of FinishChange required conditions are removed. *) EXTENDS Integers, Sequences, FiniteSets, TLC VARIABLES \* state which is the same in the static spec prop_state, acc_state, committed, elected_history, \* reconfiguration only state prop_conf, \* prop_conf[p] is current configuration of proposer p acc_conf, \* acc_conf[a] is current configuration of acceptor a conf_store \* configuration in the configuration store. CONSTANT acceptors, proposers CONSTANT NULL \* Import ProposerAcceptorStatic under PAS. \* \* Note that all vars and consts are named the same and thus substituted \* implicitly. PAS == INSTANCE ProposerAcceptorStatic \******************************************************************************** \* Helpers \******************************************************************************** \******************************************************************************** \* Type assertion \******************************************************************************** \* Is c a valid config? IsConfig(c) == /\ DOMAIN c = {"generation", "members", "newMembers"} \* Unique id of the configuration. /\ c.generation \in Nat /\ c.members \in SUBSET acceptors \* newMembers is NULL when it is not a joint conf. /\ \/ c.newMembers = NULL \/ c.newMembers \in SUBSET acceptors TypeOk == /\ PAS!TypeOk /\ \A p \in proposers: IsConfig(prop_conf[p]) /\ \A a \in acceptors: IsConfig(acc_conf[a]) /\ IsConfig(conf_store) \******************************************************************************** \* Initial \******************************************************************************** Init == /\ PAS!Init /\ \E init_members \in SUBSET acceptors: LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN \* refer to RestartProposer why it is not NULL /\ prop_conf = [p \in proposers |-> init_conf] /\ acc_conf = [a \in acceptors |-> init_conf] /\ conf_store = init_conf \* We could start with anything, but to reduce state space state with \* the most reasonable total acceptors - 1 conf size, which e.g. \* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2, \* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in \* the smallest models with single change. /\ Cardinality(init_members) = Cardinality(acceptors) - 1 \******************************************************************************** \* Actions \******************************************************************************** \* Proposer p loses all state, restarting. In the static spec we bump restarted \* proposer term to max of some quorum + 1 which is a minimal term which can win \* election. With reconfigurations it's harder to calculate such a term, so keep \* it simple and take random acceptor one + 1. \* \* Also make proposer to adopt configuration of another random acceptor. In the \* impl proposer starts with NULL configuration until handshake with first \* acceptor. Removing this NULL special case makes the spec a bit simpler. RestartProposer(p) == /\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1) /\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]] /\ UNCHANGED <> \* Acceptor a immediately votes for proposer p. Vote(p, a) == \* Configuration must be the same. /\ prop_conf[p].generation = acc_conf[a].generation \* And a is expected be a member of it. This is likely redundant as long as \* becoming leader checks membership (though vote also contributes to max \* calculation). /\ \/ a \in prop_conf[p].members \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) /\ PAS!Vote(p, a) /\ UNCHANGED <> \* Proposer p gets elected. BecomeLeader(p) == /\ prop_state[p].state = "campaign" \* Votes must form quorum in both sets (if the newMembers exists). /\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members) /\ \/ prop_conf[p].newMembers = NULL \* TLA+ disjunction evaluation doesn't short-circuit for a good reason: \* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ \* so repeat the null check. \/ (prop_conf[p].newMembers /= NULL) /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers)) \* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so \* ensure its conf is still the same. In the impl WAL fetching also has to \* check the configuration. /\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation /\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation /\ PAS!DoBecomeLeader(p) /\ UNCHANGED <> UpdateTerm(p, a) == /\ PAS!UpdateTerm(p, a) /\ UNCHANGED <> TruncateWal(p, a) == /\ prop_state[p].state = "leader" \* Configuration must be the same. /\ prop_conf[p].generation = acc_conf[a].generation /\ PAS!TruncateWal(p, a) /\ UNCHANGED <> NewEntry(p) == /\ PAS!NewEntry(p) /\ UNCHANGED <> AppendEntry(p, a) == /\ prop_state[p].state = "leader" \* Configuration must be the same. /\ prop_conf[p].generation = acc_conf[a].generation \* And a is member of it. Ignoring this likely wouldn't hurt, but not useful \* either. /\ \/ a \in prop_conf[p].members \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) /\ PAS!AppendEntry(p, a) /\ UNCHANGED <> \* see PAS!CommitEntries for comments. CommitEntries(p) == /\ prop_state[p].state = "leader" /\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members): LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN \* Configuration must be the same. /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation /\ q1_commit_lsn /= NULL \* We must collect acks from both quorums, if newMembers is present. /\ IF prop_conf[p].newMembers = NULL THEN PAS!DoCommitEntries(p, q1_commit_lsn) ELSE \E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers): LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN \* Configuration must be the same. /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation /\ q2_commit_lsn /= NULL /\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn)) /\ UNCHANGED <> \* Proposer p adopts higher conf c from conf store or from some acceptor. ProposerSwitchConf(p) == /\ \E c \in ({conf_store} \union {acc_conf[a]: a \in acceptors}): \* p's conf is lower than c. /\ (c.generation > prop_conf[p].generation) \* We allow to bump conf without restart only when wp is already elected. \* If it isn't, the votes it has already collected are from the previous \* configuration and can't be used. \* \* So if proposer is in 'campaign' in the impl we would restart preserving \* conf and increasing term. In the spec this transition is already covered \* by more a generic RestartProposer, so we don't specify it here. /\ prop_state[p].state = "leader" /\ prop_conf' = [prop_conf EXCEPT ![p] = c] /\ UNCHANGED <> \* Do CAS on the conf store, starting change into the new_members conf. StartChange(new_members) == \* Possible only if we don't already have the change in progress. /\ conf_store.newMembers = NULL \* Not necessary, but reduces space a bit. /\ new_members /= conf_store.members /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members] /\ UNCHANGED <> \* Acceptor's last_log_term. AccLastLogTerm(acc) == PAS!LastLogTerm(PAS!AcceptorTermHistory(acc)) \* Do CAS on the conf store, transferring joint conf into the newMembers only. FinishChange == \* have joint conf /\ conf_store.newMembers /= NULL \* The conditions for finishing the change are: /\ \E qo \in PAS!AllMinQuorums(conf_store.members): \* 1) Old majority must be aware of the joint conf. \* Note: generally the driver can't know current acceptor \* generation, it can only know that it once had been the \* expected one, but it might have advanced since then. \* But as explained at the top of the file if acceptor gen \* advanced, FinishChange will never be able to complete \* due to CAS anyway. We use strict equality here because \* that's what makes sense conceptually (old driver should \* abandon its attempt if it observes that conf has advanced). /\ \A a \in qo: conf_store.generation = acc_conf[a].generation \* 2) New member set must have log synced, i.e. some its majority needs \* to have at least as high as max of some \* old majority. \* 3) Term must be synced, i.e. some majority of the new set must \* have term >= than max term of some old majority. \* This ensures that two leaders are never elected with the same \* term even after config change (which would be bad unless we treat \* generation as a part of term which we don't). \* 4) A majority of the new set must be aware of the joint conf. \* This allows to safely destoy acceptor state if it is not a \* member of its current conf (which is useful for cleanup after \* migration as well as for aborts). /\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo}) sync_term == PAS!Maximum({acc_state[a].term: a \in qo}) IN \E qn \in PAS!AllMinQuorums(conf_store.newMembers): \A a \in qn: /\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos) /\ acc_state[a].term >= sync_term \* The same note as above about strict equality applies here. /\ conf_store.generation = acc_conf[a].generation /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL] /\ UNCHANGED <> \* Do CAS on the conf store, aborting the change in progress. AbortChange == \* have joint conf /\ conf_store.newMembers /= NULL /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL] /\ UNCHANGED <> \* Acceptor a switches to higher configuration from the conf store \* or from some proposer. AccSwitchConf(a) == /\ \E c \in ({conf_store} \union {prop_conf[p]: p \in proposers}): /\ acc_conf[a].generation < c.generation /\ acc_conf' = [acc_conf EXCEPT ![a] = c] /\ UNCHANGED <> \* Nuke all acceptor state if it is not a member of its current conf. Models \* cleanup after migration/abort. AccReset(a) == /\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members) \/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers)) /\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc] \* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark \* that elected proposer performed TruncateWal on the acceptor, which isn't \* true anymore after state reset. In the impl local deletion is expected to \* terminate all existing connections. /\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]] /\ UNCHANGED <> \******************************************************************************* \* Final spec \******************************************************************************* Next == \/ \E p \in proposers: RestartProposer(p) \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) \/ \E p \in proposers: BecomeLeader(p) \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) \/ \E p \in proposers: NewEntry(p) \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) \/ \E p \in proposers: CommitEntries(p) \/ \E new_members \in SUBSET acceptors: StartChange(new_members) \/ FinishChange \/ AbortChange \/ \E p \in proposers: ProposerSwitchConf(p) \/ \E a \in acceptors: AccSwitchConf(a) \/ \E a \in acceptors: AccReset(a) Spec == Init /\ [][Next]_<> \******************************************************************************** \* Invariants \******************************************************************************** AllConfs == {conf_store} \union {prop_conf[p]: p \in proposers} \union {acc_conf[a]: a \in acceptors} \* Fairly trivial (given the conf store) invariant that different configurations \* with the same generation are never issued. ConfigSafety == \A c1, c2 \in AllConfs: (c1.generation = c2.generation) => (c1 = c2) ElectionSafety == PAS!ElectionSafety ElectionSafetyFull == PAS!ElectionSafetyFull LogIsMonotonic == PAS!LogIsMonotonic LogSafety == PAS!LogSafety \******************************************************************************** \* Invariants which don't need to hold, but useful for playing/debugging. \******************************************************************************** \* Check that we ever switch into non joint conf. MaxAccConf == ~ \E a \in acceptors: /\ acc_conf[a].generation = 3 /\ acc_conf[a].newMembers /= NULL CommittedNotTruncated == PAS!CommittedNotTruncated MaxTerm == PAS!MaxTerm MaxStoreConf == conf_store.generation <= 1 MaxAccWalLen == PAS!MaxAccWalLen MaxCommitLsn == PAS!MaxCommitLsn ==== ================================================ FILE: safekeeper/spec/ProposerAcceptorStatic.tla ================================================ ---- MODULE ProposerAcceptorStatic ---- (* The protocol is very similar to Raft. The key differences are: - Leaders (proposers) are separated from storage nodes (acceptors), which has been already an established way to think about Paxos. - We don't want to stamp each log record with term, so instead carry around term histories which are sequences of pairs. As a bonus (and subtlety) this allows the proposer to commit entries from previous terms without writing new records -- if acceptor's log is caught up, update of term history on it updates last_log_term as well. *) \* Model simplifications: \* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not \* delayed, so we don't attempt to truncate WAL when the same wp already appended something \* on the acceptor since common point had been calculated (this should be rejected). \* - old WAL is immediately copied to proposer on its election, without on-demand fetch later. \* Some ideas how to break it to play around to get a feeling: \* - replace Quorum with BadQuorum. \* - remove 'don't commit entries from previous terms separately' rule in \* CommitEntries and observe figure 8 from the raft paper. \* With p2a3t4l4 32 steps error was found in 1h on 80 cores. EXTENDS Integers, Sequences, FiniteSets, TLC VARIABLES prop_state, \* prop_state[p] is state of proposer p acc_state, \* acc_state[a] is state of acceptor a committed, \* bag (set) of ever committed <> entries elected_history \* counter for elected terms, see TypeOk for details CONSTANT acceptors, proposers CONSTANT NULL \******************************************************************************** \* Helpers \******************************************************************************** Maximum(S) == (*************************************************************************) (* If S is a set of numbers, then this define Maximum(S) to be the *) (* maximum of those numbers, or -1 if S is empty. *) (*************************************************************************) IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m \* minimum of numbers in the set, error if set is empty Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n \* Min of two numbers Min(a, b) == IF a < b THEN a ELSE b \* Sort of 0 for functions EmptyF == [x \in {} |-> 42] IsEmptyF(f) == DOMAIN f = {} \* Set of values (image) of the function f. Apparently no such builtin. Range(f) == {f[x] : x \in DOMAIN f} \* If key k is in function f, map it using l, otherwise insert v. Returns the \* updated function. Upsert(f, k, v, l(_)) == LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN (k :> new_val) @@ f \***************** \* Does set of acceptors `acc_set` form the quorum in the member set `members`? \* Acceptors not from `members` are excluded (matters only for reconfig). FormsQuorum(acc_set, members) == Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1) \* Like FormsQuorum, but for minimal quorum. FormsMinQuorum(acc_set, members) == Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1) \* All sets of acceptors forming minimal quorums in the member set `members`. AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)} AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)} \* For substituting Quorum and seeing what happens. FormsBadQuorum(acc_set, members) == Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2) FormsMinBadQuorum(acc_set, members) == Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2) AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)} AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)} \* flushLsn (end of WAL, i.e. index of next entry) of acceptor a. FlushLsn(a) == Len(acc_state[a].wal) + 1 \* Typedefs. Note that TLA+ Nat includes zero. Terms == Nat Lsns == Nat \******************************************************************************** \* Type assertion \******************************************************************************** \* Defining sets of all possible tuples and using them in TypeOk in usual \* all-tuples constructor is not practical because such definitions force \* TLC to enumerate them, while they are are horribly enormous \* (TLC screams "Attempted to construct a set with too many elements"). \* So instead check types manually. \* Term history is a sequence of pairs. IsTermHistory(th) == \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns IsWal(w) == \A i \in DOMAIN w: /\ i \in Lsns /\ w[i] \in Terms TypeOk == /\ \A p \in proposers: \* '_' in field names hinders pretty printing \* https://github.com/tlaplus/tlaplus/issues/1051 \* so use camel case. /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"} \* In campaign proposer sends RequestVote and waits for acks; \* in leader he is elected. /\ prop_state[p].state \in {"campaign", "leader"} \* term for which it will campaign, or won term in leader state /\ prop_state[p].term \in Terms \* votes received /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors /\ \A vote \in Range(prop_state[p].votes): /\ IsTermHistory(vote.termHistory) /\ vote.flushLsn \in Lsns \* Proposer's term history. Empty while proposer is in "campaign". /\ IsTermHistory(prop_state[p].termHistory) \* In the model we identify WAL entries only by pairs \* without additional unique id, which is enough for its purposes. \* It means that with term history fully modeled wal becomes \* redundant as it can be computed from term history + WAL length. \* However, we still keep it here and at acceptors as explicit sequence \* where index is LSN and value is the term to avoid artificial mapping to \* figure out real entries. It shouldn't bloat model much because this \* doesn't increase number of distinct states. /\ IsWal(prop_state[p].wal) \* Map of acceptor -> next lsn to send. It is set when truncate_wal is \* done so sending entries is allowed only after that. In the impl TCP \* ensures this ordering. We use NULL instead of missing value to use \* EXCEPT in AccReset. /\ \A a \in DOMAIN prop_state[p].nextSendLsn: /\ a \in acceptors /\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL} /\ \A a \in acceptors: /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"} /\ acc_state[a].term \in Terms /\ IsTermHistory(acc_state[a].termHistory) /\ IsWal(acc_state[a].wal) /\ \A c \in committed: /\ c.term \in Terms /\ c.lsn \in Lsns \* elected_history is a retrospective map of term -> number of times it was \* elected, for use in ElectionSafetyFull invariant. For static spec it is \* fairly convincing that it holds, but with membership change it is less \* trivial. And as we identify log entries only with , importance \* of it is quite high as violation of log safety might go undetected if \* election safety is violated. Note though that this is not always the \* case, i.e. you can imagine (and TLC should find) schedule where log \* safety violation is still detected because two leaders with the same term \* commit histories which are different in previous terms, so it is not that \* crucial. Plus if spec allows ElectionSafetyFull violation, likely \* ElectionSafety will also be violated in some schedules. But neither it \* should bloat the model too much. /\ \A term \in DOMAIN elected_history: /\ term \in Terms /\ elected_history[term] \in Nat \******************************************************************************** \* Initial \******************************************************************************** InitAcc == [ \* There will be no leader in zero term, 1 is the first \* real. term |-> 0, \* Again, leader in term 0 doesn't exist, but we initialize \* term histories with it to always have common point in \* them. Lsn is 1 because TLA+ sequences are indexed from 1 \* (we don't want to truncate WAL out of range). termHistory |-> << [term |-> 0, lsn |-> 1] >>, wal |-> << >> ] Init == /\ prop_state = [p \in proposers |-> [ state |-> "campaign", term |-> 1, votes |-> EmptyF, termHistory |-> << >>, wal |-> << >>, nextSendLsn |-> [a \in acceptors |-> NULL] ]] /\ acc_state = [a \in acceptors |-> InitAcc] /\ committed = {} /\ elected_history = EmptyF \******************************************************************************** \* Actions \******************************************************************************** RestartProposerWithTerm(p, new_term) == /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", ![p].term = new_term, ![p].votes = EmptyF, ![p].termHistory = << >>, ![p].wal = << >>, ![p].nextSendLsn = [a \in acceptors |-> NULL]] /\ UNCHANGED <> \* Proposer p loses all state, restarting. \* For simplicity (and to reduct state space), we assume it immediately gets \* current state from quorum q of acceptors determining the term he will request \* to vote for. RestartProposer(p) == \E q \in AllQuorums(acceptors): LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN RestartProposerWithTerm(p, new_term) \* Term history of acceptor a's WAL: the one saved truncated to contain only <= \* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry \* (and begin LSN of the next). The mental model for non strict comparison is \* that once proposer is elected it immediately writes log record with zero \* length. This allows leader to commit existing log without writing any new \* entries. For example, assume acceptor has WAL \* 1.1, 1.2 \* written by prop with term 1; its current \* is <1, 3>. Now prop with term 2 and max vote from this acc is elected. \* Once TruncateWAL is done, becomes <2, 3> \* without any new records explicitly written. AcceptorTermHistory(a) == SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a)) \* Acceptor a immediately votes for proposer p. Vote(p, a) == /\ prop_state[p].state = "campaign" /\ acc_state[a].term < prop_state[p].term \* main voting condition /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] /\ LET vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)] IN prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes] /\ UNCHANGED <> \* Get lastLogTerm from term history th. LastLogTerm(th) == th[Len(th)].term \* Compares pairs: returns true if tl1 >= tl2. TermLsnGE(tl1, tl2) == /\ tl1.term >= tl2.term /\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn) \* Choose max pair in the non empty set of them. MaxTermLsn(term_lsn_set) == CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl) \* Find acceptor with the highest vote in proposer p's votes. MaxVoteAcc(p) == CHOOSE a \in DOMAIN prop_state[p].votes: LET a_vote == prop_state[p].votes[a] a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn] vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)} IN a_vote_term_lsn = MaxTermLsn(vote_term_lsns) \* Workhorse for BecomeLeader. \* Assumes the check prop_state[p] votes is quorum has been done *outside*. DoBecomeLeader(p) == LET \* Find acceptor with the highest vote. max_vote_acc == MaxVoteAcc(p) max_vote == prop_state[p].votes[max_vote_acc] prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) IN \* We copy all log preceding proposer's term from the max vote node so \* make sure it is still on one term with us. This is a model \* simplification which can be removed, in impl we fetch WAL on demand \* from safekeeper which has it later. Note though that in case of on \* demand fetch we must check on donor not only term match, but that \* truncate_wal had already been done (if it is not max_vote_acc). /\ acc_state[max_vote_acc].term = prop_state[p].term /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", ![p].termHistory = prop_th, ![p].wal = acc_state[max_vote_acc].wal ] /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) /\ UNCHANGED <> \* Proposer p gets elected. BecomeLeader(p) == /\ prop_state[p].state = "campaign" /\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors) /\ DoBecomeLeader(p) \* Acceptor a learns about elected proposer p's term. In impl it matches to \* VoteRequest/VoteResponse exchange when leader is already elected and is not \* interested in the vote result. UpdateTerm(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term < prop_state[p].term /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] /\ UNCHANGED <> \* Find highest common point (LSN of the first divergent record) in the logs of \* proposer p and acceptor a. Returns of the highest common point. FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) == LET \* First find index of the highest common term. \* It must exist because we initialize th with <0, 1>. last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term}) last_common_term == prop_th[last_common_idx].term \* Now find where it ends at both prop and acc and take min. End of term \* is the start of the next unless it is the last one; there it is \* flush_lsn in case of acceptor. In case of proposer it is the current \* writing position, but it can't be less than flush_lsn, so we \* take flush_lsn. acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn IN [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)] \* Elected proposer p immediately truncates WAL (and sets term history) of \* acceptor a before starting streaming. Establishes nextSendLsn for a. \* \* In impl this happens at each reconnection, here we also allow to do it \* multiple times. TruncateWal(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term /\ LET hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn IN \* Acceptor persists full history immediately; reads adjust it to the \* really existing wal with AcceptorTermHistory. /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory, \* note: SubSeq is inclusive, hence -1. ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1) ] /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn] /\ UNCHANGED <> \* Append new log entry to elected proposer NewEntry(p) == /\ prop_state[p].state = "leader" /\ LET \* entry consists only of term, index serves as LSN. new_entry == prop_state[p].term IN /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] /\ UNCHANGED <> \* Immediately append next entry from elected proposer to acceptor a. AppendEntry(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term /\ prop_state[p].nextSendLsn[a] /= NULL \* did TruncateWal /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send /\ LET send_lsn == prop_state[p].nextSendLsn[a] entry == prop_state[p].wal[send_lsn] \* Since message delivery is instant we don't check that send_lsn follows \* the last acc record, it must always be true. IN /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1] /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)] /\ UNCHANGED <> \* LSN where elected proposer p starts writing its records. PropStartLsn(p) == IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL \* LSN which can be committed by proposer p using min quorum q (check that q \* forms quorum must have been done outside). NULL if there is none. QuorumCommitLsn(p, q) == IF /\ prop_state[p].state = "leader" /\ \A a \in q: \* Without explicit responses to appends this ensures that append \* up to FlushLsn has been accepted. /\ acc_state[a].term = prop_state[p].term \* nextSendLsn existence means TruncateWal has happened, it ensures \* acceptor's WAL (and FlushLsn) are from proper proposer's history. \* Alternatively we could compare LastLogTerm here, but that's closer to \* what we do in the impl (we check flushLsn in AppendResponse, but \* AppendRequest is processed only if HandleElected handling was good). /\ prop_state[p].nextSendLsn[a] /= NULL THEN \* Now find the LSN present on all the quorum. LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN \* This is the basic Raft rule of not committing entries from previous \* terms except along with current term entry (commit them only when \* quorum recovers, i.e. last_log_term on it reaches leader's term). IF quorum_lsn >= PropStartLsn(p) THEN quorum_lsn ELSE NULL ELSE NULL \* Commit all entries on proposer p with record lsn < commit_lsn. DoCommitEntries(p, commit_lsn) == /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)} /\ UNCHANGED <> \* Proposer p commits all entries it can using some quorum. Note that unlike \* will62794/logless-reconfig this allows to commit entries from previous terms \* (when conditions for that are met). CommitEntries(p) == /\ prop_state[p].state = "leader" \* Using min quorums here is better because 1) QuorumCommitLsn for \* simplicity checks min across all accs in q. 2) it probably makes \* evaluation faster. /\ \E q \in AllMinQuorums(acceptors): LET commit_lsn == QuorumCommitLsn(p, q) IN /\ commit_lsn /= NULL /\ DoCommitEntries(p, commit_lsn) \******************************************************************************* \* Final spec \******************************************************************************* Next == \/ \E p \in proposers: RestartProposer(p) \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) \/ \E p \in proposers: BecomeLeader(p) \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) \/ \E p \in proposers: NewEntry(p) \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) \/ \E p \in proposers: CommitEntries(p) Spec == Init /\ [][Next]_<> \******************************************************************************** \* Invariants \******************************************************************************** \* Lighter version of ElectionSafetyFull which doesn't require elected_history. ElectionSafety == \A p1, p2 \in proposers: (/\ prop_state[p1].state = "leader" /\ prop_state[p2].state = "leader" /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) \* Single term must never be elected more than once. ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1 \* Log is expected to be monotonic by comparison. This is not true \* in variants of multi Paxos, but in Raft (and here) it is. LogIsMonotonic == \A a \in acceptors: \A i, j \in DOMAIN acc_state[a].wal: (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j]) \* Main invariant: If two entries are committed at the same LSN, they must be \* the same entry. LogSafety == \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2) \******************************************************************************** \* Invariants which don't need to hold, but useful for playing/debugging. \******************************************************************************** \* Limits term of elected proposers MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2) MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2 \* Limits max number of committed entries. That way we can check that we'are \* actually committing something. MaxCommitLsn == Cardinality(committed) < 2 \* How many records with different terms can be removed in single WAL \* truncation. MaxTruncatedTerms == \A p \in proposers: \A a \in acceptors: (/\ prop_state[p].state = "leader" /\ prop_state[p].term = acc_state[a].term) => LET hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns} IN Cardinality(truncated_records_terms) < 2 \* Check that TruncateWal never deletes committed record. \* It might seem that this should an invariant, but it is not. \* With 5 nodes, it is legit to truncate record which had been \* globally committed: e.g. nodes abc can commit record of term 1 in \* term 3, and after that leader of term 2 can delete such record \* on d. On 10 cores TLC can find such a trace in ~7 hours. CommittedNotTruncated == \A p \in proposers: \A a \in acceptors: (/\ prop_state[p].state = "leader" /\ prop_state[p].term = acc_state[a].term) => LET hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns} IN \A r \in truncated_records: r \notin committed ==== ================================================ FILE: safekeeper/spec/modelcheck.sh ================================================ #!/bin/bash # Usage: ./modelcheck.sh , e.g. # ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla # ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla CONFIG=$1 SPEC=$2 MEM=7G TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar" mkdir -p "tlc-results" CONFIG_FILE=$(basename -- "$CONFIG") outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log outfile="tlc-results/$outfilename" echo "saving results to $outfile" touch $outfile # Save some info about the run. GIT_REV=`git rev-parse --short HEAD` INFO=`uname -a` # First for Linux, second for Mac. CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1') CPUCORESLinux=`nproc` CPUNAMEMac=`sysctl -n machdep.cpu.brand_string` CPUCORESMac=`sysctl -n machdep.cpu.thread_count` echo "git revision: $GIT_REV" >> $outfile echo "Platform: $INFO" >> $outfile echo "CPU Info Linux: $CPUNAMELinux" >> $outfile echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile echo "CPU Info Mac: $CPUNAMEMac" >> $outfile echo "CPU Cores Mac: $CPUCORESMac" >> $outfile echo "Spec: $SPEC" >> $outfile echo "Config: $CONFIG" >> $outfile echo "----" >> $outfile cat $CONFIG >> $outfile echo "" >> $outfile echo "----" >> $outfile echo "" >> $outfile # see # https://lamport.azurewebsites.net/tla/current-tools.pdf # for TLC options. # OffHeapDiskFPSet is the optimal fingerprint set implementation # https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets # # Add -simulate to run in infinite simulation mode. # -coverage 1 is useful for profiling (check how many times actions are taken). java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \ -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2} max_term = 2 max_entries = 2 max_generation = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ConfigSafety ElectionSafetyFull LogIsMonotonic LogSafety \* As its comment explains generally it is not expected to hold, but \* in such small model it is true. CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2} max_term = 2 max_entries = 2 max_generation = 5 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ConfigSafety ElectionSafetyFull LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 2 max_entries = 2 max_generation = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ConfigSafety ElectionSafetyFull LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3, a4} max_term = 2 max_entries = 2 max_generation = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafetyFull LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg ================================================ \* A very small model just to play. CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 2 max_entries = 2 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafetyFull LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg ================================================ \* A model next to the smallest one. CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 3 max_entries = 2 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafetyFull LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 3 max_entries = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 4 max_entries = 4 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3, a4, a5} max_term = 2 max_entries = 2 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg ================================================ CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3, a4, a5} max_term = 3 max_entries = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ================================================ FILE: safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg ================================================ [File too large to display: 275 B] ================================================ FILE: safekeeper/spec/readme.md ================================================ [File too large to display: 622 B] ================================================ FILE: safekeeper/spec/remove_interm_progress.awk ================================================ # Print all lines, but thin out lines starting with Progress: # leave only first and last 5 ones in the beginning, and only 1 of 1440 # of others (once a day). # Also remove checkpointing logs. { lines[NR] = $0 } $0 ~ /^Progress/ { ++pcount } END { progress_idx = 0 for (i = 1; i <= NR; i++) { if (lines[i] ~ /^Progress/) { if (progress_idx < 5 || progress_idx >= pcount - 5 || progress_idx % 1440 == 0) { print lines[i] } progress_idx++ } else if (lines[i] ~ /^Checkpointing/) {} else { print lines[i] } } } ================================================ FILE: safekeeper/spec/remove_interm_progress.sh ================================================ #!/bin/bash awk -f remove_interm_progress.awk $1 > $1.thin ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log ================================================ git revision: 9e386917a Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux CPU Info Linux: Neoverse-N1 CPU Cores Linux: 80 CPU Info Mac: CPU Cores Mac: Spec: MCProposerAcceptorReconfig.tla Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg ---- CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2} max_term = 2 max_entries = 2 max_generation = 3 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafetyFull LogIsMonotonic LogSafety \* CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ---- TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) Running breadth-first search Model-Checking with fp 99 and seed -9189733667206762985 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 391272] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla Parsing file /tmp/tlc-3211535543066978921/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla Parsing file /tmp/tlc-3211535543066978921/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) Parsing file /tmp/tlc-3211535543066978921/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) Parsing file /tmp/tlc-3211535543066978921/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) Parsing file /tmp/tlc-3211535543066978921/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) Parsing file /tmp/tlc-3211535543066978921/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla Parsing file /tmp/tlc-3211535543066978921/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) Semantic processing of module Naturals Semantic processing of module Sequences Semantic processing of module FiniteSets Semantic processing of module TLC Semantic processing of module Integers Semantic processing of module ProposerAcceptorStatic Semantic processing of module ProposerAcceptorReconfig Semantic processing of module TLCExt Semantic processing of module _TLCTrace Semantic processing of module MCProposerAcceptorReconfig Starting... (2024-12-11 04:24:13) Computing initial states... Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:24:15. Progress(16) at 2024-12-11 04:24:18: 1,427,589 states generated (1,427,589 s/min), 142,472 distinct states found (142,472 ds/min), 47,162 states left on queue. Model checking completed. No error has been found. Estimates of the probability that TLC did not check all reachable states because two distinct states had the same fingerprint: calculated (optimistic): val = 1.0E-6 based on the actual fingerprints: val = 4.2E-8 17746857 states generated, 1121659 distinct states found, 0 states left on queue. The depth of the complete state graph search is 37. The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3). Finished in 33s at (2024-12-11 04:24:46) ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log ================================================ git revision: 9e386917a Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux CPU Info Linux: Neoverse-N1 CPU Cores Linux: 80 CPU Info Mac: CPU Cores Mac: Spec: MCProposerAcceptorReconfig.tla Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg ---- CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2} max_term = 2 max_entries = 2 max_generation = 5 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafetyFull LogIsMonotonic LogSafety \* CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ---- TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) Running breadth-first search Model-Checking with fp 114 and seed -8099467489737745861 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 392020] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla Parsing file /tmp/tlc-11757875725969857497/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla Parsing file /tmp/tlc-11757875725969857497/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) Parsing file /tmp/tlc-11757875725969857497/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) Parsing file /tmp/tlc-11757875725969857497/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) Parsing file /tmp/tlc-11757875725969857497/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) Parsing file /tmp/tlc-11757875725969857497/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla Parsing file /tmp/tlc-11757875725969857497/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) Semantic processing of module Naturals Semantic processing of module Sequences Semantic processing of module FiniteSets Semantic processing of module TLC Semantic processing of module Integers Semantic processing of module ProposerAcceptorStatic Semantic processing of module ProposerAcceptorReconfig Semantic processing of module TLCExt Semantic processing of module _TLCTrace Semantic processing of module MCProposerAcceptorReconfig Starting... (2024-12-11 04:26:12) Computing initial states... Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:26:14. Progress(14) at 2024-12-11 04:26:17: 1,519,385 states generated (1,519,385 s/min), 231,263 distinct states found (231,263 ds/min), 121,410 states left on queue. Progress(20) at 2024-12-11 04:27:17: 42,757,204 states generated (41,237,819 s/min), 4,198,386 distinct states found (3,967,123 ds/min), 1,308,109 states left on queue. Progress(22) at 2024-12-11 04:28:17: 83,613,929 states generated (40,856,725 s/min), 7,499,873 distinct states found (3,301,487 ds/min), 1,929,464 states left on queue. Progress(23) at 2024-12-11 04:29:17: 124,086,758 states generated (40,472,829 s/min), 10,569,712 distinct states found (3,069,839 ds/min), 2,386,988 states left on queue. Progress(24) at 2024-12-11 04:30:17: 163,412,538 states generated (39,325,780 s/min), 13,314,303 distinct states found (2,744,591 ds/min), 2,610,637 states left on queue. Progress(25) at 2024-12-11 04:31:17: 202,643,708 states generated (39,231,170 s/min), 15,960,583 distinct states found (2,646,280 ds/min), 2,759,681 states left on queue. Progress(26) at 2024-12-11 04:32:17: 240,681,633 states generated (38,037,925 s/min), 18,443,440 distinct states found (2,482,857 ds/min), 2,852,177 states left on queue. Progress(27) at 2024-12-11 04:33:17: 278,559,134 states generated (37,877,501 s/min), 20,878,067 distinct states found (2,434,627 ds/min), 2,904,400 states left on queue. Progress(28) at 2024-12-11 04:34:17: 316,699,911 states generated (38,140,777 s/min), 23,212,229 distinct states found (2,334,162 ds/min), 2,864,969 states left on queue. ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log ================================================ [File too large to display: 3.1 KB] ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log ================================================ git revision: bcbff084a Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U CPU Cores Linux: 10 CPU Info Mac: CPU Cores Mac: Spec: MCProposerAcceptorStatic.tla Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg ---- \* A model next to the smallest one. CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 3 max_entries = 2 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety CommittedNotTruncated SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ---- TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef) Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) Semantic processing of module Naturals Semantic processing of module Sequences Semantic processing of module FiniteSets Semantic processing of module TLC Semantic processing of module Integers Semantic processing of module ProposerAcceptorStatic Semantic processing of module TLCExt Semantic processing of module _TLCTrace Semantic processing of module MCProposerAcceptorStatic Starting... (2024-11-15 12:09:59) Computing initial states... Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00. Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue. Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue. Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue. Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue. Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue. Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue. Model checking completed. No error has been found. Estimates of the probability that TLC did not check all reachable states because two distinct states had the same fingerprint: calculated (optimistic): val = 4.9E-6 based on the actual fingerprints: val = 6.9E-7 36896322 states generated, 2623542 distinct states found, 0 states left on queue. The depth of the complete state graph search is 39. The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3). Finished in 05min 14s at (2024-11-15 12:15:13) ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log ================================================ [File too large to display: 4.9 KB] ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log ================================================ # Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled. git revision: 4f1ee6331 Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux CPU Info Linux: Neoverse-N1 CPU Cores Linux: 80 CPU Info Mac: CPU Cores Mac: Spec: MCProposerAcceptorStatic.tla Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg ---- CONSTANTS NULL = NULL proposers = {p1, p2} acceptors = {a1, a2, a3} max_term = 4 max_entries = 4 SPECIFICATION Spec CONSTRAINT StateConstraint INVARIANT TypeOk ElectionSafety LogIsMonotonic LogSafety SYMMETRY ProposerAcceptorSymmetry CHECK_DEADLOCK FALSE ALIAS Alias ---- TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) Semantic processing of module Naturals Semantic processing of module Sequences Semantic processing of module FiniteSets Semantic processing of module TLC Semantic processing of module Integers Semantic processing of module ProposerAcceptorStatic Semantic processing of module TLCExt Semantic processing of module _TLCTrace Semantic processing of module MCProposerAcceptorStatic Starting... (2024-11-06 14:20:26) Computing initial states... Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29. Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue. Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue. Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue. Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue. Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue. Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue. Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue. Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue. Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue. Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue. Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue. Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue. Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue. Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue. Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue. Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue. Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue. Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue. Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue. Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue. Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue. Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue. Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue. Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue. Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue. Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue. Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue. Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue. Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue. Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue. Checkpointing of run states/24-11-06-14-20-25.868 Checkpointing completed at (2024-11-06 14:50:32) Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue. Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue. Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue. Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue. Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue. Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue. Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue. Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue. Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue. Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue. Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue. Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue. Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue. Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue. Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue. Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue. Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue. Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue. Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue. Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue. Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue. Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue. Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue. Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue. Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue. Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue. Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue. Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue. Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue. Error: Invariant LogSafety is violated. Error: The behavior up to this point is: State 1: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 2: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 3: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 4: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 5: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 6: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 7: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 8: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 9: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 10: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 2, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, nextSendLsn |-> (a1 :> 1) ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 11: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 12: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 13: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<1>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 14: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 15: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 2) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 16: /\ prop_state = ( p1 :> [ term |-> 1, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, nextSendLsn |-> (a2 :> 3) ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<1, 1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 17: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> <<>>, termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<1, 1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 18: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 1, wal |-> <<1, 1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 19: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<>>, state |-> "campaign", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 20: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> <<>> ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 21: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) /\ committed = {} State 22: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 3, wal |-> <<>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] ) /\ committed = {} State 23: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 3, wal |-> <<1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] ) /\ committed = {} State 24: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 3, wal |-> <<1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 25: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "campaign", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> <<>>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 26: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> <<>> ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 27: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a3 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 28: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<4>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a3 :> 1) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 29: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<4>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a3 :> 2) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 30: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<4>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 31: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<4>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1]} State 32: /\ prop_state = ( p1 :> [ term |-> 4, wal |-> <<4>>, state |-> "leader", votes |-> ( a1 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, flushLsn |-> 1 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 2 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >>, nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ p2 :> [ term |-> 3, wal |-> <<1, 1>>, state |-> "leader", votes |-> ( a2 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, flushLsn |-> 3 ] @@ a3 :> [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, flushLsn |-> 1 ] ), termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >>, nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) /\ acc_state = ( a1 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] @@ a2 :> [ term |-> 3, wal |-> <<1, 1>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1], [term |-> 3, lsn |-> 3] >> ] @@ a3 :> [ term |-> 4, wal |-> <<4>>, termHistory |-> << [term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1], [term |-> 4, lsn |-> 1] >> ] ) /\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]} 1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue. The depth of the complete state graph search is 35. Finished in 58min 19s at (2024-11-06 15:18:45) Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log ================================================ [File too large to display: 220.8 KB] ================================================ FILE: safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log ================================================ [File too large to display: 7.7 KB] ================================================ FILE: safekeeper/src/auth.rs ================================================ [File too large to display: 1.2 KB] ================================================ FILE: safekeeper/src/bin/safekeeper.rs ================================================ // // Main entry point for the safekeeper executable // use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, bail}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{ArgAction, Parser}; use futures::future::BoxFuture; use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt}; use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::set_build_info_metric; use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_GLOBAL_DISK_CHECK_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; use safekeeper::hadron; use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service, }; use sd_notify::NotifyState; use storage_broker::{DEFAULT_ENDPOINT, Uri}; use tokio::runtime::Handle; use tokio::signal::unix::{SignalKind, signal}; use tokio::task::JoinError; use tracing::*; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; use utils::id::NodeId; use utils::logging::{self, LogFormat, SecretString}; use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; use safekeeper::hadron::{ GLOBAL_DISK_LIMIT_EXCEEDED, get_filesystem_capacity, get_filesystem_usage, }; use safekeeper::metrics::GLOBAL_DISK_UTIL_CHECK_SECONDS; use std::sync::atomic::Ordering; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); const FEATURES: &[&str] = &[ #[cfg(feature = "testing")] "testing", ]; fn version() -> String { format!( "{GIT_VERSION} failpoints: {}, features: {:?}", fail::has_failpoints(), FEATURES, ) } const ABOUT: &str = r#" A fleet of safekeepers is responsible for reliably storing WAL received from compute, passing it through consensus (mitigating potential computes brain split), and serving the hardened part further downstream to pageserver(s). "#; #[derive(Parser)] #[command(name = "Neon safekeeper", version = GIT_VERSION, about = ABOUT, long_about = None)] struct Args { /// Path to the safekeeper data directory. #[arg(short = 'D', long, default_value = "./")] datadir: Utf8PathBuf, /// Safekeeper node id. #[arg(long)] id: Option, /// Initialize safekeeper with given id and exit. #[arg(long)] init: bool, /// Listen endpoint for receiving/sending WAL in the form host:port. #[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)] listen_pg: String, /// Listen endpoint for receiving/sending WAL in the form host:port allowing /// only tenant scoped auth tokens. Pointless if auth is disabled. #[arg(long, default_value = None, verbatim_doc_comment)] listen_pg_tenant_only: Option, /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, /// Listen https endpoint for management and metrics in the form host:port. #[arg(long, default_value = None)] listen_https: Option, /// Advertised endpoint for receiving/sending WAL in the form host:port. If not /// specified, listen_pg is used to advertise instead. #[arg(long, default_value = None)] advertise_pg: Option, /// Availability zone of the safekeeper. #[arg(long)] availability_zone: Option, /// Do not wait for changes to be written safely to disk. Unsafe. #[arg(short, long)] no_sync: bool, /// Dump control file at path specified by this argument and exit. #[arg(long)] dump_control_file: Option, /// Broker endpoint for storage nodes coordination in the form /// http[s]://host:port. In case of https schema TLS is connection is /// established; plaintext otherwise. #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)] broker_endpoint: Uri, /// Broker keepalive interval. #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)] broker_keepalive_interval: Duration, /// Peer safekeeper is considered dead after not receiving heartbeats from /// it during this period passed as a human readable duration. #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)] heartbeat_timeout: Duration, /// Enable/disable peer recovery. #[arg(long, default_value = "false", action=ArgAction::Set)] peer_recovery: bool, /// Remote storage configuration for WAL backup (offloading to s3) as TOML /// inline table, e.g. /// {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "", bucket_region = "", concurrency_limit = 119} /// Safekeeper offloads WAL to /// [prefix_in_bucket/]//, mirroring /// structure on the file system. #[arg(long, value_parser = parse_remote_storage, verbatim_doc_comment)] remote_storage: Option, /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)] max_offloader_lag: u64, /* BEGIN_HADRON */ /// Safekeeper will re-elect a new offloader if the current backup lagging for more than this value in bytes #[arg(long, default_value_t = DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES)] max_reelect_offloader_lag_bytes: u64, /// Safekeeper will stop accepting new WALs if the timeline disk usage exceeds this value in bytes. /// Setting this value to 0 disables the limit. #[arg(long, default_value_t = DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES)] max_timeline_disk_usage_bytes: u64, /* END_HADRON */ /// Number of max parallel WAL segments to be offloaded to remote storage. #[arg(long, default_value = "5")] wal_backup_parallel_jobs: usize, /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring /// WAL backup horizon. #[arg(long)] disable_wal_backup: bool, /// If given, enables auth on incoming connections to WAL service endpoint /// (--listen-pg). Value specifies path to a .pem public key used for /// validations of JWT tokens. Empty string is allowed and means disabling /// auth. #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)] pg_auth_public_key_path: Option, /// If given, enables auth on incoming connections to tenant only WAL /// service endpoint (--listen-pg-tenant-only). Value specifies path to a /// .pem public key used for validations of JWT tokens. Empty string is /// allowed and means disabling auth. #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)] pg_tenant_only_auth_public_key_path: Option, /// If given, enables auth on incoming connections to http management /// service endpoint (--listen-http). Value specifies path to a .pem public /// key used for validations of JWT tokens. Empty string is allowed and /// means disabling auth. #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)] http_auth_public_key_path: Option, /// Format for logging, either 'plain' or 'json'. #[arg(long, default_value = "plain")] log_format: String, /// Run everything in single threaded current thread runtime, might be /// useful for debugging. #[arg(long)] current_thread_runtime: bool, /// Keep horizon for walsenders, i.e. don't remove WAL segments that are /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, /// Controls how long backup will wait until uploading the partial segment. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, /// Disable task to push messages to broker every second. Supposed to /// be used in tests. #[arg(long)] disable_periodic_broker_push: bool, /// Enable automatic switching to offloaded state. #[arg(long)] enable_offload: bool, /// Delete local WAL files after offloading. When disabled, they will be left on disk. #[arg(long)] delete_offloaded_wal: bool, /// Pending updates to control file will be automatically saved after this interval. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)] control_file_save_interval: Duration, /// Number of allowed concurrent uploads of partial segments to remote storage. #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)] partial_backup_concurrency: usize, /// How long a timeline must be resident before it is eligible for eviction. /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction, /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again, /// if it weren't for `eviction_min_resident` preventing that. /// /// Also defines interval for eviction retries. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] eviction_min_resident: Duration, /// Enable fanning out WAL to different shards from the same reader #[arg(long)] wal_reader_fanout: bool, /// Only fan out the WAL reader if the absoulte delta between the new requested position /// and the current position of the reader is smaller than this value. #[arg(long)] max_delta_for_fanout: Option, /// Path to a file with certificate's private key for https API. #[arg(long, default_value = DEFAULT_SSL_KEY_FILE)] ssl_key_file: Utf8PathBuf, /// Path to a file with a X509 certificate for https API. #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] ssl_cert_file: Utf8PathBuf, /// Period to reload certificate and private key from files. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_SSL_CERT_RELOAD_PERIOD)] ssl_cert_reload_period: Duration, /// Trusted root CA certificates to use in https APIs. #[arg(long)] ssl_ca_file: Option, /// Flag to use https for requests to peer's safekeeper API. #[arg(long)] use_https_safekeeper_api: bool, /// Path to the JWT auth token used to authenticate with other safekeepers. #[arg(long)] auth_token_path: Option, /// Enable TLS in WAL service API. /// Does not force TLS: the client negotiates TLS usage during the handshake. /// Uses key and certificate from ssl_key_file/ssl_cert_file. #[arg(long)] enable_tls_wal_service_api: bool, /// Controls whether to collect all metrics on each scrape or to return potentially stale /// results. #[arg(long, default_value_t = true)] force_metric_collection_on_scrape: bool, /// Run in development mode (disables security checks) #[arg(long, help = "Run in development mode (disables security checks)")] dev: bool, /* BEGIN_HADRON */ #[arg(long)] enable_pull_timeline_on_startup: bool, /// How often to scan entire data-dir for total disk usage #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)] global_disk_check_interval: Duration, /// The portion of the filesystem capacity that can be used by all timelines. /// A circuit breaker will trip and reject all WAL writes if the total usage /// exceeds this ratio. /// Set to 0 to disable the global disk usage limit. #[arg(long, default_value_t = DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO)] max_global_disk_usage_ratio: f64, /* END_HADRON */ } // Like PathBufValueParser, but allows empty string. fn opt_pathbuf_parser(s: &str) -> Result { Ok(Utf8PathBuf::from_str(s).unwrap()) } #[tokio::main(flavor = "current_thread")] async fn main() -> anyhow::Result<()> { // We want to allow multiple occurences of the same arg (taking the last) so // that neon_local could generate command with defaults + overrides without // getting 'argument cannot be used multiple times' error. This seems to be // impossible with pure Derive API, so convert struct to Command, modify it, // parse arguments, and then fill the struct back. let cmd = ::command() .args_override_self(true) .version(version()); let mut matches = cmd.get_matches(); let mut args = ::from_arg_matches_mut(&mut matches)?; // I failed to modify opt_pathbuf_parser to return Option in // reasonable time, so turn empty string into option post factum. if let Some(pb) = &args.pg_auth_public_key_path { if pb.as_os_str().is_empty() { args.pg_auth_public_key_path = None; } } if let Some(pb) = &args.pg_tenant_only_auth_public_key_path { if pb.as_os_str().is_empty() { args.pg_tenant_only_auth_public_key_path = None; } } if let Some(pb) = &args.http_auth_public_key_path { if pb.as_os_str().is_empty() { args.http_auth_public_key_path = None; } } if let Some(addr) = args.dump_control_file { let state = control_file::FileStorage::load_control_file(addr)?; let json = serde_json::to_string(&state)?; print!("{json}"); return Ok(()); } // important to keep the order of: // 1. init logging // 2. tracing panic hook // 3. sentry logging::init( LogFormat::from_config(&args.log_format)?, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stdout, )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); info!("buld_tag: {BUILD_TAG}"); let args_workdir = &args.datadir; let workdir = args_workdir.canonicalize_utf8().with_context(|| { format!("Failed to get the absolute path for input workdir {args_workdir:?}") })?; // Change into the data directory. std::env::set_current_dir(&workdir)?; // Prevent running multiple safekeepers on the same directory let lock_file_path = workdir.join(PID_FILE_NAME); let lock_file = pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("claimed pid file at {lock_file_path:?}"); // ensure that the lock file is held even if the main thread of the process is panics // we need to release the lock file only when the current process is gone std::mem::forget(lock_file); // Set or read our ID. let id = set_id(&workdir, args.id.map(NodeId))?; if args.init { return Ok(()); } let pg_auth = match args.pg_auth_public_key_path.as_ref() { None => { info!("pg auth is disabled"); None } Some(path) => { info!("loading pg auth JWT key from {path}"); Some(Arc::new( JwtAuth::from_key_path(path).context("failed to load the auth key")?, )) } }; let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() { None => { info!("pg tenant only auth is disabled"); None } Some(path) => { info!("loading pg tenant only auth JWT key from {path}"); Some(Arc::new( JwtAuth::from_key_path(path).context("failed to load the auth key")?, )) } }; let http_auth = match args.http_auth_public_key_path.as_ref() { None => { info!("http auth is disabled"); None } Some(path) => { info!("loading http auth JWT key(s) from {path}"); let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?; Some(Arc::new(SwappableJwtAuth::new(jwt_auth))) } }; // Load JWT auth token to connect to other safekeepers for pull_timeline. let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() { info!("loading JWT token for authentication with safekeepers from {auth_token_path}"); let auth_token = tokio::fs::read_to_string(auth_token_path).await?; Some(SecretString::from(auth_token.trim().to_owned())) } else { info!("no JWT token for authentication with safekeepers detected"); None }; let ssl_ca_certs = match args.ssl_ca_file.as_ref() { Some(ssl_ca_file) => { tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); let buf = tokio::fs::read(ssl_ca_file).await?; pem::parse_many(&buf)? .into_iter() .filter(|pem| pem.tag() == "CERTIFICATE") .collect() } None => Vec::new(), }; let conf = Arc::new(SafeKeeperConf { workdir, my_id: id, listen_pg_addr: args.listen_pg, listen_pg_addr_tenant_only: args.listen_pg_tenant_only, listen_http_addr: args.listen_http, listen_https_addr: args.listen_https, advertise_pg_addr: args.advertise_pg, availability_zone: args.availability_zone, no_sync: args.no_sync, broker_endpoint: args.broker_endpoint, broker_keepalive_interval: args.broker_keepalive_interval, heartbeat_timeout: args.heartbeat_timeout, peer_recovery_enabled: args.peer_recovery, remote_storage: args.remote_storage, max_offloader_lag_bytes: args.max_offloader_lag, /* BEGIN_HADRON */ max_reelect_offloader_lag_bytes: args.max_reelect_offloader_lag_bytes, max_timeline_disk_usage_bytes: args.max_timeline_disk_usage_bytes, /* END_HADRON */ wal_backup_enabled: !args.disable_wal_backup, backup_parallel_jobs: args.wal_backup_parallel_jobs, pg_auth, pg_tenant_only_auth, http_auth, sk_auth_token, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, partial_backup_timeout: args.partial_backup_timeout, disable_periodic_broker_push: args.disable_periodic_broker_push, enable_offload: args.enable_offload, delete_offloaded_wal: args.delete_offloaded_wal, control_file_save_interval: args.control_file_save_interval, partial_backup_concurrency: args.partial_backup_concurrency, eviction_min_resident: args.eviction_min_resident, wal_reader_fanout: args.wal_reader_fanout, max_delta_for_fanout: args.max_delta_for_fanout, ssl_key_file: args.ssl_key_file, ssl_cert_file: args.ssl_cert_file, ssl_cert_reload_period: args.ssl_cert_reload_period, ssl_ca_certs, use_https_safekeeper_api: args.use_https_safekeeper_api, enable_tls_wal_service_api: args.enable_tls_wal_service_api, force_metric_collection_on_scrape: args.force_metric_collection_on_scrape, /* BEGIN_HADRON */ advertise_pg_addr_tenant_only: None, enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup, hcc_base_url: None, global_disk_check_interval: args.global_disk_check_interval, max_global_disk_usage_ratio: args.max_global_disk_usage_ratio, /* END_HADRON */ }); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), &[("node_id", &conf.my_id.to_string())], ); start_safekeeper(conf).await } /// Result of joining any of main tasks: upper error means task failed to /// complete, e.g. panicked, inner is error produced by task itself. type JoinTaskRes = Result, JoinError>; async fn start_safekeeper(conf: Arc) -> Result<()> { // fsync the datadir to make sure we have a consistent state on disk. if !conf.no_sync { let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; let started = Instant::now(); utils::crashsafe::syncfs(dfd)?; let elapsed = started.elapsed(); info!( elapsed_ms = elapsed.as_millis(), "syncfs data directory done" ); } info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); e })?; let pg_listener_tenant_only = if let Some(listen_pg_addr_tenant_only) = &conf.listen_pg_addr_tenant_only { info!( "starting safekeeper tenant scoped WAL service on {}", listen_pg_addr_tenant_only ); let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| { error!( "failed to bind to address {}: {}", listen_pg_addr_tenant_only, e ); e })?; Some(listener) } else { None }; info!( "starting safekeeper HTTP service on {}", conf.listen_http_addr ); let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e })?; let https_listener = match conf.listen_https_addr.as_ref() { Some(listen_https_addr) => { info!("starting safekeeper HTTPS service on {}", listen_https_addr); Some(tcp_listener::bind(listen_https_addr).map_err(|e| { error!("failed to bind to address {}: {}", listen_https_addr, e); e })?) } None => None, }; let wal_backup = Arc::new(WalBackup::new(&conf).await?); let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone())); // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone()); metrics::register_internal(Box::new(timeline_collector))?; // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); // Start wal backup launcher before loading timelines as we'll notify it // through the channel about timelines which need offloading, not draining // the channel would cause deadlock. let current_thread_rt = conf .current_thread_runtime .then(|| Handle::try_current().expect("no runtime in main")); // Load all timelines from disk to memory. global_timelines.init().await?; /* BEGIN_HADRON */ if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 { match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await { Ok(_) => { info!("Successfully pulled all timelines from peer safekeepers"); } Err(e) => { error!("Failed to pull timelines from peer safekeepers: {:?}", e); return Err(e); } } } /* END_HADRON */ // Run everything in current thread rt, if asked. if conf.current_thread_runtime { info!("running in current thread runtime"); } let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_wal_service_api { let ssl_key_file = conf.ssl_key_file.clone(); let ssl_cert_file = conf.ssl_cert_file.clone(); let ssl_cert_reload_period = conf.ssl_cert_reload_period; // Create resolver in BACKGROUND_RUNTIME, so the background certificate reloading // task is run in this runtime. let cert_resolver = current_thread_rt .as_ref() .unwrap_or_else(|| BACKGROUND_RUNTIME.handle()) .spawn(async move { ReloadingCertificateResolver::new( "main", &ssl_key_file, &ssl_cert_file, ssl_cert_reload_period, ) .await }) .await??; let config = rustls::ServerConfig::builder() .with_no_client_auth() .with_cert_resolver(cert_resolver); Some(Arc::new(config)) } else { None }; let wal_service_handle = current_thread_rt .as_ref() .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) .spawn(wal_service::task_main( conf.clone(), pg_listener, Scope::SafekeeperData, conf.enable_tls_wal_service_api .then(|| tls_server_config.clone()) .flatten(), global_timelines.clone(), )) // wrap with task name for error reporting .map(|res| ("WAL service main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); let global_timelines_ = global_timelines.clone(); let timeline_housekeeping_handle = current_thread_rt .as_ref() .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) .spawn(async move { const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24); loop { tokio::time::sleep(TOMBSTONE_TTL).await; global_timelines_.housekeeping(&TOMBSTONE_TTL); } }) .map(|res| ("Timeline map housekeeping".to_owned(), res)); tasks_handles.push(Box::pin(timeline_housekeeping_handle)); /* BEGIN_HADRON */ // Spawn global disk usage watcher task, if a global disk usage limit is specified. let interval = conf.global_disk_check_interval; let data_dir = conf.workdir.clone(); // Use the safekeeper data directory to compute filesystem capacity. This only runs once on startup, so // there is little point to continue if we can't have the proper protections in place. let fs_capacity_bytes = get_filesystem_capacity(data_dir.as_std_path()) .expect("Failed to get filesystem capacity for data directory"); let limit: u64 = (conf.max_global_disk_usage_ratio * fs_capacity_bytes as f64) as u64; if limit > 0 { let disk_usage_watch_handle = BACKGROUND_RUNTIME .handle() .spawn(async move { // Use Tokio interval to preserve fixed cadence between filesystem utilization checks let mut ticker = tokio::time::interval(interval); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); loop { ticker.tick().await; let data_dir_clone = data_dir.clone(); let check_start = Instant::now(); let usage = tokio::task::spawn_blocking(move || { get_filesystem_usage(data_dir_clone.as_std_path()) }) .await .unwrap_or(0); let elapsed = check_start.elapsed().as_secs_f64(); GLOBAL_DISK_UTIL_CHECK_SECONDS.observe(elapsed); if usage > limit { warn!( "Global disk usage exceeded limit. Usage: {} bytes, limit: {} bytes", usage, limit ); } GLOBAL_DISK_LIMIT_EXCEEDED.store(usage > limit, Ordering::Relaxed); } }) .map(|res| ("Global disk usage watcher".to_string(), res)); tasks_handles.push(Box::pin(disk_usage_watch_handle)); } /* END_HADRON */ if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let wal_service_handle = current_thread_rt .as_ref() .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) .spawn(wal_service::task_main( conf.clone(), pg_listener_tenant_only, Scope::Tenant, conf.enable_tls_wal_service_api .then(|| tls_server_config.clone()) .flatten(), global_timelines.clone(), )) // wrap with task name for error reporting .map(|res| ("WAL service tenant only main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); } let http_handle = current_thread_rt .as_ref() .unwrap_or_else(|| HTTP_RUNTIME.handle()) .spawn(http::task_main_http( conf.clone(), http_listener, global_timelines.clone(), )) .map(|res| ("HTTP service main".to_owned(), res)); tasks_handles.push(Box::pin(http_handle)); if let Some(https_listener) = https_listener { let https_handle = current_thread_rt .as_ref() .unwrap_or_else(|| HTTP_RUNTIME.handle()) .spawn(http::task_main_https( conf.clone(), https_listener, tls_server_config.expect("tls_server_config is set earlier if https is enabled"), global_timelines.clone(), )) .map(|res| ("HTTPS service main".to_owned(), res)); tasks_handles.push(Box::pin(https_handle)); } let broker_task_handle = current_thread_rt .as_ref() .unwrap_or_else(|| BROKER_RUNTIME.handle()) .spawn( broker::task_main(conf.clone(), global_timelines.clone()) .instrument(info_span!("broker")), ) .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(broker_task_handle)); /* BEGIN_HADRON */ if conf.force_metric_collection_on_scrape { let metrics_handle = current_thread_rt .as_ref() .unwrap_or_else(|| BACKGROUND_RUNTIME.handle()) .spawn(async move { let mut interval: tokio::time::Interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL); loop { interval.tick().await; tokio::task::spawn_blocking(|| { METRICS_COLLECTOR.run_once(true); }); } }) .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(metrics_handle)); } /* END_HADRON */ set_build_info_metric(GIT_VERSION, BUILD_TAG); // TODO: update tokio-stream, convert to real async Stream with // SignalStream, map it to obtain missing signal name, combine streams into // single stream we can easily sit on. let mut sigquit_stream = signal(SignalKind::quit())?; let mut sigint_stream = signal(SignalKind::interrupt())?; let mut sigterm_stream = signal(SignalKind::terminate())?; // Notify systemd that we are ready. This is important as currently loading // timelines takes significant time (~30s in busy regions). if let Err(e) = sd_notify::notify(true, &[NotifyState::Ready]) { warn!("systemd notify failed: {:?}", e); } tokio::select! { Some((task_name, res)) = tasks_handles.next()=> { error!("{} task failed: {:?}, exiting", task_name, res); std::process::exit(1); } // On any shutdown signal, log receival and exit. Additionally, handling // SIGQUIT prevents coredump. _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"), _ = sigint_stream.recv() => info!("received SIGINT, terminating"), _ = sigterm_stream.recv() => info!("received SIGTERM, terminating") }; std::process::exit(0); } /// Determine safekeeper id. fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { let id_file_path = workdir.join(ID_FILE_NAME); let my_id: NodeId; // If file with ID exists, read it in; otherwise set one passed. match fs::read(&id_file_path) { Ok(id_serialized) => { my_id = NodeId( std::str::from_utf8(&id_serialized) .context("failed to parse safekeeper id")? .parse() .context("failed to parse safekeeper id")?, ); if let Some(given_id) = given_id { if given_id != my_id { bail!( "safekeeper already initialized with id {}, can't set {}", my_id, given_id ); } } info!("safekeeper ID {}", my_id); } Err(error) => match error.kind() { ErrorKind::NotFound => { my_id = if let Some(given_id) = given_id { given_id } else { bail!("safekeeper id is not specified"); }; let mut f = File::create(&id_file_path) .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?; f.write_all(my_id.to_string().as_bytes())?; f.sync_all()?; info!("initialized safekeeper id {}", my_id); } _ => { return Err(error.into()); } }, } Ok(my_id) } fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] fn verify_cli() { use clap::CommandFactory; Args::command().debug_assert() } ================================================ FILE: safekeeper/src/broker.rs ================================================ //! Communication with the broker, providing safekeeper peers and pageserver coordination. use std::sync::Arc; use std::sync::atomic::AtomicU64; use std::time::{Duration, Instant, UNIX_EPOCH}; use anyhow::{Context, Error, Result, anyhow, bail}; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage, }; use storage_broker::{Request, parse_proto_ttid}; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; use crate::metrics::{ BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS, BROKER_PUSHED_UPDATES, }; use crate::{GlobalTimelines, SafeKeeperConf}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; fn make_tls_config(conf: &SafeKeeperConf) -> storage_broker::ClientTlsConfig { storage_broker::ClientTlsConfig::new().ca_certificates( conf.ssl_ca_certs .iter() .map(pem::encode) .map(storage_broker::Certificate::from_pem), ) } /// Push once in a while data about all active timelines to the broker. async fn push_loop( conf: Arc, global_timelines: Arc, ) -> anyhow::Result<()> { if conf.disable_periodic_broker_push { info!("broker push_loop is disabled, doing nothing..."); futures::future::pending::<()>().await; // sleep forever return Ok(()); } let active_timelines_set = global_timelines.get_global_broker_active_set(); let mut client = storage_broker::connect( conf.broker_endpoint.clone(), conf.broker_keepalive_interval, make_tls_config(&conf), )?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); let outbound = async_stream::stream! { loop { // Note: we lock runtime here and in timeline methods as GlobalTimelines // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); let all_tlis = active_timelines_set.get_all(); let mut n_pushed_tlis = 0; for tli in &all_tlis { let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); n_pushed_tlis += 1; } let elapsed = now.elapsed(); BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64()); BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64); if elapsed > push_interval / 2 { info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed); } sleep(push_interval).await; } }; client .publish_safekeeper_info(Request::new(outbound)) .await?; Ok(()) } /// Subscribe and fetch all the interesting data from the broker. #[instrument(name = "broker_pull", skip_all)] async fn pull_loop( conf: Arc, global_timelines: Arc, stats: Arc, ) -> Result<()> { let mut client = storage_broker::connect( conf.broker_endpoint.clone(), conf.broker_keepalive_interval, make_tls_config(&conf), )?; // TODO: subscribe only to local timelines instead of all let request = SubscribeSafekeeperInfoRequest { subscription_key: Some(ProtoSubscriptionKey::All(())), }; let mut stream = client .subscribe_safekeeper_info(request) .await .context("subscribe_safekeper_info request failed")? .into_inner(); let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]); let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]); let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); while let Some(msg) = stream.message().await? { stats.update_pulled(); let proto_ttid = msg .tenant_timeline_id .as_ref() .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; let ttid = parse_proto_ttid(proto_ttid)?; if let Ok(tli) = global_timelines.get(ttid) { // Note that we also receive *our own* info. That's // important, as it is used as an indication of live // connection to the broker. // note: there are blocking operations below, but it's considered fine for now let res = tli.record_safekeeper_info(msg).await; if res.is_ok() { ok_counter.inc(); } else { err_counter.inc(); } res?; } else { not_found.inc(); } } bail!("end of stream"); } /// Process incoming discover requests. This is done in a separate task to avoid /// interfering with the normal pull/push loops. async fn discover_loop( conf: Arc, global_timelines: Arc, stats: Arc, ) -> Result<()> { let mut client = storage_broker::connect( conf.broker_endpoint.clone(), conf.broker_keepalive_interval, make_tls_config(&conf), )?; let request = SubscribeByFilterRequest { types: vec![TypeSubscription { r#type: MessageType::SafekeeperDiscoveryRequest as i32, }], tenant_timeline_id: Some(FilterTenantTimelineId { enabled: false, tenant_timeline_id: None, }), }; let mut stream = client .subscribe_by_filter(request) .await .context("subscribe_by_filter request failed")? .into_inner(); let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]); while let Some(typed_msg) = stream.message().await? { stats.update_pulled(); match typed_msg.r#type() { MessageType::SafekeeperDiscoveryRequest => { let msg = typed_msg .safekeeper_discovery_request .expect("proto type mismatch from broker message"); let proto_ttid = msg .tenant_timeline_id .as_ref() .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; let ttid = parse_proto_ttid(proto_ttid)?; if let Ok(tli) = global_timelines.get(ttid) { // we received a discovery request for a timeline we know about discover_counter.inc(); // create and reply with discovery response let sk_info = tli.get_safekeeper_info(&conf).await; let response = SafekeeperDiscoveryResponse { safekeeper_id: sk_info.safekeeper_id, tenant_timeline_id: sk_info.tenant_timeline_id, commit_lsn: sk_info.commit_lsn, safekeeper_connstr: sk_info.safekeeper_connstr, availability_zone: sk_info.availability_zone, standby_horizon: 0, }; // note this is a blocking call client .publish_one(TypedMessage { r#type: MessageType::SafekeeperDiscoveryResponse as i32, safekeeper_timeline_info: None, safekeeper_discovery_request: None, safekeeper_discovery_response: Some(response), }) .await?; } } _ => { warn!( "unexpected message type i32 {}, {:?}", typed_msg.r#type, typed_msg.r#type() ); } } } bail!("end of stream"); } pub async fn task_main( conf: Arc, global_timelines: Arc, ) -> anyhow::Result<()> { info!("started, broker endpoint {:?}", conf.broker_endpoint); let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; let mut discover_handle: Option>> = None; let stats = Arc::new(BrokerStats::new()); let stats_task = task_stats(stats.clone()); tokio::pin!(stats_task); // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? // Handling failures in task itself won't catch panic and in Tokio, task's // panic doesn't kill the whole executor, so it is better to do reaping // here. loop { tokio::select! { res = async { push_handle.as_mut().unwrap().await }, if push_handle.is_some() => { // was it panic or normal error? let err = match res { Ok(res_internal) => res_internal.unwrap_err(), Err(err_outer) => err_outer.into(), }; warn!("push task failed: {:?}", err); push_handle = None; }, res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { // was it panic or normal error? match res { Ok(res_internal) => if let Err(err_inner) = res_internal { warn!("pull task failed: {:?}", err_inner); } Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } }; pull_handle = None; }, res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => { // was it panic or normal error? match res { Ok(res_internal) => if let Err(err_inner) = res_internal { warn!("discover task failed: {:?}", err_inner); } Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) } }; discover_handle = None; }, _ = ticker.tick() => { if push_handle.is_none() { push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone()))); } if pull_handle.is_none() { pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone()))); } if discover_handle.is_none() { discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone()))); } }, _ = &mut stats_task => {} } } } struct BrokerStats { /// Timestamp of the last received message from the broker. last_pulled_ts: AtomicU64, } impl BrokerStats { fn new() -> Self { BrokerStats { last_pulled_ts: AtomicU64::new(0), } } fn now_millis() -> u64 { std::time::SystemTime::now() .duration_since(UNIX_EPOCH) .expect("time is before epoch") .as_millis() as u64 } /// Update last_pulled timestamp to current time. fn update_pulled(&self) { self.last_pulled_ts .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed); } } /// Periodically write to logs if there are issues with receiving data from the broker. async fn task_stats(stats: Arc) { let warn_duration = Duration::from_secs(10); let mut ticker = tokio::time::interval(warn_duration); loop { tokio::select! { _ = ticker.tick() => { let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst); if last_pulled == 0 { // no broker updates yet continue; } let now = BrokerStats::now_millis(); if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); info!("no broker updates for some time, last update: {:?}", ts); } } } } } ================================================ FILE: safekeeper/src/control_file.rs ================================================ [File too large to display: 10.2 KB] ================================================ FILE: safekeeper/src/control_file_upgrade.rs ================================================ [File too large to display: 33.2 KB] ================================================ FILE: safekeeper/src/copy_timeline.rs ================================================ [File too large to display: 7.9 KB] ================================================ FILE: safekeeper/src/debug_dump.rs ================================================ [File too large to display: 11.3 KB] ================================================ FILE: safekeeper/src/hadron.rs ================================================ [File too large to display: 15.9 KB] ================================================ FILE: safekeeper/src/handler.rs ================================================ [File too large to display: 20.9 KB] ================================================ FILE: safekeeper/src/http/mod.rs ================================================ [File too large to display: 1.4 KB] ================================================ FILE: safekeeper/src/http/openapi_spec.yaml ================================================ [File too large to display: 9.3 KB] ================================================ FILE: safekeeper/src/http/routes.rs ================================================ [File too large to display: 30.0 KB] ================================================ FILE: safekeeper/src/lib.rs ================================================ [File too large to display: 9.5 KB] ================================================ FILE: safekeeper/src/metrics.rs ================================================ [File too large to display: 36.0 KB] ================================================ FILE: safekeeper/src/patch_control_file.rs ================================================ [File too large to display: 2.5 KB] ================================================ FILE: safekeeper/src/pull_timeline.rs ================================================ [File too large to display: 30.7 KB] ================================================ FILE: safekeeper/src/rate_limit.rs ================================================ [File too large to display: 1.7 KB] ================================================ FILE: safekeeper/src/receive_wal.rs ================================================ [File too large to display: 25.1 KB] ================================================ FILE: safekeeper/src/recovery.rs ================================================ [File too large to display: 19.9 KB] ================================================ FILE: safekeeper/src/remove_wal.rs ================================================ [File too large to display: 1.4 KB] ================================================ FILE: safekeeper/src/safekeeper.rs ================================================ [File too large to display: 65.6 KB] ================================================ FILE: safekeeper/src/send_interpreted_wal.rs ================================================ [File too large to display: 45.9 KB] ================================================ FILE: safekeeper/src/send_wal.rs ================================================ [File too large to display: 41.0 KB] ================================================ FILE: safekeeper/src/state.rs ================================================ [File too large to display: 10.8 KB] ================================================ FILE: safekeeper/src/test_utils.rs ================================================ [File too large to display: 6.5 KB] ================================================ FILE: safekeeper/src/timeline.rs ================================================ [File too large to display: 47.7 KB] ================================================ FILE: safekeeper/src/timeline_eviction.rs ================================================ [File too large to display: 14.1 KB] ================================================ FILE: safekeeper/src/timeline_guard.rs ================================================ [File too large to display: 2.6 KB] ================================================ FILE: safekeeper/src/timeline_manager.rs ================================================ [File too large to display: 31.6 KB] ================================================ FILE: safekeeper/src/timelines_global_map.rs ================================================ [File too large to display: 29.8 KB] ================================================ FILE: safekeeper/src/timelines_set.rs ================================================ [File too large to display: 2.6 KB] ================================================ FILE: safekeeper/src/wal_backup.rs ================================================ [File too large to display: 27.5 KB] ================================================ FILE: safekeeper/src/wal_backup_partial.rs ================================================ [File too large to display: 20.2 KB] ================================================ FILE: safekeeper/src/wal_reader_stream.rs ================================================ [File too large to display: 8.7 KB] ================================================ FILE: safekeeper/src/wal_service.rs ================================================ [File too large to display: 4.6 KB] ================================================ FILE: safekeeper/src/wal_storage.rs ================================================ [File too large to display: 33.3 KB] ================================================ FILE: safekeeper/tests/misc_test.rs ================================================ [File too large to display: 4.6 KB] ================================================ FILE: safekeeper/tests/random_test.rs ================================================ [File too large to display: 1.7 KB] ================================================ FILE: safekeeper/tests/simple_test.rs ================================================ [File too large to display: 1.3 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/block_storage.rs ================================================ [File too large to display: 2.0 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/log.rs ================================================ [File too large to display: 2.2 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/mod.rs ================================================ [File too large to display: 177 B] ================================================ FILE: safekeeper/tests/walproposer_sim/safekeeper.rs ================================================ [File too large to display: 14.4 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/safekeeper_disk.rs ================================================ [File too large to display: 8.4 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/simulation.rs ================================================ [File too large to display: 13.0 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/simulation_logs.rs ================================================ [File too large to display: 5.8 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/walproposer_api.rs ================================================ [File too large to display: 21.8 KB] ================================================ FILE: safekeeper/tests/walproposer_sim/walproposer_disk.rs ================================================ [File too large to display: 2.1 KB] ================================================ FILE: scripts/benchmark_durations.py ================================================ [File too large to display: 9.2 KB] ================================================ FILE: scripts/check_allowed_errors.sh ================================================ [File too large to display: 566 B] ================================================ FILE: scripts/comment-test-report.js ================================================ [File too large to display: 14.2 KB] ================================================ FILE: scripts/coverage ================================================ [File too large to display: 20.2 KB] ================================================ FILE: scripts/download_basebackup.py ================================================ [File too large to display: 1.6 KB] ================================================ FILE: scripts/force_layer_download.py ================================================ [File too large to display: 10.6 KB] ================================================ FILE: scripts/generate_and_push_perf_report.sh ================================================ [File too large to display: 451 B] ================================================ FILE: scripts/ingest_perf_test_result.py ================================================ [File too large to display: 4.6 KB] ================================================ FILE: scripts/ingest_regress_test_result-new-format.py ================================================ [File too large to display: 6.6 KB] ================================================ FILE: scripts/ninstall.sh ================================================ [File too large to display: 1.1 KB] ================================================ FILE: scripts/perf_report_template.html ================================================ [File too large to display: 1.3 KB] ================================================ FILE: scripts/proxy_bench_results_ingest.py ================================================ [File too large to display: 5.7 KB] ================================================ FILE: scripts/ps_ec2_setup_instance_store ================================================ [File too large to display: 2.0 KB] ================================================ FILE: scripts/pysync ================================================ [File too large to display: 615 B] ================================================ FILE: scripts/pytest ================================================ [File too large to display: 287 B] ================================================ FILE: scripts/reformat ================================================ [File too large to display: 268 B] ================================================ FILE: scripts/sk_cleanup_tenants/readme.md ================================================ [File too large to display: 2.0 KB] ================================================ FILE: scripts/sk_cleanup_tenants/remote.yaml ================================================ [File too large to display: 2.0 KB] ================================================ FILE: scripts/sk_cleanup_tenants/script.py ================================================ [File too large to display: 3.9 KB] ================================================ FILE: scripts/sk_collect_dumps/.gitignore ================================================ [File too large to display: 32 B] ================================================ FILE: scripts/sk_collect_dumps/ansible.cfg ================================================ [File too large to display: 205 B] ================================================ FILE: scripts/sk_collect_dumps/pyproject.toml ================================================ [File too large to display: 337 B] ================================================ FILE: scripts/sk_collect_dumps/readme.md ================================================ [File too large to display: 1.9 KB] ================================================ FILE: scripts/sk_collect_dumps/remote.yaml ================================================ [File too large to display: 1.3 KB] ================================================ FILE: scripts/sk_collect_dumps/ssh.cfg ================================================ [File too large to display: 545 B] ================================================ FILE: scripts/sk_collect_dumps/upload.sh ================================================ [File too large to display: 2.1 KB] ================================================ FILE: storage_broker/Cargo.toml ================================================ [File too large to display: 971 B] ================================================ FILE: storage_broker/benches/rps.rs ================================================ [File too large to display: 6.0 KB] ================================================ FILE: storage_broker/build.rs ================================================ [File too large to display: 513 B] ================================================ FILE: storage_broker/proto/.gitignore ================================================ [File too large to display: 44 B] ================================================ FILE: storage_broker/proto/broker.proto ================================================ [File too large to display: 3.5 KB] ================================================ FILE: storage_broker/src/bin/storage_broker.rs ================================================ [File too large to display: 32.7 KB] ================================================ FILE: storage_broker/src/lib.rs ================================================ [File too large to display: 3.8 KB] ================================================ FILE: storage_broker/src/metrics.rs ================================================ [File too large to display: 1.9 KB] ================================================ FILE: storage_controller/Cargo.toml ================================================ [File too large to display: 2.0 KB] ================================================ FILE: storage_controller/client/Cargo.toml ================================================ [File too large to display: 280 B] ================================================ FILE: storage_controller/client/src/control_api.rs ================================================ [File too large to display: 1.5 KB] ================================================ FILE: storage_controller/client/src/lib.rs ================================================ [File too large to display: 21 B] ================================================ FILE: storage_controller/migrations/.keep ================================================ ================================================ FILE: storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql ================================================ [File too large to display: 328 B] ================================================ FILE: storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql ================================================ [File too large to display: 1.1 KB] ================================================ FILE: storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql ================================================ [File too large to display: 26 B] ================================================ FILE: storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql ================================================ [File too large to display: 433 B] ================================================ FILE: storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql ================================================ [File too large to display: 18 B] ================================================ FILE: storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql ================================================ [File too large to display: 245 B] ================================================ FILE: storage_controller/migrations/2024-02-29-094122_generations_null/down.sql ================================================ [File too large to display: 125 B] ================================================ FILE: storage_controller/migrations/2024-02-29-094122_generations_null/up.sql ================================================ [File too large to display: 128 B] ================================================ FILE: storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql ================================================ [File too large to display: 195 B] ================================================ FILE: storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql ================================================ [File too large to display: 195 B] ================================================ FILE: storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql ================================================ [File too large to display: 96 B] ================================================ FILE: storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql ================================================ [File too large to display: 86 B] ================================================ FILE: storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql ================================================ [File too large to display: 27 B] ================================================ FILE: storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql ================================================ [File too large to display: 546 B] ================================================ FILE: storage_controller/migrations/2024-07-26-140924_create_leader/down.sql ================================================ [File too large to display: 24 B] ================================================ FILE: storage_controller/migrations/2024-07-26-140924_create_leader/up.sql ================================================ [File too large to display: 128 B] ================================================ FILE: storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql ================================================ [File too large to display: 70 B] ================================================ FILE: storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql ================================================ [File too large to display: 569 B] ================================================ FILE: storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql ================================================ [File too large to display: 81 B] ================================================ FILE: storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql ================================================ [File too large to display: 88 B] ================================================ FILE: storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql ================================================ [File too large to display: 45 B] ================================================ FILE: storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql ================================================ [File too large to display: 52 B] ================================================ FILE: storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql ================================================ [File too large to display: 60 B] ================================================ FILE: storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql ================================================ [File too large to display: 59 B] ================================================ FILE: storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql ================================================ [File too large to display: 48 B] ================================================ FILE: storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql ================================================ [File too large to display: 55 B] ================================================ FILE: storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql ================================================ [File too large to display: 48 B] ================================================ FILE: storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql ================================================ [File too large to display: 83 B] ================================================ FILE: storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql ================================================ [File too large to display: 277 B] ================================================ FILE: storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql ================================================ [File too large to display: 37 B] ================================================ FILE: storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql ================================================ [File too large to display: 168 B] ================================================ FILE: storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql ================================================ [File too large to display: 165 B] ================================================ FILE: storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql ================================================ [File too large to display: 42 B] ================================================ FILE: storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql ================================================ [File too large to display: 49 B] ================================================ FILE: storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql ================================================ [File too large to display: 66 B] ================================================ FILE: storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql ================================================ [File too large to display: 553 B] ================================================ FILE: storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql ================================================ [File too large to display: 41 B] ================================================ FILE: storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql ================================================ [File too large to display: 48 B] ================================================ FILE: storage_controller/migrations/2025-03-18-103700_timeline_imports/down.sql ================================================ [File too large to display: 29 B] ================================================ FILE: storage_controller/migrations/2025-03-18-103700_timeline_imports/up.sql ================================================ [File too large to display: 168 B] ================================================ FILE: storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/down.sql ================================================ [File too large to display: 41 B] ================================================ FILE: storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/up.sql ================================================ [File too large to display: 74 B] ================================================ FILE: storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/down.sql ================================================ [File too large to display: 59 B] ================================================ FILE: storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/up.sql ================================================ [File too large to display: 88 B] ================================================ FILE: storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/down.sql ================================================ [File too large to display: 75 B] ================================================ FILE: storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/up.sql ================================================ [File too large to display: 80 B] ================================================ FILE: storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/down.sql ================================================ [File too large to display: 55 B] ================================================ FILE: storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/up.sql ================================================ [File too large to display: 81 B] ================================================ FILE: storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql ================================================ [File too large to display: 71 B] ================================================ FILE: storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql ================================================ [File too large to display: 621 B] ================================================ FILE: storage_controller/src/auth.rs ================================================ [File too large to display: 682 B] ================================================ FILE: storage_controller/src/background_node_operations.rs ================================================ [File too large to display: 1.8 KB] ================================================ FILE: storage_controller/src/compute_hook.rs ================================================ [File too large to display: 45.2 KB] ================================================ FILE: storage_controller/src/hadron_utils.rs ================================================ [File too large to display: 1.2 KB] ================================================ FILE: storage_controller/src/heartbeater.rs ================================================ [File too large to display: 15.2 KB] ================================================ FILE: storage_controller/src/http.rs ================================================ [File too large to display: 88.9 KB] ================================================ FILE: storage_controller/src/id_lock_map.rs ================================================ [File too large to display: 6.8 KB] ================================================ FILE: storage_controller/src/leadership.rs ================================================ [File too large to display: 5.7 KB] ================================================ FILE: storage_controller/src/lib.rs ================================================ [File too large to display: 1.2 KB] ================================================ FILE: storage_controller/src/main.rs ================================================ [File too large to display: 24.1 KB] ================================================ FILE: storage_controller/src/metrics.rs ================================================ [File too large to display: 16.0 KB] ================================================ FILE: storage_controller/src/node.rs ================================================ [File too large to display: 16.9 KB] ================================================ FILE: storage_controller/src/operation_utils.rs ================================================ [File too large to display: 6.9 KB] ================================================ FILE: storage_controller/src/pageserver_client.rs ================================================ [File too large to display: 13.5 KB] ================================================ FILE: storage_controller/src/peer_client.rs ================================================ [File too large to display: 3.6 KB] ================================================ FILE: storage_controller/src/persistence/split_state.rs ================================================ [File too large to display: 1.3 KB] ================================================ FILE: storage_controller/src/persistence.rs ================================================ [File too large to display: 106.0 KB] ================================================ FILE: storage_controller/src/reconciler.rs ================================================ [File too large to display: 51.4 KB] ================================================ FILE: storage_controller/src/safekeeper.rs ================================================ [File too large to display: 6.6 KB] ================================================ FILE: storage_controller/src/safekeeper_client.rs ================================================ [File too large to display: 4.9 KB] ================================================ FILE: storage_controller/src/scheduler.rs ================================================ [File too large to display: 56.3 KB] ================================================ FILE: storage_controller/src/schema.rs ================================================ [File too large to display: 3.3 KB] ================================================ FILE: storage_controller/src/service/chaos_injector.rs ================================================ [File too large to display: 10.9 KB] ================================================ FILE: storage_controller/src/service/feature_flag.rs ================================================ [File too large to display: 3.8 KB] ================================================ FILE: storage_controller/src/service/safekeeper_reconciler.rs ================================================ [File too large to display: 22.3 KB] ================================================ FILE: storage_controller/src/service/safekeeper_service.rs ================================================ [File too large to display: 63.5 KB] ================================================ FILE: storage_controller/src/service/tenant_shard_iterator.rs ================================================ [File too large to display: 8.9 KB] ================================================ FILE: storage_controller/src/service.rs ================================================ [File too large to display: 433.5 KB] ================================================ FILE: storage_controller/src/tenant_shard.rs ================================================ [File too large to display: 130.2 KB] ================================================ FILE: storage_controller/src/timeline_import.rs ================================================ [File too large to display: 8.6 KB] ================================================ FILE: storage_scrubber/Cargo.toml ================================================ [File too large to display: 1.2 KB] ================================================ FILE: storage_scrubber/README.md ================================================ [File too large to display: 6.0 KB] ================================================ FILE: storage_scrubber/src/checks.rs ================================================ [File too large to display: 26.6 KB] ================================================ FILE: storage_scrubber/src/cloud_admin_api.rs ================================================ [File too large to display: 17.9 KB] ================================================ FILE: storage_scrubber/src/find_large_objects.rs ================================================ [File too large to display: 3.7 KB] ================================================ FILE: storage_scrubber/src/garbage.rs ================================================ [File too large to display: 23.2 KB] ================================================ FILE: storage_scrubber/src/lib.rs ================================================ [File too large to display: 18.7 KB] ================================================ FILE: storage_scrubber/src/main.rs ================================================ [File too large to display: 15.1 KB] ================================================ FILE: storage_scrubber/src/metadata_stream.rs ================================================ [File too large to display: 6.1 KB] ================================================ FILE: storage_scrubber/src/pageserver_physical_gc.rs ================================================ [File too large to display: 31.3 KB] ================================================ FILE: storage_scrubber/src/scan_pageserver_metadata.rs ================================================ [File too large to display: 13.5 KB] ================================================ FILE: storage_scrubber/src/scan_safekeeper_metadata.rs ================================================ [File too large to display: 10.0 KB] ================================================ FILE: storage_scrubber/src/tenant_snapshot.rs ================================================ [File too large to display: 11.1 KB] ================================================ FILE: test_runner/README.md ================================================ [File too large to display: 12.2 KB] ================================================ FILE: test_runner/bin/neon_local_create_deep_l0_stack.py ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/cloud_regress/README.md ================================================ [File too large to display: 1.2 KB] ================================================ FILE: test_runner/cloud_regress/test_cloud_regress.py ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/conftest.py ================================================ [File too large to display: 518 B] ================================================ FILE: test_runner/fixtures/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/fixtures/auth_tokens.py ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/fixtures/benchmark_fixture.py ================================================ [File too large to display: 19.0 KB] ================================================ FILE: test_runner/fixtures/common_types.py ================================================ [File too large to display: 7.4 KB] ================================================ FILE: test_runner/fixtures/compare_fixtures.py ================================================ [File too large to display: 11.9 KB] ================================================ FILE: test_runner/fixtures/compute_migrations.py ================================================ [File too large to display: 876 B] ================================================ FILE: test_runner/fixtures/compute_reconfigure.py ================================================ [File too large to display: 3.0 KB] ================================================ FILE: test_runner/fixtures/endpoint/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/fixtures/endpoint/http.py ================================================ [File too large to display: 6.7 KB] ================================================ FILE: test_runner/fixtures/fast_import.py ================================================ [File too large to display: 7.4 KB] ================================================ FILE: test_runner/fixtures/h2server.py ================================================ [File too large to display: 7.0 KB] ================================================ FILE: test_runner/fixtures/httpserver.py ================================================ [File too large to display: 1.4 KB] ================================================ FILE: test_runner/fixtures/log_helper.py ================================================ [File too large to display: 1.1 KB] ================================================ FILE: test_runner/fixtures/metrics.py ================================================ [File too large to display: 7.9 KB] ================================================ FILE: test_runner/fixtures/neon_api.py ================================================ [File too large to display: 15.7 KB] ================================================ FILE: test_runner/fixtures/neon_cli.py ================================================ [File too large to display: 24.6 KB] ================================================ FILE: test_runner/fixtures/neon_fixtures.py ================================================ [File too large to display: 230.9 KB] ================================================ FILE: test_runner/fixtures/overlayfs.py ================================================ [File too large to display: 569 B] ================================================ FILE: test_runner/fixtures/pageserver/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/fixtures/pageserver/allowed_errors.py ================================================ [File too large to display: 9.9 KB] ================================================ FILE: test_runner/fixtures/pageserver/common_types.py ================================================ [File too large to display: 3.9 KB] ================================================ FILE: test_runner/fixtures/pageserver/http.py ================================================ [File too large to display: 42.3 KB] ================================================ FILE: test_runner/fixtures/pageserver/makelayers/__init__.py ================================================ ================================================ FILE: test_runner/fixtures/pageserver/makelayers/l0stack.py ================================================ [File too large to display: 5.6 KB] ================================================ FILE: test_runner/fixtures/pageserver/many_tenants.py ================================================ [File too large to display: 2.8 KB] ================================================ FILE: test_runner/fixtures/pageserver/remote_storage.py ================================================ [File too large to display: 4.4 KB] ================================================ FILE: test_runner/fixtures/pageserver/utils.py ================================================ [File too large to display: 14.8 KB] ================================================ FILE: test_runner/fixtures/parametrize.py ================================================ [File too large to display: 4.2 KB] ================================================ FILE: test_runner/fixtures/paths.py ================================================ [File too large to display: 10.4 KB] ================================================ FILE: test_runner/fixtures/pg_config.py ================================================ [File too large to display: 7.6 KB] ================================================ FILE: test_runner/fixtures/pg_stats.py ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/fixtures/pg_version.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/fixtures/port_distributor.py ================================================ [File too large to display: 3.0 KB] ================================================ FILE: test_runner/fixtures/remote_storage.py ================================================ [File too large to display: 17.5 KB] ================================================ FILE: test_runner/fixtures/reruns.py ================================================ [File too large to display: 1.1 KB] ================================================ FILE: test_runner/fixtures/safekeeper/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/fixtures/safekeeper/http.py ================================================ [File too large to display: 13.3 KB] ================================================ FILE: test_runner/fixtures/safekeeper/utils.py ================================================ [File too large to display: 786 B] ================================================ FILE: test_runner/fixtures/safekeeper_utils.py ================================================ [File too large to display: 3.1 KB] ================================================ FILE: test_runner/fixtures/slow.py ================================================ [File too large to display: 1.1 KB] ================================================ FILE: test_runner/fixtures/storage_controller_proxy.py ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/fixtures/utils.py ================================================ [File too large to display: 26.4 KB] ================================================ FILE: test_runner/fixtures/workload.py ================================================ [File too large to display: 8.1 KB] ================================================ FILE: test_runner/logical_repl/README.md ================================================ [File too large to display: 944 B] ================================================ FILE: test_runner/logical_repl/clickhouse/docker-compose.yml ================================================ [File too large to display: 296 B] ================================================ FILE: test_runner/logical_repl/debezium/docker-compose.yml ================================================ [File too large to display: 1.1 KB] ================================================ FILE: test_runner/logical_repl/test_clickhouse.py ================================================ [File too large to display: 2.8 KB] ================================================ FILE: test_runner/logical_repl/test_debezium.py ================================================ [File too large to display: 6.3 KB] ================================================ FILE: test_runner/performance/README.md ================================================ [File too large to display: 3.5 KB] ================================================ FILE: test_runner/performance/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/performance/benchbase_tpc_c_helpers/generate_diagrams.py ================================================ [File too large to display: 4.5 KB] ================================================ FILE: test_runner/performance/benchbase_tpc_c_helpers/generate_workload_size.py ================================================ [File too large to display: 11.9 KB] ================================================ FILE: test_runner/performance/benchbase_tpc_c_helpers/upload_results_to_perf_test_results.py ================================================ [File too large to display: 22.4 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql ================================================ [File too large to display: 5.2 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_action_blocks.sql ================================================ [File too large to display: 710 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_action_kwargs.sql ================================================ [File too large to display: 667 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_device_fingerprint_event.sql ================================================ [File too large to display: 2.6 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_edges.sql ================================================ [File too large to display: 719 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_hotel_rate_mapping.sql ================================================ [File too large to display: 688 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_ocr_pipeline_results_version.sql ================================================ [File too large to display: 1.6 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_priceline_raw_response.sql ================================================ [File too large to display: 1010 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_relabled_transactions.sql ================================================ [File too large to display: 905 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_state_values.sql ================================================ [File too large to display: 590 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_values.sql ================================================ [File too large to display: 1.1 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/grow_vertices.sql ================================================ [File too large to display: 977 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/insert_webhooks.sql ================================================ [File too large to display: 1.6 KB] ================================================ FILE: test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql ================================================ [File too large to display: 487 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql ================================================ [File too large to display: 895 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql ================================================ [File too large to display: 310 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_accounting_coding_body_tracking_category_selection.sql ================================================ [File too large to display: 365 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_action_blocks.sql ================================================ [File too large to display: 274 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_action_kwargs.sql ================================================ [File too large to display: 248 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_denormalized_approval_workflow.sql ================================================ [File too large to display: 312 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_device_fingerprint_event.sql ================================================ [File too large to display: 304 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_edges.sql ================================================ [File too large to display: 226 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_heron_transaction_enriched_log.sql ================================================ [File too large to display: 295 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_heron_transaction_enrichment_requests.sql ================================================ [File too large to display: 319 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_hotel_rate_mapping.sql ================================================ [File too large to display: 283 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_incoming_webhooks.sql ================================================ [File too large to display: 280 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_manual_transaction.sql ================================================ [File too large to display: 263 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_ml_receipt_matching_log.sql ================================================ [File too large to display: 299 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_ocr_pipeine_results_version.sql ================================================ [File too large to display: 294 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results.sql ================================================ [File too large to display: 283 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results_version.sql ================================================ [File too large to display: 308 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_priceline_raw_response.sql ================================================ [File too large to display: 291 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_quickbooks_transactions.sql ================================================ [File too large to display: 288 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_raw_finicity_transaction.sql ================================================ [File too large to display: 369 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_relabeled_transactions.sql ================================================ [File too large to display: 267 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_state_values.sql ================================================ [File too large to display: 251 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_stripe_authorization_event_log.sql ================================================ [File too large to display: 298 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_transaction.sql ================================================ [File too large to display: 253 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_values.sql ================================================ [File too large to display: 264 B] ================================================ FILE: test_runner/performance/large_synthetic_oltp/update_vertices.sql ================================================ [File too large to display: 254 B] ================================================ FILE: test_runner/performance/many_relations/create_many_relations.sql ================================================ [File too large to display: 8.0 KB] ================================================ FILE: test_runner/performance/out_dir_to_csv.py ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/performance/pageserver/README.md ================================================ [File too large to display: 842 B] ================================================ FILE: test_runner/performance/pageserver/__init__.py ================================================ [File too large to display: 35 B] ================================================ FILE: test_runner/performance/pageserver/interactive/__init__.py ================================================ [File too large to display: 339 B] ================================================ FILE: test_runner/performance/pageserver/interactive/test_many_small_tenants.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/performance/pageserver/pagebench/__init__.py ================================================ [File too large to display: 387 B] ================================================ FILE: test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py ================================================ [File too large to display: 5.7 KB] ================================================ FILE: test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py ================================================ [File too large to display: 5.7 KB] ================================================ FILE: test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py ================================================ [File too large to display: 10.3 KB] ================================================ FILE: test_runner/performance/pageserver/test_page_service_batching.py ================================================ [File too large to display: 16.0 KB] ================================================ FILE: test_runner/performance/pageserver/util.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/performance/pgvector/HNSW_build.sql ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/performance/pgvector/IVFFLAT_build.sql ================================================ [File too large to display: 1.8 KB] ================================================ FILE: test_runner/performance/pgvector/README.md ================================================ [File too large to display: 1.3 KB] ================================================ FILE: test_runner/performance/pgvector/halfvec_build.sql ================================================ [File too large to display: 459 B] ================================================ FILE: test_runner/performance/pgvector/loaddata.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql ================================================ [File too large to display: 424 B] ================================================ FILE: test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql ================================================ [File too large to display: 208 B] ================================================ FILE: test_runner/performance/test_branch_creation.py ================================================ [File too large to display: 9.0 KB] ================================================ FILE: test_runner/performance/test_branching.py ================================================ [File too large to display: 3.9 KB] ================================================ FILE: test_runner/performance/test_bulk_insert.py ================================================ [File too large to display: 3.2 KB] ================================================ FILE: test_runner/performance/test_bulk_tenant_create.py ================================================ [File too large to display: 1.4 KB] ================================================ FILE: test_runner/performance/test_bulk_update.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/performance/test_compaction.py ================================================ [File too large to display: 6.7 KB] ================================================ FILE: test_runner/performance/test_compare_pg_stats.py ================================================ [File too large to display: 4.2 KB] ================================================ FILE: test_runner/performance/test_compute_ctl_api.py ================================================ [File too large to display: 2.0 KB] ================================================ FILE: test_runner/performance/test_compute_startup.py ================================================ [File too large to display: 8.5 KB] ================================================ FILE: test_runner/performance/test_copy.py ================================================ [File too large to display: 2.9 KB] ================================================ FILE: test_runner/performance/test_cumulative_statistics_persistence.py ================================================ [File too large to display: 9.2 KB] ================================================ FILE: test_runner/performance/test_dup_key.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/performance/test_gc_feedback.py ================================================ [File too large to display: 8.0 KB] ================================================ FILE: test_runner/performance/test_gist_build.py ================================================ [File too large to display: 1.2 KB] ================================================ FILE: test_runner/performance/test_hot_page.py ================================================ [File too large to display: 2.1 KB] ================================================ FILE: test_runner/performance/test_hot_table.py ================================================ [File too large to display: 2.1 KB] ================================================ FILE: test_runner/performance/test_ingest_insert_bulk.py ================================================ [File too large to display: 6.9 KB] ================================================ FILE: test_runner/performance/test_ingest_logical_message.py ================================================ [File too large to display: 4.5 KB] ================================================ FILE: test_runner/performance/test_latency.py ================================================ [File too large to display: 1.3 KB] ================================================ FILE: test_runner/performance/test_layer_map.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/performance/test_lfc_prewarm.py ================================================ [File too large to display: 6.5 KB] ================================================ FILE: test_runner/performance/test_logical_replication.py ================================================ [File too large to display: 15.4 KB] ================================================ FILE: test_runner/performance/test_parallel_copy.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/performance/test_parallel_copy_to.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/performance/test_perf_ingest_using_pgcopydb.py ================================================ [File too large to display: 9.5 KB] ================================================ FILE: test_runner/performance/test_perf_many_relations.py ================================================ [File too large to display: 5.9 KB] ================================================ FILE: test_runner/performance/test_perf_olap.py ================================================ [File too large to display: 19.1 KB] ================================================ FILE: test_runner/performance/test_perf_oltp_large_tenant.py ================================================ [File too large to display: 7.0 KB] ================================================ FILE: test_runner/performance/test_perf_pgbench.py ================================================ [File too large to display: 8.2 KB] ================================================ FILE: test_runner/performance/test_perf_pgvector_queries.py ================================================ [File too large to display: 1.3 KB] ================================================ FILE: test_runner/performance/test_physical_replication.py ================================================ [File too large to display: 12.0 KB] ================================================ FILE: test_runner/performance/test_random_writes.py ================================================ [File too large to display: 3.9 KB] ================================================ FILE: test_runner/performance/test_seqscans.py ================================================ [File too large to display: 2.6 KB] ================================================ FILE: test_runner/performance/test_sharded_ingest.py ================================================ [File too large to display: 4.4 KB] ================================================ FILE: test_runner/performance/test_sharding_autosplit.py ================================================ [File too large to display: 12.1 KB] ================================================ FILE: test_runner/performance/test_storage_controller_scale.py ================================================ [File too large to display: 24.5 KB] ================================================ FILE: test_runner/performance/test_wal_backpressure.py ================================================ [File too large to display: 9.9 KB] ================================================ FILE: test_runner/performance/test_write_amplification.py ================================================ [File too large to display: 3.2 KB] ================================================ FILE: test_runner/performance/tpc-h/create-indexes.sql ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/performance/tpc-h/create-schema.sql ================================================ [File too large to display: 3.7 KB] ================================================ FILE: test_runner/performance/tpc-h/queries/1.sql ================================================ [File too large to display: 624 B] ================================================ FILE: test_runner/performance/tpc-h/queries/10.sql ================================================ [File too large to display: 623 B] ================================================ FILE: test_runner/performance/tpc-h/queries/11.sql ================================================ [File too large to display: 612 B] ================================================ FILE: test_runner/performance/tpc-h/queries/12.sql ================================================ [File too large to display: 698 B] ================================================ FILE: test_runner/performance/tpc-h/queries/13.sql ================================================ [File too large to display: 445 B] ================================================ FILE: test_runner/performance/tpc-h/queries/14.sql ================================================ [File too large to display: 429 B] ================================================ FILE: test_runner/performance/tpc-h/queries/15.sql ================================================ [File too large to display: 604 B] ================================================ FILE: test_runner/performance/tpc-h/queries/16.sql ================================================ [File too large to display: 579 B] ================================================ FILE: test_runner/performance/tpc-h/queries/17.sql ================================================ [File too large to display: 392 B] ================================================ FILE: test_runner/performance/tpc-h/queries/18.sql ================================================ [File too large to display: 542 B] ================================================ FILE: test_runner/performance/tpc-h/queries/19.sql ================================================ [File too large to display: 1.0 KB] ================================================ FILE: test_runner/performance/tpc-h/queries/2.sql ================================================ [File too large to display: 785 B] ================================================ FILE: test_runner/performance/tpc-h/queries/20.sql ================================================ [File too large to display: 684 B] ================================================ FILE: test_runner/performance/tpc-h/queries/21.sql ================================================ [File too large to display: 767 B] ================================================ FILE: test_runner/performance/tpc-h/queries/22.sql ================================================ [File too large to display: 744 B] ================================================ FILE: test_runner/performance/tpc-h/queries/3.sql ================================================ [File too large to display: 512 B] ================================================ FILE: test_runner/performance/tpc-h/queries/4.sql ================================================ [File too large to display: 453 B] ================================================ FILE: test_runner/performance/tpc-h/queries/5.sql ================================================ [File too large to display: 583 B] ================================================ FILE: test_runner/performance/tpc-h/queries/6.sql ================================================ [File too large to display: 358 B] ================================================ FILE: test_runner/performance/tpc-h/queries/7.sql ================================================ [File too large to display: 889 B] ================================================ FILE: test_runner/performance/tpc-h/queries/8.sql ================================================ [File too large to display: 874 B] ================================================ FILE: test_runner/performance/tpc-h/queries/9.sql ================================================ [File too large to display: 700 B] ================================================ FILE: test_runner/pg_clients/README.md ================================================ [File too large to display: 255 B] ================================================ FILE: test_runner/pg_clients/csharp/npgsql/.dockerignore ================================================ [File too large to display: 10 B] ================================================ FILE: test_runner/pg_clients/csharp/npgsql/.gitignore ================================================ [File too large to display: 10 B] ================================================ FILE: test_runner/pg_clients/csharp/npgsql/Dockerfile ================================================ [File too large to display: 429 B] ================================================ FILE: test_runner/pg_clients/csharp/npgsql/Program.cs ================================================ [File too large to display: 678 B] ================================================ FILE: test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj ================================================ [File too large to display: 327 B] ================================================ FILE: test_runner/pg_clients/java/jdbc/Dockerfile ================================================ [File too large to display: 349 B] ================================================ FILE: test_runner/pg_clients/java/jdbc/Example.java ================================================ [File too large to display: 970 B] ================================================ FILE: test_runner/pg_clients/python/asyncpg/Dockerfile ================================================ [File too large to display: 246 B] ================================================ FILE: test_runner/pg_clients/python/asyncpg/asyncpg_example.py ================================================ [File too large to display: 644 B] ================================================ FILE: test_runner/pg_clients/python/asyncpg/requirements.txt ================================================ [File too large to display: 16 B] ================================================ FILE: test_runner/pg_clients/python/pg8000/Dockerfile ================================================ [File too large to display: 245 B] ================================================ FILE: test_runner/pg_clients/python/pg8000/pg8000_example.py ================================================ [File too large to display: 521 B] ================================================ FILE: test_runner/pg_clients/python/pg8000/requirements.txt ================================================ [File too large to display: 29 B] ================================================ FILE: test_runner/pg_clients/rust/tokio-postgres/.dockerignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/rust/tokio-postgres/.gitignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/rust/tokio-postgres/Cargo.toml ================================================ [File too large to display: 398 B] ================================================ FILE: test_runner/pg_clients/rust/tokio-postgres/Dockerfile ================================================ [File too large to display: 203 B] ================================================ FILE: test_runner/pg_clients/rust/tokio-postgres/src/main.rs ================================================ [File too large to display: 1.3 KB] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile ================================================ [File too large to display: 460 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved ================================================ [File too large to display: 946 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift ================================================ [File too large to display: 425 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift ================================================ [File too large to display: 1.2 KB] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/.dockerignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/.gitignore ================================================ [File too large to display: 8 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile ================================================ [File too large to display: 396 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved ================================================ [File too large to display: 3.4 KB] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/Package.swift ================================================ [File too large to display: 427 B] ================================================ FILE: test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift ================================================ [File too large to display: 1.2 KB] ================================================ FILE: test_runner/pg_clients/test_pg_clients.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/pg_clients/typescript/postgresql-client/.dockerignore ================================================ [File too large to display: 14 B] ================================================ FILE: test_runner/pg_clients/typescript/postgresql-client/.gitignore ================================================ [File too large to display: 14 B] ================================================ FILE: test_runner/pg_clients/typescript/postgresql-client/Dockerfile ================================================ [File too large to display: 184 B] ================================================ FILE: test_runner/pg_clients/typescript/postgresql-client/index.js ================================================ [File too large to display: 568 B] ================================================ FILE: test_runner/pg_clients/typescript/postgresql-client/package.json ================================================ [File too large to display: 82 B] ================================================ FILE: test_runner/pg_clients/typescript/serverless-driver/.dockerignore ================================================ [File too large to display: 14 B] ================================================ FILE: test_runner/pg_clients/typescript/serverless-driver/.gitignore ================================================ [File too large to display: 14 B] ================================================ FILE: test_runner/pg_clients/typescript/serverless-driver/Dockerfile ================================================ [File too large to display: 184 B] ================================================ FILE: test_runner/pg_clients/typescript/serverless-driver/index.js ================================================ [File too large to display: 576 B] ================================================ FILE: test_runner/pg_clients/typescript/serverless-driver/package.json ================================================ [File too large to display: 108 B] ================================================ FILE: test_runner/random_ops/README.md ================================================ [File too large to display: 3.6 KB] ================================================ FILE: test_runner/random_ops/test_random_ops.py ================================================ [File too large to display: 20.3 KB] ================================================ FILE: test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql ================================================ [File too large to display: 918 B] ================================================ FILE: test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control ================================================ [File too large to display: 280 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_sql_only/sql/test_extension_sql_only--1.0--1.1.sql ================================================ [File too large to display: 290 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_sql_only/sql/test_extension_sql_only--1.0.sql ================================================ [File too large to display: 290 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_sql_only/test_extension_sql_only.control ================================================ [File too large to display: 36 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_with_lib/sql/test_extension_with_lib--1.0--1.1.sql ================================================ [File too large to display: 237 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_with_lib/sql/test_extension_with_lib--1.0.sql ================================================ [File too large to display: 254 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_with_lib/src/test_extension_with_lib.c ================================================ [File too large to display: 482 B] ================================================ FILE: test_runner/regress/data/test_remote_extensions/test_extension_with_lib/test_extension_with_lib.control ================================================ [File too large to display: 88 B] ================================================ FILE: test_runner/regress/data/test_signed_char.out ================================================ [File too large to display: 16.0 KB] ================================================ FILE: test_runner/regress/test_ancestor_branch.py ================================================ [File too large to display: 3.8 KB] ================================================ FILE: test_runner/regress/test_attach_tenant_config.py ================================================ [File too large to display: 9.8 KB] ================================================ FILE: test_runner/regress/test_auth.py ================================================ [File too large to display: 9.5 KB] ================================================ FILE: test_runner/regress/test_auth_broker.py ================================================ [File too large to display: 1.0 KB] ================================================ FILE: test_runner/regress/test_backpressure.py ================================================ [File too large to display: 7.0 KB] ================================================ FILE: test_runner/regress/test_bad_connection.py ================================================ [File too large to display: 10.4 KB] ================================================ FILE: test_runner/regress/test_basebackup.py ================================================ [File too large to display: 3.6 KB] ================================================ FILE: test_runner/regress/test_basebackup_error.py ================================================ [File too large to display: 621 B] ================================================ FILE: test_runner/regress/test_branch_and_gc.py ================================================ [File too large to display: 8.3 KB] ================================================ FILE: test_runner/regress/test_branch_behind.py ================================================ [File too large to display: 6.6 KB] ================================================ FILE: test_runner/regress/test_branching.py ================================================ [File too large to display: 16.3 KB] ================================================ FILE: test_runner/regress/test_broken_timeline.py ================================================ [File too large to display: 8.8 KB] ================================================ FILE: test_runner/regress/test_build_info_metric.py ================================================ [File too large to display: 965 B] ================================================ FILE: test_runner/regress/test_change_pageserver.py ================================================ [File too large to display: 6.8 KB] ================================================ FILE: test_runner/regress/test_clog_truncate.py ================================================ [File too large to display: 2.9 KB] ================================================ FILE: test_runner/regress/test_close_fds.py ================================================ [File too large to display: 1.8 KB] ================================================ FILE: test_runner/regress/test_combocid.py ================================================ [File too large to display: 4.7 KB] ================================================ FILE: test_runner/regress/test_communicator_metrics_exporter.py ================================================ [File too large to display: 1.9 KB] ================================================ FILE: test_runner/regress/test_compaction.py ================================================ [File too large to display: 47.4 KB] ================================================ FILE: test_runner/regress/test_compatibility.py ================================================ [File too large to display: 26.2 KB] ================================================ FILE: test_runner/regress/test_compute_catalog.py ================================================ [File too large to display: 21.8 KB] ================================================ FILE: test_runner/regress/test_compute_http.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/regress/test_compute_locales.py ================================================ [File too large to display: 1.8 KB] ================================================ FILE: test_runner/regress/test_compute_metrics.py ================================================ [File too large to display: 18.3 KB] ================================================ FILE: test_runner/regress/test_compute_migrations.py ================================================ [File too large to display: 4.0 KB] ================================================ FILE: test_runner/regress/test_compute_monitor.py ================================================ [File too large to display: 2.6 KB] ================================================ FILE: test_runner/regress/test_compute_reconfigure.py ================================================ [File too large to display: 4.4 KB] ================================================ FILE: test_runner/regress/test_compute_termination.py ================================================ [File too large to display: 12.8 KB] ================================================ FILE: test_runner/regress/test_config.py ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/regress/test_crafted_wal_end.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/regress/test_createdropdb.py ================================================ [File too large to display: 3.8 KB] ================================================ FILE: test_runner/regress/test_createuser.py ================================================ [File too large to display: 965 B] ================================================ FILE: test_runner/regress/test_ddl_forwarding.py ================================================ [File too large to display: 13.5 KB] ================================================ FILE: test_runner/regress/test_disk_usage_eviction.py ================================================ [File too large to display: 37.8 KB] ================================================ FILE: test_runner/regress/test_download_extensions.py ================================================ [File too large to display: 9.1 KB] ================================================ FILE: test_runner/regress/test_endpoint_crash.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/regress/test_endpoint_storage.py ================================================ [File too large to display: 2.0 KB] ================================================ FILE: test_runner/regress/test_event_trigger_extension.py ================================================ [File too large to display: 3.9 KB] ================================================ FILE: test_runner/regress/test_explain_with_lfc_stats.py ================================================ [File too large to display: 3.2 KB] ================================================ FILE: test_runner/regress/test_extensions.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/regress/test_feature_flag.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/regress/test_fsm_truncate.py ================================================ [File too large to display: 477 B] ================================================ FILE: test_runner/regress/test_fullbackup.py ================================================ [File too large to display: 2.4 KB] ================================================ FILE: test_runner/regress/test_gc_aggressive.py ================================================ [File too large to display: 5.7 KB] ================================================ FILE: test_runner/regress/test_gin_redo.py ================================================ [File too large to display: 887 B] ================================================ FILE: test_runner/regress/test_gist.py ================================================ [File too large to display: 814 B] ================================================ FILE: test_runner/regress/test_hadron_ps_connectivity_metrics.py ================================================ [File too large to display: 5.8 KB] ================================================ FILE: test_runner/regress/test_hcc_handling_ps_data_loss.py ================================================ [File too large to display: 2.0 KB] ================================================ FILE: test_runner/regress/test_hot_standby.py ================================================ [File too large to display: 15.2 KB] ================================================ FILE: test_runner/regress/test_import.py ================================================ [File too large to display: 10.3 KB] ================================================ FILE: test_runner/regress/test_import_pgdata.py ================================================ [File too large to display: 44.4 KB] ================================================ FILE: test_runner/regress/test_ingestion_layer_size.py ================================================ [File too large to display: 5.7 KB] ================================================ FILE: test_runner/regress/test_large_schema.py ================================================ [File too large to display: 3.3 KB] ================================================ FILE: test_runner/regress/test_layer_bloating.py ================================================ [File too large to display: 2.8 KB] ================================================ FILE: test_runner/regress/test_layer_eviction.py ================================================ [File too large to display: 13.6 KB] ================================================ FILE: test_runner/regress/test_layer_writers_fail.py ================================================ [File too large to display: 3.7 KB] ================================================ FILE: test_runner/regress/test_layers_from_future.py ================================================ [File too large to display: 11.9 KB] ================================================ FILE: test_runner/regress/test_lfc_prefetch.py ================================================ [File too large to display: 4.1 KB] ================================================ FILE: test_runner/regress/test_lfc_prewarm.py ================================================ [File too large to display: 12.0 KB] ================================================ FILE: test_runner/regress/test_lfc_resize.py ================================================ [File too large to display: 4.4 KB] ================================================ FILE: test_runner/regress/test_lfc_working_set_approximation.py ================================================ [File too large to display: 5.5 KB] ================================================ FILE: test_runner/regress/test_local_file_cache.py ================================================ [File too large to display: 5.1 KB] ================================================ FILE: test_runner/regress/test_logging.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/regress/test_logical_replication.py ================================================ [File too large to display: 23.6 KB] ================================================ FILE: test_runner/regress/test_lsn_mapping.py ================================================ [File too large to display: 11.9 KB] ================================================ FILE: test_runner/regress/test_multixact.py ================================================ [File too large to display: 3.1 KB] ================================================ FILE: test_runner/regress/test_nbtree_pagesplit_cycleid.py ================================================ [File too large to display: 5.2 KB] ================================================ FILE: test_runner/regress/test_neon_cli.py ================================================ [File too large to display: 8.2 KB] ================================================ FILE: test_runner/regress/test_neon_extension.py ================================================ [File too large to display: 4.3 KB] ================================================ FILE: test_runner/regress/test_neon_local_cli.py ================================================ [File too large to display: 3.6 KB] ================================================ FILE: test_runner/regress/test_neon_superuser.py ================================================ [File too large to display: 7.4 KB] ================================================ FILE: test_runner/regress/test_next_xid.py ================================================ [File too large to display: 17.1 KB] ================================================ FILE: test_runner/regress/test_normal_work.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/regress/test_oid_overflow.py ================================================ [File too large to display: 1.4 KB] ================================================ FILE: test_runner/regress/test_old_request_lsn.py ================================================ [File too large to display: 3.1 KB] ================================================ FILE: test_runner/regress/test_ondemand_download.py ================================================ [File too large to display: 33.1 KB] ================================================ FILE: test_runner/regress/test_ondemand_slru_download.py ================================================ [File too large to display: 5.4 KB] ================================================ FILE: test_runner/regress/test_ondemand_wal_download.py ================================================ [File too large to display: 941 B] ================================================ FILE: test_runner/regress/test_page_service_batching_regressions.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/regress/test_pageserver_api.py ================================================ [File too large to display: 6.7 KB] ================================================ FILE: test_runner/regress/test_pageserver_catchup.py ================================================ [File too large to display: 2.4 KB] ================================================ FILE: test_runner/regress/test_pageserver_config.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/regress/test_pageserver_crash_consistency.py ================================================ [File too large to display: 4.0 KB] ================================================ FILE: test_runner/regress/test_pageserver_generations.py ================================================ [File too large to display: 27.4 KB] ================================================ FILE: test_runner/regress/test_pageserver_getpage_throttle.py ================================================ [File too large to display: 5.1 KB] ================================================ FILE: test_runner/regress/test_pageserver_layer_rolling.py ================================================ [File too large to display: 12.6 KB] ================================================ FILE: test_runner/regress/test_pageserver_metric_collection.py ================================================ [File too large to display: 16.9 KB] ================================================ FILE: test_runner/regress/test_pageserver_reconnect.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/regress/test_pageserver_restart.py ================================================ [File too large to display: 10.6 KB] ================================================ FILE: test_runner/regress/test_pageserver_restarts_under_workload.py ================================================ [File too large to display: 1.3 KB] ================================================ FILE: test_runner/regress/test_pageserver_secondary.py ================================================ [File too large to display: 48.8 KB] ================================================ FILE: test_runner/regress/test_pg_query_cancellation.py ================================================ [File too large to display: 8.7 KB] ================================================ FILE: test_runner/regress/test_pg_regress.py ================================================ [File too large to display: 19.3 KB] ================================================ FILE: test_runner/regress/test_pg_waldump.py ================================================ [File too large to display: 2.2 KB] ================================================ FILE: test_runner/regress/test_pgstat.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/regress/test_physical_and_logical_replicaiton.py ================================================ [File too large to display: 3.1 KB] ================================================ FILE: test_runner/regress/test_physical_replication.py ================================================ [File too large to display: 10.7 KB] ================================================ FILE: test_runner/regress/test_pitr_gc.py ================================================ [File too large to display: 2.7 KB] ================================================ FILE: test_runner/regress/test_postgres_version.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/regress/test_prefetch_buffer_resize.py ================================================ [File too large to display: 1.2 KB] ================================================ FILE: test_runner/regress/test_proxy.py ================================================ [File too large to display: 24.3 KB] ================================================ FILE: test_runner/regress/test_proxy_allowed_ips.py ================================================ [File too large to display: 3.0 KB] ================================================ FILE: test_runner/regress/test_proxy_metric_collection.py ================================================ [File too large to display: 4.1 KB] ================================================ FILE: test_runner/regress/test_proxy_websockets.py ================================================ [File too large to display: 10.1 KB] ================================================ FILE: test_runner/regress/test_read_validation.py ================================================ [File too large to display: 7.4 KB] ================================================ FILE: test_runner/regress/test_readonly_node.py ================================================ [File too large to display: 12.8 KB] ================================================ FILE: test_runner/regress/test_recovery.py ================================================ [File too large to display: 2.5 KB] ================================================ FILE: test_runner/regress/test_relations.py ================================================ [File too large to display: 4.0 KB] ================================================ FILE: test_runner/regress/test_remote_storage.py ================================================ [File too large to display: 31.8 KB] ================================================ FILE: test_runner/regress/test_replica_promotes.py ================================================ [File too large to display: 12.0 KB] ================================================ FILE: test_runner/regress/test_replica_start.py ================================================ [File too large to display: 32.9 KB] ================================================ FILE: test_runner/regress/test_rest_broker.py ================================================ [File too large to display: 5.2 KB] ================================================ FILE: test_runner/regress/test_role_grants.py ================================================ [File too large to display: 1.8 KB] ================================================ FILE: test_runner/regress/test_s3_restore.py ================================================ [File too large to display: 5.2 KB] ================================================ FILE: test_runner/regress/test_safekeeper_deletion.py ================================================ [File too large to display: 13.7 KB] ================================================ FILE: test_runner/regress/test_safekeeper_migration.py ================================================ [File too large to display: 21.8 KB] ================================================ FILE: test_runner/regress/test_setup.py ================================================ [File too large to display: 606 B] ================================================ FILE: test_runner/regress/test_sharding.py ================================================ [File too large to display: 84.1 KB] ================================================ FILE: test_runner/regress/test_signed_char.py ================================================ [File too large to display: 2.8 KB] ================================================ FILE: test_runner/regress/test_sni_router.py ================================================ [File too large to display: 4.8 KB] ================================================ FILE: test_runner/regress/test_ssl.py ================================================ [File too large to display: 8.8 KB] ================================================ FILE: test_runner/regress/test_storage_controller.py ================================================ [File too large to display: 185.5 KB] ================================================ FILE: test_runner/regress/test_storage_scrubber.py ================================================ [File too large to display: 25.6 KB] ================================================ FILE: test_runner/regress/test_subscriber_branching.py ================================================ [File too large to display: 15.7 KB] ================================================ FILE: test_runner/regress/test_subscriber_restart.py ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/regress/test_subxacts.py ================================================ [File too large to display: 1.0 KB] ================================================ FILE: test_runner/regress/test_tenant_conf.py ================================================ [File too large to display: 17.3 KB] ================================================ FILE: test_runner/regress/test_tenant_delete.py ================================================ [File too large to display: 17.1 KB] ================================================ FILE: test_runner/regress/test_tenant_detach.py ================================================ [File too large to display: 19.9 KB] ================================================ FILE: test_runner/regress/test_tenant_relocation.py ================================================ [File too large to display: 22.0 KB] ================================================ FILE: test_runner/regress/test_tenant_size.py ================================================ [File too large to display: 32.0 KB] ================================================ FILE: test_runner/regress/test_tenant_tasks.py ================================================ [File too large to display: 3.0 KB] ================================================ FILE: test_runner/regress/test_tenants.py ================================================ [File too large to display: 23.5 KB] ================================================ FILE: test_runner/regress/test_tenants_with_remote_storage.py ================================================ [File too large to display: 11.4 KB] ================================================ FILE: test_runner/regress/test_threshold_based_eviction.py ================================================ [File too large to display: 7.9 KB] ================================================ FILE: test_runner/regress/test_timeline_archive.py ================================================ [File too large to display: 47.5 KB] ================================================ FILE: test_runner/regress/test_timeline_delete.py ================================================ [File too large to display: 31.1 KB] ================================================ FILE: test_runner/regress/test_timeline_detach_ancestor.py ================================================ [File too large to display: 75.0 KB] ================================================ FILE: test_runner/regress/test_timeline_gc_blocking.py ================================================ [File too large to display: 4.0 KB] ================================================ FILE: test_runner/regress/test_timeline_size.py ================================================ [File too large to display: 43.2 KB] ================================================ FILE: test_runner/regress/test_truncate.py ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/regress/test_twophase.py ================================================ [File too large to display: 5.3 KB] ================================================ FILE: test_runner/regress/test_unlogged.py ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/regress/test_unstable_extensions.py ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/regress/test_vm_bits.py ================================================ [File too large to display: 14.7 KB] ================================================ FILE: test_runner/regress/test_vm_truncate.py ================================================ [File too large to display: 1.4 KB] ================================================ FILE: test_runner/regress/test_wal_acceptor.py ================================================ [File too large to display: 114.9 KB] ================================================ FILE: test_runner/regress/test_wal_acceptor_async.py ================================================ [File too large to display: 33.2 KB] ================================================ FILE: test_runner/regress/test_wal_receiver.py ================================================ [File too large to display: 4.8 KB] ================================================ FILE: test_runner/regress/test_wal_restore.py ================================================ [File too large to display: 10.5 KB] ================================================ FILE: test_runner/regress/test_walredo_not_left_behind_on_detach.py ================================================ [File too large to display: 4.1 KB] ================================================ FILE: test_runner/sql_regress/.gitignore ================================================ [File too large to display: 209 B] ================================================ FILE: test_runner/sql_regress/README.md ================================================ [File too large to display: 457 B] ================================================ FILE: test_runner/sql_regress/expected/.gitignore ================================================ [File too large to display: 155 B] ================================================ FILE: test_runner/sql_regress/expected/neon-cid.out ================================================ [File too large to display: 1.0 KB] ================================================ FILE: test_runner/sql_regress/expected/neon-clog.out ================================================ [File too large to display: 306 B] ================================================ FILE: test_runner/sql_regress/expected/neon-event-triggers.out ================================================ [File too large to display: 3.4 KB] ================================================ FILE: test_runner/sql_regress/expected/neon-rel-truncate.out ================================================ [File too large to display: 1.6 KB] ================================================ FILE: test_runner/sql_regress/expected/neon-spgist.out ================================================ [File too large to display: 711 B] ================================================ FILE: test_runner/sql_regress/expected/neon-subxacts.out ================================================ [File too large to display: 389 B] ================================================ FILE: test_runner/sql_regress/expected/neon-test-utils.out ================================================ [File too large to display: 630 B] ================================================ FILE: test_runner/sql_regress/expected/neon-vacuum-full.out ================================================ [File too large to display: 9.1 KB] ================================================ FILE: test_runner/sql_regress/parallel_schedule ================================================ [File too large to display: 375 B] ================================================ FILE: test_runner/sql_regress/sql/.gitignore ================================================ [File too large to display: 136 B] ================================================ FILE: test_runner/sql_regress/sql/neon-cid.sql ================================================ [File too large to display: 978 B] ================================================ FILE: test_runner/sql_regress/sql/neon-clog.sql ================================================ [File too large to display: 307 B] ================================================ FILE: test_runner/sql_regress/sql/neon-event-triggers.sql ================================================ [File too large to display: 2.4 KB] ================================================ FILE: test_runner/sql_regress/sql/neon-rel-truncate.sql ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/sql_regress/sql/neon-spgist.sql ================================================ [File too large to display: 712 B] ================================================ FILE: test_runner/sql_regress/sql/neon-subxacts.sql ================================================ [File too large to display: 389 B] ================================================ FILE: test_runner/sql_regress/sql/neon-test-utils.sql ================================================ [File too large to display: 419 B] ================================================ FILE: test_runner/sql_regress/sql/neon-vacuum-full.sql ================================================ [File too large to display: 796 B] ================================================ FILE: test_runner/stubs/h2/README.md ================================================ [File too large to display: 62 B] ================================================ FILE: test_runner/stubs/h2/__init__.pyi ================================================ [File too large to display: 17 B] ================================================ FILE: test_runner/stubs/h2/config.pyi ================================================ [File too large to display: 1.5 KB] ================================================ FILE: test_runner/stubs/h2/connection.pyi ================================================ [File too large to display: 5.3 KB] ================================================ FILE: test_runner/stubs/h2/errors.pyi ================================================ [File too large to display: 403 B] ================================================ FILE: test_runner/stubs/h2/events.pyi ================================================ [File too large to display: 2.9 KB] ================================================ FILE: test_runner/stubs/h2/exceptions.pyi ================================================ [File too large to display: 1.4 KB] ================================================ FILE: test_runner/stubs/h2/frame_buffer.pyi ================================================ [File too large to display: 454 B] ================================================ FILE: test_runner/stubs/h2/settings.pyi ================================================ [File too large to display: 2.3 KB] ================================================ FILE: test_runner/stubs/h2/stream.pyi ================================================ [File too large to display: 7.0 KB] ================================================ FILE: test_runner/stubs/h2/utilities.pyi ================================================ [File too large to display: 1.7 KB] ================================================ FILE: test_runner/stubs/h2/windows.pyi ================================================ [File too large to display: 447 B] ================================================ FILE: test_runner/test_broken.py ================================================ [File too large to display: 877 B] ================================================ FILE: test_runner/websocket_tunnel.py ================================================ [File too large to display: 4.7 KB] ================================================ FILE: vendor/revisions.json ================================================ [File too large to display: 305 B] ================================================ FILE: workspace_hack/.gitattributes ================================================ [File too large to display: 216 B] ================================================ FILE: workspace_hack/Cargo.toml ================================================ [File too large to display: 9.9 KB] ================================================ FILE: workspace_hack/build.rs ================================================ [File too large to display: 85 B] ================================================ FILE: workspace_hack/src/lib.rs ================================================ [File too large to display: 26 B]